salamandra-2B-instruct-ultrachat / trainer_state.json
sebcif's picture
Model save
b7b25d0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.998916106655105,
"eval_steps": 500,
"global_step": 2306,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008671146759158899,
"grad_norm": 5.864805612767453e+19,
"learning_rate": 1.2987012987012986e-06,
"loss": 1.5218,
"step": 1
},
{
"epoch": 0.004335573379579449,
"grad_norm": 0.5226513429859231,
"learning_rate": 6.493506493506493e-06,
"loss": 1.36,
"step": 5
},
{
"epoch": 0.008671146759158898,
"grad_norm": 0.7095719390962212,
"learning_rate": 1.2987012987012986e-05,
"loss": 1.342,
"step": 10
},
{
"epoch": 0.013006720138738348,
"grad_norm": 0.429950690175044,
"learning_rate": 1.9480519480519476e-05,
"loss": 1.3738,
"step": 15
},
{
"epoch": 0.017342293518317797,
"grad_norm": 0.25369452359038985,
"learning_rate": 2.5974025974025972e-05,
"loss": 1.2215,
"step": 20
},
{
"epoch": 0.021677866897897247,
"grad_norm": 14.366536861443981,
"learning_rate": 3.246753246753247e-05,
"loss": 1.2357,
"step": 25
},
{
"epoch": 0.026013440277476697,
"grad_norm": 0.2687159676086448,
"learning_rate": 3.896103896103895e-05,
"loss": 1.1712,
"step": 30
},
{
"epoch": 0.030349013657056147,
"grad_norm": 0.15608265987863726,
"learning_rate": 4.545454545454545e-05,
"loss": 1.1152,
"step": 35
},
{
"epoch": 0.03468458703663559,
"grad_norm": 0.11635461961055855,
"learning_rate": 5.1948051948051944e-05,
"loss": 1.1033,
"step": 40
},
{
"epoch": 0.03902016041621505,
"grad_norm": 0.11449246594254398,
"learning_rate": 5.8441558441558436e-05,
"loss": 1.0896,
"step": 45
},
{
"epoch": 0.04335573379579449,
"grad_norm": 0.11032771125077512,
"learning_rate": 6.493506493506494e-05,
"loss": 1.0795,
"step": 50
},
{
"epoch": 0.04769130717537394,
"grad_norm": 0.10732618594326016,
"learning_rate": 7.142857142857142e-05,
"loss": 1.1107,
"step": 55
},
{
"epoch": 0.05202688055495339,
"grad_norm": 0.11532043672084052,
"learning_rate": 7.79220779220779e-05,
"loss": 1.0941,
"step": 60
},
{
"epoch": 0.05636245393453284,
"grad_norm": 0.10484657398976702,
"learning_rate": 8.441558441558442e-05,
"loss": 1.0636,
"step": 65
},
{
"epoch": 0.06069802731411229,
"grad_norm": 0.11736262481626532,
"learning_rate": 9.09090909090909e-05,
"loss": 1.0939,
"step": 70
},
{
"epoch": 0.06503360069369174,
"grad_norm": 0.14275729093576736,
"learning_rate": 9.740259740259739e-05,
"loss": 1.0876,
"step": 75
},
{
"epoch": 0.06936917407327119,
"grad_norm": 0.15749002462194958,
"learning_rate": 0.00010389610389610389,
"loss": 1.073,
"step": 80
},
{
"epoch": 0.07370474745285063,
"grad_norm": 0.19946818080973713,
"learning_rate": 0.00011038961038961037,
"loss": 1.0935,
"step": 85
},
{
"epoch": 0.0780403208324301,
"grad_norm": 0.5992707412202727,
"learning_rate": 0.00011688311688311687,
"loss": 1.1826,
"step": 90
},
{
"epoch": 0.08237589421200954,
"grad_norm": 0.19153297117483123,
"learning_rate": 0.00012337662337662337,
"loss": 1.1079,
"step": 95
},
{
"epoch": 0.08671146759158899,
"grad_norm": 0.23132363172618897,
"learning_rate": 0.00012987012987012987,
"loss": 1.1509,
"step": 100
},
{
"epoch": 0.09104704097116843,
"grad_norm": 13.74435400048776,
"learning_rate": 0.00013636363636363634,
"loss": 1.271,
"step": 105
},
{
"epoch": 0.09538261435074788,
"grad_norm": 0.5226174167068346,
"learning_rate": 0.00014285714285714284,
"loss": 1.1902,
"step": 110
},
{
"epoch": 0.09971818773032734,
"grad_norm": 0.3629740083051462,
"learning_rate": 0.00014935064935064934,
"loss": 1.1643,
"step": 115
},
{
"epoch": 0.10405376110990679,
"grad_norm": 0.22433655763478755,
"learning_rate": 0.0001558441558441558,
"loss": 1.155,
"step": 120
},
{
"epoch": 0.10838933448948623,
"grad_norm": 0.1335411736809728,
"learning_rate": 0.0001623376623376623,
"loss": 1.1243,
"step": 125
},
{
"epoch": 0.11272490786906568,
"grad_norm": 0.20752640927016786,
"learning_rate": 0.00016883116883116884,
"loss": 1.1104,
"step": 130
},
{
"epoch": 0.11706048124864514,
"grad_norm": 0.12168629703712475,
"learning_rate": 0.0001753246753246753,
"loss": 1.114,
"step": 135
},
{
"epoch": 0.12139605462822459,
"grad_norm": 0.26232391688421475,
"learning_rate": 0.0001818181818181818,
"loss": 1.1204,
"step": 140
},
{
"epoch": 0.12573162800780402,
"grad_norm": 0.16457276786988467,
"learning_rate": 0.00018831168831168828,
"loss": 1.1442,
"step": 145
},
{
"epoch": 0.13006720138738348,
"grad_norm": 0.15538288562486247,
"learning_rate": 0.00019480519480519478,
"loss": 1.1111,
"step": 150
},
{
"epoch": 0.13440277476696294,
"grad_norm": 0.201009562048921,
"learning_rate": 0.0002012987012987013,
"loss": 1.1284,
"step": 155
},
{
"epoch": 0.13873834814654237,
"grad_norm": 0.10830435883952598,
"learning_rate": 0.00020779220779220778,
"loss": 1.13,
"step": 160
},
{
"epoch": 0.14307392152612183,
"grad_norm": 0.11463870554451829,
"learning_rate": 0.00021428571428571427,
"loss": 1.1146,
"step": 165
},
{
"epoch": 0.14740949490570127,
"grad_norm": 0.12316133988511195,
"learning_rate": 0.00022077922077922075,
"loss": 1.1238,
"step": 170
},
{
"epoch": 0.15174506828528073,
"grad_norm": 0.19826475153806197,
"learning_rate": 0.00022727272727272725,
"loss": 1.2731,
"step": 175
},
{
"epoch": 0.1560806416648602,
"grad_norm": 0.928027899524197,
"learning_rate": 0.00023376623376623374,
"loss": 1.1896,
"step": 180
},
{
"epoch": 0.16041621504443962,
"grad_norm": 67.14385763593648,
"learning_rate": 0.00024025974025974024,
"loss": 3.7592,
"step": 185
},
{
"epoch": 0.16475178842401908,
"grad_norm": 172.25626988972317,
"learning_rate": 0.00024675324675324674,
"loss": 6.1514,
"step": 190
},
{
"epoch": 0.1690873618035985,
"grad_norm": 8.384860694083399,
"learning_rate": 0.0002532467532467532,
"loss": 8.2367,
"step": 195
},
{
"epoch": 0.17342293518317797,
"grad_norm": 15.817512331978708,
"learning_rate": 0.00025974025974025974,
"loss": 10.8846,
"step": 200
},
{
"epoch": 0.17775850856275743,
"grad_norm": 3.4669948773987103,
"learning_rate": 0.0002662337662337662,
"loss": 9.8149,
"step": 205
},
{
"epoch": 0.18209408194233687,
"grad_norm": 6.211867615520024,
"learning_rate": 0.0002727272727272727,
"loss": 10.4091,
"step": 210
},
{
"epoch": 0.18642965532191633,
"grad_norm": 8.673657491918139,
"learning_rate": 0.0002792207792207792,
"loss": 10.3271,
"step": 215
},
{
"epoch": 0.19076522870149576,
"grad_norm": 2.503560404486804,
"learning_rate": 0.0002857142857142857,
"loss": 12.6414,
"step": 220
},
{
"epoch": 0.19510080208107522,
"grad_norm": 3.8638747380263703,
"learning_rate": 0.00029220779220779215,
"loss": 8.4926,
"step": 225
},
{
"epoch": 0.19943637546065468,
"grad_norm": 1.0101210010336394,
"learning_rate": 0.0002987012987012987,
"loss": 7.6948,
"step": 230
},
{
"epoch": 0.2037719488402341,
"grad_norm": 0.8467011565643021,
"learning_rate": 0.0002999972492985145,
"loss": 7.5188,
"step": 235
},
{
"epoch": 0.20810752221981357,
"grad_norm": 0.5245296390809255,
"learning_rate": 0.0002999860747466326,
"loss": 7.4533,
"step": 240
},
{
"epoch": 0.212443095599393,
"grad_norm": 0.42147554363918227,
"learning_rate": 0.0002999663050653897,
"loss": 7.4204,
"step": 245
},
{
"epoch": 0.21677866897897247,
"grad_norm": 0.38921879090695144,
"learning_rate": 0.00029993794138771085,
"loss": 7.4012,
"step": 250
},
{
"epoch": 0.22111424235855193,
"grad_norm": 0.2418087757057217,
"learning_rate": 0.0002999009853390101,
"loss": 7.3999,
"step": 255
},
{
"epoch": 0.22544981573813136,
"grad_norm": 0.35618630058079676,
"learning_rate": 0.0002998554390370975,
"loss": 7.3883,
"step": 260
},
{
"epoch": 0.22978538911771082,
"grad_norm": 0.18510696993654735,
"learning_rate": 0.0002998013050920577,
"loss": 7.3686,
"step": 265
},
{
"epoch": 0.23412096249729028,
"grad_norm": 0.21788844504555988,
"learning_rate": 0.0002997385866061005,
"loss": 7.3719,
"step": 270
},
{
"epoch": 0.2384565358768697,
"grad_norm": 0.2255775965595111,
"learning_rate": 0.00029966728717338294,
"loss": 7.3634,
"step": 275
},
{
"epoch": 0.24279210925644917,
"grad_norm": 0.21067089388376176,
"learning_rate": 0.0002995874108798032,
"loss": 7.3456,
"step": 280
},
{
"epoch": 0.2471276826360286,
"grad_norm": 0.1534328867763096,
"learning_rate": 0.00029949896230276675,
"loss": 7.3761,
"step": 285
},
{
"epoch": 0.25146325601560804,
"grad_norm": 0.12732733473622673,
"learning_rate": 0.000299401946510924,
"loss": 7.3546,
"step": 290
},
{
"epoch": 0.2557988293951875,
"grad_norm": 0.13550982122013763,
"learning_rate": 0.0002992963690638794,
"loss": 7.3462,
"step": 295
},
{
"epoch": 0.26013440277476696,
"grad_norm": 0.17601177463870957,
"learning_rate": 0.0002991822360118736,
"loss": 7.3682,
"step": 300
},
{
"epoch": 0.2644699761543464,
"grad_norm": 0.14421180444911577,
"learning_rate": 0.00029905955389543604,
"loss": 7.3557,
"step": 305
},
{
"epoch": 0.2688055495339259,
"grad_norm": 0.185975203761939,
"learning_rate": 0.00029892832974501044,
"loss": 7.356,
"step": 310
},
{
"epoch": 0.2731411229135053,
"grad_norm": 0.14867246915893478,
"learning_rate": 0.00029878857108055185,
"loss": 7.3347,
"step": 315
},
{
"epoch": 0.27747669629308475,
"grad_norm": 0.1334709517444074,
"learning_rate": 0.00029864028591109593,
"loss": 7.375,
"step": 320
},
{
"epoch": 0.2818122696726642,
"grad_norm": 0.1642667927001403,
"learning_rate": 0.00029848348273429947,
"loss": 7.3474,
"step": 325
},
{
"epoch": 0.28614784305224367,
"grad_norm": 0.17688345956299092,
"learning_rate": 0.0002983181705359541,
"loss": 7.3567,
"step": 330
},
{
"epoch": 0.2904834164318231,
"grad_norm": 0.16088931478744917,
"learning_rate": 0.00029814435878947076,
"loss": 7.3632,
"step": 335
},
{
"epoch": 0.29481898981140253,
"grad_norm": 0.19798325540549802,
"learning_rate": 0.000297962057455337,
"loss": 7.3831,
"step": 340
},
{
"epoch": 0.299154563190982,
"grad_norm": 0.1608484618987419,
"learning_rate": 0.0002977712769805465,
"loss": 7.3528,
"step": 345
},
{
"epoch": 0.30349013657056145,
"grad_norm": 0.1484369817347631,
"learning_rate": 0.00029757202829799986,
"loss": 7.3502,
"step": 350
},
{
"epoch": 0.3078257099501409,
"grad_norm": 0.22145679449709124,
"learning_rate": 0.0002973643228258784,
"loss": 7.3133,
"step": 355
},
{
"epoch": 0.3121612833297204,
"grad_norm": 0.2122717500901206,
"learning_rate": 0.0002971481724669898,
"loss": 7.3684,
"step": 360
},
{
"epoch": 0.3164968567092998,
"grad_norm": 0.14900310266228523,
"learning_rate": 0.0002969235896080861,
"loss": 7.3474,
"step": 365
},
{
"epoch": 0.32083243008887924,
"grad_norm": 0.17278164384767145,
"learning_rate": 0.0002966905871191534,
"loss": 7.3683,
"step": 370
},
{
"epoch": 0.3251680034684587,
"grad_norm": 0.20274199845375984,
"learning_rate": 0.0002964491783526749,
"loss": 7.3476,
"step": 375
},
{
"epoch": 0.32950357684803816,
"grad_norm": 0.13540893972237644,
"learning_rate": 0.00029619937714286547,
"loss": 7.3424,
"step": 380
},
{
"epoch": 0.3338391502276176,
"grad_norm": 0.16792713197591935,
"learning_rate": 0.0002959411978048787,
"loss": 7.3629,
"step": 385
},
{
"epoch": 0.338174723607197,
"grad_norm": 0.16598160402305692,
"learning_rate": 0.00029567465513398694,
"loss": 7.3435,
"step": 390
},
{
"epoch": 0.3425102969867765,
"grad_norm": 0.15743754774094762,
"learning_rate": 0.00029539976440473304,
"loss": 7.3405,
"step": 395
},
{
"epoch": 0.34684587036635595,
"grad_norm": 0.1928431563375656,
"learning_rate": 0.00029511654137005534,
"loss": 7.3398,
"step": 400
},
{
"epoch": 0.3511814437459354,
"grad_norm": 0.4234041342810784,
"learning_rate": 0.00029482500226038467,
"loss": 7.3163,
"step": 405
},
{
"epoch": 0.35551701712551487,
"grad_norm": 1.6964467666559244,
"learning_rate": 0.00029452516378271446,
"loss": 7.4424,
"step": 410
},
{
"epoch": 0.35985259050509427,
"grad_norm": 1.1952504992783122,
"learning_rate": 0.00029421704311964316,
"loss": 7.3051,
"step": 415
},
{
"epoch": 0.36418816388467373,
"grad_norm": 0.9910699747566197,
"learning_rate": 0.0002939006579283898,
"loss": 7.1588,
"step": 420
},
{
"epoch": 0.3685237372642532,
"grad_norm": 0.5010649163848268,
"learning_rate": 0.00029357602633978185,
"loss": 7.0579,
"step": 425
},
{
"epoch": 0.37285931064383265,
"grad_norm": 0.3223794058961508,
"learning_rate": 0.0002932431669572163,
"loss": 6.9952,
"step": 430
},
{
"epoch": 0.3771948840234121,
"grad_norm": 0.39385777708948017,
"learning_rate": 0.00029290209885559363,
"loss": 6.9317,
"step": 435
},
{
"epoch": 0.3815304574029915,
"grad_norm": 0.6906673045815623,
"learning_rate": 0.00029255284158022474,
"loss": 6.9197,
"step": 440
},
{
"epoch": 0.385866030782571,
"grad_norm": 0.5699230517220503,
"learning_rate": 0.00029219541514571075,
"loss": 6.9122,
"step": 445
},
{
"epoch": 0.39020160416215044,
"grad_norm": 0.3399949766360826,
"learning_rate": 0.00029182984003479613,
"loss": 6.8496,
"step": 450
},
{
"epoch": 0.3945371775417299,
"grad_norm": 0.46915620454457757,
"learning_rate": 0.00029145613719719484,
"loss": 6.8021,
"step": 455
},
{
"epoch": 0.39887275092130936,
"grad_norm": 0.7795294645969753,
"learning_rate": 0.0002910743280483899,
"loss": 6.7266,
"step": 460
},
{
"epoch": 0.40320832430088877,
"grad_norm": 1.1660550286252116,
"learning_rate": 0.00029068443446840606,
"loss": 6.8039,
"step": 465
},
{
"epoch": 0.4075438976804682,
"grad_norm": 1.9954850065496847,
"learning_rate": 0.0002902864788005559,
"loss": 6.7036,
"step": 470
},
{
"epoch": 0.4118794710600477,
"grad_norm": 1.3144540313281778,
"learning_rate": 0.00028988048385015955,
"loss": 6.6625,
"step": 475
},
{
"epoch": 0.41621504443962715,
"grad_norm": 0.5550135068925496,
"learning_rate": 0.00028946647288323766,
"loss": 6.5448,
"step": 480
},
{
"epoch": 0.4205506178192066,
"grad_norm": 0.30284368817332974,
"learning_rate": 0.0002890444696251783,
"loss": 6.4523,
"step": 485
},
{
"epoch": 0.424886191198786,
"grad_norm": 0.5328756605828856,
"learning_rate": 0.0002886144982593771,
"loss": 6.3727,
"step": 490
},
{
"epoch": 0.4292217645783655,
"grad_norm": 1.2843902165543528,
"learning_rate": 0.0002881765834258516,
"loss": 6.471,
"step": 495
},
{
"epoch": 0.43355733795794493,
"grad_norm": 0.7139034044101961,
"learning_rate": 0.00028773075021982917,
"loss": 6.3271,
"step": 500
},
{
"epoch": 0.4378929113375244,
"grad_norm": 0.41272252446113744,
"learning_rate": 0.00028727702419030883,
"loss": 6.2754,
"step": 505
},
{
"epoch": 0.44222848471710385,
"grad_norm": 0.24680608205710353,
"learning_rate": 0.00028681543133859716,
"loss": 6.1946,
"step": 510
},
{
"epoch": 0.4465640580966833,
"grad_norm": 0.9592076263299766,
"learning_rate": 0.0002863459981168184,
"loss": 6.1744,
"step": 515
},
{
"epoch": 0.4508996314762627,
"grad_norm": 2.2348898675903532,
"learning_rate": 0.0002858687514263983,
"loss": 6.112,
"step": 520
},
{
"epoch": 0.4552352048558422,
"grad_norm": 0.5214216239256872,
"learning_rate": 0.00028538371861652284,
"loss": 6.1034,
"step": 525
},
{
"epoch": 0.45957077823542164,
"grad_norm": 0.6212645654417902,
"learning_rate": 0.00028489092748257066,
"loss": 6.0164,
"step": 530
},
{
"epoch": 0.4639063516150011,
"grad_norm": 0.7124953084880589,
"learning_rate": 0.0002843904062645204,
"loss": 5.986,
"step": 535
},
{
"epoch": 0.46824192499458056,
"grad_norm": 0.3376451437228419,
"learning_rate": 0.0002838821836453323,
"loss": 5.9095,
"step": 540
},
{
"epoch": 0.47257749837415997,
"grad_norm": 2.464753993204931,
"learning_rate": 0.0002833662887493045,
"loss": 5.9207,
"step": 545
},
{
"epoch": 0.4769130717537394,
"grad_norm": 0.8049577940385247,
"learning_rate": 0.00028284275114040395,
"loss": 5.9179,
"step": 550
},
{
"epoch": 0.4812486451333189,
"grad_norm": 1.3390472579975088,
"learning_rate": 0.0002823116008205725,
"loss": 5.9107,
"step": 555
},
{
"epoch": 0.48558421851289835,
"grad_norm": 0.6905384313954396,
"learning_rate": 0.00028177286822800713,
"loss": 5.796,
"step": 560
},
{
"epoch": 0.4899197918924778,
"grad_norm": 2.8514904783073898,
"learning_rate": 0.0002812265842354162,
"loss": 5.7603,
"step": 565
},
{
"epoch": 0.4942553652720572,
"grad_norm": 1.5555920419031897,
"learning_rate": 0.0002806727801482498,
"loss": 5.9134,
"step": 570
},
{
"epoch": 0.4985909386516367,
"grad_norm": 0.8043247332715469,
"learning_rate": 0.000280111487702906,
"loss": 5.8254,
"step": 575
},
{
"epoch": 0.5029265120312161,
"grad_norm": 0.5751977544216561,
"learning_rate": 0.0002795427390649119,
"loss": 5.7081,
"step": 580
},
{
"epoch": 0.5072620854107955,
"grad_norm": 0.6822440874804724,
"learning_rate": 0.00027896656682708094,
"loss": 5.6121,
"step": 585
},
{
"epoch": 0.511597658790375,
"grad_norm": 0.8797162911577192,
"learning_rate": 0.0002783830040076444,
"loss": 5.5998,
"step": 590
},
{
"epoch": 0.5159332321699545,
"grad_norm": 0.4760553301829561,
"learning_rate": 0.0002777920840483596,
"loss": 5.5739,
"step": 595
},
{
"epoch": 0.5202688055495339,
"grad_norm": 1.2070114825643932,
"learning_rate": 0.0002771938408125936,
"loss": 5.539,
"step": 600
},
{
"epoch": 0.5246043789291134,
"grad_norm": 0.3589877318117702,
"learning_rate": 0.00027658830858338245,
"loss": 5.5504,
"step": 605
},
{
"epoch": 0.5289399523086928,
"grad_norm": 0.7944121659454099,
"learning_rate": 0.0002759755220614664,
"loss": 5.5072,
"step": 610
},
{
"epoch": 0.5332755256882723,
"grad_norm": 0.7202458729236075,
"learning_rate": 0.00027535551636330175,
"loss": 5.454,
"step": 615
},
{
"epoch": 0.5376110990678518,
"grad_norm": 2.106706884389399,
"learning_rate": 0.0002747283270190482,
"loss": 5.4935,
"step": 620
},
{
"epoch": 0.5419466724474312,
"grad_norm": 1.3364284227174121,
"learning_rate": 0.0002740939899705327,
"loss": 5.5994,
"step": 625
},
{
"epoch": 0.5462822458270106,
"grad_norm": 0.6407462262016212,
"learning_rate": 0.00027345254156918976,
"loss": 5.5447,
"step": 630
},
{
"epoch": 0.55061781920659,
"grad_norm": 0.7271335411877718,
"learning_rate": 0.00027280401857397854,
"loss": 5.4461,
"step": 635
},
{
"epoch": 0.5549533925861695,
"grad_norm": 0.9256391048561129,
"learning_rate": 0.0002721484581492762,
"loss": 5.3663,
"step": 640
},
{
"epoch": 0.559288965965749,
"grad_norm": 0.7708081453999853,
"learning_rate": 0.00027148589786274793,
"loss": 5.3796,
"step": 645
},
{
"epoch": 0.5636245393453284,
"grad_norm": 0.7777461095797896,
"learning_rate": 0.00027081637568319446,
"loss": 5.2963,
"step": 650
},
{
"epoch": 0.5679601127249079,
"grad_norm": 0.5329435487662679,
"learning_rate": 0.00027013992997837585,
"loss": 5.2219,
"step": 655
},
{
"epoch": 0.5722956861044873,
"grad_norm": 0.6524870578047196,
"learning_rate": 0.0002694565995128132,
"loss": 5.2601,
"step": 660
},
{
"epoch": 0.5766312594840668,
"grad_norm": 1.2050259366493623,
"learning_rate": 0.0002687664234455667,
"loss": 5.2788,
"step": 665
},
{
"epoch": 0.5809668328636463,
"grad_norm": 0.6205369626056482,
"learning_rate": 0.00026806944132799196,
"loss": 5.1169,
"step": 670
},
{
"epoch": 0.5853024062432257,
"grad_norm": 0.9580684303122623,
"learning_rate": 0.0002673656931014735,
"loss": 5.1311,
"step": 675
},
{
"epoch": 0.5896379796228051,
"grad_norm": 0.9227742616515185,
"learning_rate": 0.00026665521909513545,
"loss": 5.1194,
"step": 680
},
{
"epoch": 0.5939735530023845,
"grad_norm": 0.35677362077396757,
"learning_rate": 0.00026593806002353086,
"loss": 5.0662,
"step": 685
},
{
"epoch": 0.598309126381964,
"grad_norm": 0.5456219742352171,
"learning_rate": 0.0002652142569843083,
"loss": 4.9998,
"step": 690
},
{
"epoch": 0.6026446997615434,
"grad_norm": 2.0462321590818453,
"learning_rate": 0.0002644838514558568,
"loss": 5.0121,
"step": 695
},
{
"epoch": 0.6069802731411229,
"grad_norm": 1.2389697363524088,
"learning_rate": 0.00026374688529492887,
"loss": 4.9563,
"step": 700
},
{
"epoch": 0.6113158465207024,
"grad_norm": 0.6691025924761106,
"learning_rate": 0.0002630034007342416,
"loss": 4.9738,
"step": 705
},
{
"epoch": 0.6156514199002818,
"grad_norm": 0.6906310139155549,
"learning_rate": 0.00026225344038005707,
"loss": 4.9986,
"step": 710
},
{
"epoch": 0.6199869932798613,
"grad_norm": 0.9744576847003475,
"learning_rate": 0.00026149704720974004,
"loss": 4.9758,
"step": 715
},
{
"epoch": 0.6243225666594407,
"grad_norm": 1.4069257744518648,
"learning_rate": 0.0002607342645692955,
"loss": 4.9898,
"step": 720
},
{
"epoch": 0.6286581400390202,
"grad_norm": 1.2113043966119588,
"learning_rate": 0.0002599651361708846,
"loss": 4.947,
"step": 725
},
{
"epoch": 0.6329937134185996,
"grad_norm": 0.7232403329008882,
"learning_rate": 0.0002591897060903197,
"loss": 4.8734,
"step": 730
},
{
"epoch": 0.637329286798179,
"grad_norm": 0.5694346606113075,
"learning_rate": 0.0002584080187645384,
"loss": 4.8135,
"step": 735
},
{
"epoch": 0.6416648601777585,
"grad_norm": 0.4648121706598134,
"learning_rate": 0.00025762011898905723,
"loss": 4.8169,
"step": 740
},
{
"epoch": 0.6460004335573379,
"grad_norm": 0.57832813623837,
"learning_rate": 0.00025682605191540447,
"loss": 4.7676,
"step": 745
},
{
"epoch": 0.6503360069369174,
"grad_norm": 0.690363584833736,
"learning_rate": 0.00025602586304853265,
"loss": 4.7134,
"step": 750
},
{
"epoch": 0.6546715803164969,
"grad_norm": 0.2557886503788499,
"learning_rate": 0.000255219598244211,
"loss": 4.7075,
"step": 755
},
{
"epoch": 0.6590071536960763,
"grad_norm": 0.7529134213339467,
"learning_rate": 0.00025440730370639744,
"loss": 4.65,
"step": 760
},
{
"epoch": 0.6633427270756558,
"grad_norm": 0.5243957618805375,
"learning_rate": 0.00025358902598459097,
"loss": 4.6432,
"step": 765
},
{
"epoch": 0.6676783004552352,
"grad_norm": 0.8067461754194933,
"learning_rate": 0.00025276481197116397,
"loss": 4.6508,
"step": 770
},
{
"epoch": 0.6720138738348147,
"grad_norm": 1.2265176252828778,
"learning_rate": 0.00025193470889867505,
"loss": 4.6586,
"step": 775
},
{
"epoch": 0.676349447214394,
"grad_norm": 1.0124995665289962,
"learning_rate": 0.00025109876433716236,
"loss": 4.5788,
"step": 780
},
{
"epoch": 0.6806850205939735,
"grad_norm": 0.9858327008465355,
"learning_rate": 0.0002502570261914174,
"loss": 4.5459,
"step": 785
},
{
"epoch": 0.685020593973553,
"grad_norm": 0.675937090658665,
"learning_rate": 0.0002494095426982399,
"loss": 4.5489,
"step": 790
},
{
"epoch": 0.6893561673531324,
"grad_norm": 0.9300433002492945,
"learning_rate": 0.0002485563624236736,
"loss": 4.55,
"step": 795
},
{
"epoch": 0.6936917407327119,
"grad_norm": 0.5489664608896133,
"learning_rate": 0.0002476975342602229,
"loss": 4.4796,
"step": 800
},
{
"epoch": 0.6980273141122914,
"grad_norm": 0.826574591896581,
"learning_rate": 0.00024683310742405106,
"loss": 4.4609,
"step": 805
},
{
"epoch": 0.7023628874918708,
"grad_norm": 1.1037971961437545,
"learning_rate": 0.00024596313145216033,
"loss": 4.5026,
"step": 810
},
{
"epoch": 0.7066984608714503,
"grad_norm": 0.4898602321700531,
"learning_rate": 0.0002450876561995523,
"loss": 4.4346,
"step": 815
},
{
"epoch": 0.7110340342510297,
"grad_norm": 0.5951952065642176,
"learning_rate": 0.00024420673183637146,
"loss": 4.4397,
"step": 820
},
{
"epoch": 0.7153696076306092,
"grad_norm": 0.8199632042459121,
"learning_rate": 0.00024332040884503023,
"loss": 4.4169,
"step": 825
},
{
"epoch": 0.7197051810101885,
"grad_norm": 0.7484859334102708,
"learning_rate": 0.00024242873801731552,
"loss": 4.4214,
"step": 830
},
{
"epoch": 0.724040754389768,
"grad_norm": 0.3847206228382832,
"learning_rate": 0.0002415317704514785,
"loss": 4.4005,
"step": 835
},
{
"epoch": 0.7283763277693475,
"grad_norm": 0.25033596262216246,
"learning_rate": 0.0002406295575493061,
"loss": 4.2858,
"step": 840
},
{
"epoch": 0.7327119011489269,
"grad_norm": 0.6631031175545227,
"learning_rate": 0.00023972215101317545,
"loss": 4.2667,
"step": 845
},
{
"epoch": 0.7370474745285064,
"grad_norm": 0.9152978939845885,
"learning_rate": 0.00023880960284309116,
"loss": 4.2363,
"step": 850
},
{
"epoch": 0.7413830479080858,
"grad_norm": 1.6470126913016125,
"learning_rate": 0.000237891965333705,
"loss": 4.2788,
"step": 855
},
{
"epoch": 0.7457186212876653,
"grad_norm": 0.9797191599004809,
"learning_rate": 0.00023696929107131962,
"loss": 4.3022,
"step": 860
},
{
"epoch": 0.7500541946672448,
"grad_norm": 0.6420083064783988,
"learning_rate": 0.00023604163293087447,
"loss": 4.2127,
"step": 865
},
{
"epoch": 0.7543897680468242,
"grad_norm": 0.3430659331012604,
"learning_rate": 0.0002351090440729163,
"loss": 4.183,
"step": 870
},
{
"epoch": 0.7587253414264037,
"grad_norm": 0.5456932919947513,
"learning_rate": 0.00023417157794055233,
"loss": 4.1664,
"step": 875
},
{
"epoch": 0.763060914805983,
"grad_norm": 0.6297764897631516,
"learning_rate": 0.0002332292882563877,
"loss": 4.1577,
"step": 880
},
{
"epoch": 0.7673964881855625,
"grad_norm": 0.7190804662247191,
"learning_rate": 0.00023228222901944693,
"loss": 4.1005,
"step": 885
},
{
"epoch": 0.771732061565142,
"grad_norm": 0.7661623579650987,
"learning_rate": 0.00023133045450207952,
"loss": 4.1292,
"step": 890
},
{
"epoch": 0.7760676349447214,
"grad_norm": 1.0103277074141193,
"learning_rate": 0.00023037401924684946,
"loss": 4.1244,
"step": 895
},
{
"epoch": 0.7804032083243009,
"grad_norm": 0.6247976412624747,
"learning_rate": 0.0002294129780634101,
"loss": 4.1062,
"step": 900
},
{
"epoch": 0.7847387817038803,
"grad_norm": 0.6772595937883702,
"learning_rate": 0.00022844738602536275,
"loss": 4.0618,
"step": 905
},
{
"epoch": 0.7890743550834598,
"grad_norm": 0.6184206318420459,
"learning_rate": 0.00022747729846710085,
"loss": 4.0676,
"step": 910
},
{
"epoch": 0.7934099284630393,
"grad_norm": 0.638858720650207,
"learning_rate": 0.0002265027709806391,
"loss": 4.0643,
"step": 915
},
{
"epoch": 0.7977455018426187,
"grad_norm": 0.6957940244542513,
"learning_rate": 0.00022552385941242736,
"loss": 4.0841,
"step": 920
},
{
"epoch": 0.8020810752221982,
"grad_norm": 0.9335514321597355,
"learning_rate": 0.00022454061986015047,
"loss": 4.0154,
"step": 925
},
{
"epoch": 0.8064166486017775,
"grad_norm": 0.4531070372158566,
"learning_rate": 0.0002235531086695137,
"loss": 3.9897,
"step": 930
},
{
"epoch": 0.810752221981357,
"grad_norm": 0.9518970467477127,
"learning_rate": 0.00022256138243101337,
"loss": 3.9785,
"step": 935
},
{
"epoch": 0.8150877953609365,
"grad_norm": 0.7222836717307966,
"learning_rate": 0.00022156549797669434,
"loss": 3.9408,
"step": 940
},
{
"epoch": 0.8194233687405159,
"grad_norm": 0.4376277363109283,
"learning_rate": 0.00022056551237689277,
"loss": 3.9633,
"step": 945
},
{
"epoch": 0.8237589421200954,
"grad_norm": 0.40106634394410035,
"learning_rate": 0.00021956148293696584,
"loss": 3.9324,
"step": 950
},
{
"epoch": 0.8280945154996748,
"grad_norm": 0.4672307168059042,
"learning_rate": 0.00021855346719400787,
"loss": 3.9066,
"step": 955
},
{
"epoch": 0.8324300888792543,
"grad_norm": 0.9110683889580367,
"learning_rate": 0.00021754152291355284,
"loss": 3.8493,
"step": 960
},
{
"epoch": 0.8367656622588338,
"grad_norm": 0.8166157046078413,
"learning_rate": 0.0002165257080862643,
"loss": 3.8129,
"step": 965
},
{
"epoch": 0.8411012356384132,
"grad_norm": 0.5406920005139866,
"learning_rate": 0.00021550608092461208,
"loss": 3.8946,
"step": 970
},
{
"epoch": 0.8454368090179927,
"grad_norm": 0.9346807879316044,
"learning_rate": 0.00021448269985953634,
"loss": 3.8407,
"step": 975
},
{
"epoch": 0.849772382397572,
"grad_norm": 0.5449137337223598,
"learning_rate": 0.00021345562353709905,
"loss": 3.8459,
"step": 980
},
{
"epoch": 0.8541079557771515,
"grad_norm": 0.5017093393188926,
"learning_rate": 0.00021242491081512329,
"loss": 3.8334,
"step": 985
},
{
"epoch": 0.858443529156731,
"grad_norm": 0.3346538476585638,
"learning_rate": 0.00021139062075982038,
"loss": 3.7552,
"step": 990
},
{
"epoch": 0.8627791025363104,
"grad_norm": 0.3170992203885485,
"learning_rate": 0.00021035281264240491,
"loss": 3.7351,
"step": 995
},
{
"epoch": 0.8671146759158899,
"grad_norm": 0.6053179178799784,
"learning_rate": 0.00020931154593569813,
"loss": 3.7225,
"step": 1000
},
{
"epoch": 0.8714502492954693,
"grad_norm": 1.3058143067536492,
"learning_rate": 0.00020826688031072,
"loss": 3.7079,
"step": 1005
},
{
"epoch": 0.8757858226750488,
"grad_norm": 0.5266184524547373,
"learning_rate": 0.00020721887563326924,
"loss": 3.7352,
"step": 1010
},
{
"epoch": 0.8801213960546282,
"grad_norm": 0.420302054844465,
"learning_rate": 0.0002061675919604932,
"loss": 3.6589,
"step": 1015
},
{
"epoch": 0.8844569694342077,
"grad_norm": 0.7488296555492675,
"learning_rate": 0.00020511308953744578,
"loss": 3.6358,
"step": 1020
},
{
"epoch": 0.8887925428137872,
"grad_norm": 0.6601885350762652,
"learning_rate": 0.0002040554287936352,
"loss": 3.6682,
"step": 1025
},
{
"epoch": 0.8931281161933666,
"grad_norm": 0.3880230100654199,
"learning_rate": 0.000202994670339561,
"loss": 3.6391,
"step": 1030
},
{
"epoch": 0.897463689572946,
"grad_norm": 0.3917175363168505,
"learning_rate": 0.00020193087496324068,
"loss": 3.6016,
"step": 1035
},
{
"epoch": 0.9017992629525254,
"grad_norm": 0.46356273931086533,
"learning_rate": 0.00020086410362672608,
"loss": 3.5906,
"step": 1040
},
{
"epoch": 0.9061348363321049,
"grad_norm": 1.1143119522475413,
"learning_rate": 0.00019979441746261007,
"loss": 3.6533,
"step": 1045
},
{
"epoch": 0.9104704097116844,
"grad_norm": 0.9967538144738599,
"learning_rate": 0.0001987218777705231,
"loss": 3.6323,
"step": 1050
},
{
"epoch": 0.9148059830912638,
"grad_norm": 0.44096052107987527,
"learning_rate": 0.0001976465460136204,
"loss": 3.5632,
"step": 1055
},
{
"epoch": 0.9191415564708433,
"grad_norm": 0.2828790472438776,
"learning_rate": 0.0001965684838150598,
"loss": 3.5499,
"step": 1060
},
{
"epoch": 0.9234771298504227,
"grad_norm": 0.3974061570448302,
"learning_rate": 0.00019548775295447047,
"loss": 3.5173,
"step": 1065
},
{
"epoch": 0.9278127032300022,
"grad_norm": 0.33203900964680516,
"learning_rate": 0.00019440441536441202,
"loss": 3.514,
"step": 1070
},
{
"epoch": 0.9321482766095817,
"grad_norm": 0.5937750161188399,
"learning_rate": 0.00019331853312682613,
"loss": 3.4923,
"step": 1075
},
{
"epoch": 0.9364838499891611,
"grad_norm": 0.45882557948133224,
"learning_rate": 0.00019223016846947843,
"loss": 3.4693,
"step": 1080
},
{
"epoch": 0.9408194233687405,
"grad_norm": 0.49337347425087247,
"learning_rate": 0.00019113938376239247,
"loss": 3.4604,
"step": 1085
},
{
"epoch": 0.9451549967483199,
"grad_norm": 0.35941699726534343,
"learning_rate": 0.00019004624151427568,
"loss": 3.4682,
"step": 1090
},
{
"epoch": 0.9494905701278994,
"grad_norm": 0.30818796537571425,
"learning_rate": 0.0001889508043689372,
"loss": 3.4252,
"step": 1095
},
{
"epoch": 0.9538261435074789,
"grad_norm": 0.7001199109738344,
"learning_rate": 0.00018785313510169782,
"loss": 3.4065,
"step": 1100
},
{
"epoch": 0.9581617168870583,
"grad_norm": 0.9768195615514486,
"learning_rate": 0.0001867532966157929,
"loss": 3.4084,
"step": 1105
},
{
"epoch": 0.9624972902666378,
"grad_norm": 0.5379630797492526,
"learning_rate": 0.0001856513519387673,
"loss": 3.4402,
"step": 1110
},
{
"epoch": 0.9668328636462172,
"grad_norm": 0.6853314266411482,
"learning_rate": 0.0001845473642188637,
"loss": 3.411,
"step": 1115
},
{
"epoch": 0.9711684370257967,
"grad_norm": 0.28464215705198403,
"learning_rate": 0.00018344139672140384,
"loss": 3.396,
"step": 1120
},
{
"epoch": 0.9755040104053762,
"grad_norm": 0.3931004534338328,
"learning_rate": 0.00018233351282516283,
"loss": 3.3599,
"step": 1125
},
{
"epoch": 0.9798395837849556,
"grad_norm": 0.4836933366464829,
"learning_rate": 0.00018122377601873733,
"loss": 3.3365,
"step": 1130
},
{
"epoch": 0.984175157164535,
"grad_norm": 0.4500537170978018,
"learning_rate": 0.00018011224989690727,
"loss": 3.3036,
"step": 1135
},
{
"epoch": 0.9885107305441144,
"grad_norm": 0.2859298083164817,
"learning_rate": 0.00017899899815699134,
"loss": 3.2616,
"step": 1140
},
{
"epoch": 0.9928463039236939,
"grad_norm": 0.7548873266247055,
"learning_rate": 0.00017788408459519674,
"loss": 3.2599,
"step": 1145
},
{
"epoch": 0.9971818773032733,
"grad_norm": 0.32602384410109714,
"learning_rate": 0.00017676757310296356,
"loss": 3.2946,
"step": 1150
},
{
"epoch": 1.0008671146759158,
"grad_norm": 1.0724922966955157,
"learning_rate": 0.00017564952766330308,
"loss": 3.2325,
"step": 1155
},
{
"epoch": 1.0052026880554954,
"grad_norm": 0.3056819985122965,
"learning_rate": 0.00017453001234713107,
"loss": 3.2937,
"step": 1160
},
{
"epoch": 1.0095382614350747,
"grad_norm": 0.36438507475227383,
"learning_rate": 0.0001734090913095966,
"loss": 3.2332,
"step": 1165
},
{
"epoch": 1.0138738348146543,
"grad_norm": 0.30915725984980597,
"learning_rate": 0.00017228682878640508,
"loss": 3.2364,
"step": 1170
},
{
"epoch": 1.0182094081942337,
"grad_norm": 0.33647494500052355,
"learning_rate": 0.0001711632890901374,
"loss": 3.2003,
"step": 1175
},
{
"epoch": 1.0225449815738132,
"grad_norm": 0.312162857766132,
"learning_rate": 0.00017003853660656435,
"loss": 3.1807,
"step": 1180
},
{
"epoch": 1.0268805549533926,
"grad_norm": 0.28432133622360134,
"learning_rate": 0.00016891263579095698,
"loss": 3.1668,
"step": 1185
},
{
"epoch": 1.031216128332972,
"grad_norm": 0.27204342913825097,
"learning_rate": 0.0001677856511643928,
"loss": 3.1283,
"step": 1190
},
{
"epoch": 1.0355517017125515,
"grad_norm": 0.47073942395246565,
"learning_rate": 0.00016665764731005838,
"loss": 3.0741,
"step": 1195
},
{
"epoch": 1.0398872750921309,
"grad_norm": 0.48701637427424527,
"learning_rate": 0.0001655286888695484,
"loss": 3.079,
"step": 1200
},
{
"epoch": 1.0442228484717104,
"grad_norm": 0.43725300587861615,
"learning_rate": 0.0001643988405391612,
"loss": 3.1095,
"step": 1205
},
{
"epoch": 1.0485584218512898,
"grad_norm": 0.4519430510361337,
"learning_rate": 0.00016326816706619136,
"loss": 3.0779,
"step": 1210
},
{
"epoch": 1.0528939952308694,
"grad_norm": 0.3697980433575495,
"learning_rate": 0.00016213673324521913,
"loss": 3.1321,
"step": 1215
},
{
"epoch": 1.0572295686104487,
"grad_norm": 0.3772355822034261,
"learning_rate": 0.00016100460391439749,
"loss": 3.0517,
"step": 1220
},
{
"epoch": 1.0615651419900283,
"grad_norm": 0.3795892280711555,
"learning_rate": 0.0001598718439517364,
"loss": 3.0278,
"step": 1225
},
{
"epoch": 1.0659007153696076,
"grad_norm": 0.2889179402680778,
"learning_rate": 0.0001587385182713849,
"loss": 3.0402,
"step": 1230
},
{
"epoch": 1.0702362887491872,
"grad_norm": 0.3755593221213261,
"learning_rate": 0.0001576046918199112,
"loss": 2.994,
"step": 1235
},
{
"epoch": 1.0745718621287665,
"grad_norm": 0.3476317696753179,
"learning_rate": 0.0001564704295725808,
"loss": 3.0468,
"step": 1240
},
{
"epoch": 1.078907435508346,
"grad_norm": 0.32852407166347947,
"learning_rate": 0.00015533579652963288,
"loss": 2.9539,
"step": 1245
},
{
"epoch": 1.0832430088879255,
"grad_norm": 0.1822141394179994,
"learning_rate": 0.00015420085771255566,
"loss": 3.0026,
"step": 1250
},
{
"epoch": 1.0875785822675048,
"grad_norm": 0.270319630849222,
"learning_rate": 0.00015306567816036006,
"loss": 2.976,
"step": 1255
},
{
"epoch": 1.0919141556470844,
"grad_norm": 0.5149999760979279,
"learning_rate": 0.00015193032292585247,
"loss": 2.9326,
"step": 1260
},
{
"epoch": 1.0962497290266637,
"grad_norm": 0.4445856562229245,
"learning_rate": 0.00015079485707190717,
"loss": 2.9483,
"step": 1265
},
{
"epoch": 1.1005853024062433,
"grad_norm": 0.5660752187512902,
"learning_rate": 0.00014965934566773753,
"loss": 2.9209,
"step": 1270
},
{
"epoch": 1.1049208757858227,
"grad_norm": 0.36107453692493174,
"learning_rate": 0.00014852385378516712,
"loss": 2.9059,
"step": 1275
},
{
"epoch": 1.1092564491654022,
"grad_norm": 0.8307960065897146,
"learning_rate": 0.00014738844649490106,
"loss": 2.9135,
"step": 1280
},
{
"epoch": 1.1135920225449816,
"grad_norm": 0.32930556448154574,
"learning_rate": 0.0001462531888627966,
"loss": 2.931,
"step": 1285
},
{
"epoch": 1.117927595924561,
"grad_norm": 0.45094251331146196,
"learning_rate": 0.00014511814594613461,
"loss": 2.8794,
"step": 1290
},
{
"epoch": 1.1222631693041405,
"grad_norm": 1.1023315045514466,
"learning_rate": 0.00014398338278989167,
"loss": 2.8964,
"step": 1295
},
{
"epoch": 1.1265987426837198,
"grad_norm": 0.6884577647184886,
"learning_rate": 0.00014284896442301218,
"loss": 2.9186,
"step": 1300
},
{
"epoch": 1.1309343160632994,
"grad_norm": 0.6853011714731221,
"learning_rate": 0.00014171495585468195,
"loss": 2.9093,
"step": 1305
},
{
"epoch": 1.1352698894428788,
"grad_norm": 0.4444521764738651,
"learning_rate": 0.000140581422070603,
"loss": 2.8949,
"step": 1310
},
{
"epoch": 1.1396054628224583,
"grad_norm": 0.36607451431757926,
"learning_rate": 0.00013944842802926904,
"loss": 2.8727,
"step": 1315
},
{
"epoch": 1.1439410362020377,
"grad_norm": 0.20621050239222513,
"learning_rate": 0.00013831603865824328,
"loss": 2.8068,
"step": 1320
},
{
"epoch": 1.1482766095816173,
"grad_norm": 0.29548591941866714,
"learning_rate": 0.00013718431885043772,
"loss": 2.8033,
"step": 1325
},
{
"epoch": 1.1526121829611966,
"grad_norm": 0.3480007759133749,
"learning_rate": 0.000136053333460394,
"loss": 2.8303,
"step": 1330
},
{
"epoch": 1.156947756340776,
"grad_norm": 0.2962266371943945,
"learning_rate": 0.0001349231473005673,
"loss": 2.7893,
"step": 1335
},
{
"epoch": 1.1612833297203555,
"grad_norm": 0.23448406684507436,
"learning_rate": 0.00013379382513761175,
"loss": 2.7797,
"step": 1340
},
{
"epoch": 1.1656189030999349,
"grad_norm": 0.3234043699614117,
"learning_rate": 0.00013266543168866934,
"loss": 2.7607,
"step": 1345
},
{
"epoch": 1.1699544764795144,
"grad_norm": 0.3404947812121631,
"learning_rate": 0.0001315380316176609,
"loss": 2.7567,
"step": 1350
},
{
"epoch": 1.1742900498590938,
"grad_norm": 0.6066048669028199,
"learning_rate": 0.0001304116895315805,
"loss": 2.7501,
"step": 1355
},
{
"epoch": 1.1786256232386734,
"grad_norm": 0.44548332421799974,
"learning_rate": 0.00012928646997679326,
"loss": 2.7475,
"step": 1360
},
{
"epoch": 1.1829611966182527,
"grad_norm": 0.3939467687702201,
"learning_rate": 0.00012816243743533624,
"loss": 2.7117,
"step": 1365
},
{
"epoch": 1.1872967699978323,
"grad_norm": 0.3011307288220261,
"learning_rate": 0.00012703965632122327,
"loss": 2.7543,
"step": 1370
},
{
"epoch": 1.1916323433774116,
"grad_norm": 0.22501315848677994,
"learning_rate": 0.00012591819097675382,
"loss": 2.7462,
"step": 1375
},
{
"epoch": 1.1959679167569912,
"grad_norm": 0.32722976515069685,
"learning_rate": 0.0001247981056688254,
"loss": 2.6968,
"step": 1380
},
{
"epoch": 1.2003034901365706,
"grad_norm": 0.1590488680202715,
"learning_rate": 0.00012367946458525099,
"loss": 2.7045,
"step": 1385
},
{
"epoch": 1.20463906351615,
"grad_norm": 0.3152582769975424,
"learning_rate": 0.00012256233183108068,
"loss": 2.6789,
"step": 1390
},
{
"epoch": 1.2089746368957295,
"grad_norm": 0.22965781079535302,
"learning_rate": 0.00012144677142492789,
"loss": 2.7101,
"step": 1395
},
{
"epoch": 1.2133102102753088,
"grad_norm": 0.4684583447317611,
"learning_rate": 0.00012033284729530057,
"loss": 2.6259,
"step": 1400
},
{
"epoch": 1.2176457836548884,
"grad_norm": 0.22363991138693204,
"learning_rate": 0.00011922062327693832,
"loss": 2.6717,
"step": 1405
},
{
"epoch": 1.2219813570344678,
"grad_norm": 0.33497001761272166,
"learning_rate": 0.00011811016310715355,
"loss": 2.6517,
"step": 1410
},
{
"epoch": 1.2263169304140473,
"grad_norm": 0.20548859646502277,
"learning_rate": 0.00011700153042217931,
"loss": 2.6677,
"step": 1415
},
{
"epoch": 1.2306525037936267,
"grad_norm": 0.2652083246967948,
"learning_rate": 0.00011589478875352255,
"loss": 2.6543,
"step": 1420
},
{
"epoch": 1.2349880771732062,
"grad_norm": 0.2529645672839692,
"learning_rate": 0.00011479000152432319,
"loss": 2.6205,
"step": 1425
},
{
"epoch": 1.2393236505527856,
"grad_norm": 0.2601375038926653,
"learning_rate": 0.0001136872320457197,
"loss": 2.6102,
"step": 1430
},
{
"epoch": 1.2436592239323652,
"grad_norm": 0.19250161537810334,
"learning_rate": 0.00011258654351322107,
"loss": 2.631,
"step": 1435
},
{
"epoch": 1.2479947973119445,
"grad_norm": 0.35217150816617415,
"learning_rate": 0.00011148799900308509,
"loss": 2.6013,
"step": 1440
},
{
"epoch": 1.2523303706915239,
"grad_norm": 0.8877189979150436,
"learning_rate": 0.00011039166146870383,
"loss": 2.6335,
"step": 1445
},
{
"epoch": 1.2566659440711034,
"grad_norm": 0.6027781279490154,
"learning_rate": 0.00010929759373699613,
"loss": 2.6011,
"step": 1450
},
{
"epoch": 1.2610015174506828,
"grad_norm": 0.29684477637886336,
"learning_rate": 0.00010820585850480696,
"loss": 2.6083,
"step": 1455
},
{
"epoch": 1.2653370908302624,
"grad_norm": 0.22866123103951785,
"learning_rate": 0.00010711651833531463,
"loss": 2.6249,
"step": 1460
},
{
"epoch": 1.2696726642098417,
"grad_norm": 0.2570183068878416,
"learning_rate": 0.00010602963565444577,
"loss": 2.5858,
"step": 1465
},
{
"epoch": 1.2740082375894213,
"grad_norm": 0.31391058369721064,
"learning_rate": 0.00010494527274729748,
"loss": 2.5606,
"step": 1470
},
{
"epoch": 1.2783438109690006,
"grad_norm": 0.3203458432202096,
"learning_rate": 0.00010386349175456825,
"loss": 2.5637,
"step": 1475
},
{
"epoch": 1.28267938434858,
"grad_norm": 0.30834075039079006,
"learning_rate": 0.00010278435466899714,
"loss": 2.6011,
"step": 1480
},
{
"epoch": 1.2870149577281595,
"grad_norm": 0.22399943173892975,
"learning_rate": 0.00010170792333181084,
"loss": 2.5288,
"step": 1485
},
{
"epoch": 1.2913505311077391,
"grad_norm": 0.3319547375893817,
"learning_rate": 0.00010063425942917974,
"loss": 2.5375,
"step": 1490
},
{
"epoch": 1.2956861044873185,
"grad_norm": 0.3315527934802732,
"learning_rate": 9.956342448868354e-05,
"loss": 2.5274,
"step": 1495
},
{
"epoch": 1.3000216778668978,
"grad_norm": 0.3580645974138616,
"learning_rate": 9.849547987578457e-05,
"loss": 2.5585,
"step": 1500
},
{
"epoch": 1.3043572512464774,
"grad_norm": 0.2780635499992022,
"learning_rate": 9.743048679031163e-05,
"loss": 2.5291,
"step": 1505
},
{
"epoch": 1.3086928246260567,
"grad_norm": 0.40375989395133016,
"learning_rate": 9.636850626295282e-05,
"loss": 2.517,
"step": 1510
},
{
"epoch": 1.3130283980056363,
"grad_norm": 0.3527800248976021,
"learning_rate": 9.530959915175796e-05,
"loss": 2.5277,
"step": 1515
},
{
"epoch": 1.3173639713852157,
"grad_norm": 0.23822673694395258,
"learning_rate": 9.425382613865107e-05,
"loss": 2.5014,
"step": 1520
},
{
"epoch": 1.3216995447647952,
"grad_norm": 0.1747138201174964,
"learning_rate": 9.32012477259531e-05,
"loss": 2.4866,
"step": 1525
},
{
"epoch": 1.3260351181443746,
"grad_norm": 0.1954668050935581,
"learning_rate": 9.215192423291463e-05,
"loss": 2.5021,
"step": 1530
},
{
"epoch": 1.330370691523954,
"grad_norm": 0.19057547870092642,
"learning_rate": 9.110591579225906e-05,
"loss": 2.5044,
"step": 1535
},
{
"epoch": 1.3347062649035335,
"grad_norm": 0.2745708957809259,
"learning_rate": 9.006328234673701e-05,
"loss": 2.5073,
"step": 1540
},
{
"epoch": 1.339041838283113,
"grad_norm": 0.20139794408374664,
"learning_rate": 8.90240836456909e-05,
"loss": 2.5033,
"step": 1545
},
{
"epoch": 1.3433774116626924,
"grad_norm": 0.201183788959332,
"learning_rate": 8.798837924163098e-05,
"loss": 2.4782,
"step": 1550
},
{
"epoch": 1.3477129850422718,
"grad_norm": 0.3799911989499351,
"learning_rate": 8.695622848682291e-05,
"loss": 2.4951,
"step": 1555
},
{
"epoch": 1.3520485584218513,
"grad_norm": 0.1990146960863876,
"learning_rate": 8.592769052988607e-05,
"loss": 2.4901,
"step": 1560
},
{
"epoch": 1.3563841318014307,
"grad_norm": 0.34068891992062217,
"learning_rate": 8.490282431240416e-05,
"loss": 2.4522,
"step": 1565
},
{
"epoch": 1.3607197051810103,
"grad_norm": 0.2291955529635031,
"learning_rate": 8.388168856554777e-05,
"loss": 2.4203,
"step": 1570
},
{
"epoch": 1.3650552785605896,
"grad_norm": 0.3248337694954652,
"learning_rate": 8.286434180670822e-05,
"loss": 2.4868,
"step": 1575
},
{
"epoch": 1.3693908519401692,
"grad_norm": 0.2990054213661462,
"learning_rate": 8.185084233614444e-05,
"loss": 2.4363,
"step": 1580
},
{
"epoch": 1.3737264253197485,
"grad_norm": 0.233583205490779,
"learning_rate": 8.084124823364204e-05,
"loss": 2.4807,
"step": 1585
},
{
"epoch": 1.3780619986993279,
"grad_norm": 0.2518607909224173,
"learning_rate": 7.983561735518474e-05,
"loss": 2.4358,
"step": 1590
},
{
"epoch": 1.3823975720789075,
"grad_norm": 0.19984297609600038,
"learning_rate": 7.883400732963913e-05,
"loss": 2.478,
"step": 1595
},
{
"epoch": 1.3867331454584868,
"grad_norm": 0.1648080690690456,
"learning_rate": 7.783647555545217e-05,
"loss": 2.442,
"step": 1600
},
{
"epoch": 1.3910687188380664,
"grad_norm": 0.17610648998979445,
"learning_rate": 7.684307919736158e-05,
"loss": 2.41,
"step": 1605
},
{
"epoch": 1.3954042922176457,
"grad_norm": 0.14818666921811272,
"learning_rate": 7.585387518312028e-05,
"loss": 2.4206,
"step": 1610
},
{
"epoch": 1.3997398655972253,
"grad_norm": 0.2001322045633342,
"learning_rate": 7.486892020023406e-05,
"loss": 2.3821,
"step": 1615
},
{
"epoch": 1.4040754389768046,
"grad_norm": 0.2200377253653553,
"learning_rate": 7.388827069271276e-05,
"loss": 2.4257,
"step": 1620
},
{
"epoch": 1.408411012356384,
"grad_norm": 0.1844230292041018,
"learning_rate": 7.291198285783602e-05,
"loss": 2.4135,
"step": 1625
},
{
"epoch": 1.4127465857359636,
"grad_norm": 0.1929745133150067,
"learning_rate": 7.194011264293254e-05,
"loss": 2.3777,
"step": 1630
},
{
"epoch": 1.4170821591155431,
"grad_norm": 0.2915968829982784,
"learning_rate": 7.097271574217421e-05,
"loss": 2.4181,
"step": 1635
},
{
"epoch": 1.4214177324951225,
"grad_norm": 0.269578671519446,
"learning_rate": 7.000984759338422e-05,
"loss": 2.3788,
"step": 1640
},
{
"epoch": 1.4257533058747018,
"grad_norm": 0.19512593934978045,
"learning_rate": 6.905156337486045e-05,
"loss": 2.391,
"step": 1645
},
{
"epoch": 1.4300888792542814,
"grad_norm": 0.2593023962861574,
"learning_rate": 6.809791800221313e-05,
"loss": 2.3963,
"step": 1650
},
{
"epoch": 1.4344244526338608,
"grad_norm": 0.14901754063689115,
"learning_rate": 6.714896612521794e-05,
"loss": 2.3976,
"step": 1655
},
{
"epoch": 1.4387600260134403,
"grad_norm": 0.302187906502475,
"learning_rate": 6.620476212468424e-05,
"loss": 2.4194,
"step": 1660
},
{
"epoch": 1.4430955993930197,
"grad_norm": 0.24180384264466487,
"learning_rate": 6.526536010933874e-05,
"loss": 2.4295,
"step": 1665
},
{
"epoch": 1.4474311727725993,
"grad_norm": 0.22169225675339646,
"learning_rate": 6.433081391272467e-05,
"loss": 2.3976,
"step": 1670
},
{
"epoch": 1.4517667461521786,
"grad_norm": 0.2799481789977155,
"learning_rate": 6.340117709011693e-05,
"loss": 2.392,
"step": 1675
},
{
"epoch": 1.456102319531758,
"grad_norm": 0.28103864837065845,
"learning_rate": 6.247650291545287e-05,
"loss": 2.3708,
"step": 1680
},
{
"epoch": 1.4604378929113375,
"grad_norm": 0.24593789213337805,
"learning_rate": 6.155684437827931e-05,
"loss": 2.4043,
"step": 1685
},
{
"epoch": 1.464773466290917,
"grad_norm": 0.19968974812237159,
"learning_rate": 6.064225418071632e-05,
"loss": 2.3784,
"step": 1690
},
{
"epoch": 1.4691090396704964,
"grad_norm": 0.2665198920246072,
"learning_rate": 5.9732784734436554e-05,
"loss": 2.387,
"step": 1695
},
{
"epoch": 1.4734446130500758,
"grad_norm": 0.33954513870533093,
"learning_rate": 5.882848815766189e-05,
"loss": 2.3659,
"step": 1700
},
{
"epoch": 1.4777801864296554,
"grad_norm": 0.3174285529770626,
"learning_rate": 5.792941627217707e-05,
"loss": 2.3703,
"step": 1705
},
{
"epoch": 1.4821157598092347,
"grad_norm": 0.21514280209891976,
"learning_rate": 5.703562060035951e-05,
"loss": 2.3311,
"step": 1710
},
{
"epoch": 1.4864513331888143,
"grad_norm": 0.26026170716900454,
"learning_rate": 5.614715236222702e-05,
"loss": 2.3534,
"step": 1715
},
{
"epoch": 1.4907869065683936,
"grad_norm": 0.2072827021307388,
"learning_rate": 5.52640624725026e-05,
"loss": 2.362,
"step": 1720
},
{
"epoch": 1.4951224799479732,
"grad_norm": 0.18636459137372047,
"learning_rate": 5.4386401537696536e-05,
"loss": 2.367,
"step": 1725
},
{
"epoch": 1.4994580533275526,
"grad_norm": 0.2616321440338591,
"learning_rate": 5.3514219853206464e-05,
"loss": 2.3517,
"step": 1730
},
{
"epoch": 1.503793626707132,
"grad_norm": 0.17660365833901004,
"learning_rate": 5.264756740043511e-05,
"loss": 2.3366,
"step": 1735
},
{
"epoch": 1.5081292000867115,
"grad_norm": 0.14325753486785228,
"learning_rate": 5.178649384392603e-05,
"loss": 2.3628,
"step": 1740
},
{
"epoch": 1.512464773466291,
"grad_norm": 0.16306886697377787,
"learning_rate": 5.093104852851749e-05,
"loss": 2.3403,
"step": 1745
},
{
"epoch": 1.5168003468458704,
"grad_norm": 0.17105852766007526,
"learning_rate": 5.008128047651488e-05,
"loss": 2.3193,
"step": 1750
},
{
"epoch": 1.5211359202254497,
"grad_norm": 0.20749830784985143,
"learning_rate": 4.923723838488117e-05,
"loss": 2.3519,
"step": 1755
},
{
"epoch": 1.5254714936050293,
"grad_norm": 0.3157660095405772,
"learning_rate": 4.839897062244638e-05,
"loss": 2.3197,
"step": 1760
},
{
"epoch": 1.5298070669846087,
"grad_norm": 0.18219178823449922,
"learning_rate": 4.756652522713599e-05,
"loss": 2.3279,
"step": 1765
},
{
"epoch": 1.534142640364188,
"grad_norm": 0.12524649727104128,
"learning_rate": 4.673994990321752e-05,
"loss": 2.3019,
"step": 1770
},
{
"epoch": 1.5384782137437676,
"grad_norm": 0.1636956452865002,
"learning_rate": 4.591929201856727e-05,
"loss": 2.2859,
"step": 1775
},
{
"epoch": 1.5428137871233472,
"grad_norm": 0.21967450962811522,
"learning_rate": 4.5104598601955805e-05,
"loss": 2.3095,
"step": 1780
},
{
"epoch": 1.5471493605029265,
"grad_norm": 0.19863296073280773,
"learning_rate": 4.4295916340352625e-05,
"loss": 2.2826,
"step": 1785
},
{
"epoch": 1.5514849338825059,
"grad_norm": 0.15602204927659435,
"learning_rate": 4.349329157625088e-05,
"loss": 2.3522,
"step": 1790
},
{
"epoch": 1.5558205072620854,
"grad_norm": 0.15284396668890557,
"learning_rate": 4.269677030501184e-05,
"loss": 2.3546,
"step": 1795
},
{
"epoch": 1.560156080641665,
"grad_norm": 0.16000621000672735,
"learning_rate": 4.1906398172228704e-05,
"loss": 2.3456,
"step": 1800
},
{
"epoch": 1.5644916540212443,
"grad_norm": 0.5401953444913549,
"learning_rate": 4.112222047111111e-05,
"loss": 2.3475,
"step": 1805
},
{
"epoch": 1.5688272274008237,
"grad_norm": 0.18490516183802747,
"learning_rate": 4.034428213988946e-05,
"loss": 2.3064,
"step": 1810
},
{
"epoch": 1.5731628007804033,
"grad_norm": 0.2648705938773556,
"learning_rate": 3.957262775923969e-05,
"loss": 2.3087,
"step": 1815
},
{
"epoch": 1.5774983741599826,
"grad_norm": 0.20317358607574193,
"learning_rate": 3.8807301549728435e-05,
"loss": 2.292,
"step": 1820
},
{
"epoch": 1.581833947539562,
"grad_norm": 0.18781915869958946,
"learning_rate": 3.804834736927918e-05,
"loss": 2.3321,
"step": 1825
},
{
"epoch": 1.5861695209191415,
"grad_norm": 0.22963664052911378,
"learning_rate": 3.7295808710658594e-05,
"loss": 2.3105,
"step": 1830
},
{
"epoch": 1.5905050942987211,
"grad_norm": 0.16547609467569466,
"learning_rate": 3.654972869898435e-05,
"loss": 2.3441,
"step": 1835
},
{
"epoch": 1.5948406676783005,
"grad_norm": 0.15450897850919312,
"learning_rate": 3.581015008925367e-05,
"loss": 2.2963,
"step": 1840
},
{
"epoch": 1.5991762410578798,
"grad_norm": 0.2368783078996937,
"learning_rate": 3.507711526389331e-05,
"loss": 2.2701,
"step": 1845
},
{
"epoch": 1.6035118144374594,
"grad_norm": 0.23275669114568637,
"learning_rate": 3.4350666230330684e-05,
"loss": 2.3027,
"step": 1850
},
{
"epoch": 1.607847387817039,
"grad_norm": 0.23123161162094558,
"learning_rate": 3.363084461858659e-05,
"loss": 2.3271,
"step": 1855
},
{
"epoch": 1.6121829611966183,
"grad_norm": 0.19114205349654592,
"learning_rate": 3.291769167888971e-05,
"loss": 2.3085,
"step": 1860
},
{
"epoch": 1.6165185345761977,
"grad_norm": 0.19601739480919383,
"learning_rate": 3.221124827931248e-05,
"loss": 2.297,
"step": 1865
},
{
"epoch": 1.6208541079557772,
"grad_norm": 0.14221216745928258,
"learning_rate": 3.151155490342917e-05,
"loss": 2.2855,
"step": 1870
},
{
"epoch": 1.6251896813353566,
"grad_norm": 0.1584865862659709,
"learning_rate": 3.081865164799613e-05,
"loss": 2.2614,
"step": 1875
},
{
"epoch": 1.629525254714936,
"grad_norm": 0.1414839158508025,
"learning_rate": 3.0132578220653648e-05,
"loss": 2.2795,
"step": 1880
},
{
"epoch": 1.6338608280945155,
"grad_norm": 0.1254646077472122,
"learning_rate": 2.9453373937650664e-05,
"loss": 2.2965,
"step": 1885
},
{
"epoch": 1.638196401474095,
"grad_norm": 0.16882648715438078,
"learning_rate": 2.8781077721591828e-05,
"loss": 2.3278,
"step": 1890
},
{
"epoch": 1.6425319748536744,
"grad_norm": 0.18395517918844081,
"learning_rate": 2.811572809920669e-05,
"loss": 2.2801,
"step": 1895
},
{
"epoch": 1.6468675482332538,
"grad_norm": 0.1309521536420322,
"learning_rate": 2.7457363199142062e-05,
"loss": 2.2852,
"step": 1900
},
{
"epoch": 1.6512031216128333,
"grad_norm": 0.12473429715301326,
"learning_rate": 2.680602074977708e-05,
"loss": 2.259,
"step": 1905
},
{
"epoch": 1.655538694992413,
"grad_norm": 0.1312047736460762,
"learning_rate": 2.6161738077060924e-05,
"loss": 2.2868,
"step": 1910
},
{
"epoch": 1.659874268371992,
"grad_norm": 0.14567444941618224,
"learning_rate": 2.552455210237398e-05,
"loss": 2.2633,
"step": 1915
},
{
"epoch": 1.6642098417515716,
"grad_norm": 0.13883443962415717,
"learning_rate": 2.4894499340411968e-05,
"loss": 2.2541,
"step": 1920
},
{
"epoch": 1.6685454151311512,
"grad_norm": 0.15369994803840648,
"learning_rate": 2.427161589709337e-05,
"loss": 2.2996,
"step": 1925
},
{
"epoch": 1.6728809885107305,
"grad_norm": 0.21447874396163935,
"learning_rate": 2.365593746749041e-05,
"loss": 2.2679,
"step": 1930
},
{
"epoch": 1.6772165618903099,
"grad_norm": 0.18513858979140632,
"learning_rate": 2.3047499333783558e-05,
"loss": 2.2658,
"step": 1935
},
{
"epoch": 1.6815521352698894,
"grad_norm": 0.18391090938011884,
"learning_rate": 2.244633636323946e-05,
"loss": 2.2907,
"step": 1940
},
{
"epoch": 1.685887708649469,
"grad_norm": 0.16891631837991977,
"learning_rate": 2.1852483006212978e-05,
"loss": 2.2478,
"step": 1945
},
{
"epoch": 1.6902232820290484,
"grad_norm": 0.1518091020243819,
"learning_rate": 2.126597329417293e-05,
"loss": 2.2473,
"step": 1950
},
{
"epoch": 1.6945588554086277,
"grad_norm": 0.17161647974253239,
"learning_rate": 2.068684083775185e-05,
"loss": 2.2537,
"step": 1955
},
{
"epoch": 1.6988944287882073,
"grad_norm": 0.14187218785526223,
"learning_rate": 2.0115118824819914e-05,
"loss": 2.2616,
"step": 1960
},
{
"epoch": 1.7032300021677866,
"grad_norm": 0.11929928350263867,
"learning_rate": 1.9550840018583153e-05,
"loss": 2.2694,
"step": 1965
},
{
"epoch": 1.707565575547366,
"grad_norm": 0.1245533165549466,
"learning_rate": 1.899403675570576e-05,
"loss": 2.2595,
"step": 1970
},
{
"epoch": 1.7119011489269456,
"grad_norm": 0.11773193996812116,
"learning_rate": 1.844474094445705e-05,
"loss": 2.2604,
"step": 1975
},
{
"epoch": 1.7162367223065251,
"grad_norm": 0.1419568607634185,
"learning_rate": 1.7902984062883053e-05,
"loss": 2.2311,
"step": 1980
},
{
"epoch": 1.7205722956861045,
"grad_norm": 0.16983875746382313,
"learning_rate": 1.736879715700243e-05,
"loss": 2.2403,
"step": 1985
},
{
"epoch": 1.7249078690656838,
"grad_norm": 0.12131945112331936,
"learning_rate": 1.684221083902746e-05,
"loss": 2.2474,
"step": 1990
},
{
"epoch": 1.7292434424452634,
"grad_norm": 0.44071395908424116,
"learning_rate": 1.6323255285609722e-05,
"loss": 2.2337,
"step": 1995
},
{
"epoch": 1.733579015824843,
"grad_norm": 0.23334817695713936,
"learning_rate": 1.5811960236110855e-05,
"loss": 2.2489,
"step": 2000
},
{
"epoch": 1.7379145892044223,
"grad_norm": 0.2111669535853138,
"learning_rate": 1.530835499089821e-05,
"loss": 2.2269,
"step": 2005
},
{
"epoch": 1.7422501625840017,
"grad_norm": 0.18179903995116767,
"learning_rate": 1.4812468409665884e-05,
"loss": 2.2706,
"step": 2010
},
{
"epoch": 1.7465857359635812,
"grad_norm": 0.12994276828633272,
"learning_rate": 1.432432890978074e-05,
"loss": 2.2688,
"step": 2015
},
{
"epoch": 1.7509213093431606,
"grad_norm": 0.20415942563901335,
"learning_rate": 1.3843964464654018e-05,
"loss": 2.2725,
"step": 2020
},
{
"epoch": 1.75525688272274,
"grad_norm": 0.13423972154083932,
"learning_rate": 1.3371402602138242e-05,
"loss": 2.2614,
"step": 2025
},
{
"epoch": 1.7595924561023195,
"grad_norm": 0.21592839787669263,
"learning_rate": 1.2906670402949703e-05,
"loss": 2.2278,
"step": 2030
},
{
"epoch": 1.763928029481899,
"grad_norm": 0.1195430112602107,
"learning_rate": 1.2449794499116567e-05,
"loss": 2.2434,
"step": 2035
},
{
"epoch": 1.7682636028614784,
"grad_norm": 0.1200861082736604,
"learning_rate": 1.200080107245278e-05,
"loss": 2.2547,
"step": 2040
},
{
"epoch": 1.7725991762410578,
"grad_norm": 0.11672129617620956,
"learning_rate": 1.1559715853057516e-05,
"loss": 2.2196,
"step": 2045
},
{
"epoch": 1.7769347496206374,
"grad_norm": 0.11729537916907379,
"learning_rate": 1.1126564117840819e-05,
"loss": 2.2613,
"step": 2050
},
{
"epoch": 1.781270323000217,
"grad_norm": 0.10450327625927365,
"learning_rate": 1.0701370689075094e-05,
"loss": 2.244,
"step": 2055
},
{
"epoch": 1.7856058963797963,
"grad_norm": 0.13649256070362986,
"learning_rate": 1.0284159932972524e-05,
"loss": 2.2222,
"step": 2060
},
{
"epoch": 1.7899414697593756,
"grad_norm": 0.11456259112458032,
"learning_rate": 9.87495575828875e-06,
"loss": 2.2401,
"step": 2065
},
{
"epoch": 1.7942770431389552,
"grad_norm": 0.12074570004540981,
"learning_rate": 9.473781614952918e-06,
"loss": 2.2401,
"step": 2070
},
{
"epoch": 1.7986126165185345,
"grad_norm": 0.12686179535370362,
"learning_rate": 9.080660492723663e-06,
"loss": 2.2295,
"step": 2075
},
{
"epoch": 1.802948189898114,
"grad_norm": 0.1300450179470363,
"learning_rate": 8.695614919871679e-06,
"loss": 2.2569,
"step": 2080
},
{
"epoch": 1.8072837632776935,
"grad_norm": 0.11937907719387332,
"learning_rate": 8.31866696188887e-06,
"loss": 2.2294,
"step": 2085
},
{
"epoch": 1.811619336657273,
"grad_norm": 0.11764431526486717,
"learning_rate": 7.949838220223664e-06,
"loss": 2.217,
"step": 2090
},
{
"epoch": 1.8159549100368524,
"grad_norm": 0.1234907525541496,
"learning_rate": 7.589149831043212e-06,
"loss": 2.217,
"step": 2095
},
{
"epoch": 1.8202904834164317,
"grad_norm": 0.1517422441513512,
"learning_rate": 7.236622464022151e-06,
"loss": 2.2453,
"step": 2100
},
{
"epoch": 1.8246260567960113,
"grad_norm": 0.33341778139897915,
"learning_rate": 6.892276321158058e-06,
"loss": 2.2356,
"step": 2105
},
{
"epoch": 1.8289616301755909,
"grad_norm": 0.11642132397770094,
"learning_rate": 6.556131135613818e-06,
"loss": 2.2423,
"step": 2110
},
{
"epoch": 1.83329720355517,
"grad_norm": 0.1407587899536127,
"learning_rate": 6.2282061705868025e-06,
"loss": 2.203,
"step": 2115
},
{
"epoch": 1.8376327769347496,
"grad_norm": 0.13035967664280043,
"learning_rate": 5.908520218204832e-06,
"loss": 2.1993,
"step": 2120
},
{
"epoch": 1.8419683503143292,
"grad_norm": 0.09453247400951749,
"learning_rate": 5.597091598449438e-06,
"loss": 2.228,
"step": 2125
},
{
"epoch": 1.8463039236939085,
"grad_norm": 0.12255843625223094,
"learning_rate": 5.293938158105904e-06,
"loss": 2.2373,
"step": 2130
},
{
"epoch": 1.8506394970734878,
"grad_norm": 0.10290208504992138,
"learning_rate": 4.999077269740581e-06,
"loss": 2.1896,
"step": 2135
},
{
"epoch": 1.8549750704530674,
"grad_norm": 0.10229541966122209,
"learning_rate": 4.712525830705338e-06,
"loss": 2.2202,
"step": 2140
},
{
"epoch": 1.859310643832647,
"grad_norm": 0.1058407353264288,
"learning_rate": 4.4343002621692155e-06,
"loss": 2.2105,
"step": 2145
},
{
"epoch": 1.8636462172122263,
"grad_norm": 0.12091080320995722,
"learning_rate": 4.164416508177398e-06,
"loss": 2.2192,
"step": 2150
},
{
"epoch": 1.8679817905918057,
"grad_norm": 0.11922564089206397,
"learning_rate": 3.902890034737527e-06,
"loss": 2.2558,
"step": 2155
},
{
"epoch": 1.8723173639713853,
"grad_norm": 0.09787488554546948,
"learning_rate": 3.649735828933409e-06,
"loss": 2.1973,
"step": 2160
},
{
"epoch": 1.8766529373509646,
"grad_norm": 0.12237194203335397,
"learning_rate": 3.4049683980661214e-06,
"loss": 2.2213,
"step": 2165
},
{
"epoch": 1.880988510730544,
"grad_norm": 0.09606403673967055,
"learning_rate": 3.168601768822726e-06,
"loss": 2.1992,
"step": 2170
},
{
"epoch": 1.8853240841101235,
"grad_norm": 0.09953286517322664,
"learning_rate": 2.940649486472396e-06,
"loss": 2.2528,
"step": 2175
},
{
"epoch": 1.889659657489703,
"grad_norm": 0.10835068342395951,
"learning_rate": 2.72112461409022e-06,
"loss": 2.2531,
"step": 2180
},
{
"epoch": 1.8939952308692825,
"grad_norm": 0.10266855095126044,
"learning_rate": 2.510039731808533e-06,
"loss": 2.269,
"step": 2185
},
{
"epoch": 1.8983308042488618,
"grad_norm": 0.10341900472056524,
"learning_rate": 2.3074069360961623e-06,
"loss": 2.2062,
"step": 2190
},
{
"epoch": 1.9026663776284414,
"grad_norm": 0.10842551877471102,
"learning_rate": 2.1132378390650463e-06,
"loss": 2.2534,
"step": 2195
},
{
"epoch": 1.907001951008021,
"grad_norm": 0.10059763675561238,
"learning_rate": 1.9275435678048845e-06,
"loss": 2.2473,
"step": 2200
},
{
"epoch": 1.9113375243876003,
"grad_norm": 0.10768116633333848,
"learning_rate": 1.7503347637454479e-06,
"loss": 2.2552,
"step": 2205
},
{
"epoch": 1.9156730977671796,
"grad_norm": 0.09414809031667026,
"learning_rate": 1.5816215820467992e-06,
"loss": 2.2367,
"step": 2210
},
{
"epoch": 1.9200086711467592,
"grad_norm": 0.10690488271519041,
"learning_rate": 1.4214136910172925e-06,
"loss": 2.2253,
"step": 2215
},
{
"epoch": 1.9243442445263386,
"grad_norm": 0.10872884844513397,
"learning_rate": 1.2697202715595822e-06,
"loss": 2.2289,
"step": 2220
},
{
"epoch": 1.928679817905918,
"grad_norm": 0.1254620474939429,
"learning_rate": 1.126550016644412e-06,
"loss": 2.2164,
"step": 2225
},
{
"epoch": 1.9330153912854975,
"grad_norm": 0.1025587696285709,
"learning_rate": 9.919111308125449e-07,
"loss": 2.2039,
"step": 2230
},
{
"epoch": 1.937350964665077,
"grad_norm": 0.09872117993630683,
"learning_rate": 8.65811329704541e-07,
"loss": 2.2492,
"step": 2235
},
{
"epoch": 1.9416865380446564,
"grad_norm": 0.1085639607949179,
"learning_rate": 7.482578396185934e-07,
"loss": 2.2449,
"step": 2240
},
{
"epoch": 1.9460221114242358,
"grad_norm": 0.09973275488637519,
"learning_rate": 6.392573970964432e-07,
"loss": 2.2074,
"step": 2245
},
{
"epoch": 1.9503576848038153,
"grad_norm": 0.11503330375288269,
"learning_rate": 5.388162485373548e-07,
"loss": 2.2473,
"step": 2250
},
{
"epoch": 1.954693258183395,
"grad_norm": 0.11928896227233268,
"learning_rate": 4.4694014984010264e-07,
"loss": 2.2128,
"step": 2255
},
{
"epoch": 1.9590288315629742,
"grad_norm": 0.08668925207942389,
"learning_rate": 3.6363436607313446e-07,
"loss": 2.2183,
"step": 2260
},
{
"epoch": 1.9633644049425536,
"grad_norm": 0.0953840596497321,
"learning_rate": 2.889036711729298e-07,
"loss": 2.2397,
"step": 2265
},
{
"epoch": 1.9676999783221332,
"grad_norm": 0.10418411546023475,
"learning_rate": 2.2275234767030193e-07,
"loss": 2.2146,
"step": 2270
},
{
"epoch": 1.9720355517017125,
"grad_norm": 0.09712806974011044,
"learning_rate": 1.6518418644507758e-07,
"loss": 2.2166,
"step": 2275
},
{
"epoch": 1.9763711250812919,
"grad_norm": 0.12331447239619826,
"learning_rate": 1.1620248650878739e-07,
"loss": 2.2371,
"step": 2280
},
{
"epoch": 1.9807066984608714,
"grad_norm": 0.09998286807181965,
"learning_rate": 7.581005481566704e-08,
"loss": 2.2271,
"step": 2285
},
{
"epoch": 1.985042271840451,
"grad_norm": 0.09866409081999304,
"learning_rate": 4.4009206101786043e-08,
"loss": 2.2148,
"step": 2290
},
{
"epoch": 1.9893778452200304,
"grad_norm": 0.10987887336557695,
"learning_rate": 2.0801762752387097e-08,
"loss": 2.2046,
"step": 2295
},
{
"epoch": 1.9937134185996097,
"grad_norm": 0.10003495023127681,
"learning_rate": 6.189054697436357e-09,
"loss": 2.1954,
"step": 2300
},
{
"epoch": 1.9980489919791893,
"grad_norm": 0.09752064885234579,
"learning_rate": 1.7191933545102067e-10,
"loss": 2.2409,
"step": 2305
},
{
"epoch": 1.998916106655105,
"step": 2306,
"total_flos": 1.542232840692197e+19,
"train_loss": 3.6333630210094006,
"train_runtime": 27233.3224,
"train_samples_per_second": 2.71,
"train_steps_per_second": 0.085
}
],
"logging_steps": 5,
"max_steps": 2306,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.542232840692197e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}