{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.998916106655105, "eval_steps": 500, "global_step": 2306, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008671146759158899, "grad_norm": 5.864805612767453e+19, "learning_rate": 1.2987012987012986e-06, "loss": 1.5218, "step": 1 }, { "epoch": 0.004335573379579449, "grad_norm": 0.5226513429859231, "learning_rate": 6.493506493506493e-06, "loss": 1.36, "step": 5 }, { "epoch": 0.008671146759158898, "grad_norm": 0.7095719390962212, "learning_rate": 1.2987012987012986e-05, "loss": 1.342, "step": 10 }, { "epoch": 0.013006720138738348, "grad_norm": 0.429950690175044, "learning_rate": 1.9480519480519476e-05, "loss": 1.3738, "step": 15 }, { "epoch": 0.017342293518317797, "grad_norm": 0.25369452359038985, "learning_rate": 2.5974025974025972e-05, "loss": 1.2215, "step": 20 }, { "epoch": 0.021677866897897247, "grad_norm": 14.366536861443981, "learning_rate": 3.246753246753247e-05, "loss": 1.2357, "step": 25 }, { "epoch": 0.026013440277476697, "grad_norm": 0.2687159676086448, "learning_rate": 3.896103896103895e-05, "loss": 1.1712, "step": 30 }, { "epoch": 0.030349013657056147, "grad_norm": 0.15608265987863726, "learning_rate": 4.545454545454545e-05, "loss": 1.1152, "step": 35 }, { "epoch": 0.03468458703663559, "grad_norm": 0.11635461961055855, "learning_rate": 5.1948051948051944e-05, "loss": 1.1033, "step": 40 }, { "epoch": 0.03902016041621505, "grad_norm": 0.11449246594254398, "learning_rate": 5.8441558441558436e-05, "loss": 1.0896, "step": 45 }, { "epoch": 0.04335573379579449, "grad_norm": 0.11032771125077512, "learning_rate": 6.493506493506494e-05, "loss": 1.0795, "step": 50 }, { "epoch": 0.04769130717537394, "grad_norm": 0.10732618594326016, "learning_rate": 7.142857142857142e-05, "loss": 1.1107, "step": 55 }, { "epoch": 0.05202688055495339, "grad_norm": 0.11532043672084052, "learning_rate": 7.79220779220779e-05, "loss": 1.0941, "step": 60 }, { "epoch": 0.05636245393453284, "grad_norm": 0.10484657398976702, "learning_rate": 8.441558441558442e-05, "loss": 1.0636, "step": 65 }, { "epoch": 0.06069802731411229, "grad_norm": 0.11736262481626532, "learning_rate": 9.09090909090909e-05, "loss": 1.0939, "step": 70 }, { "epoch": 0.06503360069369174, "grad_norm": 0.14275729093576736, "learning_rate": 9.740259740259739e-05, "loss": 1.0876, "step": 75 }, { "epoch": 0.06936917407327119, "grad_norm": 0.15749002462194958, "learning_rate": 0.00010389610389610389, "loss": 1.073, "step": 80 }, { "epoch": 0.07370474745285063, "grad_norm": 0.19946818080973713, "learning_rate": 0.00011038961038961037, "loss": 1.0935, "step": 85 }, { "epoch": 0.0780403208324301, "grad_norm": 0.5992707412202727, "learning_rate": 0.00011688311688311687, "loss": 1.1826, "step": 90 }, { "epoch": 0.08237589421200954, "grad_norm": 0.19153297117483123, "learning_rate": 0.00012337662337662337, "loss": 1.1079, "step": 95 }, { "epoch": 0.08671146759158899, "grad_norm": 0.23132363172618897, "learning_rate": 0.00012987012987012987, "loss": 1.1509, "step": 100 }, { "epoch": 0.09104704097116843, "grad_norm": 13.74435400048776, "learning_rate": 0.00013636363636363634, "loss": 1.271, "step": 105 }, { "epoch": 0.09538261435074788, "grad_norm": 0.5226174167068346, "learning_rate": 0.00014285714285714284, "loss": 1.1902, "step": 110 }, { "epoch": 0.09971818773032734, "grad_norm": 0.3629740083051462, "learning_rate": 0.00014935064935064934, "loss": 1.1643, "step": 115 }, { "epoch": 0.10405376110990679, "grad_norm": 0.22433655763478755, "learning_rate": 0.0001558441558441558, "loss": 1.155, "step": 120 }, { "epoch": 0.10838933448948623, "grad_norm": 0.1335411736809728, "learning_rate": 0.0001623376623376623, "loss": 1.1243, "step": 125 }, { "epoch": 0.11272490786906568, "grad_norm": 0.20752640927016786, "learning_rate": 0.00016883116883116884, "loss": 1.1104, "step": 130 }, { "epoch": 0.11706048124864514, "grad_norm": 0.12168629703712475, "learning_rate": 0.0001753246753246753, "loss": 1.114, "step": 135 }, { "epoch": 0.12139605462822459, "grad_norm": 0.26232391688421475, "learning_rate": 0.0001818181818181818, "loss": 1.1204, "step": 140 }, { "epoch": 0.12573162800780402, "grad_norm": 0.16457276786988467, "learning_rate": 0.00018831168831168828, "loss": 1.1442, "step": 145 }, { "epoch": 0.13006720138738348, "grad_norm": 0.15538288562486247, "learning_rate": 0.00019480519480519478, "loss": 1.1111, "step": 150 }, { "epoch": 0.13440277476696294, "grad_norm": 0.201009562048921, "learning_rate": 0.0002012987012987013, "loss": 1.1284, "step": 155 }, { "epoch": 0.13873834814654237, "grad_norm": 0.10830435883952598, "learning_rate": 0.00020779220779220778, "loss": 1.13, "step": 160 }, { "epoch": 0.14307392152612183, "grad_norm": 0.11463870554451829, "learning_rate": 0.00021428571428571427, "loss": 1.1146, "step": 165 }, { "epoch": 0.14740949490570127, "grad_norm": 0.12316133988511195, "learning_rate": 0.00022077922077922075, "loss": 1.1238, "step": 170 }, { "epoch": 0.15174506828528073, "grad_norm": 0.19826475153806197, "learning_rate": 0.00022727272727272725, "loss": 1.2731, "step": 175 }, { "epoch": 0.1560806416648602, "grad_norm": 0.928027899524197, "learning_rate": 0.00023376623376623374, "loss": 1.1896, "step": 180 }, { "epoch": 0.16041621504443962, "grad_norm": 67.14385763593648, "learning_rate": 0.00024025974025974024, "loss": 3.7592, "step": 185 }, { "epoch": 0.16475178842401908, "grad_norm": 172.25626988972317, "learning_rate": 0.00024675324675324674, "loss": 6.1514, "step": 190 }, { "epoch": 0.1690873618035985, "grad_norm": 8.384860694083399, "learning_rate": 0.0002532467532467532, "loss": 8.2367, "step": 195 }, { "epoch": 0.17342293518317797, "grad_norm": 15.817512331978708, "learning_rate": 0.00025974025974025974, "loss": 10.8846, "step": 200 }, { "epoch": 0.17775850856275743, "grad_norm": 3.4669948773987103, "learning_rate": 0.0002662337662337662, "loss": 9.8149, "step": 205 }, { "epoch": 0.18209408194233687, "grad_norm": 6.211867615520024, "learning_rate": 0.0002727272727272727, "loss": 10.4091, "step": 210 }, { "epoch": 0.18642965532191633, "grad_norm": 8.673657491918139, "learning_rate": 0.0002792207792207792, "loss": 10.3271, "step": 215 }, { "epoch": 0.19076522870149576, "grad_norm": 2.503560404486804, "learning_rate": 0.0002857142857142857, "loss": 12.6414, "step": 220 }, { "epoch": 0.19510080208107522, "grad_norm": 3.8638747380263703, "learning_rate": 0.00029220779220779215, "loss": 8.4926, "step": 225 }, { "epoch": 0.19943637546065468, "grad_norm": 1.0101210010336394, "learning_rate": 0.0002987012987012987, "loss": 7.6948, "step": 230 }, { "epoch": 0.2037719488402341, "grad_norm": 0.8467011565643021, "learning_rate": 0.0002999972492985145, "loss": 7.5188, "step": 235 }, { "epoch": 0.20810752221981357, "grad_norm": 0.5245296390809255, "learning_rate": 0.0002999860747466326, "loss": 7.4533, "step": 240 }, { "epoch": 0.212443095599393, "grad_norm": 0.42147554363918227, "learning_rate": 0.0002999663050653897, "loss": 7.4204, "step": 245 }, { "epoch": 0.21677866897897247, "grad_norm": 0.38921879090695144, "learning_rate": 0.00029993794138771085, "loss": 7.4012, "step": 250 }, { "epoch": 0.22111424235855193, "grad_norm": 0.2418087757057217, "learning_rate": 0.0002999009853390101, "loss": 7.3999, "step": 255 }, { "epoch": 0.22544981573813136, "grad_norm": 0.35618630058079676, "learning_rate": 0.0002998554390370975, "loss": 7.3883, "step": 260 }, { "epoch": 0.22978538911771082, "grad_norm": 0.18510696993654735, "learning_rate": 0.0002998013050920577, "loss": 7.3686, "step": 265 }, { "epoch": 0.23412096249729028, "grad_norm": 0.21788844504555988, "learning_rate": 0.0002997385866061005, "loss": 7.3719, "step": 270 }, { "epoch": 0.2384565358768697, "grad_norm": 0.2255775965595111, "learning_rate": 0.00029966728717338294, "loss": 7.3634, "step": 275 }, { "epoch": 0.24279210925644917, "grad_norm": 0.21067089388376176, "learning_rate": 0.0002995874108798032, "loss": 7.3456, "step": 280 }, { "epoch": 0.2471276826360286, "grad_norm": 0.1534328867763096, "learning_rate": 0.00029949896230276675, "loss": 7.3761, "step": 285 }, { "epoch": 0.25146325601560804, "grad_norm": 0.12732733473622673, "learning_rate": 0.000299401946510924, "loss": 7.3546, "step": 290 }, { "epoch": 0.2557988293951875, "grad_norm": 0.13550982122013763, "learning_rate": 0.0002992963690638794, "loss": 7.3462, "step": 295 }, { "epoch": 0.26013440277476696, "grad_norm": 0.17601177463870957, "learning_rate": 0.0002991822360118736, "loss": 7.3682, "step": 300 }, { "epoch": 0.2644699761543464, "grad_norm": 0.14421180444911577, "learning_rate": 0.00029905955389543604, "loss": 7.3557, "step": 305 }, { "epoch": 0.2688055495339259, "grad_norm": 0.185975203761939, "learning_rate": 0.00029892832974501044, "loss": 7.356, "step": 310 }, { "epoch": 0.2731411229135053, "grad_norm": 0.14867246915893478, "learning_rate": 0.00029878857108055185, "loss": 7.3347, "step": 315 }, { "epoch": 0.27747669629308475, "grad_norm": 0.1334709517444074, "learning_rate": 0.00029864028591109593, "loss": 7.375, "step": 320 }, { "epoch": 0.2818122696726642, "grad_norm": 0.1642667927001403, "learning_rate": 0.00029848348273429947, "loss": 7.3474, "step": 325 }, { "epoch": 0.28614784305224367, "grad_norm": 0.17688345956299092, "learning_rate": 0.0002983181705359541, "loss": 7.3567, "step": 330 }, { "epoch": 0.2904834164318231, "grad_norm": 0.16088931478744917, "learning_rate": 0.00029814435878947076, "loss": 7.3632, "step": 335 }, { "epoch": 0.29481898981140253, "grad_norm": 0.19798325540549802, "learning_rate": 0.000297962057455337, "loss": 7.3831, "step": 340 }, { "epoch": 0.299154563190982, "grad_norm": 0.1608484618987419, "learning_rate": 0.0002977712769805465, "loss": 7.3528, "step": 345 }, { "epoch": 0.30349013657056145, "grad_norm": 0.1484369817347631, "learning_rate": 0.00029757202829799986, "loss": 7.3502, "step": 350 }, { "epoch": 0.3078257099501409, "grad_norm": 0.22145679449709124, "learning_rate": 0.0002973643228258784, "loss": 7.3133, "step": 355 }, { "epoch": 0.3121612833297204, "grad_norm": 0.2122717500901206, "learning_rate": 0.0002971481724669898, "loss": 7.3684, "step": 360 }, { "epoch": 0.3164968567092998, "grad_norm": 0.14900310266228523, "learning_rate": 0.0002969235896080861, "loss": 7.3474, "step": 365 }, { "epoch": 0.32083243008887924, "grad_norm": 0.17278164384767145, "learning_rate": 0.0002966905871191534, "loss": 7.3683, "step": 370 }, { "epoch": 0.3251680034684587, "grad_norm": 0.20274199845375984, "learning_rate": 0.0002964491783526749, "loss": 7.3476, "step": 375 }, { "epoch": 0.32950357684803816, "grad_norm": 0.13540893972237644, "learning_rate": 0.00029619937714286547, "loss": 7.3424, "step": 380 }, { "epoch": 0.3338391502276176, "grad_norm": 0.16792713197591935, "learning_rate": 0.0002959411978048787, "loss": 7.3629, "step": 385 }, { "epoch": 0.338174723607197, "grad_norm": 0.16598160402305692, "learning_rate": 0.00029567465513398694, "loss": 7.3435, "step": 390 }, { "epoch": 0.3425102969867765, "grad_norm": 0.15743754774094762, "learning_rate": 0.00029539976440473304, "loss": 7.3405, "step": 395 }, { "epoch": 0.34684587036635595, "grad_norm": 0.1928431563375656, "learning_rate": 0.00029511654137005534, "loss": 7.3398, "step": 400 }, { "epoch": 0.3511814437459354, "grad_norm": 0.4234041342810784, "learning_rate": 0.00029482500226038467, "loss": 7.3163, "step": 405 }, { "epoch": 0.35551701712551487, "grad_norm": 1.6964467666559244, "learning_rate": 0.00029452516378271446, "loss": 7.4424, "step": 410 }, { "epoch": 0.35985259050509427, "grad_norm": 1.1952504992783122, "learning_rate": 0.00029421704311964316, "loss": 7.3051, "step": 415 }, { "epoch": 0.36418816388467373, "grad_norm": 0.9910699747566197, "learning_rate": 0.0002939006579283898, "loss": 7.1588, "step": 420 }, { "epoch": 0.3685237372642532, "grad_norm": 0.5010649163848268, "learning_rate": 0.00029357602633978185, "loss": 7.0579, "step": 425 }, { "epoch": 0.37285931064383265, "grad_norm": 0.3223794058961508, "learning_rate": 0.0002932431669572163, "loss": 6.9952, "step": 430 }, { "epoch": 0.3771948840234121, "grad_norm": 0.39385777708948017, "learning_rate": 0.00029290209885559363, "loss": 6.9317, "step": 435 }, { "epoch": 0.3815304574029915, "grad_norm": 0.6906673045815623, "learning_rate": 0.00029255284158022474, "loss": 6.9197, "step": 440 }, { "epoch": 0.385866030782571, "grad_norm": 0.5699230517220503, "learning_rate": 0.00029219541514571075, "loss": 6.9122, "step": 445 }, { "epoch": 0.39020160416215044, "grad_norm": 0.3399949766360826, "learning_rate": 0.00029182984003479613, "loss": 6.8496, "step": 450 }, { "epoch": 0.3945371775417299, "grad_norm": 0.46915620454457757, "learning_rate": 0.00029145613719719484, "loss": 6.8021, "step": 455 }, { "epoch": 0.39887275092130936, "grad_norm": 0.7795294645969753, "learning_rate": 0.0002910743280483899, "loss": 6.7266, "step": 460 }, { "epoch": 0.40320832430088877, "grad_norm": 1.1660550286252116, "learning_rate": 0.00029068443446840606, "loss": 6.8039, "step": 465 }, { "epoch": 0.4075438976804682, "grad_norm": 1.9954850065496847, "learning_rate": 0.0002902864788005559, "loss": 6.7036, "step": 470 }, { "epoch": 0.4118794710600477, "grad_norm": 1.3144540313281778, "learning_rate": 0.00028988048385015955, "loss": 6.6625, "step": 475 }, { "epoch": 0.41621504443962715, "grad_norm": 0.5550135068925496, "learning_rate": 0.00028946647288323766, "loss": 6.5448, "step": 480 }, { "epoch": 0.4205506178192066, "grad_norm": 0.30284368817332974, "learning_rate": 0.0002890444696251783, "loss": 6.4523, "step": 485 }, { "epoch": 0.424886191198786, "grad_norm": 0.5328756605828856, "learning_rate": 0.0002886144982593771, "loss": 6.3727, "step": 490 }, { "epoch": 0.4292217645783655, "grad_norm": 1.2843902165543528, "learning_rate": 0.0002881765834258516, "loss": 6.471, "step": 495 }, { "epoch": 0.43355733795794493, "grad_norm": 0.7139034044101961, "learning_rate": 0.00028773075021982917, "loss": 6.3271, "step": 500 }, { "epoch": 0.4378929113375244, "grad_norm": 0.41272252446113744, "learning_rate": 0.00028727702419030883, "loss": 6.2754, "step": 505 }, { "epoch": 0.44222848471710385, "grad_norm": 0.24680608205710353, "learning_rate": 0.00028681543133859716, "loss": 6.1946, "step": 510 }, { "epoch": 0.4465640580966833, "grad_norm": 0.9592076263299766, "learning_rate": 0.0002863459981168184, "loss": 6.1744, "step": 515 }, { "epoch": 0.4508996314762627, "grad_norm": 2.2348898675903532, "learning_rate": 0.0002858687514263983, "loss": 6.112, "step": 520 }, { "epoch": 0.4552352048558422, "grad_norm": 0.5214216239256872, "learning_rate": 0.00028538371861652284, "loss": 6.1034, "step": 525 }, { "epoch": 0.45957077823542164, "grad_norm": 0.6212645654417902, "learning_rate": 0.00028489092748257066, "loss": 6.0164, "step": 530 }, { "epoch": 0.4639063516150011, "grad_norm": 0.7124953084880589, "learning_rate": 0.0002843904062645204, "loss": 5.986, "step": 535 }, { "epoch": 0.46824192499458056, "grad_norm": 0.3376451437228419, "learning_rate": 0.0002838821836453323, "loss": 5.9095, "step": 540 }, { "epoch": 0.47257749837415997, "grad_norm": 2.464753993204931, "learning_rate": 0.0002833662887493045, "loss": 5.9207, "step": 545 }, { "epoch": 0.4769130717537394, "grad_norm": 0.8049577940385247, "learning_rate": 0.00028284275114040395, "loss": 5.9179, "step": 550 }, { "epoch": 0.4812486451333189, "grad_norm": 1.3390472579975088, "learning_rate": 0.0002823116008205725, "loss": 5.9107, "step": 555 }, { "epoch": 0.48558421851289835, "grad_norm": 0.6905384313954396, "learning_rate": 0.00028177286822800713, "loss": 5.796, "step": 560 }, { "epoch": 0.4899197918924778, "grad_norm": 2.8514904783073898, "learning_rate": 0.0002812265842354162, "loss": 5.7603, "step": 565 }, { "epoch": 0.4942553652720572, "grad_norm": 1.5555920419031897, "learning_rate": 0.0002806727801482498, "loss": 5.9134, "step": 570 }, { "epoch": 0.4985909386516367, "grad_norm": 0.8043247332715469, "learning_rate": 0.000280111487702906, "loss": 5.8254, "step": 575 }, { "epoch": 0.5029265120312161, "grad_norm": 0.5751977544216561, "learning_rate": 0.0002795427390649119, "loss": 5.7081, "step": 580 }, { "epoch": 0.5072620854107955, "grad_norm": 0.6822440874804724, "learning_rate": 0.00027896656682708094, "loss": 5.6121, "step": 585 }, { "epoch": 0.511597658790375, "grad_norm": 0.8797162911577192, "learning_rate": 0.0002783830040076444, "loss": 5.5998, "step": 590 }, { "epoch": 0.5159332321699545, "grad_norm": 0.4760553301829561, "learning_rate": 0.0002777920840483596, "loss": 5.5739, "step": 595 }, { "epoch": 0.5202688055495339, "grad_norm": 1.2070114825643932, "learning_rate": 0.0002771938408125936, "loss": 5.539, "step": 600 }, { "epoch": 0.5246043789291134, "grad_norm": 0.3589877318117702, "learning_rate": 0.00027658830858338245, "loss": 5.5504, "step": 605 }, { "epoch": 0.5289399523086928, "grad_norm": 0.7944121659454099, "learning_rate": 0.0002759755220614664, "loss": 5.5072, "step": 610 }, { "epoch": 0.5332755256882723, "grad_norm": 0.7202458729236075, "learning_rate": 0.00027535551636330175, "loss": 5.454, "step": 615 }, { "epoch": 0.5376110990678518, "grad_norm": 2.106706884389399, "learning_rate": 0.0002747283270190482, "loss": 5.4935, "step": 620 }, { "epoch": 0.5419466724474312, "grad_norm": 1.3364284227174121, "learning_rate": 0.0002740939899705327, "loss": 5.5994, "step": 625 }, { "epoch": 0.5462822458270106, "grad_norm": 0.6407462262016212, "learning_rate": 0.00027345254156918976, "loss": 5.5447, "step": 630 }, { "epoch": 0.55061781920659, "grad_norm": 0.7271335411877718, "learning_rate": 0.00027280401857397854, "loss": 5.4461, "step": 635 }, { "epoch": 0.5549533925861695, "grad_norm": 0.9256391048561129, "learning_rate": 0.0002721484581492762, "loss": 5.3663, "step": 640 }, { "epoch": 0.559288965965749, "grad_norm": 0.7708081453999853, "learning_rate": 0.00027148589786274793, "loss": 5.3796, "step": 645 }, { "epoch": 0.5636245393453284, "grad_norm": 0.7777461095797896, "learning_rate": 0.00027081637568319446, "loss": 5.2963, "step": 650 }, { "epoch": 0.5679601127249079, "grad_norm": 0.5329435487662679, "learning_rate": 0.00027013992997837585, "loss": 5.2219, "step": 655 }, { "epoch": 0.5722956861044873, "grad_norm": 0.6524870578047196, "learning_rate": 0.0002694565995128132, "loss": 5.2601, "step": 660 }, { "epoch": 0.5766312594840668, "grad_norm": 1.2050259366493623, "learning_rate": 0.0002687664234455667, "loss": 5.2788, "step": 665 }, { "epoch": 0.5809668328636463, "grad_norm": 0.6205369626056482, "learning_rate": 0.00026806944132799196, "loss": 5.1169, "step": 670 }, { "epoch": 0.5853024062432257, "grad_norm": 0.9580684303122623, "learning_rate": 0.0002673656931014735, "loss": 5.1311, "step": 675 }, { "epoch": 0.5896379796228051, "grad_norm": 0.9227742616515185, "learning_rate": 0.00026665521909513545, "loss": 5.1194, "step": 680 }, { "epoch": 0.5939735530023845, "grad_norm": 0.35677362077396757, "learning_rate": 0.00026593806002353086, "loss": 5.0662, "step": 685 }, { "epoch": 0.598309126381964, "grad_norm": 0.5456219742352171, "learning_rate": 0.0002652142569843083, "loss": 4.9998, "step": 690 }, { "epoch": 0.6026446997615434, "grad_norm": 2.0462321590818453, "learning_rate": 0.0002644838514558568, "loss": 5.0121, "step": 695 }, { "epoch": 0.6069802731411229, "grad_norm": 1.2389697363524088, "learning_rate": 0.00026374688529492887, "loss": 4.9563, "step": 700 }, { "epoch": 0.6113158465207024, "grad_norm": 0.6691025924761106, "learning_rate": 0.0002630034007342416, "loss": 4.9738, "step": 705 }, { "epoch": 0.6156514199002818, "grad_norm": 0.6906310139155549, "learning_rate": 0.00026225344038005707, "loss": 4.9986, "step": 710 }, { "epoch": 0.6199869932798613, "grad_norm": 0.9744576847003475, "learning_rate": 0.00026149704720974004, "loss": 4.9758, "step": 715 }, { "epoch": 0.6243225666594407, "grad_norm": 1.4069257744518648, "learning_rate": 0.0002607342645692955, "loss": 4.9898, "step": 720 }, { "epoch": 0.6286581400390202, "grad_norm": 1.2113043966119588, "learning_rate": 0.0002599651361708846, "loss": 4.947, "step": 725 }, { "epoch": 0.6329937134185996, "grad_norm": 0.7232403329008882, "learning_rate": 0.0002591897060903197, "loss": 4.8734, "step": 730 }, { "epoch": 0.637329286798179, "grad_norm": 0.5694346606113075, "learning_rate": 0.0002584080187645384, "loss": 4.8135, "step": 735 }, { "epoch": 0.6416648601777585, "grad_norm": 0.4648121706598134, "learning_rate": 0.00025762011898905723, "loss": 4.8169, "step": 740 }, { "epoch": 0.6460004335573379, "grad_norm": 0.57832813623837, "learning_rate": 0.00025682605191540447, "loss": 4.7676, "step": 745 }, { "epoch": 0.6503360069369174, "grad_norm": 0.690363584833736, "learning_rate": 0.00025602586304853265, "loss": 4.7134, "step": 750 }, { "epoch": 0.6546715803164969, "grad_norm": 0.2557886503788499, "learning_rate": 0.000255219598244211, "loss": 4.7075, "step": 755 }, { "epoch": 0.6590071536960763, "grad_norm": 0.7529134213339467, "learning_rate": 0.00025440730370639744, "loss": 4.65, "step": 760 }, { "epoch": 0.6633427270756558, "grad_norm": 0.5243957618805375, "learning_rate": 0.00025358902598459097, "loss": 4.6432, "step": 765 }, { "epoch": 0.6676783004552352, "grad_norm": 0.8067461754194933, "learning_rate": 0.00025276481197116397, "loss": 4.6508, "step": 770 }, { "epoch": 0.6720138738348147, "grad_norm": 1.2265176252828778, "learning_rate": 0.00025193470889867505, "loss": 4.6586, "step": 775 }, { "epoch": 0.676349447214394, "grad_norm": 1.0124995665289962, "learning_rate": 0.00025109876433716236, "loss": 4.5788, "step": 780 }, { "epoch": 0.6806850205939735, "grad_norm": 0.9858327008465355, "learning_rate": 0.0002502570261914174, "loss": 4.5459, "step": 785 }, { "epoch": 0.685020593973553, "grad_norm": 0.675937090658665, "learning_rate": 0.0002494095426982399, "loss": 4.5489, "step": 790 }, { "epoch": 0.6893561673531324, "grad_norm": 0.9300433002492945, "learning_rate": 0.0002485563624236736, "loss": 4.55, "step": 795 }, { "epoch": 0.6936917407327119, "grad_norm": 0.5489664608896133, "learning_rate": 0.0002476975342602229, "loss": 4.4796, "step": 800 }, { "epoch": 0.6980273141122914, "grad_norm": 0.826574591896581, "learning_rate": 0.00024683310742405106, "loss": 4.4609, "step": 805 }, { "epoch": 0.7023628874918708, "grad_norm": 1.1037971961437545, "learning_rate": 0.00024596313145216033, "loss": 4.5026, "step": 810 }, { "epoch": 0.7066984608714503, "grad_norm": 0.4898602321700531, "learning_rate": 0.0002450876561995523, "loss": 4.4346, "step": 815 }, { "epoch": 0.7110340342510297, "grad_norm": 0.5951952065642176, "learning_rate": 0.00024420673183637146, "loss": 4.4397, "step": 820 }, { "epoch": 0.7153696076306092, "grad_norm": 0.8199632042459121, "learning_rate": 0.00024332040884503023, "loss": 4.4169, "step": 825 }, { "epoch": 0.7197051810101885, "grad_norm": 0.7484859334102708, "learning_rate": 0.00024242873801731552, "loss": 4.4214, "step": 830 }, { "epoch": 0.724040754389768, "grad_norm": 0.3847206228382832, "learning_rate": 0.0002415317704514785, "loss": 4.4005, "step": 835 }, { "epoch": 0.7283763277693475, "grad_norm": 0.25033596262216246, "learning_rate": 0.0002406295575493061, "loss": 4.2858, "step": 840 }, { "epoch": 0.7327119011489269, "grad_norm": 0.6631031175545227, "learning_rate": 0.00023972215101317545, "loss": 4.2667, "step": 845 }, { "epoch": 0.7370474745285064, "grad_norm": 0.9152978939845885, "learning_rate": 0.00023880960284309116, "loss": 4.2363, "step": 850 }, { "epoch": 0.7413830479080858, "grad_norm": 1.6470126913016125, "learning_rate": 0.000237891965333705, "loss": 4.2788, "step": 855 }, { "epoch": 0.7457186212876653, "grad_norm": 0.9797191599004809, "learning_rate": 0.00023696929107131962, "loss": 4.3022, "step": 860 }, { "epoch": 0.7500541946672448, "grad_norm": 0.6420083064783988, "learning_rate": 0.00023604163293087447, "loss": 4.2127, "step": 865 }, { "epoch": 0.7543897680468242, "grad_norm": 0.3430659331012604, "learning_rate": 0.0002351090440729163, "loss": 4.183, "step": 870 }, { "epoch": 0.7587253414264037, "grad_norm": 0.5456932919947513, "learning_rate": 0.00023417157794055233, "loss": 4.1664, "step": 875 }, { "epoch": 0.763060914805983, "grad_norm": 0.6297764897631516, "learning_rate": 0.0002332292882563877, "loss": 4.1577, "step": 880 }, { "epoch": 0.7673964881855625, "grad_norm": 0.7190804662247191, "learning_rate": 0.00023228222901944693, "loss": 4.1005, "step": 885 }, { "epoch": 0.771732061565142, "grad_norm": 0.7661623579650987, "learning_rate": 0.00023133045450207952, "loss": 4.1292, "step": 890 }, { "epoch": 0.7760676349447214, "grad_norm": 1.0103277074141193, "learning_rate": 0.00023037401924684946, "loss": 4.1244, "step": 895 }, { "epoch": 0.7804032083243009, "grad_norm": 0.6247976412624747, "learning_rate": 0.0002294129780634101, "loss": 4.1062, "step": 900 }, { "epoch": 0.7847387817038803, "grad_norm": 0.6772595937883702, "learning_rate": 0.00022844738602536275, "loss": 4.0618, "step": 905 }, { "epoch": 0.7890743550834598, "grad_norm": 0.6184206318420459, "learning_rate": 0.00022747729846710085, "loss": 4.0676, "step": 910 }, { "epoch": 0.7934099284630393, "grad_norm": 0.638858720650207, "learning_rate": 0.0002265027709806391, "loss": 4.0643, "step": 915 }, { "epoch": 0.7977455018426187, "grad_norm": 0.6957940244542513, "learning_rate": 0.00022552385941242736, "loss": 4.0841, "step": 920 }, { "epoch": 0.8020810752221982, "grad_norm": 0.9335514321597355, "learning_rate": 0.00022454061986015047, "loss": 4.0154, "step": 925 }, { "epoch": 0.8064166486017775, "grad_norm": 0.4531070372158566, "learning_rate": 0.0002235531086695137, "loss": 3.9897, "step": 930 }, { "epoch": 0.810752221981357, "grad_norm": 0.9518970467477127, "learning_rate": 0.00022256138243101337, "loss": 3.9785, "step": 935 }, { "epoch": 0.8150877953609365, "grad_norm": 0.7222836717307966, "learning_rate": 0.00022156549797669434, "loss": 3.9408, "step": 940 }, { "epoch": 0.8194233687405159, "grad_norm": 0.4376277363109283, "learning_rate": 0.00022056551237689277, "loss": 3.9633, "step": 945 }, { "epoch": 0.8237589421200954, "grad_norm": 0.40106634394410035, "learning_rate": 0.00021956148293696584, "loss": 3.9324, "step": 950 }, { "epoch": 0.8280945154996748, "grad_norm": 0.4672307168059042, "learning_rate": 0.00021855346719400787, "loss": 3.9066, "step": 955 }, { "epoch": 0.8324300888792543, "grad_norm": 0.9110683889580367, "learning_rate": 0.00021754152291355284, "loss": 3.8493, "step": 960 }, { "epoch": 0.8367656622588338, "grad_norm": 0.8166157046078413, "learning_rate": 0.0002165257080862643, "loss": 3.8129, "step": 965 }, { "epoch": 0.8411012356384132, "grad_norm": 0.5406920005139866, "learning_rate": 0.00021550608092461208, "loss": 3.8946, "step": 970 }, { "epoch": 0.8454368090179927, "grad_norm": 0.9346807879316044, "learning_rate": 0.00021448269985953634, "loss": 3.8407, "step": 975 }, { "epoch": 0.849772382397572, "grad_norm": 0.5449137337223598, "learning_rate": 0.00021345562353709905, "loss": 3.8459, "step": 980 }, { "epoch": 0.8541079557771515, "grad_norm": 0.5017093393188926, "learning_rate": 0.00021242491081512329, "loss": 3.8334, "step": 985 }, { "epoch": 0.858443529156731, "grad_norm": 0.3346538476585638, "learning_rate": 0.00021139062075982038, "loss": 3.7552, "step": 990 }, { "epoch": 0.8627791025363104, "grad_norm": 0.3170992203885485, "learning_rate": 0.00021035281264240491, "loss": 3.7351, "step": 995 }, { "epoch": 0.8671146759158899, "grad_norm": 0.6053179178799784, "learning_rate": 0.00020931154593569813, "loss": 3.7225, "step": 1000 }, { "epoch": 0.8714502492954693, "grad_norm": 1.3058143067536492, "learning_rate": 0.00020826688031072, "loss": 3.7079, "step": 1005 }, { "epoch": 0.8757858226750488, "grad_norm": 0.5266184524547373, "learning_rate": 0.00020721887563326924, "loss": 3.7352, "step": 1010 }, { "epoch": 0.8801213960546282, "grad_norm": 0.420302054844465, "learning_rate": 0.0002061675919604932, "loss": 3.6589, "step": 1015 }, { "epoch": 0.8844569694342077, "grad_norm": 0.7488296555492675, "learning_rate": 0.00020511308953744578, "loss": 3.6358, "step": 1020 }, { "epoch": 0.8887925428137872, "grad_norm": 0.6601885350762652, "learning_rate": 0.0002040554287936352, "loss": 3.6682, "step": 1025 }, { "epoch": 0.8931281161933666, "grad_norm": 0.3880230100654199, "learning_rate": 0.000202994670339561, "loss": 3.6391, "step": 1030 }, { "epoch": 0.897463689572946, "grad_norm": 0.3917175363168505, "learning_rate": 0.00020193087496324068, "loss": 3.6016, "step": 1035 }, { "epoch": 0.9017992629525254, "grad_norm": 0.46356273931086533, "learning_rate": 0.00020086410362672608, "loss": 3.5906, "step": 1040 }, { "epoch": 0.9061348363321049, "grad_norm": 1.1143119522475413, "learning_rate": 0.00019979441746261007, "loss": 3.6533, "step": 1045 }, { "epoch": 0.9104704097116844, "grad_norm": 0.9967538144738599, "learning_rate": 0.0001987218777705231, "loss": 3.6323, "step": 1050 }, { "epoch": 0.9148059830912638, "grad_norm": 0.44096052107987527, "learning_rate": 0.0001976465460136204, "loss": 3.5632, "step": 1055 }, { "epoch": 0.9191415564708433, "grad_norm": 0.2828790472438776, "learning_rate": 0.0001965684838150598, "loss": 3.5499, "step": 1060 }, { "epoch": 0.9234771298504227, "grad_norm": 0.3974061570448302, "learning_rate": 0.00019548775295447047, "loss": 3.5173, "step": 1065 }, { "epoch": 0.9278127032300022, "grad_norm": 0.33203900964680516, "learning_rate": 0.00019440441536441202, "loss": 3.514, "step": 1070 }, { "epoch": 0.9321482766095817, "grad_norm": 0.5937750161188399, "learning_rate": 0.00019331853312682613, "loss": 3.4923, "step": 1075 }, { "epoch": 0.9364838499891611, "grad_norm": 0.45882557948133224, "learning_rate": 0.00019223016846947843, "loss": 3.4693, "step": 1080 }, { "epoch": 0.9408194233687405, "grad_norm": 0.49337347425087247, "learning_rate": 0.00019113938376239247, "loss": 3.4604, "step": 1085 }, { "epoch": 0.9451549967483199, "grad_norm": 0.35941699726534343, "learning_rate": 0.00019004624151427568, "loss": 3.4682, "step": 1090 }, { "epoch": 0.9494905701278994, "grad_norm": 0.30818796537571425, "learning_rate": 0.0001889508043689372, "loss": 3.4252, "step": 1095 }, { "epoch": 0.9538261435074789, "grad_norm": 0.7001199109738344, "learning_rate": 0.00018785313510169782, "loss": 3.4065, "step": 1100 }, { "epoch": 0.9581617168870583, "grad_norm": 0.9768195615514486, "learning_rate": 0.0001867532966157929, "loss": 3.4084, "step": 1105 }, { "epoch": 0.9624972902666378, "grad_norm": 0.5379630797492526, "learning_rate": 0.0001856513519387673, "loss": 3.4402, "step": 1110 }, { "epoch": 0.9668328636462172, "grad_norm": 0.6853314266411482, "learning_rate": 0.0001845473642188637, "loss": 3.411, "step": 1115 }, { "epoch": 0.9711684370257967, "grad_norm": 0.28464215705198403, "learning_rate": 0.00018344139672140384, "loss": 3.396, "step": 1120 }, { "epoch": 0.9755040104053762, "grad_norm": 0.3931004534338328, "learning_rate": 0.00018233351282516283, "loss": 3.3599, "step": 1125 }, { "epoch": 0.9798395837849556, "grad_norm": 0.4836933366464829, "learning_rate": 0.00018122377601873733, "loss": 3.3365, "step": 1130 }, { "epoch": 0.984175157164535, "grad_norm": 0.4500537170978018, "learning_rate": 0.00018011224989690727, "loss": 3.3036, "step": 1135 }, { "epoch": 0.9885107305441144, "grad_norm": 0.2859298083164817, "learning_rate": 0.00017899899815699134, "loss": 3.2616, "step": 1140 }, { "epoch": 0.9928463039236939, "grad_norm": 0.7548873266247055, "learning_rate": 0.00017788408459519674, "loss": 3.2599, "step": 1145 }, { "epoch": 0.9971818773032733, "grad_norm": 0.32602384410109714, "learning_rate": 0.00017676757310296356, "loss": 3.2946, "step": 1150 }, { "epoch": 1.0008671146759158, "grad_norm": 1.0724922966955157, "learning_rate": 0.00017564952766330308, "loss": 3.2325, "step": 1155 }, { "epoch": 1.0052026880554954, "grad_norm": 0.3056819985122965, "learning_rate": 0.00017453001234713107, "loss": 3.2937, "step": 1160 }, { "epoch": 1.0095382614350747, "grad_norm": 0.36438507475227383, "learning_rate": 0.0001734090913095966, "loss": 3.2332, "step": 1165 }, { "epoch": 1.0138738348146543, "grad_norm": 0.30915725984980597, "learning_rate": 0.00017228682878640508, "loss": 3.2364, "step": 1170 }, { "epoch": 1.0182094081942337, "grad_norm": 0.33647494500052355, "learning_rate": 0.0001711632890901374, "loss": 3.2003, "step": 1175 }, { "epoch": 1.0225449815738132, "grad_norm": 0.312162857766132, "learning_rate": 0.00017003853660656435, "loss": 3.1807, "step": 1180 }, { "epoch": 1.0268805549533926, "grad_norm": 0.28432133622360134, "learning_rate": 0.00016891263579095698, "loss": 3.1668, "step": 1185 }, { "epoch": 1.031216128332972, "grad_norm": 0.27204342913825097, "learning_rate": 0.0001677856511643928, "loss": 3.1283, "step": 1190 }, { "epoch": 1.0355517017125515, "grad_norm": 0.47073942395246565, "learning_rate": 0.00016665764731005838, "loss": 3.0741, "step": 1195 }, { "epoch": 1.0398872750921309, "grad_norm": 0.48701637427424527, "learning_rate": 0.0001655286888695484, "loss": 3.079, "step": 1200 }, { "epoch": 1.0442228484717104, "grad_norm": 0.43725300587861615, "learning_rate": 0.0001643988405391612, "loss": 3.1095, "step": 1205 }, { "epoch": 1.0485584218512898, "grad_norm": 0.4519430510361337, "learning_rate": 0.00016326816706619136, "loss": 3.0779, "step": 1210 }, { "epoch": 1.0528939952308694, "grad_norm": 0.3697980433575495, "learning_rate": 0.00016213673324521913, "loss": 3.1321, "step": 1215 }, { "epoch": 1.0572295686104487, "grad_norm": 0.3772355822034261, "learning_rate": 0.00016100460391439749, "loss": 3.0517, "step": 1220 }, { "epoch": 1.0615651419900283, "grad_norm": 0.3795892280711555, "learning_rate": 0.0001598718439517364, "loss": 3.0278, "step": 1225 }, { "epoch": 1.0659007153696076, "grad_norm": 0.2889179402680778, "learning_rate": 0.0001587385182713849, "loss": 3.0402, "step": 1230 }, { "epoch": 1.0702362887491872, "grad_norm": 0.3755593221213261, "learning_rate": 0.0001576046918199112, "loss": 2.994, "step": 1235 }, { "epoch": 1.0745718621287665, "grad_norm": 0.3476317696753179, "learning_rate": 0.0001564704295725808, "loss": 3.0468, "step": 1240 }, { "epoch": 1.078907435508346, "grad_norm": 0.32852407166347947, "learning_rate": 0.00015533579652963288, "loss": 2.9539, "step": 1245 }, { "epoch": 1.0832430088879255, "grad_norm": 0.1822141394179994, "learning_rate": 0.00015420085771255566, "loss": 3.0026, "step": 1250 }, { "epoch": 1.0875785822675048, "grad_norm": 0.270319630849222, "learning_rate": 0.00015306567816036006, "loss": 2.976, "step": 1255 }, { "epoch": 1.0919141556470844, "grad_norm": 0.5149999760979279, "learning_rate": 0.00015193032292585247, "loss": 2.9326, "step": 1260 }, { "epoch": 1.0962497290266637, "grad_norm": 0.4445856562229245, "learning_rate": 0.00015079485707190717, "loss": 2.9483, "step": 1265 }, { "epoch": 1.1005853024062433, "grad_norm": 0.5660752187512902, "learning_rate": 0.00014965934566773753, "loss": 2.9209, "step": 1270 }, { "epoch": 1.1049208757858227, "grad_norm": 0.36107453692493174, "learning_rate": 0.00014852385378516712, "loss": 2.9059, "step": 1275 }, { "epoch": 1.1092564491654022, "grad_norm": 0.8307960065897146, "learning_rate": 0.00014738844649490106, "loss": 2.9135, "step": 1280 }, { "epoch": 1.1135920225449816, "grad_norm": 0.32930556448154574, "learning_rate": 0.0001462531888627966, "loss": 2.931, "step": 1285 }, { "epoch": 1.117927595924561, "grad_norm": 0.45094251331146196, "learning_rate": 0.00014511814594613461, "loss": 2.8794, "step": 1290 }, { "epoch": 1.1222631693041405, "grad_norm": 1.1023315045514466, "learning_rate": 0.00014398338278989167, "loss": 2.8964, "step": 1295 }, { "epoch": 1.1265987426837198, "grad_norm": 0.6884577647184886, "learning_rate": 0.00014284896442301218, "loss": 2.9186, "step": 1300 }, { "epoch": 1.1309343160632994, "grad_norm": 0.6853011714731221, "learning_rate": 0.00014171495585468195, "loss": 2.9093, "step": 1305 }, { "epoch": 1.1352698894428788, "grad_norm": 0.4444521764738651, "learning_rate": 0.000140581422070603, "loss": 2.8949, "step": 1310 }, { "epoch": 1.1396054628224583, "grad_norm": 0.36607451431757926, "learning_rate": 0.00013944842802926904, "loss": 2.8727, "step": 1315 }, { "epoch": 1.1439410362020377, "grad_norm": 0.20621050239222513, "learning_rate": 0.00013831603865824328, "loss": 2.8068, "step": 1320 }, { "epoch": 1.1482766095816173, "grad_norm": 0.29548591941866714, "learning_rate": 0.00013718431885043772, "loss": 2.8033, "step": 1325 }, { "epoch": 1.1526121829611966, "grad_norm": 0.3480007759133749, "learning_rate": 0.000136053333460394, "loss": 2.8303, "step": 1330 }, { "epoch": 1.156947756340776, "grad_norm": 0.2962266371943945, "learning_rate": 0.0001349231473005673, "loss": 2.7893, "step": 1335 }, { "epoch": 1.1612833297203555, "grad_norm": 0.23448406684507436, "learning_rate": 0.00013379382513761175, "loss": 2.7797, "step": 1340 }, { "epoch": 1.1656189030999349, "grad_norm": 0.3234043699614117, "learning_rate": 0.00013266543168866934, "loss": 2.7607, "step": 1345 }, { "epoch": 1.1699544764795144, "grad_norm": 0.3404947812121631, "learning_rate": 0.0001315380316176609, "loss": 2.7567, "step": 1350 }, { "epoch": 1.1742900498590938, "grad_norm": 0.6066048669028199, "learning_rate": 0.0001304116895315805, "loss": 2.7501, "step": 1355 }, { "epoch": 1.1786256232386734, "grad_norm": 0.44548332421799974, "learning_rate": 0.00012928646997679326, "loss": 2.7475, "step": 1360 }, { "epoch": 1.1829611966182527, "grad_norm": 0.3939467687702201, "learning_rate": 0.00012816243743533624, "loss": 2.7117, "step": 1365 }, { "epoch": 1.1872967699978323, "grad_norm": 0.3011307288220261, "learning_rate": 0.00012703965632122327, "loss": 2.7543, "step": 1370 }, { "epoch": 1.1916323433774116, "grad_norm": 0.22501315848677994, "learning_rate": 0.00012591819097675382, "loss": 2.7462, "step": 1375 }, { "epoch": 1.1959679167569912, "grad_norm": 0.32722976515069685, "learning_rate": 0.0001247981056688254, "loss": 2.6968, "step": 1380 }, { "epoch": 1.2003034901365706, "grad_norm": 0.1590488680202715, "learning_rate": 0.00012367946458525099, "loss": 2.7045, "step": 1385 }, { "epoch": 1.20463906351615, "grad_norm": 0.3152582769975424, "learning_rate": 0.00012256233183108068, "loss": 2.6789, "step": 1390 }, { "epoch": 1.2089746368957295, "grad_norm": 0.22965781079535302, "learning_rate": 0.00012144677142492789, "loss": 2.7101, "step": 1395 }, { "epoch": 1.2133102102753088, "grad_norm": 0.4684583447317611, "learning_rate": 0.00012033284729530057, "loss": 2.6259, "step": 1400 }, { "epoch": 1.2176457836548884, "grad_norm": 0.22363991138693204, "learning_rate": 0.00011922062327693832, "loss": 2.6717, "step": 1405 }, { "epoch": 1.2219813570344678, "grad_norm": 0.33497001761272166, "learning_rate": 0.00011811016310715355, "loss": 2.6517, "step": 1410 }, { "epoch": 1.2263169304140473, "grad_norm": 0.20548859646502277, "learning_rate": 0.00011700153042217931, "loss": 2.6677, "step": 1415 }, { "epoch": 1.2306525037936267, "grad_norm": 0.2652083246967948, "learning_rate": 0.00011589478875352255, "loss": 2.6543, "step": 1420 }, { "epoch": 1.2349880771732062, "grad_norm": 0.2529645672839692, "learning_rate": 0.00011479000152432319, "loss": 2.6205, "step": 1425 }, { "epoch": 1.2393236505527856, "grad_norm": 0.2601375038926653, "learning_rate": 0.0001136872320457197, "loss": 2.6102, "step": 1430 }, { "epoch": 1.2436592239323652, "grad_norm": 0.19250161537810334, "learning_rate": 0.00011258654351322107, "loss": 2.631, "step": 1435 }, { "epoch": 1.2479947973119445, "grad_norm": 0.35217150816617415, "learning_rate": 0.00011148799900308509, "loss": 2.6013, "step": 1440 }, { "epoch": 1.2523303706915239, "grad_norm": 0.8877189979150436, "learning_rate": 0.00011039166146870383, "loss": 2.6335, "step": 1445 }, { "epoch": 1.2566659440711034, "grad_norm": 0.6027781279490154, "learning_rate": 0.00010929759373699613, "loss": 2.6011, "step": 1450 }, { "epoch": 1.2610015174506828, "grad_norm": 0.29684477637886336, "learning_rate": 0.00010820585850480696, "loss": 2.6083, "step": 1455 }, { "epoch": 1.2653370908302624, "grad_norm": 0.22866123103951785, "learning_rate": 0.00010711651833531463, "loss": 2.6249, "step": 1460 }, { "epoch": 1.2696726642098417, "grad_norm": 0.2570183068878416, "learning_rate": 0.00010602963565444577, "loss": 2.5858, "step": 1465 }, { "epoch": 1.2740082375894213, "grad_norm": 0.31391058369721064, "learning_rate": 0.00010494527274729748, "loss": 2.5606, "step": 1470 }, { "epoch": 1.2783438109690006, "grad_norm": 0.3203458432202096, "learning_rate": 0.00010386349175456825, "loss": 2.5637, "step": 1475 }, { "epoch": 1.28267938434858, "grad_norm": 0.30834075039079006, "learning_rate": 0.00010278435466899714, "loss": 2.6011, "step": 1480 }, { "epoch": 1.2870149577281595, "grad_norm": 0.22399943173892975, "learning_rate": 0.00010170792333181084, "loss": 2.5288, "step": 1485 }, { "epoch": 1.2913505311077391, "grad_norm": 0.3319547375893817, "learning_rate": 0.00010063425942917974, "loss": 2.5375, "step": 1490 }, { "epoch": 1.2956861044873185, "grad_norm": 0.3315527934802732, "learning_rate": 9.956342448868354e-05, "loss": 2.5274, "step": 1495 }, { "epoch": 1.3000216778668978, "grad_norm": 0.3580645974138616, "learning_rate": 9.849547987578457e-05, "loss": 2.5585, "step": 1500 }, { "epoch": 1.3043572512464774, "grad_norm": 0.2780635499992022, "learning_rate": 9.743048679031163e-05, "loss": 2.5291, "step": 1505 }, { "epoch": 1.3086928246260567, "grad_norm": 0.40375989395133016, "learning_rate": 9.636850626295282e-05, "loss": 2.517, "step": 1510 }, { "epoch": 1.3130283980056363, "grad_norm": 0.3527800248976021, "learning_rate": 9.530959915175796e-05, "loss": 2.5277, "step": 1515 }, { "epoch": 1.3173639713852157, "grad_norm": 0.23822673694395258, "learning_rate": 9.425382613865107e-05, "loss": 2.5014, "step": 1520 }, { "epoch": 1.3216995447647952, "grad_norm": 0.1747138201174964, "learning_rate": 9.32012477259531e-05, "loss": 2.4866, "step": 1525 }, { "epoch": 1.3260351181443746, "grad_norm": 0.1954668050935581, "learning_rate": 9.215192423291463e-05, "loss": 2.5021, "step": 1530 }, { "epoch": 1.330370691523954, "grad_norm": 0.19057547870092642, "learning_rate": 9.110591579225906e-05, "loss": 2.5044, "step": 1535 }, { "epoch": 1.3347062649035335, "grad_norm": 0.2745708957809259, "learning_rate": 9.006328234673701e-05, "loss": 2.5073, "step": 1540 }, { "epoch": 1.339041838283113, "grad_norm": 0.20139794408374664, "learning_rate": 8.90240836456909e-05, "loss": 2.5033, "step": 1545 }, { "epoch": 1.3433774116626924, "grad_norm": 0.201183788959332, "learning_rate": 8.798837924163098e-05, "loss": 2.4782, "step": 1550 }, { "epoch": 1.3477129850422718, "grad_norm": 0.3799911989499351, "learning_rate": 8.695622848682291e-05, "loss": 2.4951, "step": 1555 }, { "epoch": 1.3520485584218513, "grad_norm": 0.1990146960863876, "learning_rate": 8.592769052988607e-05, "loss": 2.4901, "step": 1560 }, { "epoch": 1.3563841318014307, "grad_norm": 0.34068891992062217, "learning_rate": 8.490282431240416e-05, "loss": 2.4522, "step": 1565 }, { "epoch": 1.3607197051810103, "grad_norm": 0.2291955529635031, "learning_rate": 8.388168856554777e-05, "loss": 2.4203, "step": 1570 }, { "epoch": 1.3650552785605896, "grad_norm": 0.3248337694954652, "learning_rate": 8.286434180670822e-05, "loss": 2.4868, "step": 1575 }, { "epoch": 1.3693908519401692, "grad_norm": 0.2990054213661462, "learning_rate": 8.185084233614444e-05, "loss": 2.4363, "step": 1580 }, { "epoch": 1.3737264253197485, "grad_norm": 0.233583205490779, "learning_rate": 8.084124823364204e-05, "loss": 2.4807, "step": 1585 }, { "epoch": 1.3780619986993279, "grad_norm": 0.2518607909224173, "learning_rate": 7.983561735518474e-05, "loss": 2.4358, "step": 1590 }, { "epoch": 1.3823975720789075, "grad_norm": 0.19984297609600038, "learning_rate": 7.883400732963913e-05, "loss": 2.478, "step": 1595 }, { "epoch": 1.3867331454584868, "grad_norm": 0.1648080690690456, "learning_rate": 7.783647555545217e-05, "loss": 2.442, "step": 1600 }, { "epoch": 1.3910687188380664, "grad_norm": 0.17610648998979445, "learning_rate": 7.684307919736158e-05, "loss": 2.41, "step": 1605 }, { "epoch": 1.3954042922176457, "grad_norm": 0.14818666921811272, "learning_rate": 7.585387518312028e-05, "loss": 2.4206, "step": 1610 }, { "epoch": 1.3997398655972253, "grad_norm": 0.2001322045633342, "learning_rate": 7.486892020023406e-05, "loss": 2.3821, "step": 1615 }, { "epoch": 1.4040754389768046, "grad_norm": 0.2200377253653553, "learning_rate": 7.388827069271276e-05, "loss": 2.4257, "step": 1620 }, { "epoch": 1.408411012356384, "grad_norm": 0.1844230292041018, "learning_rate": 7.291198285783602e-05, "loss": 2.4135, "step": 1625 }, { "epoch": 1.4127465857359636, "grad_norm": 0.1929745133150067, "learning_rate": 7.194011264293254e-05, "loss": 2.3777, "step": 1630 }, { "epoch": 1.4170821591155431, "grad_norm": 0.2915968829982784, "learning_rate": 7.097271574217421e-05, "loss": 2.4181, "step": 1635 }, { "epoch": 1.4214177324951225, "grad_norm": 0.269578671519446, "learning_rate": 7.000984759338422e-05, "loss": 2.3788, "step": 1640 }, { "epoch": 1.4257533058747018, "grad_norm": 0.19512593934978045, "learning_rate": 6.905156337486045e-05, "loss": 2.391, "step": 1645 }, { "epoch": 1.4300888792542814, "grad_norm": 0.2593023962861574, "learning_rate": 6.809791800221313e-05, "loss": 2.3963, "step": 1650 }, { "epoch": 1.4344244526338608, "grad_norm": 0.14901754063689115, "learning_rate": 6.714896612521794e-05, "loss": 2.3976, "step": 1655 }, { "epoch": 1.4387600260134403, "grad_norm": 0.302187906502475, "learning_rate": 6.620476212468424e-05, "loss": 2.4194, "step": 1660 }, { "epoch": 1.4430955993930197, "grad_norm": 0.24180384264466487, "learning_rate": 6.526536010933874e-05, "loss": 2.4295, "step": 1665 }, { "epoch": 1.4474311727725993, "grad_norm": 0.22169225675339646, "learning_rate": 6.433081391272467e-05, "loss": 2.3976, "step": 1670 }, { "epoch": 1.4517667461521786, "grad_norm": 0.2799481789977155, "learning_rate": 6.340117709011693e-05, "loss": 2.392, "step": 1675 }, { "epoch": 1.456102319531758, "grad_norm": 0.28103864837065845, "learning_rate": 6.247650291545287e-05, "loss": 2.3708, "step": 1680 }, { "epoch": 1.4604378929113375, "grad_norm": 0.24593789213337805, "learning_rate": 6.155684437827931e-05, "loss": 2.4043, "step": 1685 }, { "epoch": 1.464773466290917, "grad_norm": 0.19968974812237159, "learning_rate": 6.064225418071632e-05, "loss": 2.3784, "step": 1690 }, { "epoch": 1.4691090396704964, "grad_norm": 0.2665198920246072, "learning_rate": 5.9732784734436554e-05, "loss": 2.387, "step": 1695 }, { "epoch": 1.4734446130500758, "grad_norm": 0.33954513870533093, "learning_rate": 5.882848815766189e-05, "loss": 2.3659, "step": 1700 }, { "epoch": 1.4777801864296554, "grad_norm": 0.3174285529770626, "learning_rate": 5.792941627217707e-05, "loss": 2.3703, "step": 1705 }, { "epoch": 1.4821157598092347, "grad_norm": 0.21514280209891976, "learning_rate": 5.703562060035951e-05, "loss": 2.3311, "step": 1710 }, { "epoch": 1.4864513331888143, "grad_norm": 0.26026170716900454, "learning_rate": 5.614715236222702e-05, "loss": 2.3534, "step": 1715 }, { "epoch": 1.4907869065683936, "grad_norm": 0.2072827021307388, "learning_rate": 5.52640624725026e-05, "loss": 2.362, "step": 1720 }, { "epoch": 1.4951224799479732, "grad_norm": 0.18636459137372047, "learning_rate": 5.4386401537696536e-05, "loss": 2.367, "step": 1725 }, { "epoch": 1.4994580533275526, "grad_norm": 0.2616321440338591, "learning_rate": 5.3514219853206464e-05, "loss": 2.3517, "step": 1730 }, { "epoch": 1.503793626707132, "grad_norm": 0.17660365833901004, "learning_rate": 5.264756740043511e-05, "loss": 2.3366, "step": 1735 }, { "epoch": 1.5081292000867115, "grad_norm": 0.14325753486785228, "learning_rate": 5.178649384392603e-05, "loss": 2.3628, "step": 1740 }, { "epoch": 1.512464773466291, "grad_norm": 0.16306886697377787, "learning_rate": 5.093104852851749e-05, "loss": 2.3403, "step": 1745 }, { "epoch": 1.5168003468458704, "grad_norm": 0.17105852766007526, "learning_rate": 5.008128047651488e-05, "loss": 2.3193, "step": 1750 }, { "epoch": 1.5211359202254497, "grad_norm": 0.20749830784985143, "learning_rate": 4.923723838488117e-05, "loss": 2.3519, "step": 1755 }, { "epoch": 1.5254714936050293, "grad_norm": 0.3157660095405772, "learning_rate": 4.839897062244638e-05, "loss": 2.3197, "step": 1760 }, { "epoch": 1.5298070669846087, "grad_norm": 0.18219178823449922, "learning_rate": 4.756652522713599e-05, "loss": 2.3279, "step": 1765 }, { "epoch": 1.534142640364188, "grad_norm": 0.12524649727104128, "learning_rate": 4.673994990321752e-05, "loss": 2.3019, "step": 1770 }, { "epoch": 1.5384782137437676, "grad_norm": 0.1636956452865002, "learning_rate": 4.591929201856727e-05, "loss": 2.2859, "step": 1775 }, { "epoch": 1.5428137871233472, "grad_norm": 0.21967450962811522, "learning_rate": 4.5104598601955805e-05, "loss": 2.3095, "step": 1780 }, { "epoch": 1.5471493605029265, "grad_norm": 0.19863296073280773, "learning_rate": 4.4295916340352625e-05, "loss": 2.2826, "step": 1785 }, { "epoch": 1.5514849338825059, "grad_norm": 0.15602204927659435, "learning_rate": 4.349329157625088e-05, "loss": 2.3522, "step": 1790 }, { "epoch": 1.5558205072620854, "grad_norm": 0.15284396668890557, "learning_rate": 4.269677030501184e-05, "loss": 2.3546, "step": 1795 }, { "epoch": 1.560156080641665, "grad_norm": 0.16000621000672735, "learning_rate": 4.1906398172228704e-05, "loss": 2.3456, "step": 1800 }, { "epoch": 1.5644916540212443, "grad_norm": 0.5401953444913549, "learning_rate": 4.112222047111111e-05, "loss": 2.3475, "step": 1805 }, { "epoch": 1.5688272274008237, "grad_norm": 0.18490516183802747, "learning_rate": 4.034428213988946e-05, "loss": 2.3064, "step": 1810 }, { "epoch": 1.5731628007804033, "grad_norm": 0.2648705938773556, "learning_rate": 3.957262775923969e-05, "loss": 2.3087, "step": 1815 }, { "epoch": 1.5774983741599826, "grad_norm": 0.20317358607574193, "learning_rate": 3.8807301549728435e-05, "loss": 2.292, "step": 1820 }, { "epoch": 1.581833947539562, "grad_norm": 0.18781915869958946, "learning_rate": 3.804834736927918e-05, "loss": 2.3321, "step": 1825 }, { "epoch": 1.5861695209191415, "grad_norm": 0.22963664052911378, "learning_rate": 3.7295808710658594e-05, "loss": 2.3105, "step": 1830 }, { "epoch": 1.5905050942987211, "grad_norm": 0.16547609467569466, "learning_rate": 3.654972869898435e-05, "loss": 2.3441, "step": 1835 }, { "epoch": 1.5948406676783005, "grad_norm": 0.15450897850919312, "learning_rate": 3.581015008925367e-05, "loss": 2.2963, "step": 1840 }, { "epoch": 1.5991762410578798, "grad_norm": 0.2368783078996937, "learning_rate": 3.507711526389331e-05, "loss": 2.2701, "step": 1845 }, { "epoch": 1.6035118144374594, "grad_norm": 0.23275669114568637, "learning_rate": 3.4350666230330684e-05, "loss": 2.3027, "step": 1850 }, { "epoch": 1.607847387817039, "grad_norm": 0.23123161162094558, "learning_rate": 3.363084461858659e-05, "loss": 2.3271, "step": 1855 }, { "epoch": 1.6121829611966183, "grad_norm": 0.19114205349654592, "learning_rate": 3.291769167888971e-05, "loss": 2.3085, "step": 1860 }, { "epoch": 1.6165185345761977, "grad_norm": 0.19601739480919383, "learning_rate": 3.221124827931248e-05, "loss": 2.297, "step": 1865 }, { "epoch": 1.6208541079557772, "grad_norm": 0.14221216745928258, "learning_rate": 3.151155490342917e-05, "loss": 2.2855, "step": 1870 }, { "epoch": 1.6251896813353566, "grad_norm": 0.1584865862659709, "learning_rate": 3.081865164799613e-05, "loss": 2.2614, "step": 1875 }, { "epoch": 1.629525254714936, "grad_norm": 0.1414839158508025, "learning_rate": 3.0132578220653648e-05, "loss": 2.2795, "step": 1880 }, { "epoch": 1.6338608280945155, "grad_norm": 0.1254646077472122, "learning_rate": 2.9453373937650664e-05, "loss": 2.2965, "step": 1885 }, { "epoch": 1.638196401474095, "grad_norm": 0.16882648715438078, "learning_rate": 2.8781077721591828e-05, "loss": 2.3278, "step": 1890 }, { "epoch": 1.6425319748536744, "grad_norm": 0.18395517918844081, "learning_rate": 2.811572809920669e-05, "loss": 2.2801, "step": 1895 }, { "epoch": 1.6468675482332538, "grad_norm": 0.1309521536420322, "learning_rate": 2.7457363199142062e-05, "loss": 2.2852, "step": 1900 }, { "epoch": 1.6512031216128333, "grad_norm": 0.12473429715301326, "learning_rate": 2.680602074977708e-05, "loss": 2.259, "step": 1905 }, { "epoch": 1.655538694992413, "grad_norm": 0.1312047736460762, "learning_rate": 2.6161738077060924e-05, "loss": 2.2868, "step": 1910 }, { "epoch": 1.659874268371992, "grad_norm": 0.14567444941618224, "learning_rate": 2.552455210237398e-05, "loss": 2.2633, "step": 1915 }, { "epoch": 1.6642098417515716, "grad_norm": 0.13883443962415717, "learning_rate": 2.4894499340411968e-05, "loss": 2.2541, "step": 1920 }, { "epoch": 1.6685454151311512, "grad_norm": 0.15369994803840648, "learning_rate": 2.427161589709337e-05, "loss": 2.2996, "step": 1925 }, { "epoch": 1.6728809885107305, "grad_norm": 0.21447874396163935, "learning_rate": 2.365593746749041e-05, "loss": 2.2679, "step": 1930 }, { "epoch": 1.6772165618903099, "grad_norm": 0.18513858979140632, "learning_rate": 2.3047499333783558e-05, "loss": 2.2658, "step": 1935 }, { "epoch": 1.6815521352698894, "grad_norm": 0.18391090938011884, "learning_rate": 2.244633636323946e-05, "loss": 2.2907, "step": 1940 }, { "epoch": 1.685887708649469, "grad_norm": 0.16891631837991977, "learning_rate": 2.1852483006212978e-05, "loss": 2.2478, "step": 1945 }, { "epoch": 1.6902232820290484, "grad_norm": 0.1518091020243819, "learning_rate": 2.126597329417293e-05, "loss": 2.2473, "step": 1950 }, { "epoch": 1.6945588554086277, "grad_norm": 0.17161647974253239, "learning_rate": 2.068684083775185e-05, "loss": 2.2537, "step": 1955 }, { "epoch": 1.6988944287882073, "grad_norm": 0.14187218785526223, "learning_rate": 2.0115118824819914e-05, "loss": 2.2616, "step": 1960 }, { "epoch": 1.7032300021677866, "grad_norm": 0.11929928350263867, "learning_rate": 1.9550840018583153e-05, "loss": 2.2694, "step": 1965 }, { "epoch": 1.707565575547366, "grad_norm": 0.1245533165549466, "learning_rate": 1.899403675570576e-05, "loss": 2.2595, "step": 1970 }, { "epoch": 1.7119011489269456, "grad_norm": 0.11773193996812116, "learning_rate": 1.844474094445705e-05, "loss": 2.2604, "step": 1975 }, { "epoch": 1.7162367223065251, "grad_norm": 0.1419568607634185, "learning_rate": 1.7902984062883053e-05, "loss": 2.2311, "step": 1980 }, { "epoch": 1.7205722956861045, "grad_norm": 0.16983875746382313, "learning_rate": 1.736879715700243e-05, "loss": 2.2403, "step": 1985 }, { "epoch": 1.7249078690656838, "grad_norm": 0.12131945112331936, "learning_rate": 1.684221083902746e-05, "loss": 2.2474, "step": 1990 }, { "epoch": 1.7292434424452634, "grad_norm": 0.44071395908424116, "learning_rate": 1.6323255285609722e-05, "loss": 2.2337, "step": 1995 }, { "epoch": 1.733579015824843, "grad_norm": 0.23334817695713936, "learning_rate": 1.5811960236110855e-05, "loss": 2.2489, "step": 2000 }, { "epoch": 1.7379145892044223, "grad_norm": 0.2111669535853138, "learning_rate": 1.530835499089821e-05, "loss": 2.2269, "step": 2005 }, { "epoch": 1.7422501625840017, "grad_norm": 0.18179903995116767, "learning_rate": 1.4812468409665884e-05, "loss": 2.2706, "step": 2010 }, { "epoch": 1.7465857359635812, "grad_norm": 0.12994276828633272, "learning_rate": 1.432432890978074e-05, "loss": 2.2688, "step": 2015 }, { "epoch": 1.7509213093431606, "grad_norm": 0.20415942563901335, "learning_rate": 1.3843964464654018e-05, "loss": 2.2725, "step": 2020 }, { "epoch": 1.75525688272274, "grad_norm": 0.13423972154083932, "learning_rate": 1.3371402602138242e-05, "loss": 2.2614, "step": 2025 }, { "epoch": 1.7595924561023195, "grad_norm": 0.21592839787669263, "learning_rate": 1.2906670402949703e-05, "loss": 2.2278, "step": 2030 }, { "epoch": 1.763928029481899, "grad_norm": 0.1195430112602107, "learning_rate": 1.2449794499116567e-05, "loss": 2.2434, "step": 2035 }, { "epoch": 1.7682636028614784, "grad_norm": 0.1200861082736604, "learning_rate": 1.200080107245278e-05, "loss": 2.2547, "step": 2040 }, { "epoch": 1.7725991762410578, "grad_norm": 0.11672129617620956, "learning_rate": 1.1559715853057516e-05, "loss": 2.2196, "step": 2045 }, { "epoch": 1.7769347496206374, "grad_norm": 0.11729537916907379, "learning_rate": 1.1126564117840819e-05, "loss": 2.2613, "step": 2050 }, { "epoch": 1.781270323000217, "grad_norm": 0.10450327625927365, "learning_rate": 1.0701370689075094e-05, "loss": 2.244, "step": 2055 }, { "epoch": 1.7856058963797963, "grad_norm": 0.13649256070362986, "learning_rate": 1.0284159932972524e-05, "loss": 2.2222, "step": 2060 }, { "epoch": 1.7899414697593756, "grad_norm": 0.11456259112458032, "learning_rate": 9.87495575828875e-06, "loss": 2.2401, "step": 2065 }, { "epoch": 1.7942770431389552, "grad_norm": 0.12074570004540981, "learning_rate": 9.473781614952918e-06, "loss": 2.2401, "step": 2070 }, { "epoch": 1.7986126165185345, "grad_norm": 0.12686179535370362, "learning_rate": 9.080660492723663e-06, "loss": 2.2295, "step": 2075 }, { "epoch": 1.802948189898114, "grad_norm": 0.1300450179470363, "learning_rate": 8.695614919871679e-06, "loss": 2.2569, "step": 2080 }, { "epoch": 1.8072837632776935, "grad_norm": 0.11937907719387332, "learning_rate": 8.31866696188887e-06, "loss": 2.2294, "step": 2085 }, { "epoch": 1.811619336657273, "grad_norm": 0.11764431526486717, "learning_rate": 7.949838220223664e-06, "loss": 2.217, "step": 2090 }, { "epoch": 1.8159549100368524, "grad_norm": 0.1234907525541496, "learning_rate": 7.589149831043212e-06, "loss": 2.217, "step": 2095 }, { "epoch": 1.8202904834164317, "grad_norm": 0.1517422441513512, "learning_rate": 7.236622464022151e-06, "loss": 2.2453, "step": 2100 }, { "epoch": 1.8246260567960113, "grad_norm": 0.33341778139897915, "learning_rate": 6.892276321158058e-06, "loss": 2.2356, "step": 2105 }, { "epoch": 1.8289616301755909, "grad_norm": 0.11642132397770094, "learning_rate": 6.556131135613818e-06, "loss": 2.2423, "step": 2110 }, { "epoch": 1.83329720355517, "grad_norm": 0.1407587899536127, "learning_rate": 6.2282061705868025e-06, "loss": 2.203, "step": 2115 }, { "epoch": 1.8376327769347496, "grad_norm": 0.13035967664280043, "learning_rate": 5.908520218204832e-06, "loss": 2.1993, "step": 2120 }, { "epoch": 1.8419683503143292, "grad_norm": 0.09453247400951749, "learning_rate": 5.597091598449438e-06, "loss": 2.228, "step": 2125 }, { "epoch": 1.8463039236939085, "grad_norm": 0.12255843625223094, "learning_rate": 5.293938158105904e-06, "loss": 2.2373, "step": 2130 }, { "epoch": 1.8506394970734878, "grad_norm": 0.10290208504992138, "learning_rate": 4.999077269740581e-06, "loss": 2.1896, "step": 2135 }, { "epoch": 1.8549750704530674, "grad_norm": 0.10229541966122209, "learning_rate": 4.712525830705338e-06, "loss": 2.2202, "step": 2140 }, { "epoch": 1.859310643832647, "grad_norm": 0.1058407353264288, "learning_rate": 4.4343002621692155e-06, "loss": 2.2105, "step": 2145 }, { "epoch": 1.8636462172122263, "grad_norm": 0.12091080320995722, "learning_rate": 4.164416508177398e-06, "loss": 2.2192, "step": 2150 }, { "epoch": 1.8679817905918057, "grad_norm": 0.11922564089206397, "learning_rate": 3.902890034737527e-06, "loss": 2.2558, "step": 2155 }, { "epoch": 1.8723173639713853, "grad_norm": 0.09787488554546948, "learning_rate": 3.649735828933409e-06, "loss": 2.1973, "step": 2160 }, { "epoch": 1.8766529373509646, "grad_norm": 0.12237194203335397, "learning_rate": 3.4049683980661214e-06, "loss": 2.2213, "step": 2165 }, { "epoch": 1.880988510730544, "grad_norm": 0.09606403673967055, "learning_rate": 3.168601768822726e-06, "loss": 2.1992, "step": 2170 }, { "epoch": 1.8853240841101235, "grad_norm": 0.09953286517322664, "learning_rate": 2.940649486472396e-06, "loss": 2.2528, "step": 2175 }, { "epoch": 1.889659657489703, "grad_norm": 0.10835068342395951, "learning_rate": 2.72112461409022e-06, "loss": 2.2531, "step": 2180 }, { "epoch": 1.8939952308692825, "grad_norm": 0.10266855095126044, "learning_rate": 2.510039731808533e-06, "loss": 2.269, "step": 2185 }, { "epoch": 1.8983308042488618, "grad_norm": 0.10341900472056524, "learning_rate": 2.3074069360961623e-06, "loss": 2.2062, "step": 2190 }, { "epoch": 1.9026663776284414, "grad_norm": 0.10842551877471102, "learning_rate": 2.1132378390650463e-06, "loss": 2.2534, "step": 2195 }, { "epoch": 1.907001951008021, "grad_norm": 0.10059763675561238, "learning_rate": 1.9275435678048845e-06, "loss": 2.2473, "step": 2200 }, { "epoch": 1.9113375243876003, "grad_norm": 0.10768116633333848, "learning_rate": 1.7503347637454479e-06, "loss": 2.2552, "step": 2205 }, { "epoch": 1.9156730977671796, "grad_norm": 0.09414809031667026, "learning_rate": 1.5816215820467992e-06, "loss": 2.2367, "step": 2210 }, { "epoch": 1.9200086711467592, "grad_norm": 0.10690488271519041, "learning_rate": 1.4214136910172925e-06, "loss": 2.2253, "step": 2215 }, { "epoch": 1.9243442445263386, "grad_norm": 0.10872884844513397, "learning_rate": 1.2697202715595822e-06, "loss": 2.2289, "step": 2220 }, { "epoch": 1.928679817905918, "grad_norm": 0.1254620474939429, "learning_rate": 1.126550016644412e-06, "loss": 2.2164, "step": 2225 }, { "epoch": 1.9330153912854975, "grad_norm": 0.1025587696285709, "learning_rate": 9.919111308125449e-07, "loss": 2.2039, "step": 2230 }, { "epoch": 1.937350964665077, "grad_norm": 0.09872117993630683, "learning_rate": 8.65811329704541e-07, "loss": 2.2492, "step": 2235 }, { "epoch": 1.9416865380446564, "grad_norm": 0.1085639607949179, "learning_rate": 7.482578396185934e-07, "loss": 2.2449, "step": 2240 }, { "epoch": 1.9460221114242358, "grad_norm": 0.09973275488637519, "learning_rate": 6.392573970964432e-07, "loss": 2.2074, "step": 2245 }, { "epoch": 1.9503576848038153, "grad_norm": 0.11503330375288269, "learning_rate": 5.388162485373548e-07, "loss": 2.2473, "step": 2250 }, { "epoch": 1.954693258183395, "grad_norm": 0.11928896227233268, "learning_rate": 4.4694014984010264e-07, "loss": 2.2128, "step": 2255 }, { "epoch": 1.9590288315629742, "grad_norm": 0.08668925207942389, "learning_rate": 3.6363436607313446e-07, "loss": 2.2183, "step": 2260 }, { "epoch": 1.9633644049425536, "grad_norm": 0.0953840596497321, "learning_rate": 2.889036711729298e-07, "loss": 2.2397, "step": 2265 }, { "epoch": 1.9676999783221332, "grad_norm": 0.10418411546023475, "learning_rate": 2.2275234767030193e-07, "loss": 2.2146, "step": 2270 }, { "epoch": 1.9720355517017125, "grad_norm": 0.09712806974011044, "learning_rate": 1.6518418644507758e-07, "loss": 2.2166, "step": 2275 }, { "epoch": 1.9763711250812919, "grad_norm": 0.12331447239619826, "learning_rate": 1.1620248650878739e-07, "loss": 2.2371, "step": 2280 }, { "epoch": 1.9807066984608714, "grad_norm": 0.09998286807181965, "learning_rate": 7.581005481566704e-08, "loss": 2.2271, "step": 2285 }, { "epoch": 1.985042271840451, "grad_norm": 0.09866409081999304, "learning_rate": 4.4009206101786043e-08, "loss": 2.2148, "step": 2290 }, { "epoch": 1.9893778452200304, "grad_norm": 0.10987887336557695, "learning_rate": 2.0801762752387097e-08, "loss": 2.2046, "step": 2295 }, { "epoch": 1.9937134185996097, "grad_norm": 0.10003495023127681, "learning_rate": 6.189054697436357e-09, "loss": 2.1954, "step": 2300 }, { "epoch": 1.9980489919791893, "grad_norm": 0.09752064885234579, "learning_rate": 1.7191933545102067e-10, "loss": 2.2409, "step": 2305 }, { "epoch": 1.998916106655105, "step": 2306, "total_flos": 1.542232840692197e+19, "train_loss": 3.6333630210094006, "train_runtime": 27233.3224, "train_samples_per_second": 2.71, "train_steps_per_second": 0.085 } ], "logging_steps": 5, "max_steps": 2306, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.542232840692197e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }