{ "best_metric": 1.07135177, "best_model_checkpoint": "/qlgy0912/llm_pretrain_output/qwen2_5-7b/v1-20240919-080450/checkpoint-162000", "epoch": 1.9909206190309083, "eval_steps": 2000, "global_step": 178000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.55939102, "epoch": 1.118494729792645e-05, "grad_norm": 12.875, "learning_rate": 1.1184431271669837e-09, "loss": 2.0432241, "memory(GiB)": 101.92, "step": 1, "train_speed(iter/s)": 0.0573 }, { "acc": 0.60418726, "epoch": 0.00022369894595852902, "grad_norm": 7.1875, "learning_rate": 2.2368862543339672e-08, "loss": 1.85484695, "memory(GiB)": 104.42, "step": 20, "train_speed(iter/s)": 0.315803 }, { "acc": 0.60039387, "epoch": 0.00044739789191705805, "grad_norm": 8.625, "learning_rate": 4.4737725086679345e-08, "loss": 1.8434761, "memory(GiB)": 117.5, "step": 40, "train_speed(iter/s)": 0.361835 }, { "acc": 0.61362572, "epoch": 0.000671096837875587, "grad_norm": 10.0, "learning_rate": 6.710658763001902e-08, "loss": 1.77901268, "memory(GiB)": 117.5, "step": 60, "train_speed(iter/s)": 0.377186 }, { "acc": 0.60724144, "epoch": 0.0008947957838341161, "grad_norm": 11.3125, "learning_rate": 8.947545017335869e-08, "loss": 1.80800381, "memory(GiB)": 117.5, "step": 80, "train_speed(iter/s)": 0.388564 }, { "acc": 0.60335579, "epoch": 0.001118494729792645, "grad_norm": 11.75, "learning_rate": 1.1184431271669837e-07, "loss": 1.84817848, "memory(GiB)": 117.5, "step": 100, "train_speed(iter/s)": 0.395522 }, { "acc": 0.5978817, "epoch": 0.001342193675751174, "grad_norm": 11.6875, "learning_rate": 1.3421317526003804e-07, "loss": 1.85686684, "memory(GiB)": 117.5, "step": 120, "train_speed(iter/s)": 0.400653 }, { "acc": 0.60780258, "epoch": 0.0015658926217097032, "grad_norm": 12.0, "learning_rate": 1.565820378033777e-07, "loss": 1.8127182, "memory(GiB)": 117.5, "step": 140, "train_speed(iter/s)": 0.404265 }, { "acc": 0.61364236, "epoch": 0.0017895915676682322, "grad_norm": 6.96875, "learning_rate": 1.7895090034671738e-07, "loss": 1.79347572, "memory(GiB)": 117.5, "step": 160, "train_speed(iter/s)": 0.405405 }, { "acc": 0.60892758, "epoch": 0.002013290513626761, "grad_norm": 12.375, "learning_rate": 2.0131976289005705e-07, "loss": 1.80462837, "memory(GiB)": 117.5, "step": 180, "train_speed(iter/s)": 0.410091 }, { "acc": 0.60299497, "epoch": 0.00223698945958529, "grad_norm": 10.5625, "learning_rate": 2.2368862543339674e-07, "loss": 1.83456192, "memory(GiB)": 117.5, "step": 200, "train_speed(iter/s)": 0.414988 }, { "acc": 0.60524454, "epoch": 0.002460688405543819, "grad_norm": 16.875, "learning_rate": 2.460574879767364e-07, "loss": 1.8342823, "memory(GiB)": 117.5, "step": 220, "train_speed(iter/s)": 0.419019 }, { "acc": 0.61110516, "epoch": 0.002684387351502348, "grad_norm": 11.1875, "learning_rate": 2.684263505200761e-07, "loss": 1.80001831, "memory(GiB)": 117.5, "step": 240, "train_speed(iter/s)": 0.421808 }, { "acc": 0.61141367, "epoch": 0.002908086297460877, "grad_norm": 10.875, "learning_rate": 2.9079521306341575e-07, "loss": 1.80491867, "memory(GiB)": 117.5, "step": 260, "train_speed(iter/s)": 0.420404 }, { "acc": 0.61223078, "epoch": 0.0031317852434194064, "grad_norm": 9.0, "learning_rate": 3.131640756067554e-07, "loss": 1.79923782, "memory(GiB)": 117.51, "step": 280, "train_speed(iter/s)": 0.421991 }, { "acc": 0.60246029, "epoch": 0.0033554841893779354, "grad_norm": 6.34375, "learning_rate": 3.3553293815009514e-07, "loss": 1.83584042, "memory(GiB)": 117.51, "step": 300, "train_speed(iter/s)": 0.42421 }, { "acc": 0.61073003, "epoch": 0.0035791831353364644, "grad_norm": 8.6875, "learning_rate": 3.5790180069343476e-07, "loss": 1.81241245, "memory(GiB)": 117.51, "step": 320, "train_speed(iter/s)": 0.423101 }, { "acc": 0.61106925, "epoch": 0.0038028820812949934, "grad_norm": 12.8125, "learning_rate": 3.802706632367745e-07, "loss": 1.81777, "memory(GiB)": 117.51, "step": 340, "train_speed(iter/s)": 0.423386 }, { "acc": 0.61623344, "epoch": 0.004026581027253522, "grad_norm": 9.4375, "learning_rate": 4.026395257801141e-07, "loss": 1.78481865, "memory(GiB)": 117.51, "step": 360, "train_speed(iter/s)": 0.424691 }, { "acc": 0.61750002, "epoch": 0.004250279973212051, "grad_norm": 8.125, "learning_rate": 4.2500838832345377e-07, "loss": 1.77698402, "memory(GiB)": 117.51, "step": 380, "train_speed(iter/s)": 0.425345 }, { "acc": 0.6017416, "epoch": 0.00447397891917058, "grad_norm": 10.375, "learning_rate": 4.473772508667935e-07, "loss": 1.8436636, "memory(GiB)": 117.51, "step": 400, "train_speed(iter/s)": 0.425927 }, { "acc": 0.60661745, "epoch": 0.004697677865129109, "grad_norm": 9.5625, "learning_rate": 4.697461134101331e-07, "loss": 1.83032417, "memory(GiB)": 117.51, "step": 420, "train_speed(iter/s)": 0.427797 }, { "acc": 0.60684471, "epoch": 0.004921376811087638, "grad_norm": 13.5625, "learning_rate": 4.921149759534728e-07, "loss": 1.82581825, "memory(GiB)": 117.51, "step": 440, "train_speed(iter/s)": 0.428251 }, { "acc": 0.61443667, "epoch": 0.005145075757046167, "grad_norm": 6.0625, "learning_rate": 5.144838384968125e-07, "loss": 1.78356228, "memory(GiB)": 117.51, "step": 460, "train_speed(iter/s)": 0.429317 }, { "acc": 0.60898805, "epoch": 0.005368774703004696, "grad_norm": 7.40625, "learning_rate": 5.368527010401522e-07, "loss": 1.81197281, "memory(GiB)": 117.51, "step": 480, "train_speed(iter/s)": 0.430002 }, { "acc": 0.61763306, "epoch": 0.005592473648963225, "grad_norm": 11.875, "learning_rate": 5.592215635834918e-07, "loss": 1.77186127, "memory(GiB)": 117.51, "step": 500, "train_speed(iter/s)": 0.430338 }, { "acc": 0.61783156, "epoch": 0.005816172594921754, "grad_norm": 7.78125, "learning_rate": 5.815904261268315e-07, "loss": 1.76543846, "memory(GiB)": 117.51, "step": 520, "train_speed(iter/s)": 0.430669 }, { "acc": 0.62633824, "epoch": 0.006039871540880284, "grad_norm": 8.6875, "learning_rate": 6.039592886701712e-07, "loss": 1.72854347, "memory(GiB)": 117.51, "step": 540, "train_speed(iter/s)": 0.430955 }, { "acc": 0.60760813, "epoch": 0.006263570486838813, "grad_norm": 9.3125, "learning_rate": 6.263281512135108e-07, "loss": 1.81293373, "memory(GiB)": 117.51, "step": 560, "train_speed(iter/s)": 0.431363 }, { "acc": 0.61424675, "epoch": 0.006487269432797342, "grad_norm": 8.4375, "learning_rate": 6.486970137568505e-07, "loss": 1.77616425, "memory(GiB)": 117.51, "step": 580, "train_speed(iter/s)": 0.432313 }, { "acc": 0.61362991, "epoch": 0.006710968378755871, "grad_norm": 9.6875, "learning_rate": 6.710658763001903e-07, "loss": 1.80025196, "memory(GiB)": 117.51, "step": 600, "train_speed(iter/s)": 0.432209 }, { "acc": 0.61478548, "epoch": 0.0069346673247144, "grad_norm": 9.3125, "learning_rate": 6.934347388435298e-07, "loss": 1.80164261, "memory(GiB)": 117.51, "step": 620, "train_speed(iter/s)": 0.433201 }, { "acc": 0.6201642, "epoch": 0.007158366270672929, "grad_norm": 6.03125, "learning_rate": 7.158036013868695e-07, "loss": 1.73911686, "memory(GiB)": 117.51, "step": 640, "train_speed(iter/s)": 0.433441 }, { "acc": 0.61431103, "epoch": 0.007382065216631458, "grad_norm": 12.9375, "learning_rate": 7.381724639302092e-07, "loss": 1.78418541, "memory(GiB)": 117.51, "step": 660, "train_speed(iter/s)": 0.433157 }, { "acc": 0.63160214, "epoch": 0.007605764162589987, "grad_norm": 7.53125, "learning_rate": 7.60541326473549e-07, "loss": 1.70328293, "memory(GiB)": 117.51, "step": 680, "train_speed(iter/s)": 0.433254 }, { "acc": 0.63206501, "epoch": 0.007829463108548516, "grad_norm": 8.875, "learning_rate": 7.829101890168886e-07, "loss": 1.71971512, "memory(GiB)": 117.51, "step": 700, "train_speed(iter/s)": 0.43318 }, { "acc": 0.62986641, "epoch": 0.008053162054507044, "grad_norm": 7.90625, "learning_rate": 8.052790515602282e-07, "loss": 1.72298737, "memory(GiB)": 117.51, "step": 720, "train_speed(iter/s)": 0.432476 }, { "acc": 0.62721086, "epoch": 0.008276861000465574, "grad_norm": 9.375, "learning_rate": 8.276479141035679e-07, "loss": 1.70469704, "memory(GiB)": 117.51, "step": 740, "train_speed(iter/s)": 0.431658 }, { "acc": 0.63408947, "epoch": 0.008500559946424102, "grad_norm": 9.75, "learning_rate": 8.500167766469075e-07, "loss": 1.69796562, "memory(GiB)": 117.51, "step": 760, "train_speed(iter/s)": 0.431672 }, { "acc": 0.64024544, "epoch": 0.008724258892382632, "grad_norm": 12.8125, "learning_rate": 8.723856391902473e-07, "loss": 1.68381348, "memory(GiB)": 117.51, "step": 780, "train_speed(iter/s)": 0.432058 }, { "acc": 0.63062911, "epoch": 0.00894795783834116, "grad_norm": 7.0, "learning_rate": 8.94754501733587e-07, "loss": 1.69258385, "memory(GiB)": 117.51, "step": 800, "train_speed(iter/s)": 0.432264 }, { "acc": 0.62765074, "epoch": 0.00917165678429969, "grad_norm": 7.9375, "learning_rate": 9.171233642769265e-07, "loss": 1.71087475, "memory(GiB)": 117.51, "step": 820, "train_speed(iter/s)": 0.431883 }, { "acc": 0.62814846, "epoch": 0.009395355730258218, "grad_norm": 7.46875, "learning_rate": 9.394922268202662e-07, "loss": 1.71900196, "memory(GiB)": 117.51, "step": 840, "train_speed(iter/s)": 0.431682 }, { "acc": 0.62156167, "epoch": 0.009619054676216748, "grad_norm": 6.71875, "learning_rate": 9.61861089363606e-07, "loss": 1.74591789, "memory(GiB)": 117.51, "step": 860, "train_speed(iter/s)": 0.431975 }, { "acc": 0.63189564, "epoch": 0.009842753622175276, "grad_norm": 10.4375, "learning_rate": 9.842299519069457e-07, "loss": 1.68336296, "memory(GiB)": 117.51, "step": 880, "train_speed(iter/s)": 0.432596 }, { "acc": 0.63698292, "epoch": 0.010066452568133806, "grad_norm": 9.5625, "learning_rate": 1.0065988144502852e-06, "loss": 1.66994972, "memory(GiB)": 117.51, "step": 900, "train_speed(iter/s)": 0.43312 }, { "acc": 0.63373852, "epoch": 0.010290151514092334, "grad_norm": 16.5, "learning_rate": 1.028967676993625e-06, "loss": 1.68609638, "memory(GiB)": 117.51, "step": 920, "train_speed(iter/s)": 0.433384 }, { "acc": 0.63481846, "epoch": 0.010513850460050864, "grad_norm": 7.21875, "learning_rate": 1.0513365395369646e-06, "loss": 1.68208351, "memory(GiB)": 117.51, "step": 940, "train_speed(iter/s)": 0.434051 }, { "acc": 0.64395752, "epoch": 0.010737549406009392, "grad_norm": 7.5625, "learning_rate": 1.0737054020803043e-06, "loss": 1.65161877, "memory(GiB)": 117.51, "step": 960, "train_speed(iter/s)": 0.4341 }, { "acc": 0.64460778, "epoch": 0.010961248351967922, "grad_norm": 7.125, "learning_rate": 1.0960742646236439e-06, "loss": 1.64117279, "memory(GiB)": 117.51, "step": 980, "train_speed(iter/s)": 0.433557 }, { "acc": 0.63447485, "epoch": 0.01118494729792645, "grad_norm": 6.65625, "learning_rate": 1.1184431271669837e-06, "loss": 1.67663422, "memory(GiB)": 117.51, "step": 1000, "train_speed(iter/s)": 0.433544 }, { "acc": 0.62360525, "epoch": 0.01140864624388498, "grad_norm": 7.75, "learning_rate": 1.1408119897103232e-06, "loss": 1.71965866, "memory(GiB)": 117.51, "step": 1020, "train_speed(iter/s)": 0.433722 }, { "acc": 0.6329402, "epoch": 0.011632345189843508, "grad_norm": 7.90625, "learning_rate": 1.163180852253663e-06, "loss": 1.68101196, "memory(GiB)": 117.51, "step": 1040, "train_speed(iter/s)": 0.43403 }, { "acc": 0.62964172, "epoch": 0.011856044135802038, "grad_norm": 6.84375, "learning_rate": 1.1855497147970028e-06, "loss": 1.68891068, "memory(GiB)": 117.51, "step": 1060, "train_speed(iter/s)": 0.43405 }, { "acc": 0.64755597, "epoch": 0.012079743081760567, "grad_norm": 5.625, "learning_rate": 1.2079185773403423e-06, "loss": 1.60820656, "memory(GiB)": 117.51, "step": 1080, "train_speed(iter/s)": 0.433913 }, { "acc": 0.63846464, "epoch": 0.012303442027719096, "grad_norm": 8.4375, "learning_rate": 1.230287439883682e-06, "loss": 1.65121727, "memory(GiB)": 117.51, "step": 1100, "train_speed(iter/s)": 0.434308 }, { "acc": 0.63982573, "epoch": 0.012527140973677625, "grad_norm": 9.9375, "learning_rate": 1.2526563024270217e-06, "loss": 1.62547054, "memory(GiB)": 117.51, "step": 1120, "train_speed(iter/s)": 0.434567 }, { "acc": 0.63957529, "epoch": 0.012750839919636154, "grad_norm": 10.0625, "learning_rate": 1.2750251649703612e-06, "loss": 1.63761597, "memory(GiB)": 117.51, "step": 1140, "train_speed(iter/s)": 0.434322 }, { "acc": 0.64056911, "epoch": 0.012974538865594683, "grad_norm": 7.1875, "learning_rate": 1.297394027513701e-06, "loss": 1.63461609, "memory(GiB)": 117.51, "step": 1160, "train_speed(iter/s)": 0.434553 }, { "acc": 0.643221, "epoch": 0.013198237811553212, "grad_norm": 8.375, "learning_rate": 1.3197628900570408e-06, "loss": 1.60864143, "memory(GiB)": 117.51, "step": 1180, "train_speed(iter/s)": 0.434876 }, { "acc": 0.6401619, "epoch": 0.013421936757511741, "grad_norm": 6.53125, "learning_rate": 1.3421317526003806e-06, "loss": 1.62360535, "memory(GiB)": 117.51, "step": 1200, "train_speed(iter/s)": 0.434875 }, { "acc": 0.63970594, "epoch": 0.01364563570347027, "grad_norm": 6.46875, "learning_rate": 1.3645006151437201e-06, "loss": 1.63086281, "memory(GiB)": 117.51, "step": 1220, "train_speed(iter/s)": 0.4352 }, { "acc": 0.65479774, "epoch": 0.0138693346494288, "grad_norm": 7.875, "learning_rate": 1.3868694776870597e-06, "loss": 1.5533041, "memory(GiB)": 117.51, "step": 1240, "train_speed(iter/s)": 0.435238 }, { "acc": 0.63711462, "epoch": 0.014093033595387328, "grad_norm": 6.8125, "learning_rate": 1.4092383402303995e-06, "loss": 1.6349472, "memory(GiB)": 117.51, "step": 1260, "train_speed(iter/s)": 0.435316 }, { "acc": 0.64033937, "epoch": 0.014316732541345857, "grad_norm": 7.40625, "learning_rate": 1.431607202773739e-06, "loss": 1.60095596, "memory(GiB)": 117.51, "step": 1280, "train_speed(iter/s)": 0.435552 }, { "acc": 0.63810596, "epoch": 0.014540431487304386, "grad_norm": 6.1875, "learning_rate": 1.4539760653170786e-06, "loss": 1.64833069, "memory(GiB)": 117.51, "step": 1300, "train_speed(iter/s)": 0.435669 }, { "acc": 0.64042015, "epoch": 0.014764130433262915, "grad_norm": 6.3125, "learning_rate": 1.4763449278604184e-06, "loss": 1.61314144, "memory(GiB)": 117.51, "step": 1320, "train_speed(iter/s)": 0.436028 }, { "acc": 0.63659678, "epoch": 0.014987829379221444, "grad_norm": 7.53125, "learning_rate": 1.498713790403758e-06, "loss": 1.63475456, "memory(GiB)": 117.51, "step": 1340, "train_speed(iter/s)": 0.436268 }, { "acc": 0.64772882, "epoch": 0.015211528325179973, "grad_norm": 7.84375, "learning_rate": 1.521082652947098e-06, "loss": 1.58020296, "memory(GiB)": 117.51, "step": 1360, "train_speed(iter/s)": 0.43634 }, { "acc": 0.6422905, "epoch": 0.015435227271138502, "grad_norm": 7.0, "learning_rate": 1.5434515154904375e-06, "loss": 1.60460129, "memory(GiB)": 117.51, "step": 1380, "train_speed(iter/s)": 0.436897 }, { "acc": 0.64815216, "epoch": 0.01565892621709703, "grad_norm": 6.8125, "learning_rate": 1.5658203780337773e-06, "loss": 1.5679265, "memory(GiB)": 117.51, "step": 1400, "train_speed(iter/s)": 0.436884 }, { "acc": 0.65047207, "epoch": 0.01588262516305556, "grad_norm": 6.84375, "learning_rate": 1.5881892405771168e-06, "loss": 1.55900364, "memory(GiB)": 117.51, "step": 1420, "train_speed(iter/s)": 0.436893 }, { "acc": 0.65334387, "epoch": 0.016106324109014088, "grad_norm": 6.71875, "learning_rate": 1.6105581031204564e-06, "loss": 1.56549644, "memory(GiB)": 117.51, "step": 1440, "train_speed(iter/s)": 0.436854 }, { "acc": 0.64456635, "epoch": 0.01633002305497262, "grad_norm": 8.25, "learning_rate": 1.6329269656637962e-06, "loss": 1.59217911, "memory(GiB)": 117.51, "step": 1460, "train_speed(iter/s)": 0.436414 }, { "acc": 0.65064707, "epoch": 0.016553722000931147, "grad_norm": 6.5, "learning_rate": 1.6552958282071357e-06, "loss": 1.54694901, "memory(GiB)": 117.51, "step": 1480, "train_speed(iter/s)": 0.436684 }, { "acc": 0.64552307, "epoch": 0.016777420946889676, "grad_norm": 6.0625, "learning_rate": 1.6776646907504753e-06, "loss": 1.57959805, "memory(GiB)": 117.51, "step": 1500, "train_speed(iter/s)": 0.436978 }, { "acc": 0.65676441, "epoch": 0.017001119892848204, "grad_norm": 13.5, "learning_rate": 1.700033553293815e-06, "loss": 1.54340572, "memory(GiB)": 117.51, "step": 1520, "train_speed(iter/s)": 0.436797 }, { "acc": 0.64827771, "epoch": 0.017224818838806735, "grad_norm": 9.9375, "learning_rate": 1.722402415837155e-06, "loss": 1.56464214, "memory(GiB)": 117.51, "step": 1540, "train_speed(iter/s)": 0.437104 }, { "acc": 0.65661435, "epoch": 0.017448517784765263, "grad_norm": 7.21875, "learning_rate": 1.7447712783804946e-06, "loss": 1.55876293, "memory(GiB)": 117.51, "step": 1560, "train_speed(iter/s)": 0.437078 }, { "acc": 0.65491333, "epoch": 0.01767221673072379, "grad_norm": 7.375, "learning_rate": 1.7671401409238342e-06, "loss": 1.54966459, "memory(GiB)": 117.51, "step": 1580, "train_speed(iter/s)": 0.436807 }, { "acc": 0.65466475, "epoch": 0.01789591567668232, "grad_norm": 7.375, "learning_rate": 1.789509003467174e-06, "loss": 1.54075747, "memory(GiB)": 117.51, "step": 1600, "train_speed(iter/s)": 0.436569 }, { "acc": 0.65358028, "epoch": 0.01811961462264085, "grad_norm": 7.5, "learning_rate": 1.8118778660105135e-06, "loss": 1.555791, "memory(GiB)": 117.51, "step": 1620, "train_speed(iter/s)": 0.436332 }, { "acc": 0.65893812, "epoch": 0.01834331356859938, "grad_norm": 10.0625, "learning_rate": 1.834246728553853e-06, "loss": 1.51664772, "memory(GiB)": 117.51, "step": 1640, "train_speed(iter/s)": 0.436316 }, { "acc": 0.64542007, "epoch": 0.018567012514557907, "grad_norm": 6.40625, "learning_rate": 1.8566155910971929e-06, "loss": 1.59646339, "memory(GiB)": 117.51, "step": 1660, "train_speed(iter/s)": 0.436273 }, { "acc": 0.65863366, "epoch": 0.018790711460516436, "grad_norm": 6.28125, "learning_rate": 1.8789844536405324e-06, "loss": 1.50966263, "memory(GiB)": 117.51, "step": 1680, "train_speed(iter/s)": 0.436294 }, { "acc": 0.66117029, "epoch": 0.019014410406474967, "grad_norm": 8.4375, "learning_rate": 1.9013533161838722e-06, "loss": 1.50282288, "memory(GiB)": 117.51, "step": 1700, "train_speed(iter/s)": 0.436295 }, { "acc": 0.65714197, "epoch": 0.019238109352433495, "grad_norm": 8.3125, "learning_rate": 1.923722178727212e-06, "loss": 1.54431143, "memory(GiB)": 117.51, "step": 1720, "train_speed(iter/s)": 0.436346 }, { "acc": 0.66146021, "epoch": 0.019461808298392023, "grad_norm": 6.6875, "learning_rate": 1.9460910412705515e-06, "loss": 1.51477852, "memory(GiB)": 117.51, "step": 1740, "train_speed(iter/s)": 0.436337 }, { "acc": 0.66838789, "epoch": 0.01968550724435055, "grad_norm": 7.65625, "learning_rate": 1.9684599038138913e-06, "loss": 1.47210331, "memory(GiB)": 117.51, "step": 1760, "train_speed(iter/s)": 0.436216 }, { "acc": 0.66090431, "epoch": 0.019909206190309083, "grad_norm": 5.0, "learning_rate": 1.990828766357231e-06, "loss": 1.50238838, "memory(GiB)": 117.51, "step": 1780, "train_speed(iter/s)": 0.436297 }, { "acc": 0.66466122, "epoch": 0.02013290513626761, "grad_norm": 6.34375, "learning_rate": 2.0131976289005704e-06, "loss": 1.49873047, "memory(GiB)": 117.51, "step": 1800, "train_speed(iter/s)": 0.436343 }, { "acc": 0.66193027, "epoch": 0.02035660408222614, "grad_norm": 6.21875, "learning_rate": 2.03556649144391e-06, "loss": 1.49685192, "memory(GiB)": 117.51, "step": 1820, "train_speed(iter/s)": 0.436499 }, { "acc": 0.65813069, "epoch": 0.020580303028184668, "grad_norm": 6.25, "learning_rate": 2.05793535398725e-06, "loss": 1.50766201, "memory(GiB)": 117.51, "step": 1840, "train_speed(iter/s)": 0.436323 }, { "acc": 0.65939455, "epoch": 0.0208040019741432, "grad_norm": 7.40625, "learning_rate": 2.0803042165305893e-06, "loss": 1.51444263, "memory(GiB)": 117.51, "step": 1860, "train_speed(iter/s)": 0.436596 }, { "acc": 0.65636292, "epoch": 0.021027700920101727, "grad_norm": 9.0, "learning_rate": 2.102673079073929e-06, "loss": 1.52347584, "memory(GiB)": 117.51, "step": 1880, "train_speed(iter/s)": 0.43651 }, { "acc": 0.66866693, "epoch": 0.021251399866060255, "grad_norm": 5.75, "learning_rate": 2.125041941617269e-06, "loss": 1.48403072, "memory(GiB)": 117.51, "step": 1900, "train_speed(iter/s)": 0.436656 }, { "acc": 0.6734231, "epoch": 0.021475098812018784, "grad_norm": 7.65625, "learning_rate": 2.1474108041606087e-06, "loss": 1.44137688, "memory(GiB)": 117.51, "step": 1920, "train_speed(iter/s)": 0.436576 }, { "acc": 0.66880245, "epoch": 0.021698797757977315, "grad_norm": 4.6875, "learning_rate": 2.1697796667039484e-06, "loss": 1.45661945, "memory(GiB)": 117.51, "step": 1940, "train_speed(iter/s)": 0.436599 }, { "acc": 0.65838299, "epoch": 0.021922496703935843, "grad_norm": 6.4375, "learning_rate": 2.1921485292472878e-06, "loss": 1.50841885, "memory(GiB)": 117.51, "step": 1960, "train_speed(iter/s)": 0.436662 }, { "acc": 0.67680035, "epoch": 0.02214619564989437, "grad_norm": 8.375, "learning_rate": 2.2145173917906276e-06, "loss": 1.43250866, "memory(GiB)": 117.51, "step": 1980, "train_speed(iter/s)": 0.436619 }, { "acc": 0.6644321, "epoch": 0.0223698945958529, "grad_norm": 6.59375, "learning_rate": 2.2368862543339673e-06, "loss": 1.45818567, "memory(GiB)": 117.51, "step": 2000, "train_speed(iter/s)": 0.436722 }, { "epoch": 0.0223698945958529, "eval_acc": 0.6281820004898115, "eval_loss": 1.479537010192871, "eval_runtime": 2500.4308, "eval_samples_per_second": 30.108, "eval_steps_per_second": 15.054, "step": 2000 }, { "acc": 0.65693011, "epoch": 0.02259359354181143, "grad_norm": 6.125, "learning_rate": 2.259255116877307e-06, "loss": 1.51593409, "memory(GiB)": 129.59, "step": 2020, "train_speed(iter/s)": 0.281426 }, { "acc": 0.66862507, "epoch": 0.02281729248776996, "grad_norm": 7.46875, "learning_rate": 2.2816239794206465e-06, "loss": 1.45113821, "memory(GiB)": 114.59, "step": 2040, "train_speed(iter/s)": 0.282488 }, { "acc": 0.67072186, "epoch": 0.023040991433728487, "grad_norm": 6.03125, "learning_rate": 2.3039928419639862e-06, "loss": 1.43859653, "memory(GiB)": 114.59, "step": 2060, "train_speed(iter/s)": 0.283461 }, { "acc": 0.67230501, "epoch": 0.023264690379687016, "grad_norm": 5.9375, "learning_rate": 2.326361704507326e-06, "loss": 1.44061375, "memory(GiB)": 114.97, "step": 2080, "train_speed(iter/s)": 0.284445 }, { "acc": 0.66364655, "epoch": 0.023488389325645547, "grad_norm": 6.9375, "learning_rate": 2.3487305670506658e-06, "loss": 1.47409573, "memory(GiB)": 114.97, "step": 2100, "train_speed(iter/s)": 0.285364 }, { "acc": 0.66227207, "epoch": 0.023712088271604075, "grad_norm": 6.5, "learning_rate": 2.3710994295940056e-06, "loss": 1.47512703, "memory(GiB)": 114.97, "step": 2120, "train_speed(iter/s)": 0.286231 }, { "acc": 0.6711587, "epoch": 0.023935787217562603, "grad_norm": 5.5625, "learning_rate": 2.393468292137345e-06, "loss": 1.45563765, "memory(GiB)": 114.97, "step": 2140, "train_speed(iter/s)": 0.287164 }, { "acc": 0.67259903, "epoch": 0.024159486163521135, "grad_norm": 6.5625, "learning_rate": 2.4158371546806847e-06, "loss": 1.43661156, "memory(GiB)": 114.97, "step": 2160, "train_speed(iter/s)": 0.288115 }, { "acc": 0.66470461, "epoch": 0.024383185109479663, "grad_norm": 5.9375, "learning_rate": 2.4382060172240245e-06, "loss": 1.47079268, "memory(GiB)": 114.97, "step": 2180, "train_speed(iter/s)": 0.289094 }, { "acc": 0.66900406, "epoch": 0.02460688405543819, "grad_norm": 6.78125, "learning_rate": 2.460574879767364e-06, "loss": 1.44563389, "memory(GiB)": 114.97, "step": 2200, "train_speed(iter/s)": 0.290108 }, { "acc": 0.66599646, "epoch": 0.02483058300139672, "grad_norm": 6.6875, "learning_rate": 2.4829437423107036e-06, "loss": 1.45791435, "memory(GiB)": 114.97, "step": 2220, "train_speed(iter/s)": 0.291028 }, { "acc": 0.67166967, "epoch": 0.02505428194735525, "grad_norm": 5.53125, "learning_rate": 2.5053126048540434e-06, "loss": 1.4405304, "memory(GiB)": 114.97, "step": 2240, "train_speed(iter/s)": 0.29196 }, { "acc": 0.67572761, "epoch": 0.02527798089331378, "grad_norm": 7.9375, "learning_rate": 2.527681467397383e-06, "loss": 1.39924583, "memory(GiB)": 114.97, "step": 2260, "train_speed(iter/s)": 0.292832 }, { "acc": 0.67290783, "epoch": 0.025501679839272307, "grad_norm": 6.78125, "learning_rate": 2.5500503299407225e-06, "loss": 1.43506546, "memory(GiB)": 114.97, "step": 2280, "train_speed(iter/s)": 0.29369 }, { "acc": 0.66421866, "epoch": 0.025725378785230835, "grad_norm": 7.96875, "learning_rate": 2.5724191924840623e-06, "loss": 1.46951885, "memory(GiB)": 114.97, "step": 2300, "train_speed(iter/s)": 0.294403 }, { "acc": 0.66297183, "epoch": 0.025949077731189367, "grad_norm": 6.3125, "learning_rate": 2.594788055027402e-06, "loss": 1.48046818, "memory(GiB)": 114.97, "step": 2320, "train_speed(iter/s)": 0.295274 }, { "acc": 0.67195549, "epoch": 0.026172776677147895, "grad_norm": 7.9375, "learning_rate": 2.617156917570742e-06, "loss": 1.40575848, "memory(GiB)": 114.97, "step": 2340, "train_speed(iter/s)": 0.296156 }, { "acc": 0.65746899, "epoch": 0.026396475623106423, "grad_norm": 7.0625, "learning_rate": 2.6395257801140816e-06, "loss": 1.49496899, "memory(GiB)": 114.97, "step": 2360, "train_speed(iter/s)": 0.297045 }, { "acc": 0.67152815, "epoch": 0.02662017456906495, "grad_norm": 7.53125, "learning_rate": 2.661894642657421e-06, "loss": 1.43861656, "memory(GiB)": 114.97, "step": 2380, "train_speed(iter/s)": 0.297884 }, { "acc": 0.67675433, "epoch": 0.026843873515023483, "grad_norm": 7.96875, "learning_rate": 2.684263505200761e-06, "loss": 1.39667473, "memory(GiB)": 114.97, "step": 2400, "train_speed(iter/s)": 0.298731 }, { "acc": 0.67179852, "epoch": 0.02706757246098201, "grad_norm": 9.0, "learning_rate": 2.7066323677441005e-06, "loss": 1.42342625, "memory(GiB)": 114.97, "step": 2420, "train_speed(iter/s)": 0.299456 }, { "acc": 0.66808758, "epoch": 0.02729127140694054, "grad_norm": 8.5625, "learning_rate": 2.7290012302874403e-06, "loss": 1.44369106, "memory(GiB)": 114.97, "step": 2440, "train_speed(iter/s)": 0.30016 }, { "acc": 0.67209625, "epoch": 0.027514970352899067, "grad_norm": 6.1875, "learning_rate": 2.7513700928307796e-06, "loss": 1.41720104, "memory(GiB)": 114.97, "step": 2460, "train_speed(iter/s)": 0.300912 }, { "acc": 0.66519637, "epoch": 0.0277386692988576, "grad_norm": 8.1875, "learning_rate": 2.7737389553741194e-06, "loss": 1.43917637, "memory(GiB)": 114.97, "step": 2480, "train_speed(iter/s)": 0.301667 }, { "acc": 0.65819359, "epoch": 0.027962368244816127, "grad_norm": 6.625, "learning_rate": 2.7961078179174587e-06, "loss": 1.45741749, "memory(GiB)": 114.97, "step": 2500, "train_speed(iter/s)": 0.302427 }, { "acc": 0.68644371, "epoch": 0.028186067190774655, "grad_norm": 5.90625, "learning_rate": 2.818476680460799e-06, "loss": 1.34407101, "memory(GiB)": 114.97, "step": 2520, "train_speed(iter/s)": 0.303152 }, { "acc": 0.66801076, "epoch": 0.028409766136733183, "grad_norm": 6.84375, "learning_rate": 2.8408455430041387e-06, "loss": 1.43399668, "memory(GiB)": 114.97, "step": 2540, "train_speed(iter/s)": 0.303921 }, { "acc": 0.67592659, "epoch": 0.028633465082691715, "grad_norm": 6.625, "learning_rate": 2.863214405547478e-06, "loss": 1.37781296, "memory(GiB)": 114.97, "step": 2560, "train_speed(iter/s)": 0.304664 }, { "acc": 0.67639532, "epoch": 0.028857164028650243, "grad_norm": 4.75, "learning_rate": 2.885583268090818e-06, "loss": 1.39425507, "memory(GiB)": 114.97, "step": 2580, "train_speed(iter/s)": 0.305306 }, { "acc": 0.68030286, "epoch": 0.02908086297460877, "grad_norm": 6.65625, "learning_rate": 2.907952130634157e-06, "loss": 1.35987186, "memory(GiB)": 114.97, "step": 2600, "train_speed(iter/s)": 0.306061 }, { "acc": 0.67572002, "epoch": 0.0293045619205673, "grad_norm": 5.875, "learning_rate": 2.9303209931774974e-06, "loss": 1.37614422, "memory(GiB)": 114.97, "step": 2620, "train_speed(iter/s)": 0.306781 }, { "acc": 0.67035532, "epoch": 0.02952826086652583, "grad_norm": 5.5, "learning_rate": 2.9526898557208367e-06, "loss": 1.41726513, "memory(GiB)": 114.97, "step": 2640, "train_speed(iter/s)": 0.307513 }, { "acc": 0.68166552, "epoch": 0.02975195981248436, "grad_norm": 7.84375, "learning_rate": 2.9750587182641765e-06, "loss": 1.3571909, "memory(GiB)": 114.97, "step": 2660, "train_speed(iter/s)": 0.308126 }, { "acc": 0.67040501, "epoch": 0.029975658758442887, "grad_norm": 6.9375, "learning_rate": 2.997427580807516e-06, "loss": 1.41674747, "memory(GiB)": 114.97, "step": 2680, "train_speed(iter/s)": 0.308791 }, { "acc": 0.67270966, "epoch": 0.030199357704401415, "grad_norm": 6.0, "learning_rate": 3.019796443350856e-06, "loss": 1.39088869, "memory(GiB)": 114.97, "step": 2700, "train_speed(iter/s)": 0.309437 }, { "acc": 0.67041245, "epoch": 0.030423056650359947, "grad_norm": 7.59375, "learning_rate": 3.042165305894196e-06, "loss": 1.40822163, "memory(GiB)": 114.97, "step": 2720, "train_speed(iter/s)": 0.31015 }, { "acc": 0.67529964, "epoch": 0.030646755596318475, "grad_norm": 7.65625, "learning_rate": 3.064534168437535e-06, "loss": 1.39124298, "memory(GiB)": 114.97, "step": 2740, "train_speed(iter/s)": 0.310798 }, { "acc": 0.67722235, "epoch": 0.030870454542277003, "grad_norm": 7.1875, "learning_rate": 3.086903030980875e-06, "loss": 1.37614622, "memory(GiB)": 114.97, "step": 2760, "train_speed(iter/s)": 0.311415 }, { "acc": 0.67482271, "epoch": 0.03109415348823553, "grad_norm": 7.59375, "learning_rate": 3.1092718935242143e-06, "loss": 1.39862576, "memory(GiB)": 114.97, "step": 2780, "train_speed(iter/s)": 0.312008 }, { "acc": 0.67509365, "epoch": 0.03131785243419406, "grad_norm": 6.46875, "learning_rate": 3.1316407560675545e-06, "loss": 1.37140217, "memory(GiB)": 114.97, "step": 2800, "train_speed(iter/s)": 0.312762 }, { "acc": 0.67670097, "epoch": 0.03154155138015259, "grad_norm": 5.46875, "learning_rate": 3.154009618610894e-06, "loss": 1.3738842, "memory(GiB)": 114.97, "step": 2820, "train_speed(iter/s)": 0.313421 }, { "acc": 0.68028107, "epoch": 0.03176525032611112, "grad_norm": 7.34375, "learning_rate": 3.1763784811542336e-06, "loss": 1.37406807, "memory(GiB)": 114.97, "step": 2840, "train_speed(iter/s)": 0.314044 }, { "acc": 0.66899643, "epoch": 0.03198894927206965, "grad_norm": 6.90625, "learning_rate": 3.198747343697573e-06, "loss": 1.404105, "memory(GiB)": 114.97, "step": 2860, "train_speed(iter/s)": 0.314679 }, { "acc": 0.6729393, "epoch": 0.032212648218028175, "grad_norm": 7.34375, "learning_rate": 3.2211162062409128e-06, "loss": 1.39697399, "memory(GiB)": 114.97, "step": 2880, "train_speed(iter/s)": 0.315258 }, { "acc": 0.67177229, "epoch": 0.032436347163986703, "grad_norm": 5.1875, "learning_rate": 3.243485068784253e-06, "loss": 1.38772211, "memory(GiB)": 114.97, "step": 2900, "train_speed(iter/s)": 0.315909 }, { "acc": 0.68348765, "epoch": 0.03266004610994524, "grad_norm": 8.6875, "learning_rate": 3.2658539313275923e-06, "loss": 1.33322277, "memory(GiB)": 114.97, "step": 2920, "train_speed(iter/s)": 0.316534 }, { "acc": 0.66274691, "epoch": 0.03288374505590377, "grad_norm": 5.59375, "learning_rate": 3.288222793870932e-06, "loss": 1.43326902, "memory(GiB)": 114.97, "step": 2940, "train_speed(iter/s)": 0.317113 }, { "acc": 0.66822691, "epoch": 0.033107444001862295, "grad_norm": 5.375, "learning_rate": 3.3105916564142715e-06, "loss": 1.40090199, "memory(GiB)": 114.97, "step": 2960, "train_speed(iter/s)": 0.317731 }, { "acc": 0.67742844, "epoch": 0.03333114294782082, "grad_norm": 6.65625, "learning_rate": 3.3329605189576112e-06, "loss": 1.37847929, "memory(GiB)": 114.97, "step": 2980, "train_speed(iter/s)": 0.318416 }, { "acc": 0.68049135, "epoch": 0.03355484189377935, "grad_norm": 7.375, "learning_rate": 3.3553293815009506e-06, "loss": 1.3684948, "memory(GiB)": 114.97, "step": 3000, "train_speed(iter/s)": 0.319118 }, { "acc": 0.67952251, "epoch": 0.03377854083973788, "grad_norm": 5.34375, "learning_rate": 3.3776982440442908e-06, "loss": 1.34968567, "memory(GiB)": 114.97, "step": 3020, "train_speed(iter/s)": 0.319697 }, { "acc": 0.67846813, "epoch": 0.03400223978569641, "grad_norm": 6.9375, "learning_rate": 3.40006710658763e-06, "loss": 1.35527449, "memory(GiB)": 114.97, "step": 3040, "train_speed(iter/s)": 0.320341 }, { "acc": 0.69013329, "epoch": 0.034225938731654935, "grad_norm": 6.0, "learning_rate": 3.42243596913097e-06, "loss": 1.30549431, "memory(GiB)": 114.97, "step": 3060, "train_speed(iter/s)": 0.320975 }, { "acc": 0.67372909, "epoch": 0.03444963767761347, "grad_norm": 5.59375, "learning_rate": 3.44480483167431e-06, "loss": 1.38010759, "memory(GiB)": 114.97, "step": 3080, "train_speed(iter/s)": 0.321535 }, { "acc": 0.67608218, "epoch": 0.034673336623572, "grad_norm": 7.625, "learning_rate": 3.4671736942176495e-06, "loss": 1.37006521, "memory(GiB)": 114.97, "step": 3100, "train_speed(iter/s)": 0.322155 }, { "acc": 0.67054958, "epoch": 0.03489703556953053, "grad_norm": 5.21875, "learning_rate": 3.4895425567609892e-06, "loss": 1.38922052, "memory(GiB)": 114.97, "step": 3120, "train_speed(iter/s)": 0.322725 }, { "acc": 0.68765125, "epoch": 0.035120734515489055, "grad_norm": 7.21875, "learning_rate": 3.5119114193043286e-06, "loss": 1.32398434, "memory(GiB)": 114.97, "step": 3140, "train_speed(iter/s)": 0.323277 }, { "acc": 0.68658276, "epoch": 0.03534443346144758, "grad_norm": 7.34375, "learning_rate": 3.5342802818476684e-06, "loss": 1.33047504, "memory(GiB)": 114.97, "step": 3160, "train_speed(iter/s)": 0.323824 }, { "acc": 0.68295412, "epoch": 0.03556813240740611, "grad_norm": 5.28125, "learning_rate": 3.5566491443910077e-06, "loss": 1.35201149, "memory(GiB)": 114.97, "step": 3180, "train_speed(iter/s)": 0.324451 }, { "acc": 0.67967205, "epoch": 0.03579183135336464, "grad_norm": 7.1875, "learning_rate": 3.579018006934348e-06, "loss": 1.36269779, "memory(GiB)": 114.98, "step": 3200, "train_speed(iter/s)": 0.325003 }, { "acc": 0.68866673, "epoch": 0.036015530299323174, "grad_norm": 7.3125, "learning_rate": 3.6013868694776873e-06, "loss": 1.31009865, "memory(GiB)": 114.98, "step": 3220, "train_speed(iter/s)": 0.325593 }, { "acc": 0.68523855, "epoch": 0.0362392292452817, "grad_norm": 7.03125, "learning_rate": 3.623755732021027e-06, "loss": 1.32543907, "memory(GiB)": 114.98, "step": 3240, "train_speed(iter/s)": 0.326209 }, { "acc": 0.68065739, "epoch": 0.03646292819124023, "grad_norm": 5.25, "learning_rate": 3.646124594564367e-06, "loss": 1.34603024, "memory(GiB)": 114.98, "step": 3260, "train_speed(iter/s)": 0.326705 }, { "acc": 0.68424416, "epoch": 0.03668662713719876, "grad_norm": 6.21875, "learning_rate": 3.668493457107706e-06, "loss": 1.32180481, "memory(GiB)": 114.98, "step": 3280, "train_speed(iter/s)": 0.327208 }, { "acc": 0.68162947, "epoch": 0.03691032608315729, "grad_norm": 7.59375, "learning_rate": 3.6908623196510464e-06, "loss": 1.33958511, "memory(GiB)": 114.98, "step": 3300, "train_speed(iter/s)": 0.327738 }, { "acc": 0.66827402, "epoch": 0.037134025029115815, "grad_norm": 6.0, "learning_rate": 3.7132311821943857e-06, "loss": 1.39982567, "memory(GiB)": 114.98, "step": 3320, "train_speed(iter/s)": 0.328273 }, { "acc": 0.67939053, "epoch": 0.03735772397507434, "grad_norm": 5.40625, "learning_rate": 3.7356000447377255e-06, "loss": 1.3433918, "memory(GiB)": 114.98, "step": 3340, "train_speed(iter/s)": 0.328726 }, { "acc": 0.69432817, "epoch": 0.03758142292103287, "grad_norm": 5.0625, "learning_rate": 3.757968907281065e-06, "loss": 1.27991362, "memory(GiB)": 114.98, "step": 3360, "train_speed(iter/s)": 0.329287 }, { "acc": 0.6748672, "epoch": 0.037805121866991406, "grad_norm": 4.9375, "learning_rate": 3.7803377698244046e-06, "loss": 1.36990395, "memory(GiB)": 114.98, "step": 3380, "train_speed(iter/s)": 0.329752 }, { "acc": 0.68219452, "epoch": 0.038028820812949934, "grad_norm": 7.1875, "learning_rate": 3.8027066323677444e-06, "loss": 1.32270279, "memory(GiB)": 114.98, "step": 3400, "train_speed(iter/s)": 0.330204 }, { "acc": 0.67772913, "epoch": 0.03825251975890846, "grad_norm": 6.1875, "learning_rate": 3.825075494911084e-06, "loss": 1.36045666, "memory(GiB)": 114.98, "step": 3420, "train_speed(iter/s)": 0.330709 }, { "acc": 0.67072716, "epoch": 0.03847621870486699, "grad_norm": 6.1875, "learning_rate": 3.847444357454424e-06, "loss": 1.38766232, "memory(GiB)": 114.98, "step": 3440, "train_speed(iter/s)": 0.331055 }, { "acc": 0.67270756, "epoch": 0.03869991765082552, "grad_norm": 5.71875, "learning_rate": 3.869813219997764e-06, "loss": 1.37728977, "memory(GiB)": 114.98, "step": 3460, "train_speed(iter/s)": 0.331622 }, { "acc": 0.68063779, "epoch": 0.03892361659678405, "grad_norm": 6.59375, "learning_rate": 3.892182082541103e-06, "loss": 1.33663769, "memory(GiB)": 114.98, "step": 3480, "train_speed(iter/s)": 0.332128 }, { "acc": 0.68514166, "epoch": 0.039147315542742575, "grad_norm": 5.3125, "learning_rate": 3.914550945084442e-06, "loss": 1.31047773, "memory(GiB)": 114.98, "step": 3500, "train_speed(iter/s)": 0.332556 }, { "acc": 0.68012571, "epoch": 0.0393710144887011, "grad_norm": 6.59375, "learning_rate": 3.936919807627783e-06, "loss": 1.33457985, "memory(GiB)": 114.98, "step": 3520, "train_speed(iter/s)": 0.33301 }, { "acc": 0.69243336, "epoch": 0.03959471343465964, "grad_norm": 8.0, "learning_rate": 3.959288670171122e-06, "loss": 1.30082207, "memory(GiB)": 114.98, "step": 3540, "train_speed(iter/s)": 0.333452 }, { "acc": 0.67403669, "epoch": 0.039818412380618166, "grad_norm": 6.0, "learning_rate": 3.981657532714462e-06, "loss": 1.37182341, "memory(GiB)": 114.98, "step": 3560, "train_speed(iter/s)": 0.33398 }, { "acc": 0.68651199, "epoch": 0.040042111326576695, "grad_norm": 6.25, "learning_rate": 4.0040263952578015e-06, "loss": 1.31894865, "memory(GiB)": 114.98, "step": 3580, "train_speed(iter/s)": 0.334489 }, { "acc": 0.68884406, "epoch": 0.04026581027253522, "grad_norm": 6.15625, "learning_rate": 4.026395257801141e-06, "loss": 1.30174704, "memory(GiB)": 114.98, "step": 3600, "train_speed(iter/s)": 0.334965 }, { "acc": 0.6762135, "epoch": 0.04048950921849375, "grad_norm": 8.1875, "learning_rate": 4.048764120344481e-06, "loss": 1.37039242, "memory(GiB)": 114.98, "step": 3620, "train_speed(iter/s)": 0.335319 }, { "acc": 0.6825264, "epoch": 0.04071320816445228, "grad_norm": 4.78125, "learning_rate": 4.07113298288782e-06, "loss": 1.3348259, "memory(GiB)": 114.98, "step": 3640, "train_speed(iter/s)": 0.335764 }, { "acc": 0.68302445, "epoch": 0.04093690711041081, "grad_norm": 6.3125, "learning_rate": 4.093501845431161e-06, "loss": 1.32300167, "memory(GiB)": 114.98, "step": 3660, "train_speed(iter/s)": 0.33622 }, { "acc": 0.69386587, "epoch": 0.041160606056369335, "grad_norm": 5.90625, "learning_rate": 4.1158707079745e-06, "loss": 1.28429174, "memory(GiB)": 129.21, "step": 3680, "train_speed(iter/s)": 0.336589 }, { "acc": 0.69786777, "epoch": 0.04138430500232787, "grad_norm": 5.375, "learning_rate": 4.138239570517839e-06, "loss": 1.24646311, "memory(GiB)": 129.21, "step": 3700, "train_speed(iter/s)": 0.33697 }, { "acc": 0.68189459, "epoch": 0.0416080039482864, "grad_norm": 5.1875, "learning_rate": 4.160608433061179e-06, "loss": 1.3307025, "memory(GiB)": 129.21, "step": 3720, "train_speed(iter/s)": 0.337461 }, { "acc": 0.68678041, "epoch": 0.041831702894244926, "grad_norm": 5.65625, "learning_rate": 4.182977295604519e-06, "loss": 1.31456566, "memory(GiB)": 129.21, "step": 3740, "train_speed(iter/s)": 0.337827 }, { "acc": 0.69118967, "epoch": 0.042055401840203455, "grad_norm": 7.78125, "learning_rate": 4.205346158147858e-06, "loss": 1.28702879, "memory(GiB)": 129.21, "step": 3760, "train_speed(iter/s)": 0.338234 }, { "acc": 0.68779221, "epoch": 0.04227910078616198, "grad_norm": 5.78125, "learning_rate": 4.227715020691198e-06, "loss": 1.31603966, "memory(GiB)": 129.21, "step": 3780, "train_speed(iter/s)": 0.338629 }, { "acc": 0.68182406, "epoch": 0.04250279973212051, "grad_norm": 5.5625, "learning_rate": 4.250083883234538e-06, "loss": 1.32453871, "memory(GiB)": 129.21, "step": 3800, "train_speed(iter/s)": 0.339049 }, { "acc": 0.68753395, "epoch": 0.04272649867807904, "grad_norm": 5.5, "learning_rate": 4.272452745777877e-06, "loss": 1.29200382, "memory(GiB)": 129.21, "step": 3820, "train_speed(iter/s)": 0.339462 }, { "acc": 0.6870821, "epoch": 0.04295019762403757, "grad_norm": 6.53125, "learning_rate": 4.294821608321217e-06, "loss": 1.29467373, "memory(GiB)": 129.21, "step": 3840, "train_speed(iter/s)": 0.339899 }, { "acc": 0.69365864, "epoch": 0.0431738965699961, "grad_norm": 6.625, "learning_rate": 4.317190470864557e-06, "loss": 1.29317236, "memory(GiB)": 129.21, "step": 3860, "train_speed(iter/s)": 0.340308 }, { "acc": 0.68446317, "epoch": 0.04339759551595463, "grad_norm": 6.0, "learning_rate": 4.339559333407897e-06, "loss": 1.30300236, "memory(GiB)": 129.21, "step": 3880, "train_speed(iter/s)": 0.340701 }, { "acc": 0.68437643, "epoch": 0.04362129446191316, "grad_norm": 6.0625, "learning_rate": 4.361928195951236e-06, "loss": 1.30169353, "memory(GiB)": 129.21, "step": 3900, "train_speed(iter/s)": 0.341028 }, { "acc": 0.68696518, "epoch": 0.04384499340787169, "grad_norm": 5.90625, "learning_rate": 4.3842970584945756e-06, "loss": 1.28744335, "memory(GiB)": 129.21, "step": 3920, "train_speed(iter/s)": 0.341465 }, { "acc": 0.68390627, "epoch": 0.044068692353830215, "grad_norm": 4.625, "learning_rate": 4.406665921037916e-06, "loss": 1.29739628, "memory(GiB)": 129.21, "step": 3940, "train_speed(iter/s)": 0.34188 }, { "acc": 0.68745804, "epoch": 0.04429239129978874, "grad_norm": 7.65625, "learning_rate": 4.429034783581255e-06, "loss": 1.28496971, "memory(GiB)": 129.21, "step": 3960, "train_speed(iter/s)": 0.342346 }, { "acc": 0.69196997, "epoch": 0.04451609024574727, "grad_norm": 5.375, "learning_rate": 4.451403646124595e-06, "loss": 1.27540646, "memory(GiB)": 129.21, "step": 3980, "train_speed(iter/s)": 0.34266 }, { "acc": 0.68813467, "epoch": 0.0447397891917058, "grad_norm": 6.53125, "learning_rate": 4.473772508667935e-06, "loss": 1.30012932, "memory(GiB)": 129.21, "step": 4000, "train_speed(iter/s)": 0.343022 }, { "epoch": 0.0447397891917058, "eval_acc": 0.649159008924272, "eval_loss": 1.2790635824203491, "eval_runtime": 2341.5116, "eval_samples_per_second": 32.151, "eval_steps_per_second": 16.076, "step": 4000 }, { "acc": 0.69103689, "epoch": 0.044963488137664334, "grad_norm": 7.625, "learning_rate": 4.496141371211275e-06, "loss": 1.26613665, "memory(GiB)": 133.45, "step": 4020, "train_speed(iter/s)": 0.284991 }, { "acc": 0.68383579, "epoch": 0.04518718708362286, "grad_norm": 6.0625, "learning_rate": 4.518510233754614e-06, "loss": 1.30610342, "memory(GiB)": 133.45, "step": 4040, "train_speed(iter/s)": 0.285447 }, { "acc": 0.68921595, "epoch": 0.04541088602958139, "grad_norm": 6.96875, "learning_rate": 4.5408790962979536e-06, "loss": 1.2891118, "memory(GiB)": 133.45, "step": 4060, "train_speed(iter/s)": 0.285934 }, { "acc": 0.6863728, "epoch": 0.04563458497553992, "grad_norm": 6.03125, "learning_rate": 4.563247958841293e-06, "loss": 1.28777447, "memory(GiB)": 133.45, "step": 4080, "train_speed(iter/s)": 0.286358 }, { "acc": 0.70546312, "epoch": 0.04585828392149845, "grad_norm": 6.71875, "learning_rate": 4.585616821384633e-06, "loss": 1.20147896, "memory(GiB)": 133.45, "step": 4100, "train_speed(iter/s)": 0.286846 }, { "acc": 0.68868179, "epoch": 0.046081982867456975, "grad_norm": 6.0625, "learning_rate": 4.6079856839279725e-06, "loss": 1.2790411, "memory(GiB)": 133.45, "step": 4120, "train_speed(iter/s)": 0.287306 }, { "acc": 0.69294586, "epoch": 0.0463056818134155, "grad_norm": 6.59375, "learning_rate": 4.630354546471313e-06, "loss": 1.25960064, "memory(GiB)": 133.45, "step": 4140, "train_speed(iter/s)": 0.287772 }, { "acc": 0.7031992, "epoch": 0.04652938075937403, "grad_norm": 6.15625, "learning_rate": 4.652723409014652e-06, "loss": 1.21040249, "memory(GiB)": 133.45, "step": 4160, "train_speed(iter/s)": 0.28825 }, { "acc": 0.6923121, "epoch": 0.046753079705332566, "grad_norm": 6.1875, "learning_rate": 4.675092271557991e-06, "loss": 1.26823578, "memory(GiB)": 133.45, "step": 4180, "train_speed(iter/s)": 0.288704 }, { "acc": 0.6751318, "epoch": 0.046976778651291094, "grad_norm": 6.875, "learning_rate": 4.6974611341013316e-06, "loss": 1.34024143, "memory(GiB)": 133.45, "step": 4200, "train_speed(iter/s)": 0.289179 }, { "acc": 0.69551544, "epoch": 0.04720047759724962, "grad_norm": 6.03125, "learning_rate": 4.719829996644671e-06, "loss": 1.24157152, "memory(GiB)": 133.45, "step": 4220, "train_speed(iter/s)": 0.289683 }, { "acc": 0.69326982, "epoch": 0.04742417654320815, "grad_norm": 4.84375, "learning_rate": 4.742198859188011e-06, "loss": 1.2515089, "memory(GiB)": 133.45, "step": 4240, "train_speed(iter/s)": 0.29019 }, { "acc": 0.68492374, "epoch": 0.04764787548916668, "grad_norm": 5.375, "learning_rate": 4.7645677217313505e-06, "loss": 1.30640621, "memory(GiB)": 133.45, "step": 4260, "train_speed(iter/s)": 0.29065 }, { "acc": 0.67700334, "epoch": 0.04787157443512521, "grad_norm": 7.0625, "learning_rate": 4.78693658427469e-06, "loss": 1.34648113, "memory(GiB)": 133.45, "step": 4280, "train_speed(iter/s)": 0.291074 }, { "acc": 0.69393597, "epoch": 0.048095273381083735, "grad_norm": 6.5625, "learning_rate": 4.809305446818029e-06, "loss": 1.25471048, "memory(GiB)": 133.45, "step": 4300, "train_speed(iter/s)": 0.291533 }, { "acc": 0.67776632, "epoch": 0.04831897232704227, "grad_norm": 6.0625, "learning_rate": 4.831674309361369e-06, "loss": 1.33574219, "memory(GiB)": 133.45, "step": 4320, "train_speed(iter/s)": 0.291907 }, { "acc": 0.69186521, "epoch": 0.0485426712730008, "grad_norm": 6.625, "learning_rate": 4.8540431719047096e-06, "loss": 1.27287369, "memory(GiB)": 133.45, "step": 4340, "train_speed(iter/s)": 0.292351 }, { "acc": 0.69197435, "epoch": 0.048766370218959326, "grad_norm": 6.125, "learning_rate": 4.876412034448049e-06, "loss": 1.26098194, "memory(GiB)": 133.45, "step": 4360, "train_speed(iter/s)": 0.2928 }, { "acc": 0.68544297, "epoch": 0.048990069164917854, "grad_norm": 5.5, "learning_rate": 4.898780896991388e-06, "loss": 1.29761219, "memory(GiB)": 133.45, "step": 4380, "train_speed(iter/s)": 0.293241 }, { "acc": 0.69719958, "epoch": 0.04921376811087638, "grad_norm": 7.625, "learning_rate": 4.921149759534728e-06, "loss": 1.24267311, "memory(GiB)": 133.45, "step": 4400, "train_speed(iter/s)": 0.293686 }, { "acc": 0.69865637, "epoch": 0.04943746705683491, "grad_norm": 5.75, "learning_rate": 4.943518622078068e-06, "loss": 1.23768387, "memory(GiB)": 133.45, "step": 4420, "train_speed(iter/s)": 0.294102 }, { "acc": 0.69035616, "epoch": 0.04966116600279344, "grad_norm": 6.46875, "learning_rate": 4.965887484621407e-06, "loss": 1.28515434, "memory(GiB)": 133.45, "step": 4440, "train_speed(iter/s)": 0.294517 }, { "acc": 0.68066502, "epoch": 0.04988486494875197, "grad_norm": 6.5, "learning_rate": 4.988256347164747e-06, "loss": 1.32447376, "memory(GiB)": 133.45, "step": 4460, "train_speed(iter/s)": 0.294958 }, { "acc": 0.69577322, "epoch": 0.0501085638947105, "grad_norm": 6.65625, "learning_rate": 5.010625209708087e-06, "loss": 1.24761715, "memory(GiB)": 133.45, "step": 4480, "train_speed(iter/s)": 0.29542 }, { "acc": 0.68202438, "epoch": 0.05033226284066903, "grad_norm": 4.78125, "learning_rate": 5.032994072251426e-06, "loss": 1.30786476, "memory(GiB)": 133.45, "step": 4500, "train_speed(iter/s)": 0.295803 }, { "acc": 0.69883218, "epoch": 0.05055596178662756, "grad_norm": 6.8125, "learning_rate": 5.055362934794766e-06, "loss": 1.23517113, "memory(GiB)": 133.45, "step": 4520, "train_speed(iter/s)": 0.296207 }, { "acc": 0.69884467, "epoch": 0.050779660732586086, "grad_norm": 4.90625, "learning_rate": 5.077731797338106e-06, "loss": 1.24491005, "memory(GiB)": 133.45, "step": 4540, "train_speed(iter/s)": 0.296602 }, { "acc": 0.70561714, "epoch": 0.051003359678544614, "grad_norm": 7.21875, "learning_rate": 5.100100659881445e-06, "loss": 1.19972248, "memory(GiB)": 133.45, "step": 4560, "train_speed(iter/s)": 0.297023 }, { "acc": 0.69628196, "epoch": 0.05122705862450314, "grad_norm": 5.625, "learning_rate": 5.122469522424785e-06, "loss": 1.25253067, "memory(GiB)": 133.45, "step": 4580, "train_speed(iter/s)": 0.29741 }, { "acc": 0.70811706, "epoch": 0.05145075757046167, "grad_norm": 4.96875, "learning_rate": 5.1448383849681245e-06, "loss": 1.19783573, "memory(GiB)": 133.45, "step": 4600, "train_speed(iter/s)": 0.297845 }, { "acc": 0.69282184, "epoch": 0.0516744565164202, "grad_norm": 6.0625, "learning_rate": 5.167207247511464e-06, "loss": 1.26014996, "memory(GiB)": 133.45, "step": 4620, "train_speed(iter/s)": 0.298263 }, { "acc": 0.68996286, "epoch": 0.051898155462378734, "grad_norm": 6.65625, "learning_rate": 5.189576110054804e-06, "loss": 1.28391495, "memory(GiB)": 133.45, "step": 4640, "train_speed(iter/s)": 0.29865 }, { "acc": 0.69696751, "epoch": 0.05212185440833726, "grad_norm": 5.5625, "learning_rate": 5.211944972598144e-06, "loss": 1.23137131, "memory(GiB)": 133.45, "step": 4660, "train_speed(iter/s)": 0.299062 }, { "acc": 0.69023733, "epoch": 0.05234555335429579, "grad_norm": 6.5, "learning_rate": 5.234313835141484e-06, "loss": 1.27331352, "memory(GiB)": 133.45, "step": 4680, "train_speed(iter/s)": 0.299491 }, { "acc": 0.69248624, "epoch": 0.05256925230025432, "grad_norm": 5.5, "learning_rate": 5.256682697684823e-06, "loss": 1.26060791, "memory(GiB)": 133.45, "step": 4700, "train_speed(iter/s)": 0.299952 }, { "acc": 0.70318213, "epoch": 0.052792951246212846, "grad_norm": 6.875, "learning_rate": 5.279051560228163e-06, "loss": 1.19434862, "memory(GiB)": 133.45, "step": 4720, "train_speed(iter/s)": 0.300247 }, { "acc": 0.69159913, "epoch": 0.053016650192171375, "grad_norm": 6.125, "learning_rate": 5.3014204227715025e-06, "loss": 1.27116547, "memory(GiB)": 133.45, "step": 4740, "train_speed(iter/s)": 0.300641 }, { "acc": 0.71052713, "epoch": 0.0532403491381299, "grad_norm": 5.3125, "learning_rate": 5.323789285314842e-06, "loss": 1.16488724, "memory(GiB)": 133.45, "step": 4760, "train_speed(iter/s)": 0.301006 }, { "acc": 0.69370956, "epoch": 0.05346404808408843, "grad_norm": 6.84375, "learning_rate": 5.346158147858181e-06, "loss": 1.26267853, "memory(GiB)": 133.45, "step": 4780, "train_speed(iter/s)": 0.301401 }, { "acc": 0.68243151, "epoch": 0.053687747030046966, "grad_norm": 5.9375, "learning_rate": 5.368527010401522e-06, "loss": 1.31234283, "memory(GiB)": 133.45, "step": 4800, "train_speed(iter/s)": 0.301776 }, { "acc": 0.69505973, "epoch": 0.053911445976005494, "grad_norm": 6.59375, "learning_rate": 5.390895872944862e-06, "loss": 1.25161095, "memory(GiB)": 133.45, "step": 4820, "train_speed(iter/s)": 0.302146 }, { "acc": 0.69417834, "epoch": 0.05413514492196402, "grad_norm": 5.5625, "learning_rate": 5.413264735488201e-06, "loss": 1.26022472, "memory(GiB)": 133.45, "step": 4840, "train_speed(iter/s)": 0.302497 }, { "acc": 0.69661956, "epoch": 0.05435884386792255, "grad_norm": 5.03125, "learning_rate": 5.43563359803154e-06, "loss": 1.25330467, "memory(GiB)": 133.45, "step": 4860, "train_speed(iter/s)": 0.302916 }, { "acc": 0.69831314, "epoch": 0.05458254281388108, "grad_norm": 6.40625, "learning_rate": 5.4580024605748805e-06, "loss": 1.23917685, "memory(GiB)": 133.45, "step": 4880, "train_speed(iter/s)": 0.303324 }, { "acc": 0.69536729, "epoch": 0.054806241759839606, "grad_norm": 6.90625, "learning_rate": 5.48037132311822e-06, "loss": 1.26727858, "memory(GiB)": 133.45, "step": 4900, "train_speed(iter/s)": 0.303646 }, { "acc": 0.69382505, "epoch": 0.055029940705798135, "grad_norm": 7.8125, "learning_rate": 5.502740185661559e-06, "loss": 1.26651802, "memory(GiB)": 133.45, "step": 4920, "train_speed(iter/s)": 0.304018 }, { "acc": 0.68677721, "epoch": 0.05525363965175666, "grad_norm": 5.875, "learning_rate": 5.5251090482048994e-06, "loss": 1.28950205, "memory(GiB)": 133.45, "step": 4940, "train_speed(iter/s)": 0.304424 }, { "acc": 0.69914627, "epoch": 0.0554773385977152, "grad_norm": 6.03125, "learning_rate": 5.547477910748239e-06, "loss": 1.22549477, "memory(GiB)": 133.45, "step": 4960, "train_speed(iter/s)": 0.304792 }, { "acc": 0.69428, "epoch": 0.055701037543673726, "grad_norm": 6.9375, "learning_rate": 5.569846773291578e-06, "loss": 1.24709053, "memory(GiB)": 133.45, "step": 4980, "train_speed(iter/s)": 0.305193 }, { "acc": 0.7001369, "epoch": 0.055924736489632254, "grad_norm": 5.6875, "learning_rate": 5.5922156358349175e-06, "loss": 1.23490391, "memory(GiB)": 133.45, "step": 5000, "train_speed(iter/s)": 0.305587 }, { "acc": 0.69532223, "epoch": 0.05614843543559078, "grad_norm": 6.5, "learning_rate": 5.6145844983782585e-06, "loss": 1.23750687, "memory(GiB)": 133.45, "step": 5020, "train_speed(iter/s)": 0.305899 }, { "acc": 0.6877533, "epoch": 0.05637213438154931, "grad_norm": 5.875, "learning_rate": 5.636953360921598e-06, "loss": 1.27757187, "memory(GiB)": 133.45, "step": 5040, "train_speed(iter/s)": 0.306257 }, { "acc": 0.70067587, "epoch": 0.05659583332750784, "grad_norm": 5.65625, "learning_rate": 5.659322223464937e-06, "loss": 1.22467022, "memory(GiB)": 133.45, "step": 5060, "train_speed(iter/s)": 0.306599 }, { "acc": 0.70154467, "epoch": 0.05681953227346637, "grad_norm": 7.3125, "learning_rate": 5.6816910860082774e-06, "loss": 1.20408783, "memory(GiB)": 133.45, "step": 5080, "train_speed(iter/s)": 0.306964 }, { "acc": 0.70329523, "epoch": 0.057043231219424895, "grad_norm": 7.4375, "learning_rate": 5.704059948551617e-06, "loss": 1.21169291, "memory(GiB)": 133.45, "step": 5100, "train_speed(iter/s)": 0.307313 }, { "acc": 0.7008709, "epoch": 0.05726693016538343, "grad_norm": 5.875, "learning_rate": 5.726428811094956e-06, "loss": 1.2094698, "memory(GiB)": 133.45, "step": 5120, "train_speed(iter/s)": 0.30765 }, { "acc": 0.70213466, "epoch": 0.05749062911134196, "grad_norm": 5.125, "learning_rate": 5.7487976736382955e-06, "loss": 1.22087383, "memory(GiB)": 133.45, "step": 5140, "train_speed(iter/s)": 0.307983 }, { "acc": 0.69402914, "epoch": 0.057714328057300486, "grad_norm": 5.34375, "learning_rate": 5.771166536181636e-06, "loss": 1.25521259, "memory(GiB)": 133.45, "step": 5160, "train_speed(iter/s)": 0.308294 }, { "acc": 0.6989604, "epoch": 0.057938027003259014, "grad_norm": 6.5, "learning_rate": 5.793535398724975e-06, "loss": 1.2438303, "memory(GiB)": 133.45, "step": 5180, "train_speed(iter/s)": 0.308614 }, { "acc": 0.69609046, "epoch": 0.05816172594921754, "grad_norm": 6.15625, "learning_rate": 5.815904261268314e-06, "loss": 1.23883305, "memory(GiB)": 133.45, "step": 5200, "train_speed(iter/s)": 0.308943 }, { "acc": 0.69925718, "epoch": 0.05838542489517607, "grad_norm": 5.8125, "learning_rate": 5.838273123811654e-06, "loss": 1.23892498, "memory(GiB)": 133.45, "step": 5220, "train_speed(iter/s)": 0.309277 }, { "acc": 0.70374265, "epoch": 0.0586091238411346, "grad_norm": 6.59375, "learning_rate": 5.860641986354995e-06, "loss": 1.20503559, "memory(GiB)": 133.45, "step": 5240, "train_speed(iter/s)": 0.309621 }, { "acc": 0.70274935, "epoch": 0.05883282278709313, "grad_norm": 5.65625, "learning_rate": 5.883010848898334e-06, "loss": 1.20656013, "memory(GiB)": 133.45, "step": 5260, "train_speed(iter/s)": 0.309962 }, { "acc": 0.69992151, "epoch": 0.05905652173305166, "grad_norm": 5.53125, "learning_rate": 5.9053797114416735e-06, "loss": 1.22478895, "memory(GiB)": 133.45, "step": 5280, "train_speed(iter/s)": 0.310269 }, { "acc": 0.69704351, "epoch": 0.05928022067901019, "grad_norm": 5.59375, "learning_rate": 5.927748573985014e-06, "loss": 1.24116449, "memory(GiB)": 133.45, "step": 5300, "train_speed(iter/s)": 0.310611 }, { "acc": 0.70375757, "epoch": 0.05950391962496872, "grad_norm": 6.71875, "learning_rate": 5.950117436528353e-06, "loss": 1.20123787, "memory(GiB)": 133.45, "step": 5320, "train_speed(iter/s)": 0.310985 }, { "acc": 0.69785461, "epoch": 0.059727618570927246, "grad_norm": 6.75, "learning_rate": 5.972486299071692e-06, "loss": 1.22692823, "memory(GiB)": 133.45, "step": 5340, "train_speed(iter/s)": 0.311294 }, { "acc": 0.70049429, "epoch": 0.059951317516885774, "grad_norm": 6.3125, "learning_rate": 5.994855161615032e-06, "loss": 1.22286911, "memory(GiB)": 133.45, "step": 5360, "train_speed(iter/s)": 0.311606 }, { "acc": 0.69053574, "epoch": 0.0601750164628443, "grad_norm": 7.125, "learning_rate": 6.017224024158373e-06, "loss": 1.2630909, "memory(GiB)": 133.45, "step": 5380, "train_speed(iter/s)": 0.31191 }, { "acc": 0.69888229, "epoch": 0.06039871540880283, "grad_norm": 6.0, "learning_rate": 6.039592886701712e-06, "loss": 1.22248001, "memory(GiB)": 133.45, "step": 5400, "train_speed(iter/s)": 0.312247 }, { "acc": 0.696982, "epoch": 0.060622414354761366, "grad_norm": 5.46875, "learning_rate": 6.0619617492450515e-06, "loss": 1.23654976, "memory(GiB)": 133.45, "step": 5420, "train_speed(iter/s)": 0.312611 }, { "acc": 0.69293008, "epoch": 0.060846113300719894, "grad_norm": 5.21875, "learning_rate": 6.084330611788392e-06, "loss": 1.25534382, "memory(GiB)": 133.45, "step": 5440, "train_speed(iter/s)": 0.312947 }, { "acc": 0.69057498, "epoch": 0.06106981224667842, "grad_norm": 5.8125, "learning_rate": 6.106699474331731e-06, "loss": 1.26758986, "memory(GiB)": 133.45, "step": 5460, "train_speed(iter/s)": 0.313254 }, { "acc": 0.70574884, "epoch": 0.06129351119263695, "grad_norm": 7.125, "learning_rate": 6.12906833687507e-06, "loss": 1.21527214, "memory(GiB)": 133.45, "step": 5480, "train_speed(iter/s)": 0.313564 }, { "acc": 0.69957018, "epoch": 0.06151721013859548, "grad_norm": 5.71875, "learning_rate": 6.15143719941841e-06, "loss": 1.23458023, "memory(GiB)": 133.45, "step": 5500, "train_speed(iter/s)": 0.313851 }, { "acc": 0.69003592, "epoch": 0.061740909084554006, "grad_norm": 5.65625, "learning_rate": 6.17380606196175e-06, "loss": 1.2569293, "memory(GiB)": 133.45, "step": 5520, "train_speed(iter/s)": 0.314187 }, { "acc": 0.69123306, "epoch": 0.061964608030512534, "grad_norm": 7.34375, "learning_rate": 6.196174924505089e-06, "loss": 1.26172123, "memory(GiB)": 133.45, "step": 5540, "train_speed(iter/s)": 0.314518 }, { "acc": 0.70689983, "epoch": 0.06218830697647106, "grad_norm": 7.5, "learning_rate": 6.218543787048429e-06, "loss": 1.19322987, "memory(GiB)": 133.45, "step": 5560, "train_speed(iter/s)": 0.314923 }, { "acc": 0.71134896, "epoch": 0.0624120059224296, "grad_norm": 7.125, "learning_rate": 6.240912649591768e-06, "loss": 1.16900234, "memory(GiB)": 133.45, "step": 5580, "train_speed(iter/s)": 0.315287 }, { "acc": 0.68601809, "epoch": 0.06263570486838813, "grad_norm": 4.75, "learning_rate": 6.263281512135109e-06, "loss": 1.28316269, "memory(GiB)": 133.45, "step": 5600, "train_speed(iter/s)": 0.315593 }, { "acc": 0.69121027, "epoch": 0.06285940381434665, "grad_norm": 5.09375, "learning_rate": 6.285650374678448e-06, "loss": 1.26617737, "memory(GiB)": 133.45, "step": 5620, "train_speed(iter/s)": 0.315946 }, { "acc": 0.69801164, "epoch": 0.06308310276030518, "grad_norm": 6.03125, "learning_rate": 6.308019237221788e-06, "loss": 1.22264614, "memory(GiB)": 133.45, "step": 5640, "train_speed(iter/s)": 0.316264 }, { "acc": 0.70052648, "epoch": 0.06330680170626371, "grad_norm": 5.59375, "learning_rate": 6.330388099765128e-06, "loss": 1.21718693, "memory(GiB)": 133.45, "step": 5660, "train_speed(iter/s)": 0.316545 }, { "acc": 0.69361901, "epoch": 0.06353050065222224, "grad_norm": 5.40625, "learning_rate": 6.352756962308467e-06, "loss": 1.25445347, "memory(GiB)": 133.45, "step": 5680, "train_speed(iter/s)": 0.316787 }, { "acc": 0.69566984, "epoch": 0.06375419959818077, "grad_norm": 6.03125, "learning_rate": 6.375125824851807e-06, "loss": 1.24361992, "memory(GiB)": 133.45, "step": 5700, "train_speed(iter/s)": 0.317076 }, { "acc": 0.70255241, "epoch": 0.0639778985441393, "grad_norm": 6.5, "learning_rate": 6.397494687395146e-06, "loss": 1.20544815, "memory(GiB)": 133.45, "step": 5720, "train_speed(iter/s)": 0.317362 }, { "acc": 0.69254045, "epoch": 0.06420159749009782, "grad_norm": 6.0, "learning_rate": 6.419863549938486e-06, "loss": 1.2656971, "memory(GiB)": 133.45, "step": 5740, "train_speed(iter/s)": 0.317694 }, { "acc": 0.71046743, "epoch": 0.06442529643605635, "grad_norm": 6.21875, "learning_rate": 6.4422324124818256e-06, "loss": 1.16857672, "memory(GiB)": 133.45, "step": 5760, "train_speed(iter/s)": 0.31796 }, { "acc": 0.69353886, "epoch": 0.06464899538201488, "grad_norm": 5.5, "learning_rate": 6.464601275025165e-06, "loss": 1.24300137, "memory(GiB)": 133.45, "step": 5780, "train_speed(iter/s)": 0.318296 }, { "acc": 0.69829516, "epoch": 0.06487269432797341, "grad_norm": 6.71875, "learning_rate": 6.486970137568506e-06, "loss": 1.23822641, "memory(GiB)": 133.45, "step": 5800, "train_speed(iter/s)": 0.318597 }, { "acc": 0.68270926, "epoch": 0.06509639327393195, "grad_norm": 5.21875, "learning_rate": 6.509339000111845e-06, "loss": 1.30639105, "memory(GiB)": 133.45, "step": 5820, "train_speed(iter/s)": 0.318839 }, { "acc": 0.70102606, "epoch": 0.06532009221989048, "grad_norm": 5.46875, "learning_rate": 6.531707862655185e-06, "loss": 1.21457815, "memory(GiB)": 133.45, "step": 5840, "train_speed(iter/s)": 0.319106 }, { "acc": 0.6978858, "epoch": 0.065543791165849, "grad_norm": 6.9375, "learning_rate": 6.554076725198524e-06, "loss": 1.24867229, "memory(GiB)": 133.45, "step": 5860, "train_speed(iter/s)": 0.319386 }, { "acc": 0.69225955, "epoch": 0.06576749011180753, "grad_norm": 5.46875, "learning_rate": 6.576445587741864e-06, "loss": 1.24705372, "memory(GiB)": 133.45, "step": 5880, "train_speed(iter/s)": 0.31966 }, { "acc": 0.70117831, "epoch": 0.06599118905776606, "grad_norm": 5.8125, "learning_rate": 6.5988144502852036e-06, "loss": 1.20732498, "memory(GiB)": 133.45, "step": 5900, "train_speed(iter/s)": 0.319903 }, { "acc": 0.70282288, "epoch": 0.06621488800372459, "grad_norm": 6.40625, "learning_rate": 6.621183312828543e-06, "loss": 1.21517181, "memory(GiB)": 133.45, "step": 5920, "train_speed(iter/s)": 0.320161 }, { "acc": 0.70139155, "epoch": 0.06643858694968312, "grad_norm": 5.3125, "learning_rate": 6.643552175371882e-06, "loss": 1.21250744, "memory(GiB)": 133.45, "step": 5940, "train_speed(iter/s)": 0.320468 }, { "acc": 0.6953651, "epoch": 0.06666228589564165, "grad_norm": 6.5, "learning_rate": 6.6659210379152225e-06, "loss": 1.24624758, "memory(GiB)": 133.45, "step": 5960, "train_speed(iter/s)": 0.320762 }, { "acc": 0.69793282, "epoch": 0.06688598484160017, "grad_norm": 5.25, "learning_rate": 6.688289900458562e-06, "loss": 1.23534298, "memory(GiB)": 133.45, "step": 5980, "train_speed(iter/s)": 0.321017 }, { "acc": 0.69921069, "epoch": 0.0671096837875587, "grad_norm": 5.84375, "learning_rate": 6.710658763001901e-06, "loss": 1.20851421, "memory(GiB)": 133.45, "step": 6000, "train_speed(iter/s)": 0.321292 }, { "epoch": 0.0671096837875587, "eval_acc": 0.6593597192693526, "eval_loss": 1.2240450382232666, "eval_runtime": 2341.5113, "eval_samples_per_second": 32.151, "eval_steps_per_second": 16.076, "step": 6000 }, { "acc": 0.70199685, "epoch": 0.06733338273351723, "grad_norm": 6.5625, "learning_rate": 6.733027625545242e-06, "loss": 1.21566038, "memory(GiB)": 133.45, "step": 6020, "train_speed(iter/s)": 0.285116 }, { "acc": 0.69188595, "epoch": 0.06755708167947576, "grad_norm": 5.0625, "learning_rate": 6.7553964880885816e-06, "loss": 1.26050692, "memory(GiB)": 133.45, "step": 6040, "train_speed(iter/s)": 0.285453 }, { "acc": 0.70384083, "epoch": 0.06778078062543429, "grad_norm": 4.65625, "learning_rate": 6.777765350631921e-06, "loss": 1.22676849, "memory(GiB)": 133.45, "step": 6060, "train_speed(iter/s)": 0.285743 }, { "acc": 0.70306997, "epoch": 0.06800447957139281, "grad_norm": 5.84375, "learning_rate": 6.80013421317526e-06, "loss": 1.23013554, "memory(GiB)": 133.45, "step": 6080, "train_speed(iter/s)": 0.28605 }, { "acc": 0.70838552, "epoch": 0.06822817851735134, "grad_norm": 5.15625, "learning_rate": 6.8225030757186005e-06, "loss": 1.1880888, "memory(GiB)": 133.45, "step": 6100, "train_speed(iter/s)": 0.286321 }, { "acc": 0.70353384, "epoch": 0.06845187746330987, "grad_norm": 6.125, "learning_rate": 6.84487193826194e-06, "loss": 1.20521049, "memory(GiB)": 133.45, "step": 6120, "train_speed(iter/s)": 0.286627 }, { "acc": 0.70525389, "epoch": 0.06867557640926841, "grad_norm": 6.09375, "learning_rate": 6.867240800805279e-06, "loss": 1.19622355, "memory(GiB)": 133.45, "step": 6140, "train_speed(iter/s)": 0.286936 }, { "acc": 0.6941411, "epoch": 0.06889927535522694, "grad_norm": 5.34375, "learning_rate": 6.88960966334862e-06, "loss": 1.24113121, "memory(GiB)": 133.45, "step": 6160, "train_speed(iter/s)": 0.287276 }, { "acc": 0.7027422, "epoch": 0.06912297430118547, "grad_norm": 8.0, "learning_rate": 6.9119785258919596e-06, "loss": 1.20871944, "memory(GiB)": 133.45, "step": 6180, "train_speed(iter/s)": 0.287597 }, { "acc": 0.70053539, "epoch": 0.069346673247144, "grad_norm": 5.125, "learning_rate": 6.934347388435299e-06, "loss": 1.24607544, "memory(GiB)": 133.45, "step": 6200, "train_speed(iter/s)": 0.287898 }, { "acc": 0.69063435, "epoch": 0.06957037219310253, "grad_norm": 5.65625, "learning_rate": 6.956716250978638e-06, "loss": 1.26272335, "memory(GiB)": 133.45, "step": 6220, "train_speed(iter/s)": 0.288151 }, { "acc": 0.69800582, "epoch": 0.06979407113906105, "grad_norm": 5.59375, "learning_rate": 6.9790851135219785e-06, "loss": 1.24821606, "memory(GiB)": 133.45, "step": 6240, "train_speed(iter/s)": 0.288471 }, { "acc": 0.69935875, "epoch": 0.07001777008501958, "grad_norm": 6.15625, "learning_rate": 7.001453976065318e-06, "loss": 1.23807983, "memory(GiB)": 133.45, "step": 6260, "train_speed(iter/s)": 0.288793 }, { "acc": 0.68859825, "epoch": 0.07024146903097811, "grad_norm": 6.15625, "learning_rate": 7.023822838608657e-06, "loss": 1.26098995, "memory(GiB)": 133.45, "step": 6280, "train_speed(iter/s)": 0.28907 }, { "acc": 0.70226541, "epoch": 0.07046516797693664, "grad_norm": 6.75, "learning_rate": 7.0461917011519965e-06, "loss": 1.20739527, "memory(GiB)": 133.45, "step": 6300, "train_speed(iter/s)": 0.289397 }, { "acc": 0.69758282, "epoch": 0.07068886692289517, "grad_norm": 5.375, "learning_rate": 7.068560563695337e-06, "loss": 1.23729324, "memory(GiB)": 133.45, "step": 6320, "train_speed(iter/s)": 0.289718 }, { "acc": 0.69300604, "epoch": 0.0709125658688537, "grad_norm": 5.6875, "learning_rate": 7.090929426238676e-06, "loss": 1.24756088, "memory(GiB)": 133.45, "step": 6340, "train_speed(iter/s)": 0.290048 }, { "acc": 0.68892736, "epoch": 0.07113626481481222, "grad_norm": 5.5, "learning_rate": 7.113298288782015e-06, "loss": 1.26708412, "memory(GiB)": 133.45, "step": 6360, "train_speed(iter/s)": 0.290363 }, { "acc": 0.68795767, "epoch": 0.07135996376077075, "grad_norm": 5.28125, "learning_rate": 7.1356671513253565e-06, "loss": 1.28620567, "memory(GiB)": 133.45, "step": 6380, "train_speed(iter/s)": 0.290635 }, { "acc": 0.70575356, "epoch": 0.07158366270672928, "grad_norm": 6.46875, "learning_rate": 7.158036013868696e-06, "loss": 1.18254738, "memory(GiB)": 133.45, "step": 6400, "train_speed(iter/s)": 0.290902 }, { "acc": 0.69450798, "epoch": 0.0718073616526878, "grad_norm": 6.28125, "learning_rate": 7.180404876412035e-06, "loss": 1.24663162, "memory(GiB)": 142.32, "step": 6420, "train_speed(iter/s)": 0.291171 }, { "acc": 0.69761395, "epoch": 0.07203106059864635, "grad_norm": 7.1875, "learning_rate": 7.2027737389553745e-06, "loss": 1.22834682, "memory(GiB)": 142.32, "step": 6440, "train_speed(iter/s)": 0.291448 }, { "acc": 0.70433836, "epoch": 0.07225475954460488, "grad_norm": 5.1875, "learning_rate": 7.225142601498715e-06, "loss": 1.19324188, "memory(GiB)": 142.32, "step": 6460, "train_speed(iter/s)": 0.291743 }, { "acc": 0.70374322, "epoch": 0.0724784584905634, "grad_norm": 5.96875, "learning_rate": 7.247511464042054e-06, "loss": 1.19148579, "memory(GiB)": 142.32, "step": 6480, "train_speed(iter/s)": 0.292037 }, { "acc": 0.71309929, "epoch": 0.07270215743652193, "grad_norm": 6.84375, "learning_rate": 7.269880326585393e-06, "loss": 1.17177467, "memory(GiB)": 142.32, "step": 6500, "train_speed(iter/s)": 0.292327 }, { "acc": 0.70018425, "epoch": 0.07292585638248046, "grad_norm": 5.09375, "learning_rate": 7.292249189128734e-06, "loss": 1.23087616, "memory(GiB)": 142.32, "step": 6520, "train_speed(iter/s)": 0.292617 }, { "acc": 0.70019808, "epoch": 0.07314955532843899, "grad_norm": 6.03125, "learning_rate": 7.314618051672073e-06, "loss": 1.21691456, "memory(GiB)": 142.32, "step": 6540, "train_speed(iter/s)": 0.292931 }, { "acc": 0.70318336, "epoch": 0.07337325427439752, "grad_norm": 5.9375, "learning_rate": 7.336986914215412e-06, "loss": 1.20008945, "memory(GiB)": 142.32, "step": 6560, "train_speed(iter/s)": 0.293171 }, { "acc": 0.70243311, "epoch": 0.07359695322035605, "grad_norm": 6.53125, "learning_rate": 7.359355776758752e-06, "loss": 1.20647907, "memory(GiB)": 142.32, "step": 6580, "train_speed(iter/s)": 0.293498 }, { "acc": 0.6989253, "epoch": 0.07382065216631457, "grad_norm": 5.84375, "learning_rate": 7.381724639302093e-06, "loss": 1.22535009, "memory(GiB)": 142.32, "step": 6600, "train_speed(iter/s)": 0.293792 }, { "acc": 0.69829907, "epoch": 0.0740443511122731, "grad_norm": 5.75, "learning_rate": 7.404093501845432e-06, "loss": 1.22627316, "memory(GiB)": 142.32, "step": 6620, "train_speed(iter/s)": 0.294086 }, { "acc": 0.69951477, "epoch": 0.07426805005823163, "grad_norm": 10.625, "learning_rate": 7.426462364388771e-06, "loss": 1.22158508, "memory(GiB)": 142.32, "step": 6640, "train_speed(iter/s)": 0.294371 }, { "acc": 0.70693874, "epoch": 0.07449174900419016, "grad_norm": 4.84375, "learning_rate": 7.448831226932111e-06, "loss": 1.19212856, "memory(GiB)": 142.32, "step": 6660, "train_speed(iter/s)": 0.294633 }, { "acc": 0.69393578, "epoch": 0.07471544795014869, "grad_norm": 5.03125, "learning_rate": 7.471200089475451e-06, "loss": 1.25232372, "memory(GiB)": 142.32, "step": 6680, "train_speed(iter/s)": 0.294923 }, { "acc": 0.70557771, "epoch": 0.07493914689610721, "grad_norm": 5.40625, "learning_rate": 7.49356895201879e-06, "loss": 1.186063, "memory(GiB)": 142.32, "step": 6700, "train_speed(iter/s)": 0.295211 }, { "acc": 0.70052938, "epoch": 0.07516284584206574, "grad_norm": 5.4375, "learning_rate": 7.51593781456213e-06, "loss": 1.2155056, "memory(GiB)": 142.32, "step": 6720, "train_speed(iter/s)": 0.295488 }, { "acc": 0.70463543, "epoch": 0.07538654478802427, "grad_norm": 5.90625, "learning_rate": 7.53830667710547e-06, "loss": 1.19870224, "memory(GiB)": 142.32, "step": 6740, "train_speed(iter/s)": 0.295775 }, { "acc": 0.69882722, "epoch": 0.07561024373398281, "grad_norm": 6.125, "learning_rate": 7.560675539648809e-06, "loss": 1.21384602, "memory(GiB)": 142.32, "step": 6760, "train_speed(iter/s)": 0.296043 }, { "acc": 0.70932379, "epoch": 0.07583394267994134, "grad_norm": 7.0, "learning_rate": 7.5830444021921486e-06, "loss": 1.17907734, "memory(GiB)": 142.32, "step": 6780, "train_speed(iter/s)": 0.2963 }, { "acc": 0.71204977, "epoch": 0.07605764162589987, "grad_norm": 6.5625, "learning_rate": 7.605413264735489e-06, "loss": 1.16864624, "memory(GiB)": 142.32, "step": 6800, "train_speed(iter/s)": 0.296585 }, { "acc": 0.70702181, "epoch": 0.0762813405718584, "grad_norm": 7.375, "learning_rate": 7.627782127278829e-06, "loss": 1.16884861, "memory(GiB)": 142.32, "step": 6820, "train_speed(iter/s)": 0.296869 }, { "acc": 0.70652041, "epoch": 0.07650503951781693, "grad_norm": 6.25, "learning_rate": 7.650150989822168e-06, "loss": 1.17975092, "memory(GiB)": 142.32, "step": 6840, "train_speed(iter/s)": 0.297135 }, { "acc": 0.69857097, "epoch": 0.07672873846377545, "grad_norm": 5.40625, "learning_rate": 7.672519852365508e-06, "loss": 1.22625837, "memory(GiB)": 142.32, "step": 6860, "train_speed(iter/s)": 0.297428 }, { "acc": 0.69809103, "epoch": 0.07695243740973398, "grad_norm": 4.84375, "learning_rate": 7.694888714908849e-06, "loss": 1.22347288, "memory(GiB)": 142.32, "step": 6880, "train_speed(iter/s)": 0.297643 }, { "acc": 0.69714222, "epoch": 0.07717613635569251, "grad_norm": 5.90625, "learning_rate": 7.717257577452188e-06, "loss": 1.24700775, "memory(GiB)": 142.32, "step": 6900, "train_speed(iter/s)": 0.297908 }, { "acc": 0.69842505, "epoch": 0.07739983530165104, "grad_norm": 7.375, "learning_rate": 7.739626439995527e-06, "loss": 1.23050394, "memory(GiB)": 142.32, "step": 6920, "train_speed(iter/s)": 0.298162 }, { "acc": 0.70380225, "epoch": 0.07762353424760957, "grad_norm": 6.125, "learning_rate": 7.761995302538867e-06, "loss": 1.19995308, "memory(GiB)": 142.32, "step": 6940, "train_speed(iter/s)": 0.298434 }, { "acc": 0.70582867, "epoch": 0.0778472331935681, "grad_norm": 6.96875, "learning_rate": 7.784364165082206e-06, "loss": 1.18005552, "memory(GiB)": 142.32, "step": 6960, "train_speed(iter/s)": 0.298708 }, { "acc": 0.70252647, "epoch": 0.07807093213952662, "grad_norm": 6.59375, "learning_rate": 7.806733027625545e-06, "loss": 1.21648521, "memory(GiB)": 142.32, "step": 6980, "train_speed(iter/s)": 0.29899 }, { "acc": 0.70555744, "epoch": 0.07829463108548515, "grad_norm": 6.8125, "learning_rate": 7.829101890168885e-06, "loss": 1.18236036, "memory(GiB)": 142.32, "step": 7000, "train_speed(iter/s)": 0.299255 }, { "acc": 0.71299152, "epoch": 0.07851833003144368, "grad_norm": 5.90625, "learning_rate": 7.851470752712224e-06, "loss": 1.15435038, "memory(GiB)": 142.32, "step": 7020, "train_speed(iter/s)": 0.29954 }, { "acc": 0.68356819, "epoch": 0.0787420289774022, "grad_norm": 5.40625, "learning_rate": 7.873839615255565e-06, "loss": 1.30435982, "memory(GiB)": 142.32, "step": 7040, "train_speed(iter/s)": 0.299782 }, { "acc": 0.70083103, "epoch": 0.07896572792336073, "grad_norm": 6.28125, "learning_rate": 7.896208477798905e-06, "loss": 1.23165998, "memory(GiB)": 142.32, "step": 7060, "train_speed(iter/s)": 0.300035 }, { "acc": 0.70451536, "epoch": 0.07918942686931928, "grad_norm": 6.5625, "learning_rate": 7.918577340342244e-06, "loss": 1.20471325, "memory(GiB)": 142.32, "step": 7080, "train_speed(iter/s)": 0.300318 }, { "acc": 0.70757871, "epoch": 0.0794131258152778, "grad_norm": 5.75, "learning_rate": 7.940946202885585e-06, "loss": 1.17942467, "memory(GiB)": 142.32, "step": 7100, "train_speed(iter/s)": 0.300594 }, { "acc": 0.70759621, "epoch": 0.07963682476123633, "grad_norm": 7.6875, "learning_rate": 7.963315065428924e-06, "loss": 1.19699278, "memory(GiB)": 142.32, "step": 7120, "train_speed(iter/s)": 0.300847 }, { "acc": 0.70549011, "epoch": 0.07986052370719486, "grad_norm": 6.125, "learning_rate": 7.985683927972264e-06, "loss": 1.2019268, "memory(GiB)": 142.32, "step": 7140, "train_speed(iter/s)": 0.301112 }, { "acc": 0.70408239, "epoch": 0.08008422265315339, "grad_norm": 5.28125, "learning_rate": 8.008052790515603e-06, "loss": 1.2117197, "memory(GiB)": 142.32, "step": 7160, "train_speed(iter/s)": 0.301383 }, { "acc": 0.71735668, "epoch": 0.08030792159911192, "grad_norm": 7.0, "learning_rate": 8.030421653058942e-06, "loss": 1.13052883, "memory(GiB)": 142.32, "step": 7180, "train_speed(iter/s)": 0.30166 }, { "acc": 0.70952826, "epoch": 0.08053162054507045, "grad_norm": 5.375, "learning_rate": 8.052790515602282e-06, "loss": 1.16078491, "memory(GiB)": 142.32, "step": 7200, "train_speed(iter/s)": 0.301906 }, { "acc": 0.72152195, "epoch": 0.08075531949102897, "grad_norm": 5.84375, "learning_rate": 8.075159378145621e-06, "loss": 1.11938763, "memory(GiB)": 142.32, "step": 7220, "train_speed(iter/s)": 0.302171 }, { "acc": 0.70349169, "epoch": 0.0809790184369875, "grad_norm": 5.5625, "learning_rate": 8.097528240688962e-06, "loss": 1.18751793, "memory(GiB)": 142.32, "step": 7240, "train_speed(iter/s)": 0.302417 }, { "acc": 0.70152841, "epoch": 0.08120271738294603, "grad_norm": 5.71875, "learning_rate": 8.119897103232301e-06, "loss": 1.23246384, "memory(GiB)": 142.32, "step": 7260, "train_speed(iter/s)": 0.302689 }, { "acc": 0.70959263, "epoch": 0.08142641632890456, "grad_norm": 6.0625, "learning_rate": 8.14226596577564e-06, "loss": 1.1726326, "memory(GiB)": 142.32, "step": 7280, "train_speed(iter/s)": 0.302962 }, { "acc": 0.71365151, "epoch": 0.08165011527486309, "grad_norm": 6.4375, "learning_rate": 8.16463482831898e-06, "loss": 1.17798195, "memory(GiB)": 142.32, "step": 7300, "train_speed(iter/s)": 0.303209 }, { "acc": 0.70779314, "epoch": 0.08187381422082161, "grad_norm": 6.90625, "learning_rate": 8.187003690862321e-06, "loss": 1.18825417, "memory(GiB)": 142.32, "step": 7320, "train_speed(iter/s)": 0.303473 }, { "acc": 0.70643072, "epoch": 0.08209751316678014, "grad_norm": 4.0, "learning_rate": 8.20937255340566e-06, "loss": 1.18747969, "memory(GiB)": 142.32, "step": 7340, "train_speed(iter/s)": 0.303721 }, { "acc": 0.70462728, "epoch": 0.08232121211273867, "grad_norm": 5.6875, "learning_rate": 8.231741415949e-06, "loss": 1.18729134, "memory(GiB)": 142.32, "step": 7360, "train_speed(iter/s)": 0.303984 }, { "acc": 0.7073349, "epoch": 0.0825449110586972, "grad_norm": 6.125, "learning_rate": 8.25411027849234e-06, "loss": 1.18803921, "memory(GiB)": 142.32, "step": 7380, "train_speed(iter/s)": 0.304224 }, { "acc": 0.71332598, "epoch": 0.08276861000465574, "grad_norm": 6.15625, "learning_rate": 8.276479141035679e-06, "loss": 1.15286427, "memory(GiB)": 142.32, "step": 7400, "train_speed(iter/s)": 0.304463 }, { "acc": 0.70740366, "epoch": 0.08299230895061427, "grad_norm": 6.15625, "learning_rate": 8.298848003579018e-06, "loss": 1.17039804, "memory(GiB)": 142.32, "step": 7420, "train_speed(iter/s)": 0.304696 }, { "acc": 0.69422755, "epoch": 0.0832160078965728, "grad_norm": 5.5, "learning_rate": 8.321216866122357e-06, "loss": 1.23050299, "memory(GiB)": 142.32, "step": 7440, "train_speed(iter/s)": 0.304933 }, { "acc": 0.71108546, "epoch": 0.08343970684253132, "grad_norm": 5.53125, "learning_rate": 8.343585728665698e-06, "loss": 1.17948914, "memory(GiB)": 142.32, "step": 7460, "train_speed(iter/s)": 0.305196 }, { "acc": 0.6945385, "epoch": 0.08366340578848985, "grad_norm": 7.125, "learning_rate": 8.365954591209038e-06, "loss": 1.2404171, "memory(GiB)": 142.32, "step": 7480, "train_speed(iter/s)": 0.305438 }, { "acc": 0.69849491, "epoch": 0.08388710473444838, "grad_norm": 4.46875, "learning_rate": 8.388323453752377e-06, "loss": 1.21730022, "memory(GiB)": 142.32, "step": 7500, "train_speed(iter/s)": 0.305702 }, { "acc": 0.69795532, "epoch": 0.08411080368040691, "grad_norm": 5.125, "learning_rate": 8.410692316295716e-06, "loss": 1.22609806, "memory(GiB)": 142.32, "step": 7520, "train_speed(iter/s)": 0.305969 }, { "acc": 0.70245867, "epoch": 0.08433450262636544, "grad_norm": 5.53125, "learning_rate": 8.433061178839057e-06, "loss": 1.20462322, "memory(GiB)": 142.32, "step": 7540, "train_speed(iter/s)": 0.3062 }, { "acc": 0.70564766, "epoch": 0.08455820157232397, "grad_norm": 7.46875, "learning_rate": 8.455430041382397e-06, "loss": 1.18598976, "memory(GiB)": 142.32, "step": 7560, "train_speed(iter/s)": 0.306457 }, { "acc": 0.69490833, "epoch": 0.0847819005182825, "grad_norm": 5.53125, "learning_rate": 8.477798903925736e-06, "loss": 1.25079308, "memory(GiB)": 142.32, "step": 7580, "train_speed(iter/s)": 0.306683 }, { "acc": 0.70890718, "epoch": 0.08500559946424102, "grad_norm": 4.84375, "learning_rate": 8.500167766469076e-06, "loss": 1.16286592, "memory(GiB)": 142.32, "step": 7600, "train_speed(iter/s)": 0.306911 }, { "acc": 0.69542456, "epoch": 0.08522929841019955, "grad_norm": 6.6875, "learning_rate": 8.522536629012415e-06, "loss": 1.24621696, "memory(GiB)": 142.32, "step": 7620, "train_speed(iter/s)": 0.307145 }, { "acc": 0.69772577, "epoch": 0.08545299735615808, "grad_norm": 5.84375, "learning_rate": 8.544905491555754e-06, "loss": 1.22447929, "memory(GiB)": 142.32, "step": 7640, "train_speed(iter/s)": 0.307387 }, { "acc": 0.70933294, "epoch": 0.0856766963021166, "grad_norm": 5.96875, "learning_rate": 8.567274354099094e-06, "loss": 1.16652279, "memory(GiB)": 142.32, "step": 7660, "train_speed(iter/s)": 0.307612 }, { "acc": 0.70104036, "epoch": 0.08590039524807513, "grad_norm": 6.46875, "learning_rate": 8.589643216642435e-06, "loss": 1.20403299, "memory(GiB)": 142.32, "step": 7680, "train_speed(iter/s)": 0.307843 }, { "acc": 0.71267929, "epoch": 0.08612409419403368, "grad_norm": 4.71875, "learning_rate": 8.612012079185774e-06, "loss": 1.156917, "memory(GiB)": 142.32, "step": 7700, "train_speed(iter/s)": 0.308073 }, { "acc": 0.70763302, "epoch": 0.0863477931399922, "grad_norm": 8.3125, "learning_rate": 8.634380941729113e-06, "loss": 1.17724628, "memory(GiB)": 142.32, "step": 7720, "train_speed(iter/s)": 0.308307 }, { "acc": 0.70271668, "epoch": 0.08657149208595073, "grad_norm": 5.375, "learning_rate": 8.656749804272453e-06, "loss": 1.18066635, "memory(GiB)": 142.32, "step": 7740, "train_speed(iter/s)": 0.308527 }, { "acc": 0.7030766, "epoch": 0.08679519103190926, "grad_norm": 6.1875, "learning_rate": 8.679118666815794e-06, "loss": 1.21223183, "memory(GiB)": 142.32, "step": 7760, "train_speed(iter/s)": 0.308749 }, { "acc": 0.70786457, "epoch": 0.08701888997786779, "grad_norm": 5.9375, "learning_rate": 8.701487529359133e-06, "loss": 1.18593588, "memory(GiB)": 142.32, "step": 7780, "train_speed(iter/s)": 0.308972 }, { "acc": 0.70131688, "epoch": 0.08724258892382632, "grad_norm": 5.5, "learning_rate": 8.723856391902472e-06, "loss": 1.21976871, "memory(GiB)": 142.32, "step": 7800, "train_speed(iter/s)": 0.309224 }, { "acc": 0.71066566, "epoch": 0.08746628786978485, "grad_norm": 4.875, "learning_rate": 8.746225254445812e-06, "loss": 1.16793079, "memory(GiB)": 142.32, "step": 7820, "train_speed(iter/s)": 0.309464 }, { "acc": 0.7136025, "epoch": 0.08768998681574337, "grad_norm": 5.65625, "learning_rate": 8.768594116989151e-06, "loss": 1.14699669, "memory(GiB)": 142.32, "step": 7840, "train_speed(iter/s)": 0.309703 }, { "acc": 0.70792227, "epoch": 0.0879136857617019, "grad_norm": 6.15625, "learning_rate": 8.79096297953249e-06, "loss": 1.18644972, "memory(GiB)": 142.32, "step": 7860, "train_speed(iter/s)": 0.309943 }, { "acc": 0.70111294, "epoch": 0.08813738470766043, "grad_norm": 5.8125, "learning_rate": 8.813331842075832e-06, "loss": 1.21792564, "memory(GiB)": 142.32, "step": 7880, "train_speed(iter/s)": 0.31014 }, { "acc": 0.70191259, "epoch": 0.08836108365361896, "grad_norm": 5.125, "learning_rate": 8.835700704619171e-06, "loss": 1.21113443, "memory(GiB)": 142.32, "step": 7900, "train_speed(iter/s)": 0.310341 }, { "acc": 0.70354929, "epoch": 0.08858478259957749, "grad_norm": 6.21875, "learning_rate": 8.85806956716251e-06, "loss": 1.19375401, "memory(GiB)": 142.32, "step": 7920, "train_speed(iter/s)": 0.310604 }, { "acc": 0.69502401, "epoch": 0.08880848154553601, "grad_norm": 6.1875, "learning_rate": 8.88043842970585e-06, "loss": 1.23271923, "memory(GiB)": 142.32, "step": 7940, "train_speed(iter/s)": 0.31082 }, { "acc": 0.70778942, "epoch": 0.08903218049149454, "grad_norm": 7.09375, "learning_rate": 8.90280729224919e-06, "loss": 1.18383989, "memory(GiB)": 142.32, "step": 7960, "train_speed(iter/s)": 0.311047 }, { "acc": 0.70043745, "epoch": 0.08925587943745307, "grad_norm": 5.59375, "learning_rate": 8.92517615479253e-06, "loss": 1.21570988, "memory(GiB)": 142.32, "step": 7980, "train_speed(iter/s)": 0.311265 }, { "acc": 0.70898509, "epoch": 0.0894795783834116, "grad_norm": 5.34375, "learning_rate": 8.94754501733587e-06, "loss": 1.18427744, "memory(GiB)": 142.32, "step": 8000, "train_speed(iter/s)": 0.311513 }, { "epoch": 0.0894795783834116, "eval_acc": 0.6667631895172438, "eval_loss": 1.191528081893921, "eval_runtime": 2343.7697, "eval_samples_per_second": 32.12, "eval_steps_per_second": 16.06, "step": 8000 }, { "acc": 0.70469398, "epoch": 0.08970327732937014, "grad_norm": 5.84375, "learning_rate": 8.969913879879209e-06, "loss": 1.20549355, "memory(GiB)": 142.32, "step": 8020, "train_speed(iter/s)": 0.285223 }, { "acc": 0.71001911, "epoch": 0.08992697627532867, "grad_norm": 5.6875, "learning_rate": 8.99228274242255e-06, "loss": 1.16869812, "memory(GiB)": 142.32, "step": 8040, "train_speed(iter/s)": 0.285467 }, { "acc": 0.70526004, "epoch": 0.0901506752212872, "grad_norm": 7.3125, "learning_rate": 9.014651604965889e-06, "loss": 1.19361982, "memory(GiB)": 142.32, "step": 8060, "train_speed(iter/s)": 0.285684 }, { "acc": 0.70351586, "epoch": 0.09037437416724572, "grad_norm": 6.75, "learning_rate": 9.037020467509228e-06, "loss": 1.20174494, "memory(GiB)": 142.32, "step": 8080, "train_speed(iter/s)": 0.285922 }, { "acc": 0.69399147, "epoch": 0.09059807311320425, "grad_norm": 5.90625, "learning_rate": 9.059389330052568e-06, "loss": 1.24701157, "memory(GiB)": 142.32, "step": 8100, "train_speed(iter/s)": 0.286155 }, { "acc": 0.70178161, "epoch": 0.09082177205916278, "grad_norm": 6.125, "learning_rate": 9.081758192595907e-06, "loss": 1.21181173, "memory(GiB)": 142.32, "step": 8120, "train_speed(iter/s)": 0.28638 }, { "acc": 0.71155872, "epoch": 0.09104547100512131, "grad_norm": 7.125, "learning_rate": 9.104127055139246e-06, "loss": 1.15877962, "memory(GiB)": 142.32, "step": 8140, "train_speed(iter/s)": 0.286627 }, { "acc": 0.69953661, "epoch": 0.09126916995107984, "grad_norm": 6.03125, "learning_rate": 9.126495917682586e-06, "loss": 1.2361331, "memory(GiB)": 142.32, "step": 8160, "train_speed(iter/s)": 0.286848 }, { "acc": 0.70068483, "epoch": 0.09149286889703837, "grad_norm": 6.46875, "learning_rate": 9.148864780225927e-06, "loss": 1.21147022, "memory(GiB)": 142.32, "step": 8180, "train_speed(iter/s)": 0.287089 }, { "acc": 0.70386438, "epoch": 0.0917165678429969, "grad_norm": 6.875, "learning_rate": 9.171233642769266e-06, "loss": 1.19344063, "memory(GiB)": 142.32, "step": 8200, "train_speed(iter/s)": 0.287325 }, { "acc": 0.70187149, "epoch": 0.09194026678895542, "grad_norm": 5.875, "learning_rate": 9.193602505312606e-06, "loss": 1.21319695, "memory(GiB)": 142.32, "step": 8220, "train_speed(iter/s)": 0.28756 }, { "acc": 0.71527872, "epoch": 0.09216396573491395, "grad_norm": 6.78125, "learning_rate": 9.215971367855945e-06, "loss": 1.14087744, "memory(GiB)": 142.32, "step": 8240, "train_speed(iter/s)": 0.287827 }, { "acc": 0.69827766, "epoch": 0.09238766468087248, "grad_norm": 5.125, "learning_rate": 9.238340230399286e-06, "loss": 1.214114, "memory(GiB)": 142.32, "step": 8260, "train_speed(iter/s)": 0.288044 }, { "acc": 0.70237141, "epoch": 0.092611363626831, "grad_norm": 5.09375, "learning_rate": 9.260709092942625e-06, "loss": 1.20425568, "memory(GiB)": 142.32, "step": 8280, "train_speed(iter/s)": 0.288252 }, { "acc": 0.7216845, "epoch": 0.09283506257278953, "grad_norm": 6.0625, "learning_rate": 9.283077955485965e-06, "loss": 1.12205677, "memory(GiB)": 142.32, "step": 8300, "train_speed(iter/s)": 0.288481 }, { "acc": 0.71330023, "epoch": 0.09305876151874806, "grad_norm": 5.8125, "learning_rate": 9.305446818029304e-06, "loss": 1.14735966, "memory(GiB)": 142.32, "step": 8320, "train_speed(iter/s)": 0.288746 }, { "acc": 0.7088428, "epoch": 0.0932824604647066, "grad_norm": 4.78125, "learning_rate": 9.327815680572643e-06, "loss": 1.18303204, "memory(GiB)": 142.32, "step": 8340, "train_speed(iter/s)": 0.288969 }, { "acc": 0.71093783, "epoch": 0.09350615941066513, "grad_norm": 5.46875, "learning_rate": 9.350184543115983e-06, "loss": 1.17008724, "memory(GiB)": 142.32, "step": 8360, "train_speed(iter/s)": 0.289217 }, { "acc": 0.71493673, "epoch": 0.09372985835662366, "grad_norm": 6.0, "learning_rate": 9.372553405659322e-06, "loss": 1.15253983, "memory(GiB)": 142.32, "step": 8380, "train_speed(iter/s)": 0.289452 }, { "acc": 0.71025352, "epoch": 0.09395355730258219, "grad_norm": 6.0, "learning_rate": 9.394922268202663e-06, "loss": 1.19499626, "memory(GiB)": 142.32, "step": 8400, "train_speed(iter/s)": 0.289659 }, { "acc": 0.71296291, "epoch": 0.09417725624854072, "grad_norm": 4.9375, "learning_rate": 9.417291130746002e-06, "loss": 1.16805983, "memory(GiB)": 142.32, "step": 8420, "train_speed(iter/s)": 0.289885 }, { "acc": 0.70052986, "epoch": 0.09440095519449924, "grad_norm": 4.90625, "learning_rate": 9.439659993289342e-06, "loss": 1.21239796, "memory(GiB)": 142.32, "step": 8440, "train_speed(iter/s)": 0.290114 }, { "acc": 0.71385355, "epoch": 0.09462465414045777, "grad_norm": 5.625, "learning_rate": 9.462028855832681e-06, "loss": 1.14698524, "memory(GiB)": 142.32, "step": 8460, "train_speed(iter/s)": 0.290332 }, { "acc": 0.71181912, "epoch": 0.0948483530864163, "grad_norm": 5.75, "learning_rate": 9.484397718376022e-06, "loss": 1.16996365, "memory(GiB)": 142.32, "step": 8480, "train_speed(iter/s)": 0.290575 }, { "acc": 0.70321679, "epoch": 0.09507205203237483, "grad_norm": 7.09375, "learning_rate": 9.506766580919362e-06, "loss": 1.19769993, "memory(GiB)": 142.32, "step": 8500, "train_speed(iter/s)": 0.290793 }, { "acc": 0.70928068, "epoch": 0.09529575097833336, "grad_norm": 5.84375, "learning_rate": 9.529135443462701e-06, "loss": 1.16647758, "memory(GiB)": 142.32, "step": 8520, "train_speed(iter/s)": 0.291005 }, { "acc": 0.70690794, "epoch": 0.09551944992429189, "grad_norm": 5.28125, "learning_rate": 9.55150430600604e-06, "loss": 1.20514889, "memory(GiB)": 142.32, "step": 8540, "train_speed(iter/s)": 0.291223 }, { "acc": 0.70903845, "epoch": 0.09574314887025041, "grad_norm": 5.65625, "learning_rate": 9.57387316854938e-06, "loss": 1.1656683, "memory(GiB)": 142.32, "step": 8560, "train_speed(iter/s)": 0.291436 }, { "acc": 0.70325193, "epoch": 0.09596684781620894, "grad_norm": 6.4375, "learning_rate": 9.596242031092719e-06, "loss": 1.18919754, "memory(GiB)": 142.32, "step": 8580, "train_speed(iter/s)": 0.291651 }, { "acc": 0.69356346, "epoch": 0.09619054676216747, "grad_norm": 6.0, "learning_rate": 9.618610893636058e-06, "loss": 1.24482841, "memory(GiB)": 142.32, "step": 8600, "train_speed(iter/s)": 0.291877 }, { "acc": 0.71222267, "epoch": 0.096414245708126, "grad_norm": 5.8125, "learning_rate": 9.6409797561794e-06, "loss": 1.16239853, "memory(GiB)": 142.32, "step": 8620, "train_speed(iter/s)": 0.292084 }, { "acc": 0.71086655, "epoch": 0.09663794465408454, "grad_norm": 6.3125, "learning_rate": 9.663348618722739e-06, "loss": 1.15147018, "memory(GiB)": 142.32, "step": 8640, "train_speed(iter/s)": 0.292326 }, { "acc": 0.70530176, "epoch": 0.09686164360004307, "grad_norm": 5.125, "learning_rate": 9.685717481266078e-06, "loss": 1.18919411, "memory(GiB)": 142.32, "step": 8660, "train_speed(iter/s)": 0.292562 }, { "acc": 0.69456134, "epoch": 0.0970853425460016, "grad_norm": 6.75, "learning_rate": 9.708086343809419e-06, "loss": 1.24686356, "memory(GiB)": 142.32, "step": 8680, "train_speed(iter/s)": 0.292786 }, { "acc": 0.70547423, "epoch": 0.09730904149196012, "grad_norm": 6.21875, "learning_rate": 9.730455206352758e-06, "loss": 1.19113703, "memory(GiB)": 142.32, "step": 8700, "train_speed(iter/s)": 0.293013 }, { "acc": 0.70404553, "epoch": 0.09753274043791865, "grad_norm": 5.625, "learning_rate": 9.752824068896098e-06, "loss": 1.19966602, "memory(GiB)": 142.32, "step": 8720, "train_speed(iter/s)": 0.293232 }, { "acc": 0.71537132, "epoch": 0.09775643938387718, "grad_norm": 5.75, "learning_rate": 9.775192931439437e-06, "loss": 1.14655943, "memory(GiB)": 142.32, "step": 8740, "train_speed(iter/s)": 0.293444 }, { "acc": 0.70586367, "epoch": 0.09798013832983571, "grad_norm": 5.84375, "learning_rate": 9.797561793982777e-06, "loss": 1.19643049, "memory(GiB)": 142.32, "step": 8760, "train_speed(iter/s)": 0.293654 }, { "acc": 0.70201631, "epoch": 0.09820383727579424, "grad_norm": 5.40625, "learning_rate": 9.819930656526116e-06, "loss": 1.20359535, "memory(GiB)": 142.32, "step": 8780, "train_speed(iter/s)": 0.293866 }, { "acc": 0.70201378, "epoch": 0.09842753622175276, "grad_norm": 5.71875, "learning_rate": 9.842299519069455e-06, "loss": 1.20647783, "memory(GiB)": 142.32, "step": 8800, "train_speed(iter/s)": 0.294086 }, { "acc": 0.70206518, "epoch": 0.0986512351677113, "grad_norm": 6.0, "learning_rate": 9.864668381612795e-06, "loss": 1.21906643, "memory(GiB)": 142.32, "step": 8820, "train_speed(iter/s)": 0.294308 }, { "acc": 0.71414471, "epoch": 0.09887493411366982, "grad_norm": 6.25, "learning_rate": 9.887037244156136e-06, "loss": 1.14139614, "memory(GiB)": 142.32, "step": 8840, "train_speed(iter/s)": 0.294514 }, { "acc": 0.70055919, "epoch": 0.09909863305962835, "grad_norm": 5.8125, "learning_rate": 9.909406106699475e-06, "loss": 1.19957695, "memory(GiB)": 142.32, "step": 8860, "train_speed(iter/s)": 0.294742 }, { "acc": 0.70771613, "epoch": 0.09932233200558688, "grad_norm": 7.75, "learning_rate": 9.931774969242814e-06, "loss": 1.20092068, "memory(GiB)": 142.32, "step": 8880, "train_speed(iter/s)": 0.294948 }, { "acc": 0.71309385, "epoch": 0.0995460309515454, "grad_norm": 6.0625, "learning_rate": 9.954143831786155e-06, "loss": 1.13813963, "memory(GiB)": 142.32, "step": 8900, "train_speed(iter/s)": 0.295176 }, { "acc": 0.7007772, "epoch": 0.09976972989750393, "grad_norm": 5.46875, "learning_rate": 9.976512694329495e-06, "loss": 1.20535517, "memory(GiB)": 142.32, "step": 8920, "train_speed(iter/s)": 0.295399 }, { "acc": 0.70415955, "epoch": 0.09999342884346246, "grad_norm": 5.53125, "learning_rate": 9.998881556872834e-06, "loss": 1.17877731, "memory(GiB)": 142.32, "step": 8940, "train_speed(iter/s)": 0.295616 }, { "acc": 0.70797024, "epoch": 0.100217127789421, "grad_norm": 6.25, "learning_rate": 9.999999691312751e-06, "loss": 1.17824936, "memory(GiB)": 142.32, "step": 8960, "train_speed(iter/s)": 0.295811 }, { "acc": 0.69366193, "epoch": 0.10044082673537953, "grad_norm": 5.65625, "learning_rate": 9.999998699409167e-06, "loss": 1.24867897, "memory(GiB)": 142.32, "step": 8980, "train_speed(iter/s)": 0.29602 }, { "acc": 0.71409893, "epoch": 0.10066452568133806, "grad_norm": 6.15625, "learning_rate": 9.999997023434294e-06, "loss": 1.14566879, "memory(GiB)": 142.32, "step": 9000, "train_speed(iter/s)": 0.296226 }, { "acc": 0.70197639, "epoch": 0.10088822462729659, "grad_norm": 6.65625, "learning_rate": 9.999994663388362e-06, "loss": 1.20212345, "memory(GiB)": 142.32, "step": 9020, "train_speed(iter/s)": 0.296408 }, { "acc": 0.71278009, "epoch": 0.10111192357325512, "grad_norm": 5.0625, "learning_rate": 9.999991619271693e-06, "loss": 1.15053425, "memory(GiB)": 142.32, "step": 9040, "train_speed(iter/s)": 0.296637 }, { "acc": 0.7084362, "epoch": 0.10133562251921364, "grad_norm": 6.28125, "learning_rate": 9.999987891084703e-06, "loss": 1.18163815, "memory(GiB)": 142.32, "step": 9060, "train_speed(iter/s)": 0.296857 }, { "acc": 0.70897808, "epoch": 0.10155932146517217, "grad_norm": 4.8125, "learning_rate": 9.999983478827906e-06, "loss": 1.16985226, "memory(GiB)": 142.32, "step": 9080, "train_speed(iter/s)": 0.297029 }, { "acc": 0.71047106, "epoch": 0.1017830204111307, "grad_norm": 7.1875, "learning_rate": 9.999978382501902e-06, "loss": 1.16759863, "memory(GiB)": 142.32, "step": 9100, "train_speed(iter/s)": 0.297239 }, { "acc": 0.71045151, "epoch": 0.10200671935708923, "grad_norm": 7.15625, "learning_rate": 9.999972602107388e-06, "loss": 1.16359386, "memory(GiB)": 142.32, "step": 9120, "train_speed(iter/s)": 0.29742 }, { "acc": 0.70980272, "epoch": 0.10223041830304776, "grad_norm": 6.875, "learning_rate": 9.999966137645157e-06, "loss": 1.16925392, "memory(GiB)": 142.32, "step": 9140, "train_speed(iter/s)": 0.297634 }, { "acc": 0.7055994, "epoch": 0.10245411724900629, "grad_norm": 7.15625, "learning_rate": 9.999958989116093e-06, "loss": 1.20327549, "memory(GiB)": 142.32, "step": 9160, "train_speed(iter/s)": 0.297807 }, { "acc": 0.71045752, "epoch": 0.10267781619496481, "grad_norm": 5.625, "learning_rate": 9.999951156521172e-06, "loss": 1.16616516, "memory(GiB)": 142.32, "step": 9180, "train_speed(iter/s)": 0.29802 }, { "acc": 0.71230145, "epoch": 0.10290151514092334, "grad_norm": 6.03125, "learning_rate": 9.999942639861467e-06, "loss": 1.15437155, "memory(GiB)": 142.32, "step": 9200, "train_speed(iter/s)": 0.298237 }, { "acc": 0.69764285, "epoch": 0.10312521408688187, "grad_norm": 5.90625, "learning_rate": 9.999933439138144e-06, "loss": 1.24267941, "memory(GiB)": 142.32, "step": 9220, "train_speed(iter/s)": 0.298458 }, { "acc": 0.71147757, "epoch": 0.1033489130328404, "grad_norm": 6.46875, "learning_rate": 9.999923554352461e-06, "loss": 1.14996052, "memory(GiB)": 142.32, "step": 9240, "train_speed(iter/s)": 0.298669 }, { "acc": 0.70743532, "epoch": 0.10357261197879893, "grad_norm": 6.8125, "learning_rate": 9.999912985505772e-06, "loss": 1.19310465, "memory(GiB)": 142.32, "step": 9260, "train_speed(iter/s)": 0.298858 }, { "acc": 0.70215344, "epoch": 0.10379631092475747, "grad_norm": 6.46875, "learning_rate": 9.999901732599518e-06, "loss": 1.19191093, "memory(GiB)": 142.32, "step": 9280, "train_speed(iter/s)": 0.29905 }, { "acc": 0.70948482, "epoch": 0.104020009870716, "grad_norm": 5.625, "learning_rate": 9.999889795635243e-06, "loss": 1.15949383, "memory(GiB)": 142.32, "step": 9300, "train_speed(iter/s)": 0.299236 }, { "acc": 0.71542883, "epoch": 0.10424370881667452, "grad_norm": 5.1875, "learning_rate": 9.99987717461458e-06, "loss": 1.15301437, "memory(GiB)": 142.32, "step": 9320, "train_speed(iter/s)": 0.299441 }, { "acc": 0.72194462, "epoch": 0.10446740776263305, "grad_norm": 6.40625, "learning_rate": 9.999863869539254e-06, "loss": 1.11825218, "memory(GiB)": 142.32, "step": 9340, "train_speed(iter/s)": 0.29964 }, { "acc": 0.70734105, "epoch": 0.10469110670859158, "grad_norm": 5.34375, "learning_rate": 9.999849880411086e-06, "loss": 1.17742844, "memory(GiB)": 142.32, "step": 9360, "train_speed(iter/s)": 0.299858 }, { "acc": 0.7210681, "epoch": 0.10491480565455011, "grad_norm": 5.875, "learning_rate": 9.99983520723199e-06, "loss": 1.11002407, "memory(GiB)": 142.32, "step": 9380, "train_speed(iter/s)": 0.300071 }, { "acc": 0.70272779, "epoch": 0.10513850460050864, "grad_norm": 5.3125, "learning_rate": 9.999819850003975e-06, "loss": 1.20182734, "memory(GiB)": 142.32, "step": 9400, "train_speed(iter/s)": 0.300284 }, { "acc": 0.72117777, "epoch": 0.10536220354646716, "grad_norm": 6.53125, "learning_rate": 9.99980380872914e-06, "loss": 1.10799618, "memory(GiB)": 142.32, "step": 9420, "train_speed(iter/s)": 0.300478 }, { "acc": 0.70815034, "epoch": 0.10558590249242569, "grad_norm": 7.1875, "learning_rate": 9.999787083409679e-06, "loss": 1.16688719, "memory(GiB)": 142.32, "step": 9440, "train_speed(iter/s)": 0.300678 }, { "acc": 0.70995636, "epoch": 0.10580960143838422, "grad_norm": 6.75, "learning_rate": 9.999769674047883e-06, "loss": 1.16126041, "memory(GiB)": 142.32, "step": 9460, "train_speed(iter/s)": 0.300875 }, { "acc": 0.71229248, "epoch": 0.10603330038434275, "grad_norm": 5.65625, "learning_rate": 9.999751580646132e-06, "loss": 1.15439053, "memory(GiB)": 142.32, "step": 9480, "train_speed(iter/s)": 0.301049 }, { "acc": 0.72020168, "epoch": 0.10625699933030128, "grad_norm": 5.875, "learning_rate": 9.999732803206901e-06, "loss": 1.12213268, "memory(GiB)": 142.32, "step": 9500, "train_speed(iter/s)": 0.301251 }, { "acc": 0.71192179, "epoch": 0.1064806982762598, "grad_norm": 5.4375, "learning_rate": 9.999713341732762e-06, "loss": 1.14813652, "memory(GiB)": 142.32, "step": 9520, "train_speed(iter/s)": 0.301425 }, { "acc": 0.71063423, "epoch": 0.10670439722221833, "grad_norm": 6.0625, "learning_rate": 9.999693196226373e-06, "loss": 1.16679955, "memory(GiB)": 142.32, "step": 9540, "train_speed(iter/s)": 0.301608 }, { "acc": 0.71690354, "epoch": 0.10692809616817686, "grad_norm": 7.25, "learning_rate": 9.999672366690494e-06, "loss": 1.13806171, "memory(GiB)": 142.32, "step": 9560, "train_speed(iter/s)": 0.301791 }, { "acc": 0.71299248, "epoch": 0.10715179511413539, "grad_norm": 6.15625, "learning_rate": 9.999650853127973e-06, "loss": 1.16410913, "memory(GiB)": 142.32, "step": 9580, "train_speed(iter/s)": 0.301995 }, { "acc": 0.70581932, "epoch": 0.10737549406009393, "grad_norm": 6.09375, "learning_rate": 9.999628655541754e-06, "loss": 1.18561125, "memory(GiB)": 142.32, "step": 9600, "train_speed(iter/s)": 0.302186 }, { "acc": 0.71731071, "epoch": 0.10759919300605246, "grad_norm": 5.5, "learning_rate": 9.999605773934873e-06, "loss": 1.15674667, "memory(GiB)": 142.32, "step": 9620, "train_speed(iter/s)": 0.302363 }, { "acc": 0.71855536, "epoch": 0.10782289195201099, "grad_norm": 7.0625, "learning_rate": 9.999582208310463e-06, "loss": 1.12177916, "memory(GiB)": 142.32, "step": 9640, "train_speed(iter/s)": 0.302547 }, { "acc": 0.70215049, "epoch": 0.10804659089796952, "grad_norm": 5.28125, "learning_rate": 9.999557958671746e-06, "loss": 1.19497652, "memory(GiB)": 142.32, "step": 9660, "train_speed(iter/s)": 0.302745 }, { "acc": 0.70828161, "epoch": 0.10827028984392804, "grad_norm": 5.125, "learning_rate": 9.99953302502204e-06, "loss": 1.1906744, "memory(GiB)": 142.32, "step": 9680, "train_speed(iter/s)": 0.302929 }, { "acc": 0.71179972, "epoch": 0.10849398878988657, "grad_norm": 7.03125, "learning_rate": 9.999507407364755e-06, "loss": 1.1526104, "memory(GiB)": 142.32, "step": 9700, "train_speed(iter/s)": 0.303122 }, { "acc": 0.70420051, "epoch": 0.1087176877358451, "grad_norm": 4.625, "learning_rate": 9.999481105703397e-06, "loss": 1.17275791, "memory(GiB)": 142.32, "step": 9720, "train_speed(iter/s)": 0.303304 }, { "acc": 0.71142197, "epoch": 0.10894138668180363, "grad_norm": 5.96875, "learning_rate": 9.999454120041567e-06, "loss": 1.14759254, "memory(GiB)": 142.32, "step": 9740, "train_speed(iter/s)": 0.303464 }, { "acc": 0.71209588, "epoch": 0.10916508562776216, "grad_norm": 6.15625, "learning_rate": 9.999426450382953e-06, "loss": 1.14452791, "memory(GiB)": 142.32, "step": 9760, "train_speed(iter/s)": 0.303646 }, { "acc": 0.71273336, "epoch": 0.10938878457372068, "grad_norm": 5.375, "learning_rate": 9.999398096731343e-06, "loss": 1.15295954, "memory(GiB)": 142.32, "step": 9780, "train_speed(iter/s)": 0.30384 }, { "acc": 0.71162853, "epoch": 0.10961248351967921, "grad_norm": 5.40625, "learning_rate": 9.999369059090616e-06, "loss": 1.15734329, "memory(GiB)": 142.32, "step": 9800, "train_speed(iter/s)": 0.304043 }, { "acc": 0.70742965, "epoch": 0.10983618246563774, "grad_norm": 6.21875, "learning_rate": 9.999339337464744e-06, "loss": 1.18014708, "memory(GiB)": 142.32, "step": 9820, "train_speed(iter/s)": 0.30422 }, { "acc": 0.70291424, "epoch": 0.11005988141159627, "grad_norm": 5.0625, "learning_rate": 9.999308931857794e-06, "loss": 1.18383598, "memory(GiB)": 142.32, "step": 9840, "train_speed(iter/s)": 0.304416 }, { "acc": 0.7010704, "epoch": 0.1102835803575548, "grad_norm": 6.46875, "learning_rate": 9.999277842273925e-06, "loss": 1.20101662, "memory(GiB)": 142.32, "step": 9860, "train_speed(iter/s)": 0.304582 }, { "acc": 0.71405029, "epoch": 0.11050727930351333, "grad_norm": 5.71875, "learning_rate": 9.99924606871739e-06, "loss": 1.15028419, "memory(GiB)": 142.32, "step": 9880, "train_speed(iter/s)": 0.304771 }, { "acc": 0.71888456, "epoch": 0.11073097824947187, "grad_norm": 6.34375, "learning_rate": 9.999213611192537e-06, "loss": 1.12012501, "memory(GiB)": 142.32, "step": 9900, "train_speed(iter/s)": 0.304966 }, { "acc": 0.72172365, "epoch": 0.1109546771954304, "grad_norm": 5.4375, "learning_rate": 9.999180469703809e-06, "loss": 1.11445961, "memory(GiB)": 142.32, "step": 9920, "train_speed(iter/s)": 0.305167 }, { "acc": 0.70755749, "epoch": 0.11117837614138892, "grad_norm": 6.59375, "learning_rate": 9.999146644255738e-06, "loss": 1.17986326, "memory(GiB)": 142.32, "step": 9940, "train_speed(iter/s)": 0.305379 }, { "acc": 0.71004071, "epoch": 0.11140207508734745, "grad_norm": 5.59375, "learning_rate": 9.99911213485295e-06, "loss": 1.16918507, "memory(GiB)": 142.32, "step": 9960, "train_speed(iter/s)": 0.305556 }, { "acc": 0.70993347, "epoch": 0.11162577403330598, "grad_norm": 5.78125, "learning_rate": 9.999076941500167e-06, "loss": 1.17604284, "memory(GiB)": 142.32, "step": 9980, "train_speed(iter/s)": 0.305708 }, { "acc": 0.72883463, "epoch": 0.11184947297926451, "grad_norm": 6.875, "learning_rate": 9.999041064202208e-06, "loss": 1.08475094, "memory(GiB)": 142.32, "step": 10000, "train_speed(iter/s)": 0.305888 }, { "epoch": 0.11184947297926451, "eval_acc": 0.6731126972802838, "eval_loss": 1.1685221195220947, "eval_runtime": 2343.676, "eval_samples_per_second": 32.122, "eval_steps_per_second": 16.061, "step": 10000 }, { "acc": 0.70915222, "epoch": 0.11207317192522304, "grad_norm": 5.5, "learning_rate": 9.999004502963978e-06, "loss": 1.17877159, "memory(GiB)": 142.32, "step": 10020, "train_speed(iter/s)": 0.285189 }, { "acc": 0.70333433, "epoch": 0.11229687087118156, "grad_norm": 6.0625, "learning_rate": 9.99896725779048e-06, "loss": 1.19849586, "memory(GiB)": 142.32, "step": 10040, "train_speed(iter/s)": 0.285379 }, { "acc": 0.70390215, "epoch": 0.11252056981714009, "grad_norm": 5.6875, "learning_rate": 9.998929328686808e-06, "loss": 1.2096199, "memory(GiB)": 142.32, "step": 10060, "train_speed(iter/s)": 0.285597 }, { "acc": 0.71100807, "epoch": 0.11274426876309862, "grad_norm": 6.53125, "learning_rate": 9.998890715658153e-06, "loss": 1.15551949, "memory(GiB)": 142.32, "step": 10080, "train_speed(iter/s)": 0.285795 }, { "acc": 0.70001907, "epoch": 0.11296796770905715, "grad_norm": 5.375, "learning_rate": 9.998851418709798e-06, "loss": 1.20353966, "memory(GiB)": 142.32, "step": 10100, "train_speed(iter/s)": 0.285979 }, { "acc": 0.71475511, "epoch": 0.11319166665501568, "grad_norm": 5.96875, "learning_rate": 9.998811437847117e-06, "loss": 1.13751049, "memory(GiB)": 142.32, "step": 10120, "train_speed(iter/s)": 0.286167 }, { "acc": 0.71315126, "epoch": 0.1134153656009742, "grad_norm": 5.40625, "learning_rate": 9.998770773075586e-06, "loss": 1.15639572, "memory(GiB)": 142.32, "step": 10140, "train_speed(iter/s)": 0.286349 }, { "acc": 0.71882048, "epoch": 0.11363906454693273, "grad_norm": 5.46875, "learning_rate": 9.998729424400761e-06, "loss": 1.111269, "memory(GiB)": 142.32, "step": 10160, "train_speed(iter/s)": 0.286544 }, { "acc": 0.69843073, "epoch": 0.11386276349289126, "grad_norm": 5.5625, "learning_rate": 9.998687391828303e-06, "loss": 1.22144012, "memory(GiB)": 142.32, "step": 10180, "train_speed(iter/s)": 0.286741 }, { "acc": 0.70821781, "epoch": 0.11408646243884979, "grad_norm": 6.125, "learning_rate": 9.998644675363961e-06, "loss": 1.174646, "memory(GiB)": 142.32, "step": 10200, "train_speed(iter/s)": 0.286938 }, { "acc": 0.70878086, "epoch": 0.11431016138480833, "grad_norm": 4.90625, "learning_rate": 9.998601275013584e-06, "loss": 1.17503853, "memory(GiB)": 142.32, "step": 10220, "train_speed(iter/s)": 0.28714 }, { "acc": 0.70147762, "epoch": 0.11453386033076686, "grad_norm": 6.0, "learning_rate": 9.998557190783104e-06, "loss": 1.20690441, "memory(GiB)": 142.32, "step": 10240, "train_speed(iter/s)": 0.28735 }, { "acc": 0.70597196, "epoch": 0.11475755927672539, "grad_norm": 5.46875, "learning_rate": 9.998512422678555e-06, "loss": 1.2059597, "memory(GiB)": 142.32, "step": 10260, "train_speed(iter/s)": 0.287559 }, { "acc": 0.71346426, "epoch": 0.11498125822268392, "grad_norm": 5.8125, "learning_rate": 9.99846697070606e-06, "loss": 1.1539753, "memory(GiB)": 142.32, "step": 10280, "train_speed(iter/s)": 0.287746 }, { "acc": 0.71055846, "epoch": 0.11520495716864244, "grad_norm": 6.03125, "learning_rate": 9.99842083487184e-06, "loss": 1.18189144, "memory(GiB)": 142.32, "step": 10300, "train_speed(iter/s)": 0.28793 }, { "acc": 0.71613669, "epoch": 0.11542865611460097, "grad_norm": 6.3125, "learning_rate": 9.998374015182205e-06, "loss": 1.1446207, "memory(GiB)": 142.32, "step": 10320, "train_speed(iter/s)": 0.288096 }, { "acc": 0.70924578, "epoch": 0.1156523550605595, "grad_norm": 5.90625, "learning_rate": 9.998326511643562e-06, "loss": 1.18253613, "memory(GiB)": 142.32, "step": 10340, "train_speed(iter/s)": 0.28828 }, { "acc": 0.70885353, "epoch": 0.11587605400651803, "grad_norm": 6.1875, "learning_rate": 9.998278324262408e-06, "loss": 1.16302156, "memory(GiB)": 142.32, "step": 10360, "train_speed(iter/s)": 0.288484 }, { "acc": 0.71889725, "epoch": 0.11609975295247656, "grad_norm": 7.8125, "learning_rate": 9.998229453045341e-06, "loss": 1.11291199, "memory(GiB)": 142.32, "step": 10380, "train_speed(iter/s)": 0.288685 }, { "acc": 0.70227261, "epoch": 0.11632345189843508, "grad_norm": 6.3125, "learning_rate": 9.998179897999041e-06, "loss": 1.1924675, "memory(GiB)": 142.32, "step": 10400, "train_speed(iter/s)": 0.288876 }, { "acc": 0.70917101, "epoch": 0.11654715084439361, "grad_norm": 5.9375, "learning_rate": 9.998129659130292e-06, "loss": 1.1660656, "memory(GiB)": 142.32, "step": 10420, "train_speed(iter/s)": 0.289053 }, { "acc": 0.7272151, "epoch": 0.11677084979035214, "grad_norm": 5.5, "learning_rate": 9.998078736445964e-06, "loss": 1.08752108, "memory(GiB)": 142.32, "step": 10440, "train_speed(iter/s)": 0.289231 }, { "acc": 0.69828677, "epoch": 0.11699454873631067, "grad_norm": 4.25, "learning_rate": 9.998027129953027e-06, "loss": 1.23933144, "memory(GiB)": 142.32, "step": 10460, "train_speed(iter/s)": 0.289423 }, { "acc": 0.70756922, "epoch": 0.1172182476822692, "grad_norm": 5.75, "learning_rate": 9.99797483965854e-06, "loss": 1.17449131, "memory(GiB)": 142.32, "step": 10480, "train_speed(iter/s)": 0.289608 }, { "acc": 0.72240982, "epoch": 0.11744194662822773, "grad_norm": 6.1875, "learning_rate": 9.997921865569657e-06, "loss": 1.10241117, "memory(GiB)": 142.32, "step": 10500, "train_speed(iter/s)": 0.289786 }, { "acc": 0.71082706, "epoch": 0.11766564557418625, "grad_norm": 7.25, "learning_rate": 9.997868207693628e-06, "loss": 1.16438026, "memory(GiB)": 142.32, "step": 10520, "train_speed(iter/s)": 0.289995 }, { "acc": 0.71361642, "epoch": 0.1178893445201448, "grad_norm": 5.8125, "learning_rate": 9.997813866037792e-06, "loss": 1.13963337, "memory(GiB)": 142.32, "step": 10540, "train_speed(iter/s)": 0.290177 }, { "acc": 0.70817747, "epoch": 0.11811304346610332, "grad_norm": 5.375, "learning_rate": 9.99775884060958e-06, "loss": 1.18392315, "memory(GiB)": 142.32, "step": 10560, "train_speed(iter/s)": 0.290358 }, { "acc": 0.70609803, "epoch": 0.11833674241206185, "grad_norm": 5.28125, "learning_rate": 9.997703131416527e-06, "loss": 1.17760248, "memory(GiB)": 142.32, "step": 10580, "train_speed(iter/s)": 0.290533 }, { "acc": 0.70757093, "epoch": 0.11856044135802038, "grad_norm": 5.75, "learning_rate": 9.997646738466254e-06, "loss": 1.16309252, "memory(GiB)": 142.32, "step": 10600, "train_speed(iter/s)": 0.290739 }, { "acc": 0.7035326, "epoch": 0.11878414030397891, "grad_norm": 4.875, "learning_rate": 9.997589661766471e-06, "loss": 1.22159443, "memory(GiB)": 142.32, "step": 10620, "train_speed(iter/s)": 0.290877 }, { "acc": 0.72657003, "epoch": 0.11900783924993744, "grad_norm": 6.34375, "learning_rate": 9.997531901324991e-06, "loss": 1.09458256, "memory(GiB)": 142.32, "step": 10640, "train_speed(iter/s)": 0.291064 }, { "acc": 0.71592703, "epoch": 0.11923153819589596, "grad_norm": 6.90625, "learning_rate": 9.997473457149717e-06, "loss": 1.12592258, "memory(GiB)": 142.32, "step": 10660, "train_speed(iter/s)": 0.291243 }, { "acc": 0.71103067, "epoch": 0.11945523714185449, "grad_norm": 5.84375, "learning_rate": 9.997414329248642e-06, "loss": 1.16763744, "memory(GiB)": 142.32, "step": 10680, "train_speed(iter/s)": 0.291408 }, { "acc": 0.71518064, "epoch": 0.11967893608781302, "grad_norm": 5.53125, "learning_rate": 9.99735451762986e-06, "loss": 1.12845669, "memory(GiB)": 142.32, "step": 10700, "train_speed(iter/s)": 0.291583 }, { "acc": 0.71259875, "epoch": 0.11990263503377155, "grad_norm": 5.53125, "learning_rate": 9.99729402230155e-06, "loss": 1.14884462, "memory(GiB)": 142.32, "step": 10720, "train_speed(iter/s)": 0.291766 }, { "acc": 0.70943604, "epoch": 0.12012633397973008, "grad_norm": 6.21875, "learning_rate": 9.99723284327199e-06, "loss": 1.16200771, "memory(GiB)": 142.32, "step": 10740, "train_speed(iter/s)": 0.291944 }, { "acc": 0.71629653, "epoch": 0.1203500329256886, "grad_norm": 5.6875, "learning_rate": 9.997170980549547e-06, "loss": 1.12600689, "memory(GiB)": 142.32, "step": 10760, "train_speed(iter/s)": 0.292143 }, { "acc": 0.7163166, "epoch": 0.12057373187164713, "grad_norm": 6.90625, "learning_rate": 9.99710843414269e-06, "loss": 1.13362751, "memory(GiB)": 142.32, "step": 10780, "train_speed(iter/s)": 0.292341 }, { "acc": 0.73084569, "epoch": 0.12079743081760566, "grad_norm": 5.375, "learning_rate": 9.997045204059977e-06, "loss": 1.0725668, "memory(GiB)": 142.32, "step": 10800, "train_speed(iter/s)": 0.292542 }, { "acc": 0.70180693, "epoch": 0.12102112976356419, "grad_norm": 6.0, "learning_rate": 9.996981290310052e-06, "loss": 1.2090353, "memory(GiB)": 142.32, "step": 10820, "train_speed(iter/s)": 0.292727 }, { "acc": 0.70386992, "epoch": 0.12124482870952273, "grad_norm": 5.9375, "learning_rate": 9.996916692901665e-06, "loss": 1.20016098, "memory(GiB)": 142.32, "step": 10840, "train_speed(iter/s)": 0.292895 }, { "acc": 0.7079052, "epoch": 0.12146852765548126, "grad_norm": 5.78125, "learning_rate": 9.996851411843652e-06, "loss": 1.17706738, "memory(GiB)": 142.32, "step": 10860, "train_speed(iter/s)": 0.293079 }, { "acc": 0.71555877, "epoch": 0.12169222660143979, "grad_norm": 6.40625, "learning_rate": 9.996785447144943e-06, "loss": 1.14053001, "memory(GiB)": 142.32, "step": 10880, "train_speed(iter/s)": 0.293245 }, { "acc": 0.70772171, "epoch": 0.12191592554739832, "grad_norm": 5.0625, "learning_rate": 9.996718798814565e-06, "loss": 1.16763115, "memory(GiB)": 142.32, "step": 10900, "train_speed(iter/s)": 0.29344 }, { "acc": 0.71064229, "epoch": 0.12213962449335684, "grad_norm": 5.90625, "learning_rate": 9.996651466861636e-06, "loss": 1.16997318, "memory(GiB)": 142.32, "step": 10920, "train_speed(iter/s)": 0.293588 }, { "acc": 0.70436888, "epoch": 0.12236332343931537, "grad_norm": 5.8125, "learning_rate": 9.996583451295368e-06, "loss": 1.18395071, "memory(GiB)": 142.32, "step": 10940, "train_speed(iter/s)": 0.293755 }, { "acc": 0.70136948, "epoch": 0.1225870223852739, "grad_norm": 6.40625, "learning_rate": 9.996514752125065e-06, "loss": 1.20225773, "memory(GiB)": 142.32, "step": 10960, "train_speed(iter/s)": 0.293931 }, { "acc": 0.71327, "epoch": 0.12281072133123243, "grad_norm": 6.9375, "learning_rate": 9.996445369360129e-06, "loss": 1.16333237, "memory(GiB)": 142.32, "step": 10980, "train_speed(iter/s)": 0.294078 }, { "acc": 0.70659051, "epoch": 0.12303442027719096, "grad_norm": 5.71875, "learning_rate": 9.996375303010051e-06, "loss": 1.17228718, "memory(GiB)": 142.32, "step": 11000, "train_speed(iter/s)": 0.29424 }, { "acc": 0.70657272, "epoch": 0.12325811922314948, "grad_norm": 5.1875, "learning_rate": 9.996304553084416e-06, "loss": 1.17294693, "memory(GiB)": 142.32, "step": 11020, "train_speed(iter/s)": 0.294417 }, { "acc": 0.716817, "epoch": 0.12348181816910801, "grad_norm": 6.1875, "learning_rate": 9.996233119592905e-06, "loss": 1.14722004, "memory(GiB)": 142.32, "step": 11040, "train_speed(iter/s)": 0.294596 }, { "acc": 0.69020462, "epoch": 0.12370551711506654, "grad_norm": 5.90625, "learning_rate": 9.996161002545288e-06, "loss": 1.25860233, "memory(GiB)": 142.32, "step": 11060, "train_speed(iter/s)": 0.294761 }, { "acc": 0.71189661, "epoch": 0.12392921606102507, "grad_norm": 5.65625, "learning_rate": 9.996088201951438e-06, "loss": 1.17361965, "memory(GiB)": 142.32, "step": 11080, "train_speed(iter/s)": 0.294929 }, { "acc": 0.71497545, "epoch": 0.1241529150069836, "grad_norm": 5.625, "learning_rate": 9.996014717821309e-06, "loss": 1.13573513, "memory(GiB)": 142.32, "step": 11100, "train_speed(iter/s)": 0.295097 }, { "acc": 0.70816555, "epoch": 0.12437661395294212, "grad_norm": 6.3125, "learning_rate": 9.995940550164958e-06, "loss": 1.16391306, "memory(GiB)": 142.32, "step": 11120, "train_speed(iter/s)": 0.295262 }, { "acc": 0.71219511, "epoch": 0.12460031289890065, "grad_norm": 6.46875, "learning_rate": 9.995865698992531e-06, "loss": 1.15258369, "memory(GiB)": 142.32, "step": 11140, "train_speed(iter/s)": 0.295409 }, { "acc": 0.71000109, "epoch": 0.1248240118448592, "grad_norm": 5.4375, "learning_rate": 9.995790164314269e-06, "loss": 1.17254839, "memory(GiB)": 142.32, "step": 11160, "train_speed(iter/s)": 0.295567 }, { "acc": 0.71080894, "epoch": 0.12504771079081772, "grad_norm": 6.09375, "learning_rate": 9.995713946140507e-06, "loss": 1.15928612, "memory(GiB)": 142.32, "step": 11180, "train_speed(iter/s)": 0.295739 }, { "acc": 0.71162839, "epoch": 0.12527140973677625, "grad_norm": 6.4375, "learning_rate": 9.99563704448167e-06, "loss": 1.153965, "memory(GiB)": 142.32, "step": 11200, "train_speed(iter/s)": 0.295894 }, { "acc": 0.7139822, "epoch": 0.12549510868273478, "grad_norm": 5.71875, "learning_rate": 9.995559459348282e-06, "loss": 1.14436455, "memory(GiB)": 142.32, "step": 11220, "train_speed(iter/s)": 0.296071 }, { "acc": 0.71490102, "epoch": 0.1257188076286933, "grad_norm": 6.28125, "learning_rate": 9.995481190750958e-06, "loss": 1.15908165, "memory(GiB)": 142.32, "step": 11240, "train_speed(iter/s)": 0.296217 }, { "acc": 0.70626278, "epoch": 0.12594250657465184, "grad_norm": 5.78125, "learning_rate": 9.995402238700406e-06, "loss": 1.18055305, "memory(GiB)": 142.32, "step": 11260, "train_speed(iter/s)": 0.296365 }, { "acc": 0.71751986, "epoch": 0.12616620552061036, "grad_norm": 5.625, "learning_rate": 9.995322603207425e-06, "loss": 1.14661522, "memory(GiB)": 142.32, "step": 11280, "train_speed(iter/s)": 0.296536 }, { "acc": 0.70193458, "epoch": 0.1263899044665689, "grad_norm": 5.40625, "learning_rate": 9.995242284282912e-06, "loss": 1.20543251, "memory(GiB)": 142.32, "step": 11300, "train_speed(iter/s)": 0.296686 }, { "acc": 0.71597652, "epoch": 0.12661360341252742, "grad_norm": 5.625, "learning_rate": 9.995161281937858e-06, "loss": 1.1449235, "memory(GiB)": 142.32, "step": 11320, "train_speed(iter/s)": 0.29685 }, { "acc": 0.72535419, "epoch": 0.12683730235848595, "grad_norm": 5.25, "learning_rate": 9.995079596183343e-06, "loss": 1.08829994, "memory(GiB)": 142.32, "step": 11340, "train_speed(iter/s)": 0.296997 }, { "acc": 0.71171288, "epoch": 0.12706100130444448, "grad_norm": 5.0, "learning_rate": 9.994997227030543e-06, "loss": 1.15628796, "memory(GiB)": 142.32, "step": 11360, "train_speed(iter/s)": 0.297155 }, { "acc": 0.7151968, "epoch": 0.127284700250403, "grad_norm": 6.0625, "learning_rate": 9.994914174490727e-06, "loss": 1.134445, "memory(GiB)": 142.32, "step": 11380, "train_speed(iter/s)": 0.297322 }, { "acc": 0.73301516, "epoch": 0.12750839919636153, "grad_norm": 6.0, "learning_rate": 9.994830438575257e-06, "loss": 1.06229324, "memory(GiB)": 142.32, "step": 11400, "train_speed(iter/s)": 0.297501 }, { "acc": 0.70445805, "epoch": 0.12773209814232006, "grad_norm": 6.6875, "learning_rate": 9.994746019295592e-06, "loss": 1.18790331, "memory(GiB)": 142.32, "step": 11420, "train_speed(iter/s)": 0.297666 }, { "acc": 0.71191864, "epoch": 0.1279557970882786, "grad_norm": 5.34375, "learning_rate": 9.994660916663279e-06, "loss": 1.14316969, "memory(GiB)": 142.32, "step": 11440, "train_speed(iter/s)": 0.297826 }, { "acc": 0.71069803, "epoch": 0.12817949603423712, "grad_norm": 5.75, "learning_rate": 9.994575130689963e-06, "loss": 1.15072489, "memory(GiB)": 142.32, "step": 11460, "train_speed(iter/s)": 0.297988 }, { "acc": 0.70617604, "epoch": 0.12840319498019565, "grad_norm": 6.71875, "learning_rate": 9.99448866138738e-06, "loss": 1.19160795, "memory(GiB)": 142.32, "step": 11480, "train_speed(iter/s)": 0.298147 }, { "acc": 0.71220961, "epoch": 0.12862689392615417, "grad_norm": 5.46875, "learning_rate": 9.994401508767361e-06, "loss": 1.1710474, "memory(GiB)": 142.32, "step": 11500, "train_speed(iter/s)": 0.298297 }, { "acc": 0.71920142, "epoch": 0.1288505928721127, "grad_norm": 6.5, "learning_rate": 9.994313672841829e-06, "loss": 1.11884937, "memory(GiB)": 142.32, "step": 11520, "train_speed(iter/s)": 0.29847 }, { "acc": 0.72247925, "epoch": 0.12907429181807123, "grad_norm": 5.875, "learning_rate": 9.994225153622801e-06, "loss": 1.10786057, "memory(GiB)": 142.32, "step": 11540, "train_speed(iter/s)": 0.298633 }, { "acc": 0.70452609, "epoch": 0.12929799076402976, "grad_norm": 6.78125, "learning_rate": 9.994135951122387e-06, "loss": 1.19987297, "memory(GiB)": 142.32, "step": 11560, "train_speed(iter/s)": 0.298809 }, { "acc": 0.71878166, "epoch": 0.12952168970998829, "grad_norm": 5.90625, "learning_rate": 9.994046065352794e-06, "loss": 1.13222942, "memory(GiB)": 142.32, "step": 11580, "train_speed(iter/s)": 0.298973 }, { "acc": 0.71108704, "epoch": 0.12974538865594681, "grad_norm": 5.59375, "learning_rate": 9.993955496326318e-06, "loss": 1.16817856, "memory(GiB)": 142.32, "step": 11600, "train_speed(iter/s)": 0.299157 }, { "acc": 0.71178703, "epoch": 0.12996908760190534, "grad_norm": 5.90625, "learning_rate": 9.99386424405535e-06, "loss": 1.14670925, "memory(GiB)": 142.32, "step": 11620, "train_speed(iter/s)": 0.299331 }, { "acc": 0.71724849, "epoch": 0.1301927865478639, "grad_norm": 5.0625, "learning_rate": 9.993772308552374e-06, "loss": 1.13690195, "memory(GiB)": 142.32, "step": 11640, "train_speed(iter/s)": 0.299495 }, { "acc": 0.70994692, "epoch": 0.13041648549382243, "grad_norm": 4.875, "learning_rate": 9.993679689829968e-06, "loss": 1.163381, "memory(GiB)": 142.32, "step": 11660, "train_speed(iter/s)": 0.299657 }, { "acc": 0.71066589, "epoch": 0.13064018443978095, "grad_norm": 6.59375, "learning_rate": 9.993586387900805e-06, "loss": 1.15636358, "memory(GiB)": 142.32, "step": 11680, "train_speed(iter/s)": 0.299831 }, { "acc": 0.71147575, "epoch": 0.13086388338573948, "grad_norm": 5.46875, "learning_rate": 9.99349240277765e-06, "loss": 1.16272945, "memory(GiB)": 142.32, "step": 11700, "train_speed(iter/s)": 0.299986 }, { "acc": 0.72026205, "epoch": 0.131087582331698, "grad_norm": 5.53125, "learning_rate": 9.99339773447336e-06, "loss": 1.13112535, "memory(GiB)": 142.32, "step": 11720, "train_speed(iter/s)": 0.30015 }, { "acc": 0.70437422, "epoch": 0.13131128127765654, "grad_norm": 5.875, "learning_rate": 9.993302383000887e-06, "loss": 1.18979454, "memory(GiB)": 142.32, "step": 11740, "train_speed(iter/s)": 0.300297 }, { "acc": 0.71116166, "epoch": 0.13153498022361507, "grad_norm": 6.125, "learning_rate": 9.993206348373278e-06, "loss": 1.17355442, "memory(GiB)": 142.32, "step": 11760, "train_speed(iter/s)": 0.300457 }, { "acc": 0.70893636, "epoch": 0.1317586791695736, "grad_norm": 5.4375, "learning_rate": 9.993109630603672e-06, "loss": 1.19028816, "memory(GiB)": 142.32, "step": 11780, "train_speed(iter/s)": 0.3006 }, { "acc": 0.70850053, "epoch": 0.13198237811553212, "grad_norm": 4.71875, "learning_rate": 9.993012229705302e-06, "loss": 1.16364212, "memory(GiB)": 142.32, "step": 11800, "train_speed(iter/s)": 0.300737 }, { "acc": 0.72104521, "epoch": 0.13220607706149065, "grad_norm": 6.21875, "learning_rate": 9.99291414569149e-06, "loss": 1.11724186, "memory(GiB)": 142.32, "step": 11820, "train_speed(iter/s)": 0.300906 }, { "acc": 0.70823512, "epoch": 0.13242977600744918, "grad_norm": 5.53125, "learning_rate": 9.992815378575658e-06, "loss": 1.15897789, "memory(GiB)": 142.32, "step": 11840, "train_speed(iter/s)": 0.30105 }, { "acc": 0.72890282, "epoch": 0.1326534749534077, "grad_norm": 5.1875, "learning_rate": 9.992715928371318e-06, "loss": 1.08113708, "memory(GiB)": 142.32, "step": 11860, "train_speed(iter/s)": 0.301177 }, { "acc": 0.71125765, "epoch": 0.13287717389936624, "grad_norm": 5.71875, "learning_rate": 9.992615795092078e-06, "loss": 1.15672064, "memory(GiB)": 142.32, "step": 11880, "train_speed(iter/s)": 0.301334 }, { "acc": 0.71526723, "epoch": 0.13310087284532476, "grad_norm": 6.71875, "learning_rate": 9.992514978751635e-06, "loss": 1.13245192, "memory(GiB)": 142.32, "step": 11900, "train_speed(iter/s)": 0.301486 }, { "acc": 0.71372962, "epoch": 0.1333245717912833, "grad_norm": 5.625, "learning_rate": 9.992413479363785e-06, "loss": 1.14040928, "memory(GiB)": 142.32, "step": 11920, "train_speed(iter/s)": 0.301617 }, { "acc": 0.7061388, "epoch": 0.13354827073724182, "grad_norm": 5.15625, "learning_rate": 9.992311296942412e-06, "loss": 1.19511681, "memory(GiB)": 142.32, "step": 11940, "train_speed(iter/s)": 0.301766 }, { "acc": 0.71287117, "epoch": 0.13377196968320035, "grad_norm": 5.875, "learning_rate": 9.992208431501495e-06, "loss": 1.16329622, "memory(GiB)": 142.32, "step": 11960, "train_speed(iter/s)": 0.301934 }, { "acc": 0.71295528, "epoch": 0.13399566862915888, "grad_norm": 5.625, "learning_rate": 9.992104883055112e-06, "loss": 1.1497282, "memory(GiB)": 142.32, "step": 11980, "train_speed(iter/s)": 0.302098 }, { "acc": 0.7044631, "epoch": 0.1342193675751174, "grad_norm": 5.96875, "learning_rate": 9.992000651617429e-06, "loss": 1.19712448, "memory(GiB)": 142.32, "step": 12000, "train_speed(iter/s)": 0.302245 }, { "epoch": 0.1342193675751174, "eval_acc": 0.6768945887653317, "eval_loss": 1.1538398265838623, "eval_runtime": 2353.6472, "eval_samples_per_second": 31.986, "eval_steps_per_second": 15.993, "step": 12000 }, { "acc": 0.71759796, "epoch": 0.13444306652107593, "grad_norm": 5.9375, "learning_rate": 9.991895737202701e-06, "loss": 1.14155254, "memory(GiB)": 142.32, "step": 12020, "train_speed(iter/s)": 0.285146 }, { "acc": 0.71182518, "epoch": 0.13466676546703446, "grad_norm": 5.46875, "learning_rate": 9.991790139825288e-06, "loss": 1.16280289, "memory(GiB)": 142.32, "step": 12040, "train_speed(iter/s)": 0.285327 }, { "acc": 0.71534071, "epoch": 0.134890464412993, "grad_norm": 5.75, "learning_rate": 9.991683859499632e-06, "loss": 1.11721725, "memory(GiB)": 142.32, "step": 12060, "train_speed(iter/s)": 0.28549 }, { "acc": 0.69832973, "epoch": 0.13511416335895152, "grad_norm": 5.8125, "learning_rate": 9.99157689624028e-06, "loss": 1.22416458, "memory(GiB)": 142.32, "step": 12080, "train_speed(iter/s)": 0.285652 }, { "acc": 0.71765385, "epoch": 0.13533786230491004, "grad_norm": 6.375, "learning_rate": 9.99146925006186e-06, "loss": 1.13321171, "memory(GiB)": 142.32, "step": 12100, "train_speed(iter/s)": 0.285815 }, { "acc": 0.70767169, "epoch": 0.13556156125086857, "grad_norm": 5.71875, "learning_rate": 9.991360920979103e-06, "loss": 1.16582289, "memory(GiB)": 142.32, "step": 12120, "train_speed(iter/s)": 0.285975 }, { "acc": 0.71115026, "epoch": 0.1357852601968271, "grad_norm": 7.40625, "learning_rate": 9.991251909006829e-06, "loss": 1.17101002, "memory(GiB)": 142.32, "step": 12140, "train_speed(iter/s)": 0.286142 }, { "acc": 0.71408691, "epoch": 0.13600895914278563, "grad_norm": 6.15625, "learning_rate": 9.991142214159953e-06, "loss": 1.14353085, "memory(GiB)": 142.32, "step": 12160, "train_speed(iter/s)": 0.286296 }, { "acc": 0.71992879, "epoch": 0.13623265808874416, "grad_norm": 6.78125, "learning_rate": 9.991031836453482e-06, "loss": 1.11883402, "memory(GiB)": 142.32, "step": 12180, "train_speed(iter/s)": 0.286461 }, { "acc": 0.71969976, "epoch": 0.13645635703470269, "grad_norm": 5.25, "learning_rate": 9.990920775902514e-06, "loss": 1.1376358, "memory(GiB)": 142.32, "step": 12200, "train_speed(iter/s)": 0.286623 }, { "acc": 0.7128624, "epoch": 0.1366800559806612, "grad_norm": 4.625, "learning_rate": 9.990809032522252e-06, "loss": 1.15772839, "memory(GiB)": 142.32, "step": 12220, "train_speed(iter/s)": 0.286775 }, { "acc": 0.73155541, "epoch": 0.13690375492661974, "grad_norm": 5.96875, "learning_rate": 9.990696606327978e-06, "loss": 1.0594986, "memory(GiB)": 142.32, "step": 12240, "train_speed(iter/s)": 0.286942 }, { "acc": 0.70980148, "epoch": 0.1371274538725783, "grad_norm": 4.8125, "learning_rate": 9.990583497335074e-06, "loss": 1.17045078, "memory(GiB)": 142.32, "step": 12260, "train_speed(iter/s)": 0.287113 }, { "acc": 0.7181706, "epoch": 0.13735115281853683, "grad_norm": 6.125, "learning_rate": 9.990469705559016e-06, "loss": 1.13897476, "memory(GiB)": 142.32, "step": 12280, "train_speed(iter/s)": 0.287269 }, { "acc": 0.7229763, "epoch": 0.13757485176449535, "grad_norm": 5.53125, "learning_rate": 9.990355231015372e-06, "loss": 1.10908127, "memory(GiB)": 142.32, "step": 12300, "train_speed(iter/s)": 0.287423 }, { "acc": 0.71920156, "epoch": 0.13779855071045388, "grad_norm": 5.3125, "learning_rate": 9.990240073719804e-06, "loss": 1.1276495, "memory(GiB)": 142.32, "step": 12320, "train_speed(iter/s)": 0.287562 }, { "acc": 0.70707803, "epoch": 0.1380222496564124, "grad_norm": 5.0625, "learning_rate": 9.990124233688066e-06, "loss": 1.17642231, "memory(GiB)": 142.32, "step": 12340, "train_speed(iter/s)": 0.287724 }, { "acc": 0.70453033, "epoch": 0.13824594860237094, "grad_norm": 7.40625, "learning_rate": 9.990007710936006e-06, "loss": 1.1890337, "memory(GiB)": 142.32, "step": 12360, "train_speed(iter/s)": 0.287881 }, { "acc": 0.71915889, "epoch": 0.13846964754832947, "grad_norm": 6.1875, "learning_rate": 9.989890505479571e-06, "loss": 1.12692909, "memory(GiB)": 142.32, "step": 12380, "train_speed(iter/s)": 0.288037 }, { "acc": 0.71553006, "epoch": 0.138693346494288, "grad_norm": 4.625, "learning_rate": 9.989772617334792e-06, "loss": 1.14073334, "memory(GiB)": 142.32, "step": 12400, "train_speed(iter/s)": 0.288197 }, { "acc": 0.70962257, "epoch": 0.13891704544024652, "grad_norm": 5.3125, "learning_rate": 9.989654046517799e-06, "loss": 1.19011669, "memory(GiB)": 142.32, "step": 12420, "train_speed(iter/s)": 0.288333 }, { "acc": 0.71390781, "epoch": 0.13914074438620505, "grad_norm": 6.03125, "learning_rate": 9.989534793044813e-06, "loss": 1.14830742, "memory(GiB)": 142.32, "step": 12440, "train_speed(iter/s)": 0.288478 }, { "acc": 0.7030643, "epoch": 0.13936444333216358, "grad_norm": 5.15625, "learning_rate": 9.98941485693215e-06, "loss": 1.18248711, "memory(GiB)": 142.32, "step": 12460, "train_speed(iter/s)": 0.288628 }, { "acc": 0.707376, "epoch": 0.1395881422781221, "grad_norm": 5.78125, "learning_rate": 9.98929423819622e-06, "loss": 1.16150818, "memory(GiB)": 142.32, "step": 12480, "train_speed(iter/s)": 0.288765 }, { "acc": 0.71577606, "epoch": 0.13981184122408064, "grad_norm": 6.71875, "learning_rate": 9.989172936853525e-06, "loss": 1.1440299, "memory(GiB)": 142.32, "step": 12500, "train_speed(iter/s)": 0.288925 }, { "acc": 0.72450151, "epoch": 0.14003554017003916, "grad_norm": 6.1875, "learning_rate": 9.98905095292066e-06, "loss": 1.09539156, "memory(GiB)": 142.32, "step": 12520, "train_speed(iter/s)": 0.289101 }, { "acc": 0.71327438, "epoch": 0.1402592391159977, "grad_norm": 6.3125, "learning_rate": 9.988928286414315e-06, "loss": 1.15162001, "memory(GiB)": 142.32, "step": 12540, "train_speed(iter/s)": 0.289259 }, { "acc": 0.72054877, "epoch": 0.14048293806195622, "grad_norm": 8.1875, "learning_rate": 9.988804937351272e-06, "loss": 1.11431551, "memory(GiB)": 142.32, "step": 12560, "train_speed(iter/s)": 0.289404 }, { "acc": 0.71539855, "epoch": 0.14070663700791475, "grad_norm": 6.21875, "learning_rate": 9.988680905748407e-06, "loss": 1.13870811, "memory(GiB)": 142.32, "step": 12580, "train_speed(iter/s)": 0.289556 }, { "acc": 0.7190558, "epoch": 0.14093033595387328, "grad_norm": 6.0, "learning_rate": 9.988556191622689e-06, "loss": 1.13464108, "memory(GiB)": 142.32, "step": 12600, "train_speed(iter/s)": 0.289718 }, { "acc": 0.70627184, "epoch": 0.1411540348998318, "grad_norm": 5.5, "learning_rate": 9.988430794991181e-06, "loss": 1.17489376, "memory(GiB)": 142.32, "step": 12620, "train_speed(iter/s)": 0.289879 }, { "acc": 0.71374612, "epoch": 0.14137773384579033, "grad_norm": 6.25, "learning_rate": 9.98830471587104e-06, "loss": 1.14529819, "memory(GiB)": 142.32, "step": 12640, "train_speed(iter/s)": 0.290044 }, { "acc": 0.71513405, "epoch": 0.14160143279174886, "grad_norm": 6.15625, "learning_rate": 9.988177954279515e-06, "loss": 1.15227051, "memory(GiB)": 142.32, "step": 12660, "train_speed(iter/s)": 0.290192 }, { "acc": 0.70807276, "epoch": 0.1418251317377074, "grad_norm": 5.21875, "learning_rate": 9.988050510233948e-06, "loss": 1.1668829, "memory(GiB)": 142.32, "step": 12680, "train_speed(iter/s)": 0.290355 }, { "acc": 0.72523584, "epoch": 0.14204883068366592, "grad_norm": 4.84375, "learning_rate": 9.987922383751777e-06, "loss": 1.08652229, "memory(GiB)": 142.32, "step": 12700, "train_speed(iter/s)": 0.290493 }, { "acc": 0.71016741, "epoch": 0.14227252962962444, "grad_norm": 5.5625, "learning_rate": 9.987793574850526e-06, "loss": 1.15162592, "memory(GiB)": 142.32, "step": 12720, "train_speed(iter/s)": 0.29063 }, { "acc": 0.70899916, "epoch": 0.14249622857558297, "grad_norm": 6.25, "learning_rate": 9.987664083547826e-06, "loss": 1.16990061, "memory(GiB)": 142.32, "step": 12740, "train_speed(iter/s)": 0.290783 }, { "acc": 0.72124233, "epoch": 0.1427199275215415, "grad_norm": 7.0, "learning_rate": 9.987533909861387e-06, "loss": 1.1097271, "memory(GiB)": 142.32, "step": 12760, "train_speed(iter/s)": 0.290916 }, { "acc": 0.71105833, "epoch": 0.14294362646750003, "grad_norm": 5.375, "learning_rate": 9.987403053809022e-06, "loss": 1.15975704, "memory(GiB)": 142.32, "step": 12780, "train_speed(iter/s)": 0.29107 }, { "acc": 0.70765591, "epoch": 0.14316732541345856, "grad_norm": 5.0, "learning_rate": 9.987271515408633e-06, "loss": 1.19925852, "memory(GiB)": 142.32, "step": 12800, "train_speed(iter/s)": 0.291225 }, { "acc": 0.70906239, "epoch": 0.14339102435941709, "grad_norm": 5.5625, "learning_rate": 9.987139294678213e-06, "loss": 1.17525387, "memory(GiB)": 142.32, "step": 12820, "train_speed(iter/s)": 0.291361 }, { "acc": 0.72373552, "epoch": 0.1436147233053756, "grad_norm": 6.09375, "learning_rate": 9.987006391635859e-06, "loss": 1.10584126, "memory(GiB)": 142.32, "step": 12840, "train_speed(iter/s)": 0.29151 }, { "acc": 0.71626635, "epoch": 0.14383842225133414, "grad_norm": 5.625, "learning_rate": 9.986872806299747e-06, "loss": 1.12644253, "memory(GiB)": 142.32, "step": 12860, "train_speed(iter/s)": 0.291664 }, { "acc": 0.70809565, "epoch": 0.1440621211972927, "grad_norm": 4.9375, "learning_rate": 9.986738538688156e-06, "loss": 1.17542238, "memory(GiB)": 142.32, "step": 12880, "train_speed(iter/s)": 0.291801 }, { "acc": 0.70267611, "epoch": 0.14428582014325123, "grad_norm": 6.0625, "learning_rate": 9.98660358881946e-06, "loss": 1.18758497, "memory(GiB)": 142.32, "step": 12900, "train_speed(iter/s)": 0.291954 }, { "acc": 0.71968675, "epoch": 0.14450951908920975, "grad_norm": 6.03125, "learning_rate": 9.986467956712114e-06, "loss": 1.12864552, "memory(GiB)": 142.32, "step": 12920, "train_speed(iter/s)": 0.292079 }, { "acc": 0.7194952, "epoch": 0.14473321803516828, "grad_norm": 6.34375, "learning_rate": 9.98633164238468e-06, "loss": 1.13224192, "memory(GiB)": 142.32, "step": 12940, "train_speed(iter/s)": 0.292234 }, { "acc": 0.71984019, "epoch": 0.1449569169811268, "grad_norm": 7.21875, "learning_rate": 9.986194645855807e-06, "loss": 1.11983089, "memory(GiB)": 142.32, "step": 12960, "train_speed(iter/s)": 0.292369 }, { "acc": 0.70672612, "epoch": 0.14518061592708534, "grad_norm": 5.90625, "learning_rate": 9.986056967144236e-06, "loss": 1.16557446, "memory(GiB)": 142.32, "step": 12980, "train_speed(iter/s)": 0.292518 }, { "acc": 0.70870047, "epoch": 0.14540431487304387, "grad_norm": 6.21875, "learning_rate": 9.985918606268805e-06, "loss": 1.1767004, "memory(GiB)": 142.32, "step": 13000, "train_speed(iter/s)": 0.292669 }, { "acc": 0.71378226, "epoch": 0.1456280138190024, "grad_norm": 6.8125, "learning_rate": 9.985779563248444e-06, "loss": 1.1520833, "memory(GiB)": 142.32, "step": 13020, "train_speed(iter/s)": 0.292822 }, { "acc": 0.70473871, "epoch": 0.14585171276496092, "grad_norm": 6.5625, "learning_rate": 9.985639838102174e-06, "loss": 1.18720989, "memory(GiB)": 142.32, "step": 13040, "train_speed(iter/s)": 0.29296 }, { "acc": 0.70102572, "epoch": 0.14607541171091945, "grad_norm": 5.5, "learning_rate": 9.985499430849114e-06, "loss": 1.22474403, "memory(GiB)": 142.32, "step": 13060, "train_speed(iter/s)": 0.293107 }, { "acc": 0.72059598, "epoch": 0.14629911065687798, "grad_norm": 5.34375, "learning_rate": 9.985358341508473e-06, "loss": 1.11577034, "memory(GiB)": 142.32, "step": 13080, "train_speed(iter/s)": 0.293268 }, { "acc": 0.70324378, "epoch": 0.1465228096028365, "grad_norm": 5.9375, "learning_rate": 9.985216570099555e-06, "loss": 1.21755524, "memory(GiB)": 142.32, "step": 13100, "train_speed(iter/s)": 0.29341 }, { "acc": 0.71225891, "epoch": 0.14674650854879503, "grad_norm": 5.84375, "learning_rate": 9.985074116641752e-06, "loss": 1.14345016, "memory(GiB)": 142.32, "step": 13120, "train_speed(iter/s)": 0.29356 }, { "acc": 0.71917367, "epoch": 0.14697020749475356, "grad_norm": 5.03125, "learning_rate": 9.984930981154558e-06, "loss": 1.13634243, "memory(GiB)": 142.32, "step": 13140, "train_speed(iter/s)": 0.293717 }, { "acc": 0.71486683, "epoch": 0.1471939064407121, "grad_norm": 5.75, "learning_rate": 9.984787163657554e-06, "loss": 1.14419165, "memory(GiB)": 142.32, "step": 13160, "train_speed(iter/s)": 0.293866 }, { "acc": 0.71705165, "epoch": 0.14741760538667062, "grad_norm": 6.25, "learning_rate": 9.984642664170419e-06, "loss": 1.13460503, "memory(GiB)": 142.32, "step": 13180, "train_speed(iter/s)": 0.294004 }, { "acc": 0.71822319, "epoch": 0.14764130433262915, "grad_norm": 5.875, "learning_rate": 9.984497482712919e-06, "loss": 1.12121391, "memory(GiB)": 142.32, "step": 13200, "train_speed(iter/s)": 0.294156 }, { "acc": 0.71337595, "epoch": 0.14786500327858768, "grad_norm": 5.5625, "learning_rate": 9.984351619304919e-06, "loss": 1.13960266, "memory(GiB)": 142.32, "step": 13220, "train_speed(iter/s)": 0.294298 }, { "acc": 0.71394629, "epoch": 0.1480887022245462, "grad_norm": 5.21875, "learning_rate": 9.984205073966375e-06, "loss": 1.15243969, "memory(GiB)": 142.32, "step": 13240, "train_speed(iter/s)": 0.294437 }, { "acc": 0.71909027, "epoch": 0.14831240117050473, "grad_norm": 5.03125, "learning_rate": 9.984057846717335e-06, "loss": 1.13075819, "memory(GiB)": 142.32, "step": 13260, "train_speed(iter/s)": 0.294574 }, { "acc": 0.72227793, "epoch": 0.14853610011646326, "grad_norm": 6.25, "learning_rate": 9.983909937577944e-06, "loss": 1.11078453, "memory(GiB)": 142.32, "step": 13280, "train_speed(iter/s)": 0.294711 }, { "acc": 0.7174952, "epoch": 0.1487597990624218, "grad_norm": 4.9375, "learning_rate": 9.983761346568437e-06, "loss": 1.13347263, "memory(GiB)": 142.32, "step": 13300, "train_speed(iter/s)": 0.294828 }, { "acc": 0.70791445, "epoch": 0.14898349800838032, "grad_norm": 5.71875, "learning_rate": 9.983612073709144e-06, "loss": 1.18386497, "memory(GiB)": 142.32, "step": 13320, "train_speed(iter/s)": 0.294957 }, { "acc": 0.71492367, "epoch": 0.14920719695433884, "grad_norm": 6.375, "learning_rate": 9.983462119020487e-06, "loss": 1.12626791, "memory(GiB)": 142.32, "step": 13340, "train_speed(iter/s)": 0.295106 }, { "acc": 0.72183471, "epoch": 0.14943089590029737, "grad_norm": 5.90625, "learning_rate": 9.983311482522979e-06, "loss": 1.11235466, "memory(GiB)": 142.32, "step": 13360, "train_speed(iter/s)": 0.295246 }, { "acc": 0.71686049, "epoch": 0.1496545948462559, "grad_norm": 6.40625, "learning_rate": 9.983160164237236e-06, "loss": 1.1334095, "memory(GiB)": 142.32, "step": 13380, "train_speed(iter/s)": 0.295386 }, { "acc": 0.7140667, "epoch": 0.14987829379221443, "grad_norm": 4.625, "learning_rate": 9.983008164183955e-06, "loss": 1.14029255, "memory(GiB)": 142.32, "step": 13400, "train_speed(iter/s)": 0.295509 }, { "acc": 0.71368122, "epoch": 0.15010199273817296, "grad_norm": 4.9375, "learning_rate": 9.982855482383934e-06, "loss": 1.13648148, "memory(GiB)": 142.32, "step": 13420, "train_speed(iter/s)": 0.295656 }, { "acc": 0.72659025, "epoch": 0.15032569168413148, "grad_norm": 6.09375, "learning_rate": 9.982702118858061e-06, "loss": 1.07119684, "memory(GiB)": 142.32, "step": 13440, "train_speed(iter/s)": 0.2958 }, { "acc": 0.71996183, "epoch": 0.15054939063009, "grad_norm": 5.65625, "learning_rate": 9.98254807362732e-06, "loss": 1.1252346, "memory(GiB)": 142.32, "step": 13460, "train_speed(iter/s)": 0.295933 }, { "acc": 0.72272182, "epoch": 0.15077308957604854, "grad_norm": 6.125, "learning_rate": 9.982393346712785e-06, "loss": 1.10921974, "memory(GiB)": 142.32, "step": 13480, "train_speed(iter/s)": 0.296061 }, { "acc": 0.71456347, "epoch": 0.15099678852200707, "grad_norm": 6.875, "learning_rate": 9.982237938135625e-06, "loss": 1.16758232, "memory(GiB)": 142.32, "step": 13500, "train_speed(iter/s)": 0.296196 }, { "acc": 0.71189604, "epoch": 0.15122048746796563, "grad_norm": 5.8125, "learning_rate": 9.982081847917102e-06, "loss": 1.1512495, "memory(GiB)": 142.32, "step": 13520, "train_speed(iter/s)": 0.296343 }, { "acc": 0.71603546, "epoch": 0.15144418641392415, "grad_norm": 6.1875, "learning_rate": 9.981925076078573e-06, "loss": 1.14133339, "memory(GiB)": 142.32, "step": 13540, "train_speed(iter/s)": 0.296479 }, { "acc": 0.71517692, "epoch": 0.15166788535988268, "grad_norm": 5.46875, "learning_rate": 9.981767622641485e-06, "loss": 1.14400673, "memory(GiB)": 142.32, "step": 13560, "train_speed(iter/s)": 0.29662 }, { "acc": 0.70540662, "epoch": 0.1518915843058412, "grad_norm": 5.71875, "learning_rate": 9.98160948762738e-06, "loss": 1.18443289, "memory(GiB)": 142.32, "step": 13580, "train_speed(iter/s)": 0.296754 }, { "acc": 0.72116575, "epoch": 0.15211528325179974, "grad_norm": 6.0, "learning_rate": 9.981450671057896e-06, "loss": 1.11082726, "memory(GiB)": 142.32, "step": 13600, "train_speed(iter/s)": 0.296891 }, { "acc": 0.71299601, "epoch": 0.15233898219775827, "grad_norm": 6.90625, "learning_rate": 9.981291172954755e-06, "loss": 1.1571209, "memory(GiB)": 142.32, "step": 13620, "train_speed(iter/s)": 0.297028 }, { "acc": 0.7209939, "epoch": 0.1525626811437168, "grad_norm": 6.5, "learning_rate": 9.981130993339785e-06, "loss": 1.09807949, "memory(GiB)": 142.32, "step": 13640, "train_speed(iter/s)": 0.297171 }, { "acc": 0.71464539, "epoch": 0.15278638008967532, "grad_norm": 6.1875, "learning_rate": 9.980970132234897e-06, "loss": 1.14631596, "memory(GiB)": 142.32, "step": 13660, "train_speed(iter/s)": 0.297299 }, { "acc": 0.70987482, "epoch": 0.15301007903563385, "grad_norm": 6.40625, "learning_rate": 9.980808589662101e-06, "loss": 1.16173487, "memory(GiB)": 142.32, "step": 13680, "train_speed(iter/s)": 0.297448 }, { "acc": 0.72035499, "epoch": 0.15323377798159238, "grad_norm": 5.75, "learning_rate": 9.980646365643498e-06, "loss": 1.11459103, "memory(GiB)": 142.32, "step": 13700, "train_speed(iter/s)": 0.297592 }, { "acc": 0.70784163, "epoch": 0.1534574769275509, "grad_norm": 6.375, "learning_rate": 9.980483460201283e-06, "loss": 1.17555742, "memory(GiB)": 142.32, "step": 13720, "train_speed(iter/s)": 0.29773 }, { "acc": 0.72093983, "epoch": 0.15368117587350943, "grad_norm": 4.5625, "learning_rate": 9.980319873357742e-06, "loss": 1.11624908, "memory(GiB)": 142.32, "step": 13740, "train_speed(iter/s)": 0.297854 }, { "acc": 0.70068669, "epoch": 0.15390487481946796, "grad_norm": 5.84375, "learning_rate": 9.980155605135257e-06, "loss": 1.2225749, "memory(GiB)": 142.32, "step": 13760, "train_speed(iter/s)": 0.297982 }, { "acc": 0.72320938, "epoch": 0.1541285737654265, "grad_norm": 5.15625, "learning_rate": 9.979990655556303e-06, "loss": 1.09012146, "memory(GiB)": 142.32, "step": 13780, "train_speed(iter/s)": 0.29812 }, { "acc": 0.71033726, "epoch": 0.15435227271138502, "grad_norm": 5.8125, "learning_rate": 9.979825024643447e-06, "loss": 1.17368002, "memory(GiB)": 142.32, "step": 13800, "train_speed(iter/s)": 0.298239 }, { "acc": 0.72164707, "epoch": 0.15457597165734355, "grad_norm": 6.625, "learning_rate": 9.97965871241935e-06, "loss": 1.10837545, "memory(GiB)": 142.32, "step": 13820, "train_speed(iter/s)": 0.298381 }, { "acc": 0.7262567, "epoch": 0.15479967060330208, "grad_norm": 5.4375, "learning_rate": 9.979491718906765e-06, "loss": 1.09337645, "memory(GiB)": 142.32, "step": 13840, "train_speed(iter/s)": 0.298516 }, { "acc": 0.71064663, "epoch": 0.1550233695492606, "grad_norm": 6.1875, "learning_rate": 9.979324044128538e-06, "loss": 1.147896, "memory(GiB)": 142.32, "step": 13860, "train_speed(iter/s)": 0.298654 }, { "acc": 0.71010866, "epoch": 0.15524706849521913, "grad_norm": 6.4375, "learning_rate": 9.97915568810761e-06, "loss": 1.156497, "memory(GiB)": 142.32, "step": 13880, "train_speed(iter/s)": 0.298797 }, { "acc": 0.71762896, "epoch": 0.15547076744117766, "grad_norm": 7.125, "learning_rate": 9.978986650867019e-06, "loss": 1.14411478, "memory(GiB)": 142.32, "step": 13900, "train_speed(iter/s)": 0.298908 }, { "acc": 0.71409235, "epoch": 0.1556944663871362, "grad_norm": 5.25, "learning_rate": 9.978816932429886e-06, "loss": 1.14395809, "memory(GiB)": 142.32, "step": 13920, "train_speed(iter/s)": 0.299029 }, { "acc": 0.71337471, "epoch": 0.15591816533309472, "grad_norm": 6.1875, "learning_rate": 9.978646532819434e-06, "loss": 1.14227734, "memory(GiB)": 142.32, "step": 13940, "train_speed(iter/s)": 0.299159 }, { "acc": 0.72025528, "epoch": 0.15614186427905324, "grad_norm": 5.71875, "learning_rate": 9.978475452058974e-06, "loss": 1.1183094, "memory(GiB)": 142.32, "step": 13960, "train_speed(iter/s)": 0.299284 }, { "acc": 0.71461902, "epoch": 0.15636556322501177, "grad_norm": 5.0625, "learning_rate": 9.978303690171912e-06, "loss": 1.14931784, "memory(GiB)": 142.32, "step": 13980, "train_speed(iter/s)": 0.299404 }, { "acc": 0.72153969, "epoch": 0.1565892621709703, "grad_norm": 5.78125, "learning_rate": 9.978131247181753e-06, "loss": 1.1214572, "memory(GiB)": 142.32, "step": 14000, "train_speed(iter/s)": 0.29953 }, { "epoch": 0.1565892621709703, "eval_acc": 0.6793296956345642, "eval_loss": 1.1440019607543945, "eval_runtime": 2346.2881, "eval_samples_per_second": 32.086, "eval_steps_per_second": 16.043, "step": 14000 }, { "acc": 0.71897836, "epoch": 0.15681296111692883, "grad_norm": 6.96875, "learning_rate": 9.977958123112082e-06, "loss": 1.12176085, "memory(GiB)": 142.32, "step": 14020, "train_speed(iter/s)": 0.285078 }, { "acc": 0.70701084, "epoch": 0.15703666006288736, "grad_norm": 5.59375, "learning_rate": 9.97778431798659e-06, "loss": 1.19625797, "memory(GiB)": 142.32, "step": 14040, "train_speed(iter/s)": 0.28521 }, { "acc": 0.70741549, "epoch": 0.15726035900884588, "grad_norm": 5.6875, "learning_rate": 9.977609831829054e-06, "loss": 1.1700737, "memory(GiB)": 142.32, "step": 14060, "train_speed(iter/s)": 0.28534 }, { "acc": 0.70672612, "epoch": 0.1574840579548044, "grad_norm": 6.125, "learning_rate": 9.977434664663345e-06, "loss": 1.18599911, "memory(GiB)": 142.32, "step": 14080, "train_speed(iter/s)": 0.285475 }, { "acc": 0.71613269, "epoch": 0.15770775690076294, "grad_norm": 6.46875, "learning_rate": 9.977258816513432e-06, "loss": 1.14055805, "memory(GiB)": 142.32, "step": 14100, "train_speed(iter/s)": 0.285631 }, { "acc": 0.71754184, "epoch": 0.15793145584672147, "grad_norm": 5.625, "learning_rate": 9.97708228740337e-06, "loss": 1.13591137, "memory(GiB)": 142.32, "step": 14120, "train_speed(iter/s)": 0.285758 }, { "acc": 0.71546679, "epoch": 0.15815515479268002, "grad_norm": 5.40625, "learning_rate": 9.976905077357315e-06, "loss": 1.13871098, "memory(GiB)": 142.32, "step": 14140, "train_speed(iter/s)": 0.285903 }, { "acc": 0.71008768, "epoch": 0.15837885373863855, "grad_norm": 6.21875, "learning_rate": 9.976727186399506e-06, "loss": 1.17877207, "memory(GiB)": 142.32, "step": 14160, "train_speed(iter/s)": 0.286042 }, { "acc": 0.72533236, "epoch": 0.15860255268459708, "grad_norm": 6.46875, "learning_rate": 9.976548614554285e-06, "loss": 1.10455608, "memory(GiB)": 142.32, "step": 14180, "train_speed(iter/s)": 0.286188 }, { "acc": 0.71417542, "epoch": 0.1588262516305556, "grad_norm": 6.21875, "learning_rate": 9.976369361846082e-06, "loss": 1.15358448, "memory(GiB)": 142.32, "step": 14200, "train_speed(iter/s)": 0.286327 }, { "acc": 0.72551813, "epoch": 0.15904995057651414, "grad_norm": 5.625, "learning_rate": 9.976189428299422e-06, "loss": 1.08834648, "memory(GiB)": 142.32, "step": 14220, "train_speed(iter/s)": 0.286455 }, { "acc": 0.72046518, "epoch": 0.15927364952247267, "grad_norm": 6.09375, "learning_rate": 9.976008813938922e-06, "loss": 1.12412281, "memory(GiB)": 142.32, "step": 14240, "train_speed(iter/s)": 0.286584 }, { "acc": 0.71472626, "epoch": 0.1594973484684312, "grad_norm": 4.6875, "learning_rate": 9.975827518789294e-06, "loss": 1.13616314, "memory(GiB)": 142.32, "step": 14260, "train_speed(iter/s)": 0.286728 }, { "acc": 0.7196063, "epoch": 0.15972104741438972, "grad_norm": 7.625, "learning_rate": 9.975645542875338e-06, "loss": 1.12508307, "memory(GiB)": 142.32, "step": 14280, "train_speed(iter/s)": 0.286869 }, { "acc": 0.71433868, "epoch": 0.15994474636034825, "grad_norm": 7.25, "learning_rate": 9.975462886221954e-06, "loss": 1.14105492, "memory(GiB)": 142.32, "step": 14300, "train_speed(iter/s)": 0.287004 }, { "acc": 0.73040953, "epoch": 0.16016844530630678, "grad_norm": 5.34375, "learning_rate": 9.975279548854133e-06, "loss": 1.07041321, "memory(GiB)": 142.32, "step": 14320, "train_speed(iter/s)": 0.287144 }, { "acc": 0.71448407, "epoch": 0.1603921442522653, "grad_norm": 5.5625, "learning_rate": 9.975095530796954e-06, "loss": 1.15384808, "memory(GiB)": 142.32, "step": 14340, "train_speed(iter/s)": 0.287277 }, { "acc": 0.70448008, "epoch": 0.16061584319822383, "grad_norm": 6.125, "learning_rate": 9.974910832075598e-06, "loss": 1.19546242, "memory(GiB)": 142.32, "step": 14360, "train_speed(iter/s)": 0.287413 }, { "acc": 0.71640453, "epoch": 0.16083954214418236, "grad_norm": 6.21875, "learning_rate": 9.974725452715332e-06, "loss": 1.14917345, "memory(GiB)": 142.32, "step": 14380, "train_speed(iter/s)": 0.287546 }, { "acc": 0.70863943, "epoch": 0.1610632410901409, "grad_norm": 6.5625, "learning_rate": 9.974539392741518e-06, "loss": 1.1799016, "memory(GiB)": 142.32, "step": 14400, "train_speed(iter/s)": 0.287667 }, { "acc": 0.71090598, "epoch": 0.16128694003609942, "grad_norm": 4.9375, "learning_rate": 9.974352652179614e-06, "loss": 1.16017437, "memory(GiB)": 142.32, "step": 14420, "train_speed(iter/s)": 0.287801 }, { "acc": 0.71328888, "epoch": 0.16151063898205795, "grad_norm": 5.90625, "learning_rate": 9.974165231055166e-06, "loss": 1.14497375, "memory(GiB)": 142.32, "step": 14440, "train_speed(iter/s)": 0.287908 }, { "acc": 0.71500874, "epoch": 0.16173433792801648, "grad_norm": 5.03125, "learning_rate": 9.973977129393817e-06, "loss": 1.15831518, "memory(GiB)": 142.32, "step": 14460, "train_speed(iter/s)": 0.288026 }, { "acc": 0.71645145, "epoch": 0.161958036873975, "grad_norm": 5.78125, "learning_rate": 9.973788347221304e-06, "loss": 1.14553356, "memory(GiB)": 142.32, "step": 14480, "train_speed(iter/s)": 0.288172 }, { "acc": 0.72188959, "epoch": 0.16218173581993353, "grad_norm": 6.28125, "learning_rate": 9.97359888456345e-06, "loss": 1.11760006, "memory(GiB)": 142.32, "step": 14500, "train_speed(iter/s)": 0.288309 }, { "acc": 0.7228837, "epoch": 0.16240543476589206, "grad_norm": 6.34375, "learning_rate": 9.973408741446183e-06, "loss": 1.1043232, "memory(GiB)": 142.32, "step": 14520, "train_speed(iter/s)": 0.288441 }, { "acc": 0.70757799, "epoch": 0.1626291337118506, "grad_norm": 5.0, "learning_rate": 9.973217917895513e-06, "loss": 1.17561674, "memory(GiB)": 142.32, "step": 14540, "train_speed(iter/s)": 0.28857 }, { "acc": 0.71028347, "epoch": 0.16285283265780912, "grad_norm": 5.28125, "learning_rate": 9.973026413937548e-06, "loss": 1.15554886, "memory(GiB)": 142.32, "step": 14560, "train_speed(iter/s)": 0.2887 }, { "acc": 0.71259532, "epoch": 0.16307653160376764, "grad_norm": 6.59375, "learning_rate": 9.972834229598487e-06, "loss": 1.16078959, "memory(GiB)": 142.32, "step": 14580, "train_speed(iter/s)": 0.288816 }, { "acc": 0.71366873, "epoch": 0.16330023054972617, "grad_norm": 7.15625, "learning_rate": 9.972641364904627e-06, "loss": 1.15124702, "memory(GiB)": 142.32, "step": 14600, "train_speed(iter/s)": 0.288951 }, { "acc": 0.70680532, "epoch": 0.1635239294956847, "grad_norm": 5.125, "learning_rate": 9.972447819882351e-06, "loss": 1.18224134, "memory(GiB)": 142.32, "step": 14620, "train_speed(iter/s)": 0.289079 }, { "acc": 0.71477795, "epoch": 0.16374762844164323, "grad_norm": 6.59375, "learning_rate": 9.972253594558142e-06, "loss": 1.1308012, "memory(GiB)": 142.32, "step": 14640, "train_speed(iter/s)": 0.289213 }, { "acc": 0.70832319, "epoch": 0.16397132738760176, "grad_norm": 5.6875, "learning_rate": 9.972058688958572e-06, "loss": 1.18060093, "memory(GiB)": 142.32, "step": 14660, "train_speed(iter/s)": 0.289358 }, { "acc": 0.71968241, "epoch": 0.16419502633356028, "grad_norm": 5.875, "learning_rate": 9.971863103110306e-06, "loss": 1.11221809, "memory(GiB)": 142.32, "step": 14680, "train_speed(iter/s)": 0.289491 }, { "acc": 0.71836209, "epoch": 0.1644187252795188, "grad_norm": 5.40625, "learning_rate": 9.971666837040102e-06, "loss": 1.11862011, "memory(GiB)": 142.32, "step": 14700, "train_speed(iter/s)": 0.289618 }, { "acc": 0.71643543, "epoch": 0.16464242422547734, "grad_norm": 6.84375, "learning_rate": 9.971469890774814e-06, "loss": 1.125214, "memory(GiB)": 142.32, "step": 14720, "train_speed(iter/s)": 0.289761 }, { "acc": 0.7222887, "epoch": 0.16486612317143587, "grad_norm": 5.8125, "learning_rate": 9.971272264341386e-06, "loss": 1.1008275, "memory(GiB)": 142.32, "step": 14740, "train_speed(iter/s)": 0.28991 }, { "acc": 0.71006308, "epoch": 0.1650898221173944, "grad_norm": 6.625, "learning_rate": 9.971073957766857e-06, "loss": 1.1744175, "memory(GiB)": 142.32, "step": 14760, "train_speed(iter/s)": 0.29002 }, { "acc": 0.71489019, "epoch": 0.16531352106335295, "grad_norm": 6.34375, "learning_rate": 9.970874971078358e-06, "loss": 1.14929085, "memory(GiB)": 142.32, "step": 14780, "train_speed(iter/s)": 0.290153 }, { "acc": 0.72386017, "epoch": 0.16553722000931148, "grad_norm": 7.03125, "learning_rate": 9.97067530430311e-06, "loss": 1.08980036, "memory(GiB)": 142.32, "step": 14800, "train_speed(iter/s)": 0.290272 }, { "acc": 0.7188385, "epoch": 0.16576091895527, "grad_norm": 5.15625, "learning_rate": 9.970474957468435e-06, "loss": 1.11529493, "memory(GiB)": 142.32, "step": 14820, "train_speed(iter/s)": 0.290405 }, { "acc": 0.71146045, "epoch": 0.16598461790122854, "grad_norm": 5.5, "learning_rate": 9.97027393060174e-06, "loss": 1.14453659, "memory(GiB)": 142.32, "step": 14840, "train_speed(iter/s)": 0.290521 }, { "acc": 0.72121706, "epoch": 0.16620831684718707, "grad_norm": 5.5625, "learning_rate": 9.970072223730532e-06, "loss": 1.10764484, "memory(GiB)": 142.32, "step": 14860, "train_speed(iter/s)": 0.290648 }, { "acc": 0.7228982, "epoch": 0.1664320157931456, "grad_norm": 5.65625, "learning_rate": 9.969869836882404e-06, "loss": 1.07720709, "memory(GiB)": 142.32, "step": 14880, "train_speed(iter/s)": 0.29077 }, { "acc": 0.72811422, "epoch": 0.16665571473910412, "grad_norm": 5.75, "learning_rate": 9.969666770085046e-06, "loss": 1.07584152, "memory(GiB)": 142.32, "step": 14900, "train_speed(iter/s)": 0.290911 }, { "acc": 0.70818977, "epoch": 0.16687941368506265, "grad_norm": 4.4375, "learning_rate": 9.969463023366241e-06, "loss": 1.16408195, "memory(GiB)": 142.32, "step": 14920, "train_speed(iter/s)": 0.291051 }, { "acc": 0.71687331, "epoch": 0.16710311263102118, "grad_norm": 6.59375, "learning_rate": 9.969258596753866e-06, "loss": 1.13887691, "memory(GiB)": 142.32, "step": 14940, "train_speed(iter/s)": 0.291196 }, { "acc": 0.7123178, "epoch": 0.1673268115769797, "grad_norm": 5.78125, "learning_rate": 9.969053490275886e-06, "loss": 1.14399223, "memory(GiB)": 142.32, "step": 14960, "train_speed(iter/s)": 0.291314 }, { "acc": 0.72010336, "epoch": 0.16755051052293823, "grad_norm": 5.5625, "learning_rate": 9.968847703960365e-06, "loss": 1.11749191, "memory(GiB)": 142.32, "step": 14980, "train_speed(iter/s)": 0.291434 }, { "acc": 0.7162919, "epoch": 0.16777420946889676, "grad_norm": 6.28125, "learning_rate": 9.968641237835458e-06, "loss": 1.13571978, "memory(GiB)": 142.32, "step": 15000, "train_speed(iter/s)": 0.291556 }, { "acc": 0.72480974, "epoch": 0.1679979084148553, "grad_norm": 6.1875, "learning_rate": 9.968434091929411e-06, "loss": 1.10135822, "memory(GiB)": 142.32, "step": 15020, "train_speed(iter/s)": 0.291705 }, { "acc": 0.71014185, "epoch": 0.16822160736081382, "grad_norm": 7.25, "learning_rate": 9.968226266270563e-06, "loss": 1.16604481, "memory(GiB)": 142.32, "step": 15040, "train_speed(iter/s)": 0.291837 }, { "acc": 0.71226678, "epoch": 0.16844530630677235, "grad_norm": 6.75, "learning_rate": 9.968017760887352e-06, "loss": 1.15985031, "memory(GiB)": 142.32, "step": 15060, "train_speed(iter/s)": 0.291987 }, { "acc": 0.7174242, "epoch": 0.16866900525273087, "grad_norm": 6.53125, "learning_rate": 9.967808575808301e-06, "loss": 1.10997305, "memory(GiB)": 142.32, "step": 15080, "train_speed(iter/s)": 0.292132 }, { "acc": 0.71210647, "epoch": 0.1688927041986894, "grad_norm": 5.34375, "learning_rate": 9.96759871106203e-06, "loss": 1.1397831, "memory(GiB)": 142.32, "step": 15100, "train_speed(iter/s)": 0.292263 }, { "acc": 0.72581544, "epoch": 0.16911640314464793, "grad_norm": 6.3125, "learning_rate": 9.967388166677252e-06, "loss": 1.10396461, "memory(GiB)": 142.32, "step": 15120, "train_speed(iter/s)": 0.292392 }, { "acc": 0.72291389, "epoch": 0.16934010209060646, "grad_norm": 5.5625, "learning_rate": 9.967176942682773e-06, "loss": 1.11848564, "memory(GiB)": 142.32, "step": 15140, "train_speed(iter/s)": 0.292505 }, { "acc": 0.71119261, "epoch": 0.169563801036565, "grad_norm": 9.6875, "learning_rate": 9.966965039107491e-06, "loss": 1.16702042, "memory(GiB)": 142.32, "step": 15160, "train_speed(iter/s)": 0.292595 }, { "acc": 0.71380548, "epoch": 0.16978749998252352, "grad_norm": 4.875, "learning_rate": 9.966752455980397e-06, "loss": 1.14559937, "memory(GiB)": 142.32, "step": 15180, "train_speed(iter/s)": 0.292714 }, { "acc": 0.71651649, "epoch": 0.17001119892848204, "grad_norm": 5.0, "learning_rate": 9.966539193330576e-06, "loss": 1.13672838, "memory(GiB)": 142.32, "step": 15200, "train_speed(iter/s)": 0.292849 }, { "acc": 0.72425575, "epoch": 0.17023489787444057, "grad_norm": 5.53125, "learning_rate": 9.966325251187205e-06, "loss": 1.09845505, "memory(GiB)": 142.32, "step": 15220, "train_speed(iter/s)": 0.292972 }, { "acc": 0.71338577, "epoch": 0.1704585968203991, "grad_norm": 5.5625, "learning_rate": 9.966110629579556e-06, "loss": 1.15311565, "memory(GiB)": 142.32, "step": 15240, "train_speed(iter/s)": 0.293085 }, { "acc": 0.71304617, "epoch": 0.17068229576635763, "grad_norm": 4.625, "learning_rate": 9.965895328536987e-06, "loss": 1.14408226, "memory(GiB)": 142.32, "step": 15260, "train_speed(iter/s)": 0.293213 }, { "acc": 0.71473513, "epoch": 0.17090599471231616, "grad_norm": 5.28125, "learning_rate": 9.965679348088962e-06, "loss": 1.14021664, "memory(GiB)": 142.32, "step": 15280, "train_speed(iter/s)": 0.293338 }, { "acc": 0.72342467, "epoch": 0.17112969365827468, "grad_norm": 6.65625, "learning_rate": 9.965462688265025e-06, "loss": 1.10565615, "memory(GiB)": 142.32, "step": 15300, "train_speed(iter/s)": 0.293459 }, { "acc": 0.72583804, "epoch": 0.1713533926042332, "grad_norm": 7.4375, "learning_rate": 9.96524534909482e-06, "loss": 1.09125729, "memory(GiB)": 142.32, "step": 15320, "train_speed(iter/s)": 0.293583 }, { "acc": 0.72047024, "epoch": 0.17157709155019174, "grad_norm": 5.90625, "learning_rate": 9.965027330608078e-06, "loss": 1.12708025, "memory(GiB)": 142.32, "step": 15340, "train_speed(iter/s)": 0.293719 }, { "acc": 0.71138372, "epoch": 0.17180079049615027, "grad_norm": 6.84375, "learning_rate": 9.964808632834634e-06, "loss": 1.16170855, "memory(GiB)": 142.32, "step": 15360, "train_speed(iter/s)": 0.293853 }, { "acc": 0.72249908, "epoch": 0.1720244894421088, "grad_norm": 6.625, "learning_rate": 9.964589255804405e-06, "loss": 1.10868607, "memory(GiB)": 142.32, "step": 15380, "train_speed(iter/s)": 0.293952 }, { "acc": 0.71217117, "epoch": 0.17224818838806735, "grad_norm": 6.46875, "learning_rate": 9.964369199547404e-06, "loss": 1.16441116, "memory(GiB)": 142.32, "step": 15400, "train_speed(iter/s)": 0.294015 }, { "acc": 0.71022205, "epoch": 0.17247188733402588, "grad_norm": 5.53125, "learning_rate": 9.96414846409374e-06, "loss": 1.16760368, "memory(GiB)": 142.32, "step": 15420, "train_speed(iter/s)": 0.294152 }, { "acc": 0.72044983, "epoch": 0.1726955862799844, "grad_norm": 6.84375, "learning_rate": 9.963927049473614e-06, "loss": 1.09839764, "memory(GiB)": 142.32, "step": 15440, "train_speed(iter/s)": 0.294283 }, { "acc": 0.72083259, "epoch": 0.17291928522594294, "grad_norm": 5.6875, "learning_rate": 9.963704955717315e-06, "loss": 1.10473194, "memory(GiB)": 142.32, "step": 15460, "train_speed(iter/s)": 0.294412 }, { "acc": 0.71648302, "epoch": 0.17314298417190147, "grad_norm": 6.96875, "learning_rate": 9.963482182855231e-06, "loss": 1.11341839, "memory(GiB)": 142.32, "step": 15480, "train_speed(iter/s)": 0.294548 }, { "acc": 0.71699543, "epoch": 0.17336668311786, "grad_norm": 5.9375, "learning_rate": 9.963258730917839e-06, "loss": 1.13691416, "memory(GiB)": 142.32, "step": 15500, "train_speed(iter/s)": 0.294677 }, { "acc": 0.71492949, "epoch": 0.17359038206381852, "grad_norm": 6.40625, "learning_rate": 9.963034599935712e-06, "loss": 1.13939629, "memory(GiB)": 142.32, "step": 15520, "train_speed(iter/s)": 0.294805 }, { "acc": 0.71222887, "epoch": 0.17381408100977705, "grad_norm": 6.53125, "learning_rate": 9.962809789939513e-06, "loss": 1.15259743, "memory(GiB)": 142.32, "step": 15540, "train_speed(iter/s)": 0.294926 }, { "acc": 0.71913624, "epoch": 0.17403777995573558, "grad_norm": 5.65625, "learning_rate": 9.962584300960001e-06, "loss": 1.13455582, "memory(GiB)": 142.32, "step": 15560, "train_speed(iter/s)": 0.295057 }, { "acc": 0.69814425, "epoch": 0.1742614789016941, "grad_norm": 6.96875, "learning_rate": 9.962358133028025e-06, "loss": 1.21573734, "memory(GiB)": 142.32, "step": 15580, "train_speed(iter/s)": 0.295181 }, { "acc": 0.72550941, "epoch": 0.17448517784765263, "grad_norm": 6.5, "learning_rate": 9.962131286174529e-06, "loss": 1.10629063, "memory(GiB)": 142.32, "step": 15600, "train_speed(iter/s)": 0.295272 }, { "acc": 0.71261663, "epoch": 0.17470887679361116, "grad_norm": 5.34375, "learning_rate": 9.961903760430544e-06, "loss": 1.14447184, "memory(GiB)": 142.32, "step": 15620, "train_speed(iter/s)": 0.295398 }, { "acc": 0.71763067, "epoch": 0.1749325757395697, "grad_norm": 4.75, "learning_rate": 9.961675555827204e-06, "loss": 1.143606, "memory(GiB)": 142.32, "step": 15640, "train_speed(iter/s)": 0.295542 }, { "acc": 0.71125708, "epoch": 0.17515627468552822, "grad_norm": 5.5, "learning_rate": 9.961446672395731e-06, "loss": 1.15758381, "memory(GiB)": 142.32, "step": 15660, "train_speed(iter/s)": 0.29567 }, { "acc": 0.72333775, "epoch": 0.17537997363148675, "grad_norm": 5.15625, "learning_rate": 9.961217110167436e-06, "loss": 1.09727917, "memory(GiB)": 142.32, "step": 15680, "train_speed(iter/s)": 0.295781 }, { "acc": 0.72149601, "epoch": 0.17560367257744527, "grad_norm": 6.65625, "learning_rate": 9.96098686917373e-06, "loss": 1.10762939, "memory(GiB)": 142.32, "step": 15700, "train_speed(iter/s)": 0.295899 }, { "acc": 0.72045383, "epoch": 0.1758273715234038, "grad_norm": 5.96875, "learning_rate": 9.96075594944611e-06, "loss": 1.09781303, "memory(GiB)": 142.32, "step": 15720, "train_speed(iter/s)": 0.296024 }, { "acc": 0.71233101, "epoch": 0.17605107046936233, "grad_norm": 6.25, "learning_rate": 9.960524351016172e-06, "loss": 1.16470251, "memory(GiB)": 142.32, "step": 15740, "train_speed(iter/s)": 0.296152 }, { "acc": 0.71237373, "epoch": 0.17627476941532086, "grad_norm": 5.625, "learning_rate": 9.9602920739156e-06, "loss": 1.15507975, "memory(GiB)": 142.32, "step": 15760, "train_speed(iter/s)": 0.296259 }, { "acc": 0.7244936, "epoch": 0.1764984683612794, "grad_norm": 5.34375, "learning_rate": 9.960059118176173e-06, "loss": 1.09307346, "memory(GiB)": 142.32, "step": 15780, "train_speed(iter/s)": 0.296383 }, { "acc": 0.69622321, "epoch": 0.17672216730723792, "grad_norm": 5.375, "learning_rate": 9.959825483829762e-06, "loss": 1.22639894, "memory(GiB)": 142.32, "step": 15800, "train_speed(iter/s)": 0.296504 }, { "acc": 0.70818157, "epoch": 0.17694586625319644, "grad_norm": 7.4375, "learning_rate": 9.959591170908334e-06, "loss": 1.17291355, "memory(GiB)": 142.32, "step": 15820, "train_speed(iter/s)": 0.296629 }, { "acc": 0.70245256, "epoch": 0.17716956519915497, "grad_norm": 5.75, "learning_rate": 9.959356179443945e-06, "loss": 1.18670673, "memory(GiB)": 142.32, "step": 15840, "train_speed(iter/s)": 0.296746 }, { "acc": 0.71940432, "epoch": 0.1773932641451135, "grad_norm": 5.71875, "learning_rate": 9.959120509468744e-06, "loss": 1.13272724, "memory(GiB)": 142.32, "step": 15860, "train_speed(iter/s)": 0.296866 }, { "acc": 0.72012873, "epoch": 0.17761696309107203, "grad_norm": 5.625, "learning_rate": 9.958884161014976e-06, "loss": 1.11189108, "memory(GiB)": 142.32, "step": 15880, "train_speed(iter/s)": 0.296983 }, { "acc": 0.7190238, "epoch": 0.17784066203703056, "grad_norm": 5.96875, "learning_rate": 9.958647134114975e-06, "loss": 1.11713486, "memory(GiB)": 142.32, "step": 15900, "train_speed(iter/s)": 0.297111 }, { "acc": 0.72074356, "epoch": 0.17806436098298908, "grad_norm": 5.75, "learning_rate": 9.958409428801172e-06, "loss": 1.10059376, "memory(GiB)": 142.32, "step": 15920, "train_speed(iter/s)": 0.297229 }, { "acc": 0.71442661, "epoch": 0.1782880599289476, "grad_norm": 6.3125, "learning_rate": 9.958171045106086e-06, "loss": 1.15512905, "memory(GiB)": 142.32, "step": 15940, "train_speed(iter/s)": 0.297355 }, { "acc": 0.72280169, "epoch": 0.17851175887490614, "grad_norm": 4.9375, "learning_rate": 9.957931983062334e-06, "loss": 1.10253696, "memory(GiB)": 142.32, "step": 15960, "train_speed(iter/s)": 0.297489 }, { "acc": 0.72039776, "epoch": 0.17873545782086467, "grad_norm": 6.5, "learning_rate": 9.957692242702621e-06, "loss": 1.10994492, "memory(GiB)": 142.32, "step": 15980, "train_speed(iter/s)": 0.29761 }, { "acc": 0.71651869, "epoch": 0.1789591567668232, "grad_norm": 4.875, "learning_rate": 9.957451824059747e-06, "loss": 1.13861847, "memory(GiB)": 142.32, "step": 16000, "train_speed(iter/s)": 0.297715 }, { "epoch": 0.1789591567668232, "eval_acc": 0.6811719609302318, "eval_loss": 1.137093186378479, "eval_runtime": 2340.6466, "eval_samples_per_second": 32.163, "eval_steps_per_second": 16.082, "step": 16000 }, { "acc": 0.72094264, "epoch": 0.17918285571278172, "grad_norm": 6.8125, "learning_rate": 9.957210727166604e-06, "loss": 1.12207279, "memory(GiB)": 142.32, "step": 16020, "train_speed(iter/s)": 0.285157 }, { "acc": 0.71134968, "epoch": 0.17940655465874028, "grad_norm": 5.875, "learning_rate": 9.95696895205618e-06, "loss": 1.14405804, "memory(GiB)": 142.32, "step": 16040, "train_speed(iter/s)": 0.285288 }, { "acc": 0.7165174, "epoch": 0.1796302536046988, "grad_norm": 5.3125, "learning_rate": 9.956726498761553e-06, "loss": 1.12733059, "memory(GiB)": 142.32, "step": 16060, "train_speed(iter/s)": 0.285404 }, { "acc": 0.71509247, "epoch": 0.17985395255065734, "grad_norm": 6.03125, "learning_rate": 9.95648336731589e-06, "loss": 1.14909744, "memory(GiB)": 142.32, "step": 16080, "train_speed(iter/s)": 0.28552 }, { "acc": 0.71940441, "epoch": 0.18007765149661586, "grad_norm": 5.4375, "learning_rate": 9.95623955775246e-06, "loss": 1.12893648, "memory(GiB)": 142.32, "step": 16100, "train_speed(iter/s)": 0.285633 }, { "acc": 0.71637049, "epoch": 0.1803013504425744, "grad_norm": 5.375, "learning_rate": 9.955995070104618e-06, "loss": 1.13198576, "memory(GiB)": 142.32, "step": 16120, "train_speed(iter/s)": 0.285763 }, { "acc": 0.7171422, "epoch": 0.18052504938853292, "grad_norm": 5.15625, "learning_rate": 9.955749904405812e-06, "loss": 1.12505093, "memory(GiB)": 142.32, "step": 16140, "train_speed(iter/s)": 0.285879 }, { "acc": 0.71725311, "epoch": 0.18074874833449145, "grad_norm": 5.875, "learning_rate": 9.955504060689584e-06, "loss": 1.11983805, "memory(GiB)": 142.32, "step": 16160, "train_speed(iter/s)": 0.285998 }, { "acc": 0.70821247, "epoch": 0.18097244728044998, "grad_norm": 5.78125, "learning_rate": 9.955257538989573e-06, "loss": 1.18608093, "memory(GiB)": 142.32, "step": 16180, "train_speed(iter/s)": 0.286108 }, { "acc": 0.72209258, "epoch": 0.1811961462264085, "grad_norm": 5.78125, "learning_rate": 9.955010339339501e-06, "loss": 1.10358448, "memory(GiB)": 142.32, "step": 16200, "train_speed(iter/s)": 0.286228 }, { "acc": 0.70633264, "epoch": 0.18141984517236703, "grad_norm": 4.96875, "learning_rate": 9.954762461773194e-06, "loss": 1.19032955, "memory(GiB)": 142.32, "step": 16220, "train_speed(iter/s)": 0.28635 }, { "acc": 0.70454006, "epoch": 0.18164354411832556, "grad_norm": 5.53125, "learning_rate": 9.954513906324559e-06, "loss": 1.18207407, "memory(GiB)": 142.32, "step": 16240, "train_speed(iter/s)": 0.286471 }, { "acc": 0.71444702, "epoch": 0.1818672430642841, "grad_norm": 5.8125, "learning_rate": 9.954264673027606e-06, "loss": 1.15624504, "memory(GiB)": 142.32, "step": 16260, "train_speed(iter/s)": 0.286594 }, { "acc": 0.71773686, "epoch": 0.18209094201024262, "grad_norm": 6.59375, "learning_rate": 9.954014761916436e-06, "loss": 1.12457476, "memory(GiB)": 142.32, "step": 16280, "train_speed(iter/s)": 0.286724 }, { "acc": 0.73013916, "epoch": 0.18231464095620115, "grad_norm": 6.125, "learning_rate": 9.953764173025234e-06, "loss": 1.06859856, "memory(GiB)": 142.32, "step": 16300, "train_speed(iter/s)": 0.286845 }, { "acc": 0.72240295, "epoch": 0.18253833990215967, "grad_norm": 5.84375, "learning_rate": 9.953512906388288e-06, "loss": 1.1209795, "memory(GiB)": 142.32, "step": 16320, "train_speed(iter/s)": 0.286958 }, { "acc": 0.72071285, "epoch": 0.1827620388481182, "grad_norm": 6.125, "learning_rate": 9.953260962039976e-06, "loss": 1.13558922, "memory(GiB)": 142.32, "step": 16340, "train_speed(iter/s)": 0.287057 }, { "acc": 0.71895957, "epoch": 0.18298573779407673, "grad_norm": 5.0, "learning_rate": 9.953008340014764e-06, "loss": 1.13105135, "memory(GiB)": 142.32, "step": 16360, "train_speed(iter/s)": 0.287179 }, { "acc": 0.71533241, "epoch": 0.18320943674003526, "grad_norm": 6.65625, "learning_rate": 9.952755040347218e-06, "loss": 1.1490756, "memory(GiB)": 142.32, "step": 16380, "train_speed(iter/s)": 0.2873 }, { "acc": 0.71374331, "epoch": 0.1834331356859938, "grad_norm": 5.875, "learning_rate": 9.95250106307199e-06, "loss": 1.16088915, "memory(GiB)": 142.32, "step": 16400, "train_speed(iter/s)": 0.287418 }, { "acc": 0.72136621, "epoch": 0.18365683463195231, "grad_norm": 6.21875, "learning_rate": 9.952246408223831e-06, "loss": 1.10981789, "memory(GiB)": 142.32, "step": 16420, "train_speed(iter/s)": 0.287537 }, { "acc": 0.72110467, "epoch": 0.18388053357791084, "grad_norm": 5.59375, "learning_rate": 9.951991075837576e-06, "loss": 1.1117651, "memory(GiB)": 142.32, "step": 16440, "train_speed(iter/s)": 0.287658 }, { "acc": 0.72686586, "epoch": 0.18410423252386937, "grad_norm": 5.53125, "learning_rate": 9.951735065948165e-06, "loss": 1.09025478, "memory(GiB)": 142.32, "step": 16460, "train_speed(iter/s)": 0.287784 }, { "acc": 0.73398633, "epoch": 0.1843279314698279, "grad_norm": 5.3125, "learning_rate": 9.95147837859062e-06, "loss": 1.04977446, "memory(GiB)": 142.32, "step": 16480, "train_speed(iter/s)": 0.287897 }, { "acc": 0.7118392, "epoch": 0.18455163041578643, "grad_norm": 5.03125, "learning_rate": 9.951221013800059e-06, "loss": 1.16075897, "memory(GiB)": 142.32, "step": 16500, "train_speed(iter/s)": 0.288013 }, { "acc": 0.71868839, "epoch": 0.18477532936174496, "grad_norm": 6.40625, "learning_rate": 9.950962971611693e-06, "loss": 1.11791248, "memory(GiB)": 142.32, "step": 16520, "train_speed(iter/s)": 0.288135 }, { "acc": 0.71800566, "epoch": 0.18499902830770348, "grad_norm": 4.90625, "learning_rate": 9.950704252060827e-06, "loss": 1.13664284, "memory(GiB)": 142.32, "step": 16540, "train_speed(iter/s)": 0.28825 }, { "acc": 0.7204978, "epoch": 0.185222727253662, "grad_norm": 5.28125, "learning_rate": 9.950444855182859e-06, "loss": 1.11944942, "memory(GiB)": 142.32, "step": 16560, "train_speed(iter/s)": 0.288367 }, { "acc": 0.71288671, "epoch": 0.18544642619962054, "grad_norm": 4.75, "learning_rate": 9.950184781013276e-06, "loss": 1.16882963, "memory(GiB)": 142.32, "step": 16580, "train_speed(iter/s)": 0.288486 }, { "acc": 0.71994839, "epoch": 0.18567012514557907, "grad_norm": 5.0, "learning_rate": 9.94992402958766e-06, "loss": 1.11672792, "memory(GiB)": 142.32, "step": 16600, "train_speed(iter/s)": 0.288622 }, { "acc": 0.7126286, "epoch": 0.1858938240915376, "grad_norm": 5.1875, "learning_rate": 9.949662600941687e-06, "loss": 1.14876518, "memory(GiB)": 142.32, "step": 16620, "train_speed(iter/s)": 0.288754 }, { "acc": 0.70750294, "epoch": 0.18611752303749612, "grad_norm": 6.1875, "learning_rate": 9.949400495111124e-06, "loss": 1.16668072, "memory(GiB)": 142.32, "step": 16640, "train_speed(iter/s)": 0.288879 }, { "acc": 0.71280742, "epoch": 0.18634122198345468, "grad_norm": 6.8125, "learning_rate": 9.949137712131828e-06, "loss": 1.149862, "memory(GiB)": 142.32, "step": 16660, "train_speed(iter/s)": 0.288995 }, { "acc": 0.71915269, "epoch": 0.1865649209294132, "grad_norm": 7.65625, "learning_rate": 9.948874252039754e-06, "loss": 1.1244585, "memory(GiB)": 142.32, "step": 16680, "train_speed(iter/s)": 0.289104 }, { "acc": 0.71672716, "epoch": 0.18678861987537174, "grad_norm": 5.09375, "learning_rate": 9.948610114870946e-06, "loss": 1.13077717, "memory(GiB)": 142.32, "step": 16700, "train_speed(iter/s)": 0.289223 }, { "acc": 0.71166492, "epoch": 0.18701231882133026, "grad_norm": 6.78125, "learning_rate": 9.948345300661543e-06, "loss": 1.15205994, "memory(GiB)": 142.32, "step": 16720, "train_speed(iter/s)": 0.289349 }, { "acc": 0.71986198, "epoch": 0.1872360177672888, "grad_norm": 6.1875, "learning_rate": 9.948079809447776e-06, "loss": 1.1378356, "memory(GiB)": 142.32, "step": 16740, "train_speed(iter/s)": 0.289454 }, { "acc": 0.72770128, "epoch": 0.18745971671324732, "grad_norm": 6.8125, "learning_rate": 9.947813641265965e-06, "loss": 1.09548321, "memory(GiB)": 142.32, "step": 16760, "train_speed(iter/s)": 0.289571 }, { "acc": 0.71488643, "epoch": 0.18768341565920585, "grad_norm": 5.65625, "learning_rate": 9.947546796152529e-06, "loss": 1.12814369, "memory(GiB)": 142.32, "step": 16780, "train_speed(iter/s)": 0.28968 }, { "acc": 0.7289907, "epoch": 0.18790711460516438, "grad_norm": 5.84375, "learning_rate": 9.947279274143973e-06, "loss": 1.06832085, "memory(GiB)": 142.32, "step": 16800, "train_speed(iter/s)": 0.289793 }, { "acc": 0.71595192, "epoch": 0.1881308135511229, "grad_norm": 4.71875, "learning_rate": 9.9470110752769e-06, "loss": 1.13656807, "memory(GiB)": 142.32, "step": 16820, "train_speed(iter/s)": 0.289908 }, { "acc": 0.72392325, "epoch": 0.18835451249708143, "grad_norm": 6.0625, "learning_rate": 9.946742199588002e-06, "loss": 1.11162338, "memory(GiB)": 142.32, "step": 16840, "train_speed(iter/s)": 0.290029 }, { "acc": 0.71799731, "epoch": 0.18857821144303996, "grad_norm": 5.59375, "learning_rate": 9.946472647114066e-06, "loss": 1.11917534, "memory(GiB)": 142.32, "step": 16860, "train_speed(iter/s)": 0.290147 }, { "acc": 0.70988407, "epoch": 0.1888019103889985, "grad_norm": 5.5625, "learning_rate": 9.946202417891972e-06, "loss": 1.16669292, "memory(GiB)": 142.32, "step": 16880, "train_speed(iter/s)": 0.290273 }, { "acc": 0.72918749, "epoch": 0.18902560933495702, "grad_norm": 5.125, "learning_rate": 9.94593151195869e-06, "loss": 1.06749706, "memory(GiB)": 142.32, "step": 16900, "train_speed(iter/s)": 0.290384 }, { "acc": 0.71984301, "epoch": 0.18924930828091555, "grad_norm": 5.28125, "learning_rate": 9.945659929351282e-06, "loss": 1.11763096, "memory(GiB)": 142.32, "step": 16920, "train_speed(iter/s)": 0.290495 }, { "acc": 0.7185605, "epoch": 0.18947300722687407, "grad_norm": 6.59375, "learning_rate": 9.945387670106905e-06, "loss": 1.12439804, "memory(GiB)": 142.32, "step": 16940, "train_speed(iter/s)": 0.290597 }, { "acc": 0.71795235, "epoch": 0.1896967061728326, "grad_norm": 5.65625, "learning_rate": 9.94511473426281e-06, "loss": 1.12703333, "memory(GiB)": 142.32, "step": 16960, "train_speed(iter/s)": 0.290709 }, { "acc": 0.695154, "epoch": 0.18992040511879113, "grad_norm": 4.90625, "learning_rate": 9.944841121856337e-06, "loss": 1.23986187, "memory(GiB)": 142.32, "step": 16980, "train_speed(iter/s)": 0.290821 }, { "acc": 0.71700811, "epoch": 0.19014410406474966, "grad_norm": 6.125, "learning_rate": 9.944566832924922e-06, "loss": 1.14218645, "memory(GiB)": 142.32, "step": 17000, "train_speed(iter/s)": 0.290933 }, { "acc": 0.71502094, "epoch": 0.1903678030107082, "grad_norm": 5.59375, "learning_rate": 9.944291867506089e-06, "loss": 1.12551641, "memory(GiB)": 142.32, "step": 17020, "train_speed(iter/s)": 0.29105 }, { "acc": 0.71705103, "epoch": 0.19059150195666671, "grad_norm": 5.5625, "learning_rate": 9.944016225637458e-06, "loss": 1.13101044, "memory(GiB)": 142.32, "step": 17040, "train_speed(iter/s)": 0.291163 }, { "acc": 0.72297602, "epoch": 0.19081520090262524, "grad_norm": 5.40625, "learning_rate": 9.943739907356743e-06, "loss": 1.11755772, "memory(GiB)": 142.32, "step": 17060, "train_speed(iter/s)": 0.291269 }, { "acc": 0.71448202, "epoch": 0.19103889984858377, "grad_norm": 5.6875, "learning_rate": 9.943462912701743e-06, "loss": 1.12903795, "memory(GiB)": 142.32, "step": 17080, "train_speed(iter/s)": 0.291378 }, { "acc": 0.71386642, "epoch": 0.1912625987945423, "grad_norm": 4.78125, "learning_rate": 9.943185241710361e-06, "loss": 1.13111553, "memory(GiB)": 142.32, "step": 17100, "train_speed(iter/s)": 0.291486 }, { "acc": 0.7138525, "epoch": 0.19148629774050083, "grad_norm": 6.1875, "learning_rate": 9.942906894420582e-06, "loss": 1.12776575, "memory(GiB)": 142.32, "step": 17120, "train_speed(iter/s)": 0.291607 }, { "acc": 0.70288296, "epoch": 0.19170999668645936, "grad_norm": 5.09375, "learning_rate": 9.94262787087049e-06, "loss": 1.19603624, "memory(GiB)": 142.32, "step": 17140, "train_speed(iter/s)": 0.291716 }, { "acc": 0.71860523, "epoch": 0.19193369563241788, "grad_norm": 5.5625, "learning_rate": 9.942348171098258e-06, "loss": 1.11899261, "memory(GiB)": 142.32, "step": 17160, "train_speed(iter/s)": 0.291835 }, { "acc": 0.72287169, "epoch": 0.1921573945783764, "grad_norm": 6.125, "learning_rate": 9.942067795142154e-06, "loss": 1.11393614, "memory(GiB)": 142.32, "step": 17180, "train_speed(iter/s)": 0.291953 }, { "acc": 0.71532083, "epoch": 0.19238109352433494, "grad_norm": 6.6875, "learning_rate": 9.941786743040537e-06, "loss": 1.12906113, "memory(GiB)": 142.32, "step": 17200, "train_speed(iter/s)": 0.292066 }, { "acc": 0.71971488, "epoch": 0.19260479247029347, "grad_norm": 6.3125, "learning_rate": 9.941505014831862e-06, "loss": 1.11252537, "memory(GiB)": 142.32, "step": 17220, "train_speed(iter/s)": 0.292185 }, { "acc": 0.707651, "epoch": 0.192828491416252, "grad_norm": 6.625, "learning_rate": 9.941222610554668e-06, "loss": 1.18706799, "memory(GiB)": 142.32, "step": 17240, "train_speed(iter/s)": 0.292296 }, { "acc": 0.72337418, "epoch": 0.19305219036221052, "grad_norm": 6.28125, "learning_rate": 9.940939530247595e-06, "loss": 1.09913845, "memory(GiB)": 142.32, "step": 17260, "train_speed(iter/s)": 0.292409 }, { "acc": 0.70588951, "epoch": 0.19327588930816908, "grad_norm": 9.375, "learning_rate": 9.940655773949372e-06, "loss": 1.18685932, "memory(GiB)": 142.32, "step": 17280, "train_speed(iter/s)": 0.292518 }, { "acc": 0.71997366, "epoch": 0.1934995882541276, "grad_norm": 6.1875, "learning_rate": 9.94037134169882e-06, "loss": 1.12568054, "memory(GiB)": 142.32, "step": 17300, "train_speed(iter/s)": 0.292621 }, { "acc": 0.72071991, "epoch": 0.19372328720008614, "grad_norm": 5.65625, "learning_rate": 9.940086233534856e-06, "loss": 1.12250557, "memory(GiB)": 142.32, "step": 17320, "train_speed(iter/s)": 0.292737 }, { "acc": 0.72173262, "epoch": 0.19394698614604466, "grad_norm": 5.96875, "learning_rate": 9.939800449496484e-06, "loss": 1.10655537, "memory(GiB)": 142.32, "step": 17340, "train_speed(iter/s)": 0.29284 }, { "acc": 0.70543208, "epoch": 0.1941706850920032, "grad_norm": 6.40625, "learning_rate": 9.939513989622805e-06, "loss": 1.17955666, "memory(GiB)": 142.32, "step": 17360, "train_speed(iter/s)": 0.292959 }, { "acc": 0.72275839, "epoch": 0.19439438403796172, "grad_norm": 5.875, "learning_rate": 9.939226853953009e-06, "loss": 1.10843201, "memory(GiB)": 142.32, "step": 17380, "train_speed(iter/s)": 0.293072 }, { "acc": 0.72885499, "epoch": 0.19461808298392025, "grad_norm": 6.21875, "learning_rate": 9.938939042526382e-06, "loss": 1.06798019, "memory(GiB)": 142.32, "step": 17400, "train_speed(iter/s)": 0.293188 }, { "acc": 0.72513409, "epoch": 0.19484178192987878, "grad_norm": 6.15625, "learning_rate": 9.9386505553823e-06, "loss": 1.10736961, "memory(GiB)": 142.32, "step": 17420, "train_speed(iter/s)": 0.293296 }, { "acc": 0.71709905, "epoch": 0.1950654808758373, "grad_norm": 5.1875, "learning_rate": 9.938361392560235e-06, "loss": 1.1325737, "memory(GiB)": 142.32, "step": 17440, "train_speed(iter/s)": 0.293413 }, { "acc": 0.71611338, "epoch": 0.19528917982179583, "grad_norm": 5.375, "learning_rate": 9.938071554099745e-06, "loss": 1.13586597, "memory(GiB)": 142.32, "step": 17460, "train_speed(iter/s)": 0.293521 }, { "acc": 0.71500378, "epoch": 0.19551287876775436, "grad_norm": 6.0, "learning_rate": 9.937781040040484e-06, "loss": 1.16113377, "memory(GiB)": 142.32, "step": 17480, "train_speed(iter/s)": 0.293626 }, { "acc": 0.72533345, "epoch": 0.1957365777137129, "grad_norm": 6.46875, "learning_rate": 9.9374898504222e-06, "loss": 1.08834019, "memory(GiB)": 142.32, "step": 17500, "train_speed(iter/s)": 0.293745 }, { "acc": 0.71723242, "epoch": 0.19596027665967142, "grad_norm": 5.75, "learning_rate": 9.937197985284732e-06, "loss": 1.12464142, "memory(GiB)": 142.32, "step": 17520, "train_speed(iter/s)": 0.293858 }, { "acc": 0.71351566, "epoch": 0.19618397560562995, "grad_norm": 5.875, "learning_rate": 9.93690544466801e-06, "loss": 1.14872456, "memory(GiB)": 142.32, "step": 17540, "train_speed(iter/s)": 0.293962 }, { "acc": 0.70843725, "epoch": 0.19640767455158847, "grad_norm": 4.625, "learning_rate": 9.936612228612058e-06, "loss": 1.18376503, "memory(GiB)": 142.32, "step": 17560, "train_speed(iter/s)": 0.294063 }, { "acc": 0.71967916, "epoch": 0.196631373497547, "grad_norm": 6.25, "learning_rate": 9.936318337156993e-06, "loss": 1.10646057, "memory(GiB)": 142.32, "step": 17580, "train_speed(iter/s)": 0.294158 }, { "acc": 0.72235212, "epoch": 0.19685507244350553, "grad_norm": 5.25, "learning_rate": 9.936023770343024e-06, "loss": 1.10716362, "memory(GiB)": 142.32, "step": 17600, "train_speed(iter/s)": 0.294269 }, { "acc": 0.71137624, "epoch": 0.19707877138946406, "grad_norm": 6.4375, "learning_rate": 9.935728528210451e-06, "loss": 1.14983988, "memory(GiB)": 142.32, "step": 17620, "train_speed(iter/s)": 0.294383 }, { "acc": 0.71520324, "epoch": 0.1973024703354226, "grad_norm": 6.03125, "learning_rate": 9.935432610799667e-06, "loss": 1.13436785, "memory(GiB)": 142.32, "step": 17640, "train_speed(iter/s)": 0.294492 }, { "acc": 0.71956034, "epoch": 0.19752616928138111, "grad_norm": 6.65625, "learning_rate": 9.93513601815116e-06, "loss": 1.13322563, "memory(GiB)": 142.32, "step": 17660, "train_speed(iter/s)": 0.294601 }, { "acc": 0.72925014, "epoch": 0.19774986822733964, "grad_norm": 5.78125, "learning_rate": 9.934838750305504e-06, "loss": 1.07291756, "memory(GiB)": 142.32, "step": 17680, "train_speed(iter/s)": 0.294716 }, { "acc": 0.71403403, "epoch": 0.19797356717329817, "grad_norm": 6.03125, "learning_rate": 9.934540807303372e-06, "loss": 1.15159054, "memory(GiB)": 142.32, "step": 17700, "train_speed(iter/s)": 0.294814 }, { "acc": 0.71297245, "epoch": 0.1981972661192567, "grad_norm": 5.5, "learning_rate": 9.934242189185527e-06, "loss": 1.14196711, "memory(GiB)": 142.32, "step": 17720, "train_speed(iter/s)": 0.294925 }, { "acc": 0.72139144, "epoch": 0.19842096506521523, "grad_norm": 6.34375, "learning_rate": 9.933942895992825e-06, "loss": 1.11067705, "memory(GiB)": 142.32, "step": 17740, "train_speed(iter/s)": 0.295028 }, { "acc": 0.71692371, "epoch": 0.19864466401117375, "grad_norm": 5.09375, "learning_rate": 9.933642927766215e-06, "loss": 1.13158398, "memory(GiB)": 142.32, "step": 17760, "train_speed(iter/s)": 0.295133 }, { "acc": 0.72457094, "epoch": 0.19886836295713228, "grad_norm": 5.75, "learning_rate": 9.93334228454673e-06, "loss": 1.08977852, "memory(GiB)": 142.32, "step": 17780, "train_speed(iter/s)": 0.295246 }, { "acc": 0.71397338, "epoch": 0.1990920619030908, "grad_norm": 6.3125, "learning_rate": 9.933040966375508e-06, "loss": 1.15205956, "memory(GiB)": 142.32, "step": 17800, "train_speed(iter/s)": 0.295346 }, { "acc": 0.716222, "epoch": 0.19931576084904934, "grad_norm": 7.5625, "learning_rate": 9.932738973293773e-06, "loss": 1.13825474, "memory(GiB)": 142.32, "step": 17820, "train_speed(iter/s)": 0.295445 }, { "acc": 0.71517167, "epoch": 0.19953945979500787, "grad_norm": 6.1875, "learning_rate": 9.932436305342842e-06, "loss": 1.141113, "memory(GiB)": 142.32, "step": 17840, "train_speed(iter/s)": 0.295554 }, { "acc": 0.71729555, "epoch": 0.1997631587409664, "grad_norm": 5.5, "learning_rate": 9.932132962564121e-06, "loss": 1.13011131, "memory(GiB)": 142.32, "step": 17860, "train_speed(iter/s)": 0.295658 }, { "acc": 0.72036619, "epoch": 0.19998685768692492, "grad_norm": 5.84375, "learning_rate": 9.931828944999116e-06, "loss": 1.10368195, "memory(GiB)": 142.32, "step": 17880, "train_speed(iter/s)": 0.295759 }, { "acc": 0.7257525, "epoch": 0.20021055663288345, "grad_norm": 6.5625, "learning_rate": 9.931524252689419e-06, "loss": 1.09310503, "memory(GiB)": 142.32, "step": 17900, "train_speed(iter/s)": 0.295877 }, { "acc": 0.70843863, "epoch": 0.200434255578842, "grad_norm": 5.03125, "learning_rate": 9.931218885676718e-06, "loss": 1.17813988, "memory(GiB)": 142.32, "step": 17920, "train_speed(iter/s)": 0.295973 }, { "acc": 0.7200263, "epoch": 0.20065795452480054, "grad_norm": 5.59375, "learning_rate": 9.93091284400279e-06, "loss": 1.12435818, "memory(GiB)": 142.32, "step": 17940, "train_speed(iter/s)": 0.296079 }, { "acc": 0.72673774, "epoch": 0.20088165347075906, "grad_norm": 5.9375, "learning_rate": 9.930606127709503e-06, "loss": 1.08067513, "memory(GiB)": 142.32, "step": 17960, "train_speed(iter/s)": 0.296189 }, { "acc": 0.72318172, "epoch": 0.2011053524167176, "grad_norm": 5.09375, "learning_rate": 9.930298736838826e-06, "loss": 1.1004982, "memory(GiB)": 142.32, "step": 17980, "train_speed(iter/s)": 0.296287 }, { "acc": 0.71453075, "epoch": 0.20132905136267612, "grad_norm": 4.9375, "learning_rate": 9.92999067143281e-06, "loss": 1.16814384, "memory(GiB)": 142.32, "step": 18000, "train_speed(iter/s)": 0.296375 }, { "epoch": 0.20132905136267612, "eval_acc": 0.6826790089715968, "eval_loss": 1.1306874752044678, "eval_runtime": 2338.786, "eval_samples_per_second": 32.189, "eval_steps_per_second": 16.095, "step": 18000 }, { "acc": 0.72036638, "epoch": 0.20155275030863465, "grad_norm": 5.28125, "learning_rate": 9.929681931533605e-06, "loss": 1.11137094, "memory(GiB)": 142.32, "step": 18020, "train_speed(iter/s)": 0.285267 }, { "acc": 0.7146687, "epoch": 0.20177644925459318, "grad_norm": 5.40625, "learning_rate": 9.92937251718345e-06, "loss": 1.14326172, "memory(GiB)": 142.32, "step": 18040, "train_speed(iter/s)": 0.285382 }, { "acc": 0.71114349, "epoch": 0.2020001482005517, "grad_norm": 6.125, "learning_rate": 9.929062428424678e-06, "loss": 1.15369873, "memory(GiB)": 142.32, "step": 18060, "train_speed(iter/s)": 0.285491 }, { "acc": 0.70784769, "epoch": 0.20222384714651023, "grad_norm": 6.0625, "learning_rate": 9.928751665299714e-06, "loss": 1.17566872, "memory(GiB)": 142.32, "step": 18080, "train_speed(iter/s)": 0.285594 }, { "acc": 0.71644135, "epoch": 0.20244754609246876, "grad_norm": 6.78125, "learning_rate": 9.928440227851072e-06, "loss": 1.13226252, "memory(GiB)": 142.32, "step": 18100, "train_speed(iter/s)": 0.285709 }, { "acc": 0.72590466, "epoch": 0.2026712450384273, "grad_norm": 5.40625, "learning_rate": 9.928128116121365e-06, "loss": 1.09662113, "memory(GiB)": 142.32, "step": 18120, "train_speed(iter/s)": 0.285815 }, { "acc": 0.71479993, "epoch": 0.20289494398438582, "grad_norm": 5.40625, "learning_rate": 9.927815330153291e-06, "loss": 1.12967186, "memory(GiB)": 142.32, "step": 18140, "train_speed(iter/s)": 0.285904 }, { "acc": 0.72452707, "epoch": 0.20311864293034435, "grad_norm": 5.65625, "learning_rate": 9.927501869989648e-06, "loss": 1.10266991, "memory(GiB)": 142.32, "step": 18160, "train_speed(iter/s)": 0.286015 }, { "acc": 0.71460848, "epoch": 0.20334234187630287, "grad_norm": 5.84375, "learning_rate": 9.927187735673315e-06, "loss": 1.12837029, "memory(GiB)": 142.32, "step": 18180, "train_speed(iter/s)": 0.286115 }, { "acc": 0.72039528, "epoch": 0.2035660408222614, "grad_norm": 7.0625, "learning_rate": 9.926872927247277e-06, "loss": 1.13664398, "memory(GiB)": 142.32, "step": 18200, "train_speed(iter/s)": 0.286223 }, { "acc": 0.7282012, "epoch": 0.20378973976821993, "grad_norm": 6.8125, "learning_rate": 9.926557444754601e-06, "loss": 1.0743042, "memory(GiB)": 142.32, "step": 18220, "train_speed(iter/s)": 0.286328 }, { "acc": 0.71085463, "epoch": 0.20401343871417846, "grad_norm": 5.6875, "learning_rate": 9.92624128823845e-06, "loss": 1.16364403, "memory(GiB)": 142.32, "step": 18240, "train_speed(iter/s)": 0.286432 }, { "acc": 0.72053137, "epoch": 0.20423713766013699, "grad_norm": 5.8125, "learning_rate": 9.925924457742078e-06, "loss": 1.10786409, "memory(GiB)": 142.32, "step": 18260, "train_speed(iter/s)": 0.286526 }, { "acc": 0.71613102, "epoch": 0.20446083660609551, "grad_norm": 5.53125, "learning_rate": 9.925606953308831e-06, "loss": 1.1271863, "memory(GiB)": 142.32, "step": 18280, "train_speed(iter/s)": 0.286637 }, { "acc": 0.7125957, "epoch": 0.20468453555205404, "grad_norm": 5.90625, "learning_rate": 9.925288774982151e-06, "loss": 1.17668362, "memory(GiB)": 142.32, "step": 18300, "train_speed(iter/s)": 0.286738 }, { "acc": 0.72749653, "epoch": 0.20490823449801257, "grad_norm": 5.71875, "learning_rate": 9.92496992280557e-06, "loss": 1.07601633, "memory(GiB)": 142.32, "step": 18320, "train_speed(iter/s)": 0.286845 }, { "acc": 0.71900129, "epoch": 0.2051319334439711, "grad_norm": 6.1875, "learning_rate": 9.924650396822706e-06, "loss": 1.1371397, "memory(GiB)": 142.32, "step": 18340, "train_speed(iter/s)": 0.286945 }, { "acc": 0.70394039, "epoch": 0.20535563238992963, "grad_norm": 5.59375, "learning_rate": 9.92433019707728e-06, "loss": 1.20161057, "memory(GiB)": 142.32, "step": 18360, "train_speed(iter/s)": 0.287038 }, { "acc": 0.7277667, "epoch": 0.20557933133588815, "grad_norm": 6.53125, "learning_rate": 9.924009323613098e-06, "loss": 1.08171721, "memory(GiB)": 142.32, "step": 18380, "train_speed(iter/s)": 0.287131 }, { "acc": 0.72643976, "epoch": 0.20580303028184668, "grad_norm": 5.65625, "learning_rate": 9.92368777647406e-06, "loss": 1.0957592, "memory(GiB)": 142.32, "step": 18400, "train_speed(iter/s)": 0.287231 }, { "acc": 0.72543926, "epoch": 0.2060267292278052, "grad_norm": 5.65625, "learning_rate": 9.923365555704159e-06, "loss": 1.09040222, "memory(GiB)": 142.32, "step": 18420, "train_speed(iter/s)": 0.287337 }, { "acc": 0.71510792, "epoch": 0.20625042817376374, "grad_norm": 5.96875, "learning_rate": 9.923042661347477e-06, "loss": 1.15655203, "memory(GiB)": 142.32, "step": 18440, "train_speed(iter/s)": 0.287419 }, { "acc": 0.71617146, "epoch": 0.20647412711972227, "grad_norm": 6.15625, "learning_rate": 9.922719093448194e-06, "loss": 1.14802256, "memory(GiB)": 142.32, "step": 18460, "train_speed(iter/s)": 0.287525 }, { "acc": 0.71382899, "epoch": 0.2066978260656808, "grad_norm": 5.53125, "learning_rate": 9.92239485205058e-06, "loss": 1.15372334, "memory(GiB)": 142.32, "step": 18480, "train_speed(iter/s)": 0.287624 }, { "acc": 0.71310949, "epoch": 0.20692152501163932, "grad_norm": 6.09375, "learning_rate": 9.922069937198987e-06, "loss": 1.1403141, "memory(GiB)": 142.32, "step": 18500, "train_speed(iter/s)": 0.287713 }, { "acc": 0.72482777, "epoch": 0.20714522395759785, "grad_norm": 6.375, "learning_rate": 9.921744348937878e-06, "loss": 1.110394, "memory(GiB)": 142.32, "step": 18520, "train_speed(iter/s)": 0.287813 }, { "acc": 0.72599764, "epoch": 0.2073689229035564, "grad_norm": 5.5625, "learning_rate": 9.921418087311794e-06, "loss": 1.10126801, "memory(GiB)": 142.32, "step": 18540, "train_speed(iter/s)": 0.287918 }, { "acc": 0.72575412, "epoch": 0.20759262184951494, "grad_norm": 5.34375, "learning_rate": 9.92109115236537e-06, "loss": 1.08532295, "memory(GiB)": 142.32, "step": 18560, "train_speed(iter/s)": 0.288028 }, { "acc": 0.71446285, "epoch": 0.20781632079547346, "grad_norm": 5.8125, "learning_rate": 9.920763544143339e-06, "loss": 1.13677664, "memory(GiB)": 142.32, "step": 18580, "train_speed(iter/s)": 0.288137 }, { "acc": 0.71196213, "epoch": 0.208040019741432, "grad_norm": 5.25, "learning_rate": 9.920435262690523e-06, "loss": 1.15620174, "memory(GiB)": 142.32, "step": 18600, "train_speed(iter/s)": 0.28824 }, { "acc": 0.72548485, "epoch": 0.20826371868739052, "grad_norm": 5.6875, "learning_rate": 9.92010630805183e-06, "loss": 1.08243637, "memory(GiB)": 142.32, "step": 18620, "train_speed(iter/s)": 0.288336 }, { "acc": 0.71401377, "epoch": 0.20848741763334905, "grad_norm": 5.9375, "learning_rate": 9.919776680272272e-06, "loss": 1.14520092, "memory(GiB)": 142.32, "step": 18640, "train_speed(iter/s)": 0.288449 }, { "acc": 0.72399044, "epoch": 0.20871111657930758, "grad_norm": 5.78125, "learning_rate": 9.919446379396946e-06, "loss": 1.1000288, "memory(GiB)": 142.32, "step": 18660, "train_speed(iter/s)": 0.288544 }, { "acc": 0.71391897, "epoch": 0.2089348155252661, "grad_norm": 6.21875, "learning_rate": 9.919115405471039e-06, "loss": 1.13682098, "memory(GiB)": 142.32, "step": 18680, "train_speed(iter/s)": 0.288653 }, { "acc": 0.72203665, "epoch": 0.20915851447122463, "grad_norm": 5.09375, "learning_rate": 9.918783758539833e-06, "loss": 1.10142117, "memory(GiB)": 142.32, "step": 18700, "train_speed(iter/s)": 0.288752 }, { "acc": 0.72346835, "epoch": 0.20938221341718316, "grad_norm": 5.0, "learning_rate": 9.918451438648705e-06, "loss": 1.12463131, "memory(GiB)": 142.32, "step": 18720, "train_speed(iter/s)": 0.288861 }, { "acc": 0.72910299, "epoch": 0.2096059123631417, "grad_norm": 5.34375, "learning_rate": 9.918118445843117e-06, "loss": 1.06907272, "memory(GiB)": 142.32, "step": 18740, "train_speed(iter/s)": 0.288961 }, { "acc": 0.71934252, "epoch": 0.20982961130910022, "grad_norm": 5.53125, "learning_rate": 9.91778478016863e-06, "loss": 1.12354565, "memory(GiB)": 142.32, "step": 18760, "train_speed(iter/s)": 0.289053 }, { "acc": 0.71963329, "epoch": 0.21005331025505874, "grad_norm": 6.09375, "learning_rate": 9.917450441670895e-06, "loss": 1.1194828, "memory(GiB)": 142.32, "step": 18780, "train_speed(iter/s)": 0.289166 }, { "acc": 0.71648965, "epoch": 0.21027700920101727, "grad_norm": 5.28125, "learning_rate": 9.917115430395651e-06, "loss": 1.13416996, "memory(GiB)": 142.32, "step": 18800, "train_speed(iter/s)": 0.289272 }, { "acc": 0.71146107, "epoch": 0.2105007081469758, "grad_norm": 7.28125, "learning_rate": 9.916779746388737e-06, "loss": 1.16177845, "memory(GiB)": 142.32, "step": 18820, "train_speed(iter/s)": 0.289377 }, { "acc": 0.72327547, "epoch": 0.21072440709293433, "grad_norm": 4.65625, "learning_rate": 9.916443389696076e-06, "loss": 1.10530024, "memory(GiB)": 142.32, "step": 18840, "train_speed(iter/s)": 0.289486 }, { "acc": 0.72563438, "epoch": 0.21094810603889286, "grad_norm": 6.5625, "learning_rate": 9.916106360363687e-06, "loss": 1.08692856, "memory(GiB)": 142.32, "step": 18860, "train_speed(iter/s)": 0.28959 }, { "acc": 0.71666012, "epoch": 0.21117180498485139, "grad_norm": 6.09375, "learning_rate": 9.915768658437678e-06, "loss": 1.13364811, "memory(GiB)": 142.32, "step": 18880, "train_speed(iter/s)": 0.289695 }, { "acc": 0.71023989, "epoch": 0.2113955039308099, "grad_norm": 5.40625, "learning_rate": 9.915430283964259e-06, "loss": 1.16582222, "memory(GiB)": 142.32, "step": 18900, "train_speed(iter/s)": 0.289796 }, { "acc": 0.7194232, "epoch": 0.21161920287676844, "grad_norm": 6.375, "learning_rate": 9.915091236989715e-06, "loss": 1.10381546, "memory(GiB)": 142.32, "step": 18920, "train_speed(iter/s)": 0.2899 }, { "acc": 0.72089081, "epoch": 0.21184290182272697, "grad_norm": 4.96875, "learning_rate": 9.914751517560439e-06, "loss": 1.1120903, "memory(GiB)": 142.32, "step": 18940, "train_speed(iter/s)": 0.290009 }, { "acc": 0.71526041, "epoch": 0.2120666007686855, "grad_norm": 7.3125, "learning_rate": 9.914411125722908e-06, "loss": 1.15144291, "memory(GiB)": 142.32, "step": 18960, "train_speed(iter/s)": 0.290125 }, { "acc": 0.72684278, "epoch": 0.21229029971464403, "grad_norm": 5.96875, "learning_rate": 9.91407006152369e-06, "loss": 1.08212795, "memory(GiB)": 142.32, "step": 18980, "train_speed(iter/s)": 0.29022 }, { "acc": 0.71249824, "epoch": 0.21251399866060255, "grad_norm": 5.5, "learning_rate": 9.91372832500945e-06, "loss": 1.13843956, "memory(GiB)": 142.32, "step": 19000, "train_speed(iter/s)": 0.290317 }, { "acc": 0.70840111, "epoch": 0.21273769760656108, "grad_norm": 6.46875, "learning_rate": 9.913385916226941e-06, "loss": 1.16124582, "memory(GiB)": 142.32, "step": 19020, "train_speed(iter/s)": 0.290414 }, { "acc": 0.70295877, "epoch": 0.2129613965525196, "grad_norm": 6.125, "learning_rate": 9.913042835223012e-06, "loss": 1.21265907, "memory(GiB)": 142.32, "step": 19040, "train_speed(iter/s)": 0.290507 }, { "acc": 0.72024412, "epoch": 0.21318509549847814, "grad_norm": 5.15625, "learning_rate": 9.912699082044599e-06, "loss": 1.0976265, "memory(GiB)": 142.32, "step": 19060, "train_speed(iter/s)": 0.290611 }, { "acc": 0.70235443, "epoch": 0.21340879444443667, "grad_norm": 5.28125, "learning_rate": 9.912354656738731e-06, "loss": 1.18040142, "memory(GiB)": 142.32, "step": 19080, "train_speed(iter/s)": 0.290727 }, { "acc": 0.71882229, "epoch": 0.2136324933903952, "grad_norm": 5.75, "learning_rate": 9.912009559352536e-06, "loss": 1.13652506, "memory(GiB)": 142.32, "step": 19100, "train_speed(iter/s)": 0.290829 }, { "acc": 0.70796375, "epoch": 0.21385619233635372, "grad_norm": 5.5625, "learning_rate": 9.911663789933222e-06, "loss": 1.17350368, "memory(GiB)": 142.32, "step": 19120, "train_speed(iter/s)": 0.290938 }, { "acc": 0.72076612, "epoch": 0.21407989128231225, "grad_norm": 6.0, "learning_rate": 9.911317348528097e-06, "loss": 1.12080345, "memory(GiB)": 142.32, "step": 19140, "train_speed(iter/s)": 0.29105 }, { "acc": 0.71636534, "epoch": 0.21430359022827078, "grad_norm": 6.21875, "learning_rate": 9.910970235184561e-06, "loss": 1.13834229, "memory(GiB)": 142.32, "step": 19160, "train_speed(iter/s)": 0.291161 }, { "acc": 0.71120591, "epoch": 0.21452728917422934, "grad_norm": 5.0, "learning_rate": 9.910622449950102e-06, "loss": 1.15858068, "memory(GiB)": 142.32, "step": 19180, "train_speed(iter/s)": 0.291268 }, { "acc": 0.72368231, "epoch": 0.21475098812018786, "grad_norm": 6.4375, "learning_rate": 9.910273992872305e-06, "loss": 1.11251774, "memory(GiB)": 142.32, "step": 19200, "train_speed(iter/s)": 0.291354 }, { "acc": 0.71772871, "epoch": 0.2149746870661464, "grad_norm": 4.65625, "learning_rate": 9.90992486399884e-06, "loss": 1.12394924, "memory(GiB)": 142.32, "step": 19220, "train_speed(iter/s)": 0.291447 }, { "acc": 0.72258053, "epoch": 0.21519838601210492, "grad_norm": 6.28125, "learning_rate": 9.909575063377474e-06, "loss": 1.08080063, "memory(GiB)": 142.32, "step": 19240, "train_speed(iter/s)": 0.291555 }, { "acc": 0.71665401, "epoch": 0.21542208495806345, "grad_norm": 6.15625, "learning_rate": 9.909224591056068e-06, "loss": 1.13205242, "memory(GiB)": 142.32, "step": 19260, "train_speed(iter/s)": 0.291653 }, { "acc": 0.70487514, "epoch": 0.21564578390402198, "grad_norm": 5.125, "learning_rate": 9.908873447082567e-06, "loss": 1.17858362, "memory(GiB)": 142.32, "step": 19280, "train_speed(iter/s)": 0.291758 }, { "acc": 0.72686815, "epoch": 0.2158694828499805, "grad_norm": 5.4375, "learning_rate": 9.908521631505015e-06, "loss": 1.07729092, "memory(GiB)": 142.32, "step": 19300, "train_speed(iter/s)": 0.291851 }, { "acc": 0.72622538, "epoch": 0.21609318179593903, "grad_norm": 6.0, "learning_rate": 9.908169144371544e-06, "loss": 1.10326805, "memory(GiB)": 142.32, "step": 19320, "train_speed(iter/s)": 0.291951 }, { "acc": 0.72409687, "epoch": 0.21631688074189756, "grad_norm": 5.0625, "learning_rate": 9.90781598573038e-06, "loss": 1.11754446, "memory(GiB)": 142.32, "step": 19340, "train_speed(iter/s)": 0.292051 }, { "acc": 0.72511311, "epoch": 0.2165405796878561, "grad_norm": 5.0, "learning_rate": 9.907462155629841e-06, "loss": 1.09477386, "memory(GiB)": 142.32, "step": 19360, "train_speed(iter/s)": 0.292144 }, { "acc": 0.72372317, "epoch": 0.21676427863381462, "grad_norm": 6.75, "learning_rate": 9.907107654118337e-06, "loss": 1.10254822, "memory(GiB)": 142.32, "step": 19380, "train_speed(iter/s)": 0.292239 }, { "acc": 0.72138839, "epoch": 0.21698797757977314, "grad_norm": 6.46875, "learning_rate": 9.906752481244366e-06, "loss": 1.09304752, "memory(GiB)": 142.32, "step": 19400, "train_speed(iter/s)": 0.292335 }, { "acc": 0.71348643, "epoch": 0.21721167652573167, "grad_norm": 6.40625, "learning_rate": 9.906396637056522e-06, "loss": 1.15465488, "memory(GiB)": 142.32, "step": 19420, "train_speed(iter/s)": 0.292432 }, { "acc": 0.71957312, "epoch": 0.2174353754716902, "grad_norm": 6.25, "learning_rate": 9.906040121603488e-06, "loss": 1.1214263, "memory(GiB)": 142.32, "step": 19440, "train_speed(iter/s)": 0.292526 }, { "acc": 0.72586975, "epoch": 0.21765907441764873, "grad_norm": 6.25, "learning_rate": 9.905682934934042e-06, "loss": 1.08869915, "memory(GiB)": 142.32, "step": 19460, "train_speed(iter/s)": 0.292627 }, { "acc": 0.71644993, "epoch": 0.21788277336360726, "grad_norm": 5.1875, "learning_rate": 9.905325077097054e-06, "loss": 1.13262711, "memory(GiB)": 142.32, "step": 19480, "train_speed(iter/s)": 0.29272 }, { "acc": 0.71383629, "epoch": 0.21810647230956579, "grad_norm": 7.625, "learning_rate": 9.904966548141481e-06, "loss": 1.13841171, "memory(GiB)": 142.32, "step": 19500, "train_speed(iter/s)": 0.292817 }, { "acc": 0.73151565, "epoch": 0.2183301712555243, "grad_norm": 5.5, "learning_rate": 9.904607348116378e-06, "loss": 1.06202564, "memory(GiB)": 142.32, "step": 19520, "train_speed(iter/s)": 0.292917 }, { "acc": 0.70316734, "epoch": 0.21855387020148284, "grad_norm": 5.375, "learning_rate": 9.904247477070883e-06, "loss": 1.21777048, "memory(GiB)": 142.32, "step": 19540, "train_speed(iter/s)": 0.293013 }, { "acc": 0.7209671, "epoch": 0.21877756914744137, "grad_norm": 5.8125, "learning_rate": 9.90388693505424e-06, "loss": 1.10363607, "memory(GiB)": 142.32, "step": 19560, "train_speed(iter/s)": 0.29311 }, { "acc": 0.71994781, "epoch": 0.2190012680933999, "grad_norm": 4.78125, "learning_rate": 9.903525722115768e-06, "loss": 1.1236989, "memory(GiB)": 142.32, "step": 19580, "train_speed(iter/s)": 0.293203 }, { "acc": 0.71377201, "epoch": 0.21922496703935843, "grad_norm": 4.78125, "learning_rate": 9.90316383830489e-06, "loss": 1.15153675, "memory(GiB)": 142.32, "step": 19600, "train_speed(iter/s)": 0.293302 }, { "acc": 0.71571164, "epoch": 0.21944866598531695, "grad_norm": 5.53125, "learning_rate": 9.902801283671118e-06, "loss": 1.12966805, "memory(GiB)": 142.32, "step": 19620, "train_speed(iter/s)": 0.293406 }, { "acc": 0.7158783, "epoch": 0.21967236493127548, "grad_norm": 6.4375, "learning_rate": 9.902438058264052e-06, "loss": 1.13695736, "memory(GiB)": 142.32, "step": 19640, "train_speed(iter/s)": 0.293504 }, { "acc": 0.71363134, "epoch": 0.219896063877234, "grad_norm": 6.625, "learning_rate": 9.902074162133389e-06, "loss": 1.1589057, "memory(GiB)": 142.32, "step": 19660, "train_speed(iter/s)": 0.293613 }, { "acc": 0.72864695, "epoch": 0.22011976282319254, "grad_norm": 5.96875, "learning_rate": 9.901709595328913e-06, "loss": 1.08230495, "memory(GiB)": 142.32, "step": 19680, "train_speed(iter/s)": 0.293708 }, { "acc": 0.7215785, "epoch": 0.22034346176915107, "grad_norm": 5.78125, "learning_rate": 9.901344357900502e-06, "loss": 1.09309883, "memory(GiB)": 142.32, "step": 19700, "train_speed(iter/s)": 0.293817 }, { "acc": 0.72029667, "epoch": 0.2205671607151096, "grad_norm": 5.78125, "learning_rate": 9.900978449898127e-06, "loss": 1.12792988, "memory(GiB)": 142.32, "step": 19720, "train_speed(iter/s)": 0.293914 }, { "acc": 0.71871548, "epoch": 0.22079085966106812, "grad_norm": 6.46875, "learning_rate": 9.900611871371848e-06, "loss": 1.12310286, "memory(GiB)": 142.32, "step": 19740, "train_speed(iter/s)": 0.294009 }, { "acc": 0.7095789, "epoch": 0.22101455860702665, "grad_norm": 4.90625, "learning_rate": 9.900244622371821e-06, "loss": 1.1623579, "memory(GiB)": 142.32, "step": 19760, "train_speed(iter/s)": 0.29412 }, { "acc": 0.72875142, "epoch": 0.22123825755298518, "grad_norm": 5.75, "learning_rate": 9.899876702948288e-06, "loss": 1.07186413, "memory(GiB)": 142.32, "step": 19780, "train_speed(iter/s)": 0.294221 }, { "acc": 0.71462569, "epoch": 0.22146195649894374, "grad_norm": 6.375, "learning_rate": 9.899508113151588e-06, "loss": 1.13728571, "memory(GiB)": 142.32, "step": 19800, "train_speed(iter/s)": 0.294309 }, { "acc": 0.71275549, "epoch": 0.22168565544490226, "grad_norm": 4.5, "learning_rate": 9.899138853032147e-06, "loss": 1.15591812, "memory(GiB)": 142.32, "step": 19820, "train_speed(iter/s)": 0.294408 }, { "acc": 0.71377831, "epoch": 0.2219093543908608, "grad_norm": 6.03125, "learning_rate": 9.898768922640485e-06, "loss": 1.12956524, "memory(GiB)": 142.32, "step": 19840, "train_speed(iter/s)": 0.294503 }, { "acc": 0.71429768, "epoch": 0.22213305333681932, "grad_norm": 4.3125, "learning_rate": 9.898398322027216e-06, "loss": 1.15213966, "memory(GiB)": 142.32, "step": 19860, "train_speed(iter/s)": 0.294598 }, { "acc": 0.72518225, "epoch": 0.22235675228277785, "grad_norm": 4.8125, "learning_rate": 9.898027051243042e-06, "loss": 1.09539528, "memory(GiB)": 142.32, "step": 19880, "train_speed(iter/s)": 0.294695 }, { "acc": 0.72005682, "epoch": 0.22258045122873638, "grad_norm": 5.8125, "learning_rate": 9.897655110338759e-06, "loss": 1.12057323, "memory(GiB)": 142.32, "step": 19900, "train_speed(iter/s)": 0.294792 }, { "acc": 0.71457739, "epoch": 0.2228041501746949, "grad_norm": 7.125, "learning_rate": 9.897282499365254e-06, "loss": 1.13391562, "memory(GiB)": 142.32, "step": 19920, "train_speed(iter/s)": 0.294889 }, { "acc": 0.72230215, "epoch": 0.22302784912065343, "grad_norm": 6.0, "learning_rate": 9.896909218373503e-06, "loss": 1.11791725, "memory(GiB)": 142.32, "step": 19940, "train_speed(iter/s)": 0.294985 }, { "acc": 0.71640015, "epoch": 0.22325154806661196, "grad_norm": 4.5, "learning_rate": 9.896535267414578e-06, "loss": 1.14133892, "memory(GiB)": 142.32, "step": 19960, "train_speed(iter/s)": 0.295083 }, { "acc": 0.71592326, "epoch": 0.2234752470125705, "grad_norm": 5.625, "learning_rate": 9.896160646539641e-06, "loss": 1.12638311, "memory(GiB)": 142.32, "step": 19980, "train_speed(iter/s)": 0.295178 }, { "acc": 0.71519861, "epoch": 0.22369894595852902, "grad_norm": 6.15625, "learning_rate": 9.895785355799947e-06, "loss": 1.14989119, "memory(GiB)": 142.32, "step": 20000, "train_speed(iter/s)": 0.295279 }, { "epoch": 0.22369894595852902, "eval_acc": 0.6838678964013047, "eval_loss": 1.1259324550628662, "eval_runtime": 2340.2288, "eval_samples_per_second": 32.169, "eval_steps_per_second": 16.085, "step": 20000 }, { "acc": 0.71748371, "epoch": 0.22392264490448754, "grad_norm": 5.53125, "learning_rate": 9.895409395246839e-06, "loss": 1.12253876, "memory(GiB)": 142.32, "step": 20020, "train_speed(iter/s)": 0.2853 }, { "acc": 0.72863622, "epoch": 0.22414634385044607, "grad_norm": 7.59375, "learning_rate": 9.895032764931753e-06, "loss": 1.074366, "memory(GiB)": 142.32, "step": 20040, "train_speed(iter/s)": 0.285391 }, { "acc": 0.71004877, "epoch": 0.2243700427964046, "grad_norm": 5.65625, "learning_rate": 9.894655464906217e-06, "loss": 1.16637564, "memory(GiB)": 142.32, "step": 20060, "train_speed(iter/s)": 0.28549 }, { "acc": 0.71859007, "epoch": 0.22459374174236313, "grad_norm": 5.46875, "learning_rate": 9.894277495221856e-06, "loss": 1.13036051, "memory(GiB)": 142.32, "step": 20080, "train_speed(iter/s)": 0.285591 }, { "acc": 0.70291839, "epoch": 0.22481744068832166, "grad_norm": 5.375, "learning_rate": 9.893898855930378e-06, "loss": 1.19425774, "memory(GiB)": 142.32, "step": 20100, "train_speed(iter/s)": 0.285679 }, { "acc": 0.71769333, "epoch": 0.22504113963428019, "grad_norm": 6.0625, "learning_rate": 9.893519547083584e-06, "loss": 1.14108467, "memory(GiB)": 142.32, "step": 20120, "train_speed(iter/s)": 0.285774 }, { "acc": 0.71734505, "epoch": 0.2252648385802387, "grad_norm": 6.75, "learning_rate": 9.893139568733374e-06, "loss": 1.14197178, "memory(GiB)": 142.32, "step": 20140, "train_speed(iter/s)": 0.285885 }, { "acc": 0.72032809, "epoch": 0.22548853752619724, "grad_norm": 5.0, "learning_rate": 9.892758920931732e-06, "loss": 1.11588039, "memory(GiB)": 142.32, "step": 20160, "train_speed(iter/s)": 0.285995 }, { "acc": 0.73007169, "epoch": 0.22571223647215577, "grad_norm": 5.65625, "learning_rate": 9.892377603730733e-06, "loss": 1.06625786, "memory(GiB)": 142.32, "step": 20180, "train_speed(iter/s)": 0.28608 }, { "acc": 0.72220049, "epoch": 0.2259359354181143, "grad_norm": 6.0625, "learning_rate": 9.891995617182552e-06, "loss": 1.11005974, "memory(GiB)": 142.32, "step": 20200, "train_speed(iter/s)": 0.286188 }, { "acc": 0.71243334, "epoch": 0.22615963436407283, "grad_norm": 6.03125, "learning_rate": 9.891612961339447e-06, "loss": 1.1572793, "memory(GiB)": 142.32, "step": 20220, "train_speed(iter/s)": 0.286287 }, { "acc": 0.72462826, "epoch": 0.22638333331003135, "grad_norm": 5.5, "learning_rate": 9.891229636253773e-06, "loss": 1.09376869, "memory(GiB)": 142.32, "step": 20240, "train_speed(iter/s)": 0.286372 }, { "acc": 0.71618929, "epoch": 0.22660703225598988, "grad_norm": 5.125, "learning_rate": 9.890845641977972e-06, "loss": 1.14083796, "memory(GiB)": 142.32, "step": 20260, "train_speed(iter/s)": 0.286469 }, { "acc": 0.72190924, "epoch": 0.2268307312019484, "grad_norm": 5.125, "learning_rate": 9.89046097856458e-06, "loss": 1.11089935, "memory(GiB)": 142.32, "step": 20280, "train_speed(iter/s)": 0.286566 }, { "acc": 0.71626816, "epoch": 0.22705443014790694, "grad_norm": 5.65625, "learning_rate": 9.890075646066226e-06, "loss": 1.13753052, "memory(GiB)": 142.32, "step": 20300, "train_speed(iter/s)": 0.286661 }, { "acc": 0.71400313, "epoch": 0.22727812909386547, "grad_norm": 5.875, "learning_rate": 9.88968964453563e-06, "loss": 1.15436954, "memory(GiB)": 142.32, "step": 20320, "train_speed(iter/s)": 0.286759 }, { "acc": 0.71924515, "epoch": 0.227501828039824, "grad_norm": 6.03125, "learning_rate": 9.8893029740256e-06, "loss": 1.13502045, "memory(GiB)": 142.32, "step": 20340, "train_speed(iter/s)": 0.286858 }, { "acc": 0.71181331, "epoch": 0.22772552698578252, "grad_norm": 5.15625, "learning_rate": 9.888915634589036e-06, "loss": 1.16728897, "memory(GiB)": 142.32, "step": 20360, "train_speed(iter/s)": 0.286952 }, { "acc": 0.72981205, "epoch": 0.22794922593174105, "grad_norm": 5.46875, "learning_rate": 9.888527626278937e-06, "loss": 1.08076878, "memory(GiB)": 142.32, "step": 20380, "train_speed(iter/s)": 0.28705 }, { "acc": 0.72048774, "epoch": 0.22817292487769958, "grad_norm": 5.625, "learning_rate": 9.888138949148387e-06, "loss": 1.10999756, "memory(GiB)": 142.32, "step": 20400, "train_speed(iter/s)": 0.287158 }, { "acc": 0.72615986, "epoch": 0.22839662382365813, "grad_norm": 5.6875, "learning_rate": 9.887749603250559e-06, "loss": 1.09102802, "memory(GiB)": 142.32, "step": 20420, "train_speed(iter/s)": 0.287248 }, { "acc": 0.72266254, "epoch": 0.22862032276961666, "grad_norm": 5.125, "learning_rate": 9.887359588638724e-06, "loss": 1.10133266, "memory(GiB)": 142.32, "step": 20440, "train_speed(iter/s)": 0.287341 }, { "acc": 0.7208807, "epoch": 0.2288440217155752, "grad_norm": 6.125, "learning_rate": 9.886968905366239e-06, "loss": 1.12572651, "memory(GiB)": 142.32, "step": 20460, "train_speed(iter/s)": 0.287431 }, { "acc": 0.7099802, "epoch": 0.22906772066153372, "grad_norm": 4.90625, "learning_rate": 9.886577553486557e-06, "loss": 1.15840969, "memory(GiB)": 142.32, "step": 20480, "train_speed(iter/s)": 0.287531 }, { "acc": 0.71682625, "epoch": 0.22929141960749225, "grad_norm": 5.375, "learning_rate": 9.886185533053224e-06, "loss": 1.1317688, "memory(GiB)": 142.32, "step": 20500, "train_speed(iter/s)": 0.287623 }, { "acc": 0.72152185, "epoch": 0.22951511855345078, "grad_norm": 5.09375, "learning_rate": 9.885792844119868e-06, "loss": 1.12688618, "memory(GiB)": 142.32, "step": 20520, "train_speed(iter/s)": 0.287709 }, { "acc": 0.71863346, "epoch": 0.2297388174994093, "grad_norm": 5.28125, "learning_rate": 9.885399486740216e-06, "loss": 1.12598057, "memory(GiB)": 142.32, "step": 20540, "train_speed(iter/s)": 0.287792 }, { "acc": 0.72422466, "epoch": 0.22996251644536783, "grad_norm": 5.28125, "learning_rate": 9.885005460968088e-06, "loss": 1.10466309, "memory(GiB)": 142.32, "step": 20560, "train_speed(iter/s)": 0.287877 }, { "acc": 0.71618438, "epoch": 0.23018621539132636, "grad_norm": 4.875, "learning_rate": 9.884610766857388e-06, "loss": 1.13212309, "memory(GiB)": 142.32, "step": 20580, "train_speed(iter/s)": 0.287973 }, { "acc": 0.71639299, "epoch": 0.2304099143372849, "grad_norm": 5.78125, "learning_rate": 9.884215404462119e-06, "loss": 1.12646961, "memory(GiB)": 142.32, "step": 20600, "train_speed(iter/s)": 0.288064 }, { "acc": 0.73113289, "epoch": 0.23063361328324342, "grad_norm": 5.4375, "learning_rate": 9.883819373836372e-06, "loss": 1.04771309, "memory(GiB)": 142.32, "step": 20620, "train_speed(iter/s)": 0.288159 }, { "acc": 0.70863061, "epoch": 0.23085731222920194, "grad_norm": 6.03125, "learning_rate": 9.883422675034328e-06, "loss": 1.16397724, "memory(GiB)": 142.32, "step": 20640, "train_speed(iter/s)": 0.288253 }, { "acc": 0.71706424, "epoch": 0.23108101117516047, "grad_norm": 6.25, "learning_rate": 9.88302530811026e-06, "loss": 1.13151436, "memory(GiB)": 142.32, "step": 20660, "train_speed(iter/s)": 0.288352 }, { "acc": 0.71974363, "epoch": 0.231304710121119, "grad_norm": 5.9375, "learning_rate": 9.882627273118538e-06, "loss": 1.11439638, "memory(GiB)": 142.32, "step": 20680, "train_speed(iter/s)": 0.288447 }, { "acc": 0.71063156, "epoch": 0.23152840906707753, "grad_norm": 5.59375, "learning_rate": 9.882228570113616e-06, "loss": 1.17172489, "memory(GiB)": 142.32, "step": 20700, "train_speed(iter/s)": 0.288539 }, { "acc": 0.72626891, "epoch": 0.23175210801303606, "grad_norm": 6.1875, "learning_rate": 9.881829199150041e-06, "loss": 1.08560238, "memory(GiB)": 142.32, "step": 20720, "train_speed(iter/s)": 0.288634 }, { "acc": 0.71828928, "epoch": 0.23197580695899458, "grad_norm": 5.8125, "learning_rate": 9.881429160282455e-06, "loss": 1.12477779, "memory(GiB)": 142.32, "step": 20740, "train_speed(iter/s)": 0.288731 }, { "acc": 0.70992703, "epoch": 0.2321995059049531, "grad_norm": 4.65625, "learning_rate": 9.881028453565588e-06, "loss": 1.1815815, "memory(GiB)": 142.32, "step": 20760, "train_speed(iter/s)": 0.288831 }, { "acc": 0.71795917, "epoch": 0.23242320485091164, "grad_norm": 5.125, "learning_rate": 9.880627079054263e-06, "loss": 1.11279659, "memory(GiB)": 142.32, "step": 20780, "train_speed(iter/s)": 0.288926 }, { "acc": 0.71557007, "epoch": 0.23264690379687017, "grad_norm": 5.4375, "learning_rate": 9.880225036803393e-06, "loss": 1.15643711, "memory(GiB)": 142.32, "step": 20800, "train_speed(iter/s)": 0.289025 }, { "acc": 0.71886706, "epoch": 0.2328706027428287, "grad_norm": 5.71875, "learning_rate": 9.879822326867983e-06, "loss": 1.12493172, "memory(GiB)": 142.32, "step": 20820, "train_speed(iter/s)": 0.289118 }, { "acc": 0.72510037, "epoch": 0.23309430168878723, "grad_norm": 6.0, "learning_rate": 9.879418949303131e-06, "loss": 1.09705868, "memory(GiB)": 142.32, "step": 20840, "train_speed(iter/s)": 0.289209 }, { "acc": 0.72309151, "epoch": 0.23331800063474575, "grad_norm": 6.3125, "learning_rate": 9.879014904164023e-06, "loss": 1.0815074, "memory(GiB)": 142.32, "step": 20860, "train_speed(iter/s)": 0.289304 }, { "acc": 0.70941629, "epoch": 0.23354169958070428, "grad_norm": 4.46875, "learning_rate": 9.878610191505938e-06, "loss": 1.17095346, "memory(GiB)": 142.32, "step": 20880, "train_speed(iter/s)": 0.289394 }, { "acc": 0.72035866, "epoch": 0.2337653985266628, "grad_norm": 5.625, "learning_rate": 9.878204811384248e-06, "loss": 1.12049217, "memory(GiB)": 142.32, "step": 20900, "train_speed(iter/s)": 0.289491 }, { "acc": 0.71767673, "epoch": 0.23398909747262134, "grad_norm": 4.21875, "learning_rate": 9.877798763854415e-06, "loss": 1.14507236, "memory(GiB)": 142.32, "step": 20920, "train_speed(iter/s)": 0.289588 }, { "acc": 0.71706495, "epoch": 0.23421279641857987, "grad_norm": 6.4375, "learning_rate": 9.877392048971992e-06, "loss": 1.14005089, "memory(GiB)": 142.32, "step": 20940, "train_speed(iter/s)": 0.289674 }, { "acc": 0.7233345, "epoch": 0.2344364953645384, "grad_norm": 6.09375, "learning_rate": 9.876984666792622e-06, "loss": 1.10067482, "memory(GiB)": 142.32, "step": 20960, "train_speed(iter/s)": 0.289761 }, { "acc": 0.71011038, "epoch": 0.23466019431049692, "grad_norm": 6.75, "learning_rate": 9.87657661737204e-06, "loss": 1.16821308, "memory(GiB)": 142.32, "step": 20980, "train_speed(iter/s)": 0.289858 }, { "acc": 0.71945777, "epoch": 0.23488389325645545, "grad_norm": 5.75, "learning_rate": 9.876167900766077e-06, "loss": 1.12668409, "memory(GiB)": 142.32, "step": 21000, "train_speed(iter/s)": 0.289959 }, { "acc": 0.72964535, "epoch": 0.23510759220241398, "grad_norm": 5.5625, "learning_rate": 9.875758517030647e-06, "loss": 1.0827877, "memory(GiB)": 142.32, "step": 21020, "train_speed(iter/s)": 0.290046 }, { "acc": 0.7226388, "epoch": 0.2353312911483725, "grad_norm": 5.375, "learning_rate": 9.875348466221762e-06, "loss": 1.12712259, "memory(GiB)": 142.32, "step": 21040, "train_speed(iter/s)": 0.290138 }, { "acc": 0.72336874, "epoch": 0.23555499009433106, "grad_norm": 6.8125, "learning_rate": 9.87493774839552e-06, "loss": 1.11145525, "memory(GiB)": 142.32, "step": 21060, "train_speed(iter/s)": 0.290227 }, { "acc": 0.72416162, "epoch": 0.2357786890402896, "grad_norm": 4.53125, "learning_rate": 9.874526363608116e-06, "loss": 1.10392332, "memory(GiB)": 142.32, "step": 21080, "train_speed(iter/s)": 0.290315 }, { "acc": 0.73060169, "epoch": 0.23600238798624812, "grad_norm": 5.84375, "learning_rate": 9.874114311915833e-06, "loss": 1.06407833, "memory(GiB)": 142.32, "step": 21100, "train_speed(iter/s)": 0.290404 }, { "acc": 0.72345886, "epoch": 0.23622608693220665, "grad_norm": 5.625, "learning_rate": 9.873701593375044e-06, "loss": 1.11577587, "memory(GiB)": 142.32, "step": 21120, "train_speed(iter/s)": 0.290504 }, { "acc": 0.72161579, "epoch": 0.23644978587816518, "grad_norm": 5.96875, "learning_rate": 9.873288208042218e-06, "loss": 1.12584496, "memory(GiB)": 142.32, "step": 21140, "train_speed(iter/s)": 0.290605 }, { "acc": 0.72585869, "epoch": 0.2366734848241237, "grad_norm": 6.25, "learning_rate": 9.872874155973908e-06, "loss": 1.09496765, "memory(GiB)": 142.32, "step": 21160, "train_speed(iter/s)": 0.290698 }, { "acc": 0.71191397, "epoch": 0.23689718377008223, "grad_norm": 5.9375, "learning_rate": 9.872459437226764e-06, "loss": 1.15485868, "memory(GiB)": 142.32, "step": 21180, "train_speed(iter/s)": 0.29079 }, { "acc": 0.71499224, "epoch": 0.23712088271604076, "grad_norm": 5.4375, "learning_rate": 9.872044051857527e-06, "loss": 1.13189354, "memory(GiB)": 142.32, "step": 21200, "train_speed(iter/s)": 0.29089 }, { "acc": 0.72844563, "epoch": 0.2373445816619993, "grad_norm": 5.84375, "learning_rate": 9.871627999923025e-06, "loss": 1.08618851, "memory(GiB)": 142.32, "step": 21220, "train_speed(iter/s)": 0.29098 }, { "acc": 0.72078996, "epoch": 0.23756828060795782, "grad_norm": 5.875, "learning_rate": 9.871211281480181e-06, "loss": 1.11685972, "memory(GiB)": 142.32, "step": 21240, "train_speed(iter/s)": 0.291076 }, { "acc": 0.72282014, "epoch": 0.23779197955391634, "grad_norm": 6.25, "learning_rate": 9.870793896586009e-06, "loss": 1.10193558, "memory(GiB)": 142.32, "step": 21260, "train_speed(iter/s)": 0.291167 }, { "acc": 0.72188616, "epoch": 0.23801567849987487, "grad_norm": 5.8125, "learning_rate": 9.87037584529761e-06, "loss": 1.10157089, "memory(GiB)": 142.32, "step": 21280, "train_speed(iter/s)": 0.291251 }, { "acc": 0.70798688, "epoch": 0.2382393774458334, "grad_norm": 5.125, "learning_rate": 9.869957127672185e-06, "loss": 1.16774368, "memory(GiB)": 142.32, "step": 21300, "train_speed(iter/s)": 0.291342 }, { "acc": 0.72009649, "epoch": 0.23846307639179193, "grad_norm": 5.90625, "learning_rate": 9.869537743767014e-06, "loss": 1.13825102, "memory(GiB)": 142.32, "step": 21320, "train_speed(iter/s)": 0.291432 }, { "acc": 0.7344409, "epoch": 0.23868677533775046, "grad_norm": 5.90625, "learning_rate": 9.86911769363948e-06, "loss": 1.05485783, "memory(GiB)": 142.32, "step": 21340, "train_speed(iter/s)": 0.291525 }, { "acc": 0.71545544, "epoch": 0.23891047428370898, "grad_norm": 6.875, "learning_rate": 9.86869697734705e-06, "loss": 1.15206909, "memory(GiB)": 142.32, "step": 21360, "train_speed(iter/s)": 0.291613 }, { "acc": 0.71792436, "epoch": 0.2391341732296675, "grad_norm": 5.875, "learning_rate": 9.868275594947282e-06, "loss": 1.1234642, "memory(GiB)": 142.32, "step": 21380, "train_speed(iter/s)": 0.29171 }, { "acc": 0.72240572, "epoch": 0.23935787217562604, "grad_norm": 6.75, "learning_rate": 9.86785354649783e-06, "loss": 1.10542555, "memory(GiB)": 142.32, "step": 21400, "train_speed(iter/s)": 0.291799 }, { "acc": 0.71845398, "epoch": 0.23958157112158457, "grad_norm": 4.75, "learning_rate": 9.867430832056434e-06, "loss": 1.10996819, "memory(GiB)": 142.32, "step": 21420, "train_speed(iter/s)": 0.291894 }, { "acc": 0.72262936, "epoch": 0.2398052700675431, "grad_norm": 4.5, "learning_rate": 9.86700745168093e-06, "loss": 1.10576992, "memory(GiB)": 142.32, "step": 21440, "train_speed(iter/s)": 0.29198 }, { "acc": 0.7181304, "epoch": 0.24002896901350163, "grad_norm": 6.1875, "learning_rate": 9.86658340542924e-06, "loss": 1.13635101, "memory(GiB)": 142.32, "step": 21460, "train_speed(iter/s)": 0.292072 }, { "acc": 0.71913099, "epoch": 0.24025266795946015, "grad_norm": 6.1875, "learning_rate": 9.866158693359382e-06, "loss": 1.12941618, "memory(GiB)": 142.32, "step": 21480, "train_speed(iter/s)": 0.292165 }, { "acc": 0.7148653, "epoch": 0.24047636690541868, "grad_norm": 5.9375, "learning_rate": 9.86573331552946e-06, "loss": 1.16001501, "memory(GiB)": 142.32, "step": 21500, "train_speed(iter/s)": 0.292257 }, { "acc": 0.72478609, "epoch": 0.2407000658513772, "grad_norm": 5.0625, "learning_rate": 9.865307271997674e-06, "loss": 1.08729715, "memory(GiB)": 142.32, "step": 21520, "train_speed(iter/s)": 0.292346 }, { "acc": 0.72062454, "epoch": 0.24092376479733574, "grad_norm": 6.96875, "learning_rate": 9.864880562822312e-06, "loss": 1.1317152, "memory(GiB)": 142.32, "step": 21540, "train_speed(iter/s)": 0.292428 }, { "acc": 0.72344184, "epoch": 0.24114746374329427, "grad_norm": 5.375, "learning_rate": 9.864453188061753e-06, "loss": 1.12062006, "memory(GiB)": 142.32, "step": 21560, "train_speed(iter/s)": 0.29251 }, { "acc": 0.71483417, "epoch": 0.2413711626892528, "grad_norm": 6.09375, "learning_rate": 9.86402514777447e-06, "loss": 1.13395386, "memory(GiB)": 142.32, "step": 21580, "train_speed(iter/s)": 0.292598 }, { "acc": 0.71835413, "epoch": 0.24159486163521132, "grad_norm": 4.59375, "learning_rate": 9.863596442019023e-06, "loss": 1.13481998, "memory(GiB)": 142.32, "step": 21600, "train_speed(iter/s)": 0.292677 }, { "acc": 0.7319262, "epoch": 0.24181856058116985, "grad_norm": 5.21875, "learning_rate": 9.863167070854064e-06, "loss": 1.0613843, "memory(GiB)": 142.32, "step": 21620, "train_speed(iter/s)": 0.29276 }, { "acc": 0.71762342, "epoch": 0.24204225952712838, "grad_norm": 5.46875, "learning_rate": 9.862737034338342e-06, "loss": 1.13368101, "memory(GiB)": 142.32, "step": 21640, "train_speed(iter/s)": 0.292841 }, { "acc": 0.72141838, "epoch": 0.2422659584730869, "grad_norm": 5.46875, "learning_rate": 9.862306332530688e-06, "loss": 1.11632042, "memory(GiB)": 142.32, "step": 21660, "train_speed(iter/s)": 0.292926 }, { "acc": 0.73339252, "epoch": 0.24248965741904546, "grad_norm": 6.4375, "learning_rate": 9.86187496549003e-06, "loss": 1.06810904, "memory(GiB)": 142.32, "step": 21680, "train_speed(iter/s)": 0.293015 }, { "acc": 0.71420083, "epoch": 0.242713356365004, "grad_norm": 5.65625, "learning_rate": 9.861442933275384e-06, "loss": 1.14370155, "memory(GiB)": 142.32, "step": 21700, "train_speed(iter/s)": 0.293097 }, { "acc": 0.72342949, "epoch": 0.24293705531096252, "grad_norm": 4.71875, "learning_rate": 9.861010235945859e-06, "loss": 1.08746166, "memory(GiB)": 142.32, "step": 21720, "train_speed(iter/s)": 0.293176 }, { "acc": 0.71388507, "epoch": 0.24316075425692105, "grad_norm": 5.5625, "learning_rate": 9.860576873560651e-06, "loss": 1.1349617, "memory(GiB)": 142.32, "step": 21740, "train_speed(iter/s)": 0.293264 }, { "acc": 0.71278162, "epoch": 0.24338445320287957, "grad_norm": 5.59375, "learning_rate": 9.860142846179057e-06, "loss": 1.15829792, "memory(GiB)": 142.32, "step": 21760, "train_speed(iter/s)": 0.293345 }, { "acc": 0.71439071, "epoch": 0.2436081521488381, "grad_norm": 5.96875, "learning_rate": 9.859708153860453e-06, "loss": 1.14778605, "memory(GiB)": 142.32, "step": 21780, "train_speed(iter/s)": 0.293423 }, { "acc": 0.72878137, "epoch": 0.24383185109479663, "grad_norm": 5.5625, "learning_rate": 9.859272796664312e-06, "loss": 1.08174343, "memory(GiB)": 142.32, "step": 21800, "train_speed(iter/s)": 0.293505 }, { "acc": 0.72983446, "epoch": 0.24405555004075516, "grad_norm": 5.3125, "learning_rate": 9.858836774650197e-06, "loss": 1.07470798, "memory(GiB)": 142.32, "step": 21820, "train_speed(iter/s)": 0.293583 }, { "acc": 0.72225857, "epoch": 0.2442792489867137, "grad_norm": 5.8125, "learning_rate": 9.858400087877764e-06, "loss": 1.09368572, "memory(GiB)": 142.32, "step": 21840, "train_speed(iter/s)": 0.293672 }, { "acc": 0.72542825, "epoch": 0.24450294793267222, "grad_norm": 5.34375, "learning_rate": 9.857962736406755e-06, "loss": 1.0929987, "memory(GiB)": 142.32, "step": 21860, "train_speed(iter/s)": 0.293767 }, { "acc": 0.72206483, "epoch": 0.24472664687863074, "grad_norm": 5.90625, "learning_rate": 9.857524720297009e-06, "loss": 1.10315762, "memory(GiB)": 142.32, "step": 21880, "train_speed(iter/s)": 0.29385 }, { "acc": 0.7180192, "epoch": 0.24495034582458927, "grad_norm": 5.03125, "learning_rate": 9.857086039608449e-06, "loss": 1.11809902, "memory(GiB)": 142.32, "step": 21900, "train_speed(iter/s)": 0.293935 }, { "acc": 0.71218219, "epoch": 0.2451740447705478, "grad_norm": 5.84375, "learning_rate": 9.856646694401097e-06, "loss": 1.15959072, "memory(GiB)": 142.32, "step": 21920, "train_speed(iter/s)": 0.294011 }, { "acc": 0.71676435, "epoch": 0.24539774371650633, "grad_norm": 5.375, "learning_rate": 9.856206684735058e-06, "loss": 1.1279604, "memory(GiB)": 142.32, "step": 21940, "train_speed(iter/s)": 0.294085 }, { "acc": 0.72043257, "epoch": 0.24562144266246486, "grad_norm": 6.75, "learning_rate": 9.855766010670533e-06, "loss": 1.14377966, "memory(GiB)": 142.32, "step": 21960, "train_speed(iter/s)": 0.294175 }, { "acc": 0.71161628, "epoch": 0.24584514160842338, "grad_norm": 5.34375, "learning_rate": 9.855324672267815e-06, "loss": 1.16566181, "memory(GiB)": 142.32, "step": 21980, "train_speed(iter/s)": 0.294272 }, { "acc": 0.71903477, "epoch": 0.2460688405543819, "grad_norm": 6.71875, "learning_rate": 9.854882669587282e-06, "loss": 1.12294636, "memory(GiB)": 142.32, "step": 22000, "train_speed(iter/s)": 0.294357 }, { "epoch": 0.2460688405543819, "eval_acc": 0.6849212180296403, "eval_loss": 1.1215161085128784, "eval_runtime": 2341.9579, "eval_samples_per_second": 32.145, "eval_steps_per_second": 16.073, "step": 22000 }, { "acc": 0.71151853, "epoch": 0.24629253950034044, "grad_norm": 5.21875, "learning_rate": 9.854440002689409e-06, "loss": 1.16178474, "memory(GiB)": 142.32, "step": 22020, "train_speed(iter/s)": 0.285311 }, { "acc": 0.72773142, "epoch": 0.24651623844629897, "grad_norm": 5.09375, "learning_rate": 9.853996671634755e-06, "loss": 1.08778553, "memory(GiB)": 142.32, "step": 22040, "train_speed(iter/s)": 0.2854 }, { "acc": 0.71462994, "epoch": 0.2467399373922575, "grad_norm": 5.96875, "learning_rate": 9.85355267648398e-06, "loss": 1.14643965, "memory(GiB)": 142.32, "step": 22060, "train_speed(iter/s)": 0.285493 }, { "acc": 0.71294165, "epoch": 0.24696363633821602, "grad_norm": 6.15625, "learning_rate": 9.853108017297823e-06, "loss": 1.15869141, "memory(GiB)": 142.32, "step": 22080, "train_speed(iter/s)": 0.285571 }, { "acc": 0.7274538, "epoch": 0.24718733528417455, "grad_norm": 6.53125, "learning_rate": 9.852662694137123e-06, "loss": 1.07896614, "memory(GiB)": 142.32, "step": 22100, "train_speed(iter/s)": 0.28566 }, { "acc": 0.71819134, "epoch": 0.24741103423013308, "grad_norm": 6.09375, "learning_rate": 9.852216707062805e-06, "loss": 1.12102137, "memory(GiB)": 142.32, "step": 22120, "train_speed(iter/s)": 0.285745 }, { "acc": 0.71652527, "epoch": 0.2476347331760916, "grad_norm": 5.96875, "learning_rate": 9.85177005613589e-06, "loss": 1.12262459, "memory(GiB)": 142.32, "step": 22140, "train_speed(iter/s)": 0.285833 }, { "acc": 0.71094737, "epoch": 0.24785843212205014, "grad_norm": 5.6875, "learning_rate": 9.851322741417482e-06, "loss": 1.15481873, "memory(GiB)": 142.32, "step": 22160, "train_speed(iter/s)": 0.285919 }, { "acc": 0.72327404, "epoch": 0.24808213106800867, "grad_norm": 5.46875, "learning_rate": 9.850874762968781e-06, "loss": 1.10636654, "memory(GiB)": 142.32, "step": 22180, "train_speed(iter/s)": 0.286009 }, { "acc": 0.71969509, "epoch": 0.2483058300139672, "grad_norm": 4.625, "learning_rate": 9.850426120851077e-06, "loss": 1.12382946, "memory(GiB)": 142.32, "step": 22200, "train_speed(iter/s)": 0.286099 }, { "acc": 0.71073437, "epoch": 0.24852952895992572, "grad_norm": 4.15625, "learning_rate": 9.849976815125753e-06, "loss": 1.17088261, "memory(GiB)": 142.32, "step": 22220, "train_speed(iter/s)": 0.286185 }, { "acc": 0.71481695, "epoch": 0.24875322790588425, "grad_norm": 6.4375, "learning_rate": 9.849526845854278e-06, "loss": 1.13798828, "memory(GiB)": 142.32, "step": 22240, "train_speed(iter/s)": 0.286267 }, { "acc": 0.71451244, "epoch": 0.24897692685184278, "grad_norm": 5.625, "learning_rate": 9.849076213098214e-06, "loss": 1.1363883, "memory(GiB)": 142.32, "step": 22260, "train_speed(iter/s)": 0.286362 }, { "acc": 0.71123981, "epoch": 0.2492006257978013, "grad_norm": 5.03125, "learning_rate": 9.848624916919213e-06, "loss": 1.16005211, "memory(GiB)": 142.32, "step": 22280, "train_speed(iter/s)": 0.286447 }, { "acc": 0.71764436, "epoch": 0.24942432474375983, "grad_norm": 4.96875, "learning_rate": 9.848172957379024e-06, "loss": 1.12637644, "memory(GiB)": 142.32, "step": 22300, "train_speed(iter/s)": 0.286532 }, { "acc": 0.72456698, "epoch": 0.2496480236897184, "grad_norm": 5.625, "learning_rate": 9.847720334539476e-06, "loss": 1.08616638, "memory(GiB)": 142.32, "step": 22320, "train_speed(iter/s)": 0.286618 }, { "acc": 0.71453524, "epoch": 0.24987172263567692, "grad_norm": 6.34375, "learning_rate": 9.847267048462498e-06, "loss": 1.14009762, "memory(GiB)": 142.32, "step": 22340, "train_speed(iter/s)": 0.286706 }, { "acc": 0.71757355, "epoch": 0.25009542158163545, "grad_norm": 6.125, "learning_rate": 9.846813099210104e-06, "loss": 1.12312374, "memory(GiB)": 142.32, "step": 22360, "train_speed(iter/s)": 0.286791 }, { "acc": 0.73858938, "epoch": 0.250319120527594, "grad_norm": 6.53125, "learning_rate": 9.8463584868444e-06, "loss": 1.02202072, "memory(GiB)": 142.32, "step": 22380, "train_speed(iter/s)": 0.286874 }, { "acc": 0.7326478, "epoch": 0.2505428194735525, "grad_norm": 5.96875, "learning_rate": 9.845903211427586e-06, "loss": 1.06344452, "memory(GiB)": 142.32, "step": 22400, "train_speed(iter/s)": 0.286949 }, { "acc": 0.71365943, "epoch": 0.25076651841951103, "grad_norm": 5.71875, "learning_rate": 9.845447273021947e-06, "loss": 1.14613466, "memory(GiB)": 142.32, "step": 22420, "train_speed(iter/s)": 0.287028 }, { "acc": 0.70847626, "epoch": 0.25099021736546956, "grad_norm": 4.96875, "learning_rate": 9.844990671689865e-06, "loss": 1.16136351, "memory(GiB)": 142.32, "step": 22440, "train_speed(iter/s)": 0.287119 }, { "acc": 0.72637892, "epoch": 0.2512139163114281, "grad_norm": 5.15625, "learning_rate": 9.844533407493808e-06, "loss": 1.08790178, "memory(GiB)": 142.32, "step": 22460, "train_speed(iter/s)": 0.287204 }, { "acc": 0.71342173, "epoch": 0.2514376152573866, "grad_norm": 6.78125, "learning_rate": 9.844075480496335e-06, "loss": 1.12439966, "memory(GiB)": 142.32, "step": 22480, "train_speed(iter/s)": 0.28729 }, { "acc": 0.72551222, "epoch": 0.25166131420334514, "grad_norm": 5.78125, "learning_rate": 9.843616890760102e-06, "loss": 1.10248804, "memory(GiB)": 142.32, "step": 22500, "train_speed(iter/s)": 0.287373 }, { "acc": 0.71374483, "epoch": 0.25188501314930367, "grad_norm": 5.9375, "learning_rate": 9.843157638347844e-06, "loss": 1.14640303, "memory(GiB)": 142.32, "step": 22520, "train_speed(iter/s)": 0.287455 }, { "acc": 0.72035255, "epoch": 0.2521087120952622, "grad_norm": 4.84375, "learning_rate": 9.842697723322396e-06, "loss": 1.11269588, "memory(GiB)": 142.32, "step": 22540, "train_speed(iter/s)": 0.28754 }, { "acc": 0.71899452, "epoch": 0.2523324110412207, "grad_norm": 6.0, "learning_rate": 9.842237145746684e-06, "loss": 1.12152557, "memory(GiB)": 142.32, "step": 22560, "train_speed(iter/s)": 0.287625 }, { "acc": 0.72929935, "epoch": 0.25255610998717926, "grad_norm": 5.84375, "learning_rate": 9.841775905683717e-06, "loss": 1.06249428, "memory(GiB)": 142.32, "step": 22580, "train_speed(iter/s)": 0.287714 }, { "acc": 0.71827583, "epoch": 0.2527798089331378, "grad_norm": 5.09375, "learning_rate": 9.841314003196602e-06, "loss": 1.12549324, "memory(GiB)": 142.32, "step": 22600, "train_speed(iter/s)": 0.287805 }, { "acc": 0.71971588, "epoch": 0.2530035078790963, "grad_norm": 5.6875, "learning_rate": 9.840851438348532e-06, "loss": 1.12069778, "memory(GiB)": 142.32, "step": 22620, "train_speed(iter/s)": 0.2879 }, { "acc": 0.7266922, "epoch": 0.25322720682505484, "grad_norm": 5.625, "learning_rate": 9.840388211202795e-06, "loss": 1.08420553, "memory(GiB)": 142.32, "step": 22640, "train_speed(iter/s)": 0.287984 }, { "acc": 0.74121838, "epoch": 0.25345090577101337, "grad_norm": 6.5625, "learning_rate": 9.839924321822765e-06, "loss": 1.01627359, "memory(GiB)": 142.32, "step": 22660, "train_speed(iter/s)": 0.288077 }, { "acc": 0.73363681, "epoch": 0.2536746047169719, "grad_norm": 6.25, "learning_rate": 9.83945977027191e-06, "loss": 1.05827818, "memory(GiB)": 142.32, "step": 22680, "train_speed(iter/s)": 0.288168 }, { "acc": 0.71600871, "epoch": 0.2538983036629304, "grad_norm": 6.625, "learning_rate": 9.838994556613785e-06, "loss": 1.12344275, "memory(GiB)": 142.32, "step": 22700, "train_speed(iter/s)": 0.288256 }, { "acc": 0.71542597, "epoch": 0.25412200260888895, "grad_norm": 4.5, "learning_rate": 9.83852868091204e-06, "loss": 1.1293498, "memory(GiB)": 142.32, "step": 22720, "train_speed(iter/s)": 0.288343 }, { "acc": 0.70847969, "epoch": 0.2543457015548475, "grad_norm": 5.28125, "learning_rate": 9.838062143230413e-06, "loss": 1.17640915, "memory(GiB)": 142.32, "step": 22740, "train_speed(iter/s)": 0.288435 }, { "acc": 0.71915898, "epoch": 0.254569400500806, "grad_norm": 5.0, "learning_rate": 9.837594943632734e-06, "loss": 1.11117516, "memory(GiB)": 142.32, "step": 22760, "train_speed(iter/s)": 0.288523 }, { "acc": 0.72479286, "epoch": 0.25479309944676454, "grad_norm": 6.03125, "learning_rate": 9.837127082182921e-06, "loss": 1.09583321, "memory(GiB)": 142.32, "step": 22780, "train_speed(iter/s)": 0.288614 }, { "acc": 0.72423739, "epoch": 0.25501679839272307, "grad_norm": 7.5625, "learning_rate": 9.836658558944986e-06, "loss": 1.09898415, "memory(GiB)": 142.32, "step": 22800, "train_speed(iter/s)": 0.288707 }, { "acc": 0.71274338, "epoch": 0.2552404973386816, "grad_norm": 5.5625, "learning_rate": 9.836189373983026e-06, "loss": 1.16834526, "memory(GiB)": 142.32, "step": 22820, "train_speed(iter/s)": 0.288794 }, { "acc": 0.72742119, "epoch": 0.2554641962846401, "grad_norm": 5.75, "learning_rate": 9.835719527361236e-06, "loss": 1.08743649, "memory(GiB)": 142.32, "step": 22840, "train_speed(iter/s)": 0.288878 }, { "acc": 0.71917114, "epoch": 0.25568789523059865, "grad_norm": 5.5, "learning_rate": 9.835249019143896e-06, "loss": 1.11573353, "memory(GiB)": 142.32, "step": 22860, "train_speed(iter/s)": 0.288965 }, { "acc": 0.71723766, "epoch": 0.2559115941765572, "grad_norm": 5.0, "learning_rate": 9.834777849395378e-06, "loss": 1.14660969, "memory(GiB)": 142.32, "step": 22880, "train_speed(iter/s)": 0.289049 }, { "acc": 0.72637167, "epoch": 0.2561352931225157, "grad_norm": 6.75, "learning_rate": 9.834306018180144e-06, "loss": 1.07420177, "memory(GiB)": 142.32, "step": 22900, "train_speed(iter/s)": 0.289126 }, { "acc": 0.73024855, "epoch": 0.25635899206847423, "grad_norm": 6.375, "learning_rate": 9.83383352556275e-06, "loss": 1.08109808, "memory(GiB)": 142.32, "step": 22920, "train_speed(iter/s)": 0.289206 }, { "acc": 0.71407919, "epoch": 0.25658269101443276, "grad_norm": 5.125, "learning_rate": 9.83336037160784e-06, "loss": 1.14132423, "memory(GiB)": 142.32, "step": 22940, "train_speed(iter/s)": 0.289293 }, { "acc": 0.71740351, "epoch": 0.2568063899603913, "grad_norm": 5.8125, "learning_rate": 9.832886556380144e-06, "loss": 1.12549734, "memory(GiB)": 142.32, "step": 22960, "train_speed(iter/s)": 0.289377 }, { "acc": 0.70416932, "epoch": 0.2570300889063498, "grad_norm": 5.21875, "learning_rate": 9.832412079944491e-06, "loss": 1.21660652, "memory(GiB)": 142.32, "step": 22980, "train_speed(iter/s)": 0.289463 }, { "acc": 0.72193856, "epoch": 0.25725378785230835, "grad_norm": 6.3125, "learning_rate": 9.831936942365794e-06, "loss": 1.09873409, "memory(GiB)": 142.32, "step": 23000, "train_speed(iter/s)": 0.289544 }, { "acc": 0.72310305, "epoch": 0.2574774867982669, "grad_norm": 6.03125, "learning_rate": 9.831461143709057e-06, "loss": 1.08893843, "memory(GiB)": 142.32, "step": 23020, "train_speed(iter/s)": 0.289631 }, { "acc": 0.73206406, "epoch": 0.2577011857442254, "grad_norm": 6.6875, "learning_rate": 9.83098468403938e-06, "loss": 1.06428738, "memory(GiB)": 142.32, "step": 23040, "train_speed(iter/s)": 0.28971 }, { "acc": 0.74026299, "epoch": 0.25792488469018393, "grad_norm": 5.71875, "learning_rate": 9.830507563421947e-06, "loss": 1.03068733, "memory(GiB)": 142.32, "step": 23060, "train_speed(iter/s)": 0.289793 }, { "acc": 0.73186893, "epoch": 0.25814858363614246, "grad_norm": 7.4375, "learning_rate": 9.830029781922036e-06, "loss": 1.06551056, "memory(GiB)": 142.32, "step": 23080, "train_speed(iter/s)": 0.289876 }, { "acc": 0.71561799, "epoch": 0.258372282582101, "grad_norm": 4.84375, "learning_rate": 9.829551339605015e-06, "loss": 1.13698273, "memory(GiB)": 142.32, "step": 23100, "train_speed(iter/s)": 0.289958 }, { "acc": 0.72996836, "epoch": 0.2585959815280595, "grad_norm": 5.0625, "learning_rate": 9.829072236536338e-06, "loss": 1.08272934, "memory(GiB)": 142.32, "step": 23120, "train_speed(iter/s)": 0.290046 }, { "acc": 0.72769456, "epoch": 0.25881968047401804, "grad_norm": 5.09375, "learning_rate": 9.828592472781556e-06, "loss": 1.10306368, "memory(GiB)": 142.32, "step": 23140, "train_speed(iter/s)": 0.290131 }, { "acc": 0.71702108, "epoch": 0.25904337941997657, "grad_norm": 5.15625, "learning_rate": 9.828112048406308e-06, "loss": 1.14113197, "memory(GiB)": 142.32, "step": 23160, "train_speed(iter/s)": 0.290223 }, { "acc": 0.70982294, "epoch": 0.2592670783659351, "grad_norm": 5.1875, "learning_rate": 9.827630963476323e-06, "loss": 1.16623087, "memory(GiB)": 142.32, "step": 23180, "train_speed(iter/s)": 0.290313 }, { "acc": 0.72698631, "epoch": 0.25949077731189363, "grad_norm": 5.28125, "learning_rate": 9.827149218057418e-06, "loss": 1.10200796, "memory(GiB)": 142.32, "step": 23200, "train_speed(iter/s)": 0.290403 }, { "acc": 0.71697106, "epoch": 0.25971447625785216, "grad_norm": 5.6875, "learning_rate": 9.826666812215504e-06, "loss": 1.12425833, "memory(GiB)": 142.32, "step": 23220, "train_speed(iter/s)": 0.290492 }, { "acc": 0.73246021, "epoch": 0.2599381752038107, "grad_norm": 5.875, "learning_rate": 9.826183746016582e-06, "loss": 1.05626011, "memory(GiB)": 142.32, "step": 23240, "train_speed(iter/s)": 0.290575 }, { "acc": 0.72406816, "epoch": 0.26016187414976927, "grad_norm": 6.25, "learning_rate": 9.825700019526742e-06, "loss": 1.09930744, "memory(GiB)": 142.32, "step": 23260, "train_speed(iter/s)": 0.29065 }, { "acc": 0.71708145, "epoch": 0.2603855730957278, "grad_norm": 6.28125, "learning_rate": 9.825215632812163e-06, "loss": 1.13456545, "memory(GiB)": 142.32, "step": 23280, "train_speed(iter/s)": 0.290731 }, { "acc": 0.72637625, "epoch": 0.2606092720416863, "grad_norm": 6.71875, "learning_rate": 9.824730585939117e-06, "loss": 1.09588432, "memory(GiB)": 142.32, "step": 23300, "train_speed(iter/s)": 0.29081 }, { "acc": 0.71632509, "epoch": 0.26083297098764485, "grad_norm": 6.28125, "learning_rate": 9.824244878973967e-06, "loss": 1.13044281, "memory(GiB)": 142.32, "step": 23320, "train_speed(iter/s)": 0.290881 }, { "acc": 0.72306156, "epoch": 0.2610566699336034, "grad_norm": 5.125, "learning_rate": 9.823758511983162e-06, "loss": 1.10234375, "memory(GiB)": 142.32, "step": 23340, "train_speed(iter/s)": 0.290966 }, { "acc": 0.70850315, "epoch": 0.2612803688795619, "grad_norm": 6.0, "learning_rate": 9.823271485033246e-06, "loss": 1.17696371, "memory(GiB)": 142.32, "step": 23360, "train_speed(iter/s)": 0.291037 }, { "acc": 0.72842174, "epoch": 0.26150406782552044, "grad_norm": 5.90625, "learning_rate": 9.82278379819085e-06, "loss": 1.07459259, "memory(GiB)": 142.32, "step": 23380, "train_speed(iter/s)": 0.291125 }, { "acc": 0.71432877, "epoch": 0.26172776677147896, "grad_norm": 6.1875, "learning_rate": 9.822295451522697e-06, "loss": 1.14003916, "memory(GiB)": 142.32, "step": 23400, "train_speed(iter/s)": 0.291206 }, { "acc": 0.72894726, "epoch": 0.2619514657174375, "grad_norm": 6.21875, "learning_rate": 9.821806445095598e-06, "loss": 1.0813467, "memory(GiB)": 142.32, "step": 23420, "train_speed(iter/s)": 0.291287 }, { "acc": 0.73352566, "epoch": 0.262175164663396, "grad_norm": 5.46875, "learning_rate": 9.821316778976461e-06, "loss": 1.05734348, "memory(GiB)": 142.32, "step": 23440, "train_speed(iter/s)": 0.291368 }, { "acc": 0.72696233, "epoch": 0.26239886360935455, "grad_norm": 5.40625, "learning_rate": 9.820826453232275e-06, "loss": 1.08002396, "memory(GiB)": 142.32, "step": 23460, "train_speed(iter/s)": 0.291444 }, { "acc": 0.71992788, "epoch": 0.2626225625553131, "grad_norm": 6.75, "learning_rate": 9.820335467930125e-06, "loss": 1.12796135, "memory(GiB)": 142.32, "step": 23480, "train_speed(iter/s)": 0.291525 }, { "acc": 0.71901846, "epoch": 0.2628462615012716, "grad_norm": 5.46875, "learning_rate": 9.819843823137184e-06, "loss": 1.11366787, "memory(GiB)": 142.32, "step": 23500, "train_speed(iter/s)": 0.291598 }, { "acc": 0.72569599, "epoch": 0.26306996044723013, "grad_norm": 6.71875, "learning_rate": 9.819351518920714e-06, "loss": 1.08938408, "memory(GiB)": 142.32, "step": 23520, "train_speed(iter/s)": 0.291678 }, { "acc": 0.71923218, "epoch": 0.26329365939318866, "grad_norm": 8.4375, "learning_rate": 9.818858555348075e-06, "loss": 1.13003674, "memory(GiB)": 142.32, "step": 23540, "train_speed(iter/s)": 0.291754 }, { "acc": 0.73083277, "epoch": 0.2635173583391472, "grad_norm": 5.8125, "learning_rate": 9.818364932486709e-06, "loss": 1.06851454, "memory(GiB)": 142.32, "step": 23560, "train_speed(iter/s)": 0.291835 }, { "acc": 0.72447491, "epoch": 0.2637410572851057, "grad_norm": 6.96875, "learning_rate": 9.817870650404146e-06, "loss": 1.09534283, "memory(GiB)": 142.32, "step": 23580, "train_speed(iter/s)": 0.291902 }, { "acc": 0.71942062, "epoch": 0.26396475623106425, "grad_norm": 7.34375, "learning_rate": 9.817375709168018e-06, "loss": 1.1260004, "memory(GiB)": 142.32, "step": 23600, "train_speed(iter/s)": 0.291982 }, { "acc": 0.73951216, "epoch": 0.2641884551770228, "grad_norm": 6.5625, "learning_rate": 9.816880108846037e-06, "loss": 1.04460831, "memory(GiB)": 142.32, "step": 23620, "train_speed(iter/s)": 0.292064 }, { "acc": 0.71607542, "epoch": 0.2644121541229813, "grad_norm": 6.25, "learning_rate": 9.816383849506006e-06, "loss": 1.13025723, "memory(GiB)": 142.32, "step": 23640, "train_speed(iter/s)": 0.292148 }, { "acc": 0.72138443, "epoch": 0.26463585306893983, "grad_norm": 9.5, "learning_rate": 9.815886931215824e-06, "loss": 1.10330639, "memory(GiB)": 142.32, "step": 23660, "train_speed(iter/s)": 0.29223 }, { "acc": 0.71418304, "epoch": 0.26485955201489836, "grad_norm": 5.03125, "learning_rate": 9.815389354043474e-06, "loss": 1.13684692, "memory(GiB)": 142.32, "step": 23680, "train_speed(iter/s)": 0.292308 }, { "acc": 0.72989006, "epoch": 0.2650832509608569, "grad_norm": 6.375, "learning_rate": 9.814891118057033e-06, "loss": 1.06268806, "memory(GiB)": 142.32, "step": 23700, "train_speed(iter/s)": 0.292385 }, { "acc": 0.72601976, "epoch": 0.2653069499068154, "grad_norm": 5.96875, "learning_rate": 9.814392223324667e-06, "loss": 1.09257154, "memory(GiB)": 142.32, "step": 23720, "train_speed(iter/s)": 0.292467 }, { "acc": 0.72285719, "epoch": 0.26553064885277394, "grad_norm": 6.3125, "learning_rate": 9.81389266991463e-06, "loss": 1.10633087, "memory(GiB)": 142.32, "step": 23740, "train_speed(iter/s)": 0.292542 }, { "acc": 0.72345362, "epoch": 0.26575434779873247, "grad_norm": 4.40625, "learning_rate": 9.81339245789527e-06, "loss": 1.1014637, "memory(GiB)": 142.32, "step": 23760, "train_speed(iter/s)": 0.292631 }, { "acc": 0.72860813, "epoch": 0.265978046744691, "grad_norm": 5.84375, "learning_rate": 9.812891587335023e-06, "loss": 1.07758484, "memory(GiB)": 142.32, "step": 23780, "train_speed(iter/s)": 0.292716 }, { "acc": 0.71475959, "epoch": 0.2662017456906495, "grad_norm": 6.875, "learning_rate": 9.812390058302415e-06, "loss": 1.13192749, "memory(GiB)": 142.32, "step": 23800, "train_speed(iter/s)": 0.2928 }, { "acc": 0.72877865, "epoch": 0.26642544463660806, "grad_norm": 5.9375, "learning_rate": 9.811887870866062e-06, "loss": 1.06678801, "memory(GiB)": 142.32, "step": 23820, "train_speed(iter/s)": 0.292879 }, { "acc": 0.72192755, "epoch": 0.2666491435825666, "grad_norm": 5.125, "learning_rate": 9.811385025094669e-06, "loss": 1.11638222, "memory(GiB)": 142.32, "step": 23840, "train_speed(iter/s)": 0.292958 }, { "acc": 0.72832689, "epoch": 0.2668728425285251, "grad_norm": 6.53125, "learning_rate": 9.810881521057035e-06, "loss": 1.07658291, "memory(GiB)": 142.32, "step": 23860, "train_speed(iter/s)": 0.293041 }, { "acc": 0.72293973, "epoch": 0.26709654147448364, "grad_norm": 5.3125, "learning_rate": 9.810377358822046e-06, "loss": 1.09435654, "memory(GiB)": 142.32, "step": 23880, "train_speed(iter/s)": 0.293122 }, { "acc": 0.72246437, "epoch": 0.26732024042044217, "grad_norm": 5.28125, "learning_rate": 9.809872538458678e-06, "loss": 1.11028757, "memory(GiB)": 142.32, "step": 23900, "train_speed(iter/s)": 0.293205 }, { "acc": 0.72609787, "epoch": 0.2675439393664007, "grad_norm": 5.03125, "learning_rate": 9.809367060035997e-06, "loss": 1.08507805, "memory(GiB)": 142.32, "step": 23920, "train_speed(iter/s)": 0.293278 }, { "acc": 0.71706657, "epoch": 0.2677676383123592, "grad_norm": 5.75, "learning_rate": 9.80886092362316e-06, "loss": 1.1245841, "memory(GiB)": 142.32, "step": 23940, "train_speed(iter/s)": 0.293362 }, { "acc": 0.7104495, "epoch": 0.26799133725831775, "grad_norm": 5.09375, "learning_rate": 9.808354129289417e-06, "loss": 1.16606112, "memory(GiB)": 142.32, "step": 23960, "train_speed(iter/s)": 0.293442 }, { "acc": 0.71400909, "epoch": 0.2682150362042763, "grad_norm": 6.03125, "learning_rate": 9.8078466771041e-06, "loss": 1.14920216, "memory(GiB)": 142.32, "step": 23980, "train_speed(iter/s)": 0.293519 }, { "acc": 0.71243467, "epoch": 0.2684387351502348, "grad_norm": 7.15625, "learning_rate": 9.807338567136637e-06, "loss": 1.13077106, "memory(GiB)": 142.32, "step": 24000, "train_speed(iter/s)": 0.293604 }, { "epoch": 0.2684387351502348, "eval_acc": 0.6857878532253224, "eval_loss": 1.1177531480789185, "eval_runtime": 2342.9256, "eval_samples_per_second": 32.132, "eval_steps_per_second": 16.066, "step": 24000 }, { "acc": 0.72340102, "epoch": 0.26866243409619334, "grad_norm": 6.53125, "learning_rate": 9.806829799456547e-06, "loss": 1.12509613, "memory(GiB)": 142.32, "step": 24020, "train_speed(iter/s)": 0.285333 }, { "acc": 0.72757387, "epoch": 0.26888613304215186, "grad_norm": 6.53125, "learning_rate": 9.806320374133434e-06, "loss": 1.07848978, "memory(GiB)": 142.32, "step": 24040, "train_speed(iter/s)": 0.285407 }, { "acc": 0.71878853, "epoch": 0.2691098319881104, "grad_norm": 7.125, "learning_rate": 9.805810291236996e-06, "loss": 1.12013912, "memory(GiB)": 142.32, "step": 24060, "train_speed(iter/s)": 0.285485 }, { "acc": 0.72890005, "epoch": 0.2693335309340689, "grad_norm": 5.5625, "learning_rate": 9.805299550837018e-06, "loss": 1.08585796, "memory(GiB)": 142.32, "step": 24080, "train_speed(iter/s)": 0.285564 }, { "acc": 0.72726727, "epoch": 0.26955722988002745, "grad_norm": 5.15625, "learning_rate": 9.80478815300338e-06, "loss": 1.08262959, "memory(GiB)": 142.32, "step": 24100, "train_speed(iter/s)": 0.285642 }, { "acc": 0.72678647, "epoch": 0.269780928825986, "grad_norm": 6.125, "learning_rate": 9.804276097806045e-06, "loss": 1.07684441, "memory(GiB)": 142.32, "step": 24120, "train_speed(iter/s)": 0.285723 }, { "acc": 0.71715555, "epoch": 0.2700046277719445, "grad_norm": 6.5625, "learning_rate": 9.803763385315072e-06, "loss": 1.1466547, "memory(GiB)": 142.32, "step": 24140, "train_speed(iter/s)": 0.285801 }, { "acc": 0.72404499, "epoch": 0.27022832671790303, "grad_norm": 6.5625, "learning_rate": 9.803250015600605e-06, "loss": 1.10275784, "memory(GiB)": 142.32, "step": 24160, "train_speed(iter/s)": 0.285886 }, { "acc": 0.71524081, "epoch": 0.27045202566386156, "grad_norm": 5.9375, "learning_rate": 9.802735988732882e-06, "loss": 1.14549179, "memory(GiB)": 142.32, "step": 24180, "train_speed(iter/s)": 0.285967 }, { "acc": 0.72841892, "epoch": 0.2706757246098201, "grad_norm": 6.8125, "learning_rate": 9.802221304782229e-06, "loss": 1.08731441, "memory(GiB)": 142.32, "step": 24200, "train_speed(iter/s)": 0.286054 }, { "acc": 0.72585459, "epoch": 0.2708994235557786, "grad_norm": 5.1875, "learning_rate": 9.801705963819063e-06, "loss": 1.0896225, "memory(GiB)": 142.32, "step": 24220, "train_speed(iter/s)": 0.286137 }, { "acc": 0.72137885, "epoch": 0.27112312250173715, "grad_norm": 5.625, "learning_rate": 9.801189965913886e-06, "loss": 1.1167738, "memory(GiB)": 142.32, "step": 24240, "train_speed(iter/s)": 0.286217 }, { "acc": 0.72158408, "epoch": 0.2713468214476957, "grad_norm": 5.65625, "learning_rate": 9.800673311137301e-06, "loss": 1.13554792, "memory(GiB)": 142.32, "step": 24260, "train_speed(iter/s)": 0.286303 }, { "acc": 0.71962023, "epoch": 0.2715705203936542, "grad_norm": 6.03125, "learning_rate": 9.800155999559986e-06, "loss": 1.1214344, "memory(GiB)": 142.32, "step": 24280, "train_speed(iter/s)": 0.286391 }, { "acc": 0.72673531, "epoch": 0.27179421933961273, "grad_norm": 5.5, "learning_rate": 9.799638031252723e-06, "loss": 1.09160166, "memory(GiB)": 142.32, "step": 24300, "train_speed(iter/s)": 0.28647 }, { "acc": 0.71800623, "epoch": 0.27201791828557126, "grad_norm": 4.71875, "learning_rate": 9.799119406286373e-06, "loss": 1.14215488, "memory(GiB)": 142.32, "step": 24320, "train_speed(iter/s)": 0.286535 }, { "acc": 0.7212184, "epoch": 0.2722416172315298, "grad_norm": 6.4375, "learning_rate": 9.798600124731893e-06, "loss": 1.10550327, "memory(GiB)": 142.32, "step": 24340, "train_speed(iter/s)": 0.286615 }, { "acc": 0.73727856, "epoch": 0.2724653161774883, "grad_norm": 6.5625, "learning_rate": 9.798080186660328e-06, "loss": 1.04985218, "memory(GiB)": 142.32, "step": 24360, "train_speed(iter/s)": 0.286696 }, { "acc": 0.72503376, "epoch": 0.27268901512344684, "grad_norm": 6.71875, "learning_rate": 9.797559592142814e-06, "loss": 1.09478703, "memory(GiB)": 142.32, "step": 24380, "train_speed(iter/s)": 0.286771 }, { "acc": 0.73038397, "epoch": 0.27291271406940537, "grad_norm": 5.6875, "learning_rate": 9.797038341250574e-06, "loss": 1.06689358, "memory(GiB)": 142.32, "step": 24400, "train_speed(iter/s)": 0.286851 }, { "acc": 0.72507443, "epoch": 0.2731364130153639, "grad_norm": 6.125, "learning_rate": 9.796516434054923e-06, "loss": 1.0929636, "memory(GiB)": 142.32, "step": 24420, "train_speed(iter/s)": 0.286933 }, { "acc": 0.71597786, "epoch": 0.2733601119613224, "grad_norm": 4.875, "learning_rate": 9.795993870627267e-06, "loss": 1.14805021, "memory(GiB)": 142.32, "step": 24440, "train_speed(iter/s)": 0.287013 }, { "acc": 0.73132482, "epoch": 0.27358381090728096, "grad_norm": 6.78125, "learning_rate": 9.795470651039099e-06, "loss": 1.06813431, "memory(GiB)": 142.32, "step": 24460, "train_speed(iter/s)": 0.2871 }, { "acc": 0.72178822, "epoch": 0.2738075098532395, "grad_norm": 5.6875, "learning_rate": 9.794946775362002e-06, "loss": 1.10530739, "memory(GiB)": 142.32, "step": 24480, "train_speed(iter/s)": 0.287183 }, { "acc": 0.72365713, "epoch": 0.274031208799198, "grad_norm": 5.75, "learning_rate": 9.794422243667651e-06, "loss": 1.08401651, "memory(GiB)": 142.32, "step": 24500, "train_speed(iter/s)": 0.28727 }, { "acc": 0.7243237, "epoch": 0.2742549077451566, "grad_norm": 5.3125, "learning_rate": 9.79389705602781e-06, "loss": 1.08907681, "memory(GiB)": 142.32, "step": 24520, "train_speed(iter/s)": 0.287342 }, { "acc": 0.71824369, "epoch": 0.2744786066911151, "grad_norm": 5.6875, "learning_rate": 9.79337121251433e-06, "loss": 1.11943636, "memory(GiB)": 142.32, "step": 24540, "train_speed(iter/s)": 0.287418 }, { "acc": 0.72072783, "epoch": 0.27470230563707365, "grad_norm": 4.875, "learning_rate": 9.792844713199156e-06, "loss": 1.12114296, "memory(GiB)": 142.32, "step": 24560, "train_speed(iter/s)": 0.28749 }, { "acc": 0.71360126, "epoch": 0.2749260045830322, "grad_norm": 6.875, "learning_rate": 9.792317558154318e-06, "loss": 1.1558157, "memory(GiB)": 142.32, "step": 24580, "train_speed(iter/s)": 0.287571 }, { "acc": 0.72224674, "epoch": 0.2751497035289907, "grad_norm": 6.78125, "learning_rate": 9.79178974745194e-06, "loss": 1.11086788, "memory(GiB)": 142.32, "step": 24600, "train_speed(iter/s)": 0.28765 }, { "acc": 0.72418871, "epoch": 0.27537340247494924, "grad_norm": 6.71875, "learning_rate": 9.791261281164236e-06, "loss": 1.1058939, "memory(GiB)": 142.32, "step": 24620, "train_speed(iter/s)": 0.287726 }, { "acc": 0.72967625, "epoch": 0.27559710142090776, "grad_norm": 5.4375, "learning_rate": 9.790732159363505e-06, "loss": 1.0529314, "memory(GiB)": 142.32, "step": 24640, "train_speed(iter/s)": 0.287808 }, { "acc": 0.72588511, "epoch": 0.2758208003668663, "grad_norm": 7.125, "learning_rate": 9.79020238212214e-06, "loss": 1.08301334, "memory(GiB)": 142.32, "step": 24660, "train_speed(iter/s)": 0.287883 }, { "acc": 0.71663404, "epoch": 0.2760444993128248, "grad_norm": 6.875, "learning_rate": 9.78967194951262e-06, "loss": 1.14859753, "memory(GiB)": 142.32, "step": 24680, "train_speed(iter/s)": 0.287966 }, { "acc": 0.7221601, "epoch": 0.27626819825878335, "grad_norm": 5.375, "learning_rate": 9.789140861607518e-06, "loss": 1.11344118, "memory(GiB)": 142.32, "step": 24700, "train_speed(iter/s)": 0.288044 }, { "acc": 0.73307314, "epoch": 0.2764918972047419, "grad_norm": 6.40625, "learning_rate": 9.788609118479494e-06, "loss": 1.07904224, "memory(GiB)": 142.32, "step": 24720, "train_speed(iter/s)": 0.288118 }, { "acc": 0.72280369, "epoch": 0.2767155961507004, "grad_norm": 5.46875, "learning_rate": 9.788076720201296e-06, "loss": 1.10647354, "memory(GiB)": 142.32, "step": 24740, "train_speed(iter/s)": 0.288195 }, { "acc": 0.72586489, "epoch": 0.27693929509665893, "grad_norm": 5.75, "learning_rate": 9.787543666845766e-06, "loss": 1.08056202, "memory(GiB)": 142.32, "step": 24760, "train_speed(iter/s)": 0.288275 }, { "acc": 0.73060117, "epoch": 0.27716299404261746, "grad_norm": 6.4375, "learning_rate": 9.787009958485831e-06, "loss": 1.07633181, "memory(GiB)": 142.32, "step": 24780, "train_speed(iter/s)": 0.288357 }, { "acc": 0.72803726, "epoch": 0.277386692988576, "grad_norm": 4.46875, "learning_rate": 9.786475595194514e-06, "loss": 1.08972187, "memory(GiB)": 142.32, "step": 24800, "train_speed(iter/s)": 0.288439 }, { "acc": 0.71779842, "epoch": 0.2776103919345345, "grad_norm": 5.53125, "learning_rate": 9.78594057704492e-06, "loss": 1.12787743, "memory(GiB)": 142.32, "step": 24820, "train_speed(iter/s)": 0.288524 }, { "acc": 0.73576684, "epoch": 0.27783409088049305, "grad_norm": 6.625, "learning_rate": 9.78540490411025e-06, "loss": 1.05447979, "memory(GiB)": 142.32, "step": 24840, "train_speed(iter/s)": 0.288595 }, { "acc": 0.73101606, "epoch": 0.2780577898264516, "grad_norm": 5.34375, "learning_rate": 9.784868576463787e-06, "loss": 1.07836123, "memory(GiB)": 142.32, "step": 24860, "train_speed(iter/s)": 0.288669 }, { "acc": 0.70662513, "epoch": 0.2782814887724101, "grad_norm": 6.84375, "learning_rate": 9.784331594178913e-06, "loss": 1.17955055, "memory(GiB)": 142.32, "step": 24880, "train_speed(iter/s)": 0.288742 }, { "acc": 0.72187986, "epoch": 0.27850518771836863, "grad_norm": 6.40625, "learning_rate": 9.783793957329094e-06, "loss": 1.10037813, "memory(GiB)": 142.32, "step": 24900, "train_speed(iter/s)": 0.288819 }, { "acc": 0.72303782, "epoch": 0.27872888666432716, "grad_norm": 5.875, "learning_rate": 9.783255665987883e-06, "loss": 1.10248852, "memory(GiB)": 142.32, "step": 24920, "train_speed(iter/s)": 0.288897 }, { "acc": 0.7182848, "epoch": 0.2789525856102857, "grad_norm": 6.375, "learning_rate": 9.78271672022893e-06, "loss": 1.13012438, "memory(GiB)": 142.32, "step": 24940, "train_speed(iter/s)": 0.288977 }, { "acc": 0.71810904, "epoch": 0.2791762845562442, "grad_norm": 5.71875, "learning_rate": 9.782177120125968e-06, "loss": 1.12914352, "memory(GiB)": 142.32, "step": 24960, "train_speed(iter/s)": 0.289052 }, { "acc": 0.71054773, "epoch": 0.27939998350220274, "grad_norm": 5.90625, "learning_rate": 9.781636865752824e-06, "loss": 1.15440197, "memory(GiB)": 142.32, "step": 24980, "train_speed(iter/s)": 0.289137 }, { "acc": 0.73293447, "epoch": 0.27962368244816127, "grad_norm": 6.59375, "learning_rate": 9.78109595718341e-06, "loss": 1.05614929, "memory(GiB)": 142.32, "step": 25000, "train_speed(iter/s)": 0.289218 }, { "acc": 0.73269768, "epoch": 0.2798473813941198, "grad_norm": 5.875, "learning_rate": 9.780554394491733e-06, "loss": 1.05349674, "memory(GiB)": 142.32, "step": 25020, "train_speed(iter/s)": 0.289297 }, { "acc": 0.7129014, "epoch": 0.2800710803400783, "grad_norm": 6.4375, "learning_rate": 9.780012177751882e-06, "loss": 1.15646782, "memory(GiB)": 142.32, "step": 25040, "train_speed(iter/s)": 0.289373 }, { "acc": 0.72060423, "epoch": 0.28029477928603685, "grad_norm": 5.96875, "learning_rate": 9.779469307038048e-06, "loss": 1.12878399, "memory(GiB)": 142.32, "step": 25060, "train_speed(iter/s)": 0.289446 }, { "acc": 0.73070421, "epoch": 0.2805184782319954, "grad_norm": 6.5, "learning_rate": 9.778925782424495e-06, "loss": 1.05725822, "memory(GiB)": 142.32, "step": 25080, "train_speed(iter/s)": 0.289524 }, { "acc": 0.73108678, "epoch": 0.2807421771779539, "grad_norm": 5.3125, "learning_rate": 9.778381603985589e-06, "loss": 1.04698334, "memory(GiB)": 142.32, "step": 25100, "train_speed(iter/s)": 0.289605 }, { "acc": 0.71966476, "epoch": 0.28096587612391244, "grad_norm": 6.0625, "learning_rate": 9.777836771795781e-06, "loss": 1.12306671, "memory(GiB)": 142.32, "step": 25120, "train_speed(iter/s)": 0.289654 }, { "acc": 0.72153463, "epoch": 0.28118957506987097, "grad_norm": 6.46875, "learning_rate": 9.777291285929611e-06, "loss": 1.09856787, "memory(GiB)": 142.32, "step": 25140, "train_speed(iter/s)": 0.289735 }, { "acc": 0.71385832, "epoch": 0.2814132740158295, "grad_norm": 5.40625, "learning_rate": 9.776745146461711e-06, "loss": 1.13563995, "memory(GiB)": 142.32, "step": 25160, "train_speed(iter/s)": 0.289803 }, { "acc": 0.71797423, "epoch": 0.281636972961788, "grad_norm": 5.84375, "learning_rate": 9.776198353466799e-06, "loss": 1.12445831, "memory(GiB)": 142.32, "step": 25180, "train_speed(iter/s)": 0.28988 }, { "acc": 0.73167324, "epoch": 0.28186067190774655, "grad_norm": 5.71875, "learning_rate": 9.775650907019682e-06, "loss": 1.06919918, "memory(GiB)": 142.32, "step": 25200, "train_speed(iter/s)": 0.289966 }, { "acc": 0.72429247, "epoch": 0.2820843708537051, "grad_norm": 5.90625, "learning_rate": 9.775102807195264e-06, "loss": 1.11423187, "memory(GiB)": 142.32, "step": 25220, "train_speed(iter/s)": 0.290049 }, { "acc": 0.71911979, "epoch": 0.2823080697996636, "grad_norm": 6.5625, "learning_rate": 9.774554054068531e-06, "loss": 1.1009901, "memory(GiB)": 142.32, "step": 25240, "train_speed(iter/s)": 0.290125 }, { "acc": 0.72598181, "epoch": 0.28253176874562214, "grad_norm": 4.3125, "learning_rate": 9.774004647714557e-06, "loss": 1.09828205, "memory(GiB)": 142.32, "step": 25260, "train_speed(iter/s)": 0.290189 }, { "acc": 0.72649651, "epoch": 0.28275546769158066, "grad_norm": 5.40625, "learning_rate": 9.773454588208513e-06, "loss": 1.08588276, "memory(GiB)": 142.32, "step": 25280, "train_speed(iter/s)": 0.290274 }, { "acc": 0.73328195, "epoch": 0.2829791666375392, "grad_norm": 6.125, "learning_rate": 9.772903875625651e-06, "loss": 1.06585979, "memory(GiB)": 142.32, "step": 25300, "train_speed(iter/s)": 0.290345 }, { "acc": 0.73621535, "epoch": 0.2832028655834977, "grad_norm": 5.65625, "learning_rate": 9.772352510041318e-06, "loss": 1.07474041, "memory(GiB)": 142.32, "step": 25320, "train_speed(iter/s)": 0.290419 }, { "acc": 0.72254829, "epoch": 0.28342656452945625, "grad_norm": 5.46875, "learning_rate": 9.771800491530951e-06, "loss": 1.098493, "memory(GiB)": 142.32, "step": 25340, "train_speed(iter/s)": 0.290493 }, { "acc": 0.71016521, "epoch": 0.2836502634754148, "grad_norm": 4.6875, "learning_rate": 9.77124782017007e-06, "loss": 1.18175945, "memory(GiB)": 142.32, "step": 25360, "train_speed(iter/s)": 0.290567 }, { "acc": 0.72691259, "epoch": 0.2838739624213733, "grad_norm": 5.1875, "learning_rate": 9.770694496034293e-06, "loss": 1.08650656, "memory(GiB)": 142.32, "step": 25380, "train_speed(iter/s)": 0.290637 }, { "acc": 0.70954261, "epoch": 0.28409766136733183, "grad_norm": 5.59375, "learning_rate": 9.770140519199319e-06, "loss": 1.17277775, "memory(GiB)": 142.32, "step": 25400, "train_speed(iter/s)": 0.29071 }, { "acc": 0.72148342, "epoch": 0.28432136031329036, "grad_norm": 7.09375, "learning_rate": 9.76958588974094e-06, "loss": 1.13305187, "memory(GiB)": 142.32, "step": 25420, "train_speed(iter/s)": 0.290786 }, { "acc": 0.71309643, "epoch": 0.2845450592592489, "grad_norm": 5.15625, "learning_rate": 9.769030607735038e-06, "loss": 1.14933519, "memory(GiB)": 142.32, "step": 25440, "train_speed(iter/s)": 0.290865 }, { "acc": 0.71774626, "epoch": 0.2847687582052074, "grad_norm": 5.0625, "learning_rate": 9.768474673257584e-06, "loss": 1.12035542, "memory(GiB)": 142.32, "step": 25460, "train_speed(iter/s)": 0.290942 }, { "acc": 0.72522812, "epoch": 0.28499245715116595, "grad_norm": 6.53125, "learning_rate": 9.767918086384638e-06, "loss": 1.0752491, "memory(GiB)": 142.32, "step": 25480, "train_speed(iter/s)": 0.291023 }, { "acc": 0.72710896, "epoch": 0.2852161560971245, "grad_norm": 4.65625, "learning_rate": 9.767360847192348e-06, "loss": 1.10058708, "memory(GiB)": 142.32, "step": 25500, "train_speed(iter/s)": 0.291096 }, { "acc": 0.7304245, "epoch": 0.285439855043083, "grad_norm": 5.71875, "learning_rate": 9.766802955756953e-06, "loss": 1.07844105, "memory(GiB)": 142.32, "step": 25520, "train_speed(iter/s)": 0.291165 }, { "acc": 0.72476678, "epoch": 0.28566355398904153, "grad_norm": 6.03125, "learning_rate": 9.766244412154782e-06, "loss": 1.08928661, "memory(GiB)": 142.32, "step": 25540, "train_speed(iter/s)": 0.291243 }, { "acc": 0.7220356, "epoch": 0.28588725293500006, "grad_norm": 4.65625, "learning_rate": 9.765685216462249e-06, "loss": 1.10337029, "memory(GiB)": 142.32, "step": 25560, "train_speed(iter/s)": 0.291317 }, { "acc": 0.71569128, "epoch": 0.2861109518809586, "grad_norm": 5.8125, "learning_rate": 9.765125368755859e-06, "loss": 1.13140736, "memory(GiB)": 142.32, "step": 25580, "train_speed(iter/s)": 0.291387 }, { "acc": 0.71643534, "epoch": 0.2863346508269171, "grad_norm": 5.84375, "learning_rate": 9.764564869112212e-06, "loss": 1.13639317, "memory(GiB)": 142.32, "step": 25600, "train_speed(iter/s)": 0.291451 }, { "acc": 0.71734776, "epoch": 0.28655834977287564, "grad_norm": 6.25, "learning_rate": 9.764003717607988e-06, "loss": 1.14292221, "memory(GiB)": 142.32, "step": 25620, "train_speed(iter/s)": 0.291515 }, { "acc": 0.72400265, "epoch": 0.28678204871883417, "grad_norm": 5.625, "learning_rate": 9.763441914319961e-06, "loss": 1.10858364, "memory(GiB)": 142.32, "step": 25640, "train_speed(iter/s)": 0.29159 }, { "acc": 0.72180591, "epoch": 0.2870057476647927, "grad_norm": 4.5, "learning_rate": 9.762879459324998e-06, "loss": 1.10875664, "memory(GiB)": 142.32, "step": 25660, "train_speed(iter/s)": 0.291668 }, { "acc": 0.7198843, "epoch": 0.2872294466107512, "grad_norm": 5.375, "learning_rate": 9.762316352700045e-06, "loss": 1.12436142, "memory(GiB)": 142.32, "step": 25680, "train_speed(iter/s)": 0.291744 }, { "acc": 0.72478075, "epoch": 0.28745314555670975, "grad_norm": 5.875, "learning_rate": 9.761752594522147e-06, "loss": 1.09992504, "memory(GiB)": 142.32, "step": 25700, "train_speed(iter/s)": 0.291832 }, { "acc": 0.74242573, "epoch": 0.2876768445026683, "grad_norm": 5.9375, "learning_rate": 9.761188184868433e-06, "loss": 1.02601929, "memory(GiB)": 142.32, "step": 25720, "train_speed(iter/s)": 0.291913 }, { "acc": 0.72282472, "epoch": 0.2879005434486268, "grad_norm": 5.71875, "learning_rate": 9.760623123816122e-06, "loss": 1.11701298, "memory(GiB)": 142.32, "step": 25740, "train_speed(iter/s)": 0.291987 }, { "acc": 0.72597351, "epoch": 0.2881242423945854, "grad_norm": 6.5625, "learning_rate": 9.760057411442523e-06, "loss": 1.08891945, "memory(GiB)": 142.32, "step": 25760, "train_speed(iter/s)": 0.292059 }, { "acc": 0.71617556, "epoch": 0.2883479413405439, "grad_norm": 5.21875, "learning_rate": 9.759491047825034e-06, "loss": 1.14252167, "memory(GiB)": 142.32, "step": 25780, "train_speed(iter/s)": 0.292123 }, { "acc": 0.73053932, "epoch": 0.28857164028650245, "grad_norm": 6.78125, "learning_rate": 9.758924033041139e-06, "loss": 1.06741791, "memory(GiB)": 142.32, "step": 25800, "train_speed(iter/s)": 0.292197 }, { "acc": 0.73872929, "epoch": 0.288795339232461, "grad_norm": 4.875, "learning_rate": 9.758356367168416e-06, "loss": 1.0521719, "memory(GiB)": 142.32, "step": 25820, "train_speed(iter/s)": 0.292273 }, { "acc": 0.72442398, "epoch": 0.2890190381784195, "grad_norm": 5.21875, "learning_rate": 9.75778805028453e-06, "loss": 1.07799988, "memory(GiB)": 142.32, "step": 25840, "train_speed(iter/s)": 0.292344 }, { "acc": 0.7341393, "epoch": 0.28924273712437804, "grad_norm": 5.0625, "learning_rate": 9.757219082467233e-06, "loss": 1.05511408, "memory(GiB)": 142.32, "step": 25860, "train_speed(iter/s)": 0.292415 }, { "acc": 0.73839941, "epoch": 0.28946643607033656, "grad_norm": 6.5625, "learning_rate": 9.756649463794372e-06, "loss": 1.02913799, "memory(GiB)": 142.32, "step": 25880, "train_speed(iter/s)": 0.292489 }, { "acc": 0.72082891, "epoch": 0.2896901350162951, "grad_norm": 6.625, "learning_rate": 9.756079194343875e-06, "loss": 1.11652279, "memory(GiB)": 142.32, "step": 25900, "train_speed(iter/s)": 0.292559 }, { "acc": 0.72885299, "epoch": 0.2899138339622536, "grad_norm": 5.1875, "learning_rate": 9.755508274193764e-06, "loss": 1.09521408, "memory(GiB)": 142.32, "step": 25920, "train_speed(iter/s)": 0.29263 }, { "acc": 0.72872639, "epoch": 0.29013753290821215, "grad_norm": 5.78125, "learning_rate": 9.754936703422147e-06, "loss": 1.08381691, "memory(GiB)": 142.32, "step": 25940, "train_speed(iter/s)": 0.292707 }, { "acc": 0.72674346, "epoch": 0.2903612318541707, "grad_norm": 6.90625, "learning_rate": 9.754364482107227e-06, "loss": 1.10006075, "memory(GiB)": 142.32, "step": 25960, "train_speed(iter/s)": 0.292773 }, { "acc": 0.72488031, "epoch": 0.2905849308001292, "grad_norm": 6.125, "learning_rate": 9.753791610327291e-06, "loss": 1.08584061, "memory(GiB)": 142.32, "step": 25980, "train_speed(iter/s)": 0.292834 }, { "acc": 0.73402185, "epoch": 0.29080862974608773, "grad_norm": 5.4375, "learning_rate": 9.753218088160715e-06, "loss": 1.05246658, "memory(GiB)": 142.32, "step": 26000, "train_speed(iter/s)": 0.292901 }, { "epoch": 0.29080862974608773, "eval_acc": 0.686574134873282, "eval_loss": 1.114898920059204, "eval_runtime": 2341.5894, "eval_samples_per_second": 32.15, "eval_steps_per_second": 16.075, "step": 26000 }, { "acc": 0.70869236, "epoch": 0.29103232869204626, "grad_norm": 5.59375, "learning_rate": 9.752643915685963e-06, "loss": 1.18457365, "memory(GiB)": 142.32, "step": 26020, "train_speed(iter/s)": 0.285296 }, { "acc": 0.71389742, "epoch": 0.2912560276380048, "grad_norm": 4.59375, "learning_rate": 9.752069092981596e-06, "loss": 1.15102682, "memory(GiB)": 142.32, "step": 26040, "train_speed(iter/s)": 0.285365 }, { "acc": 0.7143168, "epoch": 0.2914797265839633, "grad_norm": 5.09375, "learning_rate": 9.751493620126254e-06, "loss": 1.15994463, "memory(GiB)": 142.32, "step": 26060, "train_speed(iter/s)": 0.285433 }, { "acc": 0.71820898, "epoch": 0.29170342552992184, "grad_norm": 6.1875, "learning_rate": 9.750917497198669e-06, "loss": 1.13074474, "memory(GiB)": 142.32, "step": 26080, "train_speed(iter/s)": 0.285503 }, { "acc": 0.71201382, "epoch": 0.2919271244758804, "grad_norm": 5.875, "learning_rate": 9.750340724277665e-06, "loss": 1.15038738, "memory(GiB)": 142.32, "step": 26100, "train_speed(iter/s)": 0.285573 }, { "acc": 0.72754011, "epoch": 0.2921508234218389, "grad_norm": 5.15625, "learning_rate": 9.74976330144215e-06, "loss": 1.06941586, "memory(GiB)": 142.32, "step": 26120, "train_speed(iter/s)": 0.285648 }, { "acc": 0.7281621, "epoch": 0.29237452236779743, "grad_norm": 5.8125, "learning_rate": 9.749185228771128e-06, "loss": 1.07384796, "memory(GiB)": 142.32, "step": 26140, "train_speed(iter/s)": 0.285725 }, { "acc": 0.73459654, "epoch": 0.29259822131375596, "grad_norm": 5.3125, "learning_rate": 9.748606506343683e-06, "loss": 1.04992256, "memory(GiB)": 142.32, "step": 26160, "train_speed(iter/s)": 0.285793 }, { "acc": 0.72343364, "epoch": 0.2928219202597145, "grad_norm": 5.40625, "learning_rate": 9.748027134238995e-06, "loss": 1.09361496, "memory(GiB)": 142.32, "step": 26180, "train_speed(iter/s)": 0.285869 }, { "acc": 0.72095027, "epoch": 0.293045619205673, "grad_norm": 6.21875, "learning_rate": 9.747447112536333e-06, "loss": 1.11448326, "memory(GiB)": 142.32, "step": 26200, "train_speed(iter/s)": 0.285942 }, { "acc": 0.72910051, "epoch": 0.29326931815163154, "grad_norm": 6.6875, "learning_rate": 9.746866441315047e-06, "loss": 1.07398911, "memory(GiB)": 142.32, "step": 26220, "train_speed(iter/s)": 0.286018 }, { "acc": 0.7156251, "epoch": 0.29349301709759007, "grad_norm": 7.375, "learning_rate": 9.746285120654582e-06, "loss": 1.12916136, "memory(GiB)": 142.32, "step": 26240, "train_speed(iter/s)": 0.286093 }, { "acc": 0.72272749, "epoch": 0.2937167160435486, "grad_norm": 5.875, "learning_rate": 9.745703150634475e-06, "loss": 1.10458508, "memory(GiB)": 142.32, "step": 26260, "train_speed(iter/s)": 0.286176 }, { "acc": 0.73004446, "epoch": 0.2939404149895071, "grad_norm": 6.8125, "learning_rate": 9.745120531334344e-06, "loss": 1.07468147, "memory(GiB)": 142.32, "step": 26280, "train_speed(iter/s)": 0.286248 }, { "acc": 0.73104897, "epoch": 0.29416411393546565, "grad_norm": 5.5625, "learning_rate": 9.744537262833903e-06, "loss": 1.06186466, "memory(GiB)": 142.32, "step": 26300, "train_speed(iter/s)": 0.286318 }, { "acc": 0.72535911, "epoch": 0.2943878128814242, "grad_norm": 5.125, "learning_rate": 9.743953345212946e-06, "loss": 1.0935648, "memory(GiB)": 142.32, "step": 26320, "train_speed(iter/s)": 0.286389 }, { "acc": 0.72524776, "epoch": 0.2946115118273827, "grad_norm": 5.46875, "learning_rate": 9.743368778551367e-06, "loss": 1.09786119, "memory(GiB)": 142.32, "step": 26340, "train_speed(iter/s)": 0.286455 }, { "acc": 0.72736053, "epoch": 0.29483521077334124, "grad_norm": 5.09375, "learning_rate": 9.74278356292914e-06, "loss": 1.06996918, "memory(GiB)": 142.32, "step": 26360, "train_speed(iter/s)": 0.286536 }, { "acc": 0.72490768, "epoch": 0.29505890971929977, "grad_norm": 5.125, "learning_rate": 9.74219769842633e-06, "loss": 1.1039257, "memory(GiB)": 142.32, "step": 26380, "train_speed(iter/s)": 0.286612 }, { "acc": 0.71966381, "epoch": 0.2952826086652583, "grad_norm": 5.8125, "learning_rate": 9.741611185123096e-06, "loss": 1.1454237, "memory(GiB)": 142.32, "step": 26400, "train_speed(iter/s)": 0.286692 }, { "acc": 0.724966, "epoch": 0.2955063076112168, "grad_norm": 5.09375, "learning_rate": 9.741024023099677e-06, "loss": 1.08728333, "memory(GiB)": 142.32, "step": 26420, "train_speed(iter/s)": 0.286762 }, { "acc": 0.72924881, "epoch": 0.29573000655717535, "grad_norm": 5.59375, "learning_rate": 9.740436212436408e-06, "loss": 1.09177914, "memory(GiB)": 142.32, "step": 26440, "train_speed(iter/s)": 0.286836 }, { "acc": 0.72088432, "epoch": 0.2959537055031339, "grad_norm": 5.25, "learning_rate": 9.739847753213707e-06, "loss": 1.13698654, "memory(GiB)": 142.32, "step": 26460, "train_speed(iter/s)": 0.286908 }, { "acc": 0.72093296, "epoch": 0.2961774044490924, "grad_norm": 6.09375, "learning_rate": 9.739258645512088e-06, "loss": 1.10803719, "memory(GiB)": 142.32, "step": 26480, "train_speed(iter/s)": 0.286988 }, { "acc": 0.73442888, "epoch": 0.29640110339505094, "grad_norm": 6.125, "learning_rate": 9.738668889412145e-06, "loss": 1.05057869, "memory(GiB)": 142.32, "step": 26500, "train_speed(iter/s)": 0.287059 }, { "acc": 0.72601967, "epoch": 0.29662480234100946, "grad_norm": 4.96875, "learning_rate": 9.738078484994566e-06, "loss": 1.08941889, "memory(GiB)": 142.32, "step": 26520, "train_speed(iter/s)": 0.287134 }, { "acc": 0.72759089, "epoch": 0.296848501286968, "grad_norm": 5.875, "learning_rate": 9.73748743234013e-06, "loss": 1.07765884, "memory(GiB)": 142.32, "step": 26540, "train_speed(iter/s)": 0.28721 }, { "acc": 0.71700201, "epoch": 0.2970722002329265, "grad_norm": 5.8125, "learning_rate": 9.736895731529696e-06, "loss": 1.12037745, "memory(GiB)": 142.32, "step": 26560, "train_speed(iter/s)": 0.287287 }, { "acc": 0.71706276, "epoch": 0.29729589917888505, "grad_norm": 6.9375, "learning_rate": 9.73630338264422e-06, "loss": 1.130404, "memory(GiB)": 142.32, "step": 26580, "train_speed(iter/s)": 0.287363 }, { "acc": 0.71498232, "epoch": 0.2975195981248436, "grad_norm": 6.25, "learning_rate": 9.735710385764747e-06, "loss": 1.14289961, "memory(GiB)": 142.32, "step": 26600, "train_speed(iter/s)": 0.287441 }, { "acc": 0.71518574, "epoch": 0.2977432970708021, "grad_norm": 7.0625, "learning_rate": 9.735116740972401e-06, "loss": 1.13849411, "memory(GiB)": 142.32, "step": 26620, "train_speed(iter/s)": 0.287519 }, { "acc": 0.73222008, "epoch": 0.29796699601676063, "grad_norm": 7.1875, "learning_rate": 9.734522448348407e-06, "loss": 1.0567277, "memory(GiB)": 142.32, "step": 26640, "train_speed(iter/s)": 0.28759 }, { "acc": 0.71299939, "epoch": 0.29819069496271916, "grad_norm": 5.1875, "learning_rate": 9.733927507974068e-06, "loss": 1.14235802, "memory(GiB)": 142.32, "step": 26660, "train_speed(iter/s)": 0.287656 }, { "acc": 0.72601547, "epoch": 0.2984143939086777, "grad_norm": 5.625, "learning_rate": 9.733331919930785e-06, "loss": 1.09323483, "memory(GiB)": 142.32, "step": 26680, "train_speed(iter/s)": 0.287726 }, { "acc": 0.71396275, "epoch": 0.2986380928546362, "grad_norm": 5.28125, "learning_rate": 9.732735684300039e-06, "loss": 1.14725075, "memory(GiB)": 142.32, "step": 26700, "train_speed(iter/s)": 0.287796 }, { "acc": 0.73204727, "epoch": 0.29886179180059474, "grad_norm": 5.1875, "learning_rate": 9.732138801163405e-06, "loss": 1.06788445, "memory(GiB)": 142.32, "step": 26720, "train_speed(iter/s)": 0.287871 }, { "acc": 0.71282115, "epoch": 0.2990854907465533, "grad_norm": 5.71875, "learning_rate": 9.731541270602544e-06, "loss": 1.13803959, "memory(GiB)": 142.32, "step": 26740, "train_speed(iter/s)": 0.287944 }, { "acc": 0.7340127, "epoch": 0.2993091896925118, "grad_norm": 4.84375, "learning_rate": 9.730943092699209e-06, "loss": 1.04732141, "memory(GiB)": 142.32, "step": 26760, "train_speed(iter/s)": 0.288014 }, { "acc": 0.71236124, "epoch": 0.29953288863847033, "grad_norm": 5.375, "learning_rate": 9.730344267535239e-06, "loss": 1.16262875, "memory(GiB)": 142.32, "step": 26780, "train_speed(iter/s)": 0.288077 }, { "acc": 0.71753216, "epoch": 0.29975658758442886, "grad_norm": 5.9375, "learning_rate": 9.72974479519256e-06, "loss": 1.12951336, "memory(GiB)": 142.32, "step": 26800, "train_speed(iter/s)": 0.288149 }, { "acc": 0.70149508, "epoch": 0.2999802865303874, "grad_norm": 5.03125, "learning_rate": 9.72914467575319e-06, "loss": 1.20781498, "memory(GiB)": 142.32, "step": 26820, "train_speed(iter/s)": 0.288215 }, { "acc": 0.72017632, "epoch": 0.3002039854763459, "grad_norm": 6.0, "learning_rate": 9.728543909299233e-06, "loss": 1.13040094, "memory(GiB)": 142.32, "step": 26840, "train_speed(iter/s)": 0.288289 }, { "acc": 0.72361908, "epoch": 0.30042768442230444, "grad_norm": 5.53125, "learning_rate": 9.727942495912883e-06, "loss": 1.09486599, "memory(GiB)": 142.32, "step": 26860, "train_speed(iter/s)": 0.288368 }, { "acc": 0.73525467, "epoch": 0.30065138336826297, "grad_norm": 6.5, "learning_rate": 9.72734043567642e-06, "loss": 1.04029274, "memory(GiB)": 142.32, "step": 26880, "train_speed(iter/s)": 0.288441 }, { "acc": 0.71901512, "epoch": 0.3008750823142215, "grad_norm": 5.59375, "learning_rate": 9.726737728672218e-06, "loss": 1.11530361, "memory(GiB)": 142.32, "step": 26900, "train_speed(iter/s)": 0.288516 }, { "acc": 0.72338514, "epoch": 0.30109878126018, "grad_norm": 5.90625, "learning_rate": 9.726134374982734e-06, "loss": 1.08873129, "memory(GiB)": 142.32, "step": 26920, "train_speed(iter/s)": 0.288587 }, { "acc": 0.72509294, "epoch": 0.30132248020613855, "grad_norm": 5.5625, "learning_rate": 9.725530374690515e-06, "loss": 1.09692421, "memory(GiB)": 142.32, "step": 26940, "train_speed(iter/s)": 0.28866 }, { "acc": 0.72650957, "epoch": 0.3015461791520971, "grad_norm": 5.46875, "learning_rate": 9.724925727878198e-06, "loss": 1.08854675, "memory(GiB)": 142.32, "step": 26960, "train_speed(iter/s)": 0.288728 }, { "acc": 0.72591219, "epoch": 0.3017698780980556, "grad_norm": 5.40625, "learning_rate": 9.724320434628505e-06, "loss": 1.10576763, "memory(GiB)": 142.32, "step": 26980, "train_speed(iter/s)": 0.288795 }, { "acc": 0.72337151, "epoch": 0.30199357704401414, "grad_norm": 4.25, "learning_rate": 9.723714495024252e-06, "loss": 1.10039101, "memory(GiB)": 142.32, "step": 27000, "train_speed(iter/s)": 0.288866 }, { "acc": 0.7216094, "epoch": 0.3022172759899727, "grad_norm": 5.90625, "learning_rate": 9.723107909148337e-06, "loss": 1.08917999, "memory(GiB)": 142.32, "step": 27020, "train_speed(iter/s)": 0.288939 }, { "acc": 0.72356467, "epoch": 0.30244097493593125, "grad_norm": 5.78125, "learning_rate": 9.722500677083754e-06, "loss": 1.11153936, "memory(GiB)": 142.32, "step": 27040, "train_speed(iter/s)": 0.289015 }, { "acc": 0.72865829, "epoch": 0.3026646738818898, "grad_norm": 5.28125, "learning_rate": 9.721892798913577e-06, "loss": 1.08312788, "memory(GiB)": 142.32, "step": 27060, "train_speed(iter/s)": 0.289086 }, { "acc": 0.72696433, "epoch": 0.3028883728278483, "grad_norm": 6.40625, "learning_rate": 9.721284274720973e-06, "loss": 1.09108524, "memory(GiB)": 142.32, "step": 27080, "train_speed(iter/s)": 0.289159 }, { "acc": 0.72137375, "epoch": 0.30311207177380683, "grad_norm": 5.65625, "learning_rate": 9.720675104589197e-06, "loss": 1.09596901, "memory(GiB)": 142.32, "step": 27100, "train_speed(iter/s)": 0.289231 }, { "acc": 0.71889782, "epoch": 0.30333577071976536, "grad_norm": 6.34375, "learning_rate": 9.720065288601594e-06, "loss": 1.12629051, "memory(GiB)": 142.32, "step": 27120, "train_speed(iter/s)": 0.289305 }, { "acc": 0.7231945, "epoch": 0.3035594696657239, "grad_norm": 5.375, "learning_rate": 9.719454826841594e-06, "loss": 1.09121799, "memory(GiB)": 142.32, "step": 27140, "train_speed(iter/s)": 0.289382 }, { "acc": 0.73045936, "epoch": 0.3037831686116824, "grad_norm": 5.0625, "learning_rate": 9.718843719392716e-06, "loss": 1.07543602, "memory(GiB)": 142.32, "step": 27160, "train_speed(iter/s)": 0.289457 }, { "acc": 0.71459494, "epoch": 0.30400686755764095, "grad_norm": 7.0625, "learning_rate": 9.718231966338572e-06, "loss": 1.14802399, "memory(GiB)": 142.32, "step": 27180, "train_speed(iter/s)": 0.289533 }, { "acc": 0.71169615, "epoch": 0.3042305665035995, "grad_norm": 6.65625, "learning_rate": 9.717619567762854e-06, "loss": 1.16105404, "memory(GiB)": 142.32, "step": 27200, "train_speed(iter/s)": 0.289597 }, { "acc": 0.71756296, "epoch": 0.304454265449558, "grad_norm": 4.625, "learning_rate": 9.71700652374935e-06, "loss": 1.1329464, "memory(GiB)": 142.32, "step": 27220, "train_speed(iter/s)": 0.289664 }, { "acc": 0.73208327, "epoch": 0.30467796439551653, "grad_norm": 5.5625, "learning_rate": 9.71639283438193e-06, "loss": 1.04889107, "memory(GiB)": 142.32, "step": 27240, "train_speed(iter/s)": 0.289731 }, { "acc": 0.71801395, "epoch": 0.30490166334147506, "grad_norm": 5.75, "learning_rate": 9.71577849974456e-06, "loss": 1.13249836, "memory(GiB)": 142.32, "step": 27260, "train_speed(iter/s)": 0.2898 }, { "acc": 0.72649541, "epoch": 0.3051253622874336, "grad_norm": 5.1875, "learning_rate": 9.715163519921285e-06, "loss": 1.09720078, "memory(GiB)": 142.32, "step": 27280, "train_speed(iter/s)": 0.289874 }, { "acc": 0.71063643, "epoch": 0.3053490612333921, "grad_norm": 6.40625, "learning_rate": 9.714547894996246e-06, "loss": 1.16077442, "memory(GiB)": 142.32, "step": 27300, "train_speed(iter/s)": 0.289951 }, { "acc": 0.71497259, "epoch": 0.30557276017935064, "grad_norm": 6.46875, "learning_rate": 9.713931625053667e-06, "loss": 1.1482336, "memory(GiB)": 142.32, "step": 27320, "train_speed(iter/s)": 0.290033 }, { "acc": 0.73053713, "epoch": 0.3057964591253092, "grad_norm": 6.34375, "learning_rate": 9.713314710177867e-06, "loss": 1.0783947, "memory(GiB)": 142.32, "step": 27340, "train_speed(iter/s)": 0.290092 }, { "acc": 0.72965508, "epoch": 0.3060201580712677, "grad_norm": 5.34375, "learning_rate": 9.712697150453246e-06, "loss": 1.07354126, "memory(GiB)": 142.32, "step": 27360, "train_speed(iter/s)": 0.290168 }, { "acc": 0.70698552, "epoch": 0.30624385701722623, "grad_norm": 4.75, "learning_rate": 9.712078945964291e-06, "loss": 1.18058233, "memory(GiB)": 142.32, "step": 27380, "train_speed(iter/s)": 0.290236 }, { "acc": 0.72069697, "epoch": 0.30646755596318476, "grad_norm": 6.125, "learning_rate": 9.711460096795589e-06, "loss": 1.12561092, "memory(GiB)": 142.32, "step": 27400, "train_speed(iter/s)": 0.290304 }, { "acc": 0.710254, "epoch": 0.3066912549091433, "grad_norm": 5.90625, "learning_rate": 9.710840603031801e-06, "loss": 1.16905708, "memory(GiB)": 142.32, "step": 27420, "train_speed(iter/s)": 0.290374 }, { "acc": 0.71752253, "epoch": 0.3069149538551018, "grad_norm": 5.25, "learning_rate": 9.710220464757687e-06, "loss": 1.1221961, "memory(GiB)": 142.32, "step": 27440, "train_speed(iter/s)": 0.290451 }, { "acc": 0.72367334, "epoch": 0.30713865280106034, "grad_norm": 5.09375, "learning_rate": 9.709599682058087e-06, "loss": 1.11286621, "memory(GiB)": 142.32, "step": 27460, "train_speed(iter/s)": 0.290522 }, { "acc": 0.72879782, "epoch": 0.30736235174701887, "grad_norm": 4.46875, "learning_rate": 9.708978255017935e-06, "loss": 1.07658825, "memory(GiB)": 142.32, "step": 27480, "train_speed(iter/s)": 0.290587 }, { "acc": 0.72223959, "epoch": 0.3075860506929774, "grad_norm": 7.75, "learning_rate": 9.708356183722252e-06, "loss": 1.09028282, "memory(GiB)": 142.32, "step": 27500, "train_speed(iter/s)": 0.29066 }, { "acc": 0.72207971, "epoch": 0.3078097496389359, "grad_norm": 5.9375, "learning_rate": 9.707733468256145e-06, "loss": 1.10465908, "memory(GiB)": 142.32, "step": 27520, "train_speed(iter/s)": 0.290731 }, { "acc": 0.70817409, "epoch": 0.30803344858489445, "grad_norm": 4.8125, "learning_rate": 9.707110108704811e-06, "loss": 1.16259995, "memory(GiB)": 142.32, "step": 27540, "train_speed(iter/s)": 0.2908 }, { "acc": 0.70989814, "epoch": 0.308257147530853, "grad_norm": 6.34375, "learning_rate": 9.706486105153532e-06, "loss": 1.16620474, "memory(GiB)": 142.32, "step": 27560, "train_speed(iter/s)": 0.29087 }, { "acc": 0.72837014, "epoch": 0.3084808464768115, "grad_norm": 4.53125, "learning_rate": 9.705861457687685e-06, "loss": 1.08026934, "memory(GiB)": 142.32, "step": 27580, "train_speed(iter/s)": 0.290942 }, { "acc": 0.72361956, "epoch": 0.30870454542277004, "grad_norm": 5.90625, "learning_rate": 9.705236166392728e-06, "loss": 1.11449699, "memory(GiB)": 142.32, "step": 27600, "train_speed(iter/s)": 0.291011 }, { "acc": 0.72499399, "epoch": 0.30892824436872857, "grad_norm": 5.0625, "learning_rate": 9.704610231354208e-06, "loss": 1.11103706, "memory(GiB)": 142.32, "step": 27620, "train_speed(iter/s)": 0.291072 }, { "acc": 0.72160954, "epoch": 0.3091519433146871, "grad_norm": 5.53125, "learning_rate": 9.703983652657767e-06, "loss": 1.09590778, "memory(GiB)": 142.32, "step": 27640, "train_speed(iter/s)": 0.29115 }, { "acc": 0.71030269, "epoch": 0.3093756422606456, "grad_norm": 6.875, "learning_rate": 9.703356430389123e-06, "loss": 1.15918331, "memory(GiB)": 142.32, "step": 27660, "train_speed(iter/s)": 0.291227 }, { "acc": 0.7219635, "epoch": 0.30959934120660415, "grad_norm": 5.25, "learning_rate": 9.702728564634097e-06, "loss": 1.11697912, "memory(GiB)": 142.32, "step": 27680, "train_speed(iter/s)": 0.291295 }, { "acc": 0.71404743, "epoch": 0.3098230401525627, "grad_norm": 5.75, "learning_rate": 9.702100055478583e-06, "loss": 1.14508801, "memory(GiB)": 142.32, "step": 27700, "train_speed(iter/s)": 0.29137 }, { "acc": 0.72917347, "epoch": 0.3100467390985212, "grad_norm": 6.96875, "learning_rate": 9.701470903008574e-06, "loss": 1.06286945, "memory(GiB)": 142.32, "step": 27720, "train_speed(iter/s)": 0.291443 }, { "acc": 0.71785846, "epoch": 0.31027043804447973, "grad_norm": 5.78125, "learning_rate": 9.700841107310146e-06, "loss": 1.12982807, "memory(GiB)": 142.32, "step": 27740, "train_speed(iter/s)": 0.291514 }, { "acc": 0.72137537, "epoch": 0.31049413699043826, "grad_norm": 5.625, "learning_rate": 9.700210668469464e-06, "loss": 1.1100626, "memory(GiB)": 142.32, "step": 27760, "train_speed(iter/s)": 0.291586 }, { "acc": 0.71060257, "epoch": 0.3107178359363968, "grad_norm": 5.78125, "learning_rate": 9.699579586572781e-06, "loss": 1.1625555, "memory(GiB)": 142.32, "step": 27780, "train_speed(iter/s)": 0.291653 }, { "acc": 0.71348944, "epoch": 0.3109415348823553, "grad_norm": 4.75, "learning_rate": 9.698947861706438e-06, "loss": 1.1597435, "memory(GiB)": 142.32, "step": 27800, "train_speed(iter/s)": 0.291723 }, { "acc": 0.72825708, "epoch": 0.31116523382831385, "grad_norm": 5.25, "learning_rate": 9.698315493956864e-06, "loss": 1.0739665, "memory(GiB)": 142.32, "step": 27820, "train_speed(iter/s)": 0.291791 }, { "acc": 0.73105335, "epoch": 0.3113889327742724, "grad_norm": 6.96875, "learning_rate": 9.697682483410576e-06, "loss": 1.05869637, "memory(GiB)": 142.32, "step": 27840, "train_speed(iter/s)": 0.291864 }, { "acc": 0.71681619, "epoch": 0.3116126317202309, "grad_norm": 5.3125, "learning_rate": 9.69704883015418e-06, "loss": 1.13195362, "memory(GiB)": 142.32, "step": 27860, "train_speed(iter/s)": 0.291926 }, { "acc": 0.72676029, "epoch": 0.31183633066618943, "grad_norm": 5.71875, "learning_rate": 9.696414534274367e-06, "loss": 1.07763138, "memory(GiB)": 142.32, "step": 27880, "train_speed(iter/s)": 0.291988 }, { "acc": 0.72053351, "epoch": 0.31206002961214796, "grad_norm": 6.21875, "learning_rate": 9.695779595857918e-06, "loss": 1.12018814, "memory(GiB)": 142.32, "step": 27900, "train_speed(iter/s)": 0.292052 }, { "acc": 0.72914782, "epoch": 0.3122837285581065, "grad_norm": 5.84375, "learning_rate": 9.695144014991702e-06, "loss": 1.08818731, "memory(GiB)": 142.32, "step": 27920, "train_speed(iter/s)": 0.292119 }, { "acc": 0.72360649, "epoch": 0.312507427504065, "grad_norm": 4.90625, "learning_rate": 9.694507791762676e-06, "loss": 1.09600105, "memory(GiB)": 142.32, "step": 27940, "train_speed(iter/s)": 0.292182 }, { "acc": 0.73141556, "epoch": 0.31273112645002354, "grad_norm": 6.34375, "learning_rate": 9.693870926257884e-06, "loss": 1.06574249, "memory(GiB)": 142.32, "step": 27960, "train_speed(iter/s)": 0.292241 }, { "acc": 0.72247591, "epoch": 0.3129548253959821, "grad_norm": 6.6875, "learning_rate": 9.693233418564459e-06, "loss": 1.11367664, "memory(GiB)": 142.32, "step": 27980, "train_speed(iter/s)": 0.292309 }, { "acc": 0.72779121, "epoch": 0.3131785243419406, "grad_norm": 5.0, "learning_rate": 9.69259526876962e-06, "loss": 1.07607441, "memory(GiB)": 142.32, "step": 28000, "train_speed(iter/s)": 0.292374 }, { "epoch": 0.3131785243419406, "eval_acc": 0.6873802337765694, "eval_loss": 1.1113524436950684, "eval_runtime": 2343.6784, "eval_samples_per_second": 32.122, "eval_steps_per_second": 16.061, "step": 28000 }, { "acc": 0.72556181, "epoch": 0.31340222328789913, "grad_norm": 6.1875, "learning_rate": 9.691956476960676e-06, "loss": 1.08604612, "memory(GiB)": 142.32, "step": 28020, "train_speed(iter/s)": 0.285311 }, { "acc": 0.72514296, "epoch": 0.31362592223385766, "grad_norm": 5.875, "learning_rate": 9.691317043225023e-06, "loss": 1.11482496, "memory(GiB)": 142.32, "step": 28040, "train_speed(iter/s)": 0.285384 }, { "acc": 0.72585697, "epoch": 0.3138496211798162, "grad_norm": 6.21875, "learning_rate": 9.690676967650144e-06, "loss": 1.09217491, "memory(GiB)": 142.32, "step": 28060, "train_speed(iter/s)": 0.285452 }, { "acc": 0.73350282, "epoch": 0.3140733201257747, "grad_norm": 6.84375, "learning_rate": 9.690036250323608e-06, "loss": 1.05969734, "memory(GiB)": 142.32, "step": 28080, "train_speed(iter/s)": 0.285517 }, { "acc": 0.70307302, "epoch": 0.31429701907173324, "grad_norm": 5.21875, "learning_rate": 9.68939489133308e-06, "loss": 1.17825222, "memory(GiB)": 142.32, "step": 28100, "train_speed(iter/s)": 0.285593 }, { "acc": 0.73279657, "epoch": 0.31452071801769177, "grad_norm": 5.625, "learning_rate": 9.688752890766302e-06, "loss": 1.04402924, "memory(GiB)": 142.32, "step": 28120, "train_speed(iter/s)": 0.285665 }, { "acc": 0.73132558, "epoch": 0.3147444169636503, "grad_norm": 5.1875, "learning_rate": 9.688110248711112e-06, "loss": 1.05848494, "memory(GiB)": 142.32, "step": 28140, "train_speed(iter/s)": 0.285733 }, { "acc": 0.72355223, "epoch": 0.3149681159096088, "grad_norm": 3.984375, "learning_rate": 9.687466965255432e-06, "loss": 1.09675426, "memory(GiB)": 142.32, "step": 28160, "train_speed(iter/s)": 0.285803 }, { "acc": 0.72321882, "epoch": 0.31519181485556735, "grad_norm": 6.34375, "learning_rate": 9.68682304048727e-06, "loss": 1.11360254, "memory(GiB)": 142.32, "step": 28180, "train_speed(iter/s)": 0.285873 }, { "acc": 0.71919708, "epoch": 0.3154155138015259, "grad_norm": 6.46875, "learning_rate": 9.686178474494727e-06, "loss": 1.12527075, "memory(GiB)": 142.32, "step": 28200, "train_speed(iter/s)": 0.285938 }, { "acc": 0.7215744, "epoch": 0.3156392127474844, "grad_norm": 5.4375, "learning_rate": 9.685533267365988e-06, "loss": 1.11588364, "memory(GiB)": 142.32, "step": 28220, "train_speed(iter/s)": 0.286006 }, { "acc": 0.73459444, "epoch": 0.31586291169344294, "grad_norm": 5.3125, "learning_rate": 9.684887419189327e-06, "loss": 1.04293957, "memory(GiB)": 142.32, "step": 28240, "train_speed(iter/s)": 0.286082 }, { "acc": 0.72886128, "epoch": 0.31608661063940147, "grad_norm": 5.625, "learning_rate": 9.684240930053102e-06, "loss": 1.08332615, "memory(GiB)": 142.32, "step": 28260, "train_speed(iter/s)": 0.286158 }, { "acc": 0.72367811, "epoch": 0.31631030958536005, "grad_norm": 6.1875, "learning_rate": 9.683593800045765e-06, "loss": 1.11871824, "memory(GiB)": 142.32, "step": 28280, "train_speed(iter/s)": 0.286221 }, { "acc": 0.71683731, "epoch": 0.3165340085313186, "grad_norm": 7.0625, "learning_rate": 9.682946029255855e-06, "loss": 1.1318552, "memory(GiB)": 142.32, "step": 28300, "train_speed(iter/s)": 0.286296 }, { "acc": 0.7143239, "epoch": 0.3167577074772771, "grad_norm": 5.4375, "learning_rate": 9.682297617771992e-06, "loss": 1.13971262, "memory(GiB)": 142.32, "step": 28320, "train_speed(iter/s)": 0.286356 }, { "acc": 0.73192406, "epoch": 0.31698140642323563, "grad_norm": 5.21875, "learning_rate": 9.681648565682889e-06, "loss": 1.07932301, "memory(GiB)": 142.32, "step": 28340, "train_speed(iter/s)": 0.286428 }, { "acc": 0.72894926, "epoch": 0.31720510536919416, "grad_norm": 5.71875, "learning_rate": 9.680998873077346e-06, "loss": 1.07124195, "memory(GiB)": 142.32, "step": 28360, "train_speed(iter/s)": 0.286497 }, { "acc": 0.72537575, "epoch": 0.3174288043151527, "grad_norm": 6.03125, "learning_rate": 9.680348540044249e-06, "loss": 1.07287703, "memory(GiB)": 142.32, "step": 28380, "train_speed(iter/s)": 0.286568 }, { "acc": 0.72337742, "epoch": 0.3176525032611112, "grad_norm": 5.5625, "learning_rate": 9.679697566672577e-06, "loss": 1.11108017, "memory(GiB)": 142.32, "step": 28400, "train_speed(iter/s)": 0.286631 }, { "acc": 0.72321, "epoch": 0.31787620220706975, "grad_norm": 6.71875, "learning_rate": 9.679045953051387e-06, "loss": 1.08574486, "memory(GiB)": 142.32, "step": 28420, "train_speed(iter/s)": 0.286697 }, { "acc": 0.73136668, "epoch": 0.3180999011530283, "grad_norm": 6.59375, "learning_rate": 9.678393699269833e-06, "loss": 1.06405821, "memory(GiB)": 142.32, "step": 28440, "train_speed(iter/s)": 0.28677 }, { "acc": 0.7429728, "epoch": 0.3183236000989868, "grad_norm": 6.40625, "learning_rate": 9.677740805417151e-06, "loss": 1.01294804, "memory(GiB)": 142.32, "step": 28460, "train_speed(iter/s)": 0.286837 }, { "acc": 0.72008529, "epoch": 0.31854729904494533, "grad_norm": 5.90625, "learning_rate": 9.677087271582666e-06, "loss": 1.12695236, "memory(GiB)": 142.32, "step": 28480, "train_speed(iter/s)": 0.286905 }, { "acc": 0.72655935, "epoch": 0.31877099799090386, "grad_norm": 5.9375, "learning_rate": 9.676433097855793e-06, "loss": 1.09945116, "memory(GiB)": 142.32, "step": 28500, "train_speed(iter/s)": 0.286974 }, { "acc": 0.71879568, "epoch": 0.3189946969368624, "grad_norm": 5.6875, "learning_rate": 9.675778284326029e-06, "loss": 1.12985048, "memory(GiB)": 142.32, "step": 28520, "train_speed(iter/s)": 0.287044 }, { "acc": 0.72865181, "epoch": 0.3192183958828209, "grad_norm": 6.78125, "learning_rate": 9.675122831082963e-06, "loss": 1.08525229, "memory(GiB)": 142.32, "step": 28540, "train_speed(iter/s)": 0.287108 }, { "acc": 0.73854389, "epoch": 0.31944209482877944, "grad_norm": 7.3125, "learning_rate": 9.674466738216273e-06, "loss": 1.03881721, "memory(GiB)": 142.32, "step": 28560, "train_speed(iter/s)": 0.287176 }, { "acc": 0.72482171, "epoch": 0.31966579377473797, "grad_norm": 4.9375, "learning_rate": 9.673810005815718e-06, "loss": 1.09664478, "memory(GiB)": 142.32, "step": 28580, "train_speed(iter/s)": 0.287241 }, { "acc": 0.72171674, "epoch": 0.3198894927206965, "grad_norm": 6.0, "learning_rate": 9.67315263397115e-06, "loss": 1.12433109, "memory(GiB)": 142.32, "step": 28600, "train_speed(iter/s)": 0.287312 }, { "acc": 0.73427095, "epoch": 0.32011319166665503, "grad_norm": 5.78125, "learning_rate": 9.672494622772509e-06, "loss": 1.05565004, "memory(GiB)": 142.32, "step": 28620, "train_speed(iter/s)": 0.287381 }, { "acc": 0.72548356, "epoch": 0.32033689061261356, "grad_norm": 4.84375, "learning_rate": 9.671835972309815e-06, "loss": 1.11163578, "memory(GiB)": 142.32, "step": 28640, "train_speed(iter/s)": 0.28745 }, { "acc": 0.72204981, "epoch": 0.3205605895585721, "grad_norm": 6.75, "learning_rate": 9.671176682673186e-06, "loss": 1.10366869, "memory(GiB)": 142.32, "step": 28660, "train_speed(iter/s)": 0.287522 }, { "acc": 0.72178917, "epoch": 0.3207842885045306, "grad_norm": 6.5, "learning_rate": 9.67051675395282e-06, "loss": 1.12298965, "memory(GiB)": 142.32, "step": 28680, "train_speed(iter/s)": 0.287591 }, { "acc": 0.7216774, "epoch": 0.32100798745048914, "grad_norm": 6.09375, "learning_rate": 9.669856186239004e-06, "loss": 1.09504547, "memory(GiB)": 142.32, "step": 28700, "train_speed(iter/s)": 0.287659 }, { "acc": 0.72123852, "epoch": 0.32123168639644767, "grad_norm": 4.625, "learning_rate": 9.669194979622117e-06, "loss": 1.11635666, "memory(GiB)": 142.32, "step": 28720, "train_speed(iter/s)": 0.287726 }, { "acc": 0.72130246, "epoch": 0.3214553853424062, "grad_norm": 6.6875, "learning_rate": 9.668533134192615e-06, "loss": 1.12194309, "memory(GiB)": 142.32, "step": 28740, "train_speed(iter/s)": 0.287796 }, { "acc": 0.72747107, "epoch": 0.3216790842883647, "grad_norm": 5.09375, "learning_rate": 9.667870650041053e-06, "loss": 1.07676868, "memory(GiB)": 142.32, "step": 28760, "train_speed(iter/s)": 0.287863 }, { "acc": 0.71936283, "epoch": 0.32190278323432325, "grad_norm": 4.75, "learning_rate": 9.667207527258067e-06, "loss": 1.13144245, "memory(GiB)": 142.32, "step": 28780, "train_speed(iter/s)": 0.287923 }, { "acc": 0.71921978, "epoch": 0.3221264821802818, "grad_norm": 7.5, "learning_rate": 9.666543765934381e-06, "loss": 1.12148609, "memory(GiB)": 142.32, "step": 28800, "train_speed(iter/s)": 0.287991 }, { "acc": 0.71725216, "epoch": 0.3223501811262403, "grad_norm": 6.0625, "learning_rate": 9.66587936616081e-06, "loss": 1.13467216, "memory(GiB)": 142.32, "step": 28820, "train_speed(iter/s)": 0.28806 }, { "acc": 0.72277546, "epoch": 0.32257388007219884, "grad_norm": 6.53125, "learning_rate": 9.665214328028249e-06, "loss": 1.10974083, "memory(GiB)": 142.32, "step": 28840, "train_speed(iter/s)": 0.288124 }, { "acc": 0.71496353, "epoch": 0.32279757901815737, "grad_norm": 6.09375, "learning_rate": 9.664548651627686e-06, "loss": 1.13731632, "memory(GiB)": 142.32, "step": 28860, "train_speed(iter/s)": 0.288182 }, { "acc": 0.72364297, "epoch": 0.3230212779641159, "grad_norm": 6.1875, "learning_rate": 9.663882337050197e-06, "loss": 1.10752544, "memory(GiB)": 142.32, "step": 28880, "train_speed(iter/s)": 0.288253 }, { "acc": 0.73199339, "epoch": 0.3232449769100744, "grad_norm": 5.5625, "learning_rate": 9.663215384386942e-06, "loss": 1.05715513, "memory(GiB)": 142.32, "step": 28900, "train_speed(iter/s)": 0.288318 }, { "acc": 0.72839336, "epoch": 0.32346867585603295, "grad_norm": 5.625, "learning_rate": 9.662547793729169e-06, "loss": 1.09684057, "memory(GiB)": 142.32, "step": 28920, "train_speed(iter/s)": 0.288378 }, { "acc": 0.71245227, "epoch": 0.3236923748019915, "grad_norm": 6.5, "learning_rate": 9.661879565168213e-06, "loss": 1.15688744, "memory(GiB)": 142.32, "step": 28940, "train_speed(iter/s)": 0.288446 }, { "acc": 0.72356448, "epoch": 0.32391607374795, "grad_norm": 6.21875, "learning_rate": 9.661210698795502e-06, "loss": 1.10541706, "memory(GiB)": 142.32, "step": 28960, "train_speed(iter/s)": 0.288513 }, { "acc": 0.72588873, "epoch": 0.32413977269390853, "grad_norm": 5.53125, "learning_rate": 9.660541194702541e-06, "loss": 1.10613041, "memory(GiB)": 142.32, "step": 28980, "train_speed(iter/s)": 0.288573 }, { "acc": 0.73287711, "epoch": 0.32436347163986706, "grad_norm": 5.4375, "learning_rate": 9.659871052980931e-06, "loss": 1.0582221, "memory(GiB)": 142.32, "step": 29000, "train_speed(iter/s)": 0.28864 }, { "acc": 0.72238474, "epoch": 0.3245871705858256, "grad_norm": 5.75, "learning_rate": 9.659200273722358e-06, "loss": 1.10359554, "memory(GiB)": 142.32, "step": 29020, "train_speed(iter/s)": 0.288711 }, { "acc": 0.73051691, "epoch": 0.3248108695317841, "grad_norm": 5.375, "learning_rate": 9.65852885701859e-06, "loss": 1.0762352, "memory(GiB)": 142.32, "step": 29040, "train_speed(iter/s)": 0.288779 }, { "acc": 0.72898536, "epoch": 0.32503456847774265, "grad_norm": 6.1875, "learning_rate": 9.657856802961488e-06, "loss": 1.07891626, "memory(GiB)": 142.32, "step": 29060, "train_speed(iter/s)": 0.288847 }, { "acc": 0.74192848, "epoch": 0.3252582674237012, "grad_norm": 5.28125, "learning_rate": 9.657184111643e-06, "loss": 1.03757553, "memory(GiB)": 142.32, "step": 29080, "train_speed(iter/s)": 0.288913 }, { "acc": 0.71790733, "epoch": 0.3254819663696597, "grad_norm": 5.09375, "learning_rate": 9.656510783155159e-06, "loss": 1.11717348, "memory(GiB)": 142.32, "step": 29100, "train_speed(iter/s)": 0.288972 }, { "acc": 0.72328649, "epoch": 0.32570566531561823, "grad_norm": 7.0, "learning_rate": 9.655836817590087e-06, "loss": 1.1053503, "memory(GiB)": 142.32, "step": 29120, "train_speed(iter/s)": 0.289045 }, { "acc": 0.7154604, "epoch": 0.32592936426157676, "grad_norm": 5.34375, "learning_rate": 9.655162215039991e-06, "loss": 1.14168816, "memory(GiB)": 142.32, "step": 29140, "train_speed(iter/s)": 0.289111 }, { "acc": 0.7195219, "epoch": 0.3261530632075353, "grad_norm": 6.75, "learning_rate": 9.654486975597165e-06, "loss": 1.11781445, "memory(GiB)": 142.32, "step": 29160, "train_speed(iter/s)": 0.289172 }, { "acc": 0.72079253, "epoch": 0.3263767621534938, "grad_norm": 5.78125, "learning_rate": 9.653811099353994e-06, "loss": 1.10343924, "memory(GiB)": 142.32, "step": 29180, "train_speed(iter/s)": 0.289237 }, { "acc": 0.70469046, "epoch": 0.32660046109945234, "grad_norm": 4.25, "learning_rate": 9.653134586402946e-06, "loss": 1.19317684, "memory(GiB)": 142.32, "step": 29200, "train_speed(iter/s)": 0.289311 }, { "acc": 0.72837105, "epoch": 0.32682416004541087, "grad_norm": 5.875, "learning_rate": 9.652457436836577e-06, "loss": 1.08089695, "memory(GiB)": 142.32, "step": 29220, "train_speed(iter/s)": 0.289379 }, { "acc": 0.72272615, "epoch": 0.3270478589913694, "grad_norm": 4.96875, "learning_rate": 9.651779650747533e-06, "loss": 1.11703491, "memory(GiB)": 142.32, "step": 29240, "train_speed(iter/s)": 0.289446 }, { "acc": 0.72864513, "epoch": 0.32727155793732793, "grad_norm": 5.8125, "learning_rate": 9.651101228228543e-06, "loss": 1.0868536, "memory(GiB)": 142.32, "step": 29260, "train_speed(iter/s)": 0.289512 }, { "acc": 0.71433687, "epoch": 0.32749525688328646, "grad_norm": 6.46875, "learning_rate": 9.650422169372427e-06, "loss": 1.13509665, "memory(GiB)": 142.32, "step": 29280, "train_speed(iter/s)": 0.289572 }, { "acc": 0.73062973, "epoch": 0.327718955829245, "grad_norm": 5.4375, "learning_rate": 9.649742474272085e-06, "loss": 1.06921082, "memory(GiB)": 142.32, "step": 29300, "train_speed(iter/s)": 0.289642 }, { "acc": 0.72895336, "epoch": 0.3279426547752035, "grad_norm": 5.78125, "learning_rate": 9.649062143020515e-06, "loss": 1.07992249, "memory(GiB)": 142.32, "step": 29320, "train_speed(iter/s)": 0.289711 }, { "acc": 0.72193022, "epoch": 0.32816635372116204, "grad_norm": 6.1875, "learning_rate": 9.648381175710792e-06, "loss": 1.10138407, "memory(GiB)": 142.32, "step": 29340, "train_speed(iter/s)": 0.289777 }, { "acc": 0.72660017, "epoch": 0.32839005266712057, "grad_norm": 5.03125, "learning_rate": 9.647699572436085e-06, "loss": 1.09435329, "memory(GiB)": 142.32, "step": 29360, "train_speed(iter/s)": 0.289845 }, { "acc": 0.71633415, "epoch": 0.3286137516130791, "grad_norm": 6.25, "learning_rate": 9.647017333289646e-06, "loss": 1.12351799, "memory(GiB)": 142.32, "step": 29380, "train_speed(iter/s)": 0.289914 }, { "acc": 0.73515377, "epoch": 0.3288374505590376, "grad_norm": 5.96875, "learning_rate": 9.646334458364813e-06, "loss": 1.06730118, "memory(GiB)": 142.32, "step": 29400, "train_speed(iter/s)": 0.289979 }, { "acc": 0.72719316, "epoch": 0.32906114950499615, "grad_norm": 5.25, "learning_rate": 9.645650947755014e-06, "loss": 1.08357925, "memory(GiB)": 142.32, "step": 29420, "train_speed(iter/s)": 0.290048 }, { "acc": 0.7332242, "epoch": 0.3292848484509547, "grad_norm": 5.96875, "learning_rate": 9.644966801553765e-06, "loss": 1.04449902, "memory(GiB)": 142.32, "step": 29440, "train_speed(iter/s)": 0.290113 }, { "acc": 0.72229376, "epoch": 0.3295085473969132, "grad_norm": 5.4375, "learning_rate": 9.644282019854665e-06, "loss": 1.10989952, "memory(GiB)": 142.32, "step": 29460, "train_speed(iter/s)": 0.290173 }, { "acc": 0.73620424, "epoch": 0.32973224634287174, "grad_norm": 5.90625, "learning_rate": 9.643596602751404e-06, "loss": 1.05234661, "memory(GiB)": 142.32, "step": 29480, "train_speed(iter/s)": 0.29024 }, { "acc": 0.71863031, "epoch": 0.32995594528883027, "grad_norm": 5.65625, "learning_rate": 9.642910550337754e-06, "loss": 1.11611557, "memory(GiB)": 142.32, "step": 29500, "train_speed(iter/s)": 0.290307 }, { "acc": 0.7143486, "epoch": 0.3301796442347888, "grad_norm": 7.40625, "learning_rate": 9.642223862707578e-06, "loss": 1.1593317, "memory(GiB)": 142.32, "step": 29520, "train_speed(iter/s)": 0.290377 }, { "acc": 0.72168441, "epoch": 0.3304033431807474, "grad_norm": 4.71875, "learning_rate": 9.641536539954826e-06, "loss": 1.10484047, "memory(GiB)": 142.32, "step": 29540, "train_speed(iter/s)": 0.290435 }, { "acc": 0.73135805, "epoch": 0.3306270421267059, "grad_norm": 5.0625, "learning_rate": 9.640848582173533e-06, "loss": 1.06057911, "memory(GiB)": 142.32, "step": 29560, "train_speed(iter/s)": 0.290499 }, { "acc": 0.71725349, "epoch": 0.33085074107266443, "grad_norm": 4.8125, "learning_rate": 9.64015998945782e-06, "loss": 1.14220018, "memory(GiB)": 142.32, "step": 29580, "train_speed(iter/s)": 0.290567 }, { "acc": 0.7238328, "epoch": 0.33107444001862296, "grad_norm": 5.5625, "learning_rate": 9.639470761901897e-06, "loss": 1.10490379, "memory(GiB)": 142.32, "step": 29600, "train_speed(iter/s)": 0.29064 }, { "acc": 0.72582083, "epoch": 0.3312981389645815, "grad_norm": 5.53125, "learning_rate": 9.63878089960006e-06, "loss": 1.09853668, "memory(GiB)": 142.32, "step": 29620, "train_speed(iter/s)": 0.29071 }, { "acc": 0.72612958, "epoch": 0.33152183791054, "grad_norm": 4.5, "learning_rate": 9.638090402646694e-06, "loss": 1.09031878, "memory(GiB)": 142.32, "step": 29640, "train_speed(iter/s)": 0.290777 }, { "acc": 0.73296223, "epoch": 0.33174553685649855, "grad_norm": 5.25, "learning_rate": 9.637399271136267e-06, "loss": 1.05174179, "memory(GiB)": 142.32, "step": 29660, "train_speed(iter/s)": 0.29084 }, { "acc": 0.72135744, "epoch": 0.3319692358024571, "grad_norm": 4.5625, "learning_rate": 9.636707505163334e-06, "loss": 1.11571198, "memory(GiB)": 142.32, "step": 29680, "train_speed(iter/s)": 0.290911 }, { "acc": 0.71916685, "epoch": 0.3321929347484156, "grad_norm": 5.875, "learning_rate": 9.636015104822543e-06, "loss": 1.14428577, "memory(GiB)": 142.32, "step": 29700, "train_speed(iter/s)": 0.290976 }, { "acc": 0.72759714, "epoch": 0.33241663369437413, "grad_norm": 5.9375, "learning_rate": 9.63532207020862e-06, "loss": 1.08019543, "memory(GiB)": 142.32, "step": 29720, "train_speed(iter/s)": 0.291045 }, { "acc": 0.73287468, "epoch": 0.33264033264033266, "grad_norm": 6.28125, "learning_rate": 9.634628401416385e-06, "loss": 1.06826019, "memory(GiB)": 142.32, "step": 29740, "train_speed(iter/s)": 0.291111 }, { "acc": 0.71877804, "epoch": 0.3328640315862912, "grad_norm": 5.25, "learning_rate": 9.63393409854074e-06, "loss": 1.13182907, "memory(GiB)": 142.32, "step": 29760, "train_speed(iter/s)": 0.291178 }, { "acc": 0.72471504, "epoch": 0.3330877305322497, "grad_norm": 6.59375, "learning_rate": 9.633239161676678e-06, "loss": 1.09464054, "memory(GiB)": 142.32, "step": 29780, "train_speed(iter/s)": 0.291241 }, { "acc": 0.71883059, "epoch": 0.33331142947820824, "grad_norm": 5.8125, "learning_rate": 9.632543590919272e-06, "loss": 1.13544521, "memory(GiB)": 142.32, "step": 29800, "train_speed(iter/s)": 0.291303 }, { "acc": 0.71815042, "epoch": 0.33353512842416677, "grad_norm": 5.71875, "learning_rate": 9.63184738636369e-06, "loss": 1.15722027, "memory(GiB)": 142.32, "step": 29820, "train_speed(iter/s)": 0.291367 }, { "acc": 0.73104076, "epoch": 0.3337588273701253, "grad_norm": 7.03125, "learning_rate": 9.63115054810518e-06, "loss": 1.06554165, "memory(GiB)": 142.32, "step": 29840, "train_speed(iter/s)": 0.291433 }, { "acc": 0.73806534, "epoch": 0.3339825263160838, "grad_norm": 6.125, "learning_rate": 9.63045307623908e-06, "loss": 1.03037033, "memory(GiB)": 142.32, "step": 29860, "train_speed(iter/s)": 0.291496 }, { "acc": 0.71902008, "epoch": 0.33420622526204236, "grad_norm": 5.34375, "learning_rate": 9.629754970860815e-06, "loss": 1.12265282, "memory(GiB)": 142.32, "step": 29880, "train_speed(iter/s)": 0.291558 }, { "acc": 0.72811184, "epoch": 0.3344299242080009, "grad_norm": 6.46875, "learning_rate": 9.629056232065896e-06, "loss": 1.08260365, "memory(GiB)": 142.32, "step": 29900, "train_speed(iter/s)": 0.291617 }, { "acc": 0.71313143, "epoch": 0.3346536231539594, "grad_norm": 6.6875, "learning_rate": 9.62835685994992e-06, "loss": 1.15808439, "memory(GiB)": 142.32, "step": 29920, "train_speed(iter/s)": 0.291687 }, { "acc": 0.71804237, "epoch": 0.33487732209991794, "grad_norm": 5.03125, "learning_rate": 9.627656854608572e-06, "loss": 1.14188366, "memory(GiB)": 142.32, "step": 29940, "train_speed(iter/s)": 0.291743 }, { "acc": 0.72348871, "epoch": 0.33510102104587647, "grad_norm": 6.59375, "learning_rate": 9.626956216137622e-06, "loss": 1.09849892, "memory(GiB)": 142.32, "step": 29960, "train_speed(iter/s)": 0.291804 }, { "acc": 0.72533169, "epoch": 0.335324719991835, "grad_norm": 6.21875, "learning_rate": 9.626254944632927e-06, "loss": 1.08840523, "memory(GiB)": 142.32, "step": 29980, "train_speed(iter/s)": 0.291866 }, { "acc": 0.73538208, "epoch": 0.3355484189377935, "grad_norm": 5.15625, "learning_rate": 9.625553040190429e-06, "loss": 1.05540142, "memory(GiB)": 142.32, "step": 30000, "train_speed(iter/s)": 0.291934 }, { "epoch": 0.3355484189377935, "eval_acc": 0.6879131306176003, "eval_loss": 1.1091336011886597, "eval_runtime": 2340.3348, "eval_samples_per_second": 32.168, "eval_steps_per_second": 16.084, "step": 30000 }, { "acc": 0.72220373, "epoch": 0.33577211788375205, "grad_norm": 5.53125, "learning_rate": 9.624850502906163e-06, "loss": 1.13053741, "memory(GiB)": 142.32, "step": 30020, "train_speed(iter/s)": 0.285367 }, { "acc": 0.72249889, "epoch": 0.3359958168297106, "grad_norm": 5.75, "learning_rate": 9.624147332876244e-06, "loss": 1.11202164, "memory(GiB)": 142.32, "step": 30040, "train_speed(iter/s)": 0.285427 }, { "acc": 0.72921376, "epoch": 0.3362195157756691, "grad_norm": 6.03125, "learning_rate": 9.623443530196874e-06, "loss": 1.07874184, "memory(GiB)": 142.32, "step": 30060, "train_speed(iter/s)": 0.285494 }, { "acc": 0.72159991, "epoch": 0.33644321472162764, "grad_norm": 6.78125, "learning_rate": 9.622739094964347e-06, "loss": 1.0994751, "memory(GiB)": 142.32, "step": 30080, "train_speed(iter/s)": 0.285563 }, { "acc": 0.71949496, "epoch": 0.33666691366758617, "grad_norm": 5.4375, "learning_rate": 9.622034027275035e-06, "loss": 1.11913929, "memory(GiB)": 142.32, "step": 30100, "train_speed(iter/s)": 0.285632 }, { "acc": 0.71680484, "epoch": 0.3368906126135447, "grad_norm": 6.25, "learning_rate": 9.621328327225406e-06, "loss": 1.14429522, "memory(GiB)": 142.32, "step": 30120, "train_speed(iter/s)": 0.285697 }, { "acc": 0.72293639, "epoch": 0.3371143115595032, "grad_norm": 6.15625, "learning_rate": 9.620621994912004e-06, "loss": 1.10632763, "memory(GiB)": 142.32, "step": 30140, "train_speed(iter/s)": 0.285759 }, { "acc": 0.72262306, "epoch": 0.33733801050546175, "grad_norm": 6.09375, "learning_rate": 9.619915030431475e-06, "loss": 1.09946384, "memory(GiB)": 142.32, "step": 30160, "train_speed(iter/s)": 0.285819 }, { "acc": 0.7205225, "epoch": 0.3375617094514203, "grad_norm": 6.5625, "learning_rate": 9.619207433880532e-06, "loss": 1.11538677, "memory(GiB)": 142.32, "step": 30180, "train_speed(iter/s)": 0.285878 }, { "acc": 0.72187681, "epoch": 0.3377854083973788, "grad_norm": 5.125, "learning_rate": 9.61849920535599e-06, "loss": 1.10732918, "memory(GiB)": 142.32, "step": 30200, "train_speed(iter/s)": 0.285943 }, { "acc": 0.72645159, "epoch": 0.33800910734333733, "grad_norm": 6.1875, "learning_rate": 9.617790344954743e-06, "loss": 1.08374481, "memory(GiB)": 142.32, "step": 30220, "train_speed(iter/s)": 0.286002 }, { "acc": 0.73251762, "epoch": 0.33823280628929586, "grad_norm": 5.8125, "learning_rate": 9.617080852773772e-06, "loss": 1.08018475, "memory(GiB)": 142.32, "step": 30240, "train_speed(iter/s)": 0.286062 }, { "acc": 0.72579851, "epoch": 0.3384565052352544, "grad_norm": 5.8125, "learning_rate": 9.61637072891015e-06, "loss": 1.09574547, "memory(GiB)": 142.32, "step": 30260, "train_speed(iter/s)": 0.286126 }, { "acc": 0.73162708, "epoch": 0.3386802041812129, "grad_norm": 5.4375, "learning_rate": 9.615659973461027e-06, "loss": 1.07321529, "memory(GiB)": 142.32, "step": 30280, "train_speed(iter/s)": 0.286194 }, { "acc": 0.72176061, "epoch": 0.33890390312717145, "grad_norm": 6.5, "learning_rate": 9.614948586523646e-06, "loss": 1.11275635, "memory(GiB)": 142.32, "step": 30300, "train_speed(iter/s)": 0.286257 }, { "acc": 0.72002573, "epoch": 0.33912760207313, "grad_norm": 5.40625, "learning_rate": 9.614236568195336e-06, "loss": 1.12063675, "memory(GiB)": 142.32, "step": 30320, "train_speed(iter/s)": 0.286324 }, { "acc": 0.72599106, "epoch": 0.3393513010190885, "grad_norm": 5.34375, "learning_rate": 9.613523918573513e-06, "loss": 1.08287086, "memory(GiB)": 142.32, "step": 30340, "train_speed(iter/s)": 0.286386 }, { "acc": 0.71740322, "epoch": 0.33957499996504703, "grad_norm": 5.84375, "learning_rate": 9.612810637755671e-06, "loss": 1.14335728, "memory(GiB)": 142.32, "step": 30360, "train_speed(iter/s)": 0.286454 }, { "acc": 0.72317553, "epoch": 0.33979869891100556, "grad_norm": 5.59375, "learning_rate": 9.612096725839407e-06, "loss": 1.09551125, "memory(GiB)": 142.32, "step": 30380, "train_speed(iter/s)": 0.286514 }, { "acc": 0.7354414, "epoch": 0.3400223978569641, "grad_norm": 5.65625, "learning_rate": 9.611382182922386e-06, "loss": 1.03898039, "memory(GiB)": 142.32, "step": 30400, "train_speed(iter/s)": 0.28658 }, { "acc": 0.73913836, "epoch": 0.3402460968029226, "grad_norm": 5.53125, "learning_rate": 9.610667009102371e-06, "loss": 1.02687244, "memory(GiB)": 142.32, "step": 30420, "train_speed(iter/s)": 0.286642 }, { "acc": 0.72606978, "epoch": 0.34046979574888114, "grad_norm": 7.0625, "learning_rate": 9.609951204477206e-06, "loss": 1.08612347, "memory(GiB)": 142.32, "step": 30440, "train_speed(iter/s)": 0.28671 }, { "acc": 0.71191692, "epoch": 0.34069349469483967, "grad_norm": 5.84375, "learning_rate": 9.609234769144826e-06, "loss": 1.15298519, "memory(GiB)": 142.32, "step": 30460, "train_speed(iter/s)": 0.286776 }, { "acc": 0.73176742, "epoch": 0.3409171936407982, "grad_norm": 5.53125, "learning_rate": 9.608517703203249e-06, "loss": 1.06834984, "memory(GiB)": 142.32, "step": 30480, "train_speed(iter/s)": 0.286842 }, { "acc": 0.71919546, "epoch": 0.3411408925867567, "grad_norm": 5.4375, "learning_rate": 9.607800006750578e-06, "loss": 1.13402596, "memory(GiB)": 142.32, "step": 30500, "train_speed(iter/s)": 0.286909 }, { "acc": 0.72235193, "epoch": 0.34136459153271526, "grad_norm": 5.25, "learning_rate": 9.607081679885006e-06, "loss": 1.12738514, "memory(GiB)": 142.32, "step": 30520, "train_speed(iter/s)": 0.286964 }, { "acc": 0.71759481, "epoch": 0.3415882904786738, "grad_norm": 4.96875, "learning_rate": 9.60636272270481e-06, "loss": 1.13491163, "memory(GiB)": 142.32, "step": 30540, "train_speed(iter/s)": 0.28703 }, { "acc": 0.74193234, "epoch": 0.3418119894246323, "grad_norm": 6.25, "learning_rate": 9.605643135308354e-06, "loss": 1.02335281, "memory(GiB)": 142.32, "step": 30560, "train_speed(iter/s)": 0.287095 }, { "acc": 0.73178825, "epoch": 0.34203568837059084, "grad_norm": 5.59375, "learning_rate": 9.604922917794087e-06, "loss": 1.07539482, "memory(GiB)": 142.32, "step": 30580, "train_speed(iter/s)": 0.287159 }, { "acc": 0.72497845, "epoch": 0.34225938731654937, "grad_norm": 5.5625, "learning_rate": 9.604202070260545e-06, "loss": 1.10140867, "memory(GiB)": 142.32, "step": 30600, "train_speed(iter/s)": 0.287222 }, { "acc": 0.72168117, "epoch": 0.3424830862625079, "grad_norm": 5.96875, "learning_rate": 9.603480592806351e-06, "loss": 1.12388401, "memory(GiB)": 142.32, "step": 30620, "train_speed(iter/s)": 0.287293 }, { "acc": 0.71843123, "epoch": 0.3427067852084664, "grad_norm": 5.71875, "learning_rate": 9.602758485530213e-06, "loss": 1.13366394, "memory(GiB)": 142.32, "step": 30640, "train_speed(iter/s)": 0.287357 }, { "acc": 0.7380549, "epoch": 0.34293048415442495, "grad_norm": 6.0, "learning_rate": 9.602035748530925e-06, "loss": 1.04064178, "memory(GiB)": 142.32, "step": 30660, "train_speed(iter/s)": 0.287419 }, { "acc": 0.72977552, "epoch": 0.3431541831003835, "grad_norm": 6.125, "learning_rate": 9.601312381907368e-06, "loss": 1.07716389, "memory(GiB)": 142.32, "step": 30680, "train_speed(iter/s)": 0.287484 }, { "acc": 0.73568177, "epoch": 0.343377882046342, "grad_norm": 4.8125, "learning_rate": 9.600588385758511e-06, "loss": 1.03424644, "memory(GiB)": 142.32, "step": 30700, "train_speed(iter/s)": 0.287545 }, { "acc": 0.72702861, "epoch": 0.34360158099230054, "grad_norm": 5.125, "learning_rate": 9.599863760183403e-06, "loss": 1.09414539, "memory(GiB)": 142.32, "step": 30720, "train_speed(iter/s)": 0.28761 }, { "acc": 0.72800074, "epoch": 0.34382527993825907, "grad_norm": 5.59375, "learning_rate": 9.599138505281187e-06, "loss": 1.09050417, "memory(GiB)": 142.32, "step": 30740, "train_speed(iter/s)": 0.287671 }, { "acc": 0.72380691, "epoch": 0.3440489788842176, "grad_norm": 6.125, "learning_rate": 9.598412621151087e-06, "loss": 1.10786877, "memory(GiB)": 142.32, "step": 30760, "train_speed(iter/s)": 0.287734 }, { "acc": 0.72187147, "epoch": 0.3442726778301761, "grad_norm": 5.84375, "learning_rate": 9.597686107892412e-06, "loss": 1.12210398, "memory(GiB)": 142.32, "step": 30780, "train_speed(iter/s)": 0.287796 }, { "acc": 0.71686125, "epoch": 0.3444963767761347, "grad_norm": 6.65625, "learning_rate": 9.596958965604563e-06, "loss": 1.12514515, "memory(GiB)": 142.32, "step": 30800, "train_speed(iter/s)": 0.287838 }, { "acc": 0.70849862, "epoch": 0.34472007572209323, "grad_norm": 6.40625, "learning_rate": 9.596231194387022e-06, "loss": 1.1778635, "memory(GiB)": 142.32, "step": 30820, "train_speed(iter/s)": 0.287899 }, { "acc": 0.71594148, "epoch": 0.34494377466805176, "grad_norm": 5.9375, "learning_rate": 9.595502794339358e-06, "loss": 1.15256243, "memory(GiB)": 142.32, "step": 30840, "train_speed(iter/s)": 0.287963 }, { "acc": 0.71905642, "epoch": 0.3451674736140103, "grad_norm": 5.8125, "learning_rate": 9.594773765561227e-06, "loss": 1.12231045, "memory(GiB)": 142.32, "step": 30860, "train_speed(iter/s)": 0.288027 }, { "acc": 0.72016163, "epoch": 0.3453911725599688, "grad_norm": 6.875, "learning_rate": 9.594044108152369e-06, "loss": 1.10148945, "memory(GiB)": 142.32, "step": 30880, "train_speed(iter/s)": 0.288084 }, { "acc": 0.71874714, "epoch": 0.34561487150592735, "grad_norm": 5.6875, "learning_rate": 9.593313822212614e-06, "loss": 1.13311424, "memory(GiB)": 142.32, "step": 30900, "train_speed(iter/s)": 0.288148 }, { "acc": 0.72550297, "epoch": 0.3458385704518859, "grad_norm": 5.65625, "learning_rate": 9.592582907841874e-06, "loss": 1.09466591, "memory(GiB)": 142.32, "step": 30920, "train_speed(iter/s)": 0.288215 }, { "acc": 0.73452034, "epoch": 0.3460622693978444, "grad_norm": 5.25, "learning_rate": 9.59185136514015e-06, "loss": 1.05196075, "memory(GiB)": 142.32, "step": 30940, "train_speed(iter/s)": 0.288282 }, { "acc": 0.73371468, "epoch": 0.34628596834380293, "grad_norm": 5.9375, "learning_rate": 9.591119194207527e-06, "loss": 1.04276657, "memory(GiB)": 142.32, "step": 30960, "train_speed(iter/s)": 0.288342 }, { "acc": 0.7282856, "epoch": 0.34650966728976146, "grad_norm": 6.25, "learning_rate": 9.590386395144174e-06, "loss": 1.08058796, "memory(GiB)": 142.32, "step": 30980, "train_speed(iter/s)": 0.288397 }, { "acc": 0.72460451, "epoch": 0.34673336623572, "grad_norm": 6.25, "learning_rate": 9.589652968050353e-06, "loss": 1.11203213, "memory(GiB)": 142.32, "step": 31000, "train_speed(iter/s)": 0.288462 }, { "acc": 0.70785847, "epoch": 0.3469570651816785, "grad_norm": 5.78125, "learning_rate": 9.588918913026402e-06, "loss": 1.16982441, "memory(GiB)": 142.32, "step": 31020, "train_speed(iter/s)": 0.288522 }, { "acc": 0.72708044, "epoch": 0.34718076412763704, "grad_norm": 5.53125, "learning_rate": 9.588184230172754e-06, "loss": 1.08690224, "memory(GiB)": 142.32, "step": 31040, "train_speed(iter/s)": 0.288586 }, { "acc": 0.74190855, "epoch": 0.34740446307359557, "grad_norm": 5.6875, "learning_rate": 9.587448919589924e-06, "loss": 1.015065, "memory(GiB)": 142.32, "step": 31060, "train_speed(iter/s)": 0.288641 }, { "acc": 0.72046089, "epoch": 0.3476281620195541, "grad_norm": 5.46875, "learning_rate": 9.586712981378512e-06, "loss": 1.11853714, "memory(GiB)": 142.32, "step": 31080, "train_speed(iter/s)": 0.2887 }, { "acc": 0.70894418, "epoch": 0.3478518609655126, "grad_norm": 4.4375, "learning_rate": 9.585976415639205e-06, "loss": 1.15362844, "memory(GiB)": 142.32, "step": 31100, "train_speed(iter/s)": 0.28876 }, { "acc": 0.71765985, "epoch": 0.34807555991147116, "grad_norm": 4.71875, "learning_rate": 9.585239222472773e-06, "loss": 1.11888866, "memory(GiB)": 142.32, "step": 31120, "train_speed(iter/s)": 0.288825 }, { "acc": 0.71343117, "epoch": 0.3482992588574297, "grad_norm": 5.15625, "learning_rate": 9.58450140198008e-06, "loss": 1.12894258, "memory(GiB)": 142.32, "step": 31140, "train_speed(iter/s)": 0.288887 }, { "acc": 0.71901941, "epoch": 0.3485229578033882, "grad_norm": 5.3125, "learning_rate": 9.583762954262066e-06, "loss": 1.13566952, "memory(GiB)": 142.32, "step": 31160, "train_speed(iter/s)": 0.288938 }, { "acc": 0.72057505, "epoch": 0.34874665674934674, "grad_norm": 6.15625, "learning_rate": 9.583023879419764e-06, "loss": 1.11756611, "memory(GiB)": 142.32, "step": 31180, "train_speed(iter/s)": 0.289 }, { "acc": 0.72594619, "epoch": 0.34897035569530527, "grad_norm": 4.78125, "learning_rate": 9.582284177554288e-06, "loss": 1.08199177, "memory(GiB)": 142.32, "step": 31200, "train_speed(iter/s)": 0.289038 }, { "acc": 0.73263531, "epoch": 0.3491940546412638, "grad_norm": 5.84375, "learning_rate": 9.581543848766841e-06, "loss": 1.06405792, "memory(GiB)": 142.32, "step": 31220, "train_speed(iter/s)": 0.289103 }, { "acc": 0.72632399, "epoch": 0.3494177535872223, "grad_norm": 5.25, "learning_rate": 9.58080289315871e-06, "loss": 1.08376598, "memory(GiB)": 142.32, "step": 31240, "train_speed(iter/s)": 0.289165 }, { "acc": 0.7278306, "epoch": 0.34964145253318085, "grad_norm": 5.78125, "learning_rate": 9.580061310831268e-06, "loss": 1.07060089, "memory(GiB)": 142.32, "step": 31260, "train_speed(iter/s)": 0.289232 }, { "acc": 0.72906833, "epoch": 0.3498651514791394, "grad_norm": 5.25, "learning_rate": 9.579319101885975e-06, "loss": 1.06050653, "memory(GiB)": 142.32, "step": 31280, "train_speed(iter/s)": 0.289303 }, { "acc": 0.73514347, "epoch": 0.3500888504250979, "grad_norm": 6.3125, "learning_rate": 9.578576266424376e-06, "loss": 1.04768486, "memory(GiB)": 142.32, "step": 31300, "train_speed(iter/s)": 0.289366 }, { "acc": 0.7325316, "epoch": 0.35031254937105644, "grad_norm": 8.9375, "learning_rate": 9.5778328045481e-06, "loss": 1.06283503, "memory(GiB)": 142.32, "step": 31320, "train_speed(iter/s)": 0.289422 }, { "acc": 0.72698689, "epoch": 0.35053624831701496, "grad_norm": 6.21875, "learning_rate": 9.577088716358864e-06, "loss": 1.08292942, "memory(GiB)": 142.32, "step": 31340, "train_speed(iter/s)": 0.289485 }, { "acc": 0.73039036, "epoch": 0.3507599472629735, "grad_norm": 5.125, "learning_rate": 9.57634400195847e-06, "loss": 1.06183071, "memory(GiB)": 142.32, "step": 31360, "train_speed(iter/s)": 0.289536 }, { "acc": 0.70996876, "epoch": 0.350983646208932, "grad_norm": 4.6875, "learning_rate": 9.575598661448804e-06, "loss": 1.15825787, "memory(GiB)": 142.32, "step": 31380, "train_speed(iter/s)": 0.289598 }, { "acc": 0.72079482, "epoch": 0.35120734515489055, "grad_norm": 5.5625, "learning_rate": 9.574852694931843e-06, "loss": 1.12112427, "memory(GiB)": 142.32, "step": 31400, "train_speed(iter/s)": 0.289653 }, { "acc": 0.72373333, "epoch": 0.3514310441008491, "grad_norm": 6.875, "learning_rate": 9.574106102509643e-06, "loss": 1.10302067, "memory(GiB)": 142.32, "step": 31420, "train_speed(iter/s)": 0.289713 }, { "acc": 0.73190908, "epoch": 0.3516547430468076, "grad_norm": 5.75, "learning_rate": 9.573358884284349e-06, "loss": 1.06283484, "memory(GiB)": 142.32, "step": 31440, "train_speed(iter/s)": 0.289775 }, { "acc": 0.73770046, "epoch": 0.35187844199276613, "grad_norm": 6.15625, "learning_rate": 9.572611040358191e-06, "loss": 1.0336071, "memory(GiB)": 142.32, "step": 31460, "train_speed(iter/s)": 0.289837 }, { "acc": 0.73003311, "epoch": 0.35210214093872466, "grad_norm": 6.84375, "learning_rate": 9.571862570833486e-06, "loss": 1.07831802, "memory(GiB)": 142.32, "step": 31480, "train_speed(iter/s)": 0.289891 }, { "acc": 0.72908645, "epoch": 0.3523258398846832, "grad_norm": 5.96875, "learning_rate": 9.571113475812635e-06, "loss": 1.07855167, "memory(GiB)": 142.32, "step": 31500, "train_speed(iter/s)": 0.28996 }, { "acc": 0.72165589, "epoch": 0.3525495388306417, "grad_norm": 5.21875, "learning_rate": 9.570363755398122e-06, "loss": 1.10787487, "memory(GiB)": 142.32, "step": 31520, "train_speed(iter/s)": 0.290022 }, { "acc": 0.72392116, "epoch": 0.35277323777660025, "grad_norm": 6.3125, "learning_rate": 9.569613409692523e-06, "loss": 1.10180264, "memory(GiB)": 142.32, "step": 31540, "train_speed(iter/s)": 0.290081 }, { "acc": 0.71266165, "epoch": 0.3529969367225588, "grad_norm": 6.96875, "learning_rate": 9.568862438798495e-06, "loss": 1.15664349, "memory(GiB)": 142.32, "step": 31560, "train_speed(iter/s)": 0.290141 }, { "acc": 0.72451005, "epoch": 0.3532206356685173, "grad_norm": 6.4375, "learning_rate": 9.568110842818779e-06, "loss": 1.0976593, "memory(GiB)": 142.32, "step": 31580, "train_speed(iter/s)": 0.290192 }, { "acc": 0.72010341, "epoch": 0.35344433461447583, "grad_norm": 6.34375, "learning_rate": 9.567358621856209e-06, "loss": 1.11350365, "memory(GiB)": 142.32, "step": 31600, "train_speed(iter/s)": 0.290249 }, { "acc": 0.72494879, "epoch": 0.35366803356043436, "grad_norm": 6.46875, "learning_rate": 9.566605776013695e-06, "loss": 1.10493431, "memory(GiB)": 142.32, "step": 31620, "train_speed(iter/s)": 0.290296 }, { "acc": 0.72164817, "epoch": 0.3538917325063929, "grad_norm": 6.1875, "learning_rate": 9.565852305394239e-06, "loss": 1.10060644, "memory(GiB)": 142.32, "step": 31640, "train_speed(iter/s)": 0.290347 }, { "acc": 0.72276125, "epoch": 0.3541154314523514, "grad_norm": 6.03125, "learning_rate": 9.565098210100928e-06, "loss": 1.10572939, "memory(GiB)": 142.32, "step": 31660, "train_speed(iter/s)": 0.290407 }, { "acc": 0.73339119, "epoch": 0.35433913039830994, "grad_norm": 5.9375, "learning_rate": 9.564343490236932e-06, "loss": 1.05649672, "memory(GiB)": 142.32, "step": 31680, "train_speed(iter/s)": 0.290469 }, { "acc": 0.72178006, "epoch": 0.35456282934426847, "grad_norm": 5.78125, "learning_rate": 9.563588145905504e-06, "loss": 1.1204092, "memory(GiB)": 142.32, "step": 31700, "train_speed(iter/s)": 0.290534 }, { "acc": 0.72785559, "epoch": 0.354786528290227, "grad_norm": 5.03125, "learning_rate": 9.562832177209992e-06, "loss": 1.07454748, "memory(GiB)": 142.32, "step": 31720, "train_speed(iter/s)": 0.290594 }, { "acc": 0.72680254, "epoch": 0.3550102272361855, "grad_norm": 4.65625, "learning_rate": 9.562075584253821e-06, "loss": 1.0834609, "memory(GiB)": 142.32, "step": 31740, "train_speed(iter/s)": 0.290655 }, { "acc": 0.72672324, "epoch": 0.35523392618214406, "grad_norm": 6.0625, "learning_rate": 9.5613183671405e-06, "loss": 1.09430485, "memory(GiB)": 142.32, "step": 31760, "train_speed(iter/s)": 0.290719 }, { "acc": 0.72941957, "epoch": 0.3554576251281026, "grad_norm": 6.0625, "learning_rate": 9.560560525973632e-06, "loss": 1.06701698, "memory(GiB)": 142.32, "step": 31780, "train_speed(iter/s)": 0.290781 }, { "acc": 0.73345346, "epoch": 0.3556813240740611, "grad_norm": 5.21875, "learning_rate": 9.559802060856898e-06, "loss": 1.08035583, "memory(GiB)": 142.32, "step": 31800, "train_speed(iter/s)": 0.290839 }, { "acc": 0.72825193, "epoch": 0.35590502302001964, "grad_norm": 6.90625, "learning_rate": 9.559042971894067e-06, "loss": 1.07772026, "memory(GiB)": 142.32, "step": 31820, "train_speed(iter/s)": 0.290897 }, { "acc": 0.72234011, "epoch": 0.35612872196597817, "grad_norm": 5.46875, "learning_rate": 9.558283259188993e-06, "loss": 1.11797495, "memory(GiB)": 142.32, "step": 31840, "train_speed(iter/s)": 0.290959 }, { "acc": 0.73753986, "epoch": 0.3563524209119367, "grad_norm": 7.9375, "learning_rate": 9.55752292284562e-06, "loss": 1.03516979, "memory(GiB)": 142.32, "step": 31860, "train_speed(iter/s)": 0.291019 }, { "acc": 0.72857199, "epoch": 0.3565761198578952, "grad_norm": 6.84375, "learning_rate": 9.556761962967964e-06, "loss": 1.06687431, "memory(GiB)": 142.32, "step": 31880, "train_speed(iter/s)": 0.291084 }, { "acc": 0.71797009, "epoch": 0.35679981880385375, "grad_norm": 4.9375, "learning_rate": 9.556000379660145e-06, "loss": 1.13689976, "memory(GiB)": 142.32, "step": 31900, "train_speed(iter/s)": 0.291136 }, { "acc": 0.72958317, "epoch": 0.3570235177498123, "grad_norm": 6.625, "learning_rate": 9.555238173026351e-06, "loss": 1.08602982, "memory(GiB)": 142.32, "step": 31920, "train_speed(iter/s)": 0.291191 }, { "acc": 0.72631168, "epoch": 0.3572472166957708, "grad_norm": 6.4375, "learning_rate": 9.554475343170867e-06, "loss": 1.08270512, "memory(GiB)": 142.32, "step": 31940, "train_speed(iter/s)": 0.291253 }, { "acc": 0.72631407, "epoch": 0.35747091564172934, "grad_norm": 6.84375, "learning_rate": 9.553711890198056e-06, "loss": 1.09381599, "memory(GiB)": 142.32, "step": 31960, "train_speed(iter/s)": 0.291307 }, { "acc": 0.73791242, "epoch": 0.35769461458768786, "grad_norm": 6.03125, "learning_rate": 9.55294781421237e-06, "loss": 1.05046768, "memory(GiB)": 142.32, "step": 31980, "train_speed(iter/s)": 0.291369 }, { "acc": 0.72483969, "epoch": 0.3579183135336464, "grad_norm": 6.65625, "learning_rate": 9.55218311531835e-06, "loss": 1.0981575, "memory(GiB)": 142.32, "step": 32000, "train_speed(iter/s)": 0.291429 }, { "epoch": 0.3579183135336464, "eval_acc": 0.6884365625008627, "eval_loss": 1.1067428588867188, "eval_runtime": 2343.2082, "eval_samples_per_second": 32.128, "eval_steps_per_second": 16.064, "step": 32000 }, { "acc": 0.72122221, "epoch": 0.3581420124796049, "grad_norm": 6.03125, "learning_rate": 9.551417793620613e-06, "loss": 1.11808758, "memory(GiB)": 142.32, "step": 32020, "train_speed(iter/s)": 0.285273 }, { "acc": 0.71150513, "epoch": 0.35836571142556345, "grad_norm": 6.15625, "learning_rate": 9.550651849223865e-06, "loss": 1.1746891, "memory(GiB)": 142.32, "step": 32040, "train_speed(iter/s)": 0.285333 }, { "acc": 0.73105531, "epoch": 0.35858941037152203, "grad_norm": 5.9375, "learning_rate": 9.549885282232903e-06, "loss": 1.07604132, "memory(GiB)": 142.32, "step": 32060, "train_speed(iter/s)": 0.285395 }, { "acc": 0.73566256, "epoch": 0.35881310931748056, "grad_norm": 6.40625, "learning_rate": 9.549118092752599e-06, "loss": 1.04928875, "memory(GiB)": 142.32, "step": 32080, "train_speed(iter/s)": 0.285458 }, { "acc": 0.71767759, "epoch": 0.3590368082634391, "grad_norm": 5.25, "learning_rate": 9.54835028088792e-06, "loss": 1.11853428, "memory(GiB)": 142.32, "step": 32100, "train_speed(iter/s)": 0.28552 }, { "acc": 0.7191278, "epoch": 0.3592605072093976, "grad_norm": 6.40625, "learning_rate": 9.54758184674391e-06, "loss": 1.10960712, "memory(GiB)": 142.32, "step": 32120, "train_speed(iter/s)": 0.28558 }, { "acc": 0.71934662, "epoch": 0.35948420615535615, "grad_norm": 5.3125, "learning_rate": 9.546812790425704e-06, "loss": 1.1169857, "memory(GiB)": 142.32, "step": 32140, "train_speed(iter/s)": 0.285646 }, { "acc": 0.71963921, "epoch": 0.3597079051013147, "grad_norm": 6.4375, "learning_rate": 9.54604311203852e-06, "loss": 1.11634178, "memory(GiB)": 142.32, "step": 32160, "train_speed(iter/s)": 0.285706 }, { "acc": 0.72046051, "epoch": 0.3599316040472732, "grad_norm": 5.75, "learning_rate": 9.54527281168766e-06, "loss": 1.11443176, "memory(GiB)": 142.32, "step": 32180, "train_speed(iter/s)": 0.28576 }, { "acc": 0.7233037, "epoch": 0.36015530299323173, "grad_norm": 5.65625, "learning_rate": 9.544501889478513e-06, "loss": 1.09735851, "memory(GiB)": 142.32, "step": 32200, "train_speed(iter/s)": 0.285815 }, { "acc": 0.73115797, "epoch": 0.36037900193919026, "grad_norm": 5.90625, "learning_rate": 9.54373034551655e-06, "loss": 1.07354679, "memory(GiB)": 142.32, "step": 32220, "train_speed(iter/s)": 0.285879 }, { "acc": 0.72972393, "epoch": 0.3606027008851488, "grad_norm": 5.1875, "learning_rate": 9.542958179907331e-06, "loss": 1.0724184, "memory(GiB)": 142.32, "step": 32240, "train_speed(iter/s)": 0.285939 }, { "acc": 0.71656513, "epoch": 0.3608263998311073, "grad_norm": 6.0625, "learning_rate": 9.542185392756501e-06, "loss": 1.14080124, "memory(GiB)": 142.32, "step": 32260, "train_speed(iter/s)": 0.285998 }, { "acc": 0.70983639, "epoch": 0.36105009877706584, "grad_norm": 5.46875, "learning_rate": 9.541411984169785e-06, "loss": 1.16655674, "memory(GiB)": 142.32, "step": 32280, "train_speed(iter/s)": 0.286054 }, { "acc": 0.72052059, "epoch": 0.36127379772302437, "grad_norm": 4.9375, "learning_rate": 9.540637954253e-06, "loss": 1.13207359, "memory(GiB)": 142.32, "step": 32300, "train_speed(iter/s)": 0.286111 }, { "acc": 0.73008666, "epoch": 0.3614974966689829, "grad_norm": 6.71875, "learning_rate": 9.53986330311204e-06, "loss": 1.07138004, "memory(GiB)": 142.32, "step": 32320, "train_speed(iter/s)": 0.286175 }, { "acc": 0.72729759, "epoch": 0.3617211956149414, "grad_norm": 8.1875, "learning_rate": 9.539088030852891e-06, "loss": 1.08792009, "memory(GiB)": 142.32, "step": 32340, "train_speed(iter/s)": 0.286233 }, { "acc": 0.72695627, "epoch": 0.36194489456089995, "grad_norm": 6.28125, "learning_rate": 9.538312137581621e-06, "loss": 1.07694626, "memory(GiB)": 142.32, "step": 32360, "train_speed(iter/s)": 0.286291 }, { "acc": 0.73531189, "epoch": 0.3621685935068585, "grad_norm": 5.8125, "learning_rate": 9.537535623404384e-06, "loss": 1.05380306, "memory(GiB)": 142.32, "step": 32380, "train_speed(iter/s)": 0.286355 }, { "acc": 0.70597649, "epoch": 0.362392292452817, "grad_norm": 6.25, "learning_rate": 9.536758488427415e-06, "loss": 1.1915802, "memory(GiB)": 142.32, "step": 32400, "train_speed(iter/s)": 0.286407 }, { "acc": 0.71383686, "epoch": 0.36261599139877554, "grad_norm": 5.90625, "learning_rate": 9.535980732757042e-06, "loss": 1.15251999, "memory(GiB)": 142.32, "step": 32420, "train_speed(iter/s)": 0.286462 }, { "acc": 0.72757902, "epoch": 0.36283969034473407, "grad_norm": 5.71875, "learning_rate": 9.53520235649967e-06, "loss": 1.07916489, "memory(GiB)": 142.32, "step": 32440, "train_speed(iter/s)": 0.286525 }, { "acc": 0.71600308, "epoch": 0.3630633892906926, "grad_norm": 5.84375, "learning_rate": 9.534423359761792e-06, "loss": 1.14061375, "memory(GiB)": 142.32, "step": 32460, "train_speed(iter/s)": 0.286586 }, { "acc": 0.72177477, "epoch": 0.3632870882366511, "grad_norm": 5.5, "learning_rate": 9.533643742649988e-06, "loss": 1.10968924, "memory(GiB)": 142.32, "step": 32480, "train_speed(iter/s)": 0.28664 }, { "acc": 0.71368985, "epoch": 0.36351078718260965, "grad_norm": 6.0, "learning_rate": 9.532863505270917e-06, "loss": 1.14535866, "memory(GiB)": 142.32, "step": 32500, "train_speed(iter/s)": 0.286704 }, { "acc": 0.73375721, "epoch": 0.3637344861285682, "grad_norm": 5.8125, "learning_rate": 9.532082647731332e-06, "loss": 1.05677452, "memory(GiB)": 142.32, "step": 32520, "train_speed(iter/s)": 0.286767 }, { "acc": 0.71897984, "epoch": 0.3639581850745267, "grad_norm": 6.46875, "learning_rate": 9.531301170138059e-06, "loss": 1.11860075, "memory(GiB)": 142.32, "step": 32540, "train_speed(iter/s)": 0.286826 }, { "acc": 0.72721591, "epoch": 0.36418188402048524, "grad_norm": 5.90625, "learning_rate": 9.53051907259802e-06, "loss": 1.07306767, "memory(GiB)": 142.32, "step": 32560, "train_speed(iter/s)": 0.286888 }, { "acc": 0.72291536, "epoch": 0.36440558296644376, "grad_norm": 5.0, "learning_rate": 9.529736355218215e-06, "loss": 1.09554157, "memory(GiB)": 142.32, "step": 32580, "train_speed(iter/s)": 0.286951 }, { "acc": 0.71702237, "epoch": 0.3646292819124023, "grad_norm": 6.21875, "learning_rate": 9.528953018105734e-06, "loss": 1.12149391, "memory(GiB)": 142.32, "step": 32600, "train_speed(iter/s)": 0.287013 }, { "acc": 0.72929363, "epoch": 0.3648529808583608, "grad_norm": 7.0625, "learning_rate": 9.528169061367745e-06, "loss": 1.07664909, "memory(GiB)": 142.32, "step": 32620, "train_speed(iter/s)": 0.287079 }, { "acc": 0.72824259, "epoch": 0.36507667980431935, "grad_norm": 6.625, "learning_rate": 9.527384485111506e-06, "loss": 1.07761593, "memory(GiB)": 142.32, "step": 32640, "train_speed(iter/s)": 0.28714 }, { "acc": 0.71323099, "epoch": 0.3653003787502779, "grad_norm": 5.875, "learning_rate": 9.52659928944436e-06, "loss": 1.16023178, "memory(GiB)": 142.32, "step": 32660, "train_speed(iter/s)": 0.287198 }, { "acc": 0.72268634, "epoch": 0.3655240776962364, "grad_norm": 5.5625, "learning_rate": 9.525813474473728e-06, "loss": 1.11253033, "memory(GiB)": 142.32, "step": 32680, "train_speed(iter/s)": 0.287265 }, { "acc": 0.73109202, "epoch": 0.36574777664219493, "grad_norm": 5.78125, "learning_rate": 9.525027040307127e-06, "loss": 1.06836872, "memory(GiB)": 142.32, "step": 32700, "train_speed(iter/s)": 0.287325 }, { "acc": 0.71536198, "epoch": 0.36597147558815346, "grad_norm": 4.71875, "learning_rate": 9.524239987052148e-06, "loss": 1.13392963, "memory(GiB)": 142.32, "step": 32720, "train_speed(iter/s)": 0.28738 }, { "acc": 0.72096519, "epoch": 0.366195174534112, "grad_norm": 4.8125, "learning_rate": 9.523452314816473e-06, "loss": 1.12138004, "memory(GiB)": 142.32, "step": 32740, "train_speed(iter/s)": 0.287441 }, { "acc": 0.71441841, "epoch": 0.3664188734800705, "grad_norm": 5.28125, "learning_rate": 9.522664023707864e-06, "loss": 1.12512875, "memory(GiB)": 142.32, "step": 32760, "train_speed(iter/s)": 0.287507 }, { "acc": 0.72289104, "epoch": 0.36664257242602905, "grad_norm": 6.09375, "learning_rate": 9.521875113834175e-06, "loss": 1.12030315, "memory(GiB)": 142.32, "step": 32780, "train_speed(iter/s)": 0.287563 }, { "acc": 0.72486906, "epoch": 0.3668662713719876, "grad_norm": 5.625, "learning_rate": 9.521085585303338e-06, "loss": 1.10755596, "memory(GiB)": 142.32, "step": 32800, "train_speed(iter/s)": 0.28762 }, { "acc": 0.73115177, "epoch": 0.3670899703179461, "grad_norm": 5.0625, "learning_rate": 9.52029543822337e-06, "loss": 1.06555586, "memory(GiB)": 142.32, "step": 32820, "train_speed(iter/s)": 0.287682 }, { "acc": 0.72354698, "epoch": 0.36731366926390463, "grad_norm": 6.15625, "learning_rate": 9.519504672702378e-06, "loss": 1.12097855, "memory(GiB)": 142.32, "step": 32840, "train_speed(iter/s)": 0.287739 }, { "acc": 0.71685009, "epoch": 0.36753736820986316, "grad_norm": 6.0625, "learning_rate": 9.518713288848547e-06, "loss": 1.12887955, "memory(GiB)": 142.32, "step": 32860, "train_speed(iter/s)": 0.287799 }, { "acc": 0.72363815, "epoch": 0.3677610671558217, "grad_norm": 6.3125, "learning_rate": 9.517921286770151e-06, "loss": 1.09454994, "memory(GiB)": 142.32, "step": 32880, "train_speed(iter/s)": 0.287856 }, { "acc": 0.72867575, "epoch": 0.3679847661017802, "grad_norm": 5.5625, "learning_rate": 9.517128666575548e-06, "loss": 1.0844842, "memory(GiB)": 142.32, "step": 32900, "train_speed(iter/s)": 0.287917 }, { "acc": 0.7214386, "epoch": 0.36820846504773874, "grad_norm": 5.125, "learning_rate": 9.516335428373177e-06, "loss": 1.10192013, "memory(GiB)": 142.32, "step": 32920, "train_speed(iter/s)": 0.287977 }, { "acc": 0.72728696, "epoch": 0.36843216399369727, "grad_norm": 5.78125, "learning_rate": 9.515541572271567e-06, "loss": 1.07279987, "memory(GiB)": 142.32, "step": 32940, "train_speed(iter/s)": 0.288039 }, { "acc": 0.7263782, "epoch": 0.3686558629396558, "grad_norm": 6.46875, "learning_rate": 9.514747098379329e-06, "loss": 1.078125, "memory(GiB)": 142.32, "step": 32960, "train_speed(iter/s)": 0.288099 }, { "acc": 0.72154131, "epoch": 0.3688795618856143, "grad_norm": 6.1875, "learning_rate": 9.513952006805157e-06, "loss": 1.10596581, "memory(GiB)": 142.32, "step": 32980, "train_speed(iter/s)": 0.28816 }, { "acc": 0.73835697, "epoch": 0.36910326083157285, "grad_norm": 6.71875, "learning_rate": 9.51315629765783e-06, "loss": 1.02358885, "memory(GiB)": 142.32, "step": 33000, "train_speed(iter/s)": 0.288218 }, { "acc": 0.72609782, "epoch": 0.3693269597775314, "grad_norm": 6.59375, "learning_rate": 9.512359971046214e-06, "loss": 1.09337978, "memory(GiB)": 142.32, "step": 33020, "train_speed(iter/s)": 0.288269 }, { "acc": 0.73726606, "epoch": 0.3695506587234899, "grad_norm": 6.28125, "learning_rate": 9.511563027079258e-06, "loss": 1.03827906, "memory(GiB)": 142.32, "step": 33040, "train_speed(iter/s)": 0.288331 }, { "acc": 0.70957756, "epoch": 0.36977435766944844, "grad_norm": 6.6875, "learning_rate": 9.510765465865995e-06, "loss": 1.16226578, "memory(GiB)": 142.32, "step": 33060, "train_speed(iter/s)": 0.288387 }, { "acc": 0.72704792, "epoch": 0.36999805661540697, "grad_norm": 5.96875, "learning_rate": 9.509967287515542e-06, "loss": 1.08836193, "memory(GiB)": 142.32, "step": 33080, "train_speed(iter/s)": 0.288435 }, { "acc": 0.71273785, "epoch": 0.3702217555613655, "grad_norm": 6.03125, "learning_rate": 9.509168492137102e-06, "loss": 1.15075397, "memory(GiB)": 142.32, "step": 33100, "train_speed(iter/s)": 0.288492 }, { "acc": 0.72645736, "epoch": 0.370445454507324, "grad_norm": 6.1875, "learning_rate": 9.50836907983996e-06, "loss": 1.08838024, "memory(GiB)": 142.32, "step": 33120, "train_speed(iter/s)": 0.288542 }, { "acc": 0.72196503, "epoch": 0.37066915345328255, "grad_norm": 5.78125, "learning_rate": 9.507569050733491e-06, "loss": 1.09921741, "memory(GiB)": 142.32, "step": 33140, "train_speed(iter/s)": 0.288604 }, { "acc": 0.71836057, "epoch": 0.3708928523992411, "grad_norm": 8.125, "learning_rate": 9.506768404927147e-06, "loss": 1.11777706, "memory(GiB)": 142.32, "step": 33160, "train_speed(iter/s)": 0.288661 }, { "acc": 0.72978716, "epoch": 0.3711165513451996, "grad_norm": 6.28125, "learning_rate": 9.505967142530468e-06, "loss": 1.07164383, "memory(GiB)": 142.32, "step": 33180, "train_speed(iter/s)": 0.288718 }, { "acc": 0.71497674, "epoch": 0.37134025029115814, "grad_norm": 5.28125, "learning_rate": 9.505165263653078e-06, "loss": 1.13324661, "memory(GiB)": 142.32, "step": 33200, "train_speed(iter/s)": 0.288779 }, { "acc": 0.72711239, "epoch": 0.37156394923711666, "grad_norm": 5.78125, "learning_rate": 9.504362768404689e-06, "loss": 1.08629265, "memory(GiB)": 142.32, "step": 33220, "train_speed(iter/s)": 0.288839 }, { "acc": 0.71999049, "epoch": 0.3717876481830752, "grad_norm": 5.25, "learning_rate": 9.503559656895089e-06, "loss": 1.12443352, "memory(GiB)": 142.32, "step": 33240, "train_speed(iter/s)": 0.288898 }, { "acc": 0.72267447, "epoch": 0.3720113471290337, "grad_norm": 7.21875, "learning_rate": 9.502755929234158e-06, "loss": 1.11919899, "memory(GiB)": 142.32, "step": 33260, "train_speed(iter/s)": 0.288954 }, { "acc": 0.72198477, "epoch": 0.37223504607499225, "grad_norm": 4.6875, "learning_rate": 9.501951585531856e-06, "loss": 1.09969978, "memory(GiB)": 142.32, "step": 33280, "train_speed(iter/s)": 0.289014 }, { "acc": 0.72212782, "epoch": 0.37245874502095083, "grad_norm": 6.78125, "learning_rate": 9.50114662589823e-06, "loss": 1.10900097, "memory(GiB)": 142.32, "step": 33300, "train_speed(iter/s)": 0.289066 }, { "acc": 0.72938509, "epoch": 0.37268244396690936, "grad_norm": 5.25, "learning_rate": 9.500341050443409e-06, "loss": 1.08675461, "memory(GiB)": 142.32, "step": 33320, "train_speed(iter/s)": 0.289122 }, { "acc": 0.73523169, "epoch": 0.3729061429128679, "grad_norm": 5.25, "learning_rate": 9.499534859277607e-06, "loss": 1.05675154, "memory(GiB)": 142.32, "step": 33340, "train_speed(iter/s)": 0.289175 }, { "acc": 0.71917639, "epoch": 0.3731298418588264, "grad_norm": 5.84375, "learning_rate": 9.498728052511124e-06, "loss": 1.10511723, "memory(GiB)": 142.32, "step": 33360, "train_speed(iter/s)": 0.289231 }, { "acc": 0.72192731, "epoch": 0.37335354080478494, "grad_norm": 5.875, "learning_rate": 9.497920630254342e-06, "loss": 1.10244026, "memory(GiB)": 142.32, "step": 33380, "train_speed(iter/s)": 0.289285 }, { "acc": 0.71724939, "epoch": 0.3735772397507435, "grad_norm": 7.1875, "learning_rate": 9.497112592617727e-06, "loss": 1.13026257, "memory(GiB)": 142.32, "step": 33400, "train_speed(iter/s)": 0.289341 }, { "acc": 0.73067636, "epoch": 0.373800938696702, "grad_norm": 5.5, "learning_rate": 9.49630393971183e-06, "loss": 1.06399288, "memory(GiB)": 142.32, "step": 33420, "train_speed(iter/s)": 0.289401 }, { "acc": 0.72956924, "epoch": 0.37402463764266053, "grad_norm": 5.4375, "learning_rate": 9.495494671647289e-06, "loss": 1.07155924, "memory(GiB)": 142.32, "step": 33440, "train_speed(iter/s)": 0.289448 }, { "acc": 0.73504372, "epoch": 0.37424833658861906, "grad_norm": 7.09375, "learning_rate": 9.494684788534821e-06, "loss": 1.0504343, "memory(GiB)": 142.32, "step": 33460, "train_speed(iter/s)": 0.289508 }, { "acc": 0.7258604, "epoch": 0.3744720355345776, "grad_norm": 5.3125, "learning_rate": 9.493874290485229e-06, "loss": 1.09279633, "memory(GiB)": 142.32, "step": 33480, "train_speed(iter/s)": 0.289567 }, { "acc": 0.71426711, "epoch": 0.3746957344805361, "grad_norm": 5.28125, "learning_rate": 9.493063177609403e-06, "loss": 1.13984241, "memory(GiB)": 142.32, "step": 33500, "train_speed(iter/s)": 0.289624 }, { "acc": 0.7181572, "epoch": 0.37491943342649464, "grad_norm": 5.28125, "learning_rate": 9.492251450018313e-06, "loss": 1.120998, "memory(GiB)": 142.32, "step": 33520, "train_speed(iter/s)": 0.289676 }, { "acc": 0.72998743, "epoch": 0.37514313237245317, "grad_norm": 6.25, "learning_rate": 9.491439107823015e-06, "loss": 1.0723465, "memory(GiB)": 142.32, "step": 33540, "train_speed(iter/s)": 0.28973 }, { "acc": 0.73069544, "epoch": 0.3753668313184117, "grad_norm": 6.03125, "learning_rate": 9.49062615113465e-06, "loss": 1.07675304, "memory(GiB)": 142.32, "step": 33560, "train_speed(iter/s)": 0.289794 }, { "acc": 0.72302871, "epoch": 0.3755905302643702, "grad_norm": 6.65625, "learning_rate": 9.489812580064442e-06, "loss": 1.09705429, "memory(GiB)": 142.32, "step": 33580, "train_speed(iter/s)": 0.289855 }, { "acc": 0.72838016, "epoch": 0.37581422921032875, "grad_norm": 5.5625, "learning_rate": 9.488998394723699e-06, "loss": 1.08716412, "memory(GiB)": 142.32, "step": 33600, "train_speed(iter/s)": 0.28992 }, { "acc": 0.71798005, "epoch": 0.3760379281562873, "grad_norm": 6.40625, "learning_rate": 9.488183595223811e-06, "loss": 1.13043509, "memory(GiB)": 142.32, "step": 33620, "train_speed(iter/s)": 0.289978 }, { "acc": 0.72591429, "epoch": 0.3762616271022458, "grad_norm": 5.6875, "learning_rate": 9.487368181676259e-06, "loss": 1.10042677, "memory(GiB)": 142.32, "step": 33640, "train_speed(iter/s)": 0.290034 }, { "acc": 0.71635633, "epoch": 0.37648532604820434, "grad_norm": 5.40625, "learning_rate": 9.4865521541926e-06, "loss": 1.11467867, "memory(GiB)": 142.32, "step": 33660, "train_speed(iter/s)": 0.290091 }, { "acc": 0.73269572, "epoch": 0.37670902499416287, "grad_norm": 5.625, "learning_rate": 9.48573551288448e-06, "loss": 1.06122322, "memory(GiB)": 142.32, "step": 33680, "train_speed(iter/s)": 0.290146 }, { "acc": 0.72635899, "epoch": 0.3769327239401214, "grad_norm": 6.0625, "learning_rate": 9.484918257863623e-06, "loss": 1.09310989, "memory(GiB)": 142.32, "step": 33700, "train_speed(iter/s)": 0.290205 }, { "acc": 0.72838984, "epoch": 0.3771564228860799, "grad_norm": 5.59375, "learning_rate": 9.484100389241844e-06, "loss": 1.07717476, "memory(GiB)": 142.32, "step": 33720, "train_speed(iter/s)": 0.290264 }, { "acc": 0.71250949, "epoch": 0.37738012183203845, "grad_norm": 6.75, "learning_rate": 9.483281907131042e-06, "loss": 1.16187735, "memory(GiB)": 142.32, "step": 33740, "train_speed(iter/s)": 0.290324 }, { "acc": 0.723277, "epoch": 0.377603820777997, "grad_norm": 5.96875, "learning_rate": 9.482462811643191e-06, "loss": 1.10924263, "memory(GiB)": 142.32, "step": 33760, "train_speed(iter/s)": 0.290377 }, { "acc": 0.72426238, "epoch": 0.3778275197239555, "grad_norm": 5.09375, "learning_rate": 9.481643102890361e-06, "loss": 1.08672037, "memory(GiB)": 142.32, "step": 33780, "train_speed(iter/s)": 0.290436 }, { "acc": 0.72769022, "epoch": 0.37805121866991404, "grad_norm": 5.96875, "learning_rate": 9.480822780984695e-06, "loss": 1.07868843, "memory(GiB)": 142.32, "step": 33800, "train_speed(iter/s)": 0.290489 }, { "acc": 0.72797842, "epoch": 0.37827491761587256, "grad_norm": 5.15625, "learning_rate": 9.480001846038429e-06, "loss": 1.08782425, "memory(GiB)": 142.32, "step": 33820, "train_speed(iter/s)": 0.290544 }, { "acc": 0.72397919, "epoch": 0.3784986165618311, "grad_norm": 6.0, "learning_rate": 9.479180298163876e-06, "loss": 1.09168472, "memory(GiB)": 142.32, "step": 33840, "train_speed(iter/s)": 0.290598 }, { "acc": 0.72438326, "epoch": 0.3787223155077896, "grad_norm": 5.4375, "learning_rate": 9.478358137473433e-06, "loss": 1.09196758, "memory(GiB)": 142.32, "step": 33860, "train_speed(iter/s)": 0.290653 }, { "acc": 0.71951051, "epoch": 0.37894601445374815, "grad_norm": 5.90625, "learning_rate": 9.477535364079588e-06, "loss": 1.12965126, "memory(GiB)": 142.32, "step": 33880, "train_speed(iter/s)": 0.290698 }, { "acc": 0.72388344, "epoch": 0.3791697133997067, "grad_norm": 5.53125, "learning_rate": 9.476711978094908e-06, "loss": 1.09680004, "memory(GiB)": 142.32, "step": 33900, "train_speed(iter/s)": 0.290754 }, { "acc": 0.72415309, "epoch": 0.3793934123456652, "grad_norm": 5.40625, "learning_rate": 9.475887979632041e-06, "loss": 1.10026588, "memory(GiB)": 142.32, "step": 33920, "train_speed(iter/s)": 0.290815 }, { "acc": 0.73317165, "epoch": 0.37961711129162373, "grad_norm": 7.21875, "learning_rate": 9.475063368803724e-06, "loss": 1.05416431, "memory(GiB)": 142.32, "step": 33940, "train_speed(iter/s)": 0.290872 }, { "acc": 0.7276237, "epoch": 0.37984081023758226, "grad_norm": 5.71875, "learning_rate": 9.474238145722775e-06, "loss": 1.08870125, "memory(GiB)": 142.32, "step": 33960, "train_speed(iter/s)": 0.290932 }, { "acc": 0.73644285, "epoch": 0.3800645091835408, "grad_norm": 6.25, "learning_rate": 9.473412310502095e-06, "loss": 1.05072212, "memory(GiB)": 142.32, "step": 33980, "train_speed(iter/s)": 0.290985 }, { "acc": 0.73197761, "epoch": 0.3802882081294993, "grad_norm": 5.5, "learning_rate": 9.472585863254672e-06, "loss": 1.07634563, "memory(GiB)": 142.32, "step": 34000, "train_speed(iter/s)": 0.29104 }, { "epoch": 0.3802882081294993, "eval_acc": 0.688996473908858, "eval_loss": 1.104271411895752, "eval_runtime": 2339.805, "eval_samples_per_second": 32.175, "eval_steps_per_second": 16.088, "step": 34000 }, { "acc": 0.72987428, "epoch": 0.38051190707545784, "grad_norm": 5.78125, "learning_rate": 9.471758804093574e-06, "loss": 1.07305613, "memory(GiB)": 142.32, "step": 34020, "train_speed(iter/s)": 0.285261 }, { "acc": 0.72225442, "epoch": 0.3807356060214164, "grad_norm": 5.28125, "learning_rate": 9.470931133131957e-06, "loss": 1.10984535, "memory(GiB)": 142.32, "step": 34040, "train_speed(iter/s)": 0.28532 }, { "acc": 0.71315751, "epoch": 0.3809593049673749, "grad_norm": 6.34375, "learning_rate": 9.470102850483055e-06, "loss": 1.14806328, "memory(GiB)": 142.32, "step": 34060, "train_speed(iter/s)": 0.285374 }, { "acc": 0.72122021, "epoch": 0.38118300391333343, "grad_norm": 5.40625, "learning_rate": 9.46927395626019e-06, "loss": 1.12596388, "memory(GiB)": 142.32, "step": 34080, "train_speed(iter/s)": 0.285426 }, { "acc": 0.71953678, "epoch": 0.38140670285929196, "grad_norm": 6.28125, "learning_rate": 9.468444450576768e-06, "loss": 1.12493811, "memory(GiB)": 142.32, "step": 34100, "train_speed(iter/s)": 0.285479 }, { "acc": 0.72377772, "epoch": 0.3816304018052505, "grad_norm": 5.71875, "learning_rate": 9.467614333546278e-06, "loss": 1.10800056, "memory(GiB)": 142.32, "step": 34120, "train_speed(iter/s)": 0.285531 }, { "acc": 0.73155093, "epoch": 0.381854100751209, "grad_norm": 4.6875, "learning_rate": 9.46678360528229e-06, "loss": 1.06787672, "memory(GiB)": 142.32, "step": 34140, "train_speed(iter/s)": 0.285585 }, { "acc": 0.73928895, "epoch": 0.38207779969716754, "grad_norm": 5.875, "learning_rate": 9.465952265898458e-06, "loss": 1.03297119, "memory(GiB)": 142.32, "step": 34160, "train_speed(iter/s)": 0.285633 }, { "acc": 0.7333549, "epoch": 0.38230149864312607, "grad_norm": 6.375, "learning_rate": 9.465120315508522e-06, "loss": 1.07701664, "memory(GiB)": 142.32, "step": 34180, "train_speed(iter/s)": 0.285687 }, { "acc": 0.73142195, "epoch": 0.3825251975890846, "grad_norm": 5.875, "learning_rate": 9.464287754226308e-06, "loss": 1.06240292, "memory(GiB)": 142.32, "step": 34200, "train_speed(iter/s)": 0.285738 }, { "acc": 0.72390018, "epoch": 0.3827488965350431, "grad_norm": 5.625, "learning_rate": 9.463454582165719e-06, "loss": 1.10820618, "memory(GiB)": 142.32, "step": 34220, "train_speed(iter/s)": 0.285796 }, { "acc": 0.72988105, "epoch": 0.38297259548100165, "grad_norm": 5.5, "learning_rate": 9.462620799440746e-06, "loss": 1.0649209, "memory(GiB)": 142.32, "step": 34240, "train_speed(iter/s)": 0.285857 }, { "acc": 0.72546539, "epoch": 0.3831962944269602, "grad_norm": 5.1875, "learning_rate": 9.461786406165463e-06, "loss": 1.09756804, "memory(GiB)": 142.32, "step": 34260, "train_speed(iter/s)": 0.285908 }, { "acc": 0.71897454, "epoch": 0.3834199933729187, "grad_norm": 5.8125, "learning_rate": 9.460951402454024e-06, "loss": 1.1155201, "memory(GiB)": 142.32, "step": 34280, "train_speed(iter/s)": 0.28597 }, { "acc": 0.72726231, "epoch": 0.38364369231887724, "grad_norm": 6.59375, "learning_rate": 9.460115788420672e-06, "loss": 1.0964777, "memory(GiB)": 142.32, "step": 34300, "train_speed(iter/s)": 0.286022 }, { "acc": 0.72031641, "epoch": 0.38386739126483577, "grad_norm": 6.8125, "learning_rate": 9.45927956417973e-06, "loss": 1.11308231, "memory(GiB)": 142.32, "step": 34320, "train_speed(iter/s)": 0.286082 }, { "acc": 0.72288656, "epoch": 0.3840910902107943, "grad_norm": 6.21875, "learning_rate": 9.458442729845608e-06, "loss": 1.11631393, "memory(GiB)": 142.32, "step": 34340, "train_speed(iter/s)": 0.28614 }, { "acc": 0.71966524, "epoch": 0.3843147891567528, "grad_norm": 5.25, "learning_rate": 9.457605285532792e-06, "loss": 1.12377281, "memory(GiB)": 142.32, "step": 34360, "train_speed(iter/s)": 0.286199 }, { "acc": 0.7298213, "epoch": 0.38453848810271135, "grad_norm": 6.125, "learning_rate": 9.45676723135586e-06, "loss": 1.0882431, "memory(GiB)": 142.32, "step": 34380, "train_speed(iter/s)": 0.286257 }, { "acc": 0.7309844, "epoch": 0.3847621870486699, "grad_norm": 5.21875, "learning_rate": 9.455928567429469e-06, "loss": 1.06770954, "memory(GiB)": 142.32, "step": 34400, "train_speed(iter/s)": 0.286316 }, { "acc": 0.7163465, "epoch": 0.3849858859946284, "grad_norm": 6.21875, "learning_rate": 9.45508929386836e-06, "loss": 1.1247633, "memory(GiB)": 142.32, "step": 34420, "train_speed(iter/s)": 0.286372 }, { "acc": 0.72556372, "epoch": 0.38520958494058694, "grad_norm": 5.25, "learning_rate": 9.454249410787358e-06, "loss": 1.10523281, "memory(GiB)": 142.32, "step": 34440, "train_speed(iter/s)": 0.286434 }, { "acc": 0.71375713, "epoch": 0.38543328388654546, "grad_norm": 7.28125, "learning_rate": 9.45340891830137e-06, "loss": 1.16831055, "memory(GiB)": 142.32, "step": 34460, "train_speed(iter/s)": 0.286489 }, { "acc": 0.72008128, "epoch": 0.385656982832504, "grad_norm": 6.75, "learning_rate": 9.452567816525388e-06, "loss": 1.11214638, "memory(GiB)": 142.32, "step": 34480, "train_speed(iter/s)": 0.286547 }, { "acc": 0.72170343, "epoch": 0.3858806817784625, "grad_norm": 6.5625, "learning_rate": 9.451726105574489e-06, "loss": 1.10821838, "memory(GiB)": 142.32, "step": 34500, "train_speed(iter/s)": 0.286612 }, { "acc": 0.72931852, "epoch": 0.38610438072442105, "grad_norm": 5.0, "learning_rate": 9.450883785563827e-06, "loss": 1.07045383, "memory(GiB)": 142.32, "step": 34520, "train_speed(iter/s)": 0.286683 }, { "acc": 0.72679172, "epoch": 0.3863280796703796, "grad_norm": 5.0625, "learning_rate": 9.450040856608647e-06, "loss": 1.0775898, "memory(GiB)": 142.32, "step": 34540, "train_speed(iter/s)": 0.286734 }, { "acc": 0.7271121, "epoch": 0.38655177861633816, "grad_norm": 5.09375, "learning_rate": 9.44919731882427e-06, "loss": 1.09557915, "memory(GiB)": 142.32, "step": 34560, "train_speed(iter/s)": 0.286785 }, { "acc": 0.72594681, "epoch": 0.3867754775622967, "grad_norm": 6.15625, "learning_rate": 9.448353172326106e-06, "loss": 1.07147694, "memory(GiB)": 142.32, "step": 34580, "train_speed(iter/s)": 0.286837 }, { "acc": 0.72291842, "epoch": 0.3869991765082552, "grad_norm": 14.625, "learning_rate": 9.447508417229649e-06, "loss": 1.11273632, "memory(GiB)": 142.32, "step": 34600, "train_speed(iter/s)": 0.286895 }, { "acc": 0.71062064, "epoch": 0.38722287545421374, "grad_norm": 5.53125, "learning_rate": 9.446663053650468e-06, "loss": 1.15384903, "memory(GiB)": 142.32, "step": 34620, "train_speed(iter/s)": 0.28695 }, { "acc": 0.73032527, "epoch": 0.38744657440017227, "grad_norm": 5.75, "learning_rate": 9.445817081704226e-06, "loss": 1.08178635, "memory(GiB)": 142.32, "step": 34640, "train_speed(iter/s)": 0.287004 }, { "acc": 0.72397242, "epoch": 0.3876702733461308, "grad_norm": 5.625, "learning_rate": 9.444970501506661e-06, "loss": 1.1085124, "memory(GiB)": 142.32, "step": 34660, "train_speed(iter/s)": 0.287063 }, { "acc": 0.72564754, "epoch": 0.38789397229208933, "grad_norm": 6.78125, "learning_rate": 9.4441233131736e-06, "loss": 1.10024147, "memory(GiB)": 142.32, "step": 34680, "train_speed(iter/s)": 0.28712 }, { "acc": 0.72388458, "epoch": 0.38811767123804786, "grad_norm": 5.21875, "learning_rate": 9.443275516820944e-06, "loss": 1.11855021, "memory(GiB)": 142.32, "step": 34700, "train_speed(iter/s)": 0.287177 }, { "acc": 0.72921343, "epoch": 0.3883413701840064, "grad_norm": 5.3125, "learning_rate": 9.442427112564692e-06, "loss": 1.06514397, "memory(GiB)": 142.32, "step": 34720, "train_speed(iter/s)": 0.28724 }, { "acc": 0.72644854, "epoch": 0.3885650691299649, "grad_norm": 6.0625, "learning_rate": 9.441578100520914e-06, "loss": 1.11092758, "memory(GiB)": 142.32, "step": 34740, "train_speed(iter/s)": 0.287296 }, { "acc": 0.71686239, "epoch": 0.38878876807592344, "grad_norm": 6.0625, "learning_rate": 9.440728480805765e-06, "loss": 1.14020882, "memory(GiB)": 142.32, "step": 34760, "train_speed(iter/s)": 0.287354 }, { "acc": 0.73696718, "epoch": 0.38901246702188197, "grad_norm": 5.5625, "learning_rate": 9.439878253535488e-06, "loss": 1.04834194, "memory(GiB)": 142.32, "step": 34780, "train_speed(iter/s)": 0.287408 }, { "acc": 0.72170048, "epoch": 0.3892361659678405, "grad_norm": 7.8125, "learning_rate": 9.439027418826406e-06, "loss": 1.10646009, "memory(GiB)": 142.32, "step": 34800, "train_speed(iter/s)": 0.287467 }, { "acc": 0.730756, "epoch": 0.389459864913799, "grad_norm": 5.4375, "learning_rate": 9.438175976794926e-06, "loss": 1.0577116, "memory(GiB)": 142.32, "step": 34820, "train_speed(iter/s)": 0.287526 }, { "acc": 0.72413473, "epoch": 0.38968356385975755, "grad_norm": 6.625, "learning_rate": 9.437323927557534e-06, "loss": 1.10498228, "memory(GiB)": 142.32, "step": 34840, "train_speed(iter/s)": 0.287585 }, { "acc": 0.735182, "epoch": 0.3899072628057161, "grad_norm": 5.3125, "learning_rate": 9.436471271230804e-06, "loss": 1.05407619, "memory(GiB)": 142.32, "step": 34860, "train_speed(iter/s)": 0.287642 }, { "acc": 0.72126431, "epoch": 0.3901309617516746, "grad_norm": 5.90625, "learning_rate": 9.435618007931395e-06, "loss": 1.10594158, "memory(GiB)": 142.32, "step": 34880, "train_speed(iter/s)": 0.287697 }, { "acc": 0.71339102, "epoch": 0.39035466069763314, "grad_norm": 6.0625, "learning_rate": 9.434764137776043e-06, "loss": 1.15343151, "memory(GiB)": 142.32, "step": 34900, "train_speed(iter/s)": 0.287751 }, { "acc": 0.71217432, "epoch": 0.39057835964359167, "grad_norm": 5.6875, "learning_rate": 9.433909660881568e-06, "loss": 1.15828581, "memory(GiB)": 142.32, "step": 34920, "train_speed(iter/s)": 0.287807 }, { "acc": 0.7156724, "epoch": 0.3908020585895502, "grad_norm": 6.03125, "learning_rate": 9.433054577364876e-06, "loss": 1.13620567, "memory(GiB)": 142.32, "step": 34940, "train_speed(iter/s)": 0.287863 }, { "acc": 0.72678342, "epoch": 0.3910257575355087, "grad_norm": 5.34375, "learning_rate": 9.432198887342956e-06, "loss": 1.10133381, "memory(GiB)": 142.32, "step": 34960, "train_speed(iter/s)": 0.287916 }, { "acc": 0.71502686, "epoch": 0.39124945648146725, "grad_norm": 6.65625, "learning_rate": 9.431342590932877e-06, "loss": 1.14870625, "memory(GiB)": 142.32, "step": 34980, "train_speed(iter/s)": 0.287961 }, { "acc": 0.72867002, "epoch": 0.3914731554274258, "grad_norm": 5.59375, "learning_rate": 9.430485688251793e-06, "loss": 1.0884244, "memory(GiB)": 142.32, "step": 35000, "train_speed(iter/s)": 0.288017 }, { "acc": 0.72446299, "epoch": 0.3916968543733843, "grad_norm": 4.90625, "learning_rate": 9.42962817941694e-06, "loss": 1.07557592, "memory(GiB)": 142.32, "step": 35020, "train_speed(iter/s)": 0.288071 }, { "acc": 0.72431183, "epoch": 0.39192055331934283, "grad_norm": 6.1875, "learning_rate": 9.428770064545638e-06, "loss": 1.10023212, "memory(GiB)": 142.32, "step": 35040, "train_speed(iter/s)": 0.288127 }, { "acc": 0.72984853, "epoch": 0.39214425226530136, "grad_norm": 5.375, "learning_rate": 9.427911343755291e-06, "loss": 1.09565802, "memory(GiB)": 142.32, "step": 35060, "train_speed(iter/s)": 0.288172 }, { "acc": 0.7286027, "epoch": 0.3923679512112599, "grad_norm": 6.28125, "learning_rate": 9.427052017163381e-06, "loss": 1.07537022, "memory(GiB)": 142.32, "step": 35080, "train_speed(iter/s)": 0.288223 }, { "acc": 0.7147953, "epoch": 0.3925916501572184, "grad_norm": 5.71875, "learning_rate": 9.42619208488748e-06, "loss": 1.13815947, "memory(GiB)": 142.32, "step": 35100, "train_speed(iter/s)": 0.288278 }, { "acc": 0.73376255, "epoch": 0.39281534910317695, "grad_norm": 6.375, "learning_rate": 9.425331547045235e-06, "loss": 1.05667839, "memory(GiB)": 142.32, "step": 35120, "train_speed(iter/s)": 0.288326 }, { "acc": 0.72120781, "epoch": 0.3930390480491355, "grad_norm": 6.8125, "learning_rate": 9.424470403754382e-06, "loss": 1.11221085, "memory(GiB)": 142.32, "step": 35140, "train_speed(iter/s)": 0.28838 }, { "acc": 0.73231645, "epoch": 0.393262746995094, "grad_norm": 5.65625, "learning_rate": 9.423608655132738e-06, "loss": 1.05428772, "memory(GiB)": 142.32, "step": 35160, "train_speed(iter/s)": 0.288434 }, { "acc": 0.72915926, "epoch": 0.39348644594105253, "grad_norm": 5.21875, "learning_rate": 9.422746301298203e-06, "loss": 1.07844944, "memory(GiB)": 142.32, "step": 35180, "train_speed(iter/s)": 0.28849 }, { "acc": 0.71939363, "epoch": 0.39371014488701106, "grad_norm": 5.53125, "learning_rate": 9.421883342368758e-06, "loss": 1.1396575, "memory(GiB)": 142.32, "step": 35200, "train_speed(iter/s)": 0.288544 }, { "acc": 0.72417259, "epoch": 0.3939338438329696, "grad_norm": 7.3125, "learning_rate": 9.421019778462468e-06, "loss": 1.09198999, "memory(GiB)": 142.32, "step": 35220, "train_speed(iter/s)": 0.288604 }, { "acc": 0.72989044, "epoch": 0.3941575427789281, "grad_norm": 5.9375, "learning_rate": 9.420155609697482e-06, "loss": 1.08040962, "memory(GiB)": 142.32, "step": 35240, "train_speed(iter/s)": 0.288656 }, { "acc": 0.72717552, "epoch": 0.39438124172488664, "grad_norm": 5.90625, "learning_rate": 9.419290836192027e-06, "loss": 1.10033207, "memory(GiB)": 142.32, "step": 35260, "train_speed(iter/s)": 0.288709 }, { "acc": 0.73760176, "epoch": 0.3946049406708452, "grad_norm": 6.3125, "learning_rate": 9.418425458064423e-06, "loss": 1.0383112, "memory(GiB)": 142.32, "step": 35280, "train_speed(iter/s)": 0.288762 }, { "acc": 0.72125058, "epoch": 0.3948286396168037, "grad_norm": 6.34375, "learning_rate": 9.41755947543306e-06, "loss": 1.10149117, "memory(GiB)": 142.32, "step": 35300, "train_speed(iter/s)": 0.288816 }, { "acc": 0.72424421, "epoch": 0.39505233856276223, "grad_norm": 6.03125, "learning_rate": 9.416692888416421e-06, "loss": 1.10548534, "memory(GiB)": 142.32, "step": 35320, "train_speed(iter/s)": 0.288868 }, { "acc": 0.73079224, "epoch": 0.39527603750872076, "grad_norm": 5.96875, "learning_rate": 9.415825697133065e-06, "loss": 1.06849823, "memory(GiB)": 142.32, "step": 35340, "train_speed(iter/s)": 0.288923 }, { "acc": 0.73492994, "epoch": 0.3954997364546793, "grad_norm": 5.40625, "learning_rate": 9.414957901701637e-06, "loss": 1.06794949, "memory(GiB)": 142.32, "step": 35360, "train_speed(iter/s)": 0.288978 }, { "acc": 0.72174244, "epoch": 0.3957234354006378, "grad_norm": 5.46875, "learning_rate": 9.414089502240864e-06, "loss": 1.11812572, "memory(GiB)": 142.32, "step": 35380, "train_speed(iter/s)": 0.289032 }, { "acc": 0.73014631, "epoch": 0.39594713434659634, "grad_norm": 6.40625, "learning_rate": 9.413220498869556e-06, "loss": 1.07656384, "memory(GiB)": 142.32, "step": 35400, "train_speed(iter/s)": 0.289084 }, { "acc": 0.72020364, "epoch": 0.39617083329255487, "grad_norm": 4.5, "learning_rate": 9.412350891706603e-06, "loss": 1.10391827, "memory(GiB)": 142.32, "step": 35420, "train_speed(iter/s)": 0.289138 }, { "acc": 0.73034072, "epoch": 0.3963945322385134, "grad_norm": 5.9375, "learning_rate": 9.411480680870982e-06, "loss": 1.06912155, "memory(GiB)": 142.32, "step": 35440, "train_speed(iter/s)": 0.289188 }, { "acc": 0.71130733, "epoch": 0.3966182311844719, "grad_norm": 6.25, "learning_rate": 9.410609866481748e-06, "loss": 1.16316547, "memory(GiB)": 142.32, "step": 35460, "train_speed(iter/s)": 0.289247 }, { "acc": 0.72365971, "epoch": 0.39684193013043045, "grad_norm": 6.15625, "learning_rate": 9.409738448658044e-06, "loss": 1.10776014, "memory(GiB)": 142.32, "step": 35480, "train_speed(iter/s)": 0.289298 }, { "acc": 0.73032694, "epoch": 0.397065629076389, "grad_norm": 6.46875, "learning_rate": 9.408866427519088e-06, "loss": 1.07469473, "memory(GiB)": 142.32, "step": 35500, "train_speed(iter/s)": 0.289355 }, { "acc": 0.72787619, "epoch": 0.3972893280223475, "grad_norm": 5.25, "learning_rate": 9.40799380318419e-06, "loss": 1.08002625, "memory(GiB)": 142.32, "step": 35520, "train_speed(iter/s)": 0.289409 }, { "acc": 0.72407637, "epoch": 0.39751302696830604, "grad_norm": 5.90625, "learning_rate": 9.407120575772733e-06, "loss": 1.1128149, "memory(GiB)": 142.32, "step": 35540, "train_speed(iter/s)": 0.289458 }, { "acc": 0.72689638, "epoch": 0.39773672591426457, "grad_norm": 5.34375, "learning_rate": 9.40624674540419e-06, "loss": 1.08772678, "memory(GiB)": 142.32, "step": 35560, "train_speed(iter/s)": 0.28951 }, { "acc": 0.7374732, "epoch": 0.3979604248602231, "grad_norm": 5.75, "learning_rate": 9.405372312198113e-06, "loss": 1.05327015, "memory(GiB)": 142.32, "step": 35580, "train_speed(iter/s)": 0.289556 }, { "acc": 0.72360654, "epoch": 0.3981841238061816, "grad_norm": 6.8125, "learning_rate": 9.404497276274136e-06, "loss": 1.1028348, "memory(GiB)": 142.32, "step": 35600, "train_speed(iter/s)": 0.289617 }, { "acc": 0.73521404, "epoch": 0.39840782275214015, "grad_norm": 5.59375, "learning_rate": 9.403621637751977e-06, "loss": 1.04416008, "memory(GiB)": 142.32, "step": 35620, "train_speed(iter/s)": 0.289665 }, { "acc": 0.72089167, "epoch": 0.3986315216980987, "grad_norm": 7.875, "learning_rate": 9.402745396751434e-06, "loss": 1.10647249, "memory(GiB)": 142.32, "step": 35640, "train_speed(iter/s)": 0.289719 }, { "acc": 0.73375807, "epoch": 0.3988552206440572, "grad_norm": 5.0, "learning_rate": 9.401868553392393e-06, "loss": 1.0787817, "memory(GiB)": 142.32, "step": 35660, "train_speed(iter/s)": 0.289774 }, { "acc": 0.72102661, "epoch": 0.39907891959001573, "grad_norm": 6.9375, "learning_rate": 9.400991107794816e-06, "loss": 1.1257597, "memory(GiB)": 142.32, "step": 35680, "train_speed(iter/s)": 0.289828 }, { "acc": 0.7171083, "epoch": 0.39930261853597426, "grad_norm": 5.9375, "learning_rate": 9.40011306007875e-06, "loss": 1.13127203, "memory(GiB)": 142.32, "step": 35700, "train_speed(iter/s)": 0.289885 }, { "acc": 0.71796074, "epoch": 0.3995263174819328, "grad_norm": 5.25, "learning_rate": 9.399234410364326e-06, "loss": 1.13021879, "memory(GiB)": 142.32, "step": 35720, "train_speed(iter/s)": 0.289936 }, { "acc": 0.73108716, "epoch": 0.3997500164278913, "grad_norm": 5.28125, "learning_rate": 9.398355158771755e-06, "loss": 1.05613623, "memory(GiB)": 142.32, "step": 35740, "train_speed(iter/s)": 0.289993 }, { "acc": 0.72046413, "epoch": 0.39997371537384985, "grad_norm": 5.90625, "learning_rate": 9.397475305421332e-06, "loss": 1.13118992, "memory(GiB)": 142.32, "step": 35760, "train_speed(iter/s)": 0.290047 }, { "acc": 0.72245941, "epoch": 0.4001974143198084, "grad_norm": 6.15625, "learning_rate": 9.396594850433432e-06, "loss": 1.09596863, "memory(GiB)": 142.32, "step": 35780, "train_speed(iter/s)": 0.290103 }, { "acc": 0.72954807, "epoch": 0.4004211132657669, "grad_norm": 6.1875, "learning_rate": 9.395713793928514e-06, "loss": 1.06858654, "memory(GiB)": 142.32, "step": 35800, "train_speed(iter/s)": 0.290157 }, { "acc": 0.74215097, "epoch": 0.4006448122117255, "grad_norm": 6.40625, "learning_rate": 9.394832136027121e-06, "loss": 1.0239152, "memory(GiB)": 142.32, "step": 35820, "train_speed(iter/s)": 0.290208 }, { "acc": 0.72717776, "epoch": 0.400868511157684, "grad_norm": 6.0625, "learning_rate": 9.393949876849875e-06, "loss": 1.07933083, "memory(GiB)": 142.32, "step": 35840, "train_speed(iter/s)": 0.290267 }, { "acc": 0.72859087, "epoch": 0.40109221010364254, "grad_norm": 5.75, "learning_rate": 9.393067016517483e-06, "loss": 1.08756466, "memory(GiB)": 142.32, "step": 35860, "train_speed(iter/s)": 0.290324 }, { "acc": 0.72298079, "epoch": 0.40131590904960107, "grad_norm": 6.9375, "learning_rate": 9.39218355515073e-06, "loss": 1.09690647, "memory(GiB)": 142.32, "step": 35880, "train_speed(iter/s)": 0.29038 }, { "acc": 0.71454329, "epoch": 0.4015396079955596, "grad_norm": 5.53125, "learning_rate": 9.391299492870488e-06, "loss": 1.16188192, "memory(GiB)": 142.32, "step": 35900, "train_speed(iter/s)": 0.290426 }, { "acc": 0.71776056, "epoch": 0.40176330694151813, "grad_norm": 8.625, "learning_rate": 9.39041482979771e-06, "loss": 1.14981232, "memory(GiB)": 142.32, "step": 35920, "train_speed(iter/s)": 0.290479 }, { "acc": 0.71070213, "epoch": 0.40198700588747666, "grad_norm": 6.28125, "learning_rate": 9.389529566053428e-06, "loss": 1.16256199, "memory(GiB)": 142.32, "step": 35940, "train_speed(iter/s)": 0.290531 }, { "acc": 0.72625046, "epoch": 0.4022107048334352, "grad_norm": 4.65625, "learning_rate": 9.388643701758761e-06, "loss": 1.10511379, "memory(GiB)": 142.32, "step": 35960, "train_speed(iter/s)": 0.290582 }, { "acc": 0.72648516, "epoch": 0.4024344037793937, "grad_norm": 5.21875, "learning_rate": 9.387757237034909e-06, "loss": 1.08186703, "memory(GiB)": 142.32, "step": 35980, "train_speed(iter/s)": 0.290633 }, { "acc": 0.72727489, "epoch": 0.40265810272535224, "grad_norm": 4.90625, "learning_rate": 9.386870172003151e-06, "loss": 1.08419819, "memory(GiB)": 142.32, "step": 36000, "train_speed(iter/s)": 0.290683 }, { "epoch": 0.40265810272535224, "eval_acc": 0.6894506439917828, "eval_loss": 1.1020585298538208, "eval_runtime": 2340.1934, "eval_samples_per_second": 32.17, "eval_steps_per_second": 16.085, "step": 36000 }, { "acc": 0.73147922, "epoch": 0.40288180167131077, "grad_norm": 5.03125, "learning_rate": 9.385982506784851e-06, "loss": 1.06479282, "memory(GiB)": 142.32, "step": 36020, "train_speed(iter/s)": 0.285239 }, { "acc": 0.73432159, "epoch": 0.4031055006172693, "grad_norm": 5.53125, "learning_rate": 9.385094241501453e-06, "loss": 1.05174179, "memory(GiB)": 142.32, "step": 36040, "train_speed(iter/s)": 0.285296 }, { "acc": 0.72089319, "epoch": 0.4033291995632278, "grad_norm": 6.5625, "learning_rate": 9.384205376274486e-06, "loss": 1.12299061, "memory(GiB)": 142.32, "step": 36060, "train_speed(iter/s)": 0.28535 }, { "acc": 0.7224225, "epoch": 0.40355289850918635, "grad_norm": 4.53125, "learning_rate": 9.383315911225557e-06, "loss": 1.1069519, "memory(GiB)": 142.32, "step": 36080, "train_speed(iter/s)": 0.2854 }, { "acc": 0.72940245, "epoch": 0.4037765974551449, "grad_norm": 6.53125, "learning_rate": 9.382425846476362e-06, "loss": 1.07703295, "memory(GiB)": 142.32, "step": 36100, "train_speed(iter/s)": 0.285456 }, { "acc": 0.72971969, "epoch": 0.4040002964011034, "grad_norm": 5.03125, "learning_rate": 9.381535182148671e-06, "loss": 1.07802315, "memory(GiB)": 142.32, "step": 36120, "train_speed(iter/s)": 0.285509 }, { "acc": 0.72953768, "epoch": 0.40422399534706194, "grad_norm": 6.34375, "learning_rate": 9.38064391836434e-06, "loss": 1.0829443, "memory(GiB)": 142.32, "step": 36140, "train_speed(iter/s)": 0.285563 }, { "acc": 0.73552351, "epoch": 0.40444769429302047, "grad_norm": 6.3125, "learning_rate": 9.379752055245306e-06, "loss": 1.04990129, "memory(GiB)": 142.32, "step": 36160, "train_speed(iter/s)": 0.28561 }, { "acc": 0.72309971, "epoch": 0.404671393238979, "grad_norm": 4.96875, "learning_rate": 9.378859592913592e-06, "loss": 1.10824413, "memory(GiB)": 142.32, "step": 36180, "train_speed(iter/s)": 0.285666 }, { "acc": 0.72813673, "epoch": 0.4048950921849375, "grad_norm": 5.03125, "learning_rate": 9.377966531491297e-06, "loss": 1.07527122, "memory(GiB)": 142.32, "step": 36200, "train_speed(iter/s)": 0.28572 }, { "acc": 0.7273797, "epoch": 0.40511879113089605, "grad_norm": 5.5625, "learning_rate": 9.377072871100603e-06, "loss": 1.09897232, "memory(GiB)": 142.32, "step": 36220, "train_speed(iter/s)": 0.285768 }, { "acc": 0.73940401, "epoch": 0.4053424900768546, "grad_norm": 5.625, "learning_rate": 9.37617861186378e-06, "loss": 1.0240345, "memory(GiB)": 142.32, "step": 36240, "train_speed(iter/s)": 0.285815 }, { "acc": 0.73135767, "epoch": 0.4055661890228131, "grad_norm": 4.6875, "learning_rate": 9.37528375390317e-06, "loss": 1.04902287, "memory(GiB)": 142.32, "step": 36260, "train_speed(iter/s)": 0.285869 }, { "acc": 0.71448364, "epoch": 0.40578988796877163, "grad_norm": 5.75, "learning_rate": 9.374388297341208e-06, "loss": 1.15117264, "memory(GiB)": 142.32, "step": 36280, "train_speed(iter/s)": 0.285924 }, { "acc": 0.72548075, "epoch": 0.40601358691473016, "grad_norm": 4.8125, "learning_rate": 9.3734922423004e-06, "loss": 1.08414078, "memory(GiB)": 142.32, "step": 36300, "train_speed(iter/s)": 0.285975 }, { "acc": 0.72431259, "epoch": 0.4062372858606887, "grad_norm": 6.09375, "learning_rate": 9.372595588903345e-06, "loss": 1.09260798, "memory(GiB)": 142.32, "step": 36320, "train_speed(iter/s)": 0.286028 }, { "acc": 0.73149185, "epoch": 0.4064609848066472, "grad_norm": 6.5, "learning_rate": 9.371698337272712e-06, "loss": 1.06885357, "memory(GiB)": 142.32, "step": 36340, "train_speed(iter/s)": 0.286081 }, { "acc": 0.72845068, "epoch": 0.40668468375260575, "grad_norm": 4.8125, "learning_rate": 9.370800487531261e-06, "loss": 1.08215046, "memory(GiB)": 142.32, "step": 36360, "train_speed(iter/s)": 0.286134 }, { "acc": 0.7296031, "epoch": 0.4069083826985643, "grad_norm": 5.875, "learning_rate": 9.369902039801831e-06, "loss": 1.08751173, "memory(GiB)": 142.32, "step": 36380, "train_speed(iter/s)": 0.286188 }, { "acc": 0.71500435, "epoch": 0.4071320816445228, "grad_norm": 4.875, "learning_rate": 9.369002994207341e-06, "loss": 1.14150791, "memory(GiB)": 142.32, "step": 36400, "train_speed(iter/s)": 0.286246 }, { "acc": 0.72463322, "epoch": 0.40735578059048133, "grad_norm": 6.8125, "learning_rate": 9.368103350870794e-06, "loss": 1.09567509, "memory(GiB)": 142.32, "step": 36420, "train_speed(iter/s)": 0.286295 }, { "acc": 0.71997328, "epoch": 0.40757947953643986, "grad_norm": 8.1875, "learning_rate": 9.367203109915275e-06, "loss": 1.11256552, "memory(GiB)": 142.32, "step": 36440, "train_speed(iter/s)": 0.286349 }, { "acc": 0.73439322, "epoch": 0.4078031784823984, "grad_norm": 4.5625, "learning_rate": 9.366302271463947e-06, "loss": 1.05972176, "memory(GiB)": 142.32, "step": 36460, "train_speed(iter/s)": 0.286406 }, { "acc": 0.73071938, "epoch": 0.4080268774283569, "grad_norm": 5.90625, "learning_rate": 9.365400835640061e-06, "loss": 1.07924862, "memory(GiB)": 142.32, "step": 36480, "train_speed(iter/s)": 0.286464 }, { "acc": 0.72526903, "epoch": 0.40825057637431544, "grad_norm": 5.1875, "learning_rate": 9.364498802566944e-06, "loss": 1.09014788, "memory(GiB)": 142.32, "step": 36500, "train_speed(iter/s)": 0.286519 }, { "acc": 0.7365489, "epoch": 0.40847427532027397, "grad_norm": 5.71875, "learning_rate": 9.363596172368008e-06, "loss": 1.05533829, "memory(GiB)": 142.32, "step": 36520, "train_speed(iter/s)": 0.286576 }, { "acc": 0.72553577, "epoch": 0.4086979742662325, "grad_norm": 4.96875, "learning_rate": 9.362692945166745e-06, "loss": 1.10116072, "memory(GiB)": 142.32, "step": 36540, "train_speed(iter/s)": 0.28663 }, { "acc": 0.7182518, "epoch": 0.40892167321219103, "grad_norm": 6.3125, "learning_rate": 9.36178912108673e-06, "loss": 1.15086517, "memory(GiB)": 142.32, "step": 36560, "train_speed(iter/s)": 0.286684 }, { "acc": 0.73061681, "epoch": 0.40914537215814956, "grad_norm": 5.4375, "learning_rate": 9.36088470025162e-06, "loss": 1.08154545, "memory(GiB)": 142.32, "step": 36580, "train_speed(iter/s)": 0.28674 }, { "acc": 0.72835484, "epoch": 0.4093690711041081, "grad_norm": 6.75, "learning_rate": 9.35997968278515e-06, "loss": 1.07610741, "memory(GiB)": 142.32, "step": 36600, "train_speed(iter/s)": 0.286787 }, { "acc": 0.73539467, "epoch": 0.4095927700500666, "grad_norm": 6.40625, "learning_rate": 9.359074068811141e-06, "loss": 1.04385424, "memory(GiB)": 142.32, "step": 36620, "train_speed(iter/s)": 0.286833 }, { "acc": 0.73267307, "epoch": 0.40981646899602514, "grad_norm": 5.625, "learning_rate": 9.358167858453495e-06, "loss": 1.06703777, "memory(GiB)": 142.32, "step": 36640, "train_speed(iter/s)": 0.286888 }, { "acc": 0.72489691, "epoch": 0.41004016794198367, "grad_norm": 5.375, "learning_rate": 9.357261051836193e-06, "loss": 1.12349339, "memory(GiB)": 142.32, "step": 36660, "train_speed(iter/s)": 0.286939 }, { "acc": 0.72609396, "epoch": 0.4102638668879422, "grad_norm": 5.375, "learning_rate": 9.356353649083298e-06, "loss": 1.0815958, "memory(GiB)": 142.32, "step": 36680, "train_speed(iter/s)": 0.286993 }, { "acc": 0.72595377, "epoch": 0.4104875658339007, "grad_norm": 6.5, "learning_rate": 9.35544565031896e-06, "loss": 1.09071884, "memory(GiB)": 142.32, "step": 36700, "train_speed(iter/s)": 0.28705 }, { "acc": 0.72563081, "epoch": 0.41071126477985925, "grad_norm": 5.46875, "learning_rate": 9.354537055667401e-06, "loss": 1.10460262, "memory(GiB)": 142.32, "step": 36720, "train_speed(iter/s)": 0.287094 }, { "acc": 0.73151784, "epoch": 0.4109349637258178, "grad_norm": 4.28125, "learning_rate": 9.353627865252933e-06, "loss": 1.07556753, "memory(GiB)": 142.32, "step": 36740, "train_speed(iter/s)": 0.287146 }, { "acc": 0.72234416, "epoch": 0.4111586626717763, "grad_norm": 5.6875, "learning_rate": 9.352718079199946e-06, "loss": 1.11554585, "memory(GiB)": 142.32, "step": 36760, "train_speed(iter/s)": 0.287198 }, { "acc": 0.73255043, "epoch": 0.41138236161773484, "grad_norm": 6.0, "learning_rate": 9.35180769763291e-06, "loss": 1.06137104, "memory(GiB)": 142.32, "step": 36780, "train_speed(iter/s)": 0.287251 }, { "acc": 0.72193384, "epoch": 0.41160606056369337, "grad_norm": 6.03125, "learning_rate": 9.350896720676378e-06, "loss": 1.10146847, "memory(GiB)": 142.32, "step": 36800, "train_speed(iter/s)": 0.287302 }, { "acc": 0.71729116, "epoch": 0.4118297595096519, "grad_norm": 5.4375, "learning_rate": 9.34998514845499e-06, "loss": 1.11483669, "memory(GiB)": 142.32, "step": 36820, "train_speed(iter/s)": 0.287356 }, { "acc": 0.72975602, "epoch": 0.4120534584556104, "grad_norm": 5.5, "learning_rate": 9.349072981093455e-06, "loss": 1.06993999, "memory(GiB)": 142.32, "step": 36840, "train_speed(iter/s)": 0.287411 }, { "acc": 0.73202538, "epoch": 0.41227715740156895, "grad_norm": 5.40625, "learning_rate": 9.348160218716574e-06, "loss": 1.05370245, "memory(GiB)": 142.32, "step": 36860, "train_speed(iter/s)": 0.287464 }, { "acc": 0.73014617, "epoch": 0.4125008563475275, "grad_norm": 6.125, "learning_rate": 9.347246861449226e-06, "loss": 1.06623554, "memory(GiB)": 142.32, "step": 36880, "train_speed(iter/s)": 0.287518 }, { "acc": 0.73665762, "epoch": 0.412724555293486, "grad_norm": 6.21875, "learning_rate": 9.346332909416371e-06, "loss": 1.04675846, "memory(GiB)": 142.32, "step": 36900, "train_speed(iter/s)": 0.28757 }, { "acc": 0.73723555, "epoch": 0.41294825423944453, "grad_norm": 5.03125, "learning_rate": 9.34541836274305e-06, "loss": 1.03106022, "memory(GiB)": 142.32, "step": 36920, "train_speed(iter/s)": 0.287623 }, { "acc": 0.71578517, "epoch": 0.41317195318540306, "grad_norm": 6.03125, "learning_rate": 9.344503221554386e-06, "loss": 1.1484869, "memory(GiB)": 142.32, "step": 36940, "train_speed(iter/s)": 0.287678 }, { "acc": 0.72374601, "epoch": 0.4133956521313616, "grad_norm": 5.75, "learning_rate": 9.343587485975586e-06, "loss": 1.09880581, "memory(GiB)": 142.32, "step": 36960, "train_speed(iter/s)": 0.287727 }, { "acc": 0.72757363, "epoch": 0.4136193510773201, "grad_norm": 5.75, "learning_rate": 9.342671156131933e-06, "loss": 1.07566986, "memory(GiB)": 142.32, "step": 36980, "train_speed(iter/s)": 0.287779 }, { "acc": 0.72559004, "epoch": 0.41384305002327865, "grad_norm": 5.0625, "learning_rate": 9.341754232148795e-06, "loss": 1.09129534, "memory(GiB)": 142.32, "step": 37000, "train_speed(iter/s)": 0.287832 }, { "acc": 0.71244011, "epoch": 0.4140667489692372, "grad_norm": 5.15625, "learning_rate": 9.340836714151618e-06, "loss": 1.15463524, "memory(GiB)": 142.32, "step": 37020, "train_speed(iter/s)": 0.287887 }, { "acc": 0.73376093, "epoch": 0.4142904479151957, "grad_norm": 6.375, "learning_rate": 9.339918602265936e-06, "loss": 1.06693726, "memory(GiB)": 142.32, "step": 37040, "train_speed(iter/s)": 0.287941 }, { "acc": 0.72900028, "epoch": 0.41451414686115423, "grad_norm": 6.1875, "learning_rate": 9.338999896617357e-06, "loss": 1.09747124, "memory(GiB)": 142.32, "step": 37060, "train_speed(iter/s)": 0.287998 }, { "acc": 0.72983742, "epoch": 0.4147378458071128, "grad_norm": 6.03125, "learning_rate": 9.338080597331573e-06, "loss": 1.07663116, "memory(GiB)": 142.32, "step": 37080, "train_speed(iter/s)": 0.288047 }, { "acc": 0.71658115, "epoch": 0.41496154475307134, "grad_norm": 6.5, "learning_rate": 9.337160704534358e-06, "loss": 1.11724968, "memory(GiB)": 142.32, "step": 37100, "train_speed(iter/s)": 0.288099 }, { "acc": 0.7346806, "epoch": 0.41518524369902987, "grad_norm": 7.1875, "learning_rate": 9.336240218351567e-06, "loss": 1.05873051, "memory(GiB)": 142.32, "step": 37120, "train_speed(iter/s)": 0.28815 }, { "acc": 0.72657213, "epoch": 0.4154089426449884, "grad_norm": 5.875, "learning_rate": 9.335319138909133e-06, "loss": 1.09765472, "memory(GiB)": 142.32, "step": 37140, "train_speed(iter/s)": 0.288197 }, { "acc": 0.72369995, "epoch": 0.4156326415909469, "grad_norm": 4.6875, "learning_rate": 9.334397466333078e-06, "loss": 1.09993649, "memory(GiB)": 142.32, "step": 37160, "train_speed(iter/s)": 0.28825 }, { "acc": 0.7318615, "epoch": 0.41585634053690546, "grad_norm": 5.84375, "learning_rate": 9.333475200749495e-06, "loss": 1.07442789, "memory(GiB)": 142.32, "step": 37180, "train_speed(iter/s)": 0.288303 }, { "acc": 0.72054563, "epoch": 0.416080039482864, "grad_norm": 5.75, "learning_rate": 9.332552342284564e-06, "loss": 1.09872093, "memory(GiB)": 142.32, "step": 37200, "train_speed(iter/s)": 0.288357 }, { "acc": 0.72770748, "epoch": 0.4163037384288225, "grad_norm": 6.59375, "learning_rate": 9.331628891064548e-06, "loss": 1.09356918, "memory(GiB)": 142.32, "step": 37220, "train_speed(iter/s)": 0.288408 }, { "acc": 0.71997261, "epoch": 0.41652743737478104, "grad_norm": 5.8125, "learning_rate": 9.330704847215784e-06, "loss": 1.11710377, "memory(GiB)": 142.32, "step": 37240, "train_speed(iter/s)": 0.288462 }, { "acc": 0.71670141, "epoch": 0.41675113632073957, "grad_norm": 5.21875, "learning_rate": 9.329780210864699e-06, "loss": 1.15114317, "memory(GiB)": 142.32, "step": 37260, "train_speed(iter/s)": 0.288512 }, { "acc": 0.727771, "epoch": 0.4169748352666981, "grad_norm": 5.65625, "learning_rate": 9.328854982137795e-06, "loss": 1.07771978, "memory(GiB)": 142.32, "step": 37280, "train_speed(iter/s)": 0.288568 }, { "acc": 0.72018318, "epoch": 0.4171985342126566, "grad_norm": 7.71875, "learning_rate": 9.327929161161652e-06, "loss": 1.12904263, "memory(GiB)": 142.32, "step": 37300, "train_speed(iter/s)": 0.28862 }, { "acc": 0.7217205, "epoch": 0.41742223315861515, "grad_norm": 5.5, "learning_rate": 9.327002748062944e-06, "loss": 1.10811729, "memory(GiB)": 142.32, "step": 37320, "train_speed(iter/s)": 0.288673 }, { "acc": 0.72897592, "epoch": 0.4176459321045737, "grad_norm": 5.4375, "learning_rate": 9.326075742968411e-06, "loss": 1.05756721, "memory(GiB)": 142.32, "step": 37340, "train_speed(iter/s)": 0.288721 }, { "acc": 0.72983332, "epoch": 0.4178696310505322, "grad_norm": 6.0625, "learning_rate": 9.325148146004881e-06, "loss": 1.09071674, "memory(GiB)": 142.32, "step": 37360, "train_speed(iter/s)": 0.288773 }, { "acc": 0.71739411, "epoch": 0.41809332999649074, "grad_norm": 5.5, "learning_rate": 9.324219957299266e-06, "loss": 1.13499784, "memory(GiB)": 142.32, "step": 37380, "train_speed(iter/s)": 0.288821 }, { "acc": 0.73045416, "epoch": 0.41831702894244926, "grad_norm": 6.21875, "learning_rate": 9.323291176978552e-06, "loss": 1.07738199, "memory(GiB)": 142.32, "step": 37400, "train_speed(iter/s)": 0.288871 }, { "acc": 0.7382575, "epoch": 0.4185407278884078, "grad_norm": 8.875, "learning_rate": 9.322361805169813e-06, "loss": 1.03465109, "memory(GiB)": 142.32, "step": 37420, "train_speed(iter/s)": 0.288916 }, { "acc": 0.7230505, "epoch": 0.4187644268343663, "grad_norm": 5.59375, "learning_rate": 9.321431842000198e-06, "loss": 1.09410286, "memory(GiB)": 142.32, "step": 37440, "train_speed(iter/s)": 0.28896 }, { "acc": 0.71881208, "epoch": 0.41898812578032485, "grad_norm": 6.125, "learning_rate": 9.320501287596936e-06, "loss": 1.12154922, "memory(GiB)": 142.32, "step": 37460, "train_speed(iter/s)": 0.289014 }, { "acc": 0.72573214, "epoch": 0.4192118247262834, "grad_norm": 6.75, "learning_rate": 9.319570142087349e-06, "loss": 1.10598183, "memory(GiB)": 142.32, "step": 37480, "train_speed(iter/s)": 0.289058 }, { "acc": 0.7304183, "epoch": 0.4194355236722419, "grad_norm": 5.25, "learning_rate": 9.318638405598821e-06, "loss": 1.06416473, "memory(GiB)": 142.32, "step": 37500, "train_speed(iter/s)": 0.289115 }, { "acc": 0.7291419, "epoch": 0.41965922261820043, "grad_norm": 6.375, "learning_rate": 9.317706078258835e-06, "loss": 1.08294315, "memory(GiB)": 142.32, "step": 37520, "train_speed(iter/s)": 0.289168 }, { "acc": 0.72401967, "epoch": 0.41988292156415896, "grad_norm": 4.84375, "learning_rate": 9.31677316019494e-06, "loss": 1.10828915, "memory(GiB)": 142.32, "step": 37540, "train_speed(iter/s)": 0.289214 }, { "acc": 0.73316469, "epoch": 0.4201066205101175, "grad_norm": 6.09375, "learning_rate": 9.315839651534778e-06, "loss": 1.0598259, "memory(GiB)": 142.32, "step": 37560, "train_speed(iter/s)": 0.289264 }, { "acc": 0.72883606, "epoch": 0.420330319456076, "grad_norm": 5.5, "learning_rate": 9.314905552406064e-06, "loss": 1.07410631, "memory(GiB)": 142.32, "step": 37580, "train_speed(iter/s)": 0.289312 }, { "acc": 0.71795754, "epoch": 0.42055401840203455, "grad_norm": 6.3125, "learning_rate": 9.313970862936596e-06, "loss": 1.11056986, "memory(GiB)": 142.32, "step": 37600, "train_speed(iter/s)": 0.289364 }, { "acc": 0.72980433, "epoch": 0.4207777173479931, "grad_norm": 5.75, "learning_rate": 9.313035583254253e-06, "loss": 1.07478237, "memory(GiB)": 142.32, "step": 37620, "train_speed(iter/s)": 0.289417 }, { "acc": 0.73462176, "epoch": 0.4210014162939516, "grad_norm": 5.25, "learning_rate": 9.312099713486994e-06, "loss": 1.0475173, "memory(GiB)": 142.32, "step": 37640, "train_speed(iter/s)": 0.289463 }, { "acc": 0.72409143, "epoch": 0.42122511523991013, "grad_norm": 6.03125, "learning_rate": 9.311163253762862e-06, "loss": 1.08583403, "memory(GiB)": 142.32, "step": 37660, "train_speed(iter/s)": 0.289518 }, { "acc": 0.73245449, "epoch": 0.42144881418586866, "grad_norm": 5.09375, "learning_rate": 9.310226204209974e-06, "loss": 1.0644825, "memory(GiB)": 142.32, "step": 37680, "train_speed(iter/s)": 0.289561 }, { "acc": 0.71402607, "epoch": 0.4216725131318272, "grad_norm": 6.78125, "learning_rate": 9.309288564956535e-06, "loss": 1.13497524, "memory(GiB)": 142.32, "step": 37700, "train_speed(iter/s)": 0.289608 }, { "acc": 0.72275505, "epoch": 0.4218962120777857, "grad_norm": 5.15625, "learning_rate": 9.308350336130828e-06, "loss": 1.12658863, "memory(GiB)": 142.32, "step": 37720, "train_speed(iter/s)": 0.289655 }, { "acc": 0.72594585, "epoch": 0.42211991102374424, "grad_norm": 4.71875, "learning_rate": 9.30741151786121e-06, "loss": 1.09183865, "memory(GiB)": 142.32, "step": 37740, "train_speed(iter/s)": 0.28971 }, { "acc": 0.73472567, "epoch": 0.42234360996970277, "grad_norm": 5.625, "learning_rate": 9.306472110276132e-06, "loss": 1.04269972, "memory(GiB)": 142.32, "step": 37760, "train_speed(iter/s)": 0.289765 }, { "acc": 0.73557196, "epoch": 0.4225673089156613, "grad_norm": 5.75, "learning_rate": 9.305532113504116e-06, "loss": 1.04469461, "memory(GiB)": 142.32, "step": 37780, "train_speed(iter/s)": 0.289812 }, { "acc": 0.7317317, "epoch": 0.4227910078616198, "grad_norm": 5.34375, "learning_rate": 9.304591527673768e-06, "loss": 1.0701601, "memory(GiB)": 142.32, "step": 37800, "train_speed(iter/s)": 0.289864 }, { "acc": 0.7185976, "epoch": 0.42301470680757836, "grad_norm": 5.78125, "learning_rate": 9.303650352913769e-06, "loss": 1.11798534, "memory(GiB)": 142.32, "step": 37820, "train_speed(iter/s)": 0.289921 }, { "acc": 0.72389994, "epoch": 0.4232384057535369, "grad_norm": 5.71875, "learning_rate": 9.30270858935289e-06, "loss": 1.10074196, "memory(GiB)": 142.32, "step": 37840, "train_speed(iter/s)": 0.289969 }, { "acc": 0.71129198, "epoch": 0.4234621046994954, "grad_norm": 5.15625, "learning_rate": 9.301766237119975e-06, "loss": 1.14601698, "memory(GiB)": 142.32, "step": 37860, "train_speed(iter/s)": 0.290024 }, { "acc": 0.7215992, "epoch": 0.42368580364545394, "grad_norm": 5.40625, "learning_rate": 9.300823296343955e-06, "loss": 1.09329319, "memory(GiB)": 142.32, "step": 37880, "train_speed(iter/s)": 0.290074 }, { "acc": 0.71979098, "epoch": 0.42390950259141247, "grad_norm": 5.40625, "learning_rate": 9.299879767153834e-06, "loss": 1.12390261, "memory(GiB)": 142.32, "step": 37900, "train_speed(iter/s)": 0.290127 }, { "acc": 0.72600894, "epoch": 0.424133201537371, "grad_norm": 5.25, "learning_rate": 9.2989356496787e-06, "loss": 1.08617153, "memory(GiB)": 142.32, "step": 37920, "train_speed(iter/s)": 0.290181 }, { "acc": 0.73247972, "epoch": 0.4243569004833295, "grad_norm": 5.65625, "learning_rate": 9.297990944047724e-06, "loss": 1.06594563, "memory(GiB)": 142.32, "step": 37940, "train_speed(iter/s)": 0.290228 }, { "acc": 0.72398682, "epoch": 0.42458059942928805, "grad_norm": 5.65625, "learning_rate": 9.297045650390155e-06, "loss": 1.09478664, "memory(GiB)": 142.32, "step": 37960, "train_speed(iter/s)": 0.290278 }, { "acc": 0.73321228, "epoch": 0.4248042983752466, "grad_norm": 5.65625, "learning_rate": 9.29609976883532e-06, "loss": 1.05148869, "memory(GiB)": 142.32, "step": 37980, "train_speed(iter/s)": 0.29033 }, { "acc": 0.72654324, "epoch": 0.4250279973212051, "grad_norm": 5.96875, "learning_rate": 9.295153299512634e-06, "loss": 1.08133869, "memory(GiB)": 142.32, "step": 38000, "train_speed(iter/s)": 0.290377 }, { "epoch": 0.4250279973212051, "eval_acc": 0.68980212914225, "eval_loss": 1.1001887321472168, "eval_runtime": 2339.4031, "eval_samples_per_second": 32.18, "eval_steps_per_second": 16.09, "step": 38000 }, { "acc": 0.73616886, "epoch": 0.42525169626716364, "grad_norm": 6.59375, "learning_rate": 9.294206242551584e-06, "loss": 1.05515594, "memory(GiB)": 142.32, "step": 38020, "train_speed(iter/s)": 0.285221 }, { "acc": 0.71702876, "epoch": 0.42547539521312217, "grad_norm": 5.96875, "learning_rate": 9.29325859808174e-06, "loss": 1.1205102, "memory(GiB)": 142.32, "step": 38040, "train_speed(iter/s)": 0.285272 }, { "acc": 0.73096542, "epoch": 0.4256990941590807, "grad_norm": 5.1875, "learning_rate": 9.292310366232757e-06, "loss": 1.07062397, "memory(GiB)": 142.32, "step": 38060, "train_speed(iter/s)": 0.28532 }, { "acc": 0.72069683, "epoch": 0.4259227931050392, "grad_norm": 5.0, "learning_rate": 9.291361547134365e-06, "loss": 1.11074352, "memory(GiB)": 142.32, "step": 38080, "train_speed(iter/s)": 0.285369 }, { "acc": 0.72915564, "epoch": 0.42614649205099775, "grad_norm": 5.5625, "learning_rate": 9.290412140916373e-06, "loss": 1.09655914, "memory(GiB)": 142.32, "step": 38100, "train_speed(iter/s)": 0.28542 }, { "acc": 0.73687449, "epoch": 0.4263701909969563, "grad_norm": 7.59375, "learning_rate": 9.28946214770868e-06, "loss": 1.04010639, "memory(GiB)": 142.32, "step": 38120, "train_speed(iter/s)": 0.285471 }, { "acc": 0.73802967, "epoch": 0.4265938899429148, "grad_norm": 6.59375, "learning_rate": 9.28851156764125e-06, "loss": 1.04133911, "memory(GiB)": 142.32, "step": 38140, "train_speed(iter/s)": 0.285525 }, { "acc": 0.72711887, "epoch": 0.42681758888887333, "grad_norm": 6.1875, "learning_rate": 9.287560400844142e-06, "loss": 1.08975487, "memory(GiB)": 142.32, "step": 38160, "train_speed(iter/s)": 0.285574 }, { "acc": 0.73483968, "epoch": 0.42704128783483186, "grad_norm": 6.96875, "learning_rate": 9.286608647447489e-06, "loss": 1.05453186, "memory(GiB)": 142.32, "step": 38180, "train_speed(iter/s)": 0.285624 }, { "acc": 0.71887836, "epoch": 0.4272649867807904, "grad_norm": 4.9375, "learning_rate": 9.285656307581503e-06, "loss": 1.1264286, "memory(GiB)": 142.32, "step": 38200, "train_speed(iter/s)": 0.285675 }, { "acc": 0.7178421, "epoch": 0.4274886857267489, "grad_norm": 5.5, "learning_rate": 9.284703381376479e-06, "loss": 1.12518625, "memory(GiB)": 142.32, "step": 38220, "train_speed(iter/s)": 0.285724 }, { "acc": 0.72809119, "epoch": 0.42771238467270745, "grad_norm": 6.21875, "learning_rate": 9.283749868962787e-06, "loss": 1.08753757, "memory(GiB)": 142.32, "step": 38240, "train_speed(iter/s)": 0.285777 }, { "acc": 0.72880421, "epoch": 0.427936083618666, "grad_norm": 5.75, "learning_rate": 9.282795770470888e-06, "loss": 1.06875858, "memory(GiB)": 142.32, "step": 38260, "train_speed(iter/s)": 0.285834 }, { "acc": 0.72579722, "epoch": 0.4281597825646245, "grad_norm": 4.34375, "learning_rate": 9.281841086031309e-06, "loss": 1.10752163, "memory(GiB)": 142.32, "step": 38280, "train_speed(iter/s)": 0.285887 }, { "acc": 0.72639403, "epoch": 0.42838348151058303, "grad_norm": 6.125, "learning_rate": 9.280885815774669e-06, "loss": 1.09786644, "memory(GiB)": 142.32, "step": 38300, "train_speed(iter/s)": 0.285941 }, { "acc": 0.73034782, "epoch": 0.42860718045654156, "grad_norm": 5.375, "learning_rate": 9.279929959831662e-06, "loss": 1.08092613, "memory(GiB)": 142.32, "step": 38320, "train_speed(iter/s)": 0.28599 }, { "acc": 0.72321086, "epoch": 0.42883087940250014, "grad_norm": 6.9375, "learning_rate": 9.27897351833306e-06, "loss": 1.0948946, "memory(GiB)": 142.32, "step": 38340, "train_speed(iter/s)": 0.286035 }, { "acc": 0.72430449, "epoch": 0.42905457834845867, "grad_norm": 5.375, "learning_rate": 9.278016491409722e-06, "loss": 1.10089312, "memory(GiB)": 142.32, "step": 38360, "train_speed(iter/s)": 0.286088 }, { "acc": 0.72550135, "epoch": 0.4292782772944172, "grad_norm": 5.96875, "learning_rate": 9.27705887919258e-06, "loss": 1.09257488, "memory(GiB)": 142.32, "step": 38380, "train_speed(iter/s)": 0.286139 }, { "acc": 0.73094711, "epoch": 0.4295019762403757, "grad_norm": 5.21875, "learning_rate": 9.276100681812651e-06, "loss": 1.06191349, "memory(GiB)": 142.32, "step": 38400, "train_speed(iter/s)": 0.286184 }, { "acc": 0.72062626, "epoch": 0.42972567518633425, "grad_norm": 5.90625, "learning_rate": 9.27514189940103e-06, "loss": 1.11911488, "memory(GiB)": 142.32, "step": 38420, "train_speed(iter/s)": 0.286234 }, { "acc": 0.73695488, "epoch": 0.4299493741322928, "grad_norm": 5.125, "learning_rate": 9.274182532088888e-06, "loss": 1.05072784, "memory(GiB)": 142.32, "step": 38440, "train_speed(iter/s)": 0.286286 }, { "acc": 0.72684336, "epoch": 0.4301730730782513, "grad_norm": 5.46875, "learning_rate": 9.273222580007488e-06, "loss": 1.09098577, "memory(GiB)": 142.32, "step": 38460, "train_speed(iter/s)": 0.286344 }, { "acc": 0.72898483, "epoch": 0.43039677202420984, "grad_norm": 4.875, "learning_rate": 9.272262043288158e-06, "loss": 1.08487358, "memory(GiB)": 142.32, "step": 38480, "train_speed(iter/s)": 0.286402 }, { "acc": 0.71409063, "epoch": 0.43062047097016837, "grad_norm": 5.875, "learning_rate": 9.271300922062315e-06, "loss": 1.16041317, "memory(GiB)": 142.32, "step": 38500, "train_speed(iter/s)": 0.286457 }, { "acc": 0.73344212, "epoch": 0.4308441699161269, "grad_norm": 5.28125, "learning_rate": 9.270339216461457e-06, "loss": 1.05142813, "memory(GiB)": 142.32, "step": 38520, "train_speed(iter/s)": 0.286513 }, { "acc": 0.73079953, "epoch": 0.4310678688620854, "grad_norm": 6.59375, "learning_rate": 9.269376926617155e-06, "loss": 1.07924328, "memory(GiB)": 142.32, "step": 38540, "train_speed(iter/s)": 0.286564 }, { "acc": 0.73324022, "epoch": 0.43129156780804395, "grad_norm": 7.21875, "learning_rate": 9.268414052661068e-06, "loss": 1.05300064, "memory(GiB)": 142.32, "step": 38560, "train_speed(iter/s)": 0.286619 }, { "acc": 0.72942314, "epoch": 0.4315152667540025, "grad_norm": 5.90625, "learning_rate": 9.267450594724926e-06, "loss": 1.09067383, "memory(GiB)": 142.32, "step": 38580, "train_speed(iter/s)": 0.286673 }, { "acc": 0.73158402, "epoch": 0.431738965699961, "grad_norm": 5.5, "learning_rate": 9.26648655294055e-06, "loss": 1.05509424, "memory(GiB)": 142.32, "step": 38600, "train_speed(iter/s)": 0.286727 }, { "acc": 0.71055336, "epoch": 0.43196266464591954, "grad_norm": 6.46875, "learning_rate": 9.265521927439829e-06, "loss": 1.15967293, "memory(GiB)": 142.32, "step": 38620, "train_speed(iter/s)": 0.286773 }, { "acc": 0.73829532, "epoch": 0.43218636359187806, "grad_norm": 5.53125, "learning_rate": 9.264556718354742e-06, "loss": 1.03955116, "memory(GiB)": 142.32, "step": 38640, "train_speed(iter/s)": 0.28682 }, { "acc": 0.73925829, "epoch": 0.4324100625378366, "grad_norm": 6.15625, "learning_rate": 9.26359092581734e-06, "loss": 1.01863842, "memory(GiB)": 142.32, "step": 38660, "train_speed(iter/s)": 0.286867 }, { "acc": 0.72018814, "epoch": 0.4326337614837951, "grad_norm": 5.4375, "learning_rate": 9.262624549959759e-06, "loss": 1.11036587, "memory(GiB)": 142.32, "step": 38680, "train_speed(iter/s)": 0.286919 }, { "acc": 0.73482623, "epoch": 0.43285746042975365, "grad_norm": 5.9375, "learning_rate": 9.261657590914213e-06, "loss": 1.04505472, "memory(GiB)": 142.32, "step": 38700, "train_speed(iter/s)": 0.286973 }, { "acc": 0.72892962, "epoch": 0.4330811593757122, "grad_norm": 6.75, "learning_rate": 9.260690048812995e-06, "loss": 1.08883877, "memory(GiB)": 142.32, "step": 38720, "train_speed(iter/s)": 0.287022 }, { "acc": 0.7424633, "epoch": 0.4333048583216707, "grad_norm": 6.65625, "learning_rate": 9.259721923788479e-06, "loss": 1.01220741, "memory(GiB)": 142.32, "step": 38740, "train_speed(iter/s)": 0.287073 }, { "acc": 0.72840004, "epoch": 0.43352855726762923, "grad_norm": 5.4375, "learning_rate": 9.258753215973117e-06, "loss": 1.06791534, "memory(GiB)": 142.32, "step": 38760, "train_speed(iter/s)": 0.287126 }, { "acc": 0.71601663, "epoch": 0.43375225621358776, "grad_norm": 5.46875, "learning_rate": 9.257783925499447e-06, "loss": 1.12799854, "memory(GiB)": 142.32, "step": 38780, "train_speed(iter/s)": 0.287178 }, { "acc": 0.70720854, "epoch": 0.4339759551595463, "grad_norm": 4.5, "learning_rate": 9.256814052500074e-06, "loss": 1.18709259, "memory(GiB)": 142.32, "step": 38800, "train_speed(iter/s)": 0.287223 }, { "acc": 0.72359552, "epoch": 0.4341996541055048, "grad_norm": 5.5625, "learning_rate": 9.255843597107697e-06, "loss": 1.09205647, "memory(GiB)": 142.32, "step": 38820, "train_speed(iter/s)": 0.287273 }, { "acc": 0.72920361, "epoch": 0.43442335305146335, "grad_norm": 6.03125, "learning_rate": 9.254872559455086e-06, "loss": 1.07119827, "memory(GiB)": 142.32, "step": 38840, "train_speed(iter/s)": 0.287322 }, { "acc": 0.71916094, "epoch": 0.4346470519974219, "grad_norm": 5.125, "learning_rate": 9.253900939675092e-06, "loss": 1.11269226, "memory(GiB)": 142.32, "step": 38860, "train_speed(iter/s)": 0.287367 }, { "acc": 0.73101745, "epoch": 0.4348707509433804, "grad_norm": 7.53125, "learning_rate": 9.252928737900649e-06, "loss": 1.0645195, "memory(GiB)": 142.32, "step": 38880, "train_speed(iter/s)": 0.287418 }, { "acc": 0.72491713, "epoch": 0.43509444988933893, "grad_norm": 5.1875, "learning_rate": 9.251955954264764e-06, "loss": 1.09190922, "memory(GiB)": 142.32, "step": 38900, "train_speed(iter/s)": 0.287471 }, { "acc": 0.73911457, "epoch": 0.43531814883529746, "grad_norm": 6.34375, "learning_rate": 9.25098258890053e-06, "loss": 1.04598093, "memory(GiB)": 142.32, "step": 38920, "train_speed(iter/s)": 0.28752 }, { "acc": 0.72542076, "epoch": 0.435541847781256, "grad_norm": 6.71875, "learning_rate": 9.250008641941119e-06, "loss": 1.08698072, "memory(GiB)": 142.32, "step": 38940, "train_speed(iter/s)": 0.287568 }, { "acc": 0.73488855, "epoch": 0.4357655467272145, "grad_norm": 7.03125, "learning_rate": 9.249034113519778e-06, "loss": 1.04912701, "memory(GiB)": 142.32, "step": 38960, "train_speed(iter/s)": 0.287623 }, { "acc": 0.72411556, "epoch": 0.43598924567317304, "grad_norm": 5.9375, "learning_rate": 9.248059003769839e-06, "loss": 1.10492897, "memory(GiB)": 142.32, "step": 38980, "train_speed(iter/s)": 0.287674 }, { "acc": 0.73592863, "epoch": 0.43621294461913157, "grad_norm": 5.75, "learning_rate": 9.247083312824707e-06, "loss": 1.0477603, "memory(GiB)": 142.32, "step": 39000, "train_speed(iter/s)": 0.287731 }, { "acc": 0.74055686, "epoch": 0.4364366435650901, "grad_norm": 6.0, "learning_rate": 9.246107040817876e-06, "loss": 1.02820921, "memory(GiB)": 142.32, "step": 39020, "train_speed(iter/s)": 0.287786 }, { "acc": 0.73811655, "epoch": 0.4366603425110486, "grad_norm": 5.125, "learning_rate": 9.24513018788291e-06, "loss": 1.03811712, "memory(GiB)": 142.32, "step": 39040, "train_speed(iter/s)": 0.287833 }, { "acc": 0.72553182, "epoch": 0.43688404145700716, "grad_norm": 6.53125, "learning_rate": 9.244152754153454e-06, "loss": 1.0871067, "memory(GiB)": 142.32, "step": 39060, "train_speed(iter/s)": 0.28788 }, { "acc": 0.71749907, "epoch": 0.4371077404029657, "grad_norm": 6.1875, "learning_rate": 9.243174739763242e-06, "loss": 1.13051643, "memory(GiB)": 142.32, "step": 39080, "train_speed(iter/s)": 0.287928 }, { "acc": 0.7200386, "epoch": 0.4373314393489242, "grad_norm": 5.3125, "learning_rate": 9.242196144846076e-06, "loss": 1.11853199, "memory(GiB)": 142.32, "step": 39100, "train_speed(iter/s)": 0.287976 }, { "acc": 0.7188962, "epoch": 0.43755513829488274, "grad_norm": 5.3125, "learning_rate": 9.241216969535842e-06, "loss": 1.12420244, "memory(GiB)": 142.32, "step": 39120, "train_speed(iter/s)": 0.288023 }, { "acc": 0.7242085, "epoch": 0.43777883724084127, "grad_norm": 5.21875, "learning_rate": 9.240237213966507e-06, "loss": 1.10318604, "memory(GiB)": 142.32, "step": 39140, "train_speed(iter/s)": 0.288076 }, { "acc": 0.7482213, "epoch": 0.4380025361867998, "grad_norm": 5.78125, "learning_rate": 9.239256878272113e-06, "loss": 0.97703609, "memory(GiB)": 142.32, "step": 39160, "train_speed(iter/s)": 0.288127 }, { "acc": 0.72020984, "epoch": 0.4382262351327583, "grad_norm": 5.46875, "learning_rate": 9.238275962586785e-06, "loss": 1.10767136, "memory(GiB)": 142.32, "step": 39180, "train_speed(iter/s)": 0.288175 }, { "acc": 0.73627834, "epoch": 0.43844993407871685, "grad_norm": 6.1875, "learning_rate": 9.237294467044727e-06, "loss": 1.03847389, "memory(GiB)": 142.32, "step": 39200, "train_speed(iter/s)": 0.288224 }, { "acc": 0.72318869, "epoch": 0.4386736330246754, "grad_norm": 6.84375, "learning_rate": 9.23631239178022e-06, "loss": 1.1055254, "memory(GiB)": 142.32, "step": 39220, "train_speed(iter/s)": 0.288271 }, { "acc": 0.72367191, "epoch": 0.4388973319706339, "grad_norm": 6.28125, "learning_rate": 9.235329736927628e-06, "loss": 1.08985624, "memory(GiB)": 142.32, "step": 39240, "train_speed(iter/s)": 0.288321 }, { "acc": 0.71087093, "epoch": 0.43912103091659244, "grad_norm": 5.59375, "learning_rate": 9.23434650262139e-06, "loss": 1.16661377, "memory(GiB)": 142.32, "step": 39260, "train_speed(iter/s)": 0.288374 }, { "acc": 0.7308876, "epoch": 0.43934472986255096, "grad_norm": 6.1875, "learning_rate": 9.233362688996028e-06, "loss": 1.05905895, "memory(GiB)": 142.32, "step": 39280, "train_speed(iter/s)": 0.28843 }, { "acc": 0.72634244, "epoch": 0.4395684288085095, "grad_norm": 6.65625, "learning_rate": 9.232378296186142e-06, "loss": 1.09104805, "memory(GiB)": 142.32, "step": 39300, "train_speed(iter/s)": 0.288482 }, { "acc": 0.7177371, "epoch": 0.439792127754468, "grad_norm": 5.0625, "learning_rate": 9.23139332432641e-06, "loss": 1.14809933, "memory(GiB)": 142.32, "step": 39320, "train_speed(iter/s)": 0.288535 }, { "acc": 0.73585167, "epoch": 0.44001582670042655, "grad_norm": 6.1875, "learning_rate": 9.23040777355159e-06, "loss": 1.0697876, "memory(GiB)": 142.32, "step": 39340, "train_speed(iter/s)": 0.288585 }, { "acc": 0.73277545, "epoch": 0.4402395256463851, "grad_norm": 6.09375, "learning_rate": 9.229421643996521e-06, "loss": 1.05731268, "memory(GiB)": 142.32, "step": 39360, "train_speed(iter/s)": 0.288641 }, { "acc": 0.71812048, "epoch": 0.4404632245923436, "grad_norm": 5.53125, "learning_rate": 9.22843493579612e-06, "loss": 1.13159304, "memory(GiB)": 142.32, "step": 39380, "train_speed(iter/s)": 0.288687 }, { "acc": 0.72144613, "epoch": 0.44068692353830213, "grad_norm": 6.125, "learning_rate": 9.227447649085379e-06, "loss": 1.10237646, "memory(GiB)": 142.32, "step": 39400, "train_speed(iter/s)": 0.288736 }, { "acc": 0.71440916, "epoch": 0.44091062248426066, "grad_norm": 4.78125, "learning_rate": 9.226459783999378e-06, "loss": 1.14234905, "memory(GiB)": 142.32, "step": 39420, "train_speed(iter/s)": 0.288787 }, { "acc": 0.72471399, "epoch": 0.4411343214302192, "grad_norm": 7.09375, "learning_rate": 9.225471340673267e-06, "loss": 1.08650112, "memory(GiB)": 142.32, "step": 39440, "train_speed(iter/s)": 0.288836 }, { "acc": 0.72303691, "epoch": 0.4413580203761777, "grad_norm": 6.09375, "learning_rate": 9.224482319242281e-06, "loss": 1.10178452, "memory(GiB)": 142.32, "step": 39460, "train_speed(iter/s)": 0.288887 }, { "acc": 0.71746278, "epoch": 0.44158171932213625, "grad_norm": 5.0625, "learning_rate": 9.223492719841732e-06, "loss": 1.12750835, "memory(GiB)": 142.32, "step": 39480, "train_speed(iter/s)": 0.288937 }, { "acc": 0.73363695, "epoch": 0.4418054182680948, "grad_norm": 5.65625, "learning_rate": 9.22250254260701e-06, "loss": 1.06433191, "memory(GiB)": 142.32, "step": 39500, "train_speed(iter/s)": 0.288987 }, { "acc": 0.73390551, "epoch": 0.4420291172140533, "grad_norm": 5.6875, "learning_rate": 9.22151178767359e-06, "loss": 1.07475204, "memory(GiB)": 142.32, "step": 39520, "train_speed(iter/s)": 0.289039 }, { "acc": 0.73263378, "epoch": 0.44225281616001183, "grad_norm": 5.21875, "learning_rate": 9.220520455177016e-06, "loss": 1.06649761, "memory(GiB)": 142.32, "step": 39540, "train_speed(iter/s)": 0.289084 }, { "acc": 0.73161721, "epoch": 0.44247651510597036, "grad_norm": 5.15625, "learning_rate": 9.219528545252918e-06, "loss": 1.06901941, "memory(GiB)": 142.32, "step": 39560, "train_speed(iter/s)": 0.289132 }, { "acc": 0.73560133, "epoch": 0.4427002140519289, "grad_norm": 5.40625, "learning_rate": 9.218536058037004e-06, "loss": 1.03800049, "memory(GiB)": 142.32, "step": 39580, "train_speed(iter/s)": 0.289178 }, { "acc": 0.73626671, "epoch": 0.44292391299788747, "grad_norm": 5.46875, "learning_rate": 9.217542993665061e-06, "loss": 1.04262047, "memory(GiB)": 142.32, "step": 39600, "train_speed(iter/s)": 0.289225 }, { "acc": 0.72801857, "epoch": 0.443147611943846, "grad_norm": 6.75, "learning_rate": 9.216549352272954e-06, "loss": 1.09207401, "memory(GiB)": 142.32, "step": 39620, "train_speed(iter/s)": 0.289274 }, { "acc": 0.73232317, "epoch": 0.4433713108898045, "grad_norm": 5.15625, "learning_rate": 9.215555133996628e-06, "loss": 1.05764675, "memory(GiB)": 142.32, "step": 39640, "train_speed(iter/s)": 0.289323 }, { "acc": 0.72828636, "epoch": 0.44359500983576305, "grad_norm": 6.40625, "learning_rate": 9.214560338972105e-06, "loss": 1.07642355, "memory(GiB)": 142.32, "step": 39660, "train_speed(iter/s)": 0.289371 }, { "acc": 0.74190102, "epoch": 0.4438187087817216, "grad_norm": 6.0, "learning_rate": 9.213564967335488e-06, "loss": 1.02939854, "memory(GiB)": 142.32, "step": 39680, "train_speed(iter/s)": 0.289419 }, { "acc": 0.72914619, "epoch": 0.4440424077276801, "grad_norm": 6.78125, "learning_rate": 9.212569019222956e-06, "loss": 1.06743355, "memory(GiB)": 142.32, "step": 39700, "train_speed(iter/s)": 0.289472 }, { "acc": 0.73705873, "epoch": 0.44426610667363864, "grad_norm": 6.1875, "learning_rate": 9.211572494770772e-06, "loss": 1.03830776, "memory(GiB)": 142.32, "step": 39720, "train_speed(iter/s)": 0.28952 }, { "acc": 0.72826295, "epoch": 0.44448980561959717, "grad_norm": 5.9375, "learning_rate": 9.210575394115273e-06, "loss": 1.09521828, "memory(GiB)": 142.32, "step": 39740, "train_speed(iter/s)": 0.289571 }, { "acc": 0.72882786, "epoch": 0.4447135045655557, "grad_norm": 5.78125, "learning_rate": 9.209577717392879e-06, "loss": 1.09214191, "memory(GiB)": 142.32, "step": 39760, "train_speed(iter/s)": 0.289614 }, { "acc": 0.72064695, "epoch": 0.4449372035115142, "grad_norm": 5.59375, "learning_rate": 9.208579464740083e-06, "loss": 1.11881695, "memory(GiB)": 142.32, "step": 39780, "train_speed(iter/s)": 0.289662 }, { "acc": 0.72803659, "epoch": 0.44516090245747275, "grad_norm": 5.9375, "learning_rate": 9.207580636293462e-06, "loss": 1.07512932, "memory(GiB)": 142.32, "step": 39800, "train_speed(iter/s)": 0.289713 }, { "acc": 0.72680588, "epoch": 0.4453846014034313, "grad_norm": 5.71875, "learning_rate": 9.206581232189668e-06, "loss": 1.09458351, "memory(GiB)": 142.32, "step": 39820, "train_speed(iter/s)": 0.289759 }, { "acc": 0.72546473, "epoch": 0.4456083003493898, "grad_norm": 5.6875, "learning_rate": 9.205581252565438e-06, "loss": 1.0863081, "memory(GiB)": 142.32, "step": 39840, "train_speed(iter/s)": 0.289809 }, { "acc": 0.7245717, "epoch": 0.44583199929534834, "grad_norm": 6.125, "learning_rate": 9.20458069755758e-06, "loss": 1.08079166, "memory(GiB)": 142.32, "step": 39860, "train_speed(iter/s)": 0.289864 }, { "acc": 0.72414198, "epoch": 0.44605569824130686, "grad_norm": 5.8125, "learning_rate": 9.203579567302987e-06, "loss": 1.07913284, "memory(GiB)": 142.32, "step": 39880, "train_speed(iter/s)": 0.289916 }, { "acc": 0.73297319, "epoch": 0.4462793971872654, "grad_norm": 5.96875, "learning_rate": 9.202577861938624e-06, "loss": 1.06533852, "memory(GiB)": 142.32, "step": 39900, "train_speed(iter/s)": 0.289958 }, { "acc": 0.7317317, "epoch": 0.4465030961332239, "grad_norm": 6.4375, "learning_rate": 9.201575581601541e-06, "loss": 1.07656765, "memory(GiB)": 142.32, "step": 39920, "train_speed(iter/s)": 0.290008 }, { "acc": 0.72937098, "epoch": 0.44672679507918245, "grad_norm": 4.21875, "learning_rate": 9.200572726428865e-06, "loss": 1.09605694, "memory(GiB)": 142.32, "step": 39940, "train_speed(iter/s)": 0.290055 }, { "acc": 0.72217655, "epoch": 0.446950494025141, "grad_norm": 5.46875, "learning_rate": 9.1995692965578e-06, "loss": 1.12257175, "memory(GiB)": 142.32, "step": 39960, "train_speed(iter/s)": 0.290106 }, { "acc": 0.71821346, "epoch": 0.4471741929710995, "grad_norm": 5.96875, "learning_rate": 9.198565292125627e-06, "loss": 1.1154294, "memory(GiB)": 142.32, "step": 39980, "train_speed(iter/s)": 0.290157 }, { "acc": 0.72367597, "epoch": 0.44739789191705803, "grad_norm": 6.46875, "learning_rate": 9.197560713269713e-06, "loss": 1.1192625, "memory(GiB)": 142.32, "step": 40000, "train_speed(iter/s)": 0.290201 }, { "epoch": 0.44739789191705803, "eval_acc": 0.6902824757490035, "eval_loss": 1.0984110832214355, "eval_runtime": 2339.8094, "eval_samples_per_second": 32.175, "eval_steps_per_second": 16.088, "step": 40000 }, { "acc": 0.72801828, "epoch": 0.44762159086301656, "grad_norm": 5.125, "learning_rate": 9.196555560127499e-06, "loss": 1.09010963, "memory(GiB)": 142.32, "step": 40020, "train_speed(iter/s)": 0.285296 }, { "acc": 0.73455534, "epoch": 0.4478452898089751, "grad_norm": 6.34375, "learning_rate": 9.195549832836497e-06, "loss": 1.04834843, "memory(GiB)": 142.32, "step": 40040, "train_speed(iter/s)": 0.285344 }, { "acc": 0.73208008, "epoch": 0.4480689887549336, "grad_norm": 5.8125, "learning_rate": 9.194543531534312e-06, "loss": 1.06319046, "memory(GiB)": 142.32, "step": 40060, "train_speed(iter/s)": 0.285392 }, { "acc": 0.72498188, "epoch": 0.44829268770089215, "grad_norm": 5.34375, "learning_rate": 9.193536656358617e-06, "loss": 1.11246243, "memory(GiB)": 142.32, "step": 40080, "train_speed(iter/s)": 0.285443 }, { "acc": 0.71330824, "epoch": 0.4485163866468507, "grad_norm": 6.21875, "learning_rate": 9.19252920744717e-06, "loss": 1.1417366, "memory(GiB)": 142.32, "step": 40100, "train_speed(iter/s)": 0.285482 }, { "acc": 0.72760496, "epoch": 0.4487400855928092, "grad_norm": 4.875, "learning_rate": 9.1915211849378e-06, "loss": 1.08367786, "memory(GiB)": 142.32, "step": 40120, "train_speed(iter/s)": 0.285528 }, { "acc": 0.73653917, "epoch": 0.44896378453876773, "grad_norm": 6.09375, "learning_rate": 9.190512588968423e-06, "loss": 1.04510832, "memory(GiB)": 142.32, "step": 40140, "train_speed(iter/s)": 0.285579 }, { "acc": 0.72732897, "epoch": 0.44918748348472626, "grad_norm": 6.53125, "learning_rate": 9.189503419677026e-06, "loss": 1.07637186, "memory(GiB)": 142.32, "step": 40160, "train_speed(iter/s)": 0.285626 }, { "acc": 0.73866606, "epoch": 0.4494111824306848, "grad_norm": 4.90625, "learning_rate": 9.18849367720168e-06, "loss": 1.03247948, "memory(GiB)": 142.32, "step": 40180, "train_speed(iter/s)": 0.285674 }, { "acc": 0.72124243, "epoch": 0.4496348813766433, "grad_norm": 5.03125, "learning_rate": 9.187483361680534e-06, "loss": 1.13660984, "memory(GiB)": 142.32, "step": 40200, "train_speed(iter/s)": 0.285716 }, { "acc": 0.7319768, "epoch": 0.44985858032260184, "grad_norm": 6.46875, "learning_rate": 9.186472473251808e-06, "loss": 1.07041264, "memory(GiB)": 142.32, "step": 40220, "train_speed(iter/s)": 0.285757 }, { "acc": 0.71378622, "epoch": 0.45008227926856037, "grad_norm": 5.53125, "learning_rate": 9.18546101205381e-06, "loss": 1.14845457, "memory(GiB)": 142.32, "step": 40240, "train_speed(iter/s)": 0.285807 }, { "acc": 0.73625388, "epoch": 0.4503059782145189, "grad_norm": 5.21875, "learning_rate": 9.184448978224923e-06, "loss": 1.05296459, "memory(GiB)": 142.32, "step": 40260, "train_speed(iter/s)": 0.285858 }, { "acc": 0.72179966, "epoch": 0.4505296771604774, "grad_norm": 8.0, "learning_rate": 9.183436371903605e-06, "loss": 1.09720516, "memory(GiB)": 142.32, "step": 40280, "train_speed(iter/s)": 0.28591 }, { "acc": 0.73630419, "epoch": 0.45075337610643595, "grad_norm": 5.84375, "learning_rate": 9.182423193228397e-06, "loss": 1.04377708, "memory(GiB)": 142.32, "step": 40300, "train_speed(iter/s)": 0.285957 }, { "acc": 0.71939278, "epoch": 0.4509770750523945, "grad_norm": 4.84375, "learning_rate": 9.181409442337913e-06, "loss": 1.13190508, "memory(GiB)": 142.32, "step": 40320, "train_speed(iter/s)": 0.286002 }, { "acc": 0.73389864, "epoch": 0.451200773998353, "grad_norm": 7.03125, "learning_rate": 9.180395119370853e-06, "loss": 1.05431309, "memory(GiB)": 142.32, "step": 40340, "train_speed(iter/s)": 0.286055 }, { "acc": 0.72132015, "epoch": 0.45142447294431154, "grad_norm": 5.75, "learning_rate": 9.179380224465988e-06, "loss": 1.10632343, "memory(GiB)": 142.32, "step": 40360, "train_speed(iter/s)": 0.286104 }, { "acc": 0.71599369, "epoch": 0.45164817189027007, "grad_norm": 5.46875, "learning_rate": 9.178364757762173e-06, "loss": 1.14168596, "memory(GiB)": 142.32, "step": 40380, "train_speed(iter/s)": 0.286156 }, { "acc": 0.72465057, "epoch": 0.4518718708362286, "grad_norm": 6.71875, "learning_rate": 9.177348719398335e-06, "loss": 1.09423332, "memory(GiB)": 142.32, "step": 40400, "train_speed(iter/s)": 0.286205 }, { "acc": 0.72619452, "epoch": 0.4520955697821871, "grad_norm": 6.625, "learning_rate": 9.176332109513486e-06, "loss": 1.08133945, "memory(GiB)": 142.32, "step": 40420, "train_speed(iter/s)": 0.286255 }, { "acc": 0.74534998, "epoch": 0.45231926872814565, "grad_norm": 6.46875, "learning_rate": 9.17531492824671e-06, "loss": 1.01109562, "memory(GiB)": 142.32, "step": 40440, "train_speed(iter/s)": 0.286298 }, { "acc": 0.73588772, "epoch": 0.4525429676741042, "grad_norm": 6.09375, "learning_rate": 9.174297175737173e-06, "loss": 1.04307833, "memory(GiB)": 142.32, "step": 40460, "train_speed(iter/s)": 0.286349 }, { "acc": 0.72883248, "epoch": 0.4527666666200627, "grad_norm": 5.84375, "learning_rate": 9.173278852124117e-06, "loss": 1.0898756, "memory(GiB)": 142.32, "step": 40480, "train_speed(iter/s)": 0.286397 }, { "acc": 0.72263956, "epoch": 0.45299036556602124, "grad_norm": 5.21875, "learning_rate": 9.172259957546865e-06, "loss": 1.10513573, "memory(GiB)": 142.32, "step": 40500, "train_speed(iter/s)": 0.286437 }, { "acc": 0.71830535, "epoch": 0.45321406451197976, "grad_norm": 5.65625, "learning_rate": 9.171240492144815e-06, "loss": 1.10703278, "memory(GiB)": 142.32, "step": 40520, "train_speed(iter/s)": 0.286484 }, { "acc": 0.72394857, "epoch": 0.4534377634579383, "grad_norm": 5.0625, "learning_rate": 9.170220456057444e-06, "loss": 1.11889658, "memory(GiB)": 142.32, "step": 40540, "train_speed(iter/s)": 0.286533 }, { "acc": 0.72464414, "epoch": 0.4536614624038968, "grad_norm": 5.625, "learning_rate": 9.16919984942431e-06, "loss": 1.09021053, "memory(GiB)": 142.32, "step": 40560, "train_speed(iter/s)": 0.286578 }, { "acc": 0.72125955, "epoch": 0.45388516134985535, "grad_norm": 6.46875, "learning_rate": 9.168178672385045e-06, "loss": 1.1166769, "memory(GiB)": 142.32, "step": 40580, "train_speed(iter/s)": 0.28663 }, { "acc": 0.73048077, "epoch": 0.4541088602958139, "grad_norm": 5.40625, "learning_rate": 9.16715692507936e-06, "loss": 1.06510773, "memory(GiB)": 142.32, "step": 40600, "train_speed(iter/s)": 0.286678 }, { "acc": 0.72387118, "epoch": 0.4543325592417724, "grad_norm": 5.65625, "learning_rate": 9.166134607647045e-06, "loss": 1.09173489, "memory(GiB)": 142.32, "step": 40620, "train_speed(iter/s)": 0.286728 }, { "acc": 0.7245194, "epoch": 0.45455625818773093, "grad_norm": 6.78125, "learning_rate": 9.165111720227968e-06, "loss": 1.10409622, "memory(GiB)": 142.32, "step": 40640, "train_speed(iter/s)": 0.286773 }, { "acc": 0.72350202, "epoch": 0.45477995713368946, "grad_norm": 5.25, "learning_rate": 9.164088262962074e-06, "loss": 1.10102749, "memory(GiB)": 142.32, "step": 40660, "train_speed(iter/s)": 0.286821 }, { "acc": 0.72954483, "epoch": 0.455003656079648, "grad_norm": 5.78125, "learning_rate": 9.163064235989388e-06, "loss": 1.08176327, "memory(GiB)": 142.32, "step": 40680, "train_speed(iter/s)": 0.286866 }, { "acc": 0.73474865, "epoch": 0.4552273550256065, "grad_norm": 5.53125, "learning_rate": 9.162039639450012e-06, "loss": 1.05310688, "memory(GiB)": 142.32, "step": 40700, "train_speed(iter/s)": 0.286916 }, { "acc": 0.71687555, "epoch": 0.45545105397156505, "grad_norm": 6.53125, "learning_rate": 9.161014473484122e-06, "loss": 1.14009399, "memory(GiB)": 142.32, "step": 40720, "train_speed(iter/s)": 0.286961 }, { "acc": 0.73234434, "epoch": 0.4556747529175236, "grad_norm": 5.9375, "learning_rate": 9.159988738231978e-06, "loss": 1.06671619, "memory(GiB)": 142.32, "step": 40740, "train_speed(iter/s)": 0.287003 }, { "acc": 0.7444201, "epoch": 0.4558984518634821, "grad_norm": 4.71875, "learning_rate": 9.158962433833914e-06, "loss": 1.01792717, "memory(GiB)": 142.32, "step": 40760, "train_speed(iter/s)": 0.287051 }, { "acc": 0.71076946, "epoch": 0.45612215080944063, "grad_norm": 6.21875, "learning_rate": 9.157935560430344e-06, "loss": 1.1577858, "memory(GiB)": 142.32, "step": 40780, "train_speed(iter/s)": 0.2871 }, { "acc": 0.7190155, "epoch": 0.45634584975539916, "grad_norm": 6.25, "learning_rate": 9.156908118161759e-06, "loss": 1.13471317, "memory(GiB)": 142.32, "step": 40800, "train_speed(iter/s)": 0.287146 }, { "acc": 0.72073116, "epoch": 0.4565695487013577, "grad_norm": 5.90625, "learning_rate": 9.155880107168728e-06, "loss": 1.10749989, "memory(GiB)": 142.32, "step": 40820, "train_speed(iter/s)": 0.287199 }, { "acc": 0.7192935, "epoch": 0.45679324764731627, "grad_norm": 6.5, "learning_rate": 9.154851527591897e-06, "loss": 1.12565117, "memory(GiB)": 142.32, "step": 40840, "train_speed(iter/s)": 0.287242 }, { "acc": 0.72649174, "epoch": 0.4570169465932748, "grad_norm": 5.15625, "learning_rate": 9.15382237957199e-06, "loss": 1.08095951, "memory(GiB)": 142.32, "step": 40860, "train_speed(iter/s)": 0.287292 }, { "acc": 0.72576447, "epoch": 0.4572406455392333, "grad_norm": 5.84375, "learning_rate": 9.15279266324981e-06, "loss": 1.08933029, "memory(GiB)": 142.32, "step": 40880, "train_speed(iter/s)": 0.287334 }, { "acc": 0.726404, "epoch": 0.45746434448519185, "grad_norm": 6.28125, "learning_rate": 9.151762378766236e-06, "loss": 1.0868413, "memory(GiB)": 142.32, "step": 40900, "train_speed(iter/s)": 0.287388 }, { "acc": 0.72486057, "epoch": 0.4576880434311504, "grad_norm": 6.0625, "learning_rate": 9.150731526262226e-06, "loss": 1.10790834, "memory(GiB)": 142.32, "step": 40920, "train_speed(iter/s)": 0.28743 }, { "acc": 0.72776508, "epoch": 0.4579117423771089, "grad_norm": 6.0625, "learning_rate": 9.149700105878818e-06, "loss": 1.08456841, "memory(GiB)": 142.32, "step": 40940, "train_speed(iter/s)": 0.287481 }, { "acc": 0.7245553, "epoch": 0.45813544132306744, "grad_norm": 6.84375, "learning_rate": 9.148668117757121e-06, "loss": 1.10955849, "memory(GiB)": 142.32, "step": 40960, "train_speed(iter/s)": 0.287529 }, { "acc": 0.7272902, "epoch": 0.45835914026902597, "grad_norm": 6.5625, "learning_rate": 9.147635562038327e-06, "loss": 1.07642136, "memory(GiB)": 142.32, "step": 40980, "train_speed(iter/s)": 0.287578 }, { "acc": 0.73241177, "epoch": 0.4585828392149845, "grad_norm": 5.25, "learning_rate": 9.146602438863705e-06, "loss": 1.06776638, "memory(GiB)": 142.32, "step": 41000, "train_speed(iter/s)": 0.287625 }, { "acc": 0.72022686, "epoch": 0.458806538160943, "grad_norm": 5.9375, "learning_rate": 9.1455687483746e-06, "loss": 1.11362133, "memory(GiB)": 142.32, "step": 41020, "train_speed(iter/s)": 0.287669 }, { "acc": 0.73097353, "epoch": 0.45903023710690155, "grad_norm": 6.0625, "learning_rate": 9.144534490712438e-06, "loss": 1.05675735, "memory(GiB)": 142.32, "step": 41040, "train_speed(iter/s)": 0.287713 }, { "acc": 0.72650237, "epoch": 0.4592539360528601, "grad_norm": 5.3125, "learning_rate": 9.143499666018719e-06, "loss": 1.09141712, "memory(GiB)": 142.32, "step": 41060, "train_speed(iter/s)": 0.287763 }, { "acc": 0.7295023, "epoch": 0.4594776349988186, "grad_norm": 6.15625, "learning_rate": 9.142464274435018e-06, "loss": 1.0803463, "memory(GiB)": 142.32, "step": 41080, "train_speed(iter/s)": 0.287818 }, { "acc": 0.73641214, "epoch": 0.45970133394477714, "grad_norm": 5.3125, "learning_rate": 9.141428316102998e-06, "loss": 1.04762735, "memory(GiB)": 142.32, "step": 41100, "train_speed(iter/s)": 0.287868 }, { "acc": 0.72519016, "epoch": 0.45992503289073566, "grad_norm": 4.84375, "learning_rate": 9.140391791164389e-06, "loss": 1.08684826, "memory(GiB)": 142.32, "step": 41120, "train_speed(iter/s)": 0.287917 }, { "acc": 0.72822437, "epoch": 0.4601487318366942, "grad_norm": 7.59375, "learning_rate": 9.139354699761003e-06, "loss": 1.08556137, "memory(GiB)": 142.32, "step": 41140, "train_speed(iter/s)": 0.28796 }, { "acc": 0.72643127, "epoch": 0.4603724307826527, "grad_norm": 6.46875, "learning_rate": 9.138317042034728e-06, "loss": 1.08679714, "memory(GiB)": 142.32, "step": 41160, "train_speed(iter/s)": 0.288007 }, { "acc": 0.73178492, "epoch": 0.46059612972861125, "grad_norm": 6.34375, "learning_rate": 9.137278818127532e-06, "loss": 1.0671196, "memory(GiB)": 142.32, "step": 41180, "train_speed(iter/s)": 0.288053 }, { "acc": 0.72921848, "epoch": 0.4608198286745698, "grad_norm": 4.96875, "learning_rate": 9.13624002818146e-06, "loss": 1.06736431, "memory(GiB)": 142.32, "step": 41200, "train_speed(iter/s)": 0.288102 }, { "acc": 0.72338562, "epoch": 0.4610435276205283, "grad_norm": 8.4375, "learning_rate": 9.135200672338631e-06, "loss": 1.08737793, "memory(GiB)": 142.32, "step": 41220, "train_speed(iter/s)": 0.288151 }, { "acc": 0.72546406, "epoch": 0.46126722656648683, "grad_norm": 5.78125, "learning_rate": 9.134160750741243e-06, "loss": 1.09497356, "memory(GiB)": 142.32, "step": 41240, "train_speed(iter/s)": 0.288201 }, { "acc": 0.725949, "epoch": 0.46149092551244536, "grad_norm": 6.4375, "learning_rate": 9.133120263531576e-06, "loss": 1.06923275, "memory(GiB)": 142.32, "step": 41260, "train_speed(iter/s)": 0.28824 }, { "acc": 0.72133584, "epoch": 0.4617146244584039, "grad_norm": 6.78125, "learning_rate": 9.132079210851979e-06, "loss": 1.11577568, "memory(GiB)": 142.32, "step": 41280, "train_speed(iter/s)": 0.288288 }, { "acc": 0.72373104, "epoch": 0.4619383234043624, "grad_norm": 5.46875, "learning_rate": 9.131037592844884e-06, "loss": 1.10898085, "memory(GiB)": 142.32, "step": 41300, "train_speed(iter/s)": 0.288339 }, { "acc": 0.72902203, "epoch": 0.46216202235032094, "grad_norm": 5.8125, "learning_rate": 9.129995409652803e-06, "loss": 1.07640171, "memory(GiB)": 142.32, "step": 41320, "train_speed(iter/s)": 0.288388 }, { "acc": 0.72964878, "epoch": 0.4623857212962795, "grad_norm": 5.59375, "learning_rate": 9.128952661418317e-06, "loss": 1.06004009, "memory(GiB)": 142.32, "step": 41340, "train_speed(iter/s)": 0.288431 }, { "acc": 0.72865558, "epoch": 0.462609420242238, "grad_norm": 5.46875, "learning_rate": 9.12790934828409e-06, "loss": 1.08392982, "memory(GiB)": 142.32, "step": 41360, "train_speed(iter/s)": 0.288477 }, { "acc": 0.74296479, "epoch": 0.46283311918819653, "grad_norm": 5.375, "learning_rate": 9.126865470392864e-06, "loss": 1.01516924, "memory(GiB)": 142.32, "step": 41380, "train_speed(iter/s)": 0.288525 }, { "acc": 0.72494459, "epoch": 0.46305681813415506, "grad_norm": 6.3125, "learning_rate": 9.125821027887454e-06, "loss": 1.0742919, "memory(GiB)": 142.32, "step": 41400, "train_speed(iter/s)": 0.288573 }, { "acc": 0.73049793, "epoch": 0.4632805170801136, "grad_norm": 5.71875, "learning_rate": 9.124776020910757e-06, "loss": 1.07011852, "memory(GiB)": 142.32, "step": 41420, "train_speed(iter/s)": 0.28862 }, { "acc": 0.73379774, "epoch": 0.4635042160260721, "grad_norm": 5.34375, "learning_rate": 9.123730449605743e-06, "loss": 1.06094179, "memory(GiB)": 142.32, "step": 41440, "train_speed(iter/s)": 0.28867 }, { "acc": 0.71675119, "epoch": 0.46372791497203064, "grad_norm": 6.0, "learning_rate": 9.122684314115461e-06, "loss": 1.14712591, "memory(GiB)": 142.32, "step": 41460, "train_speed(iter/s)": 0.288718 }, { "acc": 0.71887817, "epoch": 0.46395161391798917, "grad_norm": 6.0, "learning_rate": 9.121637614583041e-06, "loss": 1.12220821, "memory(GiB)": 142.32, "step": 41480, "train_speed(iter/s)": 0.288761 }, { "acc": 0.73482513, "epoch": 0.4641753128639477, "grad_norm": 5.34375, "learning_rate": 9.12059035115168e-06, "loss": 1.05715847, "memory(GiB)": 142.32, "step": 41500, "train_speed(iter/s)": 0.288806 }, { "acc": 0.72504053, "epoch": 0.4643990118099062, "grad_norm": 5.34375, "learning_rate": 9.119542523964665e-06, "loss": 1.09752979, "memory(GiB)": 142.32, "step": 41520, "train_speed(iter/s)": 0.288851 }, { "acc": 0.73238249, "epoch": 0.46462271075586475, "grad_norm": 5.5, "learning_rate": 9.118494133165349e-06, "loss": 1.06176872, "memory(GiB)": 142.32, "step": 41540, "train_speed(iter/s)": 0.288897 }, { "acc": 0.73652864, "epoch": 0.4648464097018233, "grad_norm": 7.15625, "learning_rate": 9.11744517889717e-06, "loss": 1.02689981, "memory(GiB)": 142.32, "step": 41560, "train_speed(iter/s)": 0.288942 }, { "acc": 0.73877263, "epoch": 0.4650701086477818, "grad_norm": 6.0625, "learning_rate": 9.11639566130364e-06, "loss": 1.04334736, "memory(GiB)": 142.32, "step": 41580, "train_speed(iter/s)": 0.288988 }, { "acc": 0.72364707, "epoch": 0.46529380759374034, "grad_norm": 6.65625, "learning_rate": 9.115345580528342e-06, "loss": 1.08694458, "memory(GiB)": 142.32, "step": 41600, "train_speed(iter/s)": 0.289032 }, { "acc": 0.71361876, "epoch": 0.46551750653969887, "grad_norm": 6.84375, "learning_rate": 9.114294936714951e-06, "loss": 1.16274166, "memory(GiB)": 142.32, "step": 41620, "train_speed(iter/s)": 0.289078 }, { "acc": 0.72981172, "epoch": 0.4657412054856574, "grad_norm": 6.71875, "learning_rate": 9.113243730007204e-06, "loss": 1.07733831, "memory(GiB)": 142.32, "step": 41640, "train_speed(iter/s)": 0.289127 }, { "acc": 0.72684174, "epoch": 0.4659649044316159, "grad_norm": 5.65625, "learning_rate": 9.112191960548924e-06, "loss": 1.08554497, "memory(GiB)": 142.32, "step": 41660, "train_speed(iter/s)": 0.289175 }, { "acc": 0.72705793, "epoch": 0.46618860337757445, "grad_norm": 5.4375, "learning_rate": 9.111139628484005e-06, "loss": 1.09583235, "memory(GiB)": 142.32, "step": 41680, "train_speed(iter/s)": 0.289222 }, { "acc": 0.7126997, "epoch": 0.466412302323533, "grad_norm": 5.625, "learning_rate": 9.110086733956425e-06, "loss": 1.1349782, "memory(GiB)": 142.32, "step": 41700, "train_speed(iter/s)": 0.289272 }, { "acc": 0.72039218, "epoch": 0.4666360012694915, "grad_norm": 6.28125, "learning_rate": 9.109033277110233e-06, "loss": 1.13993635, "memory(GiB)": 142.32, "step": 41720, "train_speed(iter/s)": 0.289318 }, { "acc": 0.73481693, "epoch": 0.46685970021545004, "grad_norm": 5.71875, "learning_rate": 9.107979258089556e-06, "loss": 1.05293112, "memory(GiB)": 142.32, "step": 41740, "train_speed(iter/s)": 0.289364 }, { "acc": 0.73879447, "epoch": 0.46708339916140856, "grad_norm": 6.15625, "learning_rate": 9.106924677038601e-06, "loss": 1.0236021, "memory(GiB)": 142.32, "step": 41760, "train_speed(iter/s)": 0.28941 }, { "acc": 0.7362905, "epoch": 0.4673070981073671, "grad_norm": 6.15625, "learning_rate": 9.105869534101648e-06, "loss": 1.03922405, "memory(GiB)": 142.32, "step": 41780, "train_speed(iter/s)": 0.289453 }, { "acc": 0.72647657, "epoch": 0.4675307970533256, "grad_norm": 6.78125, "learning_rate": 9.104813829423056e-06, "loss": 1.09434385, "memory(GiB)": 142.32, "step": 41800, "train_speed(iter/s)": 0.289495 }, { "acc": 0.73434191, "epoch": 0.46775449599928415, "grad_norm": 6.03125, "learning_rate": 9.103757563147261e-06, "loss": 1.05660391, "memory(GiB)": 142.32, "step": 41820, "train_speed(iter/s)": 0.289539 }, { "acc": 0.73747792, "epoch": 0.4679781949452427, "grad_norm": 6.625, "learning_rate": 9.102700735418777e-06, "loss": 1.01585121, "memory(GiB)": 142.32, "step": 41840, "train_speed(iter/s)": 0.289586 }, { "acc": 0.74288073, "epoch": 0.4682018938912012, "grad_norm": 5.09375, "learning_rate": 9.10164334638219e-06, "loss": 1.00241528, "memory(GiB)": 142.32, "step": 41860, "train_speed(iter/s)": 0.289638 }, { "acc": 0.7267777, "epoch": 0.46842559283715973, "grad_norm": 5.65625, "learning_rate": 9.100585396182166e-06, "loss": 1.08577557, "memory(GiB)": 142.32, "step": 41880, "train_speed(iter/s)": 0.289687 }, { "acc": 0.73129053, "epoch": 0.46864929178311826, "grad_norm": 6.1875, "learning_rate": 9.099526884963451e-06, "loss": 1.07344866, "memory(GiB)": 142.32, "step": 41900, "train_speed(iter/s)": 0.28973 }, { "acc": 0.71920209, "epoch": 0.4688729907290768, "grad_norm": 6.15625, "learning_rate": 9.09846781287086e-06, "loss": 1.11858463, "memory(GiB)": 142.32, "step": 41920, "train_speed(iter/s)": 0.289776 }, { "acc": 0.72196264, "epoch": 0.4690966896750353, "grad_norm": 5.9375, "learning_rate": 9.097408180049295e-06, "loss": 1.10837498, "memory(GiB)": 142.32, "step": 41940, "train_speed(iter/s)": 0.289819 }, { "acc": 0.73088636, "epoch": 0.46932038862099384, "grad_norm": 6.21875, "learning_rate": 9.096347986643723e-06, "loss": 1.06489363, "memory(GiB)": 142.32, "step": 41960, "train_speed(iter/s)": 0.289864 }, { "acc": 0.72146578, "epoch": 0.4695440875669524, "grad_norm": 5.65625, "learning_rate": 9.095287232799196e-06, "loss": 1.12943201, "memory(GiB)": 142.32, "step": 41980, "train_speed(iter/s)": 0.289904 }, { "acc": 0.73004603, "epoch": 0.4697677865129109, "grad_norm": 6.03125, "learning_rate": 9.094225918660842e-06, "loss": 1.06626301, "memory(GiB)": 142.32, "step": 42000, "train_speed(iter/s)": 0.28995 }, { "epoch": 0.4697677865129109, "eval_acc": 0.690654221824694, "eval_loss": 1.0966755151748657, "eval_runtime": 2342.3709, "eval_samples_per_second": 32.14, "eval_steps_per_second": 16.07, "step": 42000 }, { "acc": 0.72902904, "epoch": 0.46999148545886943, "grad_norm": 6.28125, "learning_rate": 9.093164044373862e-06, "loss": 1.07666435, "memory(GiB)": 142.32, "step": 42020, "train_speed(iter/s)": 0.285278 }, { "acc": 0.7277225, "epoch": 0.47021518440482796, "grad_norm": 6.125, "learning_rate": 9.092101610083534e-06, "loss": 1.08685188, "memory(GiB)": 142.32, "step": 42040, "train_speed(iter/s)": 0.285327 }, { "acc": 0.72469473, "epoch": 0.4704388833507865, "grad_norm": 5.71875, "learning_rate": 9.091038615935217e-06, "loss": 1.10484304, "memory(GiB)": 142.32, "step": 42060, "train_speed(iter/s)": 0.285374 }, { "acc": 0.72640066, "epoch": 0.470662582296745, "grad_norm": 5.53125, "learning_rate": 9.089975062074345e-06, "loss": 1.07764893, "memory(GiB)": 142.32, "step": 42080, "train_speed(iter/s)": 0.285419 }, { "acc": 0.73375187, "epoch": 0.4708862812427036, "grad_norm": 6.5625, "learning_rate": 9.088910948646424e-06, "loss": 1.05677662, "memory(GiB)": 142.32, "step": 42100, "train_speed(iter/s)": 0.285463 }, { "acc": 0.72910728, "epoch": 0.4711099801886621, "grad_norm": 5.5, "learning_rate": 9.08784627579704e-06, "loss": 1.08137245, "memory(GiB)": 142.32, "step": 42120, "train_speed(iter/s)": 0.285511 }, { "acc": 0.72156439, "epoch": 0.47133367913462065, "grad_norm": 4.84375, "learning_rate": 9.086781043671857e-06, "loss": 1.11243839, "memory(GiB)": 142.32, "step": 42140, "train_speed(iter/s)": 0.285559 }, { "acc": 0.72290711, "epoch": 0.4715573780805792, "grad_norm": 7.3125, "learning_rate": 9.085715252416616e-06, "loss": 1.10032768, "memory(GiB)": 142.32, "step": 42160, "train_speed(iter/s)": 0.285607 }, { "acc": 0.72646961, "epoch": 0.4717810770265377, "grad_norm": 5.375, "learning_rate": 9.084648902177127e-06, "loss": 1.08179541, "memory(GiB)": 142.32, "step": 42180, "train_speed(iter/s)": 0.285651 }, { "acc": 0.74980831, "epoch": 0.47200477597249624, "grad_norm": 6.375, "learning_rate": 9.083581993099287e-06, "loss": 0.9952569, "memory(GiB)": 142.32, "step": 42200, "train_speed(iter/s)": 0.285696 }, { "acc": 0.73066888, "epoch": 0.47222847491845477, "grad_norm": 6.28125, "learning_rate": 9.082514525329063e-06, "loss": 1.08281879, "memory(GiB)": 142.32, "step": 42220, "train_speed(iter/s)": 0.285742 }, { "acc": 0.72373128, "epoch": 0.4724521738644133, "grad_norm": 6.3125, "learning_rate": 9.081446499012498e-06, "loss": 1.09813404, "memory(GiB)": 142.32, "step": 42240, "train_speed(iter/s)": 0.285793 }, { "acc": 0.72331676, "epoch": 0.4726758728103718, "grad_norm": 6.0625, "learning_rate": 9.080377914295714e-06, "loss": 1.10165291, "memory(GiB)": 142.32, "step": 42260, "train_speed(iter/s)": 0.285841 }, { "acc": 0.73441515, "epoch": 0.47289957175633035, "grad_norm": 6.96875, "learning_rate": 9.07930877132491e-06, "loss": 1.04635773, "memory(GiB)": 142.32, "step": 42280, "train_speed(iter/s)": 0.285889 }, { "acc": 0.71761961, "epoch": 0.4731232707022889, "grad_norm": 5.4375, "learning_rate": 9.07823907024636e-06, "loss": 1.1253828, "memory(GiB)": 142.32, "step": 42300, "train_speed(iter/s)": 0.285936 }, { "acc": 0.71872759, "epoch": 0.4733469696482474, "grad_norm": 6.25, "learning_rate": 9.077168811206414e-06, "loss": 1.1315712, "memory(GiB)": 142.32, "step": 42320, "train_speed(iter/s)": 0.285982 }, { "acc": 0.72133226, "epoch": 0.47357066859420593, "grad_norm": 5.625, "learning_rate": 9.076097994351499e-06, "loss": 1.10716181, "memory(GiB)": 142.32, "step": 42340, "train_speed(iter/s)": 0.286033 }, { "acc": 0.7484457, "epoch": 0.47379436754016446, "grad_norm": 5.53125, "learning_rate": 9.075026619828116e-06, "loss": 0.98991461, "memory(GiB)": 142.32, "step": 42360, "train_speed(iter/s)": 0.286083 }, { "acc": 0.72576785, "epoch": 0.474018066486123, "grad_norm": 6.21875, "learning_rate": 9.073954687782846e-06, "loss": 1.0912509, "memory(GiB)": 142.32, "step": 42380, "train_speed(iter/s)": 0.286127 }, { "acc": 0.728052, "epoch": 0.4742417654320815, "grad_norm": 6.3125, "learning_rate": 9.072882198362345e-06, "loss": 1.08849983, "memory(GiB)": 142.32, "step": 42400, "train_speed(iter/s)": 0.286171 }, { "acc": 0.72122889, "epoch": 0.47446546437804005, "grad_norm": 7.0, "learning_rate": 9.071809151713341e-06, "loss": 1.11457062, "memory(GiB)": 142.32, "step": 42420, "train_speed(iter/s)": 0.286218 }, { "acc": 0.73565784, "epoch": 0.4746891633239986, "grad_norm": 6.40625, "learning_rate": 9.070735547982651e-06, "loss": 1.05528631, "memory(GiB)": 142.32, "step": 42440, "train_speed(iter/s)": 0.286264 }, { "acc": 0.72850323, "epoch": 0.4749128622699571, "grad_norm": 5.0, "learning_rate": 9.06966138731715e-06, "loss": 1.06855125, "memory(GiB)": 142.32, "step": 42460, "train_speed(iter/s)": 0.286308 }, { "acc": 0.73415117, "epoch": 0.47513656121591563, "grad_norm": 5.65625, "learning_rate": 9.068586669863804e-06, "loss": 1.05648928, "memory(GiB)": 142.32, "step": 42480, "train_speed(iter/s)": 0.286351 }, { "acc": 0.72820749, "epoch": 0.47536026016187416, "grad_norm": 7.4375, "learning_rate": 9.067511395769649e-06, "loss": 1.10137482, "memory(GiB)": 142.32, "step": 42500, "train_speed(iter/s)": 0.286399 }, { "acc": 0.72668428, "epoch": 0.4755839591078327, "grad_norm": 6.40625, "learning_rate": 9.066435565181795e-06, "loss": 1.08039112, "memory(GiB)": 142.32, "step": 42520, "train_speed(iter/s)": 0.286444 }, { "acc": 0.72093468, "epoch": 0.4758076580537912, "grad_norm": 6.53125, "learning_rate": 9.065359178247434e-06, "loss": 1.10813103, "memory(GiB)": 142.32, "step": 42540, "train_speed(iter/s)": 0.286489 }, { "acc": 0.7190547, "epoch": 0.47603135699974974, "grad_norm": 5.78125, "learning_rate": 9.06428223511383e-06, "loss": 1.13410988, "memory(GiB)": 142.32, "step": 42560, "train_speed(iter/s)": 0.286531 }, { "acc": 0.72705755, "epoch": 0.47625505594570827, "grad_norm": 6.28125, "learning_rate": 9.063204735928323e-06, "loss": 1.07814665, "memory(GiB)": 142.32, "step": 42580, "train_speed(iter/s)": 0.286576 }, { "acc": 0.74249306, "epoch": 0.4764787548916668, "grad_norm": 4.90625, "learning_rate": 9.062126680838332e-06, "loss": 1.02562904, "memory(GiB)": 142.32, "step": 42600, "train_speed(iter/s)": 0.286614 }, { "acc": 0.73426533, "epoch": 0.47670245383762533, "grad_norm": 6.1875, "learning_rate": 9.06104806999135e-06, "loss": 1.05779362, "memory(GiB)": 142.32, "step": 42620, "train_speed(iter/s)": 0.286661 }, { "acc": 0.72506289, "epoch": 0.47692615278358386, "grad_norm": 6.09375, "learning_rate": 9.059968903534948e-06, "loss": 1.08834629, "memory(GiB)": 142.32, "step": 42640, "train_speed(iter/s)": 0.286703 }, { "acc": 0.72509418, "epoch": 0.4771498517295424, "grad_norm": 5.875, "learning_rate": 9.058889181616768e-06, "loss": 1.08467808, "memory(GiB)": 142.32, "step": 42660, "train_speed(iter/s)": 0.286749 }, { "acc": 0.7306901, "epoch": 0.4773735506755009, "grad_norm": 5.84375, "learning_rate": 9.057808904384534e-06, "loss": 1.07183819, "memory(GiB)": 142.32, "step": 42680, "train_speed(iter/s)": 0.286794 }, { "acc": 0.72434707, "epoch": 0.47759724962145944, "grad_norm": 5.9375, "learning_rate": 9.056728071986041e-06, "loss": 1.08737431, "memory(GiB)": 142.32, "step": 42700, "train_speed(iter/s)": 0.28684 }, { "acc": 0.72171888, "epoch": 0.47782094856741797, "grad_norm": 6.09375, "learning_rate": 9.055646684569164e-06, "loss": 1.13042984, "memory(GiB)": 142.32, "step": 42720, "train_speed(iter/s)": 0.286891 }, { "acc": 0.71498303, "epoch": 0.4780446475133765, "grad_norm": 6.34375, "learning_rate": 9.054564742281853e-06, "loss": 1.14102726, "memory(GiB)": 142.32, "step": 42740, "train_speed(iter/s)": 0.286934 }, { "acc": 0.71437798, "epoch": 0.478268346459335, "grad_norm": 5.625, "learning_rate": 9.053482245272132e-06, "loss": 1.14898815, "memory(GiB)": 142.32, "step": 42760, "train_speed(iter/s)": 0.286978 }, { "acc": 0.72540808, "epoch": 0.47849204540529355, "grad_norm": 5.03125, "learning_rate": 9.052399193688102e-06, "loss": 1.10335026, "memory(GiB)": 142.32, "step": 42780, "train_speed(iter/s)": 0.287022 }, { "acc": 0.71770697, "epoch": 0.4787157443512521, "grad_norm": 6.3125, "learning_rate": 9.05131558767794e-06, "loss": 1.12847557, "memory(GiB)": 142.32, "step": 42800, "train_speed(iter/s)": 0.287067 }, { "acc": 0.7293541, "epoch": 0.4789394432972106, "grad_norm": 5.15625, "learning_rate": 9.0502314273899e-06, "loss": 1.0638876, "memory(GiB)": 142.32, "step": 42820, "train_speed(iter/s)": 0.287113 }, { "acc": 0.73410778, "epoch": 0.47916314224316914, "grad_norm": 5.71875, "learning_rate": 9.049146712972308e-06, "loss": 1.06269798, "memory(GiB)": 142.32, "step": 42840, "train_speed(iter/s)": 0.28716 }, { "acc": 0.71710138, "epoch": 0.47938684118912767, "grad_norm": 5.90625, "learning_rate": 9.048061444573571e-06, "loss": 1.12508888, "memory(GiB)": 142.32, "step": 42860, "train_speed(iter/s)": 0.287202 }, { "acc": 0.74020185, "epoch": 0.4796105401350862, "grad_norm": 6.375, "learning_rate": 9.046975622342167e-06, "loss": 1.03090992, "memory(GiB)": 142.32, "step": 42880, "train_speed(iter/s)": 0.287252 }, { "acc": 0.73025208, "epoch": 0.4798342390810447, "grad_norm": 5.78125, "learning_rate": 9.045889246426654e-06, "loss": 1.07489414, "memory(GiB)": 142.32, "step": 42900, "train_speed(iter/s)": 0.287303 }, { "acc": 0.73334084, "epoch": 0.48005793802700325, "grad_norm": 4.96875, "learning_rate": 9.044802316975662e-06, "loss": 1.04893589, "memory(GiB)": 142.32, "step": 42920, "train_speed(iter/s)": 0.28735 }, { "acc": 0.71150417, "epoch": 0.4802816369729618, "grad_norm": 5.96875, "learning_rate": 9.043714834137902e-06, "loss": 1.16475544, "memory(GiB)": 142.32, "step": 42940, "train_speed(iter/s)": 0.287397 }, { "acc": 0.72533083, "epoch": 0.4805053359189203, "grad_norm": 6.1875, "learning_rate": 9.042626798062152e-06, "loss": 1.10219421, "memory(GiB)": 142.32, "step": 42960, "train_speed(iter/s)": 0.287445 }, { "acc": 0.7299613, "epoch": 0.48072903486487883, "grad_norm": 6.09375, "learning_rate": 9.041538208897277e-06, "loss": 1.07406864, "memory(GiB)": 142.32, "step": 42980, "train_speed(iter/s)": 0.287491 }, { "acc": 0.71653805, "epoch": 0.48095273381083736, "grad_norm": 5.09375, "learning_rate": 9.040449066792205e-06, "loss": 1.1358469, "memory(GiB)": 142.32, "step": 43000, "train_speed(iter/s)": 0.287538 }, { "acc": 0.71976385, "epoch": 0.4811764327567959, "grad_norm": 6.0, "learning_rate": 9.039359371895951e-06, "loss": 1.13205433, "memory(GiB)": 142.32, "step": 43020, "train_speed(iter/s)": 0.28759 }, { "acc": 0.72337899, "epoch": 0.4814001317027544, "grad_norm": 5.96875, "learning_rate": 9.038269124357598e-06, "loss": 1.08737354, "memory(GiB)": 142.32, "step": 43040, "train_speed(iter/s)": 0.287635 }, { "acc": 0.71807394, "epoch": 0.48162383064871295, "grad_norm": 5.40625, "learning_rate": 9.03717832432631e-06, "loss": 1.14954538, "memory(GiB)": 142.32, "step": 43060, "train_speed(iter/s)": 0.287681 }, { "acc": 0.72172027, "epoch": 0.4818475295946715, "grad_norm": 6.125, "learning_rate": 9.036086971951321e-06, "loss": 1.11969299, "memory(GiB)": 142.32, "step": 43080, "train_speed(iter/s)": 0.287724 }, { "acc": 0.73267794, "epoch": 0.48207122854063, "grad_norm": 6.1875, "learning_rate": 9.034995067381946e-06, "loss": 1.05760126, "memory(GiB)": 142.32, "step": 43100, "train_speed(iter/s)": 0.287766 }, { "acc": 0.72078047, "epoch": 0.48229492748658853, "grad_norm": 6.8125, "learning_rate": 9.033902610767573e-06, "loss": 1.12163868, "memory(GiB)": 142.32, "step": 43120, "train_speed(iter/s)": 0.28781 }, { "acc": 0.70916519, "epoch": 0.48251862643254706, "grad_norm": 4.96875, "learning_rate": 9.032809602257663e-06, "loss": 1.17779217, "memory(GiB)": 142.32, "step": 43140, "train_speed(iter/s)": 0.28785 }, { "acc": 0.73182731, "epoch": 0.4827423253785056, "grad_norm": 5.65625, "learning_rate": 9.03171604200176e-06, "loss": 1.05873909, "memory(GiB)": 142.32, "step": 43160, "train_speed(iter/s)": 0.287896 }, { "acc": 0.73029141, "epoch": 0.4829660243244641, "grad_norm": 6.9375, "learning_rate": 9.030621930149475e-06, "loss": 1.08537464, "memory(GiB)": 142.32, "step": 43180, "train_speed(iter/s)": 0.287939 }, { "acc": 0.72706132, "epoch": 0.48318972327042264, "grad_norm": 5.71875, "learning_rate": 9.029527266850499e-06, "loss": 1.10501375, "memory(GiB)": 142.32, "step": 43200, "train_speed(iter/s)": 0.287987 }, { "acc": 0.72319851, "epoch": 0.4834134222163812, "grad_norm": 4.71875, "learning_rate": 9.028432052254598e-06, "loss": 1.11081371, "memory(GiB)": 142.32, "step": 43220, "train_speed(iter/s)": 0.28803 }, { "acc": 0.72726297, "epoch": 0.4836371211623397, "grad_norm": 5.53125, "learning_rate": 9.027336286511613e-06, "loss": 1.08422613, "memory(GiB)": 142.32, "step": 43240, "train_speed(iter/s)": 0.288072 }, { "acc": 0.72853236, "epoch": 0.48386082010829823, "grad_norm": 6.53125, "learning_rate": 9.026239969771459e-06, "loss": 1.06464682, "memory(GiB)": 142.32, "step": 43260, "train_speed(iter/s)": 0.28812 }, { "acc": 0.72070537, "epoch": 0.48408451905425676, "grad_norm": 5.6875, "learning_rate": 9.025143102184129e-06, "loss": 1.11420403, "memory(GiB)": 142.32, "step": 43280, "train_speed(iter/s)": 0.288167 }, { "acc": 0.72763214, "epoch": 0.4843082180002153, "grad_norm": 5.53125, "learning_rate": 9.024045683899692e-06, "loss": 1.08408337, "memory(GiB)": 142.32, "step": 43300, "train_speed(iter/s)": 0.288214 }, { "acc": 0.72053661, "epoch": 0.4845319169461738, "grad_norm": 6.25, "learning_rate": 9.022947715068287e-06, "loss": 1.12555256, "memory(GiB)": 142.32, "step": 43320, "train_speed(iter/s)": 0.288255 }, { "acc": 0.7185215, "epoch": 0.48475561589213234, "grad_norm": 6.1875, "learning_rate": 9.021849195840133e-06, "loss": 1.12779655, "memory(GiB)": 142.32, "step": 43340, "train_speed(iter/s)": 0.288298 }, { "acc": 0.73049555, "epoch": 0.4849793148380909, "grad_norm": 4.65625, "learning_rate": 9.020750126365523e-06, "loss": 1.07252836, "memory(GiB)": 142.32, "step": 43360, "train_speed(iter/s)": 0.288336 }, { "acc": 0.72362866, "epoch": 0.48520301378404945, "grad_norm": 5.59375, "learning_rate": 9.019650506794828e-06, "loss": 1.08862543, "memory(GiB)": 142.32, "step": 43380, "train_speed(iter/s)": 0.288381 }, { "acc": 0.72285128, "epoch": 0.485426712730008, "grad_norm": 8.4375, "learning_rate": 9.018550337278486e-06, "loss": 1.1130209, "memory(GiB)": 142.32, "step": 43400, "train_speed(iter/s)": 0.288425 }, { "acc": 0.73591108, "epoch": 0.4856504116759665, "grad_norm": 5.71875, "learning_rate": 9.017449617967024e-06, "loss": 1.05150356, "memory(GiB)": 142.32, "step": 43420, "train_speed(iter/s)": 0.288474 }, { "acc": 0.72880769, "epoch": 0.48587411062192504, "grad_norm": 6.40625, "learning_rate": 9.016348349011029e-06, "loss": 1.09006176, "memory(GiB)": 142.32, "step": 43440, "train_speed(iter/s)": 0.288524 }, { "acc": 0.72443199, "epoch": 0.48609780956788357, "grad_norm": 4.875, "learning_rate": 9.015246530561174e-06, "loss": 1.08612356, "memory(GiB)": 142.32, "step": 43460, "train_speed(iter/s)": 0.288572 }, { "acc": 0.73321905, "epoch": 0.4863215085138421, "grad_norm": 5.3125, "learning_rate": 9.014144162768202e-06, "loss": 1.06447048, "memory(GiB)": 142.32, "step": 43480, "train_speed(iter/s)": 0.288618 }, { "acc": 0.7299654, "epoch": 0.4865452074598006, "grad_norm": 6.0625, "learning_rate": 9.013041245782934e-06, "loss": 1.0899044, "memory(GiB)": 142.32, "step": 43500, "train_speed(iter/s)": 0.288668 }, { "acc": 0.70930343, "epoch": 0.48676890640575915, "grad_norm": 6.28125, "learning_rate": 9.011937779756263e-06, "loss": 1.14435549, "memory(GiB)": 142.32, "step": 43520, "train_speed(iter/s)": 0.288713 }, { "acc": 0.72000113, "epoch": 0.4869926053517177, "grad_norm": 5.40625, "learning_rate": 9.01083376483916e-06, "loss": 1.13081684, "memory(GiB)": 142.32, "step": 43540, "train_speed(iter/s)": 0.288754 }, { "acc": 0.72262411, "epoch": 0.4872163042976762, "grad_norm": 6.8125, "learning_rate": 9.00972920118267e-06, "loss": 1.10304947, "memory(GiB)": 142.32, "step": 43560, "train_speed(iter/s)": 0.288791 }, { "acc": 0.7192112, "epoch": 0.48744000324363473, "grad_norm": 5.71875, "learning_rate": 9.008624088937913e-06, "loss": 1.11419849, "memory(GiB)": 142.32, "step": 43580, "train_speed(iter/s)": 0.288832 }, { "acc": 0.71644135, "epoch": 0.48766370218959326, "grad_norm": 5.4375, "learning_rate": 9.007518428256086e-06, "loss": 1.13937588, "memory(GiB)": 142.32, "step": 43600, "train_speed(iter/s)": 0.288876 }, { "acc": 0.74255595, "epoch": 0.4878874011355518, "grad_norm": 6.28125, "learning_rate": 9.006412219288456e-06, "loss": 1.02432327, "memory(GiB)": 142.32, "step": 43620, "train_speed(iter/s)": 0.288916 }, { "acc": 0.7228591, "epoch": 0.4881111000815103, "grad_norm": 5.71875, "learning_rate": 9.005305462186369e-06, "loss": 1.10279217, "memory(GiB)": 142.32, "step": 43640, "train_speed(iter/s)": 0.288958 }, { "acc": 0.72625303, "epoch": 0.48833479902746885, "grad_norm": 7.59375, "learning_rate": 9.004198157101248e-06, "loss": 1.07269173, "memory(GiB)": 142.32, "step": 43660, "train_speed(iter/s)": 0.289003 }, { "acc": 0.73545375, "epoch": 0.4885584979734274, "grad_norm": 5.28125, "learning_rate": 9.003090304184583e-06, "loss": 1.04722471, "memory(GiB)": 142.32, "step": 43680, "train_speed(iter/s)": 0.289044 }, { "acc": 0.7212697, "epoch": 0.4887821969193859, "grad_norm": 5.78125, "learning_rate": 9.001981903587949e-06, "loss": 1.10552073, "memory(GiB)": 142.32, "step": 43700, "train_speed(iter/s)": 0.289084 }, { "acc": 0.72777038, "epoch": 0.48900589586534443, "grad_norm": 5.65625, "learning_rate": 9.000872955462987e-06, "loss": 1.07767162, "memory(GiB)": 142.32, "step": 43720, "train_speed(iter/s)": 0.289123 }, { "acc": 0.72252426, "epoch": 0.48922959481130296, "grad_norm": 6.0625, "learning_rate": 8.999763459961422e-06, "loss": 1.10711937, "memory(GiB)": 142.32, "step": 43740, "train_speed(iter/s)": 0.289166 }, { "acc": 0.73310051, "epoch": 0.4894532937572615, "grad_norm": 5.21875, "learning_rate": 8.998653417235044e-06, "loss": 1.06180782, "memory(GiB)": 142.32, "step": 43760, "train_speed(iter/s)": 0.289212 }, { "acc": 0.72308826, "epoch": 0.48967699270322, "grad_norm": 5.53125, "learning_rate": 8.997542827435723e-06, "loss": 1.11681175, "memory(GiB)": 142.32, "step": 43780, "train_speed(iter/s)": 0.289254 }, { "acc": 0.73087521, "epoch": 0.48990069164917854, "grad_norm": 5.34375, "learning_rate": 8.996431690715408e-06, "loss": 1.06887321, "memory(GiB)": 142.32, "step": 43800, "train_speed(iter/s)": 0.289294 }, { "acc": 0.72949114, "epoch": 0.49012439059513707, "grad_norm": 5.3125, "learning_rate": 8.995320007226114e-06, "loss": 1.07408552, "memory(GiB)": 142.32, "step": 43820, "train_speed(iter/s)": 0.289342 }, { "acc": 0.71005716, "epoch": 0.4903480895410956, "grad_norm": 5.09375, "learning_rate": 8.994207777119937e-06, "loss": 1.16257286, "memory(GiB)": 142.32, "step": 43840, "train_speed(iter/s)": 0.289393 }, { "acc": 0.72071505, "epoch": 0.49057178848705413, "grad_norm": 6.5, "learning_rate": 8.993095000549047e-06, "loss": 1.12014027, "memory(GiB)": 142.32, "step": 43860, "train_speed(iter/s)": 0.289436 }, { "acc": 0.72026663, "epoch": 0.49079548743301266, "grad_norm": 5.4375, "learning_rate": 8.991981677665685e-06, "loss": 1.10157032, "memory(GiB)": 142.32, "step": 43880, "train_speed(iter/s)": 0.289481 }, { "acc": 0.73304787, "epoch": 0.4910191863789712, "grad_norm": 5.625, "learning_rate": 8.990867808622172e-06, "loss": 1.07836323, "memory(GiB)": 142.32, "step": 43900, "train_speed(iter/s)": 0.289522 }, { "acc": 0.73002033, "epoch": 0.4912428853249297, "grad_norm": 5.8125, "learning_rate": 8.989753393570899e-06, "loss": 1.0631218, "memory(GiB)": 142.32, "step": 43920, "train_speed(iter/s)": 0.289563 }, { "acc": 0.73139496, "epoch": 0.49146658427088824, "grad_norm": 5.96875, "learning_rate": 8.988638432664336e-06, "loss": 1.07720594, "memory(GiB)": 142.32, "step": 43940, "train_speed(iter/s)": 0.289606 }, { "acc": 0.7329298, "epoch": 0.49169028321684677, "grad_norm": 6.6875, "learning_rate": 8.987522926055023e-06, "loss": 1.04735718, "memory(GiB)": 142.32, "step": 43960, "train_speed(iter/s)": 0.289654 }, { "acc": 0.71917758, "epoch": 0.4919139821628053, "grad_norm": 5.53125, "learning_rate": 8.986406873895581e-06, "loss": 1.11778889, "memory(GiB)": 142.32, "step": 43980, "train_speed(iter/s)": 0.289702 }, { "acc": 0.73063836, "epoch": 0.4921376811087638, "grad_norm": 5.21875, "learning_rate": 8.985290276338698e-06, "loss": 1.07434216, "memory(GiB)": 142.32, "step": 44000, "train_speed(iter/s)": 0.289744 }, { "epoch": 0.4921376811087638, "eval_acc": 0.691037700504285, "eval_loss": 1.0949907302856445, "eval_runtime": 2340.7123, "eval_samples_per_second": 32.162, "eval_steps_per_second": 16.081, "step": 44000 }, { "acc": 0.7391201, "epoch": 0.49236138005472235, "grad_norm": 7.09375, "learning_rate": 8.984173133537144e-06, "loss": 1.02427845, "memory(GiB)": 142.32, "step": 44020, "train_speed(iter/s)": 0.285289 }, { "acc": 0.7237215, "epoch": 0.4925850790006809, "grad_norm": 6.15625, "learning_rate": 8.983055445643758e-06, "loss": 1.1116518, "memory(GiB)": 142.32, "step": 44040, "train_speed(iter/s)": 0.285335 }, { "acc": 0.71927509, "epoch": 0.4928087779466394, "grad_norm": 6.40625, "learning_rate": 8.981937212811455e-06, "loss": 1.12097702, "memory(GiB)": 142.32, "step": 44060, "train_speed(iter/s)": 0.285378 }, { "acc": 0.73210158, "epoch": 0.49303247689259794, "grad_norm": 5.4375, "learning_rate": 8.980818435193226e-06, "loss": 1.05410175, "memory(GiB)": 142.32, "step": 44080, "train_speed(iter/s)": 0.285422 }, { "acc": 0.72950201, "epoch": 0.49325617583855647, "grad_norm": 5.3125, "learning_rate": 8.979699112942137e-06, "loss": 1.08751984, "memory(GiB)": 142.32, "step": 44100, "train_speed(iter/s)": 0.285465 }, { "acc": 0.72455177, "epoch": 0.493479874784515, "grad_norm": 6.375, "learning_rate": 8.978579246211327e-06, "loss": 1.08650646, "memory(GiB)": 142.32, "step": 44120, "train_speed(iter/s)": 0.285511 }, { "acc": 0.72620583, "epoch": 0.4937035737304735, "grad_norm": 7.21875, "learning_rate": 8.977458835154008e-06, "loss": 1.09643612, "memory(GiB)": 142.32, "step": 44140, "train_speed(iter/s)": 0.285556 }, { "acc": 0.73181076, "epoch": 0.49392727267643205, "grad_norm": 4.96875, "learning_rate": 8.97633787992347e-06, "loss": 1.05914135, "memory(GiB)": 142.32, "step": 44160, "train_speed(iter/s)": 0.285599 }, { "acc": 0.73100834, "epoch": 0.4941509716223906, "grad_norm": 5.96875, "learning_rate": 8.975216380673075e-06, "loss": 1.05063276, "memory(GiB)": 142.32, "step": 44180, "train_speed(iter/s)": 0.285643 }, { "acc": 0.73007488, "epoch": 0.4943746705683491, "grad_norm": 6.34375, "learning_rate": 8.974094337556261e-06, "loss": 1.07706833, "memory(GiB)": 142.32, "step": 44200, "train_speed(iter/s)": 0.285686 }, { "acc": 0.72950506, "epoch": 0.49459836951430763, "grad_norm": 5.6875, "learning_rate": 8.972971750726537e-06, "loss": 1.07999611, "memory(GiB)": 142.32, "step": 44220, "train_speed(iter/s)": 0.285729 }, { "acc": 0.72310414, "epoch": 0.49482206846026616, "grad_norm": 6.625, "learning_rate": 8.971848620337492e-06, "loss": 1.09195862, "memory(GiB)": 142.32, "step": 44240, "train_speed(iter/s)": 0.285772 }, { "acc": 0.72942286, "epoch": 0.4950457674062247, "grad_norm": 6.0625, "learning_rate": 8.970724946542784e-06, "loss": 1.09501457, "memory(GiB)": 142.32, "step": 44260, "train_speed(iter/s)": 0.285818 }, { "acc": 0.73623209, "epoch": 0.4952694663521832, "grad_norm": 4.5625, "learning_rate": 8.969600729496148e-06, "loss": 1.045438, "memory(GiB)": 142.32, "step": 44280, "train_speed(iter/s)": 0.285863 }, { "acc": 0.73613691, "epoch": 0.49549316529814175, "grad_norm": 6.0625, "learning_rate": 8.968475969351395e-06, "loss": 1.05057411, "memory(GiB)": 142.32, "step": 44300, "train_speed(iter/s)": 0.285909 }, { "acc": 0.72657433, "epoch": 0.4957168642441003, "grad_norm": 6.09375, "learning_rate": 8.967350666262406e-06, "loss": 1.08701611, "memory(GiB)": 142.32, "step": 44320, "train_speed(iter/s)": 0.285954 }, { "acc": 0.73208475, "epoch": 0.4959405631900588, "grad_norm": 5.0625, "learning_rate": 8.966224820383139e-06, "loss": 1.0628397, "memory(GiB)": 142.32, "step": 44340, "train_speed(iter/s)": 0.286 }, { "acc": 0.72831798, "epoch": 0.49616426213601733, "grad_norm": 5.59375, "learning_rate": 8.965098431867627e-06, "loss": 1.09335051, "memory(GiB)": 142.32, "step": 44360, "train_speed(iter/s)": 0.286043 }, { "acc": 0.71974201, "epoch": 0.49638796108197586, "grad_norm": 4.65625, "learning_rate": 8.963971500869975e-06, "loss": 1.12002487, "memory(GiB)": 142.32, "step": 44380, "train_speed(iter/s)": 0.286086 }, { "acc": 0.72006493, "epoch": 0.4966116600279344, "grad_norm": 5.8125, "learning_rate": 8.962844027544363e-06, "loss": 1.12032652, "memory(GiB)": 142.32, "step": 44400, "train_speed(iter/s)": 0.286128 }, { "acc": 0.73054705, "epoch": 0.4968353589738929, "grad_norm": 5.15625, "learning_rate": 8.961716012045047e-06, "loss": 1.06427555, "memory(GiB)": 142.32, "step": 44420, "train_speed(iter/s)": 0.286174 }, { "acc": 0.71724653, "epoch": 0.49705905791985144, "grad_norm": 5.46875, "learning_rate": 8.960587454526353e-06, "loss": 1.14145069, "memory(GiB)": 142.32, "step": 44440, "train_speed(iter/s)": 0.286222 }, { "acc": 0.73827429, "epoch": 0.49728275686580997, "grad_norm": 6.84375, "learning_rate": 8.959458355142688e-06, "loss": 1.03544006, "memory(GiB)": 142.32, "step": 44460, "train_speed(iter/s)": 0.286267 }, { "acc": 0.72953882, "epoch": 0.4975064558117685, "grad_norm": 5.8125, "learning_rate": 8.958328714048522e-06, "loss": 1.08962727, "memory(GiB)": 142.32, "step": 44480, "train_speed(iter/s)": 0.286314 }, { "acc": 0.72388544, "epoch": 0.49773015475772703, "grad_norm": 5.40625, "learning_rate": 8.957198531398414e-06, "loss": 1.08908348, "memory(GiB)": 142.32, "step": 44500, "train_speed(iter/s)": 0.286354 }, { "acc": 0.72565041, "epoch": 0.49795385370368556, "grad_norm": 7.40625, "learning_rate": 8.956067807346984e-06, "loss": 1.09020576, "memory(GiB)": 142.32, "step": 44520, "train_speed(iter/s)": 0.286397 }, { "acc": 0.73283992, "epoch": 0.4981775526496441, "grad_norm": 5.625, "learning_rate": 8.954936542048934e-06, "loss": 1.05334587, "memory(GiB)": 142.32, "step": 44540, "train_speed(iter/s)": 0.28644 }, { "acc": 0.72664728, "epoch": 0.4984012515956026, "grad_norm": 6.21875, "learning_rate": 8.953804735659034e-06, "loss": 1.07588549, "memory(GiB)": 142.32, "step": 44560, "train_speed(iter/s)": 0.286475 }, { "acc": 0.71936913, "epoch": 0.49862495054156114, "grad_norm": 5.84375, "learning_rate": 8.952672388332136e-06, "loss": 1.12946148, "memory(GiB)": 142.32, "step": 44580, "train_speed(iter/s)": 0.286516 }, { "acc": 0.72758102, "epoch": 0.49884864948751967, "grad_norm": 5.46875, "learning_rate": 8.951539500223156e-06, "loss": 1.08895855, "memory(GiB)": 142.32, "step": 44600, "train_speed(iter/s)": 0.286563 }, { "acc": 0.72937632, "epoch": 0.49907234843347825, "grad_norm": 4.90625, "learning_rate": 8.950406071487095e-06, "loss": 1.07519588, "memory(GiB)": 142.32, "step": 44620, "train_speed(iter/s)": 0.286606 }, { "acc": 0.71728678, "epoch": 0.4992960473794368, "grad_norm": 5.0625, "learning_rate": 8.949272102279016e-06, "loss": 1.14027472, "memory(GiB)": 142.32, "step": 44640, "train_speed(iter/s)": 0.286647 }, { "acc": 0.72237682, "epoch": 0.4995197463253953, "grad_norm": 5.8125, "learning_rate": 8.948137592754064e-06, "loss": 1.12316284, "memory(GiB)": 142.32, "step": 44660, "train_speed(iter/s)": 0.28669 }, { "acc": 0.72258902, "epoch": 0.49974344527135384, "grad_norm": 5.28125, "learning_rate": 8.947002543067462e-06, "loss": 1.11322479, "memory(GiB)": 142.32, "step": 44680, "train_speed(iter/s)": 0.286734 }, { "acc": 0.73478699, "epoch": 0.49996714421731236, "grad_norm": 5.6875, "learning_rate": 8.945866953374494e-06, "loss": 1.07692642, "memory(GiB)": 142.32, "step": 44700, "train_speed(iter/s)": 0.286776 }, { "acc": 0.72694769, "epoch": 0.5001908431632709, "grad_norm": 5.8125, "learning_rate": 8.944730823830527e-06, "loss": 1.08823433, "memory(GiB)": 142.32, "step": 44720, "train_speed(iter/s)": 0.286818 }, { "acc": 0.72955933, "epoch": 0.5004145421092294, "grad_norm": 5.21875, "learning_rate": 8.943594154591e-06, "loss": 1.07987633, "memory(GiB)": 142.32, "step": 44740, "train_speed(iter/s)": 0.28686 }, { "acc": 0.72001691, "epoch": 0.500638241055188, "grad_norm": 5.34375, "learning_rate": 8.942456945811427e-06, "loss": 1.14422607, "memory(GiB)": 142.32, "step": 44760, "train_speed(iter/s)": 0.286903 }, { "acc": 0.73504496, "epoch": 0.5008619400011465, "grad_norm": 5.375, "learning_rate": 8.941319197647394e-06, "loss": 1.05524578, "memory(GiB)": 142.32, "step": 44780, "train_speed(iter/s)": 0.286945 }, { "acc": 0.73678751, "epoch": 0.501085638947105, "grad_norm": 6.21875, "learning_rate": 8.940180910254556e-06, "loss": 1.0313447, "memory(GiB)": 142.32, "step": 44800, "train_speed(iter/s)": 0.286987 }, { "acc": 0.7293251, "epoch": 0.5013093378930635, "grad_norm": 6.90625, "learning_rate": 8.939042083788655e-06, "loss": 1.0748867, "memory(GiB)": 142.32, "step": 44820, "train_speed(iter/s)": 0.287029 }, { "acc": 0.73535709, "epoch": 0.5015330368390221, "grad_norm": 5.15625, "learning_rate": 8.937902718405495e-06, "loss": 1.06134453, "memory(GiB)": 142.32, "step": 44840, "train_speed(iter/s)": 0.287068 }, { "acc": 0.73564386, "epoch": 0.5017567357849806, "grad_norm": 5.90625, "learning_rate": 8.936762814260954e-06, "loss": 1.05147724, "memory(GiB)": 142.32, "step": 44860, "train_speed(iter/s)": 0.287111 }, { "acc": 0.7289731, "epoch": 0.5019804347309391, "grad_norm": 6.1875, "learning_rate": 8.935622371510995e-06, "loss": 1.0844985, "memory(GiB)": 142.32, "step": 44880, "train_speed(iter/s)": 0.287148 }, { "acc": 0.73015871, "epoch": 0.5022041336768976, "grad_norm": 5.28125, "learning_rate": 8.93448139031164e-06, "loss": 1.06037378, "memory(GiB)": 142.32, "step": 44900, "train_speed(iter/s)": 0.287195 }, { "acc": 0.72163515, "epoch": 0.5024278326228562, "grad_norm": 5.8125, "learning_rate": 8.933339870818996e-06, "loss": 1.12490959, "memory(GiB)": 142.32, "step": 44920, "train_speed(iter/s)": 0.287238 }, { "acc": 0.72849674, "epoch": 0.5026515315688147, "grad_norm": 6.25, "learning_rate": 8.932197813189237e-06, "loss": 1.08424034, "memory(GiB)": 142.32, "step": 44940, "train_speed(iter/s)": 0.287278 }, { "acc": 0.73408909, "epoch": 0.5028752305147732, "grad_norm": 4.8125, "learning_rate": 8.931055217578612e-06, "loss": 1.0692544, "memory(GiB)": 142.32, "step": 44960, "train_speed(iter/s)": 0.287323 }, { "acc": 0.72910109, "epoch": 0.5030989294607318, "grad_norm": 6.46875, "learning_rate": 8.929912084143447e-06, "loss": 1.08732643, "memory(GiB)": 142.32, "step": 44980, "train_speed(iter/s)": 0.287365 }, { "acc": 0.74094591, "epoch": 0.5033226284066903, "grad_norm": 5.6875, "learning_rate": 8.928768413040135e-06, "loss": 1.02733688, "memory(GiB)": 142.32, "step": 45000, "train_speed(iter/s)": 0.28741 }, { "acc": 0.73295193, "epoch": 0.5035463273526488, "grad_norm": 6.84375, "learning_rate": 8.927624204425152e-06, "loss": 1.05243692, "memory(GiB)": 142.32, "step": 45020, "train_speed(iter/s)": 0.287459 }, { "acc": 0.72130575, "epoch": 0.5037700262986073, "grad_norm": 6.65625, "learning_rate": 8.926479458455037e-06, "loss": 1.12601957, "memory(GiB)": 142.32, "step": 45040, "train_speed(iter/s)": 0.287509 }, { "acc": 0.72931767, "epoch": 0.5039937252445659, "grad_norm": 6.03125, "learning_rate": 8.925334175286411e-06, "loss": 1.07654285, "memory(GiB)": 142.32, "step": 45060, "train_speed(iter/s)": 0.287555 }, { "acc": 0.73251891, "epoch": 0.5042174241905244, "grad_norm": 5.25, "learning_rate": 8.924188355075963e-06, "loss": 1.0681324, "memory(GiB)": 142.32, "step": 45080, "train_speed(iter/s)": 0.287597 }, { "acc": 0.72655773, "epoch": 0.5044411231364829, "grad_norm": 5.6875, "learning_rate": 8.923041997980459e-06, "loss": 1.08685598, "memory(GiB)": 142.32, "step": 45100, "train_speed(iter/s)": 0.287638 }, { "acc": 0.72833214, "epoch": 0.5046648220824415, "grad_norm": 5.1875, "learning_rate": 8.921895104156734e-06, "loss": 1.08191004, "memory(GiB)": 142.32, "step": 45120, "train_speed(iter/s)": 0.287681 }, { "acc": 0.72214069, "epoch": 0.5048885210284, "grad_norm": 6.09375, "learning_rate": 8.920747673761705e-06, "loss": 1.11840935, "memory(GiB)": 142.32, "step": 45140, "train_speed(iter/s)": 0.28772 }, { "acc": 0.73651752, "epoch": 0.5051122199743585, "grad_norm": 6.5625, "learning_rate": 8.919599706952354e-06, "loss": 1.03828869, "memory(GiB)": 142.32, "step": 45160, "train_speed(iter/s)": 0.287765 }, { "acc": 0.72821321, "epoch": 0.505335918920317, "grad_norm": 6.25, "learning_rate": 8.918451203885737e-06, "loss": 1.08570099, "memory(GiB)": 142.32, "step": 45180, "train_speed(iter/s)": 0.287809 }, { "acc": 0.7266912, "epoch": 0.5055596178662756, "grad_norm": 4.75, "learning_rate": 8.91730216471899e-06, "loss": 1.08963614, "memory(GiB)": 142.32, "step": 45200, "train_speed(iter/s)": 0.287852 }, { "acc": 0.72444553, "epoch": 0.5057833168122341, "grad_norm": 6.03125, "learning_rate": 8.916152589609314e-06, "loss": 1.11165943, "memory(GiB)": 142.32, "step": 45220, "train_speed(iter/s)": 0.287896 }, { "acc": 0.73293653, "epoch": 0.5060070157581926, "grad_norm": 5.71875, "learning_rate": 8.91500247871399e-06, "loss": 1.06286507, "memory(GiB)": 142.32, "step": 45240, "train_speed(iter/s)": 0.287942 }, { "acc": 0.72909889, "epoch": 0.5062307147041512, "grad_norm": 6.46875, "learning_rate": 8.913851832190367e-06, "loss": 1.07686558, "memory(GiB)": 142.32, "step": 45260, "train_speed(iter/s)": 0.287981 }, { "acc": 0.73495646, "epoch": 0.5064544136501097, "grad_norm": 5.21875, "learning_rate": 8.912700650195874e-06, "loss": 1.05459442, "memory(GiB)": 142.32, "step": 45280, "train_speed(iter/s)": 0.28802 }, { "acc": 0.72531872, "epoch": 0.5066781125960682, "grad_norm": 6.3125, "learning_rate": 8.911548932888004e-06, "loss": 1.10502281, "memory(GiB)": 142.32, "step": 45300, "train_speed(iter/s)": 0.288062 }, { "acc": 0.74365602, "epoch": 0.5069018115420267, "grad_norm": 5.875, "learning_rate": 8.910396680424334e-06, "loss": 1.00448675, "memory(GiB)": 142.32, "step": 45320, "train_speed(iter/s)": 0.288106 }, { "acc": 0.73103504, "epoch": 0.5071255104879853, "grad_norm": 5.71875, "learning_rate": 8.909243892962503e-06, "loss": 1.05892344, "memory(GiB)": 142.32, "step": 45340, "train_speed(iter/s)": 0.288154 }, { "acc": 0.72673931, "epoch": 0.5073492094339438, "grad_norm": 6.46875, "learning_rate": 8.908090570660233e-06, "loss": 1.09249077, "memory(GiB)": 142.32, "step": 45360, "train_speed(iter/s)": 0.288197 }, { "acc": 0.73125629, "epoch": 0.5075729083799023, "grad_norm": 6.0, "learning_rate": 8.906936713675314e-06, "loss": 1.06508532, "memory(GiB)": 142.32, "step": 45380, "train_speed(iter/s)": 0.28824 }, { "acc": 0.72888012, "epoch": 0.5077966073258608, "grad_norm": 5.96875, "learning_rate": 8.905782322165608e-06, "loss": 1.07825212, "memory(GiB)": 142.32, "step": 45400, "train_speed(iter/s)": 0.288284 }, { "acc": 0.71876554, "epoch": 0.5080203062718194, "grad_norm": 4.71875, "learning_rate": 8.904627396289053e-06, "loss": 1.13991089, "memory(GiB)": 142.32, "step": 45420, "train_speed(iter/s)": 0.288326 }, { "acc": 0.72020874, "epoch": 0.5082440052177779, "grad_norm": 6.375, "learning_rate": 8.903471936203663e-06, "loss": 1.12742205, "memory(GiB)": 142.32, "step": 45440, "train_speed(iter/s)": 0.288371 }, { "acc": 0.7321579, "epoch": 0.5084677041637364, "grad_norm": 5.15625, "learning_rate": 8.902315942067517e-06, "loss": 1.06580811, "memory(GiB)": 142.32, "step": 45460, "train_speed(iter/s)": 0.288415 }, { "acc": 0.7337841, "epoch": 0.508691403109695, "grad_norm": 7.46875, "learning_rate": 8.901159414038773e-06, "loss": 1.05019321, "memory(GiB)": 142.32, "step": 45480, "train_speed(iter/s)": 0.28846 }, { "acc": 0.72524567, "epoch": 0.5089151020556535, "grad_norm": 5.375, "learning_rate": 8.900002352275661e-06, "loss": 1.09781971, "memory(GiB)": 142.32, "step": 45500, "train_speed(iter/s)": 0.288502 }, { "acc": 0.73604126, "epoch": 0.509138801001612, "grad_norm": 6.9375, "learning_rate": 8.898844756936484e-06, "loss": 1.03688297, "memory(GiB)": 142.32, "step": 45520, "train_speed(iter/s)": 0.288548 }, { "acc": 0.72484136, "epoch": 0.5093624999475705, "grad_norm": 5.71875, "learning_rate": 8.897686628179616e-06, "loss": 1.09272699, "memory(GiB)": 142.32, "step": 45540, "train_speed(iter/s)": 0.288589 }, { "acc": 0.71787901, "epoch": 0.5095861988935291, "grad_norm": 5.78125, "learning_rate": 8.896527966163509e-06, "loss": 1.12094879, "memory(GiB)": 142.32, "step": 45560, "train_speed(iter/s)": 0.288631 }, { "acc": 0.73139658, "epoch": 0.5098098978394876, "grad_norm": 5.78125, "learning_rate": 8.895368771046679e-06, "loss": 1.06915455, "memory(GiB)": 142.32, "step": 45580, "train_speed(iter/s)": 0.288671 }, { "acc": 0.72774305, "epoch": 0.5100335967854461, "grad_norm": 5.78125, "learning_rate": 8.894209042987725e-06, "loss": 1.06757603, "memory(GiB)": 142.32, "step": 45600, "train_speed(iter/s)": 0.288716 }, { "acc": 0.72926946, "epoch": 0.5102572957314047, "grad_norm": 5.75, "learning_rate": 8.893048782145311e-06, "loss": 1.08863029, "memory(GiB)": 142.32, "step": 45620, "train_speed(iter/s)": 0.288759 }, { "acc": 0.73419309, "epoch": 0.5104809946773632, "grad_norm": 5.75, "learning_rate": 8.89188798867818e-06, "loss": 1.05755119, "memory(GiB)": 142.32, "step": 45640, "train_speed(iter/s)": 0.288806 }, { "acc": 0.73492455, "epoch": 0.5107046936233217, "grad_norm": 6.4375, "learning_rate": 8.890726662745147e-06, "loss": 1.06788445, "memory(GiB)": 142.32, "step": 45660, "train_speed(iter/s)": 0.288847 }, { "acc": 0.73009415, "epoch": 0.5109283925692802, "grad_norm": 5.03125, "learning_rate": 8.889564804505092e-06, "loss": 1.08389282, "memory(GiB)": 142.32, "step": 45680, "train_speed(iter/s)": 0.288891 }, { "acc": 0.71997986, "epoch": 0.5111520915152388, "grad_norm": 5.53125, "learning_rate": 8.888402414116978e-06, "loss": 1.12575226, "memory(GiB)": 142.32, "step": 45700, "train_speed(iter/s)": 0.28893 }, { "acc": 0.73407946, "epoch": 0.5113757904611973, "grad_norm": 5.65625, "learning_rate": 8.887239491739835e-06, "loss": 1.04050846, "memory(GiB)": 142.32, "step": 45720, "train_speed(iter/s)": 0.288977 }, { "acc": 0.72717428, "epoch": 0.5115994894071558, "grad_norm": 5.625, "learning_rate": 8.886076037532769e-06, "loss": 1.08578453, "memory(GiB)": 142.32, "step": 45740, "train_speed(iter/s)": 0.289021 }, { "acc": 0.73171124, "epoch": 0.5118231883531144, "grad_norm": 5.71875, "learning_rate": 8.884912051654956e-06, "loss": 1.07157288, "memory(GiB)": 142.32, "step": 45760, "train_speed(iter/s)": 0.289059 }, { "acc": 0.72896891, "epoch": 0.5120468872990729, "grad_norm": 5.5, "learning_rate": 8.883747534265645e-06, "loss": 1.07795277, "memory(GiB)": 142.32, "step": 45780, "train_speed(iter/s)": 0.289096 }, { "acc": 0.73308778, "epoch": 0.5122705862450314, "grad_norm": 5.625, "learning_rate": 8.882582485524162e-06, "loss": 1.07185154, "memory(GiB)": 142.32, "step": 45800, "train_speed(iter/s)": 0.289134 }, { "acc": 0.73332119, "epoch": 0.5124942851909899, "grad_norm": 8.125, "learning_rate": 8.881416905589898e-06, "loss": 1.05976639, "memory(GiB)": 142.32, "step": 45820, "train_speed(iter/s)": 0.289174 }, { "acc": 0.73623424, "epoch": 0.5127179841369485, "grad_norm": 5.6875, "learning_rate": 8.880250794622325e-06, "loss": 1.04100952, "memory(GiB)": 142.32, "step": 45840, "train_speed(iter/s)": 0.289218 }, { "acc": 0.73011913, "epoch": 0.512941683082907, "grad_norm": 7.21875, "learning_rate": 8.879084152780982e-06, "loss": 1.0655302, "memory(GiB)": 142.32, "step": 45860, "train_speed(iter/s)": 0.289263 }, { "acc": 0.72816968, "epoch": 0.5131653820288655, "grad_norm": 5.375, "learning_rate": 8.877916980225479e-06, "loss": 1.08776751, "memory(GiB)": 142.32, "step": 45880, "train_speed(iter/s)": 0.289305 }, { "acc": 0.73612404, "epoch": 0.513389080974824, "grad_norm": 5.3125, "learning_rate": 8.876749277115506e-06, "loss": 1.03683491, "memory(GiB)": 142.32, "step": 45900, "train_speed(iter/s)": 0.289349 }, { "acc": 0.72509513, "epoch": 0.5136127799207826, "grad_norm": 5.75, "learning_rate": 8.875581043610823e-06, "loss": 1.094876, "memory(GiB)": 142.32, "step": 45920, "train_speed(iter/s)": 0.28939 }, { "acc": 0.73219318, "epoch": 0.5138364788667411, "grad_norm": 5.53125, "learning_rate": 8.874412279871257e-06, "loss": 1.04327335, "memory(GiB)": 142.32, "step": 45940, "train_speed(iter/s)": 0.289426 }, { "acc": 0.7295723, "epoch": 0.5140601778126996, "grad_norm": 7.15625, "learning_rate": 8.873242986056712e-06, "loss": 1.08852835, "memory(GiB)": 142.32, "step": 45960, "train_speed(iter/s)": 0.289464 }, { "acc": 0.73633499, "epoch": 0.5142838767586582, "grad_norm": 6.6875, "learning_rate": 8.872073162327165e-06, "loss": 1.03716211, "memory(GiB)": 142.32, "step": 45980, "train_speed(iter/s)": 0.289502 }, { "acc": 0.72658925, "epoch": 0.5145075757046167, "grad_norm": 6.75, "learning_rate": 8.870902808842665e-06, "loss": 1.08753929, "memory(GiB)": 142.32, "step": 46000, "train_speed(iter/s)": 0.289544 }, { "epoch": 0.5145075757046167, "eval_acc": 0.691392143454055, "eval_loss": 1.0935724973678589, "eval_runtime": 2340.5103, "eval_samples_per_second": 32.165, "eval_steps_per_second": 16.083, "step": 46000 }, { "acc": 0.74147043, "epoch": 0.5147312746505752, "grad_norm": 6.40625, "learning_rate": 8.869731925763332e-06, "loss": 1.01459312, "memory(GiB)": 142.32, "step": 46020, "train_speed(iter/s)": 0.285288 }, { "acc": 0.72648954, "epoch": 0.5149549735965337, "grad_norm": 5.53125, "learning_rate": 8.868560513249363e-06, "loss": 1.09331055, "memory(GiB)": 142.32, "step": 46040, "train_speed(iter/s)": 0.28533 }, { "acc": 0.73071136, "epoch": 0.5151786725424923, "grad_norm": 6.03125, "learning_rate": 8.86738857146102e-06, "loss": 1.05866871, "memory(GiB)": 142.32, "step": 46060, "train_speed(iter/s)": 0.285368 }, { "acc": 0.72694263, "epoch": 0.5154023714884508, "grad_norm": 6.125, "learning_rate": 8.866216100558642e-06, "loss": 1.08966637, "memory(GiB)": 142.32, "step": 46080, "train_speed(iter/s)": 0.285408 }, { "acc": 0.71502304, "epoch": 0.5156260704344093, "grad_norm": 5.875, "learning_rate": 8.86504310070264e-06, "loss": 1.14856167, "memory(GiB)": 142.32, "step": 46100, "train_speed(iter/s)": 0.285448 }, { "acc": 0.73298922, "epoch": 0.5158497693803679, "grad_norm": 6.4375, "learning_rate": 8.8638695720535e-06, "loss": 1.06032286, "memory(GiB)": 142.32, "step": 46120, "train_speed(iter/s)": 0.28549 }, { "acc": 0.73871965, "epoch": 0.5160734683263264, "grad_norm": 7.0, "learning_rate": 8.862695514771774e-06, "loss": 1.05906734, "memory(GiB)": 142.32, "step": 46140, "train_speed(iter/s)": 0.285531 }, { "acc": 0.73483424, "epoch": 0.5162971672722849, "grad_norm": 6.0625, "learning_rate": 8.86152092901809e-06, "loss": 1.03954268, "memory(GiB)": 142.32, "step": 46160, "train_speed(iter/s)": 0.285575 }, { "acc": 0.72068615, "epoch": 0.5165208662182434, "grad_norm": 6.1875, "learning_rate": 8.86034581495315e-06, "loss": 1.11787262, "memory(GiB)": 142.32, "step": 46180, "train_speed(iter/s)": 0.285616 }, { "acc": 0.72803984, "epoch": 0.516744565164202, "grad_norm": 5.5625, "learning_rate": 8.859170172737724e-06, "loss": 1.07607841, "memory(GiB)": 142.32, "step": 46200, "train_speed(iter/s)": 0.285659 }, { "acc": 0.73457479, "epoch": 0.5169682641101605, "grad_norm": 4.9375, "learning_rate": 8.85799400253266e-06, "loss": 1.05297241, "memory(GiB)": 142.32, "step": 46220, "train_speed(iter/s)": 0.28569 }, { "acc": 0.73148823, "epoch": 0.517191963056119, "grad_norm": 6.96875, "learning_rate": 8.856817304498872e-06, "loss": 1.06845722, "memory(GiB)": 142.32, "step": 46240, "train_speed(iter/s)": 0.285733 }, { "acc": 0.71665249, "epoch": 0.5174156620020776, "grad_norm": 6.46875, "learning_rate": 8.85564007879735e-06, "loss": 1.12713394, "memory(GiB)": 142.32, "step": 46260, "train_speed(iter/s)": 0.285776 }, { "acc": 0.73476801, "epoch": 0.5176393609480361, "grad_norm": 5.78125, "learning_rate": 8.854462325589157e-06, "loss": 1.06185474, "memory(GiB)": 142.32, "step": 46280, "train_speed(iter/s)": 0.28582 }, { "acc": 0.72756243, "epoch": 0.5178630598939946, "grad_norm": 6.5625, "learning_rate": 8.853284045035424e-06, "loss": 1.0785779, "memory(GiB)": 142.32, "step": 46300, "train_speed(iter/s)": 0.285865 }, { "acc": 0.7302618, "epoch": 0.5180867588399531, "grad_norm": 6.0, "learning_rate": 8.852105237297357e-06, "loss": 1.07433405, "memory(GiB)": 142.32, "step": 46320, "train_speed(iter/s)": 0.285908 }, { "acc": 0.73460526, "epoch": 0.5183104577859117, "grad_norm": 6.0625, "learning_rate": 8.850925902536233e-06, "loss": 1.04479256, "memory(GiB)": 142.32, "step": 46340, "train_speed(iter/s)": 0.285947 }, { "acc": 0.72238483, "epoch": 0.5185341567318702, "grad_norm": 5.09375, "learning_rate": 8.849746040913404e-06, "loss": 1.12362843, "memory(GiB)": 142.32, "step": 46360, "train_speed(iter/s)": 0.28599 }, { "acc": 0.74144096, "epoch": 0.5187578556778287, "grad_norm": 5.59375, "learning_rate": 8.848565652590293e-06, "loss": 1.02699261, "memory(GiB)": 142.32, "step": 46380, "train_speed(iter/s)": 0.286033 }, { "acc": 0.73372045, "epoch": 0.5189815546237873, "grad_norm": 7.46875, "learning_rate": 8.847384737728391e-06, "loss": 1.05042706, "memory(GiB)": 142.32, "step": 46400, "train_speed(iter/s)": 0.286077 }, { "acc": 0.73399315, "epoch": 0.5192052535697458, "grad_norm": 5.46875, "learning_rate": 8.846203296489265e-06, "loss": 1.06386528, "memory(GiB)": 142.32, "step": 46420, "train_speed(iter/s)": 0.286121 }, { "acc": 0.73080349, "epoch": 0.5194289525157043, "grad_norm": 5.90625, "learning_rate": 8.845021329034553e-06, "loss": 1.07538376, "memory(GiB)": 142.32, "step": 46440, "train_speed(iter/s)": 0.286161 }, { "acc": 0.73942528, "epoch": 0.5196526514616628, "grad_norm": 6.5625, "learning_rate": 8.843838835525965e-06, "loss": 1.02766705, "memory(GiB)": 142.32, "step": 46460, "train_speed(iter/s)": 0.286206 }, { "acc": 0.73236647, "epoch": 0.5198763504076214, "grad_norm": 6.15625, "learning_rate": 8.842655816125284e-06, "loss": 1.06328964, "memory(GiB)": 142.32, "step": 46480, "train_speed(iter/s)": 0.286247 }, { "acc": 0.72873287, "epoch": 0.52010004935358, "grad_norm": 5.53125, "learning_rate": 8.841472270994363e-06, "loss": 1.06785927, "memory(GiB)": 142.32, "step": 46500, "train_speed(iter/s)": 0.286286 }, { "acc": 0.73521309, "epoch": 0.5203237482995385, "grad_norm": 5.90625, "learning_rate": 8.840288200295126e-06, "loss": 1.04257336, "memory(GiB)": 142.32, "step": 46520, "train_speed(iter/s)": 0.286332 }, { "acc": 0.72959394, "epoch": 0.5205474472454971, "grad_norm": 5.4375, "learning_rate": 8.839103604189575e-06, "loss": 1.07950077, "memory(GiB)": 142.32, "step": 46540, "train_speed(iter/s)": 0.286375 }, { "acc": 0.74175701, "epoch": 0.5207711461914556, "grad_norm": 5.96875, "learning_rate": 8.837918482839776e-06, "loss": 1.01073704, "memory(GiB)": 142.32, "step": 46560, "train_speed(iter/s)": 0.286416 }, { "acc": 0.72267699, "epoch": 0.5209948451374141, "grad_norm": 5.28125, "learning_rate": 8.836732836407873e-06, "loss": 1.09682531, "memory(GiB)": 142.32, "step": 46580, "train_speed(iter/s)": 0.28646 }, { "acc": 0.72946301, "epoch": 0.5212185440833726, "grad_norm": 6.28125, "learning_rate": 8.835546665056078e-06, "loss": 1.07851238, "memory(GiB)": 142.32, "step": 46600, "train_speed(iter/s)": 0.286501 }, { "acc": 0.72575021, "epoch": 0.5214422430293312, "grad_norm": 5.21875, "learning_rate": 8.834359968946678e-06, "loss": 1.10645523, "memory(GiB)": 142.32, "step": 46620, "train_speed(iter/s)": 0.286546 }, { "acc": 0.72646494, "epoch": 0.5216659419752897, "grad_norm": 5.21875, "learning_rate": 8.833172748242026e-06, "loss": 1.10361805, "memory(GiB)": 142.32, "step": 46640, "train_speed(iter/s)": 0.286587 }, { "acc": 0.73110533, "epoch": 0.5218896409212482, "grad_norm": 5.6875, "learning_rate": 8.831985003104557e-06, "loss": 1.06071672, "memory(GiB)": 142.32, "step": 46660, "train_speed(iter/s)": 0.286631 }, { "acc": 0.73311763, "epoch": 0.5221133398672068, "grad_norm": 6.5625, "learning_rate": 8.830796733696765e-06, "loss": 1.04603481, "memory(GiB)": 142.32, "step": 46680, "train_speed(iter/s)": 0.28667 }, { "acc": 0.72353601, "epoch": 0.5223370388131653, "grad_norm": 5.625, "learning_rate": 8.829607940181227e-06, "loss": 1.10320568, "memory(GiB)": 142.32, "step": 46700, "train_speed(iter/s)": 0.286708 }, { "acc": 0.71912413, "epoch": 0.5225607377591238, "grad_norm": 6.28125, "learning_rate": 8.828418622720582e-06, "loss": 1.12138557, "memory(GiB)": 142.32, "step": 46720, "train_speed(iter/s)": 0.286748 }, { "acc": 0.72437134, "epoch": 0.5227844367050823, "grad_norm": 5.375, "learning_rate": 8.827228781477553e-06, "loss": 1.08435917, "memory(GiB)": 142.32, "step": 46740, "train_speed(iter/s)": 0.286793 }, { "acc": 0.71916928, "epoch": 0.5230081356510409, "grad_norm": 5.96875, "learning_rate": 8.826038416614919e-06, "loss": 1.13843908, "memory(GiB)": 142.32, "step": 46760, "train_speed(iter/s)": 0.286839 }, { "acc": 0.72534509, "epoch": 0.5232318345969994, "grad_norm": 5.6875, "learning_rate": 8.824847528295546e-06, "loss": 1.09481392, "memory(GiB)": 142.32, "step": 46780, "train_speed(iter/s)": 0.286879 }, { "acc": 0.72560487, "epoch": 0.5234555335429579, "grad_norm": 6.1875, "learning_rate": 8.823656116682359e-06, "loss": 1.10632372, "memory(GiB)": 142.32, "step": 46800, "train_speed(iter/s)": 0.286921 }, { "acc": 0.72924771, "epoch": 0.5236792324889165, "grad_norm": 5.09375, "learning_rate": 8.822464181938364e-06, "loss": 1.06187878, "memory(GiB)": 142.32, "step": 46820, "train_speed(iter/s)": 0.286967 }, { "acc": 0.73113461, "epoch": 0.523902931434875, "grad_norm": 5.53125, "learning_rate": 8.821271724226633e-06, "loss": 1.06440792, "memory(GiB)": 142.32, "step": 46840, "train_speed(iter/s)": 0.286994 }, { "acc": 0.73188467, "epoch": 0.5241266303808335, "grad_norm": 6.1875, "learning_rate": 8.820078743710312e-06, "loss": 1.05761242, "memory(GiB)": 142.32, "step": 46860, "train_speed(iter/s)": 0.287036 }, { "acc": 0.72726212, "epoch": 0.524350329326792, "grad_norm": 5.125, "learning_rate": 8.818885240552617e-06, "loss": 1.09406338, "memory(GiB)": 142.32, "step": 46880, "train_speed(iter/s)": 0.287074 }, { "acc": 0.73971472, "epoch": 0.5245740282727506, "grad_norm": 6.0, "learning_rate": 8.817691214916837e-06, "loss": 1.0349062, "memory(GiB)": 142.32, "step": 46900, "train_speed(iter/s)": 0.287119 }, { "acc": 0.72666883, "epoch": 0.5247977272187091, "grad_norm": 6.34375, "learning_rate": 8.81649666696633e-06, "loss": 1.07115726, "memory(GiB)": 142.32, "step": 46920, "train_speed(iter/s)": 0.287161 }, { "acc": 0.72698851, "epoch": 0.5250214261646676, "grad_norm": 5.78125, "learning_rate": 8.815301596864529e-06, "loss": 1.0894372, "memory(GiB)": 142.32, "step": 46940, "train_speed(iter/s)": 0.287205 }, { "acc": 0.73218617, "epoch": 0.5252451251106262, "grad_norm": 5.15625, "learning_rate": 8.814106004774939e-06, "loss": 1.0521121, "memory(GiB)": 142.32, "step": 46960, "train_speed(iter/s)": 0.287247 }, { "acc": 0.7179718, "epoch": 0.5254688240565847, "grad_norm": 5.375, "learning_rate": 8.812909890861128e-06, "loss": 1.15562601, "memory(GiB)": 142.32, "step": 46980, "train_speed(iter/s)": 0.287288 }, { "acc": 0.73553982, "epoch": 0.5256925230025432, "grad_norm": 5.375, "learning_rate": 8.811713255286746e-06, "loss": 1.04723644, "memory(GiB)": 142.32, "step": 47000, "train_speed(iter/s)": 0.287329 }, { "acc": 0.72303877, "epoch": 0.5259162219485017, "grad_norm": 6.21875, "learning_rate": 8.810516098215508e-06, "loss": 1.11079683, "memory(GiB)": 142.32, "step": 47020, "train_speed(iter/s)": 0.287373 }, { "acc": 0.73246727, "epoch": 0.5261399208944603, "grad_norm": 4.96875, "learning_rate": 8.809318419811206e-06, "loss": 1.06076727, "memory(GiB)": 142.32, "step": 47040, "train_speed(iter/s)": 0.28741 }, { "acc": 0.72764568, "epoch": 0.5263636198404188, "grad_norm": 7.15625, "learning_rate": 8.808120220237693e-06, "loss": 1.09183121, "memory(GiB)": 142.32, "step": 47060, "train_speed(iter/s)": 0.287456 }, { "acc": 0.72192831, "epoch": 0.5265873187863773, "grad_norm": 5.40625, "learning_rate": 8.806921499658906e-06, "loss": 1.13336754, "memory(GiB)": 142.32, "step": 47080, "train_speed(iter/s)": 0.2875 }, { "acc": 0.71954713, "epoch": 0.5268110177323359, "grad_norm": 6.03125, "learning_rate": 8.805722258238842e-06, "loss": 1.12319746, "memory(GiB)": 142.32, "step": 47100, "train_speed(iter/s)": 0.28754 }, { "acc": 0.72826042, "epoch": 0.5270347166782944, "grad_norm": 4.875, "learning_rate": 8.804522496141579e-06, "loss": 1.08739948, "memory(GiB)": 142.32, "step": 47120, "train_speed(iter/s)": 0.28758 }, { "acc": 0.73723269, "epoch": 0.5272584156242529, "grad_norm": 4.9375, "learning_rate": 8.803322213531257e-06, "loss": 1.03039093, "memory(GiB)": 142.32, "step": 47140, "train_speed(iter/s)": 0.287623 }, { "acc": 0.73216, "epoch": 0.5274821145702114, "grad_norm": 5.1875, "learning_rate": 8.802121410572097e-06, "loss": 1.05000811, "memory(GiB)": 142.32, "step": 47160, "train_speed(iter/s)": 0.287661 }, { "acc": 0.72245808, "epoch": 0.52770581351617, "grad_norm": 7.15625, "learning_rate": 8.800920087428381e-06, "loss": 1.09722347, "memory(GiB)": 142.32, "step": 47180, "train_speed(iter/s)": 0.287703 }, { "acc": 0.73194895, "epoch": 0.5279295124621285, "grad_norm": 7.0625, "learning_rate": 8.79971824426447e-06, "loss": 1.06868153, "memory(GiB)": 142.32, "step": 47200, "train_speed(iter/s)": 0.28774 }, { "acc": 0.71606898, "epoch": 0.528153211408087, "grad_norm": 5.84375, "learning_rate": 8.798515881244794e-06, "loss": 1.16286144, "memory(GiB)": 142.32, "step": 47220, "train_speed(iter/s)": 0.287779 }, { "acc": 0.73386745, "epoch": 0.5283769103540455, "grad_norm": 5.0625, "learning_rate": 8.79731299853385e-06, "loss": 1.06698437, "memory(GiB)": 142.32, "step": 47240, "train_speed(iter/s)": 0.287819 }, { "acc": 0.71598949, "epoch": 0.5286006093000041, "grad_norm": 6.21875, "learning_rate": 8.796109596296213e-06, "loss": 1.13054304, "memory(GiB)": 142.32, "step": 47260, "train_speed(iter/s)": 0.287859 }, { "acc": 0.73650703, "epoch": 0.5288243082459626, "grad_norm": 6.3125, "learning_rate": 8.794905674696523e-06, "loss": 1.04760513, "memory(GiB)": 142.32, "step": 47280, "train_speed(iter/s)": 0.287897 }, { "acc": 0.72331495, "epoch": 0.5290480071919211, "grad_norm": 7.25, "learning_rate": 8.793701233899496e-06, "loss": 1.11260014, "memory(GiB)": 142.32, "step": 47300, "train_speed(iter/s)": 0.28794 }, { "acc": 0.74012423, "epoch": 0.5292717061378797, "grad_norm": 6.125, "learning_rate": 8.792496274069916e-06, "loss": 1.03120117, "memory(GiB)": 142.32, "step": 47320, "train_speed(iter/s)": 0.287981 }, { "acc": 0.72615776, "epoch": 0.5294954050838382, "grad_norm": 6.375, "learning_rate": 8.791290795372638e-06, "loss": 1.10926094, "memory(GiB)": 142.32, "step": 47340, "train_speed(iter/s)": 0.288021 }, { "acc": 0.7296278, "epoch": 0.5297191040297967, "grad_norm": 5.59375, "learning_rate": 8.79008479797259e-06, "loss": 1.07173691, "memory(GiB)": 142.32, "step": 47360, "train_speed(iter/s)": 0.288053 }, { "acc": 0.74295778, "epoch": 0.5299428029757552, "grad_norm": 6.1875, "learning_rate": 8.788878282034768e-06, "loss": 0.99534817, "memory(GiB)": 142.32, "step": 47380, "train_speed(iter/s)": 0.288095 }, { "acc": 0.7307889, "epoch": 0.5301665019217138, "grad_norm": 6.0, "learning_rate": 8.787671247724241e-06, "loss": 1.077458, "memory(GiB)": 142.32, "step": 47400, "train_speed(iter/s)": 0.288134 }, { "acc": 0.73647928, "epoch": 0.5303902008676723, "grad_norm": 5.5625, "learning_rate": 8.786463695206149e-06, "loss": 1.04235783, "memory(GiB)": 142.32, "step": 47420, "train_speed(iter/s)": 0.288175 }, { "acc": 0.72841988, "epoch": 0.5306138998136308, "grad_norm": 5.9375, "learning_rate": 8.785255624645703e-06, "loss": 1.06927681, "memory(GiB)": 142.32, "step": 47440, "train_speed(iter/s)": 0.288219 }, { "acc": 0.73125982, "epoch": 0.5308375987595894, "grad_norm": 6.125, "learning_rate": 8.784047036208183e-06, "loss": 1.07972355, "memory(GiB)": 142.32, "step": 47460, "train_speed(iter/s)": 0.288261 }, { "acc": 0.72592273, "epoch": 0.5310612977055479, "grad_norm": 6.28125, "learning_rate": 8.782837930058943e-06, "loss": 1.07158432, "memory(GiB)": 142.32, "step": 47480, "train_speed(iter/s)": 0.288299 }, { "acc": 0.72201195, "epoch": 0.5312849966515064, "grad_norm": 5.9375, "learning_rate": 8.781628306363405e-06, "loss": 1.11221333, "memory(GiB)": 142.32, "step": 47500, "train_speed(iter/s)": 0.288343 }, { "acc": 0.73106718, "epoch": 0.5315086955974649, "grad_norm": 5.59375, "learning_rate": 8.780418165287062e-06, "loss": 1.08225374, "memory(GiB)": 142.32, "step": 47520, "train_speed(iter/s)": 0.288383 }, { "acc": 0.72846756, "epoch": 0.5317323945434235, "grad_norm": 5.46875, "learning_rate": 8.77920750699548e-06, "loss": 1.07659264, "memory(GiB)": 142.32, "step": 47540, "train_speed(iter/s)": 0.288421 }, { "acc": 0.73201113, "epoch": 0.531956093489382, "grad_norm": 6.375, "learning_rate": 8.777996331654294e-06, "loss": 1.07037363, "memory(GiB)": 142.32, "step": 47560, "train_speed(iter/s)": 0.288456 }, { "acc": 0.73612599, "epoch": 0.5321797924353405, "grad_norm": 4.5, "learning_rate": 8.77678463942921e-06, "loss": 1.03310375, "memory(GiB)": 142.32, "step": 47580, "train_speed(iter/s)": 0.288492 }, { "acc": 0.72699318, "epoch": 0.532403491381299, "grad_norm": 5.53125, "learning_rate": 8.775572430486004e-06, "loss": 1.0788723, "memory(GiB)": 142.32, "step": 47600, "train_speed(iter/s)": 0.288532 }, { "acc": 0.73212061, "epoch": 0.5326271903272576, "grad_norm": 6.09375, "learning_rate": 8.774359704990523e-06, "loss": 1.0648078, "memory(GiB)": 142.32, "step": 47620, "train_speed(iter/s)": 0.288573 }, { "acc": 0.73264217, "epoch": 0.5328508892732161, "grad_norm": 6.4375, "learning_rate": 8.773146463108687e-06, "loss": 1.05936069, "memory(GiB)": 142.32, "step": 47640, "train_speed(iter/s)": 0.288612 }, { "acc": 0.72578983, "epoch": 0.5330745882191746, "grad_norm": 5.15625, "learning_rate": 8.771932705006485e-06, "loss": 1.11090727, "memory(GiB)": 142.32, "step": 47660, "train_speed(iter/s)": 0.288654 }, { "acc": 0.74255443, "epoch": 0.5332982871651332, "grad_norm": 6.6875, "learning_rate": 8.770718430849976e-06, "loss": 1.02146091, "memory(GiB)": 142.32, "step": 47680, "train_speed(iter/s)": 0.288694 }, { "acc": 0.73523302, "epoch": 0.5335219861110917, "grad_norm": 6.15625, "learning_rate": 8.769503640805288e-06, "loss": 1.04525433, "memory(GiB)": 142.32, "step": 47700, "train_speed(iter/s)": 0.288732 }, { "acc": 0.72551255, "epoch": 0.5337456850570502, "grad_norm": 4.96875, "learning_rate": 8.768288335038625e-06, "loss": 1.06614676, "memory(GiB)": 142.32, "step": 47720, "train_speed(iter/s)": 0.288777 }, { "acc": 0.72696462, "epoch": 0.5339693840030088, "grad_norm": 6.21875, "learning_rate": 8.767072513716254e-06, "loss": 1.08147755, "memory(GiB)": 142.32, "step": 47740, "train_speed(iter/s)": 0.288816 }, { "acc": 0.72745228, "epoch": 0.5341930829489673, "grad_norm": 6.53125, "learning_rate": 8.765856177004522e-06, "loss": 1.08161688, "memory(GiB)": 142.32, "step": 47760, "train_speed(iter/s)": 0.288859 }, { "acc": 0.71689329, "epoch": 0.5344167818949258, "grad_norm": 6.0, "learning_rate": 8.764639325069838e-06, "loss": 1.13833256, "memory(GiB)": 142.32, "step": 47780, "train_speed(iter/s)": 0.288899 }, { "acc": 0.7368021, "epoch": 0.5346404808408843, "grad_norm": 5.84375, "learning_rate": 8.763421958078684e-06, "loss": 1.04313269, "memory(GiB)": 142.32, "step": 47800, "train_speed(iter/s)": 0.28894 }, { "acc": 0.73324075, "epoch": 0.5348641797868429, "grad_norm": 6.5, "learning_rate": 8.762204076197615e-06, "loss": 1.06585388, "memory(GiB)": 142.32, "step": 47820, "train_speed(iter/s)": 0.288989 }, { "acc": 0.73455372, "epoch": 0.5350878787328014, "grad_norm": 7.21875, "learning_rate": 8.760985679593255e-06, "loss": 1.04854794, "memory(GiB)": 142.32, "step": 47840, "train_speed(iter/s)": 0.289035 }, { "acc": 0.723383, "epoch": 0.5353115776787599, "grad_norm": 6.0, "learning_rate": 8.759766768432297e-06, "loss": 1.09941177, "memory(GiB)": 142.32, "step": 47860, "train_speed(iter/s)": 0.289073 }, { "acc": 0.73582668, "epoch": 0.5355352766247184, "grad_norm": 5.78125, "learning_rate": 8.758547342881505e-06, "loss": 1.04574995, "memory(GiB)": 142.32, "step": 47880, "train_speed(iter/s)": 0.289111 }, { "acc": 0.7254385, "epoch": 0.535758975570677, "grad_norm": 6.28125, "learning_rate": 8.757327403107713e-06, "loss": 1.10422783, "memory(GiB)": 142.32, "step": 47900, "train_speed(iter/s)": 0.289149 }, { "acc": 0.72993183, "epoch": 0.5359826745166355, "grad_norm": 5.5625, "learning_rate": 8.756106949277829e-06, "loss": 1.07077055, "memory(GiB)": 142.32, "step": 47920, "train_speed(iter/s)": 0.289189 }, { "acc": 0.72931271, "epoch": 0.536206373462594, "grad_norm": 6.53125, "learning_rate": 8.754885981558829e-06, "loss": 1.07584295, "memory(GiB)": 142.32, "step": 47940, "train_speed(iter/s)": 0.289231 }, { "acc": 0.72826462, "epoch": 0.5364300724085526, "grad_norm": 6.34375, "learning_rate": 8.753664500117756e-06, "loss": 1.07671633, "memory(GiB)": 142.32, "step": 47960, "train_speed(iter/s)": 0.289277 }, { "acc": 0.72739401, "epoch": 0.5366537713545111, "grad_norm": 4.75, "learning_rate": 8.752442505121726e-06, "loss": 1.08134861, "memory(GiB)": 142.32, "step": 47980, "train_speed(iter/s)": 0.289322 }, { "acc": 0.73457661, "epoch": 0.5368774703004696, "grad_norm": 6.71875, "learning_rate": 8.751219996737927e-06, "loss": 1.03362913, "memory(GiB)": 142.32, "step": 48000, "train_speed(iter/s)": 0.289362 }, { "epoch": 0.5368774703004696, "eval_acc": 0.6916031824343004, "eval_loss": 1.092326283454895, "eval_runtime": 2342.0789, "eval_samples_per_second": 32.144, "eval_steps_per_second": 16.072, "step": 48000 }, { "acc": 0.72966738, "epoch": 0.5371011692464281, "grad_norm": 6.71875, "learning_rate": 8.749996975133614e-06, "loss": 1.06003056, "memory(GiB)": 142.32, "step": 48020, "train_speed(iter/s)": 0.28529 }, { "acc": 0.72906761, "epoch": 0.5373248681923867, "grad_norm": 6.9375, "learning_rate": 8.748773440476117e-06, "loss": 1.0800478, "memory(GiB)": 142.32, "step": 48040, "train_speed(iter/s)": 0.285333 }, { "acc": 0.72687306, "epoch": 0.5375485671383452, "grad_norm": 5.78125, "learning_rate": 8.74754939293283e-06, "loss": 1.09505787, "memory(GiB)": 142.32, "step": 48060, "train_speed(iter/s)": 0.285371 }, { "acc": 0.7258503, "epoch": 0.5377722660843037, "grad_norm": 6.0625, "learning_rate": 8.746324832671223e-06, "loss": 1.08405228, "memory(GiB)": 142.32, "step": 48080, "train_speed(iter/s)": 0.285414 }, { "acc": 0.73255663, "epoch": 0.5379959650302623, "grad_norm": 7.28125, "learning_rate": 8.745099759858828e-06, "loss": 1.06792431, "memory(GiB)": 142.32, "step": 48100, "train_speed(iter/s)": 0.285455 }, { "acc": 0.7351697, "epoch": 0.5382196639762208, "grad_norm": 6.1875, "learning_rate": 8.743874174663259e-06, "loss": 1.0477396, "memory(GiB)": 142.32, "step": 48120, "train_speed(iter/s)": 0.285495 }, { "acc": 0.73326893, "epoch": 0.5384433629221793, "grad_norm": 4.5625, "learning_rate": 8.74264807725219e-06, "loss": 1.06328144, "memory(GiB)": 142.32, "step": 48140, "train_speed(iter/s)": 0.285535 }, { "acc": 0.72695665, "epoch": 0.5386670618681378, "grad_norm": 6.96875, "learning_rate": 8.741421467793369e-06, "loss": 1.07851162, "memory(GiB)": 142.32, "step": 48160, "train_speed(iter/s)": 0.285578 }, { "acc": 0.72960653, "epoch": 0.5388907608140964, "grad_norm": 5.03125, "learning_rate": 8.740194346454614e-06, "loss": 1.07511063, "memory(GiB)": 142.32, "step": 48180, "train_speed(iter/s)": 0.285617 }, { "acc": 0.72862573, "epoch": 0.5391144597600549, "grad_norm": 6.0625, "learning_rate": 8.738966713403812e-06, "loss": 1.07715302, "memory(GiB)": 142.32, "step": 48200, "train_speed(iter/s)": 0.285651 }, { "acc": 0.72058144, "epoch": 0.5393381587060134, "grad_norm": 6.375, "learning_rate": 8.737738568808923e-06, "loss": 1.1171875, "memory(GiB)": 142.32, "step": 48220, "train_speed(iter/s)": 0.28569 }, { "acc": 0.7384964, "epoch": 0.539561857651972, "grad_norm": 6.25, "learning_rate": 8.736509912837971e-06, "loss": 1.03646908, "memory(GiB)": 142.32, "step": 48240, "train_speed(iter/s)": 0.285728 }, { "acc": 0.72448664, "epoch": 0.5397855565979305, "grad_norm": 5.0625, "learning_rate": 8.735280745659058e-06, "loss": 1.11488371, "memory(GiB)": 142.32, "step": 48260, "train_speed(iter/s)": 0.285768 }, { "acc": 0.7263916, "epoch": 0.540009255543889, "grad_norm": 6.3125, "learning_rate": 8.734051067440349e-06, "loss": 1.08325405, "memory(GiB)": 142.32, "step": 48280, "train_speed(iter/s)": 0.285807 }, { "acc": 0.72685461, "epoch": 0.5402329544898475, "grad_norm": 6.46875, "learning_rate": 8.732820878350081e-06, "loss": 1.10673828, "memory(GiB)": 142.32, "step": 48300, "train_speed(iter/s)": 0.285846 }, { "acc": 0.71915741, "epoch": 0.5404566534358061, "grad_norm": 5.71875, "learning_rate": 8.731590178556563e-06, "loss": 1.14482079, "memory(GiB)": 142.32, "step": 48320, "train_speed(iter/s)": 0.285884 }, { "acc": 0.72841558, "epoch": 0.5406803523817646, "grad_norm": 4.96875, "learning_rate": 8.730358968228173e-06, "loss": 1.05835152, "memory(GiB)": 142.32, "step": 48340, "train_speed(iter/s)": 0.285925 }, { "acc": 0.7276763, "epoch": 0.5409040513277231, "grad_norm": 6.8125, "learning_rate": 8.729127247533357e-06, "loss": 1.10305309, "memory(GiB)": 142.32, "step": 48360, "train_speed(iter/s)": 0.285966 }, { "acc": 0.72233658, "epoch": 0.5411277502736817, "grad_norm": 6.84375, "learning_rate": 8.727895016640631e-06, "loss": 1.12336321, "memory(GiB)": 142.32, "step": 48380, "train_speed(iter/s)": 0.286001 }, { "acc": 0.73809261, "epoch": 0.5413514492196402, "grad_norm": 5.9375, "learning_rate": 8.726662275718582e-06, "loss": 1.04633541, "memory(GiB)": 142.32, "step": 48400, "train_speed(iter/s)": 0.286041 }, { "acc": 0.72660084, "epoch": 0.5415751481655987, "grad_norm": 6.9375, "learning_rate": 8.72542902493587e-06, "loss": 1.09690237, "memory(GiB)": 142.32, "step": 48420, "train_speed(iter/s)": 0.286078 }, { "acc": 0.7362051, "epoch": 0.5417988471115572, "grad_norm": 6.15625, "learning_rate": 8.724195264461218e-06, "loss": 1.04346943, "memory(GiB)": 142.32, "step": 48440, "train_speed(iter/s)": 0.286117 }, { "acc": 0.73025799, "epoch": 0.5420225460575158, "grad_norm": 4.71875, "learning_rate": 8.722960994463421e-06, "loss": 1.07510557, "memory(GiB)": 142.32, "step": 48460, "train_speed(iter/s)": 0.286158 }, { "acc": 0.72802391, "epoch": 0.5422462450034743, "grad_norm": 7.15625, "learning_rate": 8.721726215111348e-06, "loss": 1.0753993, "memory(GiB)": 142.32, "step": 48480, "train_speed(iter/s)": 0.2862 }, { "acc": 0.72039738, "epoch": 0.5424699439494328, "grad_norm": 5.0, "learning_rate": 8.720490926573932e-06, "loss": 1.12330933, "memory(GiB)": 142.32, "step": 48500, "train_speed(iter/s)": 0.286242 }, { "acc": 0.71593318, "epoch": 0.5426936428953913, "grad_norm": 6.21875, "learning_rate": 8.71925512902018e-06, "loss": 1.14166183, "memory(GiB)": 142.32, "step": 48520, "train_speed(iter/s)": 0.286279 }, { "acc": 0.73742509, "epoch": 0.5429173418413499, "grad_norm": 5.90625, "learning_rate": 8.718018822619167e-06, "loss": 1.04203911, "memory(GiB)": 142.32, "step": 48540, "train_speed(iter/s)": 0.28632 }, { "acc": 0.72928429, "epoch": 0.5431410407873084, "grad_norm": 4.65625, "learning_rate": 8.716782007540035e-06, "loss": 1.05935173, "memory(GiB)": 142.32, "step": 48560, "train_speed(iter/s)": 0.286362 }, { "acc": 0.72593536, "epoch": 0.5433647397332669, "grad_norm": 5.90625, "learning_rate": 8.715544683952e-06, "loss": 1.08286839, "memory(GiB)": 142.32, "step": 48580, "train_speed(iter/s)": 0.286405 }, { "acc": 0.73738647, "epoch": 0.5435884386792255, "grad_norm": 6.28125, "learning_rate": 8.714306852024343e-06, "loss": 1.02792187, "memory(GiB)": 142.32, "step": 48600, "train_speed(iter/s)": 0.286446 }, { "acc": 0.74091544, "epoch": 0.543812137625184, "grad_norm": 6.5, "learning_rate": 8.71306851192642e-06, "loss": 1.03298225, "memory(GiB)": 142.32, "step": 48620, "train_speed(iter/s)": 0.286484 }, { "acc": 0.72300057, "epoch": 0.5440358365711425, "grad_norm": 6.53125, "learning_rate": 8.711829663827654e-06, "loss": 1.10357494, "memory(GiB)": 142.32, "step": 48640, "train_speed(iter/s)": 0.286524 }, { "acc": 0.72266665, "epoch": 0.544259535517101, "grad_norm": 6.15625, "learning_rate": 8.710590307897534e-06, "loss": 1.11513176, "memory(GiB)": 142.32, "step": 48660, "train_speed(iter/s)": 0.286566 }, { "acc": 0.73091559, "epoch": 0.5444832344630596, "grad_norm": 5.9375, "learning_rate": 8.709350444305625e-06, "loss": 1.06959438, "memory(GiB)": 142.32, "step": 48680, "train_speed(iter/s)": 0.286605 }, { "acc": 0.72469053, "epoch": 0.5447069334090181, "grad_norm": 4.96875, "learning_rate": 8.708110073221554e-06, "loss": 1.09954662, "memory(GiB)": 142.32, "step": 48700, "train_speed(iter/s)": 0.286646 }, { "acc": 0.73467579, "epoch": 0.5449306323549766, "grad_norm": 5.625, "learning_rate": 8.706869194815025e-06, "loss": 1.04874096, "memory(GiB)": 142.32, "step": 48720, "train_speed(iter/s)": 0.286682 }, { "acc": 0.72231255, "epoch": 0.5451543313009352, "grad_norm": 5.15625, "learning_rate": 8.705627809255807e-06, "loss": 1.09503002, "memory(GiB)": 142.32, "step": 48740, "train_speed(iter/s)": 0.286722 }, { "acc": 0.72267418, "epoch": 0.5453780302468937, "grad_norm": 5.125, "learning_rate": 8.70438591671374e-06, "loss": 1.10563622, "memory(GiB)": 142.32, "step": 48760, "train_speed(iter/s)": 0.286762 }, { "acc": 0.73699026, "epoch": 0.5456017291928522, "grad_norm": 6.6875, "learning_rate": 8.70314351735873e-06, "loss": 1.0588747, "memory(GiB)": 142.32, "step": 48780, "train_speed(iter/s)": 0.286799 }, { "acc": 0.73295279, "epoch": 0.5458254281388107, "grad_norm": 5.71875, "learning_rate": 8.701900611360758e-06, "loss": 1.06458588, "memory(GiB)": 142.32, "step": 48800, "train_speed(iter/s)": 0.286837 }, { "acc": 0.73913646, "epoch": 0.5460491270847693, "grad_norm": 5.53125, "learning_rate": 8.700657198889869e-06, "loss": 1.03516731, "memory(GiB)": 142.32, "step": 48820, "train_speed(iter/s)": 0.286871 }, { "acc": 0.72106562, "epoch": 0.5462728260307278, "grad_norm": 6.25, "learning_rate": 8.699413280116182e-06, "loss": 1.11934147, "memory(GiB)": 142.32, "step": 48840, "train_speed(iter/s)": 0.286913 }, { "acc": 0.72565632, "epoch": 0.5464965249766863, "grad_norm": 5.75, "learning_rate": 8.69816885520988e-06, "loss": 1.10538549, "memory(GiB)": 142.32, "step": 48860, "train_speed(iter/s)": 0.28695 }, { "acc": 0.73018427, "epoch": 0.5467202239226449, "grad_norm": 6.84375, "learning_rate": 8.69692392434122e-06, "loss": 1.0752367, "memory(GiB)": 142.32, "step": 48880, "train_speed(iter/s)": 0.286989 }, { "acc": 0.74260597, "epoch": 0.5469439228686034, "grad_norm": 5.84375, "learning_rate": 8.695678487680526e-06, "loss": 1.00976162, "memory(GiB)": 142.32, "step": 48900, "train_speed(iter/s)": 0.287029 }, { "acc": 0.74118462, "epoch": 0.5471676218145619, "grad_norm": 4.59375, "learning_rate": 8.694432545398193e-06, "loss": 1.0338274, "memory(GiB)": 142.32, "step": 48920, "train_speed(iter/s)": 0.287066 }, { "acc": 0.72657151, "epoch": 0.5473913207605204, "grad_norm": 5.90625, "learning_rate": 8.69318609766468e-06, "loss": 1.0742281, "memory(GiB)": 142.32, "step": 48940, "train_speed(iter/s)": 0.287108 }, { "acc": 0.73192348, "epoch": 0.547615019706479, "grad_norm": 7.78125, "learning_rate": 8.69193914465052e-06, "loss": 1.06940527, "memory(GiB)": 142.32, "step": 48960, "train_speed(iter/s)": 0.287144 }, { "acc": 0.72631817, "epoch": 0.5478387186524375, "grad_norm": 6.1875, "learning_rate": 8.690691686526318e-06, "loss": 1.08075266, "memory(GiB)": 142.32, "step": 48980, "train_speed(iter/s)": 0.287186 }, { "acc": 0.7228786, "epoch": 0.548062417598396, "grad_norm": 7.1875, "learning_rate": 8.68944372346274e-06, "loss": 1.11051769, "memory(GiB)": 142.32, "step": 49000, "train_speed(iter/s)": 0.287227 }, { "acc": 0.72819047, "epoch": 0.5482861165443547, "grad_norm": 5.25, "learning_rate": 8.688195255630527e-06, "loss": 1.07562675, "memory(GiB)": 142.32, "step": 49020, "train_speed(iter/s)": 0.287265 }, { "acc": 0.72900352, "epoch": 0.5485098154903132, "grad_norm": 5.3125, "learning_rate": 8.686946283200486e-06, "loss": 1.07464275, "memory(GiB)": 142.32, "step": 49040, "train_speed(iter/s)": 0.287303 }, { "acc": 0.74168558, "epoch": 0.5487335144362717, "grad_norm": 5.125, "learning_rate": 8.685696806343495e-06, "loss": 1.00025311, "memory(GiB)": 142.32, "step": 49060, "train_speed(iter/s)": 0.287344 }, { "acc": 0.72683563, "epoch": 0.5489572133822302, "grad_norm": 6.75, "learning_rate": 8.684446825230499e-06, "loss": 1.08211594, "memory(GiB)": 142.32, "step": 49080, "train_speed(iter/s)": 0.287385 }, { "acc": 0.72888317, "epoch": 0.5491809123281888, "grad_norm": 5.4375, "learning_rate": 8.683196340032516e-06, "loss": 1.07045841, "memory(GiB)": 142.32, "step": 49100, "train_speed(iter/s)": 0.287423 }, { "acc": 0.73998556, "epoch": 0.5494046112741473, "grad_norm": 6.5, "learning_rate": 8.681945350920628e-06, "loss": 1.0253828, "memory(GiB)": 142.32, "step": 49120, "train_speed(iter/s)": 0.28746 }, { "acc": 0.7331615, "epoch": 0.5496283102201058, "grad_norm": 5.5, "learning_rate": 8.680693858065989e-06, "loss": 1.05146294, "memory(GiB)": 142.32, "step": 49140, "train_speed(iter/s)": 0.287502 }, { "acc": 0.7283577, "epoch": 0.5498520091660644, "grad_norm": 6.96875, "learning_rate": 8.67944186163982e-06, "loss": 1.08795719, "memory(GiB)": 142.32, "step": 49160, "train_speed(iter/s)": 0.287542 }, { "acc": 0.72872915, "epoch": 0.5500757081120229, "grad_norm": 5.78125, "learning_rate": 8.678189361813414e-06, "loss": 1.06973171, "memory(GiB)": 142.32, "step": 49180, "train_speed(iter/s)": 0.287581 }, { "acc": 0.72670665, "epoch": 0.5502994070579814, "grad_norm": 6.6875, "learning_rate": 8.67693635875813e-06, "loss": 1.10022087, "memory(GiB)": 142.32, "step": 49200, "train_speed(iter/s)": 0.287617 }, { "acc": 0.7415659, "epoch": 0.5505231060039399, "grad_norm": 7.3125, "learning_rate": 8.675682852645396e-06, "loss": 1.01818275, "memory(GiB)": 142.32, "step": 49220, "train_speed(iter/s)": 0.287653 }, { "acc": 0.71972361, "epoch": 0.5507468049498985, "grad_norm": 5.90625, "learning_rate": 8.67442884364671e-06, "loss": 1.12617893, "memory(GiB)": 142.32, "step": 49240, "train_speed(iter/s)": 0.287694 }, { "acc": 0.73451014, "epoch": 0.550970503895857, "grad_norm": 6.59375, "learning_rate": 8.673174331933639e-06, "loss": 1.05506763, "memory(GiB)": 142.32, "step": 49260, "train_speed(iter/s)": 0.287732 }, { "acc": 0.72705908, "epoch": 0.5511942028418155, "grad_norm": 7.25, "learning_rate": 8.671919317677819e-06, "loss": 1.09216995, "memory(GiB)": 142.32, "step": 49280, "train_speed(iter/s)": 0.287772 }, { "acc": 0.72705564, "epoch": 0.5514179017877741, "grad_norm": 5.3125, "learning_rate": 8.67066380105095e-06, "loss": 1.08779964, "memory(GiB)": 142.32, "step": 49300, "train_speed(iter/s)": 0.287813 }, { "acc": 0.71698399, "epoch": 0.5516416007337326, "grad_norm": 6.5625, "learning_rate": 8.669407782224808e-06, "loss": 1.13588161, "memory(GiB)": 142.32, "step": 49320, "train_speed(iter/s)": 0.287852 }, { "acc": 0.72978177, "epoch": 0.5518652996796911, "grad_norm": 5.8125, "learning_rate": 8.668151261371234e-06, "loss": 1.07582235, "memory(GiB)": 142.32, "step": 49340, "train_speed(iter/s)": 0.287894 }, { "acc": 0.73328791, "epoch": 0.5520889986256496, "grad_norm": 6.78125, "learning_rate": 8.666894238662136e-06, "loss": 1.07176399, "memory(GiB)": 142.32, "step": 49360, "train_speed(iter/s)": 0.287937 }, { "acc": 0.73366733, "epoch": 0.5523126975716082, "grad_norm": 8.25, "learning_rate": 8.665636714269497e-06, "loss": 1.06264257, "memory(GiB)": 142.32, "step": 49380, "train_speed(iter/s)": 0.28798 }, { "acc": 0.72281513, "epoch": 0.5525363965175667, "grad_norm": 6.71875, "learning_rate": 8.66437868836536e-06, "loss": 1.11307106, "memory(GiB)": 142.32, "step": 49400, "train_speed(iter/s)": 0.28802 }, { "acc": 0.73043532, "epoch": 0.5527600954635252, "grad_norm": 7.53125, "learning_rate": 8.663120161121841e-06, "loss": 1.07567511, "memory(GiB)": 142.32, "step": 49420, "train_speed(iter/s)": 0.28806 }, { "acc": 0.73737373, "epoch": 0.5529837944094838, "grad_norm": 6.15625, "learning_rate": 8.661861132711127e-06, "loss": 1.0433054, "memory(GiB)": 142.32, "step": 49440, "train_speed(iter/s)": 0.288101 }, { "acc": 0.72255735, "epoch": 0.5532074933554423, "grad_norm": 6.09375, "learning_rate": 8.66060160330547e-06, "loss": 1.142377, "memory(GiB)": 142.32, "step": 49460, "train_speed(iter/s)": 0.288145 }, { "acc": 0.72676392, "epoch": 0.5534311923014008, "grad_norm": 5.78125, "learning_rate": 8.659341573077192e-06, "loss": 1.10272856, "memory(GiB)": 142.32, "step": 49480, "train_speed(iter/s)": 0.288186 }, { "acc": 0.72303534, "epoch": 0.5536548912473593, "grad_norm": 5.875, "learning_rate": 8.658081042198682e-06, "loss": 1.09783154, "memory(GiB)": 142.32, "step": 49500, "train_speed(iter/s)": 0.288227 }, { "acc": 0.74058313, "epoch": 0.5538785901933179, "grad_norm": 6.15625, "learning_rate": 8.6568200108424e-06, "loss": 1.03449917, "memory(GiB)": 142.32, "step": 49520, "train_speed(iter/s)": 0.288269 }, { "acc": 0.73460412, "epoch": 0.5541022891392764, "grad_norm": 6.40625, "learning_rate": 8.655558479180874e-06, "loss": 1.04652348, "memory(GiB)": 142.32, "step": 49540, "train_speed(iter/s)": 0.288309 }, { "acc": 0.72862992, "epoch": 0.5543259880852349, "grad_norm": 4.46875, "learning_rate": 8.654296447386696e-06, "loss": 1.08824501, "memory(GiB)": 142.32, "step": 49560, "train_speed(iter/s)": 0.288349 }, { "acc": 0.7198781, "epoch": 0.5545496870311935, "grad_norm": 5.375, "learning_rate": 8.653033915632531e-06, "loss": 1.12300587, "memory(GiB)": 142.32, "step": 49580, "train_speed(iter/s)": 0.288392 }, { "acc": 0.72411213, "epoch": 0.554773385977152, "grad_norm": 5.6875, "learning_rate": 8.651770884091115e-06, "loss": 1.11189365, "memory(GiB)": 142.32, "step": 49600, "train_speed(iter/s)": 0.288436 }, { "acc": 0.74882622, "epoch": 0.5549970849231105, "grad_norm": 6.53125, "learning_rate": 8.650507352935245e-06, "loss": 0.98424702, "memory(GiB)": 142.32, "step": 49620, "train_speed(iter/s)": 0.288479 }, { "acc": 0.72459497, "epoch": 0.555220783869069, "grad_norm": 5.8125, "learning_rate": 8.649243322337793e-06, "loss": 1.10925579, "memory(GiB)": 142.32, "step": 49640, "train_speed(iter/s)": 0.288518 }, { "acc": 0.72480264, "epoch": 0.5554444828150276, "grad_norm": 5.0625, "learning_rate": 8.647978792471692e-06, "loss": 1.07807693, "memory(GiB)": 142.32, "step": 49660, "train_speed(iter/s)": 0.288558 }, { "acc": 0.73975654, "epoch": 0.5556681817609861, "grad_norm": 6.75, "learning_rate": 8.646713763509953e-06, "loss": 1.01868267, "memory(GiB)": 142.32, "step": 49680, "train_speed(iter/s)": 0.288595 }, { "acc": 0.73040409, "epoch": 0.5558918807069446, "grad_norm": 5.8125, "learning_rate": 8.645448235625646e-06, "loss": 1.08203001, "memory(GiB)": 142.32, "step": 49700, "train_speed(iter/s)": 0.288632 }, { "acc": 0.72247, "epoch": 0.5561155796529031, "grad_norm": 5.25, "learning_rate": 8.644182208991915e-06, "loss": 1.10945206, "memory(GiB)": 142.32, "step": 49720, "train_speed(iter/s)": 0.288669 }, { "acc": 0.71122055, "epoch": 0.5563392785988617, "grad_norm": 5.59375, "learning_rate": 8.642915683781972e-06, "loss": 1.16899376, "memory(GiB)": 142.32, "step": 49740, "train_speed(iter/s)": 0.288709 }, { "acc": 0.72495852, "epoch": 0.5565629775448202, "grad_norm": 6.0, "learning_rate": 8.641648660169092e-06, "loss": 1.10414982, "memory(GiB)": 142.32, "step": 49760, "train_speed(iter/s)": 0.288748 }, { "acc": 0.731106, "epoch": 0.5567866764907787, "grad_norm": 5.125, "learning_rate": 8.640381138326626e-06, "loss": 1.07601585, "memory(GiB)": 142.32, "step": 49780, "train_speed(iter/s)": 0.288782 }, { "acc": 0.73571458, "epoch": 0.5570103754367373, "grad_norm": 7.0, "learning_rate": 8.639113118427987e-06, "loss": 1.04735699, "memory(GiB)": 142.32, "step": 49800, "train_speed(iter/s)": 0.288823 }, { "acc": 0.73124995, "epoch": 0.5572340743826958, "grad_norm": 5.375, "learning_rate": 8.637844600646656e-06, "loss": 1.06761074, "memory(GiB)": 142.32, "step": 49820, "train_speed(iter/s)": 0.28886 }, { "acc": 0.72876768, "epoch": 0.5574577733286543, "grad_norm": 5.5625, "learning_rate": 8.636575585156189e-06, "loss": 1.08828602, "memory(GiB)": 142.32, "step": 49840, "train_speed(iter/s)": 0.288894 }, { "acc": 0.72802172, "epoch": 0.5576814722746128, "grad_norm": 8.0625, "learning_rate": 8.635306072130204e-06, "loss": 1.08031673, "memory(GiB)": 142.32, "step": 49860, "train_speed(iter/s)": 0.288933 }, { "acc": 0.73782339, "epoch": 0.5579051712205714, "grad_norm": 5.65625, "learning_rate": 8.634036061742386e-06, "loss": 1.03755398, "memory(GiB)": 142.32, "step": 49880, "train_speed(iter/s)": 0.288974 }, { "acc": 0.72735081, "epoch": 0.5581288701665299, "grad_norm": 6.28125, "learning_rate": 8.632765554166494e-06, "loss": 1.08355904, "memory(GiB)": 142.32, "step": 49900, "train_speed(iter/s)": 0.289013 }, { "acc": 0.71836309, "epoch": 0.5583525691124884, "grad_norm": 7.09375, "learning_rate": 8.631494549576349e-06, "loss": 1.1195631, "memory(GiB)": 142.32, "step": 49920, "train_speed(iter/s)": 0.28905 }, { "acc": 0.7348856, "epoch": 0.558576268058447, "grad_norm": 5.40625, "learning_rate": 8.630223048145844e-06, "loss": 1.04897461, "memory(GiB)": 142.32, "step": 49940, "train_speed(iter/s)": 0.289088 }, { "acc": 0.73028326, "epoch": 0.5587999670044055, "grad_norm": 5.21875, "learning_rate": 8.628951050048938e-06, "loss": 1.07098026, "memory(GiB)": 142.32, "step": 49960, "train_speed(iter/s)": 0.289129 }, { "acc": 0.73619862, "epoch": 0.559023665950364, "grad_norm": 6.84375, "learning_rate": 8.627678555459658e-06, "loss": 1.04668417, "memory(GiB)": 142.32, "step": 49980, "train_speed(iter/s)": 0.289167 }, { "acc": 0.72924166, "epoch": 0.5592473648963225, "grad_norm": 5.3125, "learning_rate": 8.626405564552102e-06, "loss": 1.08479271, "memory(GiB)": 142.32, "step": 50000, "train_speed(iter/s)": 0.289204 }, { "epoch": 0.5592473648963225, "eval_acc": 0.6920009078471993, "eval_loss": 1.0911203622817993, "eval_runtime": 2341.012, "eval_samples_per_second": 32.158, "eval_steps_per_second": 16.079, "step": 50000 }, { "acc": 0.71667252, "epoch": 0.5594710638422811, "grad_norm": 6.3125, "learning_rate": 8.62513207750043e-06, "loss": 1.12387857, "memory(GiB)": 142.32, "step": 50020, "train_speed(iter/s)": 0.285294 }, { "acc": 0.72621651, "epoch": 0.5596947627882396, "grad_norm": 6.0, "learning_rate": 8.623858094478876e-06, "loss": 1.10345535, "memory(GiB)": 142.32, "step": 50040, "train_speed(iter/s)": 0.285334 }, { "acc": 0.72172389, "epoch": 0.5599184617341981, "grad_norm": 4.53125, "learning_rate": 8.622583615661737e-06, "loss": 1.12463379, "memory(GiB)": 142.32, "step": 50060, "train_speed(iter/s)": 0.285374 }, { "acc": 0.72589364, "epoch": 0.5601421606801567, "grad_norm": 6.28125, "learning_rate": 8.62130864122338e-06, "loss": 1.09007854, "memory(GiB)": 142.32, "step": 50080, "train_speed(iter/s)": 0.285413 }, { "acc": 0.73106432, "epoch": 0.5603658596261152, "grad_norm": 6.59375, "learning_rate": 8.620033171338242e-06, "loss": 1.07800713, "memory(GiB)": 142.32, "step": 50100, "train_speed(iter/s)": 0.285448 }, { "acc": 0.72467375, "epoch": 0.5605895585720737, "grad_norm": 5.65625, "learning_rate": 8.618757206180822e-06, "loss": 1.11004314, "memory(GiB)": 142.32, "step": 50120, "train_speed(iter/s)": 0.285487 }, { "acc": 0.73051758, "epoch": 0.5608132575180322, "grad_norm": 4.90625, "learning_rate": 8.617480745925694e-06, "loss": 1.06559629, "memory(GiB)": 142.32, "step": 50140, "train_speed(iter/s)": 0.285523 }, { "acc": 0.72045794, "epoch": 0.5610369564639908, "grad_norm": 6.0625, "learning_rate": 8.616203790747493e-06, "loss": 1.12892361, "memory(GiB)": 142.32, "step": 50160, "train_speed(iter/s)": 0.285562 }, { "acc": 0.72924604, "epoch": 0.5612606554099493, "grad_norm": 5.75, "learning_rate": 8.614926340820925e-06, "loss": 1.07790184, "memory(GiB)": 142.32, "step": 50180, "train_speed(iter/s)": 0.285599 }, { "acc": 0.7314311, "epoch": 0.5614843543559078, "grad_norm": 5.625, "learning_rate": 8.613648396320768e-06, "loss": 1.05868721, "memory(GiB)": 142.32, "step": 50200, "train_speed(iter/s)": 0.28564 }, { "acc": 0.73542137, "epoch": 0.5617080533018664, "grad_norm": 5.59375, "learning_rate": 8.612369957421858e-06, "loss": 1.05132408, "memory(GiB)": 142.32, "step": 50220, "train_speed(iter/s)": 0.28568 }, { "acc": 0.72731123, "epoch": 0.5619317522478249, "grad_norm": 6.8125, "learning_rate": 8.611091024299103e-06, "loss": 1.07106819, "memory(GiB)": 142.32, "step": 50240, "train_speed(iter/s)": 0.285718 }, { "acc": 0.72058325, "epoch": 0.5621554511937834, "grad_norm": 5.40625, "learning_rate": 8.609811597127484e-06, "loss": 1.11332455, "memory(GiB)": 142.32, "step": 50260, "train_speed(iter/s)": 0.28576 }, { "acc": 0.72685714, "epoch": 0.5623791501397419, "grad_norm": 5.28125, "learning_rate": 8.608531676082041e-06, "loss": 1.07847071, "memory(GiB)": 142.32, "step": 50280, "train_speed(iter/s)": 0.2858 }, { "acc": 0.72858524, "epoch": 0.5626028490857005, "grad_norm": 6.625, "learning_rate": 8.607251261337888e-06, "loss": 1.08023806, "memory(GiB)": 142.32, "step": 50300, "train_speed(iter/s)": 0.285843 }, { "acc": 0.72433395, "epoch": 0.562826548031659, "grad_norm": 5.625, "learning_rate": 8.6059703530702e-06, "loss": 1.09620266, "memory(GiB)": 142.32, "step": 50320, "train_speed(iter/s)": 0.285883 }, { "acc": 0.72987709, "epoch": 0.5630502469776175, "grad_norm": 5.0625, "learning_rate": 8.60468895145423e-06, "loss": 1.07691345, "memory(GiB)": 142.32, "step": 50340, "train_speed(iter/s)": 0.285927 }, { "acc": 0.7314075, "epoch": 0.563273945923576, "grad_norm": 5.4375, "learning_rate": 8.603407056665287e-06, "loss": 1.05444622, "memory(GiB)": 142.32, "step": 50360, "train_speed(iter/s)": 0.285965 }, { "acc": 0.72908673, "epoch": 0.5634976448695346, "grad_norm": 5.4375, "learning_rate": 8.602124668878755e-06, "loss": 1.0938488, "memory(GiB)": 142.32, "step": 50380, "train_speed(iter/s)": 0.286 }, { "acc": 0.70534635, "epoch": 0.5637213438154931, "grad_norm": 5.59375, "learning_rate": 8.600841788270082e-06, "loss": 1.18296318, "memory(GiB)": 142.32, "step": 50400, "train_speed(iter/s)": 0.286033 }, { "acc": 0.72394724, "epoch": 0.5639450427614516, "grad_norm": 6.15625, "learning_rate": 8.599558415014784e-06, "loss": 1.10232954, "memory(GiB)": 142.32, "step": 50420, "train_speed(iter/s)": 0.286072 }, { "acc": 0.72131329, "epoch": 0.5641687417074102, "grad_norm": 5.40625, "learning_rate": 8.598274549288446e-06, "loss": 1.12268467, "memory(GiB)": 142.32, "step": 50440, "train_speed(iter/s)": 0.286112 }, { "acc": 0.73420897, "epoch": 0.5643924406533687, "grad_norm": 5.15625, "learning_rate": 8.596990191266716e-06, "loss": 1.04207211, "memory(GiB)": 142.32, "step": 50460, "train_speed(iter/s)": 0.286154 }, { "acc": 0.73433332, "epoch": 0.5646161395993272, "grad_norm": 5.59375, "learning_rate": 8.595705341125318e-06, "loss": 1.05209522, "memory(GiB)": 142.32, "step": 50480, "train_speed(iter/s)": 0.286191 }, { "acc": 0.72665863, "epoch": 0.5648398385452857, "grad_norm": 5.5, "learning_rate": 8.594419999040034e-06, "loss": 1.07966852, "memory(GiB)": 142.32, "step": 50500, "train_speed(iter/s)": 0.286224 }, { "acc": 0.72986326, "epoch": 0.5650635374912443, "grad_norm": 4.8125, "learning_rate": 8.593134165186718e-06, "loss": 1.07768717, "memory(GiB)": 142.32, "step": 50520, "train_speed(iter/s)": 0.28626 }, { "acc": 0.73205357, "epoch": 0.5652872364372028, "grad_norm": 5.21875, "learning_rate": 8.59184783974129e-06, "loss": 1.07772293, "memory(GiB)": 142.32, "step": 50540, "train_speed(iter/s)": 0.286302 }, { "acc": 0.73202105, "epoch": 0.5655109353831613, "grad_norm": 5.0, "learning_rate": 8.590561022879738e-06, "loss": 1.05936012, "memory(GiB)": 142.32, "step": 50560, "train_speed(iter/s)": 0.286338 }, { "acc": 0.7178637, "epoch": 0.5657346343291199, "grad_norm": 8.0, "learning_rate": 8.589273714778118e-06, "loss": 1.14294004, "memory(GiB)": 142.32, "step": 50580, "train_speed(iter/s)": 0.286377 }, { "acc": 0.7387517, "epoch": 0.5659583332750784, "grad_norm": 6.15625, "learning_rate": 8.587985915612548e-06, "loss": 1.03472328, "memory(GiB)": 142.32, "step": 50600, "train_speed(iter/s)": 0.286414 }, { "acc": 0.72481861, "epoch": 0.5661820322210369, "grad_norm": 6.28125, "learning_rate": 8.586697625559224e-06, "loss": 1.11070709, "memory(GiB)": 142.32, "step": 50620, "train_speed(iter/s)": 0.286454 }, { "acc": 0.72241001, "epoch": 0.5664057311669954, "grad_norm": 6.5625, "learning_rate": 8.5854088447944e-06, "loss": 1.0996418, "memory(GiB)": 142.32, "step": 50640, "train_speed(iter/s)": 0.286496 }, { "acc": 0.73075142, "epoch": 0.566629430112954, "grad_norm": 5.125, "learning_rate": 8.584119573494396e-06, "loss": 1.09268322, "memory(GiB)": 142.32, "step": 50660, "train_speed(iter/s)": 0.286534 }, { "acc": 0.72548585, "epoch": 0.5668531290589125, "grad_norm": 5.75, "learning_rate": 8.582829811835607e-06, "loss": 1.09956732, "memory(GiB)": 142.32, "step": 50680, "train_speed(iter/s)": 0.286571 }, { "acc": 0.72947879, "epoch": 0.567076828004871, "grad_norm": 6.28125, "learning_rate": 8.58153955999449e-06, "loss": 1.08462286, "memory(GiB)": 142.32, "step": 50700, "train_speed(iter/s)": 0.286611 }, { "acc": 0.74085569, "epoch": 0.5673005269508296, "grad_norm": 5.8125, "learning_rate": 8.580248818147568e-06, "loss": 1.02437401, "memory(GiB)": 142.32, "step": 50720, "train_speed(iter/s)": 0.286652 }, { "acc": 0.73709812, "epoch": 0.5675242258967881, "grad_norm": 5.625, "learning_rate": 8.578957586471434e-06, "loss": 1.04956627, "memory(GiB)": 142.32, "step": 50740, "train_speed(iter/s)": 0.28669 }, { "acc": 0.73811264, "epoch": 0.5677479248427466, "grad_norm": 5.1875, "learning_rate": 8.577665865142747e-06, "loss": 1.04953785, "memory(GiB)": 142.32, "step": 50760, "train_speed(iter/s)": 0.286727 }, { "acc": 0.72380486, "epoch": 0.5679716237887051, "grad_norm": 5.1875, "learning_rate": 8.576373654338233e-06, "loss": 1.09422092, "memory(GiB)": 142.32, "step": 50780, "train_speed(iter/s)": 0.286768 }, { "acc": 0.72475243, "epoch": 0.5681953227346637, "grad_norm": 4.375, "learning_rate": 8.575080954234686e-06, "loss": 1.11019001, "memory(GiB)": 142.32, "step": 50800, "train_speed(iter/s)": 0.286803 }, { "acc": 0.73540936, "epoch": 0.5684190216806222, "grad_norm": 5.1875, "learning_rate": 8.573787765008964e-06, "loss": 1.05141068, "memory(GiB)": 142.32, "step": 50820, "train_speed(iter/s)": 0.28684 }, { "acc": 0.72774448, "epoch": 0.5686427206265807, "grad_norm": 5.65625, "learning_rate": 8.572494086837994e-06, "loss": 1.09176979, "memory(GiB)": 142.32, "step": 50840, "train_speed(iter/s)": 0.286882 }, { "acc": 0.72861557, "epoch": 0.5688664195725393, "grad_norm": 5.875, "learning_rate": 8.571199919898771e-06, "loss": 1.0724741, "memory(GiB)": 142.32, "step": 50860, "train_speed(iter/s)": 0.286923 }, { "acc": 0.72684588, "epoch": 0.5690901185184978, "grad_norm": 5.46875, "learning_rate": 8.569905264368354e-06, "loss": 1.08784466, "memory(GiB)": 142.32, "step": 50880, "train_speed(iter/s)": 0.286962 }, { "acc": 0.73554821, "epoch": 0.5693138174644563, "grad_norm": 4.84375, "learning_rate": 8.568610120423872e-06, "loss": 1.02950249, "memory(GiB)": 142.32, "step": 50900, "train_speed(iter/s)": 0.287002 }, { "acc": 0.73139391, "epoch": 0.5695375164104148, "grad_norm": 5.8125, "learning_rate": 8.567314488242518e-06, "loss": 1.07581434, "memory(GiB)": 142.32, "step": 50920, "train_speed(iter/s)": 0.287039 }, { "acc": 0.73165965, "epoch": 0.5697612153563734, "grad_norm": 7.34375, "learning_rate": 8.566018368001555e-06, "loss": 1.06906624, "memory(GiB)": 142.32, "step": 50940, "train_speed(iter/s)": 0.287075 }, { "acc": 0.73042746, "epoch": 0.5699849143023319, "grad_norm": 5.84375, "learning_rate": 8.564721759878306e-06, "loss": 1.08269577, "memory(GiB)": 142.32, "step": 50960, "train_speed(iter/s)": 0.287115 }, { "acc": 0.73827305, "epoch": 0.5702086132482904, "grad_norm": 5.40625, "learning_rate": 8.56342466405017e-06, "loss": 1.03158474, "memory(GiB)": 142.32, "step": 50980, "train_speed(iter/s)": 0.287153 }, { "acc": 0.73838997, "epoch": 0.570432312194249, "grad_norm": 6.21875, "learning_rate": 8.562127080694607e-06, "loss": 1.02375126, "memory(GiB)": 142.32, "step": 51000, "train_speed(iter/s)": 0.287191 }, { "acc": 0.72490282, "epoch": 0.5706560111402075, "grad_norm": 5.84375, "learning_rate": 8.560829009989146e-06, "loss": 1.09859333, "memory(GiB)": 142.32, "step": 51020, "train_speed(iter/s)": 0.287232 }, { "acc": 0.71828856, "epoch": 0.570879710086166, "grad_norm": 5.09375, "learning_rate": 8.55953045211138e-06, "loss": 1.1361269, "memory(GiB)": 142.32, "step": 51040, "train_speed(iter/s)": 0.287273 }, { "acc": 0.72318091, "epoch": 0.5711034090321245, "grad_norm": 6.3125, "learning_rate": 8.558231407238969e-06, "loss": 1.10751505, "memory(GiB)": 142.32, "step": 51060, "train_speed(iter/s)": 0.287315 }, { "acc": 0.72753639, "epoch": 0.5713271079780831, "grad_norm": 5.90625, "learning_rate": 8.556931875549644e-06, "loss": 1.10024462, "memory(GiB)": 142.32, "step": 51080, "train_speed(iter/s)": 0.287354 }, { "acc": 0.72731366, "epoch": 0.5715508069240416, "grad_norm": 6.4375, "learning_rate": 8.555631857221198e-06, "loss": 1.08217659, "memory(GiB)": 142.32, "step": 51100, "train_speed(iter/s)": 0.287394 }, { "acc": 0.72414045, "epoch": 0.5717745058700001, "grad_norm": 5.5, "learning_rate": 8.55433135243149e-06, "loss": 1.11824837, "memory(GiB)": 142.32, "step": 51120, "train_speed(iter/s)": 0.287435 }, { "acc": 0.72752156, "epoch": 0.5719982048159586, "grad_norm": 5.125, "learning_rate": 8.553030361358455e-06, "loss": 1.0892561, "memory(GiB)": 142.32, "step": 51140, "train_speed(iter/s)": 0.287475 }, { "acc": 0.73324156, "epoch": 0.5722219037619172, "grad_norm": 5.9375, "learning_rate": 8.551728884180077e-06, "loss": 1.0575737, "memory(GiB)": 142.32, "step": 51160, "train_speed(iter/s)": 0.287513 }, { "acc": 0.73094282, "epoch": 0.5724456027078757, "grad_norm": 6.125, "learning_rate": 8.550426921074425e-06, "loss": 1.06832981, "memory(GiB)": 142.32, "step": 51180, "train_speed(iter/s)": 0.287545 }, { "acc": 0.73364458, "epoch": 0.5726693016538342, "grad_norm": 6.15625, "learning_rate": 8.549124472219621e-06, "loss": 1.03251429, "memory(GiB)": 142.32, "step": 51200, "train_speed(iter/s)": 0.287583 }, { "acc": 0.73051505, "epoch": 0.5728930005997928, "grad_norm": 7.1875, "learning_rate": 8.547821537793862e-06, "loss": 1.06836348, "memory(GiB)": 142.32, "step": 51220, "train_speed(iter/s)": 0.28762 }, { "acc": 0.72348833, "epoch": 0.5731166995457513, "grad_norm": 4.84375, "learning_rate": 8.546518117975406e-06, "loss": 1.10468159, "memory(GiB)": 142.32, "step": 51240, "train_speed(iter/s)": 0.287657 }, { "acc": 0.74008961, "epoch": 0.5733403984917098, "grad_norm": 5.4375, "learning_rate": 8.54521421294258e-06, "loss": 1.03286514, "memory(GiB)": 142.32, "step": 51260, "train_speed(iter/s)": 0.287696 }, { "acc": 0.72452726, "epoch": 0.5735640974376683, "grad_norm": 4.65625, "learning_rate": 8.543909822873776e-06, "loss": 1.09524841, "memory(GiB)": 142.32, "step": 51280, "train_speed(iter/s)": 0.287735 }, { "acc": 0.72708225, "epoch": 0.5737877963836269, "grad_norm": 5.34375, "learning_rate": 8.542604947947454e-06, "loss": 1.08413963, "memory(GiB)": 142.32, "step": 51300, "train_speed(iter/s)": 0.287769 }, { "acc": 0.73122158, "epoch": 0.5740114953295854, "grad_norm": 5.46875, "learning_rate": 8.54129958834214e-06, "loss": 1.06138744, "memory(GiB)": 142.32, "step": 51320, "train_speed(iter/s)": 0.287808 }, { "acc": 0.73691397, "epoch": 0.5742351942755439, "grad_norm": 7.1875, "learning_rate": 8.539993744236426e-06, "loss": 1.04553738, "memory(GiB)": 142.32, "step": 51340, "train_speed(iter/s)": 0.287847 }, { "acc": 0.72854595, "epoch": 0.5744588932215025, "grad_norm": 5.3125, "learning_rate": 8.538687415808971e-06, "loss": 1.07977724, "memory(GiB)": 142.32, "step": 51360, "train_speed(iter/s)": 0.287883 }, { "acc": 0.72572265, "epoch": 0.574682592167461, "grad_norm": 5.96875, "learning_rate": 8.537380603238497e-06, "loss": 1.11077766, "memory(GiB)": 142.32, "step": 51380, "train_speed(iter/s)": 0.287921 }, { "acc": 0.72542753, "epoch": 0.5749062911134195, "grad_norm": 4.90625, "learning_rate": 8.536073306703794e-06, "loss": 1.07470684, "memory(GiB)": 142.32, "step": 51400, "train_speed(iter/s)": 0.287958 }, { "acc": 0.71910462, "epoch": 0.575129990059378, "grad_norm": 6.1875, "learning_rate": 8.534765526383722e-06, "loss": 1.13443127, "memory(GiB)": 142.32, "step": 51420, "train_speed(iter/s)": 0.287996 }, { "acc": 0.72145042, "epoch": 0.5753536890053366, "grad_norm": 6.0625, "learning_rate": 8.533457262457202e-06, "loss": 1.12154751, "memory(GiB)": 142.32, "step": 51440, "train_speed(iter/s)": 0.288032 }, { "acc": 0.73103261, "epoch": 0.5755773879512951, "grad_norm": 5.09375, "learning_rate": 8.532148515103224e-06, "loss": 1.066436, "memory(GiB)": 142.32, "step": 51460, "train_speed(iter/s)": 0.288073 }, { "acc": 0.72486229, "epoch": 0.5758010868972536, "grad_norm": 5.15625, "learning_rate": 8.530839284500843e-06, "loss": 1.09110909, "memory(GiB)": 142.32, "step": 51480, "train_speed(iter/s)": 0.28811 }, { "acc": 0.73241653, "epoch": 0.5760247858432122, "grad_norm": 5.40625, "learning_rate": 8.52952957082918e-06, "loss": 1.04742718, "memory(GiB)": 142.32, "step": 51500, "train_speed(iter/s)": 0.288141 }, { "acc": 0.736028, "epoch": 0.5762484847891708, "grad_norm": 5.84375, "learning_rate": 8.528219374267425e-06, "loss": 1.07065945, "memory(GiB)": 142.32, "step": 51520, "train_speed(iter/s)": 0.288181 }, { "acc": 0.73345637, "epoch": 0.5764721837351293, "grad_norm": 6.25, "learning_rate": 8.52690869499483e-06, "loss": 1.07093401, "memory(GiB)": 142.32, "step": 51540, "train_speed(iter/s)": 0.288217 }, { "acc": 0.73397923, "epoch": 0.5766958826810878, "grad_norm": 6.28125, "learning_rate": 8.52559753319071e-06, "loss": 1.05818462, "memory(GiB)": 142.32, "step": 51560, "train_speed(iter/s)": 0.288254 }, { "acc": 0.73059931, "epoch": 0.5769195816270464, "grad_norm": 6.46875, "learning_rate": 8.524285889034458e-06, "loss": 1.05684814, "memory(GiB)": 142.32, "step": 51580, "train_speed(iter/s)": 0.288292 }, { "acc": 0.73476515, "epoch": 0.5771432805730049, "grad_norm": 6.4375, "learning_rate": 8.522973762705524e-06, "loss": 1.04529982, "memory(GiB)": 142.32, "step": 51600, "train_speed(iter/s)": 0.288327 }, { "acc": 0.72276511, "epoch": 0.5773669795189634, "grad_norm": 5.5, "learning_rate": 8.521661154383423e-06, "loss": 1.10647831, "memory(GiB)": 142.32, "step": 51620, "train_speed(iter/s)": 0.288364 }, { "acc": 0.7263339, "epoch": 0.577590678464922, "grad_norm": 5.625, "learning_rate": 8.520348064247739e-06, "loss": 1.08701611, "memory(GiB)": 142.32, "step": 51640, "train_speed(iter/s)": 0.288405 }, { "acc": 0.72372308, "epoch": 0.5778143774108805, "grad_norm": 6.09375, "learning_rate": 8.519034492478124e-06, "loss": 1.11091003, "memory(GiB)": 142.32, "step": 51660, "train_speed(iter/s)": 0.288437 }, { "acc": 0.72475662, "epoch": 0.578038076356839, "grad_norm": 5.8125, "learning_rate": 8.517720439254291e-06, "loss": 1.08752947, "memory(GiB)": 142.32, "step": 51680, "train_speed(iter/s)": 0.288478 }, { "acc": 0.72808156, "epoch": 0.5782617753027975, "grad_norm": 6.53125, "learning_rate": 8.516405904756022e-06, "loss": 1.09497185, "memory(GiB)": 142.32, "step": 51700, "train_speed(iter/s)": 0.288516 }, { "acc": 0.73620687, "epoch": 0.5784854742487561, "grad_norm": 5.9375, "learning_rate": 8.515090889163165e-06, "loss": 1.04716759, "memory(GiB)": 142.32, "step": 51720, "train_speed(iter/s)": 0.288551 }, { "acc": 0.72280669, "epoch": 0.5787091731947146, "grad_norm": 5.28125, "learning_rate": 8.513775392655633e-06, "loss": 1.11770391, "memory(GiB)": 142.32, "step": 51740, "train_speed(iter/s)": 0.288587 }, { "acc": 0.72805853, "epoch": 0.5789328721406731, "grad_norm": 6.21875, "learning_rate": 8.512459415413402e-06, "loss": 1.0818718, "memory(GiB)": 142.32, "step": 51760, "train_speed(iter/s)": 0.288622 }, { "acc": 0.72647572, "epoch": 0.5791565710866317, "grad_norm": 5.3125, "learning_rate": 8.511142957616518e-06, "loss": 1.09492702, "memory(GiB)": 142.32, "step": 51780, "train_speed(iter/s)": 0.288659 }, { "acc": 0.72918344, "epoch": 0.5793802700325902, "grad_norm": 5.875, "learning_rate": 8.509826019445094e-06, "loss": 1.06592789, "memory(GiB)": 142.32, "step": 51800, "train_speed(iter/s)": 0.288698 }, { "acc": 0.73603039, "epoch": 0.5796039689785487, "grad_norm": 6.40625, "learning_rate": 8.508508601079301e-06, "loss": 1.05593767, "memory(GiB)": 142.32, "step": 51820, "train_speed(iter/s)": 0.288734 }, { "acc": 0.71422806, "epoch": 0.5798276679245072, "grad_norm": 5.75, "learning_rate": 8.507190702699385e-06, "loss": 1.13618355, "memory(GiB)": 142.32, "step": 51840, "train_speed(iter/s)": 0.288773 }, { "acc": 0.73472013, "epoch": 0.5800513668704658, "grad_norm": 6.53125, "learning_rate": 8.505872324485652e-06, "loss": 1.04098234, "memory(GiB)": 142.32, "step": 51860, "train_speed(iter/s)": 0.288813 }, { "acc": 0.72446876, "epoch": 0.5802750658164243, "grad_norm": 5.59375, "learning_rate": 8.504553466618473e-06, "loss": 1.1019556, "memory(GiB)": 142.32, "step": 51880, "train_speed(iter/s)": 0.288852 }, { "acc": 0.73850632, "epoch": 0.5804987647623828, "grad_norm": 5.28125, "learning_rate": 8.503234129278288e-06, "loss": 1.03244505, "memory(GiB)": 142.32, "step": 51900, "train_speed(iter/s)": 0.288888 }, { "acc": 0.72827916, "epoch": 0.5807224637083414, "grad_norm": 5.90625, "learning_rate": 8.501914312645601e-06, "loss": 1.06688938, "memory(GiB)": 142.32, "step": 51920, "train_speed(iter/s)": 0.288926 }, { "acc": 0.71093082, "epoch": 0.5809461626542999, "grad_norm": 6.03125, "learning_rate": 8.500594016900984e-06, "loss": 1.15574837, "memory(GiB)": 142.32, "step": 51940, "train_speed(iter/s)": 0.288967 }, { "acc": 0.72739763, "epoch": 0.5811698616002584, "grad_norm": 7.15625, "learning_rate": 8.49927324222507e-06, "loss": 1.07072544, "memory(GiB)": 142.32, "step": 51960, "train_speed(iter/s)": 0.289002 }, { "acc": 0.73229175, "epoch": 0.5813935605462169, "grad_norm": 6.0625, "learning_rate": 8.49795198879856e-06, "loss": 1.06376209, "memory(GiB)": 142.32, "step": 51980, "train_speed(iter/s)": 0.289037 }, { "acc": 0.73642292, "epoch": 0.5816172594921755, "grad_norm": 4.65625, "learning_rate": 8.49663025680222e-06, "loss": 1.0354866, "memory(GiB)": 142.32, "step": 52000, "train_speed(iter/s)": 0.289072 }, { "epoch": 0.5816172594921755, "eval_acc": 0.6923674777741101, "eval_loss": 1.089379072189331, "eval_runtime": 2339.1497, "eval_samples_per_second": 32.184, "eval_steps_per_second": 16.092, "step": 52000 }, { "acc": 0.73168507, "epoch": 0.581840958438134, "grad_norm": 5.5, "learning_rate": 8.495308046416884e-06, "loss": 1.0776247, "memory(GiB)": 142.32, "step": 52020, "train_speed(iter/s)": 0.285316 }, { "acc": 0.72845955, "epoch": 0.5820646573840925, "grad_norm": 5.5, "learning_rate": 8.493985357823447e-06, "loss": 1.07222843, "memory(GiB)": 142.32, "step": 52040, "train_speed(iter/s)": 0.285349 }, { "acc": 0.72509584, "epoch": 0.582288356330051, "grad_norm": 6.125, "learning_rate": 8.492662191202872e-06, "loss": 1.09730816, "memory(GiB)": 142.32, "step": 52060, "train_speed(iter/s)": 0.285391 }, { "acc": 0.72601328, "epoch": 0.5825120552760096, "grad_norm": 5.46875, "learning_rate": 8.491338546736188e-06, "loss": 1.07897739, "memory(GiB)": 142.32, "step": 52080, "train_speed(iter/s)": 0.285429 }, { "acc": 0.73461981, "epoch": 0.5827357542219681, "grad_norm": 5.03125, "learning_rate": 8.490014424604487e-06, "loss": 1.0528266, "memory(GiB)": 142.32, "step": 52100, "train_speed(iter/s)": 0.285465 }, { "acc": 0.73106413, "epoch": 0.5829594531679266, "grad_norm": 6.28125, "learning_rate": 8.488689824988929e-06, "loss": 1.07808151, "memory(GiB)": 142.32, "step": 52120, "train_speed(iter/s)": 0.2855 }, { "acc": 0.72935424, "epoch": 0.5831831521138852, "grad_norm": 6.8125, "learning_rate": 8.48736474807074e-06, "loss": 1.08401337, "memory(GiB)": 142.32, "step": 52140, "train_speed(iter/s)": 0.285532 }, { "acc": 0.73091621, "epoch": 0.5834068510598437, "grad_norm": 5.53125, "learning_rate": 8.486039194031206e-06, "loss": 1.08439083, "memory(GiB)": 142.32, "step": 52160, "train_speed(iter/s)": 0.285572 }, { "acc": 0.72075267, "epoch": 0.5836305500058022, "grad_norm": 6.1875, "learning_rate": 8.484713163051685e-06, "loss": 1.12856064, "memory(GiB)": 142.32, "step": 52180, "train_speed(iter/s)": 0.285612 }, { "acc": 0.7242979, "epoch": 0.5838542489517607, "grad_norm": 6.15625, "learning_rate": 8.483386655313593e-06, "loss": 1.10543079, "memory(GiB)": 142.32, "step": 52200, "train_speed(iter/s)": 0.285649 }, { "acc": 0.73189392, "epoch": 0.5840779478977193, "grad_norm": 6.46875, "learning_rate": 8.482059670998419e-06, "loss": 1.06086569, "memory(GiB)": 142.32, "step": 52220, "train_speed(iter/s)": 0.285688 }, { "acc": 0.72755013, "epoch": 0.5843016468436778, "grad_norm": 6.78125, "learning_rate": 8.480732210287712e-06, "loss": 1.09524431, "memory(GiB)": 142.32, "step": 52240, "train_speed(iter/s)": 0.285726 }, { "acc": 0.72767534, "epoch": 0.5845253457896363, "grad_norm": 7.3125, "learning_rate": 8.479404273363087e-06, "loss": 1.08144083, "memory(GiB)": 142.32, "step": 52260, "train_speed(iter/s)": 0.285765 }, { "acc": 0.71946588, "epoch": 0.5847490447355949, "grad_norm": 5.96875, "learning_rate": 8.478075860406225e-06, "loss": 1.11061516, "memory(GiB)": 142.32, "step": 52280, "train_speed(iter/s)": 0.2858 }, { "acc": 0.73231764, "epoch": 0.5849727436815534, "grad_norm": 6.78125, "learning_rate": 8.476746971598873e-06, "loss": 1.06613293, "memory(GiB)": 142.32, "step": 52300, "train_speed(iter/s)": 0.28584 }, { "acc": 0.73490906, "epoch": 0.5851964426275119, "grad_norm": 5.40625, "learning_rate": 8.47541760712284e-06, "loss": 1.06201973, "memory(GiB)": 142.32, "step": 52320, "train_speed(iter/s)": 0.285877 }, { "acc": 0.73054552, "epoch": 0.5854201415734704, "grad_norm": 6.40625, "learning_rate": 8.474087767160004e-06, "loss": 1.08811855, "memory(GiB)": 142.32, "step": 52340, "train_speed(iter/s)": 0.285911 }, { "acc": 0.71331515, "epoch": 0.585643840519429, "grad_norm": 5.65625, "learning_rate": 8.472757451892305e-06, "loss": 1.1459795, "memory(GiB)": 142.32, "step": 52360, "train_speed(iter/s)": 0.28595 }, { "acc": 0.72211275, "epoch": 0.5858675394653875, "grad_norm": 5.53125, "learning_rate": 8.47142666150175e-06, "loss": 1.12084103, "memory(GiB)": 142.32, "step": 52380, "train_speed(iter/s)": 0.285987 }, { "acc": 0.73027201, "epoch": 0.586091238411346, "grad_norm": 5.625, "learning_rate": 8.470095396170408e-06, "loss": 1.07388, "memory(GiB)": 142.32, "step": 52400, "train_speed(iter/s)": 0.286023 }, { "acc": 0.72044158, "epoch": 0.5863149373573046, "grad_norm": 4.78125, "learning_rate": 8.46876365608042e-06, "loss": 1.10862923, "memory(GiB)": 142.32, "step": 52420, "train_speed(iter/s)": 0.286062 }, { "acc": 0.72670865, "epoch": 0.5865386363032631, "grad_norm": 6.59375, "learning_rate": 8.467431441413981e-06, "loss": 1.09284878, "memory(GiB)": 142.32, "step": 52440, "train_speed(iter/s)": 0.286098 }, { "acc": 0.73495102, "epoch": 0.5867623352492216, "grad_norm": 6.3125, "learning_rate": 8.466098752353359e-06, "loss": 1.05564346, "memory(GiB)": 142.32, "step": 52460, "train_speed(iter/s)": 0.286136 }, { "acc": 0.73193207, "epoch": 0.5869860341951801, "grad_norm": 5.625, "learning_rate": 8.464765589080888e-06, "loss": 1.04824104, "memory(GiB)": 142.32, "step": 52480, "train_speed(iter/s)": 0.286171 }, { "acc": 0.72224445, "epoch": 0.5872097331411387, "grad_norm": 6.40625, "learning_rate": 8.46343195177896e-06, "loss": 1.10920353, "memory(GiB)": 142.32, "step": 52500, "train_speed(iter/s)": 0.286207 }, { "acc": 0.72303143, "epoch": 0.5874334320870972, "grad_norm": 6.40625, "learning_rate": 8.462097840630037e-06, "loss": 1.10921364, "memory(GiB)": 142.32, "step": 52520, "train_speed(iter/s)": 0.286246 }, { "acc": 0.73473449, "epoch": 0.5876571310330557, "grad_norm": 6.8125, "learning_rate": 8.460763255816645e-06, "loss": 1.04415398, "memory(GiB)": 142.32, "step": 52540, "train_speed(iter/s)": 0.286282 }, { "acc": 0.72918377, "epoch": 0.5878808299790143, "grad_norm": 5.6875, "learning_rate": 8.459428197521375e-06, "loss": 1.07396202, "memory(GiB)": 142.32, "step": 52560, "train_speed(iter/s)": 0.28632 }, { "acc": 0.72460055, "epoch": 0.5881045289249728, "grad_norm": 5.34375, "learning_rate": 8.45809266592688e-06, "loss": 1.10049038, "memory(GiB)": 142.32, "step": 52580, "train_speed(iter/s)": 0.286354 }, { "acc": 0.73217173, "epoch": 0.5883282278709313, "grad_norm": 4.75, "learning_rate": 8.456756661215882e-06, "loss": 1.06227331, "memory(GiB)": 142.32, "step": 52600, "train_speed(iter/s)": 0.28639 }, { "acc": 0.73248925, "epoch": 0.5885519268168898, "grad_norm": 4.78125, "learning_rate": 8.45542018357116e-06, "loss": 1.054105, "memory(GiB)": 142.32, "step": 52620, "train_speed(iter/s)": 0.286429 }, { "acc": 0.73457661, "epoch": 0.5887756257628484, "grad_norm": 5.8125, "learning_rate": 8.454083233175573e-06, "loss": 1.05318108, "memory(GiB)": 142.32, "step": 52640, "train_speed(iter/s)": 0.286465 }, { "acc": 0.73339581, "epoch": 0.5889993247088069, "grad_norm": 6.1875, "learning_rate": 8.452745810212028e-06, "loss": 1.06214848, "memory(GiB)": 142.32, "step": 52660, "train_speed(iter/s)": 0.286497 }, { "acc": 0.7185462, "epoch": 0.5892230236547654, "grad_norm": 6.59375, "learning_rate": 8.451407914863502e-06, "loss": 1.14630604, "memory(GiB)": 142.32, "step": 52680, "train_speed(iter/s)": 0.286527 }, { "acc": 0.73140388, "epoch": 0.589446722600724, "grad_norm": 5.5625, "learning_rate": 8.450069547313045e-06, "loss": 1.07040815, "memory(GiB)": 142.32, "step": 52700, "train_speed(iter/s)": 0.286561 }, { "acc": 0.71941271, "epoch": 0.5896704215466825, "grad_norm": 6.53125, "learning_rate": 8.448730707743759e-06, "loss": 1.12078686, "memory(GiB)": 142.32, "step": 52720, "train_speed(iter/s)": 0.286598 }, { "acc": 0.72095575, "epoch": 0.589894120492641, "grad_norm": 6.71875, "learning_rate": 8.44739139633882e-06, "loss": 1.12259874, "memory(GiB)": 142.32, "step": 52740, "train_speed(iter/s)": 0.286635 }, { "acc": 0.73573027, "epoch": 0.5901178194385995, "grad_norm": 5.40625, "learning_rate": 8.446051613281462e-06, "loss": 1.04133282, "memory(GiB)": 142.32, "step": 52760, "train_speed(iter/s)": 0.286673 }, { "acc": 0.73888559, "epoch": 0.5903415183845581, "grad_norm": 6.84375, "learning_rate": 8.444711358754988e-06, "loss": 1.04133644, "memory(GiB)": 142.32, "step": 52780, "train_speed(iter/s)": 0.286714 }, { "acc": 0.7352561, "epoch": 0.5905652173305166, "grad_norm": 5.09375, "learning_rate": 8.443370632942765e-06, "loss": 1.0487009, "memory(GiB)": 142.32, "step": 52800, "train_speed(iter/s)": 0.286754 }, { "acc": 0.7354701, "epoch": 0.5907889162764751, "grad_norm": 5.90625, "learning_rate": 8.442029436028222e-06, "loss": 1.04139395, "memory(GiB)": 142.32, "step": 52820, "train_speed(iter/s)": 0.286795 }, { "acc": 0.72029786, "epoch": 0.5910126152224336, "grad_norm": 5.125, "learning_rate": 8.440687768194852e-06, "loss": 1.10979252, "memory(GiB)": 142.32, "step": 52840, "train_speed(iter/s)": 0.286831 }, { "acc": 0.72389631, "epoch": 0.5912363141683922, "grad_norm": 5.40625, "learning_rate": 8.439345629626219e-06, "loss": 1.11179209, "memory(GiB)": 142.32, "step": 52860, "train_speed(iter/s)": 0.286866 }, { "acc": 0.73932562, "epoch": 0.5914600131143507, "grad_norm": 4.8125, "learning_rate": 8.438003020505945e-06, "loss": 1.02730465, "memory(GiB)": 142.32, "step": 52880, "train_speed(iter/s)": 0.286895 }, { "acc": 0.73770828, "epoch": 0.5916837120603092, "grad_norm": 6.3125, "learning_rate": 8.436659941017715e-06, "loss": 1.0395834, "memory(GiB)": 142.32, "step": 52900, "train_speed(iter/s)": 0.286932 }, { "acc": 0.74005313, "epoch": 0.5919074110062678, "grad_norm": 5.375, "learning_rate": 8.435316391345286e-06, "loss": 1.02898283, "memory(GiB)": 142.32, "step": 52920, "train_speed(iter/s)": 0.28697 }, { "acc": 0.72205544, "epoch": 0.5921311099522263, "grad_norm": 6.78125, "learning_rate": 8.433972371672471e-06, "loss": 1.12105169, "memory(GiB)": 142.32, "step": 52940, "train_speed(iter/s)": 0.287006 }, { "acc": 0.72977009, "epoch": 0.5923548088981848, "grad_norm": 7.0, "learning_rate": 8.432627882183153e-06, "loss": 1.07771797, "memory(GiB)": 142.32, "step": 52960, "train_speed(iter/s)": 0.287041 }, { "acc": 0.7401587, "epoch": 0.5925785078441433, "grad_norm": 5.15625, "learning_rate": 8.431282923061279e-06, "loss": 1.01860981, "memory(GiB)": 142.32, "step": 52980, "train_speed(iter/s)": 0.287081 }, { "acc": 0.72596316, "epoch": 0.5928022067901019, "grad_norm": 5.875, "learning_rate": 8.429937494490853e-06, "loss": 1.07516632, "memory(GiB)": 142.32, "step": 53000, "train_speed(iter/s)": 0.28712 }, { "acc": 0.74746199, "epoch": 0.5930259057360604, "grad_norm": 5.75, "learning_rate": 8.428591596655957e-06, "loss": 0.99623966, "memory(GiB)": 142.32, "step": 53020, "train_speed(iter/s)": 0.287154 }, { "acc": 0.73540134, "epoch": 0.5932496046820189, "grad_norm": 5.75, "learning_rate": 8.427245229740722e-06, "loss": 1.04546804, "memory(GiB)": 142.32, "step": 53040, "train_speed(iter/s)": 0.287188 }, { "acc": 0.72859182, "epoch": 0.5934733036279775, "grad_norm": 6.375, "learning_rate": 8.425898393929353e-06, "loss": 1.07423458, "memory(GiB)": 142.32, "step": 53060, "train_speed(iter/s)": 0.287225 }, { "acc": 0.73163481, "epoch": 0.593697002573936, "grad_norm": 5.9375, "learning_rate": 8.424551089406118e-06, "loss": 1.06299706, "memory(GiB)": 142.32, "step": 53080, "train_speed(iter/s)": 0.287261 }, { "acc": 0.72993512, "epoch": 0.5939207015198945, "grad_norm": 5.25, "learning_rate": 8.423203316355345e-06, "loss": 1.08411865, "memory(GiB)": 142.32, "step": 53100, "train_speed(iter/s)": 0.287301 }, { "acc": 0.72208042, "epoch": 0.594144400465853, "grad_norm": 6.1875, "learning_rate": 8.42185507496143e-06, "loss": 1.11657257, "memory(GiB)": 142.32, "step": 53120, "train_speed(iter/s)": 0.287341 }, { "acc": 0.71306772, "epoch": 0.5943680994118116, "grad_norm": 4.9375, "learning_rate": 8.420506365408829e-06, "loss": 1.14121971, "memory(GiB)": 142.32, "step": 53140, "train_speed(iter/s)": 0.287376 }, { "acc": 0.72288828, "epoch": 0.5945917983577701, "grad_norm": 6.3125, "learning_rate": 8.419157187882068e-06, "loss": 1.09643478, "memory(GiB)": 142.32, "step": 53160, "train_speed(iter/s)": 0.287413 }, { "acc": 0.73108292, "epoch": 0.5948154973037286, "grad_norm": 6.34375, "learning_rate": 8.417807542565735e-06, "loss": 1.09746685, "memory(GiB)": 142.32, "step": 53180, "train_speed(iter/s)": 0.287445 }, { "acc": 0.74115601, "epoch": 0.5950391962496872, "grad_norm": 4.1875, "learning_rate": 8.416457429644476e-06, "loss": 1.03865623, "memory(GiB)": 142.32, "step": 53200, "train_speed(iter/s)": 0.28748 }, { "acc": 0.73708973, "epoch": 0.5952628951956457, "grad_norm": 5.34375, "learning_rate": 8.415106849303007e-06, "loss": 1.0427906, "memory(GiB)": 142.32, "step": 53220, "train_speed(iter/s)": 0.287519 }, { "acc": 0.72879214, "epoch": 0.5954865941416042, "grad_norm": 4.6875, "learning_rate": 8.413755801726111e-06, "loss": 1.06971245, "memory(GiB)": 142.32, "step": 53240, "train_speed(iter/s)": 0.287553 }, { "acc": 0.72328033, "epoch": 0.5957102930875627, "grad_norm": 5.96875, "learning_rate": 8.412404287098626e-06, "loss": 1.10668945, "memory(GiB)": 142.32, "step": 53260, "train_speed(iter/s)": 0.287584 }, { "acc": 0.73020077, "epoch": 0.5959339920335213, "grad_norm": 6.46875, "learning_rate": 8.41105230560546e-06, "loss": 1.07061834, "memory(GiB)": 142.32, "step": 53280, "train_speed(iter/s)": 0.287624 }, { "acc": 0.73204274, "epoch": 0.5961576909794798, "grad_norm": 5.8125, "learning_rate": 8.409699857431584e-06, "loss": 1.0555829, "memory(GiB)": 142.32, "step": 53300, "train_speed(iter/s)": 0.28766 }, { "acc": 0.73124685, "epoch": 0.5963813899254383, "grad_norm": 5.9375, "learning_rate": 8.40834694276203e-06, "loss": 1.07763691, "memory(GiB)": 142.32, "step": 53320, "train_speed(iter/s)": 0.287695 }, { "acc": 0.73590894, "epoch": 0.5966050888713968, "grad_norm": 5.78125, "learning_rate": 8.4069935617819e-06, "loss": 1.04824791, "memory(GiB)": 142.32, "step": 53340, "train_speed(iter/s)": 0.287734 }, { "acc": 0.73524218, "epoch": 0.5968287878173554, "grad_norm": 5.75, "learning_rate": 8.405639714676353e-06, "loss": 1.05404787, "memory(GiB)": 142.32, "step": 53360, "train_speed(iter/s)": 0.287775 }, { "acc": 0.71606159, "epoch": 0.5970524867633139, "grad_norm": 6.15625, "learning_rate": 8.404285401630614e-06, "loss": 1.15005913, "memory(GiB)": 142.32, "step": 53380, "train_speed(iter/s)": 0.287812 }, { "acc": 0.73051095, "epoch": 0.5972761857092724, "grad_norm": 5.84375, "learning_rate": 8.402930622829975e-06, "loss": 1.06615849, "memory(GiB)": 142.32, "step": 53400, "train_speed(iter/s)": 0.287848 }, { "acc": 0.73048024, "epoch": 0.597499884655231, "grad_norm": 6.4375, "learning_rate": 8.401575378459785e-06, "loss": 1.06867065, "memory(GiB)": 142.32, "step": 53420, "train_speed(iter/s)": 0.287885 }, { "acc": 0.72720194, "epoch": 0.5977235836011895, "grad_norm": 6.28125, "learning_rate": 8.400219668705468e-06, "loss": 1.08233423, "memory(GiB)": 142.32, "step": 53440, "train_speed(iter/s)": 0.287925 }, { "acc": 0.73626957, "epoch": 0.597947282547148, "grad_norm": 5.5625, "learning_rate": 8.398863493752495e-06, "loss": 1.04828644, "memory(GiB)": 142.32, "step": 53460, "train_speed(iter/s)": 0.287961 }, { "acc": 0.72068214, "epoch": 0.5981709814931065, "grad_norm": 5.78125, "learning_rate": 8.397506853786419e-06, "loss": 1.11461926, "memory(GiB)": 142.32, "step": 53480, "train_speed(iter/s)": 0.287995 }, { "acc": 0.73253922, "epoch": 0.5983946804390651, "grad_norm": 6.28125, "learning_rate": 8.396149748992844e-06, "loss": 1.07484493, "memory(GiB)": 142.32, "step": 53500, "train_speed(iter/s)": 0.288029 }, { "acc": 0.74480867, "epoch": 0.5986183793850236, "grad_norm": 6.03125, "learning_rate": 8.394792179557438e-06, "loss": 0.9990839, "memory(GiB)": 142.32, "step": 53520, "train_speed(iter/s)": 0.288065 }, { "acc": 0.72303438, "epoch": 0.5988420783309821, "grad_norm": 5.40625, "learning_rate": 8.393434145665941e-06, "loss": 1.10785484, "memory(GiB)": 142.32, "step": 53540, "train_speed(iter/s)": 0.288099 }, { "acc": 0.72398148, "epoch": 0.5990657772769407, "grad_norm": 7.125, "learning_rate": 8.39207564750415e-06, "loss": 1.0911396, "memory(GiB)": 142.32, "step": 53560, "train_speed(iter/s)": 0.288136 }, { "acc": 0.73016963, "epoch": 0.5992894762228992, "grad_norm": 7.125, "learning_rate": 8.390716685257924e-06, "loss": 1.06636038, "memory(GiB)": 142.32, "step": 53580, "train_speed(iter/s)": 0.288177 }, { "acc": 0.73593159, "epoch": 0.5995131751688577, "grad_norm": 5.84375, "learning_rate": 8.389357259113195e-06, "loss": 1.04827881, "memory(GiB)": 142.32, "step": 53600, "train_speed(iter/s)": 0.288216 }, { "acc": 0.73192902, "epoch": 0.5997368741148162, "grad_norm": 5.65625, "learning_rate": 8.387997369255945e-06, "loss": 1.06637049, "memory(GiB)": 142.32, "step": 53620, "train_speed(iter/s)": 0.288254 }, { "acc": 0.72962828, "epoch": 0.5999605730607748, "grad_norm": 6.46875, "learning_rate": 8.38663701587223e-06, "loss": 1.06912947, "memory(GiB)": 142.32, "step": 53640, "train_speed(iter/s)": 0.288287 }, { "acc": 0.72894993, "epoch": 0.6001842720067333, "grad_norm": 6.0, "learning_rate": 8.385276199148164e-06, "loss": 1.08423824, "memory(GiB)": 142.32, "step": 53660, "train_speed(iter/s)": 0.288323 }, { "acc": 0.72831945, "epoch": 0.6004079709526918, "grad_norm": 6.0625, "learning_rate": 8.383914919269929e-06, "loss": 1.0650136, "memory(GiB)": 142.32, "step": 53680, "train_speed(iter/s)": 0.288362 }, { "acc": 0.72675753, "epoch": 0.6006316698986504, "grad_norm": 6.71875, "learning_rate": 8.382553176423764e-06, "loss": 1.09378281, "memory(GiB)": 142.32, "step": 53700, "train_speed(iter/s)": 0.2884 }, { "acc": 0.72891769, "epoch": 0.6008553688446089, "grad_norm": 5.1875, "learning_rate": 8.381190970795978e-06, "loss": 1.08600521, "memory(GiB)": 142.32, "step": 53720, "train_speed(iter/s)": 0.288433 }, { "acc": 0.72631416, "epoch": 0.6010790677905674, "grad_norm": 5.96875, "learning_rate": 8.37982830257294e-06, "loss": 1.09181185, "memory(GiB)": 142.32, "step": 53740, "train_speed(iter/s)": 0.288469 }, { "acc": 0.73856182, "epoch": 0.6013027667365259, "grad_norm": 6.5625, "learning_rate": 8.378465171941078e-06, "loss": 1.02763157, "memory(GiB)": 142.32, "step": 53760, "train_speed(iter/s)": 0.288504 }, { "acc": 0.73393402, "epoch": 0.6015264656824845, "grad_norm": 7.0, "learning_rate": 8.377101579086893e-06, "loss": 1.05344896, "memory(GiB)": 142.32, "step": 53780, "train_speed(iter/s)": 0.288542 }, { "acc": 0.72841449, "epoch": 0.601750164628443, "grad_norm": 5.5, "learning_rate": 8.375737524196942e-06, "loss": 1.09230585, "memory(GiB)": 142.32, "step": 53800, "train_speed(iter/s)": 0.288578 }, { "acc": 0.73428116, "epoch": 0.6019738635744015, "grad_norm": 6.125, "learning_rate": 8.374373007457847e-06, "loss": 1.04487209, "memory(GiB)": 142.32, "step": 53820, "train_speed(iter/s)": 0.288614 }, { "acc": 0.73263407, "epoch": 0.60219756252036, "grad_norm": 6.125, "learning_rate": 8.373008029056292e-06, "loss": 1.06056938, "memory(GiB)": 142.32, "step": 53840, "train_speed(iter/s)": 0.288648 }, { "acc": 0.72461543, "epoch": 0.6024212614663186, "grad_norm": 5.34375, "learning_rate": 8.37164258917903e-06, "loss": 1.09969807, "memory(GiB)": 142.32, "step": 53860, "train_speed(iter/s)": 0.288687 }, { "acc": 0.73913145, "epoch": 0.6026449604122771, "grad_norm": 5.5, "learning_rate": 8.37027668801287e-06, "loss": 1.03094292, "memory(GiB)": 142.32, "step": 53880, "train_speed(iter/s)": 0.288721 }, { "acc": 0.72873917, "epoch": 0.6028686593582356, "grad_norm": 6.6875, "learning_rate": 8.368910325744686e-06, "loss": 1.07290649, "memory(GiB)": 142.32, "step": 53900, "train_speed(iter/s)": 0.288755 }, { "acc": 0.73326178, "epoch": 0.6030923583041942, "grad_norm": 6.46875, "learning_rate": 8.367543502561416e-06, "loss": 1.05375938, "memory(GiB)": 142.32, "step": 53920, "train_speed(iter/s)": 0.28879 }, { "acc": 0.72709956, "epoch": 0.6033160572501527, "grad_norm": 6.09375, "learning_rate": 8.36617621865006e-06, "loss": 1.09407005, "memory(GiB)": 142.32, "step": 53940, "train_speed(iter/s)": 0.288825 }, { "acc": 0.71564217, "epoch": 0.6035397561961112, "grad_norm": 5.3125, "learning_rate": 8.364808474197687e-06, "loss": 1.15237751, "memory(GiB)": 142.32, "step": 53960, "train_speed(iter/s)": 0.288858 }, { "acc": 0.73472967, "epoch": 0.6037634551420697, "grad_norm": 6.46875, "learning_rate": 8.363440269391419e-06, "loss": 1.05409393, "memory(GiB)": 142.32, "step": 53980, "train_speed(iter/s)": 0.288894 }, { "acc": 0.73144007, "epoch": 0.6039871540880283, "grad_norm": 6.40625, "learning_rate": 8.362071604418447e-06, "loss": 1.07285824, "memory(GiB)": 142.32, "step": 54000, "train_speed(iter/s)": 0.288927 }, { "epoch": 0.6039871540880283, "eval_acc": 0.6926513279138563, "eval_loss": 1.087906002998352, "eval_runtime": 2342.0248, "eval_samples_per_second": 32.144, "eval_steps_per_second": 16.072, "step": 54000 }, { "acc": 0.73749104, "epoch": 0.6042108530339868, "grad_norm": 6.96875, "learning_rate": 8.360702479466025e-06, "loss": 1.03431568, "memory(GiB)": 142.32, "step": 54020, "train_speed(iter/s)": 0.285314 }, { "acc": 0.73281207, "epoch": 0.6044345519799454, "grad_norm": 6.5625, "learning_rate": 8.359332894721469e-06, "loss": 1.06332607, "memory(GiB)": 142.32, "step": 54040, "train_speed(iter/s)": 0.285348 }, { "acc": 0.74556594, "epoch": 0.604658250925904, "grad_norm": 6.25, "learning_rate": 8.357962850372154e-06, "loss": 1.00615892, "memory(GiB)": 142.32, "step": 54060, "train_speed(iter/s)": 0.285377 }, { "acc": 0.73845539, "epoch": 0.6048819498718625, "grad_norm": 6.09375, "learning_rate": 8.356592346605528e-06, "loss": 1.04800673, "memory(GiB)": 142.32, "step": 54080, "train_speed(iter/s)": 0.285418 }, { "acc": 0.7256639, "epoch": 0.605105648817821, "grad_norm": 5.53125, "learning_rate": 8.35522138360909e-06, "loss": 1.08113613, "memory(GiB)": 142.32, "step": 54100, "train_speed(iter/s)": 0.285456 }, { "acc": 0.72438917, "epoch": 0.6053293477637796, "grad_norm": 5.15625, "learning_rate": 8.353849961570413e-06, "loss": 1.11518345, "memory(GiB)": 142.32, "step": 54120, "train_speed(iter/s)": 0.285491 }, { "acc": 0.72378507, "epoch": 0.6055530467097381, "grad_norm": 6.46875, "learning_rate": 8.352478080677122e-06, "loss": 1.09716473, "memory(GiB)": 142.32, "step": 54140, "train_speed(iter/s)": 0.285528 }, { "acc": 0.73081598, "epoch": 0.6057767456556966, "grad_norm": 5.59375, "learning_rate": 8.351105741116909e-06, "loss": 1.07807407, "memory(GiB)": 142.32, "step": 54160, "train_speed(iter/s)": 0.28556 }, { "acc": 0.72982321, "epoch": 0.6060004446016551, "grad_norm": 6.53125, "learning_rate": 8.349732943077535e-06, "loss": 1.07337036, "memory(GiB)": 142.32, "step": 54180, "train_speed(iter/s)": 0.285593 }, { "acc": 0.72595644, "epoch": 0.6062241435476137, "grad_norm": 6.0625, "learning_rate": 8.348359686746815e-06, "loss": 1.09512682, "memory(GiB)": 142.32, "step": 54200, "train_speed(iter/s)": 0.285627 }, { "acc": 0.73072972, "epoch": 0.6064478424935722, "grad_norm": 6.4375, "learning_rate": 8.34698597231263e-06, "loss": 1.06863613, "memory(GiB)": 142.32, "step": 54220, "train_speed(iter/s)": 0.285662 }, { "acc": 0.7292316, "epoch": 0.6066715414395307, "grad_norm": 6.21875, "learning_rate": 8.345611799962927e-06, "loss": 1.07012663, "memory(GiB)": 142.32, "step": 54240, "train_speed(iter/s)": 0.285695 }, { "acc": 0.72842131, "epoch": 0.6068952403854893, "grad_norm": 5.21875, "learning_rate": 8.344237169885707e-06, "loss": 1.0883688, "memory(GiB)": 142.32, "step": 54260, "train_speed(iter/s)": 0.285734 }, { "acc": 0.74275131, "epoch": 0.6071189393314478, "grad_norm": 5.71875, "learning_rate": 8.342862082269043e-06, "loss": 1.02213917, "memory(GiB)": 142.32, "step": 54280, "train_speed(iter/s)": 0.285772 }, { "acc": 0.73579779, "epoch": 0.6073426382774063, "grad_norm": 6.03125, "learning_rate": 8.341486537301067e-06, "loss": 1.05935678, "memory(GiB)": 142.32, "step": 54300, "train_speed(iter/s)": 0.28581 }, { "acc": 0.72742815, "epoch": 0.6075663372233648, "grad_norm": 6.1875, "learning_rate": 8.34011053516997e-06, "loss": 1.0841938, "memory(GiB)": 142.32, "step": 54320, "train_speed(iter/s)": 0.285844 }, { "acc": 0.73692894, "epoch": 0.6077900361693234, "grad_norm": 6.6875, "learning_rate": 8.338734076064013e-06, "loss": 1.03851833, "memory(GiB)": 142.32, "step": 54340, "train_speed(iter/s)": 0.285879 }, { "acc": 0.7425518, "epoch": 0.6080137351152819, "grad_norm": 6.34375, "learning_rate": 8.33735716017151e-06, "loss": 1.01596375, "memory(GiB)": 142.32, "step": 54360, "train_speed(iter/s)": 0.285916 }, { "acc": 0.7354516, "epoch": 0.6082374340612404, "grad_norm": 6.5625, "learning_rate": 8.335979787680848e-06, "loss": 1.05915604, "memory(GiB)": 142.32, "step": 54380, "train_speed(iter/s)": 0.285953 }, { "acc": 0.73546734, "epoch": 0.608461133007199, "grad_norm": 7.125, "learning_rate": 8.334601958780467e-06, "loss": 1.06721849, "memory(GiB)": 142.32, "step": 54400, "train_speed(iter/s)": 0.28599 }, { "acc": 0.74380312, "epoch": 0.6086848319531575, "grad_norm": 5.09375, "learning_rate": 8.333223673658877e-06, "loss": 1.02653542, "memory(GiB)": 142.32, "step": 54420, "train_speed(iter/s)": 0.286025 }, { "acc": 0.72413321, "epoch": 0.608908530899116, "grad_norm": 5.4375, "learning_rate": 8.331844932504644e-06, "loss": 1.10153208, "memory(GiB)": 142.32, "step": 54440, "train_speed(iter/s)": 0.286057 }, { "acc": 0.72127314, "epoch": 0.6091322298450745, "grad_norm": 6.09375, "learning_rate": 8.330465735506403e-06, "loss": 1.1003397, "memory(GiB)": 142.32, "step": 54460, "train_speed(iter/s)": 0.286093 }, { "acc": 0.72584538, "epoch": 0.6093559287910331, "grad_norm": 5.59375, "learning_rate": 8.329086082852844e-06, "loss": 1.10110893, "memory(GiB)": 142.32, "step": 54480, "train_speed(iter/s)": 0.286129 }, { "acc": 0.72631369, "epoch": 0.6095796277369916, "grad_norm": 4.875, "learning_rate": 8.327705974732727e-06, "loss": 1.10239792, "memory(GiB)": 142.32, "step": 54500, "train_speed(iter/s)": 0.286161 }, { "acc": 0.73147678, "epoch": 0.6098033266829501, "grad_norm": 6.0625, "learning_rate": 8.326325411334868e-06, "loss": 1.06691074, "memory(GiB)": 142.32, "step": 54520, "train_speed(iter/s)": 0.286197 }, { "acc": 0.73361282, "epoch": 0.6100270256289086, "grad_norm": 8.0, "learning_rate": 8.324944392848149e-06, "loss": 1.0443964, "memory(GiB)": 142.32, "step": 54540, "train_speed(iter/s)": 0.286235 }, { "acc": 0.73224168, "epoch": 0.6102507245748672, "grad_norm": 5.96875, "learning_rate": 8.323562919461514e-06, "loss": 1.06370831, "memory(GiB)": 142.32, "step": 54560, "train_speed(iter/s)": 0.286276 }, { "acc": 0.75167332, "epoch": 0.6104744235208257, "grad_norm": 5.9375, "learning_rate": 8.322180991363965e-06, "loss": 0.9685997, "memory(GiB)": 142.32, "step": 54580, "train_speed(iter/s)": 0.286311 }, { "acc": 0.7400691, "epoch": 0.6106981224667842, "grad_norm": 6.15625, "learning_rate": 8.32079860874457e-06, "loss": 1.03840685, "memory(GiB)": 142.32, "step": 54600, "train_speed(iter/s)": 0.286346 }, { "acc": 0.72659636, "epoch": 0.6109218214127428, "grad_norm": 6.34375, "learning_rate": 8.319415771792464e-06, "loss": 1.07698107, "memory(GiB)": 142.32, "step": 54620, "train_speed(iter/s)": 0.28638 }, { "acc": 0.73045311, "epoch": 0.6111455203587013, "grad_norm": 6.21875, "learning_rate": 8.318032480696834e-06, "loss": 1.0815649, "memory(GiB)": 142.32, "step": 54640, "train_speed(iter/s)": 0.286416 }, { "acc": 0.72680221, "epoch": 0.6113692193046598, "grad_norm": 5.28125, "learning_rate": 8.316648735646933e-06, "loss": 1.08759403, "memory(GiB)": 142.32, "step": 54660, "train_speed(iter/s)": 0.286448 }, { "acc": 0.74056368, "epoch": 0.6115929182506183, "grad_norm": 5.75, "learning_rate": 8.315264536832082e-06, "loss": 1.02851086, "memory(GiB)": 142.32, "step": 54680, "train_speed(iter/s)": 0.286489 }, { "acc": 0.72786531, "epoch": 0.6118166171965769, "grad_norm": 5.96875, "learning_rate": 8.313879884441655e-06, "loss": 1.08382692, "memory(GiB)": 142.32, "step": 54700, "train_speed(iter/s)": 0.286526 }, { "acc": 0.72013817, "epoch": 0.6120403161425354, "grad_norm": 5.40625, "learning_rate": 8.312494778665092e-06, "loss": 1.12553196, "memory(GiB)": 142.32, "step": 54720, "train_speed(iter/s)": 0.286562 }, { "acc": 0.73435221, "epoch": 0.6122640150884939, "grad_norm": 6.0, "learning_rate": 8.311109219691898e-06, "loss": 1.04945021, "memory(GiB)": 142.32, "step": 54740, "train_speed(iter/s)": 0.286595 }, { "acc": 0.72395196, "epoch": 0.6124877140344525, "grad_norm": 6.15625, "learning_rate": 8.309723207711638e-06, "loss": 1.10844021, "memory(GiB)": 142.32, "step": 54760, "train_speed(iter/s)": 0.286632 }, { "acc": 0.725669, "epoch": 0.612711412980411, "grad_norm": 7.75, "learning_rate": 8.308336742913934e-06, "loss": 1.08647852, "memory(GiB)": 142.32, "step": 54780, "train_speed(iter/s)": 0.286666 }, { "acc": 0.73524084, "epoch": 0.6129351119263695, "grad_norm": 5.59375, "learning_rate": 8.306949825488477e-06, "loss": 1.0626936, "memory(GiB)": 142.32, "step": 54800, "train_speed(iter/s)": 0.286706 }, { "acc": 0.73970871, "epoch": 0.613158810872328, "grad_norm": 5.15625, "learning_rate": 8.305562455625016e-06, "loss": 1.02249813, "memory(GiB)": 142.32, "step": 54820, "train_speed(iter/s)": 0.286742 }, { "acc": 0.73116093, "epoch": 0.6133825098182866, "grad_norm": 5.625, "learning_rate": 8.304174633513364e-06, "loss": 1.07436199, "memory(GiB)": 142.32, "step": 54840, "train_speed(iter/s)": 0.286776 }, { "acc": 0.73641653, "epoch": 0.6136062087642451, "grad_norm": 6.34375, "learning_rate": 8.302786359343395e-06, "loss": 1.0428318, "memory(GiB)": 142.32, "step": 54860, "train_speed(iter/s)": 0.286813 }, { "acc": 0.72595081, "epoch": 0.6138299077102036, "grad_norm": 5.65625, "learning_rate": 8.301397633305045e-06, "loss": 1.08919191, "memory(GiB)": 142.32, "step": 54880, "train_speed(iter/s)": 0.28685 }, { "acc": 0.73917103, "epoch": 0.6140536066561622, "grad_norm": 6.65625, "learning_rate": 8.300008455588311e-06, "loss": 1.03030663, "memory(GiB)": 142.32, "step": 54900, "train_speed(iter/s)": 0.286885 }, { "acc": 0.72313223, "epoch": 0.6142773056021207, "grad_norm": 6.96875, "learning_rate": 8.298618826383251e-06, "loss": 1.10812674, "memory(GiB)": 142.32, "step": 54920, "train_speed(iter/s)": 0.286924 }, { "acc": 0.71198521, "epoch": 0.6145010045480792, "grad_norm": 4.8125, "learning_rate": 8.297228745879989e-06, "loss": 1.14518433, "memory(GiB)": 142.32, "step": 54940, "train_speed(iter/s)": 0.28696 }, { "acc": 0.7220026, "epoch": 0.6147247034940377, "grad_norm": 6.4375, "learning_rate": 8.295838214268704e-06, "loss": 1.11102152, "memory(GiB)": 142.32, "step": 54960, "train_speed(iter/s)": 0.286997 }, { "acc": 0.72954655, "epoch": 0.6149484024399963, "grad_norm": 6.46875, "learning_rate": 8.294447231739644e-06, "loss": 1.07191143, "memory(GiB)": 142.32, "step": 54980, "train_speed(iter/s)": 0.287034 }, { "acc": 0.72774353, "epoch": 0.6151721013859548, "grad_norm": 4.9375, "learning_rate": 8.293055798483116e-06, "loss": 1.0911911, "memory(GiB)": 142.32, "step": 55000, "train_speed(iter/s)": 0.287068 }, { "acc": 0.72530804, "epoch": 0.6153958003319133, "grad_norm": 5.1875, "learning_rate": 8.291663914689485e-06, "loss": 1.09562035, "memory(GiB)": 142.32, "step": 55020, "train_speed(iter/s)": 0.287102 }, { "acc": 0.71661649, "epoch": 0.6156194992778719, "grad_norm": 4.75, "learning_rate": 8.29027158054918e-06, "loss": 1.12195435, "memory(GiB)": 142.32, "step": 55040, "train_speed(iter/s)": 0.287134 }, { "acc": 0.71440511, "epoch": 0.6158431982238304, "grad_norm": 4.96875, "learning_rate": 8.288878796252695e-06, "loss": 1.14503326, "memory(GiB)": 142.32, "step": 55060, "train_speed(iter/s)": 0.287166 }, { "acc": 0.74669752, "epoch": 0.6160668971697889, "grad_norm": 5.09375, "learning_rate": 8.287485561990582e-06, "loss": 1.02145653, "memory(GiB)": 142.32, "step": 55080, "train_speed(iter/s)": 0.287203 }, { "acc": 0.72209759, "epoch": 0.6162905961157474, "grad_norm": 7.0, "learning_rate": 8.286091877953455e-06, "loss": 1.10813389, "memory(GiB)": 142.32, "step": 55100, "train_speed(iter/s)": 0.287237 }, { "acc": 0.72771311, "epoch": 0.616514295061706, "grad_norm": 5.34375, "learning_rate": 8.28469774433199e-06, "loss": 1.09104309, "memory(GiB)": 142.32, "step": 55120, "train_speed(iter/s)": 0.287274 }, { "acc": 0.73108912, "epoch": 0.6167379940076645, "grad_norm": 6.28125, "learning_rate": 8.283303161316924e-06, "loss": 1.08739471, "memory(GiB)": 142.32, "step": 55140, "train_speed(iter/s)": 0.287305 }, { "acc": 0.73832707, "epoch": 0.616961692953623, "grad_norm": 5.65625, "learning_rate": 8.28190812909906e-06, "loss": 1.0455657, "memory(GiB)": 142.32, "step": 55160, "train_speed(iter/s)": 0.287343 }, { "acc": 0.72558308, "epoch": 0.6171853918995815, "grad_norm": 5.78125, "learning_rate": 8.28051264786925e-06, "loss": 1.0868825, "memory(GiB)": 142.32, "step": 55180, "train_speed(iter/s)": 0.287377 }, { "acc": 0.72837162, "epoch": 0.6174090908455401, "grad_norm": 5.8125, "learning_rate": 8.279116717818422e-06, "loss": 1.08304348, "memory(GiB)": 142.32, "step": 55200, "train_speed(iter/s)": 0.287412 }, { "acc": 0.73275189, "epoch": 0.6176327897914986, "grad_norm": 5.65625, "learning_rate": 8.277720339137559e-06, "loss": 1.05595589, "memory(GiB)": 142.32, "step": 55220, "train_speed(iter/s)": 0.287445 }, { "acc": 0.73036752, "epoch": 0.6178564887374571, "grad_norm": 5.1875, "learning_rate": 8.276323512017702e-06, "loss": 1.07017317, "memory(GiB)": 142.32, "step": 55240, "train_speed(iter/s)": 0.287482 }, { "acc": 0.72052488, "epoch": 0.6180801876834157, "grad_norm": 6.125, "learning_rate": 8.27492623664996e-06, "loss": 1.09950733, "memory(GiB)": 142.32, "step": 55260, "train_speed(iter/s)": 0.287521 }, { "acc": 0.73960261, "epoch": 0.6183038866293742, "grad_norm": 6.21875, "learning_rate": 8.273528513225499e-06, "loss": 1.02861118, "memory(GiB)": 142.32, "step": 55280, "train_speed(iter/s)": 0.287557 }, { "acc": 0.73886833, "epoch": 0.6185275855753327, "grad_norm": 5.28125, "learning_rate": 8.27213034193555e-06, "loss": 1.0428853, "memory(GiB)": 142.32, "step": 55300, "train_speed(iter/s)": 0.287592 }, { "acc": 0.73724861, "epoch": 0.6187512845212912, "grad_norm": 5.125, "learning_rate": 8.270731722971398e-06, "loss": 1.05126905, "memory(GiB)": 142.32, "step": 55320, "train_speed(iter/s)": 0.287623 }, { "acc": 0.72194099, "epoch": 0.6189749834672498, "grad_norm": 5.96875, "learning_rate": 8.269332656524399e-06, "loss": 1.0983654, "memory(GiB)": 142.32, "step": 55340, "train_speed(iter/s)": 0.28766 }, { "acc": 0.73065557, "epoch": 0.6191986824132083, "grad_norm": 5.46875, "learning_rate": 8.26793314278596e-06, "loss": 1.06910524, "memory(GiB)": 142.32, "step": 55360, "train_speed(iter/s)": 0.287697 }, { "acc": 0.74162641, "epoch": 0.6194223813591668, "grad_norm": 5.71875, "learning_rate": 8.266533181947561e-06, "loss": 0.99884453, "memory(GiB)": 142.32, "step": 55380, "train_speed(iter/s)": 0.28773 }, { "acc": 0.72086248, "epoch": 0.6196460803051254, "grad_norm": 6.375, "learning_rate": 8.26513277420073e-06, "loss": 1.1080719, "memory(GiB)": 142.32, "step": 55400, "train_speed(iter/s)": 0.287766 }, { "acc": 0.73089514, "epoch": 0.6198697792510839, "grad_norm": 5.0, "learning_rate": 8.263731919737068e-06, "loss": 1.07030125, "memory(GiB)": 142.32, "step": 55420, "train_speed(iter/s)": 0.2878 }, { "acc": 0.72988615, "epoch": 0.6200934781970424, "grad_norm": 5.625, "learning_rate": 8.26233061874823e-06, "loss": 1.07116985, "memory(GiB)": 142.32, "step": 55440, "train_speed(iter/s)": 0.287836 }, { "acc": 0.73310499, "epoch": 0.6203171771430009, "grad_norm": 7.5625, "learning_rate": 8.260928871425932e-06, "loss": 1.07554493, "memory(GiB)": 142.32, "step": 55460, "train_speed(iter/s)": 0.287871 }, { "acc": 0.74988766, "epoch": 0.6205408760889595, "grad_norm": 6.40625, "learning_rate": 8.259526677961956e-06, "loss": 0.97216377, "memory(GiB)": 142.32, "step": 55480, "train_speed(iter/s)": 0.287906 }, { "acc": 0.72010908, "epoch": 0.620764575034918, "grad_norm": 5.65625, "learning_rate": 8.258124038548141e-06, "loss": 1.11909628, "memory(GiB)": 142.32, "step": 55500, "train_speed(iter/s)": 0.287938 }, { "acc": 0.71896381, "epoch": 0.6209882739808765, "grad_norm": 4.40625, "learning_rate": 8.256720953376389e-06, "loss": 1.12725945, "memory(GiB)": 142.32, "step": 55520, "train_speed(iter/s)": 0.287972 }, { "acc": 0.7184411, "epoch": 0.621211972926835, "grad_norm": 5.28125, "learning_rate": 8.25531742263866e-06, "loss": 1.12960014, "memory(GiB)": 142.32, "step": 55540, "train_speed(iter/s)": 0.288009 }, { "acc": 0.7116354, "epoch": 0.6214356718727936, "grad_norm": 5.96875, "learning_rate": 8.25391344652698e-06, "loss": 1.16444883, "memory(GiB)": 142.32, "step": 55560, "train_speed(iter/s)": 0.288041 }, { "acc": 0.72448559, "epoch": 0.6216593708187521, "grad_norm": 4.5, "learning_rate": 8.25250902523343e-06, "loss": 1.09262152, "memory(GiB)": 142.32, "step": 55580, "train_speed(iter/s)": 0.288076 }, { "acc": 0.74191232, "epoch": 0.6218830697647106, "grad_norm": 6.21875, "learning_rate": 8.251104158950158e-06, "loss": 1.01595049, "memory(GiB)": 142.32, "step": 55600, "train_speed(iter/s)": 0.288109 }, { "acc": 0.72258139, "epoch": 0.6221067687106692, "grad_norm": 6.3125, "learning_rate": 8.249698847869368e-06, "loss": 1.10460968, "memory(GiB)": 142.32, "step": 55620, "train_speed(iter/s)": 0.288147 }, { "acc": 0.72884884, "epoch": 0.6223304676566277, "grad_norm": 6.0, "learning_rate": 8.248293092183324e-06, "loss": 1.0812418, "memory(GiB)": 142.32, "step": 55640, "train_speed(iter/s)": 0.288182 }, { "acc": 0.73558741, "epoch": 0.6225541666025862, "grad_norm": 7.53125, "learning_rate": 8.246886892084359e-06, "loss": 1.05366936, "memory(GiB)": 142.32, "step": 55660, "train_speed(iter/s)": 0.288218 }, { "acc": 0.72804728, "epoch": 0.6227778655485448, "grad_norm": 6.03125, "learning_rate": 8.245480247764856e-06, "loss": 1.07242537, "memory(GiB)": 142.32, "step": 55680, "train_speed(iter/s)": 0.288253 }, { "acc": 0.72502871, "epoch": 0.6230015644945033, "grad_norm": 5.8125, "learning_rate": 8.244073159417268e-06, "loss": 1.10487709, "memory(GiB)": 142.32, "step": 55700, "train_speed(iter/s)": 0.288285 }, { "acc": 0.73410482, "epoch": 0.6232252634404618, "grad_norm": 6.28125, "learning_rate": 8.242665627234104e-06, "loss": 1.05649815, "memory(GiB)": 142.32, "step": 55720, "train_speed(iter/s)": 0.288319 }, { "acc": 0.72716856, "epoch": 0.6234489623864203, "grad_norm": 5.78125, "learning_rate": 8.241257651407933e-06, "loss": 1.07471714, "memory(GiB)": 142.32, "step": 55740, "train_speed(iter/s)": 0.288356 }, { "acc": 0.72480831, "epoch": 0.6236726613323789, "grad_norm": 5.5625, "learning_rate": 8.239849232131386e-06, "loss": 1.10200348, "memory(GiB)": 142.32, "step": 55760, "train_speed(iter/s)": 0.28839 }, { "acc": 0.72841311, "epoch": 0.6238963602783374, "grad_norm": 6.15625, "learning_rate": 8.238440369597157e-06, "loss": 1.07652969, "memory(GiB)": 142.32, "step": 55780, "train_speed(iter/s)": 0.288423 }, { "acc": 0.73516188, "epoch": 0.6241200592242959, "grad_norm": 5.84375, "learning_rate": 8.237031063997999e-06, "loss": 1.05161161, "memory(GiB)": 142.32, "step": 55800, "train_speed(iter/s)": 0.288458 }, { "acc": 0.72712011, "epoch": 0.6243437581702544, "grad_norm": 5.84375, "learning_rate": 8.23562131552672e-06, "loss": 1.09144421, "memory(GiB)": 142.32, "step": 55820, "train_speed(iter/s)": 0.288492 }, { "acc": 0.71950111, "epoch": 0.624567457116213, "grad_norm": 6.34375, "learning_rate": 8.234211124376199e-06, "loss": 1.12806988, "memory(GiB)": 142.32, "step": 55840, "train_speed(iter/s)": 0.288532 }, { "acc": 0.73350306, "epoch": 0.6247911560621715, "grad_norm": 6.90625, "learning_rate": 8.23280049073937e-06, "loss": 1.05255795, "memory(GiB)": 142.32, "step": 55860, "train_speed(iter/s)": 0.288562 }, { "acc": 0.72607918, "epoch": 0.62501485500813, "grad_norm": 5.59375, "learning_rate": 8.231389414809226e-06, "loss": 1.10049, "memory(GiB)": 142.32, "step": 55880, "train_speed(iter/s)": 0.288598 }, { "acc": 0.72750645, "epoch": 0.6252385539540886, "grad_norm": 5.375, "learning_rate": 8.229977896778822e-06, "loss": 1.07709742, "memory(GiB)": 142.32, "step": 55900, "train_speed(iter/s)": 0.288636 }, { "acc": 0.73150091, "epoch": 0.6254622529000471, "grad_norm": 5.5, "learning_rate": 8.228565936841274e-06, "loss": 1.07352524, "memory(GiB)": 142.32, "step": 55920, "train_speed(iter/s)": 0.288671 }, { "acc": 0.73545847, "epoch": 0.6256859518460056, "grad_norm": 6.96875, "learning_rate": 8.22715353518976e-06, "loss": 1.03929234, "memory(GiB)": 142.32, "step": 55940, "train_speed(iter/s)": 0.288706 }, { "acc": 0.71674843, "epoch": 0.6259096507919641, "grad_norm": 6.34375, "learning_rate": 8.225740692017516e-06, "loss": 1.12908888, "memory(GiB)": 142.32, "step": 55960, "train_speed(iter/s)": 0.288744 }, { "acc": 0.73342419, "epoch": 0.6261333497379227, "grad_norm": 5.34375, "learning_rate": 8.22432740751784e-06, "loss": 1.05184345, "memory(GiB)": 142.32, "step": 55980, "train_speed(iter/s)": 0.288777 }, { "acc": 0.7312645, "epoch": 0.6263570486838812, "grad_norm": 6.34375, "learning_rate": 8.222913681884085e-06, "loss": 1.05965719, "memory(GiB)": 142.32, "step": 56000, "train_speed(iter/s)": 0.288815 }, { "epoch": 0.6263570486838812, "eval_acc": 0.6929462698009874, "eval_loss": 1.0865025520324707, "eval_runtime": 2341.455, "eval_samples_per_second": 32.152, "eval_steps_per_second": 16.076, "step": 56000 }, { "acc": 0.72419395, "epoch": 0.6265807476298397, "grad_norm": 6.0625, "learning_rate": 8.221499515309676e-06, "loss": 1.10225468, "memory(GiB)": 142.32, "step": 56020, "train_speed(iter/s)": 0.285332 }, { "acc": 0.72751083, "epoch": 0.6268044465757983, "grad_norm": 5.5625, "learning_rate": 8.220084907988085e-06, "loss": 1.09514256, "memory(GiB)": 142.32, "step": 56040, "train_speed(iter/s)": 0.285369 }, { "acc": 0.73587203, "epoch": 0.6270281455217568, "grad_norm": 5.875, "learning_rate": 8.218669860112854e-06, "loss": 1.03929577, "memory(GiB)": 142.32, "step": 56060, "train_speed(iter/s)": 0.285404 }, { "acc": 0.72612209, "epoch": 0.6272518444677153, "grad_norm": 6.4375, "learning_rate": 8.21725437187758e-06, "loss": 1.07877226, "memory(GiB)": 142.32, "step": 56080, "train_speed(iter/s)": 0.285438 }, { "acc": 0.73814306, "epoch": 0.6274755434136738, "grad_norm": 5.09375, "learning_rate": 8.215838443475925e-06, "loss": 1.04229174, "memory(GiB)": 142.32, "step": 56100, "train_speed(iter/s)": 0.285473 }, { "acc": 0.73856697, "epoch": 0.6276992423596324, "grad_norm": 5.875, "learning_rate": 8.214422075101603e-06, "loss": 1.03267689, "memory(GiB)": 142.32, "step": 56120, "train_speed(iter/s)": 0.28551 }, { "acc": 0.72486768, "epoch": 0.6279229413055909, "grad_norm": 6.125, "learning_rate": 8.213005266948398e-06, "loss": 1.09701595, "memory(GiB)": 142.32, "step": 56140, "train_speed(iter/s)": 0.285544 }, { "acc": 0.73668799, "epoch": 0.6281466402515494, "grad_norm": 5.75, "learning_rate": 8.211588019210148e-06, "loss": 1.05909634, "memory(GiB)": 142.32, "step": 56160, "train_speed(iter/s)": 0.285579 }, { "acc": 0.73684301, "epoch": 0.628370339197508, "grad_norm": 5.5, "learning_rate": 8.210170332080752e-06, "loss": 1.04753351, "memory(GiB)": 142.32, "step": 56180, "train_speed(iter/s)": 0.285614 }, { "acc": 0.72761965, "epoch": 0.6285940381434665, "grad_norm": 5.09375, "learning_rate": 8.208752205754171e-06, "loss": 1.08614349, "memory(GiB)": 142.32, "step": 56200, "train_speed(iter/s)": 0.285652 }, { "acc": 0.72629433, "epoch": 0.628817737089425, "grad_norm": 5.90625, "learning_rate": 8.207333640424426e-06, "loss": 1.08425999, "memory(GiB)": 142.32, "step": 56220, "train_speed(iter/s)": 0.285687 }, { "acc": 0.73381662, "epoch": 0.6290414360353835, "grad_norm": 5.1875, "learning_rate": 8.205914636285594e-06, "loss": 1.05579472, "memory(GiB)": 142.32, "step": 56240, "train_speed(iter/s)": 0.28572 }, { "acc": 0.73649416, "epoch": 0.6292651349813421, "grad_norm": 5.4375, "learning_rate": 8.204495193531816e-06, "loss": 1.050317, "memory(GiB)": 142.32, "step": 56260, "train_speed(iter/s)": 0.285752 }, { "acc": 0.72687836, "epoch": 0.6294888339273006, "grad_norm": 6.125, "learning_rate": 8.203075312357295e-06, "loss": 1.08325672, "memory(GiB)": 142.32, "step": 56280, "train_speed(iter/s)": 0.285789 }, { "acc": 0.7336338, "epoch": 0.6297125328732591, "grad_norm": 5.71875, "learning_rate": 8.201654992956287e-06, "loss": 1.05979919, "memory(GiB)": 142.32, "step": 56300, "train_speed(iter/s)": 0.285815 }, { "acc": 0.73713746, "epoch": 0.6299362318192177, "grad_norm": 5.9375, "learning_rate": 8.200234235523114e-06, "loss": 1.03069181, "memory(GiB)": 142.32, "step": 56320, "train_speed(iter/s)": 0.285844 }, { "acc": 0.7199604, "epoch": 0.6301599307651762, "grad_norm": 5.6875, "learning_rate": 8.198813040252157e-06, "loss": 1.12359734, "memory(GiB)": 142.32, "step": 56340, "train_speed(iter/s)": 0.285878 }, { "acc": 0.73703685, "epoch": 0.6303836297111347, "grad_norm": 6.75, "learning_rate": 8.197391407337854e-06, "loss": 1.05757122, "memory(GiB)": 142.32, "step": 56360, "train_speed(iter/s)": 0.285911 }, { "acc": 0.72680864, "epoch": 0.6306073286570932, "grad_norm": 5.15625, "learning_rate": 8.195969336974705e-06, "loss": 1.09722614, "memory(GiB)": 142.32, "step": 56380, "train_speed(iter/s)": 0.285947 }, { "acc": 0.74431648, "epoch": 0.6308310276030518, "grad_norm": 6.375, "learning_rate": 8.194546829357269e-06, "loss": 1.00836506, "memory(GiB)": 142.32, "step": 56400, "train_speed(iter/s)": 0.285981 }, { "acc": 0.73088546, "epoch": 0.6310547265490103, "grad_norm": 5.71875, "learning_rate": 8.193123884680168e-06, "loss": 1.0685667, "memory(GiB)": 142.32, "step": 56420, "train_speed(iter/s)": 0.286016 }, { "acc": 0.74421854, "epoch": 0.6312784254949688, "grad_norm": 6.125, "learning_rate": 8.191700503138077e-06, "loss": 1.00197363, "memory(GiB)": 142.32, "step": 56440, "train_speed(iter/s)": 0.286047 }, { "acc": 0.7314857, "epoch": 0.6315021244409273, "grad_norm": 6.5625, "learning_rate": 8.190276684925738e-06, "loss": 1.07741041, "memory(GiB)": 142.32, "step": 56460, "train_speed(iter/s)": 0.286079 }, { "acc": 0.72948322, "epoch": 0.6317258233868859, "grad_norm": 6.15625, "learning_rate": 8.18885243023795e-06, "loss": 1.07697248, "memory(GiB)": 142.32, "step": 56480, "train_speed(iter/s)": 0.286115 }, { "acc": 0.7326582, "epoch": 0.6319495223328444, "grad_norm": 6.65625, "learning_rate": 8.18742773926957e-06, "loss": 1.05325842, "memory(GiB)": 142.32, "step": 56500, "train_speed(iter/s)": 0.286152 }, { "acc": 0.73009701, "epoch": 0.6321732212788029, "grad_norm": 6.1875, "learning_rate": 8.186002612215515e-06, "loss": 1.0795104, "memory(GiB)": 142.32, "step": 56520, "train_speed(iter/s)": 0.286185 }, { "acc": 0.72651911, "epoch": 0.6323969202247615, "grad_norm": 7.65625, "learning_rate": 8.184577049270765e-06, "loss": 1.09343681, "memory(GiB)": 142.32, "step": 56540, "train_speed(iter/s)": 0.286215 }, { "acc": 0.73664527, "epoch": 0.6326206191707201, "grad_norm": 6.8125, "learning_rate": 8.183151050630358e-06, "loss": 1.03727474, "memory(GiB)": 142.32, "step": 56560, "train_speed(iter/s)": 0.286251 }, { "acc": 0.72229266, "epoch": 0.6328443181166786, "grad_norm": 5.0, "learning_rate": 8.181724616489389e-06, "loss": 1.1102766, "memory(GiB)": 142.32, "step": 56580, "train_speed(iter/s)": 0.286282 }, { "acc": 0.71998396, "epoch": 0.6330680170626372, "grad_norm": 5.90625, "learning_rate": 8.180297747043014e-06, "loss": 1.1402957, "memory(GiB)": 142.32, "step": 56600, "train_speed(iter/s)": 0.286315 }, { "acc": 0.73845606, "epoch": 0.6332917160085957, "grad_norm": 5.40625, "learning_rate": 8.178870442486451e-06, "loss": 1.04072094, "memory(GiB)": 142.32, "step": 56620, "train_speed(iter/s)": 0.286349 }, { "acc": 0.72545648, "epoch": 0.6335154149545542, "grad_norm": 5.65625, "learning_rate": 8.177442703014975e-06, "loss": 1.07707996, "memory(GiB)": 142.32, "step": 56640, "train_speed(iter/s)": 0.286382 }, { "acc": 0.72117519, "epoch": 0.6337391139005127, "grad_norm": 6.375, "learning_rate": 8.17601452882392e-06, "loss": 1.11494541, "memory(GiB)": 142.32, "step": 56660, "train_speed(iter/s)": 0.286408 }, { "acc": 0.7216218, "epoch": 0.6339628128464713, "grad_norm": 6.53125, "learning_rate": 8.174585920108682e-06, "loss": 1.11443367, "memory(GiB)": 142.32, "step": 56680, "train_speed(iter/s)": 0.286443 }, { "acc": 0.72700109, "epoch": 0.6341865117924298, "grad_norm": 6.9375, "learning_rate": 8.173156877064717e-06, "loss": 1.08423882, "memory(GiB)": 142.32, "step": 56700, "train_speed(iter/s)": 0.286476 }, { "acc": 0.72149572, "epoch": 0.6344102107383883, "grad_norm": 5.34375, "learning_rate": 8.171727399887535e-06, "loss": 1.12121925, "memory(GiB)": 142.32, "step": 56720, "train_speed(iter/s)": 0.28651 }, { "acc": 0.73119841, "epoch": 0.6346339096843469, "grad_norm": 4.9375, "learning_rate": 8.170297488772709e-06, "loss": 1.07161694, "memory(GiB)": 142.32, "step": 56740, "train_speed(iter/s)": 0.286543 }, { "acc": 0.74097676, "epoch": 0.6348576086303054, "grad_norm": 5.9375, "learning_rate": 8.168867143915874e-06, "loss": 1.03699245, "memory(GiB)": 142.32, "step": 56760, "train_speed(iter/s)": 0.286576 }, { "acc": 0.72893338, "epoch": 0.6350813075762639, "grad_norm": 6.96875, "learning_rate": 8.16743636551272e-06, "loss": 1.07231922, "memory(GiB)": 142.32, "step": 56780, "train_speed(iter/s)": 0.286611 }, { "acc": 0.72358007, "epoch": 0.6353050065222224, "grad_norm": 6.5, "learning_rate": 8.166005153758997e-06, "loss": 1.11255665, "memory(GiB)": 142.32, "step": 56800, "train_speed(iter/s)": 0.286646 }, { "acc": 0.73170056, "epoch": 0.635528705468181, "grad_norm": 6.40625, "learning_rate": 8.164573508850517e-06, "loss": 1.0764267, "memory(GiB)": 142.32, "step": 56820, "train_speed(iter/s)": 0.286679 }, { "acc": 0.73725929, "epoch": 0.6357524044141395, "grad_norm": 5.5625, "learning_rate": 8.16314143098315e-06, "loss": 1.05359859, "memory(GiB)": 142.32, "step": 56840, "train_speed(iter/s)": 0.286713 }, { "acc": 0.74001799, "epoch": 0.635976103360098, "grad_norm": 5.5625, "learning_rate": 8.161708920352823e-06, "loss": 1.0247695, "memory(GiB)": 142.32, "step": 56860, "train_speed(iter/s)": 0.28674 }, { "acc": 0.72559204, "epoch": 0.6361998023060565, "grad_norm": 5.5625, "learning_rate": 8.160275977155523e-06, "loss": 1.1000639, "memory(GiB)": 142.32, "step": 56880, "train_speed(iter/s)": 0.286775 }, { "acc": 0.71968775, "epoch": 0.6364235012520151, "grad_norm": 5.0625, "learning_rate": 8.158842601587301e-06, "loss": 1.13303795, "memory(GiB)": 142.32, "step": 56900, "train_speed(iter/s)": 0.286813 }, { "acc": 0.73312016, "epoch": 0.6366472001979736, "grad_norm": 5.875, "learning_rate": 8.157408793844258e-06, "loss": 1.05816307, "memory(GiB)": 142.32, "step": 56920, "train_speed(iter/s)": 0.28685 }, { "acc": 0.73968101, "epoch": 0.6368708991439321, "grad_norm": 5.90625, "learning_rate": 8.155974554122562e-06, "loss": 1.03246975, "memory(GiB)": 142.32, "step": 56940, "train_speed(iter/s)": 0.286886 }, { "acc": 0.72521834, "epoch": 0.6370945980898907, "grad_norm": 5.125, "learning_rate": 8.15453988261844e-06, "loss": 1.11717205, "memory(GiB)": 142.32, "step": 56960, "train_speed(iter/s)": 0.286921 }, { "acc": 0.73432512, "epoch": 0.6373182970358492, "grad_norm": 5.5625, "learning_rate": 8.153104779528173e-06, "loss": 1.04418144, "memory(GiB)": 142.32, "step": 56980, "train_speed(iter/s)": 0.286956 }, { "acc": 0.72435036, "epoch": 0.6375419959818077, "grad_norm": 6.15625, "learning_rate": 8.151669245048104e-06, "loss": 1.09721851, "memory(GiB)": 142.32, "step": 57000, "train_speed(iter/s)": 0.286993 }, { "acc": 0.7259943, "epoch": 0.6377656949277662, "grad_norm": 6.65625, "learning_rate": 8.150233279374635e-06, "loss": 1.09556122, "memory(GiB)": 142.32, "step": 57020, "train_speed(iter/s)": 0.287023 }, { "acc": 0.71877623, "epoch": 0.6379893938737248, "grad_norm": 5.0, "learning_rate": 8.148796882704223e-06, "loss": 1.13559895, "memory(GiB)": 142.32, "step": 57040, "train_speed(iter/s)": 0.287055 }, { "acc": 0.73289995, "epoch": 0.6382130928196833, "grad_norm": 5.90625, "learning_rate": 8.147360055233395e-06, "loss": 1.07872362, "memory(GiB)": 142.32, "step": 57060, "train_speed(iter/s)": 0.287086 }, { "acc": 0.73330951, "epoch": 0.6384367917656418, "grad_norm": 5.09375, "learning_rate": 8.145922797158724e-06, "loss": 1.06369858, "memory(GiB)": 142.32, "step": 57080, "train_speed(iter/s)": 0.287119 }, { "acc": 0.72988224, "epoch": 0.6386604907116004, "grad_norm": 5.84375, "learning_rate": 8.144485108676847e-06, "loss": 1.0849781, "memory(GiB)": 142.32, "step": 57100, "train_speed(iter/s)": 0.287155 }, { "acc": 0.73898907, "epoch": 0.6388841896575589, "grad_norm": 5.875, "learning_rate": 8.143046989984464e-06, "loss": 1.02838554, "memory(GiB)": 142.32, "step": 57120, "train_speed(iter/s)": 0.287187 }, { "acc": 0.72588515, "epoch": 0.6391078886035174, "grad_norm": 5.53125, "learning_rate": 8.141608441278328e-06, "loss": 1.08301411, "memory(GiB)": 142.32, "step": 57140, "train_speed(iter/s)": 0.287219 }, { "acc": 0.73205762, "epoch": 0.6393315875494759, "grad_norm": 5.6875, "learning_rate": 8.140169462755252e-06, "loss": 1.08207254, "memory(GiB)": 142.32, "step": 57160, "train_speed(iter/s)": 0.287252 }, { "acc": 0.72125645, "epoch": 0.6395552864954345, "grad_norm": 5.90625, "learning_rate": 8.138730054612111e-06, "loss": 1.11573982, "memory(GiB)": 142.32, "step": 57180, "train_speed(iter/s)": 0.287283 }, { "acc": 0.74789171, "epoch": 0.639778985441393, "grad_norm": 6.75, "learning_rate": 8.137290217045837e-06, "loss": 0.98755531, "memory(GiB)": 142.32, "step": 57200, "train_speed(iter/s)": 0.287319 }, { "acc": 0.71690407, "epoch": 0.6400026843873515, "grad_norm": 5.75, "learning_rate": 8.135849950253416e-06, "loss": 1.13106804, "memory(GiB)": 142.32, "step": 57220, "train_speed(iter/s)": 0.287352 }, { "acc": 0.72123971, "epoch": 0.6402263833333101, "grad_norm": 5.5, "learning_rate": 8.134409254431903e-06, "loss": 1.12792921, "memory(GiB)": 142.32, "step": 57240, "train_speed(iter/s)": 0.287385 }, { "acc": 0.72866497, "epoch": 0.6404500822792686, "grad_norm": 5.5625, "learning_rate": 8.132968129778401e-06, "loss": 1.089468, "memory(GiB)": 142.32, "step": 57260, "train_speed(iter/s)": 0.28742 }, { "acc": 0.73457251, "epoch": 0.6406737812252271, "grad_norm": 5.96875, "learning_rate": 8.13152657649008e-06, "loss": 1.07094517, "memory(GiB)": 142.32, "step": 57280, "train_speed(iter/s)": 0.287455 }, { "acc": 0.72809896, "epoch": 0.6408974801711856, "grad_norm": 4.8125, "learning_rate": 8.130084594764162e-06, "loss": 1.08832378, "memory(GiB)": 142.32, "step": 57300, "train_speed(iter/s)": 0.287485 }, { "acc": 0.73699312, "epoch": 0.6411211791171442, "grad_norm": 6.0625, "learning_rate": 8.128642184797934e-06, "loss": 1.04012222, "memory(GiB)": 142.32, "step": 57320, "train_speed(iter/s)": 0.287519 }, { "acc": 0.72676392, "epoch": 0.6413448780631027, "grad_norm": 6.65625, "learning_rate": 8.127199346788734e-06, "loss": 1.07810326, "memory(GiB)": 142.32, "step": 57340, "train_speed(iter/s)": 0.287551 }, { "acc": 0.74454074, "epoch": 0.6415685770090612, "grad_norm": 6.21875, "learning_rate": 8.125756080933968e-06, "loss": 1.00113487, "memory(GiB)": 142.32, "step": 57360, "train_speed(iter/s)": 0.287585 }, { "acc": 0.73374486, "epoch": 0.6417922759550198, "grad_norm": 6.65625, "learning_rate": 8.124312387431092e-06, "loss": 1.06238766, "memory(GiB)": 142.32, "step": 57380, "train_speed(iter/s)": 0.287618 }, { "acc": 0.73818545, "epoch": 0.6420159749009783, "grad_norm": 6.75, "learning_rate": 8.122868266477623e-06, "loss": 1.0407156, "memory(GiB)": 142.32, "step": 57400, "train_speed(iter/s)": 0.287654 }, { "acc": 0.74085264, "epoch": 0.6422396738469368, "grad_norm": 6.625, "learning_rate": 8.121423718271142e-06, "loss": 1.02524567, "memory(GiB)": 142.32, "step": 57420, "train_speed(iter/s)": 0.287689 }, { "acc": 0.72064552, "epoch": 0.6424633727928953, "grad_norm": 5.96875, "learning_rate": 8.119978743009278e-06, "loss": 1.10769348, "memory(GiB)": 142.32, "step": 57440, "train_speed(iter/s)": 0.287715 }, { "acc": 0.73657475, "epoch": 0.6426870717388539, "grad_norm": 9.9375, "learning_rate": 8.11853334088973e-06, "loss": 1.04265099, "memory(GiB)": 142.32, "step": 57460, "train_speed(iter/s)": 0.287749 }, { "acc": 0.72027035, "epoch": 0.6429107706848124, "grad_norm": 5.6875, "learning_rate": 8.117087512110245e-06, "loss": 1.12084866, "memory(GiB)": 142.32, "step": 57480, "train_speed(iter/s)": 0.287781 }, { "acc": 0.716572, "epoch": 0.6431344696307709, "grad_norm": 6.9375, "learning_rate": 8.115641256868636e-06, "loss": 1.14285488, "memory(GiB)": 142.32, "step": 57500, "train_speed(iter/s)": 0.287817 }, { "acc": 0.72535701, "epoch": 0.6433581685767295, "grad_norm": 4.8125, "learning_rate": 8.114194575362769e-06, "loss": 1.08964052, "memory(GiB)": 142.32, "step": 57520, "train_speed(iter/s)": 0.287853 }, { "acc": 0.7275054, "epoch": 0.643581867522688, "grad_norm": 6.34375, "learning_rate": 8.112747467790572e-06, "loss": 1.09823513, "memory(GiB)": 142.32, "step": 57540, "train_speed(iter/s)": 0.287885 }, { "acc": 0.72541647, "epoch": 0.6438055664686465, "grad_norm": 6.40625, "learning_rate": 8.11129993435003e-06, "loss": 1.09960232, "memory(GiB)": 142.32, "step": 57560, "train_speed(iter/s)": 0.287917 }, { "acc": 0.74090214, "epoch": 0.644029265414605, "grad_norm": 5.375, "learning_rate": 8.109851975239188e-06, "loss": 1.02292919, "memory(GiB)": 142.32, "step": 57580, "train_speed(iter/s)": 0.287948 }, { "acc": 0.73257952, "epoch": 0.6442529643605636, "grad_norm": 5.78125, "learning_rate": 8.108403590656144e-06, "loss": 1.04574537, "memory(GiB)": 142.32, "step": 57600, "train_speed(iter/s)": 0.287981 }, { "acc": 0.72143693, "epoch": 0.6444766633065221, "grad_norm": 5.5, "learning_rate": 8.106954780799062e-06, "loss": 1.11753368, "memory(GiB)": 142.32, "step": 57620, "train_speed(iter/s)": 0.288016 }, { "acc": 0.73176279, "epoch": 0.6447003622524806, "grad_norm": 5.34375, "learning_rate": 8.105505545866155e-06, "loss": 1.0707818, "memory(GiB)": 142.32, "step": 57640, "train_speed(iter/s)": 0.288051 }, { "acc": 0.7347743, "epoch": 0.6449240611984391, "grad_norm": 5.5, "learning_rate": 8.104055886055702e-06, "loss": 1.04565582, "memory(GiB)": 142.32, "step": 57660, "train_speed(iter/s)": 0.288088 }, { "acc": 0.72795982, "epoch": 0.6451477601443977, "grad_norm": 5.78125, "learning_rate": 8.102605801566038e-06, "loss": 1.09575958, "memory(GiB)": 142.32, "step": 57680, "train_speed(iter/s)": 0.28812 }, { "acc": 0.73177872, "epoch": 0.6453714590903562, "grad_norm": 5.65625, "learning_rate": 8.101155292595551e-06, "loss": 1.06885948, "memory(GiB)": 142.32, "step": 57700, "train_speed(iter/s)": 0.288151 }, { "acc": 0.7343935, "epoch": 0.6455951580363147, "grad_norm": 6.0625, "learning_rate": 8.099704359342695e-06, "loss": 1.05584164, "memory(GiB)": 142.32, "step": 57720, "train_speed(iter/s)": 0.288182 }, { "acc": 0.73102698, "epoch": 0.6458188569822733, "grad_norm": 5.15625, "learning_rate": 8.098253002005979e-06, "loss": 1.06997414, "memory(GiB)": 142.32, "step": 57740, "train_speed(iter/s)": 0.288219 }, { "acc": 0.719347, "epoch": 0.6460425559282318, "grad_norm": 6.75, "learning_rate": 8.096801220783967e-06, "loss": 1.11572895, "memory(GiB)": 142.32, "step": 57760, "train_speed(iter/s)": 0.288256 }, { "acc": 0.7365921, "epoch": 0.6462662548741903, "grad_norm": 6.09375, "learning_rate": 8.095349015875284e-06, "loss": 1.05035772, "memory(GiB)": 142.32, "step": 57780, "train_speed(iter/s)": 0.28829 }, { "acc": 0.7199954, "epoch": 0.6464899538201488, "grad_norm": 6.03125, "learning_rate": 8.093896387478615e-06, "loss": 1.12482548, "memory(GiB)": 142.32, "step": 57800, "train_speed(iter/s)": 0.288326 }, { "acc": 0.72889891, "epoch": 0.6467136527661074, "grad_norm": 5.71875, "learning_rate": 8.092443335792697e-06, "loss": 1.07364178, "memory(GiB)": 142.32, "step": 57820, "train_speed(iter/s)": 0.288362 }, { "acc": 0.73356247, "epoch": 0.6469373517120659, "grad_norm": 6.0, "learning_rate": 8.090989861016329e-06, "loss": 1.07267189, "memory(GiB)": 142.32, "step": 57840, "train_speed(iter/s)": 0.2884 }, { "acc": 0.73950768, "epoch": 0.6471610506580244, "grad_norm": 6.40625, "learning_rate": 8.089535963348367e-06, "loss": 1.01521902, "memory(GiB)": 142.32, "step": 57860, "train_speed(iter/s)": 0.288434 }, { "acc": 0.74000483, "epoch": 0.647384749603983, "grad_norm": 6.5, "learning_rate": 8.08808164298773e-06, "loss": 1.03083935, "memory(GiB)": 142.32, "step": 57880, "train_speed(iter/s)": 0.288466 }, { "acc": 0.7177062, "epoch": 0.6476084485499415, "grad_norm": 5.9375, "learning_rate": 8.08662690013338e-06, "loss": 1.13087654, "memory(GiB)": 142.32, "step": 57900, "train_speed(iter/s)": 0.288501 }, { "acc": 0.72548594, "epoch": 0.6478321474959, "grad_norm": 4.65625, "learning_rate": 8.085171734984353e-06, "loss": 1.07808542, "memory(GiB)": 142.32, "step": 57920, "train_speed(iter/s)": 0.288535 }, { "acc": 0.73375039, "epoch": 0.6480558464418585, "grad_norm": 6.9375, "learning_rate": 8.083716147739738e-06, "loss": 1.05113659, "memory(GiB)": 142.32, "step": 57940, "train_speed(iter/s)": 0.288567 }, { "acc": 0.73363028, "epoch": 0.6482795453878171, "grad_norm": 6.375, "learning_rate": 8.082260138598674e-06, "loss": 1.04244881, "memory(GiB)": 142.32, "step": 57960, "train_speed(iter/s)": 0.288596 }, { "acc": 0.72063174, "epoch": 0.6485032443337756, "grad_norm": 7.0, "learning_rate": 8.08080370776037e-06, "loss": 1.11167221, "memory(GiB)": 142.32, "step": 57980, "train_speed(iter/s)": 0.288632 }, { "acc": 0.73305435, "epoch": 0.6487269432797341, "grad_norm": 6.1875, "learning_rate": 8.079346855424084e-06, "loss": 1.06221647, "memory(GiB)": 142.32, "step": 58000, "train_speed(iter/s)": 0.288669 }, { "epoch": 0.6487269432797341, "eval_acc": 0.693221739509376, "eval_loss": 1.085546612739563, "eval_runtime": 2342.3576, "eval_samples_per_second": 32.14, "eval_steps_per_second": 16.07, "step": 58000 }, { "acc": 0.72273016, "epoch": 0.6489506422256927, "grad_norm": 5.34375, "learning_rate": 8.077889581789133e-06, "loss": 1.10906744, "memory(GiB)": 142.32, "step": 58020, "train_speed(iter/s)": 0.285305 }, { "acc": 0.7219924, "epoch": 0.6491743411716512, "grad_norm": 5.5, "learning_rate": 8.076431887054894e-06, "loss": 1.09910851, "memory(GiB)": 142.32, "step": 58040, "train_speed(iter/s)": 0.285338 }, { "acc": 0.73285871, "epoch": 0.6493980401176097, "grad_norm": 5.78125, "learning_rate": 8.0749737714208e-06, "loss": 1.06998882, "memory(GiB)": 142.32, "step": 58060, "train_speed(iter/s)": 0.28537 }, { "acc": 0.74168262, "epoch": 0.6496217390635682, "grad_norm": 5.875, "learning_rate": 8.073515235086345e-06, "loss": 1.01526432, "memory(GiB)": 142.32, "step": 58080, "train_speed(iter/s)": 0.285403 }, { "acc": 0.72311325, "epoch": 0.6498454380095268, "grad_norm": 5.40625, "learning_rate": 8.072056278251073e-06, "loss": 1.09987946, "memory(GiB)": 142.32, "step": 58100, "train_speed(iter/s)": 0.285435 }, { "acc": 0.72309427, "epoch": 0.6500691369554853, "grad_norm": 5.59375, "learning_rate": 8.070596901114594e-06, "loss": 1.10679159, "memory(GiB)": 142.32, "step": 58120, "train_speed(iter/s)": 0.28547 }, { "acc": 0.73333168, "epoch": 0.6502928359014438, "grad_norm": 5.5, "learning_rate": 8.069137103876568e-06, "loss": 1.06378231, "memory(GiB)": 142.32, "step": 58140, "train_speed(iter/s)": 0.285504 }, { "acc": 0.7268465, "epoch": 0.6505165348474024, "grad_norm": 6.25, "learning_rate": 8.067676886736719e-06, "loss": 1.06471004, "memory(GiB)": 142.32, "step": 58160, "train_speed(iter/s)": 0.285536 }, { "acc": 0.72476978, "epoch": 0.6507402337933609, "grad_norm": 5.625, "learning_rate": 8.066216249894824e-06, "loss": 1.1144084, "memory(GiB)": 142.32, "step": 58180, "train_speed(iter/s)": 0.285568 }, { "acc": 0.73060765, "epoch": 0.6509639327393194, "grad_norm": 5.90625, "learning_rate": 8.064755193550721e-06, "loss": 1.07957916, "memory(GiB)": 142.32, "step": 58200, "train_speed(iter/s)": 0.285602 }, { "acc": 0.73408527, "epoch": 0.6511876316852779, "grad_norm": 5.90625, "learning_rate": 8.063293717904303e-06, "loss": 1.05168343, "memory(GiB)": 142.32, "step": 58220, "train_speed(iter/s)": 0.285635 }, { "acc": 0.73271403, "epoch": 0.6514113306312365, "grad_norm": 5.96875, "learning_rate": 8.06183182315552e-06, "loss": 1.06055775, "memory(GiB)": 142.32, "step": 58240, "train_speed(iter/s)": 0.285662 }, { "acc": 0.72936964, "epoch": 0.651635029577195, "grad_norm": 6.40625, "learning_rate": 8.060369509504377e-06, "loss": 1.09493418, "memory(GiB)": 142.32, "step": 58260, "train_speed(iter/s)": 0.285701 }, { "acc": 0.72866592, "epoch": 0.6518587285231535, "grad_norm": 5.53125, "learning_rate": 8.058906777150943e-06, "loss": 1.07949829, "memory(GiB)": 142.32, "step": 58280, "train_speed(iter/s)": 0.285731 }, { "acc": 0.72474279, "epoch": 0.652082427469112, "grad_norm": 5.40625, "learning_rate": 8.057443626295342e-06, "loss": 1.09980907, "memory(GiB)": 142.32, "step": 58300, "train_speed(iter/s)": 0.285765 }, { "acc": 0.73048553, "epoch": 0.6523061264150706, "grad_norm": 4.9375, "learning_rate": 8.055980057137752e-06, "loss": 1.07217216, "memory(GiB)": 142.32, "step": 58320, "train_speed(iter/s)": 0.285801 }, { "acc": 0.73559184, "epoch": 0.6525298253610291, "grad_norm": 5.59375, "learning_rate": 8.054516069878408e-06, "loss": 1.06064157, "memory(GiB)": 142.32, "step": 58340, "train_speed(iter/s)": 0.285839 }, { "acc": 0.72745466, "epoch": 0.6527535243069876, "grad_norm": 7.46875, "learning_rate": 8.053051664717606e-06, "loss": 1.08897305, "memory(GiB)": 142.32, "step": 58360, "train_speed(iter/s)": 0.285873 }, { "acc": 0.73072901, "epoch": 0.6529772232529462, "grad_norm": 5.5, "learning_rate": 8.051586841855702e-06, "loss": 1.07225189, "memory(GiB)": 142.32, "step": 58380, "train_speed(iter/s)": 0.285907 }, { "acc": 0.72316647, "epoch": 0.6532009221989047, "grad_norm": 5.84375, "learning_rate": 8.050121601493097e-06, "loss": 1.10751133, "memory(GiB)": 142.32, "step": 58400, "train_speed(iter/s)": 0.285944 }, { "acc": 0.73139524, "epoch": 0.6534246211448632, "grad_norm": 6.40625, "learning_rate": 8.048655943830261e-06, "loss": 1.05891304, "memory(GiB)": 142.32, "step": 58420, "train_speed(iter/s)": 0.285972 }, { "acc": 0.72505522, "epoch": 0.6536483200908217, "grad_norm": 5.96875, "learning_rate": 8.047189869067718e-06, "loss": 1.11095505, "memory(GiB)": 142.32, "step": 58440, "train_speed(iter/s)": 0.286003 }, { "acc": 0.72913809, "epoch": 0.6538720190367803, "grad_norm": 5.84375, "learning_rate": 8.045723377406046e-06, "loss": 1.07296076, "memory(GiB)": 142.32, "step": 58460, "train_speed(iter/s)": 0.286039 }, { "acc": 0.74126997, "epoch": 0.6540957179827388, "grad_norm": 7.09375, "learning_rate": 8.044256469045882e-06, "loss": 1.04221077, "memory(GiB)": 142.32, "step": 58480, "train_speed(iter/s)": 0.286076 }, { "acc": 0.74375277, "epoch": 0.6543194169286973, "grad_norm": 5.28125, "learning_rate": 8.042789144187922e-06, "loss": 1.01634674, "memory(GiB)": 142.32, "step": 58500, "train_speed(iter/s)": 0.286111 }, { "acc": 0.73354578, "epoch": 0.6545431158746559, "grad_norm": 6.4375, "learning_rate": 8.041321403032914e-06, "loss": 1.06776237, "memory(GiB)": 142.32, "step": 58520, "train_speed(iter/s)": 0.286142 }, { "acc": 0.73410163, "epoch": 0.6547668148206144, "grad_norm": 5.375, "learning_rate": 8.039853245781669e-06, "loss": 1.05394382, "memory(GiB)": 142.32, "step": 58540, "train_speed(iter/s)": 0.286179 }, { "acc": 0.73316669, "epoch": 0.6549905137665729, "grad_norm": 5.75, "learning_rate": 8.03838467263505e-06, "loss": 1.04783783, "memory(GiB)": 142.32, "step": 58560, "train_speed(iter/s)": 0.286214 }, { "acc": 0.73904848, "epoch": 0.6552142127125314, "grad_norm": 5.1875, "learning_rate": 8.03691568379398e-06, "loss": 1.04578686, "memory(GiB)": 142.32, "step": 58580, "train_speed(iter/s)": 0.286248 }, { "acc": 0.73869019, "epoch": 0.65543791165849, "grad_norm": 5.6875, "learning_rate": 8.035446279459436e-06, "loss": 1.04269485, "memory(GiB)": 142.32, "step": 58600, "train_speed(iter/s)": 0.28628 }, { "acc": 0.72486644, "epoch": 0.6556616106044485, "grad_norm": 5.375, "learning_rate": 8.033976459832453e-06, "loss": 1.09799995, "memory(GiB)": 142.32, "step": 58620, "train_speed(iter/s)": 0.286308 }, { "acc": 0.72920055, "epoch": 0.655885309550407, "grad_norm": 5.625, "learning_rate": 8.032506225114126e-06, "loss": 1.07049828, "memory(GiB)": 142.32, "step": 58640, "train_speed(iter/s)": 0.28634 }, { "acc": 0.73171983, "epoch": 0.6561090084963656, "grad_norm": 5.8125, "learning_rate": 8.031035575505603e-06, "loss": 1.06406679, "memory(GiB)": 142.32, "step": 58660, "train_speed(iter/s)": 0.286373 }, { "acc": 0.74043727, "epoch": 0.6563327074423241, "grad_norm": 5.34375, "learning_rate": 8.02956451120809e-06, "loss": 1.05259523, "memory(GiB)": 142.32, "step": 58680, "train_speed(iter/s)": 0.286402 }, { "acc": 0.73217902, "epoch": 0.6565564063882826, "grad_norm": 5.78125, "learning_rate": 8.02809303242285e-06, "loss": 1.0732131, "memory(GiB)": 142.32, "step": 58700, "train_speed(iter/s)": 0.286437 }, { "acc": 0.74238019, "epoch": 0.6567801053342411, "grad_norm": 6.375, "learning_rate": 8.0266211393512e-06, "loss": 1.0228734, "memory(GiB)": 142.32, "step": 58720, "train_speed(iter/s)": 0.286466 }, { "acc": 0.74021015, "epoch": 0.6570038042801997, "grad_norm": 5.90625, "learning_rate": 8.02514883219452e-06, "loss": 1.02410469, "memory(GiB)": 142.32, "step": 58740, "train_speed(iter/s)": 0.2865 }, { "acc": 0.72166457, "epoch": 0.6572275032261582, "grad_norm": 6.25, "learning_rate": 8.02367611115424e-06, "loss": 1.1070488, "memory(GiB)": 142.32, "step": 58760, "train_speed(iter/s)": 0.286533 }, { "acc": 0.72889395, "epoch": 0.6574512021721167, "grad_norm": 5.0625, "learning_rate": 8.022202976431848e-06, "loss": 1.0760067, "memory(GiB)": 142.32, "step": 58780, "train_speed(iter/s)": 0.286569 }, { "acc": 0.73168001, "epoch": 0.6576749011180753, "grad_norm": 6.28125, "learning_rate": 8.020729428228893e-06, "loss": 1.07796078, "memory(GiB)": 142.32, "step": 58800, "train_speed(iter/s)": 0.2866 }, { "acc": 0.72738729, "epoch": 0.6578986000640338, "grad_norm": 4.96875, "learning_rate": 8.019255466746975e-06, "loss": 1.08421593, "memory(GiB)": 142.32, "step": 58820, "train_speed(iter/s)": 0.286631 }, { "acc": 0.72875681, "epoch": 0.6581222990099923, "grad_norm": 5.875, "learning_rate": 8.017781092187755e-06, "loss": 1.05491123, "memory(GiB)": 142.32, "step": 58840, "train_speed(iter/s)": 0.286664 }, { "acc": 0.7280448, "epoch": 0.6583459979559508, "grad_norm": 6.1875, "learning_rate": 8.016306304752947e-06, "loss": 1.08786335, "memory(GiB)": 142.32, "step": 58860, "train_speed(iter/s)": 0.286698 }, { "acc": 0.74347444, "epoch": 0.6585696969019094, "grad_norm": 6.34375, "learning_rate": 8.014831104644325e-06, "loss": 1.00934868, "memory(GiB)": 142.32, "step": 58880, "train_speed(iter/s)": 0.286736 }, { "acc": 0.7318778, "epoch": 0.6587933958478679, "grad_norm": 5.5625, "learning_rate": 8.013355492063715e-06, "loss": 1.05558748, "memory(GiB)": 142.32, "step": 58900, "train_speed(iter/s)": 0.286764 }, { "acc": 0.73460922, "epoch": 0.6590170947938264, "grad_norm": 5.40625, "learning_rate": 8.011879467213002e-06, "loss": 1.04557514, "memory(GiB)": 142.32, "step": 58920, "train_speed(iter/s)": 0.286798 }, { "acc": 0.72687693, "epoch": 0.659240793739785, "grad_norm": 6.09375, "learning_rate": 8.010403030294129e-06, "loss": 1.088974, "memory(GiB)": 142.32, "step": 58940, "train_speed(iter/s)": 0.28683 }, { "acc": 0.72883577, "epoch": 0.6594644926857435, "grad_norm": 6.1875, "learning_rate": 8.008926181509093e-06, "loss": 1.06582823, "memory(GiB)": 142.32, "step": 58960, "train_speed(iter/s)": 0.286866 }, { "acc": 0.73407154, "epoch": 0.659688191631702, "grad_norm": 5.0625, "learning_rate": 8.007448921059948e-06, "loss": 1.05571556, "memory(GiB)": 142.32, "step": 58980, "train_speed(iter/s)": 0.2869 }, { "acc": 0.72272654, "epoch": 0.6599118905776605, "grad_norm": 5.53125, "learning_rate": 8.005971249148804e-06, "loss": 1.10552464, "memory(GiB)": 142.32, "step": 59000, "train_speed(iter/s)": 0.286936 }, { "acc": 0.72171292, "epoch": 0.6601355895236191, "grad_norm": 6.125, "learning_rate": 8.004493165977827e-06, "loss": 1.11082401, "memory(GiB)": 142.32, "step": 59020, "train_speed(iter/s)": 0.286968 }, { "acc": 0.72354269, "epoch": 0.6603592884695776, "grad_norm": 5.0625, "learning_rate": 8.003014671749241e-06, "loss": 1.09596863, "memory(GiB)": 142.32, "step": 59040, "train_speed(iter/s)": 0.287001 }, { "acc": 0.73971243, "epoch": 0.6605829874155362, "grad_norm": 5.21875, "learning_rate": 8.001535766665326e-06, "loss": 1.02527218, "memory(GiB)": 142.32, "step": 59060, "train_speed(iter/s)": 0.287034 }, { "acc": 0.72363596, "epoch": 0.6608066863614948, "grad_norm": 5.6875, "learning_rate": 8.000056450928418e-06, "loss": 1.10216656, "memory(GiB)": 142.32, "step": 59080, "train_speed(iter/s)": 0.287066 }, { "acc": 0.72672815, "epoch": 0.6610303853074533, "grad_norm": 7.40625, "learning_rate": 7.998576724740903e-06, "loss": 1.08182735, "memory(GiB)": 142.32, "step": 59100, "train_speed(iter/s)": 0.287098 }, { "acc": 0.73364315, "epoch": 0.6612540842534118, "grad_norm": 5.03125, "learning_rate": 7.997096588305235e-06, "loss": 1.05699806, "memory(GiB)": 142.32, "step": 59120, "train_speed(iter/s)": 0.287132 }, { "acc": 0.73118963, "epoch": 0.6614777831993703, "grad_norm": 7.21875, "learning_rate": 7.995616041823914e-06, "loss": 1.07818813, "memory(GiB)": 142.32, "step": 59140, "train_speed(iter/s)": 0.287165 }, { "acc": 0.73020763, "epoch": 0.6617014821453289, "grad_norm": 5.6875, "learning_rate": 7.994135085499502e-06, "loss": 1.06297922, "memory(GiB)": 142.32, "step": 59160, "train_speed(iter/s)": 0.287195 }, { "acc": 0.73230286, "epoch": 0.6619251810912874, "grad_norm": 5.8125, "learning_rate": 7.992653719534613e-06, "loss": 1.06617842, "memory(GiB)": 142.32, "step": 59180, "train_speed(iter/s)": 0.287226 }, { "acc": 0.73568096, "epoch": 0.6621488800372459, "grad_norm": 5.96875, "learning_rate": 7.991171944131922e-06, "loss": 1.03531361, "memory(GiB)": 142.32, "step": 59200, "train_speed(iter/s)": 0.287258 }, { "acc": 0.73371301, "epoch": 0.6623725789832045, "grad_norm": 5.1875, "learning_rate": 7.989689759494155e-06, "loss": 1.06438351, "memory(GiB)": 142.32, "step": 59220, "train_speed(iter/s)": 0.287289 }, { "acc": 0.73646297, "epoch": 0.662596277929163, "grad_norm": 5.5, "learning_rate": 7.988207165824096e-06, "loss": 1.02534056, "memory(GiB)": 142.32, "step": 59240, "train_speed(iter/s)": 0.287322 }, { "acc": 0.73413782, "epoch": 0.6628199768751215, "grad_norm": 4.78125, "learning_rate": 7.986724163324585e-06, "loss": 1.0578042, "memory(GiB)": 142.32, "step": 59260, "train_speed(iter/s)": 0.287352 }, { "acc": 0.73672819, "epoch": 0.66304367582108, "grad_norm": 5.21875, "learning_rate": 7.98524075219852e-06, "loss": 1.03988323, "memory(GiB)": 142.32, "step": 59280, "train_speed(iter/s)": 0.28738 }, { "acc": 0.72706523, "epoch": 0.6632673747670386, "grad_norm": 6.25, "learning_rate": 7.98375693264885e-06, "loss": 1.09127197, "memory(GiB)": 142.32, "step": 59300, "train_speed(iter/s)": 0.287415 }, { "acc": 0.7174314, "epoch": 0.6634910737129971, "grad_norm": 4.90625, "learning_rate": 7.982272704878582e-06, "loss": 1.13235798, "memory(GiB)": 142.32, "step": 59320, "train_speed(iter/s)": 0.287444 }, { "acc": 0.73350544, "epoch": 0.6637147726589556, "grad_norm": 6.34375, "learning_rate": 7.980788069090784e-06, "loss": 1.04301243, "memory(GiB)": 142.32, "step": 59340, "train_speed(iter/s)": 0.287478 }, { "acc": 0.73195686, "epoch": 0.6639384716049141, "grad_norm": 6.78125, "learning_rate": 7.979303025488571e-06, "loss": 1.0757081, "memory(GiB)": 142.32, "step": 59360, "train_speed(iter/s)": 0.28751 }, { "acc": 0.72687678, "epoch": 0.6641621705508727, "grad_norm": 4.75, "learning_rate": 7.977817574275123e-06, "loss": 1.08489628, "memory(GiB)": 142.32, "step": 59380, "train_speed(iter/s)": 0.287541 }, { "acc": 0.72161179, "epoch": 0.6643858694968312, "grad_norm": 4.96875, "learning_rate": 7.976331715653666e-06, "loss": 1.10041962, "memory(GiB)": 142.32, "step": 59400, "train_speed(iter/s)": 0.287573 }, { "acc": 0.73395681, "epoch": 0.6646095684427897, "grad_norm": 5.71875, "learning_rate": 7.974845449827489e-06, "loss": 1.05320053, "memory(GiB)": 142.32, "step": 59420, "train_speed(iter/s)": 0.287603 }, { "acc": 0.73034239, "epoch": 0.6648332673887483, "grad_norm": 6.8125, "learning_rate": 7.973358776999935e-06, "loss": 1.07500525, "memory(GiB)": 142.32, "step": 59440, "train_speed(iter/s)": 0.287637 }, { "acc": 0.71964779, "epoch": 0.6650569663347068, "grad_norm": 5.3125, "learning_rate": 7.9718716973744e-06, "loss": 1.10324707, "memory(GiB)": 142.32, "step": 59460, "train_speed(iter/s)": 0.287669 }, { "acc": 0.72438755, "epoch": 0.6652806652806653, "grad_norm": 5.78125, "learning_rate": 7.97038421115434e-06, "loss": 1.10326118, "memory(GiB)": 142.32, "step": 59480, "train_speed(iter/s)": 0.287701 }, { "acc": 0.7310853, "epoch": 0.6655043642266238, "grad_norm": 6.21875, "learning_rate": 7.968896318543262e-06, "loss": 1.07086468, "memory(GiB)": 142.32, "step": 59500, "train_speed(iter/s)": 0.287732 }, { "acc": 0.72000389, "epoch": 0.6657280631725824, "grad_norm": 7.40625, "learning_rate": 7.967408019744734e-06, "loss": 1.11547508, "memory(GiB)": 142.32, "step": 59520, "train_speed(iter/s)": 0.287762 }, { "acc": 0.73315477, "epoch": 0.6659517621185409, "grad_norm": 7.34375, "learning_rate": 7.965919314962374e-06, "loss": 1.05407143, "memory(GiB)": 142.32, "step": 59540, "train_speed(iter/s)": 0.287793 }, { "acc": 0.72453547, "epoch": 0.6661754610644994, "grad_norm": 5.8125, "learning_rate": 7.964430204399858e-06, "loss": 1.10954466, "memory(GiB)": 142.32, "step": 59560, "train_speed(iter/s)": 0.287825 }, { "acc": 0.72714825, "epoch": 0.666399160010458, "grad_norm": 6.8125, "learning_rate": 7.962940688260918e-06, "loss": 1.07330837, "memory(GiB)": 142.32, "step": 59580, "train_speed(iter/s)": 0.287857 }, { "acc": 0.72877998, "epoch": 0.6666228589564165, "grad_norm": 5.21875, "learning_rate": 7.961450766749343e-06, "loss": 1.08899918, "memory(GiB)": 142.32, "step": 59600, "train_speed(iter/s)": 0.287888 }, { "acc": 0.74224911, "epoch": 0.666846557902375, "grad_norm": 6.21875, "learning_rate": 7.959960440068975e-06, "loss": 1.02404785, "memory(GiB)": 142.32, "step": 59620, "train_speed(iter/s)": 0.287917 }, { "acc": 0.73043628, "epoch": 0.6670702568483335, "grad_norm": 5.65625, "learning_rate": 7.95846970842371e-06, "loss": 1.08373919, "memory(GiB)": 142.32, "step": 59640, "train_speed(iter/s)": 0.287952 }, { "acc": 0.73299012, "epoch": 0.6672939557942921, "grad_norm": 5.78125, "learning_rate": 7.956978572017504e-06, "loss": 1.0735939, "memory(GiB)": 142.32, "step": 59660, "train_speed(iter/s)": 0.287988 }, { "acc": 0.73113251, "epoch": 0.6675176547402506, "grad_norm": 7.46875, "learning_rate": 7.955487031054364e-06, "loss": 1.08203316, "memory(GiB)": 142.32, "step": 59680, "train_speed(iter/s)": 0.288019 }, { "acc": 0.74425449, "epoch": 0.6677413536862091, "grad_norm": 6.03125, "learning_rate": 7.953995085738354e-06, "loss": 1.00541487, "memory(GiB)": 142.32, "step": 59700, "train_speed(iter/s)": 0.288056 }, { "acc": 0.73074856, "epoch": 0.6679650526321677, "grad_norm": 5.96875, "learning_rate": 7.952502736273594e-06, "loss": 1.06637754, "memory(GiB)": 142.32, "step": 59720, "train_speed(iter/s)": 0.288093 }, { "acc": 0.72339115, "epoch": 0.6681887515781262, "grad_norm": 6.71875, "learning_rate": 7.951009982864257e-06, "loss": 1.1034049, "memory(GiB)": 142.32, "step": 59740, "train_speed(iter/s)": 0.288129 }, { "acc": 0.73548112, "epoch": 0.6684124505240847, "grad_norm": 5.90625, "learning_rate": 7.949516825714578e-06, "loss": 1.03694305, "memory(GiB)": 142.32, "step": 59760, "train_speed(iter/s)": 0.288157 }, { "acc": 0.72679863, "epoch": 0.6686361494700432, "grad_norm": 5.78125, "learning_rate": 7.948023265028837e-06, "loss": 1.09010239, "memory(GiB)": 142.32, "step": 59780, "train_speed(iter/s)": 0.28819 }, { "acc": 0.72848072, "epoch": 0.6688598484160018, "grad_norm": 5.65625, "learning_rate": 7.946529301011376e-06, "loss": 1.08814144, "memory(GiB)": 142.32, "step": 59800, "train_speed(iter/s)": 0.288222 }, { "acc": 0.72911124, "epoch": 0.6690835473619603, "grad_norm": 6.46875, "learning_rate": 7.945034933866592e-06, "loss": 1.07557888, "memory(GiB)": 142.32, "step": 59820, "train_speed(iter/s)": 0.288258 }, { "acc": 0.71658497, "epoch": 0.6693072463079188, "grad_norm": 5.96875, "learning_rate": 7.943540163798934e-06, "loss": 1.11083879, "memory(GiB)": 142.32, "step": 59840, "train_speed(iter/s)": 0.28829 }, { "acc": 0.73605862, "epoch": 0.6695309452538774, "grad_norm": 5.875, "learning_rate": 7.942044991012909e-06, "loss": 1.04782982, "memory(GiB)": 142.32, "step": 59860, "train_speed(iter/s)": 0.288321 }, { "acc": 0.73008213, "epoch": 0.6697546441998359, "grad_norm": 6.53125, "learning_rate": 7.940549415713078e-06, "loss": 1.05196934, "memory(GiB)": 142.32, "step": 59880, "train_speed(iter/s)": 0.288351 }, { "acc": 0.74260163, "epoch": 0.6699783431457944, "grad_norm": 6.34375, "learning_rate": 7.939053438104056e-06, "loss": 1.03374405, "memory(GiB)": 142.32, "step": 59900, "train_speed(iter/s)": 0.288382 }, { "acc": 0.73369665, "epoch": 0.6702020420917529, "grad_norm": 6.25, "learning_rate": 7.937557058390515e-06, "loss": 1.06145992, "memory(GiB)": 142.32, "step": 59920, "train_speed(iter/s)": 0.28841 }, { "acc": 0.73136826, "epoch": 0.6704257410377115, "grad_norm": 5.28125, "learning_rate": 7.936060276777183e-06, "loss": 1.07084923, "memory(GiB)": 142.32, "step": 59940, "train_speed(iter/s)": 0.288445 }, { "acc": 0.73143415, "epoch": 0.67064943998367, "grad_norm": 6.5, "learning_rate": 7.934563093468838e-06, "loss": 1.06446819, "memory(GiB)": 142.32, "step": 59960, "train_speed(iter/s)": 0.288478 }, { "acc": 0.73052034, "epoch": 0.6708731389296285, "grad_norm": 6.3125, "learning_rate": 7.933065508670317e-06, "loss": 1.06450195, "memory(GiB)": 142.32, "step": 59980, "train_speed(iter/s)": 0.288506 }, { "acc": 0.72718897, "epoch": 0.671096837875587, "grad_norm": 6.4375, "learning_rate": 7.931567522586511e-06, "loss": 1.08856602, "memory(GiB)": 142.32, "step": 60000, "train_speed(iter/s)": 0.288541 }, { "epoch": 0.671096837875587, "eval_acc": 0.6934233135318529, "eval_loss": 1.08458411693573, "eval_runtime": 2341.3042, "eval_samples_per_second": 32.154, "eval_steps_per_second": 16.077, "step": 60000 }, { "acc": 0.73321114, "epoch": 0.6713205368215456, "grad_norm": 5.65625, "learning_rate": 7.930069135422366e-06, "loss": 1.06863003, "memory(GiB)": 142.32, "step": 60020, "train_speed(iter/s)": 0.28529 }, { "acc": 0.73185239, "epoch": 0.6715442357675041, "grad_norm": 6.9375, "learning_rate": 7.928570347382884e-06, "loss": 1.06575012, "memory(GiB)": 142.32, "step": 60040, "train_speed(iter/s)": 0.285324 }, { "acc": 0.73011713, "epoch": 0.6717679347134626, "grad_norm": 5.71875, "learning_rate": 7.927071158673118e-06, "loss": 1.06788235, "memory(GiB)": 142.32, "step": 60060, "train_speed(iter/s)": 0.28536 }, { "acc": 0.73288493, "epoch": 0.6719916336594212, "grad_norm": 6.125, "learning_rate": 7.925571569498182e-06, "loss": 1.07231426, "memory(GiB)": 142.32, "step": 60080, "train_speed(iter/s)": 0.285389 }, { "acc": 0.74239945, "epoch": 0.6722153326053797, "grad_norm": 5.28125, "learning_rate": 7.924071580063238e-06, "loss": 1.02775097, "memory(GiB)": 142.32, "step": 60100, "train_speed(iter/s)": 0.285419 }, { "acc": 0.71811743, "epoch": 0.6724390315513382, "grad_norm": 5.6875, "learning_rate": 7.922571190573507e-06, "loss": 1.11935024, "memory(GiB)": 142.32, "step": 60120, "train_speed(iter/s)": 0.28545 }, { "acc": 0.73691864, "epoch": 0.6726627304972967, "grad_norm": 5.6875, "learning_rate": 7.921070401234265e-06, "loss": 1.03655186, "memory(GiB)": 142.32, "step": 60140, "train_speed(iter/s)": 0.285482 }, { "acc": 0.72756438, "epoch": 0.6728864294432553, "grad_norm": 5.28125, "learning_rate": 7.919569212250839e-06, "loss": 1.09361191, "memory(GiB)": 142.32, "step": 60160, "train_speed(iter/s)": 0.285515 }, { "acc": 0.72457094, "epoch": 0.6731101283892138, "grad_norm": 5.78125, "learning_rate": 7.918067623828616e-06, "loss": 1.10805511, "memory(GiB)": 142.32, "step": 60180, "train_speed(iter/s)": 0.285547 }, { "acc": 0.74289083, "epoch": 0.6733338273351723, "grad_norm": 7.375, "learning_rate": 7.916565636173032e-06, "loss": 1.01521692, "memory(GiB)": 142.32, "step": 60200, "train_speed(iter/s)": 0.285578 }, { "acc": 0.72129736, "epoch": 0.6735575262811309, "grad_norm": 6.96875, "learning_rate": 7.915063249489582e-06, "loss": 1.10611105, "memory(GiB)": 142.32, "step": 60220, "train_speed(iter/s)": 0.28561 }, { "acc": 0.73259211, "epoch": 0.6737812252270894, "grad_norm": 7.34375, "learning_rate": 7.913560463983815e-06, "loss": 1.06652565, "memory(GiB)": 142.32, "step": 60240, "train_speed(iter/s)": 0.28564 }, { "acc": 0.72788043, "epoch": 0.6740049241730479, "grad_norm": 6.21875, "learning_rate": 7.91205727986133e-06, "loss": 1.08536854, "memory(GiB)": 142.32, "step": 60260, "train_speed(iter/s)": 0.28567 }, { "acc": 0.73142748, "epoch": 0.6742286231190064, "grad_norm": 5.0, "learning_rate": 7.910553697327787e-06, "loss": 1.06963272, "memory(GiB)": 142.32, "step": 60280, "train_speed(iter/s)": 0.285703 }, { "acc": 0.72437, "epoch": 0.674452322064965, "grad_norm": 6.5625, "learning_rate": 7.909049716588898e-06, "loss": 1.08578491, "memory(GiB)": 142.32, "step": 60300, "train_speed(iter/s)": 0.285735 }, { "acc": 0.73579092, "epoch": 0.6746760210109235, "grad_norm": 6.75, "learning_rate": 7.907545337850426e-06, "loss": 1.05839291, "memory(GiB)": 142.32, "step": 60320, "train_speed(iter/s)": 0.285764 }, { "acc": 0.72021036, "epoch": 0.674899719956882, "grad_norm": 6.21875, "learning_rate": 7.906040561318195e-06, "loss": 1.1178463, "memory(GiB)": 142.32, "step": 60340, "train_speed(iter/s)": 0.285797 }, { "acc": 0.72884779, "epoch": 0.6751234189028406, "grad_norm": 6.09375, "learning_rate": 7.904535387198079e-06, "loss": 1.08857212, "memory(GiB)": 142.32, "step": 60360, "train_speed(iter/s)": 0.28583 }, { "acc": 0.72347088, "epoch": 0.6753471178487991, "grad_norm": 6.46875, "learning_rate": 7.903029815696004e-06, "loss": 1.11912231, "memory(GiB)": 142.32, "step": 60380, "train_speed(iter/s)": 0.285863 }, { "acc": 0.75485616, "epoch": 0.6755708167947576, "grad_norm": 6.375, "learning_rate": 7.901523847017958e-06, "loss": 0.94635611, "memory(GiB)": 142.32, "step": 60400, "train_speed(iter/s)": 0.285897 }, { "acc": 0.72788849, "epoch": 0.6757945157407161, "grad_norm": 6.15625, "learning_rate": 7.900017481369976e-06, "loss": 1.09161749, "memory(GiB)": 142.32, "step": 60420, "train_speed(iter/s)": 0.285927 }, { "acc": 0.73855429, "epoch": 0.6760182146866747, "grad_norm": 5.78125, "learning_rate": 7.898510718958152e-06, "loss": 1.01451521, "memory(GiB)": 142.32, "step": 60440, "train_speed(iter/s)": 0.28596 }, { "acc": 0.74927406, "epoch": 0.6762419136326332, "grad_norm": 5.46875, "learning_rate": 7.897003559988634e-06, "loss": 0.9892643, "memory(GiB)": 142.32, "step": 60460, "train_speed(iter/s)": 0.285991 }, { "acc": 0.73297276, "epoch": 0.6764656125785917, "grad_norm": 6.75, "learning_rate": 7.89549600466762e-06, "loss": 1.0496706, "memory(GiB)": 142.32, "step": 60480, "train_speed(iter/s)": 0.286022 }, { "acc": 0.73171558, "epoch": 0.6766893115245503, "grad_norm": 7.5, "learning_rate": 7.893988053201367e-06, "loss": 1.06493893, "memory(GiB)": 142.32, "step": 60500, "train_speed(iter/s)": 0.286051 }, { "acc": 0.72637935, "epoch": 0.6769130104705088, "grad_norm": 6.90625, "learning_rate": 7.892479705796184e-06, "loss": 1.09741879, "memory(GiB)": 142.32, "step": 60520, "train_speed(iter/s)": 0.286083 }, { "acc": 0.73741598, "epoch": 0.6771367094164673, "grad_norm": 6.96875, "learning_rate": 7.890970962658432e-06, "loss": 1.02897472, "memory(GiB)": 142.32, "step": 60540, "train_speed(iter/s)": 0.286114 }, { "acc": 0.73348455, "epoch": 0.6773604083624258, "grad_norm": 5.84375, "learning_rate": 7.889461823994533e-06, "loss": 1.07009888, "memory(GiB)": 142.32, "step": 60560, "train_speed(iter/s)": 0.286147 }, { "acc": 0.72461863, "epoch": 0.6775841073083844, "grad_norm": 5.34375, "learning_rate": 7.887952290010956e-06, "loss": 1.10166492, "memory(GiB)": 142.32, "step": 60580, "train_speed(iter/s)": 0.28618 }, { "acc": 0.73665838, "epoch": 0.6778078062543429, "grad_norm": 5.5, "learning_rate": 7.886442360914228e-06, "loss": 1.04358253, "memory(GiB)": 142.32, "step": 60600, "train_speed(iter/s)": 0.28621 }, { "acc": 0.73534603, "epoch": 0.6780315052003014, "grad_norm": 5.71875, "learning_rate": 7.884932036910928e-06, "loss": 1.06178112, "memory(GiB)": 142.32, "step": 60620, "train_speed(iter/s)": 0.28624 }, { "acc": 0.73869147, "epoch": 0.67825520414626, "grad_norm": 5.46875, "learning_rate": 7.88342131820769e-06, "loss": 1.0463439, "memory(GiB)": 142.32, "step": 60640, "train_speed(iter/s)": 0.28627 }, { "acc": 0.72698622, "epoch": 0.6784789030922185, "grad_norm": 6.1875, "learning_rate": 7.881910205011203e-06, "loss": 1.10052204, "memory(GiB)": 142.32, "step": 60660, "train_speed(iter/s)": 0.286302 }, { "acc": 0.73141975, "epoch": 0.678702602038177, "grad_norm": 6.40625, "learning_rate": 7.880398697528206e-06, "loss": 1.0781538, "memory(GiB)": 142.32, "step": 60680, "train_speed(iter/s)": 0.286335 }, { "acc": 0.73651333, "epoch": 0.6789263009841355, "grad_norm": 5.3125, "learning_rate": 7.878886795965497e-06, "loss": 1.05262184, "memory(GiB)": 142.32, "step": 60700, "train_speed(iter/s)": 0.286366 }, { "acc": 0.7390131, "epoch": 0.6791499999300941, "grad_norm": 6.71875, "learning_rate": 7.877374500529926e-06, "loss": 1.0311079, "memory(GiB)": 142.32, "step": 60720, "train_speed(iter/s)": 0.286399 }, { "acc": 0.72790861, "epoch": 0.6793736988760526, "grad_norm": 5.65625, "learning_rate": 7.875861811428399e-06, "loss": 1.07635145, "memory(GiB)": 142.32, "step": 60740, "train_speed(iter/s)": 0.286432 }, { "acc": 0.71603327, "epoch": 0.6795973978220111, "grad_norm": 5.21875, "learning_rate": 7.874348728867866e-06, "loss": 1.12754879, "memory(GiB)": 142.32, "step": 60760, "train_speed(iter/s)": 0.286469 }, { "acc": 0.7460393, "epoch": 0.6798210967679696, "grad_norm": 5.90625, "learning_rate": 7.872835253055344e-06, "loss": 0.99305, "memory(GiB)": 142.32, "step": 60780, "train_speed(iter/s)": 0.286502 }, { "acc": 0.73985791, "epoch": 0.6800447957139282, "grad_norm": 6.9375, "learning_rate": 7.871321384197898e-06, "loss": 1.03305483, "memory(GiB)": 142.32, "step": 60800, "train_speed(iter/s)": 0.286538 }, { "acc": 0.7441514, "epoch": 0.6802684946598867, "grad_norm": 5.8125, "learning_rate": 7.869807122502648e-06, "loss": 1.02651539, "memory(GiB)": 142.32, "step": 60820, "train_speed(iter/s)": 0.28657 }, { "acc": 0.74268064, "epoch": 0.6804921936058452, "grad_norm": 5.75, "learning_rate": 7.868292468176762e-06, "loss": 1.01546555, "memory(GiB)": 142.32, "step": 60840, "train_speed(iter/s)": 0.286604 }, { "acc": 0.73044662, "epoch": 0.6807158925518038, "grad_norm": 6.5, "learning_rate": 7.86677742142747e-06, "loss": 1.05681105, "memory(GiB)": 142.32, "step": 60860, "train_speed(iter/s)": 0.286633 }, { "acc": 0.74555979, "epoch": 0.6809395914977623, "grad_norm": 9.25, "learning_rate": 7.86526198246205e-06, "loss": 1.00619926, "memory(GiB)": 142.32, "step": 60880, "train_speed(iter/s)": 0.286666 }, { "acc": 0.73301258, "epoch": 0.6811632904437208, "grad_norm": 6.28125, "learning_rate": 7.86374615148784e-06, "loss": 1.07138014, "memory(GiB)": 142.32, "step": 60900, "train_speed(iter/s)": 0.286696 }, { "acc": 0.72649879, "epoch": 0.6813869893896793, "grad_norm": 5.25, "learning_rate": 7.86222992871222e-06, "loss": 1.10710163, "memory(GiB)": 142.32, "step": 60920, "train_speed(iter/s)": 0.286723 }, { "acc": 0.73642573, "epoch": 0.6816106883356379, "grad_norm": 5.84375, "learning_rate": 7.860713314342636e-06, "loss": 1.04439182, "memory(GiB)": 142.32, "step": 60940, "train_speed(iter/s)": 0.286755 }, { "acc": 0.72840586, "epoch": 0.6818343872815964, "grad_norm": 5.9375, "learning_rate": 7.859196308586583e-06, "loss": 1.06829967, "memory(GiB)": 142.32, "step": 60960, "train_speed(iter/s)": 0.286788 }, { "acc": 0.73034134, "epoch": 0.6820580862275549, "grad_norm": 6.6875, "learning_rate": 7.857678911651608e-06, "loss": 1.08014212, "memory(GiB)": 142.32, "step": 60980, "train_speed(iter/s)": 0.286818 }, { "acc": 0.72428069, "epoch": 0.6822817851735135, "grad_norm": 5.03125, "learning_rate": 7.856161123745311e-06, "loss": 1.10278378, "memory(GiB)": 142.32, "step": 61000, "train_speed(iter/s)": 0.286849 }, { "acc": 0.7323638, "epoch": 0.682505484119472, "grad_norm": 7.1875, "learning_rate": 7.854642945075348e-06, "loss": 1.05536346, "memory(GiB)": 142.32, "step": 61020, "train_speed(iter/s)": 0.286879 }, { "acc": 0.72688284, "epoch": 0.6827291830654305, "grad_norm": 5.53125, "learning_rate": 7.853124375849429e-06, "loss": 1.08990784, "memory(GiB)": 142.32, "step": 61040, "train_speed(iter/s)": 0.28691 }, { "acc": 0.72822185, "epoch": 0.682952882011389, "grad_norm": 5.03125, "learning_rate": 7.851605416275314e-06, "loss": 1.08366108, "memory(GiB)": 142.32, "step": 61060, "train_speed(iter/s)": 0.286943 }, { "acc": 0.73286428, "epoch": 0.6831765809573476, "grad_norm": 6.375, "learning_rate": 7.85008606656082e-06, "loss": 1.06742048, "memory(GiB)": 142.32, "step": 61080, "train_speed(iter/s)": 0.286975 }, { "acc": 0.74753127, "epoch": 0.6834002799033061, "grad_norm": 6.21875, "learning_rate": 7.848566326913813e-06, "loss": 0.98745613, "memory(GiB)": 142.32, "step": 61100, "train_speed(iter/s)": 0.287008 }, { "acc": 0.74200125, "epoch": 0.6836239788492646, "grad_norm": 5.59375, "learning_rate": 7.847046197542219e-06, "loss": 1.01070967, "memory(GiB)": 142.32, "step": 61120, "train_speed(iter/s)": 0.287042 }, { "acc": 0.73930674, "epoch": 0.6838476777952232, "grad_norm": 6.25, "learning_rate": 7.845525678654012e-06, "loss": 1.03336887, "memory(GiB)": 142.32, "step": 61140, "train_speed(iter/s)": 0.287071 }, { "acc": 0.73004761, "epoch": 0.6840713767411817, "grad_norm": 5.4375, "learning_rate": 7.844004770457219e-06, "loss": 1.06920166, "memory(GiB)": 142.32, "step": 61160, "train_speed(iter/s)": 0.287103 }, { "acc": 0.7335238, "epoch": 0.6842950756871402, "grad_norm": 5.21875, "learning_rate": 7.842483473159923e-06, "loss": 1.06545696, "memory(GiB)": 142.32, "step": 61180, "train_speed(iter/s)": 0.287134 }, { "acc": 0.74452634, "epoch": 0.6845187746330987, "grad_norm": 5.59375, "learning_rate": 7.840961786970261e-06, "loss": 1.01687984, "memory(GiB)": 142.32, "step": 61200, "train_speed(iter/s)": 0.287162 }, { "acc": 0.7390976, "epoch": 0.6847424735790573, "grad_norm": 5.0, "learning_rate": 7.839439712096418e-06, "loss": 1.0312089, "memory(GiB)": 142.32, "step": 61220, "train_speed(iter/s)": 0.287194 }, { "acc": 0.72723398, "epoch": 0.6849661725250158, "grad_norm": 6.625, "learning_rate": 7.837917248746637e-06, "loss": 1.0797267, "memory(GiB)": 142.32, "step": 61240, "train_speed(iter/s)": 0.287228 }, { "acc": 0.73004341, "epoch": 0.6851898714709743, "grad_norm": 5.46875, "learning_rate": 7.836394397129216e-06, "loss": 1.05399971, "memory(GiB)": 142.32, "step": 61260, "train_speed(iter/s)": 0.287258 }, { "acc": 0.74181347, "epoch": 0.6854135704169328, "grad_norm": 5.9375, "learning_rate": 7.834871157452499e-06, "loss": 1.03816147, "memory(GiB)": 142.32, "step": 61280, "train_speed(iter/s)": 0.287292 }, { "acc": 0.73156567, "epoch": 0.6856372693628914, "grad_norm": 7.125, "learning_rate": 7.833347529924886e-06, "loss": 1.05451765, "memory(GiB)": 142.32, "step": 61300, "train_speed(iter/s)": 0.287324 }, { "acc": 0.73300252, "epoch": 0.6858609683088499, "grad_norm": 6.25, "learning_rate": 7.831823514754836e-06, "loss": 1.08029823, "memory(GiB)": 142.32, "step": 61320, "train_speed(iter/s)": 0.287356 }, { "acc": 0.72697911, "epoch": 0.6860846672548084, "grad_norm": 5.625, "learning_rate": 7.830299112150851e-06, "loss": 1.0979723, "memory(GiB)": 142.32, "step": 61340, "train_speed(iter/s)": 0.287388 }, { "acc": 0.73211279, "epoch": 0.686308366200767, "grad_norm": 6.09375, "learning_rate": 7.828774322321492e-06, "loss": 1.06221714, "memory(GiB)": 142.32, "step": 61360, "train_speed(iter/s)": 0.287415 }, { "acc": 0.72625518, "epoch": 0.6865320651467255, "grad_norm": 6.375, "learning_rate": 7.827249145475377e-06, "loss": 1.08222752, "memory(GiB)": 142.32, "step": 61380, "train_speed(iter/s)": 0.287444 }, { "acc": 0.72613511, "epoch": 0.686755764092684, "grad_norm": 6.1875, "learning_rate": 7.825723581821165e-06, "loss": 1.08171329, "memory(GiB)": 142.32, "step": 61400, "train_speed(iter/s)": 0.287478 }, { "acc": 0.7397119, "epoch": 0.6869794630386425, "grad_norm": 5.375, "learning_rate": 7.82419763156758e-06, "loss": 1.03612194, "memory(GiB)": 142.32, "step": 61420, "train_speed(iter/s)": 0.287509 }, { "acc": 0.73786516, "epoch": 0.6872031619846011, "grad_norm": 4.71875, "learning_rate": 7.822671294923392e-06, "loss": 1.03899012, "memory(GiB)": 142.32, "step": 61440, "train_speed(iter/s)": 0.287537 }, { "acc": 0.73073354, "epoch": 0.6874268609305596, "grad_norm": 4.75, "learning_rate": 7.821144572097424e-06, "loss": 1.07980003, "memory(GiB)": 142.32, "step": 61460, "train_speed(iter/s)": 0.287565 }, { "acc": 0.74175186, "epoch": 0.6876505598765181, "grad_norm": 5.96875, "learning_rate": 7.819617463298557e-06, "loss": 1.01495895, "memory(GiB)": 142.32, "step": 61480, "train_speed(iter/s)": 0.287599 }, { "acc": 0.73077521, "epoch": 0.6878742588224767, "grad_norm": 5.53125, "learning_rate": 7.818089968735717e-06, "loss": 1.08324928, "memory(GiB)": 142.32, "step": 61500, "train_speed(iter/s)": 0.287631 }, { "acc": 0.72926531, "epoch": 0.6880979577684352, "grad_norm": 5.125, "learning_rate": 7.816562088617891e-06, "loss": 1.0688036, "memory(GiB)": 142.32, "step": 61520, "train_speed(iter/s)": 0.287664 }, { "acc": 0.74679623, "epoch": 0.6883216567143937, "grad_norm": 4.78125, "learning_rate": 7.815033823154112e-06, "loss": 0.9900568, "memory(GiB)": 142.32, "step": 61540, "train_speed(iter/s)": 0.287695 }, { "acc": 0.73369803, "epoch": 0.6885453556603522, "grad_norm": 6.21875, "learning_rate": 7.813505172553472e-06, "loss": 1.05543537, "memory(GiB)": 142.32, "step": 61560, "train_speed(iter/s)": 0.287728 }, { "acc": 0.73533583, "epoch": 0.6887690546063109, "grad_norm": 5.75, "learning_rate": 7.81197613702511e-06, "loss": 1.05952997, "memory(GiB)": 142.32, "step": 61580, "train_speed(iter/s)": 0.287761 }, { "acc": 0.72856708, "epoch": 0.6889927535522694, "grad_norm": 6.0, "learning_rate": 7.810446716778218e-06, "loss": 1.08749905, "memory(GiB)": 142.32, "step": 61600, "train_speed(iter/s)": 0.28779 }, { "acc": 0.73985806, "epoch": 0.6892164524982279, "grad_norm": 7.21875, "learning_rate": 7.808916912022046e-06, "loss": 1.04223289, "memory(GiB)": 142.32, "step": 61620, "train_speed(iter/s)": 0.287814 }, { "acc": 0.72718163, "epoch": 0.6894401514441865, "grad_norm": 5.28125, "learning_rate": 7.807386722965891e-06, "loss": 1.09387589, "memory(GiB)": 142.32, "step": 61640, "train_speed(iter/s)": 0.287844 }, { "acc": 0.72968607, "epoch": 0.689663850390145, "grad_norm": 5.1875, "learning_rate": 7.805856149819107e-06, "loss": 1.10106707, "memory(GiB)": 142.32, "step": 61660, "train_speed(iter/s)": 0.287877 }, { "acc": 0.72149405, "epoch": 0.6898875493361035, "grad_norm": 5.1875, "learning_rate": 7.804325192791096e-06, "loss": 1.1119318, "memory(GiB)": 142.32, "step": 61680, "train_speed(iter/s)": 0.287914 }, { "acc": 0.73841791, "epoch": 0.690111248282062, "grad_norm": 5.75, "learning_rate": 7.802793852091315e-06, "loss": 1.03513832, "memory(GiB)": 142.32, "step": 61700, "train_speed(iter/s)": 0.287946 }, { "acc": 0.73152266, "epoch": 0.6903349472280206, "grad_norm": 5.96875, "learning_rate": 7.801262127929274e-06, "loss": 1.05399456, "memory(GiB)": 142.32, "step": 61720, "train_speed(iter/s)": 0.287978 }, { "acc": 0.72870026, "epoch": 0.6905586461739791, "grad_norm": 5.21875, "learning_rate": 7.799730020514536e-06, "loss": 1.07352028, "memory(GiB)": 142.32, "step": 61740, "train_speed(iter/s)": 0.288009 }, { "acc": 0.7356719, "epoch": 0.6907823451199376, "grad_norm": 6.125, "learning_rate": 7.79819753005671e-06, "loss": 1.05593977, "memory(GiB)": 142.32, "step": 61760, "train_speed(iter/s)": 0.288039 }, { "acc": 0.73616009, "epoch": 0.6910060440658962, "grad_norm": 5.125, "learning_rate": 7.796664656765472e-06, "loss": 1.04284849, "memory(GiB)": 142.32, "step": 61780, "train_speed(iter/s)": 0.288072 }, { "acc": 0.73250551, "epoch": 0.6912297430118547, "grad_norm": 4.8125, "learning_rate": 7.795131400850533e-06, "loss": 1.07768192, "memory(GiB)": 142.32, "step": 61800, "train_speed(iter/s)": 0.2881 }, { "acc": 0.73215895, "epoch": 0.6914534419578132, "grad_norm": 5.375, "learning_rate": 7.793597762521666e-06, "loss": 1.07047262, "memory(GiB)": 142.32, "step": 61820, "train_speed(iter/s)": 0.28813 }, { "acc": 0.72840104, "epoch": 0.6916771409037717, "grad_norm": 4.21875, "learning_rate": 7.792063741988695e-06, "loss": 1.07592659, "memory(GiB)": 142.32, "step": 61840, "train_speed(iter/s)": 0.288163 }, { "acc": 0.72784939, "epoch": 0.6919008398497303, "grad_norm": 6.9375, "learning_rate": 7.790529339461497e-06, "loss": 1.10452442, "memory(GiB)": 142.32, "step": 61860, "train_speed(iter/s)": 0.288193 }, { "acc": 0.71786404, "epoch": 0.6921245387956888, "grad_norm": 5.46875, "learning_rate": 7.78899455515e-06, "loss": 1.12208443, "memory(GiB)": 142.32, "step": 61880, "train_speed(iter/s)": 0.288224 }, { "acc": 0.72843676, "epoch": 0.6923482377416473, "grad_norm": 7.15625, "learning_rate": 7.787459389264183e-06, "loss": 1.10359163, "memory(GiB)": 142.32, "step": 61900, "train_speed(iter/s)": 0.288257 }, { "acc": 0.72621369, "epoch": 0.6925719366876059, "grad_norm": 5.90625, "learning_rate": 7.78592384201408e-06, "loss": 1.08909988, "memory(GiB)": 142.32, "step": 61920, "train_speed(iter/s)": 0.288288 }, { "acc": 0.738521, "epoch": 0.6927956356335644, "grad_norm": 4.5, "learning_rate": 7.784387913609775e-06, "loss": 1.02699919, "memory(GiB)": 142.32, "step": 61940, "train_speed(iter/s)": 0.28832 }, { "acc": 0.73537865, "epoch": 0.6930193345795229, "grad_norm": 5.6875, "learning_rate": 7.782851604261406e-06, "loss": 1.04758396, "memory(GiB)": 142.32, "step": 61960, "train_speed(iter/s)": 0.288351 }, { "acc": 0.72847118, "epoch": 0.6932430335254814, "grad_norm": 4.59375, "learning_rate": 7.781314914179161e-06, "loss": 1.09282904, "memory(GiB)": 142.32, "step": 61980, "train_speed(iter/s)": 0.288379 }, { "acc": 0.72785473, "epoch": 0.69346673247144, "grad_norm": 5.125, "learning_rate": 7.779777843573282e-06, "loss": 1.07812557, "memory(GiB)": 142.32, "step": 62000, "train_speed(iter/s)": 0.288413 }, { "epoch": 0.69346673247144, "eval_acc": 0.6936785716116733, "eval_loss": 1.0832748413085938, "eval_runtime": 2340.5481, "eval_samples_per_second": 32.165, "eval_steps_per_second": 16.083, "step": 62000 }, { "acc": 0.73929453, "epoch": 0.6936904314173985, "grad_norm": 8.5, "learning_rate": 7.778240392654061e-06, "loss": 1.03069582, "memory(GiB)": 142.32, "step": 62020, "train_speed(iter/s)": 0.28527 }, { "acc": 0.72799292, "epoch": 0.693914130363357, "grad_norm": 5.25, "learning_rate": 7.776702561631847e-06, "loss": 1.08944397, "memory(GiB)": 142.32, "step": 62040, "train_speed(iter/s)": 0.285298 }, { "acc": 0.72260342, "epoch": 0.6941378293093156, "grad_norm": 7.15625, "learning_rate": 7.77516435071703e-06, "loss": 1.10347691, "memory(GiB)": 142.32, "step": 62060, "train_speed(iter/s)": 0.285329 }, { "acc": 0.73444881, "epoch": 0.6943615282552741, "grad_norm": 6.53125, "learning_rate": 7.773625760120067e-06, "loss": 1.05033646, "memory(GiB)": 142.32, "step": 62080, "train_speed(iter/s)": 0.285361 }, { "acc": 0.72803364, "epoch": 0.6945852272012326, "grad_norm": 5.28125, "learning_rate": 7.772086790051453e-06, "loss": 1.06826496, "memory(GiB)": 142.32, "step": 62100, "train_speed(iter/s)": 0.285393 }, { "acc": 0.73461676, "epoch": 0.6948089261471911, "grad_norm": 7.4375, "learning_rate": 7.770547440721745e-06, "loss": 1.06146011, "memory(GiB)": 142.32, "step": 62120, "train_speed(iter/s)": 0.285423 }, { "acc": 0.73329544, "epoch": 0.6950326250931497, "grad_norm": 6.59375, "learning_rate": 7.769007712341548e-06, "loss": 1.04950037, "memory(GiB)": 142.32, "step": 62140, "train_speed(iter/s)": 0.285455 }, { "acc": 0.72563782, "epoch": 0.6952563240391082, "grad_norm": 6.4375, "learning_rate": 7.767467605121518e-06, "loss": 1.10412102, "memory(GiB)": 142.32, "step": 62160, "train_speed(iter/s)": 0.285483 }, { "acc": 0.71640577, "epoch": 0.6954800229850667, "grad_norm": 5.21875, "learning_rate": 7.765927119272361e-06, "loss": 1.13150349, "memory(GiB)": 142.32, "step": 62180, "train_speed(iter/s)": 0.285512 }, { "acc": 0.74084511, "epoch": 0.6957037219310253, "grad_norm": 6.375, "learning_rate": 7.76438625500484e-06, "loss": 1.01704664, "memory(GiB)": 142.32, "step": 62200, "train_speed(iter/s)": 0.285539 }, { "acc": 0.74399681, "epoch": 0.6959274208769838, "grad_norm": 5.1875, "learning_rate": 7.76284501252977e-06, "loss": 1.00036879, "memory(GiB)": 142.32, "step": 62220, "train_speed(iter/s)": 0.285573 }, { "acc": 0.7380353, "epoch": 0.6961511198229423, "grad_norm": 5.75, "learning_rate": 7.76130339205801e-06, "loss": 1.04989948, "memory(GiB)": 142.32, "step": 62240, "train_speed(iter/s)": 0.285607 }, { "acc": 0.72796874, "epoch": 0.6963748187689008, "grad_norm": 5.28125, "learning_rate": 7.759761393800477e-06, "loss": 1.08090153, "memory(GiB)": 142.32, "step": 62260, "train_speed(iter/s)": 0.28564 }, { "acc": 0.7388504, "epoch": 0.6965985177148594, "grad_norm": 4.9375, "learning_rate": 7.75821901796814e-06, "loss": 1.02886658, "memory(GiB)": 142.32, "step": 62280, "train_speed(iter/s)": 0.285672 }, { "acc": 0.73318405, "epoch": 0.6968222166608179, "grad_norm": 5.3125, "learning_rate": 7.756676264772019e-06, "loss": 1.04689941, "memory(GiB)": 142.32, "step": 62300, "train_speed(iter/s)": 0.285704 }, { "acc": 0.72775636, "epoch": 0.6970459156067764, "grad_norm": 5.9375, "learning_rate": 7.75513313442318e-06, "loss": 1.11058197, "memory(GiB)": 142.32, "step": 62320, "train_speed(iter/s)": 0.285732 }, { "acc": 0.74075289, "epoch": 0.697269614552735, "grad_norm": 5.3125, "learning_rate": 7.753589627132752e-06, "loss": 1.03530502, "memory(GiB)": 142.32, "step": 62340, "train_speed(iter/s)": 0.285762 }, { "acc": 0.72890115, "epoch": 0.6974933134986935, "grad_norm": 6.0, "learning_rate": 7.752045743111902e-06, "loss": 1.07152424, "memory(GiB)": 142.32, "step": 62360, "train_speed(iter/s)": 0.285794 }, { "acc": 0.72918148, "epoch": 0.697717012444652, "grad_norm": 5.625, "learning_rate": 7.750501482571859e-06, "loss": 1.06937084, "memory(GiB)": 142.32, "step": 62380, "train_speed(iter/s)": 0.285824 }, { "acc": 0.74540091, "epoch": 0.6979407113906105, "grad_norm": 7.125, "learning_rate": 7.748956845723901e-06, "loss": 1.00470486, "memory(GiB)": 142.32, "step": 62400, "train_speed(iter/s)": 0.285857 }, { "acc": 0.74077396, "epoch": 0.6981644103365691, "grad_norm": 6.34375, "learning_rate": 7.747411832779354e-06, "loss": 1.02910309, "memory(GiB)": 142.32, "step": 62420, "train_speed(iter/s)": 0.285887 }, { "acc": 0.73190193, "epoch": 0.6983881092825276, "grad_norm": 5.53125, "learning_rate": 7.745866443949599e-06, "loss": 1.04816465, "memory(GiB)": 142.32, "step": 62440, "train_speed(iter/s)": 0.285922 }, { "acc": 0.7365571, "epoch": 0.6986118082284861, "grad_norm": 6.21875, "learning_rate": 7.744320679446067e-06, "loss": 1.04776363, "memory(GiB)": 142.32, "step": 62460, "train_speed(iter/s)": 0.285942 }, { "acc": 0.74224048, "epoch": 0.6988355071744446, "grad_norm": 5.3125, "learning_rate": 7.74277453948024e-06, "loss": 1.03162842, "memory(GiB)": 142.32, "step": 62480, "train_speed(iter/s)": 0.285973 }, { "acc": 0.72257071, "epoch": 0.6990592061204032, "grad_norm": 6.875, "learning_rate": 7.741228024263653e-06, "loss": 1.11352329, "memory(GiB)": 142.32, "step": 62500, "train_speed(iter/s)": 0.286004 }, { "acc": 0.73075047, "epoch": 0.6992829050663617, "grad_norm": 6.03125, "learning_rate": 7.739681134007893e-06, "loss": 1.06863441, "memory(GiB)": 142.32, "step": 62520, "train_speed(iter/s)": 0.286036 }, { "acc": 0.73698483, "epoch": 0.6995066040123202, "grad_norm": 4.96875, "learning_rate": 7.738133868924592e-06, "loss": 1.05978117, "memory(GiB)": 142.32, "step": 62540, "train_speed(iter/s)": 0.286066 }, { "acc": 0.73665981, "epoch": 0.6997303029582788, "grad_norm": 5.9375, "learning_rate": 7.736586229225442e-06, "loss": 1.04799271, "memory(GiB)": 142.32, "step": 62560, "train_speed(iter/s)": 0.286098 }, { "acc": 0.73244038, "epoch": 0.6999540019042373, "grad_norm": 4.40625, "learning_rate": 7.735038215122181e-06, "loss": 1.05983391, "memory(GiB)": 142.32, "step": 62580, "train_speed(iter/s)": 0.286128 }, { "acc": 0.72948656, "epoch": 0.7001777008501958, "grad_norm": 5.4375, "learning_rate": 7.733489826826598e-06, "loss": 1.09261169, "memory(GiB)": 142.32, "step": 62600, "train_speed(iter/s)": 0.286158 }, { "acc": 0.74304824, "epoch": 0.7004013997961543, "grad_norm": 5.40625, "learning_rate": 7.73194106455054e-06, "loss": 1.01821299, "memory(GiB)": 142.32, "step": 62620, "train_speed(iter/s)": 0.286193 }, { "acc": 0.72532473, "epoch": 0.7006250987421129, "grad_norm": 5.8125, "learning_rate": 7.730391928505892e-06, "loss": 1.09120159, "memory(GiB)": 142.32, "step": 62640, "train_speed(iter/s)": 0.286227 }, { "acc": 0.72760234, "epoch": 0.7008487976880714, "grad_norm": 5.84375, "learning_rate": 7.728842418904602e-06, "loss": 1.07554035, "memory(GiB)": 142.32, "step": 62660, "train_speed(iter/s)": 0.28626 }, { "acc": 0.73423228, "epoch": 0.7010724966340299, "grad_norm": 7.03125, "learning_rate": 7.727292535958667e-06, "loss": 1.0548357, "memory(GiB)": 142.32, "step": 62680, "train_speed(iter/s)": 0.28629 }, { "acc": 0.73102326, "epoch": 0.7012961955799885, "grad_norm": 5.375, "learning_rate": 7.725742279880131e-06, "loss": 1.06198692, "memory(GiB)": 142.32, "step": 62700, "train_speed(iter/s)": 0.286321 }, { "acc": 0.73471479, "epoch": 0.701519894525947, "grad_norm": 4.75, "learning_rate": 7.72419165088109e-06, "loss": 1.06128044, "memory(GiB)": 142.32, "step": 62720, "train_speed(iter/s)": 0.286351 }, { "acc": 0.73049583, "epoch": 0.7017435934719055, "grad_norm": 6.53125, "learning_rate": 7.722640649173693e-06, "loss": 1.07277298, "memory(GiB)": 142.32, "step": 62740, "train_speed(iter/s)": 0.286382 }, { "acc": 0.73310628, "epoch": 0.701967292417864, "grad_norm": 5.0625, "learning_rate": 7.721089274970142e-06, "loss": 1.07228966, "memory(GiB)": 142.32, "step": 62760, "train_speed(iter/s)": 0.286407 }, { "acc": 0.73092594, "epoch": 0.7021909913638226, "grad_norm": 6.3125, "learning_rate": 7.719537528482683e-06, "loss": 1.07813644, "memory(GiB)": 142.32, "step": 62780, "train_speed(iter/s)": 0.286436 }, { "acc": 0.73067727, "epoch": 0.7024146903097811, "grad_norm": 5.90625, "learning_rate": 7.71798540992362e-06, "loss": 1.06341553, "memory(GiB)": 142.32, "step": 62800, "train_speed(iter/s)": 0.286467 }, { "acc": 0.73690248, "epoch": 0.7026383892557396, "grad_norm": 6.53125, "learning_rate": 7.716432919505303e-06, "loss": 1.04829063, "memory(GiB)": 142.32, "step": 62820, "train_speed(iter/s)": 0.286499 }, { "acc": 0.72677841, "epoch": 0.7028620882016982, "grad_norm": 7.0, "learning_rate": 7.714880057440137e-06, "loss": 1.10781479, "memory(GiB)": 142.32, "step": 62840, "train_speed(iter/s)": 0.286532 }, { "acc": 0.72981062, "epoch": 0.7030857871476567, "grad_norm": 6.3125, "learning_rate": 7.713326823940573e-06, "loss": 1.07309513, "memory(GiB)": 142.32, "step": 62860, "train_speed(iter/s)": 0.286565 }, { "acc": 0.73301654, "epoch": 0.7033094860936152, "grad_norm": 6.375, "learning_rate": 7.711773219219119e-06, "loss": 1.07216034, "memory(GiB)": 142.32, "step": 62880, "train_speed(iter/s)": 0.286599 }, { "acc": 0.74393454, "epoch": 0.7035331850395737, "grad_norm": 6.15625, "learning_rate": 7.710219243488326e-06, "loss": 1.02408257, "memory(GiB)": 142.32, "step": 62900, "train_speed(iter/s)": 0.286633 }, { "acc": 0.73388977, "epoch": 0.7037568839855323, "grad_norm": 6.34375, "learning_rate": 7.708664896960804e-06, "loss": 1.05374212, "memory(GiB)": 142.32, "step": 62920, "train_speed(iter/s)": 0.286668 }, { "acc": 0.73354478, "epoch": 0.7039805829314908, "grad_norm": 5.5, "learning_rate": 7.707110179849208e-06, "loss": 1.07741842, "memory(GiB)": 142.32, "step": 62940, "train_speed(iter/s)": 0.286699 }, { "acc": 0.7380877, "epoch": 0.7042042818774493, "grad_norm": 7.15625, "learning_rate": 7.705555092366247e-06, "loss": 1.02862711, "memory(GiB)": 142.32, "step": 62960, "train_speed(iter/s)": 0.286733 }, { "acc": 0.74226532, "epoch": 0.7044279808234079, "grad_norm": 6.875, "learning_rate": 7.703999634724678e-06, "loss": 1.0180645, "memory(GiB)": 142.32, "step": 62980, "train_speed(iter/s)": 0.286763 }, { "acc": 0.73207626, "epoch": 0.7046516797693664, "grad_norm": 7.4375, "learning_rate": 7.70244380713731e-06, "loss": 1.06610966, "memory(GiB)": 142.32, "step": 63000, "train_speed(iter/s)": 0.286794 }, { "acc": 0.71125088, "epoch": 0.7048753787153249, "grad_norm": 6.96875, "learning_rate": 7.700887609817e-06, "loss": 1.17194195, "memory(GiB)": 142.32, "step": 63020, "train_speed(iter/s)": 0.286831 }, { "acc": 0.73333135, "epoch": 0.7050990776612834, "grad_norm": 5.9375, "learning_rate": 7.699331042976664e-06, "loss": 1.04490595, "memory(GiB)": 142.32, "step": 63040, "train_speed(iter/s)": 0.28686 }, { "acc": 0.73085189, "epoch": 0.705322776607242, "grad_norm": 5.9375, "learning_rate": 7.697774106829257e-06, "loss": 1.0689167, "memory(GiB)": 142.32, "step": 63060, "train_speed(iter/s)": 0.28689 }, { "acc": 0.72132983, "epoch": 0.7055464755532005, "grad_norm": 5.34375, "learning_rate": 7.696216801587791e-06, "loss": 1.11958666, "memory(GiB)": 142.32, "step": 63080, "train_speed(iter/s)": 0.28692 }, { "acc": 0.73820639, "epoch": 0.705770174499159, "grad_norm": 6.59375, "learning_rate": 7.69465912746533e-06, "loss": 1.03433113, "memory(GiB)": 142.32, "step": 63100, "train_speed(iter/s)": 0.286955 }, { "acc": 0.72999854, "epoch": 0.7059938734451175, "grad_norm": 5.40625, "learning_rate": 7.693101084674984e-06, "loss": 1.07348413, "memory(GiB)": 142.32, "step": 63120, "train_speed(iter/s)": 0.286987 }, { "acc": 0.72809005, "epoch": 0.7062175723910761, "grad_norm": 5.5, "learning_rate": 7.691542673429917e-06, "loss": 1.08773766, "memory(GiB)": 142.32, "step": 63140, "train_speed(iter/s)": 0.287019 }, { "acc": 0.73125644, "epoch": 0.7064412713370346, "grad_norm": 7.0, "learning_rate": 7.689983893943342e-06, "loss": 1.07182713, "memory(GiB)": 142.32, "step": 63160, "train_speed(iter/s)": 0.287048 }, { "acc": 0.72092729, "epoch": 0.7066649702829931, "grad_norm": 6.1875, "learning_rate": 7.68842474642852e-06, "loss": 1.11397209, "memory(GiB)": 142.32, "step": 63180, "train_speed(iter/s)": 0.287076 }, { "acc": 0.72755389, "epoch": 0.7068886692289517, "grad_norm": 6.21875, "learning_rate": 7.686865231098767e-06, "loss": 1.08752337, "memory(GiB)": 142.32, "step": 63200, "train_speed(iter/s)": 0.287108 }, { "acc": 0.74289117, "epoch": 0.7071123681749102, "grad_norm": 6.71875, "learning_rate": 7.685305348167446e-06, "loss": 1.0148138, "memory(GiB)": 142.32, "step": 63220, "train_speed(iter/s)": 0.287139 }, { "acc": 0.73552771, "epoch": 0.7073360671208687, "grad_norm": 6.0, "learning_rate": 7.683745097847973e-06, "loss": 1.04591417, "memory(GiB)": 142.32, "step": 63240, "train_speed(iter/s)": 0.287168 }, { "acc": 0.73754783, "epoch": 0.7075597660668272, "grad_norm": 6.53125, "learning_rate": 7.68218448035381e-06, "loss": 1.05179138, "memory(GiB)": 142.32, "step": 63260, "train_speed(iter/s)": 0.2872 }, { "acc": 0.72803202, "epoch": 0.7077834650127858, "grad_norm": 6.03125, "learning_rate": 7.680623495898472e-06, "loss": 1.07035313, "memory(GiB)": 142.32, "step": 63280, "train_speed(iter/s)": 0.287231 }, { "acc": 0.74582748, "epoch": 0.7080071639587443, "grad_norm": 7.34375, "learning_rate": 7.679062144695525e-06, "loss": 1.01518583, "memory(GiB)": 142.32, "step": 63300, "train_speed(iter/s)": 0.287258 }, { "acc": 0.73779216, "epoch": 0.7082308629047028, "grad_norm": 7.53125, "learning_rate": 7.677500426958584e-06, "loss": 1.03268671, "memory(GiB)": 142.32, "step": 63320, "train_speed(iter/s)": 0.28729 }, { "acc": 0.73919334, "epoch": 0.7084545618506614, "grad_norm": 5.46875, "learning_rate": 7.675938342901315e-06, "loss": 1.03795271, "memory(GiB)": 142.32, "step": 63340, "train_speed(iter/s)": 0.287321 }, { "acc": 0.7323473, "epoch": 0.7086782607966199, "grad_norm": 6.28125, "learning_rate": 7.674375892737433e-06, "loss": 1.05829258, "memory(GiB)": 142.32, "step": 63360, "train_speed(iter/s)": 0.287353 }, { "acc": 0.72328453, "epoch": 0.7089019597425784, "grad_norm": 4.8125, "learning_rate": 7.672813076680703e-06, "loss": 1.11350975, "memory(GiB)": 142.32, "step": 63380, "train_speed(iter/s)": 0.287384 }, { "acc": 0.72279749, "epoch": 0.7091256586885369, "grad_norm": 7.625, "learning_rate": 7.67124989494494e-06, "loss": 1.0945879, "memory(GiB)": 142.32, "step": 63400, "train_speed(iter/s)": 0.287414 }, { "acc": 0.73268328, "epoch": 0.7093493576344955, "grad_norm": 6.875, "learning_rate": 7.66968634774401e-06, "loss": 1.05251541, "memory(GiB)": 142.32, "step": 63420, "train_speed(iter/s)": 0.287446 }, { "acc": 0.72120714, "epoch": 0.709573056580454, "grad_norm": 4.375, "learning_rate": 7.66812243529183e-06, "loss": 1.11797886, "memory(GiB)": 142.32, "step": 63440, "train_speed(iter/s)": 0.28747 }, { "acc": 0.72247648, "epoch": 0.7097967555264125, "grad_norm": 6.15625, "learning_rate": 7.666558157802364e-06, "loss": 1.11237755, "memory(GiB)": 142.32, "step": 63460, "train_speed(iter/s)": 0.287501 }, { "acc": 0.73917193, "epoch": 0.710020454472371, "grad_norm": 6.4375, "learning_rate": 7.66499351548963e-06, "loss": 1.03059378, "memory(GiB)": 142.32, "step": 63480, "train_speed(iter/s)": 0.287532 }, { "acc": 0.73593426, "epoch": 0.7102441534183296, "grad_norm": 6.8125, "learning_rate": 7.663428508567689e-06, "loss": 1.04502039, "memory(GiB)": 142.32, "step": 63500, "train_speed(iter/s)": 0.287565 }, { "acc": 0.72677503, "epoch": 0.7104678523642881, "grad_norm": 5.8125, "learning_rate": 7.66186313725066e-06, "loss": 1.11762924, "memory(GiB)": 142.32, "step": 63520, "train_speed(iter/s)": 0.287597 }, { "acc": 0.74357481, "epoch": 0.7106915513102466, "grad_norm": 5.875, "learning_rate": 7.660297401752708e-06, "loss": 1.00943146, "memory(GiB)": 142.32, "step": 63540, "train_speed(iter/s)": 0.287629 }, { "acc": 0.7290657, "epoch": 0.7109152502562052, "grad_norm": 7.6875, "learning_rate": 7.658731302288046e-06, "loss": 1.0730978, "memory(GiB)": 142.32, "step": 63560, "train_speed(iter/s)": 0.28766 }, { "acc": 0.73333397, "epoch": 0.7111389492021637, "grad_norm": 5.46875, "learning_rate": 7.657164839070941e-06, "loss": 1.07996025, "memory(GiB)": 142.32, "step": 63580, "train_speed(iter/s)": 0.287693 }, { "acc": 0.73457093, "epoch": 0.7113626481481222, "grad_norm": 6.28125, "learning_rate": 7.655598012315706e-06, "loss": 1.0618082, "memory(GiB)": 142.32, "step": 63600, "train_speed(iter/s)": 0.287721 }, { "acc": 0.72360506, "epoch": 0.7115863470940808, "grad_norm": 5.46875, "learning_rate": 7.654030822236705e-06, "loss": 1.11551762, "memory(GiB)": 142.32, "step": 63620, "train_speed(iter/s)": 0.287749 }, { "acc": 0.72526999, "epoch": 0.7118100460400393, "grad_norm": 4.5625, "learning_rate": 7.65246326904835e-06, "loss": 1.08272142, "memory(GiB)": 142.32, "step": 63640, "train_speed(iter/s)": 0.28778 }, { "acc": 0.72631798, "epoch": 0.7120337449859978, "grad_norm": 7.09375, "learning_rate": 7.65089535296511e-06, "loss": 1.11359758, "memory(GiB)": 142.32, "step": 63660, "train_speed(iter/s)": 0.287806 }, { "acc": 0.73082728, "epoch": 0.7122574439319563, "grad_norm": 5.75, "learning_rate": 7.649327074201498e-06, "loss": 1.07872143, "memory(GiB)": 142.32, "step": 63680, "train_speed(iter/s)": 0.287838 }, { "acc": 0.73066368, "epoch": 0.7124811428779149, "grad_norm": 5.875, "learning_rate": 7.647758432972072e-06, "loss": 1.07550039, "memory(GiB)": 142.32, "step": 63700, "train_speed(iter/s)": 0.28787 }, { "acc": 0.73656816, "epoch": 0.7127048418238734, "grad_norm": 5.78125, "learning_rate": 7.646189429491449e-06, "loss": 1.03994751, "memory(GiB)": 142.32, "step": 63720, "train_speed(iter/s)": 0.287901 }, { "acc": 0.73519435, "epoch": 0.7129285407698319, "grad_norm": 6.25, "learning_rate": 7.644620063974287e-06, "loss": 1.05485926, "memory(GiB)": 142.32, "step": 63740, "train_speed(iter/s)": 0.287934 }, { "acc": 0.73237953, "epoch": 0.7131522397157904, "grad_norm": 5.65625, "learning_rate": 7.643050336635301e-06, "loss": 1.06668911, "memory(GiB)": 142.32, "step": 63760, "train_speed(iter/s)": 0.287963 }, { "acc": 0.73900766, "epoch": 0.713375938661749, "grad_norm": 5.53125, "learning_rate": 7.64148024768925e-06, "loss": 1.01936073, "memory(GiB)": 142.32, "step": 63780, "train_speed(iter/s)": 0.28799 }, { "acc": 0.73657846, "epoch": 0.7135996376077075, "grad_norm": 4.84375, "learning_rate": 7.639909797350945e-06, "loss": 1.05463066, "memory(GiB)": 142.32, "step": 63800, "train_speed(iter/s)": 0.288019 }, { "acc": 0.74813118, "epoch": 0.713823336553666, "grad_norm": 6.46875, "learning_rate": 7.63833898583525e-06, "loss": 0.98542566, "memory(GiB)": 142.32, "step": 63820, "train_speed(iter/s)": 0.288052 }, { "acc": 0.73848319, "epoch": 0.7140470354996246, "grad_norm": 5.5625, "learning_rate": 7.63676781335707e-06, "loss": 1.03219376, "memory(GiB)": 142.32, "step": 63840, "train_speed(iter/s)": 0.288078 }, { "acc": 0.74900308, "epoch": 0.7142707344455831, "grad_norm": 4.71875, "learning_rate": 7.635196280131363e-06, "loss": 0.99439793, "memory(GiB)": 142.32, "step": 63860, "train_speed(iter/s)": 0.288109 }, { "acc": 0.73563499, "epoch": 0.7144944333915416, "grad_norm": 6.65625, "learning_rate": 7.63362438637314e-06, "loss": 1.0615056, "memory(GiB)": 142.32, "step": 63880, "train_speed(iter/s)": 0.288142 }, { "acc": 0.72719688, "epoch": 0.7147181323375001, "grad_norm": 5.59375, "learning_rate": 7.632052132297459e-06, "loss": 1.08047581, "memory(GiB)": 142.32, "step": 63900, "train_speed(iter/s)": 0.288172 }, { "acc": 0.72842569, "epoch": 0.7149418312834587, "grad_norm": 6.34375, "learning_rate": 7.630479518119425e-06, "loss": 1.08149967, "memory(GiB)": 142.32, "step": 63920, "train_speed(iter/s)": 0.288204 }, { "acc": 0.72965326, "epoch": 0.7151655302294172, "grad_norm": 7.25, "learning_rate": 7.628906544054196e-06, "loss": 1.08919525, "memory(GiB)": 142.32, "step": 63940, "train_speed(iter/s)": 0.288236 }, { "acc": 0.73200793, "epoch": 0.7153892291753757, "grad_norm": 5.96875, "learning_rate": 7.627333210316974e-06, "loss": 1.06691074, "memory(GiB)": 142.32, "step": 63960, "train_speed(iter/s)": 0.288268 }, { "acc": 0.73693027, "epoch": 0.7156129281213343, "grad_norm": 5.875, "learning_rate": 7.625759517123016e-06, "loss": 1.05450602, "memory(GiB)": 142.32, "step": 63980, "train_speed(iter/s)": 0.2883 }, { "acc": 0.73867607, "epoch": 0.7158366270672928, "grad_norm": 5.25, "learning_rate": 7.624185464687626e-06, "loss": 1.03522243, "memory(GiB)": 142.32, "step": 64000, "train_speed(iter/s)": 0.288331 }, { "epoch": 0.7158366270672928, "eval_acc": 0.6938616100918534, "eval_loss": 1.0821846723556519, "eval_runtime": 2340.7874, "eval_samples_per_second": 32.161, "eval_steps_per_second": 16.081, "step": 64000 }, { "acc": 0.73983665, "epoch": 0.7160603260132513, "grad_norm": 6.125, "learning_rate": 7.622611053226157e-06, "loss": 1.02031765, "memory(GiB)": 142.32, "step": 64020, "train_speed(iter/s)": 0.285289 }, { "acc": 0.72564797, "epoch": 0.7162840249592098, "grad_norm": 5.90625, "learning_rate": 7.621036282954008e-06, "loss": 1.09961491, "memory(GiB)": 142.32, "step": 64040, "train_speed(iter/s)": 0.285319 }, { "acc": 0.73315382, "epoch": 0.7165077239051684, "grad_norm": 5.65625, "learning_rate": 7.619461154086633e-06, "loss": 1.07610893, "memory(GiB)": 142.32, "step": 64060, "train_speed(iter/s)": 0.285348 }, { "acc": 0.73381991, "epoch": 0.7167314228511269, "grad_norm": 6.09375, "learning_rate": 7.617885666839531e-06, "loss": 1.06056633, "memory(GiB)": 142.32, "step": 64080, "train_speed(iter/s)": 0.285377 }, { "acc": 0.73072267, "epoch": 0.7169551217970855, "grad_norm": 5.90625, "learning_rate": 7.616309821428254e-06, "loss": 1.07407894, "memory(GiB)": 142.32, "step": 64100, "train_speed(iter/s)": 0.28541 }, { "acc": 0.72063909, "epoch": 0.7171788207430441, "grad_norm": 6.15625, "learning_rate": 7.614733618068395e-06, "loss": 1.12680721, "memory(GiB)": 142.32, "step": 64120, "train_speed(iter/s)": 0.285441 }, { "acc": 0.73857069, "epoch": 0.7174025196890026, "grad_norm": 7.6875, "learning_rate": 7.613157056975604e-06, "loss": 1.03302841, "memory(GiB)": 142.32, "step": 64140, "train_speed(iter/s)": 0.28547 }, { "acc": 0.74253335, "epoch": 0.7176262186349611, "grad_norm": 4.5625, "learning_rate": 7.6115801383655776e-06, "loss": 1.01796799, "memory(GiB)": 142.32, "step": 64160, "train_speed(iter/s)": 0.2855 }, { "acc": 0.7227705, "epoch": 0.7178499175809196, "grad_norm": 6.625, "learning_rate": 7.610002862454063e-06, "loss": 1.11724024, "memory(GiB)": 142.32, "step": 64180, "train_speed(iter/s)": 0.285531 }, { "acc": 0.73782444, "epoch": 0.7180736165268782, "grad_norm": 6.6875, "learning_rate": 7.608425229456847e-06, "loss": 1.04500866, "memory(GiB)": 142.32, "step": 64200, "train_speed(iter/s)": 0.285558 }, { "acc": 0.72224531, "epoch": 0.7182973154728367, "grad_norm": 6.625, "learning_rate": 7.606847239589779e-06, "loss": 1.11598721, "memory(GiB)": 142.32, "step": 64220, "train_speed(iter/s)": 0.285588 }, { "acc": 0.7288497, "epoch": 0.7185210144187952, "grad_norm": 5.78125, "learning_rate": 7.605268893068748e-06, "loss": 1.0927887, "memory(GiB)": 142.32, "step": 64240, "train_speed(iter/s)": 0.285619 }, { "acc": 0.73337297, "epoch": 0.7187447133647538, "grad_norm": 6.0, "learning_rate": 7.603690190109694e-06, "loss": 1.06784725, "memory(GiB)": 142.32, "step": 64260, "train_speed(iter/s)": 0.285649 }, { "acc": 0.72726321, "epoch": 0.7189684123107123, "grad_norm": 6.09375, "learning_rate": 7.602111130928606e-06, "loss": 1.10557995, "memory(GiB)": 142.32, "step": 64280, "train_speed(iter/s)": 0.285682 }, { "acc": 0.73416486, "epoch": 0.7191921112566708, "grad_norm": 6.96875, "learning_rate": 7.600531715741523e-06, "loss": 1.0559269, "memory(GiB)": 142.32, "step": 64300, "train_speed(iter/s)": 0.285713 }, { "acc": 0.72223988, "epoch": 0.7194158102026293, "grad_norm": 4.5, "learning_rate": 7.5989519447645325e-06, "loss": 1.11116171, "memory(GiB)": 142.32, "step": 64320, "train_speed(iter/s)": 0.285742 }, { "acc": 0.73376088, "epoch": 0.7196395091485879, "grad_norm": 5.6875, "learning_rate": 7.597371818213768e-06, "loss": 1.05350933, "memory(GiB)": 142.32, "step": 64340, "train_speed(iter/s)": 0.285772 }, { "acc": 0.73697367, "epoch": 0.7198632080945464, "grad_norm": 4.8125, "learning_rate": 7.595791336305411e-06, "loss": 1.04340658, "memory(GiB)": 142.32, "step": 64360, "train_speed(iter/s)": 0.285801 }, { "acc": 0.7182972, "epoch": 0.7200869070405049, "grad_norm": 6.96875, "learning_rate": 7.5942104992557e-06, "loss": 1.11860657, "memory(GiB)": 142.32, "step": 64380, "train_speed(iter/s)": 0.285832 }, { "acc": 0.73595152, "epoch": 0.7203106059864635, "grad_norm": 6.78125, "learning_rate": 7.592629307280912e-06, "loss": 1.03045664, "memory(GiB)": 142.32, "step": 64400, "train_speed(iter/s)": 0.285865 }, { "acc": 0.73628092, "epoch": 0.720534304932422, "grad_norm": 7.34375, "learning_rate": 7.591047760597378e-06, "loss": 1.05331192, "memory(GiB)": 142.32, "step": 64420, "train_speed(iter/s)": 0.285891 }, { "acc": 0.73273087, "epoch": 0.7207580038783805, "grad_norm": 6.53125, "learning_rate": 7.589465859421474e-06, "loss": 1.06035728, "memory(GiB)": 142.32, "step": 64440, "train_speed(iter/s)": 0.285925 }, { "acc": 0.7370748, "epoch": 0.720981702824339, "grad_norm": 7.1875, "learning_rate": 7.5878836039696305e-06, "loss": 1.04803467, "memory(GiB)": 142.32, "step": 64460, "train_speed(iter/s)": 0.285957 }, { "acc": 0.7326314, "epoch": 0.7212054017702976, "grad_norm": 6.09375, "learning_rate": 7.586300994458319e-06, "loss": 1.06984625, "memory(GiB)": 142.32, "step": 64480, "train_speed(iter/s)": 0.285987 }, { "acc": 0.72055321, "epoch": 0.7214291007162561, "grad_norm": 6.90625, "learning_rate": 7.584718031104065e-06, "loss": 1.12167988, "memory(GiB)": 142.32, "step": 64500, "train_speed(iter/s)": 0.286012 }, { "acc": 0.73246694, "epoch": 0.7216527996622146, "grad_norm": 6.5625, "learning_rate": 7.583134714123441e-06, "loss": 1.05613079, "memory(GiB)": 142.32, "step": 64520, "train_speed(iter/s)": 0.28604 }, { "acc": 0.73657832, "epoch": 0.7218764986081732, "grad_norm": 6.09375, "learning_rate": 7.581551043733066e-06, "loss": 1.02833424, "memory(GiB)": 142.32, "step": 64540, "train_speed(iter/s)": 0.286067 }, { "acc": 0.72950649, "epoch": 0.7221001975541317, "grad_norm": 5.25, "learning_rate": 7.5799670201496085e-06, "loss": 1.07038708, "memory(GiB)": 142.32, "step": 64560, "train_speed(iter/s)": 0.286096 }, { "acc": 0.73352365, "epoch": 0.7223238965000902, "grad_norm": 6.65625, "learning_rate": 7.578382643589788e-06, "loss": 1.05987282, "memory(GiB)": 142.32, "step": 64580, "train_speed(iter/s)": 0.286121 }, { "acc": 0.73357711, "epoch": 0.7225475954460487, "grad_norm": 5.15625, "learning_rate": 7.576797914270368e-06, "loss": 1.04830952, "memory(GiB)": 142.32, "step": 64600, "train_speed(iter/s)": 0.286153 }, { "acc": 0.7235877, "epoch": 0.7227712943920073, "grad_norm": 7.53125, "learning_rate": 7.575212832408162e-06, "loss": 1.11127033, "memory(GiB)": 142.32, "step": 64620, "train_speed(iter/s)": 0.286182 }, { "acc": 0.7311554, "epoch": 0.7229949933379658, "grad_norm": 5.15625, "learning_rate": 7.5736273982200315e-06, "loss": 1.08932877, "memory(GiB)": 142.32, "step": 64640, "train_speed(iter/s)": 0.286209 }, { "acc": 0.73220348, "epoch": 0.7232186922839243, "grad_norm": 6.03125, "learning_rate": 7.572041611922889e-06, "loss": 1.05931959, "memory(GiB)": 142.32, "step": 64660, "train_speed(iter/s)": 0.286241 }, { "acc": 0.73471508, "epoch": 0.7234423912298829, "grad_norm": 6.9375, "learning_rate": 7.57045547373369e-06, "loss": 1.05893364, "memory(GiB)": 142.32, "step": 64680, "train_speed(iter/s)": 0.286272 }, { "acc": 0.73274422, "epoch": 0.7236660901758414, "grad_norm": 5.875, "learning_rate": 7.5688689838694415e-06, "loss": 1.05561771, "memory(GiB)": 142.32, "step": 64700, "train_speed(iter/s)": 0.286304 }, { "acc": 0.72680731, "epoch": 0.7238897891217999, "grad_norm": 6.21875, "learning_rate": 7.5672821425471996e-06, "loss": 1.09460163, "memory(GiB)": 142.32, "step": 64720, "train_speed(iter/s)": 0.286336 }, { "acc": 0.72404666, "epoch": 0.7241134880677584, "grad_norm": 5.21875, "learning_rate": 7.5656949499840656e-06, "loss": 1.08319998, "memory(GiB)": 142.32, "step": 64740, "train_speed(iter/s)": 0.286364 }, { "acc": 0.73747263, "epoch": 0.724337187013717, "grad_norm": 5.15625, "learning_rate": 7.56410740639719e-06, "loss": 1.03989716, "memory(GiB)": 142.32, "step": 64760, "train_speed(iter/s)": 0.286396 }, { "acc": 0.74148617, "epoch": 0.7245608859596755, "grad_norm": 6.40625, "learning_rate": 7.562519512003771e-06, "loss": 1.02079964, "memory(GiB)": 142.32, "step": 64780, "train_speed(iter/s)": 0.286424 }, { "acc": 0.73229275, "epoch": 0.724784584905634, "grad_norm": 6.375, "learning_rate": 7.560931267021056e-06, "loss": 1.06348076, "memory(GiB)": 142.32, "step": 64800, "train_speed(iter/s)": 0.286455 }, { "acc": 0.72550607, "epoch": 0.7250082838515925, "grad_norm": 5.90625, "learning_rate": 7.55934267166634e-06, "loss": 1.07950916, "memory(GiB)": 142.32, "step": 64820, "train_speed(iter/s)": 0.28648 }, { "acc": 0.73202047, "epoch": 0.7252319827975511, "grad_norm": 6.1875, "learning_rate": 7.557753726156965e-06, "loss": 1.06020947, "memory(GiB)": 142.32, "step": 64840, "train_speed(iter/s)": 0.28651 }, { "acc": 0.73049898, "epoch": 0.7254556817435096, "grad_norm": 6.09375, "learning_rate": 7.556164430710322e-06, "loss": 1.05774403, "memory(GiB)": 142.32, "step": 64860, "train_speed(iter/s)": 0.28654 }, { "acc": 0.74068184, "epoch": 0.7256793806894681, "grad_norm": 6.3125, "learning_rate": 7.554574785543848e-06, "loss": 1.03502769, "memory(GiB)": 142.32, "step": 64880, "train_speed(iter/s)": 0.28657 }, { "acc": 0.73545623, "epoch": 0.7259030796354267, "grad_norm": 5.375, "learning_rate": 7.5529847908750295e-06, "loss": 1.0439023, "memory(GiB)": 142.32, "step": 64900, "train_speed(iter/s)": 0.286601 }, { "acc": 0.7172708, "epoch": 0.7261267785813852, "grad_norm": 5.0, "learning_rate": 7.551394446921403e-06, "loss": 1.13566217, "memory(GiB)": 142.32, "step": 64920, "train_speed(iter/s)": 0.286629 }, { "acc": 0.73842244, "epoch": 0.7263504775273437, "grad_norm": 6.40625, "learning_rate": 7.5498037539005464e-06, "loss": 1.01918755, "memory(GiB)": 142.32, "step": 64940, "train_speed(iter/s)": 0.286659 }, { "acc": 0.71936617, "epoch": 0.7265741764733022, "grad_norm": 5.34375, "learning_rate": 7.548212712030092e-06, "loss": 1.13106003, "memory(GiB)": 142.32, "step": 64960, "train_speed(iter/s)": 0.286692 }, { "acc": 0.71770487, "epoch": 0.7267978754192608, "grad_norm": 5.71875, "learning_rate": 7.546621321527716e-06, "loss": 1.14303169, "memory(GiB)": 142.32, "step": 64980, "train_speed(iter/s)": 0.286725 }, { "acc": 0.73559837, "epoch": 0.7270215743652193, "grad_norm": 6.1875, "learning_rate": 7.545029582611144e-06, "loss": 1.04326859, "memory(GiB)": 142.32, "step": 65000, "train_speed(iter/s)": 0.286753 }, { "acc": 0.74467478, "epoch": 0.7272452733111778, "grad_norm": 5.625, "learning_rate": 7.543437495498148e-06, "loss": 1.01544266, "memory(GiB)": 142.32, "step": 65020, "train_speed(iter/s)": 0.286781 }, { "acc": 0.73182044, "epoch": 0.7274689722571364, "grad_norm": 4.8125, "learning_rate": 7.5418450604065495e-06, "loss": 1.06264973, "memory(GiB)": 142.32, "step": 65040, "train_speed(iter/s)": 0.286809 }, { "acc": 0.72468863, "epoch": 0.7276926712030949, "grad_norm": 5.03125, "learning_rate": 7.5402522775542145e-06, "loss": 1.10160694, "memory(GiB)": 142.32, "step": 65060, "train_speed(iter/s)": 0.286841 }, { "acc": 0.72681408, "epoch": 0.7279163701490534, "grad_norm": 5.46875, "learning_rate": 7.53865914715906e-06, "loss": 1.09572582, "memory(GiB)": 142.32, "step": 65080, "train_speed(iter/s)": 0.286869 }, { "acc": 0.72784147, "epoch": 0.7281400690950119, "grad_norm": 4.5625, "learning_rate": 7.537065669439046e-06, "loss": 1.08102436, "memory(GiB)": 142.32, "step": 65100, "train_speed(iter/s)": 0.286898 }, { "acc": 0.72196112, "epoch": 0.7283637680409705, "grad_norm": 6.53125, "learning_rate": 7.535471844612188e-06, "loss": 1.11776485, "memory(GiB)": 142.32, "step": 65120, "train_speed(iter/s)": 0.286927 }, { "acc": 0.73383231, "epoch": 0.728587466986929, "grad_norm": 7.84375, "learning_rate": 7.5338776728965415e-06, "loss": 1.07111473, "memory(GiB)": 142.32, "step": 65140, "train_speed(iter/s)": 0.286957 }, { "acc": 0.73150182, "epoch": 0.7288111659328875, "grad_norm": 5.0625, "learning_rate": 7.532283154510209e-06, "loss": 1.07139187, "memory(GiB)": 142.32, "step": 65160, "train_speed(iter/s)": 0.286987 }, { "acc": 0.72723637, "epoch": 0.7290348648788461, "grad_norm": 5.59375, "learning_rate": 7.530688289671348e-06, "loss": 1.07897491, "memory(GiB)": 142.32, "step": 65180, "train_speed(iter/s)": 0.287017 }, { "acc": 0.74753084, "epoch": 0.7292585638248046, "grad_norm": 5.53125, "learning_rate": 7.529093078598158e-06, "loss": 1.00823841, "memory(GiB)": 142.32, "step": 65200, "train_speed(iter/s)": 0.287048 }, { "acc": 0.72122021, "epoch": 0.7294822627707631, "grad_norm": 6.125, "learning_rate": 7.527497521508885e-06, "loss": 1.1141036, "memory(GiB)": 142.32, "step": 65220, "train_speed(iter/s)": 0.28708 }, { "acc": 0.7346241, "epoch": 0.7297059617167216, "grad_norm": 6.0625, "learning_rate": 7.5259016186218255e-06, "loss": 1.0665966, "memory(GiB)": 142.32, "step": 65240, "train_speed(iter/s)": 0.287108 }, { "acc": 0.73801036, "epoch": 0.7299296606626802, "grad_norm": 5.875, "learning_rate": 7.52430537015532e-06, "loss": 1.03604212, "memory(GiB)": 142.32, "step": 65260, "train_speed(iter/s)": 0.287136 }, { "acc": 0.7297174, "epoch": 0.7301533596086387, "grad_norm": 6.90625, "learning_rate": 7.522708776327761e-06, "loss": 1.07784691, "memory(GiB)": 142.32, "step": 65280, "train_speed(iter/s)": 0.287163 }, { "acc": 0.74084215, "epoch": 0.7303770585545972, "grad_norm": 5.96875, "learning_rate": 7.521111837357582e-06, "loss": 1.03228941, "memory(GiB)": 142.32, "step": 65300, "train_speed(iter/s)": 0.28719 }, { "acc": 0.73527155, "epoch": 0.7306007575005558, "grad_norm": 5.8125, "learning_rate": 7.519514553463267e-06, "loss": 1.04433765, "memory(GiB)": 142.32, "step": 65320, "train_speed(iter/s)": 0.287217 }, { "acc": 0.73628101, "epoch": 0.7308244564465143, "grad_norm": 6.78125, "learning_rate": 7.517916924863353e-06, "loss": 1.04577227, "memory(GiB)": 142.32, "step": 65340, "train_speed(iter/s)": 0.287246 }, { "acc": 0.73299184, "epoch": 0.7310481553924728, "grad_norm": 6.375, "learning_rate": 7.5163189517764134e-06, "loss": 1.0702961, "memory(GiB)": 142.32, "step": 65360, "train_speed(iter/s)": 0.287276 }, { "acc": 0.72290792, "epoch": 0.7312718543384313, "grad_norm": 6.15625, "learning_rate": 7.514720634421073e-06, "loss": 1.12158375, "memory(GiB)": 142.32, "step": 65380, "train_speed(iter/s)": 0.287309 }, { "acc": 0.73183064, "epoch": 0.7314955532843899, "grad_norm": 5.4375, "learning_rate": 7.5131219730160065e-06, "loss": 1.06789179, "memory(GiB)": 142.32, "step": 65400, "train_speed(iter/s)": 0.287341 }, { "acc": 0.7296936, "epoch": 0.7317192522303484, "grad_norm": 5.15625, "learning_rate": 7.511522967779934e-06, "loss": 1.09948769, "memory(GiB)": 142.32, "step": 65420, "train_speed(iter/s)": 0.287367 }, { "acc": 0.73731422, "epoch": 0.7319429511763069, "grad_norm": 5.84375, "learning_rate": 7.509923618931621e-06, "loss": 1.02651997, "memory(GiB)": 142.32, "step": 65440, "train_speed(iter/s)": 0.287396 }, { "acc": 0.73774185, "epoch": 0.7321666501222654, "grad_norm": 7.1875, "learning_rate": 7.5083239266898824e-06, "loss": 1.05737944, "memory(GiB)": 142.32, "step": 65460, "train_speed(iter/s)": 0.287424 }, { "acc": 0.7359478, "epoch": 0.732390349068224, "grad_norm": 6.5, "learning_rate": 7.506723891273577e-06, "loss": 1.05335293, "memory(GiB)": 142.32, "step": 65480, "train_speed(iter/s)": 0.287456 }, { "acc": 0.73014288, "epoch": 0.7326140480141825, "grad_norm": 5.21875, "learning_rate": 7.505123512901615e-06, "loss": 1.06641273, "memory(GiB)": 142.32, "step": 65500, "train_speed(iter/s)": 0.287489 }, { "acc": 0.73942719, "epoch": 0.732837746960141, "grad_norm": 6.0, "learning_rate": 7.5035227917929495e-06, "loss": 1.03987904, "memory(GiB)": 142.32, "step": 65520, "train_speed(iter/s)": 0.287514 }, { "acc": 0.72290306, "epoch": 0.7330614459060996, "grad_norm": 5.375, "learning_rate": 7.501921728166584e-06, "loss": 1.11135607, "memory(GiB)": 142.32, "step": 65540, "train_speed(iter/s)": 0.287541 }, { "acc": 0.72862244, "epoch": 0.7332851448520581, "grad_norm": 6.96875, "learning_rate": 7.500320322241564e-06, "loss": 1.06527929, "memory(GiB)": 142.32, "step": 65560, "train_speed(iter/s)": 0.287567 }, { "acc": 0.73738909, "epoch": 0.7335088437980166, "grad_norm": 6.1875, "learning_rate": 7.498718574236986e-06, "loss": 1.03651466, "memory(GiB)": 142.32, "step": 65580, "train_speed(iter/s)": 0.287598 }, { "acc": 0.72458272, "epoch": 0.7337325427439751, "grad_norm": 6.1875, "learning_rate": 7.497116484371992e-06, "loss": 1.1057888, "memory(GiB)": 142.32, "step": 65600, "train_speed(iter/s)": 0.287626 }, { "acc": 0.7286623, "epoch": 0.7339562416899337, "grad_norm": 4.6875, "learning_rate": 7.495514052865772e-06, "loss": 1.08106623, "memory(GiB)": 142.32, "step": 65620, "train_speed(iter/s)": 0.287657 }, { "acc": 0.73112211, "epoch": 0.7341799406358922, "grad_norm": 7.4375, "learning_rate": 7.49391127993756e-06, "loss": 1.04656029, "memory(GiB)": 142.32, "step": 65640, "train_speed(iter/s)": 0.287687 }, { "acc": 0.73033838, "epoch": 0.7344036395818507, "grad_norm": 5.75, "learning_rate": 7.492308165806639e-06, "loss": 1.08460369, "memory(GiB)": 142.32, "step": 65660, "train_speed(iter/s)": 0.287716 }, { "acc": 0.7276454, "epoch": 0.7346273385278093, "grad_norm": 5.15625, "learning_rate": 7.490704710692337e-06, "loss": 1.0953743, "memory(GiB)": 142.32, "step": 65680, "train_speed(iter/s)": 0.287741 }, { "acc": 0.73995509, "epoch": 0.7348510374737678, "grad_norm": 4.875, "learning_rate": 7.4891009148140306e-06, "loss": 1.02985096, "memory(GiB)": 142.32, "step": 65700, "train_speed(iter/s)": 0.287771 }, { "acc": 0.72951431, "epoch": 0.7350747364197263, "grad_norm": 4.46875, "learning_rate": 7.487496778391141e-06, "loss": 1.06665993, "memory(GiB)": 142.32, "step": 65720, "train_speed(iter/s)": 0.287802 }, { "acc": 0.72946901, "epoch": 0.7352984353656848, "grad_norm": 7.34375, "learning_rate": 7.485892301643137e-06, "loss": 1.07390308, "memory(GiB)": 142.32, "step": 65740, "train_speed(iter/s)": 0.287832 }, { "acc": 0.73040171, "epoch": 0.7355221343116434, "grad_norm": 5.65625, "learning_rate": 7.484287484789537e-06, "loss": 1.07618322, "memory(GiB)": 142.32, "step": 65760, "train_speed(iter/s)": 0.28786 }, { "acc": 0.7257782, "epoch": 0.7357458332576019, "grad_norm": 7.125, "learning_rate": 7.482682328049899e-06, "loss": 1.09457884, "memory(GiB)": 142.32, "step": 65780, "train_speed(iter/s)": 0.287891 }, { "acc": 0.72710991, "epoch": 0.7359695322035604, "grad_norm": 5.65625, "learning_rate": 7.481076831643832e-06, "loss": 1.0782589, "memory(GiB)": 142.32, "step": 65800, "train_speed(iter/s)": 0.28792 }, { "acc": 0.72229552, "epoch": 0.736193231149519, "grad_norm": 5.53125, "learning_rate": 7.4794709957909925e-06, "loss": 1.09557428, "memory(GiB)": 142.32, "step": 65820, "train_speed(iter/s)": 0.287948 }, { "acc": 0.73879757, "epoch": 0.7364169300954775, "grad_norm": 7.9375, "learning_rate": 7.477864820711081e-06, "loss": 1.0575614, "memory(GiB)": 142.32, "step": 65840, "train_speed(iter/s)": 0.287977 }, { "acc": 0.73458862, "epoch": 0.736640629041436, "grad_norm": 6.125, "learning_rate": 7.476258306623846e-06, "loss": 1.06092806, "memory(GiB)": 142.32, "step": 65860, "train_speed(iter/s)": 0.288008 }, { "acc": 0.72792635, "epoch": 0.7368643279873945, "grad_norm": 6.5, "learning_rate": 7.47465145374908e-06, "loss": 1.10102253, "memory(GiB)": 142.32, "step": 65880, "train_speed(iter/s)": 0.288038 }, { "acc": 0.72760992, "epoch": 0.7370880269333531, "grad_norm": 6.375, "learning_rate": 7.4730442623066235e-06, "loss": 1.08214769, "memory(GiB)": 142.32, "step": 65900, "train_speed(iter/s)": 0.28807 }, { "acc": 0.73884158, "epoch": 0.7373117258793116, "grad_norm": 5.21875, "learning_rate": 7.471436732516364e-06, "loss": 1.01941748, "memory(GiB)": 142.32, "step": 65920, "train_speed(iter/s)": 0.2881 }, { "acc": 0.74025068, "epoch": 0.7375354248252701, "grad_norm": 6.625, "learning_rate": 7.469828864598236e-06, "loss": 1.03182793, "memory(GiB)": 142.32, "step": 65940, "train_speed(iter/s)": 0.28813 }, { "acc": 0.73141289, "epoch": 0.7377591237712287, "grad_norm": 5.71875, "learning_rate": 7.468220658772216e-06, "loss": 1.05399189, "memory(GiB)": 142.32, "step": 65960, "train_speed(iter/s)": 0.28816 }, { "acc": 0.74826298, "epoch": 0.7379828227171872, "grad_norm": 5.46875, "learning_rate": 7.466612115258331e-06, "loss": 0.99596119, "memory(GiB)": 142.32, "step": 65980, "train_speed(iter/s)": 0.288188 }, { "acc": 0.73777981, "epoch": 0.7382065216631457, "grad_norm": 5.96875, "learning_rate": 7.465003234276655e-06, "loss": 1.03968115, "memory(GiB)": 142.32, "step": 66000, "train_speed(iter/s)": 0.288218 }, { "epoch": 0.7382065216631457, "eval_acc": 0.694163650697311, "eval_loss": 1.0811753273010254, "eval_runtime": 2341.3316, "eval_samples_per_second": 32.154, "eval_steps_per_second": 16.077, "step": 66000 }, { "acc": 0.72903767, "epoch": 0.7384302206091042, "grad_norm": 7.1875, "learning_rate": 7.463394016047301e-06, "loss": 1.07558994, "memory(GiB)": 142.32, "step": 66020, "train_speed(iter/s)": 0.285268 }, { "acc": 0.71777096, "epoch": 0.7386539195550628, "grad_norm": 6.75, "learning_rate": 7.461784460790435e-06, "loss": 1.13520584, "memory(GiB)": 142.32, "step": 66040, "train_speed(iter/s)": 0.285296 }, { "acc": 0.72394314, "epoch": 0.7388776185010213, "grad_norm": 5.96875, "learning_rate": 7.460174568726269e-06, "loss": 1.11088791, "memory(GiB)": 142.32, "step": 66060, "train_speed(iter/s)": 0.285327 }, { "acc": 0.74271507, "epoch": 0.7391013174469798, "grad_norm": 5.625, "learning_rate": 7.458564340075057e-06, "loss": 0.99797192, "memory(GiB)": 142.32, "step": 66080, "train_speed(iter/s)": 0.285357 }, { "acc": 0.74515896, "epoch": 0.7393250163929384, "grad_norm": 7.4375, "learning_rate": 7.456953775057105e-06, "loss": 1.00441608, "memory(GiB)": 142.32, "step": 66100, "train_speed(iter/s)": 0.285387 }, { "acc": 0.74691086, "epoch": 0.7395487153388969, "grad_norm": 6.875, "learning_rate": 7.455342873892756e-06, "loss": 1.00334358, "memory(GiB)": 142.32, "step": 66120, "train_speed(iter/s)": 0.285417 }, { "acc": 0.73286638, "epoch": 0.7397724142848554, "grad_norm": 6.25, "learning_rate": 7.453731636802408e-06, "loss": 1.0732667, "memory(GiB)": 142.32, "step": 66140, "train_speed(iter/s)": 0.285449 }, { "acc": 0.72910852, "epoch": 0.7399961132308139, "grad_norm": 6.59375, "learning_rate": 7.452120064006499e-06, "loss": 1.08784399, "memory(GiB)": 142.32, "step": 66160, "train_speed(iter/s)": 0.285479 }, { "acc": 0.74576025, "epoch": 0.7402198121767725, "grad_norm": 6.1875, "learning_rate": 7.450508155725518e-06, "loss": 1.00611916, "memory(GiB)": 142.32, "step": 66180, "train_speed(iter/s)": 0.28551 }, { "acc": 0.72855253, "epoch": 0.740443511122731, "grad_norm": 5.71875, "learning_rate": 7.448895912179994e-06, "loss": 1.07302351, "memory(GiB)": 142.32, "step": 66200, "train_speed(iter/s)": 0.28554 }, { "acc": 0.73031569, "epoch": 0.7406672100686895, "grad_norm": 6.28125, "learning_rate": 7.447283333590507e-06, "loss": 1.06925831, "memory(GiB)": 142.32, "step": 66220, "train_speed(iter/s)": 0.285572 }, { "acc": 0.74058361, "epoch": 0.740890909014648, "grad_norm": 5.0, "learning_rate": 7.445670420177681e-06, "loss": 1.02459946, "memory(GiB)": 142.32, "step": 66240, "train_speed(iter/s)": 0.285598 }, { "acc": 0.72645316, "epoch": 0.7411146079606066, "grad_norm": 5.78125, "learning_rate": 7.444057172162184e-06, "loss": 1.09289322, "memory(GiB)": 142.32, "step": 66260, "train_speed(iter/s)": 0.285625 }, { "acc": 0.73900805, "epoch": 0.7413383069065651, "grad_norm": 5.71875, "learning_rate": 7.4424435897647316e-06, "loss": 1.02434464, "memory(GiB)": 142.32, "step": 66280, "train_speed(iter/s)": 0.285653 }, { "acc": 0.72713537, "epoch": 0.7415620058525236, "grad_norm": 5.78125, "learning_rate": 7.440829673206087e-06, "loss": 1.07823143, "memory(GiB)": 142.32, "step": 66300, "train_speed(iter/s)": 0.285684 }, { "acc": 0.74171996, "epoch": 0.7417857047984822, "grad_norm": 6.03125, "learning_rate": 7.439215422707056e-06, "loss": 1.02107153, "memory(GiB)": 142.32, "step": 66320, "train_speed(iter/s)": 0.285714 }, { "acc": 0.73277178, "epoch": 0.7420094037444407, "grad_norm": 6.0, "learning_rate": 7.437600838488488e-06, "loss": 1.06140881, "memory(GiB)": 142.32, "step": 66340, "train_speed(iter/s)": 0.285743 }, { "acc": 0.74056101, "epoch": 0.7422331026903992, "grad_norm": 6.09375, "learning_rate": 7.4359859207712855e-06, "loss": 1.04141846, "memory(GiB)": 142.32, "step": 66360, "train_speed(iter/s)": 0.285772 }, { "acc": 0.72879415, "epoch": 0.7424568016363577, "grad_norm": 5.375, "learning_rate": 7.434370669776392e-06, "loss": 1.0867054, "memory(GiB)": 142.32, "step": 66380, "train_speed(iter/s)": 0.285802 }, { "acc": 0.73028989, "epoch": 0.7426805005823163, "grad_norm": 5.96875, "learning_rate": 7.432755085724794e-06, "loss": 1.08314829, "memory(GiB)": 142.32, "step": 66400, "train_speed(iter/s)": 0.285829 }, { "acc": 0.72599611, "epoch": 0.7429041995282748, "grad_norm": 5.25, "learning_rate": 7.431139168837529e-06, "loss": 1.08864174, "memory(GiB)": 142.32, "step": 66420, "train_speed(iter/s)": 0.285856 }, { "acc": 0.73179116, "epoch": 0.7431278984742333, "grad_norm": 7.125, "learning_rate": 7.429522919335676e-06, "loss": 1.05448084, "memory(GiB)": 142.32, "step": 66440, "train_speed(iter/s)": 0.285887 }, { "acc": 0.72971354, "epoch": 0.7433515974201919, "grad_norm": 6.25, "learning_rate": 7.427906337440362e-06, "loss": 1.07335939, "memory(GiB)": 142.32, "step": 66460, "train_speed(iter/s)": 0.285914 }, { "acc": 0.72828989, "epoch": 0.7435752963661504, "grad_norm": 7.03125, "learning_rate": 7.426289423372759e-06, "loss": 1.08980322, "memory(GiB)": 142.32, "step": 66480, "train_speed(iter/s)": 0.285941 }, { "acc": 0.74211974, "epoch": 0.7437989953121089, "grad_norm": 5.4375, "learning_rate": 7.424672177354084e-06, "loss": 1.02565269, "memory(GiB)": 142.32, "step": 66500, "train_speed(iter/s)": 0.285967 }, { "acc": 0.71132069, "epoch": 0.7440226942580674, "grad_norm": 6.8125, "learning_rate": 7.423054599605597e-06, "loss": 1.16982861, "memory(GiB)": 142.32, "step": 66520, "train_speed(iter/s)": 0.285999 }, { "acc": 0.72481804, "epoch": 0.744246393204026, "grad_norm": 5.5, "learning_rate": 7.421436690348608e-06, "loss": 1.10241737, "memory(GiB)": 142.32, "step": 66540, "train_speed(iter/s)": 0.286027 }, { "acc": 0.71863189, "epoch": 0.7444700921499845, "grad_norm": 6.21875, "learning_rate": 7.419818449804469e-06, "loss": 1.11260567, "memory(GiB)": 142.32, "step": 66560, "train_speed(iter/s)": 0.286058 }, { "acc": 0.73607416, "epoch": 0.744693791095943, "grad_norm": 6.59375, "learning_rate": 7.418199878194579e-06, "loss": 1.04315119, "memory(GiB)": 142.32, "step": 66580, "train_speed(iter/s)": 0.286088 }, { "acc": 0.72966194, "epoch": 0.7449174900419017, "grad_norm": 5.90625, "learning_rate": 7.416580975740382e-06, "loss": 1.06424122, "memory(GiB)": 142.32, "step": 66600, "train_speed(iter/s)": 0.286118 }, { "acc": 0.7414629, "epoch": 0.7451411889878602, "grad_norm": 4.78125, "learning_rate": 7.414961742663367e-06, "loss": 1.03117828, "memory(GiB)": 142.32, "step": 66620, "train_speed(iter/s)": 0.286142 }, { "acc": 0.72668958, "epoch": 0.7453648879338187, "grad_norm": 6.0, "learning_rate": 7.413342179185065e-06, "loss": 1.08776426, "memory(GiB)": 142.32, "step": 66640, "train_speed(iter/s)": 0.286168 }, { "acc": 0.74506016, "epoch": 0.7455885868797772, "grad_norm": 5.75, "learning_rate": 7.411722285527061e-06, "loss": 1.00664082, "memory(GiB)": 142.32, "step": 66660, "train_speed(iter/s)": 0.286196 }, { "acc": 0.72157793, "epoch": 0.7458122858257358, "grad_norm": 4.875, "learning_rate": 7.4101020619109765e-06, "loss": 1.12377605, "memory(GiB)": 142.32, "step": 66680, "train_speed(iter/s)": 0.286222 }, { "acc": 0.72632713, "epoch": 0.7460359847716943, "grad_norm": 5.78125, "learning_rate": 7.4084815085584816e-06, "loss": 1.08784847, "memory(GiB)": 142.32, "step": 66700, "train_speed(iter/s)": 0.286252 }, { "acc": 0.72278919, "epoch": 0.7462596837176528, "grad_norm": 4.96875, "learning_rate": 7.406860625691292e-06, "loss": 1.10963926, "memory(GiB)": 142.32, "step": 66720, "train_speed(iter/s)": 0.286285 }, { "acc": 0.74629774, "epoch": 0.7464833826636114, "grad_norm": 6.53125, "learning_rate": 7.4052394135311655e-06, "loss": 0.99096317, "memory(GiB)": 142.32, "step": 66740, "train_speed(iter/s)": 0.286316 }, { "acc": 0.72582283, "epoch": 0.7467070816095699, "grad_norm": 6.15625, "learning_rate": 7.403617872299908e-06, "loss": 1.09309139, "memory(GiB)": 142.32, "step": 66760, "train_speed(iter/s)": 0.286339 }, { "acc": 0.71898193, "epoch": 0.7469307805555284, "grad_norm": 5.21875, "learning_rate": 7.4019960022193715e-06, "loss": 1.10891914, "memory(GiB)": 142.32, "step": 66780, "train_speed(iter/s)": 0.286369 }, { "acc": 0.7307148, "epoch": 0.747154479501487, "grad_norm": 7.0, "learning_rate": 7.400373803511448e-06, "loss": 1.07674141, "memory(GiB)": 142.32, "step": 66800, "train_speed(iter/s)": 0.286399 }, { "acc": 0.72873936, "epoch": 0.7473781784474455, "grad_norm": 6.0625, "learning_rate": 7.398751276398081e-06, "loss": 1.10129604, "memory(GiB)": 142.32, "step": 66820, "train_speed(iter/s)": 0.286429 }, { "acc": 0.71776657, "epoch": 0.747601877393404, "grad_norm": 5.5, "learning_rate": 7.397128421101252e-06, "loss": 1.12613258, "memory(GiB)": 142.32, "step": 66840, "train_speed(iter/s)": 0.286458 }, { "acc": 0.73208885, "epoch": 0.7478255763393625, "grad_norm": 5.53125, "learning_rate": 7.39550523784299e-06, "loss": 1.06375256, "memory(GiB)": 142.32, "step": 66860, "train_speed(iter/s)": 0.286488 }, { "acc": 0.72778168, "epoch": 0.7480492752853211, "grad_norm": 6.34375, "learning_rate": 7.393881726845374e-06, "loss": 1.09492111, "memory(GiB)": 142.32, "step": 66880, "train_speed(iter/s)": 0.286516 }, { "acc": 0.73042307, "epoch": 0.7482729742312796, "grad_norm": 5.90625, "learning_rate": 7.392257888330522e-06, "loss": 1.07863512, "memory(GiB)": 142.32, "step": 66900, "train_speed(iter/s)": 0.286546 }, { "acc": 0.73010569, "epoch": 0.7484966731772381, "grad_norm": 5.90625, "learning_rate": 7.390633722520597e-06, "loss": 1.07588215, "memory(GiB)": 142.32, "step": 66920, "train_speed(iter/s)": 0.286573 }, { "acc": 0.73701868, "epoch": 0.7487203721231966, "grad_norm": 5.6875, "learning_rate": 7.389009229637809e-06, "loss": 1.06171513, "memory(GiB)": 142.32, "step": 66940, "train_speed(iter/s)": 0.286601 }, { "acc": 0.73591423, "epoch": 0.7489440710691552, "grad_norm": 5.65625, "learning_rate": 7.387384409904411e-06, "loss": 1.04453526, "memory(GiB)": 142.32, "step": 66960, "train_speed(iter/s)": 0.286631 }, { "acc": 0.73632135, "epoch": 0.7491677700151137, "grad_norm": 6.3125, "learning_rate": 7.385759263542702e-06, "loss": 1.04428244, "memory(GiB)": 142.32, "step": 66980, "train_speed(iter/s)": 0.286662 }, { "acc": 0.7210485, "epoch": 0.7493914689610722, "grad_norm": 5.625, "learning_rate": 7.384133790775025e-06, "loss": 1.11781511, "memory(GiB)": 142.32, "step": 67000, "train_speed(iter/s)": 0.286689 }, { "acc": 0.73726568, "epoch": 0.7496151679070308, "grad_norm": 5.625, "learning_rate": 7.382507991823771e-06, "loss": 1.04833698, "memory(GiB)": 142.32, "step": 67020, "train_speed(iter/s)": 0.286717 }, { "acc": 0.73950987, "epoch": 0.7498388668529893, "grad_norm": 5.3125, "learning_rate": 7.380881866911367e-06, "loss": 1.01850128, "memory(GiB)": 142.32, "step": 67040, "train_speed(iter/s)": 0.286746 }, { "acc": 0.73424468, "epoch": 0.7500625657989478, "grad_norm": 6.5625, "learning_rate": 7.379255416260294e-06, "loss": 1.05991135, "memory(GiB)": 142.32, "step": 67060, "train_speed(iter/s)": 0.286775 }, { "acc": 0.72619257, "epoch": 0.7502862647449063, "grad_norm": 5.53125, "learning_rate": 7.377628640093072e-06, "loss": 1.08789673, "memory(GiB)": 142.32, "step": 67080, "train_speed(iter/s)": 0.286803 }, { "acc": 0.73146968, "epoch": 0.7505099636908649, "grad_norm": 4.4375, "learning_rate": 7.376001538632268e-06, "loss": 1.06648331, "memory(GiB)": 142.32, "step": 67100, "train_speed(iter/s)": 0.286833 }, { "acc": 0.73050451, "epoch": 0.7507336626368234, "grad_norm": 6.0625, "learning_rate": 7.374374112100493e-06, "loss": 1.07496319, "memory(GiB)": 142.32, "step": 67120, "train_speed(iter/s)": 0.286865 }, { "acc": 0.7342495, "epoch": 0.7509573615827819, "grad_norm": 5.90625, "learning_rate": 7.372746360720403e-06, "loss": 1.06117077, "memory(GiB)": 142.32, "step": 67140, "train_speed(iter/s)": 0.286894 }, { "acc": 0.73334002, "epoch": 0.7511810605287405, "grad_norm": 7.125, "learning_rate": 7.371118284714695e-06, "loss": 1.06627178, "memory(GiB)": 142.32, "step": 67160, "train_speed(iter/s)": 0.286926 }, { "acc": 0.73318748, "epoch": 0.751404759474699, "grad_norm": 4.625, "learning_rate": 7.369489884306115e-06, "loss": 1.05887566, "memory(GiB)": 142.32, "step": 67180, "train_speed(iter/s)": 0.286956 }, { "acc": 0.72488499, "epoch": 0.7516284584206575, "grad_norm": 6.09375, "learning_rate": 7.367861159717451e-06, "loss": 1.087216, "memory(GiB)": 142.32, "step": 67200, "train_speed(iter/s)": 0.286984 }, { "acc": 0.72837753, "epoch": 0.751852157366616, "grad_norm": 6.125, "learning_rate": 7.366232111171535e-06, "loss": 1.06866341, "memory(GiB)": 142.32, "step": 67220, "train_speed(iter/s)": 0.287009 }, { "acc": 0.73233409, "epoch": 0.7520758563125746, "grad_norm": 7.03125, "learning_rate": 7.3646027388912465e-06, "loss": 1.06343918, "memory(GiB)": 142.32, "step": 67240, "train_speed(iter/s)": 0.287039 }, { "acc": 0.73595328, "epoch": 0.7522995552585331, "grad_norm": 6.03125, "learning_rate": 7.362973043099504e-06, "loss": 1.03521585, "memory(GiB)": 142.32, "step": 67260, "train_speed(iter/s)": 0.287063 }, { "acc": 0.74047489, "epoch": 0.7525232542044916, "grad_norm": 6.59375, "learning_rate": 7.3613430240192754e-06, "loss": 1.03746738, "memory(GiB)": 142.32, "step": 67280, "train_speed(iter/s)": 0.28709 }, { "acc": 0.73444505, "epoch": 0.7527469531504501, "grad_norm": 4.96875, "learning_rate": 7.3597126818735686e-06, "loss": 1.06553822, "memory(GiB)": 142.32, "step": 67300, "train_speed(iter/s)": 0.28712 }, { "acc": 0.7254117, "epoch": 0.7529706520964087, "grad_norm": 4.78125, "learning_rate": 7.35808201688544e-06, "loss": 1.10255852, "memory(GiB)": 142.32, "step": 67320, "train_speed(iter/s)": 0.287148 }, { "acc": 0.71616449, "epoch": 0.7531943510423672, "grad_norm": 5.59375, "learning_rate": 7.356451029277987e-06, "loss": 1.14217453, "memory(GiB)": 142.32, "step": 67340, "train_speed(iter/s)": 0.287177 }, { "acc": 0.73399878, "epoch": 0.7534180499883257, "grad_norm": 6.5, "learning_rate": 7.354819719274351e-06, "loss": 1.04994879, "memory(GiB)": 142.32, "step": 67360, "train_speed(iter/s)": 0.287204 }, { "acc": 0.72940197, "epoch": 0.7536417489342843, "grad_norm": 5.8125, "learning_rate": 7.353188087097719e-06, "loss": 1.07630558, "memory(GiB)": 142.32, "step": 67380, "train_speed(iter/s)": 0.287234 }, { "acc": 0.7355969, "epoch": 0.7538654478802428, "grad_norm": 7.25, "learning_rate": 7.351556132971323e-06, "loss": 1.0505784, "memory(GiB)": 142.32, "step": 67400, "train_speed(iter/s)": 0.287263 }, { "acc": 0.72539721, "epoch": 0.7540891468262013, "grad_norm": 7.0625, "learning_rate": 7.349923857118435e-06, "loss": 1.09489441, "memory(GiB)": 142.32, "step": 67420, "train_speed(iter/s)": 0.287288 }, { "acc": 0.72193565, "epoch": 0.7543128457721598, "grad_norm": 5.59375, "learning_rate": 7.348291259762376e-06, "loss": 1.10214863, "memory(GiB)": 142.32, "step": 67440, "train_speed(iter/s)": 0.287319 }, { "acc": 0.7370935, "epoch": 0.7545365447181184, "grad_norm": 6.28125, "learning_rate": 7.346658341126508e-06, "loss": 1.04884558, "memory(GiB)": 142.32, "step": 67460, "train_speed(iter/s)": 0.287346 }, { "acc": 0.73222151, "epoch": 0.7547602436640769, "grad_norm": 5.4375, "learning_rate": 7.345025101434238e-06, "loss": 1.06008339, "memory(GiB)": 142.32, "step": 67480, "train_speed(iter/s)": 0.287376 }, { "acc": 0.7258462, "epoch": 0.7549839426100354, "grad_norm": 5.1875, "learning_rate": 7.343391540909014e-06, "loss": 1.0925354, "memory(GiB)": 142.32, "step": 67500, "train_speed(iter/s)": 0.287403 }, { "acc": 0.72975674, "epoch": 0.755207641555994, "grad_norm": 6.8125, "learning_rate": 7.341757659774333e-06, "loss": 1.07702198, "memory(GiB)": 142.32, "step": 67520, "train_speed(iter/s)": 0.287433 }, { "acc": 0.73313675, "epoch": 0.7554313405019525, "grad_norm": 6.84375, "learning_rate": 7.340123458253735e-06, "loss": 1.07786064, "memory(GiB)": 142.32, "step": 67540, "train_speed(iter/s)": 0.287459 }, { "acc": 0.73248019, "epoch": 0.755655039447911, "grad_norm": 4.8125, "learning_rate": 7.3384889365707975e-06, "loss": 1.06199799, "memory(GiB)": 142.32, "step": 67560, "train_speed(iter/s)": 0.287487 }, { "acc": 0.72726512, "epoch": 0.7558787383938695, "grad_norm": 5.40625, "learning_rate": 7.336854094949149e-06, "loss": 1.08198738, "memory(GiB)": 142.32, "step": 67580, "train_speed(iter/s)": 0.287516 }, { "acc": 0.72582564, "epoch": 0.7561024373398281, "grad_norm": 4.9375, "learning_rate": 7.33521893361246e-06, "loss": 1.10522537, "memory(GiB)": 142.32, "step": 67600, "train_speed(iter/s)": 0.287546 }, { "acc": 0.73083038, "epoch": 0.7563261362857866, "grad_norm": 5.5, "learning_rate": 7.333583452784443e-06, "loss": 1.07703295, "memory(GiB)": 142.32, "step": 67620, "train_speed(iter/s)": 0.287576 }, { "acc": 0.73799524, "epoch": 0.7565498352317451, "grad_norm": 6.75, "learning_rate": 7.331947652688854e-06, "loss": 1.04542618, "memory(GiB)": 142.32, "step": 67640, "train_speed(iter/s)": 0.287606 }, { "acc": 0.7339736, "epoch": 0.7567735341777037, "grad_norm": 5.28125, "learning_rate": 7.330311533549496e-06, "loss": 1.05626411, "memory(GiB)": 142.32, "step": 67660, "train_speed(iter/s)": 0.287634 }, { "acc": 0.72360663, "epoch": 0.7569972331236622, "grad_norm": 5.6875, "learning_rate": 7.328675095590212e-06, "loss": 1.10453424, "memory(GiB)": 142.32, "step": 67680, "train_speed(iter/s)": 0.287663 }, { "acc": 0.74418516, "epoch": 0.7572209320696207, "grad_norm": 5.4375, "learning_rate": 7.327038339034889e-06, "loss": 0.99773884, "memory(GiB)": 142.32, "step": 67700, "train_speed(iter/s)": 0.287694 }, { "acc": 0.71795502, "epoch": 0.7574446310155792, "grad_norm": 6.15625, "learning_rate": 7.325401264107462e-06, "loss": 1.12779579, "memory(GiB)": 142.32, "step": 67720, "train_speed(iter/s)": 0.287721 }, { "acc": 0.73810654, "epoch": 0.7576683299615378, "grad_norm": 5.75, "learning_rate": 7.3237638710319035e-06, "loss": 1.03692989, "memory(GiB)": 142.32, "step": 67740, "train_speed(iter/s)": 0.287751 }, { "acc": 0.7214437, "epoch": 0.7578920289074963, "grad_norm": 5.71875, "learning_rate": 7.3221261600322345e-06, "loss": 1.11423445, "memory(GiB)": 142.32, "step": 67760, "train_speed(iter/s)": 0.287781 }, { "acc": 0.73573027, "epoch": 0.7581157278534548, "grad_norm": 6.6875, "learning_rate": 7.3204881313325145e-06, "loss": 1.03415527, "memory(GiB)": 142.32, "step": 67780, "train_speed(iter/s)": 0.28781 }, { "acc": 0.73271637, "epoch": 0.7583394267994134, "grad_norm": 5.46875, "learning_rate": 7.318849785156852e-06, "loss": 1.0625329, "memory(GiB)": 142.32, "step": 67800, "train_speed(iter/s)": 0.287834 }, { "acc": 0.72678699, "epoch": 0.7585631257453719, "grad_norm": 5.625, "learning_rate": 7.317211121729394e-06, "loss": 1.08960876, "memory(GiB)": 142.32, "step": 67820, "train_speed(iter/s)": 0.287863 }, { "acc": 0.72238503, "epoch": 0.7587868246913304, "grad_norm": 4.78125, "learning_rate": 7.315572141274334e-06, "loss": 1.12291946, "memory(GiB)": 142.32, "step": 67840, "train_speed(iter/s)": 0.28789 }, { "acc": 0.72493591, "epoch": 0.7590105236372889, "grad_norm": 5.8125, "learning_rate": 7.313932844015909e-06, "loss": 1.09262486, "memory(GiB)": 142.32, "step": 67860, "train_speed(iter/s)": 0.287921 }, { "acc": 0.72891798, "epoch": 0.7592342225832475, "grad_norm": 5.53125, "learning_rate": 7.312293230178396e-06, "loss": 1.07275791, "memory(GiB)": 142.32, "step": 67880, "train_speed(iter/s)": 0.28795 }, { "acc": 0.73715305, "epoch": 0.759457921529206, "grad_norm": 6.0625, "learning_rate": 7.310653299986119e-06, "loss": 1.03663349, "memory(GiB)": 142.32, "step": 67900, "train_speed(iter/s)": 0.287979 }, { "acc": 0.72978415, "epoch": 0.7596816204751645, "grad_norm": 5.0625, "learning_rate": 7.309013053663443e-06, "loss": 1.072048, "memory(GiB)": 142.32, "step": 67920, "train_speed(iter/s)": 0.288008 }, { "acc": 0.73411627, "epoch": 0.759905319421123, "grad_norm": 7.40625, "learning_rate": 7.307372491434779e-06, "loss": 1.0486908, "memory(GiB)": 142.32, "step": 67940, "train_speed(iter/s)": 0.288032 }, { "acc": 0.73407607, "epoch": 0.7601290183670816, "grad_norm": 4.625, "learning_rate": 7.305731613524578e-06, "loss": 1.06868, "memory(GiB)": 142.32, "step": 67960, "train_speed(iter/s)": 0.288061 }, { "acc": 0.72573099, "epoch": 0.7603527173130401, "grad_norm": 5.59375, "learning_rate": 7.304090420157336e-06, "loss": 1.11657925, "memory(GiB)": 142.32, "step": 67980, "train_speed(iter/s)": 0.288091 }, { "acc": 0.73945265, "epoch": 0.7605764162589986, "grad_norm": 6.59375, "learning_rate": 7.302448911557591e-06, "loss": 1.02585392, "memory(GiB)": 142.32, "step": 68000, "train_speed(iter/s)": 0.288119 }, { "epoch": 0.7605764162589986, "eval_acc": 0.6943219422566589, "eval_loss": 1.0803848505020142, "eval_runtime": 2343.4473, "eval_samples_per_second": 32.125, "eval_steps_per_second": 16.063, "step": 68000 }, { "acc": 0.72507305, "epoch": 0.7608001152049572, "grad_norm": 7.21875, "learning_rate": 7.300807087949925e-06, "loss": 1.08959551, "memory(GiB)": 142.32, "step": 68020, "train_speed(iter/s)": 0.285251 }, { "acc": 0.72733197, "epoch": 0.7610238141509157, "grad_norm": 5.4375, "learning_rate": 7.299164949558963e-06, "loss": 1.07750492, "memory(GiB)": 142.32, "step": 68040, "train_speed(iter/s)": 0.28528 }, { "acc": 0.71985149, "epoch": 0.7612475130968742, "grad_norm": 5.9375, "learning_rate": 7.297522496609375e-06, "loss": 1.12556868, "memory(GiB)": 142.32, "step": 68060, "train_speed(iter/s)": 0.28531 }, { "acc": 0.74494705, "epoch": 0.7614712120428327, "grad_norm": 6.6875, "learning_rate": 7.295879729325868e-06, "loss": 1.00259457, "memory(GiB)": 142.32, "step": 68080, "train_speed(iter/s)": 0.285338 }, { "acc": 0.73200474, "epoch": 0.7616949109887913, "grad_norm": 4.65625, "learning_rate": 7.294236647933201e-06, "loss": 1.07492771, "memory(GiB)": 142.32, "step": 68100, "train_speed(iter/s)": 0.285365 }, { "acc": 0.73687654, "epoch": 0.7619186099347498, "grad_norm": 6.03125, "learning_rate": 7.292593252656166e-06, "loss": 1.04268398, "memory(GiB)": 142.32, "step": 68120, "train_speed(iter/s)": 0.285394 }, { "acc": 0.73845177, "epoch": 0.7621423088807083, "grad_norm": 5.875, "learning_rate": 7.290949543719607e-06, "loss": 1.03042698, "memory(GiB)": 142.32, "step": 68140, "train_speed(iter/s)": 0.285425 }, { "acc": 0.73046007, "epoch": 0.7623660078266669, "grad_norm": 5.28125, "learning_rate": 7.289305521348404e-06, "loss": 1.08375187, "memory(GiB)": 142.32, "step": 68160, "train_speed(iter/s)": 0.285455 }, { "acc": 0.73298545, "epoch": 0.7625897067726254, "grad_norm": 5.9375, "learning_rate": 7.287661185767485e-06, "loss": 1.05467415, "memory(GiB)": 142.32, "step": 68180, "train_speed(iter/s)": 0.285484 }, { "acc": 0.74136806, "epoch": 0.7628134057185839, "grad_norm": 5.6875, "learning_rate": 7.286016537201817e-06, "loss": 1.01413927, "memory(GiB)": 142.32, "step": 68200, "train_speed(iter/s)": 0.285512 }, { "acc": 0.73204756, "epoch": 0.7630371046645424, "grad_norm": 4.8125, "learning_rate": 7.284371575876412e-06, "loss": 1.07409515, "memory(GiB)": 142.32, "step": 68220, "train_speed(iter/s)": 0.285542 }, { "acc": 0.73696308, "epoch": 0.763260803610501, "grad_norm": 6.28125, "learning_rate": 7.2827263020163245e-06, "loss": 1.06415358, "memory(GiB)": 142.32, "step": 68240, "train_speed(iter/s)": 0.285568 }, { "acc": 0.73480062, "epoch": 0.7634845025564595, "grad_norm": 5.59375, "learning_rate": 7.281080715846651e-06, "loss": 1.03804255, "memory(GiB)": 142.32, "step": 68260, "train_speed(iter/s)": 0.285599 }, { "acc": 0.74269066, "epoch": 0.763708201502418, "grad_norm": 6.5, "learning_rate": 7.2794348175925314e-06, "loss": 1.01920147, "memory(GiB)": 142.32, "step": 68280, "train_speed(iter/s)": 0.285628 }, { "acc": 0.73364935, "epoch": 0.7639319004483766, "grad_norm": 6.875, "learning_rate": 7.277788607479148e-06, "loss": 1.06540375, "memory(GiB)": 142.32, "step": 68300, "train_speed(iter/s)": 0.285655 }, { "acc": 0.74062576, "epoch": 0.7641555993943351, "grad_norm": 6.65625, "learning_rate": 7.276142085731727e-06, "loss": 1.02292442, "memory(GiB)": 142.32, "step": 68320, "train_speed(iter/s)": 0.285679 }, { "acc": 0.73527613, "epoch": 0.7643792983402936, "grad_norm": 5.46875, "learning_rate": 7.274495252575533e-06, "loss": 1.04375105, "memory(GiB)": 142.32, "step": 68340, "train_speed(iter/s)": 0.285705 }, { "acc": 0.73453913, "epoch": 0.7646029972862521, "grad_norm": 6.25, "learning_rate": 7.2728481082358805e-06, "loss": 1.04612932, "memory(GiB)": 142.32, "step": 68360, "train_speed(iter/s)": 0.285732 }, { "acc": 0.71940718, "epoch": 0.7648266962322107, "grad_norm": 6.53125, "learning_rate": 7.27120065293812e-06, "loss": 1.13220692, "memory(GiB)": 142.32, "step": 68380, "train_speed(iter/s)": 0.285759 }, { "acc": 0.72885113, "epoch": 0.7650503951781692, "grad_norm": 5.1875, "learning_rate": 7.269552886907647e-06, "loss": 1.07338552, "memory(GiB)": 142.32, "step": 68400, "train_speed(iter/s)": 0.285787 }, { "acc": 0.72529888, "epoch": 0.7652740941241277, "grad_norm": 6.1875, "learning_rate": 7.267904810369899e-06, "loss": 1.09075565, "memory(GiB)": 142.32, "step": 68420, "train_speed(iter/s)": 0.285814 }, { "acc": 0.729533, "epoch": 0.7654977930700863, "grad_norm": 5.09375, "learning_rate": 7.266256423550357e-06, "loss": 1.06271248, "memory(GiB)": 142.32, "step": 68440, "train_speed(iter/s)": 0.285841 }, { "acc": 0.73648548, "epoch": 0.7657214920160448, "grad_norm": 6.375, "learning_rate": 7.264607726674544e-06, "loss": 1.05722961, "memory(GiB)": 142.32, "step": 68460, "train_speed(iter/s)": 0.285868 }, { "acc": 0.7417069, "epoch": 0.7659451909620033, "grad_norm": 6.03125, "learning_rate": 7.262958719968026e-06, "loss": 1.03022032, "memory(GiB)": 142.32, "step": 68480, "train_speed(iter/s)": 0.285899 }, { "acc": 0.73683496, "epoch": 0.7661688899079618, "grad_norm": 6.28125, "learning_rate": 7.2613094036564105e-06, "loss": 1.03661814, "memory(GiB)": 142.32, "step": 68500, "train_speed(iter/s)": 0.285923 }, { "acc": 0.72982554, "epoch": 0.7663925888539204, "grad_norm": 6.8125, "learning_rate": 7.259659777965346e-06, "loss": 1.0857214, "memory(GiB)": 142.32, "step": 68520, "train_speed(iter/s)": 0.28595 }, { "acc": 0.72708993, "epoch": 0.7666162877998789, "grad_norm": 6.59375, "learning_rate": 7.258009843120526e-06, "loss": 1.08035889, "memory(GiB)": 142.32, "step": 68540, "train_speed(iter/s)": 0.285977 }, { "acc": 0.72660284, "epoch": 0.7668399867458374, "grad_norm": 6.875, "learning_rate": 7.256359599347684e-06, "loss": 1.07212753, "memory(GiB)": 142.32, "step": 68560, "train_speed(iter/s)": 0.286003 }, { "acc": 0.73127646, "epoch": 0.767063685691796, "grad_norm": 5.25, "learning_rate": 7.254709046872601e-06, "loss": 1.05938625, "memory(GiB)": 142.32, "step": 68580, "train_speed(iter/s)": 0.286032 }, { "acc": 0.72628379, "epoch": 0.7672873846377545, "grad_norm": 5.71875, "learning_rate": 7.253058185921091e-06, "loss": 1.10092239, "memory(GiB)": 142.32, "step": 68600, "train_speed(iter/s)": 0.286059 }, { "acc": 0.7230195, "epoch": 0.767511083583713, "grad_norm": 5.96875, "learning_rate": 7.251407016719017e-06, "loss": 1.09797363, "memory(GiB)": 142.32, "step": 68620, "train_speed(iter/s)": 0.286088 }, { "acc": 0.73138847, "epoch": 0.7677347825296715, "grad_norm": 7.03125, "learning_rate": 7.249755539492285e-06, "loss": 1.06003714, "memory(GiB)": 142.32, "step": 68640, "train_speed(iter/s)": 0.286117 }, { "acc": 0.74477491, "epoch": 0.7679584814756301, "grad_norm": 6.375, "learning_rate": 7.248103754466838e-06, "loss": 1.02800627, "memory(GiB)": 142.32, "step": 68660, "train_speed(iter/s)": 0.286145 }, { "acc": 0.73322225, "epoch": 0.7681821804215886, "grad_norm": 4.9375, "learning_rate": 7.246451661868664e-06, "loss": 1.06918087, "memory(GiB)": 142.32, "step": 68680, "train_speed(iter/s)": 0.286177 }, { "acc": 0.74363737, "epoch": 0.7684058793675471, "grad_norm": 6.1875, "learning_rate": 7.244799261923794e-06, "loss": 1.00355244, "memory(GiB)": 142.32, "step": 68700, "train_speed(iter/s)": 0.286205 }, { "acc": 0.73805785, "epoch": 0.7686295783135056, "grad_norm": 4.875, "learning_rate": 7.243146554858299e-06, "loss": 1.03634319, "memory(GiB)": 142.32, "step": 68720, "train_speed(iter/s)": 0.286233 }, { "acc": 0.72180767, "epoch": 0.7688532772594642, "grad_norm": 4.8125, "learning_rate": 7.241493540898294e-06, "loss": 1.11473656, "memory(GiB)": 142.32, "step": 68740, "train_speed(iter/s)": 0.286264 }, { "acc": 0.72533078, "epoch": 0.7690769762054227, "grad_norm": 6.875, "learning_rate": 7.239840220269934e-06, "loss": 1.09088736, "memory(GiB)": 142.32, "step": 68760, "train_speed(iter/s)": 0.286294 }, { "acc": 0.73213234, "epoch": 0.7693006751513812, "grad_norm": 6.90625, "learning_rate": 7.2381865931994165e-06, "loss": 1.05481567, "memory(GiB)": 142.32, "step": 68780, "train_speed(iter/s)": 0.286325 }, { "acc": 0.73424997, "epoch": 0.7695243740973398, "grad_norm": 4.78125, "learning_rate": 7.236532659912983e-06, "loss": 1.04562845, "memory(GiB)": 142.32, "step": 68800, "train_speed(iter/s)": 0.286352 }, { "acc": 0.73806596, "epoch": 0.7697480730432983, "grad_norm": 5.1875, "learning_rate": 7.234878420636913e-06, "loss": 1.02496109, "memory(GiB)": 142.32, "step": 68820, "train_speed(iter/s)": 0.286382 }, { "acc": 0.7318254, "epoch": 0.7699717719892568, "grad_norm": 5.875, "learning_rate": 7.2332238755975326e-06, "loss": 1.07369137, "memory(GiB)": 142.32, "step": 68840, "train_speed(iter/s)": 0.286408 }, { "acc": 0.7264986, "epoch": 0.7701954709352153, "grad_norm": 6.59375, "learning_rate": 7.231569025021205e-06, "loss": 1.0852747, "memory(GiB)": 142.32, "step": 68860, "train_speed(iter/s)": 0.286437 }, { "acc": 0.72290177, "epoch": 0.7704191698811739, "grad_norm": 7.40625, "learning_rate": 7.229913869134339e-06, "loss": 1.10378628, "memory(GiB)": 142.32, "step": 68880, "train_speed(iter/s)": 0.286466 }, { "acc": 0.72803459, "epoch": 0.7706428688271324, "grad_norm": 5.625, "learning_rate": 7.228258408163382e-06, "loss": 1.09191294, "memory(GiB)": 142.32, "step": 68900, "train_speed(iter/s)": 0.286498 }, { "acc": 0.74922838, "epoch": 0.7708665677730909, "grad_norm": 8.75, "learning_rate": 7.2266026423348275e-06, "loss": 1.01314297, "memory(GiB)": 142.32, "step": 68920, "train_speed(iter/s)": 0.286523 }, { "acc": 0.73482494, "epoch": 0.7710902667190495, "grad_norm": 5.78125, "learning_rate": 7.224946571875204e-06, "loss": 1.04266605, "memory(GiB)": 142.32, "step": 68940, "train_speed(iter/s)": 0.286548 }, { "acc": 0.73799944, "epoch": 0.771313965665008, "grad_norm": 6.21875, "learning_rate": 7.223290197011088e-06, "loss": 1.03508015, "memory(GiB)": 142.32, "step": 68960, "train_speed(iter/s)": 0.286576 }, { "acc": 0.74124136, "epoch": 0.7715376646109665, "grad_norm": 6.0625, "learning_rate": 7.2216335179690954e-06, "loss": 1.02084408, "memory(GiB)": 142.32, "step": 68980, "train_speed(iter/s)": 0.286603 }, { "acc": 0.73689752, "epoch": 0.771761363556925, "grad_norm": 5.125, "learning_rate": 7.219976534975883e-06, "loss": 1.02536182, "memory(GiB)": 142.32, "step": 69000, "train_speed(iter/s)": 0.286631 }, { "acc": 0.73351445, "epoch": 0.7719850625028836, "grad_norm": 6.5, "learning_rate": 7.21831924825815e-06, "loss": 1.05281944, "memory(GiB)": 142.32, "step": 69020, "train_speed(iter/s)": 0.286659 }, { "acc": 0.72271857, "epoch": 0.7722087614488421, "grad_norm": 6.28125, "learning_rate": 7.216661658042637e-06, "loss": 1.11391563, "memory(GiB)": 142.32, "step": 69040, "train_speed(iter/s)": 0.286688 }, { "acc": 0.74101906, "epoch": 0.7724324603948006, "grad_norm": 6.21875, "learning_rate": 7.2150037645561255e-06, "loss": 1.02478218, "memory(GiB)": 142.32, "step": 69060, "train_speed(iter/s)": 0.286718 }, { "acc": 0.73333874, "epoch": 0.7726561593407592, "grad_norm": 6.71875, "learning_rate": 7.213345568025438e-06, "loss": 1.05769424, "memory(GiB)": 142.32, "step": 69080, "train_speed(iter/s)": 0.286747 }, { "acc": 0.72960548, "epoch": 0.7728798582867177, "grad_norm": 7.15625, "learning_rate": 7.211687068677442e-06, "loss": 1.0650032, "memory(GiB)": 142.32, "step": 69100, "train_speed(iter/s)": 0.286777 }, { "acc": 0.72250247, "epoch": 0.7731035572326763, "grad_norm": 5.59375, "learning_rate": 7.210028266739043e-06, "loss": 1.11904659, "memory(GiB)": 142.32, "step": 69120, "train_speed(iter/s)": 0.286805 }, { "acc": 0.73484325, "epoch": 0.7733272561786348, "grad_norm": 5.34375, "learning_rate": 7.2083691624371885e-06, "loss": 1.06754761, "memory(GiB)": 142.32, "step": 69140, "train_speed(iter/s)": 0.286835 }, { "acc": 0.73610635, "epoch": 0.7735509551245934, "grad_norm": 4.90625, "learning_rate": 7.206709755998866e-06, "loss": 1.04639177, "memory(GiB)": 142.32, "step": 69160, "train_speed(iter/s)": 0.286865 }, { "acc": 0.72786942, "epoch": 0.7737746540705519, "grad_norm": 5.78125, "learning_rate": 7.20505004765111e-06, "loss": 1.07188625, "memory(GiB)": 142.32, "step": 69180, "train_speed(iter/s)": 0.286893 }, { "acc": 0.73415499, "epoch": 0.7739983530165104, "grad_norm": 5.59375, "learning_rate": 7.203390037620988e-06, "loss": 1.06319904, "memory(GiB)": 142.32, "step": 69200, "train_speed(iter/s)": 0.286923 }, { "acc": 0.73236327, "epoch": 0.774222051962469, "grad_norm": 5.5625, "learning_rate": 7.201729726135618e-06, "loss": 1.06996174, "memory(GiB)": 142.32, "step": 69220, "train_speed(iter/s)": 0.286952 }, { "acc": 0.73455267, "epoch": 0.7744457509084275, "grad_norm": 5.96875, "learning_rate": 7.20006911342215e-06, "loss": 1.05868368, "memory(GiB)": 142.32, "step": 69240, "train_speed(iter/s)": 0.286979 }, { "acc": 0.73902802, "epoch": 0.774669449854386, "grad_norm": 6.0625, "learning_rate": 7.19840819970778e-06, "loss": 1.0395093, "memory(GiB)": 142.32, "step": 69260, "train_speed(iter/s)": 0.287006 }, { "acc": 0.74043331, "epoch": 0.7748931488003445, "grad_norm": 6.4375, "learning_rate": 7.196746985219747e-06, "loss": 1.02849979, "memory(GiB)": 142.32, "step": 69280, "train_speed(iter/s)": 0.287036 }, { "acc": 0.73759451, "epoch": 0.7751168477463031, "grad_norm": 6.0625, "learning_rate": 7.1950854701853265e-06, "loss": 1.04446983, "memory(GiB)": 142.32, "step": 69300, "train_speed(iter/s)": 0.287066 }, { "acc": 0.73038406, "epoch": 0.7753405466922616, "grad_norm": 5.15625, "learning_rate": 7.193423654831841e-06, "loss": 1.07772303, "memory(GiB)": 142.32, "step": 69320, "train_speed(iter/s)": 0.287096 }, { "acc": 0.7362433, "epoch": 0.7755642456382201, "grad_norm": 5.96875, "learning_rate": 7.191761539386646e-06, "loss": 1.04826813, "memory(GiB)": 142.32, "step": 69340, "train_speed(iter/s)": 0.287127 }, { "acc": 0.72079597, "epoch": 0.7757879445841787, "grad_norm": 5.03125, "learning_rate": 7.190099124077146e-06, "loss": 1.10978289, "memory(GiB)": 142.32, "step": 69360, "train_speed(iter/s)": 0.287157 }, { "acc": 0.7285059, "epoch": 0.7760116435301372, "grad_norm": 6.78125, "learning_rate": 7.188436409130781e-06, "loss": 1.09153271, "memory(GiB)": 142.32, "step": 69380, "train_speed(iter/s)": 0.287183 }, { "acc": 0.73406744, "epoch": 0.7762353424760957, "grad_norm": 5.34375, "learning_rate": 7.186773394775036e-06, "loss": 1.06376076, "memory(GiB)": 142.32, "step": 69400, "train_speed(iter/s)": 0.287212 }, { "acc": 0.72893848, "epoch": 0.7764590414220542, "grad_norm": 6.0625, "learning_rate": 7.185110081237435e-06, "loss": 1.08603926, "memory(GiB)": 142.32, "step": 69420, "train_speed(iter/s)": 0.287242 }, { "acc": 0.72351933, "epoch": 0.7766827403680128, "grad_norm": 6.28125, "learning_rate": 7.183446468745542e-06, "loss": 1.11799889, "memory(GiB)": 142.32, "step": 69440, "train_speed(iter/s)": 0.28727 }, { "acc": 0.73027792, "epoch": 0.7769064393139713, "grad_norm": 5.4375, "learning_rate": 7.181782557526963e-06, "loss": 1.06833782, "memory(GiB)": 142.32, "step": 69460, "train_speed(iter/s)": 0.2873 }, { "acc": 0.72152109, "epoch": 0.7771301382599298, "grad_norm": 5.375, "learning_rate": 7.180118347809345e-06, "loss": 1.12754049, "memory(GiB)": 142.32, "step": 69480, "train_speed(iter/s)": 0.287327 }, { "acc": 0.7376627, "epoch": 0.7773538372058884, "grad_norm": 6.46875, "learning_rate": 7.178453839820378e-06, "loss": 1.05075855, "memory(GiB)": 142.32, "step": 69500, "train_speed(iter/s)": 0.287354 }, { "acc": 0.72844324, "epoch": 0.7775775361518469, "grad_norm": 5.4375, "learning_rate": 7.176789033787786e-06, "loss": 1.08226194, "memory(GiB)": 142.32, "step": 69520, "train_speed(iter/s)": 0.287383 }, { "acc": 0.73629217, "epoch": 0.7778012350978054, "grad_norm": 5.09375, "learning_rate": 7.175123929939343e-06, "loss": 1.03830433, "memory(GiB)": 142.32, "step": 69540, "train_speed(iter/s)": 0.287408 }, { "acc": 0.73515549, "epoch": 0.7780249340437639, "grad_norm": 7.5, "learning_rate": 7.173458528502855e-06, "loss": 1.04188614, "memory(GiB)": 142.32, "step": 69560, "train_speed(iter/s)": 0.287434 }, { "acc": 0.72091732, "epoch": 0.7782486329897225, "grad_norm": 6.25, "learning_rate": 7.1717928297061746e-06, "loss": 1.10812778, "memory(GiB)": 142.32, "step": 69580, "train_speed(iter/s)": 0.287464 }, { "acc": 0.73412242, "epoch": 0.778472331935681, "grad_norm": 5.375, "learning_rate": 7.170126833777194e-06, "loss": 1.05078783, "memory(GiB)": 142.32, "step": 69600, "train_speed(iter/s)": 0.287493 }, { "acc": 0.72713699, "epoch": 0.7786960308816395, "grad_norm": 6.75, "learning_rate": 7.1684605409438425e-06, "loss": 1.07169476, "memory(GiB)": 142.32, "step": 69620, "train_speed(iter/s)": 0.28752 }, { "acc": 0.72586942, "epoch": 0.778919729827598, "grad_norm": 5.0, "learning_rate": 7.166793951434097e-06, "loss": 1.08528938, "memory(GiB)": 142.32, "step": 69640, "train_speed(iter/s)": 0.287547 }, { "acc": 0.7420445, "epoch": 0.7791434287735566, "grad_norm": 5.40625, "learning_rate": 7.165127065475966e-06, "loss": 1.00892277, "memory(GiB)": 142.32, "step": 69660, "train_speed(iter/s)": 0.287577 }, { "acc": 0.74049697, "epoch": 0.7793671277195151, "grad_norm": 6.375, "learning_rate": 7.163459883297506e-06, "loss": 1.02821178, "memory(GiB)": 142.32, "step": 69680, "train_speed(iter/s)": 0.287604 }, { "acc": 0.72710614, "epoch": 0.7795908266654736, "grad_norm": 5.65625, "learning_rate": 7.161792405126812e-06, "loss": 1.0941823, "memory(GiB)": 142.32, "step": 69700, "train_speed(iter/s)": 0.287635 }, { "acc": 0.73360415, "epoch": 0.7798145256114322, "grad_norm": 8.0625, "learning_rate": 7.160124631192017e-06, "loss": 1.0717906, "memory(GiB)": 142.32, "step": 69720, "train_speed(iter/s)": 0.287663 }, { "acc": 0.72974977, "epoch": 0.7800382245573907, "grad_norm": 6.0, "learning_rate": 7.158456561721299e-06, "loss": 1.06352253, "memory(GiB)": 142.32, "step": 69740, "train_speed(iter/s)": 0.28769 }, { "acc": 0.73805423, "epoch": 0.7802619235033492, "grad_norm": 5.15625, "learning_rate": 7.15678819694287e-06, "loss": 1.03146648, "memory(GiB)": 142.32, "step": 69760, "train_speed(iter/s)": 0.287716 }, { "acc": 0.73034148, "epoch": 0.7804856224493077, "grad_norm": 5.28125, "learning_rate": 7.155119537084988e-06, "loss": 1.07375507, "memory(GiB)": 142.32, "step": 69780, "train_speed(iter/s)": 0.287741 }, { "acc": 0.74020195, "epoch": 0.7807093213952663, "grad_norm": 5.46875, "learning_rate": 7.1534505823759495e-06, "loss": 1.02720871, "memory(GiB)": 142.32, "step": 69800, "train_speed(iter/s)": 0.287767 }, { "acc": 0.7384357, "epoch": 0.7809330203412248, "grad_norm": 5.75, "learning_rate": 7.151781333044092e-06, "loss": 1.03900108, "memory(GiB)": 142.32, "step": 69820, "train_speed(iter/s)": 0.287798 }, { "acc": 0.72402768, "epoch": 0.7811567192871833, "grad_norm": 5.8125, "learning_rate": 7.150111789317793e-06, "loss": 1.10581913, "memory(GiB)": 142.32, "step": 69840, "train_speed(iter/s)": 0.287827 }, { "acc": 0.73473425, "epoch": 0.7813804182331419, "grad_norm": 5.0, "learning_rate": 7.1484419514254675e-06, "loss": 1.07616425, "memory(GiB)": 142.32, "step": 69860, "train_speed(iter/s)": 0.287856 }, { "acc": 0.74474611, "epoch": 0.7816041171791004, "grad_norm": 5.28125, "learning_rate": 7.1467718195955746e-06, "loss": 1.02240934, "memory(GiB)": 142.32, "step": 69880, "train_speed(iter/s)": 0.287883 }, { "acc": 0.73476076, "epoch": 0.7818278161250589, "grad_norm": 5.4375, "learning_rate": 7.145101394056614e-06, "loss": 1.05084038, "memory(GiB)": 142.32, "step": 69900, "train_speed(iter/s)": 0.287909 }, { "acc": 0.73958764, "epoch": 0.7820515150710174, "grad_norm": 6.75, "learning_rate": 7.143430675037121e-06, "loss": 1.04205456, "memory(GiB)": 142.32, "step": 69920, "train_speed(iter/s)": 0.287934 }, { "acc": 0.72909746, "epoch": 0.782275214016976, "grad_norm": 5.90625, "learning_rate": 7.141759662765676e-06, "loss": 1.08119259, "memory(GiB)": 142.32, "step": 69940, "train_speed(iter/s)": 0.287961 }, { "acc": 0.73090706, "epoch": 0.7824989129629345, "grad_norm": 6.1875, "learning_rate": 7.140088357470895e-06, "loss": 1.06875877, "memory(GiB)": 142.32, "step": 69960, "train_speed(iter/s)": 0.287987 }, { "acc": 0.73103704, "epoch": 0.782722611908893, "grad_norm": 6.0, "learning_rate": 7.138416759381438e-06, "loss": 1.0708065, "memory(GiB)": 142.32, "step": 69980, "train_speed(iter/s)": 0.288013 }, { "acc": 0.74700661, "epoch": 0.7829463108548516, "grad_norm": 5.34375, "learning_rate": 7.136744868726003e-06, "loss": 1.00342445, "memory(GiB)": 142.32, "step": 70000, "train_speed(iter/s)": 0.288038 }, { "epoch": 0.7829463108548516, "eval_acc": 0.694489402993845, "eval_loss": 1.079437255859375, "eval_runtime": 2338.6345, "eval_samples_per_second": 32.191, "eval_steps_per_second": 16.096, "step": 70000 }, { "acc": 0.73007007, "epoch": 0.7831700098008101, "grad_norm": 5.8125, "learning_rate": 7.135072685733329e-06, "loss": 1.07881222, "memory(GiB)": 142.32, "step": 70020, "train_speed(iter/s)": 0.285259 }, { "acc": 0.74295416, "epoch": 0.7833937087467686, "grad_norm": 5.625, "learning_rate": 7.1334002106321965e-06, "loss": 1.01159763, "memory(GiB)": 142.32, "step": 70040, "train_speed(iter/s)": 0.285287 }, { "acc": 0.73208113, "epoch": 0.7836174076927271, "grad_norm": 5.625, "learning_rate": 7.1317274436514195e-06, "loss": 1.04385357, "memory(GiB)": 142.32, "step": 70060, "train_speed(iter/s)": 0.285315 }, { "acc": 0.74044399, "epoch": 0.7838411066386857, "grad_norm": 5.34375, "learning_rate": 7.13005438501986e-06, "loss": 1.02259998, "memory(GiB)": 142.32, "step": 70080, "train_speed(iter/s)": 0.285346 }, { "acc": 0.73401423, "epoch": 0.7840648055846442, "grad_norm": 6.09375, "learning_rate": 7.128381034966415e-06, "loss": 1.06603003, "memory(GiB)": 142.32, "step": 70100, "train_speed(iter/s)": 0.285376 }, { "acc": 0.73158092, "epoch": 0.7842885045306027, "grad_norm": 6.375, "learning_rate": 7.126707393720023e-06, "loss": 1.08689613, "memory(GiB)": 142.32, "step": 70120, "train_speed(iter/s)": 0.285404 }, { "acc": 0.72150736, "epoch": 0.7845122034765613, "grad_norm": 6.03125, "learning_rate": 7.12503346150966e-06, "loss": 1.12691994, "memory(GiB)": 142.32, "step": 70140, "train_speed(iter/s)": 0.285434 }, { "acc": 0.7313098, "epoch": 0.7847359024225198, "grad_norm": 6.03125, "learning_rate": 7.123359238564349e-06, "loss": 1.0890295, "memory(GiB)": 142.32, "step": 70160, "train_speed(iter/s)": 0.285461 }, { "acc": 0.7244935, "epoch": 0.7849596013684783, "grad_norm": 6.5625, "learning_rate": 7.121684725113142e-06, "loss": 1.11090374, "memory(GiB)": 142.32, "step": 70180, "train_speed(iter/s)": 0.285488 }, { "acc": 0.73289814, "epoch": 0.7851833003144368, "grad_norm": 6.09375, "learning_rate": 7.120009921385138e-06, "loss": 1.05402088, "memory(GiB)": 142.32, "step": 70200, "train_speed(iter/s)": 0.285513 }, { "acc": 0.73317595, "epoch": 0.7854069992603954, "grad_norm": 5.53125, "learning_rate": 7.118334827609477e-06, "loss": 1.06439495, "memory(GiB)": 142.32, "step": 70220, "train_speed(iter/s)": 0.285542 }, { "acc": 0.73055964, "epoch": 0.7856306982063539, "grad_norm": 4.9375, "learning_rate": 7.116659444015333e-06, "loss": 1.06785069, "memory(GiB)": 142.32, "step": 70240, "train_speed(iter/s)": 0.285566 }, { "acc": 0.73101234, "epoch": 0.7858543971523124, "grad_norm": 4.75, "learning_rate": 7.1149837708319226e-06, "loss": 1.06672955, "memory(GiB)": 142.32, "step": 70260, "train_speed(iter/s)": 0.285595 }, { "acc": 0.73028297, "epoch": 0.786078096098271, "grad_norm": 5.5, "learning_rate": 7.1133078082885025e-06, "loss": 1.09285316, "memory(GiB)": 142.32, "step": 70280, "train_speed(iter/s)": 0.285625 }, { "acc": 0.73165216, "epoch": 0.7863017950442295, "grad_norm": 6.40625, "learning_rate": 7.111631556614367e-06, "loss": 1.06696739, "memory(GiB)": 142.32, "step": 70300, "train_speed(iter/s)": 0.285655 }, { "acc": 0.73335743, "epoch": 0.786525493990188, "grad_norm": 7.0, "learning_rate": 7.109955016038854e-06, "loss": 1.05462608, "memory(GiB)": 142.32, "step": 70320, "train_speed(iter/s)": 0.285685 }, { "acc": 0.72637835, "epoch": 0.7867491929361465, "grad_norm": 6.25, "learning_rate": 7.108278186791335e-06, "loss": 1.09469643, "memory(GiB)": 142.32, "step": 70340, "train_speed(iter/s)": 0.285714 }, { "acc": 0.73197107, "epoch": 0.7869728918821051, "grad_norm": 5.40625, "learning_rate": 7.1066010691012275e-06, "loss": 1.06624203, "memory(GiB)": 142.32, "step": 70360, "train_speed(iter/s)": 0.285741 }, { "acc": 0.74766245, "epoch": 0.7871965908280636, "grad_norm": 5.84375, "learning_rate": 7.1049236631979824e-06, "loss": 0.98742657, "memory(GiB)": 142.32, "step": 70380, "train_speed(iter/s)": 0.285772 }, { "acc": 0.74027662, "epoch": 0.7874202897740221, "grad_norm": 7.0, "learning_rate": 7.103245969311094e-06, "loss": 1.02620354, "memory(GiB)": 142.32, "step": 70400, "train_speed(iter/s)": 0.2858 }, { "acc": 0.73311782, "epoch": 0.7876439887199806, "grad_norm": 6.25, "learning_rate": 7.101567987670095e-06, "loss": 1.07898073, "memory(GiB)": 142.32, "step": 70420, "train_speed(iter/s)": 0.285831 }, { "acc": 0.72837906, "epoch": 0.7878676876659392, "grad_norm": 6.625, "learning_rate": 7.099889718504557e-06, "loss": 1.07209654, "memory(GiB)": 142.32, "step": 70440, "train_speed(iter/s)": 0.285854 }, { "acc": 0.7336278, "epoch": 0.7880913866118977, "grad_norm": 6.09375, "learning_rate": 7.098211162044092e-06, "loss": 1.05791817, "memory(GiB)": 142.32, "step": 70460, "train_speed(iter/s)": 0.28588 }, { "acc": 0.73184566, "epoch": 0.7883150855578562, "grad_norm": 6.09375, "learning_rate": 7.096532318518348e-06, "loss": 1.07457924, "memory(GiB)": 142.32, "step": 70480, "train_speed(iter/s)": 0.285909 }, { "acc": 0.72789202, "epoch": 0.7885387845038148, "grad_norm": 4.09375, "learning_rate": 7.094853188157017e-06, "loss": 1.09213305, "memory(GiB)": 142.32, "step": 70500, "train_speed(iter/s)": 0.28594 }, { "acc": 0.72153459, "epoch": 0.7887624834497733, "grad_norm": 4.84375, "learning_rate": 7.093173771189828e-06, "loss": 1.11157799, "memory(GiB)": 142.32, "step": 70520, "train_speed(iter/s)": 0.28597 }, { "acc": 0.73512778, "epoch": 0.7889861823957318, "grad_norm": 6.65625, "learning_rate": 7.091494067846547e-06, "loss": 1.05609379, "memory(GiB)": 142.32, "step": 70540, "train_speed(iter/s)": 0.285997 }, { "acc": 0.74763536, "epoch": 0.7892098813416903, "grad_norm": 5.53125, "learning_rate": 7.089814078356986e-06, "loss": 0.99615011, "memory(GiB)": 142.32, "step": 70560, "train_speed(iter/s)": 0.286021 }, { "acc": 0.74297762, "epoch": 0.7894335802876489, "grad_norm": 6.15625, "learning_rate": 7.088133802950987e-06, "loss": 1.02180557, "memory(GiB)": 142.32, "step": 70580, "train_speed(iter/s)": 0.286048 }, { "acc": 0.72500644, "epoch": 0.7896572792336074, "grad_norm": 4.46875, "learning_rate": 7.086453241858437e-06, "loss": 1.09156837, "memory(GiB)": 142.32, "step": 70600, "train_speed(iter/s)": 0.286075 }, { "acc": 0.73581018, "epoch": 0.7898809781795659, "grad_norm": 5.59375, "learning_rate": 7.084772395309263e-06, "loss": 1.03373451, "memory(GiB)": 142.32, "step": 70620, "train_speed(iter/s)": 0.286102 }, { "acc": 0.72000446, "epoch": 0.7901046771255245, "grad_norm": 6.1875, "learning_rate": 7.083091263533426e-06, "loss": 1.11763086, "memory(GiB)": 142.32, "step": 70640, "train_speed(iter/s)": 0.286127 }, { "acc": 0.73606949, "epoch": 0.790328376071483, "grad_norm": 7.03125, "learning_rate": 7.08140984676093e-06, "loss": 1.05073948, "memory(GiB)": 142.32, "step": 70660, "train_speed(iter/s)": 0.286155 }, { "acc": 0.7249248, "epoch": 0.7905520750174415, "grad_norm": 6.3125, "learning_rate": 7.079728145221818e-06, "loss": 1.09759388, "memory(GiB)": 142.32, "step": 70680, "train_speed(iter/s)": 0.286182 }, { "acc": 0.73266878, "epoch": 0.7907757739634, "grad_norm": 6.03125, "learning_rate": 7.078046159146168e-06, "loss": 1.07052088, "memory(GiB)": 142.32, "step": 70700, "train_speed(iter/s)": 0.28621 }, { "acc": 0.72647424, "epoch": 0.7909994729093586, "grad_norm": 6.34375, "learning_rate": 7.076363888764102e-06, "loss": 1.08676815, "memory(GiB)": 142.32, "step": 70720, "train_speed(iter/s)": 0.286239 }, { "acc": 0.72689033, "epoch": 0.7912231718553171, "grad_norm": 5.8125, "learning_rate": 7.074681334305778e-06, "loss": 1.09071445, "memory(GiB)": 142.32, "step": 70740, "train_speed(iter/s)": 0.286267 }, { "acc": 0.73967814, "epoch": 0.7914468708012756, "grad_norm": 6.0, "learning_rate": 7.072998496001392e-06, "loss": 1.03037853, "memory(GiB)": 142.32, "step": 70760, "train_speed(iter/s)": 0.286294 }, { "acc": 0.73815603, "epoch": 0.7916705697472342, "grad_norm": 6.84375, "learning_rate": 7.0713153740811835e-06, "loss": 1.03959255, "memory(GiB)": 142.32, "step": 70780, "train_speed(iter/s)": 0.286321 }, { "acc": 0.72671537, "epoch": 0.7918942686931927, "grad_norm": 7.21875, "learning_rate": 7.069631968775426e-06, "loss": 1.09239674, "memory(GiB)": 142.32, "step": 70800, "train_speed(iter/s)": 0.286348 }, { "acc": 0.73260365, "epoch": 0.7921179676391512, "grad_norm": 4.78125, "learning_rate": 7.067948280314432e-06, "loss": 1.06273994, "memory(GiB)": 142.32, "step": 70820, "train_speed(iter/s)": 0.286372 }, { "acc": 0.72529993, "epoch": 0.7923416665851097, "grad_norm": 7.1875, "learning_rate": 7.066264308928556e-06, "loss": 1.11260662, "memory(GiB)": 142.32, "step": 70840, "train_speed(iter/s)": 0.286401 }, { "acc": 0.72613425, "epoch": 0.7925653655310683, "grad_norm": 6.59375, "learning_rate": 7.064580054848188e-06, "loss": 1.10970516, "memory(GiB)": 142.32, "step": 70860, "train_speed(iter/s)": 0.286429 }, { "acc": 0.73021202, "epoch": 0.7927890644770268, "grad_norm": 7.53125, "learning_rate": 7.06289551830376e-06, "loss": 1.07956505, "memory(GiB)": 142.32, "step": 70880, "train_speed(iter/s)": 0.28646 }, { "acc": 0.73510313, "epoch": 0.7930127634229853, "grad_norm": 5.96875, "learning_rate": 7.061210699525739e-06, "loss": 1.05068874, "memory(GiB)": 142.32, "step": 70900, "train_speed(iter/s)": 0.286489 }, { "acc": 0.73303103, "epoch": 0.7932364623689439, "grad_norm": 6.625, "learning_rate": 7.059525598744633e-06, "loss": 1.05608864, "memory(GiB)": 142.32, "step": 70920, "train_speed(iter/s)": 0.286517 }, { "acc": 0.73837905, "epoch": 0.7934601613149024, "grad_norm": 5.75, "learning_rate": 7.057840216190988e-06, "loss": 1.02381077, "memory(GiB)": 142.32, "step": 70940, "train_speed(iter/s)": 0.286547 }, { "acc": 0.72126884, "epoch": 0.7936838602608609, "grad_norm": 5.6875, "learning_rate": 7.056154552095387e-06, "loss": 1.11996794, "memory(GiB)": 142.32, "step": 70960, "train_speed(iter/s)": 0.286573 }, { "acc": 0.73968143, "epoch": 0.7939075592068194, "grad_norm": 5.84375, "learning_rate": 7.054468606688456e-06, "loss": 1.0284214, "memory(GiB)": 142.32, "step": 70980, "train_speed(iter/s)": 0.286601 }, { "acc": 0.72998428, "epoch": 0.794131258152778, "grad_norm": 6.375, "learning_rate": 7.052782380200853e-06, "loss": 1.06957111, "memory(GiB)": 142.32, "step": 71000, "train_speed(iter/s)": 0.286626 }, { "acc": 0.73403759, "epoch": 0.7943549570987365, "grad_norm": 6.15625, "learning_rate": 7.0510958728632794e-06, "loss": 1.06380148, "memory(GiB)": 142.32, "step": 71020, "train_speed(iter/s)": 0.286653 }, { "acc": 0.72002859, "epoch": 0.794578656044695, "grad_norm": 6.34375, "learning_rate": 7.049409084906474e-06, "loss": 1.11610975, "memory(GiB)": 142.32, "step": 71040, "train_speed(iter/s)": 0.286683 }, { "acc": 0.72679286, "epoch": 0.7948023549906535, "grad_norm": 4.90625, "learning_rate": 7.0477220165612115e-06, "loss": 1.06906872, "memory(GiB)": 142.32, "step": 71060, "train_speed(iter/s)": 0.286711 }, { "acc": 0.73032598, "epoch": 0.7950260539366121, "grad_norm": 6.15625, "learning_rate": 7.0460346680583105e-06, "loss": 1.07245846, "memory(GiB)": 142.32, "step": 71080, "train_speed(iter/s)": 0.286739 }, { "acc": 0.73448601, "epoch": 0.7952497528825706, "grad_norm": 7.375, "learning_rate": 7.044347039628622e-06, "loss": 1.04832973, "memory(GiB)": 142.32, "step": 71100, "train_speed(iter/s)": 0.286767 }, { "acc": 0.75055399, "epoch": 0.7954734518285291, "grad_norm": 5.78125, "learning_rate": 7.042659131503037e-06, "loss": 0.97720652, "memory(GiB)": 142.32, "step": 71120, "train_speed(iter/s)": 0.286794 }, { "acc": 0.73539934, "epoch": 0.7956971507744877, "grad_norm": 5.84375, "learning_rate": 7.040970943912486e-06, "loss": 1.05036564, "memory(GiB)": 142.32, "step": 71140, "train_speed(iter/s)": 0.286825 }, { "acc": 0.73083649, "epoch": 0.7959208497204462, "grad_norm": 5.90625, "learning_rate": 7.03928247708794e-06, "loss": 1.06846027, "memory(GiB)": 142.32, "step": 71160, "train_speed(iter/s)": 0.286854 }, { "acc": 0.72500505, "epoch": 0.7961445486664047, "grad_norm": 5.625, "learning_rate": 7.037593731260401e-06, "loss": 1.10899982, "memory(GiB)": 142.32, "step": 71180, "train_speed(iter/s)": 0.286882 }, { "acc": 0.73924317, "epoch": 0.7963682476123632, "grad_norm": 6.8125, "learning_rate": 7.035904706660917e-06, "loss": 1.01885204, "memory(GiB)": 142.32, "step": 71200, "train_speed(iter/s)": 0.286909 }, { "acc": 0.73481278, "epoch": 0.7965919465583218, "grad_norm": 4.9375, "learning_rate": 7.034215403520569e-06, "loss": 1.05641289, "memory(GiB)": 142.32, "step": 71220, "train_speed(iter/s)": 0.286938 }, { "acc": 0.729142, "epoch": 0.7968156455042803, "grad_norm": 6.09375, "learning_rate": 7.032525822070477e-06, "loss": 1.07919168, "memory(GiB)": 142.32, "step": 71240, "train_speed(iter/s)": 0.286966 }, { "acc": 0.71841698, "epoch": 0.7970393444502388, "grad_norm": 5.5625, "learning_rate": 7.030835962541802e-06, "loss": 1.10867815, "memory(GiB)": 142.32, "step": 71260, "train_speed(iter/s)": 0.286993 }, { "acc": 0.73445587, "epoch": 0.7972630433961974, "grad_norm": 5.6875, "learning_rate": 7.0291458251657405e-06, "loss": 1.06020002, "memory(GiB)": 142.32, "step": 71280, "train_speed(iter/s)": 0.287019 }, { "acc": 0.72909937, "epoch": 0.7974867423421559, "grad_norm": 5.03125, "learning_rate": 7.027455410173528e-06, "loss": 1.07290316, "memory(GiB)": 142.32, "step": 71300, "train_speed(iter/s)": 0.28705 }, { "acc": 0.73222589, "epoch": 0.7977104412881144, "grad_norm": 5.375, "learning_rate": 7.025764717796435e-06, "loss": 1.06953735, "memory(GiB)": 142.32, "step": 71320, "train_speed(iter/s)": 0.287079 }, { "acc": 0.71878786, "epoch": 0.7979341402340729, "grad_norm": 6.8125, "learning_rate": 7.024073748265773e-06, "loss": 1.12018757, "memory(GiB)": 142.32, "step": 71340, "train_speed(iter/s)": 0.287109 }, { "acc": 0.74127021, "epoch": 0.7981578391800315, "grad_norm": 5.34375, "learning_rate": 7.022382501812892e-06, "loss": 1.02158165, "memory(GiB)": 142.32, "step": 71360, "train_speed(iter/s)": 0.287137 }, { "acc": 0.73722811, "epoch": 0.79838153812599, "grad_norm": 6.25, "learning_rate": 7.020690978669178e-06, "loss": 1.0421442, "memory(GiB)": 142.32, "step": 71380, "train_speed(iter/s)": 0.287162 }, { "acc": 0.73641768, "epoch": 0.7986052370719485, "grad_norm": 6.5, "learning_rate": 7.018999179066055e-06, "loss": 1.03757324, "memory(GiB)": 142.32, "step": 71400, "train_speed(iter/s)": 0.287189 }, { "acc": 0.72950888, "epoch": 0.798828936017907, "grad_norm": 5.5, "learning_rate": 7.0173071032349896e-06, "loss": 1.06968689, "memory(GiB)": 142.32, "step": 71420, "train_speed(iter/s)": 0.287217 }, { "acc": 0.71536312, "epoch": 0.7990526349638656, "grad_norm": 7.0, "learning_rate": 7.015614751407475e-06, "loss": 1.14144211, "memory(GiB)": 142.32, "step": 71440, "train_speed(iter/s)": 0.287244 }, { "acc": 0.7334168, "epoch": 0.7992763339098241, "grad_norm": 5.875, "learning_rate": 7.013922123815054e-06, "loss": 1.05394688, "memory(GiB)": 142.32, "step": 71460, "train_speed(iter/s)": 0.287267 }, { "acc": 0.73724194, "epoch": 0.7995000328557826, "grad_norm": 5.71875, "learning_rate": 7.0122292206893e-06, "loss": 1.03330708, "memory(GiB)": 142.32, "step": 71480, "train_speed(iter/s)": 0.287295 }, { "acc": 0.7342566, "epoch": 0.7997237318017412, "grad_norm": 5.34375, "learning_rate": 7.010536042261828e-06, "loss": 1.05785618, "memory(GiB)": 142.32, "step": 71500, "train_speed(iter/s)": 0.28732 }, { "acc": 0.73860044, "epoch": 0.7999474307476997, "grad_norm": 6.34375, "learning_rate": 7.0088425887642885e-06, "loss": 1.0279089, "memory(GiB)": 142.32, "step": 71520, "train_speed(iter/s)": 0.287348 }, { "acc": 0.74455242, "epoch": 0.8001711296936582, "grad_norm": 5.84375, "learning_rate": 7.00714886042837e-06, "loss": 1.00313606, "memory(GiB)": 142.32, "step": 71540, "train_speed(iter/s)": 0.287375 }, { "acc": 0.72871938, "epoch": 0.8003948286396168, "grad_norm": 7.96875, "learning_rate": 7.005454857485798e-06, "loss": 1.07126665, "memory(GiB)": 142.32, "step": 71560, "train_speed(iter/s)": 0.287401 }, { "acc": 0.72866545, "epoch": 0.8006185275855753, "grad_norm": 7.15625, "learning_rate": 7.003760580168337e-06, "loss": 1.073806, "memory(GiB)": 142.32, "step": 71580, "train_speed(iter/s)": 0.287428 }, { "acc": 0.72218332, "epoch": 0.8008422265315338, "grad_norm": 5.625, "learning_rate": 7.002066028707788e-06, "loss": 1.11639481, "memory(GiB)": 142.32, "step": 71600, "train_speed(iter/s)": 0.287455 }, { "acc": 0.73865843, "epoch": 0.8010659254774923, "grad_norm": 6.0625, "learning_rate": 7.0003712033359915e-06, "loss": 1.03824883, "memory(GiB)": 142.32, "step": 71620, "train_speed(iter/s)": 0.287483 }, { "acc": 0.73534737, "epoch": 0.801289624423451, "grad_norm": 4.4375, "learning_rate": 6.998676104284822e-06, "loss": 1.06105785, "memory(GiB)": 142.32, "step": 71640, "train_speed(iter/s)": 0.287509 }, { "acc": 0.72982244, "epoch": 0.8015133233694095, "grad_norm": 6.40625, "learning_rate": 6.996980731786193e-06, "loss": 1.09162388, "memory(GiB)": 142.32, "step": 71660, "train_speed(iter/s)": 0.287533 }, { "acc": 0.73547974, "epoch": 0.801737022315368, "grad_norm": 5.71875, "learning_rate": 6.995285086072056e-06, "loss": 1.04476843, "memory(GiB)": 142.32, "step": 71680, "train_speed(iter/s)": 0.287561 }, { "acc": 0.72292061, "epoch": 0.8019607212613266, "grad_norm": 6.34375, "learning_rate": 6.993589167374401e-06, "loss": 1.10741997, "memory(GiB)": 142.32, "step": 71700, "train_speed(iter/s)": 0.287588 }, { "acc": 0.74091129, "epoch": 0.8021844202072851, "grad_norm": 5.34375, "learning_rate": 6.991892975925253e-06, "loss": 1.01878452, "memory(GiB)": 142.32, "step": 71720, "train_speed(iter/s)": 0.287616 }, { "acc": 0.73047504, "epoch": 0.8024081191532436, "grad_norm": 5.78125, "learning_rate": 6.990196511956675e-06, "loss": 1.08770466, "memory(GiB)": 142.32, "step": 71740, "train_speed(iter/s)": 0.287643 }, { "acc": 0.74704547, "epoch": 0.8026318180992021, "grad_norm": 5.53125, "learning_rate": 6.988499775700768e-06, "loss": 1.0045166, "memory(GiB)": 142.32, "step": 71760, "train_speed(iter/s)": 0.287671 }, { "acc": 0.72560081, "epoch": 0.8028555170451607, "grad_norm": 5.875, "learning_rate": 6.986802767389669e-06, "loss": 1.09760418, "memory(GiB)": 142.32, "step": 71780, "train_speed(iter/s)": 0.287698 }, { "acc": 0.72150946, "epoch": 0.8030792159911192, "grad_norm": 6.71875, "learning_rate": 6.985105487255553e-06, "loss": 1.10927677, "memory(GiB)": 142.32, "step": 71800, "train_speed(iter/s)": 0.287724 }, { "acc": 0.72375937, "epoch": 0.8033029149370777, "grad_norm": 5.75, "learning_rate": 6.9834079355306335e-06, "loss": 1.12432518, "memory(GiB)": 142.32, "step": 71820, "train_speed(iter/s)": 0.287753 }, { "acc": 0.73898907, "epoch": 0.8035266138830363, "grad_norm": 7.5625, "learning_rate": 6.981710112447159e-06, "loss": 1.0326046, "memory(GiB)": 142.32, "step": 71840, "train_speed(iter/s)": 0.287778 }, { "acc": 0.73555059, "epoch": 0.8037503128289948, "grad_norm": 5.125, "learning_rate": 6.980012018237415e-06, "loss": 1.04214773, "memory(GiB)": 142.32, "step": 71860, "train_speed(iter/s)": 0.287807 }, { "acc": 0.73430939, "epoch": 0.8039740117749533, "grad_norm": 6.46875, "learning_rate": 6.978313653133728e-06, "loss": 1.04410343, "memory(GiB)": 142.32, "step": 71880, "train_speed(iter/s)": 0.28783 }, { "acc": 0.73726368, "epoch": 0.8041977107209118, "grad_norm": 6.375, "learning_rate": 6.976615017368455e-06, "loss": 1.04414444, "memory(GiB)": 142.32, "step": 71900, "train_speed(iter/s)": 0.287856 }, { "acc": 0.72568049, "epoch": 0.8044214096668704, "grad_norm": 5.375, "learning_rate": 6.9749161111739946e-06, "loss": 1.08730984, "memory(GiB)": 142.32, "step": 71920, "train_speed(iter/s)": 0.287883 }, { "acc": 0.73081026, "epoch": 0.8046451086128289, "grad_norm": 5.3125, "learning_rate": 6.973216934782785e-06, "loss": 1.07700233, "memory(GiB)": 142.32, "step": 71940, "train_speed(iter/s)": 0.287906 }, { "acc": 0.7263607, "epoch": 0.8048688075587874, "grad_norm": 6.53125, "learning_rate": 6.9715174884272925e-06, "loss": 1.0914959, "memory(GiB)": 142.32, "step": 71960, "train_speed(iter/s)": 0.287933 }, { "acc": 0.74224825, "epoch": 0.805092506504746, "grad_norm": 5.03125, "learning_rate": 6.969817772340028e-06, "loss": 1.02117615, "memory(GiB)": 142.32, "step": 71980, "train_speed(iter/s)": 0.287958 }, { "acc": 0.73352613, "epoch": 0.8053162054507045, "grad_norm": 5.59375, "learning_rate": 6.9681177867535385e-06, "loss": 1.06491566, "memory(GiB)": 142.32, "step": 72000, "train_speed(iter/s)": 0.287984 }, { "epoch": 0.8053162054507045, "eval_acc": 0.6946961038684466, "eval_loss": 1.0787394046783447, "eval_runtime": 2341.243, "eval_samples_per_second": 32.155, "eval_steps_per_second": 16.078, "step": 72000 }, { "acc": 0.73413644, "epoch": 0.805539904396663, "grad_norm": 5.78125, "learning_rate": 6.966417531900405e-06, "loss": 1.06720886, "memory(GiB)": 142.32, "step": 72020, "train_speed(iter/s)": 0.28528 }, { "acc": 0.73498621, "epoch": 0.8057636033426215, "grad_norm": 6.3125, "learning_rate": 6.964717008013245e-06, "loss": 1.05800209, "memory(GiB)": 142.32, "step": 72040, "train_speed(iter/s)": 0.285306 }, { "acc": 0.73836265, "epoch": 0.8059873022885801, "grad_norm": 6.03125, "learning_rate": 6.963016215324717e-06, "loss": 1.02204189, "memory(GiB)": 142.32, "step": 72060, "train_speed(iter/s)": 0.285337 }, { "acc": 0.73514338, "epoch": 0.8062110012345386, "grad_norm": 6.03125, "learning_rate": 6.961315154067513e-06, "loss": 1.04063416, "memory(GiB)": 142.32, "step": 72080, "train_speed(iter/s)": 0.285366 }, { "acc": 0.73046846, "epoch": 0.8064347001804971, "grad_norm": 5.375, "learning_rate": 6.959613824474361e-06, "loss": 1.07310352, "memory(GiB)": 142.32, "step": 72100, "train_speed(iter/s)": 0.285391 }, { "acc": 0.71925983, "epoch": 0.8066583991264556, "grad_norm": 5.21875, "learning_rate": 6.957912226778029e-06, "loss": 1.12459583, "memory(GiB)": 142.32, "step": 72120, "train_speed(iter/s)": 0.285418 }, { "acc": 0.74092026, "epoch": 0.8068820980724142, "grad_norm": 6.59375, "learning_rate": 6.9562103612113205e-06, "loss": 1.04268847, "memory(GiB)": 142.32, "step": 72140, "train_speed(iter/s)": 0.285445 }, { "acc": 0.7388361, "epoch": 0.8071057970183727, "grad_norm": 5.84375, "learning_rate": 6.9545082280070734e-06, "loss": 1.0469347, "memory(GiB)": 142.32, "step": 72160, "train_speed(iter/s)": 0.285468 }, { "acc": 0.73047447, "epoch": 0.8073294959643312, "grad_norm": 5.65625, "learning_rate": 6.952805827398164e-06, "loss": 1.07706604, "memory(GiB)": 142.32, "step": 72180, "train_speed(iter/s)": 0.285495 }, { "acc": 0.73261995, "epoch": 0.8075531949102898, "grad_norm": 6.28125, "learning_rate": 6.951103159617505e-06, "loss": 1.06434784, "memory(GiB)": 142.32, "step": 72200, "train_speed(iter/s)": 0.285521 }, { "acc": 0.72926836, "epoch": 0.8077768938562483, "grad_norm": 6.28125, "learning_rate": 6.949400224898045e-06, "loss": 1.07997169, "memory(GiB)": 142.32, "step": 72220, "train_speed(iter/s)": 0.285548 }, { "acc": 0.74062576, "epoch": 0.8080005928022068, "grad_norm": 6.03125, "learning_rate": 6.9476970234727734e-06, "loss": 1.03502531, "memory(GiB)": 142.32, "step": 72240, "train_speed(iter/s)": 0.285578 }, { "acc": 0.73061275, "epoch": 0.8082242917481653, "grad_norm": 6.34375, "learning_rate": 6.945993555574709e-06, "loss": 1.06091413, "memory(GiB)": 142.32, "step": 72260, "train_speed(iter/s)": 0.285603 }, { "acc": 0.73489728, "epoch": 0.8084479906941239, "grad_norm": 6.75, "learning_rate": 6.9442898214369114e-06, "loss": 1.05137119, "memory(GiB)": 142.32, "step": 72280, "train_speed(iter/s)": 0.285627 }, { "acc": 0.72740993, "epoch": 0.8086716896400824, "grad_norm": 5.5, "learning_rate": 6.942585821292476e-06, "loss": 1.08710499, "memory(GiB)": 142.32, "step": 72300, "train_speed(iter/s)": 0.285657 }, { "acc": 0.72196717, "epoch": 0.8088953885860409, "grad_norm": 5.15625, "learning_rate": 6.940881555374533e-06, "loss": 1.12206678, "memory(GiB)": 142.32, "step": 72320, "train_speed(iter/s)": 0.285684 }, { "acc": 0.73607607, "epoch": 0.8091190875319995, "grad_norm": 6.0625, "learning_rate": 6.939177023916255e-06, "loss": 1.05482864, "memory(GiB)": 142.32, "step": 72340, "train_speed(iter/s)": 0.285712 }, { "acc": 0.72257328, "epoch": 0.809342786477958, "grad_norm": 4.84375, "learning_rate": 6.93747222715084e-06, "loss": 1.09845104, "memory(GiB)": 142.32, "step": 72360, "train_speed(iter/s)": 0.285741 }, { "acc": 0.73458376, "epoch": 0.8095664854239165, "grad_norm": 5.46875, "learning_rate": 6.935767165311532e-06, "loss": 1.06300583, "memory(GiB)": 142.32, "step": 72380, "train_speed(iter/s)": 0.285765 }, { "acc": 0.72758064, "epoch": 0.809790184369875, "grad_norm": 5.375, "learning_rate": 6.934061838631607e-06, "loss": 1.08203163, "memory(GiB)": 142.32, "step": 72400, "train_speed(iter/s)": 0.285792 }, { "acc": 0.73317065, "epoch": 0.8100138833158336, "grad_norm": 6.375, "learning_rate": 6.932356247344379e-06, "loss": 1.07387619, "memory(GiB)": 142.32, "step": 72420, "train_speed(iter/s)": 0.285821 }, { "acc": 0.73113213, "epoch": 0.8102375822617921, "grad_norm": 6.4375, "learning_rate": 6.930650391683198e-06, "loss": 1.06983051, "memory(GiB)": 142.32, "step": 72440, "train_speed(iter/s)": 0.285848 }, { "acc": 0.74379969, "epoch": 0.8104612812077506, "grad_norm": 6.75, "learning_rate": 6.928944271881447e-06, "loss": 1.00472107, "memory(GiB)": 142.32, "step": 72460, "train_speed(iter/s)": 0.285875 }, { "acc": 0.74362278, "epoch": 0.8106849801537092, "grad_norm": 6.21875, "learning_rate": 6.927237888172549e-06, "loss": 1.01214218, "memory(GiB)": 142.32, "step": 72480, "train_speed(iter/s)": 0.285905 }, { "acc": 0.73686481, "epoch": 0.8109086790996677, "grad_norm": 5.09375, "learning_rate": 6.92553124078996e-06, "loss": 1.05687151, "memory(GiB)": 142.32, "step": 72500, "train_speed(iter/s)": 0.285934 }, { "acc": 0.73087769, "epoch": 0.8111323780456262, "grad_norm": 6.75, "learning_rate": 6.9238243299671746e-06, "loss": 1.08109188, "memory(GiB)": 142.32, "step": 72520, "train_speed(iter/s)": 0.285958 }, { "acc": 0.73493233, "epoch": 0.8113560769915847, "grad_norm": 6.125, "learning_rate": 6.922117155937725e-06, "loss": 1.05385303, "memory(GiB)": 142.32, "step": 72540, "train_speed(iter/s)": 0.285984 }, { "acc": 0.72951241, "epoch": 0.8115797759375433, "grad_norm": 6.8125, "learning_rate": 6.920409718935175e-06, "loss": 1.09483356, "memory(GiB)": 142.32, "step": 72560, "train_speed(iter/s)": 0.286011 }, { "acc": 0.73781199, "epoch": 0.8118034748835018, "grad_norm": 6.53125, "learning_rate": 6.918702019193125e-06, "loss": 1.04890633, "memory(GiB)": 142.32, "step": 72580, "train_speed(iter/s)": 0.28604 }, { "acc": 0.73397551, "epoch": 0.8120271738294603, "grad_norm": 5.625, "learning_rate": 6.916994056945215e-06, "loss": 1.06870909, "memory(GiB)": 142.32, "step": 72600, "train_speed(iter/s)": 0.286065 }, { "acc": 0.74049711, "epoch": 0.8122508727754189, "grad_norm": 5.84375, "learning_rate": 6.915285832425117e-06, "loss": 1.01188011, "memory(GiB)": 142.32, "step": 72620, "train_speed(iter/s)": 0.286093 }, { "acc": 0.73931522, "epoch": 0.8124745717213774, "grad_norm": 6.375, "learning_rate": 6.913577345866542e-06, "loss": 1.02189064, "memory(GiB)": 142.32, "step": 72640, "train_speed(iter/s)": 0.28612 }, { "acc": 0.73585873, "epoch": 0.8126982706673359, "grad_norm": 5.59375, "learning_rate": 6.911868597503236e-06, "loss": 1.05028982, "memory(GiB)": 142.32, "step": 72660, "train_speed(iter/s)": 0.286147 }, { "acc": 0.73129692, "epoch": 0.8129219696132944, "grad_norm": 5.34375, "learning_rate": 6.910159587568978e-06, "loss": 1.06277714, "memory(GiB)": 142.32, "step": 72680, "train_speed(iter/s)": 0.286173 }, { "acc": 0.73464079, "epoch": 0.813145668559253, "grad_norm": 7.21875, "learning_rate": 6.908450316297586e-06, "loss": 1.06854534, "memory(GiB)": 142.32, "step": 72700, "train_speed(iter/s)": 0.286202 }, { "acc": 0.72350841, "epoch": 0.8133693675052115, "grad_norm": 4.90625, "learning_rate": 6.9067407839229115e-06, "loss": 1.09556456, "memory(GiB)": 142.32, "step": 72720, "train_speed(iter/s)": 0.286231 }, { "acc": 0.74871588, "epoch": 0.81359306645117, "grad_norm": 6.46875, "learning_rate": 6.905030990678845e-06, "loss": 0.99172668, "memory(GiB)": 142.32, "step": 72740, "train_speed(iter/s)": 0.286258 }, { "acc": 0.73892469, "epoch": 0.8138167653971285, "grad_norm": 9.0, "learning_rate": 6.9033209367993104e-06, "loss": 1.04251785, "memory(GiB)": 142.32, "step": 72760, "train_speed(iter/s)": 0.286286 }, { "acc": 0.7245513, "epoch": 0.8140404643430871, "grad_norm": 7.40625, "learning_rate": 6.901610622518266e-06, "loss": 1.1002346, "memory(GiB)": 142.32, "step": 72780, "train_speed(iter/s)": 0.286316 }, { "acc": 0.739604, "epoch": 0.8142641632890456, "grad_norm": 8.4375, "learning_rate": 6.899900048069709e-06, "loss": 1.03309984, "memory(GiB)": 142.32, "step": 72800, "train_speed(iter/s)": 0.286342 }, { "acc": 0.73306112, "epoch": 0.8144878622350041, "grad_norm": 6.09375, "learning_rate": 6.89818921368767e-06, "loss": 1.06551437, "memory(GiB)": 142.32, "step": 72820, "train_speed(iter/s)": 0.286367 }, { "acc": 0.74118567, "epoch": 0.8147115611809627, "grad_norm": 6.28125, "learning_rate": 6.896478119606214e-06, "loss": 1.01554403, "memory(GiB)": 142.32, "step": 72840, "train_speed(iter/s)": 0.286394 }, { "acc": 0.73224859, "epoch": 0.8149352601269212, "grad_norm": 5.8125, "learning_rate": 6.894766766059444e-06, "loss": 1.06031456, "memory(GiB)": 142.32, "step": 72860, "train_speed(iter/s)": 0.28642 }, { "acc": 0.72504234, "epoch": 0.8151589590728797, "grad_norm": 5.9375, "learning_rate": 6.893055153281499e-06, "loss": 1.098456, "memory(GiB)": 142.32, "step": 72880, "train_speed(iter/s)": 0.286445 }, { "acc": 0.73863893, "epoch": 0.8153826580188382, "grad_norm": 8.125, "learning_rate": 6.8913432815065504e-06, "loss": 1.04181013, "memory(GiB)": 142.32, "step": 72900, "train_speed(iter/s)": 0.286472 }, { "acc": 0.74078836, "epoch": 0.8156063569647968, "grad_norm": 5.4375, "learning_rate": 6.889631150968807e-06, "loss": 1.01254282, "memory(GiB)": 142.32, "step": 72920, "train_speed(iter/s)": 0.286498 }, { "acc": 0.74297614, "epoch": 0.8158300559107553, "grad_norm": 5.3125, "learning_rate": 6.887918761902515e-06, "loss": 1.01734867, "memory(GiB)": 142.32, "step": 72940, "train_speed(iter/s)": 0.286525 }, { "acc": 0.74781208, "epoch": 0.8160537548567138, "grad_norm": 5.5, "learning_rate": 6.886206114541951e-06, "loss": 0.99385777, "memory(GiB)": 142.32, "step": 72960, "train_speed(iter/s)": 0.28655 }, { "acc": 0.72527018, "epoch": 0.8162774538026724, "grad_norm": 5.5625, "learning_rate": 6.88449320912143e-06, "loss": 1.08561325, "memory(GiB)": 142.32, "step": 72980, "train_speed(iter/s)": 0.286575 }, { "acc": 0.71810007, "epoch": 0.8165011527486309, "grad_norm": 5.875, "learning_rate": 6.882780045875302e-06, "loss": 1.14072819, "memory(GiB)": 142.32, "step": 73000, "train_speed(iter/s)": 0.286601 }, { "acc": 0.73585081, "epoch": 0.8167248516945894, "grad_norm": 4.71875, "learning_rate": 6.8810666250379534e-06, "loss": 1.06123524, "memory(GiB)": 142.32, "step": 73020, "train_speed(iter/s)": 0.286625 }, { "acc": 0.72510986, "epoch": 0.8169485506405479, "grad_norm": 7.1875, "learning_rate": 6.879352946843802e-06, "loss": 1.10109711, "memory(GiB)": 142.32, "step": 73040, "train_speed(iter/s)": 0.286652 }, { "acc": 0.73126659, "epoch": 0.8171722495865065, "grad_norm": 5.40625, "learning_rate": 6.877639011527309e-06, "loss": 1.06606407, "memory(GiB)": 142.32, "step": 73060, "train_speed(iter/s)": 0.286679 }, { "acc": 0.75064926, "epoch": 0.817395948532465, "grad_norm": 6.125, "learning_rate": 6.8759248193229584e-06, "loss": 0.97869883, "memory(GiB)": 142.32, "step": 73080, "train_speed(iter/s)": 0.286707 }, { "acc": 0.73227386, "epoch": 0.8176196474784235, "grad_norm": 6.5, "learning_rate": 6.874210370465281e-06, "loss": 1.06969509, "memory(GiB)": 142.32, "step": 73100, "train_speed(iter/s)": 0.286733 }, { "acc": 0.73119745, "epoch": 0.8178433464243821, "grad_norm": 5.65625, "learning_rate": 6.8724956651888355e-06, "loss": 1.06474361, "memory(GiB)": 142.32, "step": 73120, "train_speed(iter/s)": 0.28676 }, { "acc": 0.73431287, "epoch": 0.8180670453703406, "grad_norm": 5.21875, "learning_rate": 6.870780703728219e-06, "loss": 1.05463352, "memory(GiB)": 142.32, "step": 73140, "train_speed(iter/s)": 0.286787 }, { "acc": 0.72895489, "epoch": 0.8182907443162991, "grad_norm": 6.4375, "learning_rate": 6.869065486318063e-06, "loss": 1.07180805, "memory(GiB)": 142.32, "step": 73160, "train_speed(iter/s)": 0.286812 }, { "acc": 0.73464823, "epoch": 0.8185144432622576, "grad_norm": 5.875, "learning_rate": 6.867350013193032e-06, "loss": 1.05731192, "memory(GiB)": 142.32, "step": 73180, "train_speed(iter/s)": 0.286838 }, { "acc": 0.73078947, "epoch": 0.8187381422082162, "grad_norm": 8.375, "learning_rate": 6.86563428458783e-06, "loss": 1.07115326, "memory(GiB)": 142.32, "step": 73200, "train_speed(iter/s)": 0.286866 }, { "acc": 0.73291245, "epoch": 0.8189618411541747, "grad_norm": 5.0, "learning_rate": 6.863918300737191e-06, "loss": 1.06864967, "memory(GiB)": 142.32, "step": 73220, "train_speed(iter/s)": 0.286892 }, { "acc": 0.73914175, "epoch": 0.8191855401001332, "grad_norm": 7.25, "learning_rate": 6.862202061875888e-06, "loss": 1.02911949, "memory(GiB)": 142.32, "step": 73240, "train_speed(iter/s)": 0.286917 }, { "acc": 0.72974062, "epoch": 0.8194092390460918, "grad_norm": 6.65625, "learning_rate": 6.860485568238725e-06, "loss": 1.0602684, "memory(GiB)": 142.32, "step": 73260, "train_speed(iter/s)": 0.286945 }, { "acc": 0.72914791, "epoch": 0.8196329379920503, "grad_norm": 6.40625, "learning_rate": 6.858768820060544e-06, "loss": 1.06656799, "memory(GiB)": 142.32, "step": 73280, "train_speed(iter/s)": 0.286971 }, { "acc": 0.73033266, "epoch": 0.8198566369380088, "grad_norm": 6.78125, "learning_rate": 6.857051817576221e-06, "loss": 1.07750587, "memory(GiB)": 142.32, "step": 73300, "train_speed(iter/s)": 0.286997 }, { "acc": 0.73390694, "epoch": 0.8200803358839673, "grad_norm": 6.21875, "learning_rate": 6.855334561020666e-06, "loss": 1.06834974, "memory(GiB)": 142.32, "step": 73320, "train_speed(iter/s)": 0.287027 }, { "acc": 0.73704338, "epoch": 0.8203040348299259, "grad_norm": 6.40625, "learning_rate": 6.8536170506288226e-06, "loss": 1.02900391, "memory(GiB)": 142.32, "step": 73340, "train_speed(iter/s)": 0.287053 }, { "acc": 0.73088856, "epoch": 0.8205277337758844, "grad_norm": 7.28125, "learning_rate": 6.851899286635673e-06, "loss": 1.07193604, "memory(GiB)": 142.32, "step": 73360, "train_speed(iter/s)": 0.287078 }, { "acc": 0.7419054, "epoch": 0.8207514327218429, "grad_norm": 5.75, "learning_rate": 6.8501812692762325e-06, "loss": 1.03110504, "memory(GiB)": 142.32, "step": 73380, "train_speed(iter/s)": 0.287105 }, { "acc": 0.7340848, "epoch": 0.8209751316678014, "grad_norm": 6.875, "learning_rate": 6.848462998785549e-06, "loss": 1.07441711, "memory(GiB)": 142.32, "step": 73400, "train_speed(iter/s)": 0.287133 }, { "acc": 0.73428106, "epoch": 0.82119883061376, "grad_norm": 5.5, "learning_rate": 6.846744475398706e-06, "loss": 1.05353794, "memory(GiB)": 142.32, "step": 73420, "train_speed(iter/s)": 0.287157 }, { "acc": 0.72560101, "epoch": 0.8214225295597185, "grad_norm": 7.65625, "learning_rate": 6.845025699350822e-06, "loss": 1.09640446, "memory(GiB)": 142.32, "step": 73440, "train_speed(iter/s)": 0.287184 }, { "acc": 0.73420353, "epoch": 0.821646228505677, "grad_norm": 4.5625, "learning_rate": 6.843306670877053e-06, "loss": 1.0613884, "memory(GiB)": 142.32, "step": 73460, "train_speed(iter/s)": 0.28721 }, { "acc": 0.73837509, "epoch": 0.8218699274516356, "grad_norm": 6.15625, "learning_rate": 6.841587390212583e-06, "loss": 1.03519993, "memory(GiB)": 142.32, "step": 73480, "train_speed(iter/s)": 0.287238 }, { "acc": 0.72748013, "epoch": 0.8220936263975941, "grad_norm": 6.46875, "learning_rate": 6.839867857592634e-06, "loss": 1.10700474, "memory(GiB)": 142.32, "step": 73500, "train_speed(iter/s)": 0.287263 }, { "acc": 0.7332675, "epoch": 0.8223173253435526, "grad_norm": 5.21875, "learning_rate": 6.8381480732524675e-06, "loss": 1.05994167, "memory(GiB)": 142.32, "step": 73520, "train_speed(iter/s)": 0.287291 }, { "acc": 0.72860327, "epoch": 0.8225410242895111, "grad_norm": 6.375, "learning_rate": 6.83642803742737e-06, "loss": 1.07096901, "memory(GiB)": 142.32, "step": 73540, "train_speed(iter/s)": 0.287317 }, { "acc": 0.72992296, "epoch": 0.8227647232354697, "grad_norm": 6.90625, "learning_rate": 6.834707750352667e-06, "loss": 1.07790966, "memory(GiB)": 142.32, "step": 73560, "train_speed(iter/s)": 0.287343 }, { "acc": 0.73884945, "epoch": 0.8229884221814282, "grad_norm": 6.78125, "learning_rate": 6.832987212263722e-06, "loss": 1.0338829, "memory(GiB)": 142.32, "step": 73580, "train_speed(iter/s)": 0.287368 }, { "acc": 0.73596492, "epoch": 0.8232121211273867, "grad_norm": 6.25, "learning_rate": 6.831266423395926e-06, "loss": 1.04718914, "memory(GiB)": 142.32, "step": 73600, "train_speed(iter/s)": 0.287396 }, { "acc": 0.73808379, "epoch": 0.8234358200733453, "grad_norm": 5.84375, "learning_rate": 6.829545383984708e-06, "loss": 1.0327033, "memory(GiB)": 142.32, "step": 73620, "train_speed(iter/s)": 0.287424 }, { "acc": 0.73516283, "epoch": 0.8236595190193038, "grad_norm": 6.25, "learning_rate": 6.827824094265532e-06, "loss": 1.04815102, "memory(GiB)": 142.32, "step": 73640, "train_speed(iter/s)": 0.287452 }, { "acc": 0.7291615, "epoch": 0.8238832179652623, "grad_norm": 5.65625, "learning_rate": 6.826102554473895e-06, "loss": 1.06743813, "memory(GiB)": 142.32, "step": 73660, "train_speed(iter/s)": 0.287479 }, { "acc": 0.74090414, "epoch": 0.8241069169112208, "grad_norm": 5.8125, "learning_rate": 6.8243807648453265e-06, "loss": 1.03112774, "memory(GiB)": 142.32, "step": 73680, "train_speed(iter/s)": 0.287506 }, { "acc": 0.74123545, "epoch": 0.8243306158571794, "grad_norm": 5.96875, "learning_rate": 6.822658725615394e-06, "loss": 1.02602711, "memory(GiB)": 142.32, "step": 73700, "train_speed(iter/s)": 0.28753 }, { "acc": 0.73810291, "epoch": 0.8245543148031379, "grad_norm": 6.75, "learning_rate": 6.820936437019694e-06, "loss": 1.03986177, "memory(GiB)": 142.32, "step": 73720, "train_speed(iter/s)": 0.287557 }, { "acc": 0.72792091, "epoch": 0.8247780137490964, "grad_norm": 6.90625, "learning_rate": 6.819213899293864e-06, "loss": 1.08198452, "memory(GiB)": 142.32, "step": 73740, "train_speed(iter/s)": 0.287584 }, { "acc": 0.73658533, "epoch": 0.825001712695055, "grad_norm": 6.40625, "learning_rate": 6.8174911126735685e-06, "loss": 1.04704151, "memory(GiB)": 142.32, "step": 73760, "train_speed(iter/s)": 0.287605 }, { "acc": 0.73712511, "epoch": 0.8252254116410135, "grad_norm": 6.125, "learning_rate": 6.815768077394511e-06, "loss": 1.0585393, "memory(GiB)": 142.32, "step": 73780, "train_speed(iter/s)": 0.287634 }, { "acc": 0.73274727, "epoch": 0.825449110586972, "grad_norm": 7.21875, "learning_rate": 6.81404479369243e-06, "loss": 1.04634361, "memory(GiB)": 142.32, "step": 73800, "train_speed(iter/s)": 0.287663 }, { "acc": 0.73217402, "epoch": 0.8256728095329305, "grad_norm": 6.34375, "learning_rate": 6.81232126180309e-06, "loss": 1.06409731, "memory(GiB)": 142.32, "step": 73820, "train_speed(iter/s)": 0.287686 }, { "acc": 0.73444166, "epoch": 0.8258965084788891, "grad_norm": 6.125, "learning_rate": 6.8105974819622965e-06, "loss": 1.06789188, "memory(GiB)": 142.32, "step": 73840, "train_speed(iter/s)": 0.287711 }, { "acc": 0.7308085, "epoch": 0.8261202074248476, "grad_norm": 6.0625, "learning_rate": 6.8088734544058895e-06, "loss": 1.06118526, "memory(GiB)": 142.32, "step": 73860, "train_speed(iter/s)": 0.287734 }, { "acc": 0.73725557, "epoch": 0.8263439063708061, "grad_norm": 5.59375, "learning_rate": 6.8071491793697386e-06, "loss": 1.03748722, "memory(GiB)": 142.32, "step": 73880, "train_speed(iter/s)": 0.287759 }, { "acc": 0.73705645, "epoch": 0.8265676053167647, "grad_norm": 6.875, "learning_rate": 6.805424657089752e-06, "loss": 1.03756351, "memory(GiB)": 142.32, "step": 73900, "train_speed(iter/s)": 0.287787 }, { "acc": 0.73905864, "epoch": 0.8267913042627232, "grad_norm": 5.875, "learning_rate": 6.803699887801865e-06, "loss": 1.03441448, "memory(GiB)": 142.32, "step": 73920, "train_speed(iter/s)": 0.287811 }, { "acc": 0.74053497, "epoch": 0.8270150032086817, "grad_norm": 5.4375, "learning_rate": 6.801974871742052e-06, "loss": 1.04373264, "memory(GiB)": 142.32, "step": 73940, "train_speed(iter/s)": 0.287836 }, { "acc": 0.72399492, "epoch": 0.8272387021546402, "grad_norm": 7.0, "learning_rate": 6.800249609146321e-06, "loss": 1.1024065, "memory(GiB)": 142.32, "step": 73960, "train_speed(iter/s)": 0.287863 }, { "acc": 0.73611631, "epoch": 0.8274624011005988, "grad_norm": 6.8125, "learning_rate": 6.7985241002507116e-06, "loss": 1.03520451, "memory(GiB)": 142.32, "step": 73980, "train_speed(iter/s)": 0.287887 }, { "acc": 0.72891626, "epoch": 0.8276861000465573, "grad_norm": 5.40625, "learning_rate": 6.7967983452913e-06, "loss": 1.08526449, "memory(GiB)": 142.32, "step": 74000, "train_speed(iter/s)": 0.287906 }, { "epoch": 0.8276861000465573, "eval_acc": 0.694910396427925, "eval_loss": 1.0776705741882324, "eval_runtime": 2340.3192, "eval_samples_per_second": 32.168, "eval_steps_per_second": 16.084, "step": 74000 }, { "acc": 0.74405394, "epoch": 0.8279097989925158, "grad_norm": 5.375, "learning_rate": 6.79507234450419e-06, "loss": 1.02242126, "memory(GiB)": 142.32, "step": 74020, "train_speed(iter/s)": 0.285269 }, { "acc": 0.73230152, "epoch": 0.8281334979384743, "grad_norm": 6.5, "learning_rate": 6.793346098125527e-06, "loss": 1.0418314, "memory(GiB)": 142.32, "step": 74040, "train_speed(iter/s)": 0.285294 }, { "acc": 0.73530579, "epoch": 0.8283571968844329, "grad_norm": 7.0, "learning_rate": 6.791619606391486e-06, "loss": 1.06433172, "memory(GiB)": 142.32, "step": 74060, "train_speed(iter/s)": 0.285314 }, { "acc": 0.72018261, "epoch": 0.8285808958303914, "grad_norm": 5.46875, "learning_rate": 6.789892869538273e-06, "loss": 1.12133217, "memory(GiB)": 142.32, "step": 74080, "train_speed(iter/s)": 0.28534 }, { "acc": 0.73092737, "epoch": 0.8288045947763499, "grad_norm": 5.9375, "learning_rate": 6.7881658878021335e-06, "loss": 1.05674229, "memory(GiB)": 142.32, "step": 74100, "train_speed(iter/s)": 0.285364 }, { "acc": 0.73215771, "epoch": 0.8290282937223085, "grad_norm": 6.28125, "learning_rate": 6.786438661419341e-06, "loss": 1.07395535, "memory(GiB)": 142.32, "step": 74120, "train_speed(iter/s)": 0.28539 }, { "acc": 0.73465366, "epoch": 0.8292519926682671, "grad_norm": 6.0, "learning_rate": 6.784711190626205e-06, "loss": 1.0593235, "memory(GiB)": 142.32, "step": 74140, "train_speed(iter/s)": 0.285416 }, { "acc": 0.74901385, "epoch": 0.8294756916142256, "grad_norm": 6.34375, "learning_rate": 6.78298347565907e-06, "loss": 0.97727127, "memory(GiB)": 142.32, "step": 74160, "train_speed(iter/s)": 0.285442 }, { "acc": 0.74204292, "epoch": 0.8296993905601842, "grad_norm": 6.1875, "learning_rate": 6.7812555167543106e-06, "loss": 1.0319006, "memory(GiB)": 142.32, "step": 74180, "train_speed(iter/s)": 0.285466 }, { "acc": 0.71262722, "epoch": 0.8299230895061427, "grad_norm": 5.71875, "learning_rate": 6.7795273141483365e-06, "loss": 1.14443531, "memory(GiB)": 142.32, "step": 74200, "train_speed(iter/s)": 0.285493 }, { "acc": 0.73260517, "epoch": 0.8301467884521012, "grad_norm": 6.15625, "learning_rate": 6.777798868077589e-06, "loss": 1.06582069, "memory(GiB)": 142.32, "step": 74220, "train_speed(iter/s)": 0.285516 }, { "acc": 0.73064814, "epoch": 0.8303704873980597, "grad_norm": 6.75, "learning_rate": 6.776070178778549e-06, "loss": 1.0773035, "memory(GiB)": 142.32, "step": 74240, "train_speed(iter/s)": 0.285544 }, { "acc": 0.72779198, "epoch": 0.8305941863440183, "grad_norm": 5.6875, "learning_rate": 6.774341246487719e-06, "loss": 1.0787447, "memory(GiB)": 142.32, "step": 74260, "train_speed(iter/s)": 0.285572 }, { "acc": 0.73100967, "epoch": 0.8308178852899768, "grad_norm": 6.40625, "learning_rate": 6.772612071441647e-06, "loss": 1.09018211, "memory(GiB)": 142.32, "step": 74280, "train_speed(iter/s)": 0.285599 }, { "acc": 0.73942494, "epoch": 0.8310415842359353, "grad_norm": 6.0625, "learning_rate": 6.7708826538769064e-06, "loss": 1.03715534, "memory(GiB)": 142.32, "step": 74300, "train_speed(iter/s)": 0.285627 }, { "acc": 0.71978302, "epoch": 0.8312652831818939, "grad_norm": 6.84375, "learning_rate": 6.7691529940301085e-06, "loss": 1.11504364, "memory(GiB)": 142.32, "step": 74320, "train_speed(iter/s)": 0.285652 }, { "acc": 0.72316084, "epoch": 0.8314889821278524, "grad_norm": 6.09375, "learning_rate": 6.767423092137894e-06, "loss": 1.12018127, "memory(GiB)": 142.32, "step": 74340, "train_speed(iter/s)": 0.285678 }, { "acc": 0.72883301, "epoch": 0.8317126810738109, "grad_norm": 6.4375, "learning_rate": 6.765692948436936e-06, "loss": 1.07146301, "memory(GiB)": 142.32, "step": 74360, "train_speed(iter/s)": 0.285705 }, { "acc": 0.72998571, "epoch": 0.8319363800197694, "grad_norm": 5.96875, "learning_rate": 6.763962563163946e-06, "loss": 1.07730103, "memory(GiB)": 142.32, "step": 74380, "train_speed(iter/s)": 0.285731 }, { "acc": 0.72903214, "epoch": 0.832160078965728, "grad_norm": 5.75, "learning_rate": 6.7622319365556655e-06, "loss": 1.07698355, "memory(GiB)": 142.32, "step": 74400, "train_speed(iter/s)": 0.285755 }, { "acc": 0.72669721, "epoch": 0.8323837779116865, "grad_norm": 5.625, "learning_rate": 6.760501068848867e-06, "loss": 1.0919055, "memory(GiB)": 142.32, "step": 74420, "train_speed(iter/s)": 0.285782 }, { "acc": 0.73369427, "epoch": 0.832607476857645, "grad_norm": 6.34375, "learning_rate": 6.75876996028036e-06, "loss": 1.06190338, "memory(GiB)": 142.32, "step": 74440, "train_speed(iter/s)": 0.285809 }, { "acc": 0.72702074, "epoch": 0.8328311758036036, "grad_norm": 5.0, "learning_rate": 6.757038611086984e-06, "loss": 1.08769255, "memory(GiB)": 142.32, "step": 74460, "train_speed(iter/s)": 0.285836 }, { "acc": 0.72949476, "epoch": 0.8330548747495621, "grad_norm": 5.9375, "learning_rate": 6.75530702150561e-06, "loss": 1.08515034, "memory(GiB)": 142.32, "step": 74480, "train_speed(iter/s)": 0.285861 }, { "acc": 0.72560415, "epoch": 0.8332785736955206, "grad_norm": 7.125, "learning_rate": 6.7535751917731474e-06, "loss": 1.08934793, "memory(GiB)": 142.32, "step": 74500, "train_speed(iter/s)": 0.285887 }, { "acc": 0.73089666, "epoch": 0.8335022726414791, "grad_norm": 5.21875, "learning_rate": 6.751843122126534e-06, "loss": 1.07585907, "memory(GiB)": 142.32, "step": 74520, "train_speed(iter/s)": 0.285916 }, { "acc": 0.71578279, "epoch": 0.8337259715874377, "grad_norm": 6.5625, "learning_rate": 6.750110812802744e-06, "loss": 1.13828745, "memory(GiB)": 142.32, "step": 74540, "train_speed(iter/s)": 0.285942 }, { "acc": 0.73769112, "epoch": 0.8339496705333962, "grad_norm": 5.8125, "learning_rate": 6.7483782640387776e-06, "loss": 1.0469038, "memory(GiB)": 142.32, "step": 74560, "train_speed(iter/s)": 0.285969 }, { "acc": 0.73080835, "epoch": 0.8341733694793547, "grad_norm": 7.1875, "learning_rate": 6.746645476071675e-06, "loss": 1.07409935, "memory(GiB)": 142.32, "step": 74580, "train_speed(iter/s)": 0.285996 }, { "acc": 0.7494801, "epoch": 0.8343970684253132, "grad_norm": 6.125, "learning_rate": 6.744912449138505e-06, "loss": 1.00779343, "memory(GiB)": 142.32, "step": 74600, "train_speed(iter/s)": 0.286024 }, { "acc": 0.72444334, "epoch": 0.8346207673712718, "grad_norm": 6.25, "learning_rate": 6.743179183476373e-06, "loss": 1.11154633, "memory(GiB)": 142.32, "step": 74620, "train_speed(iter/s)": 0.286051 }, { "acc": 0.73588028, "epoch": 0.8348444663172303, "grad_norm": 5.3125, "learning_rate": 6.7414456793224135e-06, "loss": 1.05302391, "memory(GiB)": 142.32, "step": 74640, "train_speed(iter/s)": 0.286077 }, { "acc": 0.73054237, "epoch": 0.8350681652631888, "grad_norm": 5.1875, "learning_rate": 6.739711936913793e-06, "loss": 1.07638836, "memory(GiB)": 142.32, "step": 74660, "train_speed(iter/s)": 0.286105 }, { "acc": 0.73943992, "epoch": 0.8352918642091474, "grad_norm": 7.84375, "learning_rate": 6.737977956487714e-06, "loss": 1.04084225, "memory(GiB)": 142.32, "step": 74680, "train_speed(iter/s)": 0.28613 }, { "acc": 0.73351398, "epoch": 0.8355155631551059, "grad_norm": 5.96875, "learning_rate": 6.736243738281407e-06, "loss": 1.05837631, "memory(GiB)": 142.32, "step": 74700, "train_speed(iter/s)": 0.286157 }, { "acc": 0.73451314, "epoch": 0.8357392621010644, "grad_norm": 6.5, "learning_rate": 6.734509282532141e-06, "loss": 1.06355, "memory(GiB)": 142.32, "step": 74720, "train_speed(iter/s)": 0.286185 }, { "acc": 0.73008366, "epoch": 0.835962961047023, "grad_norm": 6.8125, "learning_rate": 6.732774589477216e-06, "loss": 1.08070354, "memory(GiB)": 142.32, "step": 74740, "train_speed(iter/s)": 0.286212 }, { "acc": 0.73545971, "epoch": 0.8361866599929815, "grad_norm": 6.96875, "learning_rate": 6.731039659353958e-06, "loss": 1.03734217, "memory(GiB)": 142.32, "step": 74760, "train_speed(iter/s)": 0.286239 }, { "acc": 0.73927612, "epoch": 0.83641035893894, "grad_norm": 7.53125, "learning_rate": 6.729304492399731e-06, "loss": 1.02766333, "memory(GiB)": 142.32, "step": 74780, "train_speed(iter/s)": 0.286268 }, { "acc": 0.73246651, "epoch": 0.8366340578848985, "grad_norm": 5.875, "learning_rate": 6.727569088851933e-06, "loss": 1.07262535, "memory(GiB)": 142.32, "step": 74800, "train_speed(iter/s)": 0.286296 }, { "acc": 0.72621422, "epoch": 0.8368577568308571, "grad_norm": 5.875, "learning_rate": 6.725833448947992e-06, "loss": 1.09189825, "memory(GiB)": 142.32, "step": 74820, "train_speed(iter/s)": 0.286325 }, { "acc": 0.72871122, "epoch": 0.8370814557768156, "grad_norm": 5.6875, "learning_rate": 6.724097572925366e-06, "loss": 1.0863554, "memory(GiB)": 142.32, "step": 74840, "train_speed(iter/s)": 0.286352 }, { "acc": 0.7336463, "epoch": 0.8373051547227741, "grad_norm": 6.03125, "learning_rate": 6.72236146102155e-06, "loss": 1.0554903, "memory(GiB)": 142.32, "step": 74860, "train_speed(iter/s)": 0.286378 }, { "acc": 0.72609348, "epoch": 0.8375288536687326, "grad_norm": 5.78125, "learning_rate": 6.720625113474069e-06, "loss": 1.08584003, "memory(GiB)": 142.32, "step": 74880, "train_speed(iter/s)": 0.286404 }, { "acc": 0.72719374, "epoch": 0.8377525526146912, "grad_norm": 5.21875, "learning_rate": 6.718888530520476e-06, "loss": 1.08308535, "memory(GiB)": 142.32, "step": 74900, "train_speed(iter/s)": 0.286424 }, { "acc": 0.73619547, "epoch": 0.8379762515606497, "grad_norm": 5.59375, "learning_rate": 6.7171517123983655e-06, "loss": 1.03234005, "memory(GiB)": 142.32, "step": 74920, "train_speed(iter/s)": 0.286451 }, { "acc": 0.72711616, "epoch": 0.8381999505066082, "grad_norm": 5.3125, "learning_rate": 6.7154146593453565e-06, "loss": 1.09422245, "memory(GiB)": 142.32, "step": 74940, "train_speed(iter/s)": 0.286476 }, { "acc": 0.73207731, "epoch": 0.8384236494525668, "grad_norm": 6.8125, "learning_rate": 6.713677371599103e-06, "loss": 1.05531912, "memory(GiB)": 142.32, "step": 74960, "train_speed(iter/s)": 0.286505 }, { "acc": 0.72870412, "epoch": 0.8386473483985253, "grad_norm": 6.96875, "learning_rate": 6.711939849397291e-06, "loss": 1.0653635, "memory(GiB)": 142.32, "step": 74980, "train_speed(iter/s)": 0.286534 }, { "acc": 0.73635492, "epoch": 0.8388710473444838, "grad_norm": 6.53125, "learning_rate": 6.710202092977638e-06, "loss": 1.06019354, "memory(GiB)": 142.32, "step": 75000, "train_speed(iter/s)": 0.28656 }, { "acc": 0.72733793, "epoch": 0.8390947462904423, "grad_norm": 6.84375, "learning_rate": 6.708464102577895e-06, "loss": 1.08917313, "memory(GiB)": 142.32, "step": 75020, "train_speed(iter/s)": 0.286588 }, { "acc": 0.72689238, "epoch": 0.8393184452364009, "grad_norm": 5.96875, "learning_rate": 6.706725878435842e-06, "loss": 1.09857864, "memory(GiB)": 142.32, "step": 75040, "train_speed(iter/s)": 0.286616 }, { "acc": 0.73113375, "epoch": 0.8395421441823594, "grad_norm": 6.15625, "learning_rate": 6.7049874207892965e-06, "loss": 1.08225651, "memory(GiB)": 142.32, "step": 75060, "train_speed(iter/s)": 0.286641 }, { "acc": 0.72050638, "epoch": 0.8397658431283179, "grad_norm": 5.9375, "learning_rate": 6.7032487298761e-06, "loss": 1.1154356, "memory(GiB)": 142.32, "step": 75080, "train_speed(iter/s)": 0.286666 }, { "acc": 0.73112111, "epoch": 0.8399895420742765, "grad_norm": 5.625, "learning_rate": 6.7015098059341325e-06, "loss": 1.07465515, "memory(GiB)": 142.32, "step": 75100, "train_speed(iter/s)": 0.286693 }, { "acc": 0.73602276, "epoch": 0.840213241020235, "grad_norm": 5.875, "learning_rate": 6.699770649201304e-06, "loss": 1.06005859, "memory(GiB)": 142.32, "step": 75120, "train_speed(iter/s)": 0.286716 }, { "acc": 0.73276396, "epoch": 0.8404369399661935, "grad_norm": 6.84375, "learning_rate": 6.698031259915554e-06, "loss": 1.06381721, "memory(GiB)": 142.32, "step": 75140, "train_speed(iter/s)": 0.286743 }, { "acc": 0.74994283, "epoch": 0.840660638912152, "grad_norm": 5.90625, "learning_rate": 6.696291638314859e-06, "loss": 0.97585945, "memory(GiB)": 142.32, "step": 75160, "train_speed(iter/s)": 0.286771 }, { "acc": 0.72341862, "epoch": 0.8408843378581106, "grad_norm": 6.0625, "learning_rate": 6.694551784637222e-06, "loss": 1.11284637, "memory(GiB)": 142.32, "step": 75180, "train_speed(iter/s)": 0.286796 }, { "acc": 0.74044862, "epoch": 0.8411080368040691, "grad_norm": 4.96875, "learning_rate": 6.692811699120678e-06, "loss": 1.03215513, "memory(GiB)": 142.32, "step": 75200, "train_speed(iter/s)": 0.286822 }, { "acc": 0.73101215, "epoch": 0.8413317357500276, "grad_norm": 5.9375, "learning_rate": 6.6910713820033e-06, "loss": 1.04803171, "memory(GiB)": 142.32, "step": 75220, "train_speed(iter/s)": 0.28685 }, { "acc": 0.73398743, "epoch": 0.8415554346959861, "grad_norm": 5.40625, "learning_rate": 6.689330833523184e-06, "loss": 1.08466549, "memory(GiB)": 142.32, "step": 75240, "train_speed(iter/s)": 0.286875 }, { "acc": 0.73935442, "epoch": 0.8417791336419447, "grad_norm": 6.15625, "learning_rate": 6.687590053918467e-06, "loss": 1.0120554, "memory(GiB)": 142.32, "step": 75260, "train_speed(iter/s)": 0.286902 }, { "acc": 0.72321205, "epoch": 0.8420028325879032, "grad_norm": 6.09375, "learning_rate": 6.6858490434273075e-06, "loss": 1.10466232, "memory(GiB)": 142.32, "step": 75280, "train_speed(iter/s)": 0.286928 }, { "acc": 0.7293313, "epoch": 0.8422265315338617, "grad_norm": 5.875, "learning_rate": 6.6841078022879025e-06, "loss": 1.0781311, "memory(GiB)": 142.32, "step": 75300, "train_speed(iter/s)": 0.286953 }, { "acc": 0.73281002, "epoch": 0.8424502304798203, "grad_norm": 6.1875, "learning_rate": 6.6823663307384774e-06, "loss": 1.07354488, "memory(GiB)": 142.32, "step": 75320, "train_speed(iter/s)": 0.286976 }, { "acc": 0.74047298, "epoch": 0.8426739294257788, "grad_norm": 7.0, "learning_rate": 6.680624629017294e-06, "loss": 1.02313938, "memory(GiB)": 142.32, "step": 75340, "train_speed(iter/s)": 0.287002 }, { "acc": 0.74018116, "epoch": 0.8428976283717373, "grad_norm": 6.53125, "learning_rate": 6.6788826973626385e-06, "loss": 1.0347559, "memory(GiB)": 142.32, "step": 75360, "train_speed(iter/s)": 0.287029 }, { "acc": 0.74410729, "epoch": 0.8431213273176958, "grad_norm": 6.375, "learning_rate": 6.677140536012834e-06, "loss": 1.01891451, "memory(GiB)": 142.32, "step": 75380, "train_speed(iter/s)": 0.287052 }, { "acc": 0.72972765, "epoch": 0.8433450262636544, "grad_norm": 5.65625, "learning_rate": 6.675398145206231e-06, "loss": 1.08615818, "memory(GiB)": 142.32, "step": 75400, "train_speed(iter/s)": 0.287079 }, { "acc": 0.73233223, "epoch": 0.8435687252096129, "grad_norm": 5.78125, "learning_rate": 6.6736555251812164e-06, "loss": 1.08496513, "memory(GiB)": 142.32, "step": 75420, "train_speed(iter/s)": 0.287103 }, { "acc": 0.73387532, "epoch": 0.8437924241555714, "grad_norm": 5.90625, "learning_rate": 6.671912676176202e-06, "loss": 1.05036106, "memory(GiB)": 142.32, "step": 75440, "train_speed(iter/s)": 0.28713 }, { "acc": 0.72227135, "epoch": 0.84401612310153, "grad_norm": 6.96875, "learning_rate": 6.670169598429638e-06, "loss": 1.11233692, "memory(GiB)": 142.32, "step": 75460, "train_speed(iter/s)": 0.287154 }, { "acc": 0.73328562, "epoch": 0.8442398220474885, "grad_norm": 6.0, "learning_rate": 6.668426292180002e-06, "loss": 1.06036968, "memory(GiB)": 142.32, "step": 75480, "train_speed(iter/s)": 0.287181 }, { "acc": 0.73270025, "epoch": 0.844463520993447, "grad_norm": 8.5625, "learning_rate": 6.6666827576657985e-06, "loss": 1.06720886, "memory(GiB)": 142.32, "step": 75500, "train_speed(iter/s)": 0.287205 }, { "acc": 0.72916784, "epoch": 0.8446872199394055, "grad_norm": 7.3125, "learning_rate": 6.664938995125573e-06, "loss": 1.08526707, "memory(GiB)": 142.32, "step": 75520, "train_speed(iter/s)": 0.287232 }, { "acc": 0.73427486, "epoch": 0.8449109188853641, "grad_norm": 6.15625, "learning_rate": 6.663195004797896e-06, "loss": 1.06247005, "memory(GiB)": 142.32, "step": 75540, "train_speed(iter/s)": 0.287259 }, { "acc": 0.73353405, "epoch": 0.8451346178313226, "grad_norm": 7.53125, "learning_rate": 6.661450786921368e-06, "loss": 1.06088276, "memory(GiB)": 142.32, "step": 75560, "train_speed(iter/s)": 0.287286 }, { "acc": 0.73854551, "epoch": 0.8453583167772811, "grad_norm": 6.0625, "learning_rate": 6.6597063417346266e-06, "loss": 1.02307549, "memory(GiB)": 142.32, "step": 75580, "train_speed(iter/s)": 0.287316 }, { "acc": 0.73129349, "epoch": 0.8455820157232397, "grad_norm": 6.1875, "learning_rate": 6.6579616694763334e-06, "loss": 1.06637516, "memory(GiB)": 142.32, "step": 75600, "train_speed(iter/s)": 0.287341 }, { "acc": 0.72577238, "epoch": 0.8458057146691982, "grad_norm": 6.3125, "learning_rate": 6.656216770385188e-06, "loss": 1.11564178, "memory(GiB)": 142.32, "step": 75620, "train_speed(iter/s)": 0.287367 }, { "acc": 0.72927742, "epoch": 0.8460294136151567, "grad_norm": 5.9375, "learning_rate": 6.654471644699914e-06, "loss": 1.09193039, "memory(GiB)": 142.32, "step": 75640, "train_speed(iter/s)": 0.287388 }, { "acc": 0.74207969, "epoch": 0.8462531125611152, "grad_norm": 6.8125, "learning_rate": 6.652726292659272e-06, "loss": 1.03470249, "memory(GiB)": 142.32, "step": 75660, "train_speed(iter/s)": 0.287413 }, { "acc": 0.7200335, "epoch": 0.8464768115070738, "grad_norm": 6.53125, "learning_rate": 6.650980714502051e-06, "loss": 1.12093029, "memory(GiB)": 142.32, "step": 75680, "train_speed(iter/s)": 0.287437 }, { "acc": 0.73344288, "epoch": 0.8467005104530323, "grad_norm": 7.46875, "learning_rate": 6.649234910467068e-06, "loss": 1.04115906, "memory(GiB)": 142.32, "step": 75700, "train_speed(iter/s)": 0.287462 }, { "acc": 0.73510604, "epoch": 0.8469242093989908, "grad_norm": 5.6875, "learning_rate": 6.647488880793178e-06, "loss": 1.06869965, "memory(GiB)": 142.32, "step": 75720, "train_speed(iter/s)": 0.287489 }, { "acc": 0.73774586, "epoch": 0.8471479083449494, "grad_norm": 6.65625, "learning_rate": 6.64574262571926e-06, "loss": 1.04792843, "memory(GiB)": 142.32, "step": 75740, "train_speed(iter/s)": 0.287515 }, { "acc": 0.72091436, "epoch": 0.8473716072909079, "grad_norm": 5.625, "learning_rate": 6.6439961454842285e-06, "loss": 1.11553059, "memory(GiB)": 142.32, "step": 75760, "train_speed(iter/s)": 0.287542 }, { "acc": 0.73580685, "epoch": 0.8475953062368664, "grad_norm": 5.5625, "learning_rate": 6.642249440327026e-06, "loss": 1.05445824, "memory(GiB)": 142.32, "step": 75780, "train_speed(iter/s)": 0.287569 }, { "acc": 0.72618723, "epoch": 0.8478190051828249, "grad_norm": 6.53125, "learning_rate": 6.640502510486628e-06, "loss": 1.09784002, "memory(GiB)": 142.32, "step": 75800, "train_speed(iter/s)": 0.287594 }, { "acc": 0.74025974, "epoch": 0.8480427041287835, "grad_norm": 6.125, "learning_rate": 6.638755356202037e-06, "loss": 1.04073944, "memory(GiB)": 142.32, "step": 75820, "train_speed(iter/s)": 0.287621 }, { "acc": 0.73903146, "epoch": 0.848266403074742, "grad_norm": 5.375, "learning_rate": 6.637007977712291e-06, "loss": 1.04003696, "memory(GiB)": 142.32, "step": 75840, "train_speed(iter/s)": 0.287646 }, { "acc": 0.73316092, "epoch": 0.8484901020207005, "grad_norm": 6.84375, "learning_rate": 6.635260375256453e-06, "loss": 1.05622082, "memory(GiB)": 142.32, "step": 75860, "train_speed(iter/s)": 0.28767 }, { "acc": 0.73079295, "epoch": 0.848713800966659, "grad_norm": 6.03125, "learning_rate": 6.633512549073626e-06, "loss": 1.07281017, "memory(GiB)": 142.32, "step": 75880, "train_speed(iter/s)": 0.287695 }, { "acc": 0.73162212, "epoch": 0.8489374999126176, "grad_norm": 5.65625, "learning_rate": 6.631764499402932e-06, "loss": 1.07347355, "memory(GiB)": 142.32, "step": 75900, "train_speed(iter/s)": 0.287717 }, { "acc": 0.72791853, "epoch": 0.8491611988585761, "grad_norm": 7.4375, "learning_rate": 6.630016226483531e-06, "loss": 1.07747383, "memory(GiB)": 142.32, "step": 75920, "train_speed(iter/s)": 0.287745 }, { "acc": 0.73241243, "epoch": 0.8493848978045346, "grad_norm": 6.03125, "learning_rate": 6.628267730554613e-06, "loss": 1.06993685, "memory(GiB)": 142.32, "step": 75940, "train_speed(iter/s)": 0.287771 }, { "acc": 0.73457098, "epoch": 0.8496085967504932, "grad_norm": 6.34375, "learning_rate": 6.6265190118553945e-06, "loss": 1.05782108, "memory(GiB)": 142.32, "step": 75960, "train_speed(iter/s)": 0.287793 }, { "acc": 0.74031363, "epoch": 0.8498322956964517, "grad_norm": 6.5, "learning_rate": 6.624770070625129e-06, "loss": 1.03087072, "memory(GiB)": 142.32, "step": 75980, "train_speed(iter/s)": 0.287819 }, { "acc": 0.7308095, "epoch": 0.8500559946424102, "grad_norm": 6.03125, "learning_rate": 6.623020907103093e-06, "loss": 1.07363501, "memory(GiB)": 142.32, "step": 76000, "train_speed(iter/s)": 0.287845 }, { "epoch": 0.8500559946424102, "eval_acc": 0.6950122433172469, "eval_loss": 1.0770922899246216, "eval_runtime": 2338.6487, "eval_samples_per_second": 32.191, "eval_steps_per_second": 16.096, "step": 76000 }, { "acc": 0.74063148, "epoch": 0.8502796935883687, "grad_norm": 6.1875, "learning_rate": 6.6212715215286e-06, "loss": 1.01140461, "memory(GiB)": 142.32, "step": 76020, "train_speed(iter/s)": 0.285289 }, { "acc": 0.73697853, "epoch": 0.8505033925343273, "grad_norm": 6.40625, "learning_rate": 6.619521914140988e-06, "loss": 1.03542662, "memory(GiB)": 142.32, "step": 76040, "train_speed(iter/s)": 0.285315 }, { "acc": 0.73571634, "epoch": 0.8507270914802858, "grad_norm": 6.4375, "learning_rate": 6.61777208517963e-06, "loss": 1.04974174, "memory(GiB)": 142.32, "step": 76060, "train_speed(iter/s)": 0.285341 }, { "acc": 0.73235159, "epoch": 0.8509507904262443, "grad_norm": 5.21875, "learning_rate": 6.616022034883928e-06, "loss": 1.06917362, "memory(GiB)": 142.32, "step": 76080, "train_speed(iter/s)": 0.285367 }, { "acc": 0.73692412, "epoch": 0.8511744893722029, "grad_norm": 6.65625, "learning_rate": 6.614271763493314e-06, "loss": 1.0494915, "memory(GiB)": 142.32, "step": 76100, "train_speed(iter/s)": 0.28539 }, { "acc": 0.72857552, "epoch": 0.8513981883181614, "grad_norm": 6.84375, "learning_rate": 6.6125212712472485e-06, "loss": 1.07597542, "memory(GiB)": 142.32, "step": 76120, "train_speed(iter/s)": 0.285417 }, { "acc": 0.75047064, "epoch": 0.8516218872641199, "grad_norm": 5.75, "learning_rate": 6.610770558385224e-06, "loss": 0.97787704, "memory(GiB)": 142.32, "step": 76140, "train_speed(iter/s)": 0.285441 }, { "acc": 0.7263833, "epoch": 0.8518455862100784, "grad_norm": 5.5, "learning_rate": 6.6090196251467655e-06, "loss": 1.0814167, "memory(GiB)": 142.32, "step": 76160, "train_speed(iter/s)": 0.285468 }, { "acc": 0.73880873, "epoch": 0.852069285156037, "grad_norm": 6.75, "learning_rate": 6.607268471771424e-06, "loss": 1.03187094, "memory(GiB)": 142.32, "step": 76180, "train_speed(iter/s)": 0.285496 }, { "acc": 0.72683372, "epoch": 0.8522929841019955, "grad_norm": 6.09375, "learning_rate": 6.605517098498783e-06, "loss": 1.10871944, "memory(GiB)": 142.32, "step": 76200, "train_speed(iter/s)": 0.285519 }, { "acc": 0.73125105, "epoch": 0.852516683047954, "grad_norm": 5.46875, "learning_rate": 6.603765505568452e-06, "loss": 1.06195345, "memory(GiB)": 142.32, "step": 76220, "train_speed(iter/s)": 0.285547 }, { "acc": 0.73080988, "epoch": 0.8527403819939126, "grad_norm": 6.5625, "learning_rate": 6.6020136932200796e-06, "loss": 1.07555542, "memory(GiB)": 142.32, "step": 76240, "train_speed(iter/s)": 0.285571 }, { "acc": 0.73065491, "epoch": 0.8529640809398711, "grad_norm": 6.75, "learning_rate": 6.6002616616933345e-06, "loss": 1.06573038, "memory(GiB)": 142.32, "step": 76260, "train_speed(iter/s)": 0.285599 }, { "acc": 0.72015886, "epoch": 0.8531877798858296, "grad_norm": 6.9375, "learning_rate": 6.5985094112279204e-06, "loss": 1.11562939, "memory(GiB)": 142.32, "step": 76280, "train_speed(iter/s)": 0.285627 }, { "acc": 0.72559032, "epoch": 0.8534114788317881, "grad_norm": 5.53125, "learning_rate": 6.596756942063573e-06, "loss": 1.11067619, "memory(GiB)": 142.32, "step": 76300, "train_speed(iter/s)": 0.285653 }, { "acc": 0.73144002, "epoch": 0.8536351777777467, "grad_norm": 6.4375, "learning_rate": 6.595004254440051e-06, "loss": 1.07360239, "memory(GiB)": 142.32, "step": 76320, "train_speed(iter/s)": 0.285678 }, { "acc": 0.73974295, "epoch": 0.8538588767237052, "grad_norm": 5.75, "learning_rate": 6.593251348597151e-06, "loss": 1.03628635, "memory(GiB)": 142.32, "step": 76340, "train_speed(iter/s)": 0.285704 }, { "acc": 0.73792038, "epoch": 0.8540825756696637, "grad_norm": 4.96875, "learning_rate": 6.591498224774692e-06, "loss": 1.04471674, "memory(GiB)": 142.32, "step": 76360, "train_speed(iter/s)": 0.285732 }, { "acc": 0.7340518, "epoch": 0.8543062746156223, "grad_norm": 6.34375, "learning_rate": 6.589744883212529e-06, "loss": 1.06209812, "memory(GiB)": 142.32, "step": 76380, "train_speed(iter/s)": 0.285759 }, { "acc": 0.73555427, "epoch": 0.8545299735615808, "grad_norm": 5.59375, "learning_rate": 6.587991324150544e-06, "loss": 1.04006491, "memory(GiB)": 142.32, "step": 76400, "train_speed(iter/s)": 0.285786 }, { "acc": 0.72969275, "epoch": 0.8547536725075393, "grad_norm": 5.625, "learning_rate": 6.586237547828647e-06, "loss": 1.06884775, "memory(GiB)": 142.32, "step": 76420, "train_speed(iter/s)": 0.285811 }, { "acc": 0.72094421, "epoch": 0.8549773714534978, "grad_norm": 5.125, "learning_rate": 6.58448355448678e-06, "loss": 1.11306772, "memory(GiB)": 142.32, "step": 76440, "train_speed(iter/s)": 0.285836 }, { "acc": 0.72699184, "epoch": 0.8552010703994564, "grad_norm": 6.3125, "learning_rate": 6.5827293443649164e-06, "loss": 1.09580173, "memory(GiB)": 142.32, "step": 76460, "train_speed(iter/s)": 0.285861 }, { "acc": 0.72671127, "epoch": 0.8554247693454149, "grad_norm": 5.21875, "learning_rate": 6.580974917703056e-06, "loss": 1.09335957, "memory(GiB)": 142.32, "step": 76480, "train_speed(iter/s)": 0.285885 }, { "acc": 0.72763042, "epoch": 0.8556484682913734, "grad_norm": 6.4375, "learning_rate": 6.57922027474123e-06, "loss": 1.09283142, "memory(GiB)": 142.32, "step": 76500, "train_speed(iter/s)": 0.285908 }, { "acc": 0.73148394, "epoch": 0.855872167237332, "grad_norm": 5.0625, "learning_rate": 6.577465415719498e-06, "loss": 1.07872658, "memory(GiB)": 142.32, "step": 76520, "train_speed(iter/s)": 0.285932 }, { "acc": 0.73813229, "epoch": 0.8560958661832905, "grad_norm": 5.625, "learning_rate": 6.57571034087795e-06, "loss": 1.03010578, "memory(GiB)": 142.32, "step": 76540, "train_speed(iter/s)": 0.285957 }, { "acc": 0.72977085, "epoch": 0.856319565129249, "grad_norm": 5.1875, "learning_rate": 6.573955050456704e-06, "loss": 1.09408474, "memory(GiB)": 142.32, "step": 76560, "train_speed(iter/s)": 0.285984 }, { "acc": 0.72836595, "epoch": 0.8565432640752075, "grad_norm": 6.21875, "learning_rate": 6.572199544695912e-06, "loss": 1.09851618, "memory(GiB)": 142.32, "step": 76580, "train_speed(iter/s)": 0.286012 }, { "acc": 0.73403888, "epoch": 0.8567669630211661, "grad_norm": 6.9375, "learning_rate": 6.5704438238357505e-06, "loss": 1.05210009, "memory(GiB)": 142.32, "step": 76600, "train_speed(iter/s)": 0.286038 }, { "acc": 0.73553591, "epoch": 0.8569906619671246, "grad_norm": 5.46875, "learning_rate": 6.568687888116426e-06, "loss": 1.04325008, "memory(GiB)": 142.32, "step": 76620, "train_speed(iter/s)": 0.286063 }, { "acc": 0.74404693, "epoch": 0.8572143609130831, "grad_norm": 7.0625, "learning_rate": 6.566931737778177e-06, "loss": 1.02418041, "memory(GiB)": 142.32, "step": 76640, "train_speed(iter/s)": 0.286087 }, { "acc": 0.73170261, "epoch": 0.8574380598590418, "grad_norm": 4.65625, "learning_rate": 6.565175373061269e-06, "loss": 1.08061733, "memory(GiB)": 142.32, "step": 76660, "train_speed(iter/s)": 0.286113 }, { "acc": 0.73380179, "epoch": 0.8576617588050003, "grad_norm": 6.15625, "learning_rate": 6.563418794205999e-06, "loss": 1.047295, "memory(GiB)": 142.32, "step": 76680, "train_speed(iter/s)": 0.286143 }, { "acc": 0.73421497, "epoch": 0.8578854577509588, "grad_norm": 6.65625, "learning_rate": 6.561662001452691e-06, "loss": 1.06640091, "memory(GiB)": 142.32, "step": 76700, "train_speed(iter/s)": 0.286167 }, { "acc": 0.73599958, "epoch": 0.8581091566969173, "grad_norm": 6.90625, "learning_rate": 6.559904995041701e-06, "loss": 1.03517399, "memory(GiB)": 142.32, "step": 76720, "train_speed(iter/s)": 0.286193 }, { "acc": 0.7457221, "epoch": 0.8583328556428759, "grad_norm": 5.71875, "learning_rate": 6.55814777521341e-06, "loss": 1.01129723, "memory(GiB)": 142.32, "step": 76740, "train_speed(iter/s)": 0.286217 }, { "acc": 0.73262258, "epoch": 0.8585565545888344, "grad_norm": 6.8125, "learning_rate": 6.556390342208234e-06, "loss": 1.07116632, "memory(GiB)": 142.32, "step": 76760, "train_speed(iter/s)": 0.286242 }, { "acc": 0.72925963, "epoch": 0.8587802535347929, "grad_norm": 6.90625, "learning_rate": 6.554632696266612e-06, "loss": 1.08898525, "memory(GiB)": 142.32, "step": 76780, "train_speed(iter/s)": 0.286265 }, { "acc": 0.74646168, "epoch": 0.8590039524807515, "grad_norm": 5.34375, "learning_rate": 6.5528748376290165e-06, "loss": 0.98841724, "memory(GiB)": 142.32, "step": 76800, "train_speed(iter/s)": 0.286292 }, { "acc": 0.7157939, "epoch": 0.85922765142671, "grad_norm": 7.21875, "learning_rate": 6.551116766535949e-06, "loss": 1.1385438, "memory(GiB)": 142.32, "step": 76820, "train_speed(iter/s)": 0.286317 }, { "acc": 0.74270287, "epoch": 0.8594513503726685, "grad_norm": 6.375, "learning_rate": 6.5493584832279355e-06, "loss": 1.02803497, "memory(GiB)": 142.32, "step": 76840, "train_speed(iter/s)": 0.286344 }, { "acc": 0.73810234, "epoch": 0.859675049318627, "grad_norm": 5.96875, "learning_rate": 6.547599987945537e-06, "loss": 1.04027004, "memory(GiB)": 142.32, "step": 76860, "train_speed(iter/s)": 0.286369 }, { "acc": 0.73665667, "epoch": 0.8598987482645856, "grad_norm": 6.09375, "learning_rate": 6.545841280929338e-06, "loss": 1.05279408, "memory(GiB)": 142.32, "step": 76880, "train_speed(iter/s)": 0.286397 }, { "acc": 0.73044605, "epoch": 0.8601224472105441, "grad_norm": 6.3125, "learning_rate": 6.544082362419958e-06, "loss": 1.05817986, "memory(GiB)": 142.32, "step": 76900, "train_speed(iter/s)": 0.286423 }, { "acc": 0.73114672, "epoch": 0.8603461461565026, "grad_norm": 5.90625, "learning_rate": 6.542323232658041e-06, "loss": 1.07049131, "memory(GiB)": 142.32, "step": 76920, "train_speed(iter/s)": 0.286448 }, { "acc": 0.71870708, "epoch": 0.8605698451024612, "grad_norm": 6.96875, "learning_rate": 6.540563891884262e-06, "loss": 1.10660677, "memory(GiB)": 142.32, "step": 76940, "train_speed(iter/s)": 0.286474 }, { "acc": 0.73730044, "epoch": 0.8607935440484197, "grad_norm": 7.09375, "learning_rate": 6.538804340339321e-06, "loss": 1.04630184, "memory(GiB)": 142.32, "step": 76960, "train_speed(iter/s)": 0.286501 }, { "acc": 0.73410015, "epoch": 0.8610172429943782, "grad_norm": 6.3125, "learning_rate": 6.5370445782639515e-06, "loss": 1.05154648, "memory(GiB)": 142.32, "step": 76980, "train_speed(iter/s)": 0.286519 }, { "acc": 0.72780981, "epoch": 0.8612409419403367, "grad_norm": 5.03125, "learning_rate": 6.535284605898915e-06, "loss": 1.08862848, "memory(GiB)": 142.32, "step": 77000, "train_speed(iter/s)": 0.28654 }, { "acc": 0.73912897, "epoch": 0.8614646408862953, "grad_norm": 5.75, "learning_rate": 6.5335244234850005e-06, "loss": 1.01988087, "memory(GiB)": 142.32, "step": 77020, "train_speed(iter/s)": 0.286565 }, { "acc": 0.72247024, "epoch": 0.8616883398322538, "grad_norm": 5.53125, "learning_rate": 6.531764031263026e-06, "loss": 1.10940075, "memory(GiB)": 142.32, "step": 77040, "train_speed(iter/s)": 0.286589 }, { "acc": 0.73784332, "epoch": 0.8619120387782123, "grad_norm": 5.65625, "learning_rate": 6.530003429473837e-06, "loss": 1.0308176, "memory(GiB)": 142.32, "step": 77060, "train_speed(iter/s)": 0.286612 }, { "acc": 0.73743029, "epoch": 0.8621357377241708, "grad_norm": 6.5, "learning_rate": 6.52824261835831e-06, "loss": 1.05958996, "memory(GiB)": 142.32, "step": 77080, "train_speed(iter/s)": 0.28664 }, { "acc": 0.72657075, "epoch": 0.8623594366701294, "grad_norm": 6.96875, "learning_rate": 6.52648159815735e-06, "loss": 1.090522, "memory(GiB)": 142.32, "step": 77100, "train_speed(iter/s)": 0.286668 }, { "acc": 0.73359728, "epoch": 0.8625831356160879, "grad_norm": 5.75, "learning_rate": 6.524720369111888e-06, "loss": 1.04764729, "memory(GiB)": 142.32, "step": 77120, "train_speed(iter/s)": 0.286695 }, { "acc": 0.72700291, "epoch": 0.8628068345620464, "grad_norm": 5.375, "learning_rate": 6.5229589314628885e-06, "loss": 1.09226494, "memory(GiB)": 142.32, "step": 77140, "train_speed(iter/s)": 0.28672 }, { "acc": 0.72558603, "epoch": 0.863030533508005, "grad_norm": 5.15625, "learning_rate": 6.521197285451337e-06, "loss": 1.1089426, "memory(GiB)": 142.32, "step": 77160, "train_speed(iter/s)": 0.286742 }, { "acc": 0.73375368, "epoch": 0.8632542324539635, "grad_norm": 5.625, "learning_rate": 6.519435431318254e-06, "loss": 1.05792961, "memory(GiB)": 142.32, "step": 77180, "train_speed(iter/s)": 0.28677 }, { "acc": 0.73703289, "epoch": 0.863477931399922, "grad_norm": 5.46875, "learning_rate": 6.517673369304687e-06, "loss": 1.05497932, "memory(GiB)": 142.32, "step": 77200, "train_speed(iter/s)": 0.286799 }, { "acc": 0.7372057, "epoch": 0.8637016303458805, "grad_norm": 6.1875, "learning_rate": 6.515911099651711e-06, "loss": 1.04839191, "memory(GiB)": 142.32, "step": 77220, "train_speed(iter/s)": 0.286824 }, { "acc": 0.74456768, "epoch": 0.8639253292918391, "grad_norm": 5.59375, "learning_rate": 6.5141486226004265e-06, "loss": 1.0129138, "memory(GiB)": 142.32, "step": 77240, "train_speed(iter/s)": 0.286851 }, { "acc": 0.72715931, "epoch": 0.8641490282377976, "grad_norm": 6.5625, "learning_rate": 6.512385938391972e-06, "loss": 1.0953125, "memory(GiB)": 142.32, "step": 77260, "train_speed(iter/s)": 0.286876 }, { "acc": 0.73252363, "epoch": 0.8643727271837561, "grad_norm": 5.3125, "learning_rate": 6.510623047267502e-06, "loss": 1.06010761, "memory(GiB)": 142.32, "step": 77280, "train_speed(iter/s)": 0.286904 }, { "acc": 0.74319057, "epoch": 0.8645964261297147, "grad_norm": 6.96875, "learning_rate": 6.508859949468207e-06, "loss": 1.01416035, "memory(GiB)": 142.32, "step": 77300, "train_speed(iter/s)": 0.286929 }, { "acc": 0.72273226, "epoch": 0.8648201250756732, "grad_norm": 5.5, "learning_rate": 6.507096645235304e-06, "loss": 1.1180645, "memory(GiB)": 142.32, "step": 77320, "train_speed(iter/s)": 0.286954 }, { "acc": 0.73286963, "epoch": 0.8650438240216317, "grad_norm": 6.375, "learning_rate": 6.50533313481004e-06, "loss": 1.07794876, "memory(GiB)": 142.32, "step": 77340, "train_speed(iter/s)": 0.286981 }, { "acc": 0.73401775, "epoch": 0.8652675229675902, "grad_norm": 6.5, "learning_rate": 6.503569418433687e-06, "loss": 1.04673548, "memory(GiB)": 142.32, "step": 77360, "train_speed(iter/s)": 0.287008 }, { "acc": 0.74728107, "epoch": 0.8654912219135488, "grad_norm": 5.09375, "learning_rate": 6.501805496347547e-06, "loss": 0.99854898, "memory(GiB)": 142.32, "step": 77380, "train_speed(iter/s)": 0.287036 }, { "acc": 0.73799767, "epoch": 0.8657149208595073, "grad_norm": 5.125, "learning_rate": 6.500041368792948e-06, "loss": 1.05014553, "memory(GiB)": 142.32, "step": 77400, "train_speed(iter/s)": 0.287059 }, { "acc": 0.73327646, "epoch": 0.8659386198054658, "grad_norm": 6.28125, "learning_rate": 6.498277036011249e-06, "loss": 1.0585392, "memory(GiB)": 142.32, "step": 77420, "train_speed(iter/s)": 0.287084 }, { "acc": 0.74883204, "epoch": 0.8661623187514244, "grad_norm": 6.84375, "learning_rate": 6.496512498243837e-06, "loss": 0.98372116, "memory(GiB)": 142.32, "step": 77440, "train_speed(iter/s)": 0.287108 }, { "acc": 0.72901888, "epoch": 0.8663860176973829, "grad_norm": 5.375, "learning_rate": 6.494747755732126e-06, "loss": 1.09147568, "memory(GiB)": 142.32, "step": 77460, "train_speed(iter/s)": 0.287134 }, { "acc": 0.72495122, "epoch": 0.8666097166433414, "grad_norm": 6.34375, "learning_rate": 6.492982808717556e-06, "loss": 1.11478691, "memory(GiB)": 142.32, "step": 77480, "train_speed(iter/s)": 0.287157 }, { "acc": 0.71728964, "epoch": 0.8668334155892999, "grad_norm": 4.9375, "learning_rate": 6.491217657441598e-06, "loss": 1.11952305, "memory(GiB)": 142.32, "step": 77500, "train_speed(iter/s)": 0.287183 }, { "acc": 0.74595499, "epoch": 0.8670571145352585, "grad_norm": 6.375, "learning_rate": 6.48945230214575e-06, "loss": 1.00900345, "memory(GiB)": 142.32, "step": 77520, "train_speed(iter/s)": 0.287208 }, { "acc": 0.7279686, "epoch": 0.867280813481217, "grad_norm": 5.84375, "learning_rate": 6.4876867430715375e-06, "loss": 1.07728872, "memory(GiB)": 142.32, "step": 77540, "train_speed(iter/s)": 0.287235 }, { "acc": 0.73822026, "epoch": 0.8675045124271755, "grad_norm": 5.34375, "learning_rate": 6.485920980460516e-06, "loss": 1.0387392, "memory(GiB)": 142.32, "step": 77560, "train_speed(iter/s)": 0.28726 }, { "acc": 0.73361883, "epoch": 0.867728211373134, "grad_norm": 5.53125, "learning_rate": 6.4841550145542655e-06, "loss": 1.05379715, "memory(GiB)": 142.32, "step": 77580, "train_speed(iter/s)": 0.287284 }, { "acc": 0.73290539, "epoch": 0.8679519103190926, "grad_norm": 6.0625, "learning_rate": 6.4823888455943936e-06, "loss": 1.06963654, "memory(GiB)": 142.32, "step": 77600, "train_speed(iter/s)": 0.287307 }, { "acc": 0.73749328, "epoch": 0.8681756092650511, "grad_norm": 6.40625, "learning_rate": 6.480622473822541e-06, "loss": 1.03455, "memory(GiB)": 142.32, "step": 77620, "train_speed(iter/s)": 0.28733 }, { "acc": 0.73901153, "epoch": 0.8683993082110096, "grad_norm": 5.84375, "learning_rate": 6.478855899480371e-06, "loss": 1.03101301, "memory(GiB)": 142.32, "step": 77640, "train_speed(iter/s)": 0.287352 }, { "acc": 0.73329811, "epoch": 0.8686230071569682, "grad_norm": 6.9375, "learning_rate": 6.477089122809577e-06, "loss": 1.05970621, "memory(GiB)": 142.32, "step": 77660, "train_speed(iter/s)": 0.287378 }, { "acc": 0.74365153, "epoch": 0.8688467061029267, "grad_norm": 6.5, "learning_rate": 6.475322144051877e-06, "loss": 1.02177315, "memory(GiB)": 142.32, "step": 77680, "train_speed(iter/s)": 0.287404 }, { "acc": 0.72119174, "epoch": 0.8690704050488852, "grad_norm": 5.625, "learning_rate": 6.473554963449021e-06, "loss": 1.10849876, "memory(GiB)": 142.32, "step": 77700, "train_speed(iter/s)": 0.287429 }, { "acc": 0.75176392, "epoch": 0.8692941039948437, "grad_norm": 6.46875, "learning_rate": 6.471787581242784e-06, "loss": 0.98206739, "memory(GiB)": 142.32, "step": 77720, "train_speed(iter/s)": 0.287456 }, { "acc": 0.73017359, "epoch": 0.8695178029408023, "grad_norm": 6.1875, "learning_rate": 6.470019997674969e-06, "loss": 1.05829449, "memory(GiB)": 142.32, "step": 77740, "train_speed(iter/s)": 0.28748 }, { "acc": 0.7223825, "epoch": 0.8697415018867608, "grad_norm": 5.84375, "learning_rate": 6.468252212987408e-06, "loss": 1.10151825, "memory(GiB)": 142.32, "step": 77760, "train_speed(iter/s)": 0.287506 }, { "acc": 0.71961823, "epoch": 0.8699652008327193, "grad_norm": 5.1875, "learning_rate": 6.466484227421957e-06, "loss": 1.13004704, "memory(GiB)": 142.32, "step": 77780, "train_speed(iter/s)": 0.287531 }, { "acc": 0.7360239, "epoch": 0.8701888997786779, "grad_norm": 6.5625, "learning_rate": 6.464716041220505e-06, "loss": 1.04284801, "memory(GiB)": 142.32, "step": 77800, "train_speed(iter/s)": 0.287556 }, { "acc": 0.73703942, "epoch": 0.8704125987246364, "grad_norm": 4.5625, "learning_rate": 6.46294765462496e-06, "loss": 1.0413619, "memory(GiB)": 142.32, "step": 77820, "train_speed(iter/s)": 0.28758 }, { "acc": 0.73443389, "epoch": 0.8706362976705949, "grad_norm": 5.40625, "learning_rate": 6.461179067877266e-06, "loss": 1.0472868, "memory(GiB)": 142.32, "step": 77840, "train_speed(iter/s)": 0.287602 }, { "acc": 0.73350711, "epoch": 0.8708599966165534, "grad_norm": 5.21875, "learning_rate": 6.4594102812193916e-06, "loss": 1.05133648, "memory(GiB)": 142.32, "step": 77860, "train_speed(iter/s)": 0.287624 }, { "acc": 0.73196993, "epoch": 0.871083695562512, "grad_norm": 5.71875, "learning_rate": 6.457641294893331e-06, "loss": 1.06568699, "memory(GiB)": 142.32, "step": 77880, "train_speed(iter/s)": 0.28765 }, { "acc": 0.73956895, "epoch": 0.8713073945084705, "grad_norm": 5.71875, "learning_rate": 6.455872109141106e-06, "loss": 1.04718571, "memory(GiB)": 142.32, "step": 77900, "train_speed(iter/s)": 0.287675 }, { "acc": 0.7350276, "epoch": 0.871531093454429, "grad_norm": 6.03125, "learning_rate": 6.454102724204767e-06, "loss": 1.03130035, "memory(GiB)": 142.32, "step": 77920, "train_speed(iter/s)": 0.287704 }, { "acc": 0.73525496, "epoch": 0.8717547924003876, "grad_norm": 6.96875, "learning_rate": 6.452333140326391e-06, "loss": 1.05687342, "memory(GiB)": 142.32, "step": 77940, "train_speed(iter/s)": 0.287729 }, { "acc": 0.73511105, "epoch": 0.8719784913463461, "grad_norm": 5.65625, "learning_rate": 6.450563357748084e-06, "loss": 1.05461321, "memory(GiB)": 142.32, "step": 77960, "train_speed(iter/s)": 0.287757 }, { "acc": 0.72978563, "epoch": 0.8722021902923046, "grad_norm": 5.6875, "learning_rate": 6.448793376711977e-06, "loss": 1.08021507, "memory(GiB)": 142.32, "step": 77980, "train_speed(iter/s)": 0.287781 }, { "acc": 0.74053974, "epoch": 0.8724258892382631, "grad_norm": 6.03125, "learning_rate": 6.447023197460226e-06, "loss": 1.03973293, "memory(GiB)": 142.32, "step": 78000, "train_speed(iter/s)": 0.287805 }, { "epoch": 0.8724258892382631, "eval_acc": 0.6951593445358996, "eval_loss": 1.0763410329818726, "eval_runtime": 2341.2927, "eval_samples_per_second": 32.154, "eval_steps_per_second": 16.077, "step": 78000 }, { "acc": 0.72552958, "epoch": 0.8726495881842217, "grad_norm": 5.0, "learning_rate": 6.44525282023502e-06, "loss": 1.09432592, "memory(GiB)": 142.32, "step": 78020, "train_speed(iter/s)": 0.285311 }, { "acc": 0.75123749, "epoch": 0.8728732871301802, "grad_norm": 6.375, "learning_rate": 6.443482245278571e-06, "loss": 0.96928787, "memory(GiB)": 142.32, "step": 78040, "train_speed(iter/s)": 0.285338 }, { "acc": 0.74012003, "epoch": 0.8730969860761387, "grad_norm": 5.53125, "learning_rate": 6.441711472833118e-06, "loss": 1.02548771, "memory(GiB)": 142.32, "step": 78060, "train_speed(iter/s)": 0.285364 }, { "acc": 0.74170752, "epoch": 0.8733206850220973, "grad_norm": 5.5, "learning_rate": 6.439940503140929e-06, "loss": 1.01726475, "memory(GiB)": 142.32, "step": 78080, "train_speed(iter/s)": 0.28539 }, { "acc": 0.73275371, "epoch": 0.8735443839680558, "grad_norm": 7.0625, "learning_rate": 6.438169336444298e-06, "loss": 1.04752369, "memory(GiB)": 142.32, "step": 78100, "train_speed(iter/s)": 0.285412 }, { "acc": 0.73618574, "epoch": 0.8737680829140143, "grad_norm": 4.875, "learning_rate": 6.436397972985544e-06, "loss": 1.05947037, "memory(GiB)": 142.32, "step": 78120, "train_speed(iter/s)": 0.285432 }, { "acc": 0.73250055, "epoch": 0.8739917818599728, "grad_norm": 5.40625, "learning_rate": 6.434626413007018e-06, "loss": 1.07366886, "memory(GiB)": 142.32, "step": 78140, "train_speed(iter/s)": 0.285456 }, { "acc": 0.73900709, "epoch": 0.8742154808059314, "grad_norm": 5.75, "learning_rate": 6.432854656751093e-06, "loss": 1.04215221, "memory(GiB)": 142.32, "step": 78160, "train_speed(iter/s)": 0.285482 }, { "acc": 0.74086213, "epoch": 0.8744391797518899, "grad_norm": 6.65625, "learning_rate": 6.431082704460172e-06, "loss": 1.02996445, "memory(GiB)": 142.32, "step": 78180, "train_speed(iter/s)": 0.28551 }, { "acc": 0.7324892, "epoch": 0.8746628786978484, "grad_norm": 5.71875, "learning_rate": 6.42931055637668e-06, "loss": 1.06716156, "memory(GiB)": 142.32, "step": 78200, "train_speed(iter/s)": 0.285537 }, { "acc": 0.73216372, "epoch": 0.874886577643807, "grad_norm": 5.40625, "learning_rate": 6.427538212743075e-06, "loss": 1.06205359, "memory(GiB)": 142.32, "step": 78220, "train_speed(iter/s)": 0.285565 }, { "acc": 0.73806524, "epoch": 0.8751102765897655, "grad_norm": 6.0, "learning_rate": 6.4257656738018385e-06, "loss": 1.02061901, "memory(GiB)": 142.32, "step": 78240, "train_speed(iter/s)": 0.285592 }, { "acc": 0.72738938, "epoch": 0.875333975535724, "grad_norm": 5.5, "learning_rate": 6.423992939795478e-06, "loss": 1.09613609, "memory(GiB)": 142.32, "step": 78260, "train_speed(iter/s)": 0.285619 }, { "acc": 0.73699408, "epoch": 0.8755576744816825, "grad_norm": 4.40625, "learning_rate": 6.422220010966531e-06, "loss": 1.05124435, "memory(GiB)": 142.32, "step": 78280, "train_speed(iter/s)": 0.285647 }, { "acc": 0.72990742, "epoch": 0.8757813734276411, "grad_norm": 5.90625, "learning_rate": 6.4204468875575585e-06, "loss": 1.06410637, "memory(GiB)": 142.32, "step": 78300, "train_speed(iter/s)": 0.285677 }, { "acc": 0.74882326, "epoch": 0.8760050723735996, "grad_norm": 5.09375, "learning_rate": 6.418673569811148e-06, "loss": 0.99422569, "memory(GiB)": 142.32, "step": 78320, "train_speed(iter/s)": 0.285701 }, { "acc": 0.7383152, "epoch": 0.8762287713195581, "grad_norm": 6.40625, "learning_rate": 6.416900057969916e-06, "loss": 1.04391432, "memory(GiB)": 142.32, "step": 78340, "train_speed(iter/s)": 0.285725 }, { "acc": 0.73868761, "epoch": 0.8764524702655166, "grad_norm": 6.0, "learning_rate": 6.415126352276504e-06, "loss": 1.03664923, "memory(GiB)": 142.32, "step": 78360, "train_speed(iter/s)": 0.285753 }, { "acc": 0.72342339, "epoch": 0.8766761692114752, "grad_norm": 6.6875, "learning_rate": 6.41335245297358e-06, "loss": 1.09814186, "memory(GiB)": 142.32, "step": 78380, "train_speed(iter/s)": 0.28578 }, { "acc": 0.74062452, "epoch": 0.8768998681574337, "grad_norm": 6.65625, "learning_rate": 6.411578360303841e-06, "loss": 1.02817936, "memory(GiB)": 142.32, "step": 78400, "train_speed(iter/s)": 0.285802 }, { "acc": 0.73378963, "epoch": 0.8771235671033922, "grad_norm": 5.65625, "learning_rate": 6.409804074510003e-06, "loss": 1.05012865, "memory(GiB)": 142.32, "step": 78420, "train_speed(iter/s)": 0.285826 }, { "acc": 0.72088041, "epoch": 0.8773472660493508, "grad_norm": 5.4375, "learning_rate": 6.408029595834818e-06, "loss": 1.12384377, "memory(GiB)": 142.32, "step": 78440, "train_speed(iter/s)": 0.285849 }, { "acc": 0.73671331, "epoch": 0.8775709649953093, "grad_norm": 5.5625, "learning_rate": 6.4062549245210595e-06, "loss": 1.05180359, "memory(GiB)": 142.32, "step": 78460, "train_speed(iter/s)": 0.285873 }, { "acc": 0.73120832, "epoch": 0.8777946639412678, "grad_norm": 7.09375, "learning_rate": 6.4044800608115265e-06, "loss": 1.06865005, "memory(GiB)": 142.32, "step": 78480, "train_speed(iter/s)": 0.285898 }, { "acc": 0.73807359, "epoch": 0.8780183628872263, "grad_norm": 4.65625, "learning_rate": 6.402705004949047e-06, "loss": 1.04894924, "memory(GiB)": 142.32, "step": 78500, "train_speed(iter/s)": 0.285923 }, { "acc": 0.74765778, "epoch": 0.8782420618331849, "grad_norm": 7.0625, "learning_rate": 6.400929757176473e-06, "loss": 0.98328667, "memory(GiB)": 142.32, "step": 78520, "train_speed(iter/s)": 0.285948 }, { "acc": 0.73984451, "epoch": 0.8784657607791434, "grad_norm": 6.8125, "learning_rate": 6.399154317736685e-06, "loss": 1.02293673, "memory(GiB)": 142.32, "step": 78540, "train_speed(iter/s)": 0.285971 }, { "acc": 0.73871355, "epoch": 0.8786894597251019, "grad_norm": 5.625, "learning_rate": 6.397378686872587e-06, "loss": 1.02820702, "memory(GiB)": 142.32, "step": 78560, "train_speed(iter/s)": 0.285996 }, { "acc": 0.72491317, "epoch": 0.8789131586710605, "grad_norm": 6.15625, "learning_rate": 6.395602864827112e-06, "loss": 1.11740475, "memory(GiB)": 142.32, "step": 78580, "train_speed(iter/s)": 0.286021 }, { "acc": 0.73347492, "epoch": 0.879136857617019, "grad_norm": 7.46875, "learning_rate": 6.393826851843218e-06, "loss": 1.05999966, "memory(GiB)": 142.32, "step": 78600, "train_speed(iter/s)": 0.286043 }, { "acc": 0.73258233, "epoch": 0.8793605565629775, "grad_norm": 6.125, "learning_rate": 6.392050648163888e-06, "loss": 1.05109282, "memory(GiB)": 142.32, "step": 78620, "train_speed(iter/s)": 0.28607 }, { "acc": 0.73858213, "epoch": 0.879584255508936, "grad_norm": 5.84375, "learning_rate": 6.390274254032132e-06, "loss": 1.02522812, "memory(GiB)": 142.32, "step": 78640, "train_speed(iter/s)": 0.286094 }, { "acc": 0.73349848, "epoch": 0.8798079544548946, "grad_norm": 5.4375, "learning_rate": 6.388497669690985e-06, "loss": 1.05690689, "memory(GiB)": 142.32, "step": 78660, "train_speed(iter/s)": 0.286118 }, { "acc": 0.74393387, "epoch": 0.8800316534008531, "grad_norm": 6.53125, "learning_rate": 6.386720895383512e-06, "loss": 1.00810566, "memory(GiB)": 142.32, "step": 78680, "train_speed(iter/s)": 0.286142 }, { "acc": 0.7185286, "epoch": 0.8802553523468116, "grad_norm": 5.9375, "learning_rate": 6.384943931352801e-06, "loss": 1.1303854, "memory(GiB)": 142.32, "step": 78700, "train_speed(iter/s)": 0.286162 }, { "acc": 0.7339282, "epoch": 0.8804790512927702, "grad_norm": 5.0, "learning_rate": 6.383166777841963e-06, "loss": 1.06684256, "memory(GiB)": 142.32, "step": 78720, "train_speed(iter/s)": 0.286189 }, { "acc": 0.72931747, "epoch": 0.8807027502387287, "grad_norm": 7.25, "learning_rate": 6.38138943509414e-06, "loss": 1.08133831, "memory(GiB)": 142.32, "step": 78740, "train_speed(iter/s)": 0.286214 }, { "acc": 0.73370028, "epoch": 0.8809264491846872, "grad_norm": 7.0625, "learning_rate": 6.379611903352498e-06, "loss": 1.06083469, "memory(GiB)": 142.32, "step": 78760, "train_speed(iter/s)": 0.286236 }, { "acc": 0.73131247, "epoch": 0.8811501481306457, "grad_norm": 5.1875, "learning_rate": 6.377834182860229e-06, "loss": 1.07391605, "memory(GiB)": 142.32, "step": 78780, "train_speed(iter/s)": 0.286261 }, { "acc": 0.73323507, "epoch": 0.8813738470766043, "grad_norm": 5.75, "learning_rate": 6.376056273860549e-06, "loss": 1.0676981, "memory(GiB)": 142.32, "step": 78800, "train_speed(iter/s)": 0.286285 }, { "acc": 0.7297123, "epoch": 0.8815975460225628, "grad_norm": 6.40625, "learning_rate": 6.374278176596703e-06, "loss": 1.08357563, "memory(GiB)": 142.32, "step": 78820, "train_speed(iter/s)": 0.28631 }, { "acc": 0.73237085, "epoch": 0.8818212449685213, "grad_norm": 5.40625, "learning_rate": 6.372499891311958e-06, "loss": 1.05401268, "memory(GiB)": 142.32, "step": 78840, "train_speed(iter/s)": 0.286335 }, { "acc": 0.74562731, "epoch": 0.8820449439144799, "grad_norm": 7.25, "learning_rate": 6.370721418249612e-06, "loss": 1.0081852, "memory(GiB)": 142.32, "step": 78860, "train_speed(iter/s)": 0.28636 }, { "acc": 0.74055548, "epoch": 0.8822686428604384, "grad_norm": 5.8125, "learning_rate": 6.368942757652984e-06, "loss": 1.02395144, "memory(GiB)": 142.32, "step": 78880, "train_speed(iter/s)": 0.286384 }, { "acc": 0.73090134, "epoch": 0.8824923418063969, "grad_norm": 5.625, "learning_rate": 6.367163909765419e-06, "loss": 1.09459991, "memory(GiB)": 142.32, "step": 78900, "train_speed(iter/s)": 0.286409 }, { "acc": 0.72590809, "epoch": 0.8827160407523554, "grad_norm": 5.53125, "learning_rate": 6.365384874830291e-06, "loss": 1.0887394, "memory(GiB)": 142.32, "step": 78920, "train_speed(iter/s)": 0.286435 }, { "acc": 0.72616143, "epoch": 0.882939739698314, "grad_norm": 6.03125, "learning_rate": 6.3636056530909955e-06, "loss": 1.07448673, "memory(GiB)": 142.32, "step": 78940, "train_speed(iter/s)": 0.286459 }, { "acc": 0.73118014, "epoch": 0.8831634386442725, "grad_norm": 5.9375, "learning_rate": 6.3618262447909565e-06, "loss": 1.08493347, "memory(GiB)": 142.32, "step": 78960, "train_speed(iter/s)": 0.286481 }, { "acc": 0.73660831, "epoch": 0.883387137590231, "grad_norm": 7.90625, "learning_rate": 6.360046650173623e-06, "loss": 1.05323505, "memory(GiB)": 142.32, "step": 78980, "train_speed(iter/s)": 0.286507 }, { "acc": 0.72148867, "epoch": 0.8836108365361895, "grad_norm": 6.4375, "learning_rate": 6.358266869482466e-06, "loss": 1.10681076, "memory(GiB)": 142.32, "step": 79000, "train_speed(iter/s)": 0.286531 }, { "acc": 0.72925253, "epoch": 0.8838345354821481, "grad_norm": 6.84375, "learning_rate": 6.3564869029609895e-06, "loss": 1.08293447, "memory(GiB)": 142.32, "step": 79020, "train_speed(iter/s)": 0.286554 }, { "acc": 0.72107615, "epoch": 0.8840582344281066, "grad_norm": 5.0625, "learning_rate": 6.354706750852715e-06, "loss": 1.12039394, "memory(GiB)": 142.32, "step": 79040, "train_speed(iter/s)": 0.286581 }, { "acc": 0.74792776, "epoch": 0.8842819333740651, "grad_norm": 4.40625, "learning_rate": 6.3529264134011935e-06, "loss": 0.9934268, "memory(GiB)": 142.32, "step": 79060, "train_speed(iter/s)": 0.286604 }, { "acc": 0.73298831, "epoch": 0.8845056323200237, "grad_norm": 4.46875, "learning_rate": 6.351145890850001e-06, "loss": 1.0547924, "memory(GiB)": 142.32, "step": 79080, "train_speed(iter/s)": 0.286631 }, { "acc": 0.74409418, "epoch": 0.8847293312659822, "grad_norm": 5.375, "learning_rate": 6.349365183442738e-06, "loss": 1.01072464, "memory(GiB)": 142.32, "step": 79100, "train_speed(iter/s)": 0.286654 }, { "acc": 0.73436708, "epoch": 0.8849530302119407, "grad_norm": 7.03125, "learning_rate": 6.347584291423033e-06, "loss": 1.04139624, "memory(GiB)": 142.32, "step": 79120, "train_speed(iter/s)": 0.286679 }, { "acc": 0.71983471, "epoch": 0.8851767291578992, "grad_norm": 6.78125, "learning_rate": 6.3458032150345325e-06, "loss": 1.12317419, "memory(GiB)": 142.32, "step": 79140, "train_speed(iter/s)": 0.286702 }, { "acc": 0.72923365, "epoch": 0.8854004281038578, "grad_norm": 5.0, "learning_rate": 6.344021954520918e-06, "loss": 1.09150381, "memory(GiB)": 142.32, "step": 79160, "train_speed(iter/s)": 0.286726 }, { "acc": 0.74110479, "epoch": 0.8856241270498164, "grad_norm": 5.8125, "learning_rate": 6.342240510125889e-06, "loss": 1.02708569, "memory(GiB)": 142.32, "step": 79180, "train_speed(iter/s)": 0.286751 }, { "acc": 0.73416004, "epoch": 0.8858478259957749, "grad_norm": 5.59375, "learning_rate": 6.340458882093173e-06, "loss": 1.05588036, "memory(GiB)": 142.32, "step": 79200, "train_speed(iter/s)": 0.286776 }, { "acc": 0.71813073, "epoch": 0.8860715249417335, "grad_norm": 6.46875, "learning_rate": 6.3386770706665235e-06, "loss": 1.11665344, "memory(GiB)": 142.32, "step": 79220, "train_speed(iter/s)": 0.286801 }, { "acc": 0.73188462, "epoch": 0.886295223887692, "grad_norm": 5.34375, "learning_rate": 6.336895076089717e-06, "loss": 1.0754982, "memory(GiB)": 142.32, "step": 79240, "train_speed(iter/s)": 0.286825 }, { "acc": 0.73915482, "epoch": 0.8865189228336505, "grad_norm": 6.46875, "learning_rate": 6.335112898606553e-06, "loss": 1.03558559, "memory(GiB)": 142.32, "step": 79260, "train_speed(iter/s)": 0.286851 }, { "acc": 0.73646374, "epoch": 0.886742621779609, "grad_norm": 6.40625, "learning_rate": 6.333330538460863e-06, "loss": 1.04195948, "memory(GiB)": 142.32, "step": 79280, "train_speed(iter/s)": 0.286875 }, { "acc": 0.73554616, "epoch": 0.8869663207255676, "grad_norm": 6.65625, "learning_rate": 6.331547995896496e-06, "loss": 1.04476795, "memory(GiB)": 142.32, "step": 79300, "train_speed(iter/s)": 0.2869 }, { "acc": 0.73343821, "epoch": 0.8871900196715261, "grad_norm": 5.09375, "learning_rate": 6.3297652711573345e-06, "loss": 1.04451313, "memory(GiB)": 142.32, "step": 79320, "train_speed(iter/s)": 0.286925 }, { "acc": 0.73463864, "epoch": 0.8874137186174846, "grad_norm": 6.71875, "learning_rate": 6.327982364487275e-06, "loss": 1.06496277, "memory(GiB)": 142.32, "step": 79340, "train_speed(iter/s)": 0.286949 }, { "acc": 0.73824954, "epoch": 0.8876374175634432, "grad_norm": 5.84375, "learning_rate": 6.326199276130246e-06, "loss": 1.03265734, "memory(GiB)": 142.32, "step": 79360, "train_speed(iter/s)": 0.286972 }, { "acc": 0.73427796, "epoch": 0.8878611165094017, "grad_norm": 6.6875, "learning_rate": 6.3244160063302e-06, "loss": 1.0509903, "memory(GiB)": 142.32, "step": 79380, "train_speed(iter/s)": 0.286996 }, { "acc": 0.7425106, "epoch": 0.8880848154553602, "grad_norm": 7.34375, "learning_rate": 6.322632555331116e-06, "loss": 1.02420349, "memory(GiB)": 142.32, "step": 79400, "train_speed(iter/s)": 0.28702 }, { "acc": 0.74080811, "epoch": 0.8883085144013187, "grad_norm": 6.28125, "learning_rate": 6.320848923376993e-06, "loss": 1.0244318, "memory(GiB)": 142.32, "step": 79420, "train_speed(iter/s)": 0.287046 }, { "acc": 0.73256378, "epoch": 0.8885322133472773, "grad_norm": 6.3125, "learning_rate": 6.319065110711858e-06, "loss": 1.05839977, "memory(GiB)": 142.32, "step": 79440, "train_speed(iter/s)": 0.287069 }, { "acc": 0.732336, "epoch": 0.8887559122932358, "grad_norm": 6.4375, "learning_rate": 6.317281117579761e-06, "loss": 1.07601366, "memory(GiB)": 142.32, "step": 79460, "train_speed(iter/s)": 0.287089 }, { "acc": 0.73196316, "epoch": 0.8889796112391943, "grad_norm": 5.53125, "learning_rate": 6.31549694422478e-06, "loss": 1.08905048, "memory(GiB)": 142.32, "step": 79480, "train_speed(iter/s)": 0.287114 }, { "acc": 0.74127283, "epoch": 0.8892033101851529, "grad_norm": 6.40625, "learning_rate": 6.313712590891014e-06, "loss": 1.03670406, "memory(GiB)": 142.32, "step": 79500, "train_speed(iter/s)": 0.287135 }, { "acc": 0.73418307, "epoch": 0.8894270091311114, "grad_norm": 9.5, "learning_rate": 6.311928057822589e-06, "loss": 1.06730776, "memory(GiB)": 142.32, "step": 79520, "train_speed(iter/s)": 0.287159 }, { "acc": 0.73229833, "epoch": 0.8896507080770699, "grad_norm": 6.4375, "learning_rate": 6.3101433452636525e-06, "loss": 1.07572918, "memory(GiB)": 142.32, "step": 79540, "train_speed(iter/s)": 0.287186 }, { "acc": 0.72769184, "epoch": 0.8898744070230284, "grad_norm": 5.625, "learning_rate": 6.308358453458381e-06, "loss": 1.08403893, "memory(GiB)": 142.32, "step": 79560, "train_speed(iter/s)": 0.287213 }, { "acc": 0.73544931, "epoch": 0.890098105968987, "grad_norm": 6.8125, "learning_rate": 6.306573382650974e-06, "loss": 1.06643486, "memory(GiB)": 142.32, "step": 79580, "train_speed(iter/s)": 0.287238 }, { "acc": 0.73128891, "epoch": 0.8903218049149455, "grad_norm": 5.9375, "learning_rate": 6.30478813308565e-06, "loss": 1.06029587, "memory(GiB)": 142.32, "step": 79600, "train_speed(iter/s)": 0.287265 }, { "acc": 0.74038839, "epoch": 0.890545503860904, "grad_norm": 6.4375, "learning_rate": 6.30300270500666e-06, "loss": 1.03116894, "memory(GiB)": 142.32, "step": 79620, "train_speed(iter/s)": 0.28729 }, { "acc": 0.74096537, "epoch": 0.8907692028068626, "grad_norm": 5.90625, "learning_rate": 6.301217098658277e-06, "loss": 1.03518677, "memory(GiB)": 142.32, "step": 79640, "train_speed(iter/s)": 0.287315 }, { "acc": 0.74394417, "epoch": 0.8909929017528211, "grad_norm": 6.3125, "learning_rate": 6.299431314284796e-06, "loss": 1.01730328, "memory(GiB)": 142.32, "step": 79660, "train_speed(iter/s)": 0.287337 }, { "acc": 0.73051972, "epoch": 0.8912166006987796, "grad_norm": 4.90625, "learning_rate": 6.297645352130538e-06, "loss": 1.05777111, "memory(GiB)": 142.32, "step": 79680, "train_speed(iter/s)": 0.28736 }, { "acc": 0.730375, "epoch": 0.8914402996447381, "grad_norm": 7.5625, "learning_rate": 6.295859212439847e-06, "loss": 1.08282146, "memory(GiB)": 142.32, "step": 79700, "train_speed(iter/s)": 0.287385 }, { "acc": 0.74330764, "epoch": 0.8916639985906967, "grad_norm": 6.15625, "learning_rate": 6.2940728954570955e-06, "loss": 1.01567974, "memory(GiB)": 142.32, "step": 79720, "train_speed(iter/s)": 0.28741 }, { "acc": 0.74160843, "epoch": 0.8918876975366552, "grad_norm": 5.71875, "learning_rate": 6.292286401426674e-06, "loss": 1.01399708, "memory(GiB)": 142.32, "step": 79740, "train_speed(iter/s)": 0.287434 }, { "acc": 0.73199234, "epoch": 0.8921113964826137, "grad_norm": 6.25, "learning_rate": 6.2904997305930025e-06, "loss": 1.05949116, "memory(GiB)": 142.32, "step": 79760, "train_speed(iter/s)": 0.287457 }, { "acc": 0.73562317, "epoch": 0.8923350954285723, "grad_norm": 6.09375, "learning_rate": 6.288712883200521e-06, "loss": 1.04569607, "memory(GiB)": 142.32, "step": 79780, "train_speed(iter/s)": 0.287479 }, { "acc": 0.73037939, "epoch": 0.8925587943745308, "grad_norm": 6.09375, "learning_rate": 6.286925859493699e-06, "loss": 1.06927471, "memory(GiB)": 142.32, "step": 79800, "train_speed(iter/s)": 0.287506 }, { "acc": 0.7335, "epoch": 0.8927824933204893, "grad_norm": 6.1875, "learning_rate": 6.2851386597170235e-06, "loss": 1.05066576, "memory(GiB)": 142.32, "step": 79820, "train_speed(iter/s)": 0.287531 }, { "acc": 0.72655859, "epoch": 0.8930061922664478, "grad_norm": 5.46875, "learning_rate": 6.2833512841150116e-06, "loss": 1.10696735, "memory(GiB)": 142.32, "step": 79840, "train_speed(iter/s)": 0.287552 }, { "acc": 0.7457015, "epoch": 0.8932298912124064, "grad_norm": 5.03125, "learning_rate": 6.281563732932201e-06, "loss": 1.00775986, "memory(GiB)": 142.32, "step": 79860, "train_speed(iter/s)": 0.287578 }, { "acc": 0.74193268, "epoch": 0.8934535901583649, "grad_norm": 6.90625, "learning_rate": 6.279776006413153e-06, "loss": 1.01825104, "memory(GiB)": 142.32, "step": 79880, "train_speed(iter/s)": 0.287604 }, { "acc": 0.7393075, "epoch": 0.8936772891043234, "grad_norm": 5.5625, "learning_rate": 6.277988104802455e-06, "loss": 1.01709366, "memory(GiB)": 142.32, "step": 79900, "train_speed(iter/s)": 0.287633 }, { "acc": 0.74034138, "epoch": 0.893900988050282, "grad_norm": 5.75, "learning_rate": 6.2762000283447185e-06, "loss": 1.02618275, "memory(GiB)": 142.32, "step": 79920, "train_speed(iter/s)": 0.287658 }, { "acc": 0.73445415, "epoch": 0.8941246869962405, "grad_norm": 5.875, "learning_rate": 6.274411777284576e-06, "loss": 1.07603703, "memory(GiB)": 142.32, "step": 79940, "train_speed(iter/s)": 0.287684 }, { "acc": 0.74041576, "epoch": 0.894348385942199, "grad_norm": 6.09375, "learning_rate": 6.272623351866688e-06, "loss": 1.05510139, "memory(GiB)": 142.32, "step": 79960, "train_speed(iter/s)": 0.287706 }, { "acc": 0.7320056, "epoch": 0.8945720848881575, "grad_norm": 5.59375, "learning_rate": 6.270834752335735e-06, "loss": 1.06741352, "memory(GiB)": 142.32, "step": 79980, "train_speed(iter/s)": 0.287731 }, { "acc": 0.73357763, "epoch": 0.8947957838341161, "grad_norm": 7.21875, "learning_rate": 6.269045978936423e-06, "loss": 1.07236137, "memory(GiB)": 142.32, "step": 80000, "train_speed(iter/s)": 0.287756 }, { "epoch": 0.8947957838341161, "eval_acc": 0.6953155656357356, "eval_loss": 1.0756518840789795, "eval_runtime": 2339.7147, "eval_samples_per_second": 32.176, "eval_steps_per_second": 16.088, "step": 80000 }, { "acc": 0.74511957, "epoch": 0.8950194827800746, "grad_norm": 6.53125, "learning_rate": 6.267257031913483e-06, "loss": 1.01783123, "memory(GiB)": 142.32, "step": 80020, "train_speed(iter/s)": 0.28533 }, { "acc": 0.73971853, "epoch": 0.8952431817260331, "grad_norm": 7.28125, "learning_rate": 6.265467911511667e-06, "loss": 1.04817181, "memory(GiB)": 142.32, "step": 80040, "train_speed(iter/s)": 0.285355 }, { "acc": 0.72985206, "epoch": 0.8954668806719916, "grad_norm": 8.125, "learning_rate": 6.263678617975754e-06, "loss": 1.0849267, "memory(GiB)": 142.32, "step": 80060, "train_speed(iter/s)": 0.285378 }, { "acc": 0.73750167, "epoch": 0.8956905796179502, "grad_norm": 6.5625, "learning_rate": 6.261889151550542e-06, "loss": 1.05154648, "memory(GiB)": 142.32, "step": 80080, "train_speed(iter/s)": 0.285402 }, { "acc": 0.7381247, "epoch": 0.8959142785639087, "grad_norm": 6.84375, "learning_rate": 6.260099512480859e-06, "loss": 1.0389513, "memory(GiB)": 142.32, "step": 80100, "train_speed(iter/s)": 0.285424 }, { "acc": 0.74452381, "epoch": 0.8961379775098672, "grad_norm": 6.59375, "learning_rate": 6.258309701011551e-06, "loss": 1.00875988, "memory(GiB)": 142.32, "step": 80120, "train_speed(iter/s)": 0.285444 }, { "acc": 0.74785223, "epoch": 0.8963616764558258, "grad_norm": 5.46875, "learning_rate": 6.256519717387492e-06, "loss": 1.00594149, "memory(GiB)": 142.32, "step": 80140, "train_speed(iter/s)": 0.285469 }, { "acc": 0.73046446, "epoch": 0.8965853754017843, "grad_norm": 6.15625, "learning_rate": 6.254729561853575e-06, "loss": 1.08680687, "memory(GiB)": 142.32, "step": 80160, "train_speed(iter/s)": 0.285494 }, { "acc": 0.74258194, "epoch": 0.8968090743477428, "grad_norm": 5.3125, "learning_rate": 6.252939234654721e-06, "loss": 1.00957832, "memory(GiB)": 142.32, "step": 80180, "train_speed(iter/s)": 0.28552 }, { "acc": 0.74087372, "epoch": 0.8970327732937013, "grad_norm": 8.1875, "learning_rate": 6.251148736035869e-06, "loss": 1.04946384, "memory(GiB)": 142.32, "step": 80200, "train_speed(iter/s)": 0.285545 }, { "acc": 0.71871967, "epoch": 0.8972564722396599, "grad_norm": 6.75, "learning_rate": 6.249358066241987e-06, "loss": 1.1343648, "memory(GiB)": 142.32, "step": 80220, "train_speed(iter/s)": 0.285569 }, { "acc": 0.7354702, "epoch": 0.8974801711856184, "grad_norm": 6.9375, "learning_rate": 6.247567225518064e-06, "loss": 1.04211035, "memory(GiB)": 142.32, "step": 80240, "train_speed(iter/s)": 0.285593 }, { "acc": 0.72266903, "epoch": 0.8977038701315769, "grad_norm": 6.1875, "learning_rate": 6.245776214109114e-06, "loss": 1.10350542, "memory(GiB)": 142.32, "step": 80260, "train_speed(iter/s)": 0.285619 }, { "acc": 0.73712616, "epoch": 0.8979275690775355, "grad_norm": 6.4375, "learning_rate": 6.243985032260171e-06, "loss": 1.0536974, "memory(GiB)": 142.32, "step": 80280, "train_speed(iter/s)": 0.285643 }, { "acc": 0.7331008, "epoch": 0.898151268023494, "grad_norm": 6.59375, "learning_rate": 6.242193680216295e-06, "loss": 1.05299129, "memory(GiB)": 142.32, "step": 80300, "train_speed(iter/s)": 0.285666 }, { "acc": 0.73350425, "epoch": 0.8983749669694525, "grad_norm": 6.3125, "learning_rate": 6.240402158222568e-06, "loss": 1.08812065, "memory(GiB)": 142.32, "step": 80320, "train_speed(iter/s)": 0.285689 }, { "acc": 0.73905611, "epoch": 0.898598665915411, "grad_norm": 6.3125, "learning_rate": 6.238610466524097e-06, "loss": 1.03998451, "memory(GiB)": 142.32, "step": 80340, "train_speed(iter/s)": 0.285712 }, { "acc": 0.73690834, "epoch": 0.8988223648613696, "grad_norm": 6.6875, "learning_rate": 6.2368186053660095e-06, "loss": 1.03571463, "memory(GiB)": 142.32, "step": 80360, "train_speed(iter/s)": 0.285737 }, { "acc": 0.7385169, "epoch": 0.8990460638073281, "grad_norm": 7.375, "learning_rate": 6.23502657499346e-06, "loss": 1.03411388, "memory(GiB)": 142.32, "step": 80380, "train_speed(iter/s)": 0.285761 }, { "acc": 0.7444859, "epoch": 0.8992697627532866, "grad_norm": 6.34375, "learning_rate": 6.233234375651621e-06, "loss": 0.99241619, "memory(GiB)": 142.32, "step": 80400, "train_speed(iter/s)": 0.285786 }, { "acc": 0.72246609, "epoch": 0.8994934616992452, "grad_norm": 6.90625, "learning_rate": 6.2314420075856926e-06, "loss": 1.1040184, "memory(GiB)": 142.32, "step": 80420, "train_speed(iter/s)": 0.285812 }, { "acc": 0.7350317, "epoch": 0.8997171606452037, "grad_norm": 5.375, "learning_rate": 6.229649471040897e-06, "loss": 1.05602608, "memory(GiB)": 142.32, "step": 80440, "train_speed(iter/s)": 0.285836 }, { "acc": 0.72345161, "epoch": 0.8999408595911622, "grad_norm": 6.53125, "learning_rate": 6.227856766262478e-06, "loss": 1.11334953, "memory(GiB)": 142.32, "step": 80460, "train_speed(iter/s)": 0.285859 }, { "acc": 0.73188481, "epoch": 0.9001645585371207, "grad_norm": 6.125, "learning_rate": 6.226063893495704e-06, "loss": 1.05529995, "memory(GiB)": 142.32, "step": 80480, "train_speed(iter/s)": 0.285885 }, { "acc": 0.7170579, "epoch": 0.9003882574830793, "grad_norm": 5.6875, "learning_rate": 6.224270852985863e-06, "loss": 1.14281425, "memory(GiB)": 142.32, "step": 80500, "train_speed(iter/s)": 0.285908 }, { "acc": 0.74890943, "epoch": 0.9006119564290378, "grad_norm": 4.78125, "learning_rate": 6.2224776449782705e-06, "loss": 0.99548893, "memory(GiB)": 142.32, "step": 80520, "train_speed(iter/s)": 0.285933 }, { "acc": 0.7257565, "epoch": 0.9008356553749963, "grad_norm": 7.21875, "learning_rate": 6.2206842697182645e-06, "loss": 1.10937405, "memory(GiB)": 142.32, "step": 80540, "train_speed(iter/s)": 0.285957 }, { "acc": 0.73804445, "epoch": 0.9010593543209549, "grad_norm": 4.875, "learning_rate": 6.2188907274512015e-06, "loss": 1.03760834, "memory(GiB)": 142.32, "step": 80560, "train_speed(iter/s)": 0.285984 }, { "acc": 0.73118801, "epoch": 0.9012830532669134, "grad_norm": 4.53125, "learning_rate": 6.217097018422466e-06, "loss": 1.06671066, "memory(GiB)": 142.32, "step": 80580, "train_speed(iter/s)": 0.286011 }, { "acc": 0.72721586, "epoch": 0.9015067522128719, "grad_norm": 6.59375, "learning_rate": 6.215303142877461e-06, "loss": 1.07886295, "memory(GiB)": 142.32, "step": 80600, "train_speed(iter/s)": 0.286038 }, { "acc": 0.73175912, "epoch": 0.9017304511588304, "grad_norm": 5.6875, "learning_rate": 6.213509101061616e-06, "loss": 1.05738258, "memory(GiB)": 142.32, "step": 80620, "train_speed(iter/s)": 0.286062 }, { "acc": 0.73584294, "epoch": 0.901954150104789, "grad_norm": 5.34375, "learning_rate": 6.211714893220381e-06, "loss": 1.06720486, "memory(GiB)": 142.32, "step": 80640, "train_speed(iter/s)": 0.286088 }, { "acc": 0.7246479, "epoch": 0.9021778490507475, "grad_norm": 6.0625, "learning_rate": 6.209920519599228e-06, "loss": 1.09566784, "memory(GiB)": 142.32, "step": 80660, "train_speed(iter/s)": 0.286114 }, { "acc": 0.73166447, "epoch": 0.902401547996706, "grad_norm": 7.21875, "learning_rate": 6.208125980443657e-06, "loss": 1.0708746, "memory(GiB)": 142.32, "step": 80680, "train_speed(iter/s)": 0.286139 }, { "acc": 0.73997245, "epoch": 0.9026252469426645, "grad_norm": 5.0625, "learning_rate": 6.206331275999182e-06, "loss": 1.00922127, "memory(GiB)": 142.32, "step": 80700, "train_speed(iter/s)": 0.286163 }, { "acc": 0.72301559, "epoch": 0.9028489458886231, "grad_norm": 6.5, "learning_rate": 6.204536406511346e-06, "loss": 1.11805363, "memory(GiB)": 142.32, "step": 80720, "train_speed(iter/s)": 0.286188 }, { "acc": 0.72912855, "epoch": 0.9030726448345816, "grad_norm": 7.125, "learning_rate": 6.202741372225713e-06, "loss": 1.06683064, "memory(GiB)": 142.32, "step": 80740, "train_speed(iter/s)": 0.286211 }, { "acc": 0.73066258, "epoch": 0.9032963437805401, "grad_norm": 5.9375, "learning_rate": 6.20094617338787e-06, "loss": 1.07139435, "memory(GiB)": 142.32, "step": 80760, "train_speed(iter/s)": 0.286235 }, { "acc": 0.72671404, "epoch": 0.9035200427264987, "grad_norm": 6.875, "learning_rate": 6.199150810243423e-06, "loss": 1.09304371, "memory(GiB)": 142.32, "step": 80780, "train_speed(iter/s)": 0.286259 }, { "acc": 0.73057261, "epoch": 0.9037437416724572, "grad_norm": 5.0, "learning_rate": 6.197355283038007e-06, "loss": 1.07620296, "memory(GiB)": 142.32, "step": 80800, "train_speed(iter/s)": 0.286284 }, { "acc": 0.73477106, "epoch": 0.9039674406184157, "grad_norm": 5.5625, "learning_rate": 6.195559592017273e-06, "loss": 1.05277815, "memory(GiB)": 142.32, "step": 80820, "train_speed(iter/s)": 0.286306 }, { "acc": 0.73241024, "epoch": 0.9041911395643742, "grad_norm": 6.09375, "learning_rate": 6.193763737426899e-06, "loss": 1.05457268, "memory(GiB)": 142.32, "step": 80840, "train_speed(iter/s)": 0.286328 }, { "acc": 0.72986937, "epoch": 0.9044148385103328, "grad_norm": 5.1875, "learning_rate": 6.1919677195125825e-06, "loss": 1.08573246, "memory(GiB)": 142.32, "step": 80860, "train_speed(iter/s)": 0.286354 }, { "acc": 0.74447412, "epoch": 0.9046385374562913, "grad_norm": 6.65625, "learning_rate": 6.190171538520045e-06, "loss": 1.01391211, "memory(GiB)": 142.32, "step": 80880, "train_speed(iter/s)": 0.286377 }, { "acc": 0.73721714, "epoch": 0.9048622364022498, "grad_norm": 6.46875, "learning_rate": 6.18837519469503e-06, "loss": 1.04231968, "memory(GiB)": 142.32, "step": 80900, "train_speed(iter/s)": 0.286401 }, { "acc": 0.72669935, "epoch": 0.9050859353482084, "grad_norm": 6.4375, "learning_rate": 6.186578688283302e-06, "loss": 1.10301018, "memory(GiB)": 142.32, "step": 80920, "train_speed(iter/s)": 0.286421 }, { "acc": 0.73366652, "epoch": 0.9053096342941669, "grad_norm": 5.6875, "learning_rate": 6.18478201953065e-06, "loss": 1.05574389, "memory(GiB)": 142.32, "step": 80940, "train_speed(iter/s)": 0.286444 }, { "acc": 0.71935496, "epoch": 0.9055333332401254, "grad_norm": 8.0, "learning_rate": 6.182985188682882e-06, "loss": 1.13466396, "memory(GiB)": 142.32, "step": 80960, "train_speed(iter/s)": 0.28647 }, { "acc": 0.73687224, "epoch": 0.9057570321860839, "grad_norm": 5.53125, "learning_rate": 6.181188195985832e-06, "loss": 1.04006062, "memory(GiB)": 142.32, "step": 80980, "train_speed(iter/s)": 0.286495 }, { "acc": 0.73205366, "epoch": 0.9059807311320425, "grad_norm": 6.21875, "learning_rate": 6.179391041685354e-06, "loss": 1.07612648, "memory(GiB)": 142.32, "step": 81000, "train_speed(iter/s)": 0.286519 }, { "acc": 0.747756, "epoch": 0.906204430078001, "grad_norm": 4.8125, "learning_rate": 6.177593726027325e-06, "loss": 0.9921545, "memory(GiB)": 142.32, "step": 81020, "train_speed(iter/s)": 0.286544 }, { "acc": 0.74408178, "epoch": 0.9064281290239595, "grad_norm": 6.6875, "learning_rate": 6.175796249257641e-06, "loss": 0.99937649, "memory(GiB)": 142.32, "step": 81040, "train_speed(iter/s)": 0.286568 }, { "acc": 0.73392334, "epoch": 0.9066518279699181, "grad_norm": 5.46875, "learning_rate": 6.173998611622224e-06, "loss": 1.0622961, "memory(GiB)": 142.32, "step": 81060, "train_speed(iter/s)": 0.286592 }, { "acc": 0.73823361, "epoch": 0.9068755269158766, "grad_norm": 5.9375, "learning_rate": 6.172200813367017e-06, "loss": 1.03381023, "memory(GiB)": 142.32, "step": 81080, "train_speed(iter/s)": 0.286616 }, { "acc": 0.72979851, "epoch": 0.9070992258618351, "grad_norm": 6.28125, "learning_rate": 6.170402854737986e-06, "loss": 1.07440338, "memory(GiB)": 142.32, "step": 81100, "train_speed(iter/s)": 0.286642 }, { "acc": 0.74129457, "epoch": 0.9073229248077936, "grad_norm": 4.875, "learning_rate": 6.1686047359811145e-06, "loss": 1.01613159, "memory(GiB)": 142.32, "step": 81120, "train_speed(iter/s)": 0.286666 }, { "acc": 0.72507496, "epoch": 0.9075466237537522, "grad_norm": 6.65625, "learning_rate": 6.1668064573424105e-06, "loss": 1.08635855, "memory(GiB)": 142.32, "step": 81140, "train_speed(iter/s)": 0.286691 }, { "acc": 0.73724627, "epoch": 0.9077703226997107, "grad_norm": 6.0, "learning_rate": 6.1650080190679064e-06, "loss": 1.03735676, "memory(GiB)": 142.32, "step": 81160, "train_speed(iter/s)": 0.286716 }, { "acc": 0.73810072, "epoch": 0.9079940216456692, "grad_norm": 6.375, "learning_rate": 6.1632094214036534e-06, "loss": 1.01923466, "memory(GiB)": 142.32, "step": 81180, "train_speed(iter/s)": 0.286734 }, { "acc": 0.7445128, "epoch": 0.9082177205916278, "grad_norm": 5.53125, "learning_rate": 6.1614106645957265e-06, "loss": 1.01868486, "memory(GiB)": 142.32, "step": 81200, "train_speed(iter/s)": 0.286757 }, { "acc": 0.72432356, "epoch": 0.9084414195375863, "grad_norm": 6.6875, "learning_rate": 6.15961174889022e-06, "loss": 1.09940023, "memory(GiB)": 142.32, "step": 81220, "train_speed(iter/s)": 0.286781 }, { "acc": 0.73649664, "epoch": 0.9086651184835448, "grad_norm": 6.40625, "learning_rate": 6.15781267453325e-06, "loss": 1.05766001, "memory(GiB)": 142.32, "step": 81240, "train_speed(iter/s)": 0.286806 }, { "acc": 0.73670568, "epoch": 0.9088888174295033, "grad_norm": 5.40625, "learning_rate": 6.156013441770958e-06, "loss": 1.05249157, "memory(GiB)": 142.32, "step": 81260, "train_speed(iter/s)": 0.286827 }, { "acc": 0.73233919, "epoch": 0.9091125163754619, "grad_norm": 6.46875, "learning_rate": 6.154214050849504e-06, "loss": 1.07130222, "memory(GiB)": 142.32, "step": 81280, "train_speed(iter/s)": 0.286852 }, { "acc": 0.73023958, "epoch": 0.9093362153214204, "grad_norm": 6.125, "learning_rate": 6.152414502015071e-06, "loss": 1.07131443, "memory(GiB)": 142.32, "step": 81300, "train_speed(iter/s)": 0.286875 }, { "acc": 0.73504519, "epoch": 0.9095599142673789, "grad_norm": 5.9375, "learning_rate": 6.1506147955138615e-06, "loss": 1.06448555, "memory(GiB)": 142.32, "step": 81320, "train_speed(iter/s)": 0.286898 }, { "acc": 0.72804642, "epoch": 0.9097836132133374, "grad_norm": 6.53125, "learning_rate": 6.148814931592102e-06, "loss": 1.08576612, "memory(GiB)": 142.32, "step": 81340, "train_speed(iter/s)": 0.286922 }, { "acc": 0.73683853, "epoch": 0.910007312159296, "grad_norm": 7.78125, "learning_rate": 6.147014910496041e-06, "loss": 1.0620306, "memory(GiB)": 142.32, "step": 81360, "train_speed(iter/s)": 0.286944 }, { "acc": 0.72893591, "epoch": 0.9102310111052545, "grad_norm": 5.71875, "learning_rate": 6.1452147324719444e-06, "loss": 1.08866711, "memory(GiB)": 142.32, "step": 81380, "train_speed(iter/s)": 0.286966 }, { "acc": 0.73581648, "epoch": 0.910454710051213, "grad_norm": 6.3125, "learning_rate": 6.143414397766103e-06, "loss": 1.0446475, "memory(GiB)": 142.32, "step": 81400, "train_speed(iter/s)": 0.286989 }, { "acc": 0.73689022, "epoch": 0.9106784089971716, "grad_norm": 5.53125, "learning_rate": 6.14161390662483e-06, "loss": 1.04821758, "memory(GiB)": 142.32, "step": 81420, "train_speed(iter/s)": 0.287013 }, { "acc": 0.74222822, "epoch": 0.9109021079431301, "grad_norm": 7.5, "learning_rate": 6.139813259294456e-06, "loss": 1.01851501, "memory(GiB)": 142.32, "step": 81440, "train_speed(iter/s)": 0.287038 }, { "acc": 0.73650246, "epoch": 0.9111258068890886, "grad_norm": 5.875, "learning_rate": 6.138012456021337e-06, "loss": 1.05385742, "memory(GiB)": 142.32, "step": 81460, "train_speed(iter/s)": 0.287062 }, { "acc": 0.72560301, "epoch": 0.9113495058350471, "grad_norm": 6.03125, "learning_rate": 6.136211497051848e-06, "loss": 1.09177504, "memory(GiB)": 142.32, "step": 81480, "train_speed(iter/s)": 0.287087 }, { "acc": 0.7324892, "epoch": 0.9115732047810057, "grad_norm": 7.0, "learning_rate": 6.134410382632385e-06, "loss": 1.05530987, "memory(GiB)": 142.32, "step": 81500, "train_speed(iter/s)": 0.28711 }, { "acc": 0.73014221, "epoch": 0.9117969037269642, "grad_norm": 5.9375, "learning_rate": 6.13260911300937e-06, "loss": 1.0653553, "memory(GiB)": 142.32, "step": 81520, "train_speed(iter/s)": 0.287135 }, { "acc": 0.74189339, "epoch": 0.9120206026729227, "grad_norm": 6.0625, "learning_rate": 6.130807688429237e-06, "loss": 1.0241518, "memory(GiB)": 142.32, "step": 81540, "train_speed(iter/s)": 0.287159 }, { "acc": 0.74264631, "epoch": 0.9122443016188813, "grad_norm": 6.71875, "learning_rate": 6.12900610913845e-06, "loss": 1.02220497, "memory(GiB)": 142.32, "step": 81560, "train_speed(iter/s)": 0.287184 }, { "acc": 0.73408375, "epoch": 0.9124680005648398, "grad_norm": 5.96875, "learning_rate": 6.12720437538349e-06, "loss": 1.05405979, "memory(GiB)": 142.32, "step": 81580, "train_speed(iter/s)": 0.287207 }, { "acc": 0.73651333, "epoch": 0.9126916995107983, "grad_norm": 5.5, "learning_rate": 6.125402487410859e-06, "loss": 1.05003357, "memory(GiB)": 142.32, "step": 81600, "train_speed(iter/s)": 0.287233 }, { "acc": 0.72871313, "epoch": 0.9129153984567568, "grad_norm": 5.46875, "learning_rate": 6.123600445467085e-06, "loss": 1.07747498, "memory(GiB)": 142.32, "step": 81620, "train_speed(iter/s)": 0.287257 }, { "acc": 0.72343807, "epoch": 0.9131390974027154, "grad_norm": 5.96875, "learning_rate": 6.1217982497987075e-06, "loss": 1.1003067, "memory(GiB)": 142.32, "step": 81640, "train_speed(iter/s)": 0.287282 }, { "acc": 0.73905735, "epoch": 0.9133627963486739, "grad_norm": 6.46875, "learning_rate": 6.119995900652296e-06, "loss": 1.02139034, "memory(GiB)": 142.32, "step": 81660, "train_speed(iter/s)": 0.287305 }, { "acc": 0.72233553, "epoch": 0.9135864952946325, "grad_norm": 5.625, "learning_rate": 6.118193398274437e-06, "loss": 1.11057873, "memory(GiB)": 142.32, "step": 81680, "train_speed(iter/s)": 0.287328 }, { "acc": 0.72525196, "epoch": 0.9138101942405911, "grad_norm": 5.59375, "learning_rate": 6.116390742911738e-06, "loss": 1.09611092, "memory(GiB)": 142.32, "step": 81700, "train_speed(iter/s)": 0.287351 }, { "acc": 0.72847366, "epoch": 0.9140338931865496, "grad_norm": 5.65625, "learning_rate": 6.114587934810829e-06, "loss": 1.08922348, "memory(GiB)": 142.32, "step": 81720, "train_speed(iter/s)": 0.287376 }, { "acc": 0.73627863, "epoch": 0.9142575921325081, "grad_norm": 6.4375, "learning_rate": 6.112784974218358e-06, "loss": 1.05892124, "memory(GiB)": 142.32, "step": 81740, "train_speed(iter/s)": 0.2874 }, { "acc": 0.72872524, "epoch": 0.9144812910784667, "grad_norm": 5.90625, "learning_rate": 6.110981861380999e-06, "loss": 1.08314095, "memory(GiB)": 142.32, "step": 81760, "train_speed(iter/s)": 0.287422 }, { "acc": 0.72831488, "epoch": 0.9147049900244252, "grad_norm": 6.6875, "learning_rate": 6.109178596545441e-06, "loss": 1.09715652, "memory(GiB)": 142.32, "step": 81780, "train_speed(iter/s)": 0.287445 }, { "acc": 0.74042072, "epoch": 0.9149286889703837, "grad_norm": 6.78125, "learning_rate": 6.107375179958397e-06, "loss": 1.02840929, "memory(GiB)": 142.32, "step": 81800, "train_speed(iter/s)": 0.287469 }, { "acc": 0.73667831, "epoch": 0.9151523879163422, "grad_norm": 5.59375, "learning_rate": 6.105571611866601e-06, "loss": 1.03899174, "memory(GiB)": 142.32, "step": 81820, "train_speed(iter/s)": 0.287492 }, { "acc": 0.73214178, "epoch": 0.9153760868623008, "grad_norm": 5.84375, "learning_rate": 6.103767892516806e-06, "loss": 1.06888857, "memory(GiB)": 142.32, "step": 81840, "train_speed(iter/s)": 0.287514 }, { "acc": 0.726443, "epoch": 0.9155997858082593, "grad_norm": 5.6875, "learning_rate": 6.101964022155787e-06, "loss": 1.08794327, "memory(GiB)": 142.32, "step": 81860, "train_speed(iter/s)": 0.287537 }, { "acc": 0.74110155, "epoch": 0.9158234847542178, "grad_norm": 5.96875, "learning_rate": 6.100160001030337e-06, "loss": 1.03051262, "memory(GiB)": 142.32, "step": 81880, "train_speed(iter/s)": 0.28756 }, { "acc": 0.72359357, "epoch": 0.9160471837001763, "grad_norm": 5.65625, "learning_rate": 6.098355829387277e-06, "loss": 1.11386127, "memory(GiB)": 142.32, "step": 81900, "train_speed(iter/s)": 0.287584 }, { "acc": 0.73908195, "epoch": 0.9162708826461349, "grad_norm": 5.78125, "learning_rate": 6.0965515074734395e-06, "loss": 1.0551157, "memory(GiB)": 142.32, "step": 81920, "train_speed(iter/s)": 0.287605 }, { "acc": 0.73511572, "epoch": 0.9164945815920934, "grad_norm": 5.96875, "learning_rate": 6.094747035535683e-06, "loss": 1.06301403, "memory(GiB)": 142.32, "step": 81940, "train_speed(iter/s)": 0.287628 }, { "acc": 0.73559418, "epoch": 0.9167182805380519, "grad_norm": 5.34375, "learning_rate": 6.092942413820883e-06, "loss": 1.05869484, "memory(GiB)": 142.32, "step": 81960, "train_speed(iter/s)": 0.287652 }, { "acc": 0.73875885, "epoch": 0.9169419794840105, "grad_norm": 6.625, "learning_rate": 6.091137642575939e-06, "loss": 1.04168835, "memory(GiB)": 142.32, "step": 81980, "train_speed(iter/s)": 0.287675 }, { "acc": 0.74430943, "epoch": 0.917165678429969, "grad_norm": 5.125, "learning_rate": 6.08933272204777e-06, "loss": 1.01078224, "memory(GiB)": 142.32, "step": 82000, "train_speed(iter/s)": 0.287699 }, { "epoch": 0.917165678429969, "eval_acc": 0.695429145128958, "eval_loss": 1.0753705501556396, "eval_runtime": 2339.6344, "eval_samples_per_second": 32.177, "eval_steps_per_second": 16.089, "step": 82000 }, { "acc": 0.72764816, "epoch": 0.9173893773759275, "grad_norm": 5.375, "learning_rate": 6.087527652483315e-06, "loss": 1.09435587, "memory(GiB)": 142.32, "step": 82020, "train_speed(iter/s)": 0.28533 }, { "acc": 0.72228546, "epoch": 0.917613076321886, "grad_norm": 6.0, "learning_rate": 6.085722434129533e-06, "loss": 1.11963825, "memory(GiB)": 142.32, "step": 82040, "train_speed(iter/s)": 0.285352 }, { "acc": 0.73979092, "epoch": 0.9178367752678446, "grad_norm": 6.625, "learning_rate": 6.083917067233402e-06, "loss": 1.03493443, "memory(GiB)": 142.32, "step": 82060, "train_speed(iter/s)": 0.285375 }, { "acc": 0.73679695, "epoch": 0.9180604742138031, "grad_norm": 5.8125, "learning_rate": 6.082111552041925e-06, "loss": 1.04868584, "memory(GiB)": 142.32, "step": 82080, "train_speed(iter/s)": 0.2854 }, { "acc": 0.72247705, "epoch": 0.9182841731597616, "grad_norm": 5.46875, "learning_rate": 6.080305888802119e-06, "loss": 1.12427311, "memory(GiB)": 142.32, "step": 82100, "train_speed(iter/s)": 0.285424 }, { "acc": 0.72789493, "epoch": 0.9185078721057202, "grad_norm": 5.15625, "learning_rate": 6.078500077761027e-06, "loss": 1.1005106, "memory(GiB)": 142.32, "step": 82120, "train_speed(iter/s)": 0.285449 }, { "acc": 0.73676395, "epoch": 0.9187315710516787, "grad_norm": 5.65625, "learning_rate": 6.07669411916571e-06, "loss": 1.04824009, "memory(GiB)": 142.32, "step": 82140, "train_speed(iter/s)": 0.285471 }, { "acc": 0.724051, "epoch": 0.9189552699976372, "grad_norm": 7.65625, "learning_rate": 6.074888013263247e-06, "loss": 1.10841999, "memory(GiB)": 142.32, "step": 82160, "train_speed(iter/s)": 0.285495 }, { "acc": 0.73378959, "epoch": 0.9191789689435957, "grad_norm": 5.75, "learning_rate": 6.073081760300741e-06, "loss": 1.06656685, "memory(GiB)": 142.32, "step": 82180, "train_speed(iter/s)": 0.285518 }, { "acc": 0.73762283, "epoch": 0.9194026678895543, "grad_norm": 6.625, "learning_rate": 6.071275360525311e-06, "loss": 1.06003323, "memory(GiB)": 142.32, "step": 82200, "train_speed(iter/s)": 0.285542 }, { "acc": 0.7343132, "epoch": 0.9196263668355128, "grad_norm": 5.9375, "learning_rate": 6.069468814184101e-06, "loss": 1.05451765, "memory(GiB)": 142.32, "step": 82220, "train_speed(iter/s)": 0.285566 }, { "acc": 0.73395987, "epoch": 0.9198500657814713, "grad_norm": 5.84375, "learning_rate": 6.067662121524271e-06, "loss": 1.07895565, "memory(GiB)": 142.32, "step": 82240, "train_speed(iter/s)": 0.285589 }, { "acc": 0.73753138, "epoch": 0.9200737647274299, "grad_norm": 5.4375, "learning_rate": 6.0658552827930016e-06, "loss": 1.04250202, "memory(GiB)": 142.32, "step": 82260, "train_speed(iter/s)": 0.285614 }, { "acc": 0.73361378, "epoch": 0.9202974636733884, "grad_norm": 5.0, "learning_rate": 6.064048298237495e-06, "loss": 1.04944735, "memory(GiB)": 142.32, "step": 82280, "train_speed(iter/s)": 0.285637 }, { "acc": 0.74181604, "epoch": 0.9205211626193469, "grad_norm": 5.5, "learning_rate": 6.062241168104972e-06, "loss": 1.01510935, "memory(GiB)": 142.32, "step": 82300, "train_speed(iter/s)": 0.285661 }, { "acc": 0.73323536, "epoch": 0.9207448615653054, "grad_norm": 6.96875, "learning_rate": 6.0604338926426745e-06, "loss": 1.04944983, "memory(GiB)": 142.32, "step": 82320, "train_speed(iter/s)": 0.285684 }, { "acc": 0.72353597, "epoch": 0.920968560511264, "grad_norm": 6.0625, "learning_rate": 6.058626472097865e-06, "loss": 1.11207695, "memory(GiB)": 142.32, "step": 82340, "train_speed(iter/s)": 0.285709 }, { "acc": 0.72705951, "epoch": 0.9211922594572225, "grad_norm": 5.65625, "learning_rate": 6.0568189067178206e-06, "loss": 1.0799305, "memory(GiB)": 142.32, "step": 82360, "train_speed(iter/s)": 0.285731 }, { "acc": 0.74691486, "epoch": 0.921415958403181, "grad_norm": 5.40625, "learning_rate": 6.055011196749845e-06, "loss": 0.99538097, "memory(GiB)": 142.32, "step": 82380, "train_speed(iter/s)": 0.285753 }, { "acc": 0.73355665, "epoch": 0.9216396573491396, "grad_norm": 6.1875, "learning_rate": 6.053203342441259e-06, "loss": 1.05975056, "memory(GiB)": 142.32, "step": 82400, "train_speed(iter/s)": 0.285778 }, { "acc": 0.75111742, "epoch": 0.9218633562950981, "grad_norm": 6.34375, "learning_rate": 6.0513953440394e-06, "loss": 0.98526249, "memory(GiB)": 142.32, "step": 82420, "train_speed(iter/s)": 0.285803 }, { "acc": 0.74881945, "epoch": 0.9220870552410566, "grad_norm": 6.03125, "learning_rate": 6.049587201791631e-06, "loss": 1.00782423, "memory(GiB)": 142.32, "step": 82440, "train_speed(iter/s)": 0.285824 }, { "acc": 0.72783017, "epoch": 0.9223107541870151, "grad_norm": 7.09375, "learning_rate": 6.047778915945333e-06, "loss": 1.08921661, "memory(GiB)": 142.32, "step": 82460, "train_speed(iter/s)": 0.285849 }, { "acc": 0.72813253, "epoch": 0.9225344531329737, "grad_norm": 7.65625, "learning_rate": 6.0459704867479005e-06, "loss": 1.08805752, "memory(GiB)": 142.32, "step": 82480, "train_speed(iter/s)": 0.285874 }, { "acc": 0.73337164, "epoch": 0.9227581520789322, "grad_norm": 6.5, "learning_rate": 6.044161914446756e-06, "loss": 1.0706953, "memory(GiB)": 142.32, "step": 82500, "train_speed(iter/s)": 0.285899 }, { "acc": 0.73425202, "epoch": 0.9229818510248907, "grad_norm": 5.4375, "learning_rate": 6.042353199289337e-06, "loss": 1.06200533, "memory(GiB)": 142.32, "step": 82520, "train_speed(iter/s)": 0.285922 }, { "acc": 0.7299614, "epoch": 0.9232055499708492, "grad_norm": 6.4375, "learning_rate": 6.040544341523103e-06, "loss": 1.07276859, "memory(GiB)": 142.32, "step": 82540, "train_speed(iter/s)": 0.285946 }, { "acc": 0.72474689, "epoch": 0.9234292489168078, "grad_norm": 5.84375, "learning_rate": 6.038735341395528e-06, "loss": 1.09673538, "memory(GiB)": 142.32, "step": 82560, "train_speed(iter/s)": 0.285967 }, { "acc": 0.73213387, "epoch": 0.9236529478627663, "grad_norm": 5.90625, "learning_rate": 6.036926199154113e-06, "loss": 1.0689312, "memory(GiB)": 142.32, "step": 82580, "train_speed(iter/s)": 0.285992 }, { "acc": 0.72844787, "epoch": 0.9238766468087248, "grad_norm": 6.59375, "learning_rate": 6.035116915046372e-06, "loss": 1.10496864, "memory(GiB)": 142.32, "step": 82600, "train_speed(iter/s)": 0.286016 }, { "acc": 0.74156208, "epoch": 0.9241003457546834, "grad_norm": 6.03125, "learning_rate": 6.033307489319842e-06, "loss": 1.02534561, "memory(GiB)": 142.32, "step": 82620, "train_speed(iter/s)": 0.286041 }, { "acc": 0.73665457, "epoch": 0.9243240447006419, "grad_norm": 7.1875, "learning_rate": 6.031497922222077e-06, "loss": 1.05032921, "memory(GiB)": 142.32, "step": 82640, "train_speed(iter/s)": 0.286067 }, { "acc": 0.73425131, "epoch": 0.9245477436466004, "grad_norm": 6.28125, "learning_rate": 6.029688214000653e-06, "loss": 1.05131073, "memory(GiB)": 142.32, "step": 82660, "train_speed(iter/s)": 0.286092 }, { "acc": 0.72982578, "epoch": 0.924771442592559, "grad_norm": 4.53125, "learning_rate": 6.027878364903166e-06, "loss": 1.08630886, "memory(GiB)": 142.32, "step": 82680, "train_speed(iter/s)": 0.286116 }, { "acc": 0.73516884, "epoch": 0.9249951415385175, "grad_norm": 5.8125, "learning_rate": 6.0260683751772255e-06, "loss": 1.03784599, "memory(GiB)": 142.32, "step": 82700, "train_speed(iter/s)": 0.286137 }, { "acc": 0.72434287, "epoch": 0.925218840484476, "grad_norm": 5.1875, "learning_rate": 6.024258245070465e-06, "loss": 1.08707628, "memory(GiB)": 142.32, "step": 82720, "train_speed(iter/s)": 0.286159 }, { "acc": 0.73668213, "epoch": 0.9254425394304345, "grad_norm": 5.28125, "learning_rate": 6.022447974830535e-06, "loss": 1.04918594, "memory(GiB)": 142.32, "step": 82740, "train_speed(iter/s)": 0.286184 }, { "acc": 0.73171673, "epoch": 0.9256662383763931, "grad_norm": 4.78125, "learning_rate": 6.02063756470511e-06, "loss": 1.06108627, "memory(GiB)": 142.32, "step": 82760, "train_speed(iter/s)": 0.286206 }, { "acc": 0.7366262, "epoch": 0.9258899373223516, "grad_norm": 6.5625, "learning_rate": 6.0188270149418784e-06, "loss": 1.04317722, "memory(GiB)": 142.32, "step": 82780, "train_speed(iter/s)": 0.286229 }, { "acc": 0.72847099, "epoch": 0.9261136362683101, "grad_norm": 6.21875, "learning_rate": 6.017016325788547e-06, "loss": 1.07292719, "memory(GiB)": 142.32, "step": 82800, "train_speed(iter/s)": 0.286252 }, { "acc": 0.73607712, "epoch": 0.9263373352142686, "grad_norm": 5.90625, "learning_rate": 6.0152054974928465e-06, "loss": 1.04030361, "memory(GiB)": 142.32, "step": 82820, "train_speed(iter/s)": 0.286278 }, { "acc": 0.74017181, "epoch": 0.9265610341602272, "grad_norm": 6.1875, "learning_rate": 6.013394530302523e-06, "loss": 1.03637104, "memory(GiB)": 142.32, "step": 82840, "train_speed(iter/s)": 0.2863 }, { "acc": 0.71740847, "epoch": 0.9267847331061857, "grad_norm": 5.6875, "learning_rate": 6.011583424465344e-06, "loss": 1.14612789, "memory(GiB)": 142.32, "step": 82860, "train_speed(iter/s)": 0.286322 }, { "acc": 0.73120079, "epoch": 0.9270084320521442, "grad_norm": 7.5625, "learning_rate": 6.009772180229094e-06, "loss": 1.08174057, "memory(GiB)": 142.32, "step": 82880, "train_speed(iter/s)": 0.286346 }, { "acc": 0.7296803, "epoch": 0.9272321309981028, "grad_norm": 6.34375, "learning_rate": 6.007960797841575e-06, "loss": 1.08579826, "memory(GiB)": 142.32, "step": 82900, "train_speed(iter/s)": 0.286371 }, { "acc": 0.72563267, "epoch": 0.9274558299440613, "grad_norm": 5.75, "learning_rate": 6.006149277550613e-06, "loss": 1.09200821, "memory(GiB)": 142.32, "step": 82920, "train_speed(iter/s)": 0.286397 }, { "acc": 0.73837371, "epoch": 0.9276795288900198, "grad_norm": 6.0625, "learning_rate": 6.0043376196040485e-06, "loss": 1.0349287, "memory(GiB)": 142.32, "step": 82940, "train_speed(iter/s)": 0.28642 }, { "acc": 0.73154016, "epoch": 0.9279032278359783, "grad_norm": 6.125, "learning_rate": 6.002525824249741e-06, "loss": 1.07730141, "memory(GiB)": 142.32, "step": 82960, "train_speed(iter/s)": 0.286445 }, { "acc": 0.73835382, "epoch": 0.9281269267819369, "grad_norm": 6.59375, "learning_rate": 6.000713891735573e-06, "loss": 1.03153877, "memory(GiB)": 142.32, "step": 82980, "train_speed(iter/s)": 0.286472 }, { "acc": 0.74514103, "epoch": 0.9283506257278954, "grad_norm": 6.5625, "learning_rate": 5.998901822309441e-06, "loss": 1.00335884, "memory(GiB)": 142.32, "step": 83000, "train_speed(iter/s)": 0.286495 }, { "acc": 0.75031247, "epoch": 0.9285743246738539, "grad_norm": 5.8125, "learning_rate": 5.9970896162192614e-06, "loss": 0.98203945, "memory(GiB)": 142.32, "step": 83020, "train_speed(iter/s)": 0.286518 }, { "acc": 0.74524288, "epoch": 0.9287980236198125, "grad_norm": 5.25, "learning_rate": 5.9952772737129706e-06, "loss": 1.02909298, "memory(GiB)": 142.32, "step": 83040, "train_speed(iter/s)": 0.286539 }, { "acc": 0.73841624, "epoch": 0.929021722565771, "grad_norm": 6.5, "learning_rate": 5.993464795038523e-06, "loss": 1.03752575, "memory(GiB)": 142.32, "step": 83060, "train_speed(iter/s)": 0.286564 }, { "acc": 0.72943878, "epoch": 0.9292454215117295, "grad_norm": 5.4375, "learning_rate": 5.991652180443893e-06, "loss": 1.08648024, "memory(GiB)": 142.32, "step": 83080, "train_speed(iter/s)": 0.286587 }, { "acc": 0.72402325, "epoch": 0.929469120457688, "grad_norm": 6.21875, "learning_rate": 5.989839430177069e-06, "loss": 1.07764511, "memory(GiB)": 142.32, "step": 83100, "train_speed(iter/s)": 0.28661 }, { "acc": 0.74668365, "epoch": 0.9296928194036466, "grad_norm": 6.65625, "learning_rate": 5.988026544486063e-06, "loss": 1.00489225, "memory(GiB)": 142.32, "step": 83120, "train_speed(iter/s)": 0.286632 }, { "acc": 0.73422275, "epoch": 0.9299165183496051, "grad_norm": 5.65625, "learning_rate": 5.9862135236189045e-06, "loss": 1.05374746, "memory(GiB)": 142.32, "step": 83140, "train_speed(iter/s)": 0.286656 }, { "acc": 0.7315383, "epoch": 0.9301402172955636, "grad_norm": 5.5, "learning_rate": 5.98440036782364e-06, "loss": 1.08485231, "memory(GiB)": 142.32, "step": 83160, "train_speed(iter/s)": 0.286679 }, { "acc": 0.73895054, "epoch": 0.9303639162415221, "grad_norm": 6.96875, "learning_rate": 5.982587077348333e-06, "loss": 1.0327342, "memory(GiB)": 142.32, "step": 83180, "train_speed(iter/s)": 0.286701 }, { "acc": 0.73085623, "epoch": 0.9305876151874807, "grad_norm": 7.5, "learning_rate": 5.980773652441072e-06, "loss": 1.06790857, "memory(GiB)": 142.32, "step": 83200, "train_speed(iter/s)": 0.286727 }, { "acc": 0.73591442, "epoch": 0.9308113141334392, "grad_norm": 5.65625, "learning_rate": 5.978960093349955e-06, "loss": 1.06112194, "memory(GiB)": 142.32, "step": 83220, "train_speed(iter/s)": 0.286749 }, { "acc": 0.75285163, "epoch": 0.9310350130793977, "grad_norm": 6.8125, "learning_rate": 5.977146400323105e-06, "loss": 0.97879715, "memory(GiB)": 142.32, "step": 83240, "train_speed(iter/s)": 0.286772 }, { "acc": 0.74766526, "epoch": 0.9312587120253563, "grad_norm": 5.6875, "learning_rate": 5.975332573608661e-06, "loss": 0.98692865, "memory(GiB)": 142.32, "step": 83260, "train_speed(iter/s)": 0.286795 }, { "acc": 0.72305698, "epoch": 0.9314824109713148, "grad_norm": 7.875, "learning_rate": 5.97351861345478e-06, "loss": 1.10554562, "memory(GiB)": 142.32, "step": 83280, "train_speed(iter/s)": 0.28682 }, { "acc": 0.73402557, "epoch": 0.9317061099172733, "grad_norm": 6.25, "learning_rate": 5.971704520109638e-06, "loss": 1.05435457, "memory(GiB)": 142.32, "step": 83300, "train_speed(iter/s)": 0.286844 }, { "acc": 0.74252558, "epoch": 0.9319298088632318, "grad_norm": 5.21875, "learning_rate": 5.9698902938214285e-06, "loss": 1.0274951, "memory(GiB)": 142.32, "step": 83320, "train_speed(iter/s)": 0.286867 }, { "acc": 0.73644514, "epoch": 0.9321535078091904, "grad_norm": 4.78125, "learning_rate": 5.968075934838364e-06, "loss": 1.06314468, "memory(GiB)": 142.32, "step": 83340, "train_speed(iter/s)": 0.286889 }, { "acc": 0.73208728, "epoch": 0.9323772067551489, "grad_norm": 6.125, "learning_rate": 5.966261443408674e-06, "loss": 1.08766594, "memory(GiB)": 142.32, "step": 83360, "train_speed(iter/s)": 0.286913 }, { "acc": 0.73342781, "epoch": 0.9326009057011074, "grad_norm": 7.90625, "learning_rate": 5.964446819780608e-06, "loss": 1.07254467, "memory(GiB)": 142.32, "step": 83380, "train_speed(iter/s)": 0.286938 }, { "acc": 0.73717432, "epoch": 0.932824604647066, "grad_norm": 6.5, "learning_rate": 5.962632064202434e-06, "loss": 1.04901142, "memory(GiB)": 142.32, "step": 83400, "train_speed(iter/s)": 0.28696 }, { "acc": 0.7514472, "epoch": 0.9330483035930245, "grad_norm": 6.03125, "learning_rate": 5.960817176922432e-06, "loss": 0.9905653, "memory(GiB)": 142.32, "step": 83420, "train_speed(iter/s)": 0.286984 }, { "acc": 0.73608398, "epoch": 0.933272002538983, "grad_norm": 5.84375, "learning_rate": 5.959002158188907e-06, "loss": 1.04207344, "memory(GiB)": 142.32, "step": 83440, "train_speed(iter/s)": 0.28701 }, { "acc": 0.73381491, "epoch": 0.9334957014849415, "grad_norm": 6.25, "learning_rate": 5.9571870082501794e-06, "loss": 1.06779919, "memory(GiB)": 142.32, "step": 83460, "train_speed(iter/s)": 0.287035 }, { "acc": 0.74261703, "epoch": 0.9337194004309001, "grad_norm": 5.96875, "learning_rate": 5.9553717273545885e-06, "loss": 1.01840401, "memory(GiB)": 142.32, "step": 83480, "train_speed(iter/s)": 0.287058 }, { "acc": 0.72454352, "epoch": 0.9339430993768586, "grad_norm": 6.28125, "learning_rate": 5.953556315750491e-06, "loss": 1.0966877, "memory(GiB)": 142.32, "step": 83500, "train_speed(iter/s)": 0.287083 }, { "acc": 0.74033432, "epoch": 0.9341667983228171, "grad_norm": 7.0625, "learning_rate": 5.951740773686257e-06, "loss": 1.03944902, "memory(GiB)": 142.32, "step": 83520, "train_speed(iter/s)": 0.287107 }, { "acc": 0.73126421, "epoch": 0.9343904972687757, "grad_norm": 5.65625, "learning_rate": 5.949925101410284e-06, "loss": 1.09293785, "memory(GiB)": 142.32, "step": 83540, "train_speed(iter/s)": 0.287131 }, { "acc": 0.74443741, "epoch": 0.9346141962147342, "grad_norm": 5.8125, "learning_rate": 5.9481092991709785e-06, "loss": 1.0215704, "memory(GiB)": 142.32, "step": 83560, "train_speed(iter/s)": 0.287154 }, { "acc": 0.74285645, "epoch": 0.9348378951606927, "grad_norm": 6.6875, "learning_rate": 5.94629336721677e-06, "loss": 1.02720776, "memory(GiB)": 142.32, "step": 83580, "train_speed(iter/s)": 0.287177 }, { "acc": 0.73553114, "epoch": 0.9350615941066512, "grad_norm": 5.78125, "learning_rate": 5.944477305796104e-06, "loss": 1.07007761, "memory(GiB)": 142.32, "step": 83600, "train_speed(iter/s)": 0.2872 }, { "acc": 0.73159695, "epoch": 0.9352852930526098, "grad_norm": 7.5, "learning_rate": 5.942661115157441e-06, "loss": 1.05000648, "memory(GiB)": 142.32, "step": 83620, "train_speed(iter/s)": 0.28722 }, { "acc": 0.73894567, "epoch": 0.9355089919985683, "grad_norm": 6.28125, "learning_rate": 5.940844795549264e-06, "loss": 1.03461647, "memory(GiB)": 142.32, "step": 83640, "train_speed(iter/s)": 0.287241 }, { "acc": 0.72875738, "epoch": 0.9357326909445268, "grad_norm": 4.625, "learning_rate": 5.939028347220072e-06, "loss": 1.0815237, "memory(GiB)": 142.32, "step": 83660, "train_speed(iter/s)": 0.287264 }, { "acc": 0.73526754, "epoch": 0.9359563898904854, "grad_norm": 6.96875, "learning_rate": 5.93721177041838e-06, "loss": 1.05357075, "memory(GiB)": 142.32, "step": 83680, "train_speed(iter/s)": 0.287287 }, { "acc": 0.73681951, "epoch": 0.9361800888364439, "grad_norm": 7.25, "learning_rate": 5.935395065392723e-06, "loss": 1.04092064, "memory(GiB)": 142.32, "step": 83700, "train_speed(iter/s)": 0.287311 }, { "acc": 0.733881, "epoch": 0.9364037877824024, "grad_norm": 4.875, "learning_rate": 5.93357823239165e-06, "loss": 1.05761662, "memory(GiB)": 142.32, "step": 83720, "train_speed(iter/s)": 0.287335 }, { "acc": 0.72836723, "epoch": 0.9366274867283609, "grad_norm": 6.96875, "learning_rate": 5.931761271663732e-06, "loss": 1.07965994, "memory(GiB)": 142.32, "step": 83740, "train_speed(iter/s)": 0.287356 }, { "acc": 0.72978725, "epoch": 0.9368511856743195, "grad_norm": 5.625, "learning_rate": 5.929944183457552e-06, "loss": 1.07868433, "memory(GiB)": 142.32, "step": 83760, "train_speed(iter/s)": 0.287379 }, { "acc": 0.72361612, "epoch": 0.937074884620278, "grad_norm": 5.625, "learning_rate": 5.928126968021717e-06, "loss": 1.10943184, "memory(GiB)": 142.32, "step": 83780, "train_speed(iter/s)": 0.287404 }, { "acc": 0.73846459, "epoch": 0.9372985835662365, "grad_norm": 6.25, "learning_rate": 5.926309625604847e-06, "loss": 1.04078426, "memory(GiB)": 142.32, "step": 83800, "train_speed(iter/s)": 0.287425 }, { "acc": 0.74252825, "epoch": 0.937522282512195, "grad_norm": 5.9375, "learning_rate": 5.924492156455581e-06, "loss": 1.01415443, "memory(GiB)": 142.32, "step": 83820, "train_speed(iter/s)": 0.287447 }, { "acc": 0.73684673, "epoch": 0.9377459814581536, "grad_norm": 5.28125, "learning_rate": 5.9226745608225724e-06, "loss": 1.04342442, "memory(GiB)": 142.32, "step": 83840, "train_speed(iter/s)": 0.287473 }, { "acc": 0.7392025, "epoch": 0.9379696804041121, "grad_norm": 5.125, "learning_rate": 5.920856838954496e-06, "loss": 1.0382019, "memory(GiB)": 142.32, "step": 83860, "train_speed(iter/s)": 0.287496 }, { "acc": 0.74575224, "epoch": 0.9381933793500706, "grad_norm": 7.1875, "learning_rate": 5.9190389911000415e-06, "loss": 1.00934544, "memory(GiB)": 142.32, "step": 83880, "train_speed(iter/s)": 0.287521 }, { "acc": 0.72932968, "epoch": 0.9384170782960292, "grad_norm": 5.25, "learning_rate": 5.917221017507917e-06, "loss": 1.07642803, "memory(GiB)": 142.32, "step": 83900, "train_speed(iter/s)": 0.287545 }, { "acc": 0.73546114, "epoch": 0.9386407772419877, "grad_norm": 5.0, "learning_rate": 5.9154029184268495e-06, "loss": 1.06421089, "memory(GiB)": 142.32, "step": 83920, "train_speed(iter/s)": 0.287569 }, { "acc": 0.73323212, "epoch": 0.9388644761879462, "grad_norm": 6.65625, "learning_rate": 5.913584694105576e-06, "loss": 1.06552429, "memory(GiB)": 142.32, "step": 83940, "train_speed(iter/s)": 0.287589 }, { "acc": 0.74489694, "epoch": 0.9390881751339047, "grad_norm": 5.375, "learning_rate": 5.91176634479286e-06, "loss": 0.9947998, "memory(GiB)": 142.32, "step": 83960, "train_speed(iter/s)": 0.287613 }, { "acc": 0.72549162, "epoch": 0.9393118740798633, "grad_norm": 6.96875, "learning_rate": 5.9099478707374745e-06, "loss": 1.09869061, "memory(GiB)": 142.32, "step": 83980, "train_speed(iter/s)": 0.287633 }, { "acc": 0.72784061, "epoch": 0.9395355730258218, "grad_norm": 6.15625, "learning_rate": 5.908129272188215e-06, "loss": 1.0952323, "memory(GiB)": 142.32, "step": 84000, "train_speed(iter/s)": 0.287654 }, { "epoch": 0.9395355730258218, "eval_acc": 0.6954874630718757, "eval_loss": 1.0747733116149902, "eval_runtime": 2339.4641, "eval_samples_per_second": 32.18, "eval_steps_per_second": 16.09, "step": 84000 }, { "acc": 0.74580703, "epoch": 0.9397592719717803, "grad_norm": 6.34375, "learning_rate": 5.906310549393891e-06, "loss": 0.99190054, "memory(GiB)": 142.32, "step": 84020, "train_speed(iter/s)": 0.285341 }, { "acc": 0.74312243, "epoch": 0.9399829709177389, "grad_norm": 6.71875, "learning_rate": 5.904491702603329e-06, "loss": 1.02847118, "memory(GiB)": 142.32, "step": 84040, "train_speed(iter/s)": 0.285362 }, { "acc": 0.73436508, "epoch": 0.9402066698636974, "grad_norm": 7.0625, "learning_rate": 5.902672732065374e-06, "loss": 1.05332994, "memory(GiB)": 142.32, "step": 84060, "train_speed(iter/s)": 0.285383 }, { "acc": 0.74007483, "epoch": 0.9404303688096559, "grad_norm": 6.28125, "learning_rate": 5.9008536380288875e-06, "loss": 1.0374382, "memory(GiB)": 142.32, "step": 84080, "train_speed(iter/s)": 0.285406 }, { "acc": 0.74558487, "epoch": 0.9406540677556144, "grad_norm": 5.6875, "learning_rate": 5.899034420742746e-06, "loss": 1.00343189, "memory(GiB)": 142.32, "step": 84100, "train_speed(iter/s)": 0.285431 }, { "acc": 0.74655881, "epoch": 0.940877766701573, "grad_norm": 6.21875, "learning_rate": 5.897215080455848e-06, "loss": 0.99877796, "memory(GiB)": 142.32, "step": 84120, "train_speed(iter/s)": 0.285453 }, { "acc": 0.73305655, "epoch": 0.9411014656475315, "grad_norm": 5.6875, "learning_rate": 5.895395617417101e-06, "loss": 1.06526575, "memory(GiB)": 142.32, "step": 84140, "train_speed(iter/s)": 0.285478 }, { "acc": 0.73882294, "epoch": 0.94132516459349, "grad_norm": 5.25, "learning_rate": 5.893576031875435e-06, "loss": 1.0346323, "memory(GiB)": 142.32, "step": 84160, "train_speed(iter/s)": 0.285498 }, { "acc": 0.72113309, "epoch": 0.9415488635394486, "grad_norm": 6.25, "learning_rate": 5.891756324079797e-06, "loss": 1.10936785, "memory(GiB)": 142.32, "step": 84180, "train_speed(iter/s)": 0.285523 }, { "acc": 0.74503288, "epoch": 0.9417725624854072, "grad_norm": 6.375, "learning_rate": 5.889936494279147e-06, "loss": 0.99875956, "memory(GiB)": 142.32, "step": 84200, "train_speed(iter/s)": 0.285546 }, { "acc": 0.74434481, "epoch": 0.9419962614313657, "grad_norm": 6.3125, "learning_rate": 5.888116542722465e-06, "loss": 0.99784117, "memory(GiB)": 142.32, "step": 84220, "train_speed(iter/s)": 0.28557 }, { "acc": 0.71854644, "epoch": 0.9422199603773243, "grad_norm": 6.28125, "learning_rate": 5.886296469658746e-06, "loss": 1.14234495, "memory(GiB)": 142.32, "step": 84240, "train_speed(iter/s)": 0.285591 }, { "acc": 0.73171844, "epoch": 0.9424436593232828, "grad_norm": 6.53125, "learning_rate": 5.884476275337e-06, "loss": 1.07380753, "memory(GiB)": 142.32, "step": 84260, "train_speed(iter/s)": 0.285615 }, { "acc": 0.73109751, "epoch": 0.9426673582692413, "grad_norm": 5.75, "learning_rate": 5.8826559600062595e-06, "loss": 1.07419605, "memory(GiB)": 142.32, "step": 84280, "train_speed(iter/s)": 0.285638 }, { "acc": 0.72663574, "epoch": 0.9428910572151998, "grad_norm": 5.0625, "learning_rate": 5.880835523915565e-06, "loss": 1.08396664, "memory(GiB)": 142.32, "step": 84300, "train_speed(iter/s)": 0.285661 }, { "acc": 0.73224373, "epoch": 0.9431147561611584, "grad_norm": 5.6875, "learning_rate": 5.8790149673139855e-06, "loss": 1.07037888, "memory(GiB)": 142.32, "step": 84320, "train_speed(iter/s)": 0.285684 }, { "acc": 0.73177834, "epoch": 0.9433384551071169, "grad_norm": 4.75, "learning_rate": 5.8771942904505915e-06, "loss": 1.06749821, "memory(GiB)": 142.32, "step": 84340, "train_speed(iter/s)": 0.285708 }, { "acc": 0.72823458, "epoch": 0.9435621540530754, "grad_norm": 5.875, "learning_rate": 5.8753734935744814e-06, "loss": 1.07895012, "memory(GiB)": 142.32, "step": 84360, "train_speed(iter/s)": 0.285732 }, { "acc": 0.73341312, "epoch": 0.943785852999034, "grad_norm": 6.5625, "learning_rate": 5.8735525769347634e-06, "loss": 1.05647831, "memory(GiB)": 142.32, "step": 84380, "train_speed(iter/s)": 0.285756 }, { "acc": 0.73007488, "epoch": 0.9440095519449925, "grad_norm": 5.34375, "learning_rate": 5.8717315407805685e-06, "loss": 1.08354053, "memory(GiB)": 142.32, "step": 84400, "train_speed(iter/s)": 0.285782 }, { "acc": 0.73015227, "epoch": 0.944233250890951, "grad_norm": 6.53125, "learning_rate": 5.869910385361039e-06, "loss": 1.08813896, "memory(GiB)": 142.32, "step": 84420, "train_speed(iter/s)": 0.285806 }, { "acc": 0.73263159, "epoch": 0.9444569498369095, "grad_norm": 7.3125, "learning_rate": 5.868089110925335e-06, "loss": 1.04790001, "memory(GiB)": 142.32, "step": 84440, "train_speed(iter/s)": 0.285827 }, { "acc": 0.72645612, "epoch": 0.9446806487828681, "grad_norm": 5.8125, "learning_rate": 5.866267717722632e-06, "loss": 1.09278231, "memory(GiB)": 142.32, "step": 84460, "train_speed(iter/s)": 0.285851 }, { "acc": 0.73061018, "epoch": 0.9449043477288266, "grad_norm": 6.40625, "learning_rate": 5.864446206002124e-06, "loss": 1.08678665, "memory(GiB)": 142.32, "step": 84480, "train_speed(iter/s)": 0.285874 }, { "acc": 0.73205233, "epoch": 0.9451280466747851, "grad_norm": 6.375, "learning_rate": 5.862624576013019e-06, "loss": 1.06236534, "memory(GiB)": 142.32, "step": 84500, "train_speed(iter/s)": 0.285894 }, { "acc": 0.74182453, "epoch": 0.9453517456207436, "grad_norm": 6.4375, "learning_rate": 5.860802828004541e-06, "loss": 1.02648849, "memory(GiB)": 142.32, "step": 84520, "train_speed(iter/s)": 0.285918 }, { "acc": 0.72690821, "epoch": 0.9455754445667022, "grad_norm": 6.46875, "learning_rate": 5.858980962225935e-06, "loss": 1.09578562, "memory(GiB)": 142.32, "step": 84540, "train_speed(iter/s)": 0.285941 }, { "acc": 0.73035917, "epoch": 0.9457991435126607, "grad_norm": 6.15625, "learning_rate": 5.857158978926454e-06, "loss": 1.085709, "memory(GiB)": 142.32, "step": 84560, "train_speed(iter/s)": 0.285965 }, { "acc": 0.75124884, "epoch": 0.9460228424586192, "grad_norm": 6.5, "learning_rate": 5.855336878355373e-06, "loss": 0.97617702, "memory(GiB)": 142.32, "step": 84580, "train_speed(iter/s)": 0.285988 }, { "acc": 0.72505198, "epoch": 0.9462465414045778, "grad_norm": 7.75, "learning_rate": 5.853514660761982e-06, "loss": 1.10309238, "memory(GiB)": 142.32, "step": 84600, "train_speed(iter/s)": 0.28601 }, { "acc": 0.72051411, "epoch": 0.9464702403505363, "grad_norm": 5.21875, "learning_rate": 5.851692326395585e-06, "loss": 1.11009216, "memory(GiB)": 142.32, "step": 84620, "train_speed(iter/s)": 0.286033 }, { "acc": 0.73895884, "epoch": 0.9466939392964948, "grad_norm": 5.78125, "learning_rate": 5.8498698755055065e-06, "loss": 1.03309708, "memory(GiB)": 142.32, "step": 84640, "train_speed(iter/s)": 0.286057 }, { "acc": 0.74470792, "epoch": 0.9469176382424533, "grad_norm": 6.4375, "learning_rate": 5.84804730834108e-06, "loss": 1.01639442, "memory(GiB)": 142.32, "step": 84660, "train_speed(iter/s)": 0.28608 }, { "acc": 0.73437061, "epoch": 0.9471413371884119, "grad_norm": 6.59375, "learning_rate": 5.8462246251516594e-06, "loss": 1.04669075, "memory(GiB)": 142.32, "step": 84680, "train_speed(iter/s)": 0.286103 }, { "acc": 0.74485035, "epoch": 0.9473650361343704, "grad_norm": 5.25, "learning_rate": 5.844401826186616e-06, "loss": 1.00105877, "memory(GiB)": 142.32, "step": 84700, "train_speed(iter/s)": 0.286129 }, { "acc": 0.73195095, "epoch": 0.9475887350803289, "grad_norm": 6.65625, "learning_rate": 5.842578911695333e-06, "loss": 1.05416269, "memory(GiB)": 142.32, "step": 84720, "train_speed(iter/s)": 0.286154 }, { "acc": 0.74755397, "epoch": 0.9478124340262875, "grad_norm": 6.9375, "learning_rate": 5.840755881927213e-06, "loss": 0.99111891, "memory(GiB)": 142.32, "step": 84740, "train_speed(iter/s)": 0.286178 }, { "acc": 0.73359737, "epoch": 0.948036132972246, "grad_norm": 5.96875, "learning_rate": 5.838932737131669e-06, "loss": 1.05228844, "memory(GiB)": 142.32, "step": 84760, "train_speed(iter/s)": 0.2862 }, { "acc": 0.74313583, "epoch": 0.9482598319182045, "grad_norm": 5.3125, "learning_rate": 5.837109477558137e-06, "loss": 1.034618, "memory(GiB)": 142.32, "step": 84780, "train_speed(iter/s)": 0.286223 }, { "acc": 0.73655095, "epoch": 0.948483530864163, "grad_norm": 4.1875, "learning_rate": 5.835286103456063e-06, "loss": 1.05016193, "memory(GiB)": 142.32, "step": 84800, "train_speed(iter/s)": 0.286246 }, { "acc": 0.73615618, "epoch": 0.9487072298101216, "grad_norm": 6.5625, "learning_rate": 5.83346261507491e-06, "loss": 1.03629837, "memory(GiB)": 142.32, "step": 84820, "train_speed(iter/s)": 0.286268 }, { "acc": 0.73717756, "epoch": 0.9489309287560801, "grad_norm": 5.4375, "learning_rate": 5.831639012664161e-06, "loss": 1.0485075, "memory(GiB)": 142.32, "step": 84840, "train_speed(iter/s)": 0.28629 }, { "acc": 0.72358408, "epoch": 0.9491546277020386, "grad_norm": 6.59375, "learning_rate": 5.829815296473306e-06, "loss": 1.10402641, "memory(GiB)": 142.32, "step": 84860, "train_speed(iter/s)": 0.28631 }, { "acc": 0.73907471, "epoch": 0.9493783266479972, "grad_norm": 6.625, "learning_rate": 5.827991466751858e-06, "loss": 1.042449, "memory(GiB)": 142.32, "step": 84880, "train_speed(iter/s)": 0.286329 }, { "acc": 0.73913736, "epoch": 0.9496020255939557, "grad_norm": 7.03125, "learning_rate": 5.826167523749343e-06, "loss": 1.04258652, "memory(GiB)": 142.32, "step": 84900, "train_speed(iter/s)": 0.286353 }, { "acc": 0.74205894, "epoch": 0.9498257245399142, "grad_norm": 6.9375, "learning_rate": 5.824343467715302e-06, "loss": 1.02202148, "memory(GiB)": 142.32, "step": 84920, "train_speed(iter/s)": 0.286374 }, { "acc": 0.74256611, "epoch": 0.9500494234858727, "grad_norm": 7.375, "learning_rate": 5.8225192988992916e-06, "loss": 1.01763296, "memory(GiB)": 142.32, "step": 84940, "train_speed(iter/s)": 0.286397 }, { "acc": 0.73417473, "epoch": 0.9502731224318313, "grad_norm": 6.9375, "learning_rate": 5.820695017550886e-06, "loss": 1.06056709, "memory(GiB)": 142.32, "step": 84960, "train_speed(iter/s)": 0.286421 }, { "acc": 0.74567165, "epoch": 0.9504968213777898, "grad_norm": 6.78125, "learning_rate": 5.81887062391967e-06, "loss": 1.00369377, "memory(GiB)": 142.32, "step": 84980, "train_speed(iter/s)": 0.286445 }, { "acc": 0.73138523, "epoch": 0.9507205203237483, "grad_norm": 4.8125, "learning_rate": 5.817046118255249e-06, "loss": 1.0761076, "memory(GiB)": 142.32, "step": 85000, "train_speed(iter/s)": 0.28647 }, { "acc": 0.75673518, "epoch": 0.9509442192697068, "grad_norm": 5.1875, "learning_rate": 5.81522150080724e-06, "loss": 0.95677719, "memory(GiB)": 142.32, "step": 85020, "train_speed(iter/s)": 0.286493 }, { "acc": 0.73346844, "epoch": 0.9511679182156654, "grad_norm": 7.6875, "learning_rate": 5.813396771825278e-06, "loss": 1.06591072, "memory(GiB)": 142.32, "step": 85040, "train_speed(iter/s)": 0.286517 }, { "acc": 0.7447443, "epoch": 0.9513916171616239, "grad_norm": 6.46875, "learning_rate": 5.811571931559012e-06, "loss": 1.00538635, "memory(GiB)": 142.32, "step": 85060, "train_speed(iter/s)": 0.286539 }, { "acc": 0.72419715, "epoch": 0.9516153161075824, "grad_norm": 5.9375, "learning_rate": 5.8097469802581055e-06, "loss": 1.1287859, "memory(GiB)": 142.32, "step": 85080, "train_speed(iter/s)": 0.286562 }, { "acc": 0.73538966, "epoch": 0.951839015053541, "grad_norm": 6.9375, "learning_rate": 5.807921918172238e-06, "loss": 1.05636616, "memory(GiB)": 142.32, "step": 85100, "train_speed(iter/s)": 0.286585 }, { "acc": 0.73053741, "epoch": 0.9520627139994995, "grad_norm": 5.375, "learning_rate": 5.806096745551104e-06, "loss": 1.08943825, "memory(GiB)": 142.32, "step": 85120, "train_speed(iter/s)": 0.28661 }, { "acc": 0.72610226, "epoch": 0.952286412945458, "grad_norm": 6.28125, "learning_rate": 5.804271462644413e-06, "loss": 1.1072628, "memory(GiB)": 142.32, "step": 85140, "train_speed(iter/s)": 0.286633 }, { "acc": 0.73852081, "epoch": 0.9525101118914165, "grad_norm": 5.875, "learning_rate": 5.80244606970189e-06, "loss": 1.0439497, "memory(GiB)": 142.32, "step": 85160, "train_speed(iter/s)": 0.286657 }, { "acc": 0.73320036, "epoch": 0.9527338108373751, "grad_norm": 7.03125, "learning_rate": 5.8006205669732775e-06, "loss": 1.06005621, "memory(GiB)": 142.32, "step": 85180, "train_speed(iter/s)": 0.286682 }, { "acc": 0.73029394, "epoch": 0.9529575097833336, "grad_norm": 5.84375, "learning_rate": 5.798794954708326e-06, "loss": 1.07217321, "memory(GiB)": 142.32, "step": 85200, "train_speed(iter/s)": 0.286706 }, { "acc": 0.74644432, "epoch": 0.9531812087292921, "grad_norm": 6.0, "learning_rate": 5.796969233156807e-06, "loss": 1.01574249, "memory(GiB)": 142.32, "step": 85220, "train_speed(iter/s)": 0.286728 }, { "acc": 0.74946079, "epoch": 0.9534049076752507, "grad_norm": 7.75, "learning_rate": 5.795143402568506e-06, "loss": 0.98784294, "memory(GiB)": 142.32, "step": 85240, "train_speed(iter/s)": 0.286754 }, { "acc": 0.73294258, "epoch": 0.9536286066212092, "grad_norm": 5.59375, "learning_rate": 5.793317463193222e-06, "loss": 1.06426239, "memory(GiB)": 142.32, "step": 85260, "train_speed(iter/s)": 0.286778 }, { "acc": 0.74261169, "epoch": 0.9538523055671677, "grad_norm": 6.75, "learning_rate": 5.791491415280772e-06, "loss": 1.02816133, "memory(GiB)": 142.32, "step": 85280, "train_speed(iter/s)": 0.286801 }, { "acc": 0.72012076, "epoch": 0.9540760045131262, "grad_norm": 6.15625, "learning_rate": 5.789665259080981e-06, "loss": 1.12572412, "memory(GiB)": 142.32, "step": 85300, "train_speed(iter/s)": 0.286824 }, { "acc": 0.73537421, "epoch": 0.9542997034590848, "grad_norm": 6.875, "learning_rate": 5.787838994843696e-06, "loss": 1.04381943, "memory(GiB)": 142.32, "step": 85320, "train_speed(iter/s)": 0.286849 }, { "acc": 0.74703264, "epoch": 0.9545234024050433, "grad_norm": 6.53125, "learning_rate": 5.786012622818776e-06, "loss": 0.99208183, "memory(GiB)": 142.32, "step": 85340, "train_speed(iter/s)": 0.286873 }, { "acc": 0.7338233, "epoch": 0.9547471013510018, "grad_norm": 5.75, "learning_rate": 5.784186143256094e-06, "loss": 1.05211716, "memory(GiB)": 142.32, "step": 85360, "train_speed(iter/s)": 0.286896 }, { "acc": 0.74106112, "epoch": 0.9549708002969604, "grad_norm": 7.34375, "learning_rate": 5.782359556405541e-06, "loss": 1.01320496, "memory(GiB)": 142.32, "step": 85380, "train_speed(iter/s)": 0.286917 }, { "acc": 0.72415891, "epoch": 0.9551944992429189, "grad_norm": 5.90625, "learning_rate": 5.780532862517016e-06, "loss": 1.10383348, "memory(GiB)": 142.32, "step": 85400, "train_speed(iter/s)": 0.286939 }, { "acc": 0.73267126, "epoch": 0.9554181981888774, "grad_norm": 5.71875, "learning_rate": 5.77870606184044e-06, "loss": 1.07505531, "memory(GiB)": 142.32, "step": 85420, "train_speed(iter/s)": 0.286962 }, { "acc": 0.72995672, "epoch": 0.9556418971348359, "grad_norm": 6.59375, "learning_rate": 5.776879154625744e-06, "loss": 1.07616339, "memory(GiB)": 142.32, "step": 85440, "train_speed(iter/s)": 0.286988 }, { "acc": 0.73697839, "epoch": 0.9558655960807945, "grad_norm": 5.25, "learning_rate": 5.775052141122876e-06, "loss": 1.05089531, "memory(GiB)": 142.32, "step": 85460, "train_speed(iter/s)": 0.287012 }, { "acc": 0.72207565, "epoch": 0.956089295026753, "grad_norm": 6.15625, "learning_rate": 5.773225021581797e-06, "loss": 1.11765032, "memory(GiB)": 142.32, "step": 85480, "train_speed(iter/s)": 0.287034 }, { "acc": 0.71872349, "epoch": 0.9563129939727115, "grad_norm": 6.0625, "learning_rate": 5.771397796252485e-06, "loss": 1.11946335, "memory(GiB)": 142.32, "step": 85500, "train_speed(iter/s)": 0.287059 }, { "acc": 0.73244276, "epoch": 0.95653669291867, "grad_norm": 5.59375, "learning_rate": 5.769570465384926e-06, "loss": 1.04238873, "memory(GiB)": 142.32, "step": 85520, "train_speed(iter/s)": 0.287082 }, { "acc": 0.73497257, "epoch": 0.9567603918646286, "grad_norm": 5.71875, "learning_rate": 5.767743029229128e-06, "loss": 1.05362749, "memory(GiB)": 142.32, "step": 85540, "train_speed(iter/s)": 0.287103 }, { "acc": 0.7399168, "epoch": 0.9569840908105871, "grad_norm": 5.4375, "learning_rate": 5.76591548803511e-06, "loss": 1.04139118, "memory(GiB)": 142.32, "step": 85560, "train_speed(iter/s)": 0.287127 }, { "acc": 0.73834114, "epoch": 0.9572077897565456, "grad_norm": 5.0, "learning_rate": 5.764087842052906e-06, "loss": 1.03759823, "memory(GiB)": 142.32, "step": 85580, "train_speed(iter/s)": 0.287149 }, { "acc": 0.73298573, "epoch": 0.9574314887025042, "grad_norm": 5.75, "learning_rate": 5.762260091532564e-06, "loss": 1.04599342, "memory(GiB)": 142.32, "step": 85600, "train_speed(iter/s)": 0.287173 }, { "acc": 0.7295022, "epoch": 0.9576551876484627, "grad_norm": 7.1875, "learning_rate": 5.760432236724146e-06, "loss": 1.08535109, "memory(GiB)": 142.32, "step": 85620, "train_speed(iter/s)": 0.287196 }, { "acc": 0.72883978, "epoch": 0.9578788865944212, "grad_norm": 6.53125, "learning_rate": 5.75860427787773e-06, "loss": 1.0767868, "memory(GiB)": 142.32, "step": 85640, "train_speed(iter/s)": 0.287218 }, { "acc": 0.73369298, "epoch": 0.9581025855403797, "grad_norm": 6.40625, "learning_rate": 5.756776215243404e-06, "loss": 1.05984554, "memory(GiB)": 142.32, "step": 85660, "train_speed(iter/s)": 0.287242 }, { "acc": 0.73775287, "epoch": 0.9583262844863383, "grad_norm": 6.21875, "learning_rate": 5.754948049071276e-06, "loss": 1.03797798, "memory(GiB)": 142.32, "step": 85680, "train_speed(iter/s)": 0.287266 }, { "acc": 0.73833513, "epoch": 0.9585499834322968, "grad_norm": 5.34375, "learning_rate": 5.7531197796114645e-06, "loss": 1.02758856, "memory(GiB)": 142.32, "step": 85700, "train_speed(iter/s)": 0.287289 }, { "acc": 0.73070116, "epoch": 0.9587736823782553, "grad_norm": 6.5625, "learning_rate": 5.7512914071141014e-06, "loss": 1.05968285, "memory(GiB)": 142.32, "step": 85720, "train_speed(iter/s)": 0.287313 }, { "acc": 0.73861475, "epoch": 0.9589973813242139, "grad_norm": 5.4375, "learning_rate": 5.749462931829336e-06, "loss": 1.03448992, "memory(GiB)": 142.32, "step": 85740, "train_speed(iter/s)": 0.287334 }, { "acc": 0.73513393, "epoch": 0.9592210802701724, "grad_norm": 5.0625, "learning_rate": 5.74763435400733e-06, "loss": 1.02937775, "memory(GiB)": 142.32, "step": 85760, "train_speed(iter/s)": 0.28736 }, { "acc": 0.74027281, "epoch": 0.9594447792161309, "grad_norm": 5.71875, "learning_rate": 5.745805673898257e-06, "loss": 1.03834391, "memory(GiB)": 142.32, "step": 85780, "train_speed(iter/s)": 0.287385 }, { "acc": 0.71890249, "epoch": 0.9596684781620894, "grad_norm": 5.25, "learning_rate": 5.743976891752309e-06, "loss": 1.13980646, "memory(GiB)": 142.32, "step": 85800, "train_speed(iter/s)": 0.287406 }, { "acc": 0.72533712, "epoch": 0.959892177108048, "grad_norm": 5.84375, "learning_rate": 5.742148007819688e-06, "loss": 1.09633007, "memory(GiB)": 142.32, "step": 85820, "train_speed(iter/s)": 0.28743 }, { "acc": 0.72963662, "epoch": 0.9601158760540065, "grad_norm": 5.9375, "learning_rate": 5.740319022350611e-06, "loss": 1.07483082, "memory(GiB)": 142.32, "step": 85840, "train_speed(iter/s)": 0.287452 }, { "acc": 0.73084078, "epoch": 0.960339574999965, "grad_norm": 5.0625, "learning_rate": 5.738489935595311e-06, "loss": 1.07359867, "memory(GiB)": 142.32, "step": 85860, "train_speed(iter/s)": 0.287475 }, { "acc": 0.74329863, "epoch": 0.9605632739459236, "grad_norm": 6.46875, "learning_rate": 5.7366607478040304e-06, "loss": 1.02501431, "memory(GiB)": 142.32, "step": 85880, "train_speed(iter/s)": 0.287497 }, { "acc": 0.73773799, "epoch": 0.9607869728918821, "grad_norm": 4.75, "learning_rate": 5.734831459227032e-06, "loss": 1.04910984, "memory(GiB)": 142.32, "step": 85900, "train_speed(iter/s)": 0.287519 }, { "acc": 0.73961315, "epoch": 0.9610106718378406, "grad_norm": 6.65625, "learning_rate": 5.7330020701145876e-06, "loss": 1.03946037, "memory(GiB)": 142.32, "step": 85920, "train_speed(iter/s)": 0.287546 }, { "acc": 0.74746809, "epoch": 0.9612343707837991, "grad_norm": 6.59375, "learning_rate": 5.7311725807169815e-06, "loss": 0.99747028, "memory(GiB)": 142.32, "step": 85940, "train_speed(iter/s)": 0.287567 }, { "acc": 0.73272529, "epoch": 0.9614580697297577, "grad_norm": 7.15625, "learning_rate": 5.729342991284516e-06, "loss": 1.06955509, "memory(GiB)": 142.32, "step": 85960, "train_speed(iter/s)": 0.287591 }, { "acc": 0.74757185, "epoch": 0.9616817686757162, "grad_norm": 6.4375, "learning_rate": 5.727513302067504e-06, "loss": 0.98941698, "memory(GiB)": 142.32, "step": 85980, "train_speed(iter/s)": 0.287616 }, { "acc": 0.73193893, "epoch": 0.9619054676216747, "grad_norm": 5.3125, "learning_rate": 5.725683513316276e-06, "loss": 1.0736187, "memory(GiB)": 142.32, "step": 86000, "train_speed(iter/s)": 0.287637 }, { "epoch": 0.9619054676216747, "eval_acc": 0.6955924156504654, "eval_loss": 1.0742322206497192, "eval_runtime": 2339.4772, "eval_samples_per_second": 32.179, "eval_steps_per_second": 16.09, "step": 86000 }, { "acc": 0.73613052, "epoch": 0.9621291665676333, "grad_norm": 6.84375, "learning_rate": 5.7238536252811685e-06, "loss": 1.04393187, "memory(GiB)": 142.32, "step": 86020, "train_speed(iter/s)": 0.28538 }, { "acc": 0.73334541, "epoch": 0.9623528655135918, "grad_norm": 5.59375, "learning_rate": 5.722023638212539e-06, "loss": 1.06102676, "memory(GiB)": 142.32, "step": 86040, "train_speed(iter/s)": 0.285401 }, { "acc": 0.74451857, "epoch": 0.9625765644595503, "grad_norm": 7.1875, "learning_rate": 5.720193552360757e-06, "loss": 1.00924873, "memory(GiB)": 142.32, "step": 86060, "train_speed(iter/s)": 0.285425 }, { "acc": 0.73122854, "epoch": 0.9628002634055088, "grad_norm": 5.84375, "learning_rate": 5.718363367976202e-06, "loss": 1.08051128, "memory(GiB)": 142.32, "step": 86080, "train_speed(iter/s)": 0.285448 }, { "acc": 0.73462782, "epoch": 0.9630239623514674, "grad_norm": 6.0625, "learning_rate": 5.716533085309272e-06, "loss": 1.06419735, "memory(GiB)": 142.32, "step": 86100, "train_speed(iter/s)": 0.285468 }, { "acc": 0.73127203, "epoch": 0.9632476612974259, "grad_norm": 5.5625, "learning_rate": 5.714702704610373e-06, "loss": 1.0721796, "memory(GiB)": 142.32, "step": 86120, "train_speed(iter/s)": 0.285492 }, { "acc": 0.74271317, "epoch": 0.9634713602433844, "grad_norm": 6.75, "learning_rate": 5.712872226129929e-06, "loss": 1.02482681, "memory(GiB)": 142.32, "step": 86140, "train_speed(iter/s)": 0.285516 }, { "acc": 0.7244235, "epoch": 0.963695059189343, "grad_norm": 6.15625, "learning_rate": 5.711041650118374e-06, "loss": 1.09954433, "memory(GiB)": 142.32, "step": 86160, "train_speed(iter/s)": 0.285539 }, { "acc": 0.7391818, "epoch": 0.9639187581353015, "grad_norm": 5.3125, "learning_rate": 5.70921097682616e-06, "loss": 1.01973858, "memory(GiB)": 142.32, "step": 86180, "train_speed(iter/s)": 0.285562 }, { "acc": 0.74114132, "epoch": 0.96414245708126, "grad_norm": 5.96875, "learning_rate": 5.707380206503745e-06, "loss": 1.00899296, "memory(GiB)": 142.32, "step": 86200, "train_speed(iter/s)": 0.285585 }, { "acc": 0.72735505, "epoch": 0.9643661560272185, "grad_norm": 6.03125, "learning_rate": 5.705549339401609e-06, "loss": 1.08250637, "memory(GiB)": 142.32, "step": 86220, "train_speed(iter/s)": 0.285609 }, { "acc": 0.7365262, "epoch": 0.9645898549731771, "grad_norm": 8.4375, "learning_rate": 5.703718375770239e-06, "loss": 1.03432369, "memory(GiB)": 142.32, "step": 86240, "train_speed(iter/s)": 0.285633 }, { "acc": 0.73746662, "epoch": 0.9648135539191356, "grad_norm": 6.875, "learning_rate": 5.701887315860135e-06, "loss": 1.059972, "memory(GiB)": 142.32, "step": 86260, "train_speed(iter/s)": 0.285651 }, { "acc": 0.74163256, "epoch": 0.9650372528650941, "grad_norm": 6.1875, "learning_rate": 5.7000561599218155e-06, "loss": 1.022822, "memory(GiB)": 142.32, "step": 86280, "train_speed(iter/s)": 0.285671 }, { "acc": 0.73672905, "epoch": 0.9652609518110526, "grad_norm": 6.25, "learning_rate": 5.698224908205805e-06, "loss": 1.03625946, "memory(GiB)": 142.32, "step": 86300, "train_speed(iter/s)": 0.285695 }, { "acc": 0.73008509, "epoch": 0.9654846507570112, "grad_norm": 5.71875, "learning_rate": 5.69639356096265e-06, "loss": 1.10170193, "memory(GiB)": 142.32, "step": 86320, "train_speed(iter/s)": 0.285719 }, { "acc": 0.73890715, "epoch": 0.9657083497029697, "grad_norm": 6.4375, "learning_rate": 5.6945621184429005e-06, "loss": 1.03049774, "memory(GiB)": 142.32, "step": 86340, "train_speed(iter/s)": 0.285743 }, { "acc": 0.7269927, "epoch": 0.9659320486489282, "grad_norm": 6.5625, "learning_rate": 5.692730580897126e-06, "loss": 1.1144536, "memory(GiB)": 142.32, "step": 86360, "train_speed(iter/s)": 0.285765 }, { "acc": 0.74164209, "epoch": 0.9661557475948868, "grad_norm": 6.59375, "learning_rate": 5.690898948575906e-06, "loss": 1.03247452, "memory(GiB)": 142.32, "step": 86380, "train_speed(iter/s)": 0.285788 }, { "acc": 0.72603922, "epoch": 0.9663794465408453, "grad_norm": 6.09375, "learning_rate": 5.689067221729835e-06, "loss": 1.09433184, "memory(GiB)": 142.32, "step": 86400, "train_speed(iter/s)": 0.285811 }, { "acc": 0.73951955, "epoch": 0.9666031454868038, "grad_norm": 7.28125, "learning_rate": 5.68723540060952e-06, "loss": 1.01726093, "memory(GiB)": 142.32, "step": 86420, "train_speed(iter/s)": 0.285835 }, { "acc": 0.72388864, "epoch": 0.9668268444327623, "grad_norm": 6.53125, "learning_rate": 5.685403485465578e-06, "loss": 1.0925993, "memory(GiB)": 142.32, "step": 86440, "train_speed(iter/s)": 0.285859 }, { "acc": 0.73453293, "epoch": 0.9670505433787209, "grad_norm": 4.90625, "learning_rate": 5.683571476548643e-06, "loss": 1.06933393, "memory(GiB)": 142.32, "step": 86460, "train_speed(iter/s)": 0.285881 }, { "acc": 0.73840113, "epoch": 0.9672742423246794, "grad_norm": 5.625, "learning_rate": 5.681739374109359e-06, "loss": 1.04099598, "memory(GiB)": 142.32, "step": 86480, "train_speed(iter/s)": 0.285904 }, { "acc": 0.7445632, "epoch": 0.9674979412706379, "grad_norm": 6.1875, "learning_rate": 5.679907178398385e-06, "loss": 1.01796227, "memory(GiB)": 142.32, "step": 86500, "train_speed(iter/s)": 0.285925 }, { "acc": 0.73408737, "epoch": 0.9677216402165965, "grad_norm": 5.875, "learning_rate": 5.67807488966639e-06, "loss": 1.05422916, "memory(GiB)": 142.32, "step": 86520, "train_speed(iter/s)": 0.285946 }, { "acc": 0.73511391, "epoch": 0.967945339162555, "grad_norm": 5.6875, "learning_rate": 5.67624250816406e-06, "loss": 1.04886723, "memory(GiB)": 142.32, "step": 86540, "train_speed(iter/s)": 0.285969 }, { "acc": 0.74295979, "epoch": 0.9681690381085135, "grad_norm": 7.875, "learning_rate": 5.674410034142087e-06, "loss": 1.02083321, "memory(GiB)": 142.32, "step": 86560, "train_speed(iter/s)": 0.285992 }, { "acc": 0.7316514, "epoch": 0.968392737054472, "grad_norm": 5.03125, "learning_rate": 5.672577467851184e-06, "loss": 1.06554651, "memory(GiB)": 142.32, "step": 86580, "train_speed(iter/s)": 0.286015 }, { "acc": 0.74638453, "epoch": 0.9686164360004306, "grad_norm": 6.40625, "learning_rate": 5.670744809542068e-06, "loss": 1.00442724, "memory(GiB)": 142.32, "step": 86600, "train_speed(iter/s)": 0.286039 }, { "acc": 0.7280529, "epoch": 0.9688401349463891, "grad_norm": 5.625, "learning_rate": 5.668912059465477e-06, "loss": 1.08360548, "memory(GiB)": 142.32, "step": 86620, "train_speed(iter/s)": 0.286061 }, { "acc": 0.74208083, "epoch": 0.9690638338923476, "grad_norm": 7.34375, "learning_rate": 5.667079217872153e-06, "loss": 1.01623116, "memory(GiB)": 142.32, "step": 86640, "train_speed(iter/s)": 0.286083 }, { "acc": 0.73366747, "epoch": 0.9692875328383062, "grad_norm": 5.75, "learning_rate": 5.665246285012858e-06, "loss": 1.06010084, "memory(GiB)": 142.32, "step": 86660, "train_speed(iter/s)": 0.286104 }, { "acc": 0.74333124, "epoch": 0.9695112317842647, "grad_norm": 5.65625, "learning_rate": 5.663413261138364e-06, "loss": 1.01681099, "memory(GiB)": 142.32, "step": 86680, "train_speed(iter/s)": 0.286127 }, { "acc": 0.73931799, "epoch": 0.9697349307302232, "grad_norm": 5.375, "learning_rate": 5.661580146499452e-06, "loss": 1.03775711, "memory(GiB)": 142.32, "step": 86700, "train_speed(iter/s)": 0.286148 }, { "acc": 0.72579937, "epoch": 0.9699586296761818, "grad_norm": 5.46875, "learning_rate": 5.659746941346919e-06, "loss": 1.08972054, "memory(GiB)": 142.32, "step": 86720, "train_speed(iter/s)": 0.28617 }, { "acc": 0.72495537, "epoch": 0.9701823286221404, "grad_norm": 6.28125, "learning_rate": 5.657913645931578e-06, "loss": 1.09788332, "memory(GiB)": 142.32, "step": 86740, "train_speed(iter/s)": 0.286194 }, { "acc": 0.73714008, "epoch": 0.9704060275680989, "grad_norm": 6.84375, "learning_rate": 5.6560802605042445e-06, "loss": 1.0533124, "memory(GiB)": 142.32, "step": 86760, "train_speed(iter/s)": 0.286218 }, { "acc": 0.74234772, "epoch": 0.9706297265140574, "grad_norm": 6.78125, "learning_rate": 5.6542467853157525e-06, "loss": 1.01791048, "memory(GiB)": 142.32, "step": 86780, "train_speed(iter/s)": 0.28624 }, { "acc": 0.73797789, "epoch": 0.970853425460016, "grad_norm": 5.5625, "learning_rate": 5.65241322061695e-06, "loss": 1.04677534, "memory(GiB)": 142.32, "step": 86800, "train_speed(iter/s)": 0.286263 }, { "acc": 0.73389273, "epoch": 0.9710771244059745, "grad_norm": 6.34375, "learning_rate": 5.650579566658694e-06, "loss": 1.0658596, "memory(GiB)": 142.32, "step": 86820, "train_speed(iter/s)": 0.286285 }, { "acc": 0.73155441, "epoch": 0.971300823351933, "grad_norm": 7.3125, "learning_rate": 5.6487458236918545e-06, "loss": 1.06561003, "memory(GiB)": 142.32, "step": 86840, "train_speed(iter/s)": 0.286309 }, { "acc": 0.73169136, "epoch": 0.9715245222978915, "grad_norm": 6.0625, "learning_rate": 5.646911991967313e-06, "loss": 1.05882568, "memory(GiB)": 142.32, "step": 86860, "train_speed(iter/s)": 0.286333 }, { "acc": 0.73232803, "epoch": 0.9717482212438501, "grad_norm": 5.21875, "learning_rate": 5.645078071735964e-06, "loss": 1.0737484, "memory(GiB)": 142.32, "step": 86880, "train_speed(iter/s)": 0.286353 }, { "acc": 0.73510017, "epoch": 0.9719719201898086, "grad_norm": 5.875, "learning_rate": 5.643244063248715e-06, "loss": 1.05874462, "memory(GiB)": 142.32, "step": 86900, "train_speed(iter/s)": 0.286377 }, { "acc": 0.72932572, "epoch": 0.9721956191357671, "grad_norm": 6.03125, "learning_rate": 5.641409966756483e-06, "loss": 1.07492599, "memory(GiB)": 142.32, "step": 86920, "train_speed(iter/s)": 0.286399 }, { "acc": 0.72933559, "epoch": 0.9724193180817257, "grad_norm": 6.09375, "learning_rate": 5.6395757825102025e-06, "loss": 1.08277702, "memory(GiB)": 142.32, "step": 86940, "train_speed(iter/s)": 0.286422 }, { "acc": 0.73519936, "epoch": 0.9726430170276842, "grad_norm": 5.125, "learning_rate": 5.637741510760812e-06, "loss": 1.06581573, "memory(GiB)": 142.32, "step": 86960, "train_speed(iter/s)": 0.286443 }, { "acc": 0.74279966, "epoch": 0.9728667159736427, "grad_norm": 6.15625, "learning_rate": 5.635907151759267e-06, "loss": 1.02661495, "memory(GiB)": 142.32, "step": 86980, "train_speed(iter/s)": 0.286466 }, { "acc": 0.74015427, "epoch": 0.9730904149196012, "grad_norm": 6.5, "learning_rate": 5.634072705756535e-06, "loss": 1.0217638, "memory(GiB)": 142.32, "step": 87000, "train_speed(iter/s)": 0.28649 }, { "acc": 0.7402864, "epoch": 0.9733141138655598, "grad_norm": 5.3125, "learning_rate": 5.632238173003593e-06, "loss": 1.03535633, "memory(GiB)": 142.32, "step": 87020, "train_speed(iter/s)": 0.286511 }, { "acc": 0.73762875, "epoch": 0.9735378128115183, "grad_norm": 4.875, "learning_rate": 5.630403553751433e-06, "loss": 1.02674332, "memory(GiB)": 142.32, "step": 87040, "train_speed(iter/s)": 0.286535 }, { "acc": 0.73614745, "epoch": 0.9737615117574768, "grad_norm": 8.375, "learning_rate": 5.628568848251056e-06, "loss": 1.05290394, "memory(GiB)": 142.32, "step": 87060, "train_speed(iter/s)": 0.28656 }, { "acc": 0.72193699, "epoch": 0.9739852107034354, "grad_norm": 7.125, "learning_rate": 5.626734056753475e-06, "loss": 1.12528172, "memory(GiB)": 142.32, "step": 87080, "train_speed(iter/s)": 0.286583 }, { "acc": 0.72970047, "epoch": 0.9742089096493939, "grad_norm": 5.65625, "learning_rate": 5.624899179509719e-06, "loss": 1.08460388, "memory(GiB)": 142.32, "step": 87100, "train_speed(iter/s)": 0.286605 }, { "acc": 0.7417346, "epoch": 0.9744326085953524, "grad_norm": 6.15625, "learning_rate": 5.623064216770821e-06, "loss": 1.02411785, "memory(GiB)": 142.32, "step": 87120, "train_speed(iter/s)": 0.286628 }, { "acc": 0.73721681, "epoch": 0.9746563075413109, "grad_norm": 6.40625, "learning_rate": 5.621229168787836e-06, "loss": 1.04248533, "memory(GiB)": 142.32, "step": 87140, "train_speed(iter/s)": 0.286648 }, { "acc": 0.734589, "epoch": 0.9748800064872695, "grad_norm": 6.03125, "learning_rate": 5.61939403581182e-06, "loss": 1.06361103, "memory(GiB)": 142.32, "step": 87160, "train_speed(iter/s)": 0.286667 }, { "acc": 0.73971272, "epoch": 0.975103705433228, "grad_norm": 4.75, "learning_rate": 5.617558818093844e-06, "loss": 1.02718611, "memory(GiB)": 142.32, "step": 87180, "train_speed(iter/s)": 0.28669 }, { "acc": 0.72864714, "epoch": 0.9753274043791865, "grad_norm": 5.40625, "learning_rate": 5.615723515884998e-06, "loss": 1.08616371, "memory(GiB)": 142.32, "step": 87200, "train_speed(iter/s)": 0.286712 }, { "acc": 0.72952976, "epoch": 0.975551103325145, "grad_norm": 5.6875, "learning_rate": 5.613888129436372e-06, "loss": 1.08455572, "memory(GiB)": 142.32, "step": 87220, "train_speed(iter/s)": 0.286736 }, { "acc": 0.73187289, "epoch": 0.9757748022711036, "grad_norm": 5.96875, "learning_rate": 5.612052658999078e-06, "loss": 1.07333069, "memory(GiB)": 142.32, "step": 87240, "train_speed(iter/s)": 0.286757 }, { "acc": 0.72793736, "epoch": 0.9759985012170621, "grad_norm": 4.5, "learning_rate": 5.6102171048242294e-06, "loss": 1.09559708, "memory(GiB)": 142.32, "step": 87260, "train_speed(iter/s)": 0.286779 }, { "acc": 0.74349136, "epoch": 0.9762222001630206, "grad_norm": 6.625, "learning_rate": 5.608381467162961e-06, "loss": 1.00889835, "memory(GiB)": 142.32, "step": 87280, "train_speed(iter/s)": 0.286802 }, { "acc": 0.73002386, "epoch": 0.9764458991089792, "grad_norm": 6.65625, "learning_rate": 5.606545746266411e-06, "loss": 1.10554848, "memory(GiB)": 142.32, "step": 87300, "train_speed(iter/s)": 0.286825 }, { "acc": 0.73514099, "epoch": 0.9766695980549377, "grad_norm": 7.03125, "learning_rate": 5.6047099423857335e-06, "loss": 1.04348278, "memory(GiB)": 142.32, "step": 87320, "train_speed(iter/s)": 0.286848 }, { "acc": 0.73755188, "epoch": 0.9768932970008962, "grad_norm": 6.0625, "learning_rate": 5.6028740557720915e-06, "loss": 1.03702259, "memory(GiB)": 142.32, "step": 87340, "train_speed(iter/s)": 0.28687 }, { "acc": 0.74659905, "epoch": 0.9771169959468547, "grad_norm": 6.0625, "learning_rate": 5.601038086676663e-06, "loss": 1.015271, "memory(GiB)": 142.32, "step": 87360, "train_speed(iter/s)": 0.286896 }, { "acc": 0.74208107, "epoch": 0.9773406948928133, "grad_norm": 6.125, "learning_rate": 5.599202035350634e-06, "loss": 1.02267437, "memory(GiB)": 142.32, "step": 87380, "train_speed(iter/s)": 0.286918 }, { "acc": 0.72952356, "epoch": 0.9775643938387718, "grad_norm": 6.46875, "learning_rate": 5.5973659020451995e-06, "loss": 1.0734436, "memory(GiB)": 142.32, "step": 87400, "train_speed(iter/s)": 0.286942 }, { "acc": 0.73136797, "epoch": 0.9777880927847303, "grad_norm": 6.15625, "learning_rate": 5.595529687011574e-06, "loss": 1.07468433, "memory(GiB)": 142.32, "step": 87420, "train_speed(iter/s)": 0.286964 }, { "acc": 0.73943028, "epoch": 0.9780117917306889, "grad_norm": 6.65625, "learning_rate": 5.593693390500973e-06, "loss": 1.02942772, "memory(GiB)": 142.32, "step": 87440, "train_speed(iter/s)": 0.286988 }, { "acc": 0.74586601, "epoch": 0.9782354906766474, "grad_norm": 6.46875, "learning_rate": 5.591857012764632e-06, "loss": 0.99717503, "memory(GiB)": 142.32, "step": 87460, "train_speed(iter/s)": 0.287014 }, { "acc": 0.7383709, "epoch": 0.9784591896226059, "grad_norm": 5.65625, "learning_rate": 5.590020554053792e-06, "loss": 1.03497686, "memory(GiB)": 142.32, "step": 87480, "train_speed(iter/s)": 0.287038 }, { "acc": 0.73756323, "epoch": 0.9786828885685644, "grad_norm": 5.65625, "learning_rate": 5.588184014619705e-06, "loss": 1.01585827, "memory(GiB)": 142.32, "step": 87500, "train_speed(iter/s)": 0.287058 }, { "acc": 0.74908481, "epoch": 0.978906587514523, "grad_norm": 5.09375, "learning_rate": 5.58634739471364e-06, "loss": 0.9881918, "memory(GiB)": 142.32, "step": 87520, "train_speed(iter/s)": 0.287078 }, { "acc": 0.74174032, "epoch": 0.9791302864604815, "grad_norm": 6.5625, "learning_rate": 5.584510694586869e-06, "loss": 1.01762543, "memory(GiB)": 142.32, "step": 87540, "train_speed(iter/s)": 0.287098 }, { "acc": 0.72657948, "epoch": 0.97935398540644, "grad_norm": 6.09375, "learning_rate": 5.582673914490682e-06, "loss": 1.11165524, "memory(GiB)": 142.32, "step": 87560, "train_speed(iter/s)": 0.287119 }, { "acc": 0.72644696, "epoch": 0.9795776843523986, "grad_norm": 5.8125, "learning_rate": 5.5808370546763735e-06, "loss": 1.08321047, "memory(GiB)": 142.32, "step": 87580, "train_speed(iter/s)": 0.287143 }, { "acc": 0.73648729, "epoch": 0.9798013832983571, "grad_norm": 4.90625, "learning_rate": 5.579000115395254e-06, "loss": 1.05169888, "memory(GiB)": 142.32, "step": 87600, "train_speed(iter/s)": 0.287165 }, { "acc": 0.74134927, "epoch": 0.9800250822443156, "grad_norm": 5.875, "learning_rate": 5.577163096898643e-06, "loss": 1.0259119, "memory(GiB)": 142.32, "step": 87620, "train_speed(iter/s)": 0.287187 }, { "acc": 0.73677683, "epoch": 0.9802487811902741, "grad_norm": 5.625, "learning_rate": 5.575325999437872e-06, "loss": 1.03742094, "memory(GiB)": 142.32, "step": 87640, "train_speed(iter/s)": 0.28721 }, { "acc": 0.73735018, "epoch": 0.9804724801362327, "grad_norm": 5.71875, "learning_rate": 5.57348882326428e-06, "loss": 1.04528561, "memory(GiB)": 142.32, "step": 87660, "train_speed(iter/s)": 0.287231 }, { "acc": 0.73666916, "epoch": 0.9806961790821912, "grad_norm": 6.6875, "learning_rate": 5.57165156862922e-06, "loss": 1.03996315, "memory(GiB)": 142.32, "step": 87680, "train_speed(iter/s)": 0.287254 }, { "acc": 0.73709974, "epoch": 0.9809198780281497, "grad_norm": 6.59375, "learning_rate": 5.569814235784056e-06, "loss": 1.03948936, "memory(GiB)": 142.32, "step": 87700, "train_speed(iter/s)": 0.287279 }, { "acc": 0.73932695, "epoch": 0.9811435769741083, "grad_norm": 6.375, "learning_rate": 5.567976824980158e-06, "loss": 1.03209267, "memory(GiB)": 142.32, "step": 87720, "train_speed(iter/s)": 0.287302 }, { "acc": 0.71953812, "epoch": 0.9813672759200668, "grad_norm": 5.78125, "learning_rate": 5.566139336468912e-06, "loss": 1.12506962, "memory(GiB)": 142.32, "step": 87740, "train_speed(iter/s)": 0.287323 }, { "acc": 0.73461018, "epoch": 0.9815909748660253, "grad_norm": 5.15625, "learning_rate": 5.564301770501714e-06, "loss": 1.05254631, "memory(GiB)": 142.32, "step": 87760, "train_speed(iter/s)": 0.287343 }, { "acc": 0.74099236, "epoch": 0.9818146738119838, "grad_norm": 5.71875, "learning_rate": 5.562464127329968e-06, "loss": 1.02514763, "memory(GiB)": 142.32, "step": 87780, "train_speed(iter/s)": 0.287367 }, { "acc": 0.73325739, "epoch": 0.9820383727579424, "grad_norm": 5.8125, "learning_rate": 5.56062640720509e-06, "loss": 1.06002769, "memory(GiB)": 142.32, "step": 87800, "train_speed(iter/s)": 0.287386 }, { "acc": 0.73277435, "epoch": 0.9822620717039009, "grad_norm": 5.28125, "learning_rate": 5.558788610378505e-06, "loss": 1.06270981, "memory(GiB)": 142.32, "step": 87820, "train_speed(iter/s)": 0.287406 }, { "acc": 0.73811398, "epoch": 0.9824857706498594, "grad_norm": 6.65625, "learning_rate": 5.556950737101651e-06, "loss": 1.04579887, "memory(GiB)": 142.32, "step": 87840, "train_speed(iter/s)": 0.287427 }, { "acc": 0.73186474, "epoch": 0.982709469595818, "grad_norm": 5.0625, "learning_rate": 5.555112787625977e-06, "loss": 1.07522135, "memory(GiB)": 142.32, "step": 87860, "train_speed(iter/s)": 0.287451 }, { "acc": 0.74195986, "epoch": 0.9829331685417765, "grad_norm": 5.84375, "learning_rate": 5.55327476220294e-06, "loss": 1.02217808, "memory(GiB)": 142.32, "step": 87880, "train_speed(iter/s)": 0.287475 }, { "acc": 0.73104877, "epoch": 0.983156867487735, "grad_norm": 7.625, "learning_rate": 5.551436661084008e-06, "loss": 1.06874075, "memory(GiB)": 142.32, "step": 87900, "train_speed(iter/s)": 0.2875 }, { "acc": 0.73809118, "epoch": 0.9833805664336935, "grad_norm": 5.96875, "learning_rate": 5.549598484520656e-06, "loss": 1.02486992, "memory(GiB)": 142.32, "step": 87920, "train_speed(iter/s)": 0.287524 }, { "acc": 0.75428262, "epoch": 0.9836042653796521, "grad_norm": 4.6875, "learning_rate": 5.547760232764376e-06, "loss": 0.97397089, "memory(GiB)": 142.32, "step": 87940, "train_speed(iter/s)": 0.287546 }, { "acc": 0.72627811, "epoch": 0.9838279643256106, "grad_norm": 5.75, "learning_rate": 5.545921906066668e-06, "loss": 1.08017159, "memory(GiB)": 142.32, "step": 87960, "train_speed(iter/s)": 0.287571 }, { "acc": 0.74200277, "epoch": 0.9840516632715691, "grad_norm": 5.90625, "learning_rate": 5.5440835046790395e-06, "loss": 1.02382603, "memory(GiB)": 142.32, "step": 87980, "train_speed(iter/s)": 0.287594 }, { "acc": 0.73305883, "epoch": 0.9842753622175276, "grad_norm": 7.03125, "learning_rate": 5.5422450288530125e-06, "loss": 1.06075191, "memory(GiB)": 142.32, "step": 88000, "train_speed(iter/s)": 0.287615 }, { "epoch": 0.9842753622175276, "eval_acc": 0.6957125515988088, "eval_loss": 1.0738524198532104, "eval_runtime": 2339.2029, "eval_samples_per_second": 32.183, "eval_steps_per_second": 16.092, "step": 88000 }, { "acc": 0.73877354, "epoch": 0.9844990611634862, "grad_norm": 6.1875, "learning_rate": 5.540406478840114e-06, "loss": 1.04170971, "memory(GiB)": 142.32, "step": 88020, "train_speed(iter/s)": 0.285408 }, { "acc": 0.73515224, "epoch": 0.9847227601094447, "grad_norm": 7.0625, "learning_rate": 5.5385678548918845e-06, "loss": 1.06258774, "memory(GiB)": 142.32, "step": 88040, "train_speed(iter/s)": 0.285428 }, { "acc": 0.73673205, "epoch": 0.9849464590554032, "grad_norm": 6.1875, "learning_rate": 5.5367291572598744e-06, "loss": 1.03631315, "memory(GiB)": 142.32, "step": 88060, "train_speed(iter/s)": 0.285449 }, { "acc": 0.73607035, "epoch": 0.9851701580013618, "grad_norm": 5.75, "learning_rate": 5.534890386195645e-06, "loss": 1.03575745, "memory(GiB)": 142.32, "step": 88080, "train_speed(iter/s)": 0.285472 }, { "acc": 0.74013119, "epoch": 0.9853938569473203, "grad_norm": 6.65625, "learning_rate": 5.5330515419507656e-06, "loss": 1.02779312, "memory(GiB)": 142.32, "step": 88100, "train_speed(iter/s)": 0.285495 }, { "acc": 0.7439652, "epoch": 0.9856175558932788, "grad_norm": 6.21875, "learning_rate": 5.531212624776815e-06, "loss": 0.99355078, "memory(GiB)": 142.32, "step": 88120, "train_speed(iter/s)": 0.28552 }, { "acc": 0.74150963, "epoch": 0.9858412548392373, "grad_norm": 6.9375, "learning_rate": 5.529373634925385e-06, "loss": 1.01741924, "memory(GiB)": 142.32, "step": 88140, "train_speed(iter/s)": 0.285547 }, { "acc": 0.71960044, "epoch": 0.9860649537851959, "grad_norm": 4.59375, "learning_rate": 5.5275345726480756e-06, "loss": 1.1080265, "memory(GiB)": 142.32, "step": 88160, "train_speed(iter/s)": 0.285569 }, { "acc": 0.74506245, "epoch": 0.9862886527311544, "grad_norm": 7.0, "learning_rate": 5.525695438196496e-06, "loss": 0.99926548, "memory(GiB)": 142.32, "step": 88180, "train_speed(iter/s)": 0.285591 }, { "acc": 0.73333864, "epoch": 0.9865123516771129, "grad_norm": 6.125, "learning_rate": 5.5238562318222665e-06, "loss": 1.07232141, "memory(GiB)": 142.32, "step": 88200, "train_speed(iter/s)": 0.285614 }, { "acc": 0.73820086, "epoch": 0.9867360506230715, "grad_norm": 6.25, "learning_rate": 5.522016953777017e-06, "loss": 1.04052858, "memory(GiB)": 142.32, "step": 88220, "train_speed(iter/s)": 0.285636 }, { "acc": 0.73098459, "epoch": 0.98695974956903, "grad_norm": 5.59375, "learning_rate": 5.520177604312386e-06, "loss": 1.06705971, "memory(GiB)": 142.32, "step": 88240, "train_speed(iter/s)": 0.285659 }, { "acc": 0.73909998, "epoch": 0.9871834485149885, "grad_norm": 6.09375, "learning_rate": 5.5183381836800255e-06, "loss": 1.04903412, "memory(GiB)": 142.32, "step": 88260, "train_speed(iter/s)": 0.285682 }, { "acc": 0.74121637, "epoch": 0.987407147460947, "grad_norm": 6.875, "learning_rate": 5.516498692131592e-06, "loss": 1.02857037, "memory(GiB)": 142.32, "step": 88280, "train_speed(iter/s)": 0.285705 }, { "acc": 0.74008856, "epoch": 0.9876308464069056, "grad_norm": 6.5625, "learning_rate": 5.514659129918756e-06, "loss": 1.03387928, "memory(GiB)": 142.32, "step": 88300, "train_speed(iter/s)": 0.285726 }, { "acc": 0.72896175, "epoch": 0.9878545453528641, "grad_norm": 6.40625, "learning_rate": 5.512819497293193e-06, "loss": 1.08303089, "memory(GiB)": 142.32, "step": 88320, "train_speed(iter/s)": 0.285749 }, { "acc": 0.74265442, "epoch": 0.9880782442988226, "grad_norm": 5.25, "learning_rate": 5.510979794506593e-06, "loss": 1.0224741, "memory(GiB)": 142.32, "step": 88340, "train_speed(iter/s)": 0.285774 }, { "acc": 0.7332798, "epoch": 0.9883019432447812, "grad_norm": 5.8125, "learning_rate": 5.509140021810654e-06, "loss": 1.06263218, "memory(GiB)": 142.32, "step": 88360, "train_speed(iter/s)": 0.285798 }, { "acc": 0.74973764, "epoch": 0.9885256421907397, "grad_norm": 7.15625, "learning_rate": 5.507300179457082e-06, "loss": 0.9927186, "memory(GiB)": 142.32, "step": 88380, "train_speed(iter/s)": 0.285823 }, { "acc": 0.72046299, "epoch": 0.9887493411366982, "grad_norm": 5.84375, "learning_rate": 5.505460267697597e-06, "loss": 1.11028156, "memory(GiB)": 142.32, "step": 88400, "train_speed(iter/s)": 0.285844 }, { "acc": 0.74336748, "epoch": 0.9889730400826567, "grad_norm": 6.28125, "learning_rate": 5.503620286783921e-06, "loss": 1.01278534, "memory(GiB)": 142.32, "step": 88420, "train_speed(iter/s)": 0.285865 }, { "acc": 0.72015643, "epoch": 0.9891967390286153, "grad_norm": 5.25, "learning_rate": 5.5017802369677905e-06, "loss": 1.14907799, "memory(GiB)": 142.32, "step": 88440, "train_speed(iter/s)": 0.285885 }, { "acc": 0.72143483, "epoch": 0.9894204379745738, "grad_norm": 6.375, "learning_rate": 5.499940118500953e-06, "loss": 1.12230549, "memory(GiB)": 142.32, "step": 88460, "train_speed(iter/s)": 0.285909 }, { "acc": 0.73093586, "epoch": 0.9896441369205323, "grad_norm": 7.03125, "learning_rate": 5.49809993163516e-06, "loss": 1.06909237, "memory(GiB)": 142.32, "step": 88480, "train_speed(iter/s)": 0.28593 }, { "acc": 0.74068809, "epoch": 0.9898678358664909, "grad_norm": 6.15625, "learning_rate": 5.496259676622178e-06, "loss": 1.03439713, "memory(GiB)": 142.32, "step": 88500, "train_speed(iter/s)": 0.285956 }, { "acc": 0.72674508, "epoch": 0.9900915348124494, "grad_norm": 4.90625, "learning_rate": 5.49441935371378e-06, "loss": 1.07896662, "memory(GiB)": 142.32, "step": 88520, "train_speed(iter/s)": 0.28598 }, { "acc": 0.73701849, "epoch": 0.9903152337584079, "grad_norm": 5.0625, "learning_rate": 5.492578963161746e-06, "loss": 1.04620504, "memory(GiB)": 142.32, "step": 88540, "train_speed(iter/s)": 0.286001 }, { "acc": 0.73881187, "epoch": 0.9905389327043664, "grad_norm": 5.875, "learning_rate": 5.490738505217869e-06, "loss": 1.04271259, "memory(GiB)": 142.32, "step": 88560, "train_speed(iter/s)": 0.286023 }, { "acc": 0.74236026, "epoch": 0.990762631650325, "grad_norm": 7.28125, "learning_rate": 5.488897980133951e-06, "loss": 1.03176384, "memory(GiB)": 142.32, "step": 88580, "train_speed(iter/s)": 0.286046 }, { "acc": 0.72736731, "epoch": 0.9909863305962835, "grad_norm": 6.5625, "learning_rate": 5.487057388161801e-06, "loss": 1.10344028, "memory(GiB)": 142.32, "step": 88600, "train_speed(iter/s)": 0.286068 }, { "acc": 0.73703976, "epoch": 0.991210029542242, "grad_norm": 5.40625, "learning_rate": 5.485216729553239e-06, "loss": 1.04805489, "memory(GiB)": 142.32, "step": 88620, "train_speed(iter/s)": 0.28609 }, { "acc": 0.73158917, "epoch": 0.9914337284882005, "grad_norm": 6.3125, "learning_rate": 5.4833760045600926e-06, "loss": 1.05273418, "memory(GiB)": 142.32, "step": 88640, "train_speed(iter/s)": 0.28611 }, { "acc": 0.7365077, "epoch": 0.9916574274341591, "grad_norm": 6.65625, "learning_rate": 5.481535213434199e-06, "loss": 1.05399055, "memory(GiB)": 142.32, "step": 88660, "train_speed(iter/s)": 0.28613 }, { "acc": 0.72691908, "epoch": 0.9918811263801176, "grad_norm": 7.4375, "learning_rate": 5.479694356427407e-06, "loss": 1.08837547, "memory(GiB)": 142.32, "step": 88680, "train_speed(iter/s)": 0.286153 }, { "acc": 0.73511305, "epoch": 0.9921048253260761, "grad_norm": 6.875, "learning_rate": 5.47785343379157e-06, "loss": 1.06386175, "memory(GiB)": 142.32, "step": 88700, "train_speed(iter/s)": 0.286178 }, { "acc": 0.72946396, "epoch": 0.9923285242720347, "grad_norm": 5.6875, "learning_rate": 5.476012445778554e-06, "loss": 1.08778496, "memory(GiB)": 142.32, "step": 88720, "train_speed(iter/s)": 0.286199 }, { "acc": 0.7405601, "epoch": 0.9925522232179932, "grad_norm": 5.5, "learning_rate": 5.47417139264023e-06, "loss": 1.02907906, "memory(GiB)": 142.32, "step": 88740, "train_speed(iter/s)": 0.28622 }, { "acc": 0.74194078, "epoch": 0.9927759221639517, "grad_norm": 5.75, "learning_rate": 5.472330274628484e-06, "loss": 1.02324228, "memory(GiB)": 142.32, "step": 88760, "train_speed(iter/s)": 0.286241 }, { "acc": 0.73925509, "epoch": 0.9929996211099102, "grad_norm": 6.46875, "learning_rate": 5.470489091995203e-06, "loss": 1.03104668, "memory(GiB)": 142.32, "step": 88780, "train_speed(iter/s)": 0.286265 }, { "acc": 0.72374668, "epoch": 0.9932233200558688, "grad_norm": 5.09375, "learning_rate": 5.46864784499229e-06, "loss": 1.10955114, "memory(GiB)": 142.32, "step": 88800, "train_speed(iter/s)": 0.286289 }, { "acc": 0.7346859, "epoch": 0.9934470190018273, "grad_norm": 7.40625, "learning_rate": 5.466806533871655e-06, "loss": 1.03159027, "memory(GiB)": 142.32, "step": 88820, "train_speed(iter/s)": 0.286311 }, { "acc": 0.73925972, "epoch": 0.9936707179477858, "grad_norm": 6.625, "learning_rate": 5.464965158885212e-06, "loss": 1.03810253, "memory(GiB)": 142.32, "step": 88840, "train_speed(iter/s)": 0.286335 }, { "acc": 0.74331551, "epoch": 0.9938944168937444, "grad_norm": 6.15625, "learning_rate": 5.463123720284889e-06, "loss": 1.00941372, "memory(GiB)": 142.32, "step": 88860, "train_speed(iter/s)": 0.286354 }, { "acc": 0.72383747, "epoch": 0.9941181158397029, "grad_norm": 6.71875, "learning_rate": 5.461282218322623e-06, "loss": 1.11203642, "memory(GiB)": 142.32, "step": 88880, "train_speed(iter/s)": 0.286374 }, { "acc": 0.72648144, "epoch": 0.9943418147856614, "grad_norm": 6.4375, "learning_rate": 5.4594406532503564e-06, "loss": 1.10691385, "memory(GiB)": 142.32, "step": 88900, "train_speed(iter/s)": 0.286395 }, { "acc": 0.73074045, "epoch": 0.9945655137316199, "grad_norm": 6.15625, "learning_rate": 5.4575990253200415e-06, "loss": 1.06934814, "memory(GiB)": 142.32, "step": 88920, "train_speed(iter/s)": 0.286418 }, { "acc": 0.72825255, "epoch": 0.9947892126775785, "grad_norm": 5.78125, "learning_rate": 5.455757334783639e-06, "loss": 1.06555958, "memory(GiB)": 142.32, "step": 88940, "train_speed(iter/s)": 0.286439 }, { "acc": 0.72205696, "epoch": 0.995012911623537, "grad_norm": 6.5, "learning_rate": 5.453915581893119e-06, "loss": 1.10411072, "memory(GiB)": 142.32, "step": 88960, "train_speed(iter/s)": 0.286462 }, { "acc": 0.74251213, "epoch": 0.9952366105694955, "grad_norm": 6.53125, "learning_rate": 5.4520737669004585e-06, "loss": 1.03739672, "memory(GiB)": 142.32, "step": 88980, "train_speed(iter/s)": 0.286479 }, { "acc": 0.74705639, "epoch": 0.9954603095154541, "grad_norm": 6.28125, "learning_rate": 5.450231890057646e-06, "loss": 1.00618658, "memory(GiB)": 142.32, "step": 89000, "train_speed(iter/s)": 0.286503 }, { "acc": 0.72735863, "epoch": 0.9956840084614126, "grad_norm": 5.3125, "learning_rate": 5.448389951616675e-06, "loss": 1.093046, "memory(GiB)": 142.32, "step": 89020, "train_speed(iter/s)": 0.286525 }, { "acc": 0.74285851, "epoch": 0.9959077074073711, "grad_norm": 6.09375, "learning_rate": 5.4465479518295505e-06, "loss": 1.01604595, "memory(GiB)": 142.32, "step": 89040, "train_speed(iter/s)": 0.286546 }, { "acc": 0.73239603, "epoch": 0.9961314063533296, "grad_norm": 6.15625, "learning_rate": 5.4447058909482844e-06, "loss": 1.05246735, "memory(GiB)": 142.32, "step": 89060, "train_speed(iter/s)": 0.286568 }, { "acc": 0.72563109, "epoch": 0.9963551052992882, "grad_norm": 6.03125, "learning_rate": 5.442863769224894e-06, "loss": 1.08409786, "memory(GiB)": 142.32, "step": 89080, "train_speed(iter/s)": 0.28659 }, { "acc": 0.74861841, "epoch": 0.9965788042452467, "grad_norm": 5.6875, "learning_rate": 5.44102158691141e-06, "loss": 1.01310616, "memory(GiB)": 142.32, "step": 89100, "train_speed(iter/s)": 0.286607 }, { "acc": 0.72886081, "epoch": 0.9968025031912052, "grad_norm": 5.34375, "learning_rate": 5.4391793442598705e-06, "loss": 1.07795572, "memory(GiB)": 142.32, "step": 89120, "train_speed(iter/s)": 0.286632 }, { "acc": 0.74093122, "epoch": 0.9970262021371638, "grad_norm": 6.75, "learning_rate": 5.437337041522319e-06, "loss": 1.0203805, "memory(GiB)": 142.32, "step": 89140, "train_speed(iter/s)": 0.286652 }, { "acc": 0.73638973, "epoch": 0.9972499010831223, "grad_norm": 5.0, "learning_rate": 5.435494678950809e-06, "loss": 1.04748573, "memory(GiB)": 142.32, "step": 89160, "train_speed(iter/s)": 0.286675 }, { "acc": 0.74248405, "epoch": 0.9974736000290808, "grad_norm": 6.0, "learning_rate": 5.4336522567974025e-06, "loss": 1.015522, "memory(GiB)": 142.32, "step": 89180, "train_speed(iter/s)": 0.286695 }, { "acc": 0.75423617, "epoch": 0.9976972989750393, "grad_norm": 5.375, "learning_rate": 5.4318097753141686e-06, "loss": 0.97458153, "memory(GiB)": 142.32, "step": 89200, "train_speed(iter/s)": 0.286714 }, { "acc": 0.7369091, "epoch": 0.997920997920998, "grad_norm": 4.78125, "learning_rate": 5.429967234753185e-06, "loss": 1.03833427, "memory(GiB)": 142.32, "step": 89220, "train_speed(iter/s)": 0.286736 }, { "acc": 0.73051281, "epoch": 0.9981446968669565, "grad_norm": 5.21875, "learning_rate": 5.428124635366539e-06, "loss": 1.06650372, "memory(GiB)": 142.32, "step": 89240, "train_speed(iter/s)": 0.28676 }, { "acc": 0.73399935, "epoch": 0.998368395812915, "grad_norm": 5.90625, "learning_rate": 5.4262819774063244e-06, "loss": 1.06163864, "memory(GiB)": 142.32, "step": 89260, "train_speed(iter/s)": 0.286782 }, { "acc": 0.73317032, "epoch": 0.9985920947588736, "grad_norm": 4.96875, "learning_rate": 5.424439261124641e-06, "loss": 1.07748089, "memory(GiB)": 142.32, "step": 89280, "train_speed(iter/s)": 0.286802 }, { "acc": 0.74223685, "epoch": 0.9988157937048321, "grad_norm": 5.65625, "learning_rate": 5.422596486773599e-06, "loss": 1.02199259, "memory(GiB)": 142.32, "step": 89300, "train_speed(iter/s)": 0.286824 }, { "acc": 0.72863092, "epoch": 0.9990394926507906, "grad_norm": 6.21875, "learning_rate": 5.42075365460532e-06, "loss": 1.08159819, "memory(GiB)": 142.32, "step": 89320, "train_speed(iter/s)": 0.286847 }, { "acc": 0.73454685, "epoch": 0.9992631915967491, "grad_norm": 6.03125, "learning_rate": 5.418910764871925e-06, "loss": 1.06095314, "memory(GiB)": 142.32, "step": 89340, "train_speed(iter/s)": 0.286865 }, { "acc": 0.73572865, "epoch": 0.9994868905427077, "grad_norm": 6.59375, "learning_rate": 5.417067817825551e-06, "loss": 1.04677715, "memory(GiB)": 142.32, "step": 89360, "train_speed(iter/s)": 0.286887 }, { "acc": 0.72772923, "epoch": 0.9997105894886662, "grad_norm": 5.5625, "learning_rate": 5.415224813718337e-06, "loss": 1.08632164, "memory(GiB)": 142.32, "step": 89380, "train_speed(iter/s)": 0.286906 }, { "acc": 0.74296975, "epoch": 0.9999342884346247, "grad_norm": 6.0, "learning_rate": 5.4133817528024345e-06, "loss": 1.01088181, "memory(GiB)": 142.32, "step": 89400, "train_speed(iter/s)": 0.286927 }, { "acc": 0.7264163, "epoch": 1.0001579873805833, "grad_norm": 5.375, "learning_rate": 5.411538635329999e-06, "loss": 1.09706964, "memory(GiB)": 142.32, "step": 89420, "train_speed(iter/s)": 0.286948 }, { "acc": 0.73477831, "epoch": 1.0003816863265418, "grad_norm": 5.5, "learning_rate": 5.409695461553197e-06, "loss": 1.04975586, "memory(GiB)": 142.32, "step": 89440, "train_speed(iter/s)": 0.286968 }, { "acc": 0.74086509, "epoch": 1.0006053852725003, "grad_norm": 4.90625, "learning_rate": 5.407852231724199e-06, "loss": 1.03376904, "memory(GiB)": 142.32, "step": 89460, "train_speed(iter/s)": 0.286991 }, { "acc": 0.74705238, "epoch": 1.0008290842184588, "grad_norm": 5.46875, "learning_rate": 5.406008946095186e-06, "loss": 0.99530029, "memory(GiB)": 142.32, "step": 89480, "train_speed(iter/s)": 0.287014 }, { "acc": 0.730582, "epoch": 1.0010527831644174, "grad_norm": 5.75, "learning_rate": 5.404165604918346e-06, "loss": 1.07215881, "memory(GiB)": 142.32, "step": 89500, "train_speed(iter/s)": 0.287037 }, { "acc": 0.73855944, "epoch": 1.001276482110376, "grad_norm": 6.59375, "learning_rate": 5.402322208445875e-06, "loss": 1.05437355, "memory(GiB)": 142.32, "step": 89520, "train_speed(iter/s)": 0.28706 }, { "acc": 0.72573972, "epoch": 1.0015001810563344, "grad_norm": 5.3125, "learning_rate": 5.400478756929977e-06, "loss": 1.07901659, "memory(GiB)": 142.32, "step": 89540, "train_speed(iter/s)": 0.287081 }, { "acc": 0.73240609, "epoch": 1.001723880002293, "grad_norm": 5.625, "learning_rate": 5.398635250622858e-06, "loss": 1.07250481, "memory(GiB)": 142.32, "step": 89560, "train_speed(iter/s)": 0.287102 }, { "acc": 0.75513639, "epoch": 1.0019475789482515, "grad_norm": 4.625, "learning_rate": 5.396791689776739e-06, "loss": 0.97395496, "memory(GiB)": 142.32, "step": 89580, "train_speed(iter/s)": 0.287122 }, { "acc": 0.73480291, "epoch": 1.00217127789421, "grad_norm": 5.28125, "learning_rate": 5.394948074643846e-06, "loss": 1.05480976, "memory(GiB)": 142.32, "step": 89600, "train_speed(iter/s)": 0.287146 }, { "acc": 0.73755789, "epoch": 1.0023949768401685, "grad_norm": 6.09375, "learning_rate": 5.393104405476413e-06, "loss": 1.03567047, "memory(GiB)": 142.32, "step": 89620, "train_speed(iter/s)": 0.287168 }, { "acc": 0.71858029, "epoch": 1.002618675786127, "grad_norm": 5.40625, "learning_rate": 5.3912606825266765e-06, "loss": 1.13159323, "memory(GiB)": 142.32, "step": 89640, "train_speed(iter/s)": 0.287191 }, { "acc": 0.74390059, "epoch": 1.0028423747320856, "grad_norm": 6.03125, "learning_rate": 5.389416906046888e-06, "loss": 1.01572857, "memory(GiB)": 142.32, "step": 89660, "train_speed(iter/s)": 0.287213 }, { "acc": 0.74569988, "epoch": 1.0030660736780441, "grad_norm": 6.625, "learning_rate": 5.3875730762893e-06, "loss": 1.01852016, "memory(GiB)": 142.32, "step": 89680, "train_speed(iter/s)": 0.287236 }, { "acc": 0.72035427, "epoch": 1.0032897726240027, "grad_norm": 6.875, "learning_rate": 5.385729193506175e-06, "loss": 1.12458973, "memory(GiB)": 142.32, "step": 89700, "train_speed(iter/s)": 0.287257 }, { "acc": 0.73821211, "epoch": 1.0035134715699612, "grad_norm": 5.5625, "learning_rate": 5.383885257949783e-06, "loss": 1.06478233, "memory(GiB)": 142.32, "step": 89720, "train_speed(iter/s)": 0.287279 }, { "acc": 0.74014025, "epoch": 1.0037371705159197, "grad_norm": 5.53125, "learning_rate": 5.3820412698724e-06, "loss": 1.0236599, "memory(GiB)": 142.32, "step": 89740, "train_speed(iter/s)": 0.287302 }, { "acc": 0.73550401, "epoch": 1.0039608694618782, "grad_norm": 5.9375, "learning_rate": 5.380197229526313e-06, "loss": 1.03237343, "memory(GiB)": 142.32, "step": 89760, "train_speed(iter/s)": 0.287325 }, { "acc": 0.73709154, "epoch": 1.0041845684078368, "grad_norm": 7.15625, "learning_rate": 5.378353137163808e-06, "loss": 1.05312452, "memory(GiB)": 142.32, "step": 89780, "train_speed(iter/s)": 0.287345 }, { "acc": 0.75118856, "epoch": 1.0044082673537953, "grad_norm": 6.28125, "learning_rate": 5.376508993037187e-06, "loss": 0.97435436, "memory(GiB)": 142.32, "step": 89800, "train_speed(iter/s)": 0.287365 }, { "acc": 0.74249315, "epoch": 1.0046319662997538, "grad_norm": 6.46875, "learning_rate": 5.374664797398754e-06, "loss": 1.00682888, "memory(GiB)": 142.32, "step": 89820, "train_speed(iter/s)": 0.287386 }, { "acc": 0.74367428, "epoch": 1.0048556652457123, "grad_norm": 7.40625, "learning_rate": 5.372820550500822e-06, "loss": 1.03014469, "memory(GiB)": 142.32, "step": 89840, "train_speed(iter/s)": 0.287406 }, { "acc": 0.73590307, "epoch": 1.0050793641916709, "grad_norm": 6.71875, "learning_rate": 5.3709762525957095e-06, "loss": 1.06065731, "memory(GiB)": 142.32, "step": 89860, "train_speed(iter/s)": 0.287426 }, { "acc": 0.73317499, "epoch": 1.0053030631376294, "grad_norm": 6.09375, "learning_rate": 5.369131903935744e-06, "loss": 1.05601168, "memory(GiB)": 142.32, "step": 89880, "train_speed(iter/s)": 0.287449 }, { "acc": 0.74721098, "epoch": 1.005526762083588, "grad_norm": 6.375, "learning_rate": 5.367287504773256e-06, "loss": 0.98835583, "memory(GiB)": 142.32, "step": 89900, "train_speed(iter/s)": 0.28747 }, { "acc": 0.72789106, "epoch": 1.0057504610295465, "grad_norm": 5.53125, "learning_rate": 5.36544305536059e-06, "loss": 1.07470512, "memory(GiB)": 142.32, "step": 89920, "train_speed(iter/s)": 0.287492 }, { "acc": 0.74121537, "epoch": 1.005974159975505, "grad_norm": 5.375, "learning_rate": 5.3635985559500895e-06, "loss": 1.03270941, "memory(GiB)": 142.32, "step": 89940, "train_speed(iter/s)": 0.287514 }, { "acc": 0.71936407, "epoch": 1.0061978589214635, "grad_norm": 5.28125, "learning_rate": 5.36175400679411e-06, "loss": 1.12365074, "memory(GiB)": 142.32, "step": 89960, "train_speed(iter/s)": 0.287535 }, { "acc": 0.73643007, "epoch": 1.006421557867422, "grad_norm": 5.84375, "learning_rate": 5.359909408145011e-06, "loss": 1.05002575, "memory(GiB)": 142.32, "step": 89980, "train_speed(iter/s)": 0.287556 }, { "acc": 0.74048839, "epoch": 1.0066452568133806, "grad_norm": 5.78125, "learning_rate": 5.358064760255161e-06, "loss": 1.02223988, "memory(GiB)": 142.32, "step": 90000, "train_speed(iter/s)": 0.287578 }, { "epoch": 1.0066452568133806, "eval_acc": 0.6957797429396345, "eval_loss": 1.0738943815231323, "eval_runtime": 2346.3402, "eval_samples_per_second": 32.085, "eval_steps_per_second": 16.043, "step": 90000 }, { "acc": 0.73186722, "epoch": 1.006868955759339, "grad_norm": 5.6875, "learning_rate": 5.356220063376933e-06, "loss": 1.08596716, "memory(GiB)": 142.32, "step": 90020, "train_speed(iter/s)": 0.285414 }, { "acc": 0.72718954, "epoch": 1.0070926547052976, "grad_norm": 6.5625, "learning_rate": 5.35437531776271e-06, "loss": 1.1100317, "memory(GiB)": 142.32, "step": 90040, "train_speed(iter/s)": 0.285436 }, { "acc": 0.74058247, "epoch": 1.0073163536512562, "grad_norm": 6.875, "learning_rate": 5.352530523664878e-06, "loss": 1.0176487, "memory(GiB)": 142.32, "step": 90060, "train_speed(iter/s)": 0.285457 }, { "acc": 0.7310894, "epoch": 1.0075400525972147, "grad_norm": 5.40625, "learning_rate": 5.350685681335831e-06, "loss": 1.05683079, "memory(GiB)": 142.32, "step": 90080, "train_speed(iter/s)": 0.285479 }, { "acc": 0.74630995, "epoch": 1.0077637515431732, "grad_norm": 6.53125, "learning_rate": 5.348840791027971e-06, "loss": 0.9915081, "memory(GiB)": 142.32, "step": 90100, "train_speed(iter/s)": 0.285504 }, { "acc": 0.73484898, "epoch": 1.0079874504891317, "grad_norm": 6.125, "learning_rate": 5.346995852993704e-06, "loss": 1.06108389, "memory(GiB)": 142.32, "step": 90120, "train_speed(iter/s)": 0.285526 }, { "acc": 0.72476563, "epoch": 1.0082111494350903, "grad_norm": 6.875, "learning_rate": 5.345150867485445e-06, "loss": 1.10466805, "memory(GiB)": 142.32, "step": 90140, "train_speed(iter/s)": 0.285547 }, { "acc": 0.73096461, "epoch": 1.0084348483810488, "grad_norm": 5.96875, "learning_rate": 5.343305834755615e-06, "loss": 1.07910767, "memory(GiB)": 142.32, "step": 90160, "train_speed(iter/s)": 0.285569 }, { "acc": 0.74565964, "epoch": 1.0086585473270073, "grad_norm": 7.46875, "learning_rate": 5.341460755056639e-06, "loss": 1.00541191, "memory(GiB)": 142.32, "step": 90180, "train_speed(iter/s)": 0.285592 }, { "acc": 0.73252163, "epoch": 1.0088822462729659, "grad_norm": 6.71875, "learning_rate": 5.339615628640951e-06, "loss": 1.05042076, "memory(GiB)": 142.32, "step": 90200, "train_speed(iter/s)": 0.285614 }, { "acc": 0.74339237, "epoch": 1.0091059452189244, "grad_norm": 6.9375, "learning_rate": 5.33777045576099e-06, "loss": 1.01472082, "memory(GiB)": 142.32, "step": 90220, "train_speed(iter/s)": 0.285635 }, { "acc": 0.72672195, "epoch": 1.009329644164883, "grad_norm": 5.21875, "learning_rate": 5.335925236669205e-06, "loss": 1.08936653, "memory(GiB)": 142.32, "step": 90240, "train_speed(iter/s)": 0.285655 }, { "acc": 0.73075495, "epoch": 1.0095533431108414, "grad_norm": 4.75, "learning_rate": 5.334079971618045e-06, "loss": 1.0765769, "memory(GiB)": 142.32, "step": 90260, "train_speed(iter/s)": 0.285676 }, { "acc": 0.73055706, "epoch": 1.0097770420568, "grad_norm": 5.78125, "learning_rate": 5.332234660859969e-06, "loss": 1.08147802, "memory(GiB)": 142.32, "step": 90280, "train_speed(iter/s)": 0.285697 }, { "acc": 0.75229607, "epoch": 1.0100007410027585, "grad_norm": 6.84375, "learning_rate": 5.330389304647443e-06, "loss": 0.96925335, "memory(GiB)": 142.32, "step": 90300, "train_speed(iter/s)": 0.285718 }, { "acc": 0.72484431, "epoch": 1.010224439948717, "grad_norm": 4.96875, "learning_rate": 5.328543903232939e-06, "loss": 1.105021, "memory(GiB)": 142.32, "step": 90320, "train_speed(iter/s)": 0.285738 }, { "acc": 0.72977257, "epoch": 1.0104481388946756, "grad_norm": 6.46875, "learning_rate": 5.326698456868931e-06, "loss": 1.07772675, "memory(GiB)": 142.32, "step": 90340, "train_speed(iter/s)": 0.285759 }, { "acc": 0.74067926, "epoch": 1.010671837840634, "grad_norm": 6.4375, "learning_rate": 5.324852965807905e-06, "loss": 1.02709875, "memory(GiB)": 142.32, "step": 90360, "train_speed(iter/s)": 0.285781 }, { "acc": 0.7348134, "epoch": 1.0108955367865926, "grad_norm": 6.96875, "learning_rate": 5.3230074303023515e-06, "loss": 1.04367485, "memory(GiB)": 142.32, "step": 90380, "train_speed(iter/s)": 0.285801 }, { "acc": 0.72542753, "epoch": 1.0111192357325511, "grad_norm": 6.03125, "learning_rate": 5.321161850604763e-06, "loss": 1.10831871, "memory(GiB)": 142.32, "step": 90400, "train_speed(iter/s)": 0.285821 }, { "acc": 0.73058786, "epoch": 1.0113429346785097, "grad_norm": 6.78125, "learning_rate": 5.319316226967645e-06, "loss": 1.08633423, "memory(GiB)": 142.32, "step": 90420, "train_speed(iter/s)": 0.285844 }, { "acc": 0.73103752, "epoch": 1.0115666336244682, "grad_norm": 7.4375, "learning_rate": 5.3174705596435e-06, "loss": 1.09539595, "memory(GiB)": 142.32, "step": 90440, "train_speed(iter/s)": 0.285865 }, { "acc": 0.73613729, "epoch": 1.0117903325704267, "grad_norm": 4.75, "learning_rate": 5.315624848884847e-06, "loss": 1.0448267, "memory(GiB)": 142.32, "step": 90460, "train_speed(iter/s)": 0.285884 }, { "acc": 0.73562222, "epoch": 1.0120140315163852, "grad_norm": 6.5625, "learning_rate": 5.3137790949442025e-06, "loss": 1.0459549, "memory(GiB)": 142.32, "step": 90480, "train_speed(iter/s)": 0.285904 }, { "acc": 0.73260031, "epoch": 1.0122377304623438, "grad_norm": 5.0625, "learning_rate": 5.311933298074094e-06, "loss": 1.07926159, "memory(GiB)": 142.32, "step": 90500, "train_speed(iter/s)": 0.285924 }, { "acc": 0.72866573, "epoch": 1.0124614294083023, "grad_norm": 5.6875, "learning_rate": 5.310087458527051e-06, "loss": 1.08836288, "memory(GiB)": 142.32, "step": 90520, "train_speed(iter/s)": 0.285944 }, { "acc": 0.72755656, "epoch": 1.0126851283542608, "grad_norm": 6.1875, "learning_rate": 5.308241576555612e-06, "loss": 1.073283, "memory(GiB)": 142.32, "step": 90540, "train_speed(iter/s)": 0.285967 }, { "acc": 0.73399868, "epoch": 1.0129088273002194, "grad_norm": 5.625, "learning_rate": 5.306395652412318e-06, "loss": 1.06080217, "memory(GiB)": 142.32, "step": 90560, "train_speed(iter/s)": 0.285985 }, { "acc": 0.73185978, "epoch": 1.013132526246178, "grad_norm": 6.40625, "learning_rate": 5.30454968634972e-06, "loss": 1.07331781, "memory(GiB)": 142.32, "step": 90580, "train_speed(iter/s)": 0.286004 }, { "acc": 0.73621221, "epoch": 1.0133562251921364, "grad_norm": 5.125, "learning_rate": 5.302703678620374e-06, "loss": 1.04663811, "memory(GiB)": 142.32, "step": 90600, "train_speed(iter/s)": 0.286027 }, { "acc": 0.73849535, "epoch": 1.013579924138095, "grad_norm": 5.71875, "learning_rate": 5.300857629476835e-06, "loss": 1.04809561, "memory(GiB)": 142.32, "step": 90620, "train_speed(iter/s)": 0.286046 }, { "acc": 0.73929377, "epoch": 1.0138036230840535, "grad_norm": 6.3125, "learning_rate": 5.299011539171673e-06, "loss": 1.0349617, "memory(GiB)": 142.32, "step": 90640, "train_speed(iter/s)": 0.286067 }, { "acc": 0.7354733, "epoch": 1.014027322030012, "grad_norm": 7.34375, "learning_rate": 5.29716540795746e-06, "loss": 1.04551105, "memory(GiB)": 142.32, "step": 90660, "train_speed(iter/s)": 0.286088 }, { "acc": 0.73557649, "epoch": 1.0142510209759705, "grad_norm": 5.34375, "learning_rate": 5.29531923608677e-06, "loss": 1.03489838, "memory(GiB)": 142.32, "step": 90680, "train_speed(iter/s)": 0.28611 }, { "acc": 0.74485035, "epoch": 1.014474719921929, "grad_norm": 6.125, "learning_rate": 5.293473023812189e-06, "loss": 1.01575174, "memory(GiB)": 142.32, "step": 90700, "train_speed(iter/s)": 0.286128 }, { "acc": 0.73836889, "epoch": 1.0146984188678876, "grad_norm": 5.90625, "learning_rate": 5.291626771386302e-06, "loss": 1.0399292, "memory(GiB)": 142.32, "step": 90720, "train_speed(iter/s)": 0.286148 }, { "acc": 0.73553305, "epoch": 1.0149221178138461, "grad_norm": 5.03125, "learning_rate": 5.289780479061706e-06, "loss": 1.05438251, "memory(GiB)": 142.32, "step": 90740, "train_speed(iter/s)": 0.286169 }, { "acc": 0.74330654, "epoch": 1.0151458167598046, "grad_norm": 7.5625, "learning_rate": 5.287934147090997e-06, "loss": 1.00906487, "memory(GiB)": 142.32, "step": 90760, "train_speed(iter/s)": 0.286189 }, { "acc": 0.73612585, "epoch": 1.0153695157057632, "grad_norm": 7.15625, "learning_rate": 5.286087775726782e-06, "loss": 1.06451492, "memory(GiB)": 142.32, "step": 90780, "train_speed(iter/s)": 0.286212 }, { "acc": 0.73000979, "epoch": 1.0155932146517217, "grad_norm": 5.5625, "learning_rate": 5.28424136522167e-06, "loss": 1.08470268, "memory(GiB)": 142.32, "step": 90800, "train_speed(iter/s)": 0.286234 }, { "acc": 0.72934103, "epoch": 1.0158169135976802, "grad_norm": 6.375, "learning_rate": 5.282394915828277e-06, "loss": 1.0809988, "memory(GiB)": 142.32, "step": 90820, "train_speed(iter/s)": 0.286254 }, { "acc": 0.73967671, "epoch": 1.0160406125436388, "grad_norm": 5.59375, "learning_rate": 5.280548427799224e-06, "loss": 1.04168186, "memory(GiB)": 142.32, "step": 90840, "train_speed(iter/s)": 0.286275 }, { "acc": 0.74195204, "epoch": 1.0162643114895973, "grad_norm": 5.28125, "learning_rate": 5.278701901387135e-06, "loss": 1.02838278, "memory(GiB)": 142.32, "step": 90860, "train_speed(iter/s)": 0.286297 }, { "acc": 0.73948107, "epoch": 1.0164880104355558, "grad_norm": 5.59375, "learning_rate": 5.276855336844641e-06, "loss": 1.03240356, "memory(GiB)": 142.32, "step": 90880, "train_speed(iter/s)": 0.286318 }, { "acc": 0.73934112, "epoch": 1.0167117093815143, "grad_norm": 4.84375, "learning_rate": 5.2750087344243805e-06, "loss": 1.02598934, "memory(GiB)": 142.32, "step": 90900, "train_speed(iter/s)": 0.286337 }, { "acc": 0.7497952, "epoch": 1.0169354083274729, "grad_norm": 5.875, "learning_rate": 5.273162094378995e-06, "loss": 0.98028288, "memory(GiB)": 142.32, "step": 90920, "train_speed(iter/s)": 0.286359 }, { "acc": 0.72853432, "epoch": 1.0171591072734314, "grad_norm": 5.75, "learning_rate": 5.271315416961131e-06, "loss": 1.09197197, "memory(GiB)": 142.32, "step": 90940, "train_speed(iter/s)": 0.286379 }, { "acc": 0.7298728, "epoch": 1.01738280621939, "grad_norm": 6.5, "learning_rate": 5.269468702423438e-06, "loss": 1.07305241, "memory(GiB)": 142.32, "step": 90960, "train_speed(iter/s)": 0.286401 }, { "acc": 0.73242159, "epoch": 1.0176065051653485, "grad_norm": 5.09375, "learning_rate": 5.267621951018577e-06, "loss": 1.05706253, "memory(GiB)": 142.32, "step": 90980, "train_speed(iter/s)": 0.286422 }, { "acc": 0.73513408, "epoch": 1.017830204111307, "grad_norm": 5.84375, "learning_rate": 5.265775162999206e-06, "loss": 1.0330719, "memory(GiB)": 142.32, "step": 91000, "train_speed(iter/s)": 0.286441 }, { "acc": 0.73715029, "epoch": 1.0180539030572655, "grad_norm": 6.46875, "learning_rate": 5.263928338617996e-06, "loss": 1.05116978, "memory(GiB)": 142.32, "step": 91020, "train_speed(iter/s)": 0.286462 }, { "acc": 0.73023076, "epoch": 1.018277602003224, "grad_norm": 6.625, "learning_rate": 5.262081478127616e-06, "loss": 1.07362833, "memory(GiB)": 142.32, "step": 91040, "train_speed(iter/s)": 0.286482 }, { "acc": 0.74325366, "epoch": 1.0185013009491826, "grad_norm": 6.28125, "learning_rate": 5.260234581780743e-06, "loss": 1.02142973, "memory(GiB)": 142.32, "step": 91060, "train_speed(iter/s)": 0.286503 }, { "acc": 0.72024183, "epoch": 1.018724999895141, "grad_norm": 5.46875, "learning_rate": 5.25838764983006e-06, "loss": 1.11694937, "memory(GiB)": 142.32, "step": 91080, "train_speed(iter/s)": 0.286523 }, { "acc": 0.74128847, "epoch": 1.0189486988410996, "grad_norm": 4.96875, "learning_rate": 5.256540682528254e-06, "loss": 1.02737379, "memory(GiB)": 142.32, "step": 91100, "train_speed(iter/s)": 0.286543 }, { "acc": 0.73642392, "epoch": 1.0191723977870581, "grad_norm": 6.59375, "learning_rate": 5.254693680128016e-06, "loss": 1.03419485, "memory(GiB)": 142.32, "step": 91120, "train_speed(iter/s)": 0.286563 }, { "acc": 0.74242215, "epoch": 1.0193960967330167, "grad_norm": 5.65625, "learning_rate": 5.252846642882041e-06, "loss": 1.02542248, "memory(GiB)": 142.32, "step": 91140, "train_speed(iter/s)": 0.286584 }, { "acc": 0.73342175, "epoch": 1.0196197956789752, "grad_norm": 6.0625, "learning_rate": 5.250999571043031e-06, "loss": 1.05381947, "memory(GiB)": 142.32, "step": 91160, "train_speed(iter/s)": 0.286607 }, { "acc": 0.73053112, "epoch": 1.0198434946249337, "grad_norm": 5.9375, "learning_rate": 5.249152464863692e-06, "loss": 1.07834101, "memory(GiB)": 142.32, "step": 91180, "train_speed(iter/s)": 0.286629 }, { "acc": 0.72846088, "epoch": 1.0200671935708923, "grad_norm": 8.3125, "learning_rate": 5.247305324596736e-06, "loss": 1.08958683, "memory(GiB)": 142.32, "step": 91200, "train_speed(iter/s)": 0.28665 }, { "acc": 0.73594389, "epoch": 1.0202908925168508, "grad_norm": 5.75, "learning_rate": 5.245458150494877e-06, "loss": 1.05307198, "memory(GiB)": 142.32, "step": 91220, "train_speed(iter/s)": 0.286672 }, { "acc": 0.73330441, "epoch": 1.0205145914628093, "grad_norm": 5.5, "learning_rate": 5.243610942810834e-06, "loss": 1.06641006, "memory(GiB)": 142.32, "step": 91240, "train_speed(iter/s)": 0.286693 }, { "acc": 0.74587965, "epoch": 1.0207382904087678, "grad_norm": 5.90625, "learning_rate": 5.2417637017973315e-06, "loss": 1.01674519, "memory(GiB)": 142.32, "step": 91260, "train_speed(iter/s)": 0.286714 }, { "acc": 0.74196501, "epoch": 1.0209619893547264, "grad_norm": 4.8125, "learning_rate": 5.239916427707099e-06, "loss": 1.02092695, "memory(GiB)": 142.32, "step": 91280, "train_speed(iter/s)": 0.286734 }, { "acc": 0.73706379, "epoch": 1.021185688300685, "grad_norm": 6.28125, "learning_rate": 5.23806912079287e-06, "loss": 1.04235411, "memory(GiB)": 142.32, "step": 91300, "train_speed(iter/s)": 0.286754 }, { "acc": 0.74155083, "epoch": 1.0214093872466434, "grad_norm": 5.25, "learning_rate": 5.236221781307383e-06, "loss": 1.04593983, "memory(GiB)": 142.32, "step": 91320, "train_speed(iter/s)": 0.286775 }, { "acc": 0.74905748, "epoch": 1.021633086192602, "grad_norm": 5.25, "learning_rate": 5.23437440950338e-06, "loss": 0.99312582, "memory(GiB)": 142.32, "step": 91340, "train_speed(iter/s)": 0.286796 }, { "acc": 0.75303755, "epoch": 1.0218567851385605, "grad_norm": 5.3125, "learning_rate": 5.232527005633608e-06, "loss": 0.97205582, "memory(GiB)": 142.32, "step": 91360, "train_speed(iter/s)": 0.286817 }, { "acc": 0.7338872, "epoch": 1.022080484084519, "grad_norm": 6.84375, "learning_rate": 5.230679569950817e-06, "loss": 1.05959206, "memory(GiB)": 142.32, "step": 91380, "train_speed(iter/s)": 0.286837 }, { "acc": 0.7487299, "epoch": 1.0223041830304775, "grad_norm": 6.65625, "learning_rate": 5.228832102707763e-06, "loss": 1.00263233, "memory(GiB)": 142.32, "step": 91400, "train_speed(iter/s)": 0.286857 }, { "acc": 0.74452257, "epoch": 1.022527881976436, "grad_norm": 5.25, "learning_rate": 5.226984604157209e-06, "loss": 1.0078104, "memory(GiB)": 142.32, "step": 91420, "train_speed(iter/s)": 0.286877 }, { "acc": 0.74570904, "epoch": 1.0227515809223946, "grad_norm": 5.53125, "learning_rate": 5.225137074551917e-06, "loss": 1.01136303, "memory(GiB)": 142.32, "step": 91440, "train_speed(iter/s)": 0.286898 }, { "acc": 0.72973328, "epoch": 1.0229752798683531, "grad_norm": 5.90625, "learning_rate": 5.223289514144654e-06, "loss": 1.07371149, "memory(GiB)": 142.32, "step": 91460, "train_speed(iter/s)": 0.286919 }, { "acc": 0.74202824, "epoch": 1.0231989788143117, "grad_norm": 5.90625, "learning_rate": 5.221441923188193e-06, "loss": 1.01636276, "memory(GiB)": 142.32, "step": 91480, "train_speed(iter/s)": 0.286942 }, { "acc": 0.74216452, "epoch": 1.0234226777602702, "grad_norm": 6.15625, "learning_rate": 5.219594301935313e-06, "loss": 1.03943319, "memory(GiB)": 142.32, "step": 91500, "train_speed(iter/s)": 0.286964 }, { "acc": 0.7319046, "epoch": 1.0236463767062287, "grad_norm": 6.46875, "learning_rate": 5.217746650638793e-06, "loss": 1.06953659, "memory(GiB)": 142.32, "step": 91520, "train_speed(iter/s)": 0.286984 }, { "acc": 0.7360898, "epoch": 1.0238700756521872, "grad_norm": 5.65625, "learning_rate": 5.21589896955142e-06, "loss": 1.0522974, "memory(GiB)": 142.32, "step": 91540, "train_speed(iter/s)": 0.287006 }, { "acc": 0.72084594, "epoch": 1.0240937745981458, "grad_norm": 7.25, "learning_rate": 5.2140512589259804e-06, "loss": 1.12860699, "memory(GiB)": 142.32, "step": 91560, "train_speed(iter/s)": 0.287028 }, { "acc": 0.74776955, "epoch": 1.0243174735441043, "grad_norm": 7.0, "learning_rate": 5.21220351901527e-06, "loss": 1.02306566, "memory(GiB)": 142.32, "step": 91580, "train_speed(iter/s)": 0.287047 }, { "acc": 0.75009422, "epoch": 1.0245411724900628, "grad_norm": 5.65625, "learning_rate": 5.210355750072085e-06, "loss": 0.97756529, "memory(GiB)": 142.32, "step": 91600, "train_speed(iter/s)": 0.287069 }, { "acc": 0.74382763, "epoch": 1.0247648714360214, "grad_norm": 6.03125, "learning_rate": 5.208507952349227e-06, "loss": 1.01517754, "memory(GiB)": 142.32, "step": 91620, "train_speed(iter/s)": 0.28709 }, { "acc": 0.74078779, "epoch": 1.0249885703819799, "grad_norm": 5.8125, "learning_rate": 5.206660126099501e-06, "loss": 1.03900509, "memory(GiB)": 142.32, "step": 91640, "train_speed(iter/s)": 0.287111 }, { "acc": 0.7400003, "epoch": 1.0252122693279384, "grad_norm": 5.59375, "learning_rate": 5.2048122715757154e-06, "loss": 1.04062176, "memory(GiB)": 142.32, "step": 91660, "train_speed(iter/s)": 0.287129 }, { "acc": 0.7288435, "epoch": 1.025435968273897, "grad_norm": 4.65625, "learning_rate": 5.202964389030683e-06, "loss": 1.07621498, "memory(GiB)": 142.32, "step": 91680, "train_speed(iter/s)": 0.287152 }, { "acc": 0.73801146, "epoch": 1.0256596672198555, "grad_norm": 6.4375, "learning_rate": 5.201116478717222e-06, "loss": 1.02972527, "memory(GiB)": 142.32, "step": 91700, "train_speed(iter/s)": 0.287172 }, { "acc": 0.72341733, "epoch": 1.025883366165814, "grad_norm": 6.5625, "learning_rate": 5.1992685408881515e-06, "loss": 1.10233059, "memory(GiB)": 142.32, "step": 91720, "train_speed(iter/s)": 0.287194 }, { "acc": 0.73947868, "epoch": 1.0261070651117725, "grad_norm": 6.0625, "learning_rate": 5.197420575796298e-06, "loss": 1.02798452, "memory(GiB)": 142.32, "step": 91740, "train_speed(iter/s)": 0.287217 }, { "acc": 0.74151144, "epoch": 1.026330764057731, "grad_norm": 5.28125, "learning_rate": 5.1955725836944874e-06, "loss": 1.03055077, "memory(GiB)": 142.32, "step": 91760, "train_speed(iter/s)": 0.287238 }, { "acc": 0.73416471, "epoch": 1.0265544630036896, "grad_norm": 5.875, "learning_rate": 5.19372456483555e-06, "loss": 1.05826941, "memory(GiB)": 142.32, "step": 91780, "train_speed(iter/s)": 0.287257 }, { "acc": 0.74270506, "epoch": 1.026778161949648, "grad_norm": 6.375, "learning_rate": 5.191876519472325e-06, "loss": 1.01449594, "memory(GiB)": 142.32, "step": 91800, "train_speed(iter/s)": 0.287277 }, { "acc": 0.73530164, "epoch": 1.0270018608956066, "grad_norm": 5.40625, "learning_rate": 5.190028447857649e-06, "loss": 1.04853363, "memory(GiB)": 142.32, "step": 91820, "train_speed(iter/s)": 0.287299 }, { "acc": 0.72725701, "epoch": 1.0272255598415652, "grad_norm": 6.78125, "learning_rate": 5.188180350244366e-06, "loss": 1.09259348, "memory(GiB)": 142.32, "step": 91840, "train_speed(iter/s)": 0.28732 }, { "acc": 0.73477354, "epoch": 1.0274492587875237, "grad_norm": 7.34375, "learning_rate": 5.18633222688532e-06, "loss": 1.0446703, "memory(GiB)": 142.32, "step": 91860, "train_speed(iter/s)": 0.287343 }, { "acc": 0.73248692, "epoch": 1.0276729577334822, "grad_norm": 6.5625, "learning_rate": 5.184484078033363e-06, "loss": 1.07155361, "memory(GiB)": 142.32, "step": 91880, "train_speed(iter/s)": 0.287361 }, { "acc": 0.73739014, "epoch": 1.0278966566794407, "grad_norm": 6.96875, "learning_rate": 5.182635903941346e-06, "loss": 1.0386879, "memory(GiB)": 142.32, "step": 91900, "train_speed(iter/s)": 0.287383 }, { "acc": 0.72421775, "epoch": 1.0281203556253993, "grad_norm": 5.71875, "learning_rate": 5.180787704862128e-06, "loss": 1.09925766, "memory(GiB)": 142.32, "step": 91920, "train_speed(iter/s)": 0.287401 }, { "acc": 0.73143578, "epoch": 1.0283440545713578, "grad_norm": 5.125, "learning_rate": 5.17893948104857e-06, "loss": 1.06808605, "memory(GiB)": 142.32, "step": 91940, "train_speed(iter/s)": 0.287424 }, { "acc": 0.73403187, "epoch": 1.0285677535173163, "grad_norm": 6.125, "learning_rate": 5.17709123275353e-06, "loss": 1.04419289, "memory(GiB)": 142.32, "step": 91960, "train_speed(iter/s)": 0.287447 }, { "acc": 0.72020063, "epoch": 1.0287914524632749, "grad_norm": 7.59375, "learning_rate": 5.17524296022988e-06, "loss": 1.11948795, "memory(GiB)": 142.32, "step": 91980, "train_speed(iter/s)": 0.287469 }, { "acc": 0.73109999, "epoch": 1.0290151514092334, "grad_norm": 5.4375, "learning_rate": 5.173394663730486e-06, "loss": 1.07149448, "memory(GiB)": 142.32, "step": 92000, "train_speed(iter/s)": 0.287491 }, { "epoch": 1.0290151514092334, "eval_acc": 0.6958500892663829, "eval_loss": 1.0735591650009155, "eval_runtime": 2341.351, "eval_samples_per_second": 32.154, "eval_steps_per_second": 16.077, "step": 92000 }, { "acc": 0.73240767, "epoch": 1.029238850355192, "grad_norm": 5.875, "learning_rate": 5.171546343508227e-06, "loss": 1.07178888, "memory(GiB)": 142.32, "step": 92020, "train_speed(iter/s)": 0.285379 }, { "acc": 0.73561964, "epoch": 1.0294625493011504, "grad_norm": 5.9375, "learning_rate": 5.169697999815974e-06, "loss": 1.06108208, "memory(GiB)": 142.32, "step": 92040, "train_speed(iter/s)": 0.285401 }, { "acc": 0.72976518, "epoch": 1.029686248247109, "grad_norm": 6.625, "learning_rate": 5.167849632906609e-06, "loss": 1.07649431, "memory(GiB)": 142.32, "step": 92060, "train_speed(iter/s)": 0.285422 }, { "acc": 0.73523417, "epoch": 1.0299099471930675, "grad_norm": 7.0, "learning_rate": 5.166001243033016e-06, "loss": 1.04543724, "memory(GiB)": 142.32, "step": 92080, "train_speed(iter/s)": 0.285441 }, { "acc": 0.72411308, "epoch": 1.030133646139026, "grad_norm": 5.875, "learning_rate": 5.16415283044808e-06, "loss": 1.09284143, "memory(GiB)": 142.32, "step": 92100, "train_speed(iter/s)": 0.285463 }, { "acc": 0.74285288, "epoch": 1.0303573450849846, "grad_norm": 6.96875, "learning_rate": 5.16230439540469e-06, "loss": 1.01269798, "memory(GiB)": 142.32, "step": 92120, "train_speed(iter/s)": 0.285487 }, { "acc": 0.74530249, "epoch": 1.030581044030943, "grad_norm": 6.5625, "learning_rate": 5.16045593815574e-06, "loss": 1.00427036, "memory(GiB)": 142.32, "step": 92140, "train_speed(iter/s)": 0.285508 }, { "acc": 0.73570261, "epoch": 1.0308047429769016, "grad_norm": 5.0625, "learning_rate": 5.158607458954123e-06, "loss": 1.04545021, "memory(GiB)": 142.32, "step": 92160, "train_speed(iter/s)": 0.285531 }, { "acc": 0.73405018, "epoch": 1.0310284419228601, "grad_norm": 5.8125, "learning_rate": 5.156758958052739e-06, "loss": 1.06300659, "memory(GiB)": 142.32, "step": 92180, "train_speed(iter/s)": 0.28555 }, { "acc": 0.73735819, "epoch": 1.0312521408688187, "grad_norm": 6.90625, "learning_rate": 5.1549104357044886e-06, "loss": 1.04382629, "memory(GiB)": 142.32, "step": 92200, "train_speed(iter/s)": 0.285574 }, { "acc": 0.74059534, "epoch": 1.0314758398147772, "grad_norm": 6.71875, "learning_rate": 5.153061892162276e-06, "loss": 1.03557034, "memory(GiB)": 142.32, "step": 92220, "train_speed(iter/s)": 0.285597 }, { "acc": 0.73285761, "epoch": 1.0316995387607357, "grad_norm": 5.90625, "learning_rate": 5.15121332767901e-06, "loss": 1.06951742, "memory(GiB)": 142.32, "step": 92240, "train_speed(iter/s)": 0.28562 }, { "acc": 0.727001, "epoch": 1.0319232377066943, "grad_norm": 6.15625, "learning_rate": 5.1493647425076e-06, "loss": 1.08517189, "memory(GiB)": 142.32, "step": 92260, "train_speed(iter/s)": 0.285642 }, { "acc": 0.74318061, "epoch": 1.0321469366526528, "grad_norm": 7.09375, "learning_rate": 5.147516136900957e-06, "loss": 1.01356277, "memory(GiB)": 142.32, "step": 92280, "train_speed(iter/s)": 0.285665 }, { "acc": 0.7358099, "epoch": 1.0323706355986113, "grad_norm": 5.65625, "learning_rate": 5.145667511111998e-06, "loss": 1.03963871, "memory(GiB)": 142.32, "step": 92300, "train_speed(iter/s)": 0.285687 }, { "acc": 0.7489254, "epoch": 1.0325943345445698, "grad_norm": 7.0625, "learning_rate": 5.1438188653936415e-06, "loss": 0.99336748, "memory(GiB)": 142.32, "step": 92320, "train_speed(iter/s)": 0.285703 }, { "acc": 0.72841253, "epoch": 1.0328180334905284, "grad_norm": 5.5, "learning_rate": 5.141970199998808e-06, "loss": 1.09143763, "memory(GiB)": 142.32, "step": 92340, "train_speed(iter/s)": 0.285725 }, { "acc": 0.7520802, "epoch": 1.033041732436487, "grad_norm": 5.3125, "learning_rate": 5.140121515180424e-06, "loss": 0.97668476, "memory(GiB)": 142.32, "step": 92360, "train_speed(iter/s)": 0.285741 }, { "acc": 0.74848518, "epoch": 1.0332654313824454, "grad_norm": 5.625, "learning_rate": 5.138272811191413e-06, "loss": 1.00253983, "memory(GiB)": 142.32, "step": 92380, "train_speed(iter/s)": 0.285766 }, { "acc": 0.72819242, "epoch": 1.033489130328404, "grad_norm": 6.0625, "learning_rate": 5.136424088284704e-06, "loss": 1.10396557, "memory(GiB)": 142.32, "step": 92400, "train_speed(iter/s)": 0.285786 }, { "acc": 0.73189583, "epoch": 1.0337128292743625, "grad_norm": 7.09375, "learning_rate": 5.13457534671323e-06, "loss": 1.07007904, "memory(GiB)": 142.32, "step": 92420, "train_speed(iter/s)": 0.28581 }, { "acc": 0.74340057, "epoch": 1.033936528220321, "grad_norm": 5.625, "learning_rate": 5.132726586729926e-06, "loss": 1.02266808, "memory(GiB)": 142.32, "step": 92440, "train_speed(iter/s)": 0.285831 }, { "acc": 0.74336696, "epoch": 1.0341602271662795, "grad_norm": 5.03125, "learning_rate": 5.130877808587728e-06, "loss": 1.01823978, "memory(GiB)": 142.32, "step": 92460, "train_speed(iter/s)": 0.285852 }, { "acc": 0.75268087, "epoch": 1.034383926112238, "grad_norm": 7.28125, "learning_rate": 5.129029012539574e-06, "loss": 0.98010998, "memory(GiB)": 142.32, "step": 92480, "train_speed(iter/s)": 0.285875 }, { "acc": 0.7291851, "epoch": 1.0346076250581966, "grad_norm": 6.375, "learning_rate": 5.127180198838407e-06, "loss": 1.086376, "memory(GiB)": 142.32, "step": 92500, "train_speed(iter/s)": 0.285899 }, { "acc": 0.74054861, "epoch": 1.0348313240041551, "grad_norm": 6.34375, "learning_rate": 5.125331367737171e-06, "loss": 1.03683243, "memory(GiB)": 142.32, "step": 92520, "train_speed(iter/s)": 0.285921 }, { "acc": 0.72609825, "epoch": 1.0350550229501136, "grad_norm": 6.0625, "learning_rate": 5.1234825194888125e-06, "loss": 1.06185074, "memory(GiB)": 142.32, "step": 92540, "train_speed(iter/s)": 0.285943 }, { "acc": 0.72929459, "epoch": 1.0352787218960722, "grad_norm": 6.3125, "learning_rate": 5.121633654346282e-06, "loss": 1.08074179, "memory(GiB)": 142.32, "step": 92560, "train_speed(iter/s)": 0.285967 }, { "acc": 0.74520588, "epoch": 1.0355024208420307, "grad_norm": 5.8125, "learning_rate": 5.119784772562527e-06, "loss": 1.00800438, "memory(GiB)": 142.32, "step": 92580, "train_speed(iter/s)": 0.285989 }, { "acc": 0.73246765, "epoch": 1.0357261197879892, "grad_norm": 7.0, "learning_rate": 5.117935874390503e-06, "loss": 1.04603367, "memory(GiB)": 142.32, "step": 92600, "train_speed(iter/s)": 0.286012 }, { "acc": 0.73491755, "epoch": 1.0359498187339478, "grad_norm": 6.34375, "learning_rate": 5.116086960083168e-06, "loss": 1.04275513, "memory(GiB)": 142.32, "step": 92620, "train_speed(iter/s)": 0.286033 }, { "acc": 0.72558928, "epoch": 1.0361735176799063, "grad_norm": 6.25, "learning_rate": 5.114238029893475e-06, "loss": 1.0923398, "memory(GiB)": 142.32, "step": 92640, "train_speed(iter/s)": 0.286053 }, { "acc": 0.73847771, "epoch": 1.0363972166258648, "grad_norm": 7.75, "learning_rate": 5.1123890840743875e-06, "loss": 1.04862795, "memory(GiB)": 142.32, "step": 92660, "train_speed(iter/s)": 0.286073 }, { "acc": 0.74915013, "epoch": 1.0366209155718233, "grad_norm": 6.0, "learning_rate": 5.110540122878868e-06, "loss": 0.99511852, "memory(GiB)": 142.32, "step": 92680, "train_speed(iter/s)": 0.286088 }, { "acc": 0.72025747, "epoch": 1.0368446145177819, "grad_norm": 4.875, "learning_rate": 5.108691146559878e-06, "loss": 1.12773876, "memory(GiB)": 142.32, "step": 92700, "train_speed(iter/s)": 0.286109 }, { "acc": 0.74657125, "epoch": 1.0370683134637404, "grad_norm": 6.28125, "learning_rate": 5.106842155370386e-06, "loss": 0.97061386, "memory(GiB)": 142.32, "step": 92720, "train_speed(iter/s)": 0.28613 }, { "acc": 0.73926067, "epoch": 1.037292012409699, "grad_norm": 6.09375, "learning_rate": 5.10499314956336e-06, "loss": 1.03638811, "memory(GiB)": 142.32, "step": 92740, "train_speed(iter/s)": 0.286152 }, { "acc": 0.74926872, "epoch": 1.0375157113556575, "grad_norm": 6.71875, "learning_rate": 5.10314412939177e-06, "loss": 0.98369465, "memory(GiB)": 142.32, "step": 92760, "train_speed(iter/s)": 0.286173 }, { "acc": 0.73244529, "epoch": 1.037739410301616, "grad_norm": 6.1875, "learning_rate": 5.101295095108592e-06, "loss": 1.07452393, "memory(GiB)": 142.32, "step": 92780, "train_speed(iter/s)": 0.286193 }, { "acc": 0.74757013, "epoch": 1.0379631092475745, "grad_norm": 5.28125, "learning_rate": 5.099446046966794e-06, "loss": 1.0101799, "memory(GiB)": 142.32, "step": 92800, "train_speed(iter/s)": 0.286213 }, { "acc": 0.75273294, "epoch": 1.038186808193533, "grad_norm": 6.125, "learning_rate": 5.097596985219355e-06, "loss": 0.97679129, "memory(GiB)": 142.32, "step": 92820, "train_speed(iter/s)": 0.286234 }, { "acc": 0.74578829, "epoch": 1.0384105071394916, "grad_norm": 6.375, "learning_rate": 5.095747910119255e-06, "loss": 1.00221224, "memory(GiB)": 142.32, "step": 92840, "train_speed(iter/s)": 0.286255 }, { "acc": 0.73781834, "epoch": 1.03863420608545, "grad_norm": 5.78125, "learning_rate": 5.0938988219194715e-06, "loss": 1.05532055, "memory(GiB)": 142.32, "step": 92860, "train_speed(iter/s)": 0.286273 }, { "acc": 0.73354492, "epoch": 1.0388579050314086, "grad_norm": 6.5, "learning_rate": 5.092049720872988e-06, "loss": 1.05989151, "memory(GiB)": 142.32, "step": 92880, "train_speed(iter/s)": 0.286296 }, { "acc": 0.72352734, "epoch": 1.0390816039773672, "grad_norm": 6.03125, "learning_rate": 5.090200607232787e-06, "loss": 1.10922508, "memory(GiB)": 142.32, "step": 92900, "train_speed(iter/s)": 0.286317 }, { "acc": 0.73580685, "epoch": 1.0393053029233257, "grad_norm": 5.1875, "learning_rate": 5.088351481251852e-06, "loss": 1.05729647, "memory(GiB)": 142.32, "step": 92920, "train_speed(iter/s)": 0.286338 }, { "acc": 0.7321629, "epoch": 1.0395290018692842, "grad_norm": 5.65625, "learning_rate": 5.086502343183173e-06, "loss": 1.07755852, "memory(GiB)": 142.32, "step": 92940, "train_speed(iter/s)": 0.286359 }, { "acc": 0.73322134, "epoch": 1.0397527008152427, "grad_norm": 7.46875, "learning_rate": 5.084653193279736e-06, "loss": 1.06543541, "memory(GiB)": 142.32, "step": 92960, "train_speed(iter/s)": 0.286382 }, { "acc": 0.73336506, "epoch": 1.0399763997612013, "grad_norm": 5.8125, "learning_rate": 5.082804031794534e-06, "loss": 1.03841076, "memory(GiB)": 142.32, "step": 92980, "train_speed(iter/s)": 0.286404 }, { "acc": 0.74815702, "epoch": 1.0402000987071598, "grad_norm": 5.9375, "learning_rate": 5.0809548589805555e-06, "loss": 1.00157509, "memory(GiB)": 142.32, "step": 93000, "train_speed(iter/s)": 0.286425 }, { "acc": 0.72984104, "epoch": 1.0404237976531183, "grad_norm": 6.03125, "learning_rate": 5.079105675090795e-06, "loss": 1.07822056, "memory(GiB)": 142.32, "step": 93020, "train_speed(iter/s)": 0.286447 }, { "acc": 0.73501768, "epoch": 1.0406474965990768, "grad_norm": 5.125, "learning_rate": 5.077256480378248e-06, "loss": 1.05206175, "memory(GiB)": 142.32, "step": 93040, "train_speed(iter/s)": 0.28647 }, { "acc": 0.74076762, "epoch": 1.0408711955450354, "grad_norm": 5.6875, "learning_rate": 5.0754072750959095e-06, "loss": 1.01036911, "memory(GiB)": 142.32, "step": 93060, "train_speed(iter/s)": 0.286491 }, { "acc": 0.73464665, "epoch": 1.0410948944909941, "grad_norm": 6.96875, "learning_rate": 5.073558059496779e-06, "loss": 1.05620823, "memory(GiB)": 142.32, "step": 93080, "train_speed(iter/s)": 0.286512 }, { "acc": 0.7307641, "epoch": 1.0413185934369527, "grad_norm": 6.875, "learning_rate": 5.071708833833855e-06, "loss": 1.06156502, "memory(GiB)": 142.32, "step": 93100, "train_speed(iter/s)": 0.286533 }, { "acc": 0.73359919, "epoch": 1.0415422923829112, "grad_norm": 6.78125, "learning_rate": 5.069859598360136e-06, "loss": 1.06187668, "memory(GiB)": 142.32, "step": 93120, "train_speed(iter/s)": 0.286553 }, { "acc": 0.73710065, "epoch": 1.0417659913288697, "grad_norm": 5.625, "learning_rate": 5.068010353328626e-06, "loss": 1.04067707, "memory(GiB)": 142.32, "step": 93140, "train_speed(iter/s)": 0.286573 }, { "acc": 0.73736773, "epoch": 1.0419896902748282, "grad_norm": 6.84375, "learning_rate": 5.066161098992327e-06, "loss": 1.05919018, "memory(GiB)": 142.32, "step": 93160, "train_speed(iter/s)": 0.286596 }, { "acc": 0.72570744, "epoch": 1.0422133892207868, "grad_norm": 6.03125, "learning_rate": 5.064311835604245e-06, "loss": 1.11103926, "memory(GiB)": 142.32, "step": 93180, "train_speed(iter/s)": 0.286617 }, { "acc": 0.74231319, "epoch": 1.0424370881667453, "grad_norm": 5.46875, "learning_rate": 5.062462563417385e-06, "loss": 1.03465176, "memory(GiB)": 142.32, "step": 93200, "train_speed(iter/s)": 0.286637 }, { "acc": 0.73764391, "epoch": 1.0426607871127038, "grad_norm": 7.09375, "learning_rate": 5.060613282684754e-06, "loss": 1.03866138, "memory(GiB)": 142.32, "step": 93220, "train_speed(iter/s)": 0.286655 }, { "acc": 0.74021063, "epoch": 1.0428844860586624, "grad_norm": 6.65625, "learning_rate": 5.058763993659358e-06, "loss": 1.0363925, "memory(GiB)": 142.32, "step": 93240, "train_speed(iter/s)": 0.286677 }, { "acc": 0.74662395, "epoch": 1.0431081850046209, "grad_norm": 6.625, "learning_rate": 5.056914696594209e-06, "loss": 1.00676451, "memory(GiB)": 142.32, "step": 93260, "train_speed(iter/s)": 0.286695 }, { "acc": 0.74407949, "epoch": 1.0433318839505794, "grad_norm": 5.75, "learning_rate": 5.055065391742314e-06, "loss": 1.00465164, "memory(GiB)": 142.32, "step": 93280, "train_speed(iter/s)": 0.286714 }, { "acc": 0.72166591, "epoch": 1.043555582896538, "grad_norm": 6.03125, "learning_rate": 5.053216079356688e-06, "loss": 1.11039286, "memory(GiB)": 142.32, "step": 93300, "train_speed(iter/s)": 0.286734 }, { "acc": 0.73987951, "epoch": 1.0437792818424965, "grad_norm": 6.21875, "learning_rate": 5.051366759690342e-06, "loss": 1.03328972, "memory(GiB)": 142.32, "step": 93320, "train_speed(iter/s)": 0.286754 }, { "acc": 0.73740015, "epoch": 1.044002980788455, "grad_norm": 6.59375, "learning_rate": 5.049517432996287e-06, "loss": 1.0367321, "memory(GiB)": 142.32, "step": 93340, "train_speed(iter/s)": 0.286775 }, { "acc": 0.75071926, "epoch": 1.0442266797344135, "grad_norm": 5.3125, "learning_rate": 5.047668099527541e-06, "loss": 0.97380428, "memory(GiB)": 142.32, "step": 93360, "train_speed(iter/s)": 0.286795 }, { "acc": 0.72896552, "epoch": 1.044450378680372, "grad_norm": 6.0, "learning_rate": 5.045818759537116e-06, "loss": 1.07577515, "memory(GiB)": 142.32, "step": 93380, "train_speed(iter/s)": 0.286817 }, { "acc": 0.74438252, "epoch": 1.0446740776263306, "grad_norm": 5.3125, "learning_rate": 5.043969413278033e-06, "loss": 1.01116581, "memory(GiB)": 142.32, "step": 93400, "train_speed(iter/s)": 0.286837 }, { "acc": 0.73038273, "epoch": 1.044897776572289, "grad_norm": 5.78125, "learning_rate": 5.042120061003304e-06, "loss": 1.07605629, "memory(GiB)": 142.32, "step": 93420, "train_speed(iter/s)": 0.286857 }, { "acc": 0.72531018, "epoch": 1.0451214755182476, "grad_norm": 5.625, "learning_rate": 5.040270702965948e-06, "loss": 1.09512367, "memory(GiB)": 142.32, "step": 93440, "train_speed(iter/s)": 0.286878 }, { "acc": 0.73709784, "epoch": 1.0453451744642062, "grad_norm": 6.21875, "learning_rate": 5.038421339418985e-06, "loss": 1.04078121, "memory(GiB)": 142.32, "step": 93460, "train_speed(iter/s)": 0.2869 }, { "acc": 0.7314249, "epoch": 1.0455688734101647, "grad_norm": 4.96875, "learning_rate": 5.036571970615434e-06, "loss": 1.0642663, "memory(GiB)": 142.32, "step": 93480, "train_speed(iter/s)": 0.286921 }, { "acc": 0.74509096, "epoch": 1.0457925723561232, "grad_norm": 6.1875, "learning_rate": 5.034722596808314e-06, "loss": 0.99989271, "memory(GiB)": 142.32, "step": 93500, "train_speed(iter/s)": 0.286943 }, { "acc": 0.73083496, "epoch": 1.0460162713020817, "grad_norm": 4.5, "learning_rate": 5.032873218250647e-06, "loss": 1.07526903, "memory(GiB)": 142.32, "step": 93520, "train_speed(iter/s)": 0.286964 }, { "acc": 0.73139315, "epoch": 1.0462399702480403, "grad_norm": 5.90625, "learning_rate": 5.031023835195454e-06, "loss": 1.07052097, "memory(GiB)": 142.32, "step": 93540, "train_speed(iter/s)": 0.286986 }, { "acc": 0.73265266, "epoch": 1.0464636691939988, "grad_norm": 6.875, "learning_rate": 5.0291744478957545e-06, "loss": 1.07041721, "memory(GiB)": 142.32, "step": 93560, "train_speed(iter/s)": 0.287007 }, { "acc": 0.74048576, "epoch": 1.0466873681399573, "grad_norm": 5.625, "learning_rate": 5.027325056604575e-06, "loss": 1.04252567, "memory(GiB)": 142.32, "step": 93580, "train_speed(iter/s)": 0.287029 }, { "acc": 0.74754167, "epoch": 1.0469110670859159, "grad_norm": 6.625, "learning_rate": 5.025475661574938e-06, "loss": 0.99138336, "memory(GiB)": 142.32, "step": 93600, "train_speed(iter/s)": 0.28705 }, { "acc": 0.72694368, "epoch": 1.0471347660318744, "grad_norm": 5.84375, "learning_rate": 5.023626263059866e-06, "loss": 1.08991814, "memory(GiB)": 142.32, "step": 93620, "train_speed(iter/s)": 0.28707 }, { "acc": 0.74419212, "epoch": 1.047358464977833, "grad_norm": 6.5625, "learning_rate": 5.021776861312384e-06, "loss": 1.01474934, "memory(GiB)": 142.32, "step": 93640, "train_speed(iter/s)": 0.287089 }, { "acc": 0.74058428, "epoch": 1.0475821639237914, "grad_norm": 6.0625, "learning_rate": 5.0199274565855146e-06, "loss": 1.0315897, "memory(GiB)": 142.32, "step": 93660, "train_speed(iter/s)": 0.287106 }, { "acc": 0.73831682, "epoch": 1.04780586286975, "grad_norm": 4.9375, "learning_rate": 5.018078049132286e-06, "loss": 1.04729872, "memory(GiB)": 142.32, "step": 93680, "train_speed(iter/s)": 0.287124 }, { "acc": 0.74140348, "epoch": 1.0480295618157085, "grad_norm": 5.34375, "learning_rate": 5.01622863920572e-06, "loss": 1.02404165, "memory(GiB)": 142.32, "step": 93700, "train_speed(iter/s)": 0.287145 }, { "acc": 0.74805508, "epoch": 1.048253260761667, "grad_norm": 5.875, "learning_rate": 5.014379227058847e-06, "loss": 0.99061842, "memory(GiB)": 142.32, "step": 93720, "train_speed(iter/s)": 0.287166 }, { "acc": 0.7477951, "epoch": 1.0484769597076256, "grad_norm": 7.15625, "learning_rate": 5.012529812944688e-06, "loss": 0.98598518, "memory(GiB)": 142.32, "step": 93740, "train_speed(iter/s)": 0.287187 }, { "acc": 0.72632933, "epoch": 1.048700658653584, "grad_norm": 5.3125, "learning_rate": 5.010680397116272e-06, "loss": 1.08719902, "memory(GiB)": 142.32, "step": 93760, "train_speed(iter/s)": 0.287209 }, { "acc": 0.74527798, "epoch": 1.0489243575995426, "grad_norm": 8.25, "learning_rate": 5.008830979826625e-06, "loss": 1.00260105, "memory(GiB)": 142.32, "step": 93780, "train_speed(iter/s)": 0.28723 }, { "acc": 0.74185448, "epoch": 1.0491480565455011, "grad_norm": 6.71875, "learning_rate": 5.006981561328774e-06, "loss": 1.02469349, "memory(GiB)": 142.32, "step": 93800, "train_speed(iter/s)": 0.287244 }, { "acc": 0.73145242, "epoch": 1.0493717554914597, "grad_norm": 6.09375, "learning_rate": 5.005132141875746e-06, "loss": 1.08069878, "memory(GiB)": 142.32, "step": 93820, "train_speed(iter/s)": 0.287266 }, { "acc": 0.73624487, "epoch": 1.0495954544374182, "grad_norm": 4.21875, "learning_rate": 5.003282721720568e-06, "loss": 1.0575532, "memory(GiB)": 142.32, "step": 93840, "train_speed(iter/s)": 0.287289 }, { "acc": 0.7220387, "epoch": 1.0498191533833767, "grad_norm": 5.125, "learning_rate": 5.001433301116265e-06, "loss": 1.12754726, "memory(GiB)": 142.32, "step": 93860, "train_speed(iter/s)": 0.28731 }, { "acc": 0.73146915, "epoch": 1.0500428523293353, "grad_norm": 5.5625, "learning_rate": 4.9995838803158666e-06, "loss": 1.05854807, "memory(GiB)": 142.32, "step": 93880, "train_speed(iter/s)": 0.287334 }, { "acc": 0.74455428, "epoch": 1.0502665512752938, "grad_norm": 6.25, "learning_rate": 4.9977344595724e-06, "loss": 0.99511185, "memory(GiB)": 142.32, "step": 93900, "train_speed(iter/s)": 0.287355 }, { "acc": 0.73598719, "epoch": 1.0504902502212523, "grad_norm": 5.25, "learning_rate": 4.99588503913889e-06, "loss": 1.05489111, "memory(GiB)": 142.32, "step": 93920, "train_speed(iter/s)": 0.287376 }, { "acc": 0.73575554, "epoch": 1.0507139491672108, "grad_norm": 5.6875, "learning_rate": 4.9940356192683685e-06, "loss": 1.03660097, "memory(GiB)": 142.32, "step": 93940, "train_speed(iter/s)": 0.287396 }, { "acc": 0.73830919, "epoch": 1.0509376481131694, "grad_norm": 6.375, "learning_rate": 4.992186200213857e-06, "loss": 1.02699776, "memory(GiB)": 142.32, "step": 93960, "train_speed(iter/s)": 0.287417 }, { "acc": 0.73814039, "epoch": 1.051161347059128, "grad_norm": 4.28125, "learning_rate": 4.990336782228386e-06, "loss": 1.04388733, "memory(GiB)": 142.32, "step": 93980, "train_speed(iter/s)": 0.287437 }, { "acc": 0.73358779, "epoch": 1.0513850460050864, "grad_norm": 5.3125, "learning_rate": 4.98848736556498e-06, "loss": 1.05316105, "memory(GiB)": 142.32, "step": 94000, "train_speed(iter/s)": 0.287453 }, { "epoch": 1.0513850460050864, "eval_acc": 0.6958871110543213, "eval_loss": 1.0735259056091309, "eval_runtime": 2343.3377, "eval_samples_per_second": 32.126, "eval_steps_per_second": 16.063, "step": 94000 }, { "acc": 0.74143724, "epoch": 1.051608744951045, "grad_norm": 6.21875, "learning_rate": 4.9866379504766674e-06, "loss": 1.00937557, "memory(GiB)": 142.32, "step": 94020, "train_speed(iter/s)": 0.285386 }, { "acc": 0.73532715, "epoch": 1.0518324438970035, "grad_norm": 6.0, "learning_rate": 4.9847885372164766e-06, "loss": 1.05401783, "memory(GiB)": 142.32, "step": 94040, "train_speed(iter/s)": 0.285406 }, { "acc": 0.72938528, "epoch": 1.052056142842962, "grad_norm": 6.40625, "learning_rate": 4.982939126037429e-06, "loss": 1.07867975, "memory(GiB)": 142.32, "step": 94060, "train_speed(iter/s)": 0.285426 }, { "acc": 0.75054979, "epoch": 1.0522798417889205, "grad_norm": 5.5625, "learning_rate": 4.981089717192553e-06, "loss": 0.98939199, "memory(GiB)": 142.32, "step": 94080, "train_speed(iter/s)": 0.285448 }, { "acc": 0.74039879, "epoch": 1.052503540734879, "grad_norm": 6.46875, "learning_rate": 4.979240310934873e-06, "loss": 1.02187109, "memory(GiB)": 142.32, "step": 94100, "train_speed(iter/s)": 0.28547 }, { "acc": 0.7413765, "epoch": 1.0527272396808376, "grad_norm": 5.0, "learning_rate": 4.977390907517416e-06, "loss": 1.02962885, "memory(GiB)": 142.32, "step": 94120, "train_speed(iter/s)": 0.285488 }, { "acc": 0.73642817, "epoch": 1.0529509386267961, "grad_norm": 7.03125, "learning_rate": 4.975541507193208e-06, "loss": 1.04298096, "memory(GiB)": 142.32, "step": 94140, "train_speed(iter/s)": 0.285507 }, { "acc": 0.73906159, "epoch": 1.0531746375727546, "grad_norm": 6.15625, "learning_rate": 4.97369211021527e-06, "loss": 1.03648453, "memory(GiB)": 142.32, "step": 94160, "train_speed(iter/s)": 0.285529 }, { "acc": 0.75677962, "epoch": 1.0533983365187132, "grad_norm": 6.59375, "learning_rate": 4.971842716836627e-06, "loss": 0.95592508, "memory(GiB)": 142.32, "step": 94180, "train_speed(iter/s)": 0.28555 }, { "acc": 0.74220347, "epoch": 1.0536220354646717, "grad_norm": 6.90625, "learning_rate": 4.969993327310303e-06, "loss": 1.01326122, "memory(GiB)": 142.32, "step": 94200, "train_speed(iter/s)": 0.285573 }, { "acc": 0.73961105, "epoch": 1.0538457344106302, "grad_norm": 5.53125, "learning_rate": 4.968143941889319e-06, "loss": 1.04060726, "memory(GiB)": 142.32, "step": 94220, "train_speed(iter/s)": 0.285594 }, { "acc": 0.74754338, "epoch": 1.0540694333565888, "grad_norm": 5.84375, "learning_rate": 4.966294560826702e-06, "loss": 0.99403801, "memory(GiB)": 142.32, "step": 94240, "train_speed(iter/s)": 0.285614 }, { "acc": 0.74201784, "epoch": 1.0542931323025473, "grad_norm": 6.59375, "learning_rate": 4.96444518437547e-06, "loss": 1.01372023, "memory(GiB)": 142.32, "step": 94260, "train_speed(iter/s)": 0.285635 }, { "acc": 0.74182043, "epoch": 1.0545168312485058, "grad_norm": 5.6875, "learning_rate": 4.962595812788645e-06, "loss": 1.01487341, "memory(GiB)": 142.32, "step": 94280, "train_speed(iter/s)": 0.285656 }, { "acc": 0.74089918, "epoch": 1.0547405301944643, "grad_norm": 6.125, "learning_rate": 4.960746446319246e-06, "loss": 1.02072983, "memory(GiB)": 142.32, "step": 94300, "train_speed(iter/s)": 0.285678 }, { "acc": 0.73533888, "epoch": 1.0549642291404229, "grad_norm": 5.28125, "learning_rate": 4.958897085220295e-06, "loss": 1.05247593, "memory(GiB)": 142.32, "step": 94320, "train_speed(iter/s)": 0.285697 }, { "acc": 0.73850441, "epoch": 1.0551879280863814, "grad_norm": 6.875, "learning_rate": 4.957047729744811e-06, "loss": 1.02923031, "memory(GiB)": 142.32, "step": 94340, "train_speed(iter/s)": 0.285717 }, { "acc": 0.74716277, "epoch": 1.05541162703234, "grad_norm": 6.3125, "learning_rate": 4.955198380145811e-06, "loss": 1.01154547, "memory(GiB)": 142.32, "step": 94360, "train_speed(iter/s)": 0.285738 }, { "acc": 0.74618034, "epoch": 1.0556353259782985, "grad_norm": 6.4375, "learning_rate": 4.953349036676313e-06, "loss": 1.00372906, "memory(GiB)": 142.32, "step": 94380, "train_speed(iter/s)": 0.285759 }, { "acc": 0.73658447, "epoch": 1.055859024924257, "grad_norm": 6.5, "learning_rate": 4.951499699589333e-06, "loss": 1.05527887, "memory(GiB)": 142.32, "step": 94400, "train_speed(iter/s)": 0.28578 }, { "acc": 0.74626513, "epoch": 1.0560827238702155, "grad_norm": 6.375, "learning_rate": 4.949650369137888e-06, "loss": 1.0005846, "memory(GiB)": 142.32, "step": 94420, "train_speed(iter/s)": 0.2858 }, { "acc": 0.72920032, "epoch": 1.056306422816174, "grad_norm": 5.96875, "learning_rate": 4.947801045574993e-06, "loss": 1.08364639, "memory(GiB)": 142.32, "step": 94440, "train_speed(iter/s)": 0.28582 }, { "acc": 0.72748089, "epoch": 1.0565301217621326, "grad_norm": 7.46875, "learning_rate": 4.945951729153659e-06, "loss": 1.08949022, "memory(GiB)": 142.32, "step": 94460, "train_speed(iter/s)": 0.285838 }, { "acc": 0.74836154, "epoch": 1.056753820708091, "grad_norm": 5.90625, "learning_rate": 4.944102420126902e-06, "loss": 0.99411583, "memory(GiB)": 142.32, "step": 94480, "train_speed(iter/s)": 0.285859 }, { "acc": 0.7415843, "epoch": 1.0569775196540496, "grad_norm": 6.875, "learning_rate": 4.942253118747733e-06, "loss": 1.0394268, "memory(GiB)": 142.32, "step": 94500, "train_speed(iter/s)": 0.285878 }, { "acc": 0.73485804, "epoch": 1.0572012186000082, "grad_norm": 7.03125, "learning_rate": 4.9404038252691625e-06, "loss": 1.04504423, "memory(GiB)": 142.32, "step": 94520, "train_speed(iter/s)": 0.285899 }, { "acc": 0.72448015, "epoch": 1.0574249175459667, "grad_norm": 5.46875, "learning_rate": 4.938554539944201e-06, "loss": 1.08716726, "memory(GiB)": 142.32, "step": 94540, "train_speed(iter/s)": 0.285921 }, { "acc": 0.73474722, "epoch": 1.0576486164919252, "grad_norm": 6.09375, "learning_rate": 4.936705263025856e-06, "loss": 1.06306953, "memory(GiB)": 142.32, "step": 94560, "train_speed(iter/s)": 0.28594 }, { "acc": 0.74450293, "epoch": 1.0578723154378837, "grad_norm": 5.78125, "learning_rate": 4.934855994767136e-06, "loss": 1.00573549, "memory(GiB)": 142.32, "step": 94580, "train_speed(iter/s)": 0.28596 }, { "acc": 0.74054022, "epoch": 1.0580960143838423, "grad_norm": 6.0625, "learning_rate": 4.933006735421047e-06, "loss": 1.01261272, "memory(GiB)": 142.32, "step": 94600, "train_speed(iter/s)": 0.285981 }, { "acc": 0.7382174, "epoch": 1.0583197133298008, "grad_norm": 6.375, "learning_rate": 4.931157485240594e-06, "loss": 1.03544998, "memory(GiB)": 142.32, "step": 94620, "train_speed(iter/s)": 0.286003 }, { "acc": 0.73651962, "epoch": 1.0585434122757593, "grad_norm": 5.96875, "learning_rate": 4.929308244478782e-06, "loss": 1.04476871, "memory(GiB)": 142.32, "step": 94640, "train_speed(iter/s)": 0.286023 }, { "acc": 0.73541889, "epoch": 1.0587671112217178, "grad_norm": 5.78125, "learning_rate": 4.927459013388612e-06, "loss": 1.06485882, "memory(GiB)": 142.32, "step": 94660, "train_speed(iter/s)": 0.28604 }, { "acc": 0.74147501, "epoch": 1.0589908101676764, "grad_norm": 7.09375, "learning_rate": 4.925609792223088e-06, "loss": 1.00975437, "memory(GiB)": 142.32, "step": 94680, "train_speed(iter/s)": 0.286062 }, { "acc": 0.73133755, "epoch": 1.059214509113635, "grad_norm": 6.75, "learning_rate": 4.923760581235204e-06, "loss": 1.0771596, "memory(GiB)": 142.32, "step": 94700, "train_speed(iter/s)": 0.286084 }, { "acc": 0.73758078, "epoch": 1.0594382080595934, "grad_norm": 6.53125, "learning_rate": 4.921911380677964e-06, "loss": 1.03765345, "memory(GiB)": 142.32, "step": 94720, "train_speed(iter/s)": 0.286104 }, { "acc": 0.73682537, "epoch": 1.059661907005552, "grad_norm": 7.625, "learning_rate": 4.920062190804363e-06, "loss": 1.04775887, "memory(GiB)": 142.32, "step": 94740, "train_speed(iter/s)": 0.286123 }, { "acc": 0.72918653, "epoch": 1.0598856059515105, "grad_norm": 6.59375, "learning_rate": 4.918213011867396e-06, "loss": 1.07698612, "memory(GiB)": 142.32, "step": 94760, "train_speed(iter/s)": 0.286142 }, { "acc": 0.72827148, "epoch": 1.060109304897469, "grad_norm": 5.90625, "learning_rate": 4.91636384412006e-06, "loss": 1.08089809, "memory(GiB)": 142.32, "step": 94780, "train_speed(iter/s)": 0.286163 }, { "acc": 0.73681688, "epoch": 1.0603330038434275, "grad_norm": 7.5, "learning_rate": 4.9145146878153435e-06, "loss": 1.05882063, "memory(GiB)": 142.32, "step": 94800, "train_speed(iter/s)": 0.286181 }, { "acc": 0.73327484, "epoch": 1.060556702789386, "grad_norm": 5.0, "learning_rate": 4.91266554320624e-06, "loss": 1.05064564, "memory(GiB)": 142.32, "step": 94820, "train_speed(iter/s)": 0.286202 }, { "acc": 0.72493539, "epoch": 1.0607804017353446, "grad_norm": 6.28125, "learning_rate": 4.910816410545739e-06, "loss": 1.09374752, "memory(GiB)": 142.32, "step": 94840, "train_speed(iter/s)": 0.286222 }, { "acc": 0.73769035, "epoch": 1.0610041006813031, "grad_norm": 5.125, "learning_rate": 4.908967290086827e-06, "loss": 1.04427357, "memory(GiB)": 142.32, "step": 94860, "train_speed(iter/s)": 0.286245 }, { "acc": 0.73394742, "epoch": 1.0612277996272617, "grad_norm": 5.90625, "learning_rate": 4.907118182082493e-06, "loss": 1.03877335, "memory(GiB)": 142.32, "step": 94880, "train_speed(iter/s)": 0.286269 }, { "acc": 0.74454961, "epoch": 1.0614514985732202, "grad_norm": 7.09375, "learning_rate": 4.905269086785717e-06, "loss": 1.01793118, "memory(GiB)": 142.32, "step": 94900, "train_speed(iter/s)": 0.286288 }, { "acc": 0.7357049, "epoch": 1.0616751975191787, "grad_norm": 6.8125, "learning_rate": 4.9034200044494845e-06, "loss": 1.0473485, "memory(GiB)": 142.32, "step": 94920, "train_speed(iter/s)": 0.286308 }, { "acc": 0.73749976, "epoch": 1.0618988964651372, "grad_norm": 5.875, "learning_rate": 4.901570935326776e-06, "loss": 1.04097567, "memory(GiB)": 142.32, "step": 94940, "train_speed(iter/s)": 0.286329 }, { "acc": 0.72928019, "epoch": 1.0621225954110958, "grad_norm": 5.3125, "learning_rate": 4.899721879670571e-06, "loss": 1.08908091, "memory(GiB)": 142.32, "step": 94960, "train_speed(iter/s)": 0.28635 }, { "acc": 0.73040771, "epoch": 1.0623462943570543, "grad_norm": 5.25, "learning_rate": 4.897872837733845e-06, "loss": 1.07320461, "memory(GiB)": 142.32, "step": 94980, "train_speed(iter/s)": 0.28637 }, { "acc": 0.73656425, "epoch": 1.0625699933030128, "grad_norm": 6.25, "learning_rate": 4.896023809769576e-06, "loss": 1.04554844, "memory(GiB)": 142.32, "step": 95000, "train_speed(iter/s)": 0.286389 }, { "acc": 0.73466663, "epoch": 1.0627936922489714, "grad_norm": 5.0625, "learning_rate": 4.894174796030735e-06, "loss": 1.07052841, "memory(GiB)": 142.32, "step": 95020, "train_speed(iter/s)": 0.286408 }, { "acc": 0.74337273, "epoch": 1.0630173911949299, "grad_norm": 5.34375, "learning_rate": 4.892325796770294e-06, "loss": 1.00803814, "memory(GiB)": 142.32, "step": 95040, "train_speed(iter/s)": 0.28643 }, { "acc": 0.74272647, "epoch": 1.0632410901408884, "grad_norm": 6.625, "learning_rate": 4.890476812241223e-06, "loss": 1.01511116, "memory(GiB)": 142.32, "step": 95060, "train_speed(iter/s)": 0.286451 }, { "acc": 0.72845697, "epoch": 1.063464789086847, "grad_norm": 5.25, "learning_rate": 4.8886278426964916e-06, "loss": 1.08337927, "memory(GiB)": 142.32, "step": 95080, "train_speed(iter/s)": 0.286472 }, { "acc": 0.74063234, "epoch": 1.0636884880328055, "grad_norm": 4.75, "learning_rate": 4.886778888389061e-06, "loss": 1.02775555, "memory(GiB)": 142.32, "step": 95100, "train_speed(iter/s)": 0.286492 }, { "acc": 0.72948627, "epoch": 1.063912186978764, "grad_norm": 7.75, "learning_rate": 4.884929949571898e-06, "loss": 1.0674696, "memory(GiB)": 142.32, "step": 95120, "train_speed(iter/s)": 0.286512 }, { "acc": 0.73695736, "epoch": 1.0641358859247225, "grad_norm": 6.4375, "learning_rate": 4.883081026497962e-06, "loss": 1.06440144, "memory(GiB)": 142.32, "step": 95140, "train_speed(iter/s)": 0.286532 }, { "acc": 0.7280705, "epoch": 1.064359584870681, "grad_norm": 7.3125, "learning_rate": 4.881232119420212e-06, "loss": 1.10048294, "memory(GiB)": 142.32, "step": 95160, "train_speed(iter/s)": 0.286553 }, { "acc": 0.73351126, "epoch": 1.0645832838166396, "grad_norm": 5.6875, "learning_rate": 4.879383228591608e-06, "loss": 1.06917667, "memory(GiB)": 142.32, "step": 95180, "train_speed(iter/s)": 0.28657 }, { "acc": 0.7350647, "epoch": 1.064806982762598, "grad_norm": 6.75, "learning_rate": 4.8775343542651e-06, "loss": 1.06164455, "memory(GiB)": 142.32, "step": 95200, "train_speed(iter/s)": 0.286588 }, { "acc": 0.73416939, "epoch": 1.0650306817085566, "grad_norm": 5.6875, "learning_rate": 4.875685496693643e-06, "loss": 1.07127571, "memory(GiB)": 142.32, "step": 95220, "train_speed(iter/s)": 0.286607 }, { "acc": 0.73242421, "epoch": 1.0652543806545152, "grad_norm": 5.9375, "learning_rate": 4.873836656130188e-06, "loss": 1.06227894, "memory(GiB)": 142.32, "step": 95240, "train_speed(iter/s)": 0.286626 }, { "acc": 0.73984022, "epoch": 1.0654780796004737, "grad_norm": 5.1875, "learning_rate": 4.871987832827681e-06, "loss": 1.02639332, "memory(GiB)": 142.32, "step": 95260, "train_speed(iter/s)": 0.286646 }, { "acc": 0.73564167, "epoch": 1.0657017785464322, "grad_norm": 5.84375, "learning_rate": 4.87013902703907e-06, "loss": 1.07578897, "memory(GiB)": 142.32, "step": 95280, "train_speed(iter/s)": 0.286664 }, { "acc": 0.74487839, "epoch": 1.0659254774923907, "grad_norm": 5.78125, "learning_rate": 4.868290239017293e-06, "loss": 1.01640358, "memory(GiB)": 142.32, "step": 95300, "train_speed(iter/s)": 0.286686 }, { "acc": 0.74430046, "epoch": 1.0661491764383493, "grad_norm": 5.34375, "learning_rate": 4.866441469015296e-06, "loss": 1.0140934, "memory(GiB)": 142.32, "step": 95320, "train_speed(iter/s)": 0.286705 }, { "acc": 0.73156829, "epoch": 1.0663728753843078, "grad_norm": 6.34375, "learning_rate": 4.864592717286015e-06, "loss": 1.07348309, "memory(GiB)": 142.32, "step": 95340, "train_speed(iter/s)": 0.286724 }, { "acc": 0.7316823, "epoch": 1.0665965743302663, "grad_norm": 6.375, "learning_rate": 4.8627439840823845e-06, "loss": 1.07203512, "memory(GiB)": 142.32, "step": 95360, "train_speed(iter/s)": 0.286744 }, { "acc": 0.73138351, "epoch": 1.0668202732762249, "grad_norm": 7.125, "learning_rate": 4.860895269657341e-06, "loss": 1.07760649, "memory(GiB)": 142.32, "step": 95380, "train_speed(iter/s)": 0.286764 }, { "acc": 0.72844734, "epoch": 1.0670439722221834, "grad_norm": 6.3125, "learning_rate": 4.859046574263811e-06, "loss": 1.08124132, "memory(GiB)": 142.32, "step": 95400, "train_speed(iter/s)": 0.286786 }, { "acc": 0.73446932, "epoch": 1.067267671168142, "grad_norm": 5.6875, "learning_rate": 4.857197898154725e-06, "loss": 1.05164719, "memory(GiB)": 142.32, "step": 95420, "train_speed(iter/s)": 0.286808 }, { "acc": 0.73284473, "epoch": 1.0674913701141004, "grad_norm": 5.0, "learning_rate": 4.855349241583007e-06, "loss": 1.06890907, "memory(GiB)": 142.32, "step": 95440, "train_speed(iter/s)": 0.286829 }, { "acc": 0.7235054, "epoch": 1.067715069060059, "grad_norm": 6.15625, "learning_rate": 4.853500604801581e-06, "loss": 1.11579514, "memory(GiB)": 142.32, "step": 95460, "train_speed(iter/s)": 0.286849 }, { "acc": 0.73806419, "epoch": 1.0679387680060175, "grad_norm": 6.15625, "learning_rate": 4.851651988063367e-06, "loss": 1.03257742, "memory(GiB)": 142.32, "step": 95480, "train_speed(iter/s)": 0.286869 }, { "acc": 0.73069172, "epoch": 1.068162466951976, "grad_norm": 5.53125, "learning_rate": 4.849803391621279e-06, "loss": 1.07647905, "memory(GiB)": 142.32, "step": 95500, "train_speed(iter/s)": 0.286888 }, { "acc": 0.73829384, "epoch": 1.0683861658979346, "grad_norm": 6.25, "learning_rate": 4.847954815728236e-06, "loss": 1.03944454, "memory(GiB)": 142.32, "step": 95520, "train_speed(iter/s)": 0.286906 }, { "acc": 0.73839989, "epoch": 1.068609864843893, "grad_norm": 4.96875, "learning_rate": 4.846106260637146e-06, "loss": 1.03847046, "memory(GiB)": 142.32, "step": 95540, "train_speed(iter/s)": 0.286926 }, { "acc": 0.75004535, "epoch": 1.0688335637898516, "grad_norm": 6.5, "learning_rate": 4.84425772660092e-06, "loss": 0.98294125, "memory(GiB)": 142.32, "step": 95560, "train_speed(iter/s)": 0.286946 }, { "acc": 0.7369688, "epoch": 1.0690572627358101, "grad_norm": 6.96875, "learning_rate": 4.842409213872464e-06, "loss": 1.04074497, "memory(GiB)": 142.32, "step": 95580, "train_speed(iter/s)": 0.286967 }, { "acc": 0.7500349, "epoch": 1.0692809616817687, "grad_norm": 6.5625, "learning_rate": 4.840560722704678e-06, "loss": 0.99185944, "memory(GiB)": 142.32, "step": 95600, "train_speed(iter/s)": 0.286989 }, { "acc": 0.72582741, "epoch": 1.0695046606277272, "grad_norm": 4.78125, "learning_rate": 4.838712253350465e-06, "loss": 1.08184986, "memory(GiB)": 142.32, "step": 95620, "train_speed(iter/s)": 0.28701 }, { "acc": 0.72679515, "epoch": 1.0697283595736857, "grad_norm": 6.25, "learning_rate": 4.836863806062721e-06, "loss": 1.08563271, "memory(GiB)": 142.32, "step": 95640, "train_speed(iter/s)": 0.28703 }, { "acc": 0.73284984, "epoch": 1.0699520585196443, "grad_norm": 6.65625, "learning_rate": 4.83501538109434e-06, "loss": 1.06356859, "memory(GiB)": 142.32, "step": 95660, "train_speed(iter/s)": 0.287047 }, { "acc": 0.73654013, "epoch": 1.0701757574656028, "grad_norm": 5.59375, "learning_rate": 4.8331669786982135e-06, "loss": 1.03881283, "memory(GiB)": 142.32, "step": 95680, "train_speed(iter/s)": 0.287067 }, { "acc": 0.73387384, "epoch": 1.0703994564115613, "grad_norm": 5.5, "learning_rate": 4.831318599127229e-06, "loss": 1.05341663, "memory(GiB)": 142.32, "step": 95700, "train_speed(iter/s)": 0.287088 }, { "acc": 0.7395752, "epoch": 1.0706231553575198, "grad_norm": 5.4375, "learning_rate": 4.8294702426342705e-06, "loss": 1.04249926, "memory(GiB)": 142.32, "step": 95720, "train_speed(iter/s)": 0.28711 }, { "acc": 0.74158134, "epoch": 1.0708468543034784, "grad_norm": 6.25, "learning_rate": 4.827621909472221e-06, "loss": 1.01806507, "memory(GiB)": 142.32, "step": 95740, "train_speed(iter/s)": 0.287131 }, { "acc": 0.73595982, "epoch": 1.071070553249437, "grad_norm": 5.96875, "learning_rate": 4.825773599893956e-06, "loss": 1.02975636, "memory(GiB)": 142.32, "step": 95760, "train_speed(iter/s)": 0.287152 }, { "acc": 0.73389831, "epoch": 1.0712942521953954, "grad_norm": 6.9375, "learning_rate": 4.8239253141523565e-06, "loss": 1.06278381, "memory(GiB)": 142.32, "step": 95780, "train_speed(iter/s)": 0.287173 }, { "acc": 0.74360552, "epoch": 1.071517951141354, "grad_norm": 6.78125, "learning_rate": 4.822077052500288e-06, "loss": 1.03064108, "memory(GiB)": 142.32, "step": 95800, "train_speed(iter/s)": 0.287193 }, { "acc": 0.73565521, "epoch": 1.0717416500873125, "grad_norm": 6.09375, "learning_rate": 4.820228815190622e-06, "loss": 1.04209633, "memory(GiB)": 142.32, "step": 95820, "train_speed(iter/s)": 0.287214 }, { "acc": 0.74582715, "epoch": 1.071965349033271, "grad_norm": 5.8125, "learning_rate": 4.818380602476224e-06, "loss": 1.01230373, "memory(GiB)": 142.32, "step": 95840, "train_speed(iter/s)": 0.287235 }, { "acc": 0.74107428, "epoch": 1.0721890479792295, "grad_norm": 5.53125, "learning_rate": 4.816532414609956e-06, "loss": 1.03293705, "memory(GiB)": 142.32, "step": 95860, "train_speed(iter/s)": 0.287257 }, { "acc": 0.73612528, "epoch": 1.072412746925188, "grad_norm": 8.4375, "learning_rate": 4.814684251844678e-06, "loss": 1.0404705, "memory(GiB)": 142.32, "step": 95880, "train_speed(iter/s)": 0.287278 }, { "acc": 0.72681923, "epoch": 1.0726364458711466, "grad_norm": 6.875, "learning_rate": 4.81283611443324e-06, "loss": 1.0763113, "memory(GiB)": 142.32, "step": 95900, "train_speed(iter/s)": 0.287295 }, { "acc": 0.72233148, "epoch": 1.0728601448171051, "grad_norm": 7.03125, "learning_rate": 4.810988002628497e-06, "loss": 1.11680183, "memory(GiB)": 142.32, "step": 95920, "train_speed(iter/s)": 0.287317 }, { "acc": 0.74733315, "epoch": 1.0730838437630636, "grad_norm": 6.75, "learning_rate": 4.809139916683298e-06, "loss": 0.99286938, "memory(GiB)": 142.32, "step": 95940, "train_speed(iter/s)": 0.287337 }, { "acc": 0.74476957, "epoch": 1.0733075427090222, "grad_norm": 6.09375, "learning_rate": 4.807291856850485e-06, "loss": 1.00967503, "memory(GiB)": 142.32, "step": 95960, "train_speed(iter/s)": 0.287355 }, { "acc": 0.738519, "epoch": 1.0735312416549807, "grad_norm": 6.1875, "learning_rate": 4.805443823382901e-06, "loss": 1.02184601, "memory(GiB)": 142.32, "step": 95980, "train_speed(iter/s)": 0.287375 }, { "acc": 0.73593788, "epoch": 1.0737549406009392, "grad_norm": 5.78125, "learning_rate": 4.8035958165333835e-06, "loss": 1.04638195, "memory(GiB)": 142.32, "step": 96000, "train_speed(iter/s)": 0.287395 }, { "epoch": 1.0737549406009392, "eval_acc": 0.6959351752929898, "eval_loss": 1.0736260414123535, "eval_runtime": 2340.9208, "eval_samples_per_second": 32.16, "eval_steps_per_second": 16.08, "step": 96000 }, { "acc": 0.73708272, "epoch": 1.0739786395468978, "grad_norm": 6.03125, "learning_rate": 4.801747836554765e-06, "loss": 1.03715, "memory(GiB)": 142.32, "step": 96020, "train_speed(iter/s)": 0.28537 }, { "acc": 0.72311945, "epoch": 1.0742023384928563, "grad_norm": 5.65625, "learning_rate": 4.799899883699876e-06, "loss": 1.11556311, "memory(GiB)": 142.32, "step": 96040, "train_speed(iter/s)": 0.285392 }, { "acc": 0.74255209, "epoch": 1.0744260374388148, "grad_norm": 7.1875, "learning_rate": 4.798051958221544e-06, "loss": 1.02667179, "memory(GiB)": 142.32, "step": 96060, "train_speed(iter/s)": 0.285411 }, { "acc": 0.73045149, "epoch": 1.0746497363847733, "grad_norm": 5.5, "learning_rate": 4.796204060372589e-06, "loss": 1.08709698, "memory(GiB)": 142.32, "step": 96080, "train_speed(iter/s)": 0.28543 }, { "acc": 0.74154701, "epoch": 1.0748734353307319, "grad_norm": 7.40625, "learning_rate": 4.794356190405832e-06, "loss": 1.02306862, "memory(GiB)": 142.32, "step": 96100, "train_speed(iter/s)": 0.285449 }, { "acc": 0.74569559, "epoch": 1.0750971342766904, "grad_norm": 7.8125, "learning_rate": 4.792508348574088e-06, "loss": 1.0125823, "memory(GiB)": 142.32, "step": 96120, "train_speed(iter/s)": 0.285467 }, { "acc": 0.7354104, "epoch": 1.075320833222649, "grad_norm": 7.125, "learning_rate": 4.790660535130168e-06, "loss": 1.05624313, "memory(GiB)": 142.32, "step": 96140, "train_speed(iter/s)": 0.285488 }, { "acc": 0.74626713, "epoch": 1.0755445321686075, "grad_norm": 5.1875, "learning_rate": 4.788812750326878e-06, "loss": 1.0201643, "memory(GiB)": 142.32, "step": 96160, "train_speed(iter/s)": 0.285508 }, { "acc": 0.7275281, "epoch": 1.075768231114566, "grad_norm": 6.4375, "learning_rate": 4.786964994417023e-06, "loss": 1.07175426, "memory(GiB)": 142.32, "step": 96180, "train_speed(iter/s)": 0.28553 }, { "acc": 0.73009186, "epoch": 1.0759919300605245, "grad_norm": 6.09375, "learning_rate": 4.7851172676534006e-06, "loss": 1.07974167, "memory(GiB)": 142.32, "step": 96200, "train_speed(iter/s)": 0.285551 }, { "acc": 0.73915181, "epoch": 1.076215629006483, "grad_norm": 6.03125, "learning_rate": 4.7832695702888085e-06, "loss": 1.04522333, "memory(GiB)": 142.32, "step": 96220, "train_speed(iter/s)": 0.28557 }, { "acc": 0.7362009, "epoch": 1.0764393279524416, "grad_norm": 5.59375, "learning_rate": 4.781421902576037e-06, "loss": 1.05121431, "memory(GiB)": 142.32, "step": 96240, "train_speed(iter/s)": 0.285592 }, { "acc": 0.73460464, "epoch": 1.0766630268984, "grad_norm": 5.96875, "learning_rate": 4.779574264767873e-06, "loss": 1.0677393, "memory(GiB)": 142.32, "step": 96260, "train_speed(iter/s)": 0.285613 }, { "acc": 0.73105931, "epoch": 1.0768867258443586, "grad_norm": 6.78125, "learning_rate": 4.7777266571171e-06, "loss": 1.08585558, "memory(GiB)": 142.32, "step": 96280, "train_speed(iter/s)": 0.285634 }, { "acc": 0.7396265, "epoch": 1.0771104247903172, "grad_norm": 7.4375, "learning_rate": 4.775879079876497e-06, "loss": 1.03605022, "memory(GiB)": 142.32, "step": 96300, "train_speed(iter/s)": 0.285655 }, { "acc": 0.73899212, "epoch": 1.0773341237362757, "grad_norm": 7.3125, "learning_rate": 4.77403153329884e-06, "loss": 1.04006624, "memory(GiB)": 142.32, "step": 96320, "train_speed(iter/s)": 0.285672 }, { "acc": 0.75077162, "epoch": 1.0775578226822342, "grad_norm": 7.28125, "learning_rate": 4.7721840176369e-06, "loss": 0.97326298, "memory(GiB)": 142.32, "step": 96340, "train_speed(iter/s)": 0.285694 }, { "acc": 0.73707576, "epoch": 1.0777815216281927, "grad_norm": 6.875, "learning_rate": 4.770336533143442e-06, "loss": 1.06238728, "memory(GiB)": 142.32, "step": 96360, "train_speed(iter/s)": 0.285715 }, { "acc": 0.73021164, "epoch": 1.0780052205741513, "grad_norm": 5.59375, "learning_rate": 4.768489080071227e-06, "loss": 1.07188015, "memory(GiB)": 142.32, "step": 96380, "train_speed(iter/s)": 0.285733 }, { "acc": 0.73843966, "epoch": 1.0782289195201098, "grad_norm": 6.25, "learning_rate": 4.766641658673017e-06, "loss": 1.03632116, "memory(GiB)": 142.32, "step": 96400, "train_speed(iter/s)": 0.285753 }, { "acc": 0.7365026, "epoch": 1.0784526184660683, "grad_norm": 6.59375, "learning_rate": 4.7647942692015625e-06, "loss": 1.0526783, "memory(GiB)": 142.32, "step": 96420, "train_speed(iter/s)": 0.285772 }, { "acc": 0.73550758, "epoch": 1.0786763174120269, "grad_norm": 6.28125, "learning_rate": 4.762946911909615e-06, "loss": 1.06353893, "memory(GiB)": 142.32, "step": 96440, "train_speed(iter/s)": 0.285794 }, { "acc": 0.73276653, "epoch": 1.0789000163579854, "grad_norm": 6.0625, "learning_rate": 4.761099587049918e-06, "loss": 1.07552185, "memory(GiB)": 142.32, "step": 96460, "train_speed(iter/s)": 0.285813 }, { "acc": 0.73711572, "epoch": 1.079123715303944, "grad_norm": 6.90625, "learning_rate": 4.7592522948752115e-06, "loss": 1.03956642, "memory(GiB)": 142.32, "step": 96480, "train_speed(iter/s)": 0.285836 }, { "acc": 0.72950377, "epoch": 1.0793474142499024, "grad_norm": 6.625, "learning_rate": 4.757405035638232e-06, "loss": 1.10110836, "memory(GiB)": 142.32, "step": 96500, "train_speed(iter/s)": 0.285857 }, { "acc": 0.74743891, "epoch": 1.079571113195861, "grad_norm": 6.1875, "learning_rate": 4.755557809591711e-06, "loss": 0.99717159, "memory(GiB)": 142.32, "step": 96520, "train_speed(iter/s)": 0.285876 }, { "acc": 0.7310585, "epoch": 1.0797948121418195, "grad_norm": 6.0625, "learning_rate": 4.753710616988377e-06, "loss": 1.07249508, "memory(GiB)": 142.32, "step": 96540, "train_speed(iter/s)": 0.285895 }, { "acc": 0.73773279, "epoch": 1.080018511087778, "grad_norm": 5.5625, "learning_rate": 4.751863458080949e-06, "loss": 1.05089111, "memory(GiB)": 142.32, "step": 96560, "train_speed(iter/s)": 0.285916 }, { "acc": 0.73005915, "epoch": 1.0802422100337365, "grad_norm": 7.34375, "learning_rate": 4.750016333122147e-06, "loss": 1.06112614, "memory(GiB)": 142.32, "step": 96580, "train_speed(iter/s)": 0.285937 }, { "acc": 0.72605515, "epoch": 1.080465908979695, "grad_norm": 5.875, "learning_rate": 4.748169242364684e-06, "loss": 1.09638453, "memory(GiB)": 142.32, "step": 96600, "train_speed(iter/s)": 0.285958 }, { "acc": 0.73699532, "epoch": 1.0806896079256536, "grad_norm": 5.25, "learning_rate": 4.746322186061269e-06, "loss": 1.03443203, "memory(GiB)": 142.32, "step": 96620, "train_speed(iter/s)": 0.28598 }, { "acc": 0.74685922, "epoch": 1.0809133068716121, "grad_norm": 5.625, "learning_rate": 4.7444751644646045e-06, "loss": 0.99803696, "memory(GiB)": 142.32, "step": 96640, "train_speed(iter/s)": 0.285999 }, { "acc": 0.73614402, "epoch": 1.0811370058175707, "grad_norm": 6.15625, "learning_rate": 4.7426281778273896e-06, "loss": 1.04889736, "memory(GiB)": 142.32, "step": 96660, "train_speed(iter/s)": 0.286018 }, { "acc": 0.73497047, "epoch": 1.0813607047635292, "grad_norm": 5.8125, "learning_rate": 4.740781226402318e-06, "loss": 1.03181038, "memory(GiB)": 142.32, "step": 96680, "train_speed(iter/s)": 0.286037 }, { "acc": 0.74568329, "epoch": 1.0815844037094877, "grad_norm": 5.25, "learning_rate": 4.73893431044208e-06, "loss": 1.00656958, "memory(GiB)": 142.32, "step": 96700, "train_speed(iter/s)": 0.286058 }, { "acc": 0.73956027, "epoch": 1.0818081026554462, "grad_norm": 5.28125, "learning_rate": 4.73708743019936e-06, "loss": 1.02853928, "memory(GiB)": 142.32, "step": 96720, "train_speed(iter/s)": 0.286079 }, { "acc": 0.74986658, "epoch": 1.0820318016014048, "grad_norm": 6.46875, "learning_rate": 4.735240585926838e-06, "loss": 0.98019886, "memory(GiB)": 142.32, "step": 96740, "train_speed(iter/s)": 0.286097 }, { "acc": 0.73899145, "epoch": 1.0822555005473633, "grad_norm": 6.90625, "learning_rate": 4.733393777877187e-06, "loss": 1.0394146, "memory(GiB)": 142.32, "step": 96760, "train_speed(iter/s)": 0.28612 }, { "acc": 0.73279562, "epoch": 1.0824791994933218, "grad_norm": 5.15625, "learning_rate": 4.7315470063030785e-06, "loss": 1.06747122, "memory(GiB)": 142.32, "step": 96780, "train_speed(iter/s)": 0.28614 }, { "acc": 0.73028693, "epoch": 1.0827028984392804, "grad_norm": 6.5625, "learning_rate": 4.729700271457176e-06, "loss": 1.07729397, "memory(GiB)": 142.32, "step": 96800, "train_speed(iter/s)": 0.28616 }, { "acc": 0.72579675, "epoch": 1.0829265973852389, "grad_norm": 6.1875, "learning_rate": 4.7278535735921405e-06, "loss": 1.10428028, "memory(GiB)": 142.32, "step": 96820, "train_speed(iter/s)": 0.286178 }, { "acc": 0.72973127, "epoch": 1.0831502963311974, "grad_norm": 6.6875, "learning_rate": 4.7260069129606275e-06, "loss": 1.0794363, "memory(GiB)": 142.32, "step": 96840, "train_speed(iter/s)": 0.286199 }, { "acc": 0.7428978, "epoch": 1.083373995277156, "grad_norm": 5.9375, "learning_rate": 4.724160289815283e-06, "loss": 1.01972189, "memory(GiB)": 142.32, "step": 96860, "train_speed(iter/s)": 0.286219 }, { "acc": 0.72988534, "epoch": 1.0835976942231145, "grad_norm": 5.96875, "learning_rate": 4.722313704408754e-06, "loss": 1.07956476, "memory(GiB)": 142.32, "step": 96880, "train_speed(iter/s)": 0.286241 }, { "acc": 0.7358614, "epoch": 1.083821393169073, "grad_norm": 5.375, "learning_rate": 4.720467156993679e-06, "loss": 1.03474913, "memory(GiB)": 142.32, "step": 96900, "train_speed(iter/s)": 0.286262 }, { "acc": 0.73298044, "epoch": 1.0840450921150315, "grad_norm": 5.3125, "learning_rate": 4.718620647822692e-06, "loss": 1.0620821, "memory(GiB)": 142.32, "step": 96920, "train_speed(iter/s)": 0.286283 }, { "acc": 0.7402647, "epoch": 1.08426879106099, "grad_norm": 5.4375, "learning_rate": 4.716774177148424e-06, "loss": 1.05583401, "memory(GiB)": 142.32, "step": 96940, "train_speed(iter/s)": 0.286304 }, { "acc": 0.73145304, "epoch": 1.0844924900069486, "grad_norm": 6.15625, "learning_rate": 4.714927745223495e-06, "loss": 1.07038193, "memory(GiB)": 142.32, "step": 96960, "train_speed(iter/s)": 0.286326 }, { "acc": 0.74049687, "epoch": 1.0847161889529071, "grad_norm": 7.1875, "learning_rate": 4.7130813523005255e-06, "loss": 1.03442955, "memory(GiB)": 142.32, "step": 96980, "train_speed(iter/s)": 0.286348 }, { "acc": 0.7492692, "epoch": 1.0849398878988656, "grad_norm": 5.5, "learning_rate": 4.711234998632128e-06, "loss": 0.98503294, "memory(GiB)": 142.32, "step": 97000, "train_speed(iter/s)": 0.286369 }, { "acc": 0.74296041, "epoch": 1.0851635868448242, "grad_norm": 6.28125, "learning_rate": 4.709388684470911e-06, "loss": 1.01463108, "memory(GiB)": 142.32, "step": 97020, "train_speed(iter/s)": 0.286386 }, { "acc": 0.73620782, "epoch": 1.0853872857907827, "grad_norm": 5.1875, "learning_rate": 4.707542410069476e-06, "loss": 1.04570847, "memory(GiB)": 142.32, "step": 97040, "train_speed(iter/s)": 0.286408 }, { "acc": 0.7344285, "epoch": 1.0856109847367412, "grad_norm": 5.40625, "learning_rate": 4.705696175680419e-06, "loss": 1.04086151, "memory(GiB)": 142.32, "step": 97060, "train_speed(iter/s)": 0.286428 }, { "acc": 0.74534874, "epoch": 1.0858346836826998, "grad_norm": 6.125, "learning_rate": 4.703849981556332e-06, "loss": 1.01035824, "memory(GiB)": 142.32, "step": 97080, "train_speed(iter/s)": 0.286446 }, { "acc": 0.73494911, "epoch": 1.0860583826286583, "grad_norm": 5.9375, "learning_rate": 4.7020038279498e-06, "loss": 1.06505775, "memory(GiB)": 142.32, "step": 97100, "train_speed(iter/s)": 0.286466 }, { "acc": 0.72854056, "epoch": 1.0862820815746168, "grad_norm": 6.84375, "learning_rate": 4.700157715113403e-06, "loss": 1.08126545, "memory(GiB)": 142.32, "step": 97120, "train_speed(iter/s)": 0.286487 }, { "acc": 0.73702812, "epoch": 1.0865057805205753, "grad_norm": 6.5625, "learning_rate": 4.698311643299717e-06, "loss": 1.05792255, "memory(GiB)": 142.32, "step": 97140, "train_speed(iter/s)": 0.286509 }, { "acc": 0.74150696, "epoch": 1.0867294794665339, "grad_norm": 6.8125, "learning_rate": 4.69646561276131e-06, "loss": 1.02344732, "memory(GiB)": 142.32, "step": 97160, "train_speed(iter/s)": 0.286531 }, { "acc": 0.72845178, "epoch": 1.0869531784124924, "grad_norm": 6.5625, "learning_rate": 4.694619623750746e-06, "loss": 1.09133902, "memory(GiB)": 142.32, "step": 97180, "train_speed(iter/s)": 0.286551 }, { "acc": 0.72571936, "epoch": 1.087176877358451, "grad_norm": 5.5625, "learning_rate": 4.692773676520582e-06, "loss": 1.09040766, "memory(GiB)": 142.32, "step": 97200, "train_speed(iter/s)": 0.286571 }, { "acc": 0.73305159, "epoch": 1.0874005763044094, "grad_norm": 6.25, "learning_rate": 4.69092777132337e-06, "loss": 1.06313362, "memory(GiB)": 142.32, "step": 97220, "train_speed(iter/s)": 0.286589 }, { "acc": 0.7392663, "epoch": 1.087624275250368, "grad_norm": 5.875, "learning_rate": 4.689081908411658e-06, "loss": 1.02626972, "memory(GiB)": 142.32, "step": 97240, "train_speed(iter/s)": 0.286608 }, { "acc": 0.7300292, "epoch": 1.0878479741963265, "grad_norm": 6.25, "learning_rate": 4.687236088037983e-06, "loss": 1.08530273, "memory(GiB)": 142.32, "step": 97260, "train_speed(iter/s)": 0.286626 }, { "acc": 0.75258102, "epoch": 1.088071673142285, "grad_norm": 6.75, "learning_rate": 4.685390310454884e-06, "loss": 0.98924685, "memory(GiB)": 142.32, "step": 97280, "train_speed(iter/s)": 0.286646 }, { "acc": 0.74116211, "epoch": 1.0882953720882436, "grad_norm": 5.09375, "learning_rate": 4.683544575914886e-06, "loss": 1.04175835, "memory(GiB)": 142.32, "step": 97300, "train_speed(iter/s)": 0.286666 }, { "acc": 0.74544415, "epoch": 1.088519071034202, "grad_norm": 5.53125, "learning_rate": 4.681698884670512e-06, "loss": 1.00857239, "memory(GiB)": 142.32, "step": 97320, "train_speed(iter/s)": 0.286687 }, { "acc": 0.73240576, "epoch": 1.0887427699801606, "grad_norm": 6.65625, "learning_rate": 4.679853236974281e-06, "loss": 1.05771837, "memory(GiB)": 142.32, "step": 97340, "train_speed(iter/s)": 0.286707 }, { "acc": 0.75137477, "epoch": 1.0889664689261191, "grad_norm": 5.875, "learning_rate": 4.678007633078703e-06, "loss": 0.98444576, "memory(GiB)": 142.32, "step": 97360, "train_speed(iter/s)": 0.286726 }, { "acc": 0.73577299, "epoch": 1.0891901678720777, "grad_norm": 6.53125, "learning_rate": 4.676162073236285e-06, "loss": 1.05068521, "memory(GiB)": 142.32, "step": 97380, "train_speed(iter/s)": 0.286747 }, { "acc": 0.73369455, "epoch": 1.0894138668180362, "grad_norm": 6.8125, "learning_rate": 4.674316557699522e-06, "loss": 1.06429882, "memory(GiB)": 142.32, "step": 97400, "train_speed(iter/s)": 0.286766 }, { "acc": 0.7340641, "epoch": 1.0896375657639947, "grad_norm": 7.4375, "learning_rate": 4.67247108672091e-06, "loss": 1.0682869, "memory(GiB)": 142.32, "step": 97420, "train_speed(iter/s)": 0.286784 }, { "acc": 0.7358984, "epoch": 1.0898612647099533, "grad_norm": 7.84375, "learning_rate": 4.670625660552934e-06, "loss": 1.02382927, "memory(GiB)": 142.32, "step": 97440, "train_speed(iter/s)": 0.286803 }, { "acc": 0.73613038, "epoch": 1.0900849636559118, "grad_norm": 5.84375, "learning_rate": 4.668780279448076e-06, "loss": 1.04729424, "memory(GiB)": 142.32, "step": 97460, "train_speed(iter/s)": 0.286824 }, { "acc": 0.73451185, "epoch": 1.0903086626018703, "grad_norm": 4.78125, "learning_rate": 4.666934943658811e-06, "loss": 1.06147766, "memory(GiB)": 142.32, "step": 97480, "train_speed(iter/s)": 0.286843 }, { "acc": 0.7427526, "epoch": 1.0905323615478288, "grad_norm": 6.40625, "learning_rate": 4.665089653437604e-06, "loss": 1.00920277, "memory(GiB)": 142.32, "step": 97500, "train_speed(iter/s)": 0.286864 }, { "acc": 0.75473714, "epoch": 1.0907560604937874, "grad_norm": 4.84375, "learning_rate": 4.6632444090369215e-06, "loss": 0.98309393, "memory(GiB)": 142.32, "step": 97520, "train_speed(iter/s)": 0.286882 }, { "acc": 0.74151192, "epoch": 1.090979759439746, "grad_norm": 5.9375, "learning_rate": 4.661399210709215e-06, "loss": 1.01848917, "memory(GiB)": 142.32, "step": 97540, "train_speed(iter/s)": 0.286902 }, { "acc": 0.7417326, "epoch": 1.0912034583857044, "grad_norm": 6.28125, "learning_rate": 4.659554058706937e-06, "loss": 1.03433733, "memory(GiB)": 142.32, "step": 97560, "train_speed(iter/s)": 0.286925 }, { "acc": 0.73238077, "epoch": 1.091427157331663, "grad_norm": 5.09375, "learning_rate": 4.657708953282532e-06, "loss": 1.06867447, "memory(GiB)": 142.32, "step": 97580, "train_speed(iter/s)": 0.286946 }, { "acc": 0.74586039, "epoch": 1.0916508562776215, "grad_norm": 6.84375, "learning_rate": 4.655863894688433e-06, "loss": 1.02042542, "memory(GiB)": 142.32, "step": 97600, "train_speed(iter/s)": 0.286966 }, { "acc": 0.7397604, "epoch": 1.09187455522358, "grad_norm": 4.875, "learning_rate": 4.654018883177071e-06, "loss": 1.03688812, "memory(GiB)": 142.32, "step": 97620, "train_speed(iter/s)": 0.286987 }, { "acc": 0.73203182, "epoch": 1.0920982541695385, "grad_norm": 5.15625, "learning_rate": 4.6521739190008725e-06, "loss": 1.07512131, "memory(GiB)": 142.32, "step": 97640, "train_speed(iter/s)": 0.287006 }, { "acc": 0.73217344, "epoch": 1.092321953115497, "grad_norm": 6.59375, "learning_rate": 4.650329002412253e-06, "loss": 1.05449734, "memory(GiB)": 142.32, "step": 97660, "train_speed(iter/s)": 0.287025 }, { "acc": 0.733951, "epoch": 1.0925456520614556, "grad_norm": 6.46875, "learning_rate": 4.6484841336636245e-06, "loss": 1.06962118, "memory(GiB)": 142.32, "step": 97680, "train_speed(iter/s)": 0.287045 }, { "acc": 0.72898264, "epoch": 1.0927693510074141, "grad_norm": 6.09375, "learning_rate": 4.64663931300739e-06, "loss": 1.09442768, "memory(GiB)": 142.32, "step": 97700, "train_speed(iter/s)": 0.287064 }, { "acc": 0.74469457, "epoch": 1.0929930499533727, "grad_norm": 6.5, "learning_rate": 4.644794540695949e-06, "loss": 1.00440092, "memory(GiB)": 142.32, "step": 97720, "train_speed(iter/s)": 0.287085 }, { "acc": 0.73994322, "epoch": 1.0932167488993312, "grad_norm": 5.53125, "learning_rate": 4.64294981698169e-06, "loss": 1.03811264, "memory(GiB)": 142.32, "step": 97740, "train_speed(iter/s)": 0.287105 }, { "acc": 0.73665004, "epoch": 1.0934404478452897, "grad_norm": 6.4375, "learning_rate": 4.641105142117e-06, "loss": 1.02938032, "memory(GiB)": 142.32, "step": 97760, "train_speed(iter/s)": 0.287126 }, { "acc": 0.7406281, "epoch": 1.0936641467912482, "grad_norm": 6.9375, "learning_rate": 4.639260516354259e-06, "loss": 1.02064285, "memory(GiB)": 142.32, "step": 97780, "train_speed(iter/s)": 0.287146 }, { "acc": 0.72709236, "epoch": 1.0938878457372068, "grad_norm": 6.59375, "learning_rate": 4.637415939945833e-06, "loss": 1.09332561, "memory(GiB)": 142.32, "step": 97800, "train_speed(iter/s)": 0.287166 }, { "acc": 0.7412354, "epoch": 1.0941115446831653, "grad_norm": 7.03125, "learning_rate": 4.63557141314409e-06, "loss": 1.03191471, "memory(GiB)": 142.32, "step": 97820, "train_speed(iter/s)": 0.287186 }, { "acc": 0.7341784, "epoch": 1.0943352436291238, "grad_norm": 5.53125, "learning_rate": 4.633726936201385e-06, "loss": 1.05117359, "memory(GiB)": 142.32, "step": 97840, "train_speed(iter/s)": 0.287206 }, { "acc": 0.73870258, "epoch": 1.0945589425750826, "grad_norm": 6.375, "learning_rate": 4.631882509370072e-06, "loss": 1.04591961, "memory(GiB)": 142.32, "step": 97860, "train_speed(iter/s)": 0.287224 }, { "acc": 0.74277229, "epoch": 1.094782641521041, "grad_norm": 5.375, "learning_rate": 4.630038132902494e-06, "loss": 1.02133713, "memory(GiB)": 142.32, "step": 97880, "train_speed(iter/s)": 0.287245 }, { "acc": 0.74534316, "epoch": 1.0950063404669996, "grad_norm": 6.625, "learning_rate": 4.6281938070509855e-06, "loss": 1.00702705, "memory(GiB)": 142.32, "step": 97900, "train_speed(iter/s)": 0.287265 }, { "acc": 0.75318365, "epoch": 1.0952300394129582, "grad_norm": 5.78125, "learning_rate": 4.626349532067879e-06, "loss": 0.97144976, "memory(GiB)": 142.32, "step": 97920, "train_speed(iter/s)": 0.287284 }, { "acc": 0.7291832, "epoch": 1.0954537383589167, "grad_norm": 5.40625, "learning_rate": 4.6245053082054975e-06, "loss": 1.0842947, "memory(GiB)": 142.32, "step": 97940, "train_speed(iter/s)": 0.287304 }, { "acc": 0.73856936, "epoch": 1.0956774373048752, "grad_norm": 6.59375, "learning_rate": 4.622661135716157e-06, "loss": 1.03246679, "memory(GiB)": 142.32, "step": 97960, "train_speed(iter/s)": 0.287325 }, { "acc": 0.72870388, "epoch": 1.0959011362508337, "grad_norm": 6.09375, "learning_rate": 4.620817014852167e-06, "loss": 1.06723557, "memory(GiB)": 142.32, "step": 97980, "train_speed(iter/s)": 0.287346 }, { "acc": 0.73167477, "epoch": 1.0961248351967923, "grad_norm": 6.1875, "learning_rate": 4.618972945865828e-06, "loss": 1.07375412, "memory(GiB)": 142.32, "step": 98000, "train_speed(iter/s)": 0.287369 }, { "epoch": 1.0961248351967923, "eval_acc": 0.6959605137736826, "eval_loss": 1.0732790231704712, "eval_runtime": 2338.4189, "eval_samples_per_second": 32.194, "eval_steps_per_second": 16.097, "step": 98000 }, { "acc": 0.73828545, "epoch": 1.0963485341427508, "grad_norm": 5.09375, "learning_rate": 4.617128929009436e-06, "loss": 1.03024778, "memory(GiB)": 142.32, "step": 98020, "train_speed(iter/s)": 0.285389 }, { "acc": 0.72745471, "epoch": 1.0965722330887093, "grad_norm": 6.1875, "learning_rate": 4.61528496453528e-06, "loss": 1.08625736, "memory(GiB)": 142.32, "step": 98040, "train_speed(iter/s)": 0.28541 }, { "acc": 0.74246159, "epoch": 1.0967959320346679, "grad_norm": 6.75, "learning_rate": 4.613441052695639e-06, "loss": 1.01722488, "memory(GiB)": 142.32, "step": 98060, "train_speed(iter/s)": 0.285431 }, { "acc": 0.73664293, "epoch": 1.0970196309806264, "grad_norm": 5.40625, "learning_rate": 4.611597193742789e-06, "loss": 1.04499531, "memory(GiB)": 142.32, "step": 98080, "train_speed(iter/s)": 0.285449 }, { "acc": 0.72793832, "epoch": 1.097243329926585, "grad_norm": 6.09375, "learning_rate": 4.609753387928993e-06, "loss": 1.08297148, "memory(GiB)": 142.32, "step": 98100, "train_speed(iter/s)": 0.285469 }, { "acc": 0.74559536, "epoch": 1.0974670288725434, "grad_norm": 6.96875, "learning_rate": 4.60790963550651e-06, "loss": 1.00716743, "memory(GiB)": 142.32, "step": 98120, "train_speed(iter/s)": 0.285488 }, { "acc": 0.73138895, "epoch": 1.097690727818502, "grad_norm": 5.9375, "learning_rate": 4.606065936727595e-06, "loss": 1.08230534, "memory(GiB)": 142.32, "step": 98140, "train_speed(iter/s)": 0.285508 }, { "acc": 0.74389343, "epoch": 1.0979144267644605, "grad_norm": 5.71875, "learning_rate": 4.60422229184449e-06, "loss": 1.00967178, "memory(GiB)": 142.32, "step": 98160, "train_speed(iter/s)": 0.285529 }, { "acc": 0.74040461, "epoch": 1.098138125710419, "grad_norm": 7.78125, "learning_rate": 4.602378701109433e-06, "loss": 1.02727432, "memory(GiB)": 142.32, "step": 98180, "train_speed(iter/s)": 0.285548 }, { "acc": 0.73772402, "epoch": 1.0983618246563776, "grad_norm": 5.28125, "learning_rate": 4.600535164774653e-06, "loss": 1.03246784, "memory(GiB)": 142.32, "step": 98200, "train_speed(iter/s)": 0.285567 }, { "acc": 0.73617001, "epoch": 1.098585523602336, "grad_norm": 5.6875, "learning_rate": 4.598691683092371e-06, "loss": 1.05814695, "memory(GiB)": 142.32, "step": 98220, "train_speed(iter/s)": 0.285587 }, { "acc": 0.73697624, "epoch": 1.0988092225482946, "grad_norm": 5.875, "learning_rate": 4.596848256314805e-06, "loss": 1.03632803, "memory(GiB)": 142.32, "step": 98240, "train_speed(iter/s)": 0.285606 }, { "acc": 0.73774652, "epoch": 1.0990329214942531, "grad_norm": 5.4375, "learning_rate": 4.595004884694158e-06, "loss": 1.04104948, "memory(GiB)": 142.32, "step": 98260, "train_speed(iter/s)": 0.285628 }, { "acc": 0.73536663, "epoch": 1.0992566204402117, "grad_norm": 5.65625, "learning_rate": 4.5931615684826324e-06, "loss": 1.06016045, "memory(GiB)": 142.32, "step": 98280, "train_speed(iter/s)": 0.285648 }, { "acc": 0.74187756, "epoch": 1.0994803193861702, "grad_norm": 5.96875, "learning_rate": 4.591318307932418e-06, "loss": 1.02491493, "memory(GiB)": 142.32, "step": 98300, "train_speed(iter/s)": 0.285668 }, { "acc": 0.72831583, "epoch": 1.0997040183321287, "grad_norm": 6.6875, "learning_rate": 4.5894751032957024e-06, "loss": 1.08798075, "memory(GiB)": 142.32, "step": 98320, "train_speed(iter/s)": 0.285686 }, { "acc": 0.73540173, "epoch": 1.0999277172780872, "grad_norm": 5.28125, "learning_rate": 4.587631954824659e-06, "loss": 1.04941006, "memory(GiB)": 142.32, "step": 98340, "train_speed(iter/s)": 0.285704 }, { "acc": 0.74042912, "epoch": 1.1001514162240458, "grad_norm": 5.59375, "learning_rate": 4.585788862771458e-06, "loss": 1.01703625, "memory(GiB)": 142.32, "step": 98360, "train_speed(iter/s)": 0.285724 }, { "acc": 0.7187232, "epoch": 1.1003751151700043, "grad_norm": 6.125, "learning_rate": 4.583945827388261e-06, "loss": 1.13603916, "memory(GiB)": 142.32, "step": 98380, "train_speed(iter/s)": 0.285744 }, { "acc": 0.73340268, "epoch": 1.1005988141159628, "grad_norm": 7.4375, "learning_rate": 4.582102848927222e-06, "loss": 1.06910553, "memory(GiB)": 142.32, "step": 98400, "train_speed(iter/s)": 0.28576 }, { "acc": 0.72534428, "epoch": 1.1008225130619214, "grad_norm": 6.625, "learning_rate": 4.580259927640488e-06, "loss": 1.11745186, "memory(GiB)": 142.32, "step": 98420, "train_speed(iter/s)": 0.285782 }, { "acc": 0.72963438, "epoch": 1.1010462120078799, "grad_norm": 6.125, "learning_rate": 4.578417063780193e-06, "loss": 1.07721281, "memory(GiB)": 142.32, "step": 98440, "train_speed(iter/s)": 0.285802 }, { "acc": 0.73878303, "epoch": 1.1012699109538384, "grad_norm": 5.3125, "learning_rate": 4.576574257598471e-06, "loss": 1.03489246, "memory(GiB)": 142.32, "step": 98460, "train_speed(iter/s)": 0.285823 }, { "acc": 0.72992859, "epoch": 1.101493609899797, "grad_norm": 6.4375, "learning_rate": 4.574731509347441e-06, "loss": 1.07944651, "memory(GiB)": 142.32, "step": 98480, "train_speed(iter/s)": 0.285839 }, { "acc": 0.7309227, "epoch": 1.1017173088457555, "grad_norm": 5.28125, "learning_rate": 4.57288881927922e-06, "loss": 1.06824684, "memory(GiB)": 142.32, "step": 98500, "train_speed(iter/s)": 0.285859 }, { "acc": 0.7309515, "epoch": 1.101941007791714, "grad_norm": 7.03125, "learning_rate": 4.571046187645914e-06, "loss": 1.0540123, "memory(GiB)": 142.32, "step": 98520, "train_speed(iter/s)": 0.285879 }, { "acc": 0.74002767, "epoch": 1.1021647067376725, "grad_norm": 6.5, "learning_rate": 4.56920361469962e-06, "loss": 1.03536472, "memory(GiB)": 142.32, "step": 98540, "train_speed(iter/s)": 0.285899 }, { "acc": 0.73772125, "epoch": 1.102388405683631, "grad_norm": 5.96875, "learning_rate": 4.567361100692429e-06, "loss": 1.04344635, "memory(GiB)": 142.32, "step": 98560, "train_speed(iter/s)": 0.28592 }, { "acc": 0.73278069, "epoch": 1.1026121046295896, "grad_norm": 7.90625, "learning_rate": 4.565518645876424e-06, "loss": 1.06123514, "memory(GiB)": 142.32, "step": 98580, "train_speed(iter/s)": 0.28594 }, { "acc": 0.73906989, "epoch": 1.1028358035755481, "grad_norm": 6.625, "learning_rate": 4.563676250503677e-06, "loss": 1.02483711, "memory(GiB)": 142.32, "step": 98600, "train_speed(iter/s)": 0.28596 }, { "acc": 0.73577542, "epoch": 1.1030595025215066, "grad_norm": 6.4375, "learning_rate": 4.561833914826256e-06, "loss": 1.06077785, "memory(GiB)": 142.32, "step": 98620, "train_speed(iter/s)": 0.285979 }, { "acc": 0.74161539, "epoch": 1.1032832014674652, "grad_norm": 6.625, "learning_rate": 4.55999163909622e-06, "loss": 1.03011761, "memory(GiB)": 142.32, "step": 98640, "train_speed(iter/s)": 0.285999 }, { "acc": 0.72625728, "epoch": 1.1035069004134237, "grad_norm": 6.40625, "learning_rate": 4.5581494235656146e-06, "loss": 1.09136038, "memory(GiB)": 142.32, "step": 98660, "train_speed(iter/s)": 0.286019 }, { "acc": 0.73063459, "epoch": 1.1037305993593822, "grad_norm": 5.96875, "learning_rate": 4.556307268486484e-06, "loss": 1.08079376, "memory(GiB)": 142.32, "step": 98680, "train_speed(iter/s)": 0.286037 }, { "acc": 0.74250922, "epoch": 1.1039542983053408, "grad_norm": 6.9375, "learning_rate": 4.554465174110862e-06, "loss": 1.03942738, "memory(GiB)": 142.32, "step": 98700, "train_speed(iter/s)": 0.286057 }, { "acc": 0.74380589, "epoch": 1.1041779972512993, "grad_norm": 7.0625, "learning_rate": 4.5526231406907705e-06, "loss": 1.00855627, "memory(GiB)": 142.32, "step": 98720, "train_speed(iter/s)": 0.286078 }, { "acc": 0.73658195, "epoch": 1.1044016961972578, "grad_norm": 5.90625, "learning_rate": 4.55078116847823e-06, "loss": 1.04463425, "memory(GiB)": 142.32, "step": 98740, "train_speed(iter/s)": 0.2861 }, { "acc": 0.74531231, "epoch": 1.1046253951432163, "grad_norm": 5.1875, "learning_rate": 4.548939257725245e-06, "loss": 1.00717354, "memory(GiB)": 142.32, "step": 98760, "train_speed(iter/s)": 0.28612 }, { "acc": 0.73743505, "epoch": 1.1048490940891749, "grad_norm": 6.6875, "learning_rate": 4.547097408683817e-06, "loss": 1.04144135, "memory(GiB)": 142.32, "step": 98780, "train_speed(iter/s)": 0.28614 }, { "acc": 0.72471437, "epoch": 1.1050727930351334, "grad_norm": 6.5625, "learning_rate": 4.545255621605937e-06, "loss": 1.11249218, "memory(GiB)": 142.32, "step": 98800, "train_speed(iter/s)": 0.286157 }, { "acc": 0.75080786, "epoch": 1.105296491981092, "grad_norm": 7.125, "learning_rate": 4.543413896743587e-06, "loss": 0.99058619, "memory(GiB)": 142.32, "step": 98820, "train_speed(iter/s)": 0.286177 }, { "acc": 0.74826679, "epoch": 1.1055201909270505, "grad_norm": 6.90625, "learning_rate": 4.541572234348744e-06, "loss": 0.98768816, "memory(GiB)": 142.32, "step": 98840, "train_speed(iter/s)": 0.286199 }, { "acc": 0.73368649, "epoch": 1.105743889873009, "grad_norm": 7.375, "learning_rate": 4.539730634673371e-06, "loss": 1.06021204, "memory(GiB)": 142.32, "step": 98860, "train_speed(iter/s)": 0.28622 }, { "acc": 0.73385801, "epoch": 1.1059675888189675, "grad_norm": 5.59375, "learning_rate": 4.537889097969425e-06, "loss": 1.04783211, "memory(GiB)": 142.32, "step": 98880, "train_speed(iter/s)": 0.28624 }, { "acc": 0.74461937, "epoch": 1.106191287764926, "grad_norm": 6.125, "learning_rate": 4.536047624488856e-06, "loss": 1.01747704, "memory(GiB)": 142.32, "step": 98900, "train_speed(iter/s)": 0.286261 }, { "acc": 0.73863373, "epoch": 1.1064149867108846, "grad_norm": 5.8125, "learning_rate": 4.534206214483604e-06, "loss": 1.05351505, "memory(GiB)": 142.32, "step": 98920, "train_speed(iter/s)": 0.286281 }, { "acc": 0.73525562, "epoch": 1.106638685656843, "grad_norm": 6.09375, "learning_rate": 4.5323648682055995e-06, "loss": 1.05867653, "memory(GiB)": 142.32, "step": 98940, "train_speed(iter/s)": 0.286298 }, { "acc": 0.73440552, "epoch": 1.1068623846028016, "grad_norm": 5.78125, "learning_rate": 4.530523585906764e-06, "loss": 1.05034752, "memory(GiB)": 142.32, "step": 98960, "train_speed(iter/s)": 0.286317 }, { "acc": 0.7369792, "epoch": 1.1070860835487601, "grad_norm": 6.15625, "learning_rate": 4.528682367839013e-06, "loss": 1.04293308, "memory(GiB)": 142.32, "step": 98980, "train_speed(iter/s)": 0.286334 }, { "acc": 0.74139795, "epoch": 1.1073097824947187, "grad_norm": 6.3125, "learning_rate": 4.526841214254251e-06, "loss": 1.02357674, "memory(GiB)": 142.32, "step": 99000, "train_speed(iter/s)": 0.286351 }, { "acc": 0.73624907, "epoch": 1.1075334814406772, "grad_norm": 6.625, "learning_rate": 4.525000125404373e-06, "loss": 1.05067301, "memory(GiB)": 142.32, "step": 99020, "train_speed(iter/s)": 0.286369 }, { "acc": 0.7390883, "epoch": 1.1077571803866357, "grad_norm": 6.15625, "learning_rate": 4.523159101541268e-06, "loss": 1.03370342, "memory(GiB)": 142.32, "step": 99040, "train_speed(iter/s)": 0.286387 }, { "acc": 0.73849125, "epoch": 1.1079808793325943, "grad_norm": 5.875, "learning_rate": 4.521318142916813e-06, "loss": 1.029883, "memory(GiB)": 142.32, "step": 99060, "train_speed(iter/s)": 0.286408 }, { "acc": 0.73061762, "epoch": 1.1082045782785528, "grad_norm": 4.71875, "learning_rate": 4.519477249782878e-06, "loss": 1.07413063, "memory(GiB)": 142.32, "step": 99080, "train_speed(iter/s)": 0.286426 }, { "acc": 0.73059368, "epoch": 1.1084282772245113, "grad_norm": 5.25, "learning_rate": 4.517636422391324e-06, "loss": 1.07824459, "memory(GiB)": 142.32, "step": 99100, "train_speed(iter/s)": 0.286442 }, { "acc": 0.73267946, "epoch": 1.1086519761704698, "grad_norm": 7.75, "learning_rate": 4.515795660994002e-06, "loss": 1.06925211, "memory(GiB)": 142.32, "step": 99120, "train_speed(iter/s)": 0.286462 }, { "acc": 0.74329309, "epoch": 1.1088756751164284, "grad_norm": 5.75, "learning_rate": 4.513954965842755e-06, "loss": 0.99951229, "memory(GiB)": 142.32, "step": 99140, "train_speed(iter/s)": 0.286481 }, { "acc": 0.73119283, "epoch": 1.109099374062387, "grad_norm": 6.375, "learning_rate": 4.5121143371894146e-06, "loss": 1.07511101, "memory(GiB)": 142.32, "step": 99160, "train_speed(iter/s)": 0.2865 }, { "acc": 0.7431077, "epoch": 1.1093230730083454, "grad_norm": 6.125, "learning_rate": 4.510273775285807e-06, "loss": 1.0105505, "memory(GiB)": 142.32, "step": 99180, "train_speed(iter/s)": 0.286519 }, { "acc": 0.72530546, "epoch": 1.109546771954304, "grad_norm": 7.03125, "learning_rate": 4.508433280383746e-06, "loss": 1.09427137, "memory(GiB)": 142.32, "step": 99200, "train_speed(iter/s)": 0.28654 }, { "acc": 0.73069181, "epoch": 1.1097704709002625, "grad_norm": 8.6875, "learning_rate": 4.506592852735039e-06, "loss": 1.0782383, "memory(GiB)": 142.32, "step": 99220, "train_speed(iter/s)": 0.28656 }, { "acc": 0.74567661, "epoch": 1.109994169846221, "grad_norm": 5.1875, "learning_rate": 4.504752492591483e-06, "loss": 1.01988859, "memory(GiB)": 142.32, "step": 99240, "train_speed(iter/s)": 0.286582 }, { "acc": 0.72665062, "epoch": 1.1102178687921795, "grad_norm": 5.40625, "learning_rate": 4.502912200204863e-06, "loss": 1.0864254, "memory(GiB)": 142.32, "step": 99260, "train_speed(iter/s)": 0.286604 }, { "acc": 0.73897285, "epoch": 1.110441567738138, "grad_norm": 6.5625, "learning_rate": 4.50107197582696e-06, "loss": 1.03764925, "memory(GiB)": 142.32, "step": 99280, "train_speed(iter/s)": 0.286625 }, { "acc": 0.73151922, "epoch": 1.1106652666840966, "grad_norm": 5.90625, "learning_rate": 4.499231819709542e-06, "loss": 1.06060467, "memory(GiB)": 142.32, "step": 99300, "train_speed(iter/s)": 0.286645 }, { "acc": 0.73186293, "epoch": 1.1108889656300551, "grad_norm": 6.0625, "learning_rate": 4.4973917321043684e-06, "loss": 1.07474241, "memory(GiB)": 142.32, "step": 99320, "train_speed(iter/s)": 0.286666 }, { "acc": 0.74191046, "epoch": 1.1111126645760137, "grad_norm": 5.21875, "learning_rate": 4.49555171326319e-06, "loss": 1.0343195, "memory(GiB)": 142.32, "step": 99340, "train_speed(iter/s)": 0.286684 }, { "acc": 0.74374537, "epoch": 1.1113363635219722, "grad_norm": 6.1875, "learning_rate": 4.493711763437748e-06, "loss": 1.00204163, "memory(GiB)": 142.32, "step": 99360, "train_speed(iter/s)": 0.286703 }, { "acc": 0.72921753, "epoch": 1.1115600624679307, "grad_norm": 4.875, "learning_rate": 4.491871882879772e-06, "loss": 1.08480225, "memory(GiB)": 142.32, "step": 99380, "train_speed(iter/s)": 0.286723 }, { "acc": 0.73947906, "epoch": 1.1117837614138892, "grad_norm": 5.8125, "learning_rate": 4.490032071840985e-06, "loss": 1.04545488, "memory(GiB)": 142.32, "step": 99400, "train_speed(iter/s)": 0.286743 }, { "acc": 0.74151993, "epoch": 1.1120074603598478, "grad_norm": 6.34375, "learning_rate": 4.488192330573104e-06, "loss": 1.02152023, "memory(GiB)": 142.32, "step": 99420, "train_speed(iter/s)": 0.286763 }, { "acc": 0.74748154, "epoch": 1.1122311593058063, "grad_norm": 6.5625, "learning_rate": 4.486352659327823e-06, "loss": 0.99979973, "memory(GiB)": 142.32, "step": 99440, "train_speed(iter/s)": 0.286782 }, { "acc": 0.73483448, "epoch": 1.1124548582517648, "grad_norm": 6.21875, "learning_rate": 4.484513058356841e-06, "loss": 1.06251087, "memory(GiB)": 142.32, "step": 99460, "train_speed(iter/s)": 0.286799 }, { "acc": 0.72680254, "epoch": 1.1126785571977234, "grad_norm": 6.40625, "learning_rate": 4.4826735279118425e-06, "loss": 1.10899134, "memory(GiB)": 142.32, "step": 99480, "train_speed(iter/s)": 0.28682 }, { "acc": 0.73779635, "epoch": 1.1129022561436819, "grad_norm": 5.25, "learning_rate": 4.480834068244498e-06, "loss": 1.04494667, "memory(GiB)": 142.32, "step": 99500, "train_speed(iter/s)": 0.286837 }, { "acc": 0.74824457, "epoch": 1.1131259550896404, "grad_norm": 6.96875, "learning_rate": 4.478994679606473e-06, "loss": 1.01127033, "memory(GiB)": 142.32, "step": 99520, "train_speed(iter/s)": 0.286858 }, { "acc": 0.72830567, "epoch": 1.113349654035599, "grad_norm": 5.90625, "learning_rate": 4.477155362249422e-06, "loss": 1.08491983, "memory(GiB)": 142.32, "step": 99540, "train_speed(iter/s)": 0.286879 }, { "acc": 0.74737921, "epoch": 1.1135733529815575, "grad_norm": 6.75, "learning_rate": 4.475316116424992e-06, "loss": 0.99911442, "memory(GiB)": 142.32, "step": 99560, "train_speed(iter/s)": 0.2869 }, { "acc": 0.72601562, "epoch": 1.113797051927516, "grad_norm": 6.0625, "learning_rate": 4.473476942384817e-06, "loss": 1.1075264, "memory(GiB)": 142.32, "step": 99580, "train_speed(iter/s)": 0.286921 }, { "acc": 0.73792944, "epoch": 1.1140207508734745, "grad_norm": 6.15625, "learning_rate": 4.471637840380522e-06, "loss": 1.04996767, "memory(GiB)": 142.32, "step": 99600, "train_speed(iter/s)": 0.28694 }, { "acc": 0.7431973, "epoch": 1.114244449819433, "grad_norm": 5.21875, "learning_rate": 4.469798810663722e-06, "loss": 1.00055065, "memory(GiB)": 142.32, "step": 99620, "train_speed(iter/s)": 0.28696 }, { "acc": 0.74340153, "epoch": 1.1144681487653916, "grad_norm": 4.8125, "learning_rate": 4.467959853486023e-06, "loss": 1.01911039, "memory(GiB)": 142.32, "step": 99640, "train_speed(iter/s)": 0.28698 }, { "acc": 0.72816172, "epoch": 1.11469184771135, "grad_norm": 5.96875, "learning_rate": 4.4661209690990195e-06, "loss": 1.09810104, "memory(GiB)": 142.32, "step": 99660, "train_speed(iter/s)": 0.287002 }, { "acc": 0.72666836, "epoch": 1.1149155466573086, "grad_norm": 5.375, "learning_rate": 4.464282157754301e-06, "loss": 1.10737877, "memory(GiB)": 142.32, "step": 99680, "train_speed(iter/s)": 0.287023 }, { "acc": 0.74000144, "epoch": 1.1151392456032672, "grad_norm": 6.28125, "learning_rate": 4.462443419703439e-06, "loss": 1.03784332, "memory(GiB)": 142.32, "step": 99700, "train_speed(iter/s)": 0.287044 }, { "acc": 0.74049196, "epoch": 1.1153629445492257, "grad_norm": 5.84375, "learning_rate": 4.460604755198e-06, "loss": 1.02042294, "memory(GiB)": 142.32, "step": 99720, "train_speed(iter/s)": 0.287063 }, { "acc": 0.74525928, "epoch": 1.1155866434951842, "grad_norm": 6.28125, "learning_rate": 4.458766164489541e-06, "loss": 1.01486721, "memory(GiB)": 142.32, "step": 99740, "train_speed(iter/s)": 0.287083 }, { "acc": 0.74056458, "epoch": 1.1158103424411427, "grad_norm": 4.84375, "learning_rate": 4.456927647829607e-06, "loss": 1.03578215, "memory(GiB)": 142.32, "step": 99760, "train_speed(iter/s)": 0.287103 }, { "acc": 0.72888117, "epoch": 1.1160340413871013, "grad_norm": 6.65625, "learning_rate": 4.455089205469733e-06, "loss": 1.07509899, "memory(GiB)": 142.32, "step": 99780, "train_speed(iter/s)": 0.287125 }, { "acc": 0.73412085, "epoch": 1.1162577403330598, "grad_norm": 5.34375, "learning_rate": 4.4532508376614434e-06, "loss": 1.03473148, "memory(GiB)": 142.32, "step": 99800, "train_speed(iter/s)": 0.287145 }, { "acc": 0.73481121, "epoch": 1.1164814392790183, "grad_norm": 6.0, "learning_rate": 4.451412544656255e-06, "loss": 1.05871372, "memory(GiB)": 142.32, "step": 99820, "train_speed(iter/s)": 0.287165 }, { "acc": 0.73139763, "epoch": 1.1167051382249769, "grad_norm": 7.1875, "learning_rate": 4.449574326705671e-06, "loss": 1.07995358, "memory(GiB)": 142.32, "step": 99840, "train_speed(iter/s)": 0.287184 }, { "acc": 0.73864861, "epoch": 1.1169288371709354, "grad_norm": 4.6875, "learning_rate": 4.447736184061186e-06, "loss": 1.03147869, "memory(GiB)": 142.32, "step": 99860, "train_speed(iter/s)": 0.287203 }, { "acc": 0.75291281, "epoch": 1.117152536116894, "grad_norm": 6.25, "learning_rate": 4.4458981169742865e-06, "loss": 0.98348026, "memory(GiB)": 142.32, "step": 99880, "train_speed(iter/s)": 0.287219 }, { "acc": 0.74584699, "epoch": 1.1173762350628524, "grad_norm": 7.75, "learning_rate": 4.444060125696444e-06, "loss": 0.9895359, "memory(GiB)": 142.32, "step": 99900, "train_speed(iter/s)": 0.28724 }, { "acc": 0.72859297, "epoch": 1.117599934008811, "grad_norm": 7.0, "learning_rate": 4.442222210479121e-06, "loss": 1.09733849, "memory(GiB)": 142.32, "step": 99920, "train_speed(iter/s)": 0.287259 }, { "acc": 0.73602452, "epoch": 1.1178236329547695, "grad_norm": 5.21875, "learning_rate": 4.4403843715737725e-06, "loss": 1.03891945, "memory(GiB)": 142.32, "step": 99940, "train_speed(iter/s)": 0.287278 }, { "acc": 0.73377271, "epoch": 1.118047331900728, "grad_norm": 5.71875, "learning_rate": 4.438546609231841e-06, "loss": 1.06771355, "memory(GiB)": 142.32, "step": 99960, "train_speed(iter/s)": 0.287298 }, { "acc": 0.73473802, "epoch": 1.1182710308466866, "grad_norm": 5.96875, "learning_rate": 4.43670892370476e-06, "loss": 1.06549664, "memory(GiB)": 142.32, "step": 99980, "train_speed(iter/s)": 0.287319 }, { "acc": 0.73239827, "epoch": 1.118494729792645, "grad_norm": 4.3125, "learning_rate": 4.434871315243948e-06, "loss": 1.05878859, "memory(GiB)": 142.32, "step": 100000, "train_speed(iter/s)": 0.287339 }, { "epoch": 1.118494729792645, "eval_acc": 0.6960527971119259, "eval_loss": 1.073075771331787, "eval_runtime": 2347.0168, "eval_samples_per_second": 32.076, "eval_steps_per_second": 16.038, "step": 100000 }, { "acc": 0.74312305, "epoch": 1.1187184287386036, "grad_norm": 7.40625, "learning_rate": 4.433033784100817e-06, "loss": 1.0155117, "memory(GiB)": 142.32, "step": 100020, "train_speed(iter/s)": 0.285394 }, { "acc": 0.72500238, "epoch": 1.1189421276845621, "grad_norm": 5.15625, "learning_rate": 4.431196330526769e-06, "loss": 1.11273298, "memory(GiB)": 142.32, "step": 100040, "train_speed(iter/s)": 0.285413 }, { "acc": 0.74051332, "epoch": 1.1191658266305207, "grad_norm": 6.09375, "learning_rate": 4.429358954773192e-06, "loss": 1.02420092, "memory(GiB)": 142.32, "step": 100060, "train_speed(iter/s)": 0.285432 }, { "acc": 0.72822785, "epoch": 1.1193895255764792, "grad_norm": 6.375, "learning_rate": 4.427521657091469e-06, "loss": 1.088342, "memory(GiB)": 142.32, "step": 100080, "train_speed(iter/s)": 0.285455 }, { "acc": 0.74226112, "epoch": 1.1196132245224377, "grad_norm": 5.875, "learning_rate": 4.425684437732964e-06, "loss": 1.01656952, "memory(GiB)": 142.32, "step": 100100, "train_speed(iter/s)": 0.285475 }, { "acc": 0.73791347, "epoch": 1.1198369234683963, "grad_norm": 6.1875, "learning_rate": 4.423847296949036e-06, "loss": 1.04714451, "memory(GiB)": 142.32, "step": 100120, "train_speed(iter/s)": 0.285496 }, { "acc": 0.75252786, "epoch": 1.1200606224143548, "grad_norm": 5.3125, "learning_rate": 4.422010234991034e-06, "loss": 0.97056904, "memory(GiB)": 142.32, "step": 100140, "train_speed(iter/s)": 0.285518 }, { "acc": 0.73937149, "epoch": 1.1202843213603133, "grad_norm": 8.6875, "learning_rate": 4.4201732521102934e-06, "loss": 1.03574667, "memory(GiB)": 142.32, "step": 100160, "train_speed(iter/s)": 0.285537 }, { "acc": 0.72089691, "epoch": 1.1205080203062718, "grad_norm": 5.15625, "learning_rate": 4.4183363485581395e-06, "loss": 1.12152138, "memory(GiB)": 142.32, "step": 100180, "train_speed(iter/s)": 0.28556 }, { "acc": 0.72844877, "epoch": 1.1207317192522304, "grad_norm": 6.0625, "learning_rate": 4.416499524585887e-06, "loss": 1.07536888, "memory(GiB)": 142.32, "step": 100200, "train_speed(iter/s)": 0.285581 }, { "acc": 0.73340139, "epoch": 1.120955418198189, "grad_norm": 6.21875, "learning_rate": 4.414662780444839e-06, "loss": 1.05261097, "memory(GiB)": 142.32, "step": 100220, "train_speed(iter/s)": 0.285603 }, { "acc": 0.743609, "epoch": 1.1211791171441474, "grad_norm": 4.78125, "learning_rate": 4.412826116386289e-06, "loss": 1.02994385, "memory(GiB)": 142.32, "step": 100240, "train_speed(iter/s)": 0.285623 }, { "acc": 0.72341328, "epoch": 1.121402816090106, "grad_norm": 6.0625, "learning_rate": 4.4109895326615195e-06, "loss": 1.11420755, "memory(GiB)": 142.32, "step": 100260, "train_speed(iter/s)": 0.285644 }, { "acc": 0.74002562, "epoch": 1.1216265150360645, "grad_norm": 6.4375, "learning_rate": 4.409153029521802e-06, "loss": 1.02137547, "memory(GiB)": 142.32, "step": 100280, "train_speed(iter/s)": 0.285664 }, { "acc": 0.7367753, "epoch": 1.121850213982023, "grad_norm": 6.3125, "learning_rate": 4.407316607218394e-06, "loss": 1.04257278, "memory(GiB)": 142.32, "step": 100300, "train_speed(iter/s)": 0.285684 }, { "acc": 0.74092607, "epoch": 1.1220739129279815, "grad_norm": 4.84375, "learning_rate": 4.405480266002545e-06, "loss": 1.03002758, "memory(GiB)": 142.32, "step": 100320, "train_speed(iter/s)": 0.285703 }, { "acc": 0.73579144, "epoch": 1.12229761187394, "grad_norm": 7.28125, "learning_rate": 4.403644006125494e-06, "loss": 1.04688759, "memory(GiB)": 142.32, "step": 100340, "train_speed(iter/s)": 0.285723 }, { "acc": 0.73343372, "epoch": 1.1225213108198986, "grad_norm": 6.09375, "learning_rate": 4.401807827838466e-06, "loss": 1.05706511, "memory(GiB)": 142.32, "step": 100360, "train_speed(iter/s)": 0.285743 }, { "acc": 0.73732443, "epoch": 1.1227450097658571, "grad_norm": 4.65625, "learning_rate": 4.399971731392679e-06, "loss": 1.05405674, "memory(GiB)": 142.32, "step": 100380, "train_speed(iter/s)": 0.285764 }, { "acc": 0.74914937, "epoch": 1.1229687087118156, "grad_norm": 6.84375, "learning_rate": 4.398135717039334e-06, "loss": 0.98792515, "memory(GiB)": 142.32, "step": 100400, "train_speed(iter/s)": 0.285783 }, { "acc": 0.73692188, "epoch": 1.1231924076577742, "grad_norm": 6.28125, "learning_rate": 4.396299785029626e-06, "loss": 1.0463726, "memory(GiB)": 142.32, "step": 100420, "train_speed(iter/s)": 0.285805 }, { "acc": 0.73430681, "epoch": 1.1234161066037327, "grad_norm": 6.6875, "learning_rate": 4.394463935614736e-06, "loss": 1.07678356, "memory(GiB)": 142.32, "step": 100440, "train_speed(iter/s)": 0.285822 }, { "acc": 0.7395875, "epoch": 1.1236398055496912, "grad_norm": 5.3125, "learning_rate": 4.392628169045835e-06, "loss": 1.02813358, "memory(GiB)": 142.32, "step": 100460, "train_speed(iter/s)": 0.285841 }, { "acc": 0.74758019, "epoch": 1.1238635044956498, "grad_norm": 6.03125, "learning_rate": 4.390792485574082e-06, "loss": 0.99063663, "memory(GiB)": 142.32, "step": 100480, "train_speed(iter/s)": 0.285862 }, { "acc": 0.73230019, "epoch": 1.1240872034416083, "grad_norm": 6.53125, "learning_rate": 4.388956885450624e-06, "loss": 1.06527195, "memory(GiB)": 142.32, "step": 100500, "train_speed(iter/s)": 0.285882 }, { "acc": 0.73416052, "epoch": 1.1243109023875668, "grad_norm": 6.3125, "learning_rate": 4.387121368926598e-06, "loss": 1.0501339, "memory(GiB)": 142.32, "step": 100520, "train_speed(iter/s)": 0.285901 }, { "acc": 0.74190331, "epoch": 1.1245346013335253, "grad_norm": 6.96875, "learning_rate": 4.385285936253129e-06, "loss": 1.03718185, "memory(GiB)": 142.32, "step": 100540, "train_speed(iter/s)": 0.285923 }, { "acc": 0.74487643, "epoch": 1.1247583002794839, "grad_norm": 5.3125, "learning_rate": 4.38345058768133e-06, "loss": 1.02374706, "memory(GiB)": 142.32, "step": 100560, "train_speed(iter/s)": 0.285941 }, { "acc": 0.73194032, "epoch": 1.1249819992254424, "grad_norm": 6.5625, "learning_rate": 4.381615323462304e-06, "loss": 1.0730423, "memory(GiB)": 142.32, "step": 100580, "train_speed(iter/s)": 0.28596 }, { "acc": 0.73110356, "epoch": 1.125205698171401, "grad_norm": 6.375, "learning_rate": 4.37978014384714e-06, "loss": 1.08620853, "memory(GiB)": 142.32, "step": 100600, "train_speed(iter/s)": 0.285977 }, { "acc": 0.73278294, "epoch": 1.1254293971173595, "grad_norm": 6.59375, "learning_rate": 4.3779450490869194e-06, "loss": 1.05332108, "memory(GiB)": 142.32, "step": 100620, "train_speed(iter/s)": 0.285996 }, { "acc": 0.73741064, "epoch": 1.125653096063318, "grad_norm": 7.375, "learning_rate": 4.376110039432704e-06, "loss": 1.04361992, "memory(GiB)": 142.32, "step": 100640, "train_speed(iter/s)": 0.286016 }, { "acc": 0.73143034, "epoch": 1.1258767950092765, "grad_norm": 5.46875, "learning_rate": 4.3742751151355535e-06, "loss": 1.0794405, "memory(GiB)": 142.32, "step": 100660, "train_speed(iter/s)": 0.286035 }, { "acc": 0.73948197, "epoch": 1.126100493955235, "grad_norm": 6.15625, "learning_rate": 4.3724402764465116e-06, "loss": 1.02608013, "memory(GiB)": 142.32, "step": 100680, "train_speed(iter/s)": 0.286055 }, { "acc": 0.73174291, "epoch": 1.1263241929011936, "grad_norm": 7.09375, "learning_rate": 4.370605523616609e-06, "loss": 1.0628149, "memory(GiB)": 142.32, "step": 100700, "train_speed(iter/s)": 0.286073 }, { "acc": 0.74164181, "epoch": 1.126547891847152, "grad_norm": 5.46875, "learning_rate": 4.368770856896868e-06, "loss": 1.01921387, "memory(GiB)": 142.32, "step": 100720, "train_speed(iter/s)": 0.286092 }, { "acc": 0.74071026, "epoch": 1.1267715907931106, "grad_norm": 6.0, "learning_rate": 4.366936276538295e-06, "loss": 1.01716213, "memory(GiB)": 142.32, "step": 100740, "train_speed(iter/s)": 0.286111 }, { "acc": 0.74735284, "epoch": 1.1269952897390692, "grad_norm": 6.625, "learning_rate": 4.3651017827918875e-06, "loss": 0.99498739, "memory(GiB)": 142.32, "step": 100760, "train_speed(iter/s)": 0.28613 }, { "acc": 0.74466062, "epoch": 1.1272189886850277, "grad_norm": 5.25, "learning_rate": 4.363267375908631e-06, "loss": 1.00582361, "memory(GiB)": 142.32, "step": 100780, "train_speed(iter/s)": 0.286149 }, { "acc": 0.73495626, "epoch": 1.1274426876309862, "grad_norm": 5.71875, "learning_rate": 4.3614330561394995e-06, "loss": 1.05035839, "memory(GiB)": 142.32, "step": 100800, "train_speed(iter/s)": 0.286168 }, { "acc": 0.72525635, "epoch": 1.1276663865769447, "grad_norm": 7.15625, "learning_rate": 4.3595988237354535e-06, "loss": 1.104177, "memory(GiB)": 142.32, "step": 100820, "train_speed(iter/s)": 0.286187 }, { "acc": 0.72487245, "epoch": 1.1278900855229033, "grad_norm": 7.0, "learning_rate": 4.357764678947441e-06, "loss": 1.10748959, "memory(GiB)": 142.32, "step": 100840, "train_speed(iter/s)": 0.286204 }, { "acc": 0.73910217, "epoch": 1.1281137844688618, "grad_norm": 7.4375, "learning_rate": 4.3559306220264e-06, "loss": 1.02221107, "memory(GiB)": 142.32, "step": 100860, "train_speed(iter/s)": 0.286225 }, { "acc": 0.74391794, "epoch": 1.1283374834148203, "grad_norm": 6.0625, "learning_rate": 4.354096653223255e-06, "loss": 1.03077965, "memory(GiB)": 142.32, "step": 100880, "train_speed(iter/s)": 0.286248 }, { "acc": 0.73465538, "epoch": 1.1285611823607788, "grad_norm": 5.9375, "learning_rate": 4.352262772788921e-06, "loss": 1.04778328, "memory(GiB)": 142.32, "step": 100900, "train_speed(iter/s)": 0.286269 }, { "acc": 0.73870935, "epoch": 1.1287848813067374, "grad_norm": 6.40625, "learning_rate": 4.350428980974299e-06, "loss": 1.05027008, "memory(GiB)": 142.32, "step": 100920, "train_speed(iter/s)": 0.286286 }, { "acc": 0.74493885, "epoch": 1.129008580252696, "grad_norm": 5.9375, "learning_rate": 4.348595278030276e-06, "loss": 1.01516972, "memory(GiB)": 142.32, "step": 100940, "train_speed(iter/s)": 0.286304 }, { "acc": 0.74994173, "epoch": 1.1292322791986544, "grad_norm": 5.6875, "learning_rate": 4.346761664207728e-06, "loss": 0.97588711, "memory(GiB)": 142.32, "step": 100960, "train_speed(iter/s)": 0.286324 }, { "acc": 0.73177724, "epoch": 1.129455978144613, "grad_norm": 6.28125, "learning_rate": 4.344928139757523e-06, "loss": 1.06090546, "memory(GiB)": 142.32, "step": 100980, "train_speed(iter/s)": 0.286344 }, { "acc": 0.74249926, "epoch": 1.1296796770905715, "grad_norm": 6.46875, "learning_rate": 4.343094704930512e-06, "loss": 1.01999111, "memory(GiB)": 142.32, "step": 101000, "train_speed(iter/s)": 0.286362 }, { "acc": 0.73859057, "epoch": 1.12990337603653, "grad_norm": 5.46875, "learning_rate": 4.341261359977534e-06, "loss": 1.04349556, "memory(GiB)": 142.32, "step": 101020, "train_speed(iter/s)": 0.28638 }, { "acc": 0.73843775, "epoch": 1.1301270749824885, "grad_norm": 5.9375, "learning_rate": 4.339428105149418e-06, "loss": 1.02415047, "memory(GiB)": 142.32, "step": 101040, "train_speed(iter/s)": 0.2864 }, { "acc": 0.73238401, "epoch": 1.130350773928447, "grad_norm": 5.5625, "learning_rate": 4.337594940696978e-06, "loss": 1.06011305, "memory(GiB)": 142.32, "step": 101060, "train_speed(iter/s)": 0.286415 }, { "acc": 0.73763671, "epoch": 1.1305744728744056, "grad_norm": 8.0625, "learning_rate": 4.335761866871018e-06, "loss": 1.04327297, "memory(GiB)": 142.32, "step": 101080, "train_speed(iter/s)": 0.286435 }, { "acc": 0.7406549, "epoch": 1.1307981718203641, "grad_norm": 6.03125, "learning_rate": 4.333928883922329e-06, "loss": 1.02863216, "memory(GiB)": 142.32, "step": 101100, "train_speed(iter/s)": 0.286455 }, { "acc": 0.73350739, "epoch": 1.1310218707663227, "grad_norm": 6.71875, "learning_rate": 4.332095992101691e-06, "loss": 1.05632677, "memory(GiB)": 142.32, "step": 101120, "train_speed(iter/s)": 0.286473 }, { "acc": 0.74588327, "epoch": 1.1312455697122812, "grad_norm": 6.78125, "learning_rate": 4.330263191659866e-06, "loss": 1.01816568, "memory(GiB)": 142.32, "step": 101140, "train_speed(iter/s)": 0.286491 }, { "acc": 0.72781239, "epoch": 1.1314692686582397, "grad_norm": 6.34375, "learning_rate": 4.328430482847609e-06, "loss": 1.08313017, "memory(GiB)": 142.32, "step": 101160, "train_speed(iter/s)": 0.28651 }, { "acc": 0.72316837, "epoch": 1.1316929676041982, "grad_norm": 5.09375, "learning_rate": 4.326597865915661e-06, "loss": 1.09457636, "memory(GiB)": 142.32, "step": 101180, "train_speed(iter/s)": 0.286526 }, { "acc": 0.75008736, "epoch": 1.1319166665501568, "grad_norm": 7.09375, "learning_rate": 4.32476534111475e-06, "loss": 1.00070496, "memory(GiB)": 142.32, "step": 101200, "train_speed(iter/s)": 0.286544 }, { "acc": 0.73451519, "epoch": 1.1321403654961153, "grad_norm": 5.625, "learning_rate": 4.322932908695593e-06, "loss": 1.0580842, "memory(GiB)": 142.32, "step": 101220, "train_speed(iter/s)": 0.286562 }, { "acc": 0.73476696, "epoch": 1.1323640644420738, "grad_norm": 6.3125, "learning_rate": 4.3211005689088904e-06, "loss": 1.07426014, "memory(GiB)": 142.32, "step": 101240, "train_speed(iter/s)": 0.28658 }, { "acc": 0.73507204, "epoch": 1.1325877633880324, "grad_norm": 6.46875, "learning_rate": 4.319268322005333e-06, "loss": 1.05809727, "memory(GiB)": 142.32, "step": 101260, "train_speed(iter/s)": 0.2866 }, { "acc": 0.74049687, "epoch": 1.1328114623339909, "grad_norm": 5.0, "learning_rate": 4.3174361682356e-06, "loss": 1.02438793, "memory(GiB)": 142.32, "step": 101280, "train_speed(iter/s)": 0.286617 }, { "acc": 0.74065156, "epoch": 1.1330351612799494, "grad_norm": 5.75, "learning_rate": 4.315604107850355e-06, "loss": 1.04017982, "memory(GiB)": 142.32, "step": 101300, "train_speed(iter/s)": 0.286634 }, { "acc": 0.72723522, "epoch": 1.133258860225908, "grad_norm": 7.125, "learning_rate": 4.313772141100251e-06, "loss": 1.10418644, "memory(GiB)": 142.32, "step": 101320, "train_speed(iter/s)": 0.286649 }, { "acc": 0.72887058, "epoch": 1.1334825591718665, "grad_norm": 5.4375, "learning_rate": 4.311940268235926e-06, "loss": 1.06538525, "memory(GiB)": 142.32, "step": 101340, "train_speed(iter/s)": 0.28667 }, { "acc": 0.7388011, "epoch": 1.133706258117825, "grad_norm": 5.46875, "learning_rate": 4.310108489508007e-06, "loss": 1.01708822, "memory(GiB)": 142.32, "step": 101360, "train_speed(iter/s)": 0.286688 }, { "acc": 0.74518108, "epoch": 1.1339299570637835, "grad_norm": 6.25, "learning_rate": 4.308276805167107e-06, "loss": 0.99718914, "memory(GiB)": 142.32, "step": 101380, "train_speed(iter/s)": 0.286705 }, { "acc": 0.75761957, "epoch": 1.134153656009742, "grad_norm": 7.25, "learning_rate": 4.306445215463827e-06, "loss": 0.94890022, "memory(GiB)": 142.32, "step": 101400, "train_speed(iter/s)": 0.286725 }, { "acc": 0.73829632, "epoch": 1.1343773549557006, "grad_norm": 7.25, "learning_rate": 4.304613720648756e-06, "loss": 1.04036999, "memory(GiB)": 142.32, "step": 101420, "train_speed(iter/s)": 0.286742 }, { "acc": 0.73078842, "epoch": 1.134601053901659, "grad_norm": 6.125, "learning_rate": 4.302782320972467e-06, "loss": 1.07857933, "memory(GiB)": 142.32, "step": 101440, "train_speed(iter/s)": 0.286759 }, { "acc": 0.73571959, "epoch": 1.1348247528476176, "grad_norm": 6.78125, "learning_rate": 4.300951016685521e-06, "loss": 1.03943577, "memory(GiB)": 142.32, "step": 101460, "train_speed(iter/s)": 0.286778 }, { "acc": 0.72381039, "epoch": 1.1350484517935762, "grad_norm": 4.625, "learning_rate": 4.299119808038468e-06, "loss": 1.10733166, "memory(GiB)": 142.32, "step": 101480, "train_speed(iter/s)": 0.286798 }, { "acc": 0.7268712, "epoch": 1.1352721507395347, "grad_norm": 4.21875, "learning_rate": 4.297288695281843e-06, "loss": 1.09420872, "memory(GiB)": 142.32, "step": 101500, "train_speed(iter/s)": 0.286814 }, { "acc": 0.74109941, "epoch": 1.1354958496854932, "grad_norm": 5.5625, "learning_rate": 4.295457678666169e-06, "loss": 1.02895584, "memory(GiB)": 142.32, "step": 101520, "train_speed(iter/s)": 0.286834 }, { "acc": 0.73671265, "epoch": 1.1357195486314517, "grad_norm": 6.4375, "learning_rate": 4.293626758441955e-06, "loss": 1.03623629, "memory(GiB)": 142.32, "step": 101540, "train_speed(iter/s)": 0.286853 }, { "acc": 0.73536353, "epoch": 1.1359432475774103, "grad_norm": 4.8125, "learning_rate": 4.291795934859697e-06, "loss": 1.03749142, "memory(GiB)": 142.32, "step": 101560, "train_speed(iter/s)": 0.286874 }, { "acc": 0.73244476, "epoch": 1.1361669465233688, "grad_norm": 6.5, "learning_rate": 4.289965208169877e-06, "loss": 1.07100887, "memory(GiB)": 142.32, "step": 101580, "train_speed(iter/s)": 0.286893 }, { "acc": 0.75258627, "epoch": 1.1363906454693273, "grad_norm": 7.875, "learning_rate": 4.288134578622965e-06, "loss": 0.96878023, "memory(GiB)": 142.32, "step": 101600, "train_speed(iter/s)": 0.286912 }, { "acc": 0.7317956, "epoch": 1.1366143444152859, "grad_norm": 6.3125, "learning_rate": 4.286304046469418e-06, "loss": 1.0684536, "memory(GiB)": 142.32, "step": 101620, "train_speed(iter/s)": 0.286932 }, { "acc": 0.73838992, "epoch": 1.1368380433612444, "grad_norm": 5.4375, "learning_rate": 4.284473611959679e-06, "loss": 1.04123764, "memory(GiB)": 142.32, "step": 101640, "train_speed(iter/s)": 0.286951 }, { "acc": 0.74206858, "epoch": 1.137061742307203, "grad_norm": 5.0625, "learning_rate": 4.2826432753441764e-06, "loss": 1.01513901, "memory(GiB)": 142.32, "step": 101660, "train_speed(iter/s)": 0.28697 }, { "acc": 0.74097219, "epoch": 1.1372854412531614, "grad_norm": 7.25, "learning_rate": 4.280813036873327e-06, "loss": 1.03417091, "memory(GiB)": 142.32, "step": 101680, "train_speed(iter/s)": 0.286987 }, { "acc": 0.7445775, "epoch": 1.13750914019912, "grad_norm": 6.90625, "learning_rate": 4.278982896797535e-06, "loss": 1.00959835, "memory(GiB)": 142.32, "step": 101700, "train_speed(iter/s)": 0.287008 }, { "acc": 0.72884436, "epoch": 1.1377328391450785, "grad_norm": 5.0625, "learning_rate": 4.277152855367186e-06, "loss": 1.1002346, "memory(GiB)": 142.32, "step": 101720, "train_speed(iter/s)": 0.287024 }, { "acc": 0.7392312, "epoch": 1.137956538091037, "grad_norm": 6.6875, "learning_rate": 4.275322912832661e-06, "loss": 1.03588886, "memory(GiB)": 142.32, "step": 101740, "train_speed(iter/s)": 0.287045 }, { "acc": 0.73844957, "epoch": 1.1381802370369956, "grad_norm": 6.46875, "learning_rate": 4.273493069444318e-06, "loss": 1.02298298, "memory(GiB)": 142.32, "step": 101760, "train_speed(iter/s)": 0.287061 }, { "acc": 0.73129125, "epoch": 1.138403935982954, "grad_norm": 5.25, "learning_rate": 4.271663325452508e-06, "loss": 1.07662382, "memory(GiB)": 142.32, "step": 101780, "train_speed(iter/s)": 0.287078 }, { "acc": 0.75025959, "epoch": 1.1386276349289126, "grad_norm": 5.03125, "learning_rate": 4.269833681107567e-06, "loss": 1.0048996, "memory(GiB)": 142.32, "step": 101800, "train_speed(iter/s)": 0.287098 }, { "acc": 0.73461485, "epoch": 1.1388513338748711, "grad_norm": 6.46875, "learning_rate": 4.268004136659813e-06, "loss": 1.06005478, "memory(GiB)": 142.32, "step": 101820, "train_speed(iter/s)": 0.287117 }, { "acc": 0.72920847, "epoch": 1.1390750328208297, "grad_norm": 6.5, "learning_rate": 4.2661746923595545e-06, "loss": 1.06998787, "memory(GiB)": 142.32, "step": 101840, "train_speed(iter/s)": 0.287134 }, { "acc": 0.73643188, "epoch": 1.1392987317667882, "grad_norm": 5.625, "learning_rate": 4.2643453484570875e-06, "loss": 1.05432034, "memory(GiB)": 142.32, "step": 101860, "train_speed(iter/s)": 0.287153 }, { "acc": 0.74171553, "epoch": 1.1395224307127467, "grad_norm": 5.15625, "learning_rate": 4.262516105202694e-06, "loss": 1.02926235, "memory(GiB)": 142.32, "step": 101880, "train_speed(iter/s)": 0.28717 }, { "acc": 0.74061756, "epoch": 1.1397461296587053, "grad_norm": 5.59375, "learning_rate": 4.260686962846636e-06, "loss": 1.02213497, "memory(GiB)": 142.32, "step": 101900, "train_speed(iter/s)": 0.287191 }, { "acc": 0.72667418, "epoch": 1.1399698286046638, "grad_norm": 5.5, "learning_rate": 4.258857921639169e-06, "loss": 1.09000196, "memory(GiB)": 142.32, "step": 101920, "train_speed(iter/s)": 0.287211 }, { "acc": 0.73098874, "epoch": 1.1401935275506223, "grad_norm": 6.46875, "learning_rate": 4.257028981830532e-06, "loss": 1.08238001, "memory(GiB)": 142.32, "step": 101940, "train_speed(iter/s)": 0.28723 }, { "acc": 0.74401164, "epoch": 1.1404172264965808, "grad_norm": 5.96875, "learning_rate": 4.25520014367095e-06, "loss": 1.00059147, "memory(GiB)": 142.32, "step": 101960, "train_speed(iter/s)": 0.287249 }, { "acc": 0.73577814, "epoch": 1.1406409254425394, "grad_norm": 6.03125, "learning_rate": 4.253371407410634e-06, "loss": 1.06510048, "memory(GiB)": 142.32, "step": 101980, "train_speed(iter/s)": 0.287265 }, { "acc": 0.75252771, "epoch": 1.140864624388498, "grad_norm": 6.875, "learning_rate": 4.251542773299781e-06, "loss": 0.97847204, "memory(GiB)": 142.32, "step": 102000, "train_speed(iter/s)": 0.287281 }, { "epoch": 1.140864624388498, "eval_acc": 0.6960786285591692, "eval_loss": 1.072737216949463, "eval_runtime": 2343.9345, "eval_samples_per_second": 32.118, "eval_steps_per_second": 16.059, "step": 102000 }, { "acc": 0.72782621, "epoch": 1.1410883233344564, "grad_norm": 6.5, "learning_rate": 4.249714241588575e-06, "loss": 1.08865414, "memory(GiB)": 142.32, "step": 102020, "train_speed(iter/s)": 0.285371 }, { "acc": 0.7411839, "epoch": 1.141312022280415, "grad_norm": 7.59375, "learning_rate": 4.247885812527184e-06, "loss": 1.01915112, "memory(GiB)": 142.32, "step": 102040, "train_speed(iter/s)": 0.28539 }, { "acc": 0.74284568, "epoch": 1.1415357212263735, "grad_norm": 5.875, "learning_rate": 4.246057486365764e-06, "loss": 1.01528769, "memory(GiB)": 142.32, "step": 102060, "train_speed(iter/s)": 0.285408 }, { "acc": 0.7306284, "epoch": 1.141759420172332, "grad_norm": 5.8125, "learning_rate": 4.244229263354458e-06, "loss": 1.05784378, "memory(GiB)": 142.32, "step": 102080, "train_speed(iter/s)": 0.285428 }, { "acc": 0.73741264, "epoch": 1.1419831191182905, "grad_norm": 5.4375, "learning_rate": 4.242401143743389e-06, "loss": 1.04329586, "memory(GiB)": 142.32, "step": 102100, "train_speed(iter/s)": 0.285448 }, { "acc": 0.74707546, "epoch": 1.142206818064249, "grad_norm": 6.0, "learning_rate": 4.240573127782673e-06, "loss": 0.98313999, "memory(GiB)": 142.32, "step": 102120, "train_speed(iter/s)": 0.285466 }, { "acc": 0.73709593, "epoch": 1.1424305170102076, "grad_norm": 5.71875, "learning_rate": 4.238745215722407e-06, "loss": 1.0612155, "memory(GiB)": 142.32, "step": 102140, "train_speed(iter/s)": 0.285484 }, { "acc": 0.72649865, "epoch": 1.1426542159561661, "grad_norm": 6.96875, "learning_rate": 4.2369174078126775e-06, "loss": 1.08988266, "memory(GiB)": 142.32, "step": 102160, "train_speed(iter/s)": 0.285504 }, { "acc": 0.73213172, "epoch": 1.1428779149021246, "grad_norm": 6.0625, "learning_rate": 4.235089704303554e-06, "loss": 1.07021313, "memory(GiB)": 142.32, "step": 102180, "train_speed(iter/s)": 0.285521 }, { "acc": 0.723487, "epoch": 1.1431016138480832, "grad_norm": 5.71875, "learning_rate": 4.23326210544509e-06, "loss": 1.105867, "memory(GiB)": 142.32, "step": 102200, "train_speed(iter/s)": 0.28554 }, { "acc": 0.72520247, "epoch": 1.1433253127940417, "grad_norm": 6.1875, "learning_rate": 4.23143461148733e-06, "loss": 1.10354004, "memory(GiB)": 142.32, "step": 102220, "train_speed(iter/s)": 0.28556 }, { "acc": 0.72955742, "epoch": 1.1435490117400002, "grad_norm": 7.1875, "learning_rate": 4.2296072226803016e-06, "loss": 1.09462814, "memory(GiB)": 142.32, "step": 102240, "train_speed(iter/s)": 0.28558 }, { "acc": 0.7565587, "epoch": 1.1437727106859588, "grad_norm": 4.9375, "learning_rate": 4.227779939274016e-06, "loss": 0.97056828, "memory(GiB)": 142.32, "step": 102260, "train_speed(iter/s)": 0.285601 }, { "acc": 0.73354139, "epoch": 1.1439964096319173, "grad_norm": 5.125, "learning_rate": 4.225952761518472e-06, "loss": 1.05526428, "memory(GiB)": 142.32, "step": 102280, "train_speed(iter/s)": 0.285617 }, { "acc": 0.74339743, "epoch": 1.1442201085778758, "grad_norm": 5.40625, "learning_rate": 4.224125689663655e-06, "loss": 1.00349388, "memory(GiB)": 142.32, "step": 102300, "train_speed(iter/s)": 0.285637 }, { "acc": 0.74009762, "epoch": 1.1444438075238343, "grad_norm": 6.75, "learning_rate": 4.2222987239595316e-06, "loss": 1.02581635, "memory(GiB)": 142.32, "step": 102320, "train_speed(iter/s)": 0.285656 }, { "acc": 0.73961039, "epoch": 1.1446675064697929, "grad_norm": 5.46875, "learning_rate": 4.220471864656059e-06, "loss": 1.03614025, "memory(GiB)": 142.32, "step": 102340, "train_speed(iter/s)": 0.285675 }, { "acc": 0.73622894, "epoch": 1.1448912054157514, "grad_norm": 6.1875, "learning_rate": 4.218645112003178e-06, "loss": 1.05288963, "memory(GiB)": 142.32, "step": 102360, "train_speed(iter/s)": 0.285693 }, { "acc": 0.75176682, "epoch": 1.14511490436171, "grad_norm": 6.25, "learning_rate": 4.216818466250815e-06, "loss": 0.98076572, "memory(GiB)": 142.32, "step": 102380, "train_speed(iter/s)": 0.285714 }, { "acc": 0.74064159, "epoch": 1.1453386033076685, "grad_norm": 6.84375, "learning_rate": 4.214991927648878e-06, "loss": 1.03036127, "memory(GiB)": 142.32, "step": 102400, "train_speed(iter/s)": 0.285733 }, { "acc": 0.74453745, "epoch": 1.145562302253627, "grad_norm": 5.71875, "learning_rate": 4.213165496447267e-06, "loss": 1.01659584, "memory(GiB)": 142.32, "step": 102420, "train_speed(iter/s)": 0.285749 }, { "acc": 0.7280138, "epoch": 1.1457860011995855, "grad_norm": 5.53125, "learning_rate": 4.211339172895861e-06, "loss": 1.07934971, "memory(GiB)": 142.32, "step": 102440, "train_speed(iter/s)": 0.285768 }, { "acc": 0.73506212, "epoch": 1.146009700145544, "grad_norm": 5.46875, "learning_rate": 4.2095129572445295e-06, "loss": 1.05965796, "memory(GiB)": 142.32, "step": 102460, "train_speed(iter/s)": 0.285789 }, { "acc": 0.71725073, "epoch": 1.1462333990915026, "grad_norm": 5.84375, "learning_rate": 4.207686849743125e-06, "loss": 1.12951136, "memory(GiB)": 142.32, "step": 102480, "train_speed(iter/s)": 0.285809 }, { "acc": 0.7349649, "epoch": 1.146457098037461, "grad_norm": 5.90625, "learning_rate": 4.205860850641484e-06, "loss": 1.04714155, "memory(GiB)": 142.32, "step": 102500, "train_speed(iter/s)": 0.285829 }, { "acc": 0.73296556, "epoch": 1.1466807969834196, "grad_norm": 6.625, "learning_rate": 4.204034960189428e-06, "loss": 1.05158978, "memory(GiB)": 142.32, "step": 102520, "train_speed(iter/s)": 0.285849 }, { "acc": 0.73275075, "epoch": 1.1469044959293782, "grad_norm": 5.34375, "learning_rate": 4.202209178636768e-06, "loss": 1.07107906, "memory(GiB)": 142.32, "step": 102540, "train_speed(iter/s)": 0.285865 }, { "acc": 0.73102055, "epoch": 1.1471281948753367, "grad_norm": 5.5625, "learning_rate": 4.200383506233295e-06, "loss": 1.08072424, "memory(GiB)": 142.32, "step": 102560, "train_speed(iter/s)": 0.285883 }, { "acc": 0.73879128, "epoch": 1.1473518938212952, "grad_norm": 5.625, "learning_rate": 4.198557943228787e-06, "loss": 1.05591278, "memory(GiB)": 142.32, "step": 102580, "train_speed(iter/s)": 0.2859 }, { "acc": 0.72686863, "epoch": 1.1475755927672537, "grad_norm": 5.03125, "learning_rate": 4.1967324898730085e-06, "loss": 1.09231071, "memory(GiB)": 142.32, "step": 102600, "train_speed(iter/s)": 0.285922 }, { "acc": 0.74101992, "epoch": 1.1477992917132123, "grad_norm": 6.71875, "learning_rate": 4.194907146415706e-06, "loss": 1.01597958, "memory(GiB)": 142.32, "step": 102620, "train_speed(iter/s)": 0.285943 }, { "acc": 0.74048343, "epoch": 1.1480229906591708, "grad_norm": 5.53125, "learning_rate": 4.193081913106613e-06, "loss": 1.03969584, "memory(GiB)": 142.32, "step": 102640, "train_speed(iter/s)": 0.28596 }, { "acc": 0.73567162, "epoch": 1.1482466896051293, "grad_norm": 6.21875, "learning_rate": 4.191256790195448e-06, "loss": 1.04251223, "memory(GiB)": 142.32, "step": 102660, "train_speed(iter/s)": 0.285974 }, { "acc": 0.72203674, "epoch": 1.1484703885510879, "grad_norm": 7.9375, "learning_rate": 4.189431777931915e-06, "loss": 1.11052589, "memory(GiB)": 142.32, "step": 102680, "train_speed(iter/s)": 0.285993 }, { "acc": 0.72906942, "epoch": 1.1486940874970464, "grad_norm": 6.46875, "learning_rate": 4.1876068765657e-06, "loss": 1.10098476, "memory(GiB)": 142.32, "step": 102700, "train_speed(iter/s)": 0.286012 }, { "acc": 0.7379806, "epoch": 1.148917786443005, "grad_norm": 6.5, "learning_rate": 4.185782086346475e-06, "loss": 1.03763323, "memory(GiB)": 142.32, "step": 102720, "train_speed(iter/s)": 0.286031 }, { "acc": 0.73281784, "epoch": 1.1491414853889634, "grad_norm": 5.25, "learning_rate": 4.183957407523899e-06, "loss": 1.06094389, "memory(GiB)": 142.32, "step": 102740, "train_speed(iter/s)": 0.286049 }, { "acc": 0.73205814, "epoch": 1.149365184334922, "grad_norm": 5.78125, "learning_rate": 4.182132840347613e-06, "loss": 1.06308937, "memory(GiB)": 142.32, "step": 102760, "train_speed(iter/s)": 0.286069 }, { "acc": 0.73223572, "epoch": 1.1495888832808805, "grad_norm": 6.65625, "learning_rate": 4.180308385067246e-06, "loss": 1.05274916, "memory(GiB)": 142.32, "step": 102780, "train_speed(iter/s)": 0.286089 }, { "acc": 0.7310029, "epoch": 1.149812582226839, "grad_norm": 6.15625, "learning_rate": 4.178484041932406e-06, "loss": 1.07569237, "memory(GiB)": 142.32, "step": 102800, "train_speed(iter/s)": 0.286108 }, { "acc": 0.73732166, "epoch": 1.1500362811727975, "grad_norm": 6.46875, "learning_rate": 4.1766598111926926e-06, "loss": 1.05360222, "memory(GiB)": 142.32, "step": 102820, "train_speed(iter/s)": 0.286128 }, { "acc": 0.738307, "epoch": 1.150259980118756, "grad_norm": 6.125, "learning_rate": 4.174835693097685e-06, "loss": 1.038554, "memory(GiB)": 142.32, "step": 102840, "train_speed(iter/s)": 0.286147 }, { "acc": 0.73959894, "epoch": 1.1504836790647146, "grad_norm": 5.34375, "learning_rate": 4.173011687896949e-06, "loss": 1.03319883, "memory(GiB)": 142.32, "step": 102860, "train_speed(iter/s)": 0.286167 }, { "acc": 0.72660875, "epoch": 1.1507073780106731, "grad_norm": 6.1875, "learning_rate": 4.171187795840035e-06, "loss": 1.10218563, "memory(GiB)": 142.32, "step": 102880, "train_speed(iter/s)": 0.286185 }, { "acc": 0.74304762, "epoch": 1.1509310769566317, "grad_norm": 6.84375, "learning_rate": 4.1693640171764756e-06, "loss": 1.02116394, "memory(GiB)": 142.32, "step": 102900, "train_speed(iter/s)": 0.286205 }, { "acc": 0.74141474, "epoch": 1.1511547759025902, "grad_norm": 6.1875, "learning_rate": 4.1675403521557916e-06, "loss": 1.00887585, "memory(GiB)": 142.32, "step": 102920, "train_speed(iter/s)": 0.286224 }, { "acc": 0.74433975, "epoch": 1.1513784748485487, "grad_norm": 5.53125, "learning_rate": 4.165716801027486e-06, "loss": 1.00774622, "memory(GiB)": 142.32, "step": 102940, "train_speed(iter/s)": 0.286243 }, { "acc": 0.7314405, "epoch": 1.1516021737945072, "grad_norm": 7.34375, "learning_rate": 4.1638933640410465e-06, "loss": 1.05952396, "memory(GiB)": 142.32, "step": 102960, "train_speed(iter/s)": 0.286264 }, { "acc": 0.7392314, "epoch": 1.1518258727404658, "grad_norm": 7.15625, "learning_rate": 4.162070041445948e-06, "loss": 1.03455238, "memory(GiB)": 142.32, "step": 102980, "train_speed(iter/s)": 0.286283 }, { "acc": 0.73592739, "epoch": 1.1520495716864243, "grad_norm": 6.8125, "learning_rate": 4.160246833491642e-06, "loss": 1.05561562, "memory(GiB)": 142.32, "step": 103000, "train_speed(iter/s)": 0.286301 }, { "acc": 0.74608874, "epoch": 1.1522732706323828, "grad_norm": 6.8125, "learning_rate": 4.158423740427574e-06, "loss": 1.00737324, "memory(GiB)": 142.32, "step": 103020, "train_speed(iter/s)": 0.286319 }, { "acc": 0.73844972, "epoch": 1.1524969695783414, "grad_norm": 6.3125, "learning_rate": 4.156600762503166e-06, "loss": 1.02824001, "memory(GiB)": 142.32, "step": 103040, "train_speed(iter/s)": 0.286338 }, { "acc": 0.74128551, "epoch": 1.1527206685242999, "grad_norm": 7.28125, "learning_rate": 4.1547778999678275e-06, "loss": 1.03572292, "memory(GiB)": 142.32, "step": 103060, "train_speed(iter/s)": 0.286356 }, { "acc": 0.72470779, "epoch": 1.1529443674702584, "grad_norm": 6.03125, "learning_rate": 4.152955153070954e-06, "loss": 1.1037487, "memory(GiB)": 142.32, "step": 103080, "train_speed(iter/s)": 0.286373 }, { "acc": 0.73327808, "epoch": 1.153168066416217, "grad_norm": 5.9375, "learning_rate": 4.151132522061923e-06, "loss": 1.0714633, "memory(GiB)": 142.32, "step": 103100, "train_speed(iter/s)": 0.286392 }, { "acc": 0.7246254, "epoch": 1.1533917653621755, "grad_norm": 6.34375, "learning_rate": 4.149310007190097e-06, "loss": 1.09443626, "memory(GiB)": 142.32, "step": 103120, "train_speed(iter/s)": 0.286413 }, { "acc": 0.74299941, "epoch": 1.153615464308134, "grad_norm": 4.96875, "learning_rate": 4.14748760870482e-06, "loss": 1.01533241, "memory(GiB)": 142.32, "step": 103140, "train_speed(iter/s)": 0.286432 }, { "acc": 0.73894281, "epoch": 1.1538391632540925, "grad_norm": 6.28125, "learning_rate": 4.145665326855423e-06, "loss": 1.04393005, "memory(GiB)": 142.32, "step": 103160, "train_speed(iter/s)": 0.286451 }, { "acc": 0.75127277, "epoch": 1.154062862200051, "grad_norm": 5.75, "learning_rate": 4.14384316189122e-06, "loss": 1.00169315, "memory(GiB)": 142.32, "step": 103180, "train_speed(iter/s)": 0.286467 }, { "acc": 0.73923106, "epoch": 1.1542865611460096, "grad_norm": 6.34375, "learning_rate": 4.142021114061511e-06, "loss": 1.04021072, "memory(GiB)": 142.32, "step": 103200, "train_speed(iter/s)": 0.286486 }, { "acc": 0.73345289, "epoch": 1.154510260091968, "grad_norm": 5.9375, "learning_rate": 4.140199183615578e-06, "loss": 1.06173325, "memory(GiB)": 142.32, "step": 103220, "train_speed(iter/s)": 0.286507 }, { "acc": 0.74608955, "epoch": 1.1547339590379266, "grad_norm": 6.40625, "learning_rate": 4.138377370802684e-06, "loss": 1.01374855, "memory(GiB)": 142.32, "step": 103240, "train_speed(iter/s)": 0.286527 }, { "acc": 0.74204817, "epoch": 1.1549576579838852, "grad_norm": 6.3125, "learning_rate": 4.136555675872082e-06, "loss": 1.02217274, "memory(GiB)": 142.32, "step": 103260, "train_speed(iter/s)": 0.286547 }, { "acc": 0.74308643, "epoch": 1.1551813569298437, "grad_norm": 7.15625, "learning_rate": 4.134734099073005e-06, "loss": 1.01354446, "memory(GiB)": 142.32, "step": 103280, "train_speed(iter/s)": 0.286567 }, { "acc": 0.73233371, "epoch": 1.1554050558758022, "grad_norm": 6.4375, "learning_rate": 4.132912640654671e-06, "loss": 1.06970844, "memory(GiB)": 142.32, "step": 103300, "train_speed(iter/s)": 0.286587 }, { "acc": 0.73493395, "epoch": 1.1556287548217608, "grad_norm": 5.8125, "learning_rate": 4.131091300866281e-06, "loss": 1.04965849, "memory(GiB)": 142.32, "step": 103320, "train_speed(iter/s)": 0.286606 }, { "acc": 0.73669844, "epoch": 1.1558524537677193, "grad_norm": 6.0, "learning_rate": 4.12927007995702e-06, "loss": 1.04901333, "memory(GiB)": 142.32, "step": 103340, "train_speed(iter/s)": 0.286624 }, { "acc": 0.73986678, "epoch": 1.156076152713678, "grad_norm": 6.65625, "learning_rate": 4.127448978176058e-06, "loss": 1.03934011, "memory(GiB)": 142.32, "step": 103360, "train_speed(iter/s)": 0.286645 }, { "acc": 0.73079596, "epoch": 1.1562998516596366, "grad_norm": 5.125, "learning_rate": 4.125627995772547e-06, "loss": 1.07587004, "memory(GiB)": 142.32, "step": 103380, "train_speed(iter/s)": 0.286663 }, { "acc": 0.74034562, "epoch": 1.156523550605595, "grad_norm": 5.875, "learning_rate": 4.123807132995625e-06, "loss": 1.02684288, "memory(GiB)": 142.32, "step": 103400, "train_speed(iter/s)": 0.286679 }, { "acc": 0.75487266, "epoch": 1.1567472495515536, "grad_norm": 5.4375, "learning_rate": 4.121986390094412e-06, "loss": 0.96927843, "memory(GiB)": 142.32, "step": 103420, "train_speed(iter/s)": 0.286697 }, { "acc": 0.733741, "epoch": 1.1569709484975121, "grad_norm": 7.0625, "learning_rate": 4.1201657673180075e-06, "loss": 1.06293507, "memory(GiB)": 142.32, "step": 103440, "train_speed(iter/s)": 0.286715 }, { "acc": 0.73354216, "epoch": 1.1571946474434707, "grad_norm": 5.09375, "learning_rate": 4.118345264915503e-06, "loss": 1.06788607, "memory(GiB)": 142.32, "step": 103460, "train_speed(iter/s)": 0.286733 }, { "acc": 0.72788272, "epoch": 1.1574183463894292, "grad_norm": 5.65625, "learning_rate": 4.1165248831359675e-06, "loss": 1.09807034, "memory(GiB)": 142.32, "step": 103480, "train_speed(iter/s)": 0.286752 }, { "acc": 0.73610363, "epoch": 1.1576420453353877, "grad_norm": 5.375, "learning_rate": 4.1147046222284564e-06, "loss": 1.05245523, "memory(GiB)": 142.32, "step": 103500, "train_speed(iter/s)": 0.286771 }, { "acc": 0.73363576, "epoch": 1.1578657442813463, "grad_norm": 5.78125, "learning_rate": 4.1128844824420075e-06, "loss": 1.07615366, "memory(GiB)": 142.32, "step": 103520, "train_speed(iter/s)": 0.28679 }, { "acc": 0.74867969, "epoch": 1.1580894432273048, "grad_norm": 5.5625, "learning_rate": 4.111064464025641e-06, "loss": 0.9895421, "memory(GiB)": 142.32, "step": 103540, "train_speed(iter/s)": 0.286809 }, { "acc": 0.74332438, "epoch": 1.1583131421732633, "grad_norm": 6.65625, "learning_rate": 4.10924456722836e-06, "loss": 1.0089859, "memory(GiB)": 142.32, "step": 103560, "train_speed(iter/s)": 0.286829 }, { "acc": 0.74924345, "epoch": 1.1585368411192218, "grad_norm": 8.125, "learning_rate": 4.107424792299155e-06, "loss": 1.00406265, "memory(GiB)": 142.32, "step": 103580, "train_speed(iter/s)": 0.286846 }, { "acc": 0.73088093, "epoch": 1.1587605400651804, "grad_norm": 6.75, "learning_rate": 4.105605139486997e-06, "loss": 1.07397022, "memory(GiB)": 142.32, "step": 103600, "train_speed(iter/s)": 0.286863 }, { "acc": 0.7202045, "epoch": 1.158984239011139, "grad_norm": 7.78125, "learning_rate": 4.10378560904084e-06, "loss": 1.13257627, "memory(GiB)": 142.32, "step": 103620, "train_speed(iter/s)": 0.286881 }, { "acc": 0.73170433, "epoch": 1.1592079379570974, "grad_norm": 5.40625, "learning_rate": 4.10196620120962e-06, "loss": 1.07431717, "memory(GiB)": 142.32, "step": 103640, "train_speed(iter/s)": 0.2869 }, { "acc": 0.74160347, "epoch": 1.159431636903056, "grad_norm": 7.75, "learning_rate": 4.10014691624226e-06, "loss": 1.01551914, "memory(GiB)": 142.32, "step": 103660, "train_speed(iter/s)": 0.286918 }, { "acc": 0.73986359, "epoch": 1.1596553358490145, "grad_norm": 7.40625, "learning_rate": 4.098327754387664e-06, "loss": 1.03068333, "memory(GiB)": 142.32, "step": 103680, "train_speed(iter/s)": 0.286936 }, { "acc": 0.73688955, "epoch": 1.159879034794973, "grad_norm": 6.34375, "learning_rate": 4.096508715894718e-06, "loss": 1.05806913, "memory(GiB)": 142.32, "step": 103700, "train_speed(iter/s)": 0.286954 }, { "acc": 0.73707399, "epoch": 1.1601027337409315, "grad_norm": 6.71875, "learning_rate": 4.094689801012296e-06, "loss": 1.05686741, "memory(GiB)": 142.32, "step": 103720, "train_speed(iter/s)": 0.286971 }, { "acc": 0.73679838, "epoch": 1.16032643268689, "grad_norm": 5.375, "learning_rate": 4.092871009989247e-06, "loss": 1.03909492, "memory(GiB)": 142.32, "step": 103740, "train_speed(iter/s)": 0.286989 }, { "acc": 0.74047089, "epoch": 1.1605501316328486, "grad_norm": 6.78125, "learning_rate": 4.09105234307441e-06, "loss": 1.03947992, "memory(GiB)": 142.32, "step": 103760, "train_speed(iter/s)": 0.287008 }, { "acc": 0.73331671, "epoch": 1.1607738305788071, "grad_norm": 5.53125, "learning_rate": 4.089233800516605e-06, "loss": 1.07210989, "memory(GiB)": 142.32, "step": 103780, "train_speed(iter/s)": 0.287026 }, { "acc": 0.73569269, "epoch": 1.1609975295247656, "grad_norm": 6.4375, "learning_rate": 4.087415382564633e-06, "loss": 1.04609079, "memory(GiB)": 142.32, "step": 103800, "train_speed(iter/s)": 0.287047 }, { "acc": 0.73653526, "epoch": 1.1612212284707242, "grad_norm": 5.53125, "learning_rate": 4.085597089467283e-06, "loss": 1.05085354, "memory(GiB)": 142.32, "step": 103820, "train_speed(iter/s)": 0.287068 }, { "acc": 0.73950491, "epoch": 1.1614449274166827, "grad_norm": 4.59375, "learning_rate": 4.0837789214733185e-06, "loss": 1.02066069, "memory(GiB)": 142.32, "step": 103840, "train_speed(iter/s)": 0.287089 }, { "acc": 0.74207397, "epoch": 1.1616686263626412, "grad_norm": 6.5, "learning_rate": 4.081960878831493e-06, "loss": 1.02159376, "memory(GiB)": 142.32, "step": 103860, "train_speed(iter/s)": 0.287108 }, { "acc": 0.72786589, "epoch": 1.1618923253085998, "grad_norm": 5.0625, "learning_rate": 4.080142961790542e-06, "loss": 1.10112724, "memory(GiB)": 142.32, "step": 103880, "train_speed(iter/s)": 0.287126 }, { "acc": 0.72904558, "epoch": 1.1621160242545583, "grad_norm": 6.4375, "learning_rate": 4.078325170599182e-06, "loss": 1.09244041, "memory(GiB)": 142.32, "step": 103900, "train_speed(iter/s)": 0.287144 }, { "acc": 0.74605646, "epoch": 1.1623397232005168, "grad_norm": 7.03125, "learning_rate": 4.076507505506112e-06, "loss": 1.01472845, "memory(GiB)": 142.32, "step": 103920, "train_speed(iter/s)": 0.287161 }, { "acc": 0.74576955, "epoch": 1.1625634221464753, "grad_norm": 5.6875, "learning_rate": 4.074689966760015e-06, "loss": 0.98936586, "memory(GiB)": 142.32, "step": 103940, "train_speed(iter/s)": 0.28718 }, { "acc": 0.73886085, "epoch": 1.1627871210924339, "grad_norm": 5.75, "learning_rate": 4.072872554609556e-06, "loss": 1.04575891, "memory(GiB)": 142.32, "step": 103960, "train_speed(iter/s)": 0.287198 }, { "acc": 0.73470964, "epoch": 1.1630108200383924, "grad_norm": 6.09375, "learning_rate": 4.071055269303384e-06, "loss": 1.07200203, "memory(GiB)": 142.32, "step": 103980, "train_speed(iter/s)": 0.287218 }, { "acc": 0.74289379, "epoch": 1.163234518984351, "grad_norm": 5.75, "learning_rate": 4.069238111090128e-06, "loss": 1.01649637, "memory(GiB)": 142.32, "step": 104000, "train_speed(iter/s)": 0.287236 }, { "epoch": 1.163234518984351, "eval_acc": 0.6961165869835535, "eval_loss": 1.0726956129074097, "eval_runtime": 2342.5295, "eval_samples_per_second": 32.137, "eval_steps_per_second": 16.069, "step": 104000 }, { "acc": 0.74010248, "epoch": 1.1634582179303095, "grad_norm": 7.09375, "learning_rate": 4.067421080218404e-06, "loss": 1.01398392, "memory(GiB)": 142.32, "step": 104020, "train_speed(iter/s)": 0.285369 }, { "acc": 0.72827253, "epoch": 1.163681916876268, "grad_norm": 6.21875, "learning_rate": 4.065604176936804e-06, "loss": 1.08688202, "memory(GiB)": 142.32, "step": 104040, "train_speed(iter/s)": 0.28539 }, { "acc": 0.73505116, "epoch": 1.1639056158222265, "grad_norm": 5.1875, "learning_rate": 4.063787401493908e-06, "loss": 1.05458212, "memory(GiB)": 142.32, "step": 104060, "train_speed(iter/s)": 0.285408 }, { "acc": 0.73598776, "epoch": 1.164129314768185, "grad_norm": 6.125, "learning_rate": 4.061970754138277e-06, "loss": 1.06076641, "memory(GiB)": 142.32, "step": 104080, "train_speed(iter/s)": 0.285426 }, { "acc": 0.74031906, "epoch": 1.1643530137141436, "grad_norm": 7.03125, "learning_rate": 4.060154235118454e-06, "loss": 1.04232197, "memory(GiB)": 142.32, "step": 104100, "train_speed(iter/s)": 0.285444 }, { "acc": 0.74830399, "epoch": 1.164576712660102, "grad_norm": 5.9375, "learning_rate": 4.058337844682967e-06, "loss": 0.99426813, "memory(GiB)": 142.32, "step": 104120, "train_speed(iter/s)": 0.285461 }, { "acc": 0.73889165, "epoch": 1.1648004116060606, "grad_norm": 5.59375, "learning_rate": 4.056521583080322e-06, "loss": 1.03988571, "memory(GiB)": 142.32, "step": 104140, "train_speed(iter/s)": 0.285479 }, { "acc": 0.73220658, "epoch": 1.1650241105520192, "grad_norm": 6.4375, "learning_rate": 4.054705450559009e-06, "loss": 1.07620926, "memory(GiB)": 142.32, "step": 104160, "train_speed(iter/s)": 0.285498 }, { "acc": 0.73478508, "epoch": 1.1652478094979777, "grad_norm": 6.8125, "learning_rate": 4.052889447367503e-06, "loss": 1.04626446, "memory(GiB)": 142.32, "step": 104180, "train_speed(iter/s)": 0.285517 }, { "acc": 0.73392191, "epoch": 1.1654715084439362, "grad_norm": 6.75, "learning_rate": 4.051073573754257e-06, "loss": 1.05696487, "memory(GiB)": 142.32, "step": 104200, "train_speed(iter/s)": 0.285537 }, { "acc": 0.74503441, "epoch": 1.1656952073898947, "grad_norm": 5.5625, "learning_rate": 4.049257829967709e-06, "loss": 1.02142353, "memory(GiB)": 142.32, "step": 104220, "train_speed(iter/s)": 0.285554 }, { "acc": 0.7382597, "epoch": 1.1659189063358533, "grad_norm": 5.8125, "learning_rate": 4.0474422162562785e-06, "loss": 1.03727722, "memory(GiB)": 142.32, "step": 104240, "train_speed(iter/s)": 0.285574 }, { "acc": 0.74357882, "epoch": 1.1661426052818118, "grad_norm": 6.15625, "learning_rate": 4.045626732868369e-06, "loss": 1.017313, "memory(GiB)": 142.32, "step": 104260, "train_speed(iter/s)": 0.285595 }, { "acc": 0.73046923, "epoch": 1.1663663042277703, "grad_norm": 6.65625, "learning_rate": 4.043811380052364e-06, "loss": 1.07305937, "memory(GiB)": 142.32, "step": 104280, "train_speed(iter/s)": 0.285615 }, { "acc": 0.73447733, "epoch": 1.1665900031737289, "grad_norm": 7.125, "learning_rate": 4.0419961580566295e-06, "loss": 1.05823298, "memory(GiB)": 142.32, "step": 104300, "train_speed(iter/s)": 0.285634 }, { "acc": 0.73859959, "epoch": 1.1668137021196874, "grad_norm": 7.125, "learning_rate": 4.040181067129512e-06, "loss": 1.04865494, "memory(GiB)": 142.32, "step": 104320, "train_speed(iter/s)": 0.285652 }, { "acc": 0.73221197, "epoch": 1.167037401065646, "grad_norm": 5.6875, "learning_rate": 4.038366107519344e-06, "loss": 1.0630743, "memory(GiB)": 142.32, "step": 104340, "train_speed(iter/s)": 0.285668 }, { "acc": 0.73134127, "epoch": 1.1672611000116044, "grad_norm": 6.3125, "learning_rate": 4.036551279474438e-06, "loss": 1.09436054, "memory(GiB)": 142.32, "step": 104360, "train_speed(iter/s)": 0.285688 }, { "acc": 0.73384428, "epoch": 1.167484798957563, "grad_norm": 6.40625, "learning_rate": 4.034736583243088e-06, "loss": 1.05230789, "memory(GiB)": 142.32, "step": 104380, "train_speed(iter/s)": 0.285705 }, { "acc": 0.72900076, "epoch": 1.1677084979035215, "grad_norm": 5.71875, "learning_rate": 4.032922019073569e-06, "loss": 1.09008808, "memory(GiB)": 142.32, "step": 104400, "train_speed(iter/s)": 0.285722 }, { "acc": 0.74227409, "epoch": 1.16793219684948, "grad_norm": 6.9375, "learning_rate": 4.031107587214142e-06, "loss": 1.00012722, "memory(GiB)": 142.32, "step": 104420, "train_speed(iter/s)": 0.285741 }, { "acc": 0.72715297, "epoch": 1.1681558957954385, "grad_norm": 6.78125, "learning_rate": 4.029293287913044e-06, "loss": 1.0887989, "memory(GiB)": 142.32, "step": 104440, "train_speed(iter/s)": 0.285761 }, { "acc": 0.73352098, "epoch": 1.168379594741397, "grad_norm": 6.53125, "learning_rate": 4.0274791214185e-06, "loss": 1.0648447, "memory(GiB)": 142.32, "step": 104460, "train_speed(iter/s)": 0.285779 }, { "acc": 0.7419096, "epoch": 1.1686032936873556, "grad_norm": 6.3125, "learning_rate": 4.025665087978713e-06, "loss": 1.01059332, "memory(GiB)": 142.32, "step": 104480, "train_speed(iter/s)": 0.285798 }, { "acc": 0.73485537, "epoch": 1.1688269926333141, "grad_norm": 5.90625, "learning_rate": 4.0238511878418675e-06, "loss": 1.06330776, "memory(GiB)": 142.32, "step": 104500, "train_speed(iter/s)": 0.285817 }, { "acc": 0.74037313, "epoch": 1.1690506915792727, "grad_norm": 5.625, "learning_rate": 4.0220374212561325e-06, "loss": 1.02801781, "memory(GiB)": 142.32, "step": 104520, "train_speed(iter/s)": 0.285837 }, { "acc": 0.75098543, "epoch": 1.1692743905252312, "grad_norm": 6.0, "learning_rate": 4.020223788469656e-06, "loss": 0.97766895, "memory(GiB)": 142.32, "step": 104540, "train_speed(iter/s)": 0.285854 }, { "acc": 0.72844172, "epoch": 1.1694980894711897, "grad_norm": 5.625, "learning_rate": 4.01841028973057e-06, "loss": 1.08952389, "memory(GiB)": 142.32, "step": 104560, "train_speed(iter/s)": 0.285872 }, { "acc": 0.74121866, "epoch": 1.1697217884171482, "grad_norm": 5.125, "learning_rate": 4.016596925286987e-06, "loss": 1.04315357, "memory(GiB)": 142.32, "step": 104580, "train_speed(iter/s)": 0.285887 }, { "acc": 0.73229704, "epoch": 1.1699454873631068, "grad_norm": 7.9375, "learning_rate": 4.014783695387e-06, "loss": 1.06460638, "memory(GiB)": 142.32, "step": 104600, "train_speed(iter/s)": 0.285906 }, { "acc": 0.74179578, "epoch": 1.1701691863090653, "grad_norm": 5.59375, "learning_rate": 4.012970600278685e-06, "loss": 1.03807602, "memory(GiB)": 142.32, "step": 104620, "train_speed(iter/s)": 0.285924 }, { "acc": 0.741292, "epoch": 1.1703928852550238, "grad_norm": 5.6875, "learning_rate": 4.0111576402101e-06, "loss": 1.01590652, "memory(GiB)": 142.32, "step": 104640, "train_speed(iter/s)": 0.285944 }, { "acc": 0.74632502, "epoch": 1.1706165842009824, "grad_norm": 6.21875, "learning_rate": 4.009344815429284e-06, "loss": 0.99622345, "memory(GiB)": 142.32, "step": 104660, "train_speed(iter/s)": 0.285959 }, { "acc": 0.75766411, "epoch": 1.1708402831469409, "grad_norm": 5.8125, "learning_rate": 4.0075321261842585e-06, "loss": 0.95128098, "memory(GiB)": 142.32, "step": 104680, "train_speed(iter/s)": 0.285979 }, { "acc": 0.74607081, "epoch": 1.1710639820928994, "grad_norm": 5.15625, "learning_rate": 4.005719572723021e-06, "loss": 0.99895954, "memory(GiB)": 142.32, "step": 104700, "train_speed(iter/s)": 0.285996 }, { "acc": 0.75076928, "epoch": 1.171287681038858, "grad_norm": 6.15625, "learning_rate": 4.0039071552935585e-06, "loss": 0.99003515, "memory(GiB)": 142.32, "step": 104720, "train_speed(iter/s)": 0.286017 }, { "acc": 0.73793869, "epoch": 1.1715113799848165, "grad_norm": 7.4375, "learning_rate": 4.002094874143835e-06, "loss": 1.03507652, "memory(GiB)": 142.32, "step": 104740, "train_speed(iter/s)": 0.286037 }, { "acc": 0.73968883, "epoch": 1.171735078930775, "grad_norm": 5.3125, "learning_rate": 4.000282729521795e-06, "loss": 1.04544611, "memory(GiB)": 142.32, "step": 104760, "train_speed(iter/s)": 0.286057 }, { "acc": 0.73534818, "epoch": 1.1719587778767335, "grad_norm": 7.21875, "learning_rate": 3.998470721675369e-06, "loss": 1.05088825, "memory(GiB)": 142.32, "step": 104780, "train_speed(iter/s)": 0.286074 }, { "acc": 0.73574066, "epoch": 1.172182476822692, "grad_norm": 5.40625, "learning_rate": 3.996658850852461e-06, "loss": 1.06002636, "memory(GiB)": 142.32, "step": 104800, "train_speed(iter/s)": 0.286093 }, { "acc": 0.73052406, "epoch": 1.1724061757686506, "grad_norm": 5.625, "learning_rate": 3.994847117300965e-06, "loss": 1.07183399, "memory(GiB)": 142.32, "step": 104820, "train_speed(iter/s)": 0.286111 }, { "acc": 0.73776097, "epoch": 1.172629874714609, "grad_norm": 6.71875, "learning_rate": 3.99303552126875e-06, "loss": 1.04042816, "memory(GiB)": 142.32, "step": 104840, "train_speed(iter/s)": 0.286129 }, { "acc": 0.72299895, "epoch": 1.1728535736605676, "grad_norm": 7.3125, "learning_rate": 3.991224063003667e-06, "loss": 1.10925884, "memory(GiB)": 142.32, "step": 104860, "train_speed(iter/s)": 0.286147 }, { "acc": 0.7391799, "epoch": 1.1730772726065262, "grad_norm": 5.875, "learning_rate": 3.989412742753554e-06, "loss": 1.03914127, "memory(GiB)": 142.32, "step": 104880, "train_speed(iter/s)": 0.286166 }, { "acc": 0.73127689, "epoch": 1.1733009715524847, "grad_norm": 4.6875, "learning_rate": 3.9876015607662195e-06, "loss": 1.07489014, "memory(GiB)": 142.32, "step": 104900, "train_speed(iter/s)": 0.286182 }, { "acc": 0.73061309, "epoch": 1.1735246704984432, "grad_norm": 5.375, "learning_rate": 3.985790517289464e-06, "loss": 1.08554163, "memory(GiB)": 142.32, "step": 104920, "train_speed(iter/s)": 0.286201 }, { "acc": 0.74074473, "epoch": 1.1737483694444018, "grad_norm": 6.875, "learning_rate": 3.983979612571061e-06, "loss": 1.02537842, "memory(GiB)": 142.32, "step": 104940, "train_speed(iter/s)": 0.286217 }, { "acc": 0.72946992, "epoch": 1.1739720683903603, "grad_norm": 7.5625, "learning_rate": 3.982168846858768e-06, "loss": 1.07669086, "memory(GiB)": 142.32, "step": 104960, "train_speed(iter/s)": 0.286235 }, { "acc": 0.73504753, "epoch": 1.1741957673363188, "grad_norm": 6.0625, "learning_rate": 3.980358220400328e-06, "loss": 1.03416348, "memory(GiB)": 142.32, "step": 104980, "train_speed(iter/s)": 0.286252 }, { "acc": 0.74521108, "epoch": 1.1744194662822773, "grad_norm": 5.59375, "learning_rate": 3.978547733443455e-06, "loss": 1.00527973, "memory(GiB)": 142.32, "step": 105000, "train_speed(iter/s)": 0.28627 }, { "acc": 0.74916649, "epoch": 1.1746431652282359, "grad_norm": 7.9375, "learning_rate": 3.976737386235852e-06, "loss": 1.00154896, "memory(GiB)": 142.32, "step": 105020, "train_speed(iter/s)": 0.286289 }, { "acc": 0.72722735, "epoch": 1.1748668641741944, "grad_norm": 6.65625, "learning_rate": 3.974927179025202e-06, "loss": 1.09482403, "memory(GiB)": 142.32, "step": 105040, "train_speed(iter/s)": 0.286308 }, { "acc": 0.72848825, "epoch": 1.175090563120153, "grad_norm": 4.65625, "learning_rate": 3.973117112059165e-06, "loss": 1.08767471, "memory(GiB)": 142.32, "step": 105060, "train_speed(iter/s)": 0.286327 }, { "acc": 0.71847935, "epoch": 1.1753142620661114, "grad_norm": 6.09375, "learning_rate": 3.971307185585385e-06, "loss": 1.13880043, "memory(GiB)": 142.32, "step": 105080, "train_speed(iter/s)": 0.286345 }, { "acc": 0.72480602, "epoch": 1.17553796101207, "grad_norm": 5.625, "learning_rate": 3.969497399851484e-06, "loss": 1.09229059, "memory(GiB)": 142.32, "step": 105100, "train_speed(iter/s)": 0.286364 }, { "acc": 0.73346086, "epoch": 1.1757616599580285, "grad_norm": 6.34375, "learning_rate": 3.967687755105068e-06, "loss": 1.07964115, "memory(GiB)": 142.32, "step": 105120, "train_speed(iter/s)": 0.286382 }, { "acc": 0.73612661, "epoch": 1.175985358903987, "grad_norm": 7.28125, "learning_rate": 3.965878251593723e-06, "loss": 1.06463032, "memory(GiB)": 142.32, "step": 105140, "train_speed(iter/s)": 0.286399 }, { "acc": 0.7414257, "epoch": 1.1762090578499456, "grad_norm": 7.25, "learning_rate": 3.964068889565014e-06, "loss": 1.01924057, "memory(GiB)": 142.32, "step": 105160, "train_speed(iter/s)": 0.286417 }, { "acc": 0.73330064, "epoch": 1.176432756795904, "grad_norm": 6.53125, "learning_rate": 3.9622596692664896e-06, "loss": 1.05775824, "memory(GiB)": 142.32, "step": 105180, "train_speed(iter/s)": 0.286434 }, { "acc": 0.75125475, "epoch": 1.1766564557418626, "grad_norm": 6.59375, "learning_rate": 3.9604505909456735e-06, "loss": 0.98805809, "memory(GiB)": 142.32, "step": 105200, "train_speed(iter/s)": 0.286453 }, { "acc": 0.7289732, "epoch": 1.1768801546878211, "grad_norm": 5.59375, "learning_rate": 3.958641654850075e-06, "loss": 1.07673721, "memory(GiB)": 142.32, "step": 105220, "train_speed(iter/s)": 0.286471 }, { "acc": 0.72446175, "epoch": 1.1771038536337797, "grad_norm": 6.625, "learning_rate": 3.956832861227182e-06, "loss": 1.1024003, "memory(GiB)": 142.32, "step": 105240, "train_speed(iter/s)": 0.286488 }, { "acc": 0.73281956, "epoch": 1.1773275525797382, "grad_norm": 6.8125, "learning_rate": 3.955024210324464e-06, "loss": 1.07387142, "memory(GiB)": 142.32, "step": 105260, "train_speed(iter/s)": 0.286506 }, { "acc": 0.73934135, "epoch": 1.1775512515256967, "grad_norm": 5.21875, "learning_rate": 3.953215702389372e-06, "loss": 1.02814064, "memory(GiB)": 142.32, "step": 105280, "train_speed(iter/s)": 0.286522 }, { "acc": 0.73623309, "epoch": 1.1777749504716553, "grad_norm": 7.21875, "learning_rate": 3.951407337669332e-06, "loss": 1.04076376, "memory(GiB)": 142.32, "step": 105300, "train_speed(iter/s)": 0.286539 }, { "acc": 0.73105421, "epoch": 1.1779986494176138, "grad_norm": 5.59375, "learning_rate": 3.949599116411757e-06, "loss": 1.07755709, "memory(GiB)": 142.32, "step": 105320, "train_speed(iter/s)": 0.286558 }, { "acc": 0.74270625, "epoch": 1.1782223483635723, "grad_norm": 7.46875, "learning_rate": 3.947791038864036e-06, "loss": 1.03401413, "memory(GiB)": 142.32, "step": 105340, "train_speed(iter/s)": 0.286576 }, { "acc": 0.74522023, "epoch": 1.1784460473095308, "grad_norm": 6.53125, "learning_rate": 3.9459831052735425e-06, "loss": 1.01277428, "memory(GiB)": 142.32, "step": 105360, "train_speed(iter/s)": 0.286595 }, { "acc": 0.73342762, "epoch": 1.1786697462554894, "grad_norm": 5.28125, "learning_rate": 3.944175315887624e-06, "loss": 1.07582779, "memory(GiB)": 142.32, "step": 105380, "train_speed(iter/s)": 0.286614 }, { "acc": 0.74309688, "epoch": 1.178893445201448, "grad_norm": 6.375, "learning_rate": 3.942367670953613e-06, "loss": 1.01557198, "memory(GiB)": 142.32, "step": 105400, "train_speed(iter/s)": 0.286632 }, { "acc": 0.73422565, "epoch": 1.1791171441474064, "grad_norm": 5.46875, "learning_rate": 3.940560170718822e-06, "loss": 1.06259031, "memory(GiB)": 142.32, "step": 105420, "train_speed(iter/s)": 0.28665 }, { "acc": 0.73038707, "epoch": 1.179340843093365, "grad_norm": 6.09375, "learning_rate": 3.938752815430543e-06, "loss": 1.07334766, "memory(GiB)": 142.32, "step": 105440, "train_speed(iter/s)": 0.28667 }, { "acc": 0.73622575, "epoch": 1.1795645420393235, "grad_norm": 6.03125, "learning_rate": 3.9369456053360464e-06, "loss": 1.04107895, "memory(GiB)": 142.32, "step": 105460, "train_speed(iter/s)": 0.286688 }, { "acc": 0.73311071, "epoch": 1.179788240985282, "grad_norm": 5.8125, "learning_rate": 3.935138540682587e-06, "loss": 1.07276001, "memory(GiB)": 142.32, "step": 105480, "train_speed(iter/s)": 0.286704 }, { "acc": 0.73573093, "epoch": 1.1800119399312405, "grad_norm": 6.4375, "learning_rate": 3.933331621717394e-06, "loss": 1.06252022, "memory(GiB)": 142.32, "step": 105500, "train_speed(iter/s)": 0.286721 }, { "acc": 0.74083791, "epoch": 1.180235638877199, "grad_norm": 5.8125, "learning_rate": 3.931524848687683e-06, "loss": 1.02150841, "memory(GiB)": 142.32, "step": 105520, "train_speed(iter/s)": 0.286741 }, { "acc": 0.73565578, "epoch": 1.1804593378231576, "grad_norm": 5.3125, "learning_rate": 3.9297182218406435e-06, "loss": 1.04590893, "memory(GiB)": 142.32, "step": 105540, "train_speed(iter/s)": 0.286759 }, { "acc": 0.74556618, "epoch": 1.1806830367691161, "grad_norm": 7.1875, "learning_rate": 3.927911741423449e-06, "loss": 1.01736269, "memory(GiB)": 142.32, "step": 105560, "train_speed(iter/s)": 0.286778 }, { "acc": 0.73119335, "epoch": 1.1809067357150747, "grad_norm": 5.875, "learning_rate": 3.9261054076832526e-06, "loss": 1.08213501, "memory(GiB)": 142.32, "step": 105580, "train_speed(iter/s)": 0.286796 }, { "acc": 0.72702694, "epoch": 1.1811304346610332, "grad_norm": 6.125, "learning_rate": 3.9242992208671855e-06, "loss": 1.09148102, "memory(GiB)": 142.32, "step": 105600, "train_speed(iter/s)": 0.286813 }, { "acc": 0.7341125, "epoch": 1.1813541336069917, "grad_norm": 5.0, "learning_rate": 3.922493181222361e-06, "loss": 1.05778522, "memory(GiB)": 142.32, "step": 105620, "train_speed(iter/s)": 0.286832 }, { "acc": 0.7283175, "epoch": 1.1815778325529502, "grad_norm": 4.90625, "learning_rate": 3.92068728899587e-06, "loss": 1.09890099, "memory(GiB)": 142.32, "step": 105640, "train_speed(iter/s)": 0.286852 }, { "acc": 0.7375823, "epoch": 1.1818015314989088, "grad_norm": 6.84375, "learning_rate": 3.918881544434785e-06, "loss": 1.03736801, "memory(GiB)": 142.32, "step": 105660, "train_speed(iter/s)": 0.28687 }, { "acc": 0.74382696, "epoch": 1.1820252304448673, "grad_norm": 5.375, "learning_rate": 3.917075947786156e-06, "loss": 1.00078049, "memory(GiB)": 142.32, "step": 105680, "train_speed(iter/s)": 0.286888 }, { "acc": 0.74083266, "epoch": 1.1822489293908258, "grad_norm": 5.5625, "learning_rate": 3.9152704992970174e-06, "loss": 1.03169117, "memory(GiB)": 142.32, "step": 105700, "train_speed(iter/s)": 0.286904 }, { "acc": 0.74334679, "epoch": 1.1824726283367843, "grad_norm": 7.59375, "learning_rate": 3.913465199214379e-06, "loss": 1.01714363, "memory(GiB)": 142.32, "step": 105720, "train_speed(iter/s)": 0.286923 }, { "acc": 0.74402289, "epoch": 1.1826963272827429, "grad_norm": 5.96875, "learning_rate": 3.9116600477852315e-06, "loss": 0.99340296, "memory(GiB)": 142.32, "step": 105740, "train_speed(iter/s)": 0.286941 }, { "acc": 0.72669716, "epoch": 1.1829200262287014, "grad_norm": 5.5, "learning_rate": 3.909855045256545e-06, "loss": 1.10102901, "memory(GiB)": 142.32, "step": 105760, "train_speed(iter/s)": 0.28696 }, { "acc": 0.74499931, "epoch": 1.18314372517466, "grad_norm": 5.5625, "learning_rate": 3.90805019187527e-06, "loss": 1.00566063, "memory(GiB)": 142.32, "step": 105780, "train_speed(iter/s)": 0.286979 }, { "acc": 0.75475645, "epoch": 1.1833674241206185, "grad_norm": 7.03125, "learning_rate": 3.906245487888336e-06, "loss": 0.97383652, "memory(GiB)": 142.32, "step": 105800, "train_speed(iter/s)": 0.286998 }, { "acc": 0.73642569, "epoch": 1.183591123066577, "grad_norm": 7.375, "learning_rate": 3.904440933542654e-06, "loss": 1.03659077, "memory(GiB)": 142.32, "step": 105820, "train_speed(iter/s)": 0.287016 }, { "acc": 0.73046913, "epoch": 1.1838148220125355, "grad_norm": 6.09375, "learning_rate": 3.902636529085109e-06, "loss": 1.07840061, "memory(GiB)": 142.32, "step": 105840, "train_speed(iter/s)": 0.287035 }, { "acc": 0.74087739, "epoch": 1.184038520958494, "grad_norm": 6.0625, "learning_rate": 3.9008322747625736e-06, "loss": 1.01425991, "memory(GiB)": 142.32, "step": 105860, "train_speed(iter/s)": 0.287054 }, { "acc": 0.73541203, "epoch": 1.1842622199044526, "grad_norm": 6.375, "learning_rate": 3.899028170821894e-06, "loss": 1.05640574, "memory(GiB)": 142.32, "step": 105880, "train_speed(iter/s)": 0.287073 }, { "acc": 0.73792906, "epoch": 1.184485918850411, "grad_norm": 6.65625, "learning_rate": 3.897224217509896e-06, "loss": 1.05134277, "memory(GiB)": 142.32, "step": 105900, "train_speed(iter/s)": 0.287092 }, { "acc": 0.73843646, "epoch": 1.1847096177963696, "grad_norm": 6.0, "learning_rate": 3.895420415073389e-06, "loss": 1.03261881, "memory(GiB)": 142.32, "step": 105920, "train_speed(iter/s)": 0.28711 }, { "acc": 0.73594437, "epoch": 1.1849333167423282, "grad_norm": 4.6875, "learning_rate": 3.893616763759155e-06, "loss": 1.04914398, "memory(GiB)": 142.32, "step": 105940, "train_speed(iter/s)": 0.287127 }, { "acc": 0.72236896, "epoch": 1.1851570156882867, "grad_norm": 7.25, "learning_rate": 3.891813263813962e-06, "loss": 1.11698074, "memory(GiB)": 142.32, "step": 105960, "train_speed(iter/s)": 0.287146 }, { "acc": 0.73923912, "epoch": 1.1853807146342452, "grad_norm": 5.78125, "learning_rate": 3.890009915484556e-06, "loss": 1.04097672, "memory(GiB)": 142.32, "step": 105980, "train_speed(iter/s)": 0.287165 }, { "acc": 0.73270149, "epoch": 1.1856044135802037, "grad_norm": 6.5, "learning_rate": 3.888206719017657e-06, "loss": 1.07374201, "memory(GiB)": 142.32, "step": 106000, "train_speed(iter/s)": 0.287182 }, { "epoch": 1.1856044135802037, "eval_acc": 0.6961887079898835, "eval_loss": 1.0726717710494995, "eval_runtime": 2342.3323, "eval_samples_per_second": 32.14, "eval_steps_per_second": 16.07, "step": 106000 }, { "acc": 0.72526197, "epoch": 1.1858281125261623, "grad_norm": 5.875, "learning_rate": 3.886403674659972e-06, "loss": 1.10876637, "memory(GiB)": 142.32, "step": 106020, "train_speed(iter/s)": 0.28535 }, { "acc": 0.73773575, "epoch": 1.1860518114721208, "grad_norm": 6.3125, "learning_rate": 3.88460078265818e-06, "loss": 1.05875607, "memory(GiB)": 142.32, "step": 106040, "train_speed(iter/s)": 0.28537 }, { "acc": 0.74209776, "epoch": 1.1862755104180793, "grad_norm": 6.53125, "learning_rate": 3.882798043258943e-06, "loss": 1.02367687, "memory(GiB)": 142.32, "step": 106060, "train_speed(iter/s)": 0.285387 }, { "acc": 0.73409147, "epoch": 1.1864992093640379, "grad_norm": 7.875, "learning_rate": 3.880995456708903e-06, "loss": 1.04583931, "memory(GiB)": 142.32, "step": 106080, "train_speed(iter/s)": 0.285405 }, { "acc": 0.74056082, "epoch": 1.1867229083099964, "grad_norm": 7.53125, "learning_rate": 3.879193023254678e-06, "loss": 1.02407417, "memory(GiB)": 142.32, "step": 106100, "train_speed(iter/s)": 0.285421 }, { "acc": 0.73731041, "epoch": 1.186946607255955, "grad_norm": 6.78125, "learning_rate": 3.877390743142869e-06, "loss": 1.04179239, "memory(GiB)": 142.32, "step": 106120, "train_speed(iter/s)": 0.28544 }, { "acc": 0.73084726, "epoch": 1.1871703062019134, "grad_norm": 5.8125, "learning_rate": 3.875588616620052e-06, "loss": 1.08604641, "memory(GiB)": 142.32, "step": 106140, "train_speed(iter/s)": 0.285459 }, { "acc": 0.73211217, "epoch": 1.187394005147872, "grad_norm": 7.15625, "learning_rate": 3.873786643932782e-06, "loss": 1.06775761, "memory(GiB)": 142.32, "step": 106160, "train_speed(iter/s)": 0.285476 }, { "acc": 0.74881277, "epoch": 1.1876177040938305, "grad_norm": 6.1875, "learning_rate": 3.8719848253275975e-06, "loss": 0.99994879, "memory(GiB)": 142.32, "step": 106180, "train_speed(iter/s)": 0.285493 }, { "acc": 0.7430089, "epoch": 1.187841403039789, "grad_norm": 6.125, "learning_rate": 3.870183161051012e-06, "loss": 1.01592655, "memory(GiB)": 142.32, "step": 106200, "train_speed(iter/s)": 0.285514 }, { "acc": 0.74197912, "epoch": 1.1880651019857476, "grad_norm": 5.28125, "learning_rate": 3.86838165134952e-06, "loss": 1.02716255, "memory(GiB)": 142.32, "step": 106220, "train_speed(iter/s)": 0.285533 }, { "acc": 0.72830362, "epoch": 1.188288800931706, "grad_norm": 5.125, "learning_rate": 3.866580296469591e-06, "loss": 1.07644386, "memory(GiB)": 142.32, "step": 106240, "train_speed(iter/s)": 0.285553 }, { "acc": 0.74143991, "epoch": 1.1885124998776646, "grad_norm": 5.46875, "learning_rate": 3.864779096657678e-06, "loss": 1.03834057, "memory(GiB)": 142.32, "step": 106260, "train_speed(iter/s)": 0.285571 }, { "acc": 0.73097382, "epoch": 1.1887361988236231, "grad_norm": 6.84375, "learning_rate": 3.862978052160211e-06, "loss": 1.07721558, "memory(GiB)": 142.32, "step": 106280, "train_speed(iter/s)": 0.285589 }, { "acc": 0.72636538, "epoch": 1.1889598977695817, "grad_norm": 6.25, "learning_rate": 3.861177163223597e-06, "loss": 1.09056702, "memory(GiB)": 142.32, "step": 106300, "train_speed(iter/s)": 0.285607 }, { "acc": 0.7359046, "epoch": 1.1891835967155402, "grad_norm": 4.625, "learning_rate": 3.8593764300942274e-06, "loss": 1.05528431, "memory(GiB)": 142.32, "step": 106320, "train_speed(iter/s)": 0.285626 }, { "acc": 0.73563166, "epoch": 1.1894072956614987, "grad_norm": 6.4375, "learning_rate": 3.857575853018463e-06, "loss": 1.04838619, "memory(GiB)": 142.32, "step": 106340, "train_speed(iter/s)": 0.285646 }, { "acc": 0.73888493, "epoch": 1.1896309946074572, "grad_norm": 5.15625, "learning_rate": 3.8557754322426515e-06, "loss": 1.04836445, "memory(GiB)": 142.32, "step": 106360, "train_speed(iter/s)": 0.285666 }, { "acc": 0.74087553, "epoch": 1.1898546935534158, "grad_norm": 6.0625, "learning_rate": 3.853975168013115e-06, "loss": 1.02757263, "memory(GiB)": 142.32, "step": 106380, "train_speed(iter/s)": 0.285686 }, { "acc": 0.74125843, "epoch": 1.1900783924993743, "grad_norm": 7.3125, "learning_rate": 3.852175060576157e-06, "loss": 1.01827679, "memory(GiB)": 142.32, "step": 106400, "train_speed(iter/s)": 0.285705 }, { "acc": 0.72876406, "epoch": 1.1903020914453328, "grad_norm": 6.53125, "learning_rate": 3.8503751101780575e-06, "loss": 1.0858984, "memory(GiB)": 142.32, "step": 106420, "train_speed(iter/s)": 0.285722 }, { "acc": 0.74636345, "epoch": 1.1905257903912914, "grad_norm": 6.625, "learning_rate": 3.848575317065073e-06, "loss": 1.00152617, "memory(GiB)": 142.32, "step": 106440, "train_speed(iter/s)": 0.285742 }, { "acc": 0.75211573, "epoch": 1.1907494893372499, "grad_norm": 6.09375, "learning_rate": 3.846775681483444e-06, "loss": 0.98259277, "memory(GiB)": 142.32, "step": 106460, "train_speed(iter/s)": 0.28576 }, { "acc": 0.73198633, "epoch": 1.1909731882832084, "grad_norm": 5.34375, "learning_rate": 3.844976203679385e-06, "loss": 1.09369917, "memory(GiB)": 142.32, "step": 106480, "train_speed(iter/s)": 0.285778 }, { "acc": 0.73445492, "epoch": 1.191196887229167, "grad_norm": 6.1875, "learning_rate": 3.84317688389909e-06, "loss": 1.06516228, "memory(GiB)": 142.32, "step": 106500, "train_speed(iter/s)": 0.285795 }, { "acc": 0.72474174, "epoch": 1.1914205861751255, "grad_norm": 5.8125, "learning_rate": 3.8413777223887335e-06, "loss": 1.10715809, "memory(GiB)": 142.32, "step": 106520, "train_speed(iter/s)": 0.285812 }, { "acc": 0.74126015, "epoch": 1.191644285121084, "grad_norm": 5.25, "learning_rate": 3.839578719394464e-06, "loss": 1.03362217, "memory(GiB)": 142.32, "step": 106540, "train_speed(iter/s)": 0.28583 }, { "acc": 0.73639822, "epoch": 1.1918679840670425, "grad_norm": 5.8125, "learning_rate": 3.837779875162413e-06, "loss": 1.05639067, "memory(GiB)": 142.32, "step": 106560, "train_speed(iter/s)": 0.285848 }, { "acc": 0.73542156, "epoch": 1.192091683013001, "grad_norm": 7.15625, "learning_rate": 3.835981189938687e-06, "loss": 1.04767647, "memory(GiB)": 142.32, "step": 106580, "train_speed(iter/s)": 0.285866 }, { "acc": 0.73442154, "epoch": 1.1923153819589596, "grad_norm": 6.5, "learning_rate": 3.83418266396937e-06, "loss": 1.04839897, "memory(GiB)": 142.32, "step": 106600, "train_speed(iter/s)": 0.285886 }, { "acc": 0.71526146, "epoch": 1.1925390809049181, "grad_norm": 5.3125, "learning_rate": 3.832384297500529e-06, "loss": 1.14684315, "memory(GiB)": 142.32, "step": 106620, "train_speed(iter/s)": 0.285901 }, { "acc": 0.73531532, "epoch": 1.1927627798508766, "grad_norm": 6.21875, "learning_rate": 3.830586090778204e-06, "loss": 1.03381424, "memory(GiB)": 142.32, "step": 106640, "train_speed(iter/s)": 0.285919 }, { "acc": 0.74576969, "epoch": 1.1929864787968352, "grad_norm": 6.375, "learning_rate": 3.828788044048418e-06, "loss": 1.00862761, "memory(GiB)": 142.32, "step": 106660, "train_speed(iter/s)": 0.285938 }, { "acc": 0.74321203, "epoch": 1.1932101777427937, "grad_norm": 6.78125, "learning_rate": 3.826990157557169e-06, "loss": 1.02357483, "memory(GiB)": 142.32, "step": 106680, "train_speed(iter/s)": 0.285955 }, { "acc": 0.73580599, "epoch": 1.1934338766887522, "grad_norm": 6.96875, "learning_rate": 3.82519243155043e-06, "loss": 1.04964275, "memory(GiB)": 142.32, "step": 106700, "train_speed(iter/s)": 0.285974 }, { "acc": 0.74631815, "epoch": 1.1936575756347108, "grad_norm": 7.90625, "learning_rate": 3.8233948662741595e-06, "loss": 0.99234848, "memory(GiB)": 142.32, "step": 106720, "train_speed(iter/s)": 0.285993 }, { "acc": 0.73472862, "epoch": 1.1938812745806693, "grad_norm": 7.0, "learning_rate": 3.821597461974289e-06, "loss": 1.05445309, "memory(GiB)": 142.32, "step": 106740, "train_speed(iter/s)": 0.28601 }, { "acc": 0.74408197, "epoch": 1.1941049735266278, "grad_norm": 5.40625, "learning_rate": 3.819800218896728e-06, "loss": 1.01213551, "memory(GiB)": 142.32, "step": 106760, "train_speed(iter/s)": 0.286029 }, { "acc": 0.73035126, "epoch": 1.1943286724725863, "grad_norm": 6.5625, "learning_rate": 3.818003137287367e-06, "loss": 1.07754288, "memory(GiB)": 142.32, "step": 106780, "train_speed(iter/s)": 0.286046 }, { "acc": 0.7342937, "epoch": 1.1945523714185449, "grad_norm": 5.9375, "learning_rate": 3.816206217392072e-06, "loss": 1.06193943, "memory(GiB)": 142.32, "step": 106800, "train_speed(iter/s)": 0.286066 }, { "acc": 0.74304695, "epoch": 1.1947760703645034, "grad_norm": 5.0, "learning_rate": 3.8144094594566854e-06, "loss": 1.03040981, "memory(GiB)": 142.32, "step": 106820, "train_speed(iter/s)": 0.286082 }, { "acc": 0.72593136, "epoch": 1.194999769310462, "grad_norm": 5.28125, "learning_rate": 3.812612863727031e-06, "loss": 1.10931749, "memory(GiB)": 142.32, "step": 106840, "train_speed(iter/s)": 0.2861 }, { "acc": 0.72608199, "epoch": 1.1952234682564205, "grad_norm": 6.03125, "learning_rate": 3.8108164304489085e-06, "loss": 1.09323616, "memory(GiB)": 142.32, "step": 106860, "train_speed(iter/s)": 0.286117 }, { "acc": 0.74170113, "epoch": 1.195447167202379, "grad_norm": 5.8125, "learning_rate": 3.8090201598680972e-06, "loss": 1.02681131, "memory(GiB)": 142.32, "step": 106880, "train_speed(iter/s)": 0.286136 }, { "acc": 0.74038286, "epoch": 1.1956708661483375, "grad_norm": 7.6875, "learning_rate": 3.8072240522303495e-06, "loss": 1.03043556, "memory(GiB)": 142.32, "step": 106900, "train_speed(iter/s)": 0.286153 }, { "acc": 0.73591557, "epoch": 1.195894565094296, "grad_norm": 6.03125, "learning_rate": 3.8054281077814e-06, "loss": 1.04023819, "memory(GiB)": 142.32, "step": 106920, "train_speed(iter/s)": 0.286174 }, { "acc": 0.74999075, "epoch": 1.1961182640402546, "grad_norm": 6.5, "learning_rate": 3.8036323267669604e-06, "loss": 0.98812332, "memory(GiB)": 142.32, "step": 106940, "train_speed(iter/s)": 0.286192 }, { "acc": 0.73532553, "epoch": 1.196341962986213, "grad_norm": 5.3125, "learning_rate": 3.801836709432718e-06, "loss": 1.0297617, "memory(GiB)": 142.32, "step": 106960, "train_speed(iter/s)": 0.28621 }, { "acc": 0.74075246, "epoch": 1.1965656619321716, "grad_norm": 7.0625, "learning_rate": 3.8000412560243405e-06, "loss": 1.02629299, "memory(GiB)": 142.32, "step": 106980, "train_speed(iter/s)": 0.286229 }, { "acc": 0.73675814, "epoch": 1.1967893608781301, "grad_norm": 5.8125, "learning_rate": 3.79824596678747e-06, "loss": 1.03982964, "memory(GiB)": 142.32, "step": 107000, "train_speed(iter/s)": 0.286246 }, { "acc": 0.73013449, "epoch": 1.1970130598240887, "grad_norm": 6.5625, "learning_rate": 3.796450841967728e-06, "loss": 1.08306723, "memory(GiB)": 142.32, "step": 107020, "train_speed(iter/s)": 0.286266 }, { "acc": 0.7433145, "epoch": 1.1972367587700472, "grad_norm": 7.5625, "learning_rate": 3.7946558818107132e-06, "loss": 1.01226177, "memory(GiB)": 142.32, "step": 107040, "train_speed(iter/s)": 0.286285 }, { "acc": 0.73133888, "epoch": 1.1974604577160057, "grad_norm": 7.28125, "learning_rate": 3.7928610865620023e-06, "loss": 1.07208519, "memory(GiB)": 142.32, "step": 107060, "train_speed(iter/s)": 0.286304 }, { "acc": 0.74411697, "epoch": 1.1976841566619643, "grad_norm": 6.125, "learning_rate": 3.7910664564671496e-06, "loss": 1.02102795, "memory(GiB)": 142.32, "step": 107080, "train_speed(iter/s)": 0.286321 }, { "acc": 0.74307151, "epoch": 1.1979078556079228, "grad_norm": 5.4375, "learning_rate": 3.7892719917716847e-06, "loss": 1.03531685, "memory(GiB)": 142.32, "step": 107100, "train_speed(iter/s)": 0.286338 }, { "acc": 0.74673572, "epoch": 1.1981315545538813, "grad_norm": 6.15625, "learning_rate": 3.7874776927211165e-06, "loss": 1.00376129, "memory(GiB)": 142.32, "step": 107120, "train_speed(iter/s)": 0.286356 }, { "acc": 0.74707522, "epoch": 1.1983552534998398, "grad_norm": 6.0625, "learning_rate": 3.7856835595609304e-06, "loss": 1.00777922, "memory(GiB)": 142.32, "step": 107140, "train_speed(iter/s)": 0.286375 }, { "acc": 0.74837685, "epoch": 1.1985789524457984, "grad_norm": 7.40625, "learning_rate": 3.7838895925365905e-06, "loss": 0.99707022, "memory(GiB)": 142.32, "step": 107160, "train_speed(iter/s)": 0.286393 }, { "acc": 0.73431411, "epoch": 1.198802651391757, "grad_norm": 6.1875, "learning_rate": 3.7820957918935374e-06, "loss": 1.04795856, "memory(GiB)": 142.32, "step": 107180, "train_speed(iter/s)": 0.28641 }, { "acc": 0.73435483, "epoch": 1.1990263503377154, "grad_norm": 5.53125, "learning_rate": 3.780302157877187e-06, "loss": 1.05700054, "memory(GiB)": 142.32, "step": 107200, "train_speed(iter/s)": 0.28643 }, { "acc": 0.72773695, "epoch": 1.199250049283674, "grad_norm": 6.3125, "learning_rate": 3.7785086907329345e-06, "loss": 1.07454205, "memory(GiB)": 142.32, "step": 107220, "train_speed(iter/s)": 0.286449 }, { "acc": 0.7300395, "epoch": 1.1994737482296325, "grad_norm": 6.0, "learning_rate": 3.7767153907061522e-06, "loss": 1.07473221, "memory(GiB)": 142.32, "step": 107240, "train_speed(iter/s)": 0.286468 }, { "acc": 0.72875938, "epoch": 1.199697447175591, "grad_norm": 7.375, "learning_rate": 3.7749222580421896e-06, "loss": 1.06876774, "memory(GiB)": 142.32, "step": 107260, "train_speed(iter/s)": 0.286484 }, { "acc": 0.72959023, "epoch": 1.1999211461215495, "grad_norm": 5.5625, "learning_rate": 3.773129292986373e-06, "loss": 1.06626482, "memory(GiB)": 142.32, "step": 107280, "train_speed(iter/s)": 0.286502 }, { "acc": 0.74087253, "epoch": 1.200144845067508, "grad_norm": 6.03125, "learning_rate": 3.771336495784005e-06, "loss": 1.02138977, "memory(GiB)": 142.32, "step": 107300, "train_speed(iter/s)": 0.286521 }, { "acc": 0.72902718, "epoch": 1.2003685440134666, "grad_norm": 5.65625, "learning_rate": 3.7695438666803654e-06, "loss": 1.0776823, "memory(GiB)": 142.32, "step": 107320, "train_speed(iter/s)": 0.28654 }, { "acc": 0.73056812, "epoch": 1.2005922429594251, "grad_norm": 5.03125, "learning_rate": 3.767751405920712e-06, "loss": 1.07017059, "memory(GiB)": 142.32, "step": 107340, "train_speed(iter/s)": 0.286558 }, { "acc": 0.74271564, "epoch": 1.2008159419053837, "grad_norm": 6.25, "learning_rate": 3.765959113750279e-06, "loss": 1.02887583, "memory(GiB)": 142.32, "step": 107360, "train_speed(iter/s)": 0.286577 }, { "acc": 0.73712606, "epoch": 1.2010396408513422, "grad_norm": 5.8125, "learning_rate": 3.76416699041428e-06, "loss": 1.03972092, "memory(GiB)": 142.32, "step": 107380, "train_speed(iter/s)": 0.286596 }, { "acc": 0.74359827, "epoch": 1.2012633397973007, "grad_norm": 7.34375, "learning_rate": 3.7623750361578986e-06, "loss": 1.01062355, "memory(GiB)": 142.32, "step": 107400, "train_speed(iter/s)": 0.286615 }, { "acc": 0.72955141, "epoch": 1.2014870387432592, "grad_norm": 6.5625, "learning_rate": 3.7605832512263026e-06, "loss": 1.07882814, "memory(GiB)": 142.32, "step": 107420, "train_speed(iter/s)": 0.286632 }, { "acc": 0.74603472, "epoch": 1.2017107376892178, "grad_norm": 4.875, "learning_rate": 3.7587916358646328e-06, "loss": 0.98026333, "memory(GiB)": 142.32, "step": 107440, "train_speed(iter/s)": 0.28665 }, { "acc": 0.73592043, "epoch": 1.2019344366351763, "grad_norm": 6.90625, "learning_rate": 3.757000190318008e-06, "loss": 1.05526009, "memory(GiB)": 142.32, "step": 107460, "train_speed(iter/s)": 0.286667 }, { "acc": 0.73352876, "epoch": 1.2021581355811348, "grad_norm": 5.25, "learning_rate": 3.755208914831525e-06, "loss": 1.05775347, "memory(GiB)": 142.32, "step": 107480, "train_speed(iter/s)": 0.286685 }, { "acc": 0.7460279, "epoch": 1.2023818345270934, "grad_norm": 6.46875, "learning_rate": 3.7534178096502537e-06, "loss": 0.99857693, "memory(GiB)": 142.32, "step": 107500, "train_speed(iter/s)": 0.286702 }, { "acc": 0.74216218, "epoch": 1.2026055334730519, "grad_norm": 5.75, "learning_rate": 3.7516268750192437e-06, "loss": 1.04234667, "memory(GiB)": 142.32, "step": 107520, "train_speed(iter/s)": 0.286718 }, { "acc": 0.72999907, "epoch": 1.2028292324190104, "grad_norm": 7.34375, "learning_rate": 3.74983611118352e-06, "loss": 1.0715414, "memory(GiB)": 142.32, "step": 107540, "train_speed(iter/s)": 0.286736 }, { "acc": 0.72776194, "epoch": 1.203052931364969, "grad_norm": 5.90625, "learning_rate": 3.7480455183880865e-06, "loss": 1.09736071, "memory(GiB)": 142.32, "step": 107560, "train_speed(iter/s)": 0.286755 }, { "acc": 0.73843312, "epoch": 1.2032766303109275, "grad_norm": 6.59375, "learning_rate": 3.746255096877921e-06, "loss": 1.03214817, "memory(GiB)": 142.32, "step": 107580, "train_speed(iter/s)": 0.286772 }, { "acc": 0.74210596, "epoch": 1.203500329256886, "grad_norm": 5.96875, "learning_rate": 3.7444648468979774e-06, "loss": 1.01490383, "memory(GiB)": 142.32, "step": 107600, "train_speed(iter/s)": 0.286789 }, { "acc": 0.73886709, "epoch": 1.2037240282028445, "grad_norm": 6.46875, "learning_rate": 3.7426747686931886e-06, "loss": 1.04061222, "memory(GiB)": 142.32, "step": 107620, "train_speed(iter/s)": 0.286806 }, { "acc": 0.74032459, "epoch": 1.203947727148803, "grad_norm": 5.1875, "learning_rate": 3.7408848625084624e-06, "loss": 1.0311924, "memory(GiB)": 142.32, "step": 107640, "train_speed(iter/s)": 0.286824 }, { "acc": 0.74504786, "epoch": 1.2041714260947616, "grad_norm": 5.40625, "learning_rate": 3.7390951285886845e-06, "loss": 1.01278439, "memory(GiB)": 142.32, "step": 107660, "train_speed(iter/s)": 0.286843 }, { "acc": 0.74093065, "epoch": 1.20439512504072, "grad_norm": 7.5, "learning_rate": 3.737305567178716e-06, "loss": 1.02691631, "memory(GiB)": 142.32, "step": 107680, "train_speed(iter/s)": 0.286862 }, { "acc": 0.74952879, "epoch": 1.2046188239866786, "grad_norm": 5.78125, "learning_rate": 3.7355161785233928e-06, "loss": 0.99358768, "memory(GiB)": 142.32, "step": 107700, "train_speed(iter/s)": 0.286882 }, { "acc": 0.74613428, "epoch": 1.2048425229326372, "grad_norm": 6.34375, "learning_rate": 3.733726962867532e-06, "loss": 1.00616417, "memory(GiB)": 142.32, "step": 107720, "train_speed(iter/s)": 0.2869 }, { "acc": 0.72612085, "epoch": 1.2050662218785957, "grad_norm": 5.8125, "learning_rate": 3.7319379204559203e-06, "loss": 1.09610577, "memory(GiB)": 142.32, "step": 107740, "train_speed(iter/s)": 0.286918 }, { "acc": 0.74119835, "epoch": 1.2052899208245544, "grad_norm": 6.4375, "learning_rate": 3.730149051533326e-06, "loss": 1.01780853, "memory(GiB)": 142.32, "step": 107760, "train_speed(iter/s)": 0.286934 }, { "acc": 0.74088888, "epoch": 1.205513619770513, "grad_norm": 5.03125, "learning_rate": 3.7283603563444916e-06, "loss": 1.03045788, "memory(GiB)": 142.32, "step": 107780, "train_speed(iter/s)": 0.286952 }, { "acc": 0.73358488, "epoch": 1.2057373187164715, "grad_norm": 5.15625, "learning_rate": 3.726571835134136e-06, "loss": 1.07079554, "memory(GiB)": 142.32, "step": 107800, "train_speed(iter/s)": 0.286964 }, { "acc": 0.73089924, "epoch": 1.20596101766243, "grad_norm": 5.84375, "learning_rate": 3.724783488146957e-06, "loss": 1.07882099, "memory(GiB)": 142.32, "step": 107820, "train_speed(iter/s)": 0.286982 }, { "acc": 0.72904158, "epoch": 1.2061847166083886, "grad_norm": 5.96875, "learning_rate": 3.7229953156276216e-06, "loss": 1.108078, "memory(GiB)": 142.32, "step": 107840, "train_speed(iter/s)": 0.287 }, { "acc": 0.73679295, "epoch": 1.206408415554347, "grad_norm": 6.5, "learning_rate": 3.72120731782078e-06, "loss": 1.06524754, "memory(GiB)": 142.32, "step": 107860, "train_speed(iter/s)": 0.287017 }, { "acc": 0.73708987, "epoch": 1.2066321145003056, "grad_norm": 6.78125, "learning_rate": 3.7194194949710556e-06, "loss": 1.03790779, "memory(GiB)": 142.32, "step": 107880, "train_speed(iter/s)": 0.287034 }, { "acc": 0.7323245, "epoch": 1.2068558134462641, "grad_norm": 5.40625, "learning_rate": 3.7176318473230476e-06, "loss": 1.06454763, "memory(GiB)": 142.32, "step": 107900, "train_speed(iter/s)": 0.287052 }, { "acc": 0.73549614, "epoch": 1.2070795123922227, "grad_norm": 7.5, "learning_rate": 3.7158443751213334e-06, "loss": 1.07087116, "memory(GiB)": 142.32, "step": 107920, "train_speed(iter/s)": 0.28707 }, { "acc": 0.7279995, "epoch": 1.2073032113381812, "grad_norm": 5.375, "learning_rate": 3.714057078610463e-06, "loss": 1.07768688, "memory(GiB)": 142.32, "step": 107940, "train_speed(iter/s)": 0.287089 }, { "acc": 0.74629893, "epoch": 1.2075269102841397, "grad_norm": 6.90625, "learning_rate": 3.7122699580349643e-06, "loss": 1.00251417, "memory(GiB)": 142.32, "step": 107960, "train_speed(iter/s)": 0.287103 }, { "acc": 0.73412018, "epoch": 1.2077506092300982, "grad_norm": 6.59375, "learning_rate": 3.710483013639341e-06, "loss": 1.06939182, "memory(GiB)": 142.32, "step": 107980, "train_speed(iter/s)": 0.287122 }, { "acc": 0.71929388, "epoch": 1.2079743081760568, "grad_norm": 6.96875, "learning_rate": 3.708696245668073e-06, "loss": 1.11458368, "memory(GiB)": 142.32, "step": 108000, "train_speed(iter/s)": 0.28714 }, { "epoch": 1.2079743081760568, "eval_acc": 0.6961364535355363, "eval_loss": 1.0724867582321167, "eval_runtime": 2344.5661, "eval_samples_per_second": 32.11, "eval_steps_per_second": 16.055, "step": 108000 }, { "acc": 0.73418264, "epoch": 1.2081980071220153, "grad_norm": 5.4375, "learning_rate": 3.706909654365617e-06, "loss": 1.06659822, "memory(GiB)": 142.32, "step": 108020, "train_speed(iter/s)": 0.285339 }, { "acc": 0.72856979, "epoch": 1.2084217060679738, "grad_norm": 5.125, "learning_rate": 3.7051232399764016e-06, "loss": 1.07627945, "memory(GiB)": 142.32, "step": 108040, "train_speed(iter/s)": 0.28536 }, { "acc": 0.72208395, "epoch": 1.2086454050139324, "grad_norm": 6.96875, "learning_rate": 3.7033370027448346e-06, "loss": 1.10349579, "memory(GiB)": 142.32, "step": 108060, "train_speed(iter/s)": 0.285378 }, { "acc": 0.74380713, "epoch": 1.208869103959891, "grad_norm": 6.28125, "learning_rate": 3.701550942915299e-06, "loss": 1.01814938, "memory(GiB)": 142.32, "step": 108080, "train_speed(iter/s)": 0.285397 }, { "acc": 0.74531264, "epoch": 1.2090928029058494, "grad_norm": 6.9375, "learning_rate": 3.6997650607321545e-06, "loss": 1.00919237, "memory(GiB)": 142.32, "step": 108100, "train_speed(iter/s)": 0.285414 }, { "acc": 0.7315033, "epoch": 1.209316501851808, "grad_norm": 5.09375, "learning_rate": 3.6979793564397343e-06, "loss": 1.06307974, "memory(GiB)": 142.32, "step": 108120, "train_speed(iter/s)": 0.285433 }, { "acc": 0.7316761, "epoch": 1.2095402007977665, "grad_norm": 4.96875, "learning_rate": 3.6961938302823476e-06, "loss": 1.06928082, "memory(GiB)": 142.32, "step": 108140, "train_speed(iter/s)": 0.285453 }, { "acc": 0.7387846, "epoch": 1.209763899743725, "grad_norm": 5.53125, "learning_rate": 3.6944084825042813e-06, "loss": 1.04286213, "memory(GiB)": 142.32, "step": 108160, "train_speed(iter/s)": 0.285471 }, { "acc": 0.73339014, "epoch": 1.2099875986896835, "grad_norm": 6.84375, "learning_rate": 3.6926233133497947e-06, "loss": 1.07086439, "memory(GiB)": 142.32, "step": 108180, "train_speed(iter/s)": 0.285492 }, { "acc": 0.7409627, "epoch": 1.210211297635642, "grad_norm": 5.90625, "learning_rate": 3.690838323063126e-06, "loss": 1.01816807, "memory(GiB)": 142.32, "step": 108200, "train_speed(iter/s)": 0.285511 }, { "acc": 0.74134483, "epoch": 1.2104349965816006, "grad_norm": 6.8125, "learning_rate": 3.6890535118884884e-06, "loss": 1.02043877, "memory(GiB)": 142.32, "step": 108220, "train_speed(iter/s)": 0.285529 }, { "acc": 0.74849072, "epoch": 1.2106586955275591, "grad_norm": 6.90625, "learning_rate": 3.6872688800700674e-06, "loss": 0.99319725, "memory(GiB)": 142.32, "step": 108240, "train_speed(iter/s)": 0.285546 }, { "acc": 0.7402966, "epoch": 1.2108823944735176, "grad_norm": 6.84375, "learning_rate": 3.685484427852026e-06, "loss": 1.0229887, "memory(GiB)": 142.32, "step": 108260, "train_speed(iter/s)": 0.285566 }, { "acc": 0.74349504, "epoch": 1.2111060934194762, "grad_norm": 5.75, "learning_rate": 3.6837001554785035e-06, "loss": 1.0229496, "memory(GiB)": 142.32, "step": 108280, "train_speed(iter/s)": 0.285583 }, { "acc": 0.72792301, "epoch": 1.2113297923654347, "grad_norm": 6.125, "learning_rate": 3.6819160631936146e-06, "loss": 1.08354053, "memory(GiB)": 142.32, "step": 108300, "train_speed(iter/s)": 0.2856 }, { "acc": 0.73094792, "epoch": 1.2115534913113932, "grad_norm": 6.09375, "learning_rate": 3.680132151241449e-06, "loss": 1.08010464, "memory(GiB)": 142.32, "step": 108320, "train_speed(iter/s)": 0.285615 }, { "acc": 0.75630183, "epoch": 1.2117771902573518, "grad_norm": 6.375, "learning_rate": 3.678348419866069e-06, "loss": 0.95526476, "memory(GiB)": 142.32, "step": 108340, "train_speed(iter/s)": 0.285632 }, { "acc": 0.73449821, "epoch": 1.2120008892033103, "grad_norm": 6.625, "learning_rate": 3.676564869311516e-06, "loss": 1.06343403, "memory(GiB)": 142.32, "step": 108360, "train_speed(iter/s)": 0.28565 }, { "acc": 0.73962712, "epoch": 1.2122245881492688, "grad_norm": 4.65625, "learning_rate": 3.674781499821805e-06, "loss": 1.03632698, "memory(GiB)": 142.32, "step": 108380, "train_speed(iter/s)": 0.285668 }, { "acc": 0.74904099, "epoch": 1.2124482870952273, "grad_norm": 6.4375, "learning_rate": 3.6729983116409267e-06, "loss": 0.97768393, "memory(GiB)": 142.32, "step": 108400, "train_speed(iter/s)": 0.285687 }, { "acc": 0.75267572, "epoch": 1.2126719860411859, "grad_norm": 5.4375, "learning_rate": 3.6712153050128474e-06, "loss": 0.97795343, "memory(GiB)": 142.32, "step": 108420, "train_speed(iter/s)": 0.285706 }, { "acc": 0.74079447, "epoch": 1.2128956849871444, "grad_norm": 5.71875, "learning_rate": 3.669432480181507e-06, "loss": 1.02969933, "memory(GiB)": 142.32, "step": 108440, "train_speed(iter/s)": 0.285722 }, { "acc": 0.73739014, "epoch": 1.213119383933103, "grad_norm": 6.625, "learning_rate": 3.667649837390821e-06, "loss": 1.0491189, "memory(GiB)": 142.32, "step": 108460, "train_speed(iter/s)": 0.285742 }, { "acc": 0.73768377, "epoch": 1.2133430828790615, "grad_norm": 5.3125, "learning_rate": 3.6658673768846803e-06, "loss": 1.03660908, "memory(GiB)": 142.32, "step": 108480, "train_speed(iter/s)": 0.285761 }, { "acc": 0.74078493, "epoch": 1.21356678182502, "grad_norm": 7.4375, "learning_rate": 3.664085098906952e-06, "loss": 1.03404827, "memory(GiB)": 142.32, "step": 108500, "train_speed(iter/s)": 0.285778 }, { "acc": 0.72784414, "epoch": 1.2137904807709785, "grad_norm": 6.65625, "learning_rate": 3.662303003701478e-06, "loss": 1.10617409, "memory(GiB)": 142.32, "step": 108520, "train_speed(iter/s)": 0.285795 }, { "acc": 0.73055344, "epoch": 1.214014179716937, "grad_norm": 6.84375, "learning_rate": 3.6605210915120715e-06, "loss": 1.08302717, "memory(GiB)": 142.32, "step": 108540, "train_speed(iter/s)": 0.285815 }, { "acc": 0.74567094, "epoch": 1.2142378786628956, "grad_norm": 5.53125, "learning_rate": 3.6587393625825262e-06, "loss": 1.01386299, "memory(GiB)": 142.32, "step": 108560, "train_speed(iter/s)": 0.285834 }, { "acc": 0.72206707, "epoch": 1.214461577608854, "grad_norm": 5.8125, "learning_rate": 3.6569578171566067e-06, "loss": 1.12066078, "memory(GiB)": 142.32, "step": 108580, "train_speed(iter/s)": 0.285853 }, { "acc": 0.71991334, "epoch": 1.2146852765548126, "grad_norm": 4.96875, "learning_rate": 3.6551764554780544e-06, "loss": 1.11965055, "memory(GiB)": 142.32, "step": 108600, "train_speed(iter/s)": 0.285871 }, { "acc": 0.73635087, "epoch": 1.2149089755007711, "grad_norm": 5.71875, "learning_rate": 3.6533952777905856e-06, "loss": 1.0464962, "memory(GiB)": 142.32, "step": 108620, "train_speed(iter/s)": 0.285889 }, { "acc": 0.7417356, "epoch": 1.2151326744467297, "grad_norm": 6.8125, "learning_rate": 3.65161428433789e-06, "loss": 1.03796291, "memory(GiB)": 142.32, "step": 108640, "train_speed(iter/s)": 0.285907 }, { "acc": 0.73543167, "epoch": 1.2153563733926882, "grad_norm": 5.53125, "learning_rate": 3.6498334753636323e-06, "loss": 1.06034536, "memory(GiB)": 142.32, "step": 108660, "train_speed(iter/s)": 0.285925 }, { "acc": 0.73570685, "epoch": 1.2155800723386467, "grad_norm": 5.0, "learning_rate": 3.648052851111454e-06, "loss": 1.06138372, "memory(GiB)": 142.32, "step": 108680, "train_speed(iter/s)": 0.285943 }, { "acc": 0.73506103, "epoch": 1.2158037712846053, "grad_norm": 5.625, "learning_rate": 3.646272411824969e-06, "loss": 1.0763917, "memory(GiB)": 142.32, "step": 108700, "train_speed(iter/s)": 0.285961 }, { "acc": 0.74497499, "epoch": 1.2160274702305638, "grad_norm": 5.6875, "learning_rate": 3.6444921577477686e-06, "loss": 0.99559174, "memory(GiB)": 142.32, "step": 108720, "train_speed(iter/s)": 0.285979 }, { "acc": 0.73018074, "epoch": 1.2162511691765223, "grad_norm": 5.59375, "learning_rate": 3.642712089123415e-06, "loss": 1.07200985, "memory(GiB)": 142.32, "step": 108740, "train_speed(iter/s)": 0.285998 }, { "acc": 0.73192625, "epoch": 1.2164748681224808, "grad_norm": 5.5625, "learning_rate": 3.640932206195447e-06, "loss": 1.06935663, "memory(GiB)": 142.32, "step": 108760, "train_speed(iter/s)": 0.286016 }, { "acc": 0.72957311, "epoch": 1.2166985670684394, "grad_norm": 7.75, "learning_rate": 3.6391525092073793e-06, "loss": 1.07691956, "memory(GiB)": 142.32, "step": 108780, "train_speed(iter/s)": 0.286034 }, { "acc": 0.73925366, "epoch": 1.216922266014398, "grad_norm": 5.15625, "learning_rate": 3.637372998402699e-06, "loss": 1.04266539, "memory(GiB)": 142.32, "step": 108800, "train_speed(iter/s)": 0.286053 }, { "acc": 0.73155613, "epoch": 1.2171459649603564, "grad_norm": 7.15625, "learning_rate": 3.63559367402487e-06, "loss": 1.0753273, "memory(GiB)": 142.32, "step": 108820, "train_speed(iter/s)": 0.286071 }, { "acc": 0.73344274, "epoch": 1.217369663906315, "grad_norm": 7.09375, "learning_rate": 3.633814536317327e-06, "loss": 1.06714306, "memory(GiB)": 142.32, "step": 108840, "train_speed(iter/s)": 0.286089 }, { "acc": 0.73845882, "epoch": 1.2175933628522735, "grad_norm": 6.5625, "learning_rate": 3.6320355855234837e-06, "loss": 1.04635506, "memory(GiB)": 142.32, "step": 108860, "train_speed(iter/s)": 0.286108 }, { "acc": 0.74485712, "epoch": 1.217817061798232, "grad_norm": 6.5625, "learning_rate": 3.630256821886724e-06, "loss": 1.00538988, "memory(GiB)": 142.32, "step": 108880, "train_speed(iter/s)": 0.286126 }, { "acc": 0.7350996, "epoch": 1.2180407607441905, "grad_norm": 7.25, "learning_rate": 3.628478245650412e-06, "loss": 1.05950613, "memory(GiB)": 142.32, "step": 108900, "train_speed(iter/s)": 0.286144 }, { "acc": 0.73331137, "epoch": 1.218264459690149, "grad_norm": 5.09375, "learning_rate": 3.626699857057877e-06, "loss": 1.06160355, "memory(GiB)": 142.32, "step": 108920, "train_speed(iter/s)": 0.286161 }, { "acc": 0.73478174, "epoch": 1.2184881586361076, "grad_norm": 6.8125, "learning_rate": 3.624921656352431e-06, "loss": 1.05542564, "memory(GiB)": 142.32, "step": 108940, "train_speed(iter/s)": 0.286179 }, { "acc": 0.74285822, "epoch": 1.2187118575820661, "grad_norm": 6.3125, "learning_rate": 3.623143643777357e-06, "loss": 1.0164402, "memory(GiB)": 142.32, "step": 108960, "train_speed(iter/s)": 0.286196 }, { "acc": 0.74059753, "epoch": 1.2189355565280247, "grad_norm": 6.34375, "learning_rate": 3.621365819575912e-06, "loss": 1.03879004, "memory(GiB)": 142.32, "step": 108980, "train_speed(iter/s)": 0.286212 }, { "acc": 0.73362598, "epoch": 1.2191592554739832, "grad_norm": 6.9375, "learning_rate": 3.6195881839913285e-06, "loss": 1.06763372, "memory(GiB)": 142.32, "step": 109000, "train_speed(iter/s)": 0.286229 }, { "acc": 0.74040546, "epoch": 1.2193829544199417, "grad_norm": 6.65625, "learning_rate": 3.6178107372668113e-06, "loss": 1.05481958, "memory(GiB)": 142.32, "step": 109020, "train_speed(iter/s)": 0.286246 }, { "acc": 0.73864841, "epoch": 1.2196066533659002, "grad_norm": 6.71875, "learning_rate": 3.6160334796455414e-06, "loss": 1.04786654, "memory(GiB)": 142.32, "step": 109040, "train_speed(iter/s)": 0.286263 }, { "acc": 0.74008236, "epoch": 1.2198303523118588, "grad_norm": 6.875, "learning_rate": 3.614256411370674e-06, "loss": 1.02013674, "memory(GiB)": 142.32, "step": 109060, "train_speed(iter/s)": 0.286281 }, { "acc": 0.74983234, "epoch": 1.2200540512578173, "grad_norm": 6.25, "learning_rate": 3.6124795326853356e-06, "loss": 0.9944416, "memory(GiB)": 142.32, "step": 109080, "train_speed(iter/s)": 0.286297 }, { "acc": 0.74139013, "epoch": 1.2202777502037758, "grad_norm": 7.09375, "learning_rate": 3.610702843832629e-06, "loss": 1.02457838, "memory(GiB)": 142.32, "step": 109100, "train_speed(iter/s)": 0.286316 }, { "acc": 0.73463631, "epoch": 1.2205014491497344, "grad_norm": 6.46875, "learning_rate": 3.608926345055631e-06, "loss": 1.05588703, "memory(GiB)": 142.32, "step": 109120, "train_speed(iter/s)": 0.286333 }, { "acc": 0.74215908, "epoch": 1.2207251480956929, "grad_norm": 5.875, "learning_rate": 3.607150036597392e-06, "loss": 1.03604231, "memory(GiB)": 142.32, "step": 109140, "train_speed(iter/s)": 0.286352 }, { "acc": 0.72978611, "epoch": 1.2209488470416514, "grad_norm": 7.21875, "learning_rate": 3.605373918700938e-06, "loss": 1.06855268, "memory(GiB)": 142.32, "step": 109160, "train_speed(iter/s)": 0.28637 }, { "acc": 0.74424553, "epoch": 1.22117254598761, "grad_norm": 6.25, "learning_rate": 3.6035979916092646e-06, "loss": 1.01325941, "memory(GiB)": 142.32, "step": 109180, "train_speed(iter/s)": 0.286386 }, { "acc": 0.74027929, "epoch": 1.2213962449335685, "grad_norm": 6.3125, "learning_rate": 3.601822255565345e-06, "loss": 1.04140224, "memory(GiB)": 142.32, "step": 109200, "train_speed(iter/s)": 0.286405 }, { "acc": 0.73481264, "epoch": 1.221619943879527, "grad_norm": 6.34375, "learning_rate": 3.6000467108121247e-06, "loss": 1.05989094, "memory(GiB)": 142.32, "step": 109220, "train_speed(iter/s)": 0.286423 }, { "acc": 0.73218188, "epoch": 1.2218436428254855, "grad_norm": 5.8125, "learning_rate": 3.598271357592525e-06, "loss": 1.06356888, "memory(GiB)": 142.32, "step": 109240, "train_speed(iter/s)": 0.286441 }, { "acc": 0.73575125, "epoch": 1.222067341771444, "grad_norm": 4.5625, "learning_rate": 3.5964961961494394e-06, "loss": 1.05701466, "memory(GiB)": 142.32, "step": 109260, "train_speed(iter/s)": 0.28646 }, { "acc": 0.74116335, "epoch": 1.2222910407174026, "grad_norm": 7.40625, "learning_rate": 3.5947212267257346e-06, "loss": 1.02750568, "memory(GiB)": 142.32, "step": 109280, "train_speed(iter/s)": 0.286479 }, { "acc": 0.72869768, "epoch": 1.222514739663361, "grad_norm": 5.8125, "learning_rate": 3.592946449564251e-06, "loss": 1.08641415, "memory(GiB)": 142.32, "step": 109300, "train_speed(iter/s)": 0.286498 }, { "acc": 0.74119692, "epoch": 1.2227384386093196, "grad_norm": 6.65625, "learning_rate": 3.5911718649078055e-06, "loss": 1.02433701, "memory(GiB)": 142.32, "step": 109320, "train_speed(iter/s)": 0.286516 }, { "acc": 0.73421993, "epoch": 1.2229621375552782, "grad_norm": 4.46875, "learning_rate": 3.5893974729991855e-06, "loss": 1.06257362, "memory(GiB)": 142.32, "step": 109340, "train_speed(iter/s)": 0.286534 }, { "acc": 0.73996358, "epoch": 1.2231858365012367, "grad_norm": 5.75, "learning_rate": 3.5876232740811543e-06, "loss": 1.03735971, "memory(GiB)": 142.32, "step": 109360, "train_speed(iter/s)": 0.286553 }, { "acc": 0.74864311, "epoch": 1.2234095354471952, "grad_norm": 6.40625, "learning_rate": 3.5858492683964453e-06, "loss": 0.98992996, "memory(GiB)": 142.32, "step": 109380, "train_speed(iter/s)": 0.286569 }, { "acc": 0.72766924, "epoch": 1.2236332343931537, "grad_norm": 5.34375, "learning_rate": 3.58407545618777e-06, "loss": 1.08459148, "memory(GiB)": 142.32, "step": 109400, "train_speed(iter/s)": 0.286587 }, { "acc": 0.73330069, "epoch": 1.2238569333391123, "grad_norm": 5.78125, "learning_rate": 3.5823018376978097e-06, "loss": 1.0756361, "memory(GiB)": 142.32, "step": 109420, "train_speed(iter/s)": 0.286606 }, { "acc": 0.73227081, "epoch": 1.2240806322850708, "grad_norm": 6.59375, "learning_rate": 3.580528413169222e-06, "loss": 1.07413902, "memory(GiB)": 142.32, "step": 109440, "train_speed(iter/s)": 0.286622 }, { "acc": 0.74218268, "epoch": 1.2243043312310293, "grad_norm": 5.90625, "learning_rate": 3.5787551828446377e-06, "loss": 1.01625309, "memory(GiB)": 142.32, "step": 109460, "train_speed(iter/s)": 0.286638 }, { "acc": 0.74362812, "epoch": 1.2245280301769879, "grad_norm": 6.25, "learning_rate": 3.5769821469666565e-06, "loss": 1.02354164, "memory(GiB)": 142.32, "step": 109480, "train_speed(iter/s)": 0.286657 }, { "acc": 0.73863363, "epoch": 1.2247517291229464, "grad_norm": 5.875, "learning_rate": 3.575209305777858e-06, "loss": 1.05332079, "memory(GiB)": 142.32, "step": 109500, "train_speed(iter/s)": 0.286675 }, { "acc": 0.74279733, "epoch": 1.224975428068905, "grad_norm": 5.28125, "learning_rate": 3.5734366595207915e-06, "loss": 1.02194109, "memory(GiB)": 142.32, "step": 109520, "train_speed(iter/s)": 0.286692 }, { "acc": 0.7458715, "epoch": 1.2251991270148634, "grad_norm": 6.78125, "learning_rate": 3.5716642084379806e-06, "loss": 1.00253468, "memory(GiB)": 142.32, "step": 109540, "train_speed(iter/s)": 0.286709 }, { "acc": 0.72889638, "epoch": 1.225422825960822, "grad_norm": 6.40625, "learning_rate": 3.569891952771921e-06, "loss": 1.07637329, "memory(GiB)": 142.32, "step": 109560, "train_speed(iter/s)": 0.286728 }, { "acc": 0.74548359, "epoch": 1.2256465249067805, "grad_norm": 5.875, "learning_rate": 3.568119892765084e-06, "loss": 1.01290874, "memory(GiB)": 142.32, "step": 109580, "train_speed(iter/s)": 0.286746 }, { "acc": 0.72852325, "epoch": 1.225870223852739, "grad_norm": 5.5625, "learning_rate": 3.5663480286599117e-06, "loss": 1.08235321, "memory(GiB)": 142.32, "step": 109600, "train_speed(iter/s)": 0.28676 }, { "acc": 0.74416027, "epoch": 1.2260939227986976, "grad_norm": 6.96875, "learning_rate": 3.56457636069882e-06, "loss": 1.02342072, "memory(GiB)": 142.32, "step": 109620, "train_speed(iter/s)": 0.286777 }, { "acc": 0.74599686, "epoch": 1.226317621744656, "grad_norm": 5.96875, "learning_rate": 3.5628048891241994e-06, "loss": 1.01351147, "memory(GiB)": 142.32, "step": 109640, "train_speed(iter/s)": 0.286793 }, { "acc": 0.74064884, "epoch": 1.2265413206906146, "grad_norm": 5.625, "learning_rate": 3.561033614178412e-06, "loss": 1.0271841, "memory(GiB)": 142.32, "step": 109660, "train_speed(iter/s)": 0.286813 }, { "acc": 0.74263792, "epoch": 1.2267650196365731, "grad_norm": 5.8125, "learning_rate": 3.5592625361037946e-06, "loss": 1.01682539, "memory(GiB)": 142.32, "step": 109680, "train_speed(iter/s)": 0.286833 }, { "acc": 0.72961626, "epoch": 1.2269887185825317, "grad_norm": 5.40625, "learning_rate": 3.5574916551426553e-06, "loss": 1.07998238, "memory(GiB)": 142.32, "step": 109700, "train_speed(iter/s)": 0.28685 }, { "acc": 0.74422169, "epoch": 1.2272124175284902, "grad_norm": 8.0625, "learning_rate": 3.5557209715372743e-06, "loss": 1.01223373, "memory(GiB)": 142.32, "step": 109720, "train_speed(iter/s)": 0.286866 }, { "acc": 0.73573055, "epoch": 1.2274361164744487, "grad_norm": 7.25, "learning_rate": 3.553950485529909e-06, "loss": 1.05263691, "memory(GiB)": 142.32, "step": 109740, "train_speed(iter/s)": 0.286883 }, { "acc": 0.74729486, "epoch": 1.2276598154204073, "grad_norm": 5.96875, "learning_rate": 3.5521801973627856e-06, "loss": 1.01260462, "memory(GiB)": 142.32, "step": 109760, "train_speed(iter/s)": 0.286902 }, { "acc": 0.74006176, "epoch": 1.2278835143663658, "grad_norm": 5.59375, "learning_rate": 3.550410107278106e-06, "loss": 1.02843952, "memory(GiB)": 142.32, "step": 109780, "train_speed(iter/s)": 0.286919 }, { "acc": 0.730825, "epoch": 1.2281072133123243, "grad_norm": 5.25, "learning_rate": 3.548640215518043e-06, "loss": 1.06143436, "memory(GiB)": 142.32, "step": 109800, "train_speed(iter/s)": 0.286935 }, { "acc": 0.73091755, "epoch": 1.2283309122582828, "grad_norm": 7.4375, "learning_rate": 3.5468705223247426e-06, "loss": 1.0729332, "memory(GiB)": 142.32, "step": 109820, "train_speed(iter/s)": 0.286953 }, { "acc": 0.7442584, "epoch": 1.2285546112042414, "grad_norm": 6.03125, "learning_rate": 3.545101027940325e-06, "loss": 1.0146224, "memory(GiB)": 142.32, "step": 109840, "train_speed(iter/s)": 0.286972 }, { "acc": 0.73207021, "epoch": 1.2287783101502, "grad_norm": 5.96875, "learning_rate": 3.5433317326068817e-06, "loss": 1.07663097, "memory(GiB)": 142.32, "step": 109860, "train_speed(iter/s)": 0.28699 }, { "acc": 0.73682714, "epoch": 1.2290020090961584, "grad_norm": 5.90625, "learning_rate": 3.5415626365664792e-06, "loss": 1.04817505, "memory(GiB)": 142.32, "step": 109880, "train_speed(iter/s)": 0.287007 }, { "acc": 0.71883469, "epoch": 1.229225708042117, "grad_norm": 5.9375, "learning_rate": 3.5397937400611525e-06, "loss": 1.13189526, "memory(GiB)": 142.32, "step": 109900, "train_speed(iter/s)": 0.287025 }, { "acc": 0.73917489, "epoch": 1.2294494069880755, "grad_norm": 6.0, "learning_rate": 3.5380250433329146e-06, "loss": 1.04479771, "memory(GiB)": 142.32, "step": 109920, "train_speed(iter/s)": 0.287043 }, { "acc": 0.73661909, "epoch": 1.229673105934034, "grad_norm": 6.15625, "learning_rate": 3.536256546623746e-06, "loss": 1.04961338, "memory(GiB)": 142.32, "step": 109940, "train_speed(iter/s)": 0.287061 }, { "acc": 0.74940276, "epoch": 1.2298968048799925, "grad_norm": 6.71875, "learning_rate": 3.534488250175604e-06, "loss": 0.99451847, "memory(GiB)": 142.32, "step": 109960, "train_speed(iter/s)": 0.287079 }, { "acc": 0.73303471, "epoch": 1.230120503825951, "grad_norm": 5.0625, "learning_rate": 3.532720154230417e-06, "loss": 1.06702671, "memory(GiB)": 142.32, "step": 109980, "train_speed(iter/s)": 0.287096 }, { "acc": 0.74353685, "epoch": 1.2303442027719096, "grad_norm": 6.84375, "learning_rate": 3.5309522590300844e-06, "loss": 1.00866089, "memory(GiB)": 142.32, "step": 110000, "train_speed(iter/s)": 0.287114 }, { "epoch": 1.2303442027719096, "eval_acc": 0.6962143915471617, "eval_loss": 1.0722109079360962, "eval_runtime": 2342.1417, "eval_samples_per_second": 32.143, "eval_steps_per_second": 16.072, "step": 110000 }, { "acc": 0.72511721, "epoch": 1.2305679017178681, "grad_norm": 6.90625, "learning_rate": 3.5291845648164804e-06, "loss": 1.11176538, "memory(GiB)": 142.32, "step": 110020, "train_speed(iter/s)": 0.285349 }, { "acc": 0.74721136, "epoch": 1.2307916006638266, "grad_norm": 6.53125, "learning_rate": 3.5274170718314506e-06, "loss": 0.99002705, "memory(GiB)": 142.32, "step": 110040, "train_speed(iter/s)": 0.285366 }, { "acc": 0.73924942, "epoch": 1.2310152996097852, "grad_norm": 6.3125, "learning_rate": 3.525649780316813e-06, "loss": 1.04570189, "memory(GiB)": 142.32, "step": 110060, "train_speed(iter/s)": 0.285385 }, { "acc": 0.73068409, "epoch": 1.2312389985557437, "grad_norm": 5.75, "learning_rate": 3.5238826905143607e-06, "loss": 1.09131775, "memory(GiB)": 142.32, "step": 110080, "train_speed(iter/s)": 0.2854 }, { "acc": 0.74419279, "epoch": 1.2314626975017022, "grad_norm": 6.1875, "learning_rate": 3.5221158026658544e-06, "loss": 1.00099392, "memory(GiB)": 142.32, "step": 110100, "train_speed(iter/s)": 0.285417 }, { "acc": 0.741045, "epoch": 1.2316863964476608, "grad_norm": 5.625, "learning_rate": 3.52034911701303e-06, "loss": 1.03224239, "memory(GiB)": 142.32, "step": 110120, "train_speed(iter/s)": 0.285434 }, { "acc": 0.73303747, "epoch": 1.2319100953936193, "grad_norm": 6.78125, "learning_rate": 3.5185826337975947e-06, "loss": 1.03892965, "memory(GiB)": 142.32, "step": 110140, "train_speed(iter/s)": 0.285451 }, { "acc": 0.74112983, "epoch": 1.2321337943395778, "grad_norm": 4.75, "learning_rate": 3.51681635326123e-06, "loss": 1.01681623, "memory(GiB)": 142.32, "step": 110160, "train_speed(iter/s)": 0.285468 }, { "acc": 0.72555065, "epoch": 1.2323574932855363, "grad_norm": 6.21875, "learning_rate": 3.5150502756455862e-06, "loss": 1.11109734, "memory(GiB)": 142.32, "step": 110180, "train_speed(iter/s)": 0.285486 }, { "acc": 0.74940629, "epoch": 1.2325811922314949, "grad_norm": 7.65625, "learning_rate": 3.513284401192291e-06, "loss": 0.98548737, "memory(GiB)": 142.32, "step": 110200, "train_speed(iter/s)": 0.285504 }, { "acc": 0.74166317, "epoch": 1.2328048911774534, "grad_norm": 4.90625, "learning_rate": 3.51151873014294e-06, "loss": 1.01324272, "memory(GiB)": 142.32, "step": 110220, "train_speed(iter/s)": 0.285521 }, { "acc": 0.7467329, "epoch": 1.233028590123412, "grad_norm": 5.125, "learning_rate": 3.5097532627391014e-06, "loss": 1.01868238, "memory(GiB)": 142.32, "step": 110240, "train_speed(iter/s)": 0.285536 }, { "acc": 0.72821054, "epoch": 1.2332522890693705, "grad_norm": 6.53125, "learning_rate": 3.5079879992223164e-06, "loss": 1.07370186, "memory(GiB)": 142.32, "step": 110260, "train_speed(iter/s)": 0.285554 }, { "acc": 0.74002523, "epoch": 1.233475988015329, "grad_norm": 6.6875, "learning_rate": 3.5062229398340995e-06, "loss": 1.03334866, "memory(GiB)": 142.32, "step": 110280, "train_speed(iter/s)": 0.285573 }, { "acc": 0.7237031, "epoch": 1.2336996869612875, "grad_norm": 5.71875, "learning_rate": 3.5044580848159355e-06, "loss": 1.10640373, "memory(GiB)": 142.32, "step": 110300, "train_speed(iter/s)": 0.285592 }, { "acc": 0.73617125, "epoch": 1.233923385907246, "grad_norm": 6.34375, "learning_rate": 3.502693434409282e-06, "loss": 1.04109859, "memory(GiB)": 142.32, "step": 110320, "train_speed(iter/s)": 0.28561 }, { "acc": 0.74584169, "epoch": 1.2341470848532046, "grad_norm": 5.5625, "learning_rate": 3.5009289888555676e-06, "loss": 0.99803829, "memory(GiB)": 142.32, "step": 110340, "train_speed(iter/s)": 0.285626 }, { "acc": 0.74311171, "epoch": 1.234370783799163, "grad_norm": 5.9375, "learning_rate": 3.4991647483961945e-06, "loss": 1.00938644, "memory(GiB)": 142.32, "step": 110360, "train_speed(iter/s)": 0.285642 }, { "acc": 0.74298954, "epoch": 1.2345944827451216, "grad_norm": 5.78125, "learning_rate": 3.497400713272535e-06, "loss": 1.02031727, "memory(GiB)": 142.32, "step": 110380, "train_speed(iter/s)": 0.28566 }, { "acc": 0.74457507, "epoch": 1.2348181816910802, "grad_norm": 6.25, "learning_rate": 3.4956368837259357e-06, "loss": 1.01981421, "memory(GiB)": 142.32, "step": 110400, "train_speed(iter/s)": 0.285678 }, { "acc": 0.73953161, "epoch": 1.2350418806370387, "grad_norm": 6.6875, "learning_rate": 3.493873259997713e-06, "loss": 1.03038807, "memory(GiB)": 142.32, "step": 110420, "train_speed(iter/s)": 0.285695 }, { "acc": 0.74263687, "epoch": 1.2352655795829972, "grad_norm": 6.75, "learning_rate": 3.492109842329156e-06, "loss": 1.00886135, "memory(GiB)": 142.32, "step": 110440, "train_speed(iter/s)": 0.285713 }, { "acc": 0.73835397, "epoch": 1.2354892785289557, "grad_norm": 6.375, "learning_rate": 3.4903466309615254e-06, "loss": 1.03625116, "memory(GiB)": 142.32, "step": 110460, "train_speed(iter/s)": 0.28573 }, { "acc": 0.74104147, "epoch": 1.2357129774749143, "grad_norm": 6.53125, "learning_rate": 3.488583626136053e-06, "loss": 1.03394136, "memory(GiB)": 142.32, "step": 110480, "train_speed(iter/s)": 0.285747 }, { "acc": 0.7396193, "epoch": 1.2359366764208728, "grad_norm": 6.875, "learning_rate": 3.486820828093943e-06, "loss": 1.04031105, "memory(GiB)": 142.32, "step": 110500, "train_speed(iter/s)": 0.285765 }, { "acc": 0.73425674, "epoch": 1.2361603753668313, "grad_norm": 5.90625, "learning_rate": 3.4850582370763743e-06, "loss": 1.06258879, "memory(GiB)": 142.32, "step": 110520, "train_speed(iter/s)": 0.285781 }, { "acc": 0.72723007, "epoch": 1.2363840743127898, "grad_norm": 6.5, "learning_rate": 3.4832958533244897e-06, "loss": 1.08757601, "memory(GiB)": 142.32, "step": 110540, "train_speed(iter/s)": 0.285797 }, { "acc": 0.73339911, "epoch": 1.2366077732587484, "grad_norm": 5.9375, "learning_rate": 3.481533677079413e-06, "loss": 1.05248184, "memory(GiB)": 142.32, "step": 110560, "train_speed(iter/s)": 0.285815 }, { "acc": 0.71721535, "epoch": 1.236831472204707, "grad_norm": 5.625, "learning_rate": 3.4797717085822314e-06, "loss": 1.14870367, "memory(GiB)": 142.32, "step": 110580, "train_speed(iter/s)": 0.285834 }, { "acc": 0.73323278, "epoch": 1.2370551711506654, "grad_norm": 5.5625, "learning_rate": 3.4780099480740104e-06, "loss": 1.05096016, "memory(GiB)": 142.32, "step": 110600, "train_speed(iter/s)": 0.285851 }, { "acc": 0.73877263, "epoch": 1.237278870096624, "grad_norm": 5.46875, "learning_rate": 3.4762483957957834e-06, "loss": 1.04643955, "memory(GiB)": 142.32, "step": 110620, "train_speed(iter/s)": 0.285867 }, { "acc": 0.75018849, "epoch": 1.2375025690425825, "grad_norm": 6.21875, "learning_rate": 3.4744870519885544e-06, "loss": 0.98914089, "memory(GiB)": 142.32, "step": 110640, "train_speed(iter/s)": 0.285884 }, { "acc": 0.75031662, "epoch": 1.237726267988541, "grad_norm": 6.375, "learning_rate": 3.4727259168933002e-06, "loss": 0.9860074, "memory(GiB)": 142.32, "step": 110660, "train_speed(iter/s)": 0.2859 }, { "acc": 0.75428696, "epoch": 1.2379499669344995, "grad_norm": 7.125, "learning_rate": 3.470964990750971e-06, "loss": 0.97886639, "memory(GiB)": 142.32, "step": 110680, "train_speed(iter/s)": 0.285917 }, { "acc": 0.7325232, "epoch": 1.238173665880458, "grad_norm": 5.59375, "learning_rate": 3.4692042738024865e-06, "loss": 1.06938953, "memory(GiB)": 142.32, "step": 110700, "train_speed(iter/s)": 0.285935 }, { "acc": 0.73116064, "epoch": 1.2383973648264166, "grad_norm": 6.21875, "learning_rate": 3.4674437662887385e-06, "loss": 1.08895102, "memory(GiB)": 142.32, "step": 110720, "train_speed(iter/s)": 0.285951 }, { "acc": 0.7438858, "epoch": 1.2386210637723751, "grad_norm": 5.78125, "learning_rate": 3.465683468450587e-06, "loss": 1.01907511, "memory(GiB)": 142.32, "step": 110740, "train_speed(iter/s)": 0.285969 }, { "acc": 0.73425226, "epoch": 1.2388447627183337, "grad_norm": 6.09375, "learning_rate": 3.4639233805288676e-06, "loss": 1.07921495, "memory(GiB)": 142.32, "step": 110760, "train_speed(iter/s)": 0.285987 }, { "acc": 0.737328, "epoch": 1.2390684616642922, "grad_norm": 6.625, "learning_rate": 3.462163502764385e-06, "loss": 1.02993526, "memory(GiB)": 142.32, "step": 110780, "train_speed(iter/s)": 0.286004 }, { "acc": 0.74222021, "epoch": 1.2392921606102507, "grad_norm": 6.125, "learning_rate": 3.460403835397917e-06, "loss": 1.02256174, "memory(GiB)": 142.32, "step": 110800, "train_speed(iter/s)": 0.28602 }, { "acc": 0.73804626, "epoch": 1.2395158595562092, "grad_norm": 5.75, "learning_rate": 3.4586443786702106e-06, "loss": 1.04907942, "memory(GiB)": 142.32, "step": 110820, "train_speed(iter/s)": 0.28604 }, { "acc": 0.74384189, "epoch": 1.2397395585021678, "grad_norm": 6.40625, "learning_rate": 3.4568851328219834e-06, "loss": 1.01198978, "memory(GiB)": 142.32, "step": 110840, "train_speed(iter/s)": 0.286057 }, { "acc": 0.72400317, "epoch": 1.2399632574481263, "grad_norm": 6.21875, "learning_rate": 3.455126098093926e-06, "loss": 1.10896091, "memory(GiB)": 142.32, "step": 110860, "train_speed(iter/s)": 0.286074 }, { "acc": 0.73924332, "epoch": 1.2401869563940848, "grad_norm": 7.15625, "learning_rate": 3.4533672747267e-06, "loss": 1.04023361, "memory(GiB)": 142.32, "step": 110880, "train_speed(iter/s)": 0.28609 }, { "acc": 0.73837852, "epoch": 1.2404106553400434, "grad_norm": 4.4375, "learning_rate": 3.451608662960937e-06, "loss": 1.03252201, "memory(GiB)": 142.32, "step": 110900, "train_speed(iter/s)": 0.286107 }, { "acc": 0.73937497, "epoch": 1.2406343542860019, "grad_norm": 6.90625, "learning_rate": 3.449850263037241e-06, "loss": 1.04767895, "memory(GiB)": 142.32, "step": 110920, "train_speed(iter/s)": 0.286123 }, { "acc": 0.73621216, "epoch": 1.2408580532319604, "grad_norm": 5.15625, "learning_rate": 3.4480920751961853e-06, "loss": 1.05430059, "memory(GiB)": 142.32, "step": 110940, "train_speed(iter/s)": 0.286139 }, { "acc": 0.74603691, "epoch": 1.241081752177919, "grad_norm": 6.4375, "learning_rate": 3.4463340996783155e-06, "loss": 1.02580967, "memory(GiB)": 142.32, "step": 110960, "train_speed(iter/s)": 0.286155 }, { "acc": 0.73195882, "epoch": 1.2413054511238775, "grad_norm": 6.15625, "learning_rate": 3.4445763367241485e-06, "loss": 1.07827778, "memory(GiB)": 142.32, "step": 110980, "train_speed(iter/s)": 0.286171 }, { "acc": 0.72684474, "epoch": 1.241529150069836, "grad_norm": 6.25, "learning_rate": 3.4428187865741702e-06, "loss": 1.10467873, "memory(GiB)": 142.32, "step": 111000, "train_speed(iter/s)": 0.286188 }, { "acc": 0.73481579, "epoch": 1.2417528490157945, "grad_norm": 5.40625, "learning_rate": 3.4410614494688397e-06, "loss": 1.06188927, "memory(GiB)": 142.32, "step": 111020, "train_speed(iter/s)": 0.286204 }, { "acc": 0.73739195, "epoch": 1.241976547961753, "grad_norm": 6.125, "learning_rate": 3.439304325648585e-06, "loss": 1.05534573, "memory(GiB)": 142.32, "step": 111040, "train_speed(iter/s)": 0.286221 }, { "acc": 0.73363094, "epoch": 1.2422002469077116, "grad_norm": 5.3125, "learning_rate": 3.4375474153538064e-06, "loss": 1.08386612, "memory(GiB)": 142.32, "step": 111060, "train_speed(iter/s)": 0.286238 }, { "acc": 0.72961025, "epoch": 1.24242394585367, "grad_norm": 5.65625, "learning_rate": 3.435790718824873e-06, "loss": 1.0755249, "memory(GiB)": 142.32, "step": 111080, "train_speed(iter/s)": 0.286255 }, { "acc": 0.73575649, "epoch": 1.2426476447996286, "grad_norm": 6.40625, "learning_rate": 3.434034236302127e-06, "loss": 1.04173584, "memory(GiB)": 142.32, "step": 111100, "train_speed(iter/s)": 0.286273 }, { "acc": 0.74787245, "epoch": 1.2428713437455872, "grad_norm": 5.78125, "learning_rate": 3.4322779680258822e-06, "loss": 1.00357809, "memory(GiB)": 142.32, "step": 111120, "train_speed(iter/s)": 0.286292 }, { "acc": 0.74421535, "epoch": 1.2430950426915457, "grad_norm": 5.375, "learning_rate": 3.4305219142364176e-06, "loss": 1.01580162, "memory(GiB)": 142.32, "step": 111140, "train_speed(iter/s)": 0.28631 }, { "acc": 0.73263788, "epoch": 1.2433187416375042, "grad_norm": 6.59375, "learning_rate": 3.428766075173988e-06, "loss": 1.0754015, "memory(GiB)": 142.32, "step": 111160, "train_speed(iter/s)": 0.286328 }, { "acc": 0.73856192, "epoch": 1.2435424405834627, "grad_norm": 6.875, "learning_rate": 3.4270104510788184e-06, "loss": 1.05288982, "memory(GiB)": 142.32, "step": 111180, "train_speed(iter/s)": 0.286347 }, { "acc": 0.7457901, "epoch": 1.2437661395294213, "grad_norm": 5.5625, "learning_rate": 3.4252550421911015e-06, "loss": 1.00056324, "memory(GiB)": 142.32, "step": 111200, "train_speed(iter/s)": 0.286366 }, { "acc": 0.7392724, "epoch": 1.2439898384753798, "grad_norm": 5.9375, "learning_rate": 3.423499848751004e-06, "loss": 1.04167576, "memory(GiB)": 142.32, "step": 111220, "train_speed(iter/s)": 0.286382 }, { "acc": 0.74469719, "epoch": 1.2442135374213383, "grad_norm": 5.21875, "learning_rate": 3.42174487099866e-06, "loss": 1.03300409, "memory(GiB)": 142.32, "step": 111240, "train_speed(iter/s)": 0.286401 }, { "acc": 0.73553038, "epoch": 1.2444372363672969, "grad_norm": 6.59375, "learning_rate": 3.419990109174176e-06, "loss": 1.05742598, "memory(GiB)": 142.32, "step": 111260, "train_speed(iter/s)": 0.28642 }, { "acc": 0.74355392, "epoch": 1.2446609353132554, "grad_norm": 6.15625, "learning_rate": 3.41823556351763e-06, "loss": 1.00860558, "memory(GiB)": 142.32, "step": 111280, "train_speed(iter/s)": 0.286439 }, { "acc": 0.72328663, "epoch": 1.244884634259214, "grad_norm": 5.84375, "learning_rate": 3.416481234269066e-06, "loss": 1.10058327, "memory(GiB)": 142.32, "step": 111300, "train_speed(iter/s)": 0.286456 }, { "acc": 0.74755645, "epoch": 1.2451083332051724, "grad_norm": 6.0, "learning_rate": 3.414727121668503e-06, "loss": 1.00098495, "memory(GiB)": 142.32, "step": 111320, "train_speed(iter/s)": 0.286473 }, { "acc": 0.73976741, "epoch": 1.245332032151131, "grad_norm": 6.1875, "learning_rate": 3.412973225955929e-06, "loss": 1.06550379, "memory(GiB)": 142.32, "step": 111340, "train_speed(iter/s)": 0.286492 }, { "acc": 0.73592124, "epoch": 1.2455557310970895, "grad_norm": 7.53125, "learning_rate": 3.4112195473713015e-06, "loss": 1.05996265, "memory(GiB)": 142.32, "step": 111360, "train_speed(iter/s)": 0.28651 }, { "acc": 0.73149071, "epoch": 1.245779430043048, "grad_norm": 5.40625, "learning_rate": 3.409466086154548e-06, "loss": 1.06628971, "memory(GiB)": 142.32, "step": 111380, "train_speed(iter/s)": 0.286528 }, { "acc": 0.73980722, "epoch": 1.2460031289890066, "grad_norm": 5.53125, "learning_rate": 3.4077128425455686e-06, "loss": 1.029389, "memory(GiB)": 142.32, "step": 111400, "train_speed(iter/s)": 0.286544 }, { "acc": 0.73299146, "epoch": 1.246226827934965, "grad_norm": 5.40625, "learning_rate": 3.405959816784231e-06, "loss": 1.06685181, "memory(GiB)": 142.32, "step": 111420, "train_speed(iter/s)": 0.286561 }, { "acc": 0.72803679, "epoch": 1.2464505268809236, "grad_norm": 5.78125, "learning_rate": 3.404207009110374e-06, "loss": 1.09183779, "memory(GiB)": 142.32, "step": 111440, "train_speed(iter/s)": 0.286579 }, { "acc": 0.73968468, "epoch": 1.2466742258268821, "grad_norm": 7.125, "learning_rate": 3.4024544197638085e-06, "loss": 1.02962723, "memory(GiB)": 142.32, "step": 111460, "train_speed(iter/s)": 0.286597 }, { "acc": 0.72629185, "epoch": 1.2468979247728407, "grad_norm": 6.25, "learning_rate": 3.400702048984312e-06, "loss": 1.09218807, "memory(GiB)": 142.32, "step": 111480, "train_speed(iter/s)": 0.286614 }, { "acc": 0.73935061, "epoch": 1.2471216237187992, "grad_norm": 5.375, "learning_rate": 3.3989498970116347e-06, "loss": 1.04253111, "memory(GiB)": 142.32, "step": 111500, "train_speed(iter/s)": 0.286631 }, { "acc": 0.73874893, "epoch": 1.2473453226647577, "grad_norm": 6.53125, "learning_rate": 3.3971979640854954e-06, "loss": 1.0425499, "memory(GiB)": 142.32, "step": 111520, "train_speed(iter/s)": 0.286648 }, { "acc": 0.73375793, "epoch": 1.2475690216107163, "grad_norm": 6.59375, "learning_rate": 3.3954462504455838e-06, "loss": 1.05262585, "memory(GiB)": 142.32, "step": 111540, "train_speed(iter/s)": 0.286664 }, { "acc": 0.73733864, "epoch": 1.2477927205566748, "grad_norm": 5.9375, "learning_rate": 3.3936947563315603e-06, "loss": 1.05176811, "memory(GiB)": 142.32, "step": 111560, "train_speed(iter/s)": 0.286681 }, { "acc": 0.7424387, "epoch": 1.2480164195026333, "grad_norm": 5.90625, "learning_rate": 3.391943481983053e-06, "loss": 1.01087694, "memory(GiB)": 142.32, "step": 111580, "train_speed(iter/s)": 0.286698 }, { "acc": 0.74506855, "epoch": 1.2482401184485918, "grad_norm": 6.03125, "learning_rate": 3.3901924276396614e-06, "loss": 1.00625067, "memory(GiB)": 142.32, "step": 111600, "train_speed(iter/s)": 0.286715 }, { "acc": 0.74152908, "epoch": 1.2484638173945504, "grad_norm": 6.0, "learning_rate": 3.3884415935409555e-06, "loss": 1.02264328, "memory(GiB)": 142.32, "step": 111620, "train_speed(iter/s)": 0.286732 }, { "acc": 0.73164978, "epoch": 1.248687516340509, "grad_norm": 6.125, "learning_rate": 3.3866909799264737e-06, "loss": 1.08305759, "memory(GiB)": 142.32, "step": 111640, "train_speed(iter/s)": 0.286749 }, { "acc": 0.73607411, "epoch": 1.2489112152864674, "grad_norm": 5.40625, "learning_rate": 3.3849405870357265e-06, "loss": 1.04325256, "memory(GiB)": 142.32, "step": 111660, "train_speed(iter/s)": 0.286768 }, { "acc": 0.73666744, "epoch": 1.249134914232426, "grad_norm": 5.34375, "learning_rate": 3.383190415108191e-06, "loss": 1.04401379, "memory(GiB)": 142.32, "step": 111680, "train_speed(iter/s)": 0.286785 }, { "acc": 0.73630972, "epoch": 1.2493586131783845, "grad_norm": 5.65625, "learning_rate": 3.3814404643833156e-06, "loss": 1.03354626, "memory(GiB)": 142.32, "step": 111700, "train_speed(iter/s)": 0.286803 }, { "acc": 0.73126621, "epoch": 1.249582312124343, "grad_norm": 6.09375, "learning_rate": 3.379690735100519e-06, "loss": 1.06254654, "memory(GiB)": 142.32, "step": 111720, "train_speed(iter/s)": 0.28682 }, { "acc": 0.74740791, "epoch": 1.2498060110703015, "grad_norm": 6.65625, "learning_rate": 3.37794122749919e-06, "loss": 0.99714746, "memory(GiB)": 142.32, "step": 111740, "train_speed(iter/s)": 0.286836 }, { "acc": 0.73046808, "epoch": 1.25002971001626, "grad_norm": 5.625, "learning_rate": 3.376191941818686e-06, "loss": 1.06797485, "memory(GiB)": 142.32, "step": 111760, "train_speed(iter/s)": 0.286855 }, { "acc": 0.73097229, "epoch": 1.2502534089622186, "grad_norm": 5.9375, "learning_rate": 3.374442878298334e-06, "loss": 1.0679945, "memory(GiB)": 142.32, "step": 111780, "train_speed(iter/s)": 0.286871 }, { "acc": 0.74082308, "epoch": 1.2504771079081771, "grad_norm": 5.78125, "learning_rate": 3.37269403717743e-06, "loss": 1.03292265, "memory(GiB)": 142.32, "step": 111800, "train_speed(iter/s)": 0.28689 }, { "acc": 0.74020348, "epoch": 1.2507008068541356, "grad_norm": 5.9375, "learning_rate": 3.3709454186952417e-06, "loss": 1.0339098, "memory(GiB)": 142.32, "step": 111820, "train_speed(iter/s)": 0.286905 }, { "acc": 0.74062972, "epoch": 1.2509245058000942, "grad_norm": 5.65625, "learning_rate": 3.369197023091004e-06, "loss": 1.02876368, "memory(GiB)": 142.32, "step": 111840, "train_speed(iter/s)": 0.286923 }, { "acc": 0.75137458, "epoch": 1.2511482047460527, "grad_norm": 6.4375, "learning_rate": 3.367448850603925e-06, "loss": 0.98911934, "memory(GiB)": 142.32, "step": 111860, "train_speed(iter/s)": 0.28694 }, { "acc": 0.73949833, "epoch": 1.2513719036920112, "grad_norm": 5.78125, "learning_rate": 3.3657009014731763e-06, "loss": 1.03574171, "memory(GiB)": 142.32, "step": 111880, "train_speed(iter/s)": 0.286958 }, { "acc": 0.74009733, "epoch": 1.2515956026379698, "grad_norm": 6.65625, "learning_rate": 3.3639531759379035e-06, "loss": 1.02917099, "memory(GiB)": 142.32, "step": 111900, "train_speed(iter/s)": 0.286975 }, { "acc": 0.75117974, "epoch": 1.2518193015839283, "grad_norm": 5.8125, "learning_rate": 3.362205674237221e-06, "loss": 0.98459206, "memory(GiB)": 142.32, "step": 111920, "train_speed(iter/s)": 0.286993 }, { "acc": 0.73408551, "epoch": 1.2520430005298868, "grad_norm": 6.09375, "learning_rate": 3.3604583966102124e-06, "loss": 1.0503809, "memory(GiB)": 142.32, "step": 111940, "train_speed(iter/s)": 0.28701 }, { "acc": 0.73105831, "epoch": 1.2522666994758453, "grad_norm": 6.40625, "learning_rate": 3.3587113432959295e-06, "loss": 1.06208639, "memory(GiB)": 142.32, "step": 111960, "train_speed(iter/s)": 0.287028 }, { "acc": 0.73294163, "epoch": 1.2524903984218039, "grad_norm": 6.1875, "learning_rate": 3.356964514533394e-06, "loss": 1.05307093, "memory(GiB)": 142.32, "step": 111980, "train_speed(iter/s)": 0.287046 }, { "acc": 0.74025469, "epoch": 1.2527140973677624, "grad_norm": 5.71875, "learning_rate": 3.355217910561597e-06, "loss": 1.04375238, "memory(GiB)": 142.32, "step": 112000, "train_speed(iter/s)": 0.287064 }, { "epoch": 1.2527140973677624, "eval_acc": 0.6962215395621432, "eval_loss": 1.0720889568328857, "eval_runtime": 2344.2676, "eval_samples_per_second": 32.114, "eval_steps_per_second": 16.057, "step": 112000 }, { "acc": 0.72952185, "epoch": 1.252937796313721, "grad_norm": 6.15625, "learning_rate": 3.3534715316194986e-06, "loss": 1.08991928, "memory(GiB)": 142.32, "step": 112020, "train_speed(iter/s)": 0.285328 }, { "acc": 0.72924356, "epoch": 1.2531614952596795, "grad_norm": 5.96875, "learning_rate": 3.35172537794603e-06, "loss": 1.06964293, "memory(GiB)": 142.32, "step": 112040, "train_speed(iter/s)": 0.285346 }, { "acc": 0.74547806, "epoch": 1.253385194205638, "grad_norm": 5.34375, "learning_rate": 3.34997944978009e-06, "loss": 0.99134731, "memory(GiB)": 142.32, "step": 112060, "train_speed(iter/s)": 0.285366 }, { "acc": 0.73410721, "epoch": 1.2536088931515965, "grad_norm": 5.34375, "learning_rate": 3.3482337473605435e-06, "loss": 1.07371807, "memory(GiB)": 142.32, "step": 112080, "train_speed(iter/s)": 0.285383 }, { "acc": 0.7451591, "epoch": 1.253832592097555, "grad_norm": 4.4375, "learning_rate": 3.34648827092623e-06, "loss": 1.00341606, "memory(GiB)": 142.32, "step": 112100, "train_speed(iter/s)": 0.285399 }, { "acc": 0.73450594, "epoch": 1.2540562910435136, "grad_norm": 6.53125, "learning_rate": 3.344743020715955e-06, "loss": 1.05145435, "memory(GiB)": 142.32, "step": 112120, "train_speed(iter/s)": 0.285417 }, { "acc": 0.73440971, "epoch": 1.254279989989472, "grad_norm": 7.53125, "learning_rate": 3.3429979969684944e-06, "loss": 1.04619884, "memory(GiB)": 142.32, "step": 112140, "train_speed(iter/s)": 0.285436 }, { "acc": 0.73075762, "epoch": 1.2545036889354306, "grad_norm": 7.4375, "learning_rate": 3.3412531999225928e-06, "loss": 1.0717804, "memory(GiB)": 142.32, "step": 112160, "train_speed(iter/s)": 0.285454 }, { "acc": 0.73275433, "epoch": 1.2547273878813892, "grad_norm": 7.09375, "learning_rate": 3.339508629816961e-06, "loss": 1.06010628, "memory(GiB)": 142.32, "step": 112180, "train_speed(iter/s)": 0.285472 }, { "acc": 0.72921352, "epoch": 1.2549510868273477, "grad_norm": 5.84375, "learning_rate": 3.3377642868902827e-06, "loss": 1.08276997, "memory(GiB)": 142.32, "step": 112200, "train_speed(iter/s)": 0.28549 }, { "acc": 0.73894806, "epoch": 1.2551747857733062, "grad_norm": 6.5, "learning_rate": 3.336020171381209e-06, "loss": 1.04272957, "memory(GiB)": 142.32, "step": 112220, "train_speed(iter/s)": 0.285508 }, { "acc": 0.73905325, "epoch": 1.2553984847192647, "grad_norm": 6.28125, "learning_rate": 3.3342762835283593e-06, "loss": 1.04037991, "memory(GiB)": 142.32, "step": 112240, "train_speed(iter/s)": 0.285525 }, { "acc": 0.73266697, "epoch": 1.2556221836652233, "grad_norm": 6.0, "learning_rate": 3.3325326235703235e-06, "loss": 1.06359577, "memory(GiB)": 142.32, "step": 112260, "train_speed(iter/s)": 0.285543 }, { "acc": 0.73938508, "epoch": 1.2558458826111818, "grad_norm": 8.4375, "learning_rate": 3.3307891917456573e-06, "loss": 1.03022213, "memory(GiB)": 142.32, "step": 112280, "train_speed(iter/s)": 0.285559 }, { "acc": 0.74370427, "epoch": 1.2560695815571403, "grad_norm": 6.90625, "learning_rate": 3.329045988292889e-06, "loss": 1.00787554, "memory(GiB)": 142.32, "step": 112300, "train_speed(iter/s)": 0.285575 }, { "acc": 0.74757614, "epoch": 1.2562932805030989, "grad_norm": 5.9375, "learning_rate": 3.3273030134505124e-06, "loss": 0.99444714, "memory(GiB)": 142.32, "step": 112320, "train_speed(iter/s)": 0.285592 }, { "acc": 0.73391824, "epoch": 1.2565169794490574, "grad_norm": 7.1875, "learning_rate": 3.325560267456992e-06, "loss": 1.05678062, "memory(GiB)": 142.32, "step": 112340, "train_speed(iter/s)": 0.285608 }, { "acc": 0.74781256, "epoch": 1.256740678395016, "grad_norm": 5.6875, "learning_rate": 3.323817750550761e-06, "loss": 1.00010815, "memory(GiB)": 142.32, "step": 112360, "train_speed(iter/s)": 0.285625 }, { "acc": 0.74301338, "epoch": 1.2569643773409744, "grad_norm": 6.78125, "learning_rate": 3.322075462970219e-06, "loss": 1.00986519, "memory(GiB)": 142.32, "step": 112380, "train_speed(iter/s)": 0.285643 }, { "acc": 0.74264865, "epoch": 1.257188076286933, "grad_norm": 7.03125, "learning_rate": 3.3203334049537373e-06, "loss": 1.01612091, "memory(GiB)": 142.32, "step": 112400, "train_speed(iter/s)": 0.285662 }, { "acc": 0.75118217, "epoch": 1.2574117752328915, "grad_norm": 6.3125, "learning_rate": 3.318591576739653e-06, "loss": 0.98630466, "memory(GiB)": 142.32, "step": 112420, "train_speed(iter/s)": 0.285683 }, { "acc": 0.74855976, "epoch": 1.25763547417885, "grad_norm": 8.25, "learning_rate": 3.3168499785662745e-06, "loss": 0.98711796, "memory(GiB)": 142.32, "step": 112440, "train_speed(iter/s)": 0.285701 }, { "acc": 0.74661274, "epoch": 1.2578591731248085, "grad_norm": 7.25, "learning_rate": 3.3151086106718783e-06, "loss": 1.00467911, "memory(GiB)": 142.32, "step": 112460, "train_speed(iter/s)": 0.285719 }, { "acc": 0.75315046, "epoch": 1.258082872070767, "grad_norm": 5.625, "learning_rate": 3.313367473294705e-06, "loss": 0.97882538, "memory(GiB)": 142.32, "step": 112480, "train_speed(iter/s)": 0.285738 }, { "acc": 0.72766132, "epoch": 1.2583065710167256, "grad_norm": 5.84375, "learning_rate": 3.3116265666729687e-06, "loss": 1.09987516, "memory(GiB)": 142.32, "step": 112500, "train_speed(iter/s)": 0.285755 }, { "acc": 0.73642797, "epoch": 1.2585302699626841, "grad_norm": 6.375, "learning_rate": 3.3098858910448517e-06, "loss": 1.05080528, "memory(GiB)": 142.32, "step": 112520, "train_speed(iter/s)": 0.285773 }, { "acc": 0.73630919, "epoch": 1.2587539689086427, "grad_norm": 5.5625, "learning_rate": 3.3081454466485007e-06, "loss": 1.04615459, "memory(GiB)": 142.32, "step": 112540, "train_speed(iter/s)": 0.285791 }, { "acc": 0.73845239, "epoch": 1.2589776678546012, "grad_norm": 4.96875, "learning_rate": 3.3064052337220355e-06, "loss": 1.04756012, "memory(GiB)": 142.32, "step": 112560, "train_speed(iter/s)": 0.285807 }, { "acc": 0.73824902, "epoch": 1.2592013668005597, "grad_norm": 6.21875, "learning_rate": 3.3046652525035404e-06, "loss": 1.03928499, "memory(GiB)": 142.32, "step": 112580, "train_speed(iter/s)": 0.285823 }, { "acc": 0.74476638, "epoch": 1.2594250657465182, "grad_norm": 5.03125, "learning_rate": 3.3029255032310715e-06, "loss": 1.00402145, "memory(GiB)": 142.32, "step": 112600, "train_speed(iter/s)": 0.28584 }, { "acc": 0.74206562, "epoch": 1.2596487646924768, "grad_norm": 5.78125, "learning_rate": 3.301185986142651e-06, "loss": 1.01597509, "memory(GiB)": 142.32, "step": 112620, "train_speed(iter/s)": 0.285857 }, { "acc": 0.7372829, "epoch": 1.2598724636384353, "grad_norm": 4.8125, "learning_rate": 3.299446701476269e-06, "loss": 1.02948399, "memory(GiB)": 142.32, "step": 112640, "train_speed(iter/s)": 0.285876 }, { "acc": 0.73704557, "epoch": 1.2600961625843938, "grad_norm": 5.46875, "learning_rate": 3.297707649469884e-06, "loss": 1.03856869, "memory(GiB)": 142.32, "step": 112660, "train_speed(iter/s)": 0.285894 }, { "acc": 0.74498391, "epoch": 1.2603198615303524, "grad_norm": 7.21875, "learning_rate": 3.295968830361424e-06, "loss": 1.02227421, "memory(GiB)": 142.32, "step": 112680, "train_speed(iter/s)": 0.28591 }, { "acc": 0.73787994, "epoch": 1.2605435604763109, "grad_norm": 6.25, "learning_rate": 3.294230244388784e-06, "loss": 1.04374304, "memory(GiB)": 142.32, "step": 112700, "train_speed(iter/s)": 0.285927 }, { "acc": 0.73762088, "epoch": 1.2607672594222694, "grad_norm": 6.40625, "learning_rate": 3.2924918917898296e-06, "loss": 1.03382854, "memory(GiB)": 142.32, "step": 112720, "train_speed(iter/s)": 0.285942 }, { "acc": 0.74220085, "epoch": 1.260990958368228, "grad_norm": 6.59375, "learning_rate": 3.2907537728023887e-06, "loss": 1.00774536, "memory(GiB)": 142.32, "step": 112740, "train_speed(iter/s)": 0.28596 }, { "acc": 0.73249245, "epoch": 1.2612146573141865, "grad_norm": 6.46875, "learning_rate": 3.2890158876642618e-06, "loss": 1.08239021, "memory(GiB)": 142.32, "step": 112760, "train_speed(iter/s)": 0.285976 }, { "acc": 0.7340785, "epoch": 1.261438356260145, "grad_norm": 6.5, "learning_rate": 3.2872782366132185e-06, "loss": 1.0460125, "memory(GiB)": 142.32, "step": 112780, "train_speed(iter/s)": 0.285993 }, { "acc": 0.74276915, "epoch": 1.2616620552061035, "grad_norm": 5.78125, "learning_rate": 3.2855408198869922e-06, "loss": 1.01998644, "memory(GiB)": 142.32, "step": 112800, "train_speed(iter/s)": 0.28601 }, { "acc": 0.73702545, "epoch": 1.261885754152062, "grad_norm": 5.25, "learning_rate": 3.2838036377232875e-06, "loss": 1.05553493, "memory(GiB)": 142.32, "step": 112820, "train_speed(iter/s)": 0.286027 }, { "acc": 0.74589863, "epoch": 1.2621094530980206, "grad_norm": 6.09375, "learning_rate": 3.2820666903597747e-06, "loss": 1.01787109, "memory(GiB)": 142.32, "step": 112840, "train_speed(iter/s)": 0.286044 }, { "acc": 0.72985439, "epoch": 1.2623331520439791, "grad_norm": 6.25, "learning_rate": 3.2803299780340938e-06, "loss": 1.07388439, "memory(GiB)": 142.32, "step": 112860, "train_speed(iter/s)": 0.286061 }, { "acc": 0.73326035, "epoch": 1.2625568509899376, "grad_norm": 6.0625, "learning_rate": 3.278593500983851e-06, "loss": 1.06179695, "memory(GiB)": 142.32, "step": 112880, "train_speed(iter/s)": 0.286078 }, { "acc": 0.73512001, "epoch": 1.2627805499358962, "grad_norm": 6.625, "learning_rate": 3.2768572594466227e-06, "loss": 1.04871693, "memory(GiB)": 142.32, "step": 112900, "train_speed(iter/s)": 0.286095 }, { "acc": 0.74074526, "epoch": 1.2630042488818547, "grad_norm": 5.34375, "learning_rate": 3.275121253659951e-06, "loss": 1.0238781, "memory(GiB)": 142.32, "step": 112920, "train_speed(iter/s)": 0.286114 }, { "acc": 0.73773923, "epoch": 1.2632279478278132, "grad_norm": 5.65625, "learning_rate": 3.2733854838613455e-06, "loss": 1.05300198, "memory(GiB)": 142.32, "step": 112940, "train_speed(iter/s)": 0.286131 }, { "acc": 0.72812524, "epoch": 1.2634516467737718, "grad_norm": 6.46875, "learning_rate": 3.271649950288284e-06, "loss": 1.09114971, "memory(GiB)": 142.32, "step": 112960, "train_speed(iter/s)": 0.286151 }, { "acc": 0.74694772, "epoch": 1.2636753457197303, "grad_norm": 5.5, "learning_rate": 3.269914653178214e-06, "loss": 1.00324221, "memory(GiB)": 142.32, "step": 112980, "train_speed(iter/s)": 0.286167 }, { "acc": 0.73661952, "epoch": 1.2638990446656888, "grad_norm": 4.90625, "learning_rate": 3.2681795927685477e-06, "loss": 1.05610008, "memory(GiB)": 142.32, "step": 113000, "train_speed(iter/s)": 0.286184 }, { "acc": 0.73665981, "epoch": 1.2641227436116473, "grad_norm": 5.4375, "learning_rate": 3.266444769296667e-06, "loss": 1.05765467, "memory(GiB)": 142.32, "step": 113020, "train_speed(iter/s)": 0.286202 }, { "acc": 0.73216228, "epoch": 1.2643464425576059, "grad_norm": 7.125, "learning_rate": 3.26471018299992e-06, "loss": 1.06046724, "memory(GiB)": 142.32, "step": 113040, "train_speed(iter/s)": 0.286218 }, { "acc": 0.7460351, "epoch": 1.2645701415035644, "grad_norm": 5.0625, "learning_rate": 3.2629758341156227e-06, "loss": 1.00223541, "memory(GiB)": 142.32, "step": 113060, "train_speed(iter/s)": 0.286235 }, { "acc": 0.74720964, "epoch": 1.264793840449523, "grad_norm": 6.5625, "learning_rate": 3.261241722881059e-06, "loss": 1.00006256, "memory(GiB)": 142.32, "step": 113080, "train_speed(iter/s)": 0.286252 }, { "acc": 0.7366004, "epoch": 1.2650175393954814, "grad_norm": 6.59375, "learning_rate": 3.25950784953348e-06, "loss": 1.0464201, "memory(GiB)": 142.32, "step": 113100, "train_speed(iter/s)": 0.28627 }, { "acc": 0.72956352, "epoch": 1.26524123834144, "grad_norm": 5.4375, "learning_rate": 3.2577742143101053e-06, "loss": 1.07013645, "memory(GiB)": 142.32, "step": 113120, "train_speed(iter/s)": 0.286288 }, { "acc": 0.7387538, "epoch": 1.2654649372873985, "grad_norm": 5.65625, "learning_rate": 3.2560408174481202e-06, "loss": 1.03755398, "memory(GiB)": 142.32, "step": 113140, "train_speed(iter/s)": 0.286307 }, { "acc": 0.74083591, "epoch": 1.265688636233357, "grad_norm": 5.09375, "learning_rate": 3.254307659184678e-06, "loss": 1.05022478, "memory(GiB)": 142.32, "step": 113160, "train_speed(iter/s)": 0.286325 }, { "acc": 0.75146599, "epoch": 1.2659123351793156, "grad_norm": 7.96875, "learning_rate": 3.2525747397568984e-06, "loss": 0.97334633, "memory(GiB)": 142.32, "step": 113180, "train_speed(iter/s)": 0.286341 }, { "acc": 0.73611031, "epoch": 1.266136034125274, "grad_norm": 5.53125, "learning_rate": 3.2508420594018723e-06, "loss": 1.0468462, "memory(GiB)": 142.32, "step": 113200, "train_speed(iter/s)": 0.286359 }, { "acc": 0.73988886, "epoch": 1.2663597330712326, "grad_norm": 5.4375, "learning_rate": 3.249109618356654e-06, "loss": 1.01823912, "memory(GiB)": 142.32, "step": 113220, "train_speed(iter/s)": 0.286373 }, { "acc": 0.74347315, "epoch": 1.2665834320171911, "grad_norm": 6.375, "learning_rate": 3.247377416858265e-06, "loss": 1.02431736, "memory(GiB)": 142.32, "step": 113240, "train_speed(iter/s)": 0.286391 }, { "acc": 0.73584442, "epoch": 1.2668071309631497, "grad_norm": 5.90625, "learning_rate": 3.2456454551436967e-06, "loss": 1.049928, "memory(GiB)": 142.32, "step": 113260, "train_speed(iter/s)": 0.286408 }, { "acc": 0.72945108, "epoch": 1.2670308299091082, "grad_norm": 6.78125, "learning_rate": 3.243913733449905e-06, "loss": 1.08268223, "memory(GiB)": 142.32, "step": 113280, "train_speed(iter/s)": 0.286425 }, { "acc": 0.72974238, "epoch": 1.2672545288550667, "grad_norm": 7.34375, "learning_rate": 3.242182252013815e-06, "loss": 1.06321268, "memory(GiB)": 142.32, "step": 113300, "train_speed(iter/s)": 0.286443 }, { "acc": 0.7358325, "epoch": 1.2674782278010253, "grad_norm": 6.3125, "learning_rate": 3.2404510110723192e-06, "loss": 1.05199747, "memory(GiB)": 142.32, "step": 113320, "train_speed(iter/s)": 0.286462 }, { "acc": 0.72385821, "epoch": 1.2677019267469838, "grad_norm": 7.84375, "learning_rate": 3.2387200108622736e-06, "loss": 1.10573807, "memory(GiB)": 142.32, "step": 113340, "train_speed(iter/s)": 0.286479 }, { "acc": 0.72818174, "epoch": 1.2679256256929423, "grad_norm": 6.1875, "learning_rate": 3.2369892516205047e-06, "loss": 1.10172472, "memory(GiB)": 142.32, "step": 113360, "train_speed(iter/s)": 0.286496 }, { "acc": 0.7440239, "epoch": 1.2681493246389008, "grad_norm": 7.125, "learning_rate": 3.235258733583806e-06, "loss": 1.01817017, "memory(GiB)": 142.32, "step": 113380, "train_speed(iter/s)": 0.286514 }, { "acc": 0.73166208, "epoch": 1.2683730235848594, "grad_norm": 6.875, "learning_rate": 3.233528456988936e-06, "loss": 1.0536377, "memory(GiB)": 142.32, "step": 113400, "train_speed(iter/s)": 0.286532 }, { "acc": 0.74847927, "epoch": 1.268596722530818, "grad_norm": 5.71875, "learning_rate": 3.231798422072623e-06, "loss": 0.98565655, "memory(GiB)": 142.32, "step": 113420, "train_speed(iter/s)": 0.286549 }, { "acc": 0.7450613, "epoch": 1.2688204214767764, "grad_norm": 6.25, "learning_rate": 3.2300686290715584e-06, "loss": 1.00361042, "memory(GiB)": 142.32, "step": 113440, "train_speed(iter/s)": 0.286567 }, { "acc": 0.73950353, "epoch": 1.269044120422735, "grad_norm": 6.46875, "learning_rate": 3.2283390782224035e-06, "loss": 1.04320698, "memory(GiB)": 142.32, "step": 113460, "train_speed(iter/s)": 0.286584 }, { "acc": 0.750805, "epoch": 1.2692678193686935, "grad_norm": 6.90625, "learning_rate": 3.226609769761785e-06, "loss": 0.98981028, "memory(GiB)": 142.32, "step": 113480, "train_speed(iter/s)": 0.2866 }, { "acc": 0.73108249, "epoch": 1.269491518314652, "grad_norm": 6.53125, "learning_rate": 3.224880703926298e-06, "loss": 1.0701561, "memory(GiB)": 142.32, "step": 113500, "train_speed(iter/s)": 0.286619 }, { "acc": 0.74638186, "epoch": 1.2697152172606105, "grad_norm": 7.40625, "learning_rate": 3.223151880952504e-06, "loss": 1.00194435, "memory(GiB)": 142.32, "step": 113520, "train_speed(iter/s)": 0.286636 }, { "acc": 0.72232604, "epoch": 1.269938916206569, "grad_norm": 6.21875, "learning_rate": 3.221423301076929e-06, "loss": 1.09270287, "memory(GiB)": 142.32, "step": 113540, "train_speed(iter/s)": 0.286653 }, { "acc": 0.72690935, "epoch": 1.2701626151525276, "grad_norm": 6.0, "learning_rate": 3.2196949645360675e-06, "loss": 1.097192, "memory(GiB)": 142.32, "step": 113560, "train_speed(iter/s)": 0.286669 }, { "acc": 0.72703567, "epoch": 1.2703863140984861, "grad_norm": 6.375, "learning_rate": 3.2179668715663814e-06, "loss": 1.0922574, "memory(GiB)": 142.32, "step": 113580, "train_speed(iter/s)": 0.286687 }, { "acc": 0.74506383, "epoch": 1.2706100130444447, "grad_norm": 7.03125, "learning_rate": 3.2162390224042987e-06, "loss": 1.00225964, "memory(GiB)": 142.32, "step": 113600, "train_speed(iter/s)": 0.286703 }, { "acc": 0.73376293, "epoch": 1.2708337119904032, "grad_norm": 7.03125, "learning_rate": 3.2145114172862147e-06, "loss": 1.05579138, "memory(GiB)": 142.32, "step": 113620, "train_speed(iter/s)": 0.286719 }, { "acc": 0.73768034, "epoch": 1.2710574109363617, "grad_norm": 4.6875, "learning_rate": 3.2127840564484893e-06, "loss": 1.03813248, "memory(GiB)": 142.32, "step": 113640, "train_speed(iter/s)": 0.286736 }, { "acc": 0.73002672, "epoch": 1.2712811098823202, "grad_norm": 6.75, "learning_rate": 3.2110569401274494e-06, "loss": 1.08574715, "memory(GiB)": 142.32, "step": 113660, "train_speed(iter/s)": 0.286753 }, { "acc": 0.7310451, "epoch": 1.2715048088282788, "grad_norm": 5.5, "learning_rate": 3.2093300685593896e-06, "loss": 1.06890488, "memory(GiB)": 142.32, "step": 113680, "train_speed(iter/s)": 0.28677 }, { "acc": 0.74577351, "epoch": 1.2717285077742373, "grad_norm": 6.40625, "learning_rate": 3.207603441980571e-06, "loss": 1.00473003, "memory(GiB)": 142.32, "step": 113700, "train_speed(iter/s)": 0.286787 }, { "acc": 0.73624897, "epoch": 1.2719522067201958, "grad_norm": 4.9375, "learning_rate": 3.205877060627221e-06, "loss": 1.03495932, "memory(GiB)": 142.32, "step": 113720, "train_speed(iter/s)": 0.286803 }, { "acc": 0.74656396, "epoch": 1.2721759056661546, "grad_norm": 6.71875, "learning_rate": 3.204150924735533e-06, "loss": 1.01399517, "memory(GiB)": 142.32, "step": 113740, "train_speed(iter/s)": 0.28682 }, { "acc": 0.7285162, "epoch": 1.272399604612113, "grad_norm": 5.875, "learning_rate": 3.2024250345416674e-06, "loss": 1.09608746, "memory(GiB)": 142.32, "step": 113760, "train_speed(iter/s)": 0.286836 }, { "acc": 0.72932777, "epoch": 1.2726233035580716, "grad_norm": 6.46875, "learning_rate": 3.2006993902817497e-06, "loss": 1.07601566, "memory(GiB)": 142.32, "step": 113780, "train_speed(iter/s)": 0.286854 }, { "acc": 0.74345751, "epoch": 1.2728470025040302, "grad_norm": 5.1875, "learning_rate": 3.198973992191874e-06, "loss": 1.01169682, "memory(GiB)": 142.32, "step": 113800, "train_speed(iter/s)": 0.286872 }, { "acc": 0.72983932, "epoch": 1.2730707014499887, "grad_norm": 5.34375, "learning_rate": 3.197248840508098e-06, "loss": 1.08429756, "memory(GiB)": 142.32, "step": 113820, "train_speed(iter/s)": 0.286888 }, { "acc": 0.74347863, "epoch": 1.2732944003959472, "grad_norm": 6.0625, "learning_rate": 3.195523935466448e-06, "loss": 1.03162231, "memory(GiB)": 142.32, "step": 113840, "train_speed(iter/s)": 0.286904 }, { "acc": 0.73196802, "epoch": 1.2735180993419057, "grad_norm": 6.5, "learning_rate": 3.1937992773029164e-06, "loss": 1.06008329, "memory(GiB)": 142.32, "step": 113860, "train_speed(iter/s)": 0.286922 }, { "acc": 0.72012081, "epoch": 1.2737417982878643, "grad_norm": 7.4375, "learning_rate": 3.1920748662534594e-06, "loss": 1.10811911, "memory(GiB)": 142.32, "step": 113880, "train_speed(iter/s)": 0.28694 }, { "acc": 0.72917633, "epoch": 1.2739654972338228, "grad_norm": 7.875, "learning_rate": 3.190350702554002e-06, "loss": 1.07499466, "memory(GiB)": 142.32, "step": 113900, "train_speed(iter/s)": 0.286959 }, { "acc": 0.72792602, "epoch": 1.2741891961797813, "grad_norm": 6.375, "learning_rate": 3.188626786440434e-06, "loss": 1.09004803, "memory(GiB)": 142.32, "step": 113920, "train_speed(iter/s)": 0.286975 }, { "acc": 0.74027567, "epoch": 1.2744128951257399, "grad_norm": 7.4375, "learning_rate": 3.186903118148613e-06, "loss": 1.03482304, "memory(GiB)": 142.32, "step": 113940, "train_speed(iter/s)": 0.286994 }, { "acc": 0.73666534, "epoch": 1.2746365940716984, "grad_norm": 6.15625, "learning_rate": 3.18517969791436e-06, "loss": 1.04837456, "memory(GiB)": 142.32, "step": 113960, "train_speed(iter/s)": 0.287011 }, { "acc": 0.74373503, "epoch": 1.274860293017657, "grad_norm": 5.78125, "learning_rate": 3.1834565259734647e-06, "loss": 1.0172122, "memory(GiB)": 142.32, "step": 113980, "train_speed(iter/s)": 0.287028 }, { "acc": 0.73313618, "epoch": 1.2750839919636154, "grad_norm": 4.84375, "learning_rate": 3.1817336025616803e-06, "loss": 1.07076721, "memory(GiB)": 142.32, "step": 114000, "train_speed(iter/s)": 0.287045 }, { "epoch": 1.2750839919636154, "eval_acc": 0.6962588078333568, "eval_loss": 1.0718250274658203, "eval_runtime": 2343.3485, "eval_samples_per_second": 32.126, "eval_steps_per_second": 16.063, "step": 114000 }, { "acc": 0.7451602, "epoch": 1.275307690909574, "grad_norm": 5.15625, "learning_rate": 3.180010927914728e-06, "loss": 1.01286087, "memory(GiB)": 142.32, "step": 114020, "train_speed(iter/s)": 0.28534 }, { "acc": 0.74381638, "epoch": 1.2755313898555325, "grad_norm": 5.75, "learning_rate": 3.178288502268294e-06, "loss": 1.01385813, "memory(GiB)": 142.32, "step": 114040, "train_speed(iter/s)": 0.285356 }, { "acc": 0.74855704, "epoch": 1.275755088801491, "grad_norm": 5.0625, "learning_rate": 3.1765663258580333e-06, "loss": 0.99117432, "memory(GiB)": 142.32, "step": 114060, "train_speed(iter/s)": 0.285373 }, { "acc": 0.73715515, "epoch": 1.2759787877474495, "grad_norm": 4.96875, "learning_rate": 3.1748443989195597e-06, "loss": 1.03331203, "memory(GiB)": 142.32, "step": 114080, "train_speed(iter/s)": 0.285391 }, { "acc": 0.74660873, "epoch": 1.276202486693408, "grad_norm": 6.25, "learning_rate": 3.1731227216884606e-06, "loss": 0.9960659, "memory(GiB)": 142.32, "step": 114100, "train_speed(iter/s)": 0.285409 }, { "acc": 0.73191352, "epoch": 1.2764261856393666, "grad_norm": 6.375, "learning_rate": 3.171401294400286e-06, "loss": 1.07434826, "memory(GiB)": 142.32, "step": 114120, "train_speed(iter/s)": 0.285424 }, { "acc": 0.74454098, "epoch": 1.2766498845853251, "grad_norm": 5.21875, "learning_rate": 3.16968011729055e-06, "loss": 1.0227809, "memory(GiB)": 142.32, "step": 114140, "train_speed(iter/s)": 0.285442 }, { "acc": 0.74190912, "epoch": 1.2768735835312837, "grad_norm": 6.84375, "learning_rate": 3.1679591905947365e-06, "loss": 1.0255909, "memory(GiB)": 142.32, "step": 114160, "train_speed(iter/s)": 0.28546 }, { "acc": 0.75068731, "epoch": 1.2770972824772422, "grad_norm": 7.75, "learning_rate": 3.1662385145482912e-06, "loss": 0.99312096, "memory(GiB)": 142.32, "step": 114180, "train_speed(iter/s)": 0.285478 }, { "acc": 0.72946372, "epoch": 1.2773209814232007, "grad_norm": 6.5, "learning_rate": 3.1645180893866267e-06, "loss": 1.08799801, "memory(GiB)": 142.32, "step": 114200, "train_speed(iter/s)": 0.285495 }, { "acc": 0.73733954, "epoch": 1.2775446803691592, "grad_norm": 6.28125, "learning_rate": 3.1627979153451225e-06, "loss": 1.04063148, "memory(GiB)": 142.32, "step": 114220, "train_speed(iter/s)": 0.285512 }, { "acc": 0.7301753, "epoch": 1.2777683793151178, "grad_norm": 6.65625, "learning_rate": 3.161077992659124e-06, "loss": 1.07824469, "memory(GiB)": 142.32, "step": 114240, "train_speed(iter/s)": 0.28553 }, { "acc": 0.73823872, "epoch": 1.2779920782610763, "grad_norm": 6.125, "learning_rate": 3.159358321563941e-06, "loss": 1.03812027, "memory(GiB)": 142.32, "step": 114260, "train_speed(iter/s)": 0.285548 }, { "acc": 0.74102564, "epoch": 1.2782157772070348, "grad_norm": 5.53125, "learning_rate": 3.1576389022948474e-06, "loss": 1.02446327, "memory(GiB)": 142.32, "step": 114280, "train_speed(iter/s)": 0.285562 }, { "acc": 0.73599949, "epoch": 1.2784394761529934, "grad_norm": 6.46875, "learning_rate": 3.155919735087085e-06, "loss": 1.03785629, "memory(GiB)": 142.32, "step": 114300, "train_speed(iter/s)": 0.28558 }, { "acc": 0.74286013, "epoch": 1.2786631750989519, "grad_norm": 6.375, "learning_rate": 3.1542008201758616e-06, "loss": 1.02574902, "memory(GiB)": 142.32, "step": 114320, "train_speed(iter/s)": 0.285595 }, { "acc": 0.73468695, "epoch": 1.2788868740449104, "grad_norm": 7.21875, "learning_rate": 3.152482157796348e-06, "loss": 1.04963722, "memory(GiB)": 142.32, "step": 114340, "train_speed(iter/s)": 0.285611 }, { "acc": 0.73723106, "epoch": 1.279110572990869, "grad_norm": 6.28125, "learning_rate": 3.150763748183684e-06, "loss": 1.04109402, "memory(GiB)": 142.32, "step": 114360, "train_speed(iter/s)": 0.285628 }, { "acc": 0.74255171, "epoch": 1.2793342719368275, "grad_norm": 5.84375, "learning_rate": 3.149045591572969e-06, "loss": 1.02846546, "memory(GiB)": 142.32, "step": 114380, "train_speed(iter/s)": 0.285643 }, { "acc": 0.73915758, "epoch": 1.279557970882786, "grad_norm": 6.90625, "learning_rate": 3.1473276881992742e-06, "loss": 1.03764629, "memory(GiB)": 142.32, "step": 114400, "train_speed(iter/s)": 0.285661 }, { "acc": 0.73565865, "epoch": 1.2797816698287445, "grad_norm": 6.25, "learning_rate": 3.145610038297632e-06, "loss": 1.05798855, "memory(GiB)": 142.32, "step": 114420, "train_speed(iter/s)": 0.285678 }, { "acc": 0.73883286, "epoch": 1.280005368774703, "grad_norm": 7.9375, "learning_rate": 3.1438926421030414e-06, "loss": 1.03380308, "memory(GiB)": 142.32, "step": 114440, "train_speed(iter/s)": 0.285695 }, { "acc": 0.72480602, "epoch": 1.2802290677206616, "grad_norm": 6.875, "learning_rate": 3.142175499850469e-06, "loss": 1.09103241, "memory(GiB)": 142.32, "step": 114460, "train_speed(iter/s)": 0.285711 }, { "acc": 0.72999792, "epoch": 1.2804527666666201, "grad_norm": 6.28125, "learning_rate": 3.1404586117748413e-06, "loss": 1.06819334, "memory(GiB)": 142.32, "step": 114480, "train_speed(iter/s)": 0.285725 }, { "acc": 0.74216862, "epoch": 1.2806764656125786, "grad_norm": 5.90625, "learning_rate": 3.1387419781110546e-06, "loss": 1.02129755, "memory(GiB)": 142.32, "step": 114500, "train_speed(iter/s)": 0.285742 }, { "acc": 0.74521961, "epoch": 1.2809001645585372, "grad_norm": 7.25, "learning_rate": 3.137025599093969e-06, "loss": 0.99858913, "memory(GiB)": 142.32, "step": 114520, "train_speed(iter/s)": 0.285759 }, { "acc": 0.73379459, "epoch": 1.2811238635044957, "grad_norm": 5.90625, "learning_rate": 3.135309474958409e-06, "loss": 1.06310863, "memory(GiB)": 142.32, "step": 114540, "train_speed(iter/s)": 0.285776 }, { "acc": 0.7306077, "epoch": 1.2813475624504542, "grad_norm": 5.4375, "learning_rate": 3.1335936059391668e-06, "loss": 1.08192978, "memory(GiB)": 142.32, "step": 114560, "train_speed(iter/s)": 0.285792 }, { "acc": 0.73836112, "epoch": 1.2815712613964128, "grad_norm": 5.78125, "learning_rate": 3.1318779922709953e-06, "loss": 1.03807764, "memory(GiB)": 142.32, "step": 114580, "train_speed(iter/s)": 0.28581 }, { "acc": 0.7378109, "epoch": 1.2817949603423713, "grad_norm": 4.96875, "learning_rate": 3.130162634188616e-06, "loss": 1.02228336, "memory(GiB)": 142.32, "step": 114600, "train_speed(iter/s)": 0.285827 }, { "acc": 0.74735727, "epoch": 1.2820186592883298, "grad_norm": 8.625, "learning_rate": 3.1284475319267143e-06, "loss": 0.98597851, "memory(GiB)": 142.32, "step": 114620, "train_speed(iter/s)": 0.285845 }, { "acc": 0.74864578, "epoch": 1.2822423582342883, "grad_norm": 6.71875, "learning_rate": 3.126732685719941e-06, "loss": 0.99681396, "memory(GiB)": 142.32, "step": 114640, "train_speed(iter/s)": 0.285862 }, { "acc": 0.73869753, "epoch": 1.2824660571802469, "grad_norm": 5.3125, "learning_rate": 3.125018095802913e-06, "loss": 1.03095274, "memory(GiB)": 142.32, "step": 114660, "train_speed(iter/s)": 0.285878 }, { "acc": 0.73306088, "epoch": 1.2826897561262054, "grad_norm": 6.25, "learning_rate": 3.1233037624102067e-06, "loss": 1.04773254, "memory(GiB)": 142.32, "step": 114680, "train_speed(iter/s)": 0.285897 }, { "acc": 0.75067291, "epoch": 1.282913455072164, "grad_norm": 5.40625, "learning_rate": 3.121589685776372e-06, "loss": 0.98480377, "memory(GiB)": 142.32, "step": 114700, "train_speed(iter/s)": 0.285915 }, { "acc": 0.74130507, "epoch": 1.2831371540181224, "grad_norm": 4.8125, "learning_rate": 3.1198758661359152e-06, "loss": 1.01449108, "memory(GiB)": 142.32, "step": 114720, "train_speed(iter/s)": 0.285932 }, { "acc": 0.73833008, "epoch": 1.283360852964081, "grad_norm": 5.5625, "learning_rate": 3.118162303723314e-06, "loss": 1.041049, "memory(GiB)": 142.32, "step": 114740, "train_speed(iter/s)": 0.285948 }, { "acc": 0.73263636, "epoch": 1.2835845519100395, "grad_norm": 5.53125, "learning_rate": 3.1164489987730078e-06, "loss": 1.06572695, "memory(GiB)": 142.32, "step": 114760, "train_speed(iter/s)": 0.285963 }, { "acc": 0.73439903, "epoch": 1.283808250855998, "grad_norm": 5.5, "learning_rate": 3.1147359515194e-06, "loss": 1.0546711, "memory(GiB)": 142.32, "step": 114780, "train_speed(iter/s)": 0.28598 }, { "acc": 0.73489833, "epoch": 1.2840319498019566, "grad_norm": 6.8125, "learning_rate": 3.1130231621968602e-06, "loss": 1.0400897, "memory(GiB)": 142.32, "step": 114800, "train_speed(iter/s)": 0.285996 }, { "acc": 0.73445978, "epoch": 1.284255648747915, "grad_norm": 6.09375, "learning_rate": 3.1113106310397236e-06, "loss": 1.07391415, "memory(GiB)": 142.32, "step": 114820, "train_speed(iter/s)": 0.286012 }, { "acc": 0.74952717, "epoch": 1.2844793476938736, "grad_norm": 6.21875, "learning_rate": 3.10959835828229e-06, "loss": 0.98140745, "memory(GiB)": 142.32, "step": 114840, "train_speed(iter/s)": 0.286029 }, { "acc": 0.73434043, "epoch": 1.2847030466398321, "grad_norm": 5.25, "learning_rate": 3.107886344158819e-06, "loss": 1.03334103, "memory(GiB)": 142.32, "step": 114860, "train_speed(iter/s)": 0.286047 }, { "acc": 0.72978597, "epoch": 1.2849267455857907, "grad_norm": 8.0, "learning_rate": 3.106174588903541e-06, "loss": 1.07496967, "memory(GiB)": 142.32, "step": 114880, "train_speed(iter/s)": 0.286063 }, { "acc": 0.73635058, "epoch": 1.2851504445317492, "grad_norm": 5.78125, "learning_rate": 3.1044630927506483e-06, "loss": 1.05179234, "memory(GiB)": 142.32, "step": 114900, "train_speed(iter/s)": 0.286081 }, { "acc": 0.76163106, "epoch": 1.2853741434777077, "grad_norm": 5.96875, "learning_rate": 3.1027518559342982e-06, "loss": 0.93995733, "memory(GiB)": 142.32, "step": 114920, "train_speed(iter/s)": 0.286099 }, { "acc": 0.73391008, "epoch": 1.2855978424236663, "grad_norm": 6.1875, "learning_rate": 3.1010408786886114e-06, "loss": 1.06138182, "memory(GiB)": 142.32, "step": 114940, "train_speed(iter/s)": 0.286118 }, { "acc": 0.7400311, "epoch": 1.2858215413696248, "grad_norm": 6.0, "learning_rate": 3.0993301612476743e-06, "loss": 1.02577572, "memory(GiB)": 142.32, "step": 114960, "train_speed(iter/s)": 0.286136 }, { "acc": 0.74322691, "epoch": 1.2860452403155833, "grad_norm": 5.75, "learning_rate": 3.097619703845539e-06, "loss": 1.01385937, "memory(GiB)": 142.32, "step": 114980, "train_speed(iter/s)": 0.286155 }, { "acc": 0.74054384, "epoch": 1.2862689392615418, "grad_norm": 6.6875, "learning_rate": 3.095909506716219e-06, "loss": 1.03038578, "memory(GiB)": 142.32, "step": 115000, "train_speed(iter/s)": 0.286171 }, { "acc": 0.7287303, "epoch": 1.2864926382075004, "grad_norm": 6.4375, "learning_rate": 3.0941995700936957e-06, "loss": 1.08185787, "memory(GiB)": 142.32, "step": 115020, "train_speed(iter/s)": 0.286186 }, { "acc": 0.74284687, "epoch": 1.286716337153459, "grad_norm": 6.21875, "learning_rate": 3.09248989421191e-06, "loss": 1.01007681, "memory(GiB)": 142.32, "step": 115040, "train_speed(iter/s)": 0.286203 }, { "acc": 0.73751144, "epoch": 1.2869400360994174, "grad_norm": 5.59375, "learning_rate": 3.0907804793047715e-06, "loss": 1.04171696, "memory(GiB)": 142.32, "step": 115060, "train_speed(iter/s)": 0.286221 }, { "acc": 0.73234625, "epoch": 1.287163735045376, "grad_norm": 6.9375, "learning_rate": 3.0890713256061523e-06, "loss": 1.07391853, "memory(GiB)": 142.32, "step": 115080, "train_speed(iter/s)": 0.286237 }, { "acc": 0.7414609, "epoch": 1.2873874339913345, "grad_norm": 6.65625, "learning_rate": 3.0873624333498884e-06, "loss": 1.01974297, "memory(GiB)": 142.32, "step": 115100, "train_speed(iter/s)": 0.286253 }, { "acc": 0.72865591, "epoch": 1.287611132937293, "grad_norm": 5.71875, "learning_rate": 3.0856538027697834e-06, "loss": 1.09221954, "memory(GiB)": 142.32, "step": 115120, "train_speed(iter/s)": 0.286272 }, { "acc": 0.74136496, "epoch": 1.2878348318832515, "grad_norm": 5.375, "learning_rate": 3.0839454340996e-06, "loss": 1.03374233, "memory(GiB)": 142.32, "step": 115140, "train_speed(iter/s)": 0.286289 }, { "acc": 0.7357811, "epoch": 1.28805853082921, "grad_norm": 6.15625, "learning_rate": 3.0822373275730672e-06, "loss": 1.0498498, "memory(GiB)": 142.32, "step": 115160, "train_speed(iter/s)": 0.286305 }, { "acc": 0.72691231, "epoch": 1.2882822297751686, "grad_norm": 5.8125, "learning_rate": 3.0805294834238793e-06, "loss": 1.07207565, "memory(GiB)": 142.32, "step": 115180, "train_speed(iter/s)": 0.286322 }, { "acc": 0.74637675, "epoch": 1.2885059287211271, "grad_norm": 5.90625, "learning_rate": 3.0788219018856934e-06, "loss": 0.98879242, "memory(GiB)": 142.32, "step": 115200, "train_speed(iter/s)": 0.286338 }, { "acc": 0.73398476, "epoch": 1.2887296276670857, "grad_norm": 6.90625, "learning_rate": 3.0771145831921323e-06, "loss": 1.04489841, "memory(GiB)": 142.32, "step": 115220, "train_speed(iter/s)": 0.286356 }, { "acc": 0.7389534, "epoch": 1.2889533266130442, "grad_norm": 4.5, "learning_rate": 3.0754075275767804e-06, "loss": 1.05450277, "memory(GiB)": 142.32, "step": 115240, "train_speed(iter/s)": 0.28637 }, { "acc": 0.74296174, "epoch": 1.2891770255590027, "grad_norm": 5.90625, "learning_rate": 3.073700735273186e-06, "loss": 1.01457825, "memory(GiB)": 142.32, "step": 115260, "train_speed(iter/s)": 0.286386 }, { "acc": 0.7439693, "epoch": 1.2894007245049612, "grad_norm": 5.65625, "learning_rate": 3.0719942065148655e-06, "loss": 1.00613098, "memory(GiB)": 142.32, "step": 115280, "train_speed(iter/s)": 0.286399 }, { "acc": 0.74184971, "epoch": 1.2896244234509198, "grad_norm": 6.34375, "learning_rate": 3.070287941535295e-06, "loss": 1.03009033, "memory(GiB)": 142.32, "step": 115300, "train_speed(iter/s)": 0.286417 }, { "acc": 0.74911819, "epoch": 1.2898481223968783, "grad_norm": 7.21875, "learning_rate": 3.0685819405679164e-06, "loss": 0.98377857, "memory(GiB)": 142.32, "step": 115320, "train_speed(iter/s)": 0.286433 }, { "acc": 0.73566451, "epoch": 1.2900718213428368, "grad_norm": 6.21875, "learning_rate": 3.0668762038461342e-06, "loss": 1.05473003, "memory(GiB)": 142.32, "step": 115340, "train_speed(iter/s)": 0.28645 }, { "acc": 0.74268856, "epoch": 1.2902955202887954, "grad_norm": 5.375, "learning_rate": 3.0651707316033176e-06, "loss": 1.03190193, "memory(GiB)": 142.32, "step": 115360, "train_speed(iter/s)": 0.286466 }, { "acc": 0.73982463, "epoch": 1.2905192192347539, "grad_norm": 6.03125, "learning_rate": 3.0634655240728002e-06, "loss": 1.03040504, "memory(GiB)": 142.32, "step": 115380, "train_speed(iter/s)": 0.286484 }, { "acc": 0.74185605, "epoch": 1.2907429181807124, "grad_norm": 5.40625, "learning_rate": 3.061760581487878e-06, "loss": 1.0143074, "memory(GiB)": 142.32, "step": 115400, "train_speed(iter/s)": 0.286501 }, { "acc": 0.73790188, "epoch": 1.290966617126671, "grad_norm": 5.875, "learning_rate": 3.060055904081814e-06, "loss": 1.03433208, "memory(GiB)": 142.32, "step": 115420, "train_speed(iter/s)": 0.286519 }, { "acc": 0.73965683, "epoch": 1.2911903160726295, "grad_norm": 5.90625, "learning_rate": 3.0583514920878293e-06, "loss": 1.03103676, "memory(GiB)": 142.32, "step": 115440, "train_speed(iter/s)": 0.286536 }, { "acc": 0.7297606, "epoch": 1.291414015018588, "grad_norm": 7.25, "learning_rate": 3.0566473457391127e-06, "loss": 1.09960575, "memory(GiB)": 142.32, "step": 115460, "train_speed(iter/s)": 0.286553 }, { "acc": 0.7322804, "epoch": 1.2916377139645465, "grad_norm": 6.09375, "learning_rate": 3.054943465268816e-06, "loss": 1.06934748, "memory(GiB)": 142.32, "step": 115480, "train_speed(iter/s)": 0.286571 }, { "acc": 0.7375967, "epoch": 1.291861412910505, "grad_norm": 5.5, "learning_rate": 3.0532398509100545e-06, "loss": 1.03507595, "memory(GiB)": 142.32, "step": 115500, "train_speed(iter/s)": 0.286591 }, { "acc": 0.7341938, "epoch": 1.2920851118564636, "grad_norm": 5.625, "learning_rate": 3.051536502895909e-06, "loss": 1.05621014, "memory(GiB)": 142.32, "step": 115520, "train_speed(iter/s)": 0.286607 }, { "acc": 0.73900394, "epoch": 1.292308810802422, "grad_norm": 6.25, "learning_rate": 3.0498334214594184e-06, "loss": 1.03994436, "memory(GiB)": 142.32, "step": 115540, "train_speed(iter/s)": 0.286622 }, { "acc": 0.73524585, "epoch": 1.2925325097483806, "grad_norm": 6.5, "learning_rate": 3.048130606833589e-06, "loss": 1.048563, "memory(GiB)": 142.32, "step": 115560, "train_speed(iter/s)": 0.286637 }, { "acc": 0.73991117, "epoch": 1.2927562086943392, "grad_norm": 6.28125, "learning_rate": 3.046428059251393e-06, "loss": 1.03490067, "memory(GiB)": 142.32, "step": 115580, "train_speed(iter/s)": 0.286654 }, { "acc": 0.74299431, "epoch": 1.2929799076402977, "grad_norm": 7.53125, "learning_rate": 3.0447257789457597e-06, "loss": 1.02167654, "memory(GiB)": 142.32, "step": 115600, "train_speed(iter/s)": 0.286671 }, { "acc": 0.7401515, "epoch": 1.2932036065862562, "grad_norm": 5.65625, "learning_rate": 3.0430237661495894e-06, "loss": 1.03671484, "memory(GiB)": 142.32, "step": 115620, "train_speed(iter/s)": 0.286689 }, { "acc": 0.73370795, "epoch": 1.2934273055322147, "grad_norm": 6.03125, "learning_rate": 3.0413220210957377e-06, "loss": 1.07214718, "memory(GiB)": 142.32, "step": 115640, "train_speed(iter/s)": 0.286707 }, { "acc": 0.7382061, "epoch": 1.2936510044781733, "grad_norm": 4.78125, "learning_rate": 3.03962054401703e-06, "loss": 1.03690567, "memory(GiB)": 142.32, "step": 115660, "train_speed(iter/s)": 0.286725 }, { "acc": 0.73046331, "epoch": 1.2938747034241318, "grad_norm": 6.3125, "learning_rate": 3.037919335146252e-06, "loss": 1.07278156, "memory(GiB)": 142.32, "step": 115680, "train_speed(iter/s)": 0.286742 }, { "acc": 0.73641138, "epoch": 1.2940984023700903, "grad_norm": 5.9375, "learning_rate": 3.036218394716154e-06, "loss": 1.03481274, "memory(GiB)": 142.32, "step": 115700, "train_speed(iter/s)": 0.286759 }, { "acc": 0.74264154, "epoch": 1.2943221013160489, "grad_norm": 6.1875, "learning_rate": 3.0345177229594487e-06, "loss": 1.01963539, "memory(GiB)": 142.32, "step": 115720, "train_speed(iter/s)": 0.286777 }, { "acc": 0.73900814, "epoch": 1.2945458002620074, "grad_norm": 6.625, "learning_rate": 3.0328173201088117e-06, "loss": 1.04469318, "memory(GiB)": 142.32, "step": 115740, "train_speed(iter/s)": 0.286794 }, { "acc": 0.72442541, "epoch": 1.294769499207966, "grad_norm": 6.15625, "learning_rate": 3.0311171863968823e-06, "loss": 1.1079195, "memory(GiB)": 142.32, "step": 115760, "train_speed(iter/s)": 0.286809 }, { "acc": 0.74557219, "epoch": 1.2949931981539244, "grad_norm": 5.75, "learning_rate": 3.029417322056264e-06, "loss": 1.03243694, "memory(GiB)": 142.32, "step": 115780, "train_speed(iter/s)": 0.286826 }, { "acc": 0.73745604, "epoch": 1.295216897099883, "grad_norm": 7.0, "learning_rate": 3.0277177273195223e-06, "loss": 1.04500656, "memory(GiB)": 142.32, "step": 115800, "train_speed(iter/s)": 0.286843 }, { "acc": 0.74368348, "epoch": 1.2954405960458415, "grad_norm": 7.5625, "learning_rate": 3.0260184024191864e-06, "loss": 1.02532406, "memory(GiB)": 142.32, "step": 115820, "train_speed(iter/s)": 0.286859 }, { "acc": 0.72607002, "epoch": 1.2956642949918, "grad_norm": 6.25, "learning_rate": 3.0243193475877477e-06, "loss": 1.0964859, "memory(GiB)": 142.32, "step": 115840, "train_speed(iter/s)": 0.286875 }, { "acc": 0.74778948, "epoch": 1.2958879939377586, "grad_norm": 5.125, "learning_rate": 3.02262056305766e-06, "loss": 0.9883276, "memory(GiB)": 142.32, "step": 115860, "train_speed(iter/s)": 0.286892 }, { "acc": 0.73439651, "epoch": 1.296111692883717, "grad_norm": 5.90625, "learning_rate": 3.0209220490613434e-06, "loss": 1.04981489, "memory(GiB)": 142.32, "step": 115880, "train_speed(iter/s)": 0.286909 }, { "acc": 0.74318805, "epoch": 1.2963353918296756, "grad_norm": 6.0625, "learning_rate": 3.0192238058311774e-06, "loss": 1.02150879, "memory(GiB)": 142.32, "step": 115900, "train_speed(iter/s)": 0.286928 }, { "acc": 0.74135571, "epoch": 1.2965590907756341, "grad_norm": 5.90625, "learning_rate": 3.0175258335995082e-06, "loss": 1.02045708, "memory(GiB)": 142.32, "step": 115920, "train_speed(iter/s)": 0.286945 }, { "acc": 0.72876191, "epoch": 1.2967827897215927, "grad_norm": 6.90625, "learning_rate": 3.0158281325986392e-06, "loss": 1.08207741, "memory(GiB)": 142.32, "step": 115940, "train_speed(iter/s)": 0.286961 }, { "acc": 0.74568958, "epoch": 1.2970064886675512, "grad_norm": 6.21875, "learning_rate": 3.014130703060843e-06, "loss": 1.01217108, "memory(GiB)": 142.32, "step": 115960, "train_speed(iter/s)": 0.286978 }, { "acc": 0.74780283, "epoch": 1.2972301876135097, "grad_norm": 5.53125, "learning_rate": 3.0124335452183505e-06, "loss": 1.00259075, "memory(GiB)": 142.32, "step": 115980, "train_speed(iter/s)": 0.286994 }, { "acc": 0.73699417, "epoch": 1.2974538865594683, "grad_norm": 6.125, "learning_rate": 3.0107366593033584e-06, "loss": 1.03886328, "memory(GiB)": 142.32, "step": 116000, "train_speed(iter/s)": 0.287009 }, { "epoch": 1.2974538865594683, "eval_acc": 0.6962821251511928, "eval_loss": 1.0718010663986206, "eval_runtime": 2340.6364, "eval_samples_per_second": 32.163, "eval_steps_per_second": 16.082, "step": 116000 }, { "acc": 0.73057933, "epoch": 1.2976775855054268, "grad_norm": 5.90625, "learning_rate": 3.0090400455480263e-06, "loss": 1.07170525, "memory(GiB)": 142.32, "step": 116020, "train_speed(iter/s)": 0.285337 }, { "acc": 0.74110527, "epoch": 1.2979012844513853, "grad_norm": 6.75, "learning_rate": 3.007343704184471e-06, "loss": 1.02544422, "memory(GiB)": 142.32, "step": 116040, "train_speed(iter/s)": 0.285355 }, { "acc": 0.73582735, "epoch": 1.2981249833973438, "grad_norm": 6.71875, "learning_rate": 3.00564763544478e-06, "loss": 1.04572048, "memory(GiB)": 142.32, "step": 116060, "train_speed(iter/s)": 0.285372 }, { "acc": 0.73778782, "epoch": 1.2983486823433024, "grad_norm": 5.0, "learning_rate": 3.0039518395609974e-06, "loss": 1.03908072, "memory(GiB)": 142.32, "step": 116080, "train_speed(iter/s)": 0.28539 }, { "acc": 0.74839134, "epoch": 1.298572381289261, "grad_norm": 6.0625, "learning_rate": 3.002256316765133e-06, "loss": 0.99249935, "memory(GiB)": 142.32, "step": 116100, "train_speed(iter/s)": 0.285409 }, { "acc": 0.73138838, "epoch": 1.2987960802352194, "grad_norm": 5.96875, "learning_rate": 3.000561067289159e-06, "loss": 1.09039497, "memory(GiB)": 142.32, "step": 116120, "train_speed(iter/s)": 0.285425 }, { "acc": 0.73868275, "epoch": 1.299019779181178, "grad_norm": 5.46875, "learning_rate": 2.998866091365009e-06, "loss": 1.0281559, "memory(GiB)": 142.32, "step": 116140, "train_speed(iter/s)": 0.285443 }, { "acc": 0.74129982, "epoch": 1.2992434781271365, "grad_norm": 5.90625, "learning_rate": 2.9971713892245825e-06, "loss": 1.01353703, "memory(GiB)": 142.32, "step": 116160, "train_speed(iter/s)": 0.28546 }, { "acc": 0.75276122, "epoch": 1.299467177073095, "grad_norm": 4.9375, "learning_rate": 2.995476961099735e-06, "loss": 0.97871494, "memory(GiB)": 142.32, "step": 116180, "train_speed(iter/s)": 0.285476 }, { "acc": 0.73492851, "epoch": 1.2996908760190535, "grad_norm": 6.40625, "learning_rate": 2.9937828072222907e-06, "loss": 1.06750278, "memory(GiB)": 142.32, "step": 116200, "train_speed(iter/s)": 0.285494 }, { "acc": 0.73676538, "epoch": 1.299914574965012, "grad_norm": 5.96875, "learning_rate": 2.9920889278240338e-06, "loss": 1.05433674, "memory(GiB)": 142.32, "step": 116220, "train_speed(iter/s)": 0.285513 }, { "acc": 0.72887735, "epoch": 1.3001382739109706, "grad_norm": 5.84375, "learning_rate": 2.990395323136712e-06, "loss": 1.06905146, "memory(GiB)": 142.32, "step": 116240, "train_speed(iter/s)": 0.285532 }, { "acc": 0.72348843, "epoch": 1.3003619728569291, "grad_norm": 6.65625, "learning_rate": 2.9887019933920337e-06, "loss": 1.10014696, "memory(GiB)": 142.32, "step": 116260, "train_speed(iter/s)": 0.285549 }, { "acc": 0.73261003, "epoch": 1.3005856718028876, "grad_norm": 5.3125, "learning_rate": 2.9870089388216706e-06, "loss": 1.05783043, "memory(GiB)": 142.32, "step": 116280, "train_speed(iter/s)": 0.285564 }, { "acc": 0.74196939, "epoch": 1.3008093707488462, "grad_norm": 6.0625, "learning_rate": 2.985316159657257e-06, "loss": 1.01563511, "memory(GiB)": 142.32, "step": 116300, "train_speed(iter/s)": 0.285581 }, { "acc": 0.73364902, "epoch": 1.3010330696948047, "grad_norm": 6.90625, "learning_rate": 2.983623656130389e-06, "loss": 1.05759945, "memory(GiB)": 142.32, "step": 116320, "train_speed(iter/s)": 0.285599 }, { "acc": 0.73798156, "epoch": 1.3012567686407632, "grad_norm": 7.8125, "learning_rate": 2.981931428472625e-06, "loss": 1.03999252, "memory(GiB)": 142.32, "step": 116340, "train_speed(iter/s)": 0.285616 }, { "acc": 0.75814333, "epoch": 1.3014804675867218, "grad_norm": 8.5625, "learning_rate": 2.9802394769154875e-06, "loss": 0.94516449, "memory(GiB)": 142.32, "step": 116360, "train_speed(iter/s)": 0.285634 }, { "acc": 0.73438787, "epoch": 1.3017041665326803, "grad_norm": 6.3125, "learning_rate": 2.978547801690458e-06, "loss": 1.0502492, "memory(GiB)": 142.32, "step": 116380, "train_speed(iter/s)": 0.28565 }, { "acc": 0.72201118, "epoch": 1.3019278654786388, "grad_norm": 6.78125, "learning_rate": 2.9768564030289827e-06, "loss": 1.10695601, "memory(GiB)": 142.32, "step": 116400, "train_speed(iter/s)": 0.285669 }, { "acc": 0.74223585, "epoch": 1.3021515644245973, "grad_norm": 7.03125, "learning_rate": 2.9751652811624686e-06, "loss": 1.04327374, "memory(GiB)": 142.32, "step": 116420, "train_speed(iter/s)": 0.285685 }, { "acc": 0.73253613, "epoch": 1.3023752633705559, "grad_norm": 6.46875, "learning_rate": 2.9734744363222855e-06, "loss": 1.06540766, "memory(GiB)": 142.32, "step": 116440, "train_speed(iter/s)": 0.285701 }, { "acc": 0.73377686, "epoch": 1.3025989623165144, "grad_norm": 5.53125, "learning_rate": 2.971783868739766e-06, "loss": 1.05758514, "memory(GiB)": 142.32, "step": 116460, "train_speed(iter/s)": 0.285718 }, { "acc": 0.74274559, "epoch": 1.302822661262473, "grad_norm": 7.03125, "learning_rate": 2.9700935786462027e-06, "loss": 1.01577816, "memory(GiB)": 142.32, "step": 116480, "train_speed(iter/s)": 0.285734 }, { "acc": 0.73378592, "epoch": 1.3030463602084315, "grad_norm": 5.6875, "learning_rate": 2.9684035662728516e-06, "loss": 1.05683708, "memory(GiB)": 142.32, "step": 116500, "train_speed(iter/s)": 0.28575 }, { "acc": 0.75071964, "epoch": 1.30327005915439, "grad_norm": 5.84375, "learning_rate": 2.9667138318509304e-06, "loss": 0.98081493, "memory(GiB)": 142.32, "step": 116520, "train_speed(iter/s)": 0.285766 }, { "acc": 0.74094553, "epoch": 1.3034937581003485, "grad_norm": 6.46875, "learning_rate": 2.9650243756116196e-06, "loss": 1.03601437, "memory(GiB)": 142.32, "step": 116540, "train_speed(iter/s)": 0.285783 }, { "acc": 0.73528657, "epoch": 1.303717457046307, "grad_norm": 6.28125, "learning_rate": 2.9633351977860624e-06, "loss": 1.04338589, "memory(GiB)": 142.32, "step": 116560, "train_speed(iter/s)": 0.2858 }, { "acc": 0.7296567, "epoch": 1.3039411559922656, "grad_norm": 7.28125, "learning_rate": 2.961646298605359e-06, "loss": 1.08769541, "memory(GiB)": 142.32, "step": 116580, "train_speed(iter/s)": 0.285817 }, { "acc": 0.74258637, "epoch": 1.304164854938224, "grad_norm": 6.65625, "learning_rate": 2.959957678300577e-06, "loss": 1.01573353, "memory(GiB)": 142.32, "step": 116600, "train_speed(iter/s)": 0.285831 }, { "acc": 0.75551529, "epoch": 1.3043885538841826, "grad_norm": 6.8125, "learning_rate": 2.9582693371027436e-06, "loss": 0.97101307, "memory(GiB)": 142.32, "step": 116620, "train_speed(iter/s)": 0.285847 }, { "acc": 0.7250411, "epoch": 1.3046122528301412, "grad_norm": 6.4375, "learning_rate": 2.956581275242848e-06, "loss": 1.0827755, "memory(GiB)": 142.32, "step": 116640, "train_speed(iter/s)": 0.285864 }, { "acc": 0.73953757, "epoch": 1.3048359517760997, "grad_norm": 5.6875, "learning_rate": 2.954893492951842e-06, "loss": 1.04004307, "memory(GiB)": 142.32, "step": 116660, "train_speed(iter/s)": 0.285879 }, { "acc": 0.73284841, "epoch": 1.3050596507220582, "grad_norm": 6.1875, "learning_rate": 2.9532059904606363e-06, "loss": 1.05734901, "memory(GiB)": 142.32, "step": 116680, "train_speed(iter/s)": 0.285897 }, { "acc": 0.73722081, "epoch": 1.3052833496680167, "grad_norm": 6.75, "learning_rate": 2.9515187680001067e-06, "loss": 1.03641415, "memory(GiB)": 142.32, "step": 116700, "train_speed(iter/s)": 0.285913 }, { "acc": 0.73762236, "epoch": 1.3055070486139753, "grad_norm": 9.8125, "learning_rate": 2.9498318258010893e-06, "loss": 1.04322681, "memory(GiB)": 142.32, "step": 116720, "train_speed(iter/s)": 0.285928 }, { "acc": 0.73771758, "epoch": 1.3057307475599338, "grad_norm": 5.59375, "learning_rate": 2.9481451640943816e-06, "loss": 1.05176859, "memory(GiB)": 142.32, "step": 116740, "train_speed(iter/s)": 0.285945 }, { "acc": 0.72701025, "epoch": 1.3059544465058923, "grad_norm": 6.90625, "learning_rate": 2.9464587831107442e-06, "loss": 1.09144611, "memory(GiB)": 142.32, "step": 116760, "train_speed(iter/s)": 0.285961 }, { "acc": 0.74404855, "epoch": 1.3061781454518508, "grad_norm": 7.90625, "learning_rate": 2.9447726830808966e-06, "loss": 1.02043505, "memory(GiB)": 142.32, "step": 116780, "train_speed(iter/s)": 0.285976 }, { "acc": 0.74810863, "epoch": 1.3064018443978094, "grad_norm": 6.71875, "learning_rate": 2.9430868642355214e-06, "loss": 0.98707895, "memory(GiB)": 142.32, "step": 116800, "train_speed(iter/s)": 0.285993 }, { "acc": 0.7381494, "epoch": 1.306625543343768, "grad_norm": 9.1875, "learning_rate": 2.941401326805263e-06, "loss": 1.03656025, "memory(GiB)": 142.32, "step": 116820, "train_speed(iter/s)": 0.286008 }, { "acc": 0.72621841, "epoch": 1.3068492422897264, "grad_norm": 6.75, "learning_rate": 2.9397160710207285e-06, "loss": 1.11152744, "memory(GiB)": 142.32, "step": 116840, "train_speed(iter/s)": 0.286023 }, { "acc": 0.72782331, "epoch": 1.307072941235685, "grad_norm": 5.3125, "learning_rate": 2.9380310971124836e-06, "loss": 1.09436817, "memory(GiB)": 142.32, "step": 116860, "train_speed(iter/s)": 0.28604 }, { "acc": 0.7523242, "epoch": 1.3072966401816435, "grad_norm": 5.8125, "learning_rate": 2.9363464053110557e-06, "loss": 0.98392525, "memory(GiB)": 142.32, "step": 116880, "train_speed(iter/s)": 0.286056 }, { "acc": 0.74136953, "epoch": 1.307520339127602, "grad_norm": 7.8125, "learning_rate": 2.9346619958469367e-06, "loss": 1.01980152, "memory(GiB)": 142.32, "step": 116900, "train_speed(iter/s)": 0.286072 }, { "acc": 0.73642511, "epoch": 1.3077440380735605, "grad_norm": 6.375, "learning_rate": 2.932977868950577e-06, "loss": 1.03659248, "memory(GiB)": 142.32, "step": 116920, "train_speed(iter/s)": 0.286089 }, { "acc": 0.73545389, "epoch": 1.307967737019519, "grad_norm": 5.65625, "learning_rate": 2.9312940248523893e-06, "loss": 1.06525478, "memory(GiB)": 142.32, "step": 116940, "train_speed(iter/s)": 0.286106 }, { "acc": 0.73895607, "epoch": 1.3081914359654776, "grad_norm": 6.03125, "learning_rate": 2.929610463782749e-06, "loss": 1.04463577, "memory(GiB)": 142.32, "step": 116960, "train_speed(iter/s)": 0.286123 }, { "acc": 0.73549442, "epoch": 1.3084151349114361, "grad_norm": 7.65625, "learning_rate": 2.9279271859719883e-06, "loss": 1.06791315, "memory(GiB)": 142.32, "step": 116980, "train_speed(iter/s)": 0.28614 }, { "acc": 0.72581296, "epoch": 1.3086388338573947, "grad_norm": 5.90625, "learning_rate": 2.926244191650406e-06, "loss": 1.09743729, "memory(GiB)": 142.32, "step": 117000, "train_speed(iter/s)": 0.286156 }, { "acc": 0.72776089, "epoch": 1.3088625328033532, "grad_norm": 7.4375, "learning_rate": 2.9245614810482583e-06, "loss": 1.07396126, "memory(GiB)": 142.32, "step": 117020, "train_speed(iter/s)": 0.286173 }, { "acc": 0.74532509, "epoch": 1.3090862317493117, "grad_norm": 4.5, "learning_rate": 2.922879054395765e-06, "loss": 1.01667881, "memory(GiB)": 142.32, "step": 117040, "train_speed(iter/s)": 0.286189 }, { "acc": 0.739534, "epoch": 1.3093099306952702, "grad_norm": 6.78125, "learning_rate": 2.9211969119231075e-06, "loss": 1.04237041, "memory(GiB)": 142.32, "step": 117060, "train_speed(iter/s)": 0.286205 }, { "acc": 0.73118305, "epoch": 1.3095336296412288, "grad_norm": 7.34375, "learning_rate": 2.9195150538604237e-06, "loss": 1.07316093, "memory(GiB)": 142.32, "step": 117080, "train_speed(iter/s)": 0.286221 }, { "acc": 0.74405189, "epoch": 1.3097573285871873, "grad_norm": 6.5625, "learning_rate": 2.9178334804378184e-06, "loss": 1.02013664, "memory(GiB)": 142.32, "step": 117100, "train_speed(iter/s)": 0.286238 }, { "acc": 0.73624039, "epoch": 1.3099810275331458, "grad_norm": 6.28125, "learning_rate": 2.916152191885354e-06, "loss": 1.03725967, "memory(GiB)": 142.32, "step": 117120, "train_speed(iter/s)": 0.286255 }, { "acc": 0.74211016, "epoch": 1.3102047264791044, "grad_norm": 6.40625, "learning_rate": 2.9144711884330535e-06, "loss": 1.03208838, "memory(GiB)": 142.32, "step": 117140, "train_speed(iter/s)": 0.286272 }, { "acc": 0.74001646, "epoch": 1.3104284254250629, "grad_norm": 5.125, "learning_rate": 2.912790470310905e-06, "loss": 1.03367062, "memory(GiB)": 142.32, "step": 117160, "train_speed(iter/s)": 0.286287 }, { "acc": 0.7431428, "epoch": 1.3106521243710214, "grad_norm": 7.09375, "learning_rate": 2.9111100377488515e-06, "loss": 1.02035923, "memory(GiB)": 142.32, "step": 117180, "train_speed(iter/s)": 0.286305 }, { "acc": 0.74470415, "epoch": 1.31087582331698, "grad_norm": 5.90625, "learning_rate": 2.909429890976806e-06, "loss": 1.00897484, "memory(GiB)": 142.32, "step": 117200, "train_speed(iter/s)": 0.286322 }, { "acc": 0.72612619, "epoch": 1.3110995222629385, "grad_norm": 6.1875, "learning_rate": 2.9077500302246286e-06, "loss": 1.10399151, "memory(GiB)": 142.32, "step": 117220, "train_speed(iter/s)": 0.286339 }, { "acc": 0.74012794, "epoch": 1.311323221208897, "grad_norm": 6.09375, "learning_rate": 2.906070455722154e-06, "loss": 1.03276939, "memory(GiB)": 142.32, "step": 117240, "train_speed(iter/s)": 0.286356 }, { "acc": 0.73042669, "epoch": 1.3115469201548555, "grad_norm": 5.78125, "learning_rate": 2.9043911676991706e-06, "loss": 1.08209295, "memory(GiB)": 142.32, "step": 117260, "train_speed(iter/s)": 0.286373 }, { "acc": 0.7439805, "epoch": 1.311770619100814, "grad_norm": 8.0625, "learning_rate": 2.9027121663854263e-06, "loss": 1.00752516, "memory(GiB)": 142.32, "step": 117280, "train_speed(iter/s)": 0.286391 }, { "acc": 0.731635, "epoch": 1.3119943180467726, "grad_norm": 5.625, "learning_rate": 2.9010334520106367e-06, "loss": 1.06603355, "memory(GiB)": 142.32, "step": 117300, "train_speed(iter/s)": 0.286407 }, { "acc": 0.74715033, "epoch": 1.312218016992731, "grad_norm": 8.0625, "learning_rate": 2.89935502480447e-06, "loss": 0.99575481, "memory(GiB)": 142.32, "step": 117320, "train_speed(iter/s)": 0.286423 }, { "acc": 0.73531556, "epoch": 1.3124417159386896, "grad_norm": 5.59375, "learning_rate": 2.897676884996563e-06, "loss": 1.05077581, "memory(GiB)": 142.32, "step": 117340, "train_speed(iter/s)": 0.28644 }, { "acc": 0.7324729, "epoch": 1.3126654148846482, "grad_norm": 6.53125, "learning_rate": 2.8959990328165078e-06, "loss": 1.07276335, "memory(GiB)": 142.32, "step": 117360, "train_speed(iter/s)": 0.286455 }, { "acc": 0.74062176, "epoch": 1.3128891138306067, "grad_norm": 6.25, "learning_rate": 2.8943214684938557e-06, "loss": 1.03097019, "memory(GiB)": 142.32, "step": 117380, "train_speed(iter/s)": 0.286471 }, { "acc": 0.73272543, "epoch": 1.3131128127765652, "grad_norm": 6.3125, "learning_rate": 2.8926441922581255e-06, "loss": 1.05460567, "memory(GiB)": 142.32, "step": 117400, "train_speed(iter/s)": 0.286487 }, { "acc": 0.73976965, "epoch": 1.3133365117225237, "grad_norm": 7.0625, "learning_rate": 2.8909672043387894e-06, "loss": 1.03363209, "memory(GiB)": 142.32, "step": 117420, "train_speed(iter/s)": 0.286504 }, { "acc": 0.75342016, "epoch": 1.3135602106684823, "grad_norm": 6.125, "learning_rate": 2.8892905049652862e-06, "loss": 0.97206726, "memory(GiB)": 142.32, "step": 117440, "train_speed(iter/s)": 0.286522 }, { "acc": 0.73702812, "epoch": 1.3137839096144408, "grad_norm": 6.71875, "learning_rate": 2.887614094367011e-06, "loss": 1.05120144, "memory(GiB)": 142.32, "step": 117460, "train_speed(iter/s)": 0.286539 }, { "acc": 0.73650999, "epoch": 1.3140076085603993, "grad_norm": 6.34375, "learning_rate": 2.885937972773319e-06, "loss": 1.03983669, "memory(GiB)": 142.32, "step": 117480, "train_speed(iter/s)": 0.286556 }, { "acc": 0.73805962, "epoch": 1.314231307506358, "grad_norm": 5.75, "learning_rate": 2.8842621404135308e-06, "loss": 1.03978567, "memory(GiB)": 142.32, "step": 117500, "train_speed(iter/s)": 0.286572 }, { "acc": 0.7388032, "epoch": 1.3144550064523166, "grad_norm": 6.03125, "learning_rate": 2.882586597516921e-06, "loss": 1.02557087, "memory(GiB)": 142.32, "step": 117520, "train_speed(iter/s)": 0.286588 }, { "acc": 0.73613348, "epoch": 1.3146787053982751, "grad_norm": 5.5, "learning_rate": 2.8809113443127312e-06, "loss": 1.03622608, "memory(GiB)": 142.32, "step": 117540, "train_speed(iter/s)": 0.286606 }, { "acc": 0.72324424, "epoch": 1.3149024043442337, "grad_norm": 6.3125, "learning_rate": 2.8792363810301587e-06, "loss": 1.11049156, "memory(GiB)": 142.32, "step": 117560, "train_speed(iter/s)": 0.286624 }, { "acc": 0.73118768, "epoch": 1.3151261032901922, "grad_norm": 6.71875, "learning_rate": 2.8775617078983596e-06, "loss": 1.07351398, "memory(GiB)": 142.32, "step": 117580, "train_speed(iter/s)": 0.286642 }, { "acc": 0.74292169, "epoch": 1.3153498022361507, "grad_norm": 5.6875, "learning_rate": 2.8758873251464583e-06, "loss": 1.00857353, "memory(GiB)": 142.32, "step": 117600, "train_speed(iter/s)": 0.28666 }, { "acc": 0.74089441, "epoch": 1.3155735011821093, "grad_norm": 5.1875, "learning_rate": 2.8742132330035283e-06, "loss": 1.03209324, "memory(GiB)": 142.32, "step": 117620, "train_speed(iter/s)": 0.286676 }, { "acc": 0.73974171, "epoch": 1.3157972001280678, "grad_norm": 5.46875, "learning_rate": 2.872539431698615e-06, "loss": 1.032127, "memory(GiB)": 142.32, "step": 117640, "train_speed(iter/s)": 0.286694 }, { "acc": 0.73185787, "epoch": 1.3160208990740263, "grad_norm": 5.84375, "learning_rate": 2.870865921460716e-06, "loss": 1.06732483, "memory(GiB)": 142.32, "step": 117660, "train_speed(iter/s)": 0.286712 }, { "acc": 0.74042764, "epoch": 1.3162445980199848, "grad_norm": 6.125, "learning_rate": 2.8691927025187886e-06, "loss": 1.03428326, "memory(GiB)": 142.32, "step": 117680, "train_speed(iter/s)": 0.286727 }, { "acc": 0.74798431, "epoch": 1.3164682969659434, "grad_norm": 5.65625, "learning_rate": 2.8675197751017586e-06, "loss": 0.97938366, "memory(GiB)": 142.32, "step": 117700, "train_speed(iter/s)": 0.286744 }, { "acc": 0.74440475, "epoch": 1.316691995911902, "grad_norm": 5.71875, "learning_rate": 2.865847139438501e-06, "loss": 1.0171937, "memory(GiB)": 142.32, "step": 117720, "train_speed(iter/s)": 0.286765 }, { "acc": 0.74545374, "epoch": 1.3169156948578604, "grad_norm": 7.09375, "learning_rate": 2.8641747957578613e-06, "loss": 1.01075077, "memory(GiB)": 142.32, "step": 117740, "train_speed(iter/s)": 0.286781 }, { "acc": 0.7436635, "epoch": 1.317139393803819, "grad_norm": 4.75, "learning_rate": 2.862502744288637e-06, "loss": 1.02270508, "memory(GiB)": 142.32, "step": 117760, "train_speed(iter/s)": 0.286798 }, { "acc": 0.71829147, "epoch": 1.3173630927497775, "grad_norm": 5.4375, "learning_rate": 2.860830985259587e-06, "loss": 1.12942238, "memory(GiB)": 142.32, "step": 117780, "train_speed(iter/s)": 0.286814 }, { "acc": 0.73214893, "epoch": 1.317586791695736, "grad_norm": 5.75, "learning_rate": 2.859159518899437e-06, "loss": 1.07224121, "memory(GiB)": 142.32, "step": 117800, "train_speed(iter/s)": 0.28683 }, { "acc": 0.73046908, "epoch": 1.3178104906416945, "grad_norm": 6.28125, "learning_rate": 2.8574883454368616e-06, "loss": 1.07830925, "memory(GiB)": 142.32, "step": 117820, "train_speed(iter/s)": 0.286846 }, { "acc": 0.74557657, "epoch": 1.318034189587653, "grad_norm": 6.5, "learning_rate": 2.8558174651005068e-06, "loss": 1.02257986, "memory(GiB)": 142.32, "step": 117840, "train_speed(iter/s)": 0.286864 }, { "acc": 0.74206705, "epoch": 1.3182578885336116, "grad_norm": 5.625, "learning_rate": 2.8541468781189695e-06, "loss": 1.02235842, "memory(GiB)": 142.32, "step": 117860, "train_speed(iter/s)": 0.286882 }, { "acc": 0.73289289, "epoch": 1.3184815874795701, "grad_norm": 6.0, "learning_rate": 2.852476584720809e-06, "loss": 1.07160549, "memory(GiB)": 142.32, "step": 117880, "train_speed(iter/s)": 0.286898 }, { "acc": 0.74293575, "epoch": 1.3187052864255286, "grad_norm": 5.90625, "learning_rate": 2.8508065851345486e-06, "loss": 1.00812435, "memory(GiB)": 142.32, "step": 117900, "train_speed(iter/s)": 0.286912 }, { "acc": 0.73940954, "epoch": 1.3189289853714872, "grad_norm": 5.96875, "learning_rate": 2.849136879588664e-06, "loss": 1.04554062, "memory(GiB)": 142.32, "step": 117920, "train_speed(iter/s)": 0.286927 }, { "acc": 0.74196768, "epoch": 1.3191526843174457, "grad_norm": 6.625, "learning_rate": 2.8474674683116e-06, "loss": 1.02090683, "memory(GiB)": 142.32, "step": 117940, "train_speed(iter/s)": 0.286942 }, { "acc": 0.7298624, "epoch": 1.3193763832634042, "grad_norm": 6.4375, "learning_rate": 2.8457983515317533e-06, "loss": 1.07946377, "memory(GiB)": 142.32, "step": 117960, "train_speed(iter/s)": 0.286956 }, { "acc": 0.74822383, "epoch": 1.3196000822093628, "grad_norm": 6.78125, "learning_rate": 2.8441295294774795e-06, "loss": 1.00128212, "memory(GiB)": 142.32, "step": 117980, "train_speed(iter/s)": 0.286972 }, { "acc": 0.73616323, "epoch": 1.3198237811553213, "grad_norm": 6.65625, "learning_rate": 2.842461002377104e-06, "loss": 1.06658516, "memory(GiB)": 142.32, "step": 118000, "train_speed(iter/s)": 0.286988 }, { "epoch": 1.3198237811553213, "eval_acc": 0.6963102735412232, "eval_loss": 1.0716960430145264, "eval_runtime": 2346.7734, "eval_samples_per_second": 32.079, "eval_steps_per_second": 16.04, "step": 118000 }, { "acc": 0.73863525, "epoch": 1.3200474801012798, "grad_norm": 6.28125, "learning_rate": 2.840792770458899e-06, "loss": 1.02788239, "memory(GiB)": 142.32, "step": 118020, "train_speed(iter/s)": 0.285341 }, { "acc": 0.74477663, "epoch": 1.3202711790472383, "grad_norm": 5.8125, "learning_rate": 2.839124833951107e-06, "loss": 1.01628971, "memory(GiB)": 142.32, "step": 118040, "train_speed(iter/s)": 0.285358 }, { "acc": 0.72360907, "epoch": 1.3204948779931969, "grad_norm": 5.03125, "learning_rate": 2.8374571930819237e-06, "loss": 1.10236511, "memory(GiB)": 142.32, "step": 118060, "train_speed(iter/s)": 0.285374 }, { "acc": 0.73243799, "epoch": 1.3207185769391554, "grad_norm": 6.15625, "learning_rate": 2.8357898480795047e-06, "loss": 1.07161751, "memory(GiB)": 142.32, "step": 118080, "train_speed(iter/s)": 0.285388 }, { "acc": 0.73570447, "epoch": 1.320942275885114, "grad_norm": 6.375, "learning_rate": 2.83412279917197e-06, "loss": 1.04362907, "memory(GiB)": 142.32, "step": 118100, "train_speed(iter/s)": 0.285405 }, { "acc": 0.72929068, "epoch": 1.3211659748310725, "grad_norm": 5.90625, "learning_rate": 2.832456046587392e-06, "loss": 1.0863102, "memory(GiB)": 142.32, "step": 118120, "train_speed(iter/s)": 0.285419 }, { "acc": 0.73837757, "epoch": 1.321389673777031, "grad_norm": 4.875, "learning_rate": 2.83078959055381e-06, "loss": 1.03658762, "memory(GiB)": 142.32, "step": 118140, "train_speed(iter/s)": 0.285435 }, { "acc": 0.74572892, "epoch": 1.3216133727229895, "grad_norm": 6.03125, "learning_rate": 2.829123431299217e-06, "loss": 1.00703297, "memory(GiB)": 142.32, "step": 118160, "train_speed(iter/s)": 0.285452 }, { "acc": 0.73306179, "epoch": 1.321837071668948, "grad_norm": 5.5625, "learning_rate": 2.827457569051566e-06, "loss": 1.05424547, "memory(GiB)": 142.32, "step": 118180, "train_speed(iter/s)": 0.285469 }, { "acc": 0.74092131, "epoch": 1.3220607706149066, "grad_norm": 7.1875, "learning_rate": 2.825792004038774e-06, "loss": 1.00546894, "memory(GiB)": 142.32, "step": 118200, "train_speed(iter/s)": 0.285483 }, { "acc": 0.74285583, "epoch": 1.322284469560865, "grad_norm": 7.09375, "learning_rate": 2.8241267364887103e-06, "loss": 1.01541767, "memory(GiB)": 142.32, "step": 118220, "train_speed(iter/s)": 0.285501 }, { "acc": 0.73344021, "epoch": 1.3225081685068236, "grad_norm": 6.1875, "learning_rate": 2.822461766629212e-06, "loss": 1.06569233, "memory(GiB)": 142.32, "step": 118240, "train_speed(iter/s)": 0.285517 }, { "acc": 0.737357, "epoch": 1.3227318674527822, "grad_norm": 5.71875, "learning_rate": 2.820797094688068e-06, "loss": 1.03374538, "memory(GiB)": 142.32, "step": 118260, "train_speed(iter/s)": 0.285533 }, { "acc": 0.73880467, "epoch": 1.3229555663987407, "grad_norm": 6.28125, "learning_rate": 2.8191327208930276e-06, "loss": 1.05163717, "memory(GiB)": 142.32, "step": 118280, "train_speed(iter/s)": 0.28555 }, { "acc": 0.73012495, "epoch": 1.3231792653446992, "grad_norm": 5.34375, "learning_rate": 2.8174686454718048e-06, "loss": 1.08312044, "memory(GiB)": 142.32, "step": 118300, "train_speed(iter/s)": 0.285567 }, { "acc": 0.71980352, "epoch": 1.3234029642906577, "grad_norm": 6.71875, "learning_rate": 2.8158048686520647e-06, "loss": 1.12874393, "memory(GiB)": 142.32, "step": 118320, "train_speed(iter/s)": 0.285584 }, { "acc": 0.73926907, "epoch": 1.3236266632366163, "grad_norm": 5.71875, "learning_rate": 2.814141390661439e-06, "loss": 1.02254438, "memory(GiB)": 142.32, "step": 118340, "train_speed(iter/s)": 0.285601 }, { "acc": 0.74598899, "epoch": 1.3238503621825748, "grad_norm": 6.5625, "learning_rate": 2.812478211727515e-06, "loss": 0.99510775, "memory(GiB)": 142.32, "step": 118360, "train_speed(iter/s)": 0.285617 }, { "acc": 0.74143248, "epoch": 1.3240740611285333, "grad_norm": 6.03125, "learning_rate": 2.8108153320778385e-06, "loss": 1.04077234, "memory(GiB)": 142.32, "step": 118380, "train_speed(iter/s)": 0.285634 }, { "acc": 0.7292428, "epoch": 1.3242977600744918, "grad_norm": 6.84375, "learning_rate": 2.809152751939915e-06, "loss": 1.08979454, "memory(GiB)": 142.32, "step": 118400, "train_speed(iter/s)": 0.285649 }, { "acc": 0.73330102, "epoch": 1.3245214590204504, "grad_norm": 6.34375, "learning_rate": 2.8074904715412084e-06, "loss": 1.06016998, "memory(GiB)": 142.32, "step": 118420, "train_speed(iter/s)": 0.285665 }, { "acc": 0.73428421, "epoch": 1.324745157966409, "grad_norm": 5.1875, "learning_rate": 2.805828491109145e-06, "loss": 1.07259369, "memory(GiB)": 142.32, "step": 118440, "train_speed(iter/s)": 0.285684 }, { "acc": 0.73356657, "epoch": 1.3249688569123674, "grad_norm": 5.53125, "learning_rate": 2.804166810871103e-06, "loss": 1.05018568, "memory(GiB)": 142.32, "step": 118460, "train_speed(iter/s)": 0.285699 }, { "acc": 0.73235292, "epoch": 1.325192555858326, "grad_norm": 6.8125, "learning_rate": 2.8025054310544297e-06, "loss": 1.0781105, "memory(GiB)": 142.32, "step": 118480, "train_speed(iter/s)": 0.285715 }, { "acc": 0.72706656, "epoch": 1.3254162548042845, "grad_norm": 6.0, "learning_rate": 2.800844351886423e-06, "loss": 1.10261745, "memory(GiB)": 142.32, "step": 118500, "train_speed(iter/s)": 0.285733 }, { "acc": 0.74416533, "epoch": 1.325639953750243, "grad_norm": 6.46875, "learning_rate": 2.79918357359434e-06, "loss": 1.01653709, "memory(GiB)": 142.32, "step": 118520, "train_speed(iter/s)": 0.285748 }, { "acc": 0.73726053, "epoch": 1.3258636526962015, "grad_norm": 4.875, "learning_rate": 2.7975230964054033e-06, "loss": 1.05003815, "memory(GiB)": 142.32, "step": 118540, "train_speed(iter/s)": 0.285763 }, { "acc": 0.74457221, "epoch": 1.32608735164216, "grad_norm": 6.0625, "learning_rate": 2.795862920546785e-06, "loss": 1.00791454, "memory(GiB)": 142.32, "step": 118560, "train_speed(iter/s)": 0.285779 }, { "acc": 0.7476984, "epoch": 1.3263110505881186, "grad_norm": 6.375, "learning_rate": 2.794203046245626e-06, "loss": 0.99743843, "memory(GiB)": 142.32, "step": 118580, "train_speed(iter/s)": 0.285796 }, { "acc": 0.72415957, "epoch": 1.3265347495340771, "grad_norm": 6.71875, "learning_rate": 2.792543473729018e-06, "loss": 1.12570829, "memory(GiB)": 142.32, "step": 118600, "train_speed(iter/s)": 0.285813 }, { "acc": 0.7442584, "epoch": 1.3267584484800357, "grad_norm": 6.78125, "learning_rate": 2.7908842032240133e-06, "loss": 1.02159576, "memory(GiB)": 142.32, "step": 118620, "train_speed(iter/s)": 0.28583 }, { "acc": 0.7338233, "epoch": 1.3269821474259942, "grad_norm": 4.90625, "learning_rate": 2.7892252349576264e-06, "loss": 1.06273251, "memory(GiB)": 142.32, "step": 118640, "train_speed(iter/s)": 0.285847 }, { "acc": 0.73399553, "epoch": 1.3272058463719527, "grad_norm": 4.71875, "learning_rate": 2.7875665691568256e-06, "loss": 1.06187363, "memory(GiB)": 142.32, "step": 118660, "train_speed(iter/s)": 0.285863 }, { "acc": 0.73939962, "epoch": 1.3274295453179112, "grad_norm": 5.8125, "learning_rate": 2.785908206048542e-06, "loss": 1.02488136, "memory(GiB)": 142.32, "step": 118680, "train_speed(iter/s)": 0.285879 }, { "acc": 0.73803787, "epoch": 1.3276532442638698, "grad_norm": 6.21875, "learning_rate": 2.784250145859663e-06, "loss": 1.0400281, "memory(GiB)": 142.32, "step": 118700, "train_speed(iter/s)": 0.285895 }, { "acc": 0.73303242, "epoch": 1.3278769432098283, "grad_norm": 5.375, "learning_rate": 2.7825923888170325e-06, "loss": 1.05903625, "memory(GiB)": 142.32, "step": 118720, "train_speed(iter/s)": 0.285913 }, { "acc": 0.74877324, "epoch": 1.3281006421557868, "grad_norm": 6.75, "learning_rate": 2.7809349351474592e-06, "loss": 0.99005013, "memory(GiB)": 142.32, "step": 118740, "train_speed(iter/s)": 0.285931 }, { "acc": 0.74080181, "epoch": 1.3283243411017454, "grad_norm": 6.5625, "learning_rate": 2.7792777850777026e-06, "loss": 1.02688808, "memory(GiB)": 142.32, "step": 118760, "train_speed(iter/s)": 0.285947 }, { "acc": 0.74348941, "epoch": 1.3285480400477039, "grad_norm": 5.9375, "learning_rate": 2.777620938834488e-06, "loss": 1.01061192, "memory(GiB)": 142.32, "step": 118780, "train_speed(iter/s)": 0.285962 }, { "acc": 0.7471571, "epoch": 1.3287717389936624, "grad_norm": 7.1875, "learning_rate": 2.775964396644495e-06, "loss": 0.98062687, "memory(GiB)": 142.32, "step": 118800, "train_speed(iter/s)": 0.285977 }, { "acc": 0.7321424, "epoch": 1.328995437939621, "grad_norm": 6.125, "learning_rate": 2.774308158734358e-06, "loss": 1.07183857, "memory(GiB)": 142.32, "step": 118820, "train_speed(iter/s)": 0.285992 }, { "acc": 0.72898808, "epoch": 1.3292191368855795, "grad_norm": 6.28125, "learning_rate": 2.7726522253306804e-06, "loss": 1.08939428, "memory(GiB)": 142.32, "step": 118840, "train_speed(iter/s)": 0.286009 }, { "acc": 0.73726349, "epoch": 1.329442835831538, "grad_norm": 4.78125, "learning_rate": 2.7709965966600116e-06, "loss": 1.04537621, "memory(GiB)": 142.32, "step": 118860, "train_speed(iter/s)": 0.286025 }, { "acc": 0.73649545, "epoch": 1.3296665347774965, "grad_norm": 5.25, "learning_rate": 2.769341272948871e-06, "loss": 1.04586973, "memory(GiB)": 142.32, "step": 118880, "train_speed(iter/s)": 0.28604 }, { "acc": 0.74334612, "epoch": 1.329890233723455, "grad_norm": 5.8125, "learning_rate": 2.7676862544237275e-06, "loss": 1.02411175, "memory(GiB)": 142.32, "step": 118900, "train_speed(iter/s)": 0.286057 }, { "acc": 0.73451862, "epoch": 1.3301139326694136, "grad_norm": 5.46875, "learning_rate": 2.7660315413110096e-06, "loss": 1.05972614, "memory(GiB)": 142.32, "step": 118920, "train_speed(iter/s)": 0.286073 }, { "acc": 0.74016371, "epoch": 1.330337631615372, "grad_norm": 5.59375, "learning_rate": 2.7643771338371096e-06, "loss": 1.03666639, "memory(GiB)": 142.32, "step": 118940, "train_speed(iter/s)": 0.28609 }, { "acc": 0.7442872, "epoch": 1.3305613305613306, "grad_norm": 6.625, "learning_rate": 2.7627230322283698e-06, "loss": 1.0091753, "memory(GiB)": 142.32, "step": 118960, "train_speed(iter/s)": 0.286107 }, { "acc": 0.73187542, "epoch": 1.3307850295072892, "grad_norm": 6.5625, "learning_rate": 2.7610692367110993e-06, "loss": 1.06970692, "memory(GiB)": 142.32, "step": 118980, "train_speed(iter/s)": 0.286122 }, { "acc": 0.75019908, "epoch": 1.3310087284532477, "grad_norm": 7.5, "learning_rate": 2.759415747511559e-06, "loss": 0.99257584, "memory(GiB)": 142.32, "step": 119000, "train_speed(iter/s)": 0.286139 }, { "acc": 0.73278065, "epoch": 1.3312324273992062, "grad_norm": 5.90625, "learning_rate": 2.757762564855968e-06, "loss": 1.05773468, "memory(GiB)": 142.32, "step": 119020, "train_speed(iter/s)": 0.286152 }, { "acc": 0.74051685, "epoch": 1.3314561263451647, "grad_norm": 7.625, "learning_rate": 2.756109688970509e-06, "loss": 1.03145599, "memory(GiB)": 142.32, "step": 119040, "train_speed(iter/s)": 0.286169 }, { "acc": 0.74399586, "epoch": 1.3316798252911233, "grad_norm": 6.0625, "learning_rate": 2.754457120081315e-06, "loss": 1.00963993, "memory(GiB)": 142.32, "step": 119060, "train_speed(iter/s)": 0.286185 }, { "acc": 0.74117036, "epoch": 1.3319035242370818, "grad_norm": 6.6875, "learning_rate": 2.752804858414485e-06, "loss": 1.02651997, "memory(GiB)": 142.32, "step": 119080, "train_speed(iter/s)": 0.286202 }, { "acc": 0.73248529, "epoch": 1.3321272231830403, "grad_norm": 6.40625, "learning_rate": 2.751152904196068e-06, "loss": 1.06666527, "memory(GiB)": 142.32, "step": 119100, "train_speed(iter/s)": 0.286218 }, { "acc": 0.73213162, "epoch": 1.3323509221289989, "grad_norm": 7.0625, "learning_rate": 2.74950125765208e-06, "loss": 1.08167706, "memory(GiB)": 142.32, "step": 119120, "train_speed(iter/s)": 0.286234 }, { "acc": 0.74002962, "epoch": 1.3325746210749574, "grad_norm": 5.875, "learning_rate": 2.747849919008487e-06, "loss": 1.04145393, "memory(GiB)": 142.32, "step": 119140, "train_speed(iter/s)": 0.286248 }, { "acc": 0.73427629, "epoch": 1.332798320020916, "grad_norm": 6.40625, "learning_rate": 2.746198888491213e-06, "loss": 1.07308197, "memory(GiB)": 142.32, "step": 119160, "train_speed(iter/s)": 0.286262 }, { "acc": 0.73333502, "epoch": 1.3330220189668744, "grad_norm": 5.65625, "learning_rate": 2.7445481663261477e-06, "loss": 1.07304344, "memory(GiB)": 142.32, "step": 119180, "train_speed(iter/s)": 0.286276 }, { "acc": 0.74813719, "epoch": 1.333245717912833, "grad_norm": 6.53125, "learning_rate": 2.742897752739129e-06, "loss": 1.00163736, "memory(GiB)": 142.32, "step": 119200, "train_speed(iter/s)": 0.286292 }, { "acc": 0.72502694, "epoch": 1.3334694168587915, "grad_norm": 7.3125, "learning_rate": 2.741247647955961e-06, "loss": 1.09716206, "memory(GiB)": 142.32, "step": 119220, "train_speed(iter/s)": 0.286309 }, { "acc": 0.7320322, "epoch": 1.33369311580475, "grad_norm": 6.21875, "learning_rate": 2.7395978522023996e-06, "loss": 1.09267483, "memory(GiB)": 142.32, "step": 119240, "train_speed(iter/s)": 0.286324 }, { "acc": 0.7369154, "epoch": 1.3339168147507086, "grad_norm": 7.25, "learning_rate": 2.737948365704159e-06, "loss": 1.04848461, "memory(GiB)": 142.32, "step": 119260, "train_speed(iter/s)": 0.286339 }, { "acc": 0.73465548, "epoch": 1.334140513696667, "grad_norm": 6.96875, "learning_rate": 2.736299188686916e-06, "loss": 1.05454807, "memory(GiB)": 142.32, "step": 119280, "train_speed(iter/s)": 0.286356 }, { "acc": 0.74299855, "epoch": 1.3343642126426256, "grad_norm": 6.25, "learning_rate": 2.7346503213762977e-06, "loss": 1.01194935, "memory(GiB)": 142.32, "step": 119300, "train_speed(iter/s)": 0.286373 }, { "acc": 0.74206524, "epoch": 1.3345879115885841, "grad_norm": 7.125, "learning_rate": 2.7330017639978968e-06, "loss": 1.01176872, "memory(GiB)": 142.32, "step": 119320, "train_speed(iter/s)": 0.28639 }, { "acc": 0.74686079, "epoch": 1.3348116105345427, "grad_norm": 6.65625, "learning_rate": 2.7313535167772575e-06, "loss": 0.99256344, "memory(GiB)": 142.32, "step": 119340, "train_speed(iter/s)": 0.286407 }, { "acc": 0.74525166, "epoch": 1.3350353094805012, "grad_norm": 6.0, "learning_rate": 2.729705579939881e-06, "loss": 1.02260246, "memory(GiB)": 142.32, "step": 119360, "train_speed(iter/s)": 0.286425 }, { "acc": 0.73091865, "epoch": 1.3352590084264597, "grad_norm": 5.75, "learning_rate": 2.728057953711234e-06, "loss": 1.08453407, "memory(GiB)": 142.32, "step": 119380, "train_speed(iter/s)": 0.286442 }, { "acc": 0.74732838, "epoch": 1.3354827073724183, "grad_norm": 5.9375, "learning_rate": 2.72641063831673e-06, "loss": 0.99470768, "memory(GiB)": 142.32, "step": 119400, "train_speed(iter/s)": 0.286457 }, { "acc": 0.74405546, "epoch": 1.3357064063183768, "grad_norm": 5.625, "learning_rate": 2.7247636339817496e-06, "loss": 1.01766415, "memory(GiB)": 142.32, "step": 119420, "train_speed(iter/s)": 0.286475 }, { "acc": 0.73989334, "epoch": 1.3359301052643353, "grad_norm": 5.3125, "learning_rate": 2.723116940931625e-06, "loss": 1.02975731, "memory(GiB)": 142.32, "step": 119440, "train_speed(iter/s)": 0.286493 }, { "acc": 0.71743879, "epoch": 1.3361538042102938, "grad_norm": 6.03125, "learning_rate": 2.7214705593916453e-06, "loss": 1.12921009, "memory(GiB)": 142.32, "step": 119460, "train_speed(iter/s)": 0.286507 }, { "acc": 0.74015951, "epoch": 1.3363775031562524, "grad_norm": 6.71875, "learning_rate": 2.719824489587062e-06, "loss": 1.04849167, "memory(GiB)": 142.32, "step": 119480, "train_speed(iter/s)": 0.286525 }, { "acc": 0.73230085, "epoch": 1.336601202102211, "grad_norm": 5.6875, "learning_rate": 2.7181787317430784e-06, "loss": 1.06998749, "memory(GiB)": 142.32, "step": 119500, "train_speed(iter/s)": 0.286542 }, { "acc": 0.75043421, "epoch": 1.3368249010481694, "grad_norm": 6.5, "learning_rate": 2.716533286084861e-06, "loss": 0.98112507, "memory(GiB)": 142.32, "step": 119520, "train_speed(iter/s)": 0.286559 }, { "acc": 0.73806758, "epoch": 1.337048599994128, "grad_norm": 6.5625, "learning_rate": 2.7148881528375282e-06, "loss": 1.04199944, "memory(GiB)": 142.32, "step": 119540, "train_speed(iter/s)": 0.286575 }, { "acc": 0.7452178, "epoch": 1.3372722989400865, "grad_norm": 5.9375, "learning_rate": 2.7132433322261554e-06, "loss": 1.00355015, "memory(GiB)": 142.32, "step": 119560, "train_speed(iter/s)": 0.286591 }, { "acc": 0.74141912, "epoch": 1.337495997886045, "grad_norm": 6.0, "learning_rate": 2.7115988244757847e-06, "loss": 1.02251911, "memory(GiB)": 142.32, "step": 119580, "train_speed(iter/s)": 0.286608 }, { "acc": 0.72796898, "epoch": 1.3377196968320035, "grad_norm": 7.34375, "learning_rate": 2.7099546298113986e-06, "loss": 1.08357677, "memory(GiB)": 142.32, "step": 119600, "train_speed(iter/s)": 0.286622 }, { "acc": 0.73948293, "epoch": 1.337943395777962, "grad_norm": 7.21875, "learning_rate": 2.7083107484579547e-06, "loss": 1.03761482, "memory(GiB)": 142.32, "step": 119620, "train_speed(iter/s)": 0.286637 }, { "acc": 0.75313025, "epoch": 1.3381670947239206, "grad_norm": 6.0625, "learning_rate": 2.7066671806403533e-06, "loss": 0.98651085, "memory(GiB)": 142.32, "step": 119640, "train_speed(iter/s)": 0.286655 }, { "acc": 0.7327672, "epoch": 1.3383907936698791, "grad_norm": 5.59375, "learning_rate": 2.705023926583463e-06, "loss": 1.06842518, "memory(GiB)": 142.32, "step": 119660, "train_speed(iter/s)": 0.28667 }, { "acc": 0.73606014, "epoch": 1.3386144926158376, "grad_norm": 5.78125, "learning_rate": 2.703380986512103e-06, "loss": 1.04008207, "memory(GiB)": 142.32, "step": 119680, "train_speed(iter/s)": 0.286686 }, { "acc": 0.72670317, "epoch": 1.3388381915617962, "grad_norm": 6.46875, "learning_rate": 2.7017383606510483e-06, "loss": 1.09124336, "memory(GiB)": 142.32, "step": 119700, "train_speed(iter/s)": 0.286703 }, { "acc": 0.74052963, "epoch": 1.3390618905077547, "grad_norm": 5.375, "learning_rate": 2.7000960492250365e-06, "loss": 1.03242416, "memory(GiB)": 142.32, "step": 119720, "train_speed(iter/s)": 0.286719 }, { "acc": 0.73736162, "epoch": 1.3392855894537132, "grad_norm": 6.0625, "learning_rate": 2.6984540524587576e-06, "loss": 1.0475914, "memory(GiB)": 142.32, "step": 119740, "train_speed(iter/s)": 0.286736 }, { "acc": 0.73330603, "epoch": 1.3395092883996718, "grad_norm": 6.65625, "learning_rate": 2.6968123705768624e-06, "loss": 1.06470985, "memory(GiB)": 142.32, "step": 119760, "train_speed(iter/s)": 0.286751 }, { "acc": 0.74560194, "epoch": 1.3397329873456303, "grad_norm": 5.625, "learning_rate": 2.6951710038039545e-06, "loss": 1.02423639, "memory(GiB)": 142.32, "step": 119780, "train_speed(iter/s)": 0.286768 }, { "acc": 0.73904715, "epoch": 1.3399566862915888, "grad_norm": 7.25, "learning_rate": 2.693529952364595e-06, "loss": 1.03837652, "memory(GiB)": 142.32, "step": 119800, "train_speed(iter/s)": 0.286785 }, { "acc": 0.73359146, "epoch": 1.3401803852375473, "grad_norm": 6.46875, "learning_rate": 2.6918892164833075e-06, "loss": 1.06061249, "memory(GiB)": 142.32, "step": 119820, "train_speed(iter/s)": 0.286802 }, { "acc": 0.7319912, "epoch": 1.3404040841835059, "grad_norm": 5.6875, "learning_rate": 2.690248796384564e-06, "loss": 1.05630693, "memory(GiB)": 142.32, "step": 119840, "train_speed(iter/s)": 0.286819 }, { "acc": 0.73665514, "epoch": 1.3406277831294644, "grad_norm": 5.59375, "learning_rate": 2.6886086922928012e-06, "loss": 1.05023212, "memory(GiB)": 142.32, "step": 119860, "train_speed(iter/s)": 0.286835 }, { "acc": 0.72726998, "epoch": 1.340851482075423, "grad_norm": 5.53125, "learning_rate": 2.686968904432406e-06, "loss": 1.09915686, "memory(GiB)": 142.32, "step": 119880, "train_speed(iter/s)": 0.28685 }, { "acc": 0.73116808, "epoch": 1.3410751810213815, "grad_norm": 5.03125, "learning_rate": 2.6853294330277237e-06, "loss": 1.05509014, "memory(GiB)": 142.32, "step": 119900, "train_speed(iter/s)": 0.286867 }, { "acc": 0.73456497, "epoch": 1.34129887996734, "grad_norm": 5.5625, "learning_rate": 2.6836902783030615e-06, "loss": 1.05197525, "memory(GiB)": 142.32, "step": 119920, "train_speed(iter/s)": 0.286883 }, { "acc": 0.7398056, "epoch": 1.3415225789132985, "grad_norm": 6.46875, "learning_rate": 2.6820514404826747e-06, "loss": 1.02382336, "memory(GiB)": 142.32, "step": 119940, "train_speed(iter/s)": 0.2869 }, { "acc": 0.73344927, "epoch": 1.341746277859257, "grad_norm": 6.75, "learning_rate": 2.6804129197907833e-06, "loss": 1.06043577, "memory(GiB)": 142.32, "step": 119960, "train_speed(iter/s)": 0.286917 }, { "acc": 0.73086643, "epoch": 1.3419699768052156, "grad_norm": 6.0, "learning_rate": 2.6787747164515603e-06, "loss": 1.05949793, "memory(GiB)": 142.32, "step": 119980, "train_speed(iter/s)": 0.286934 }, { "acc": 0.73670034, "epoch": 1.342193675751174, "grad_norm": 6.75, "learning_rate": 2.6771368306891318e-06, "loss": 1.0467514, "memory(GiB)": 142.32, "step": 120000, "train_speed(iter/s)": 0.28695 }, { "epoch": 1.342193675751174, "eval_acc": 0.6963128369672855, "eval_loss": 1.0716651678085327, "eval_runtime": 2340.4047, "eval_samples_per_second": 32.167, "eval_steps_per_second": 16.084, "step": 120000 }, { "acc": 0.74020824, "epoch": 1.3424173746971326, "grad_norm": 7.125, "learning_rate": 2.675499262727588e-06, "loss": 1.03327541, "memory(GiB)": 142.32, "step": 120020, "train_speed(iter/s)": 0.285335 }, { "acc": 0.73855305, "epoch": 1.3426410736430912, "grad_norm": 6.8125, "learning_rate": 2.6738620127909676e-06, "loss": 1.04985199, "memory(GiB)": 142.32, "step": 120040, "train_speed(iter/s)": 0.285351 }, { "acc": 0.74380851, "epoch": 1.3428647725890497, "grad_norm": 6.78125, "learning_rate": 2.6722250811032735e-06, "loss": 1.02056799, "memory(GiB)": 142.32, "step": 120060, "train_speed(iter/s)": 0.285368 }, { "acc": 0.73888631, "epoch": 1.3430884715350082, "grad_norm": 6.09375, "learning_rate": 2.670588467888461e-06, "loss": 1.04616661, "memory(GiB)": 142.32, "step": 120080, "train_speed(iter/s)": 0.285384 }, { "acc": 0.73331556, "epoch": 1.3433121704809667, "grad_norm": 7.5625, "learning_rate": 2.6689521733704382e-06, "loss": 1.06477127, "memory(GiB)": 142.32, "step": 120100, "train_speed(iter/s)": 0.285401 }, { "acc": 0.73106079, "epoch": 1.3435358694269253, "grad_norm": 5.5625, "learning_rate": 2.667316197773079e-06, "loss": 1.05683918, "memory(GiB)": 142.32, "step": 120120, "train_speed(iter/s)": 0.285416 }, { "acc": 0.74085817, "epoch": 1.3437595683728838, "grad_norm": 5.90625, "learning_rate": 2.6656805413202036e-06, "loss": 1.03236198, "memory(GiB)": 142.32, "step": 120140, "train_speed(iter/s)": 0.285433 }, { "acc": 0.75065174, "epoch": 1.3439832673188423, "grad_norm": 5.96875, "learning_rate": 2.664045204235597e-06, "loss": 0.98562565, "memory(GiB)": 142.32, "step": 120160, "train_speed(iter/s)": 0.285451 }, { "acc": 0.74533997, "epoch": 1.3442069662648009, "grad_norm": 6.375, "learning_rate": 2.662410186742995e-06, "loss": 1.01126451, "memory(GiB)": 142.32, "step": 120180, "train_speed(iter/s)": 0.285468 }, { "acc": 0.73164854, "epoch": 1.3444306652107594, "grad_norm": 4.25, "learning_rate": 2.6607754890660892e-06, "loss": 1.07825432, "memory(GiB)": 142.32, "step": 120200, "train_speed(iter/s)": 0.285482 }, { "acc": 0.73079615, "epoch": 1.344654364156718, "grad_norm": 4.90625, "learning_rate": 2.6591411114285337e-06, "loss": 1.06799316, "memory(GiB)": 142.32, "step": 120220, "train_speed(iter/s)": 0.285499 }, { "acc": 0.74521208, "epoch": 1.3448780631026764, "grad_norm": 6.65625, "learning_rate": 2.657507054053931e-06, "loss": 1.00886736, "memory(GiB)": 142.32, "step": 120240, "train_speed(iter/s)": 0.285513 }, { "acc": 0.74173088, "epoch": 1.345101762048635, "grad_norm": 6.75, "learning_rate": 2.6558733171658473e-06, "loss": 1.01390018, "memory(GiB)": 142.32, "step": 120260, "train_speed(iter/s)": 0.285529 }, { "acc": 0.74740076, "epoch": 1.3453254609945935, "grad_norm": 6.15625, "learning_rate": 2.654239900987799e-06, "loss": 1.00902805, "memory(GiB)": 142.32, "step": 120280, "train_speed(iter/s)": 0.285547 }, { "acc": 0.72807589, "epoch": 1.345549159940552, "grad_norm": 5.9375, "learning_rate": 2.6526068057432585e-06, "loss": 1.07763062, "memory(GiB)": 142.32, "step": 120300, "train_speed(iter/s)": 0.285565 }, { "acc": 0.74989982, "epoch": 1.3457728588865105, "grad_norm": 6.90625, "learning_rate": 2.6509740316556616e-06, "loss": 0.99786081, "memory(GiB)": 142.32, "step": 120320, "train_speed(iter/s)": 0.28558 }, { "acc": 0.75047398, "epoch": 1.345996557832469, "grad_norm": 5.53125, "learning_rate": 2.6493415789483902e-06, "loss": 0.98140669, "memory(GiB)": 142.32, "step": 120340, "train_speed(iter/s)": 0.285596 }, { "acc": 0.75001397, "epoch": 1.3462202567784276, "grad_norm": 6.1875, "learning_rate": 2.647709447844792e-06, "loss": 0.97275324, "memory(GiB)": 142.32, "step": 120360, "train_speed(iter/s)": 0.285614 }, { "acc": 0.7376677, "epoch": 1.3464439557243861, "grad_norm": 6.1875, "learning_rate": 2.646077638568162e-06, "loss": 1.04817371, "memory(GiB)": 142.32, "step": 120380, "train_speed(iter/s)": 0.285628 }, { "acc": 0.73488407, "epoch": 1.3466676546703447, "grad_norm": 7.15625, "learning_rate": 2.644446151341755e-06, "loss": 1.05255489, "memory(GiB)": 142.32, "step": 120400, "train_speed(iter/s)": 0.285646 }, { "acc": 0.73729811, "epoch": 1.3468913536163032, "grad_norm": 5.5625, "learning_rate": 2.6428149863887854e-06, "loss": 1.05225563, "memory(GiB)": 142.32, "step": 120420, "train_speed(iter/s)": 0.285663 }, { "acc": 0.73001113, "epoch": 1.3471150525622617, "grad_norm": 6.125, "learning_rate": 2.641184143932416e-06, "loss": 1.08379841, "memory(GiB)": 142.32, "step": 120440, "train_speed(iter/s)": 0.285682 }, { "acc": 0.74820518, "epoch": 1.3473387515082202, "grad_norm": 5.53125, "learning_rate": 2.639553624195772e-06, "loss": 0.99007463, "memory(GiB)": 142.32, "step": 120460, "train_speed(iter/s)": 0.2857 }, { "acc": 0.74344244, "epoch": 1.3475624504541788, "grad_norm": 5.59375, "learning_rate": 2.6379234274019313e-06, "loss": 1.01451187, "memory(GiB)": 142.32, "step": 120480, "train_speed(iter/s)": 0.285718 }, { "acc": 0.7299499, "epoch": 1.3477861494001373, "grad_norm": 5.875, "learning_rate": 2.6362935537739254e-06, "loss": 1.07031507, "memory(GiB)": 142.32, "step": 120500, "train_speed(iter/s)": 0.285734 }, { "acc": 0.75556426, "epoch": 1.3480098483460958, "grad_norm": 5.75, "learning_rate": 2.6346640035347483e-06, "loss": 0.95681057, "memory(GiB)": 142.32, "step": 120520, "train_speed(iter/s)": 0.285753 }, { "acc": 0.73724461, "epoch": 1.3482335472920544, "grad_norm": 5.59375, "learning_rate": 2.633034776907342e-06, "loss": 1.03946114, "memory(GiB)": 142.32, "step": 120540, "train_speed(iter/s)": 0.285769 }, { "acc": 0.73939757, "epoch": 1.3484572462380129, "grad_norm": 6.5, "learning_rate": 2.631405874114612e-06, "loss": 1.0284008, "memory(GiB)": 142.32, "step": 120560, "train_speed(iter/s)": 0.285786 }, { "acc": 0.74400816, "epoch": 1.3486809451839714, "grad_norm": 5.375, "learning_rate": 2.629777295379414e-06, "loss": 1.01310387, "memory(GiB)": 142.32, "step": 120580, "train_speed(iter/s)": 0.285801 }, { "acc": 0.74215593, "epoch": 1.34890464412993, "grad_norm": 5.84375, "learning_rate": 2.628149040924558e-06, "loss": 1.02588072, "memory(GiB)": 142.32, "step": 120600, "train_speed(iter/s)": 0.285818 }, { "acc": 0.74549904, "epoch": 1.3491283430758885, "grad_norm": 5.09375, "learning_rate": 2.626521110972816e-06, "loss": 1.00265636, "memory(GiB)": 142.32, "step": 120620, "train_speed(iter/s)": 0.285836 }, { "acc": 0.7435905, "epoch": 1.349352042021847, "grad_norm": 6.59375, "learning_rate": 2.62489350574691e-06, "loss": 1.02638302, "memory(GiB)": 142.32, "step": 120640, "train_speed(iter/s)": 0.285853 }, { "acc": 0.74613161, "epoch": 1.3495757409678055, "grad_norm": 6.15625, "learning_rate": 2.623266225469522e-06, "loss": 1.01572113, "memory(GiB)": 142.32, "step": 120660, "train_speed(iter/s)": 0.285871 }, { "acc": 0.74311676, "epoch": 1.349799439913764, "grad_norm": 5.28125, "learning_rate": 2.621639270363285e-06, "loss": 1.03604031, "memory(GiB)": 142.32, "step": 120680, "train_speed(iter/s)": 0.285889 }, { "acc": 0.74519958, "epoch": 1.3500231388597226, "grad_norm": 7.5625, "learning_rate": 2.62001264065079e-06, "loss": 1.02297497, "memory(GiB)": 142.32, "step": 120700, "train_speed(iter/s)": 0.285905 }, { "acc": 0.73622465, "epoch": 1.350246837805681, "grad_norm": 5.65625, "learning_rate": 2.618386336554584e-06, "loss": 1.05297823, "memory(GiB)": 142.32, "step": 120720, "train_speed(iter/s)": 0.285922 }, { "acc": 0.72957506, "epoch": 1.3504705367516396, "grad_norm": 5.625, "learning_rate": 2.616760358297167e-06, "loss": 1.08158598, "memory(GiB)": 142.32, "step": 120740, "train_speed(iter/s)": 0.285938 }, { "acc": 0.74479294, "epoch": 1.3506942356975982, "grad_norm": 5.78125, "learning_rate": 2.615134706101001e-06, "loss": 0.98328953, "memory(GiB)": 142.32, "step": 120760, "train_speed(iter/s)": 0.285954 }, { "acc": 0.73744154, "epoch": 1.3509179346435567, "grad_norm": 6.09375, "learning_rate": 2.6135093801884913e-06, "loss": 1.04420748, "memory(GiB)": 142.32, "step": 120780, "train_speed(iter/s)": 0.285971 }, { "acc": 0.73594761, "epoch": 1.3511416335895152, "grad_norm": 5.28125, "learning_rate": 2.6118843807820118e-06, "loss": 1.05697813, "memory(GiB)": 142.32, "step": 120800, "train_speed(iter/s)": 0.28599 }, { "acc": 0.72755408, "epoch": 1.3513653325354738, "grad_norm": 6.5625, "learning_rate": 2.6102597081038816e-06, "loss": 1.09321613, "memory(GiB)": 142.32, "step": 120820, "train_speed(iter/s)": 0.286008 }, { "acc": 0.73742113, "epoch": 1.3515890314814323, "grad_norm": 6.65625, "learning_rate": 2.6086353623763796e-06, "loss": 1.03828697, "memory(GiB)": 142.32, "step": 120840, "train_speed(iter/s)": 0.286025 }, { "acc": 0.72542095, "epoch": 1.3518127304273908, "grad_norm": 5.65625, "learning_rate": 2.6070113438217413e-06, "loss": 1.09828167, "memory(GiB)": 142.32, "step": 120860, "train_speed(iter/s)": 0.286041 }, { "acc": 0.73877583, "epoch": 1.3520364293733493, "grad_norm": 6.375, "learning_rate": 2.6053876526621546e-06, "loss": 1.03411007, "memory(GiB)": 142.32, "step": 120880, "train_speed(iter/s)": 0.286057 }, { "acc": 0.74409399, "epoch": 1.3522601283193079, "grad_norm": 6.28125, "learning_rate": 2.6037642891197644e-06, "loss": 1.02322063, "memory(GiB)": 142.32, "step": 120900, "train_speed(iter/s)": 0.286073 }, { "acc": 0.73487287, "epoch": 1.3524838272652664, "grad_norm": 6.6875, "learning_rate": 2.60214125341667e-06, "loss": 1.04141445, "memory(GiB)": 142.32, "step": 120920, "train_speed(iter/s)": 0.286087 }, { "acc": 0.7353426, "epoch": 1.352707526211225, "grad_norm": 5.4375, "learning_rate": 2.600518545774924e-06, "loss": 1.05122433, "memory(GiB)": 142.32, "step": 120940, "train_speed(iter/s)": 0.286104 }, { "acc": 0.72186317, "epoch": 1.3529312251571834, "grad_norm": 6.625, "learning_rate": 2.598896166416539e-06, "loss": 1.11138554, "memory(GiB)": 142.32, "step": 120960, "train_speed(iter/s)": 0.286119 }, { "acc": 0.73628178, "epoch": 1.353154924103142, "grad_norm": 6.28125, "learning_rate": 2.5972741155634763e-06, "loss": 1.0554678, "memory(GiB)": 142.32, "step": 120980, "train_speed(iter/s)": 0.286134 }, { "acc": 0.73688135, "epoch": 1.3533786230491005, "grad_norm": 6.21875, "learning_rate": 2.595652393437659e-06, "loss": 1.03942089, "memory(GiB)": 142.32, "step": 121000, "train_speed(iter/s)": 0.286149 }, { "acc": 0.73653626, "epoch": 1.353602321995059, "grad_norm": 6.15625, "learning_rate": 2.59403100026096e-06, "loss": 1.04292526, "memory(GiB)": 142.32, "step": 121020, "train_speed(iter/s)": 0.286167 }, { "acc": 0.72635031, "epoch": 1.3538260209410176, "grad_norm": 6.34375, "learning_rate": 2.5924099362552085e-06, "loss": 1.10787697, "memory(GiB)": 142.32, "step": 121040, "train_speed(iter/s)": 0.286184 }, { "acc": 0.74273376, "epoch": 1.354049719886976, "grad_norm": 6.34375, "learning_rate": 2.590789201642192e-06, "loss": 1.02555456, "memory(GiB)": 142.32, "step": 121060, "train_speed(iter/s)": 0.2862 }, { "acc": 0.74707427, "epoch": 1.3542734188329346, "grad_norm": 7.1875, "learning_rate": 2.589168796643645e-06, "loss": 0.99853821, "memory(GiB)": 142.32, "step": 121080, "train_speed(iter/s)": 0.286218 }, { "acc": 0.73640642, "epoch": 1.3544971177788931, "grad_norm": 6.59375, "learning_rate": 2.587548721481269e-06, "loss": 1.04586678, "memory(GiB)": 142.32, "step": 121100, "train_speed(iter/s)": 0.286234 }, { "acc": 0.72685566, "epoch": 1.3547208167248517, "grad_norm": 7.46875, "learning_rate": 2.5859289763767088e-06, "loss": 1.09524193, "memory(GiB)": 142.32, "step": 121120, "train_speed(iter/s)": 0.286249 }, { "acc": 0.72711892, "epoch": 1.3549445156708102, "grad_norm": 6.78125, "learning_rate": 2.5843095615515678e-06, "loss": 1.09009418, "memory(GiB)": 142.32, "step": 121140, "train_speed(iter/s)": 0.286266 }, { "acc": 0.72875957, "epoch": 1.3551682146167687, "grad_norm": 5.53125, "learning_rate": 2.582690477227409e-06, "loss": 1.08554955, "memory(GiB)": 142.32, "step": 121160, "train_speed(iter/s)": 0.286282 }, { "acc": 0.7272068, "epoch": 1.3553919135627273, "grad_norm": 6.71875, "learning_rate": 2.581071723625742e-06, "loss": 1.11450939, "memory(GiB)": 142.32, "step": 121180, "train_speed(iter/s)": 0.286299 }, { "acc": 0.73201494, "epoch": 1.3556156125086858, "grad_norm": 6.4375, "learning_rate": 2.57945330096804e-06, "loss": 1.06770296, "memory(GiB)": 142.32, "step": 121200, "train_speed(iter/s)": 0.286314 }, { "acc": 0.74056349, "epoch": 1.3558393114546443, "grad_norm": 5.625, "learning_rate": 2.577835209475724e-06, "loss": 1.04478817, "memory(GiB)": 142.32, "step": 121220, "train_speed(iter/s)": 0.286327 }, { "acc": 0.75096035, "epoch": 1.3560630104006028, "grad_norm": 7.75, "learning_rate": 2.5762174493701696e-06, "loss": 0.99063034, "memory(GiB)": 142.32, "step": 121240, "train_speed(iter/s)": 0.286343 }, { "acc": 0.73528514, "epoch": 1.3562867093465614, "grad_norm": 6.34375, "learning_rate": 2.5746000208727145e-06, "loss": 1.06230221, "memory(GiB)": 142.32, "step": 121260, "train_speed(iter/s)": 0.28636 }, { "acc": 0.75089579, "epoch": 1.35651040829252, "grad_norm": 6.09375, "learning_rate": 2.572982924204641e-06, "loss": 0.99010715, "memory(GiB)": 142.32, "step": 121280, "train_speed(iter/s)": 0.286375 }, { "acc": 0.74866972, "epoch": 1.3567341072384784, "grad_norm": 8.8125, "learning_rate": 2.5713661595871965e-06, "loss": 0.98584518, "memory(GiB)": 142.32, "step": 121300, "train_speed(iter/s)": 0.286391 }, { "acc": 0.73765559, "epoch": 1.356957806184437, "grad_norm": 6.375, "learning_rate": 2.569749727241574e-06, "loss": 1.05255718, "memory(GiB)": 142.32, "step": 121320, "train_speed(iter/s)": 0.286404 }, { "acc": 0.74593019, "epoch": 1.3571815051303955, "grad_norm": 5.65625, "learning_rate": 2.5681336273889225e-06, "loss": 1.01326275, "memory(GiB)": 142.32, "step": 121340, "train_speed(iter/s)": 0.286421 }, { "acc": 0.72710562, "epoch": 1.357405204076354, "grad_norm": 6.40625, "learning_rate": 2.5665178602503528e-06, "loss": 1.08182735, "memory(GiB)": 142.32, "step": 121360, "train_speed(iter/s)": 0.286439 }, { "acc": 0.73792572, "epoch": 1.3576289030223125, "grad_norm": 6.625, "learning_rate": 2.56490242604692e-06, "loss": 1.05718021, "memory(GiB)": 142.32, "step": 121380, "train_speed(iter/s)": 0.286453 }, { "acc": 0.74120564, "epoch": 1.357852601968271, "grad_norm": 6.75, "learning_rate": 2.563287324999643e-06, "loss": 1.01425934, "memory(GiB)": 142.32, "step": 121400, "train_speed(iter/s)": 0.286469 }, { "acc": 0.74272542, "epoch": 1.3580763009142296, "grad_norm": 6.4375, "learning_rate": 2.561672557329489e-06, "loss": 1.0248209, "memory(GiB)": 142.32, "step": 121420, "train_speed(iter/s)": 0.286481 }, { "acc": 0.74889364, "epoch": 1.3582999998601881, "grad_norm": 5.65625, "learning_rate": 2.5600581232573782e-06, "loss": 1.00033836, "memory(GiB)": 142.32, "step": 121440, "train_speed(iter/s)": 0.286497 }, { "acc": 0.74109564, "epoch": 1.3585236988061467, "grad_norm": 6.5625, "learning_rate": 2.558444023004193e-06, "loss": 1.02817898, "memory(GiB)": 142.32, "step": 121460, "train_speed(iter/s)": 0.286514 }, { "acc": 0.74277434, "epoch": 1.3587473977521052, "grad_norm": 6.40625, "learning_rate": 2.5568302567907623e-06, "loss": 1.0179678, "memory(GiB)": 142.32, "step": 121480, "train_speed(iter/s)": 0.286531 }, { "acc": 0.73275695, "epoch": 1.3589710966980637, "grad_norm": 6.375, "learning_rate": 2.5552168248378737e-06, "loss": 1.06490784, "memory(GiB)": 142.32, "step": 121500, "train_speed(iter/s)": 0.286548 }, { "acc": 0.73885922, "epoch": 1.3591947956440222, "grad_norm": 6.25, "learning_rate": 2.5536037273662686e-06, "loss": 1.04412212, "memory(GiB)": 142.32, "step": 121520, "train_speed(iter/s)": 0.286564 }, { "acc": 0.74026985, "epoch": 1.3594184945899808, "grad_norm": 5.96875, "learning_rate": 2.551990964596639e-06, "loss": 1.04324379, "memory(GiB)": 142.32, "step": 121540, "train_speed(iter/s)": 0.28658 }, { "acc": 0.75613537, "epoch": 1.3596421935359393, "grad_norm": 5.34375, "learning_rate": 2.550378536749637e-06, "loss": 0.96166611, "memory(GiB)": 142.32, "step": 121560, "train_speed(iter/s)": 0.286596 }, { "acc": 0.7377182, "epoch": 1.3598658924818978, "grad_norm": 5.15625, "learning_rate": 2.548766444045862e-06, "loss": 1.03525896, "memory(GiB)": 142.32, "step": 121580, "train_speed(iter/s)": 0.286612 }, { "acc": 0.73570161, "epoch": 1.3600895914278563, "grad_norm": 5.75, "learning_rate": 2.5471546867058763e-06, "loss": 1.04082413, "memory(GiB)": 142.32, "step": 121600, "train_speed(iter/s)": 0.286629 }, { "acc": 0.73855639, "epoch": 1.3603132903738149, "grad_norm": 6.28125, "learning_rate": 2.5455432649501883e-06, "loss": 1.05045204, "memory(GiB)": 142.32, "step": 121620, "train_speed(iter/s)": 0.286645 }, { "acc": 0.72515373, "epoch": 1.3605369893197734, "grad_norm": 6.125, "learning_rate": 2.543932178999262e-06, "loss": 1.10796051, "memory(GiB)": 142.32, "step": 121640, "train_speed(iter/s)": 0.286659 }, { "acc": 0.73733664, "epoch": 1.360760688265732, "grad_norm": 8.625, "learning_rate": 2.542321429073521e-06, "loss": 1.06532164, "memory(GiB)": 142.32, "step": 121660, "train_speed(iter/s)": 0.286674 }, { "acc": 0.73805208, "epoch": 1.3609843872116905, "grad_norm": 6.4375, "learning_rate": 2.5407110153933345e-06, "loss": 1.03996439, "memory(GiB)": 142.32, "step": 121680, "train_speed(iter/s)": 0.28669 }, { "acc": 0.72978735, "epoch": 1.361208086157649, "grad_norm": 6.71875, "learning_rate": 2.539100938179035e-06, "loss": 1.0804512, "memory(GiB)": 142.32, "step": 121700, "train_speed(iter/s)": 0.286704 }, { "acc": 0.7315279, "epoch": 1.3614317851036075, "grad_norm": 6.21875, "learning_rate": 2.5374911976509008e-06, "loss": 1.07892103, "memory(GiB)": 142.32, "step": 121720, "train_speed(iter/s)": 0.286721 }, { "acc": 0.7310555, "epoch": 1.361655484049566, "grad_norm": 6.21875, "learning_rate": 2.5358817940291667e-06, "loss": 1.07463303, "memory(GiB)": 142.32, "step": 121740, "train_speed(iter/s)": 0.286737 }, { "acc": 0.74808002, "epoch": 1.3618791829955246, "grad_norm": 6.1875, "learning_rate": 2.5342727275340258e-06, "loss": 0.99370956, "memory(GiB)": 142.32, "step": 121760, "train_speed(iter/s)": 0.286752 }, { "acc": 0.74601488, "epoch": 1.362102881941483, "grad_norm": 6.78125, "learning_rate": 2.532663998385617e-06, "loss": 1.01301584, "memory(GiB)": 142.32, "step": 121780, "train_speed(iter/s)": 0.286769 }, { "acc": 0.73927636, "epoch": 1.3623265808874416, "grad_norm": 6.46875, "learning_rate": 2.531055606804041e-06, "loss": 1.02722149, "memory(GiB)": 142.32, "step": 121800, "train_speed(iter/s)": 0.286783 }, { "acc": 0.74131718, "epoch": 1.3625502798334002, "grad_norm": 7.3125, "learning_rate": 2.5294475530093477e-06, "loss": 1.02661533, "memory(GiB)": 142.32, "step": 121820, "train_speed(iter/s)": 0.286799 }, { "acc": 0.72581205, "epoch": 1.3627739787793587, "grad_norm": 6.8125, "learning_rate": 2.5278398372215395e-06, "loss": 1.11927023, "memory(GiB)": 142.32, "step": 121840, "train_speed(iter/s)": 0.286815 }, { "acc": 0.74064951, "epoch": 1.3629976777253172, "grad_norm": 7.4375, "learning_rate": 2.526232459660578e-06, "loss": 1.03947201, "memory(GiB)": 142.32, "step": 121860, "train_speed(iter/s)": 0.286831 }, { "acc": 0.7375865, "epoch": 1.3632213766712757, "grad_norm": 6.40625, "learning_rate": 2.5246254205463738e-06, "loss": 1.0510417, "memory(GiB)": 142.32, "step": 121880, "train_speed(iter/s)": 0.286847 }, { "acc": 0.74176693, "epoch": 1.3634450756172343, "grad_norm": 5.84375, "learning_rate": 2.5230187200987945e-06, "loss": 1.02958155, "memory(GiB)": 142.32, "step": 121900, "train_speed(iter/s)": 0.286862 }, { "acc": 0.7384944, "epoch": 1.3636687745631928, "grad_norm": 7.03125, "learning_rate": 2.5214123585376582e-06, "loss": 1.05376911, "memory(GiB)": 142.32, "step": 121920, "train_speed(iter/s)": 0.286878 }, { "acc": 0.73474689, "epoch": 1.3638924735091513, "grad_norm": 6.21875, "learning_rate": 2.519806336082739e-06, "loss": 1.05362692, "memory(GiB)": 142.32, "step": 121940, "train_speed(iter/s)": 0.286895 }, { "acc": 0.72808275, "epoch": 1.3641161724551099, "grad_norm": 5.4375, "learning_rate": 2.5182006529537626e-06, "loss": 1.089149, "memory(GiB)": 142.32, "step": 121960, "train_speed(iter/s)": 0.286913 }, { "acc": 0.72580743, "epoch": 1.3643398714010684, "grad_norm": 6.5, "learning_rate": 2.5165953093704088e-06, "loss": 1.1036725, "memory(GiB)": 142.32, "step": 121980, "train_speed(iter/s)": 0.286929 }, { "acc": 0.73680973, "epoch": 1.364563570347027, "grad_norm": 6.1875, "learning_rate": 2.5149903055523145e-06, "loss": 1.05505867, "memory(GiB)": 142.32, "step": 122000, "train_speed(iter/s)": 0.286947 }, { "epoch": 1.364563570347027, "eval_acc": 0.6963164849197588, "eval_loss": 1.071678876876831, "eval_runtime": 2340.3731, "eval_samples_per_second": 32.167, "eval_steps_per_second": 16.084, "step": 122000 }, { "acc": 0.74393864, "epoch": 1.3647872692929854, "grad_norm": 5.75, "learning_rate": 2.5133856417190635e-06, "loss": 1.0092762, "memory(GiB)": 142.32, "step": 122020, "train_speed(iter/s)": 0.28536 }, { "acc": 0.73869705, "epoch": 1.365010968238944, "grad_norm": 5.96875, "learning_rate": 2.5117813180901997e-06, "loss": 1.04328899, "memory(GiB)": 142.32, "step": 122040, "train_speed(iter/s)": 0.285376 }, { "acc": 0.73516135, "epoch": 1.3652346671849025, "grad_norm": 5.625, "learning_rate": 2.510177334885217e-06, "loss": 1.03892193, "memory(GiB)": 142.32, "step": 122060, "train_speed(iter/s)": 0.285393 }, { "acc": 0.73158541, "epoch": 1.365458366130861, "grad_norm": 6.25, "learning_rate": 2.508573692323561e-06, "loss": 1.07383928, "memory(GiB)": 142.32, "step": 122080, "train_speed(iter/s)": 0.28541 }, { "acc": 0.73854127, "epoch": 1.3656820650768196, "grad_norm": 6.125, "learning_rate": 2.5069703906246362e-06, "loss": 1.04768867, "memory(GiB)": 142.32, "step": 122100, "train_speed(iter/s)": 0.285426 }, { "acc": 0.7416811, "epoch": 1.365905764022778, "grad_norm": 6.0, "learning_rate": 2.5053674300077935e-06, "loss": 1.03334455, "memory(GiB)": 142.32, "step": 122120, "train_speed(iter/s)": 0.285443 }, { "acc": 0.73189402, "epoch": 1.3661294629687366, "grad_norm": 6.34375, "learning_rate": 2.503764810692345e-06, "loss": 1.0709795, "memory(GiB)": 142.32, "step": 122140, "train_speed(iter/s)": 0.28546 }, { "acc": 0.73481894, "epoch": 1.3663531619146951, "grad_norm": 5.75, "learning_rate": 2.5021625328975495e-06, "loss": 1.06560621, "memory(GiB)": 142.32, "step": 122160, "train_speed(iter/s)": 0.285476 }, { "acc": 0.74243593, "epoch": 1.3665768608606537, "grad_norm": 6.15625, "learning_rate": 2.5005605968426204e-06, "loss": 1.02822227, "memory(GiB)": 142.32, "step": 122180, "train_speed(iter/s)": 0.285493 }, { "acc": 0.73543186, "epoch": 1.3668005598066122, "grad_norm": 6.09375, "learning_rate": 2.498959002746729e-06, "loss": 1.05240011, "memory(GiB)": 142.32, "step": 122200, "train_speed(iter/s)": 0.285508 }, { "acc": 0.73161821, "epoch": 1.3670242587525707, "grad_norm": 4.6875, "learning_rate": 2.4973577508289914e-06, "loss": 1.06750107, "memory(GiB)": 142.32, "step": 122220, "train_speed(iter/s)": 0.285525 }, { "acc": 0.73898859, "epoch": 1.3672479576985292, "grad_norm": 6.9375, "learning_rate": 2.495756841308487e-06, "loss": 1.04869308, "memory(GiB)": 142.32, "step": 122240, "train_speed(iter/s)": 0.28554 }, { "acc": 0.7357254, "epoch": 1.3674716566444878, "grad_norm": 5.9375, "learning_rate": 2.4941562744042403e-06, "loss": 1.04163628, "memory(GiB)": 142.32, "step": 122260, "train_speed(iter/s)": 0.285556 }, { "acc": 0.73069363, "epoch": 1.3676953555904463, "grad_norm": 5.0625, "learning_rate": 2.4925560503352303e-06, "loss": 1.06738958, "memory(GiB)": 142.32, "step": 122280, "train_speed(iter/s)": 0.285572 }, { "acc": 0.74494476, "epoch": 1.3679190545364048, "grad_norm": 7.78125, "learning_rate": 2.490956169320394e-06, "loss": 1.01043835, "memory(GiB)": 142.32, "step": 122300, "train_speed(iter/s)": 0.28559 }, { "acc": 0.74161921, "epoch": 1.3681427534823634, "grad_norm": 5.46875, "learning_rate": 2.4893566315786143e-06, "loss": 1.02289677, "memory(GiB)": 142.32, "step": 122320, "train_speed(iter/s)": 0.285603 }, { "acc": 0.749577, "epoch": 1.3683664524283219, "grad_norm": 7.03125, "learning_rate": 2.487757437328735e-06, "loss": 0.98741016, "memory(GiB)": 142.32, "step": 122340, "train_speed(iter/s)": 0.285618 }, { "acc": 0.73909044, "epoch": 1.3685901513742804, "grad_norm": 6.625, "learning_rate": 2.486158586789546e-06, "loss": 1.04241486, "memory(GiB)": 142.32, "step": 122360, "train_speed(iter/s)": 0.285635 }, { "acc": 0.72282963, "epoch": 1.368813850320239, "grad_norm": 5.46875, "learning_rate": 2.484560080179792e-06, "loss": 1.11907578, "memory(GiB)": 142.32, "step": 122380, "train_speed(iter/s)": 0.28565 }, { "acc": 0.74048948, "epoch": 1.3690375492661975, "grad_norm": 7.1875, "learning_rate": 2.4829619177181747e-06, "loss": 1.04254227, "memory(GiB)": 142.32, "step": 122400, "train_speed(iter/s)": 0.285666 }, { "acc": 0.73990402, "epoch": 1.369261248212156, "grad_norm": 5.96875, "learning_rate": 2.4813640996233417e-06, "loss": 1.02893667, "memory(GiB)": 142.32, "step": 122420, "train_speed(iter/s)": 0.285683 }, { "acc": 0.74352102, "epoch": 1.3694849471581145, "grad_norm": 5.96875, "learning_rate": 2.4797666261139016e-06, "loss": 1.01185188, "memory(GiB)": 142.32, "step": 122440, "train_speed(iter/s)": 0.285699 }, { "acc": 0.73416224, "epoch": 1.369708646104073, "grad_norm": 4.5, "learning_rate": 2.4781694974084093e-06, "loss": 1.064324, "memory(GiB)": 142.32, "step": 122460, "train_speed(iter/s)": 0.285715 }, { "acc": 0.73287206, "epoch": 1.3699323450500316, "grad_norm": 6.40625, "learning_rate": 2.476572713725373e-06, "loss": 1.07198868, "memory(GiB)": 142.32, "step": 122480, "train_speed(iter/s)": 0.285731 }, { "acc": 0.7332675, "epoch": 1.3701560439959901, "grad_norm": 4.75, "learning_rate": 2.4749762752832597e-06, "loss": 1.06447334, "memory(GiB)": 142.32, "step": 122500, "train_speed(iter/s)": 0.285747 }, { "acc": 0.74151282, "epoch": 1.3703797429419486, "grad_norm": 7.0625, "learning_rate": 2.473380182300481e-06, "loss": 1.03006353, "memory(GiB)": 142.32, "step": 122520, "train_speed(iter/s)": 0.285763 }, { "acc": 0.73750086, "epoch": 1.3706034418879072, "grad_norm": 6.0625, "learning_rate": 2.471784434995409e-06, "loss": 1.03129282, "memory(GiB)": 142.32, "step": 122540, "train_speed(iter/s)": 0.285778 }, { "acc": 0.73747473, "epoch": 1.3708271408338657, "grad_norm": 5.8125, "learning_rate": 2.470189033586363e-06, "loss": 1.07285595, "memory(GiB)": 142.32, "step": 122560, "train_speed(iter/s)": 0.285795 }, { "acc": 0.7513382, "epoch": 1.3710508397798242, "grad_norm": 6.0625, "learning_rate": 2.468593978291614e-06, "loss": 0.98510952, "memory(GiB)": 142.32, "step": 122580, "train_speed(iter/s)": 0.285811 }, { "acc": 0.73517637, "epoch": 1.3712745387257828, "grad_norm": 7.34375, "learning_rate": 2.466999269329393e-06, "loss": 1.04637575, "memory(GiB)": 142.32, "step": 122600, "train_speed(iter/s)": 0.285828 }, { "acc": 0.73055496, "epoch": 1.3714982376717413, "grad_norm": 5.09375, "learning_rate": 2.4654049069178753e-06, "loss": 1.07686796, "memory(GiB)": 142.32, "step": 122620, "train_speed(iter/s)": 0.285845 }, { "acc": 0.72915697, "epoch": 1.3717219366176998, "grad_norm": 5.15625, "learning_rate": 2.4638108912751958e-06, "loss": 1.08220053, "memory(GiB)": 142.32, "step": 122640, "train_speed(iter/s)": 0.285861 }, { "acc": 0.73895845, "epoch": 1.3719456355636583, "grad_norm": 6.78125, "learning_rate": 2.462217222619437e-06, "loss": 1.03296585, "memory(GiB)": 142.32, "step": 122660, "train_speed(iter/s)": 0.285878 }, { "acc": 0.73103862, "epoch": 1.3721693345096169, "grad_norm": 6.09375, "learning_rate": 2.460623901168633e-06, "loss": 1.07488441, "memory(GiB)": 142.32, "step": 122680, "train_speed(iter/s)": 0.285895 }, { "acc": 0.7290514, "epoch": 1.3723930334555754, "grad_norm": 5.96875, "learning_rate": 2.4590309271407774e-06, "loss": 1.08745632, "memory(GiB)": 142.32, "step": 122700, "train_speed(iter/s)": 0.28591 }, { "acc": 0.74858189, "epoch": 1.372616732401534, "grad_norm": 7.0, "learning_rate": 2.4574383007538085e-06, "loss": 0.99118023, "memory(GiB)": 142.32, "step": 122720, "train_speed(iter/s)": 0.285927 }, { "acc": 0.73739376, "epoch": 1.3728404313474925, "grad_norm": 6.1875, "learning_rate": 2.455846022225623e-06, "loss": 1.05640411, "memory(GiB)": 142.32, "step": 122740, "train_speed(iter/s)": 0.285943 }, { "acc": 0.74314699, "epoch": 1.373064130293451, "grad_norm": 6.4375, "learning_rate": 2.454254091774066e-06, "loss": 1.02272911, "memory(GiB)": 142.32, "step": 122760, "train_speed(iter/s)": 0.285959 }, { "acc": 0.73875275, "epoch": 1.3732878292394095, "grad_norm": 6.21875, "learning_rate": 2.4526625096169344e-06, "loss": 1.04759998, "memory(GiB)": 142.32, "step": 122780, "train_speed(iter/s)": 0.285974 }, { "acc": 0.73585844, "epoch": 1.373511528185368, "grad_norm": 6.96875, "learning_rate": 2.4510712759719837e-06, "loss": 1.04668522, "memory(GiB)": 142.32, "step": 122800, "train_speed(iter/s)": 0.28599 }, { "acc": 0.7372704, "epoch": 1.3737352271313266, "grad_norm": 5.625, "learning_rate": 2.4494803910569127e-06, "loss": 1.04264736, "memory(GiB)": 142.32, "step": 122820, "train_speed(iter/s)": 0.286005 }, { "acc": 0.74563732, "epoch": 1.373958926077285, "grad_norm": 6.5, "learning_rate": 2.4478898550893815e-06, "loss": 0.99776669, "memory(GiB)": 142.32, "step": 122840, "train_speed(iter/s)": 0.286021 }, { "acc": 0.73498521, "epoch": 1.3741826250232436, "grad_norm": 6.25, "learning_rate": 2.446299668286996e-06, "loss": 1.05978298, "memory(GiB)": 142.32, "step": 122860, "train_speed(iter/s)": 0.286039 }, { "acc": 0.73460064, "epoch": 1.3744063239692021, "grad_norm": 6.34375, "learning_rate": 2.444709830867315e-06, "loss": 1.05720863, "memory(GiB)": 142.32, "step": 122880, "train_speed(iter/s)": 0.286055 }, { "acc": 0.73943338, "epoch": 1.3746300229151607, "grad_norm": 5.6875, "learning_rate": 2.443120343047855e-06, "loss": 1.04304619, "memory(GiB)": 142.32, "step": 122900, "train_speed(iter/s)": 0.286071 }, { "acc": 0.72369013, "epoch": 1.3748537218611192, "grad_norm": 5.21875, "learning_rate": 2.441531205046076e-06, "loss": 1.09572248, "memory(GiB)": 142.32, "step": 122920, "train_speed(iter/s)": 0.286086 }, { "acc": 0.7472743, "epoch": 1.3750774208070777, "grad_norm": 5.65625, "learning_rate": 2.439942417079399e-06, "loss": 0.9979126, "memory(GiB)": 142.32, "step": 122940, "train_speed(iter/s)": 0.286101 }, { "acc": 0.74076648, "epoch": 1.3753011197530363, "grad_norm": 6.0625, "learning_rate": 2.4383539793651905e-06, "loss": 1.02906103, "memory(GiB)": 142.32, "step": 122960, "train_speed(iter/s)": 0.286115 }, { "acc": 0.73749599, "epoch": 1.3755248186989948, "grad_norm": 6.09375, "learning_rate": 2.436765892120771e-06, "loss": 1.02965078, "memory(GiB)": 142.32, "step": 122980, "train_speed(iter/s)": 0.28613 }, { "acc": 0.7368454, "epoch": 1.3757485176449533, "grad_norm": 6.21875, "learning_rate": 2.435178155563416e-06, "loss": 1.04185524, "memory(GiB)": 142.32, "step": 123000, "train_speed(iter/s)": 0.286142 }, { "acc": 0.73122149, "epoch": 1.3759722165909118, "grad_norm": 4.59375, "learning_rate": 2.4335907699103467e-06, "loss": 1.06653862, "memory(GiB)": 142.32, "step": 123020, "train_speed(iter/s)": 0.286158 }, { "acc": 0.74308815, "epoch": 1.3761959155368704, "grad_norm": 5.5, "learning_rate": 2.432003735378745e-06, "loss": 1.03521986, "memory(GiB)": 142.32, "step": 123040, "train_speed(iter/s)": 0.286174 }, { "acc": 0.75245128, "epoch": 1.376419614482829, "grad_norm": 6.28125, "learning_rate": 2.4304170521857375e-06, "loss": 0.98027315, "memory(GiB)": 142.32, "step": 123060, "train_speed(iter/s)": 0.286188 }, { "acc": 0.73566394, "epoch": 1.3766433134287874, "grad_norm": 6.09375, "learning_rate": 2.4288307205484026e-06, "loss": 1.05593185, "memory(GiB)": 142.32, "step": 123080, "train_speed(iter/s)": 0.286203 }, { "acc": 0.74998112, "epoch": 1.376867012374746, "grad_norm": 6.96875, "learning_rate": 2.427244740683778e-06, "loss": 1.00007467, "memory(GiB)": 142.32, "step": 123100, "train_speed(iter/s)": 0.286217 }, { "acc": 0.73802466, "epoch": 1.3770907113207045, "grad_norm": 6.6875, "learning_rate": 2.425659112808846e-06, "loss": 1.02366323, "memory(GiB)": 142.32, "step": 123120, "train_speed(iter/s)": 0.286232 }, { "acc": 0.74258356, "epoch": 1.377314410266663, "grad_norm": 7.40625, "learning_rate": 2.4240738371405427e-06, "loss": 1.02577, "memory(GiB)": 142.32, "step": 123140, "train_speed(iter/s)": 0.286247 }, { "acc": 0.73093071, "epoch": 1.3775381092126215, "grad_norm": 6.625, "learning_rate": 2.422488913895755e-06, "loss": 1.06950884, "memory(GiB)": 142.32, "step": 123160, "train_speed(iter/s)": 0.286264 }, { "acc": 0.73933001, "epoch": 1.37776180815858, "grad_norm": 5.6875, "learning_rate": 2.4209043432913274e-06, "loss": 1.03577328, "memory(GiB)": 142.32, "step": 123180, "train_speed(iter/s)": 0.286281 }, { "acc": 0.74505234, "epoch": 1.3779855071045386, "grad_norm": 6.90625, "learning_rate": 2.4193201255440496e-06, "loss": 1.02109432, "memory(GiB)": 142.32, "step": 123200, "train_speed(iter/s)": 0.286297 }, { "acc": 0.74031205, "epoch": 1.3782092060504971, "grad_norm": 5.25, "learning_rate": 2.417736260870663e-06, "loss": 1.00969267, "memory(GiB)": 142.32, "step": 123220, "train_speed(iter/s)": 0.286314 }, { "acc": 0.74068899, "epoch": 1.3784329049964557, "grad_norm": 6.5, "learning_rate": 2.4161527494878663e-06, "loss": 1.02045727, "memory(GiB)": 142.32, "step": 123240, "train_speed(iter/s)": 0.286331 }, { "acc": 0.74623327, "epoch": 1.3786566039424142, "grad_norm": 6.5, "learning_rate": 2.4145695916123037e-06, "loss": 0.9941164, "memory(GiB)": 142.32, "step": 123260, "train_speed(iter/s)": 0.286346 }, { "acc": 0.73038626, "epoch": 1.3788803028883727, "grad_norm": 7.0625, "learning_rate": 2.412986787460577e-06, "loss": 1.08906784, "memory(GiB)": 142.32, "step": 123280, "train_speed(iter/s)": 0.286361 }, { "acc": 0.72943058, "epoch": 1.3791040018343312, "grad_norm": 5.8125, "learning_rate": 2.411404337249235e-06, "loss": 1.08100147, "memory(GiB)": 142.32, "step": 123300, "train_speed(iter/s)": 0.286376 }, { "acc": 0.72958202, "epoch": 1.3793277007802898, "grad_norm": 5.84375, "learning_rate": 2.409822241194777e-06, "loss": 1.08430834, "memory(GiB)": 142.32, "step": 123320, "train_speed(iter/s)": 0.286391 }, { "acc": 0.72785807, "epoch": 1.3795513997262483, "grad_norm": 6.96875, "learning_rate": 2.408240499513661e-06, "loss": 1.07274837, "memory(GiB)": 142.32, "step": 123340, "train_speed(iter/s)": 0.286406 }, { "acc": 0.72741013, "epoch": 1.3797750986722068, "grad_norm": 5.5, "learning_rate": 2.406659112422287e-06, "loss": 1.09187765, "memory(GiB)": 142.32, "step": 123360, "train_speed(iter/s)": 0.286422 }, { "acc": 0.74219255, "epoch": 1.3799987976181654, "grad_norm": 6.90625, "learning_rate": 2.4050780801370162e-06, "loss": 1.01402311, "memory(GiB)": 142.32, "step": 123380, "train_speed(iter/s)": 0.286437 }, { "acc": 0.73799996, "epoch": 1.3802224965641239, "grad_norm": 6.3125, "learning_rate": 2.4034974028741533e-06, "loss": 1.02875385, "memory(GiB)": 142.32, "step": 123400, "train_speed(iter/s)": 0.286453 }, { "acc": 0.73039966, "epoch": 1.3804461955100824, "grad_norm": 6.28125, "learning_rate": 2.401917080849957e-06, "loss": 1.06616096, "memory(GiB)": 142.32, "step": 123420, "train_speed(iter/s)": 0.286467 }, { "acc": 0.73482418, "epoch": 1.380669894456041, "grad_norm": 4.21875, "learning_rate": 2.400337114280641e-06, "loss": 1.05738144, "memory(GiB)": 142.32, "step": 123440, "train_speed(iter/s)": 0.286483 }, { "acc": 0.74005327, "epoch": 1.3808935934019995, "grad_norm": 5.21875, "learning_rate": 2.398757503382363e-06, "loss": 1.03041954, "memory(GiB)": 142.32, "step": 123460, "train_speed(iter/s)": 0.286496 }, { "acc": 0.7404747, "epoch": 1.381117292347958, "grad_norm": 5.40625, "learning_rate": 2.3971782483712414e-06, "loss": 1.04311714, "memory(GiB)": 142.32, "step": 123480, "train_speed(iter/s)": 0.286511 }, { "acc": 0.73893151, "epoch": 1.3813409912939165, "grad_norm": 6.21875, "learning_rate": 2.3955993494633385e-06, "loss": 1.03084097, "memory(GiB)": 142.32, "step": 123500, "train_speed(iter/s)": 0.286527 }, { "acc": 0.73830509, "epoch": 1.381564690239875, "grad_norm": 6.03125, "learning_rate": 2.394020806874667e-06, "loss": 1.04442968, "memory(GiB)": 142.32, "step": 123520, "train_speed(iter/s)": 0.286544 }, { "acc": 0.73568921, "epoch": 1.3817883891858336, "grad_norm": 5.28125, "learning_rate": 2.3924426208212003e-06, "loss": 1.04349318, "memory(GiB)": 142.32, "step": 123540, "train_speed(iter/s)": 0.286557 }, { "acc": 0.74415326, "epoch": 1.382012088131792, "grad_norm": 7.0, "learning_rate": 2.3908647915188514e-06, "loss": 1.02187738, "memory(GiB)": 142.32, "step": 123560, "train_speed(iter/s)": 0.286573 }, { "acc": 0.73581891, "epoch": 1.3822357870777506, "grad_norm": 6.21875, "learning_rate": 2.3892873191834936e-06, "loss": 1.050033, "memory(GiB)": 142.32, "step": 123580, "train_speed(iter/s)": 0.286591 }, { "acc": 0.73388762, "epoch": 1.3824594860237092, "grad_norm": 6.53125, "learning_rate": 2.387710204030947e-06, "loss": 1.06210451, "memory(GiB)": 142.32, "step": 123600, "train_speed(iter/s)": 0.286607 }, { "acc": 0.73889246, "epoch": 1.3826831849696677, "grad_norm": 6.0, "learning_rate": 2.38613344627698e-06, "loss": 1.04091787, "memory(GiB)": 142.32, "step": 123620, "train_speed(iter/s)": 0.286621 }, { "acc": 0.7457201, "epoch": 1.3829068839156262, "grad_norm": 6.21875, "learning_rate": 2.384557046137321e-06, "loss": 0.9973506, "memory(GiB)": 142.32, "step": 123640, "train_speed(iter/s)": 0.286635 }, { "acc": 0.74914951, "epoch": 1.3831305828615847, "grad_norm": 6.0, "learning_rate": 2.382981003827639e-06, "loss": 0.99912395, "memory(GiB)": 142.32, "step": 123660, "train_speed(iter/s)": 0.28665 }, { "acc": 0.73478565, "epoch": 1.3833542818075433, "grad_norm": 5.53125, "learning_rate": 2.3814053195635633e-06, "loss": 1.07540989, "memory(GiB)": 142.32, "step": 123680, "train_speed(iter/s)": 0.286667 }, { "acc": 0.7296648, "epoch": 1.3835779807535018, "grad_norm": 4.5, "learning_rate": 2.3798299935606684e-06, "loss": 1.07689629, "memory(GiB)": 142.32, "step": 123700, "train_speed(iter/s)": 0.286683 }, { "acc": 0.73354635, "epoch": 1.3838016796994603, "grad_norm": 5.90625, "learning_rate": 2.3782550260344796e-06, "loss": 1.06403761, "memory(GiB)": 142.32, "step": 123720, "train_speed(iter/s)": 0.286699 }, { "acc": 0.73449926, "epoch": 1.3840253786454189, "grad_norm": 6.5625, "learning_rate": 2.3766804172004784e-06, "loss": 1.06570415, "memory(GiB)": 142.32, "step": 123740, "train_speed(iter/s)": 0.286716 }, { "acc": 0.74651499, "epoch": 1.3842490775913774, "grad_norm": 6.34375, "learning_rate": 2.37510616727409e-06, "loss": 0.9973032, "memory(GiB)": 142.32, "step": 123760, "train_speed(iter/s)": 0.286733 }, { "acc": 0.7365891, "epoch": 1.3844727765373361, "grad_norm": 7.0, "learning_rate": 2.373532276470698e-06, "loss": 1.0363102, "memory(GiB)": 142.32, "step": 123780, "train_speed(iter/s)": 0.286747 }, { "acc": 0.73399677, "epoch": 1.3846964754832947, "grad_norm": 6.0625, "learning_rate": 2.3719587450056316e-06, "loss": 1.04962788, "memory(GiB)": 142.32, "step": 123800, "train_speed(iter/s)": 0.286763 }, { "acc": 0.73929138, "epoch": 1.3849201744292532, "grad_norm": 6.0, "learning_rate": 2.3703855730941704e-06, "loss": 1.0298665, "memory(GiB)": 142.32, "step": 123820, "train_speed(iter/s)": 0.28678 }, { "acc": 0.74010582, "epoch": 1.3851438733752117, "grad_norm": 6.03125, "learning_rate": 2.3688127609515502e-06, "loss": 1.03646946, "memory(GiB)": 142.32, "step": 123840, "train_speed(iter/s)": 0.286796 }, { "acc": 0.74185824, "epoch": 1.3853675723211702, "grad_norm": 6.96875, "learning_rate": 2.3672403087929512e-06, "loss": 1.02717438, "memory(GiB)": 142.32, "step": 123860, "train_speed(iter/s)": 0.286812 }, { "acc": 0.73451977, "epoch": 1.3855912712671288, "grad_norm": 7.78125, "learning_rate": 2.3656682168335105e-06, "loss": 1.06257687, "memory(GiB)": 142.32, "step": 123880, "train_speed(iter/s)": 0.286827 }, { "acc": 0.7508955, "epoch": 1.3858149702130873, "grad_norm": 6.0, "learning_rate": 2.3640964852883108e-06, "loss": 0.98496695, "memory(GiB)": 142.32, "step": 123900, "train_speed(iter/s)": 0.286844 }, { "acc": 0.73017535, "epoch": 1.3860386691590458, "grad_norm": 6.4375, "learning_rate": 2.362525114372386e-06, "loss": 1.07695551, "memory(GiB)": 142.32, "step": 123920, "train_speed(iter/s)": 0.286859 }, { "acc": 0.74065733, "epoch": 1.3862623681050044, "grad_norm": 5.78125, "learning_rate": 2.3609541043007254e-06, "loss": 1.02934818, "memory(GiB)": 142.32, "step": 123940, "train_speed(iter/s)": 0.286874 }, { "acc": 0.73028126, "epoch": 1.386486067050963, "grad_norm": 4.5, "learning_rate": 2.3593834552882627e-06, "loss": 1.07911358, "memory(GiB)": 142.32, "step": 123960, "train_speed(iter/s)": 0.28689 }, { "acc": 0.74327087, "epoch": 1.3867097659969214, "grad_norm": 6.34375, "learning_rate": 2.3578131675498876e-06, "loss": 1.00109215, "memory(GiB)": 142.32, "step": 123980, "train_speed(iter/s)": 0.286906 }, { "acc": 0.73107824, "epoch": 1.38693346494288, "grad_norm": 6.75, "learning_rate": 2.356243241300437e-06, "loss": 1.0692297, "memory(GiB)": 142.32, "step": 124000, "train_speed(iter/s)": 0.286922 }, { "epoch": 1.38693346494288, "eval_acc": 0.6963302879831713, "eval_loss": 1.0716172456741333, "eval_runtime": 2340.9893, "eval_samples_per_second": 32.159, "eval_steps_per_second": 16.08, "step": 124000 }, { "acc": 0.73765078, "epoch": 1.3871571638888385, "grad_norm": 6.34375, "learning_rate": 2.3546736767546974e-06, "loss": 1.04050217, "memory(GiB)": 142.32, "step": 124020, "train_speed(iter/s)": 0.285359 }, { "acc": 0.74287887, "epoch": 1.387380862834797, "grad_norm": 7.03125, "learning_rate": 2.353104474127411e-06, "loss": 1.02497988, "memory(GiB)": 142.32, "step": 124040, "train_speed(iter/s)": 0.285372 }, { "acc": 0.74095993, "epoch": 1.3876045617807555, "grad_norm": 6.0, "learning_rate": 2.3515356336332633e-06, "loss": 1.03063135, "memory(GiB)": 142.32, "step": 124060, "train_speed(iter/s)": 0.285386 }, { "acc": 0.75065842, "epoch": 1.387828260726714, "grad_norm": 5.8125, "learning_rate": 2.3499671554868986e-06, "loss": 0.99735165, "memory(GiB)": 142.32, "step": 124080, "train_speed(iter/s)": 0.285401 }, { "acc": 0.74728823, "epoch": 1.3880519596726726, "grad_norm": 5.5, "learning_rate": 2.348399039902904e-06, "loss": 0.99668083, "memory(GiB)": 142.32, "step": 124100, "train_speed(iter/s)": 0.285416 }, { "acc": 0.74256601, "epoch": 1.3882756586186311, "grad_norm": 5.625, "learning_rate": 2.346831287095819e-06, "loss": 1.02441196, "memory(GiB)": 142.32, "step": 124120, "train_speed(iter/s)": 0.285433 }, { "acc": 0.74020829, "epoch": 1.3884993575645896, "grad_norm": 6.25, "learning_rate": 2.345263897280139e-06, "loss": 1.03273373, "memory(GiB)": 142.32, "step": 124140, "train_speed(iter/s)": 0.28545 }, { "acc": 0.7433845, "epoch": 1.3887230565105482, "grad_norm": 5.59375, "learning_rate": 2.3436968706703008e-06, "loss": 1.01483383, "memory(GiB)": 142.32, "step": 124160, "train_speed(iter/s)": 0.285466 }, { "acc": 0.73366966, "epoch": 1.3889467554565067, "grad_norm": 5.5, "learning_rate": 2.342130207480699e-06, "loss": 1.07191076, "memory(GiB)": 142.32, "step": 124180, "train_speed(iter/s)": 0.28548 }, { "acc": 0.74173336, "epoch": 1.3891704544024652, "grad_norm": 5.65625, "learning_rate": 2.3405639079256754e-06, "loss": 1.02073212, "memory(GiB)": 142.32, "step": 124200, "train_speed(iter/s)": 0.285495 }, { "acc": 0.73745193, "epoch": 1.3893941533484238, "grad_norm": 7.46875, "learning_rate": 2.338997972219519e-06, "loss": 1.05102921, "memory(GiB)": 142.32, "step": 124220, "train_speed(iter/s)": 0.285509 }, { "acc": 0.73597398, "epoch": 1.3896178522943823, "grad_norm": 5.9375, "learning_rate": 2.3374324005764763e-06, "loss": 1.03390121, "memory(GiB)": 142.32, "step": 124240, "train_speed(iter/s)": 0.285525 }, { "acc": 0.74331675, "epoch": 1.3898415512403408, "grad_norm": 6.5625, "learning_rate": 2.335867193210737e-06, "loss": 1.01481743, "memory(GiB)": 142.32, "step": 124260, "train_speed(iter/s)": 0.285542 }, { "acc": 0.74238734, "epoch": 1.3900652501862993, "grad_norm": 5.59375, "learning_rate": 2.334302350336446e-06, "loss": 1.01782532, "memory(GiB)": 142.32, "step": 124280, "train_speed(iter/s)": 0.28556 }, { "acc": 0.73800507, "epoch": 1.3902889491322579, "grad_norm": 5.78125, "learning_rate": 2.332737872167695e-06, "loss": 1.03652439, "memory(GiB)": 142.32, "step": 124300, "train_speed(iter/s)": 0.285577 }, { "acc": 0.73731079, "epoch": 1.3905126480782164, "grad_norm": 7.375, "learning_rate": 2.3311737589185273e-06, "loss": 1.0477869, "memory(GiB)": 142.32, "step": 124320, "train_speed(iter/s)": 0.285591 }, { "acc": 0.7305419, "epoch": 1.390736347024175, "grad_norm": 6.4375, "learning_rate": 2.329610010802934e-06, "loss": 1.06717272, "memory(GiB)": 142.32, "step": 124340, "train_speed(iter/s)": 0.285607 }, { "acc": 0.73727293, "epoch": 1.3909600459701335, "grad_norm": 5.96875, "learning_rate": 2.328046628034861e-06, "loss": 1.05164871, "memory(GiB)": 142.32, "step": 124360, "train_speed(iter/s)": 0.285622 }, { "acc": 0.74446263, "epoch": 1.391183744916092, "grad_norm": 6.625, "learning_rate": 2.3264836108282014e-06, "loss": 1.02048702, "memory(GiB)": 142.32, "step": 124380, "train_speed(iter/s)": 0.285637 }, { "acc": 0.7383069, "epoch": 1.3914074438620505, "grad_norm": 6.625, "learning_rate": 2.3249209593967946e-06, "loss": 1.04152546, "memory(GiB)": 142.32, "step": 124400, "train_speed(iter/s)": 0.285651 }, { "acc": 0.73675318, "epoch": 1.391631142808009, "grad_norm": 7.125, "learning_rate": 2.3233586739544384e-06, "loss": 1.0456358, "memory(GiB)": 142.32, "step": 124420, "train_speed(iter/s)": 0.285666 }, { "acc": 0.73659878, "epoch": 1.3918548417539676, "grad_norm": 6.65625, "learning_rate": 2.321796754714872e-06, "loss": 1.05743332, "memory(GiB)": 142.32, "step": 124440, "train_speed(iter/s)": 0.28568 }, { "acc": 0.72585301, "epoch": 1.392078540699926, "grad_norm": 6.125, "learning_rate": 2.3202352018917914e-06, "loss": 1.09243908, "memory(GiB)": 142.32, "step": 124460, "train_speed(iter/s)": 0.285698 }, { "acc": 0.73645153, "epoch": 1.3923022396458846, "grad_norm": 5.15625, "learning_rate": 2.3186740156988375e-06, "loss": 1.05175037, "memory(GiB)": 142.32, "step": 124480, "train_speed(iter/s)": 0.285714 }, { "acc": 0.74038525, "epoch": 1.3925259385918431, "grad_norm": 5.46875, "learning_rate": 2.3171131963496017e-06, "loss": 1.04292202, "memory(GiB)": 142.32, "step": 124500, "train_speed(iter/s)": 0.285729 }, { "acc": 0.75528278, "epoch": 1.3927496375378017, "grad_norm": 6.34375, "learning_rate": 2.3155527440576296e-06, "loss": 0.95097055, "memory(GiB)": 142.32, "step": 124520, "train_speed(iter/s)": 0.285747 }, { "acc": 0.73998046, "epoch": 1.3929733364837602, "grad_norm": 5.59375, "learning_rate": 2.3139926590364105e-06, "loss": 1.02848644, "memory(GiB)": 142.32, "step": 124540, "train_speed(iter/s)": 0.285763 }, { "acc": 0.73275003, "epoch": 1.3931970354297187, "grad_norm": 5.5, "learning_rate": 2.3124329414993886e-06, "loss": 1.06274433, "memory(GiB)": 142.32, "step": 124560, "train_speed(iter/s)": 0.285778 }, { "acc": 0.73404179, "epoch": 1.3934207343756773, "grad_norm": 6.15625, "learning_rate": 2.310873591659955e-06, "loss": 1.08557901, "memory(GiB)": 142.32, "step": 124580, "train_speed(iter/s)": 0.285793 }, { "acc": 0.72962704, "epoch": 1.3936444333216358, "grad_norm": 6.96875, "learning_rate": 2.3093146097314485e-06, "loss": 1.07344437, "memory(GiB)": 142.32, "step": 124600, "train_speed(iter/s)": 0.285807 }, { "acc": 0.73882408, "epoch": 1.3938681322675943, "grad_norm": 6.6875, "learning_rate": 2.307755995927164e-06, "loss": 1.04455414, "memory(GiB)": 142.32, "step": 124620, "train_speed(iter/s)": 0.28582 }, { "acc": 0.7331212, "epoch": 1.3940918312135528, "grad_norm": 7.0, "learning_rate": 2.3061977504603384e-06, "loss": 1.06373253, "memory(GiB)": 142.32, "step": 124640, "train_speed(iter/s)": 0.285835 }, { "acc": 0.73540378, "epoch": 1.3943155301595114, "grad_norm": 5.75, "learning_rate": 2.304639873544166e-06, "loss": 1.04910774, "memory(GiB)": 142.32, "step": 124660, "train_speed(iter/s)": 0.285851 }, { "acc": 0.73624206, "epoch": 1.39453922910547, "grad_norm": 6.09375, "learning_rate": 2.303082365391784e-06, "loss": 1.04197931, "memory(GiB)": 142.32, "step": 124680, "train_speed(iter/s)": 0.285867 }, { "acc": 0.74216309, "epoch": 1.3947629280514284, "grad_norm": 6.78125, "learning_rate": 2.3015252262162807e-06, "loss": 1.02670822, "memory(GiB)": 142.32, "step": 124700, "train_speed(iter/s)": 0.285884 }, { "acc": 0.74544516, "epoch": 1.394986626997387, "grad_norm": 6.0, "learning_rate": 2.2999684562306982e-06, "loss": 1.00586166, "memory(GiB)": 142.32, "step": 124720, "train_speed(iter/s)": 0.285901 }, { "acc": 0.76223359, "epoch": 1.3952103259433455, "grad_norm": 5.5625, "learning_rate": 2.298412055648022e-06, "loss": 0.92117691, "memory(GiB)": 142.32, "step": 124740, "train_speed(iter/s)": 0.285916 }, { "acc": 0.74362254, "epoch": 1.395434024889304, "grad_norm": 7.15625, "learning_rate": 2.296856024681192e-06, "loss": 1.00523376, "memory(GiB)": 142.32, "step": 124760, "train_speed(iter/s)": 0.28593 }, { "acc": 0.7532361, "epoch": 1.3956577238352625, "grad_norm": 6.75, "learning_rate": 2.2953003635430955e-06, "loss": 0.96737461, "memory(GiB)": 142.32, "step": 124780, "train_speed(iter/s)": 0.285943 }, { "acc": 0.7399724, "epoch": 1.395881422781221, "grad_norm": 4.71875, "learning_rate": 2.293745072446566e-06, "loss": 1.02039738, "memory(GiB)": 142.32, "step": 124800, "train_speed(iter/s)": 0.285958 }, { "acc": 0.74175091, "epoch": 1.3961051217271796, "grad_norm": 6.75, "learning_rate": 2.292190151604394e-06, "loss": 1.02525558, "memory(GiB)": 142.32, "step": 124820, "train_speed(iter/s)": 0.285973 }, { "acc": 0.74083958, "epoch": 1.3963288206731381, "grad_norm": 6.5, "learning_rate": 2.290635601229311e-06, "loss": 1.01705112, "memory(GiB)": 142.32, "step": 124840, "train_speed(iter/s)": 0.285989 }, { "acc": 0.73626151, "epoch": 1.3965525196190967, "grad_norm": 6.875, "learning_rate": 2.2890814215340052e-06, "loss": 1.04905682, "memory(GiB)": 142.32, "step": 124860, "train_speed(iter/s)": 0.286004 }, { "acc": 0.73462524, "epoch": 1.3967762185650552, "grad_norm": 6.3125, "learning_rate": 2.2875276127311088e-06, "loss": 1.05626907, "memory(GiB)": 142.32, "step": 124880, "train_speed(iter/s)": 0.286021 }, { "acc": 0.73857183, "epoch": 1.3969999175110137, "grad_norm": 6.1875, "learning_rate": 2.285974175033203e-06, "loss": 1.04513664, "memory(GiB)": 142.32, "step": 124900, "train_speed(iter/s)": 0.286037 }, { "acc": 0.74249258, "epoch": 1.3972236164569722, "grad_norm": 5.34375, "learning_rate": 2.2844211086528244e-06, "loss": 1.01394711, "memory(GiB)": 142.32, "step": 124920, "train_speed(iter/s)": 0.286053 }, { "acc": 0.73217206, "epoch": 1.3974473154029308, "grad_norm": 6.34375, "learning_rate": 2.2828684138024513e-06, "loss": 1.06544209, "memory(GiB)": 142.32, "step": 124940, "train_speed(iter/s)": 0.286067 }, { "acc": 0.73819661, "epoch": 1.3976710143488893, "grad_norm": 4.78125, "learning_rate": 2.2813160906945177e-06, "loss": 1.03492489, "memory(GiB)": 142.32, "step": 124960, "train_speed(iter/s)": 0.286084 }, { "acc": 0.73157806, "epoch": 1.3978947132948478, "grad_norm": 5.5625, "learning_rate": 2.2797641395414017e-06, "loss": 1.06768341, "memory(GiB)": 142.32, "step": 124980, "train_speed(iter/s)": 0.286102 }, { "acc": 0.72821274, "epoch": 1.3981184122408064, "grad_norm": 6.21875, "learning_rate": 2.2782125605554307e-06, "loss": 1.09714785, "memory(GiB)": 142.32, "step": 125000, "train_speed(iter/s)": 0.286117 }, { "acc": 0.73873281, "epoch": 1.3983421111867649, "grad_norm": 6.3125, "learning_rate": 2.276661353948886e-06, "loss": 1.04559898, "memory(GiB)": 142.32, "step": 125020, "train_speed(iter/s)": 0.286132 }, { "acc": 0.73247662, "epoch": 1.3985658101327234, "grad_norm": 6.78125, "learning_rate": 2.275110519933993e-06, "loss": 1.05695715, "memory(GiB)": 142.32, "step": 125040, "train_speed(iter/s)": 0.286146 }, { "acc": 0.71863251, "epoch": 1.398789509078682, "grad_norm": 7.0, "learning_rate": 2.2735600587229294e-06, "loss": 1.14249916, "memory(GiB)": 142.32, "step": 125060, "train_speed(iter/s)": 0.286157 }, { "acc": 0.74147687, "epoch": 1.3990132080246405, "grad_norm": 7.03125, "learning_rate": 2.2720099705278197e-06, "loss": 1.02411785, "memory(GiB)": 142.32, "step": 125080, "train_speed(iter/s)": 0.286172 }, { "acc": 0.73907776, "epoch": 1.399236906970599, "grad_norm": 6.0625, "learning_rate": 2.2704602555607363e-06, "loss": 1.04053764, "memory(GiB)": 142.32, "step": 125100, "train_speed(iter/s)": 0.286189 }, { "acc": 0.7345315, "epoch": 1.3994606059165575, "grad_norm": 6.40625, "learning_rate": 2.2689109140337064e-06, "loss": 1.07716703, "memory(GiB)": 142.32, "step": 125120, "train_speed(iter/s)": 0.286204 }, { "acc": 0.73700047, "epoch": 1.399684304862516, "grad_norm": 5.84375, "learning_rate": 2.267361946158697e-06, "loss": 1.03698072, "memory(GiB)": 142.32, "step": 125140, "train_speed(iter/s)": 0.286221 }, { "acc": 0.7444005, "epoch": 1.3999080038084746, "grad_norm": 5.0, "learning_rate": 2.2658133521476337e-06, "loss": 1.02167435, "memory(GiB)": 142.32, "step": 125160, "train_speed(iter/s)": 0.286236 }, { "acc": 0.73996077, "epoch": 1.400131702754433, "grad_norm": 6.5, "learning_rate": 2.264265132212385e-06, "loss": 1.03833942, "memory(GiB)": 142.32, "step": 125180, "train_speed(iter/s)": 0.286253 }, { "acc": 0.73251925, "epoch": 1.4003554017003916, "grad_norm": 6.5625, "learning_rate": 2.2627172865647666e-06, "loss": 1.05527973, "memory(GiB)": 142.32, "step": 125200, "train_speed(iter/s)": 0.286269 }, { "acc": 0.72293959, "epoch": 1.4005791006463502, "grad_norm": 5.6875, "learning_rate": 2.26116981541655e-06, "loss": 1.12181129, "memory(GiB)": 142.32, "step": 125220, "train_speed(iter/s)": 0.286283 }, { "acc": 0.74464703, "epoch": 1.4008027995923087, "grad_norm": 6.78125, "learning_rate": 2.259622718979448e-06, "loss": 1.02415218, "memory(GiB)": 142.32, "step": 125240, "train_speed(iter/s)": 0.286297 }, { "acc": 0.7405159, "epoch": 1.4010264985382672, "grad_norm": 4.3125, "learning_rate": 2.2580759974651283e-06, "loss": 1.02423325, "memory(GiB)": 142.32, "step": 125260, "train_speed(iter/s)": 0.286311 }, { "acc": 0.73835645, "epoch": 1.4012501974842257, "grad_norm": 5.40625, "learning_rate": 2.2565296510852035e-06, "loss": 1.03252821, "memory(GiB)": 142.32, "step": 125280, "train_speed(iter/s)": 0.286326 }, { "acc": 0.73472214, "epoch": 1.4014738964301843, "grad_norm": 6.78125, "learning_rate": 2.254983680051234e-06, "loss": 1.04627304, "memory(GiB)": 142.32, "step": 125300, "train_speed(iter/s)": 0.286341 }, { "acc": 0.75181308, "epoch": 1.4016975953761428, "grad_norm": 6.0625, "learning_rate": 2.2534380845747343e-06, "loss": 0.98088799, "memory(GiB)": 142.32, "step": 125320, "train_speed(iter/s)": 0.286357 }, { "acc": 0.74268389, "epoch": 1.4019212943221013, "grad_norm": 5.46875, "learning_rate": 2.25189286486716e-06, "loss": 1.01941013, "memory(GiB)": 142.32, "step": 125340, "train_speed(iter/s)": 0.286369 }, { "acc": 0.73603854, "epoch": 1.4021449932680599, "grad_norm": 7.125, "learning_rate": 2.250348021139924e-06, "loss": 1.03738728, "memory(GiB)": 142.32, "step": 125360, "train_speed(iter/s)": 0.286386 }, { "acc": 0.73776875, "epoch": 1.4023686922140184, "grad_norm": 5.40625, "learning_rate": 2.248803553604379e-06, "loss": 1.03955517, "memory(GiB)": 142.32, "step": 125380, "train_speed(iter/s)": 0.286402 }, { "acc": 0.74018545, "epoch": 1.402592391159977, "grad_norm": 7.90625, "learning_rate": 2.24725946247183e-06, "loss": 1.0210186, "memory(GiB)": 142.32, "step": 125400, "train_speed(iter/s)": 0.286416 }, { "acc": 0.75021467, "epoch": 1.4028160901059354, "grad_norm": 6.28125, "learning_rate": 2.2457157479535346e-06, "loss": 0.9886013, "memory(GiB)": 142.32, "step": 125420, "train_speed(iter/s)": 0.286432 }, { "acc": 0.73860493, "epoch": 1.403039789051894, "grad_norm": 6.5625, "learning_rate": 2.2441724102606906e-06, "loss": 1.04582376, "memory(GiB)": 142.32, "step": 125440, "train_speed(iter/s)": 0.286449 }, { "acc": 0.73315897, "epoch": 1.4032634879978525, "grad_norm": 6.375, "learning_rate": 2.242629449604453e-06, "loss": 1.05855675, "memory(GiB)": 142.32, "step": 125460, "train_speed(iter/s)": 0.286465 }, { "acc": 0.75169249, "epoch": 1.403487186943811, "grad_norm": 6.6875, "learning_rate": 2.241086866195918e-06, "loss": 0.96946468, "memory(GiB)": 142.32, "step": 125480, "train_speed(iter/s)": 0.286481 }, { "acc": 0.7466145, "epoch": 1.4037108858897696, "grad_norm": 6.4375, "learning_rate": 2.2395446602461335e-06, "loss": 1.00093708, "memory(GiB)": 142.32, "step": 125500, "train_speed(iter/s)": 0.286497 }, { "acc": 0.73872156, "epoch": 1.403934584835728, "grad_norm": 6.46875, "learning_rate": 2.2380028319660955e-06, "loss": 1.03914738, "memory(GiB)": 142.32, "step": 125520, "train_speed(iter/s)": 0.286511 }, { "acc": 0.74001431, "epoch": 1.4041582837816866, "grad_norm": 5.5625, "learning_rate": 2.236461381566747e-06, "loss": 1.04870987, "memory(GiB)": 142.32, "step": 125540, "train_speed(iter/s)": 0.286527 }, { "acc": 0.74398808, "epoch": 1.4043819827276451, "grad_norm": 4.78125, "learning_rate": 2.2349203092589827e-06, "loss": 1.00679054, "memory(GiB)": 142.32, "step": 125560, "train_speed(iter/s)": 0.286542 }, { "acc": 0.73953676, "epoch": 1.4046056816736037, "grad_norm": 6.5, "learning_rate": 2.23337961525364e-06, "loss": 1.03749466, "memory(GiB)": 142.32, "step": 125580, "train_speed(iter/s)": 0.286557 }, { "acc": 0.75110469, "epoch": 1.4048293806195622, "grad_norm": 6.6875, "learning_rate": 2.231839299761513e-06, "loss": 0.96248684, "memory(GiB)": 142.32, "step": 125600, "train_speed(iter/s)": 0.286572 }, { "acc": 0.73042254, "epoch": 1.4050530795655207, "grad_norm": 6.46875, "learning_rate": 2.2302993629933355e-06, "loss": 1.06109715, "memory(GiB)": 142.32, "step": 125620, "train_speed(iter/s)": 0.286588 }, { "acc": 0.73674788, "epoch": 1.4052767785114793, "grad_norm": 7.21875, "learning_rate": 2.2287598051597914e-06, "loss": 1.03355932, "memory(GiB)": 142.32, "step": 125640, "train_speed(iter/s)": 0.286604 }, { "acc": 0.73927474, "epoch": 1.4055004774574378, "grad_norm": 5.84375, "learning_rate": 2.227220626471518e-06, "loss": 1.0159481, "memory(GiB)": 142.32, "step": 125660, "train_speed(iter/s)": 0.28662 }, { "acc": 0.73471384, "epoch": 1.4057241764033963, "grad_norm": 6.375, "learning_rate": 2.225681827139093e-06, "loss": 1.04579449, "memory(GiB)": 142.32, "step": 125680, "train_speed(iter/s)": 0.286636 }, { "acc": 0.73760891, "epoch": 1.4059478753493548, "grad_norm": 5.78125, "learning_rate": 2.22414340737305e-06, "loss": 1.06299171, "memory(GiB)": 142.32, "step": 125700, "train_speed(iter/s)": 0.286652 }, { "acc": 0.73352995, "epoch": 1.4061715742953134, "grad_norm": 4.96875, "learning_rate": 2.222605367383865e-06, "loss": 1.06464186, "memory(GiB)": 142.32, "step": 125720, "train_speed(iter/s)": 0.286669 }, { "acc": 0.74086332, "epoch": 1.406395273241272, "grad_norm": 5.9375, "learning_rate": 2.2210677073819624e-06, "loss": 1.02870073, "memory(GiB)": 142.32, "step": 125740, "train_speed(iter/s)": 0.286685 }, { "acc": 0.74141965, "epoch": 1.4066189721872304, "grad_norm": 5.75, "learning_rate": 2.2195304275777193e-06, "loss": 1.03990059, "memory(GiB)": 142.32, "step": 125760, "train_speed(iter/s)": 0.286702 }, { "acc": 0.73062391, "epoch": 1.406842671133189, "grad_norm": 5.5, "learning_rate": 2.2179935281814535e-06, "loss": 1.05794773, "memory(GiB)": 142.32, "step": 125780, "train_speed(iter/s)": 0.286718 }, { "acc": 0.75191755, "epoch": 1.4070663700791475, "grad_norm": 5.90625, "learning_rate": 2.2164570094034393e-06, "loss": 0.98817062, "memory(GiB)": 142.32, "step": 125800, "train_speed(iter/s)": 0.286734 }, { "acc": 0.74952631, "epoch": 1.407290069025106, "grad_norm": 7.34375, "learning_rate": 2.2149208714538917e-06, "loss": 0.98864689, "memory(GiB)": 142.32, "step": 125820, "train_speed(iter/s)": 0.286751 }, { "acc": 0.74294925, "epoch": 1.4075137679710645, "grad_norm": 5.5625, "learning_rate": 2.213385114542976e-06, "loss": 1.02401123, "memory(GiB)": 142.32, "step": 125840, "train_speed(iter/s)": 0.286766 }, { "acc": 0.73717299, "epoch": 1.407737466917023, "grad_norm": 6.625, "learning_rate": 2.2118497388808075e-06, "loss": 1.06157742, "memory(GiB)": 142.32, "step": 125860, "train_speed(iter/s)": 0.286782 }, { "acc": 0.73873682, "epoch": 1.4079611658629816, "grad_norm": 5.0625, "learning_rate": 2.2103147446774446e-06, "loss": 1.04072971, "memory(GiB)": 142.32, "step": 125880, "train_speed(iter/s)": 0.286798 }, { "acc": 0.73625221, "epoch": 1.4081848648089401, "grad_norm": 6.5625, "learning_rate": 2.208780132142901e-06, "loss": 1.05281487, "memory(GiB)": 142.32, "step": 125900, "train_speed(iter/s)": 0.286813 }, { "acc": 0.73793097, "epoch": 1.4084085637548986, "grad_norm": 6.3125, "learning_rate": 2.2072459014871305e-06, "loss": 1.03499126, "memory(GiB)": 142.32, "step": 125920, "train_speed(iter/s)": 0.286828 }, { "acc": 0.73362522, "epoch": 1.4086322627008572, "grad_norm": 5.4375, "learning_rate": 2.2057120529200366e-06, "loss": 1.05615978, "memory(GiB)": 142.32, "step": 125940, "train_speed(iter/s)": 0.286843 }, { "acc": 0.74922619, "epoch": 1.4088559616468157, "grad_norm": 5.6875, "learning_rate": 2.2041785866514755e-06, "loss": 0.97918167, "memory(GiB)": 142.32, "step": 125960, "train_speed(iter/s)": 0.286858 }, { "acc": 0.74273005, "epoch": 1.4090796605927742, "grad_norm": 5.6875, "learning_rate": 2.2026455028912434e-06, "loss": 1.01429424, "memory(GiB)": 142.32, "step": 125980, "train_speed(iter/s)": 0.286876 }, { "acc": 0.7288372, "epoch": 1.4093033595387328, "grad_norm": 5.03125, "learning_rate": 2.201112801849092e-06, "loss": 1.08127317, "memory(GiB)": 142.32, "step": 126000, "train_speed(iter/s)": 0.28689 }, { "epoch": 1.4093033595387328, "eval_acc": 0.696301104363385, "eval_loss": 1.0716557502746582, "eval_runtime": 2341.5529, "eval_samples_per_second": 32.151, "eval_steps_per_second": 16.076, "step": 126000 }, { "acc": 0.73126345, "epoch": 1.4095270584846913, "grad_norm": 5.9375, "learning_rate": 2.199580483734714e-06, "loss": 1.06222401, "memory(GiB)": 142.32, "step": 126020, "train_speed(iter/s)": 0.285353 }, { "acc": 0.72344685, "epoch": 1.4097507574306498, "grad_norm": 5.96875, "learning_rate": 2.1980485487577513e-06, "loss": 1.10643883, "memory(GiB)": 142.32, "step": 126040, "train_speed(iter/s)": 0.285368 }, { "acc": 0.73089724, "epoch": 1.4099744563766083, "grad_norm": 5.875, "learning_rate": 2.1965169971277984e-06, "loss": 1.07554054, "memory(GiB)": 142.32, "step": 126060, "train_speed(iter/s)": 0.285383 }, { "acc": 0.73101435, "epoch": 1.4101981553225669, "grad_norm": 5.65625, "learning_rate": 2.194985829054389e-06, "loss": 1.06837578, "memory(GiB)": 142.32, "step": 126080, "train_speed(iter/s)": 0.285397 }, { "acc": 0.72280917, "epoch": 1.4104218542685254, "grad_norm": 6.28125, "learning_rate": 2.1934550447470134e-06, "loss": 1.1011157, "memory(GiB)": 142.32, "step": 126100, "train_speed(iter/s)": 0.285414 }, { "acc": 0.73066225, "epoch": 1.410645553214484, "grad_norm": 5.46875, "learning_rate": 2.1919246444151022e-06, "loss": 1.07418385, "memory(GiB)": 142.32, "step": 126120, "train_speed(iter/s)": 0.285429 }, { "acc": 0.73394947, "epoch": 1.4108692521604425, "grad_norm": 7.71875, "learning_rate": 2.1903946282680345e-06, "loss": 1.05059986, "memory(GiB)": 142.32, "step": 126140, "train_speed(iter/s)": 0.285446 }, { "acc": 0.73983908, "epoch": 1.411092951106401, "grad_norm": 6.1875, "learning_rate": 2.188864996515142e-06, "loss": 1.04009495, "memory(GiB)": 142.32, "step": 126160, "train_speed(iter/s)": 0.285461 }, { "acc": 0.74967422, "epoch": 1.4113166500523595, "grad_norm": 6.46875, "learning_rate": 2.1873357493656965e-06, "loss": 0.97470217, "memory(GiB)": 142.32, "step": 126180, "train_speed(iter/s)": 0.285476 }, { "acc": 0.73230181, "epoch": 1.411540348998318, "grad_norm": 4.96875, "learning_rate": 2.1858068870289245e-06, "loss": 1.06672401, "memory(GiB)": 142.32, "step": 126200, "train_speed(iter/s)": 0.285493 }, { "acc": 0.73042941, "epoch": 1.4117640479442766, "grad_norm": 5.875, "learning_rate": 2.1842784097139945e-06, "loss": 1.08821583, "memory(GiB)": 142.32, "step": 126220, "train_speed(iter/s)": 0.285509 }, { "acc": 0.73451872, "epoch": 1.411987746890235, "grad_norm": 7.625, "learning_rate": 2.1827503176300224e-06, "loss": 1.05272808, "memory(GiB)": 142.32, "step": 126240, "train_speed(iter/s)": 0.285523 }, { "acc": 0.73493919, "epoch": 1.4122114458361936, "grad_norm": 6.96875, "learning_rate": 2.1812226109860764e-06, "loss": 1.04624004, "memory(GiB)": 142.32, "step": 126260, "train_speed(iter/s)": 0.28554 }, { "acc": 0.73274155, "epoch": 1.4124351447821522, "grad_norm": 5.375, "learning_rate": 2.1796952899911643e-06, "loss": 1.07648964, "memory(GiB)": 142.32, "step": 126280, "train_speed(iter/s)": 0.285555 }, { "acc": 0.74862485, "epoch": 1.4126588437281107, "grad_norm": 7.15625, "learning_rate": 2.1781683548542504e-06, "loss": 0.99465275, "memory(GiB)": 142.32, "step": 126300, "train_speed(iter/s)": 0.285569 }, { "acc": 0.74122381, "epoch": 1.4128825426740692, "grad_norm": 6.8125, "learning_rate": 2.1766418057842386e-06, "loss": 1.04070721, "memory(GiB)": 142.32, "step": 126320, "train_speed(iter/s)": 0.285585 }, { "acc": 0.72914219, "epoch": 1.4131062416200277, "grad_norm": 5.8125, "learning_rate": 2.1751156429899815e-06, "loss": 1.06560841, "memory(GiB)": 142.32, "step": 126340, "train_speed(iter/s)": 0.285602 }, { "acc": 0.73912807, "epoch": 1.4133299405659863, "grad_norm": 6.15625, "learning_rate": 2.1735898666802828e-06, "loss": 1.04831371, "memory(GiB)": 142.32, "step": 126360, "train_speed(iter/s)": 0.285616 }, { "acc": 0.73709602, "epoch": 1.4135536395119448, "grad_norm": 6.25, "learning_rate": 2.172064477063887e-06, "loss": 1.04244213, "memory(GiB)": 142.32, "step": 126380, "train_speed(iter/s)": 0.285632 }, { "acc": 0.73320303, "epoch": 1.4137773384579033, "grad_norm": 5.125, "learning_rate": 2.1705394743494935e-06, "loss": 1.07585659, "memory(GiB)": 142.32, "step": 126400, "train_speed(iter/s)": 0.285646 }, { "acc": 0.74439716, "epoch": 1.4140010374038618, "grad_norm": 7.125, "learning_rate": 2.169014858745742e-06, "loss": 1.00981245, "memory(GiB)": 142.32, "step": 126420, "train_speed(iter/s)": 0.285662 }, { "acc": 0.74350586, "epoch": 1.4142247363498204, "grad_norm": 6.15625, "learning_rate": 2.16749063046122e-06, "loss": 1.01784573, "memory(GiB)": 142.32, "step": 126440, "train_speed(iter/s)": 0.285677 }, { "acc": 0.74548144, "epoch": 1.414448435295779, "grad_norm": 5.6875, "learning_rate": 2.1659667897044678e-06, "loss": 1.02050495, "memory(GiB)": 142.32, "step": 126460, "train_speed(iter/s)": 0.285692 }, { "acc": 0.74031887, "epoch": 1.4146721342417374, "grad_norm": 4.46875, "learning_rate": 2.1644433366839648e-06, "loss": 1.03746662, "memory(GiB)": 142.32, "step": 126480, "train_speed(iter/s)": 0.285706 }, { "acc": 0.72451763, "epoch": 1.414895833187696, "grad_norm": 7.25, "learning_rate": 2.1629202716081443e-06, "loss": 1.10986729, "memory(GiB)": 142.32, "step": 126500, "train_speed(iter/s)": 0.285721 }, { "acc": 0.74355478, "epoch": 1.4151195321336545, "grad_norm": 8.4375, "learning_rate": 2.1613975946853815e-06, "loss": 1.02320852, "memory(GiB)": 142.32, "step": 126520, "train_speed(iter/s)": 0.285736 }, { "acc": 0.73322759, "epoch": 1.415343231079613, "grad_norm": 6.15625, "learning_rate": 2.159875306123999e-06, "loss": 1.06655712, "memory(GiB)": 142.32, "step": 126540, "train_speed(iter/s)": 0.285752 }, { "acc": 0.74057474, "epoch": 1.4155669300255715, "grad_norm": 5.84375, "learning_rate": 2.158353406132272e-06, "loss": 1.02469196, "memory(GiB)": 142.32, "step": 126560, "train_speed(iter/s)": 0.285768 }, { "acc": 0.73787451, "epoch": 1.41579062897153, "grad_norm": 5.6875, "learning_rate": 2.156831894918413e-06, "loss": 1.04657135, "memory(GiB)": 142.32, "step": 126580, "train_speed(iter/s)": 0.285783 }, { "acc": 0.74028807, "epoch": 1.4160143279174886, "grad_norm": 5.84375, "learning_rate": 2.1553107726905907e-06, "loss": 1.04473419, "memory(GiB)": 142.32, "step": 126600, "train_speed(iter/s)": 0.285798 }, { "acc": 0.74239044, "epoch": 1.4162380268634471, "grad_norm": 5.75, "learning_rate": 2.153790039656915e-06, "loss": 1.01511812, "memory(GiB)": 142.32, "step": 126620, "train_speed(iter/s)": 0.285812 }, { "acc": 0.72462616, "epoch": 1.4164617258094057, "grad_norm": 5.15625, "learning_rate": 2.152269696025442e-06, "loss": 1.09732637, "memory(GiB)": 142.32, "step": 126640, "train_speed(iter/s)": 0.285828 }, { "acc": 0.75204468, "epoch": 1.4166854247553642, "grad_norm": 6.8125, "learning_rate": 2.150749742004179e-06, "loss": 0.97582264, "memory(GiB)": 142.32, "step": 126660, "train_speed(iter/s)": 0.285841 }, { "acc": 0.73470926, "epoch": 1.4169091237013227, "grad_norm": 6.71875, "learning_rate": 2.149230177801077e-06, "loss": 1.05330334, "memory(GiB)": 142.32, "step": 126680, "train_speed(iter/s)": 0.285857 }, { "acc": 0.74802189, "epoch": 1.4171328226472812, "grad_norm": 6.34375, "learning_rate": 2.147711003624034e-06, "loss": 0.99263229, "memory(GiB)": 142.32, "step": 126700, "train_speed(iter/s)": 0.285872 }, { "acc": 0.74396124, "epoch": 1.4173565215932398, "grad_norm": 6.0, "learning_rate": 2.1461922196808914e-06, "loss": 1.02328243, "memory(GiB)": 142.32, "step": 126720, "train_speed(iter/s)": 0.285888 }, { "acc": 0.7376565, "epoch": 1.4175802205391983, "grad_norm": 5.875, "learning_rate": 2.1446738261794466e-06, "loss": 1.0227417, "memory(GiB)": 142.32, "step": 126740, "train_speed(iter/s)": 0.285903 }, { "acc": 0.73876677, "epoch": 1.4178039194851568, "grad_norm": 6.90625, "learning_rate": 2.1431558233274337e-06, "loss": 1.03740501, "memory(GiB)": 142.32, "step": 126760, "train_speed(iter/s)": 0.285919 }, { "acc": 0.74728551, "epoch": 1.4180276184311154, "grad_norm": 6.8125, "learning_rate": 2.1416382113325356e-06, "loss": 1.01292038, "memory(GiB)": 142.32, "step": 126780, "train_speed(iter/s)": 0.285933 }, { "acc": 0.75051746, "epoch": 1.4182513173770739, "grad_norm": 5.46875, "learning_rate": 2.140120990402388e-06, "loss": 0.99205036, "memory(GiB)": 142.32, "step": 126800, "train_speed(iter/s)": 0.285949 }, { "acc": 0.7414, "epoch": 1.4184750163230324, "grad_norm": 5.90625, "learning_rate": 2.138604160744564e-06, "loss": 1.02236595, "memory(GiB)": 142.32, "step": 126820, "train_speed(iter/s)": 0.285965 }, { "acc": 0.74303064, "epoch": 1.418698715268991, "grad_norm": 5.6875, "learning_rate": 2.1370877225665913e-06, "loss": 1.02820415, "memory(GiB)": 142.32, "step": 126840, "train_speed(iter/s)": 0.28598 }, { "acc": 0.73072214, "epoch": 1.4189224142149495, "grad_norm": 5.78125, "learning_rate": 2.135571676075939e-06, "loss": 1.06412373, "memory(GiB)": 142.32, "step": 126860, "train_speed(iter/s)": 0.285996 }, { "acc": 0.73883801, "epoch": 1.419146113160908, "grad_norm": 5.625, "learning_rate": 2.1340560214800217e-06, "loss": 1.03624134, "memory(GiB)": 142.32, "step": 126880, "train_speed(iter/s)": 0.286012 }, { "acc": 0.74263206, "epoch": 1.4193698121068665, "grad_norm": 7.59375, "learning_rate": 2.1325407589862057e-06, "loss": 1.02229404, "memory(GiB)": 142.32, "step": 126900, "train_speed(iter/s)": 0.286028 }, { "acc": 0.75294895, "epoch": 1.419593511052825, "grad_norm": 6.375, "learning_rate": 2.1310258888017983e-06, "loss": 0.96592503, "memory(GiB)": 142.32, "step": 126920, "train_speed(iter/s)": 0.286045 }, { "acc": 0.73310804, "epoch": 1.4198172099987836, "grad_norm": 5.5625, "learning_rate": 2.1295114111340575e-06, "loss": 1.0647254, "memory(GiB)": 142.32, "step": 126940, "train_speed(iter/s)": 0.28606 }, { "acc": 0.72811069, "epoch": 1.420040908944742, "grad_norm": 7.5625, "learning_rate": 2.1279973261901848e-06, "loss": 1.07570553, "memory(GiB)": 142.32, "step": 126960, "train_speed(iter/s)": 0.286076 }, { "acc": 0.72358208, "epoch": 1.4202646078907006, "grad_norm": 6.40625, "learning_rate": 2.126483634177326e-06, "loss": 1.11651306, "memory(GiB)": 142.32, "step": 126980, "train_speed(iter/s)": 0.28609 }, { "acc": 0.73329906, "epoch": 1.4204883068366592, "grad_norm": 5.625, "learning_rate": 2.12497033530258e-06, "loss": 1.05473919, "memory(GiB)": 142.32, "step": 127000, "train_speed(iter/s)": 0.286105 }, { "acc": 0.73275871, "epoch": 1.4207120057826177, "grad_norm": 5.34375, "learning_rate": 2.123457429772984e-06, "loss": 1.05794725, "memory(GiB)": 142.32, "step": 127020, "train_speed(iter/s)": 0.286122 }, { "acc": 0.73443489, "epoch": 1.4209357047285762, "grad_norm": 6.53125, "learning_rate": 2.1219449177955293e-06, "loss": 1.07323408, "memory(GiB)": 142.32, "step": 127040, "train_speed(iter/s)": 0.286137 }, { "acc": 0.74341621, "epoch": 1.4211594036745347, "grad_norm": 5.625, "learning_rate": 2.1204327995771464e-06, "loss": 1.01888876, "memory(GiB)": 142.32, "step": 127060, "train_speed(iter/s)": 0.286153 }, { "acc": 0.73499928, "epoch": 1.4213831026204933, "grad_norm": 6.65625, "learning_rate": 2.1189210753247127e-06, "loss": 1.06243601, "memory(GiB)": 142.32, "step": 127080, "train_speed(iter/s)": 0.28617 }, { "acc": 0.73724222, "epoch": 1.4216068015664518, "grad_norm": 6.65625, "learning_rate": 2.117409745245058e-06, "loss": 1.04308624, "memory(GiB)": 142.32, "step": 127100, "train_speed(iter/s)": 0.286186 }, { "acc": 0.73398275, "epoch": 1.4218305005124103, "grad_norm": 5.9375, "learning_rate": 2.1158988095449502e-06, "loss": 1.05977974, "memory(GiB)": 142.32, "step": 127120, "train_speed(iter/s)": 0.2862 }, { "acc": 0.7286798, "epoch": 1.4220541994583689, "grad_norm": 7.3125, "learning_rate": 2.114388268431111e-06, "loss": 1.08949919, "memory(GiB)": 142.32, "step": 127140, "train_speed(iter/s)": 0.286214 }, { "acc": 0.73041019, "epoch": 1.4222778984043274, "grad_norm": 6.59375, "learning_rate": 2.1128781221102e-06, "loss": 1.05972176, "memory(GiB)": 142.32, "step": 127160, "train_speed(iter/s)": 0.28623 }, { "acc": 0.73398232, "epoch": 1.422501597350286, "grad_norm": 5.4375, "learning_rate": 2.111368370788828e-06, "loss": 1.0642024, "memory(GiB)": 142.32, "step": 127180, "train_speed(iter/s)": 0.286244 }, { "acc": 0.74712915, "epoch": 1.4227252962962444, "grad_norm": 4.8125, "learning_rate": 2.1098590146735522e-06, "loss": 1.00064487, "memory(GiB)": 142.32, "step": 127200, "train_speed(iter/s)": 0.286259 }, { "acc": 0.74442434, "epoch": 1.422948995242203, "grad_norm": 6.0625, "learning_rate": 2.108350053970871e-06, "loss": 1.01679287, "memory(GiB)": 142.32, "step": 127220, "train_speed(iter/s)": 0.286274 }, { "acc": 0.7425622, "epoch": 1.4231726941881615, "grad_norm": 5.1875, "learning_rate": 2.1068414888872353e-06, "loss": 1.01521091, "memory(GiB)": 142.32, "step": 127240, "train_speed(iter/s)": 0.286289 }, { "acc": 0.74519944, "epoch": 1.42339639313412, "grad_norm": 6.78125, "learning_rate": 2.105333319629037e-06, "loss": 1.01159668, "memory(GiB)": 142.32, "step": 127260, "train_speed(iter/s)": 0.286304 }, { "acc": 0.72355614, "epoch": 1.4236200920800786, "grad_norm": 5.625, "learning_rate": 2.103825546402613e-06, "loss": 1.11621218, "memory(GiB)": 142.32, "step": 127280, "train_speed(iter/s)": 0.286319 }, { "acc": 0.73640065, "epoch": 1.423843791026037, "grad_norm": 4.96875, "learning_rate": 2.102318169414252e-06, "loss": 1.0648591, "memory(GiB)": 142.32, "step": 127300, "train_speed(iter/s)": 0.286334 }, { "acc": 0.73879294, "epoch": 1.4240674899719956, "grad_norm": 6.0, "learning_rate": 2.100811188870181e-06, "loss": 1.03016853, "memory(GiB)": 142.32, "step": 127320, "train_speed(iter/s)": 0.28635 }, { "acc": 0.7380096, "epoch": 1.4242911889179541, "grad_norm": 6.5, "learning_rate": 2.0993046049765796e-06, "loss": 1.03964176, "memory(GiB)": 142.32, "step": 127340, "train_speed(iter/s)": 0.286365 }, { "acc": 0.74058514, "epoch": 1.4245148878639127, "grad_norm": 5.78125, "learning_rate": 2.0977984179395693e-06, "loss": 1.02768965, "memory(GiB)": 142.32, "step": 127360, "train_speed(iter/s)": 0.28638 }, { "acc": 0.74517708, "epoch": 1.4247385868098712, "grad_norm": 4.90625, "learning_rate": 2.096292627965216e-06, "loss": 1.01209373, "memory(GiB)": 142.32, "step": 127380, "train_speed(iter/s)": 0.286396 }, { "acc": 0.75636845, "epoch": 1.4249622857558297, "grad_norm": 5.28125, "learning_rate": 2.0947872352595353e-06, "loss": 0.96011448, "memory(GiB)": 142.32, "step": 127400, "train_speed(iter/s)": 0.286412 }, { "acc": 0.73578167, "epoch": 1.4251859847017883, "grad_norm": 7.1875, "learning_rate": 2.093282240028485e-06, "loss": 1.04795799, "memory(GiB)": 142.32, "step": 127420, "train_speed(iter/s)": 0.286426 }, { "acc": 0.74251184, "epoch": 1.4254096836477468, "grad_norm": 6.21875, "learning_rate": 2.0917776424779727e-06, "loss": 1.00126753, "memory(GiB)": 142.32, "step": 127440, "train_speed(iter/s)": 0.286441 }, { "acc": 0.74187675, "epoch": 1.4256333825937053, "grad_norm": 6.875, "learning_rate": 2.0902734428138468e-06, "loss": 1.00888157, "memory(GiB)": 142.32, "step": 127460, "train_speed(iter/s)": 0.286456 }, { "acc": 0.74508109, "epoch": 1.4258570815396638, "grad_norm": 7.5625, "learning_rate": 2.0887696412419017e-06, "loss": 1.00061188, "memory(GiB)": 142.32, "step": 127480, "train_speed(iter/s)": 0.286471 }, { "acc": 0.74693413, "epoch": 1.4260807804856224, "grad_norm": 6.625, "learning_rate": 2.0872662379678822e-06, "loss": 0.98381996, "memory(GiB)": 142.32, "step": 127500, "train_speed(iter/s)": 0.286485 }, { "acc": 0.73781271, "epoch": 1.426304479431581, "grad_norm": 5.96875, "learning_rate": 2.0857632331974725e-06, "loss": 1.03954773, "memory(GiB)": 142.32, "step": 127520, "train_speed(iter/s)": 0.286501 }, { "acc": 0.72727747, "epoch": 1.4265281783775394, "grad_norm": 5.9375, "learning_rate": 2.084260627136308e-06, "loss": 1.09563484, "memory(GiB)": 142.32, "step": 127540, "train_speed(iter/s)": 0.286515 }, { "acc": 0.73010798, "epoch": 1.4267518773234982, "grad_norm": 5.4375, "learning_rate": 2.0827584199899658e-06, "loss": 1.09201937, "memory(GiB)": 142.32, "step": 127560, "train_speed(iter/s)": 0.286529 }, { "acc": 0.73999658, "epoch": 1.4269755762694567, "grad_norm": 7.46875, "learning_rate": 2.0812566119639664e-06, "loss": 1.0419096, "memory(GiB)": 142.32, "step": 127580, "train_speed(iter/s)": 0.286543 }, { "acc": 0.73342638, "epoch": 1.4271992752154152, "grad_norm": 6.40625, "learning_rate": 2.0797552032637828e-06, "loss": 1.07120934, "memory(GiB)": 142.32, "step": 127600, "train_speed(iter/s)": 0.28656 }, { "acc": 0.73244104, "epoch": 1.4274229741613738, "grad_norm": 6.6875, "learning_rate": 2.078254194094826e-06, "loss": 1.07852802, "memory(GiB)": 142.32, "step": 127620, "train_speed(iter/s)": 0.286575 }, { "acc": 0.74216542, "epoch": 1.4276466731073323, "grad_norm": 6.15625, "learning_rate": 2.076753584662458e-06, "loss": 1.0335927, "memory(GiB)": 142.32, "step": 127640, "train_speed(iter/s)": 0.286591 }, { "acc": 0.72833056, "epoch": 1.4278703720532908, "grad_norm": 6.4375, "learning_rate": 2.0752533751719826e-06, "loss": 1.07727909, "memory(GiB)": 142.32, "step": 127660, "train_speed(iter/s)": 0.286606 }, { "acc": 0.74842467, "epoch": 1.4280940709992493, "grad_norm": 7.09375, "learning_rate": 2.0737535658286485e-06, "loss": 0.99747858, "memory(GiB)": 142.32, "step": 127680, "train_speed(iter/s)": 0.286622 }, { "acc": 0.72602801, "epoch": 1.4283177699452079, "grad_norm": 5.40625, "learning_rate": 2.0722541568376535e-06, "loss": 1.10386276, "memory(GiB)": 142.32, "step": 127700, "train_speed(iter/s)": 0.286636 }, { "acc": 0.74355507, "epoch": 1.4285414688911664, "grad_norm": 7.6875, "learning_rate": 2.0707551484041347e-06, "loss": 1.02311802, "memory(GiB)": 142.32, "step": 127720, "train_speed(iter/s)": 0.286652 }, { "acc": 0.73403277, "epoch": 1.428765167837125, "grad_norm": 7.28125, "learning_rate": 2.0692565407331834e-06, "loss": 1.04913845, "memory(GiB)": 142.32, "step": 127740, "train_speed(iter/s)": 0.286668 }, { "acc": 0.74194336, "epoch": 1.4289888667830835, "grad_norm": 5.4375, "learning_rate": 2.0677583340298263e-06, "loss": 1.01752567, "memory(GiB)": 142.32, "step": 127760, "train_speed(iter/s)": 0.286684 }, { "acc": 0.72727404, "epoch": 1.429212565729042, "grad_norm": 7.25, "learning_rate": 2.0662605284990388e-06, "loss": 1.08589649, "memory(GiB)": 142.32, "step": 127780, "train_speed(iter/s)": 0.286701 }, { "acc": 0.72910523, "epoch": 1.4294362646750005, "grad_norm": 5.09375, "learning_rate": 2.0647631243457455e-06, "loss": 1.08846645, "memory(GiB)": 142.32, "step": 127800, "train_speed(iter/s)": 0.286716 }, { "acc": 0.74842749, "epoch": 1.429659963620959, "grad_norm": 5.9375, "learning_rate": 2.0632661217748094e-06, "loss": 0.98528728, "memory(GiB)": 142.32, "step": 127820, "train_speed(iter/s)": 0.286732 }, { "acc": 0.74399729, "epoch": 1.4298836625669176, "grad_norm": 5.4375, "learning_rate": 2.0617695209910454e-06, "loss": 1.01696529, "memory(GiB)": 142.32, "step": 127840, "train_speed(iter/s)": 0.286747 }, { "acc": 0.73571072, "epoch": 1.430107361512876, "grad_norm": 6.4375, "learning_rate": 2.0602733221992077e-06, "loss": 1.04062843, "memory(GiB)": 142.32, "step": 127860, "train_speed(iter/s)": 0.286758 }, { "acc": 0.73402171, "epoch": 1.4303310604588346, "grad_norm": 5.28125, "learning_rate": 2.058777525603998e-06, "loss": 1.07043762, "memory(GiB)": 142.32, "step": 127880, "train_speed(iter/s)": 0.286772 }, { "acc": 0.72982521, "epoch": 1.4305547594047932, "grad_norm": 6.40625, "learning_rate": 2.057282131410062e-06, "loss": 1.09618835, "memory(GiB)": 142.32, "step": 127900, "train_speed(iter/s)": 0.286786 }, { "acc": 0.74700756, "epoch": 1.4307784583507517, "grad_norm": 4.65625, "learning_rate": 2.0557871398219903e-06, "loss": 1.00297928, "memory(GiB)": 142.32, "step": 127920, "train_speed(iter/s)": 0.286801 }, { "acc": 0.73431044, "epoch": 1.4310021572967102, "grad_norm": 6.4375, "learning_rate": 2.0542925510443224e-06, "loss": 1.06544542, "memory(GiB)": 142.32, "step": 127940, "train_speed(iter/s)": 0.286817 }, { "acc": 0.73345375, "epoch": 1.4312258562426687, "grad_norm": 6.375, "learning_rate": 2.0527983652815347e-06, "loss": 1.05847549, "memory(GiB)": 142.32, "step": 127960, "train_speed(iter/s)": 0.286832 }, { "acc": 0.7347146, "epoch": 1.4314495551886273, "grad_norm": 6.0625, "learning_rate": 2.0513045827380584e-06, "loss": 1.07436781, "memory(GiB)": 142.32, "step": 127980, "train_speed(iter/s)": 0.286849 }, { "acc": 0.73106194, "epoch": 1.4316732541345858, "grad_norm": 6.40625, "learning_rate": 2.0498112036182616e-06, "loss": 1.06379395, "memory(GiB)": 142.32, "step": 128000, "train_speed(iter/s)": 0.286865 }, { "epoch": 1.4316732541345858, "eval_acc": 0.6963461122094405, "eval_loss": 1.0715112686157227, "eval_runtime": 2339.0182, "eval_samples_per_second": 32.186, "eval_steps_per_second": 16.093, "step": 128000 }, { "acc": 0.73751698, "epoch": 1.4318969530805443, "grad_norm": 6.1875, "learning_rate": 2.0483182281264586e-06, "loss": 1.05344143, "memory(GiB)": 142.32, "step": 128020, "train_speed(iter/s)": 0.285352 }, { "acc": 0.73734493, "epoch": 1.4321206520265028, "grad_norm": 4.9375, "learning_rate": 2.0468256564669124e-06, "loss": 1.04113922, "memory(GiB)": 142.32, "step": 128040, "train_speed(iter/s)": 0.285367 }, { "acc": 0.73330135, "epoch": 1.4323443509724614, "grad_norm": 6.09375, "learning_rate": 2.0453334888438253e-06, "loss": 1.07540989, "memory(GiB)": 142.32, "step": 128060, "train_speed(iter/s)": 0.285381 }, { "acc": 0.7441112, "epoch": 1.43256804991842, "grad_norm": 6.1875, "learning_rate": 2.0438417254613508e-06, "loss": 1.02137527, "memory(GiB)": 142.32, "step": 128080, "train_speed(iter/s)": 0.285397 }, { "acc": 0.7399869, "epoch": 1.4327917488643784, "grad_norm": 5.625, "learning_rate": 2.042350366523582e-06, "loss": 1.04309444, "memory(GiB)": 142.32, "step": 128100, "train_speed(iter/s)": 0.285414 }, { "acc": 0.73367038, "epoch": 1.433015447810337, "grad_norm": 5.5, "learning_rate": 2.040859412234555e-06, "loss": 1.0484458, "memory(GiB)": 142.32, "step": 128120, "train_speed(iter/s)": 0.285428 }, { "acc": 0.73829384, "epoch": 1.4332391467562955, "grad_norm": 6.25, "learning_rate": 2.0393688627982585e-06, "loss": 1.05123281, "memory(GiB)": 142.32, "step": 128140, "train_speed(iter/s)": 0.285442 }, { "acc": 0.73132877, "epoch": 1.433462845702254, "grad_norm": 6.78125, "learning_rate": 2.0378787184186165e-06, "loss": 1.08112755, "memory(GiB)": 142.32, "step": 128160, "train_speed(iter/s)": 0.285458 }, { "acc": 0.73129673, "epoch": 1.4336865446482125, "grad_norm": 6.46875, "learning_rate": 2.0363889792995067e-06, "loss": 1.07175121, "memory(GiB)": 142.32, "step": 128180, "train_speed(iter/s)": 0.285473 }, { "acc": 0.73979826, "epoch": 1.433910243594171, "grad_norm": 5.65625, "learning_rate": 2.0348996456447438e-06, "loss": 1.05685101, "memory(GiB)": 142.32, "step": 128200, "train_speed(iter/s)": 0.28549 }, { "acc": 0.73370647, "epoch": 1.4341339425401296, "grad_norm": 5.4375, "learning_rate": 2.033410717658089e-06, "loss": 1.0615675, "memory(GiB)": 142.32, "step": 128220, "train_speed(iter/s)": 0.285506 }, { "acc": 0.74078999, "epoch": 1.4343576414860881, "grad_norm": 6.28125, "learning_rate": 2.0319221955432515e-06, "loss": 1.04240417, "memory(GiB)": 142.32, "step": 128240, "train_speed(iter/s)": 0.285522 }, { "acc": 0.73234329, "epoch": 1.4345813404320467, "grad_norm": 5.5625, "learning_rate": 2.03043407950388e-06, "loss": 1.07129517, "memory(GiB)": 142.32, "step": 128260, "train_speed(iter/s)": 0.285537 }, { "acc": 0.73522091, "epoch": 1.4348050393780052, "grad_norm": 6.28125, "learning_rate": 2.028946369743573e-06, "loss": 1.05974312, "memory(GiB)": 142.32, "step": 128280, "train_speed(iter/s)": 0.285553 }, { "acc": 0.74015751, "epoch": 1.4350287383239637, "grad_norm": 7.25, "learning_rate": 2.027459066465869e-06, "loss": 1.02605324, "memory(GiB)": 142.32, "step": 128300, "train_speed(iter/s)": 0.285568 }, { "acc": 0.74256554, "epoch": 1.4352524372699222, "grad_norm": 6.4375, "learning_rate": 2.02597216987425e-06, "loss": 1.01788464, "memory(GiB)": 142.32, "step": 128320, "train_speed(iter/s)": 0.285583 }, { "acc": 0.73684082, "epoch": 1.4354761362158808, "grad_norm": 6.6875, "learning_rate": 2.0244856801721484e-06, "loss": 1.03881073, "memory(GiB)": 142.32, "step": 128340, "train_speed(iter/s)": 0.285598 }, { "acc": 0.73339186, "epoch": 1.4356998351618393, "grad_norm": 6.53125, "learning_rate": 2.0229995975629348e-06, "loss": 1.06111288, "memory(GiB)": 142.32, "step": 128360, "train_speed(iter/s)": 0.285611 }, { "acc": 0.74241934, "epoch": 1.4359235341077978, "grad_norm": 6.0, "learning_rate": 2.021513922249928e-06, "loss": 1.01718178, "memory(GiB)": 142.32, "step": 128380, "train_speed(iter/s)": 0.285626 }, { "acc": 0.73222919, "epoch": 1.4361472330537564, "grad_norm": 6.3125, "learning_rate": 2.0200286544363902e-06, "loss": 1.06699791, "memory(GiB)": 142.32, "step": 128400, "train_speed(iter/s)": 0.285639 }, { "acc": 0.74177713, "epoch": 1.4363709319997149, "grad_norm": 5.15625, "learning_rate": 2.0185437943255233e-06, "loss": 1.03435287, "memory(GiB)": 142.32, "step": 128420, "train_speed(iter/s)": 0.285656 }, { "acc": 0.73918571, "epoch": 1.4365946309456734, "grad_norm": 5.625, "learning_rate": 2.017059342120482e-06, "loss": 1.03398705, "memory(GiB)": 142.32, "step": 128440, "train_speed(iter/s)": 0.28567 }, { "acc": 0.74821835, "epoch": 1.436818329891632, "grad_norm": 6.28125, "learning_rate": 2.0155752980243575e-06, "loss": 1.00404911, "memory(GiB)": 142.32, "step": 128460, "train_speed(iter/s)": 0.285685 }, { "acc": 0.7470314, "epoch": 1.4370420288375905, "grad_norm": 5.59375, "learning_rate": 2.0140916622401914e-06, "loss": 1.00729942, "memory(GiB)": 142.32, "step": 128480, "train_speed(iter/s)": 0.285701 }, { "acc": 0.74252872, "epoch": 1.437265727783549, "grad_norm": 5.8125, "learning_rate": 2.0126084349709635e-06, "loss": 1.01116924, "memory(GiB)": 142.32, "step": 128500, "train_speed(iter/s)": 0.285715 }, { "acc": 0.73373747, "epoch": 1.4374894267295075, "grad_norm": 5.34375, "learning_rate": 2.0111256164196e-06, "loss": 1.06040993, "memory(GiB)": 142.32, "step": 128520, "train_speed(iter/s)": 0.28573 }, { "acc": 0.7474556, "epoch": 1.437713125675466, "grad_norm": 5.90625, "learning_rate": 2.0096432067889752e-06, "loss": 0.99738026, "memory(GiB)": 142.32, "step": 128540, "train_speed(iter/s)": 0.285745 }, { "acc": 0.72597098, "epoch": 1.4379368246214246, "grad_norm": 5.9375, "learning_rate": 2.0081612062818995e-06, "loss": 1.09939404, "memory(GiB)": 142.32, "step": 128560, "train_speed(iter/s)": 0.285761 }, { "acc": 0.74909401, "epoch": 1.438160523567383, "grad_norm": 7.25, "learning_rate": 2.0066796151011358e-06, "loss": 0.99383316, "memory(GiB)": 142.32, "step": 128580, "train_speed(iter/s)": 0.285777 }, { "acc": 0.74370556, "epoch": 1.4383842225133416, "grad_norm": 5.71875, "learning_rate": 2.0051984334493857e-06, "loss": 1.00917835, "memory(GiB)": 142.32, "step": 128600, "train_speed(iter/s)": 0.285792 }, { "acc": 0.7393961, "epoch": 1.4386079214593002, "grad_norm": 5.90625, "learning_rate": 2.003717661529293e-06, "loss": 1.04160614, "memory(GiB)": 142.32, "step": 128620, "train_speed(iter/s)": 0.285808 }, { "acc": 0.7307436, "epoch": 1.4388316204052587, "grad_norm": 6.5, "learning_rate": 2.002237299543453e-06, "loss": 1.07194061, "memory(GiB)": 142.32, "step": 128640, "train_speed(iter/s)": 0.285824 }, { "acc": 0.71947536, "epoch": 1.4390553193512172, "grad_norm": 6.40625, "learning_rate": 2.000757347694397e-06, "loss": 1.13294525, "memory(GiB)": 142.32, "step": 128660, "train_speed(iter/s)": 0.285838 }, { "acc": 0.73212962, "epoch": 1.4392790182971757, "grad_norm": 7.6875, "learning_rate": 1.9992778061846064e-06, "loss": 1.06667681, "memory(GiB)": 142.32, "step": 128680, "train_speed(iter/s)": 0.285853 }, { "acc": 0.73768473, "epoch": 1.4395027172431343, "grad_norm": 5.84375, "learning_rate": 1.9977986752165017e-06, "loss": 1.02853565, "memory(GiB)": 142.32, "step": 128700, "train_speed(iter/s)": 0.285869 }, { "acc": 0.73962975, "epoch": 1.4397264161890928, "grad_norm": 5.5, "learning_rate": 1.996319954992448e-06, "loss": 1.03993835, "memory(GiB)": 142.32, "step": 128720, "train_speed(iter/s)": 0.285884 }, { "acc": 0.7328393, "epoch": 1.4399501151350513, "grad_norm": 5.65625, "learning_rate": 1.994841645714759e-06, "loss": 1.06377211, "memory(GiB)": 142.32, "step": 128740, "train_speed(iter/s)": 0.285899 }, { "acc": 0.73514023, "epoch": 1.4401738140810099, "grad_norm": 6.5, "learning_rate": 1.9933637475856845e-06, "loss": 1.05009956, "memory(GiB)": 142.32, "step": 128760, "train_speed(iter/s)": 0.285916 }, { "acc": 0.73811159, "epoch": 1.4403975130269684, "grad_norm": 5.5, "learning_rate": 1.9918862608074258e-06, "loss": 1.04599447, "memory(GiB)": 142.32, "step": 128780, "train_speed(iter/s)": 0.285931 }, { "acc": 0.72113476, "epoch": 1.440621211972927, "grad_norm": 6.875, "learning_rate": 1.9904091855821223e-06, "loss": 1.13724718, "memory(GiB)": 142.32, "step": 128800, "train_speed(iter/s)": 0.285947 }, { "acc": 0.74090114, "epoch": 1.4408449109188854, "grad_norm": 6.0, "learning_rate": 1.9889325221118576e-06, "loss": 1.02685127, "memory(GiB)": 142.32, "step": 128820, "train_speed(iter/s)": 0.285961 }, { "acc": 0.72505827, "epoch": 1.441068609864844, "grad_norm": 5.28125, "learning_rate": 1.987456270598664e-06, "loss": 1.08300209, "memory(GiB)": 142.32, "step": 128840, "train_speed(iter/s)": 0.285975 }, { "acc": 0.73785238, "epoch": 1.4412923088108025, "grad_norm": 5.5, "learning_rate": 1.9859804312445096e-06, "loss": 1.0368845, "memory(GiB)": 142.32, "step": 128860, "train_speed(iter/s)": 0.285989 }, { "acc": 0.74345198, "epoch": 1.441516007756761, "grad_norm": 5.5625, "learning_rate": 1.984505004251314e-06, "loss": 1.00579548, "memory(GiB)": 142.32, "step": 128880, "train_speed(iter/s)": 0.286004 }, { "acc": 0.73775959, "epoch": 1.4417397067027196, "grad_norm": 5.46875, "learning_rate": 1.983029989820936e-06, "loss": 1.04507751, "memory(GiB)": 142.32, "step": 128900, "train_speed(iter/s)": 0.286021 }, { "acc": 0.7377564, "epoch": 1.441963405648678, "grad_norm": 6.28125, "learning_rate": 1.9815553881551753e-06, "loss": 1.02814102, "memory(GiB)": 142.32, "step": 128920, "train_speed(iter/s)": 0.286037 }, { "acc": 0.73544903, "epoch": 1.4421871045946366, "grad_norm": 5.15625, "learning_rate": 1.9800811994557833e-06, "loss": 1.05040932, "memory(GiB)": 142.32, "step": 128940, "train_speed(iter/s)": 0.286053 }, { "acc": 0.72691684, "epoch": 1.4424108035405951, "grad_norm": 6.0625, "learning_rate": 1.9786074239244458e-06, "loss": 1.10089092, "memory(GiB)": 142.32, "step": 128960, "train_speed(iter/s)": 0.28607 }, { "acc": 0.72752252, "epoch": 1.4426345024865537, "grad_norm": 5.0, "learning_rate": 1.9771340617628e-06, "loss": 1.08757648, "memory(GiB)": 142.32, "step": 128980, "train_speed(iter/s)": 0.286084 }, { "acc": 0.7424614, "epoch": 1.4428582014325122, "grad_norm": 6.375, "learning_rate": 1.9756611131724215e-06, "loss": 1.03395386, "memory(GiB)": 142.32, "step": 129000, "train_speed(iter/s)": 0.286099 }, { "acc": 0.73471794, "epoch": 1.4430819003784707, "grad_norm": 6.59375, "learning_rate": 1.974188578354829e-06, "loss": 1.05090637, "memory(GiB)": 142.32, "step": 129020, "train_speed(iter/s)": 0.286115 }, { "acc": 0.73057232, "epoch": 1.4433055993244293, "grad_norm": 5.46875, "learning_rate": 1.972716457511489e-06, "loss": 1.06757069, "memory(GiB)": 142.32, "step": 129040, "train_speed(iter/s)": 0.286132 }, { "acc": 0.72897367, "epoch": 1.4435292982703878, "grad_norm": 7.3125, "learning_rate": 1.9712447508438072e-06, "loss": 1.09499464, "memory(GiB)": 142.32, "step": 129060, "train_speed(iter/s)": 0.286148 }, { "acc": 0.73835411, "epoch": 1.4437529972163463, "grad_norm": 6.53125, "learning_rate": 1.9697734585531348e-06, "loss": 1.02161007, "memory(GiB)": 142.32, "step": 129080, "train_speed(iter/s)": 0.286164 }, { "acc": 0.72449684, "epoch": 1.4439766961623048, "grad_norm": 6.59375, "learning_rate": 1.9683025808407635e-06, "loss": 1.08971405, "memory(GiB)": 142.32, "step": 129100, "train_speed(iter/s)": 0.286181 }, { "acc": 0.72552643, "epoch": 1.4442003951082634, "grad_norm": 6.125, "learning_rate": 1.9668321179079337e-06, "loss": 1.10444746, "memory(GiB)": 142.32, "step": 129120, "train_speed(iter/s)": 0.286197 }, { "acc": 0.73663111, "epoch": 1.444424094054222, "grad_norm": 6.84375, "learning_rate": 1.965362069955824e-06, "loss": 1.04913235, "memory(GiB)": 142.32, "step": 129140, "train_speed(iter/s)": 0.286214 }, { "acc": 0.7394722, "epoch": 1.4446477930001804, "grad_norm": 7.46875, "learning_rate": 1.9638924371855565e-06, "loss": 1.02246113, "memory(GiB)": 142.32, "step": 129160, "train_speed(iter/s)": 0.28623 }, { "acc": 0.73141584, "epoch": 1.444871491946139, "grad_norm": 6.59375, "learning_rate": 1.962423219798202e-06, "loss": 1.05092793, "memory(GiB)": 142.32, "step": 129180, "train_speed(iter/s)": 0.286245 }, { "acc": 0.73052888, "epoch": 1.4450951908920975, "grad_norm": 7.03125, "learning_rate": 1.9609544179947653e-06, "loss": 1.06576929, "memory(GiB)": 142.32, "step": 129200, "train_speed(iter/s)": 0.286261 }, { "acc": 0.73475547, "epoch": 1.445318889838056, "grad_norm": 5.5, "learning_rate": 1.9594860319762045e-06, "loss": 1.07412863, "memory(GiB)": 142.32, "step": 129220, "train_speed(iter/s)": 0.286275 }, { "acc": 0.75785837, "epoch": 1.4455425887840145, "grad_norm": 6.96875, "learning_rate": 1.958018061943413e-06, "loss": 0.9613802, "memory(GiB)": 142.32, "step": 129240, "train_speed(iter/s)": 0.28629 }, { "acc": 0.73054128, "epoch": 1.445766287729973, "grad_norm": 7.3125, "learning_rate": 1.9565505080972293e-06, "loss": 1.06349716, "memory(GiB)": 142.32, "step": 129260, "train_speed(iter/s)": 0.286306 }, { "acc": 0.72632504, "epoch": 1.4459899866759316, "grad_norm": 5.4375, "learning_rate": 1.955083370638438e-06, "loss": 1.10261002, "memory(GiB)": 142.32, "step": 129280, "train_speed(iter/s)": 0.286321 }, { "acc": 0.7219821, "epoch": 1.4462136856218901, "grad_norm": 6.375, "learning_rate": 1.953616649767762e-06, "loss": 1.11264973, "memory(GiB)": 142.32, "step": 129300, "train_speed(iter/s)": 0.286336 }, { "acc": 0.74126983, "epoch": 1.4464373845678486, "grad_norm": 5.8125, "learning_rate": 1.952150345685874e-06, "loss": 1.02502127, "memory(GiB)": 142.32, "step": 129320, "train_speed(iter/s)": 0.286352 }, { "acc": 0.73670907, "epoch": 1.4466610835138072, "grad_norm": 5.0625, "learning_rate": 1.9506844585933817e-06, "loss": 1.04893789, "memory(GiB)": 142.32, "step": 129340, "train_speed(iter/s)": 0.286368 }, { "acc": 0.73836432, "epoch": 1.4468847824597657, "grad_norm": 5.6875, "learning_rate": 1.949218988690838e-06, "loss": 1.03103657, "memory(GiB)": 142.32, "step": 129360, "train_speed(iter/s)": 0.286384 }, { "acc": 0.73202925, "epoch": 1.4471084814057242, "grad_norm": 6.90625, "learning_rate": 1.9477539361787447e-06, "loss": 1.07316628, "memory(GiB)": 142.32, "step": 129380, "train_speed(iter/s)": 0.286398 }, { "acc": 0.74350734, "epoch": 1.4473321803516828, "grad_norm": 7.90625, "learning_rate": 1.9462893012575373e-06, "loss": 1.03581371, "memory(GiB)": 142.32, "step": 129400, "train_speed(iter/s)": 0.286412 }, { "acc": 0.74396334, "epoch": 1.4475558792976413, "grad_norm": 4.96875, "learning_rate": 1.9448250841276033e-06, "loss": 1.02433338, "memory(GiB)": 142.32, "step": 129420, "train_speed(iter/s)": 0.286427 }, { "acc": 0.72875519, "epoch": 1.4477795782435998, "grad_norm": 5.46875, "learning_rate": 1.9433612849892664e-06, "loss": 1.09347115, "memory(GiB)": 142.32, "step": 129440, "train_speed(iter/s)": 0.286441 }, { "acc": 0.73420858, "epoch": 1.4480032771895583, "grad_norm": 7.4375, "learning_rate": 1.9418979040427934e-06, "loss": 1.05863953, "memory(GiB)": 142.32, "step": 129460, "train_speed(iter/s)": 0.286456 }, { "acc": 0.73809071, "epoch": 1.4482269761355169, "grad_norm": 5.125, "learning_rate": 1.940434941488399e-06, "loss": 1.03535042, "memory(GiB)": 142.32, "step": 129480, "train_speed(iter/s)": 0.286471 }, { "acc": 0.73762341, "epoch": 1.4484506750814754, "grad_norm": 5.1875, "learning_rate": 1.9389723975262337e-06, "loss": 1.0549757, "memory(GiB)": 142.32, "step": 129500, "train_speed(iter/s)": 0.286485 }, { "acc": 0.74511099, "epoch": 1.448674374027434, "grad_norm": 7.09375, "learning_rate": 1.937510272356399e-06, "loss": 1.00096798, "memory(GiB)": 142.32, "step": 129520, "train_speed(iter/s)": 0.2865 }, { "acc": 0.73665104, "epoch": 1.4488980729733925, "grad_norm": 9.375, "learning_rate": 1.936048566178932e-06, "loss": 1.05093861, "memory(GiB)": 142.32, "step": 129540, "train_speed(iter/s)": 0.286514 }, { "acc": 0.74404087, "epoch": 1.449121771919351, "grad_norm": 6.59375, "learning_rate": 1.934587279193813e-06, "loss": 1.01496124, "memory(GiB)": 142.32, "step": 129560, "train_speed(iter/s)": 0.28653 }, { "acc": 0.74462419, "epoch": 1.4493454708653095, "grad_norm": 5.625, "learning_rate": 1.933126411600971e-06, "loss": 1.0221899, "memory(GiB)": 142.32, "step": 129580, "train_speed(iter/s)": 0.286546 }, { "acc": 0.72727461, "epoch": 1.449569169811268, "grad_norm": 6.71875, "learning_rate": 1.9316659636002698e-06, "loss": 1.08265791, "memory(GiB)": 142.32, "step": 129600, "train_speed(iter/s)": 0.286561 }, { "acc": 0.75711203, "epoch": 1.4497928687572266, "grad_norm": 6.0, "learning_rate": 1.930205935391524e-06, "loss": 0.95416298, "memory(GiB)": 142.32, "step": 129620, "train_speed(iter/s)": 0.286576 }, { "acc": 0.73853931, "epoch": 1.450016567703185, "grad_norm": 6.03125, "learning_rate": 1.9287463271744827e-06, "loss": 1.03473988, "memory(GiB)": 142.32, "step": 129640, "train_speed(iter/s)": 0.286593 }, { "acc": 0.74504318, "epoch": 1.4502402666491436, "grad_norm": 6.6875, "learning_rate": 1.927287139148841e-06, "loss": 1.00748005, "memory(GiB)": 142.32, "step": 129660, "train_speed(iter/s)": 0.286607 }, { "acc": 0.72113738, "epoch": 1.4504639655951022, "grad_norm": 7.03125, "learning_rate": 1.925828371514239e-06, "loss": 1.13636036, "memory(GiB)": 142.32, "step": 129680, "train_speed(iter/s)": 0.28662 }, { "acc": 0.74045658, "epoch": 1.4506876645410607, "grad_norm": 5.75, "learning_rate": 1.924370024470254e-06, "loss": 1.02923889, "memory(GiB)": 142.32, "step": 129700, "train_speed(iter/s)": 0.286635 }, { "acc": 0.73234682, "epoch": 1.4509113634870192, "grad_norm": 6.0625, "learning_rate": 1.922912098216413e-06, "loss": 1.0627943, "memory(GiB)": 142.32, "step": 129720, "train_speed(iter/s)": 0.286649 }, { "acc": 0.74533424, "epoch": 1.4511350624329777, "grad_norm": 6.65625, "learning_rate": 1.921454592952178e-06, "loss": 0.99809761, "memory(GiB)": 142.32, "step": 129740, "train_speed(iter/s)": 0.286665 }, { "acc": 0.72847257, "epoch": 1.4513587613789363, "grad_norm": 5.4375, "learning_rate": 1.9199975088769558e-06, "loss": 1.0889533, "memory(GiB)": 142.32, "step": 129760, "train_speed(iter/s)": 0.286679 }, { "acc": 0.74800053, "epoch": 1.4515824603248948, "grad_norm": 5.125, "learning_rate": 1.9185408461900997e-06, "loss": 0.99836521, "memory(GiB)": 142.32, "step": 129780, "train_speed(iter/s)": 0.286694 }, { "acc": 0.72989063, "epoch": 1.4518061592708533, "grad_norm": 5.84375, "learning_rate": 1.9170846050908983e-06, "loss": 1.06860199, "memory(GiB)": 142.32, "step": 129800, "train_speed(iter/s)": 0.286709 }, { "acc": 0.74401693, "epoch": 1.4520298582168119, "grad_norm": 7.4375, "learning_rate": 1.915628785778589e-06, "loss": 1.01536713, "memory(GiB)": 142.32, "step": 129820, "train_speed(iter/s)": 0.286723 }, { "acc": 0.7394846, "epoch": 1.4522535571627704, "grad_norm": 5.875, "learning_rate": 1.9141733884523485e-06, "loss": 1.0506835, "memory(GiB)": 142.32, "step": 129840, "train_speed(iter/s)": 0.286739 }, { "acc": 0.73589268, "epoch": 1.452477256108729, "grad_norm": 7.03125, "learning_rate": 1.9127184133112923e-06, "loss": 1.05637808, "memory(GiB)": 142.32, "step": 129860, "train_speed(iter/s)": 0.286755 }, { "acc": 0.73542662, "epoch": 1.4527009550546874, "grad_norm": 5.90625, "learning_rate": 1.911263860554487e-06, "loss": 1.02707119, "memory(GiB)": 142.32, "step": 129880, "train_speed(iter/s)": 0.286771 }, { "acc": 0.73303747, "epoch": 1.452924654000646, "grad_norm": 6.625, "learning_rate": 1.909809730380932e-06, "loss": 1.06178865, "memory(GiB)": 142.32, "step": 129900, "train_speed(iter/s)": 0.286787 }, { "acc": 0.74472809, "epoch": 1.4531483529466045, "grad_norm": 6.4375, "learning_rate": 1.908356022989577e-06, "loss": 1.02023344, "memory(GiB)": 142.32, "step": 129920, "train_speed(iter/s)": 0.2868 }, { "acc": 0.74130201, "epoch": 1.453372051892563, "grad_norm": 6.875, "learning_rate": 1.906902738579307e-06, "loss": 1.02707157, "memory(GiB)": 142.32, "step": 129940, "train_speed(iter/s)": 0.286816 }, { "acc": 0.72972031, "epoch": 1.4535957508385215, "grad_norm": 5.625, "learning_rate": 1.9054498773489521e-06, "loss": 1.09289341, "memory(GiB)": 142.32, "step": 129960, "train_speed(iter/s)": 0.286831 }, { "acc": 0.74183521, "epoch": 1.45381944978448, "grad_norm": 5.96875, "learning_rate": 1.9039974394972865e-06, "loss": 1.01699543, "memory(GiB)": 142.32, "step": 129980, "train_speed(iter/s)": 0.286844 }, { "acc": 0.72408304, "epoch": 1.4540431487304386, "grad_norm": 5.71875, "learning_rate": 1.9025454252230214e-06, "loss": 1.1018198, "memory(GiB)": 142.32, "step": 130000, "train_speed(iter/s)": 0.286861 }, { "epoch": 1.4540431487304386, "eval_acc": 0.6963303865764814, "eval_loss": 1.0714620351791382, "eval_runtime": 2341.1916, "eval_samples_per_second": 32.156, "eval_steps_per_second": 16.078, "step": 130000 }, { "acc": 0.73561201, "epoch": 1.4542668476763971, "grad_norm": 5.40625, "learning_rate": 1.901093834724817e-06, "loss": 1.0399313, "memory(GiB)": 142.32, "step": 130020, "train_speed(iter/s)": 0.28537 }, { "acc": 0.75263739, "epoch": 1.4544905466223557, "grad_norm": 5.03125, "learning_rate": 1.8996426682012675e-06, "loss": 0.98200035, "memory(GiB)": 142.32, "step": 130040, "train_speed(iter/s)": 0.285385 }, { "acc": 0.73777885, "epoch": 1.4547142455683142, "grad_norm": 6.15625, "learning_rate": 1.8981919258509174e-06, "loss": 1.04818039, "memory(GiB)": 142.32, "step": 130060, "train_speed(iter/s)": 0.2854 }, { "acc": 0.74851127, "epoch": 1.4549379445142727, "grad_norm": 4.75, "learning_rate": 1.8967416078722466e-06, "loss": 0.99346037, "memory(GiB)": 142.32, "step": 130080, "train_speed(iter/s)": 0.285416 }, { "acc": 0.73253641, "epoch": 1.4551616434602312, "grad_norm": 6.0, "learning_rate": 1.8952917144636784e-06, "loss": 1.06689739, "memory(GiB)": 142.32, "step": 130100, "train_speed(iter/s)": 0.28543 }, { "acc": 0.73121037, "epoch": 1.4553853424061898, "grad_norm": 7.0, "learning_rate": 1.8938422458235816e-06, "loss": 1.06928234, "memory(GiB)": 142.32, "step": 130120, "train_speed(iter/s)": 0.285445 }, { "acc": 0.73070297, "epoch": 1.4556090413521483, "grad_norm": 6.59375, "learning_rate": 1.892393202150261e-06, "loss": 1.07702456, "memory(GiB)": 142.32, "step": 130140, "train_speed(iter/s)": 0.285459 }, { "acc": 0.7335876, "epoch": 1.4558327402981068, "grad_norm": 5.15625, "learning_rate": 1.8909445836419699e-06, "loss": 1.05525179, "memory(GiB)": 142.32, "step": 130160, "train_speed(iter/s)": 0.285474 }, { "acc": 0.74396162, "epoch": 1.4560564392440654, "grad_norm": 5.21875, "learning_rate": 1.8894963904968982e-06, "loss": 1.02490311, "memory(GiB)": 142.32, "step": 130180, "train_speed(iter/s)": 0.285488 }, { "acc": 0.73841796, "epoch": 1.4562801381900239, "grad_norm": 6.8125, "learning_rate": 1.8880486229131783e-06, "loss": 1.05042763, "memory(GiB)": 142.32, "step": 130200, "train_speed(iter/s)": 0.285504 }, { "acc": 0.74026079, "epoch": 1.4565038371359824, "grad_norm": 6.59375, "learning_rate": 1.8866012810888889e-06, "loss": 1.02446404, "memory(GiB)": 142.32, "step": 130220, "train_speed(iter/s)": 0.28552 }, { "acc": 0.73945017, "epoch": 1.456727536081941, "grad_norm": 5.28125, "learning_rate": 1.8851543652220445e-06, "loss": 1.03423271, "memory(GiB)": 142.32, "step": 130240, "train_speed(iter/s)": 0.285533 }, { "acc": 0.74183402, "epoch": 1.4569512350278995, "grad_norm": 5.75, "learning_rate": 1.883707875510604e-06, "loss": 1.03421984, "memory(GiB)": 142.32, "step": 130260, "train_speed(iter/s)": 0.285547 }, { "acc": 0.74620781, "epoch": 1.457174933973858, "grad_norm": 6.34375, "learning_rate": 1.8822618121524671e-06, "loss": 0.99705381, "memory(GiB)": 142.32, "step": 130280, "train_speed(iter/s)": 0.285562 }, { "acc": 0.72745361, "epoch": 1.4573986329198165, "grad_norm": 4.875, "learning_rate": 1.8808161753454785e-06, "loss": 1.08368874, "memory(GiB)": 142.32, "step": 130300, "train_speed(iter/s)": 0.285576 }, { "acc": 0.74068956, "epoch": 1.457622331865775, "grad_norm": 6.4375, "learning_rate": 1.8793709652874203e-06, "loss": 1.0127121, "memory(GiB)": 142.32, "step": 130320, "train_speed(iter/s)": 0.285591 }, { "acc": 0.73394952, "epoch": 1.4578460308117336, "grad_norm": 6.78125, "learning_rate": 1.877926182176017e-06, "loss": 1.06048412, "memory(GiB)": 142.32, "step": 130340, "train_speed(iter/s)": 0.285605 }, { "acc": 0.74790564, "epoch": 1.4580697297576921, "grad_norm": 6.3125, "learning_rate": 1.876481826208938e-06, "loss": 1.00280714, "memory(GiB)": 142.32, "step": 130360, "train_speed(iter/s)": 0.285621 }, { "acc": 0.73976297, "epoch": 1.4582934287036506, "grad_norm": 8.5, "learning_rate": 1.8750378975837884e-06, "loss": 1.04657135, "memory(GiB)": 142.32, "step": 130380, "train_speed(iter/s)": 0.285637 }, { "acc": 0.74908872, "epoch": 1.4585171276496092, "grad_norm": 5.9375, "learning_rate": 1.8735943964981229e-06, "loss": 0.9883604, "memory(GiB)": 142.32, "step": 130400, "train_speed(iter/s)": 0.285652 }, { "acc": 0.73703423, "epoch": 1.4587408265955677, "grad_norm": 5.96875, "learning_rate": 1.8721513231494304e-06, "loss": 1.05264549, "memory(GiB)": 142.32, "step": 130420, "train_speed(iter/s)": 0.285668 }, { "acc": 0.74275875, "epoch": 1.4589645255415262, "grad_norm": 5.75, "learning_rate": 1.8707086777351424e-06, "loss": 1.01603241, "memory(GiB)": 142.32, "step": 130440, "train_speed(iter/s)": 0.285684 }, { "acc": 0.75162659, "epoch": 1.4591882244874848, "grad_norm": 5.78125, "learning_rate": 1.8692664604526368e-06, "loss": 0.97879553, "memory(GiB)": 142.32, "step": 130460, "train_speed(iter/s)": 0.285699 }, { "acc": 0.7358532, "epoch": 1.4594119234334433, "grad_norm": 6.34375, "learning_rate": 1.867824671499226e-06, "loss": 1.05651159, "memory(GiB)": 142.32, "step": 130480, "train_speed(iter/s)": 0.285712 }, { "acc": 0.73159728, "epoch": 1.4596356223794018, "grad_norm": 6.625, "learning_rate": 1.8663833110721714e-06, "loss": 1.06363754, "memory(GiB)": 142.32, "step": 130500, "train_speed(iter/s)": 0.285727 }, { "acc": 0.72494822, "epoch": 1.4598593213253603, "grad_norm": 6.9375, "learning_rate": 1.8649423793686694e-06, "loss": 1.11258764, "memory(GiB)": 142.32, "step": 130520, "train_speed(iter/s)": 0.285742 }, { "acc": 0.74074955, "epoch": 1.4600830202713189, "grad_norm": 7.3125, "learning_rate": 1.8635018765858582e-06, "loss": 1.03550034, "memory(GiB)": 142.32, "step": 130540, "train_speed(iter/s)": 0.285757 }, { "acc": 0.73522606, "epoch": 1.4603067192172774, "grad_norm": 5.0, "learning_rate": 1.8620618029208231e-06, "loss": 1.03637409, "memory(GiB)": 142.32, "step": 130560, "train_speed(iter/s)": 0.285773 }, { "acc": 0.74152937, "epoch": 1.460530418163236, "grad_norm": 7.0, "learning_rate": 1.8606221585705831e-06, "loss": 1.01960125, "memory(GiB)": 142.32, "step": 130580, "train_speed(iter/s)": 0.285789 }, { "acc": 0.73542795, "epoch": 1.4607541171091944, "grad_norm": 5.75, "learning_rate": 1.8591829437321058e-06, "loss": 1.06121502, "memory(GiB)": 142.32, "step": 130600, "train_speed(iter/s)": 0.285803 }, { "acc": 0.73452072, "epoch": 1.460977816055153, "grad_norm": 7.75, "learning_rate": 1.8577441586022937e-06, "loss": 1.04707928, "memory(GiB)": 142.32, "step": 130620, "train_speed(iter/s)": 0.285818 }, { "acc": 0.72838516, "epoch": 1.4612015150011115, "grad_norm": 6.59375, "learning_rate": 1.856305803377993e-06, "loss": 1.08617001, "memory(GiB)": 142.32, "step": 130640, "train_speed(iter/s)": 0.285832 }, { "acc": 0.73013182, "epoch": 1.46142521394707, "grad_norm": 6.34375, "learning_rate": 1.8548678782559932e-06, "loss": 1.09056435, "memory(GiB)": 142.32, "step": 130660, "train_speed(iter/s)": 0.285848 }, { "acc": 0.72574081, "epoch": 1.4616489128930286, "grad_norm": 6.40625, "learning_rate": 1.853430383433021e-06, "loss": 1.08462124, "memory(GiB)": 142.32, "step": 130680, "train_speed(iter/s)": 0.285862 }, { "acc": 0.72761025, "epoch": 1.461872611838987, "grad_norm": 6.09375, "learning_rate": 1.8519933191057483e-06, "loss": 1.09158916, "memory(GiB)": 142.32, "step": 130700, "train_speed(iter/s)": 0.285878 }, { "acc": 0.7508955, "epoch": 1.4620963107849456, "grad_norm": 5.4375, "learning_rate": 1.8505566854707845e-06, "loss": 0.98674555, "memory(GiB)": 142.32, "step": 130720, "train_speed(iter/s)": 0.285894 }, { "acc": 0.74068885, "epoch": 1.4623200097309041, "grad_norm": 5.71875, "learning_rate": 1.8491204827246811e-06, "loss": 1.03017893, "memory(GiB)": 142.32, "step": 130740, "train_speed(iter/s)": 0.285908 }, { "acc": 0.72735734, "epoch": 1.4625437086768627, "grad_norm": 5.9375, "learning_rate": 1.847684711063934e-06, "loss": 1.09270287, "memory(GiB)": 142.32, "step": 130760, "train_speed(iter/s)": 0.285924 }, { "acc": 0.74438372, "epoch": 1.4627674076228212, "grad_norm": 7.53125, "learning_rate": 1.8462493706849733e-06, "loss": 1.01714745, "memory(GiB)": 142.32, "step": 130780, "train_speed(iter/s)": 0.285941 }, { "acc": 0.74514785, "epoch": 1.4629911065687797, "grad_norm": 5.6875, "learning_rate": 1.844814461784178e-06, "loss": 1.0240346, "memory(GiB)": 142.32, "step": 130800, "train_speed(iter/s)": 0.285956 }, { "acc": 0.73684225, "epoch": 1.4632148055147383, "grad_norm": 7.40625, "learning_rate": 1.843379984557862e-06, "loss": 1.03111906, "memory(GiB)": 142.32, "step": 130820, "train_speed(iter/s)": 0.28597 }, { "acc": 0.73892722, "epoch": 1.4634385044606968, "grad_norm": 6.9375, "learning_rate": 1.841945939202281e-06, "loss": 1.04951763, "memory(GiB)": 142.32, "step": 130840, "train_speed(iter/s)": 0.285984 }, { "acc": 0.74191322, "epoch": 1.4636622034066553, "grad_norm": 5.84375, "learning_rate": 1.8405123259136365e-06, "loss": 1.00469017, "memory(GiB)": 142.32, "step": 130860, "train_speed(iter/s)": 0.285996 }, { "acc": 0.75313358, "epoch": 1.4638859023526138, "grad_norm": 6.125, "learning_rate": 1.8390791448880635e-06, "loss": 0.97647715, "memory(GiB)": 142.32, "step": 130880, "train_speed(iter/s)": 0.28601 }, { "acc": 0.73190727, "epoch": 1.4641096012985724, "grad_norm": 6.21875, "learning_rate": 1.837646396321645e-06, "loss": 1.06737061, "memory(GiB)": 142.32, "step": 130900, "train_speed(iter/s)": 0.286027 }, { "acc": 0.73731146, "epoch": 1.464333300244531, "grad_norm": 6.78125, "learning_rate": 1.8362140804104e-06, "loss": 1.04388428, "memory(GiB)": 142.32, "step": 130920, "train_speed(iter/s)": 0.28604 }, { "acc": 0.7395462, "epoch": 1.4645569991904894, "grad_norm": 6.34375, "learning_rate": 1.8347821973502878e-06, "loss": 1.02687902, "memory(GiB)": 142.32, "step": 130940, "train_speed(iter/s)": 0.286055 }, { "acc": 0.7443635, "epoch": 1.464780698136448, "grad_norm": 5.46875, "learning_rate": 1.8333507473372142e-06, "loss": 1.03296318, "memory(GiB)": 142.32, "step": 130960, "train_speed(iter/s)": 0.28607 }, { "acc": 0.73602324, "epoch": 1.4650043970824065, "grad_norm": 6.90625, "learning_rate": 1.8319197305670189e-06, "loss": 1.04446011, "memory(GiB)": 142.32, "step": 130980, "train_speed(iter/s)": 0.286083 }, { "acc": 0.72847953, "epoch": 1.465228096028365, "grad_norm": 6.25, "learning_rate": 1.830489147235488e-06, "loss": 1.10860195, "memory(GiB)": 142.32, "step": 131000, "train_speed(iter/s)": 0.286098 }, { "acc": 0.74170003, "epoch": 1.4654517949743235, "grad_norm": 4.9375, "learning_rate": 1.829058997538345e-06, "loss": 1.02335072, "memory(GiB)": 142.32, "step": 131020, "train_speed(iter/s)": 0.286113 }, { "acc": 0.74088211, "epoch": 1.465675493920282, "grad_norm": 6.9375, "learning_rate": 1.8276292816712521e-06, "loss": 1.02992115, "memory(GiB)": 142.32, "step": 131040, "train_speed(iter/s)": 0.286127 }, { "acc": 0.74359007, "epoch": 1.4658991928662406, "grad_norm": 6.09375, "learning_rate": 1.8261999998298192e-06, "loss": 1.01670551, "memory(GiB)": 142.32, "step": 131060, "train_speed(iter/s)": 0.286142 }, { "acc": 0.74103346, "epoch": 1.4661228918121991, "grad_norm": 7.0625, "learning_rate": 1.8247711522095884e-06, "loss": 1.03566113, "memory(GiB)": 142.32, "step": 131080, "train_speed(iter/s)": 0.286157 }, { "acc": 0.73779736, "epoch": 1.4663465907581577, "grad_norm": 8.0625, "learning_rate": 1.8233427390060505e-06, "loss": 1.0475214, "memory(GiB)": 142.32, "step": 131100, "train_speed(iter/s)": 0.286173 }, { "acc": 0.74859371, "epoch": 1.4665702897041162, "grad_norm": 5.96875, "learning_rate": 1.8219147604146303e-06, "loss": 0.98870659, "memory(GiB)": 142.32, "step": 131120, "train_speed(iter/s)": 0.286188 }, { "acc": 0.7424695, "epoch": 1.4667939886500747, "grad_norm": 6.90625, "learning_rate": 1.8204872166306948e-06, "loss": 1.01596127, "memory(GiB)": 142.32, "step": 131140, "train_speed(iter/s)": 0.286201 }, { "acc": 0.748669, "epoch": 1.4670176875960332, "grad_norm": 4.90625, "learning_rate": 1.819060107849555e-06, "loss": 0.98997135, "memory(GiB)": 142.32, "step": 131160, "train_speed(iter/s)": 0.286216 }, { "acc": 0.74606113, "epoch": 1.4672413865419918, "grad_norm": 6.375, "learning_rate": 1.8176334342664576e-06, "loss": 0.99624081, "memory(GiB)": 142.32, "step": 131180, "train_speed(iter/s)": 0.286232 }, { "acc": 0.73771515, "epoch": 1.4674650854879503, "grad_norm": 5.5, "learning_rate": 1.8162071960765941e-06, "loss": 1.02067966, "memory(GiB)": 142.32, "step": 131200, "train_speed(iter/s)": 0.286247 }, { "acc": 0.74068775, "epoch": 1.4676887844339088, "grad_norm": 5.34375, "learning_rate": 1.8147813934750935e-06, "loss": 1.02467155, "memory(GiB)": 142.32, "step": 131220, "train_speed(iter/s)": 0.286263 }, { "acc": 0.74787703, "epoch": 1.4679124833798673, "grad_norm": 5.84375, "learning_rate": 1.8133560266570234e-06, "loss": 1.00976791, "memory(GiB)": 142.32, "step": 131240, "train_speed(iter/s)": 0.286277 }, { "acc": 0.74004931, "epoch": 1.4681361823258259, "grad_norm": 6.53125, "learning_rate": 1.811931095817398e-06, "loss": 1.04460678, "memory(GiB)": 142.32, "step": 131260, "train_speed(iter/s)": 0.286292 }, { "acc": 0.73959985, "epoch": 1.4683598812717844, "grad_norm": 6.5625, "learning_rate": 1.8105066011511657e-06, "loss": 1.04337788, "memory(GiB)": 142.32, "step": 131280, "train_speed(iter/s)": 0.286307 }, { "acc": 0.72611575, "epoch": 1.468583580217743, "grad_norm": 6.8125, "learning_rate": 1.8090825428532198e-06, "loss": 1.10128412, "memory(GiB)": 142.32, "step": 131300, "train_speed(iter/s)": 0.286321 }, { "acc": 0.73343487, "epoch": 1.4688072791637015, "grad_norm": 6.78125, "learning_rate": 1.8076589211183909e-06, "loss": 1.06741562, "memory(GiB)": 142.32, "step": 131320, "train_speed(iter/s)": 0.286337 }, { "acc": 0.74184637, "epoch": 1.46903097810966, "grad_norm": 5.90625, "learning_rate": 1.8062357361414496e-06, "loss": 1.0186491, "memory(GiB)": 142.32, "step": 131340, "train_speed(iter/s)": 0.286352 }, { "acc": 0.73851366, "epoch": 1.4692546770556185, "grad_norm": 6.3125, "learning_rate": 1.80481298811711e-06, "loss": 1.04777765, "memory(GiB)": 142.32, "step": 131360, "train_speed(iter/s)": 0.286366 }, { "acc": 0.74052887, "epoch": 1.469478376001577, "grad_norm": 5.28125, "learning_rate": 1.8033906772400217e-06, "loss": 1.03443966, "memory(GiB)": 142.32, "step": 131380, "train_speed(iter/s)": 0.286381 }, { "acc": 0.74663081, "epoch": 1.4697020749475356, "grad_norm": 7.03125, "learning_rate": 1.8019688037047806e-06, "loss": 0.99065561, "memory(GiB)": 142.32, "step": 131400, "train_speed(iter/s)": 0.286395 }, { "acc": 0.73164201, "epoch": 1.469925773893494, "grad_norm": 5.71875, "learning_rate": 1.8005473677059176e-06, "loss": 1.05540915, "memory(GiB)": 142.32, "step": 131420, "train_speed(iter/s)": 0.28641 }, { "acc": 0.73362761, "epoch": 1.4701494728394526, "grad_norm": 6.71875, "learning_rate": 1.7991263694379058e-06, "loss": 1.06881952, "memory(GiB)": 142.32, "step": 131440, "train_speed(iter/s)": 0.286426 }, { "acc": 0.72882628, "epoch": 1.4703731717854112, "grad_norm": 5.78125, "learning_rate": 1.7977058090951571e-06, "loss": 1.08760118, "memory(GiB)": 142.32, "step": 131460, "train_speed(iter/s)": 0.28644 }, { "acc": 0.75182037, "epoch": 1.4705968707313697, "grad_norm": 7.25, "learning_rate": 1.7962856868720236e-06, "loss": 0.99735422, "memory(GiB)": 142.32, "step": 131480, "train_speed(iter/s)": 0.286457 }, { "acc": 0.73497682, "epoch": 1.4708205696773282, "grad_norm": 6.34375, "learning_rate": 1.7948660029628013e-06, "loss": 1.05294132, "memory(GiB)": 142.32, "step": 131500, "train_speed(iter/s)": 0.286472 }, { "acc": 0.72363234, "epoch": 1.4710442686232867, "grad_norm": 5.4375, "learning_rate": 1.7934467575617204e-06, "loss": 1.10574865, "memory(GiB)": 142.32, "step": 131520, "train_speed(iter/s)": 0.286488 }, { "acc": 0.73691473, "epoch": 1.4712679675692453, "grad_norm": 6.6875, "learning_rate": 1.7920279508629569e-06, "loss": 1.0420681, "memory(GiB)": 142.32, "step": 131540, "train_speed(iter/s)": 0.286503 }, { "acc": 0.73070354, "epoch": 1.4714916665152038, "grad_norm": 6.96875, "learning_rate": 1.790609583060622e-06, "loss": 1.07102833, "memory(GiB)": 142.32, "step": 131560, "train_speed(iter/s)": 0.286517 }, { "acc": 0.73897152, "epoch": 1.4717153654611623, "grad_norm": 5.9375, "learning_rate": 1.789191654348767e-06, "loss": 1.0525321, "memory(GiB)": 142.32, "step": 131580, "train_speed(iter/s)": 0.286531 }, { "acc": 0.74319439, "epoch": 1.4719390644071209, "grad_norm": 5.96875, "learning_rate": 1.7877741649213886e-06, "loss": 1.02648897, "memory(GiB)": 142.32, "step": 131600, "train_speed(iter/s)": 0.286547 }, { "acc": 0.72305622, "epoch": 1.4721627633530794, "grad_norm": 5.78125, "learning_rate": 1.7863571149724163e-06, "loss": 1.11518154, "memory(GiB)": 142.32, "step": 131620, "train_speed(iter/s)": 0.286563 }, { "acc": 0.73855929, "epoch": 1.472386462299038, "grad_norm": 6.4375, "learning_rate": 1.7849405046957251e-06, "loss": 1.01674232, "memory(GiB)": 142.32, "step": 131640, "train_speed(iter/s)": 0.28658 }, { "acc": 0.75168953, "epoch": 1.4726101612449964, "grad_norm": 7.25, "learning_rate": 1.7835243342851277e-06, "loss": 0.98136864, "memory(GiB)": 142.32, "step": 131660, "train_speed(iter/s)": 0.286595 }, { "acc": 0.72926712, "epoch": 1.472833860190955, "grad_norm": 5.5, "learning_rate": 1.7821086039343733e-06, "loss": 1.07875471, "memory(GiB)": 142.32, "step": 131680, "train_speed(iter/s)": 0.286611 }, { "acc": 0.74758282, "epoch": 1.4730575591369135, "grad_norm": 7.0625, "learning_rate": 1.7806933138371573e-06, "loss": 1.01255207, "memory(GiB)": 142.32, "step": 131700, "train_speed(iter/s)": 0.286625 }, { "acc": 0.72972765, "epoch": 1.473281258082872, "grad_norm": 6.1875, "learning_rate": 1.7792784641871097e-06, "loss": 1.07941732, "memory(GiB)": 142.32, "step": 131720, "train_speed(iter/s)": 0.286638 }, { "acc": 0.73825665, "epoch": 1.4735049570288306, "grad_norm": 6.3125, "learning_rate": 1.7778640551778038e-06, "loss": 1.04317341, "memory(GiB)": 142.32, "step": 131740, "train_speed(iter/s)": 0.286652 }, { "acc": 0.7388545, "epoch": 1.473728655974789, "grad_norm": 6.90625, "learning_rate": 1.7764500870027507e-06, "loss": 1.03862476, "memory(GiB)": 142.32, "step": 131760, "train_speed(iter/s)": 0.286667 }, { "acc": 0.73477774, "epoch": 1.4739523549207476, "grad_norm": 4.71875, "learning_rate": 1.7750365598553988e-06, "loss": 1.0551012, "memory(GiB)": 142.32, "step": 131780, "train_speed(iter/s)": 0.286682 }, { "acc": 0.74434628, "epoch": 1.4741760538667061, "grad_norm": 6.71875, "learning_rate": 1.7736234739291424e-06, "loss": 1.02307587, "memory(GiB)": 142.32, "step": 131800, "train_speed(iter/s)": 0.286696 }, { "acc": 0.72872415, "epoch": 1.4743997528126647, "grad_norm": 6.71875, "learning_rate": 1.772210829417309e-06, "loss": 1.09104204, "memory(GiB)": 142.32, "step": 131820, "train_speed(iter/s)": 0.28671 }, { "acc": 0.74139013, "epoch": 1.4746234517586232, "grad_norm": 6.4375, "learning_rate": 1.7707986265131717e-06, "loss": 1.02940006, "memory(GiB)": 142.32, "step": 131840, "train_speed(iter/s)": 0.286723 }, { "acc": 0.75247746, "epoch": 1.4748471507045817, "grad_norm": 5.5, "learning_rate": 1.7693868654099377e-06, "loss": 0.98586092, "memory(GiB)": 142.32, "step": 131860, "train_speed(iter/s)": 0.286738 }, { "acc": 0.73771029, "epoch": 1.4750708496505403, "grad_norm": 5.6875, "learning_rate": 1.7679755463007552e-06, "loss": 1.05496254, "memory(GiB)": 142.32, "step": 131880, "train_speed(iter/s)": 0.286753 }, { "acc": 0.74276209, "epoch": 1.4752945485964988, "grad_norm": 6.1875, "learning_rate": 1.7665646693787158e-06, "loss": 1.00313377, "memory(GiB)": 142.32, "step": 131900, "train_speed(iter/s)": 0.286769 }, { "acc": 0.73797274, "epoch": 1.4755182475424573, "grad_norm": 7.1875, "learning_rate": 1.7651542348368445e-06, "loss": 1.04201756, "memory(GiB)": 142.32, "step": 131920, "train_speed(iter/s)": 0.286785 }, { "acc": 0.72539568, "epoch": 1.4757419464884158, "grad_norm": 8.4375, "learning_rate": 1.7637442428681123e-06, "loss": 1.11200657, "memory(GiB)": 142.32, "step": 131940, "train_speed(iter/s)": 0.2868 }, { "acc": 0.74079247, "epoch": 1.4759656454343744, "grad_norm": 5.40625, "learning_rate": 1.762334693665424e-06, "loss": 1.02046738, "memory(GiB)": 142.32, "step": 131960, "train_speed(iter/s)": 0.286815 }, { "acc": 0.74569788, "epoch": 1.476189344380333, "grad_norm": 7.0625, "learning_rate": 1.7609255874216252e-06, "loss": 1.02241735, "memory(GiB)": 142.32, "step": 131980, "train_speed(iter/s)": 0.286831 }, { "acc": 0.75050764, "epoch": 1.4764130433262914, "grad_norm": 6.25, "learning_rate": 1.7595169243295045e-06, "loss": 0.97309256, "memory(GiB)": 142.32, "step": 132000, "train_speed(iter/s)": 0.286846 }, { "epoch": 1.4764130433262914, "eval_acc": 0.6963585349665118, "eval_loss": 1.0714285373687744, "eval_runtime": 2344.0507, "eval_samples_per_second": 32.117, "eval_steps_per_second": 16.059, "step": 132000 }, { "acc": 0.74014921, "epoch": 1.47663674227225, "grad_norm": 4.6875, "learning_rate": 1.7581087045817841e-06, "loss": 1.02855816, "memory(GiB)": 142.32, "step": 132020, "train_speed(iter/s)": 0.285375 }, { "acc": 0.73889642, "epoch": 1.4768604412182085, "grad_norm": 6.90625, "learning_rate": 1.7567009283711322e-06, "loss": 1.03546886, "memory(GiB)": 142.32, "step": 132040, "train_speed(iter/s)": 0.285391 }, { "acc": 0.74179797, "epoch": 1.477084140164167, "grad_norm": 5.5, "learning_rate": 1.7552935958901506e-06, "loss": 1.02228031, "memory(GiB)": 142.32, "step": 132060, "train_speed(iter/s)": 0.285406 }, { "acc": 0.73191504, "epoch": 1.4773078391101255, "grad_norm": 6.65625, "learning_rate": 1.753886707331381e-06, "loss": 1.06017609, "memory(GiB)": 142.32, "step": 132080, "train_speed(iter/s)": 0.285421 }, { "acc": 0.72935853, "epoch": 1.477531538056084, "grad_norm": 6.0625, "learning_rate": 1.7524802628873089e-06, "loss": 1.09477129, "memory(GiB)": 142.32, "step": 132100, "train_speed(iter/s)": 0.285437 }, { "acc": 0.73432999, "epoch": 1.4777552370020426, "grad_norm": 6.6875, "learning_rate": 1.751074262750353e-06, "loss": 1.04640579, "memory(GiB)": 142.32, "step": 132120, "train_speed(iter/s)": 0.285452 }, { "acc": 0.74621043, "epoch": 1.4779789359480011, "grad_norm": 7.03125, "learning_rate": 1.7496687071128776e-06, "loss": 1.02279301, "memory(GiB)": 142.32, "step": 132140, "train_speed(iter/s)": 0.285467 }, { "acc": 0.7392952, "epoch": 1.4782026348939596, "grad_norm": 6.0625, "learning_rate": 1.7482635961671807e-06, "loss": 1.02540131, "memory(GiB)": 142.32, "step": 132160, "train_speed(iter/s)": 0.285482 }, { "acc": 0.73061657, "epoch": 1.4784263338399182, "grad_norm": 5.875, "learning_rate": 1.7468589301055005e-06, "loss": 1.07837877, "memory(GiB)": 142.32, "step": 132180, "train_speed(iter/s)": 0.285497 }, { "acc": 0.74208441, "epoch": 1.4786500327858767, "grad_norm": 6.03125, "learning_rate": 1.7454547091200186e-06, "loss": 1.01377754, "memory(GiB)": 142.32, "step": 132200, "train_speed(iter/s)": 0.285512 }, { "acc": 0.72695169, "epoch": 1.4788737317318352, "grad_norm": 6.25, "learning_rate": 1.7440509334028482e-06, "loss": 1.10596485, "memory(GiB)": 142.32, "step": 132220, "train_speed(iter/s)": 0.285527 }, { "acc": 0.73279505, "epoch": 1.4790974306777938, "grad_norm": 6.4375, "learning_rate": 1.742647603146051e-06, "loss": 1.07204123, "memory(GiB)": 142.32, "step": 132240, "train_speed(iter/s)": 0.285539 }, { "acc": 0.74312263, "epoch": 1.4793211296237523, "grad_norm": 6.375, "learning_rate": 1.7412447185416193e-06, "loss": 1.00576715, "memory(GiB)": 142.32, "step": 132260, "train_speed(iter/s)": 0.285554 }, { "acc": 0.73311515, "epoch": 1.4795448285697108, "grad_norm": 6.28125, "learning_rate": 1.7398422797814868e-06, "loss": 1.06936302, "memory(GiB)": 142.32, "step": 132280, "train_speed(iter/s)": 0.28557 }, { "acc": 0.73610191, "epoch": 1.4797685275156693, "grad_norm": 5.3125, "learning_rate": 1.7384402870575312e-06, "loss": 1.06019745, "memory(GiB)": 142.32, "step": 132300, "train_speed(iter/s)": 0.285586 }, { "acc": 0.73628206, "epoch": 1.4799922264616279, "grad_norm": 6.875, "learning_rate": 1.7370387405615602e-06, "loss": 1.06231613, "memory(GiB)": 142.32, "step": 132320, "train_speed(iter/s)": 0.285602 }, { "acc": 0.73956218, "epoch": 1.4802159254075864, "grad_norm": 5.96875, "learning_rate": 1.7356376404853303e-06, "loss": 1.03821411, "memory(GiB)": 142.32, "step": 132340, "train_speed(iter/s)": 0.285618 }, { "acc": 0.73935399, "epoch": 1.480439624353545, "grad_norm": 5.21875, "learning_rate": 1.7342369870205294e-06, "loss": 1.04520359, "memory(GiB)": 142.32, "step": 132360, "train_speed(iter/s)": 0.285634 }, { "acc": 0.73968534, "epoch": 1.4806633232995035, "grad_norm": 6.59375, "learning_rate": 1.7328367803587853e-06, "loss": 1.02950172, "memory(GiB)": 142.32, "step": 132380, "train_speed(iter/s)": 0.285648 }, { "acc": 0.75049105, "epoch": 1.480887022245462, "grad_norm": 5.59375, "learning_rate": 1.7314370206916703e-06, "loss": 0.98115177, "memory(GiB)": 142.32, "step": 132400, "train_speed(iter/s)": 0.285663 }, { "acc": 0.74206519, "epoch": 1.4811107211914205, "grad_norm": 8.25, "learning_rate": 1.7300377082106873e-06, "loss": 1.01747513, "memory(GiB)": 142.32, "step": 132420, "train_speed(iter/s)": 0.285676 }, { "acc": 0.72685976, "epoch": 1.481334420137379, "grad_norm": 6.0625, "learning_rate": 1.7286388431072859e-06, "loss": 1.09199276, "memory(GiB)": 142.32, "step": 132440, "train_speed(iter/s)": 0.285692 }, { "acc": 0.74250321, "epoch": 1.4815581190833376, "grad_norm": 7.09375, "learning_rate": 1.7272404255728498e-06, "loss": 1.0128314, "memory(GiB)": 142.32, "step": 132460, "train_speed(iter/s)": 0.285707 }, { "acc": 0.75088472, "epoch": 1.481781818029296, "grad_norm": 7.0625, "learning_rate": 1.7258424557987002e-06, "loss": 0.9850563, "memory(GiB)": 142.32, "step": 132480, "train_speed(iter/s)": 0.28572 }, { "acc": 0.73524303, "epoch": 1.4820055169752546, "grad_norm": 5.5625, "learning_rate": 1.7244449339761028e-06, "loss": 1.06871319, "memory(GiB)": 142.32, "step": 132500, "train_speed(iter/s)": 0.285737 }, { "acc": 0.72806482, "epoch": 1.4822292159212132, "grad_norm": 6.875, "learning_rate": 1.7230478602962553e-06, "loss": 1.07627296, "memory(GiB)": 142.32, "step": 132520, "train_speed(iter/s)": 0.285751 }, { "acc": 0.73020635, "epoch": 1.4824529148671717, "grad_norm": 6.4375, "learning_rate": 1.7216512349503001e-06, "loss": 1.08697014, "memory(GiB)": 142.32, "step": 132540, "train_speed(iter/s)": 0.285766 }, { "acc": 0.7418726, "epoch": 1.4826766138131302, "grad_norm": 7.21875, "learning_rate": 1.7202550581293147e-06, "loss": 1.02552299, "memory(GiB)": 142.32, "step": 132560, "train_speed(iter/s)": 0.285781 }, { "acc": 0.74109211, "epoch": 1.4829003127590887, "grad_norm": 6.9375, "learning_rate": 1.718859330024314e-06, "loss": 1.01690807, "memory(GiB)": 142.32, "step": 132580, "train_speed(iter/s)": 0.285797 }, { "acc": 0.74240141, "epoch": 1.4831240117050473, "grad_norm": 6.0625, "learning_rate": 1.7174640508262585e-06, "loss": 1.04299736, "memory(GiB)": 142.32, "step": 132600, "train_speed(iter/s)": 0.285812 }, { "acc": 0.74149823, "epoch": 1.4833477106510058, "grad_norm": 7.8125, "learning_rate": 1.716069220726036e-06, "loss": 1.04342098, "memory(GiB)": 142.32, "step": 132620, "train_speed(iter/s)": 0.285826 }, { "acc": 0.73653975, "epoch": 1.4835714095969643, "grad_norm": 5.59375, "learning_rate": 1.714674839914484e-06, "loss": 1.05531759, "memory(GiB)": 142.32, "step": 132640, "train_speed(iter/s)": 0.285839 }, { "acc": 0.73724194, "epoch": 1.4837951085429228, "grad_norm": 6.40625, "learning_rate": 1.7132809085823705e-06, "loss": 1.0545248, "memory(GiB)": 142.32, "step": 132660, "train_speed(iter/s)": 0.285854 }, { "acc": 0.75458822, "epoch": 1.4840188074888814, "grad_norm": 7.34375, "learning_rate": 1.7118874269204078e-06, "loss": 0.96471605, "memory(GiB)": 142.32, "step": 132680, "train_speed(iter/s)": 0.285868 }, { "acc": 0.73715687, "epoch": 1.48424250643484, "grad_norm": 8.125, "learning_rate": 1.7104943951192437e-06, "loss": 1.04494915, "memory(GiB)": 142.32, "step": 132700, "train_speed(iter/s)": 0.285882 }, { "acc": 0.74860344, "epoch": 1.4844662053807984, "grad_norm": 6.53125, "learning_rate": 1.709101813369462e-06, "loss": 0.99470825, "memory(GiB)": 142.32, "step": 132720, "train_speed(iter/s)": 0.285898 }, { "acc": 0.73726301, "epoch": 1.484689904326757, "grad_norm": 7.21875, "learning_rate": 1.7077096818615918e-06, "loss": 1.04084492, "memory(GiB)": 142.32, "step": 132740, "train_speed(iter/s)": 0.285914 }, { "acc": 0.74447856, "epoch": 1.4849136032727155, "grad_norm": 5.4375, "learning_rate": 1.7063180007860935e-06, "loss": 1.01605091, "memory(GiB)": 142.32, "step": 132760, "train_speed(iter/s)": 0.285928 }, { "acc": 0.7431942, "epoch": 1.485137302218674, "grad_norm": 6.46875, "learning_rate": 1.7049267703333715e-06, "loss": 1.02697487, "memory(GiB)": 142.32, "step": 132780, "train_speed(iter/s)": 0.285942 }, { "acc": 0.73095279, "epoch": 1.4853610011646325, "grad_norm": 6.34375, "learning_rate": 1.7035359906937649e-06, "loss": 1.06892662, "memory(GiB)": 142.32, "step": 132800, "train_speed(iter/s)": 0.285956 }, { "acc": 0.73826008, "epoch": 1.485584700110591, "grad_norm": 5.34375, "learning_rate": 1.7021456620575504e-06, "loss": 1.05219784, "memory(GiB)": 142.32, "step": 132820, "train_speed(iter/s)": 0.285971 }, { "acc": 0.72899189, "epoch": 1.4858083990565496, "grad_norm": 4.75, "learning_rate": 1.7007557846149487e-06, "loss": 1.07040215, "memory(GiB)": 142.32, "step": 132840, "train_speed(iter/s)": 0.285985 }, { "acc": 0.742449, "epoch": 1.4860320980025081, "grad_norm": 7.75, "learning_rate": 1.6993663585561105e-06, "loss": 1.02457542, "memory(GiB)": 142.32, "step": 132860, "train_speed(iter/s)": 0.286001 }, { "acc": 0.72973037, "epoch": 1.4862557969484667, "grad_norm": 5.125, "learning_rate": 1.6979773840711328e-06, "loss": 1.07959061, "memory(GiB)": 142.32, "step": 132880, "train_speed(iter/s)": 0.286017 }, { "acc": 0.73505206, "epoch": 1.4864794958944252, "grad_norm": 4.5625, "learning_rate": 1.6965888613500464e-06, "loss": 1.03910065, "memory(GiB)": 142.32, "step": 132900, "train_speed(iter/s)": 0.28603 }, { "acc": 0.73851886, "epoch": 1.4867031948403837, "grad_norm": 7.4375, "learning_rate": 1.6952007905828184e-06, "loss": 1.0383749, "memory(GiB)": 142.32, "step": 132920, "train_speed(iter/s)": 0.286045 }, { "acc": 0.74045544, "epoch": 1.4869268937863422, "grad_norm": 7.25, "learning_rate": 1.6938131719593603e-06, "loss": 1.02649727, "memory(GiB)": 142.32, "step": 132940, "train_speed(iter/s)": 0.286061 }, { "acc": 0.73457479, "epoch": 1.4871505927323008, "grad_norm": 6.46875, "learning_rate": 1.6924260056695153e-06, "loss": 1.06109409, "memory(GiB)": 142.32, "step": 132960, "train_speed(iter/s)": 0.286075 }, { "acc": 0.73040676, "epoch": 1.4873742916782593, "grad_norm": 4.90625, "learning_rate": 1.69103929190307e-06, "loss": 1.07373896, "memory(GiB)": 142.32, "step": 132980, "train_speed(iter/s)": 0.286091 }, { "acc": 0.72618165, "epoch": 1.4875979906242178, "grad_norm": 6.375, "learning_rate": 1.6896530308497455e-06, "loss": 1.09606647, "memory(GiB)": 142.32, "step": 133000, "train_speed(iter/s)": 0.286105 }, { "acc": 0.74867916, "epoch": 1.4878216895701764, "grad_norm": 5.1875, "learning_rate": 1.6882672226992008e-06, "loss": 0.97325964, "memory(GiB)": 142.32, "step": 133020, "train_speed(iter/s)": 0.28612 }, { "acc": 0.72788501, "epoch": 1.4880453885161349, "grad_norm": 6.4375, "learning_rate": 1.6868818676410376e-06, "loss": 1.09802322, "memory(GiB)": 142.32, "step": 133040, "train_speed(iter/s)": 0.286134 }, { "acc": 0.73257895, "epoch": 1.4882690874620934, "grad_norm": 5.84375, "learning_rate": 1.685496965864788e-06, "loss": 1.06193113, "memory(GiB)": 142.32, "step": 133060, "train_speed(iter/s)": 0.286146 }, { "acc": 0.75093021, "epoch": 1.488492786408052, "grad_norm": 5.78125, "learning_rate": 1.6841125175599304e-06, "loss": 0.97200356, "memory(GiB)": 142.32, "step": 133080, "train_speed(iter/s)": 0.286159 }, { "acc": 0.7398674, "epoch": 1.4887164853540105, "grad_norm": 5.625, "learning_rate": 1.6827285229158753e-06, "loss": 1.01941509, "memory(GiB)": 142.32, "step": 133100, "train_speed(iter/s)": 0.286174 }, { "acc": 0.73590446, "epoch": 1.488940184299969, "grad_norm": 5.625, "learning_rate": 1.681344982121972e-06, "loss": 1.04368992, "memory(GiB)": 142.32, "step": 133120, "train_speed(iter/s)": 0.286189 }, { "acc": 0.74054136, "epoch": 1.4891638832459275, "grad_norm": 6.28125, "learning_rate": 1.6799618953675106e-06, "loss": 1.0320961, "memory(GiB)": 142.32, "step": 133140, "train_speed(iter/s)": 0.286203 }, { "acc": 0.74794002, "epoch": 1.489387582191886, "grad_norm": 6.71875, "learning_rate": 1.6785792628417147e-06, "loss": 0.98704357, "memory(GiB)": 142.32, "step": 133160, "train_speed(iter/s)": 0.286218 }, { "acc": 0.74305096, "epoch": 1.4896112811378446, "grad_norm": 5.03125, "learning_rate": 1.677197084733751e-06, "loss": 1.00882721, "memory(GiB)": 142.32, "step": 133180, "train_speed(iter/s)": 0.286233 }, { "acc": 0.73785915, "epoch": 1.489834980083803, "grad_norm": 6.71875, "learning_rate": 1.6758153612327204e-06, "loss": 1.03967381, "memory(GiB)": 142.32, "step": 133200, "train_speed(iter/s)": 0.286251 }, { "acc": 0.73103008, "epoch": 1.4900586790297616, "grad_norm": 6.3125, "learning_rate": 1.6744340925276602e-06, "loss": 1.08693657, "memory(GiB)": 142.32, "step": 133220, "train_speed(iter/s)": 0.286265 }, { "acc": 0.7380363, "epoch": 1.4902823779757202, "grad_norm": 5.5, "learning_rate": 1.6730532788075509e-06, "loss": 1.04543629, "memory(GiB)": 142.32, "step": 133240, "train_speed(iter/s)": 0.286279 }, { "acc": 0.74144278, "epoch": 1.4905060769216787, "grad_norm": 6.875, "learning_rate": 1.6716729202613046e-06, "loss": 1.03608017, "memory(GiB)": 142.32, "step": 133260, "train_speed(iter/s)": 0.286294 }, { "acc": 0.73729844, "epoch": 1.4907297758676372, "grad_norm": 5.28125, "learning_rate": 1.6702930170777776e-06, "loss": 1.04595032, "memory(GiB)": 142.32, "step": 133280, "train_speed(iter/s)": 0.286309 }, { "acc": 0.73221397, "epoch": 1.4909534748135957, "grad_norm": 5.03125, "learning_rate": 1.6689135694457575e-06, "loss": 1.07974606, "memory(GiB)": 142.32, "step": 133300, "train_speed(iter/s)": 0.286325 }, { "acc": 0.75117507, "epoch": 1.4911771737595543, "grad_norm": 7.375, "learning_rate": 1.667534577553972e-06, "loss": 0.99124784, "memory(GiB)": 142.32, "step": 133320, "train_speed(iter/s)": 0.28634 }, { "acc": 0.73638592, "epoch": 1.4914008727055128, "grad_norm": 6.03125, "learning_rate": 1.66615604159109e-06, "loss": 1.05501652, "memory(GiB)": 142.32, "step": 133340, "train_speed(iter/s)": 0.286353 }, { "acc": 0.73486929, "epoch": 1.4916245716514713, "grad_norm": 5.5625, "learning_rate": 1.6647779617457116e-06, "loss": 1.05596771, "memory(GiB)": 142.32, "step": 133360, "train_speed(iter/s)": 0.286367 }, { "acc": 0.73905716, "epoch": 1.4918482705974299, "grad_norm": 5.6875, "learning_rate": 1.6634003382063806e-06, "loss": 1.03268499, "memory(GiB)": 142.32, "step": 133380, "train_speed(iter/s)": 0.286382 }, { "acc": 0.72141171, "epoch": 1.4920719695433884, "grad_norm": 5.46875, "learning_rate": 1.6620231711615747e-06, "loss": 1.11610279, "memory(GiB)": 142.32, "step": 133400, "train_speed(iter/s)": 0.286395 }, { "acc": 0.74682732, "epoch": 1.492295668489347, "grad_norm": 7.53125, "learning_rate": 1.6606464607997075e-06, "loss": 1.00420237, "memory(GiB)": 142.32, "step": 133420, "train_speed(iter/s)": 0.286411 }, { "acc": 0.72993779, "epoch": 1.4925193674353054, "grad_norm": 5.75, "learning_rate": 1.6592702073091371e-06, "loss": 1.07138977, "memory(GiB)": 142.32, "step": 133440, "train_speed(iter/s)": 0.286426 }, { "acc": 0.73245192, "epoch": 1.492743066381264, "grad_norm": 5.125, "learning_rate": 1.6578944108781503e-06, "loss": 1.0616169, "memory(GiB)": 142.32, "step": 133460, "train_speed(iter/s)": 0.28644 }, { "acc": 0.71713867, "epoch": 1.4929667653272225, "grad_norm": 5.34375, "learning_rate": 1.6565190716949797e-06, "loss": 1.12974815, "memory(GiB)": 142.32, "step": 133480, "train_speed(iter/s)": 0.286455 }, { "acc": 0.7449522, "epoch": 1.493190464273181, "grad_norm": 6.0625, "learning_rate": 1.6551441899477894e-06, "loss": 1.02023144, "memory(GiB)": 142.32, "step": 133500, "train_speed(iter/s)": 0.28647 }, { "acc": 0.73761334, "epoch": 1.4934141632191396, "grad_norm": 5.6875, "learning_rate": 1.6537697658246809e-06, "loss": 1.03079405, "memory(GiB)": 142.32, "step": 133520, "train_speed(iter/s)": 0.286486 }, { "acc": 0.73451309, "epoch": 1.493637862165098, "grad_norm": 6.4375, "learning_rate": 1.6523957995136992e-06, "loss": 1.04728279, "memory(GiB)": 142.32, "step": 133540, "train_speed(iter/s)": 0.286501 }, { "acc": 0.74661245, "epoch": 1.4938615611110566, "grad_norm": 6.4375, "learning_rate": 1.6510222912028185e-06, "loss": 1.00903044, "memory(GiB)": 142.32, "step": 133560, "train_speed(iter/s)": 0.286514 }, { "acc": 0.73404198, "epoch": 1.4940852600570151, "grad_norm": 6.15625, "learning_rate": 1.649649241079958e-06, "loss": 1.05718098, "memory(GiB)": 142.32, "step": 133580, "train_speed(iter/s)": 0.286529 }, { "acc": 0.73139033, "epoch": 1.4943089590029737, "grad_norm": 8.125, "learning_rate": 1.648276649332969e-06, "loss": 1.08968334, "memory(GiB)": 142.32, "step": 133600, "train_speed(iter/s)": 0.286541 }, { "acc": 0.73900585, "epoch": 1.4945326579489322, "grad_norm": 6.84375, "learning_rate": 1.6469045161496395e-06, "loss": 1.03333378, "memory(GiB)": 142.32, "step": 133620, "train_speed(iter/s)": 0.286557 }, { "acc": 0.73201609, "epoch": 1.4947563568948907, "grad_norm": 7.34375, "learning_rate": 1.6455328417177007e-06, "loss": 1.07625713, "memory(GiB)": 142.32, "step": 133640, "train_speed(iter/s)": 0.286569 }, { "acc": 0.74170728, "epoch": 1.4949800558408493, "grad_norm": 5.96875, "learning_rate": 1.6441616262248144e-06, "loss": 1.03529978, "memory(GiB)": 142.32, "step": 133660, "train_speed(iter/s)": 0.286582 }, { "acc": 0.73937955, "epoch": 1.4952037547868078, "grad_norm": 5.0625, "learning_rate": 1.6427908698585848e-06, "loss": 1.02045593, "memory(GiB)": 142.32, "step": 133680, "train_speed(iter/s)": 0.286595 }, { "acc": 0.73478289, "epoch": 1.4954274537327663, "grad_norm": 5.59375, "learning_rate": 1.6414205728065502e-06, "loss": 1.06804619, "memory(GiB)": 142.32, "step": 133700, "train_speed(iter/s)": 0.286608 }, { "acc": 0.742173, "epoch": 1.4956511526787248, "grad_norm": 7.40625, "learning_rate": 1.6400507352561846e-06, "loss": 1.02803707, "memory(GiB)": 142.32, "step": 133720, "train_speed(iter/s)": 0.286623 }, { "acc": 0.74108114, "epoch": 1.4958748516246834, "grad_norm": 6.6875, "learning_rate": 1.6386813573949044e-06, "loss": 1.03615608, "memory(GiB)": 142.32, "step": 133740, "train_speed(iter/s)": 0.286638 }, { "acc": 0.73355541, "epoch": 1.496098550570642, "grad_norm": 6.78125, "learning_rate": 1.6373124394100576e-06, "loss": 1.05020294, "memory(GiB)": 142.32, "step": 133760, "train_speed(iter/s)": 0.286654 }, { "acc": 0.72795196, "epoch": 1.4963222495166004, "grad_norm": 6.375, "learning_rate": 1.6359439814889344e-06, "loss": 1.08336725, "memory(GiB)": 142.32, "step": 133780, "train_speed(iter/s)": 0.286667 }, { "acc": 0.7405911, "epoch": 1.496545948462559, "grad_norm": 6.8125, "learning_rate": 1.6345759838187581e-06, "loss": 1.03882561, "memory(GiB)": 142.32, "step": 133800, "train_speed(iter/s)": 0.286681 }, { "acc": 0.75285807, "epoch": 1.4967696474085175, "grad_norm": 4.84375, "learning_rate": 1.6332084465866898e-06, "loss": 0.96477451, "memory(GiB)": 142.32, "step": 133820, "train_speed(iter/s)": 0.286696 }, { "acc": 0.72688999, "epoch": 1.4969933463544762, "grad_norm": 5.3125, "learning_rate": 1.631841369979829e-06, "loss": 1.09415359, "memory(GiB)": 142.32, "step": 133840, "train_speed(iter/s)": 0.28671 }, { "acc": 0.73727427, "epoch": 1.4972170453004348, "grad_norm": 6.5625, "learning_rate": 1.630474754185209e-06, "loss": 1.04095879, "memory(GiB)": 142.32, "step": 133860, "train_speed(iter/s)": 0.286723 }, { "acc": 0.72971458, "epoch": 1.4974407442463933, "grad_norm": 6.71875, "learning_rate": 1.629108599389806e-06, "loss": 1.08127565, "memory(GiB)": 142.32, "step": 133880, "train_speed(iter/s)": 0.286737 }, { "acc": 0.72508945, "epoch": 1.4976644431923518, "grad_norm": 4.96875, "learning_rate": 1.627742905780526e-06, "loss": 1.09288187, "memory(GiB)": 142.32, "step": 133900, "train_speed(iter/s)": 0.286751 }, { "acc": 0.73884134, "epoch": 1.4978881421383103, "grad_norm": 6.71875, "learning_rate": 1.6263776735442189e-06, "loss": 1.03398743, "memory(GiB)": 142.32, "step": 133920, "train_speed(iter/s)": 0.286766 }, { "acc": 0.72975054, "epoch": 1.4981118410842689, "grad_norm": 5.28125, "learning_rate": 1.6250129028676664e-06, "loss": 1.07119169, "memory(GiB)": 142.32, "step": 133940, "train_speed(iter/s)": 0.286781 }, { "acc": 0.73559008, "epoch": 1.4983355400302274, "grad_norm": 6.625, "learning_rate": 1.6236485939375867e-06, "loss": 1.04466505, "memory(GiB)": 142.32, "step": 133960, "train_speed(iter/s)": 0.286794 }, { "acc": 0.73146563, "epoch": 1.498559238976186, "grad_norm": 7.4375, "learning_rate": 1.6222847469406399e-06, "loss": 1.08261929, "memory(GiB)": 142.32, "step": 133980, "train_speed(iter/s)": 0.28681 }, { "acc": 0.74354029, "epoch": 1.4987829379221445, "grad_norm": 5.625, "learning_rate": 1.6209213620634174e-06, "loss": 1.01477785, "memory(GiB)": 142.32, "step": 134000, "train_speed(iter/s)": 0.286824 }, { "epoch": 1.4987829379221445, "eval_acc": 0.6963594223063025, "eval_loss": 1.0714521408081055, "eval_runtime": 2343.3454, "eval_samples_per_second": 32.126, "eval_steps_per_second": 16.063, "step": 134000 }, { "acc": 0.73169556, "epoch": 1.499006636868103, "grad_norm": 5.84375, "learning_rate": 1.6195584394924519e-06, "loss": 1.07803841, "memory(GiB)": 142.32, "step": 134020, "train_speed(iter/s)": 0.285376 }, { "acc": 0.74090309, "epoch": 1.4992303358140615, "grad_norm": 4.9375, "learning_rate": 1.6181959794142094e-06, "loss": 1.01942825, "memory(GiB)": 142.32, "step": 134040, "train_speed(iter/s)": 0.285389 }, { "acc": 0.73370247, "epoch": 1.49945403476002, "grad_norm": 5.25, "learning_rate": 1.6168339820150924e-06, "loss": 1.07034283, "memory(GiB)": 142.32, "step": 134060, "train_speed(iter/s)": 0.285405 }, { "acc": 0.73398533, "epoch": 1.4996777337059786, "grad_norm": 6.21875, "learning_rate": 1.6154724474814454e-06, "loss": 1.07496367, "memory(GiB)": 142.32, "step": 134080, "train_speed(iter/s)": 0.28542 }, { "acc": 0.72822056, "epoch": 1.499901432651937, "grad_norm": 6.34375, "learning_rate": 1.6141113759995414e-06, "loss": 1.07335968, "memory(GiB)": 142.32, "step": 134100, "train_speed(iter/s)": 0.285435 }, { "acc": 0.73022242, "epoch": 1.5001251315978954, "grad_norm": 5.71875, "learning_rate": 1.6127507677555988e-06, "loss": 1.07610626, "memory(GiB)": 142.32, "step": 134120, "train_speed(iter/s)": 0.285447 }, { "acc": 0.73866181, "epoch": 1.500348830543854, "grad_norm": 4.65625, "learning_rate": 1.6113906229357656e-06, "loss": 1.03415432, "memory(GiB)": 142.32, "step": 134140, "train_speed(iter/s)": 0.285462 }, { "acc": 0.73675127, "epoch": 1.5005725294898125, "grad_norm": 5.78125, "learning_rate": 1.6100309417261278e-06, "loss": 1.04598122, "memory(GiB)": 142.32, "step": 134160, "train_speed(iter/s)": 0.285477 }, { "acc": 0.74633131, "epoch": 1.500796228435771, "grad_norm": 6.875, "learning_rate": 1.6086717243127132e-06, "loss": 1.00855961, "memory(GiB)": 142.32, "step": 134180, "train_speed(iter/s)": 0.28549 }, { "acc": 0.74000707, "epoch": 1.5010199273817295, "grad_norm": 5.25, "learning_rate": 1.6073129708814783e-06, "loss": 1.03817768, "memory(GiB)": 142.32, "step": 134200, "train_speed(iter/s)": 0.285504 }, { "acc": 0.72985334, "epoch": 1.501243626327688, "grad_norm": 5.84375, "learning_rate": 1.605954681618323e-06, "loss": 1.08100014, "memory(GiB)": 142.32, "step": 134220, "train_speed(iter/s)": 0.285518 }, { "acc": 0.73236642, "epoch": 1.5014673252736466, "grad_norm": 6.53125, "learning_rate": 1.6045968567090797e-06, "loss": 1.05790577, "memory(GiB)": 142.32, "step": 134240, "train_speed(iter/s)": 0.285534 }, { "acc": 0.72427039, "epoch": 1.501691024219605, "grad_norm": 5.03125, "learning_rate": 1.6032394963395159e-06, "loss": 1.12623787, "memory(GiB)": 142.32, "step": 134260, "train_speed(iter/s)": 0.285549 }, { "acc": 0.74719238, "epoch": 1.5019147231655636, "grad_norm": 5.90625, "learning_rate": 1.6018826006953415e-06, "loss": 0.99825287, "memory(GiB)": 142.32, "step": 134280, "train_speed(iter/s)": 0.285565 }, { "acc": 0.72550564, "epoch": 1.5021384221115222, "grad_norm": 5.625, "learning_rate": 1.600526169962196e-06, "loss": 1.11126652, "memory(GiB)": 142.32, "step": 134300, "train_speed(iter/s)": 0.28558 }, { "acc": 0.73991303, "epoch": 1.5023621210574807, "grad_norm": 5.03125, "learning_rate": 1.5991702043256623e-06, "loss": 1.02575207, "memory(GiB)": 142.32, "step": 134320, "train_speed(iter/s)": 0.285596 }, { "acc": 0.73281555, "epoch": 1.5025858200034392, "grad_norm": 6.1875, "learning_rate": 1.5978147039712533e-06, "loss": 1.07916155, "memory(GiB)": 142.32, "step": 134340, "train_speed(iter/s)": 0.285611 }, { "acc": 0.7393508, "epoch": 1.5028095189493977, "grad_norm": 6.6875, "learning_rate": 1.5964596690844198e-06, "loss": 1.02252741, "memory(GiB)": 142.32, "step": 134360, "train_speed(iter/s)": 0.285624 }, { "acc": 0.72965021, "epoch": 1.5030332178953563, "grad_norm": 6.15625, "learning_rate": 1.5951050998505523e-06, "loss": 1.09488373, "memory(GiB)": 142.32, "step": 134380, "train_speed(iter/s)": 0.28564 }, { "acc": 0.72996564, "epoch": 1.5032569168413148, "grad_norm": 5.4375, "learning_rate": 1.5937509964549736e-06, "loss": 1.09092827, "memory(GiB)": 142.32, "step": 134400, "train_speed(iter/s)": 0.285656 }, { "acc": 0.73993378, "epoch": 1.5034806157872733, "grad_norm": 6.09375, "learning_rate": 1.5923973590829462e-06, "loss": 1.0322691, "memory(GiB)": 142.32, "step": 134420, "train_speed(iter/s)": 0.28567 }, { "acc": 0.73358698, "epoch": 1.5037043147332319, "grad_norm": 5.90625, "learning_rate": 1.5910441879196658e-06, "loss": 1.0712471, "memory(GiB)": 142.32, "step": 134440, "train_speed(iter/s)": 0.285684 }, { "acc": 0.72948117, "epoch": 1.5039280136791906, "grad_norm": 8.1875, "learning_rate": 1.5896914831502646e-06, "loss": 1.07449322, "memory(GiB)": 142.32, "step": 134460, "train_speed(iter/s)": 0.285697 }, { "acc": 0.74432397, "epoch": 1.5041517126251491, "grad_norm": 5.6875, "learning_rate": 1.5883392449598139e-06, "loss": 1.0047266, "memory(GiB)": 142.32, "step": 134480, "train_speed(iter/s)": 0.285712 }, { "acc": 0.74595299, "epoch": 1.5043754115711077, "grad_norm": 5.75, "learning_rate": 1.5869874735333173e-06, "loss": 0.99296484, "memory(GiB)": 142.32, "step": 134500, "train_speed(iter/s)": 0.285726 }, { "acc": 0.73791108, "epoch": 1.5045991105170662, "grad_norm": 6.625, "learning_rate": 1.5856361690557192e-06, "loss": 1.0292264, "memory(GiB)": 142.32, "step": 134520, "train_speed(iter/s)": 0.285739 }, { "acc": 0.73267102, "epoch": 1.5048228094630247, "grad_norm": 8.0, "learning_rate": 1.5842853317118957e-06, "loss": 1.07696371, "memory(GiB)": 142.32, "step": 134540, "train_speed(iter/s)": 0.285755 }, { "acc": 0.73788037, "epoch": 1.5050465084089832, "grad_norm": 5.65625, "learning_rate": 1.5829349616866591e-06, "loss": 1.05028009, "memory(GiB)": 142.32, "step": 134560, "train_speed(iter/s)": 0.285771 }, { "acc": 0.73931937, "epoch": 1.5052702073549418, "grad_norm": 6.65625, "learning_rate": 1.5815850591647618e-06, "loss": 1.03295546, "memory(GiB)": 142.32, "step": 134580, "train_speed(iter/s)": 0.285786 }, { "acc": 0.73602843, "epoch": 1.5054939063009003, "grad_norm": 7.125, "learning_rate": 1.5802356243308875e-06, "loss": 1.0278141, "memory(GiB)": 142.32, "step": 134600, "train_speed(iter/s)": 0.2858 }, { "acc": 0.73610802, "epoch": 1.5057176052468588, "grad_norm": 6.0, "learning_rate": 1.5788866573696615e-06, "loss": 1.04315319, "memory(GiB)": 142.32, "step": 134620, "train_speed(iter/s)": 0.285816 }, { "acc": 0.73730459, "epoch": 1.5059413041928174, "grad_norm": 6.0625, "learning_rate": 1.5775381584656397e-06, "loss": 1.04001665, "memory(GiB)": 142.32, "step": 134640, "train_speed(iter/s)": 0.285831 }, { "acc": 0.7377265, "epoch": 1.5061650031387759, "grad_norm": 6.25, "learning_rate": 1.5761901278033137e-06, "loss": 1.06162634, "memory(GiB)": 142.32, "step": 134660, "train_speed(iter/s)": 0.285848 }, { "acc": 0.74769211, "epoch": 1.5063887020847344, "grad_norm": 7.5625, "learning_rate": 1.5748425655671179e-06, "loss": 0.98762341, "memory(GiB)": 142.32, "step": 134680, "train_speed(iter/s)": 0.285862 }, { "acc": 0.72130203, "epoch": 1.506612401030693, "grad_norm": 6.125, "learning_rate": 1.5734954719414147e-06, "loss": 1.11592703, "memory(GiB)": 142.32, "step": 134700, "train_speed(iter/s)": 0.285878 }, { "acc": 0.72604184, "epoch": 1.5068360999766515, "grad_norm": 6.375, "learning_rate": 1.572148847110508e-06, "loss": 1.0930439, "memory(GiB)": 142.32, "step": 134720, "train_speed(iter/s)": 0.285894 }, { "acc": 0.72952323, "epoch": 1.50705979892261, "grad_norm": 5.71875, "learning_rate": 1.5708026912586343e-06, "loss": 1.07437916, "memory(GiB)": 142.32, "step": 134740, "train_speed(iter/s)": 0.285909 }, { "acc": 0.72908797, "epoch": 1.5072834978685685, "grad_norm": 6.5625, "learning_rate": 1.5694570045699658e-06, "loss": 1.10068874, "memory(GiB)": 142.32, "step": 134760, "train_speed(iter/s)": 0.285922 }, { "acc": 0.74062624, "epoch": 1.507507196814527, "grad_norm": 5.8125, "learning_rate": 1.568111787228614e-06, "loss": 1.03226929, "memory(GiB)": 142.32, "step": 134780, "train_speed(iter/s)": 0.285938 }, { "acc": 0.74726772, "epoch": 1.5077308957604856, "grad_norm": 5.90625, "learning_rate": 1.5667670394186212e-06, "loss": 1.00769596, "memory(GiB)": 142.32, "step": 134800, "train_speed(iter/s)": 0.285953 }, { "acc": 0.73841734, "epoch": 1.507954594706444, "grad_norm": 6.15625, "learning_rate": 1.5654227613239714e-06, "loss": 1.02920952, "memory(GiB)": 142.32, "step": 134820, "train_speed(iter/s)": 0.285967 }, { "acc": 0.7399765, "epoch": 1.5081782936524026, "grad_norm": 6.03125, "learning_rate": 1.5640789531285787e-06, "loss": 1.02455273, "memory(GiB)": 142.32, "step": 134840, "train_speed(iter/s)": 0.285983 }, { "acc": 0.73773561, "epoch": 1.5084019925983612, "grad_norm": 7.21875, "learning_rate": 1.5627356150162948e-06, "loss": 1.04290199, "memory(GiB)": 142.32, "step": 134860, "train_speed(iter/s)": 0.285998 }, { "acc": 0.74849558, "epoch": 1.5086256915443197, "grad_norm": 5.90625, "learning_rate": 1.5613927471709101e-06, "loss": 0.99811058, "memory(GiB)": 142.32, "step": 134880, "train_speed(iter/s)": 0.286012 }, { "acc": 0.7277154, "epoch": 1.5088493904902782, "grad_norm": 5.65625, "learning_rate": 1.5600503497761449e-06, "loss": 1.09421749, "memory(GiB)": 142.32, "step": 134900, "train_speed(iter/s)": 0.286026 }, { "acc": 0.73462019, "epoch": 1.5090730894362367, "grad_norm": 7.25, "learning_rate": 1.5587084230156618e-06, "loss": 1.0769434, "memory(GiB)": 142.32, "step": 134920, "train_speed(iter/s)": 0.28604 }, { "acc": 0.74838877, "epoch": 1.5092967883821953, "grad_norm": 6.375, "learning_rate": 1.557366967073054e-06, "loss": 0.98757553, "memory(GiB)": 142.32, "step": 134940, "train_speed(iter/s)": 0.286053 }, { "acc": 0.73804703, "epoch": 1.5095204873281538, "grad_norm": 6.375, "learning_rate": 1.5560259821318496e-06, "loss": 1.03807487, "memory(GiB)": 142.32, "step": 134960, "train_speed(iter/s)": 0.286068 }, { "acc": 0.74410172, "epoch": 1.5097441862741123, "grad_norm": 5.4375, "learning_rate": 1.5546854683755203e-06, "loss": 1.01157894, "memory(GiB)": 142.32, "step": 134980, "train_speed(iter/s)": 0.286083 }, { "acc": 0.74891644, "epoch": 1.5099678852200709, "grad_norm": 5.53125, "learning_rate": 1.5533454259874597e-06, "loss": 1.00460777, "memory(GiB)": 142.32, "step": 135000, "train_speed(iter/s)": 0.286095 }, { "acc": 0.73338404, "epoch": 1.5101915841660294, "grad_norm": 6.09375, "learning_rate": 1.5520058551510115e-06, "loss": 1.06980457, "memory(GiB)": 142.32, "step": 135020, "train_speed(iter/s)": 0.28611 }, { "acc": 0.72859106, "epoch": 1.510415283111988, "grad_norm": 6.46875, "learning_rate": 1.5506667560494432e-06, "loss": 1.08812294, "memory(GiB)": 142.32, "step": 135040, "train_speed(iter/s)": 0.286125 }, { "acc": 0.73447237, "epoch": 1.5106389820579464, "grad_norm": 7.0625, "learning_rate": 1.5493281288659672e-06, "loss": 1.07075787, "memory(GiB)": 142.32, "step": 135060, "train_speed(iter/s)": 0.28614 }, { "acc": 0.73333483, "epoch": 1.510862681003905, "grad_norm": 5.6875, "learning_rate": 1.547989973783724e-06, "loss": 1.05549135, "memory(GiB)": 142.32, "step": 135080, "train_speed(iter/s)": 0.286155 }, { "acc": 0.74489317, "epoch": 1.5110863799498635, "grad_norm": 6.0625, "learning_rate": 1.5466522909857917e-06, "loss": 1.01265583, "memory(GiB)": 142.32, "step": 135100, "train_speed(iter/s)": 0.28617 }, { "acc": 0.73894272, "epoch": 1.511310078895822, "grad_norm": 5.34375, "learning_rate": 1.5453150806551875e-06, "loss": 1.04343319, "memory(GiB)": 142.32, "step": 135120, "train_speed(iter/s)": 0.286184 }, { "acc": 0.73575621, "epoch": 1.5115337778417806, "grad_norm": 7.28125, "learning_rate": 1.5439783429748574e-06, "loss": 1.04293032, "memory(GiB)": 142.32, "step": 135140, "train_speed(iter/s)": 0.286198 }, { "acc": 0.74408617, "epoch": 1.511757476787739, "grad_norm": 6.125, "learning_rate": 1.542642078127689e-06, "loss": 1.00256939, "memory(GiB)": 142.32, "step": 135160, "train_speed(iter/s)": 0.286214 }, { "acc": 0.73274369, "epoch": 1.5119811757336976, "grad_norm": 6.8125, "learning_rate": 1.5413062862965023e-06, "loss": 1.0760252, "memory(GiB)": 142.32, "step": 135180, "train_speed(iter/s)": 0.286228 }, { "acc": 0.75099773, "epoch": 1.5122048746796561, "grad_norm": 7.96875, "learning_rate": 1.5399709676640496e-06, "loss": 0.9879735, "memory(GiB)": 142.32, "step": 135200, "train_speed(iter/s)": 0.286241 }, { "acc": 0.74015594, "epoch": 1.5124285736256147, "grad_norm": 5.875, "learning_rate": 1.5386361224130253e-06, "loss": 1.0266654, "memory(GiB)": 142.32, "step": 135220, "train_speed(iter/s)": 0.286255 }, { "acc": 0.73049402, "epoch": 1.5126522725715732, "grad_norm": 7.34375, "learning_rate": 1.5373017507260517e-06, "loss": 1.07521038, "memory(GiB)": 142.32, "step": 135240, "train_speed(iter/s)": 0.286268 }, { "acc": 0.73822479, "epoch": 1.5128759715175317, "grad_norm": 5.8125, "learning_rate": 1.5359678527856943e-06, "loss": 1.0440053, "memory(GiB)": 142.32, "step": 135260, "train_speed(iter/s)": 0.286282 }, { "acc": 0.7390027, "epoch": 1.5130996704634903, "grad_norm": 6.28125, "learning_rate": 1.5346344287744452e-06, "loss": 1.04490681, "memory(GiB)": 142.32, "step": 135280, "train_speed(iter/s)": 0.286296 }, { "acc": 0.73080912, "epoch": 1.5133233694094488, "grad_norm": 5.84375, "learning_rate": 1.5333014788747397e-06, "loss": 1.07331028, "memory(GiB)": 142.32, "step": 135300, "train_speed(iter/s)": 0.28631 }, { "acc": 0.74075398, "epoch": 1.5135470683554073, "grad_norm": 6.25, "learning_rate": 1.5319690032689417e-06, "loss": 1.02804871, "memory(GiB)": 142.32, "step": 135320, "train_speed(iter/s)": 0.286325 }, { "acc": 0.74861326, "epoch": 1.5137707673013658, "grad_norm": 6.0, "learning_rate": 1.5306370021393524e-06, "loss": 0.99443254, "memory(GiB)": 142.32, "step": 135340, "train_speed(iter/s)": 0.286339 }, { "acc": 0.74130034, "epoch": 1.5139944662473244, "grad_norm": 5.46875, "learning_rate": 1.5293054756682113e-06, "loss": 1.01387329, "memory(GiB)": 142.32, "step": 135360, "train_speed(iter/s)": 0.286353 }, { "acc": 0.73369436, "epoch": 1.514218165193283, "grad_norm": 6.78125, "learning_rate": 1.5279744240376877e-06, "loss": 1.05483055, "memory(GiB)": 142.32, "step": 135380, "train_speed(iter/s)": 0.286367 }, { "acc": 0.73698487, "epoch": 1.5144418641392414, "grad_norm": 6.9375, "learning_rate": 1.5266438474298907e-06, "loss": 1.03089457, "memory(GiB)": 142.32, "step": 135400, "train_speed(iter/s)": 0.286382 }, { "acc": 0.74087286, "epoch": 1.5146655630852, "grad_norm": 5.375, "learning_rate": 1.5253137460268612e-06, "loss": 1.03171349, "memory(GiB)": 142.32, "step": 135420, "train_speed(iter/s)": 0.286398 }, { "acc": 0.74225292, "epoch": 1.5148892620311585, "grad_norm": 6.0625, "learning_rate": 1.5239841200105743e-06, "loss": 1.03612251, "memory(GiB)": 142.32, "step": 135440, "train_speed(iter/s)": 0.286411 }, { "acc": 0.73883224, "epoch": 1.515112960977117, "grad_norm": 6.28125, "learning_rate": 1.522654969562945e-06, "loss": 1.03335581, "memory(GiB)": 142.32, "step": 135460, "train_speed(iter/s)": 0.286425 }, { "acc": 0.73827925, "epoch": 1.5153366599230755, "grad_norm": 7.53125, "learning_rate": 1.521326294865817e-06, "loss": 1.02791538, "memory(GiB)": 142.32, "step": 135480, "train_speed(iter/s)": 0.286441 }, { "acc": 0.72896633, "epoch": 1.515560358869034, "grad_norm": 6.90625, "learning_rate": 1.5199980961009754e-06, "loss": 1.10191536, "memory(GiB)": 142.32, "step": 135500, "train_speed(iter/s)": 0.286454 }, { "acc": 0.74386806, "epoch": 1.5157840578149926, "grad_norm": 7.46875, "learning_rate": 1.518670373450135e-06, "loss": 1.03160105, "memory(GiB)": 142.32, "step": 135520, "train_speed(iter/s)": 0.28647 }, { "acc": 0.74169903, "epoch": 1.5160077567609511, "grad_norm": 5.0, "learning_rate": 1.5173431270949451e-06, "loss": 1.04463081, "memory(GiB)": 142.32, "step": 135540, "train_speed(iter/s)": 0.286484 }, { "acc": 0.75092325, "epoch": 1.5162314557069096, "grad_norm": 6.84375, "learning_rate": 1.5160163572169962e-06, "loss": 1.00103388, "memory(GiB)": 142.32, "step": 135560, "train_speed(iter/s)": 0.286497 }, { "acc": 0.73324003, "epoch": 1.5164551546528682, "grad_norm": 5.84375, "learning_rate": 1.5146900639978052e-06, "loss": 1.07051945, "memory(GiB)": 142.32, "step": 135580, "train_speed(iter/s)": 0.286512 }, { "acc": 0.73070335, "epoch": 1.5166788535988267, "grad_norm": 6.59375, "learning_rate": 1.513364247618832e-06, "loss": 1.07306986, "memory(GiB)": 142.32, "step": 135600, "train_speed(iter/s)": 0.286528 }, { "acc": 0.7377923, "epoch": 1.5169025525447852, "grad_norm": 6.21875, "learning_rate": 1.512038908261465e-06, "loss": 1.04756603, "memory(GiB)": 142.32, "step": 135620, "train_speed(iter/s)": 0.286543 }, { "acc": 0.7302803, "epoch": 1.5171262514907438, "grad_norm": 7.25, "learning_rate": 1.510714046107028e-06, "loss": 1.07383976, "memory(GiB)": 142.32, "step": 135640, "train_speed(iter/s)": 0.286558 }, { "acc": 0.74231462, "epoch": 1.5173499504367023, "grad_norm": 5.5, "learning_rate": 1.5093896613367847e-06, "loss": 1.0363328, "memory(GiB)": 142.32, "step": 135660, "train_speed(iter/s)": 0.286573 }, { "acc": 0.73825989, "epoch": 1.5175736493826608, "grad_norm": 6.59375, "learning_rate": 1.5080657541319265e-06, "loss": 1.03604355, "memory(GiB)": 142.32, "step": 135680, "train_speed(iter/s)": 0.286588 }, { "acc": 0.75148907, "epoch": 1.5177973483286193, "grad_norm": 4.625, "learning_rate": 1.5067423246735857e-06, "loss": 0.98755989, "memory(GiB)": 142.32, "step": 135700, "train_speed(iter/s)": 0.286601 }, { "acc": 0.73875551, "epoch": 1.5180210472745779, "grad_norm": 5.6875, "learning_rate": 1.5054193731428257e-06, "loss": 1.04364548, "memory(GiB)": 142.32, "step": 135720, "train_speed(iter/s)": 0.286613 }, { "acc": 0.73808889, "epoch": 1.5182447462205364, "grad_norm": 7.15625, "learning_rate": 1.5040968997206423e-06, "loss": 1.03969879, "memory(GiB)": 142.32, "step": 135740, "train_speed(iter/s)": 0.286627 }, { "acc": 0.73139076, "epoch": 1.518468445166495, "grad_norm": 6.03125, "learning_rate": 1.5027749045879724e-06, "loss": 1.0718709, "memory(GiB)": 142.32, "step": 135760, "train_speed(iter/s)": 0.286642 }, { "acc": 0.74657326, "epoch": 1.5186921441124535, "grad_norm": 6.5625, "learning_rate": 1.5014533879256816e-06, "loss": 1.02549124, "memory(GiB)": 142.32, "step": 135780, "train_speed(iter/s)": 0.286657 }, { "acc": 0.7344933, "epoch": 1.518915843058412, "grad_norm": 5.46875, "learning_rate": 1.5001323499145743e-06, "loss": 1.05947828, "memory(GiB)": 142.32, "step": 135800, "train_speed(iter/s)": 0.286671 }, { "acc": 0.75325994, "epoch": 1.5191395420043705, "grad_norm": 5.71875, "learning_rate": 1.4988117907353861e-06, "loss": 0.98262672, "memory(GiB)": 142.32, "step": 135820, "train_speed(iter/s)": 0.286684 }, { "acc": 0.73460598, "epoch": 1.519363240950329, "grad_norm": 5.84375, "learning_rate": 1.497491710568787e-06, "loss": 1.06490517, "memory(GiB)": 142.32, "step": 135840, "train_speed(iter/s)": 0.286698 }, { "acc": 0.73513603, "epoch": 1.5195869398962876, "grad_norm": 6.71875, "learning_rate": 1.496172109595385e-06, "loss": 1.05755234, "memory(GiB)": 142.32, "step": 135860, "train_speed(iter/s)": 0.286713 }, { "acc": 0.74605765, "epoch": 1.519810638842246, "grad_norm": 6.5, "learning_rate": 1.494852987995719e-06, "loss": 1.01895561, "memory(GiB)": 142.32, "step": 135880, "train_speed(iter/s)": 0.286727 }, { "acc": 0.73908987, "epoch": 1.5200343377882046, "grad_norm": 5.21875, "learning_rate": 1.493534345950266e-06, "loss": 1.04411926, "memory(GiB)": 142.32, "step": 135900, "train_speed(iter/s)": 0.286742 }, { "acc": 0.72821388, "epoch": 1.5202580367341632, "grad_norm": 5.90625, "learning_rate": 1.4922161836394332e-06, "loss": 1.08068247, "memory(GiB)": 142.32, "step": 135920, "train_speed(iter/s)": 0.286756 }, { "acc": 0.73892021, "epoch": 1.5204817356801217, "grad_norm": 5.15625, "learning_rate": 1.4908985012435624e-06, "loss": 1.03440609, "memory(GiB)": 142.32, "step": 135940, "train_speed(iter/s)": 0.286768 }, { "acc": 0.73936558, "epoch": 1.5207054346260802, "grad_norm": 7.5625, "learning_rate": 1.4895812989429353e-06, "loss": 1.03923817, "memory(GiB)": 142.32, "step": 135960, "train_speed(iter/s)": 0.286783 }, { "acc": 0.74186659, "epoch": 1.5209291335720387, "grad_norm": 5.3125, "learning_rate": 1.48826457691776e-06, "loss": 1.02955647, "memory(GiB)": 142.32, "step": 135980, "train_speed(iter/s)": 0.286797 }, { "acc": 0.72639241, "epoch": 1.5211528325179973, "grad_norm": 5.65625, "learning_rate": 1.4869483353481867e-06, "loss": 1.09587564, "memory(GiB)": 142.32, "step": 136000, "train_speed(iter/s)": 0.286812 }, { "epoch": 1.5211528325179973, "eval_acc": 0.6963527672578715, "eval_loss": 1.0714740753173828, "eval_runtime": 2337.6547, "eval_samples_per_second": 32.204, "eval_steps_per_second": 16.102, "step": 136000 }, { "acc": 0.73530254, "epoch": 1.5213765314639558, "grad_norm": 5.0625, "learning_rate": 1.4856325744142936e-06, "loss": 1.06098204, "memory(GiB)": 142.32, "step": 136020, "train_speed(iter/s)": 0.285389 }, { "acc": 0.74639778, "epoch": 1.5216002304099143, "grad_norm": 5.5, "learning_rate": 1.4843172942960954e-06, "loss": 1.01022215, "memory(GiB)": 142.32, "step": 136040, "train_speed(iter/s)": 0.285405 }, { "acc": 0.72971506, "epoch": 1.5218239293558729, "grad_norm": 6.78125, "learning_rate": 1.4830024951735434e-06, "loss": 1.07476234, "memory(GiB)": 142.32, "step": 136060, "train_speed(iter/s)": 0.285419 }, { "acc": 0.72861061, "epoch": 1.5220476283018314, "grad_norm": 5.09375, "learning_rate": 1.4816881772265173e-06, "loss": 1.09690228, "memory(GiB)": 142.32, "step": 136080, "train_speed(iter/s)": 0.285432 }, { "acc": 0.74176688, "epoch": 1.52227132724779, "grad_norm": 6.59375, "learning_rate": 1.4803743406348393e-06, "loss": 1.02447042, "memory(GiB)": 142.32, "step": 136100, "train_speed(iter/s)": 0.285445 }, { "acc": 0.7395339, "epoch": 1.5224950261937484, "grad_norm": 7.28125, "learning_rate": 1.4790609855782577e-06, "loss": 1.04548817, "memory(GiB)": 142.32, "step": 136120, "train_speed(iter/s)": 0.285459 }, { "acc": 0.72982254, "epoch": 1.522718725139707, "grad_norm": 6.59375, "learning_rate": 1.4777481122364584e-06, "loss": 1.082269, "memory(GiB)": 142.32, "step": 136140, "train_speed(iter/s)": 0.285477 }, { "acc": 0.74545689, "epoch": 1.5229424240856655, "grad_norm": 7.28125, "learning_rate": 1.4764357207890645e-06, "loss": 1.01651897, "memory(GiB)": 142.32, "step": 136160, "train_speed(iter/s)": 0.285493 }, { "acc": 0.73865213, "epoch": 1.523166123031624, "grad_norm": 7.15625, "learning_rate": 1.4751238114156242e-06, "loss": 1.03394232, "memory(GiB)": 142.32, "step": 136180, "train_speed(iter/s)": 0.285507 }, { "acc": 0.73862906, "epoch": 1.5233898219775825, "grad_norm": 6.0, "learning_rate": 1.4738123842956304e-06, "loss": 1.04717283, "memory(GiB)": 142.32, "step": 136200, "train_speed(iter/s)": 0.285522 }, { "acc": 0.73325272, "epoch": 1.523613520923541, "grad_norm": 6.25, "learning_rate": 1.4725014396085014e-06, "loss": 1.06888161, "memory(GiB)": 142.32, "step": 136220, "train_speed(iter/s)": 0.285538 }, { "acc": 0.74101648, "epoch": 1.5238372198694996, "grad_norm": 6.96875, "learning_rate": 1.471190977533597e-06, "loss": 1.0287302, "memory(GiB)": 142.32, "step": 136240, "train_speed(iter/s)": 0.285555 }, { "acc": 0.7290947, "epoch": 1.5240609188154581, "grad_norm": 5.90625, "learning_rate": 1.4698809982502048e-06, "loss": 1.08312092, "memory(GiB)": 142.32, "step": 136260, "train_speed(iter/s)": 0.285571 }, { "acc": 0.74001374, "epoch": 1.5242846177614167, "grad_norm": 5.03125, "learning_rate": 1.468571501937548e-06, "loss": 1.03894539, "memory(GiB)": 142.32, "step": 136280, "train_speed(iter/s)": 0.285585 }, { "acc": 0.72436905, "epoch": 1.5245083167073752, "grad_norm": 6.5625, "learning_rate": 1.4672624887747865e-06, "loss": 1.10951214, "memory(GiB)": 142.32, "step": 136300, "train_speed(iter/s)": 0.285599 }, { "acc": 0.74432478, "epoch": 1.5247320156533337, "grad_norm": 6.5625, "learning_rate": 1.4659539589410099e-06, "loss": 1.01384354, "memory(GiB)": 142.32, "step": 136320, "train_speed(iter/s)": 0.285612 }, { "acc": 0.74458685, "epoch": 1.5249557145992922, "grad_norm": 7.15625, "learning_rate": 1.4646459126152458e-06, "loss": 1.01580219, "memory(GiB)": 142.32, "step": 136340, "train_speed(iter/s)": 0.285624 }, { "acc": 0.73862219, "epoch": 1.5251794135452508, "grad_norm": 4.875, "learning_rate": 1.4633383499764531e-06, "loss": 1.04391842, "memory(GiB)": 142.32, "step": 136360, "train_speed(iter/s)": 0.285638 }, { "acc": 0.74972439, "epoch": 1.5254031124912093, "grad_norm": 6.9375, "learning_rate": 1.4620312712035234e-06, "loss": 0.990271, "memory(GiB)": 142.32, "step": 136380, "train_speed(iter/s)": 0.285652 }, { "acc": 0.73990831, "epoch": 1.5256268114371678, "grad_norm": 5.53125, "learning_rate": 1.4607246764752858e-06, "loss": 1.03196297, "memory(GiB)": 142.32, "step": 136400, "train_speed(iter/s)": 0.285665 }, { "acc": 0.75408225, "epoch": 1.5258505103831264, "grad_norm": 7.6875, "learning_rate": 1.4594185659704995e-06, "loss": 0.97866955, "memory(GiB)": 142.32, "step": 136420, "train_speed(iter/s)": 0.285679 }, { "acc": 0.73506393, "epoch": 1.5260742093290849, "grad_norm": 6.1875, "learning_rate": 1.4581129398678612e-06, "loss": 1.05958004, "memory(GiB)": 142.32, "step": 136440, "train_speed(iter/s)": 0.285693 }, { "acc": 0.73312111, "epoch": 1.5262979082750434, "grad_norm": 6.40625, "learning_rate": 1.4568077983459982e-06, "loss": 1.06698151, "memory(GiB)": 142.32, "step": 136460, "train_speed(iter/s)": 0.285706 }, { "acc": 0.73445597, "epoch": 1.526521607221002, "grad_norm": 5.71875, "learning_rate": 1.4555031415834703e-06, "loss": 1.05798254, "memory(GiB)": 142.32, "step": 136480, "train_speed(iter/s)": 0.28572 }, { "acc": 0.73124304, "epoch": 1.5267453061669605, "grad_norm": 6.1875, "learning_rate": 1.4541989697587771e-06, "loss": 1.07645836, "memory(GiB)": 142.32, "step": 136500, "train_speed(iter/s)": 0.285735 }, { "acc": 0.74230003, "epoch": 1.526969005112919, "grad_norm": 6.21875, "learning_rate": 1.4528952830503445e-06, "loss": 1.02632227, "memory(GiB)": 142.32, "step": 136520, "train_speed(iter/s)": 0.285747 }, { "acc": 0.73484855, "epoch": 1.5271927040588775, "grad_norm": 6.5, "learning_rate": 1.451592081636538e-06, "loss": 1.05988388, "memory(GiB)": 142.32, "step": 136540, "train_speed(iter/s)": 0.285762 }, { "acc": 0.73191171, "epoch": 1.527416403004836, "grad_norm": 5.40625, "learning_rate": 1.4502893656956535e-06, "loss": 1.07200413, "memory(GiB)": 142.32, "step": 136560, "train_speed(iter/s)": 0.285775 }, { "acc": 0.72692738, "epoch": 1.5276401019507946, "grad_norm": 5.53125, "learning_rate": 1.4489871354059192e-06, "loss": 1.0990078, "memory(GiB)": 142.32, "step": 136580, "train_speed(iter/s)": 0.285791 }, { "acc": 0.74150133, "epoch": 1.527863800896753, "grad_norm": 6.75, "learning_rate": 1.4476853909455025e-06, "loss": 1.02737904, "memory(GiB)": 142.32, "step": 136600, "train_speed(iter/s)": 0.285805 }, { "acc": 0.74542923, "epoch": 1.5280874998427116, "grad_norm": 7.53125, "learning_rate": 1.4463841324924966e-06, "loss": 0.99826736, "memory(GiB)": 142.32, "step": 136620, "train_speed(iter/s)": 0.28582 }, { "acc": 0.73052559, "epoch": 1.5283111987886702, "grad_norm": 6.0, "learning_rate": 1.4450833602249359e-06, "loss": 1.0830637, "memory(GiB)": 142.32, "step": 136640, "train_speed(iter/s)": 0.285834 }, { "acc": 0.7390275, "epoch": 1.5285348977346287, "grad_norm": 6.75, "learning_rate": 1.4437830743207827e-06, "loss": 1.0311594, "memory(GiB)": 142.32, "step": 136660, "train_speed(iter/s)": 0.285849 }, { "acc": 0.73606873, "epoch": 1.5287585966805872, "grad_norm": 5.4375, "learning_rate": 1.4424832749579338e-06, "loss": 1.06298943, "memory(GiB)": 142.32, "step": 136680, "train_speed(iter/s)": 0.285864 }, { "acc": 0.73480844, "epoch": 1.5289822956265458, "grad_norm": 5.9375, "learning_rate": 1.4411839623142227e-06, "loss": 1.04929504, "memory(GiB)": 142.32, "step": 136700, "train_speed(iter/s)": 0.285878 }, { "acc": 0.73121567, "epoch": 1.5292059945725043, "grad_norm": 6.0, "learning_rate": 1.4398851365674115e-06, "loss": 1.04910879, "memory(GiB)": 142.32, "step": 136720, "train_speed(iter/s)": 0.285892 }, { "acc": 0.74736052, "epoch": 1.5294296935184628, "grad_norm": 6.375, "learning_rate": 1.4385867978952011e-06, "loss": 0.99732227, "memory(GiB)": 142.32, "step": 136740, "train_speed(iter/s)": 0.285907 }, { "acc": 0.73938789, "epoch": 1.5296533924644213, "grad_norm": 6.4375, "learning_rate": 1.4372889464752203e-06, "loss": 1.02905312, "memory(GiB)": 142.32, "step": 136760, "train_speed(iter/s)": 0.285922 }, { "acc": 0.73728504, "epoch": 1.5298770914103799, "grad_norm": 4.75, "learning_rate": 1.435991582485034e-06, "loss": 1.05105867, "memory(GiB)": 142.32, "step": 136780, "train_speed(iter/s)": 0.285935 }, { "acc": 0.75467653, "epoch": 1.5301007903563384, "grad_norm": 8.1875, "learning_rate": 1.4346947061021417e-06, "loss": 0.96005087, "memory(GiB)": 142.32, "step": 136800, "train_speed(iter/s)": 0.28595 }, { "acc": 0.74019947, "epoch": 1.530324489302297, "grad_norm": 7.15625, "learning_rate": 1.4333983175039717e-06, "loss": 1.00852785, "memory(GiB)": 142.32, "step": 136820, "train_speed(iter/s)": 0.285962 }, { "acc": 0.74299173, "epoch": 1.5305481882482554, "grad_norm": 5.90625, "learning_rate": 1.432102416867892e-06, "loss": 1.02358513, "memory(GiB)": 142.32, "step": 136840, "train_speed(iter/s)": 0.285975 }, { "acc": 0.73858204, "epoch": 1.530771887194214, "grad_norm": 5.78125, "learning_rate": 1.4308070043711992e-06, "loss": 1.03093014, "memory(GiB)": 142.32, "step": 136860, "train_speed(iter/s)": 0.285989 }, { "acc": 0.73616729, "epoch": 1.5309955861401725, "grad_norm": 6.90625, "learning_rate": 1.4295120801911216e-06, "loss": 1.05290813, "memory(GiB)": 142.32, "step": 136880, "train_speed(iter/s)": 0.286003 }, { "acc": 0.74093494, "epoch": 1.531219285086131, "grad_norm": 5.3125, "learning_rate": 1.4282176445048274e-06, "loss": 1.0339613, "memory(GiB)": 142.32, "step": 136900, "train_speed(iter/s)": 0.286016 }, { "acc": 0.7434165, "epoch": 1.5314429840320896, "grad_norm": 5.75, "learning_rate": 1.4269236974894103e-06, "loss": 1.00877733, "memory(GiB)": 142.32, "step": 136920, "train_speed(iter/s)": 0.28603 }, { "acc": 0.74733577, "epoch": 1.531666682978048, "grad_norm": 5.5625, "learning_rate": 1.4256302393219041e-06, "loss": 0.99914532, "memory(GiB)": 142.32, "step": 136940, "train_speed(iter/s)": 0.286044 }, { "acc": 0.73335781, "epoch": 1.5318903819240066, "grad_norm": 5.84375, "learning_rate": 1.4243372701792702e-06, "loss": 1.06150417, "memory(GiB)": 142.32, "step": 136960, "train_speed(iter/s)": 0.286057 }, { "acc": 0.73940392, "epoch": 1.5321140808699654, "grad_norm": 6.25, "learning_rate": 1.4230447902384049e-06, "loss": 1.03188267, "memory(GiB)": 142.32, "step": 136980, "train_speed(iter/s)": 0.286069 }, { "acc": 0.74441371, "epoch": 1.532337779815924, "grad_norm": 5.0, "learning_rate": 1.4217527996761399e-06, "loss": 1.00597067, "memory(GiB)": 142.32, "step": 137000, "train_speed(iter/s)": 0.286082 }, { "acc": 0.73468752, "epoch": 1.5325614787618824, "grad_norm": 5.8125, "learning_rate": 1.420461298669235e-06, "loss": 1.05647602, "memory(GiB)": 142.32, "step": 137020, "train_speed(iter/s)": 0.286095 }, { "acc": 0.73102245, "epoch": 1.532785177707841, "grad_norm": 5.0, "learning_rate": 1.4191702873943898e-06, "loss": 1.06508522, "memory(GiB)": 142.32, "step": 137040, "train_speed(iter/s)": 0.286108 }, { "acc": 0.75093484, "epoch": 1.5330088766537995, "grad_norm": 6.25, "learning_rate": 1.4178797660282313e-06, "loss": 0.9885006, "memory(GiB)": 142.32, "step": 137060, "train_speed(iter/s)": 0.286122 }, { "acc": 0.73356771, "epoch": 1.533232575599758, "grad_norm": 4.8125, "learning_rate": 1.416589734747319e-06, "loss": 1.06861954, "memory(GiB)": 142.32, "step": 137080, "train_speed(iter/s)": 0.286136 }, { "acc": 0.73869662, "epoch": 1.5334562745457165, "grad_norm": 4.21875, "learning_rate": 1.4153001937281512e-06, "loss": 1.04058437, "memory(GiB)": 142.32, "step": 137100, "train_speed(iter/s)": 0.28615 }, { "acc": 0.73056688, "epoch": 1.533679973491675, "grad_norm": 5.75, "learning_rate": 1.4140111431471531e-06, "loss": 1.09017153, "memory(GiB)": 142.32, "step": 137120, "train_speed(iter/s)": 0.286163 }, { "acc": 0.73744564, "epoch": 1.5339036724376336, "grad_norm": 6.53125, "learning_rate": 1.4127225831806873e-06, "loss": 1.0272686, "memory(GiB)": 142.32, "step": 137140, "train_speed(iter/s)": 0.286177 }, { "acc": 0.72602482, "epoch": 1.5341273713835921, "grad_norm": 6.9375, "learning_rate": 1.4114345140050466e-06, "loss": 1.11398582, "memory(GiB)": 142.32, "step": 137160, "train_speed(iter/s)": 0.286191 }, { "acc": 0.72898564, "epoch": 1.5343510703295506, "grad_norm": 6.125, "learning_rate": 1.4101469357964549e-06, "loss": 1.08358459, "memory(GiB)": 142.32, "step": 137180, "train_speed(iter/s)": 0.286205 }, { "acc": 0.72748032, "epoch": 1.5345747692755092, "grad_norm": 7.40625, "learning_rate": 1.408859848731075e-06, "loss": 1.08297367, "memory(GiB)": 142.32, "step": 137200, "train_speed(iter/s)": 0.28622 }, { "acc": 0.74294024, "epoch": 1.5347984682214677, "grad_norm": 7.0625, "learning_rate": 1.407573252984995e-06, "loss": 1.01784019, "memory(GiB)": 142.32, "step": 137220, "train_speed(iter/s)": 0.286233 }, { "acc": 0.74066315, "epoch": 1.5350221671674262, "grad_norm": 6.1875, "learning_rate": 1.406287148734244e-06, "loss": 1.0214304, "memory(GiB)": 142.32, "step": 137240, "train_speed(iter/s)": 0.286246 }, { "acc": 0.72937102, "epoch": 1.5352458661133848, "grad_norm": 6.25, "learning_rate": 1.4050015361547764e-06, "loss": 1.07201881, "memory(GiB)": 142.32, "step": 137260, "train_speed(iter/s)": 0.286261 }, { "acc": 0.7245429, "epoch": 1.5354695650593433, "grad_norm": 5.90625, "learning_rate": 1.4037164154224813e-06, "loss": 1.10387306, "memory(GiB)": 142.32, "step": 137280, "train_speed(iter/s)": 0.286275 }, { "acc": 0.75818844, "epoch": 1.5356932640053018, "grad_norm": 5.1875, "learning_rate": 1.4024317867131854e-06, "loss": 0.95484009, "memory(GiB)": 142.32, "step": 137300, "train_speed(iter/s)": 0.286291 }, { "acc": 0.75073423, "epoch": 1.5359169629512603, "grad_norm": 6.625, "learning_rate": 1.40114765020264e-06, "loss": 0.99095402, "memory(GiB)": 142.32, "step": 137320, "train_speed(iter/s)": 0.286306 }, { "acc": 0.73698983, "epoch": 1.5361406618972189, "grad_norm": 4.90625, "learning_rate": 1.3998640060665391e-06, "loss": 1.04879436, "memory(GiB)": 142.32, "step": 137340, "train_speed(iter/s)": 0.28632 }, { "acc": 0.73719335, "epoch": 1.5363643608431774, "grad_norm": 5.625, "learning_rate": 1.3985808544804969e-06, "loss": 1.0368598, "memory(GiB)": 142.32, "step": 137360, "train_speed(iter/s)": 0.286334 }, { "acc": 0.74611135, "epoch": 1.536588059789136, "grad_norm": 7.0, "learning_rate": 1.397298195620071e-06, "loss": 0.99386787, "memory(GiB)": 142.32, "step": 137380, "train_speed(iter/s)": 0.286348 }, { "acc": 0.74542103, "epoch": 1.5368117587350945, "grad_norm": 5.59375, "learning_rate": 1.3960160296607468e-06, "loss": 1.0168047, "memory(GiB)": 142.32, "step": 137400, "train_speed(iter/s)": 0.28636 }, { "acc": 0.7394639, "epoch": 1.537035457681053, "grad_norm": 6.0625, "learning_rate": 1.394734356777941e-06, "loss": 1.05249796, "memory(GiB)": 142.32, "step": 137420, "train_speed(iter/s)": 0.286373 }, { "acc": 0.74047728, "epoch": 1.5372591566270115, "grad_norm": 4.875, "learning_rate": 1.3934531771470078e-06, "loss": 1.02863321, "memory(GiB)": 142.32, "step": 137440, "train_speed(iter/s)": 0.286387 }, { "acc": 0.73332686, "epoch": 1.53748285557297, "grad_norm": 5.25, "learning_rate": 1.3921724909432277e-06, "loss": 1.05403328, "memory(GiB)": 142.32, "step": 137460, "train_speed(iter/s)": 0.286399 }, { "acc": 0.73309536, "epoch": 1.5377065545189286, "grad_norm": 5.625, "learning_rate": 1.3908922983418205e-06, "loss": 1.06546354, "memory(GiB)": 142.32, "step": 137480, "train_speed(iter/s)": 0.286414 }, { "acc": 0.74780235, "epoch": 1.537930253464887, "grad_norm": 6.21875, "learning_rate": 1.3896125995179328e-06, "loss": 0.99247456, "memory(GiB)": 142.32, "step": 137500, "train_speed(iter/s)": 0.286429 }, { "acc": 0.72823181, "epoch": 1.5381539524108456, "grad_norm": 6.53125, "learning_rate": 1.3883333946466443e-06, "loss": 1.07680473, "memory(GiB)": 142.32, "step": 137520, "train_speed(iter/s)": 0.286443 }, { "acc": 0.74208403, "epoch": 1.5383776513568042, "grad_norm": 6.40625, "learning_rate": 1.3870546839029713e-06, "loss": 1.01772804, "memory(GiB)": 142.32, "step": 137540, "train_speed(iter/s)": 0.286458 }, { "acc": 0.7444293, "epoch": 1.5386013503027627, "grad_norm": 6.25, "learning_rate": 1.3857764674618568e-06, "loss": 1.00912609, "memory(GiB)": 142.32, "step": 137560, "train_speed(iter/s)": 0.286471 }, { "acc": 0.73570919, "epoch": 1.5388250492487212, "grad_norm": 6.5625, "learning_rate": 1.3844987454981823e-06, "loss": 1.03973637, "memory(GiB)": 142.32, "step": 137580, "train_speed(iter/s)": 0.286484 }, { "acc": 0.74452744, "epoch": 1.5390487481946797, "grad_norm": 5.9375, "learning_rate": 1.3832215181867575e-06, "loss": 1.00807056, "memory(GiB)": 142.32, "step": 137600, "train_speed(iter/s)": 0.286498 }, { "acc": 0.73346233, "epoch": 1.5392724471406383, "grad_norm": 5.625, "learning_rate": 1.3819447857023222e-06, "loss": 1.04523067, "memory(GiB)": 142.32, "step": 137620, "train_speed(iter/s)": 0.286512 }, { "acc": 0.75023904, "epoch": 1.5394961460865968, "grad_norm": 6.8125, "learning_rate": 1.3806685482195565e-06, "loss": 0.98617668, "memory(GiB)": 142.32, "step": 137640, "train_speed(iter/s)": 0.286525 }, { "acc": 0.73756914, "epoch": 1.5397198450325553, "grad_norm": 4.84375, "learning_rate": 1.3793928059130635e-06, "loss": 1.04923134, "memory(GiB)": 142.32, "step": 137660, "train_speed(iter/s)": 0.28654 }, { "acc": 0.74516983, "epoch": 1.5399435439785139, "grad_norm": 6.9375, "learning_rate": 1.3781175589573869e-06, "loss": 1.02598133, "memory(GiB)": 142.32, "step": 137680, "train_speed(iter/s)": 0.286554 }, { "acc": 0.73407955, "epoch": 1.5401672429244724, "grad_norm": 7.0, "learning_rate": 1.3768428075269969e-06, "loss": 1.06075726, "memory(GiB)": 142.32, "step": 137700, "train_speed(iter/s)": 0.286567 }, { "acc": 0.73976345, "epoch": 1.540390941870431, "grad_norm": 6.25, "learning_rate": 1.3755685517962958e-06, "loss": 1.03164434, "memory(GiB)": 142.32, "step": 137720, "train_speed(iter/s)": 0.286582 }, { "acc": 0.73265824, "epoch": 1.5406146408163894, "grad_norm": 5.90625, "learning_rate": 1.3742947919396231e-06, "loss": 1.05385523, "memory(GiB)": 142.32, "step": 137740, "train_speed(iter/s)": 0.286596 }, { "acc": 0.73512363, "epoch": 1.540838339762348, "grad_norm": 5.21875, "learning_rate": 1.3730215281312454e-06, "loss": 1.05101566, "memory(GiB)": 142.32, "step": 137760, "train_speed(iter/s)": 0.28661 }, { "acc": 0.74261026, "epoch": 1.5410620387083065, "grad_norm": 5.59375, "learning_rate": 1.3717487605453655e-06, "loss": 1.0192914, "memory(GiB)": 142.32, "step": 137780, "train_speed(iter/s)": 0.28662 }, { "acc": 0.74624796, "epoch": 1.541285737654265, "grad_norm": 6.125, "learning_rate": 1.3704764893561145e-06, "loss": 0.99113178, "memory(GiB)": 142.32, "step": 137800, "train_speed(iter/s)": 0.286635 }, { "acc": 0.73655891, "epoch": 1.5415094366002235, "grad_norm": 6.15625, "learning_rate": 1.369204714737556e-06, "loss": 1.0511117, "memory(GiB)": 142.32, "step": 137820, "train_speed(iter/s)": 0.28665 }, { "acc": 0.73562508, "epoch": 1.541733135546182, "grad_norm": 6.09375, "learning_rate": 1.3679334368636905e-06, "loss": 1.05084343, "memory(GiB)": 142.32, "step": 137840, "train_speed(iter/s)": 0.286664 }, { "acc": 0.74354615, "epoch": 1.5419568344921406, "grad_norm": 6.28125, "learning_rate": 1.3666626559084434e-06, "loss": 1.00285044, "memory(GiB)": 142.32, "step": 137860, "train_speed(iter/s)": 0.286678 }, { "acc": 0.73106289, "epoch": 1.5421805334380991, "grad_norm": 5.75, "learning_rate": 1.3653923720456785e-06, "loss": 1.05996571, "memory(GiB)": 142.32, "step": 137880, "train_speed(iter/s)": 0.286693 }, { "acc": 0.74152718, "epoch": 1.5424042323840577, "grad_norm": 6.21875, "learning_rate": 1.364122585449188e-06, "loss": 1.0149869, "memory(GiB)": 142.32, "step": 137900, "train_speed(iter/s)": 0.286708 }, { "acc": 0.72406416, "epoch": 1.5426279313300162, "grad_norm": 6.625, "learning_rate": 1.3628532962926949e-06, "loss": 1.11688509, "memory(GiB)": 142.32, "step": 137920, "train_speed(iter/s)": 0.286721 }, { "acc": 0.73672113, "epoch": 1.5428516302759747, "grad_norm": 5.09375, "learning_rate": 1.361584504749859e-06, "loss": 1.05121574, "memory(GiB)": 142.32, "step": 137940, "train_speed(iter/s)": 0.286736 }, { "acc": 0.74566488, "epoch": 1.5430753292219332, "grad_norm": 5.71875, "learning_rate": 1.3603162109942664e-06, "loss": 1.00012341, "memory(GiB)": 142.32, "step": 137960, "train_speed(iter/s)": 0.286751 }, { "acc": 0.73314581, "epoch": 1.5432990281678918, "grad_norm": 5.09375, "learning_rate": 1.3590484151994405e-06, "loss": 1.05324841, "memory(GiB)": 142.32, "step": 137980, "train_speed(iter/s)": 0.286767 }, { "acc": 0.74209223, "epoch": 1.5435227271138503, "grad_norm": 6.21875, "learning_rate": 1.3577811175388328e-06, "loss": 1.03473358, "memory(GiB)": 142.32, "step": 138000, "train_speed(iter/s)": 0.286782 }, { "epoch": 1.5435227271138503, "eval_acc": 0.6963502038318092, "eval_loss": 1.071437954902649, "eval_runtime": 2342.8472, "eval_samples_per_second": 32.133, "eval_steps_per_second": 16.067, "step": 138000 }, { "acc": 0.73941784, "epoch": 1.5437464260598088, "grad_norm": 6.40625, "learning_rate": 1.3565143181858258e-06, "loss": 1.02944489, "memory(GiB)": 142.32, "step": 138020, "train_speed(iter/s)": 0.285377 }, { "acc": 0.72802591, "epoch": 1.5439701250057674, "grad_norm": 4.96875, "learning_rate": 1.3552480173137395e-06, "loss": 1.07880211, "memory(GiB)": 142.32, "step": 138040, "train_speed(iter/s)": 0.285391 }, { "acc": 0.73855829, "epoch": 1.5441938239517259, "grad_norm": 5.53125, "learning_rate": 1.3539822150958181e-06, "loss": 1.02224464, "memory(GiB)": 142.32, "step": 138060, "train_speed(iter/s)": 0.285404 }, { "acc": 0.75403261, "epoch": 1.5444175228976844, "grad_norm": 6.09375, "learning_rate": 1.3527169117052447e-06, "loss": 0.96637688, "memory(GiB)": 142.32, "step": 138080, "train_speed(iter/s)": 0.285416 }, { "acc": 0.73421502, "epoch": 1.544641221843643, "grad_norm": 6.71875, "learning_rate": 1.3514521073151298e-06, "loss": 1.05109615, "memory(GiB)": 142.32, "step": 138100, "train_speed(iter/s)": 0.285429 }, { "acc": 0.73547354, "epoch": 1.5448649207896015, "grad_norm": 5.375, "learning_rate": 1.3501878020985144e-06, "loss": 1.04636774, "memory(GiB)": 142.32, "step": 138120, "train_speed(iter/s)": 0.285442 }, { "acc": 0.73771844, "epoch": 1.54508861973556, "grad_norm": 5.9375, "learning_rate": 1.3489239962283774e-06, "loss": 1.05461254, "memory(GiB)": 142.32, "step": 138140, "train_speed(iter/s)": 0.285454 }, { "acc": 0.74106035, "epoch": 1.5453123186815185, "grad_norm": 6.21875, "learning_rate": 1.3476606898776217e-06, "loss": 1.03320503, "memory(GiB)": 142.32, "step": 138160, "train_speed(iter/s)": 0.285467 }, { "acc": 0.74154034, "epoch": 1.545536017627477, "grad_norm": 6.40625, "learning_rate": 1.3463978832190893e-06, "loss": 1.02642097, "memory(GiB)": 142.32, "step": 138180, "train_speed(iter/s)": 0.285482 }, { "acc": 0.73977747, "epoch": 1.5457597165734356, "grad_norm": 5.65625, "learning_rate": 1.3451355764255486e-06, "loss": 1.02870235, "memory(GiB)": 142.32, "step": 138200, "train_speed(iter/s)": 0.285497 }, { "acc": 0.75078402, "epoch": 1.545983415519394, "grad_norm": 5.03125, "learning_rate": 1.3438737696696996e-06, "loss": 0.98888016, "memory(GiB)": 142.32, "step": 138220, "train_speed(iter/s)": 0.285511 }, { "acc": 0.73302817, "epoch": 1.5462071144653526, "grad_norm": 5.3125, "learning_rate": 1.342612463124179e-06, "loss": 1.06546516, "memory(GiB)": 142.32, "step": 138240, "train_speed(iter/s)": 0.285525 }, { "acc": 0.73949428, "epoch": 1.5464308134113112, "grad_norm": 5.28125, "learning_rate": 1.341351656961547e-06, "loss": 1.04078941, "memory(GiB)": 142.32, "step": 138260, "train_speed(iter/s)": 0.285538 }, { "acc": 0.75050092, "epoch": 1.5466545123572697, "grad_norm": 6.15625, "learning_rate": 1.3400913513543045e-06, "loss": 0.98706703, "memory(GiB)": 142.32, "step": 138280, "train_speed(iter/s)": 0.285552 }, { "acc": 0.7480423, "epoch": 1.5468782113032282, "grad_norm": 5.3125, "learning_rate": 1.3388315464748775e-06, "loss": 0.99605942, "memory(GiB)": 142.32, "step": 138300, "train_speed(iter/s)": 0.285566 }, { "acc": 0.73068523, "epoch": 1.5471019102491868, "grad_norm": 6.03125, "learning_rate": 1.3375722424956233e-06, "loss": 1.07193012, "memory(GiB)": 142.32, "step": 138320, "train_speed(iter/s)": 0.28558 }, { "acc": 0.74374638, "epoch": 1.5473256091951453, "grad_norm": 5.25, "learning_rate": 1.336313439588836e-06, "loss": 1.00630598, "memory(GiB)": 142.32, "step": 138340, "train_speed(iter/s)": 0.285595 }, { "acc": 0.74727044, "epoch": 1.5475493081411038, "grad_norm": 7.9375, "learning_rate": 1.3350551379267347e-06, "loss": 0.99955425, "memory(GiB)": 142.32, "step": 138360, "train_speed(iter/s)": 0.285611 }, { "acc": 0.7513957, "epoch": 1.5477730070870623, "grad_norm": 5.3125, "learning_rate": 1.3337973376814761e-06, "loss": 0.98181963, "memory(GiB)": 142.32, "step": 138380, "train_speed(iter/s)": 0.285625 }, { "acc": 0.72584324, "epoch": 1.5479967060330209, "grad_norm": 5.375, "learning_rate": 1.3325400390251442e-06, "loss": 1.08265915, "memory(GiB)": 142.32, "step": 138400, "train_speed(iter/s)": 0.28564 }, { "acc": 0.73384242, "epoch": 1.5482204049789794, "grad_norm": 5.96875, "learning_rate": 1.3312832421297534e-06, "loss": 1.0535347, "memory(GiB)": 142.32, "step": 138420, "train_speed(iter/s)": 0.285654 }, { "acc": 0.74873829, "epoch": 1.548444103924938, "grad_norm": 5.34375, "learning_rate": 1.3300269471672545e-06, "loss": 1.0019969, "memory(GiB)": 142.32, "step": 138440, "train_speed(iter/s)": 0.285664 }, { "acc": 0.73463097, "epoch": 1.5486678028708964, "grad_norm": 6.1875, "learning_rate": 1.328771154309524e-06, "loss": 1.05741005, "memory(GiB)": 142.32, "step": 138460, "train_speed(iter/s)": 0.285679 }, { "acc": 0.72306476, "epoch": 1.548891501816855, "grad_norm": 5.625, "learning_rate": 1.3275158637283747e-06, "loss": 1.11734238, "memory(GiB)": 142.32, "step": 138480, "train_speed(iter/s)": 0.285692 }, { "acc": 0.74523067, "epoch": 1.5491152007628135, "grad_norm": 6.25, "learning_rate": 1.3262610755955468e-06, "loss": 1.02132111, "memory(GiB)": 142.32, "step": 138500, "train_speed(iter/s)": 0.285706 }, { "acc": 0.73965464, "epoch": 1.549338899708772, "grad_norm": 5.40625, "learning_rate": 1.3250067900827129e-06, "loss": 1.03409023, "memory(GiB)": 142.32, "step": 138520, "train_speed(iter/s)": 0.28572 }, { "acc": 0.73397751, "epoch": 1.5495625986547306, "grad_norm": 6.65625, "learning_rate": 1.3237530073614807e-06, "loss": 1.05820656, "memory(GiB)": 142.32, "step": 138540, "train_speed(iter/s)": 0.285733 }, { "acc": 0.73260508, "epoch": 1.549786297600689, "grad_norm": 4.9375, "learning_rate": 1.3224997276033797e-06, "loss": 1.06225395, "memory(GiB)": 142.32, "step": 138560, "train_speed(iter/s)": 0.285746 }, { "acc": 0.7400773, "epoch": 1.5500099965466476, "grad_norm": 6.75, "learning_rate": 1.321246950979881e-06, "loss": 1.0221221, "memory(GiB)": 142.32, "step": 138580, "train_speed(iter/s)": 0.28576 }, { "acc": 0.74371037, "epoch": 1.5502336954926061, "grad_norm": 6.25, "learning_rate": 1.319994677662379e-06, "loss": 1.00808601, "memory(GiB)": 142.32, "step": 138600, "train_speed(iter/s)": 0.285773 }, { "acc": 0.74769163, "epoch": 1.5504573944385647, "grad_norm": 5.90625, "learning_rate": 1.3187429078222063e-06, "loss": 1.00481529, "memory(GiB)": 142.32, "step": 138620, "train_speed(iter/s)": 0.285786 }, { "acc": 0.73452578, "epoch": 1.5506810933845232, "grad_norm": 6.84375, "learning_rate": 1.317491641630621e-06, "loss": 1.05739107, "memory(GiB)": 142.32, "step": 138640, "train_speed(iter/s)": 0.2858 }, { "acc": 0.73233976, "epoch": 1.5509047923304817, "grad_norm": 6.28125, "learning_rate": 1.3162408792588132e-06, "loss": 1.09112129, "memory(GiB)": 142.32, "step": 138660, "train_speed(iter/s)": 0.285815 }, { "acc": 0.7418602, "epoch": 1.5511284912764403, "grad_norm": 5.4375, "learning_rate": 1.3149906208779073e-06, "loss": 1.01924019, "memory(GiB)": 142.32, "step": 138680, "train_speed(iter/s)": 0.285828 }, { "acc": 0.74196129, "epoch": 1.5513521902223988, "grad_norm": 4.78125, "learning_rate": 1.313740866658954e-06, "loss": 1.01917839, "memory(GiB)": 142.32, "step": 138700, "train_speed(iter/s)": 0.285843 }, { "acc": 0.74252033, "epoch": 1.5515758891683573, "grad_norm": 5.71875, "learning_rate": 1.3124916167729407e-06, "loss": 1.0177618, "memory(GiB)": 142.32, "step": 138720, "train_speed(iter/s)": 0.285856 }, { "acc": 0.73724308, "epoch": 1.5517995881143158, "grad_norm": 5.0625, "learning_rate": 1.3112428713907804e-06, "loss": 1.04533291, "memory(GiB)": 142.32, "step": 138740, "train_speed(iter/s)": 0.285872 }, { "acc": 0.74009743, "epoch": 1.5520232870602744, "grad_norm": 7.125, "learning_rate": 1.3099946306833184e-06, "loss": 1.03572464, "memory(GiB)": 142.32, "step": 138760, "train_speed(iter/s)": 0.285885 }, { "acc": 0.75179095, "epoch": 1.552246986006233, "grad_norm": 5.0625, "learning_rate": 1.308746894821335e-06, "loss": 0.97101498, "memory(GiB)": 142.32, "step": 138780, "train_speed(iter/s)": 0.2859 }, { "acc": 0.74318371, "epoch": 1.5524706849521914, "grad_norm": 8.1875, "learning_rate": 1.307499663975535e-06, "loss": 1.01980343, "memory(GiB)": 142.32, "step": 138800, "train_speed(iter/s)": 0.285913 }, { "acc": 0.72696848, "epoch": 1.55269438389815, "grad_norm": 6.6875, "learning_rate": 1.30625293831656e-06, "loss": 1.09220343, "memory(GiB)": 142.32, "step": 138820, "train_speed(iter/s)": 0.285926 }, { "acc": 0.72178173, "epoch": 1.5529180828441085, "grad_norm": 6.28125, "learning_rate": 1.3050067180149794e-06, "loss": 1.12008381, "memory(GiB)": 142.32, "step": 138840, "train_speed(iter/s)": 0.285939 }, { "acc": 0.72340841, "epoch": 1.553141781790067, "grad_norm": 6.5625, "learning_rate": 1.3037610032412917e-06, "loss": 1.10322437, "memory(GiB)": 142.32, "step": 138860, "train_speed(iter/s)": 0.285952 }, { "acc": 0.72633381, "epoch": 1.5533654807360255, "grad_norm": 6.125, "learning_rate": 1.3025157941659316e-06, "loss": 1.09898491, "memory(GiB)": 142.32, "step": 138880, "train_speed(iter/s)": 0.285965 }, { "acc": 0.73579388, "epoch": 1.553589179681984, "grad_norm": 5.90625, "learning_rate": 1.3012710909592586e-06, "loss": 1.04206314, "memory(GiB)": 142.32, "step": 138900, "train_speed(iter/s)": 0.285979 }, { "acc": 0.73262877, "epoch": 1.5538128786279426, "grad_norm": 5.25, "learning_rate": 1.3000268937915689e-06, "loss": 1.07032261, "memory(GiB)": 142.32, "step": 138920, "train_speed(iter/s)": 0.285992 }, { "acc": 0.73606329, "epoch": 1.5540365775739011, "grad_norm": 6.53125, "learning_rate": 1.2987832028330849e-06, "loss": 1.04196281, "memory(GiB)": 142.32, "step": 138940, "train_speed(iter/s)": 0.286006 }, { "acc": 0.7470665, "epoch": 1.5542602765198597, "grad_norm": 5.65625, "learning_rate": 1.29754001825396e-06, "loss": 0.99114571, "memory(GiB)": 142.32, "step": 138960, "train_speed(iter/s)": 0.286018 }, { "acc": 0.73063793, "epoch": 1.5544839754658182, "grad_norm": 4.53125, "learning_rate": 1.2962973402242823e-06, "loss": 1.09282293, "memory(GiB)": 142.32, "step": 138980, "train_speed(iter/s)": 0.286032 }, { "acc": 0.73736644, "epoch": 1.5547076744117767, "grad_norm": 5.75, "learning_rate": 1.2950551689140651e-06, "loss": 1.04364204, "memory(GiB)": 142.32, "step": 139000, "train_speed(iter/s)": 0.286045 }, { "acc": 0.72601442, "epoch": 1.5549313733577352, "grad_norm": 6.78125, "learning_rate": 1.293813504493258e-06, "loss": 1.09384651, "memory(GiB)": 142.32, "step": 139020, "train_speed(iter/s)": 0.28606 }, { "acc": 0.7448741, "epoch": 1.5551550723036938, "grad_norm": 5.9375, "learning_rate": 1.2925723471317374e-06, "loss": 1.01067696, "memory(GiB)": 142.32, "step": 139040, "train_speed(iter/s)": 0.286074 }, { "acc": 0.73920832, "epoch": 1.5553787712496523, "grad_norm": 6.375, "learning_rate": 1.2913316969993096e-06, "loss": 1.03185959, "memory(GiB)": 142.32, "step": 139060, "train_speed(iter/s)": 0.286087 }, { "acc": 0.73759999, "epoch": 1.5556024701956108, "grad_norm": 6.03125, "learning_rate": 1.2900915542657155e-06, "loss": 1.04534607, "memory(GiB)": 142.32, "step": 139080, "train_speed(iter/s)": 0.286101 }, { "acc": 0.74051762, "epoch": 1.5558261691415693, "grad_norm": 6.3125, "learning_rate": 1.2888519191006227e-06, "loss": 1.04482965, "memory(GiB)": 142.32, "step": 139100, "train_speed(iter/s)": 0.286115 }, { "acc": 0.73030863, "epoch": 1.5560498680875279, "grad_norm": 5.84375, "learning_rate": 1.2876127916736335e-06, "loss": 1.05266981, "memory(GiB)": 142.32, "step": 139120, "train_speed(iter/s)": 0.286129 }, { "acc": 0.73366222, "epoch": 1.5562735670334864, "grad_norm": 6.25, "learning_rate": 1.2863741721542767e-06, "loss": 1.07391624, "memory(GiB)": 142.32, "step": 139140, "train_speed(iter/s)": 0.286143 }, { "acc": 0.74245682, "epoch": 1.556497265979445, "grad_norm": 6.8125, "learning_rate": 1.2851360607120112e-06, "loss": 1.02919998, "memory(GiB)": 142.32, "step": 139160, "train_speed(iter/s)": 0.286159 }, { "acc": 0.73306437, "epoch": 1.5567209649254035, "grad_norm": 6.34375, "learning_rate": 1.2838984575162316e-06, "loss": 1.07097015, "memory(GiB)": 142.32, "step": 139180, "train_speed(iter/s)": 0.286173 }, { "acc": 0.73332739, "epoch": 1.556944663871362, "grad_norm": 5.28125, "learning_rate": 1.2826613627362572e-06, "loss": 1.05497875, "memory(GiB)": 142.32, "step": 139200, "train_speed(iter/s)": 0.286186 }, { "acc": 0.74316778, "epoch": 1.5571683628173205, "grad_norm": 6.25, "learning_rate": 1.281424776541343e-06, "loss": 1.0225584, "memory(GiB)": 142.32, "step": 139220, "train_speed(iter/s)": 0.286199 }, { "acc": 0.74288378, "epoch": 1.557392061763279, "grad_norm": 7.40625, "learning_rate": 1.2801886991006695e-06, "loss": 1.02073717, "memory(GiB)": 142.32, "step": 139240, "train_speed(iter/s)": 0.286214 }, { "acc": 0.73192291, "epoch": 1.5576157607092376, "grad_norm": 5.9375, "learning_rate": 1.2789531305833497e-06, "loss": 1.08417969, "memory(GiB)": 142.32, "step": 139260, "train_speed(iter/s)": 0.286229 }, { "acc": 0.74134989, "epoch": 1.557839459655196, "grad_norm": 5.9375, "learning_rate": 1.2777180711584287e-06, "loss": 1.02896461, "memory(GiB)": 142.32, "step": 139280, "train_speed(iter/s)": 0.286243 }, { "acc": 0.73566585, "epoch": 1.5580631586011546, "grad_norm": 7.34375, "learning_rate": 1.2764835209948772e-06, "loss": 1.03761511, "memory(GiB)": 142.32, "step": 139300, "train_speed(iter/s)": 0.28626 }, { "acc": 0.74688435, "epoch": 1.5582868575471132, "grad_norm": 4.5, "learning_rate": 1.2752494802616034e-06, "loss": 1.00133991, "memory(GiB)": 142.32, "step": 139320, "train_speed(iter/s)": 0.286274 }, { "acc": 0.73006558, "epoch": 1.5585105564930717, "grad_norm": 8.6875, "learning_rate": 1.2740159491274394e-06, "loss": 1.06986942, "memory(GiB)": 142.32, "step": 139340, "train_speed(iter/s)": 0.286289 }, { "acc": 0.72209778, "epoch": 1.5587342554390302, "grad_norm": 6.4375, "learning_rate": 1.2727829277611492e-06, "loss": 1.09481144, "memory(GiB)": 142.32, "step": 139360, "train_speed(iter/s)": 0.286302 }, { "acc": 0.74636378, "epoch": 1.5589579543849887, "grad_norm": 7.1875, "learning_rate": 1.2715504163314295e-06, "loss": 0.99358177, "memory(GiB)": 142.32, "step": 139380, "train_speed(iter/s)": 0.286316 }, { "acc": 0.73092861, "epoch": 1.5591816533309473, "grad_norm": 6.78125, "learning_rate": 1.2703184150069037e-06, "loss": 1.07784519, "memory(GiB)": 142.32, "step": 139400, "train_speed(iter/s)": 0.286331 }, { "acc": 0.73215966, "epoch": 1.5594053522769058, "grad_norm": 6.40625, "learning_rate": 1.2690869239561293e-06, "loss": 1.06972885, "memory(GiB)": 142.32, "step": 139420, "train_speed(iter/s)": 0.286345 }, { "acc": 0.72739749, "epoch": 1.5596290512228643, "grad_norm": 4.75, "learning_rate": 1.2678559433475911e-06, "loss": 1.08338451, "memory(GiB)": 142.32, "step": 139440, "train_speed(iter/s)": 0.28636 }, { "acc": 0.73361759, "epoch": 1.5598527501688229, "grad_norm": 7.4375, "learning_rate": 1.266625473349703e-06, "loss": 1.05781078, "memory(GiB)": 142.32, "step": 139460, "train_speed(iter/s)": 0.286374 }, { "acc": 0.74777827, "epoch": 1.5600764491147814, "grad_norm": 4.65625, "learning_rate": 1.2653955141308132e-06, "loss": 0.9988205, "memory(GiB)": 142.32, "step": 139480, "train_speed(iter/s)": 0.286387 }, { "acc": 0.75157919, "epoch": 1.56030014806074, "grad_norm": 7.34375, "learning_rate": 1.2641660658591959e-06, "loss": 0.97849998, "memory(GiB)": 142.32, "step": 139500, "train_speed(iter/s)": 0.286401 }, { "acc": 0.74076738, "epoch": 1.5605238470066984, "grad_norm": 7.125, "learning_rate": 1.2629371287030596e-06, "loss": 1.04512596, "memory(GiB)": 142.32, "step": 139520, "train_speed(iter/s)": 0.286414 }, { "acc": 0.73613148, "epoch": 1.560747545952657, "grad_norm": 6.5, "learning_rate": 1.2617087028305392e-06, "loss": 1.0359148, "memory(GiB)": 142.32, "step": 139540, "train_speed(iter/s)": 0.286428 }, { "acc": 0.74089899, "epoch": 1.5609712448986155, "grad_norm": 5.8125, "learning_rate": 1.2604807884096986e-06, "loss": 1.02078581, "memory(GiB)": 142.32, "step": 139560, "train_speed(iter/s)": 0.286441 }, { "acc": 0.74141083, "epoch": 1.561194943844574, "grad_norm": 5.1875, "learning_rate": 1.259253385608538e-06, "loss": 1.02119007, "memory(GiB)": 142.32, "step": 139580, "train_speed(iter/s)": 0.286455 }, { "acc": 0.73876438, "epoch": 1.5614186427905326, "grad_norm": 6.125, "learning_rate": 1.2580264945949805e-06, "loss": 1.03747578, "memory(GiB)": 142.32, "step": 139600, "train_speed(iter/s)": 0.286469 }, { "acc": 0.73745842, "epoch": 1.561642341736491, "grad_norm": 6.46875, "learning_rate": 1.2568001155368853e-06, "loss": 1.04613686, "memory(GiB)": 142.32, "step": 139620, "train_speed(iter/s)": 0.286482 }, { "acc": 0.72968893, "epoch": 1.5618660406824496, "grad_norm": 5.625, "learning_rate": 1.2555742486020368e-06, "loss": 1.07240734, "memory(GiB)": 142.32, "step": 139640, "train_speed(iter/s)": 0.286496 }, { "acc": 0.72899199, "epoch": 1.5620897396284081, "grad_norm": 5.15625, "learning_rate": 1.25434889395815e-06, "loss": 1.07982407, "memory(GiB)": 142.32, "step": 139660, "train_speed(iter/s)": 0.286508 }, { "acc": 0.73008022, "epoch": 1.5623134385743667, "grad_norm": 5.09375, "learning_rate": 1.2531240517728731e-06, "loss": 1.07857323, "memory(GiB)": 142.32, "step": 139680, "train_speed(iter/s)": 0.286523 }, { "acc": 0.73765936, "epoch": 1.5625371375203252, "grad_norm": 6.53125, "learning_rate": 1.2518997222137802e-06, "loss": 1.03609734, "memory(GiB)": 142.32, "step": 139700, "train_speed(iter/s)": 0.286537 }, { "acc": 0.74779339, "epoch": 1.5627608364662837, "grad_norm": 5.46875, "learning_rate": 1.2506759054483802e-06, "loss": 1.00590105, "memory(GiB)": 142.32, "step": 139720, "train_speed(iter/s)": 0.286552 }, { "acc": 0.73575563, "epoch": 1.5629845354122422, "grad_norm": 5.0625, "learning_rate": 1.2494526016441044e-06, "loss": 1.05450087, "memory(GiB)": 142.32, "step": 139740, "train_speed(iter/s)": 0.286567 }, { "acc": 0.7382906, "epoch": 1.5632082343582008, "grad_norm": 7.46875, "learning_rate": 1.2482298109683216e-06, "loss": 1.0400713, "memory(GiB)": 142.32, "step": 139760, "train_speed(iter/s)": 0.28658 }, { "acc": 0.72407627, "epoch": 1.5634319333041593, "grad_norm": 6.5625, "learning_rate": 1.2470075335883258e-06, "loss": 1.10670948, "memory(GiB)": 142.32, "step": 139780, "train_speed(iter/s)": 0.286592 }, { "acc": 0.73953018, "epoch": 1.5636556322501178, "grad_norm": 5.84375, "learning_rate": 1.2457857696713405e-06, "loss": 1.03645153, "memory(GiB)": 142.32, "step": 139800, "train_speed(iter/s)": 0.286607 }, { "acc": 0.7442008, "epoch": 1.5638793311960764, "grad_norm": 7.8125, "learning_rate": 1.2445645193845236e-06, "loss": 1.03161049, "memory(GiB)": 142.32, "step": 139820, "train_speed(iter/s)": 0.286622 }, { "acc": 0.73312273, "epoch": 1.564103030142035, "grad_norm": 6.34375, "learning_rate": 1.2433437828949562e-06, "loss": 1.07705708, "memory(GiB)": 142.32, "step": 139840, "train_speed(iter/s)": 0.286634 }, { "acc": 0.75086479, "epoch": 1.5643267290879934, "grad_norm": 5.4375, "learning_rate": 1.2421235603696558e-06, "loss": 0.96986465, "memory(GiB)": 142.32, "step": 139860, "train_speed(iter/s)": 0.286647 }, { "acc": 0.73773103, "epoch": 1.564550428033952, "grad_norm": 5.6875, "learning_rate": 1.240903851975565e-06, "loss": 1.0326931, "memory(GiB)": 142.32, "step": 139880, "train_speed(iter/s)": 0.286661 }, { "acc": 0.7460372, "epoch": 1.5647741269799105, "grad_norm": 6.875, "learning_rate": 1.239684657879555e-06, "loss": 0.99059772, "memory(GiB)": 142.32, "step": 139900, "train_speed(iter/s)": 0.286675 }, { "acc": 0.73034954, "epoch": 1.564997825925869, "grad_norm": 4.96875, "learning_rate": 1.2384659782484338e-06, "loss": 1.05524387, "memory(GiB)": 142.32, "step": 139920, "train_speed(iter/s)": 0.286689 }, { "acc": 0.73610125, "epoch": 1.5652215248718275, "grad_norm": 6.34375, "learning_rate": 1.2372478132489291e-06, "loss": 1.05062065, "memory(GiB)": 142.32, "step": 139940, "train_speed(iter/s)": 0.286704 }, { "acc": 0.73133454, "epoch": 1.565445223817786, "grad_norm": 4.9375, "learning_rate": 1.2360301630477074e-06, "loss": 1.07349596, "memory(GiB)": 142.32, "step": 139960, "train_speed(iter/s)": 0.286717 }, { "acc": 0.72466578, "epoch": 1.5656689227637446, "grad_norm": 6.03125, "learning_rate": 1.234813027811359e-06, "loss": 1.08670349, "memory(GiB)": 142.32, "step": 139980, "train_speed(iter/s)": 0.286732 }, { "acc": 0.7408421, "epoch": 1.5658926217097031, "grad_norm": 6.90625, "learning_rate": 1.2335964077064034e-06, "loss": 1.02778282, "memory(GiB)": 142.32, "step": 140000, "train_speed(iter/s)": 0.286746 }, { "epoch": 1.5658926217097031, "eval_acc": 0.6963638590052565, "eval_loss": 1.0714579820632935, "eval_runtime": 2342.6197, "eval_samples_per_second": 32.136, "eval_steps_per_second": 16.068, "step": 140000 }, { "acc": 0.73695078, "epoch": 1.5661163206556616, "grad_norm": 5.5625, "learning_rate": 1.2323803028992953e-06, "loss": 1.04395218, "memory(GiB)": 142.32, "step": 140020, "train_speed(iter/s)": 0.285361 }, { "acc": 0.74466271, "epoch": 1.5663400196016202, "grad_norm": 5.78125, "learning_rate": 1.2311647135564119e-06, "loss": 1.02511435, "memory(GiB)": 142.32, "step": 140040, "train_speed(iter/s)": 0.285376 }, { "acc": 0.74408102, "epoch": 1.5665637185475787, "grad_norm": 7.375, "learning_rate": 1.2299496398440669e-06, "loss": 1.00970163, "memory(GiB)": 142.32, "step": 140060, "train_speed(iter/s)": 0.28539 }, { "acc": 0.73496885, "epoch": 1.5667874174935372, "grad_norm": 6.1875, "learning_rate": 1.2287350819284966e-06, "loss": 1.03136101, "memory(GiB)": 142.32, "step": 140080, "train_speed(iter/s)": 0.285405 }, { "acc": 0.73266039, "epoch": 1.5670111164394958, "grad_norm": 4.96875, "learning_rate": 1.2275210399758703e-06, "loss": 1.06558237, "memory(GiB)": 142.32, "step": 140100, "train_speed(iter/s)": 0.28542 }, { "acc": 0.73290682, "epoch": 1.5672348153854543, "grad_norm": 5.8125, "learning_rate": 1.2263075141522878e-06, "loss": 1.07209663, "memory(GiB)": 142.32, "step": 140120, "train_speed(iter/s)": 0.285435 }, { "acc": 0.7391789, "epoch": 1.5674585143314128, "grad_norm": 6.625, "learning_rate": 1.2250945046237744e-06, "loss": 1.03864079, "memory(GiB)": 142.32, "step": 140140, "train_speed(iter/s)": 0.28545 }, { "acc": 0.73649397, "epoch": 1.5676822132773713, "grad_norm": 5.5, "learning_rate": 1.2238820115562899e-06, "loss": 1.04465027, "memory(GiB)": 142.32, "step": 140160, "train_speed(iter/s)": 0.285465 }, { "acc": 0.73702335, "epoch": 1.5679059122233299, "grad_norm": 5.59375, "learning_rate": 1.22267003511572e-06, "loss": 1.04788761, "memory(GiB)": 142.32, "step": 140180, "train_speed(iter/s)": 0.285479 }, { "acc": 0.73528557, "epoch": 1.5681296111692884, "grad_norm": 7.125, "learning_rate": 1.2214585754678782e-06, "loss": 1.06336021, "memory(GiB)": 142.32, "step": 140200, "train_speed(iter/s)": 0.285494 }, { "acc": 0.72591715, "epoch": 1.568353310115247, "grad_norm": 7.28125, "learning_rate": 1.2202476327785118e-06, "loss": 1.119841, "memory(GiB)": 142.32, "step": 140220, "train_speed(iter/s)": 0.285509 }, { "acc": 0.72269711, "epoch": 1.5685770090612055, "grad_norm": 6.53125, "learning_rate": 1.219037207213294e-06, "loss": 1.10066433, "memory(GiB)": 142.32, "step": 140240, "train_speed(iter/s)": 0.285523 }, { "acc": 0.73356972, "epoch": 1.568800708007164, "grad_norm": 8.125, "learning_rate": 1.2178272989378293e-06, "loss": 1.05846539, "memory(GiB)": 142.32, "step": 140260, "train_speed(iter/s)": 0.285536 }, { "acc": 0.72780075, "epoch": 1.5690244069531225, "grad_norm": 6.4375, "learning_rate": 1.21661790811765e-06, "loss": 1.08641958, "memory(GiB)": 142.32, "step": 140280, "train_speed(iter/s)": 0.285549 }, { "acc": 0.74885645, "epoch": 1.569248105899081, "grad_norm": 7.9375, "learning_rate": 1.2154090349182163e-06, "loss": 1.00357475, "memory(GiB)": 142.32, "step": 140300, "train_speed(iter/s)": 0.285564 }, { "acc": 0.74654884, "epoch": 1.5694718048450396, "grad_norm": 7.28125, "learning_rate": 1.2142006795049227e-06, "loss": 1.01626682, "memory(GiB)": 142.32, "step": 140320, "train_speed(iter/s)": 0.285579 }, { "acc": 0.73523788, "epoch": 1.569695503790998, "grad_norm": 4.9375, "learning_rate": 1.212992842043086e-06, "loss": 1.04792271, "memory(GiB)": 142.32, "step": 140340, "train_speed(iter/s)": 0.285592 }, { "acc": 0.74634156, "epoch": 1.5699192027369566, "grad_norm": 5.25, "learning_rate": 1.2117855226979585e-06, "loss": 1.00826092, "memory(GiB)": 142.32, "step": 140360, "train_speed(iter/s)": 0.285607 }, { "acc": 0.74245338, "epoch": 1.5701429016829151, "grad_norm": 5.0, "learning_rate": 1.210578721634718e-06, "loss": 1.01818752, "memory(GiB)": 142.32, "step": 140380, "train_speed(iter/s)": 0.28562 }, { "acc": 0.74062204, "epoch": 1.5703666006288737, "grad_norm": 5.71875, "learning_rate": 1.2093724390184703e-06, "loss": 1.02849588, "memory(GiB)": 142.32, "step": 140400, "train_speed(iter/s)": 0.285634 }, { "acc": 0.74365125, "epoch": 1.5705902995748322, "grad_norm": 6.59375, "learning_rate": 1.2081666750142546e-06, "loss": 1.02105484, "memory(GiB)": 142.32, "step": 140420, "train_speed(iter/s)": 0.285649 }, { "acc": 0.73890119, "epoch": 1.5708139985207907, "grad_norm": 5.21875, "learning_rate": 1.2069614297870342e-06, "loss": 1.03905907, "memory(GiB)": 142.32, "step": 140440, "train_speed(iter/s)": 0.285663 }, { "acc": 0.73163462, "epoch": 1.5710376974667493, "grad_norm": 5.15625, "learning_rate": 1.2057567035017064e-06, "loss": 1.07863865, "memory(GiB)": 142.32, "step": 140460, "train_speed(iter/s)": 0.285676 }, { "acc": 0.74153199, "epoch": 1.5712613964127078, "grad_norm": 6.3125, "learning_rate": 1.2045524963230943e-06, "loss": 1.03031616, "memory(GiB)": 142.32, "step": 140480, "train_speed(iter/s)": 0.28569 }, { "acc": 0.74458637, "epoch": 1.5714850953586663, "grad_norm": 7.25, "learning_rate": 1.2033488084159484e-06, "loss": 1.04057293, "memory(GiB)": 142.32, "step": 140500, "train_speed(iter/s)": 0.285705 }, { "acc": 0.73823309, "epoch": 1.5717087943046248, "grad_norm": 7.0, "learning_rate": 1.2021456399449537e-06, "loss": 1.03959866, "memory(GiB)": 142.32, "step": 140520, "train_speed(iter/s)": 0.285719 }, { "acc": 0.72420902, "epoch": 1.5719324932505834, "grad_norm": 6.15625, "learning_rate": 1.2009429910747178e-06, "loss": 1.10850821, "memory(GiB)": 142.32, "step": 140540, "train_speed(iter/s)": 0.285733 }, { "acc": 0.73650446, "epoch": 1.572156192196542, "grad_norm": 5.0625, "learning_rate": 1.199740861969783e-06, "loss": 1.0593008, "memory(GiB)": 142.32, "step": 140560, "train_speed(iter/s)": 0.285748 }, { "acc": 0.73700256, "epoch": 1.5723798911425004, "grad_norm": 6.4375, "learning_rate": 1.1985392527946172e-06, "loss": 1.04422064, "memory(GiB)": 142.32, "step": 140580, "train_speed(iter/s)": 0.285761 }, { "acc": 0.73704319, "epoch": 1.572603590088459, "grad_norm": 6.71875, "learning_rate": 1.197338163713615e-06, "loss": 1.04568014, "memory(GiB)": 142.32, "step": 140600, "train_speed(iter/s)": 0.285775 }, { "acc": 0.73858013, "epoch": 1.5728272890344175, "grad_norm": 6.5, "learning_rate": 1.1961375948911058e-06, "loss": 1.03863888, "memory(GiB)": 142.32, "step": 140620, "train_speed(iter/s)": 0.285789 }, { "acc": 0.73279009, "epoch": 1.573050987980376, "grad_norm": 5.65625, "learning_rate": 1.1949375464913427e-06, "loss": 1.06904917, "memory(GiB)": 142.32, "step": 140640, "train_speed(iter/s)": 0.285803 }, { "acc": 0.73762388, "epoch": 1.5732746869263345, "grad_norm": 7.34375, "learning_rate": 1.1937380186785108e-06, "loss": 1.04340267, "memory(GiB)": 142.32, "step": 140660, "train_speed(iter/s)": 0.285817 }, { "acc": 0.74342327, "epoch": 1.573498385872293, "grad_norm": 5.8125, "learning_rate": 1.1925390116167223e-06, "loss": 1.00924463, "memory(GiB)": 142.32, "step": 140680, "train_speed(iter/s)": 0.285828 }, { "acc": 0.72720876, "epoch": 1.5737220848182516, "grad_norm": 7.15625, "learning_rate": 1.1913405254700168e-06, "loss": 1.09602346, "memory(GiB)": 142.32, "step": 140700, "train_speed(iter/s)": 0.285842 }, { "acc": 0.74963608, "epoch": 1.5739457837642101, "grad_norm": 5.96875, "learning_rate": 1.190142560402367e-06, "loss": 1.00062885, "memory(GiB)": 142.32, "step": 140720, "train_speed(iter/s)": 0.285855 }, { "acc": 0.74242764, "epoch": 1.5741694827101687, "grad_norm": 7.09375, "learning_rate": 1.1889451165776688e-06, "loss": 1.0203722, "memory(GiB)": 142.32, "step": 140740, "train_speed(iter/s)": 0.285866 }, { "acc": 0.74063983, "epoch": 1.5743931816561272, "grad_norm": 5.90625, "learning_rate": 1.1877481941597523e-06, "loss": 1.03661089, "memory(GiB)": 142.32, "step": 140760, "train_speed(iter/s)": 0.285881 }, { "acc": 0.74777517, "epoch": 1.5746168806020857, "grad_norm": 6.46875, "learning_rate": 1.1865517933123732e-06, "loss": 0.98698826, "memory(GiB)": 142.32, "step": 140780, "train_speed(iter/s)": 0.285896 }, { "acc": 0.73457222, "epoch": 1.5748405795480442, "grad_norm": 6.90625, "learning_rate": 1.1853559141992138e-06, "loss": 1.06743851, "memory(GiB)": 142.32, "step": 140800, "train_speed(iter/s)": 0.285909 }, { "acc": 0.73075886, "epoch": 1.5750642784940028, "grad_norm": 6.5, "learning_rate": 1.1841605569838905e-06, "loss": 1.07661209, "memory(GiB)": 142.32, "step": 140820, "train_speed(iter/s)": 0.285923 }, { "acc": 0.73166552, "epoch": 1.5752879774399613, "grad_norm": 6.59375, "learning_rate": 1.1829657218299428e-06, "loss": 1.07416916, "memory(GiB)": 142.32, "step": 140840, "train_speed(iter/s)": 0.285936 }, { "acc": 0.74436798, "epoch": 1.5755116763859198, "grad_norm": 6.0625, "learning_rate": 1.1817714089008436e-06, "loss": 1.02614727, "memory(GiB)": 142.32, "step": 140860, "train_speed(iter/s)": 0.285948 }, { "acc": 0.73183432, "epoch": 1.5757353753318784, "grad_norm": 6.09375, "learning_rate": 1.1805776183599904e-06, "loss": 1.04885712, "memory(GiB)": 142.32, "step": 140880, "train_speed(iter/s)": 0.285963 }, { "acc": 0.73653793, "epoch": 1.5759590742778369, "grad_norm": 6.25, "learning_rate": 1.1793843503707115e-06, "loss": 1.04518032, "memory(GiB)": 142.32, "step": 140900, "train_speed(iter/s)": 0.285977 }, { "acc": 0.72848148, "epoch": 1.5761827732237954, "grad_norm": 5.5, "learning_rate": 1.178191605096261e-06, "loss": 1.09157238, "memory(GiB)": 142.32, "step": 140920, "train_speed(iter/s)": 0.285992 }, { "acc": 0.74434872, "epoch": 1.576406472169754, "grad_norm": 6.0, "learning_rate": 1.1769993826998267e-06, "loss": 1.00910501, "memory(GiB)": 142.32, "step": 140940, "train_speed(iter/s)": 0.286005 }, { "acc": 0.72790031, "epoch": 1.5766301711157125, "grad_norm": 6.3125, "learning_rate": 1.1758076833445203e-06, "loss": 1.08762398, "memory(GiB)": 142.32, "step": 140960, "train_speed(iter/s)": 0.286019 }, { "acc": 0.73105097, "epoch": 1.576853870061671, "grad_norm": 6.6875, "learning_rate": 1.1746165071933812e-06, "loss": 1.04949636, "memory(GiB)": 142.32, "step": 140980, "train_speed(iter/s)": 0.286032 }, { "acc": 0.7342, "epoch": 1.5770775690076295, "grad_norm": 6.5, "learning_rate": 1.173425854409383e-06, "loss": 1.07646341, "memory(GiB)": 142.32, "step": 141000, "train_speed(iter/s)": 0.286046 }, { "acc": 0.73030109, "epoch": 1.577301267953588, "grad_norm": 6.375, "learning_rate": 1.172235725155421e-06, "loss": 1.05432568, "memory(GiB)": 142.32, "step": 141020, "train_speed(iter/s)": 0.286057 }, { "acc": 0.7390523, "epoch": 1.5775249668995466, "grad_norm": 6.8125, "learning_rate": 1.1710461195943245e-06, "loss": 1.05306654, "memory(GiB)": 142.32, "step": 141040, "train_speed(iter/s)": 0.286071 }, { "acc": 0.72277441, "epoch": 1.577748665845505, "grad_norm": 6.125, "learning_rate": 1.1698570378888469e-06, "loss": 1.08789997, "memory(GiB)": 142.32, "step": 141060, "train_speed(iter/s)": 0.286086 }, { "acc": 0.72811856, "epoch": 1.5779723647914636, "grad_norm": 6.15625, "learning_rate": 1.1686684802016706e-06, "loss": 1.08119974, "memory(GiB)": 142.32, "step": 141080, "train_speed(iter/s)": 0.286101 }, { "acc": 0.74615507, "epoch": 1.5781960637374222, "grad_norm": 5.875, "learning_rate": 1.1674804466954099e-06, "loss": 1.00984478, "memory(GiB)": 142.32, "step": 141100, "train_speed(iter/s)": 0.286114 }, { "acc": 0.74754276, "epoch": 1.5784197626833807, "grad_norm": 5.90625, "learning_rate": 1.166292937532602e-06, "loss": 0.99551449, "memory(GiB)": 142.32, "step": 141120, "train_speed(iter/s)": 0.286128 }, { "acc": 0.73430047, "epoch": 1.5786434616293392, "grad_norm": 5.84375, "learning_rate": 1.1651059528757186e-06, "loss": 1.05189991, "memory(GiB)": 142.32, "step": 141140, "train_speed(iter/s)": 0.286142 }, { "acc": 0.73467689, "epoch": 1.5788671605752977, "grad_norm": 6.84375, "learning_rate": 1.1639194928871533e-06, "loss": 1.04943943, "memory(GiB)": 142.32, "step": 141160, "train_speed(iter/s)": 0.286156 }, { "acc": 0.74366131, "epoch": 1.5790908595212563, "grad_norm": 5.28125, "learning_rate": 1.1627335577292303e-06, "loss": 1.01265516, "memory(GiB)": 142.32, "step": 141180, "train_speed(iter/s)": 0.286171 }, { "acc": 0.73015289, "epoch": 1.5793145584672148, "grad_norm": 6.875, "learning_rate": 1.1615481475642053e-06, "loss": 1.06815662, "memory(GiB)": 142.32, "step": 141200, "train_speed(iter/s)": 0.286183 }, { "acc": 0.73844318, "epoch": 1.5795382574131733, "grad_norm": 6.59375, "learning_rate": 1.1603632625542565e-06, "loss": 1.02876911, "memory(GiB)": 142.32, "step": 141220, "train_speed(iter/s)": 0.286197 }, { "acc": 0.74524479, "epoch": 1.5797619563591319, "grad_norm": 6.0625, "learning_rate": 1.1591789028614964e-06, "loss": 1.00178785, "memory(GiB)": 142.32, "step": 141240, "train_speed(iter/s)": 0.286213 }, { "acc": 0.74000831, "epoch": 1.5799856553050904, "grad_norm": 5.75, "learning_rate": 1.1579950686479595e-06, "loss": 1.02411718, "memory(GiB)": 142.32, "step": 141260, "train_speed(iter/s)": 0.286227 }, { "acc": 0.73429804, "epoch": 1.580209354251049, "grad_norm": 5.59375, "learning_rate": 1.1568117600756112e-06, "loss": 1.05839157, "memory(GiB)": 142.32, "step": 141280, "train_speed(iter/s)": 0.286241 }, { "acc": 0.72913814, "epoch": 1.5804330531970074, "grad_norm": 5.71875, "learning_rate": 1.1556289773063468e-06, "loss": 1.08150158, "memory(GiB)": 142.32, "step": 141300, "train_speed(iter/s)": 0.286255 }, { "acc": 0.73450093, "epoch": 1.580656752142966, "grad_norm": 5.65625, "learning_rate": 1.154446720501986e-06, "loss": 1.04297047, "memory(GiB)": 142.32, "step": 141320, "train_speed(iter/s)": 0.286269 }, { "acc": 0.74174285, "epoch": 1.5808804510889245, "grad_norm": 6.34375, "learning_rate": 1.153264989824281e-06, "loss": 1.02572803, "memory(GiB)": 142.32, "step": 141340, "train_speed(iter/s)": 0.286284 }, { "acc": 0.73915854, "epoch": 1.581104150034883, "grad_norm": 5.09375, "learning_rate": 1.1520837854349077e-06, "loss": 1.04123898, "memory(GiB)": 142.32, "step": 141360, "train_speed(iter/s)": 0.286299 }, { "acc": 0.71963863, "epoch": 1.5813278489808416, "grad_norm": 5.84375, "learning_rate": 1.1509031074954707e-06, "loss": 1.12752228, "memory(GiB)": 142.32, "step": 141380, "train_speed(iter/s)": 0.286312 }, { "acc": 0.73418193, "epoch": 1.5815515479268, "grad_norm": 5.15625, "learning_rate": 1.149722956167506e-06, "loss": 1.08225775, "memory(GiB)": 142.32, "step": 141400, "train_speed(iter/s)": 0.286326 }, { "acc": 0.74429073, "epoch": 1.5817752468727586, "grad_norm": 5.0625, "learning_rate": 1.1485433316124728e-06, "loss": 1.0185421, "memory(GiB)": 142.32, "step": 141420, "train_speed(iter/s)": 0.28634 }, { "acc": 0.72764602, "epoch": 1.5819989458187171, "grad_norm": 4.8125, "learning_rate": 1.1473642339917635e-06, "loss": 1.08300104, "memory(GiB)": 142.32, "step": 141440, "train_speed(iter/s)": 0.286355 }, { "acc": 0.73705463, "epoch": 1.5822226447646757, "grad_norm": 4.96875, "learning_rate": 1.1461856634666935e-06, "loss": 1.02806473, "memory(GiB)": 142.32, "step": 141460, "train_speed(iter/s)": 0.286372 }, { "acc": 0.74232073, "epoch": 1.5824463437106342, "grad_norm": 6.15625, "learning_rate": 1.1450076201985072e-06, "loss": 1.01822243, "memory(GiB)": 142.32, "step": 141480, "train_speed(iter/s)": 0.286387 }, { "acc": 0.73278685, "epoch": 1.5826700426565927, "grad_norm": 5.59375, "learning_rate": 1.14383010434838e-06, "loss": 1.04840031, "memory(GiB)": 142.32, "step": 141500, "train_speed(iter/s)": 0.286401 }, { "acc": 0.73555346, "epoch": 1.5828937416025513, "grad_norm": 6.34375, "learning_rate": 1.1426531160774106e-06, "loss": 1.04800014, "memory(GiB)": 142.32, "step": 141520, "train_speed(iter/s)": 0.286415 }, { "acc": 0.73442411, "epoch": 1.5831174405485098, "grad_norm": 6.0, "learning_rate": 1.1414766555466311e-06, "loss": 1.06297016, "memory(GiB)": 142.32, "step": 141540, "train_speed(iter/s)": 0.286428 }, { "acc": 0.722299, "epoch": 1.5833411394944683, "grad_norm": 6.09375, "learning_rate": 1.1403007229169955e-06, "loss": 1.0946949, "memory(GiB)": 142.32, "step": 141560, "train_speed(iter/s)": 0.286442 }, { "acc": 0.74188423, "epoch": 1.5835648384404268, "grad_norm": 5.875, "learning_rate": 1.1391253183493877e-06, "loss": 1.03282223, "memory(GiB)": 142.32, "step": 141580, "train_speed(iter/s)": 0.286455 }, { "acc": 0.73486233, "epoch": 1.5837885373863854, "grad_norm": 5.34375, "learning_rate": 1.1379504420046222e-06, "loss": 1.04549932, "memory(GiB)": 142.32, "step": 141600, "train_speed(iter/s)": 0.286468 }, { "acc": 0.73823509, "epoch": 1.584012236332344, "grad_norm": 5.28125, "learning_rate": 1.1367760940434364e-06, "loss": 1.04963903, "memory(GiB)": 142.32, "step": 141620, "train_speed(iter/s)": 0.28648 }, { "acc": 0.74660759, "epoch": 1.5842359352783024, "grad_norm": 4.875, "learning_rate": 1.1356022746265005e-06, "loss": 0.99974136, "memory(GiB)": 142.32, "step": 141640, "train_speed(iter/s)": 0.286493 }, { "acc": 0.73993168, "epoch": 1.584459634224261, "grad_norm": 5.9375, "learning_rate": 1.1344289839144084e-06, "loss": 1.04242382, "memory(GiB)": 142.32, "step": 141660, "train_speed(iter/s)": 0.286506 }, { "acc": 0.74150944, "epoch": 1.5846833331702195, "grad_norm": 7.0, "learning_rate": 1.1332562220676818e-06, "loss": 1.04308348, "memory(GiB)": 142.32, "step": 141680, "train_speed(iter/s)": 0.28652 }, { "acc": 0.73633385, "epoch": 1.584907032116178, "grad_norm": 5.875, "learning_rate": 1.132083989246774e-06, "loss": 1.05443459, "memory(GiB)": 142.32, "step": 141700, "train_speed(iter/s)": 0.286533 }, { "acc": 0.72059193, "epoch": 1.5851307310621365, "grad_norm": 7.0, "learning_rate": 1.1309122856120597e-06, "loss": 1.10837078, "memory(GiB)": 142.32, "step": 141720, "train_speed(iter/s)": 0.286547 }, { "acc": 0.72460947, "epoch": 1.585354430008095, "grad_norm": 6.8125, "learning_rate": 1.1297411113238488e-06, "loss": 1.0892971, "memory(GiB)": 142.32, "step": 141740, "train_speed(iter/s)": 0.286562 }, { "acc": 0.72977953, "epoch": 1.5855781289540536, "grad_norm": 7.34375, "learning_rate": 1.1285704665423718e-06, "loss": 1.09808235, "memory(GiB)": 142.32, "step": 141760, "train_speed(iter/s)": 0.286575 }, { "acc": 0.73622561, "epoch": 1.5858018279000121, "grad_norm": 8.3125, "learning_rate": 1.1274003514277899e-06, "loss": 1.03637409, "memory(GiB)": 142.32, "step": 141780, "train_speed(iter/s)": 0.286588 }, { "acc": 0.71505585, "epoch": 1.5860255268459706, "grad_norm": 6.5625, "learning_rate": 1.1262307661401934e-06, "loss": 1.15004635, "memory(GiB)": 142.32, "step": 141800, "train_speed(iter/s)": 0.286602 }, { "acc": 0.7366416, "epoch": 1.5862492257919292, "grad_norm": 6.53125, "learning_rate": 1.125061710839595e-06, "loss": 1.05526066, "memory(GiB)": 142.32, "step": 141820, "train_speed(iter/s)": 0.286616 }, { "acc": 0.73526258, "epoch": 1.5864729247378877, "grad_norm": 6.34375, "learning_rate": 1.123893185685942e-06, "loss": 1.05762339, "memory(GiB)": 142.32, "step": 141840, "train_speed(iter/s)": 0.28663 }, { "acc": 0.72933459, "epoch": 1.5866966236838462, "grad_norm": 5.4375, "learning_rate": 1.1227251908391034e-06, "loss": 1.08869228, "memory(GiB)": 142.32, "step": 141860, "train_speed(iter/s)": 0.286642 }, { "acc": 0.74240751, "epoch": 1.5869203226298048, "grad_norm": 5.90625, "learning_rate": 1.1215577264588767e-06, "loss": 1.01660862, "memory(GiB)": 142.32, "step": 141880, "train_speed(iter/s)": 0.286656 }, { "acc": 0.74330506, "epoch": 1.5871440215757633, "grad_norm": 6.0, "learning_rate": 1.1203907927049901e-06, "loss": 1.03215694, "memory(GiB)": 142.32, "step": 141900, "train_speed(iter/s)": 0.28667 }, { "acc": 0.74231119, "epoch": 1.5873677205217218, "grad_norm": 6.53125, "learning_rate": 1.1192243897370937e-06, "loss": 1.0310339, "memory(GiB)": 142.32, "step": 141920, "train_speed(iter/s)": 0.286684 }, { "acc": 0.7375483, "epoch": 1.5875914194676803, "grad_norm": 6.28125, "learning_rate": 1.1180585177147712e-06, "loss": 1.03543034, "memory(GiB)": 142.32, "step": 141940, "train_speed(iter/s)": 0.286698 }, { "acc": 0.73050776, "epoch": 1.5878151184136389, "grad_norm": 7.1875, "learning_rate": 1.1168931767975295e-06, "loss": 1.07340679, "memory(GiB)": 142.32, "step": 141960, "train_speed(iter/s)": 0.286712 }, { "acc": 0.72989984, "epoch": 1.5880388173595974, "grad_norm": 6.375, "learning_rate": 1.1157283671448022e-06, "loss": 1.06813431, "memory(GiB)": 142.32, "step": 141980, "train_speed(iter/s)": 0.286726 }, { "acc": 0.73742342, "epoch": 1.588262516305556, "grad_norm": 5.21875, "learning_rate": 1.1145640889159548e-06, "loss": 1.03622379, "memory(GiB)": 142.32, "step": 142000, "train_speed(iter/s)": 0.28674 }, { "epoch": 1.588262516305556, "eval_acc": 0.6963563166170348, "eval_loss": 1.0714468955993652, "eval_runtime": 2340.2482, "eval_samples_per_second": 32.169, "eval_steps_per_second": 16.085, "step": 142000 }, { "acc": 0.74732165, "epoch": 1.5884862152515145, "grad_norm": 7.03125, "learning_rate": 1.1134003422702738e-06, "loss": 0.97492104, "memory(GiB)": 142.32, "step": 142020, "train_speed(iter/s)": 0.285378 }, { "acc": 0.74163818, "epoch": 1.588709914197473, "grad_norm": 5.84375, "learning_rate": 1.1122371273669802e-06, "loss": 1.02858725, "memory(GiB)": 142.32, "step": 142040, "train_speed(iter/s)": 0.28539 }, { "acc": 0.72987213, "epoch": 1.5889336131434315, "grad_norm": 7.125, "learning_rate": 1.1110744443652161e-06, "loss": 1.08418446, "memory(GiB)": 142.32, "step": 142060, "train_speed(iter/s)": 0.285405 }, { "acc": 0.74429216, "epoch": 1.58915731208939, "grad_norm": 6.09375, "learning_rate": 1.109912293424054e-06, "loss": 1.02897892, "memory(GiB)": 142.32, "step": 142080, "train_speed(iter/s)": 0.285419 }, { "acc": 0.73690901, "epoch": 1.5893810110353486, "grad_norm": 6.46875, "learning_rate": 1.1087506747024924e-06, "loss": 1.04810486, "memory(GiB)": 142.32, "step": 142100, "train_speed(iter/s)": 0.285433 }, { "acc": 0.74293957, "epoch": 1.589604709981307, "grad_norm": 5.71875, "learning_rate": 1.107589588359455e-06, "loss": 1.01545162, "memory(GiB)": 142.32, "step": 142120, "train_speed(iter/s)": 0.285446 }, { "acc": 0.73193293, "epoch": 1.5898284089272656, "grad_norm": 6.125, "learning_rate": 1.1064290345537992e-06, "loss": 1.05385647, "memory(GiB)": 142.32, "step": 142140, "train_speed(iter/s)": 0.285458 }, { "acc": 0.73538036, "epoch": 1.5900521078732242, "grad_norm": 4.71875, "learning_rate": 1.1052690134443022e-06, "loss": 1.04045792, "memory(GiB)": 142.32, "step": 142160, "train_speed(iter/s)": 0.285471 }, { "acc": 0.73846531, "epoch": 1.5902758068191827, "grad_norm": 6.4375, "learning_rate": 1.1041095251896738e-06, "loss": 1.03776636, "memory(GiB)": 142.32, "step": 142180, "train_speed(iter/s)": 0.285484 }, { "acc": 0.74028244, "epoch": 1.5904995057651412, "grad_norm": 7.3125, "learning_rate": 1.1029505699485482e-06, "loss": 1.03876762, "memory(GiB)": 142.32, "step": 142200, "train_speed(iter/s)": 0.285499 }, { "acc": 0.73495097, "epoch": 1.5907232047110997, "grad_norm": 5.125, "learning_rate": 1.101792147879484e-06, "loss": 1.06055918, "memory(GiB)": 142.32, "step": 142220, "train_speed(iter/s)": 0.285513 }, { "acc": 0.72279158, "epoch": 1.5909469036570583, "grad_norm": 6.0, "learning_rate": 1.1006342591409742e-06, "loss": 1.10496283, "memory(GiB)": 142.32, "step": 142240, "train_speed(iter/s)": 0.285525 }, { "acc": 0.72853355, "epoch": 1.5911706026030168, "grad_norm": 5.625, "learning_rate": 1.0994769038914304e-06, "loss": 1.08692389, "memory(GiB)": 142.32, "step": 142260, "train_speed(iter/s)": 0.285538 }, { "acc": 0.73522816, "epoch": 1.5913943015489753, "grad_norm": 5.34375, "learning_rate": 1.0983200822891998e-06, "loss": 1.06643467, "memory(GiB)": 142.32, "step": 142280, "train_speed(iter/s)": 0.28555 }, { "acc": 0.73612299, "epoch": 1.5916180004949338, "grad_norm": 5.71875, "learning_rate": 1.097163794492549e-06, "loss": 1.04080105, "memory(GiB)": 142.32, "step": 142300, "train_speed(iter/s)": 0.285563 }, { "acc": 0.73356934, "epoch": 1.5918416994408924, "grad_norm": 5.8125, "learning_rate": 1.0960080406596747e-06, "loss": 1.06548252, "memory(GiB)": 142.32, "step": 142320, "train_speed(iter/s)": 0.285576 }, { "acc": 0.74804506, "epoch": 1.592065398386851, "grad_norm": 5.84375, "learning_rate": 1.0948528209487026e-06, "loss": 1.00125141, "memory(GiB)": 142.32, "step": 142340, "train_speed(iter/s)": 0.28559 }, { "acc": 0.75031891, "epoch": 1.5922890973328094, "grad_norm": 6.65625, "learning_rate": 1.0936981355176802e-06, "loss": 0.9805542, "memory(GiB)": 142.32, "step": 142360, "train_speed(iter/s)": 0.285604 }, { "acc": 0.71668482, "epoch": 1.592512796278768, "grad_norm": 6.34375, "learning_rate": 1.0925439845245883e-06, "loss": 1.13867626, "memory(GiB)": 142.32, "step": 142380, "train_speed(iter/s)": 0.285618 }, { "acc": 0.75151186, "epoch": 1.5927364952247265, "grad_norm": 5.8125, "learning_rate": 1.0913903681273303e-06, "loss": 0.98466768, "memory(GiB)": 142.32, "step": 142400, "train_speed(iter/s)": 0.285632 }, { "acc": 0.73150406, "epoch": 1.592960194170685, "grad_norm": 4.8125, "learning_rate": 1.0902372864837347e-06, "loss": 1.05951023, "memory(GiB)": 142.32, "step": 142420, "train_speed(iter/s)": 0.285645 }, { "acc": 0.73651886, "epoch": 1.5931838931166435, "grad_norm": 6.28125, "learning_rate": 1.0890847397515635e-06, "loss": 1.05662327, "memory(GiB)": 142.32, "step": 142440, "train_speed(iter/s)": 0.285659 }, { "acc": 0.73659253, "epoch": 1.593407592062602, "grad_norm": 7.0, "learning_rate": 1.0879327280884983e-06, "loss": 1.03249435, "memory(GiB)": 142.32, "step": 142460, "train_speed(iter/s)": 0.285672 }, { "acc": 0.74024391, "epoch": 1.5936312910085606, "grad_norm": 7.125, "learning_rate": 1.0867812516521537e-06, "loss": 1.02032738, "memory(GiB)": 142.32, "step": 142480, "train_speed(iter/s)": 0.285686 }, { "acc": 0.72557073, "epoch": 1.5938549899545191, "grad_norm": 5.625, "learning_rate": 1.0856303106000666e-06, "loss": 1.10652294, "memory(GiB)": 142.32, "step": 142500, "train_speed(iter/s)": 0.2857 }, { "acc": 0.72835369, "epoch": 1.5940786889004777, "grad_norm": 6.5625, "learning_rate": 1.0844799050897004e-06, "loss": 1.10425339, "memory(GiB)": 142.32, "step": 142520, "train_speed(iter/s)": 0.285715 }, { "acc": 0.74072056, "epoch": 1.5943023878464362, "grad_norm": 5.8125, "learning_rate": 1.083330035278451e-06, "loss": 1.02114115, "memory(GiB)": 142.32, "step": 142540, "train_speed(iter/s)": 0.285728 }, { "acc": 0.74081388, "epoch": 1.5945260867923947, "grad_norm": 6.21875, "learning_rate": 1.082180701323633e-06, "loss": 1.02740536, "memory(GiB)": 142.32, "step": 142560, "train_speed(iter/s)": 0.285742 }, { "acc": 0.73393497, "epoch": 1.5947497857383532, "grad_norm": 6.59375, "learning_rate": 1.081031903382495e-06, "loss": 1.05598335, "memory(GiB)": 142.32, "step": 142580, "train_speed(iter/s)": 0.285755 }, { "acc": 0.73867793, "epoch": 1.5949734846843118, "grad_norm": 5.59375, "learning_rate": 1.0798836416122078e-06, "loss": 1.02011795, "memory(GiB)": 142.32, "step": 142600, "train_speed(iter/s)": 0.285769 }, { "acc": 0.72998648, "epoch": 1.5951971836302703, "grad_norm": 6.1875, "learning_rate": 1.0787359161698684e-06, "loss": 1.0882576, "memory(GiB)": 142.32, "step": 142620, "train_speed(iter/s)": 0.285782 }, { "acc": 0.74243269, "epoch": 1.5954208825762288, "grad_norm": 5.8125, "learning_rate": 1.0775887272125046e-06, "loss": 1.01738911, "memory(GiB)": 142.32, "step": 142640, "train_speed(iter/s)": 0.285795 }, { "acc": 0.73524656, "epoch": 1.5956445815221874, "grad_norm": 5.4375, "learning_rate": 1.076442074897066e-06, "loss": 1.05190792, "memory(GiB)": 142.32, "step": 142660, "train_speed(iter/s)": 0.285807 }, { "acc": 0.73326507, "epoch": 1.5958682804681459, "grad_norm": 6.4375, "learning_rate": 1.0752959593804336e-06, "loss": 1.08263235, "memory(GiB)": 142.32, "step": 142680, "train_speed(iter/s)": 0.285821 }, { "acc": 0.74928608, "epoch": 1.5960919794141044, "grad_norm": 5.78125, "learning_rate": 1.0741503808194104e-06, "loss": 0.98290596, "memory(GiB)": 142.32, "step": 142700, "train_speed(iter/s)": 0.285835 }, { "acc": 0.75014124, "epoch": 1.596315678360063, "grad_norm": 5.09375, "learning_rate": 1.0730053393707274e-06, "loss": 0.98617249, "memory(GiB)": 142.32, "step": 142720, "train_speed(iter/s)": 0.285849 }, { "acc": 0.74315195, "epoch": 1.5965393773060215, "grad_norm": 6.78125, "learning_rate": 1.0718608351910453e-06, "loss": 1.01237965, "memory(GiB)": 142.32, "step": 142740, "train_speed(iter/s)": 0.285864 }, { "acc": 0.73464622, "epoch": 1.59676307625198, "grad_norm": 5.53125, "learning_rate": 1.070716868436945e-06, "loss": 1.06000776, "memory(GiB)": 142.32, "step": 142760, "train_speed(iter/s)": 0.285878 }, { "acc": 0.73277721, "epoch": 1.5969867751979385, "grad_norm": 5.8125, "learning_rate": 1.0695734392649415e-06, "loss": 1.0684721, "memory(GiB)": 142.32, "step": 142780, "train_speed(iter/s)": 0.285892 }, { "acc": 0.73963871, "epoch": 1.597210474143897, "grad_norm": 6.625, "learning_rate": 1.0684305478314693e-06, "loss": 1.04499607, "memory(GiB)": 142.32, "step": 142800, "train_speed(iter/s)": 0.285907 }, { "acc": 0.71854792, "epoch": 1.5974341730898556, "grad_norm": 7.5625, "learning_rate": 1.0672881942928926e-06, "loss": 1.11844702, "memory(GiB)": 142.32, "step": 142820, "train_speed(iter/s)": 0.285921 }, { "acc": 0.73129025, "epoch": 1.597657872035814, "grad_norm": 5.78125, "learning_rate": 1.0661463788055037e-06, "loss": 1.06995993, "memory(GiB)": 142.32, "step": 142840, "train_speed(iter/s)": 0.285936 }, { "acc": 0.7481771, "epoch": 1.5978815709817726, "grad_norm": 7.71875, "learning_rate": 1.0650051015255163e-06, "loss": 1.01043453, "memory(GiB)": 142.32, "step": 142860, "train_speed(iter/s)": 0.285948 }, { "acc": 0.73715329, "epoch": 1.5981052699277312, "grad_norm": 6.375, "learning_rate": 1.0638643626090766e-06, "loss": 1.05875072, "memory(GiB)": 142.32, "step": 142880, "train_speed(iter/s)": 0.285963 }, { "acc": 0.72018194, "epoch": 1.5983289688736897, "grad_norm": 6.1875, "learning_rate": 1.0627241622122525e-06, "loss": 1.12457943, "memory(GiB)": 142.32, "step": 142900, "train_speed(iter/s)": 0.285976 }, { "acc": 0.73902397, "epoch": 1.5985526678196482, "grad_norm": 6.21875, "learning_rate": 1.061584500491038e-06, "loss": 1.03875828, "memory(GiB)": 142.32, "step": 142920, "train_speed(iter/s)": 0.285991 }, { "acc": 0.73792562, "epoch": 1.5987763667656067, "grad_norm": 6.125, "learning_rate": 1.0604453776013585e-06, "loss": 1.04937458, "memory(GiB)": 142.32, "step": 142940, "train_speed(iter/s)": 0.286005 }, { "acc": 0.73012576, "epoch": 1.5990000657115653, "grad_norm": 6.09375, "learning_rate": 1.0593067936990586e-06, "loss": 1.06215439, "memory(GiB)": 142.32, "step": 142960, "train_speed(iter/s)": 0.286019 }, { "acc": 0.7452888, "epoch": 1.5992237646575238, "grad_norm": 5.78125, "learning_rate": 1.0581687489399167e-06, "loss": 1.0027791, "memory(GiB)": 142.32, "step": 142980, "train_speed(iter/s)": 0.286032 }, { "acc": 0.75017748, "epoch": 1.5994474636034823, "grad_norm": 5.96875, "learning_rate": 1.0570312434796315e-06, "loss": 0.97999287, "memory(GiB)": 142.32, "step": 143000, "train_speed(iter/s)": 0.286044 }, { "acc": 0.73398752, "epoch": 1.5996711625494409, "grad_norm": 6.03125, "learning_rate": 1.055894277473829e-06, "loss": 1.05743275, "memory(GiB)": 142.32, "step": 143020, "train_speed(iter/s)": 0.286059 }, { "acc": 0.73392687, "epoch": 1.5998948614953994, "grad_norm": 6.28125, "learning_rate": 1.0547578510780648e-06, "loss": 1.05077305, "memory(GiB)": 142.32, "step": 143040, "train_speed(iter/s)": 0.286072 }, { "acc": 0.74545865, "epoch": 1.600118560441358, "grad_norm": 5.71875, "learning_rate": 1.0536219644478157e-06, "loss": 1.00766182, "memory(GiB)": 142.32, "step": 143060, "train_speed(iter/s)": 0.286086 }, { "acc": 0.72864995, "epoch": 1.6003422593873164, "grad_norm": 5.65625, "learning_rate": 1.0524866177384896e-06, "loss": 1.08049088, "memory(GiB)": 142.32, "step": 143080, "train_speed(iter/s)": 0.286101 }, { "acc": 0.73220229, "epoch": 1.600565958333275, "grad_norm": 5.90625, "learning_rate": 1.0513518111054177e-06, "loss": 1.07952948, "memory(GiB)": 142.32, "step": 143100, "train_speed(iter/s)": 0.286115 }, { "acc": 0.72437129, "epoch": 1.6007896572792335, "grad_norm": 6.46875, "learning_rate": 1.050217544703856e-06, "loss": 1.10557261, "memory(GiB)": 142.32, "step": 143120, "train_speed(iter/s)": 0.28613 }, { "acc": 0.71940336, "epoch": 1.601013356225192, "grad_norm": 5.03125, "learning_rate": 1.0490838186889906e-06, "loss": 1.12255497, "memory(GiB)": 142.32, "step": 143140, "train_speed(iter/s)": 0.286143 }, { "acc": 0.73951025, "epoch": 1.6012370551711506, "grad_norm": 5.875, "learning_rate": 1.047950633215929e-06, "loss": 1.02743187, "memory(GiB)": 142.32, "step": 143160, "train_speed(iter/s)": 0.286158 }, { "acc": 0.73032894, "epoch": 1.601460754117109, "grad_norm": 4.71875, "learning_rate": 1.04681798843971e-06, "loss": 1.06729851, "memory(GiB)": 142.32, "step": 143180, "train_speed(iter/s)": 0.286171 }, { "acc": 0.73326783, "epoch": 1.6016844530630676, "grad_norm": 7.0625, "learning_rate": 1.045685884515294e-06, "loss": 1.06214352, "memory(GiB)": 142.32, "step": 143200, "train_speed(iter/s)": 0.286184 }, { "acc": 0.72486019, "epoch": 1.6019081520090261, "grad_norm": 4.78125, "learning_rate": 1.0445543215975683e-06, "loss": 1.1095314, "memory(GiB)": 142.32, "step": 143220, "train_speed(iter/s)": 0.286198 }, { "acc": 0.73609295, "epoch": 1.6021318509549847, "grad_norm": 6.53125, "learning_rate": 1.043423299841349e-06, "loss": 1.05449581, "memory(GiB)": 142.32, "step": 143240, "train_speed(iter/s)": 0.286212 }, { "acc": 0.73843832, "epoch": 1.6023555499009432, "grad_norm": 5.25, "learning_rate": 1.0422928194013732e-06, "loss": 1.04258862, "memory(GiB)": 142.32, "step": 143260, "train_speed(iter/s)": 0.286226 }, { "acc": 0.73460855, "epoch": 1.6025792488469017, "grad_norm": 5.75, "learning_rate": 1.041162880432311e-06, "loss": 1.04368582, "memory(GiB)": 142.32, "step": 143280, "train_speed(iter/s)": 0.28624 }, { "acc": 0.72894993, "epoch": 1.6028029477928603, "grad_norm": 6.125, "learning_rate": 1.0400334830887494e-06, "loss": 1.08290825, "memory(GiB)": 142.32, "step": 143300, "train_speed(iter/s)": 0.286252 }, { "acc": 0.73346038, "epoch": 1.6030266467388188, "grad_norm": 5.75, "learning_rate": 1.038904627525209e-06, "loss": 1.05466785, "memory(GiB)": 142.32, "step": 143320, "train_speed(iter/s)": 0.286266 }, { "acc": 0.72779455, "epoch": 1.6032503456847773, "grad_norm": 8.6875, "learning_rate": 1.0377763138961327e-06, "loss": 1.09282036, "memory(GiB)": 142.32, "step": 143340, "train_speed(iter/s)": 0.28628 }, { "acc": 0.73341904, "epoch": 1.6034740446307358, "grad_norm": 4.96875, "learning_rate": 1.0366485423558886e-06, "loss": 1.05855942, "memory(GiB)": 142.32, "step": 143360, "train_speed(iter/s)": 0.286292 }, { "acc": 0.72674279, "epoch": 1.6036977435766944, "grad_norm": 6.625, "learning_rate": 1.035521313058775e-06, "loss": 1.08497934, "memory(GiB)": 142.32, "step": 143380, "train_speed(iter/s)": 0.286306 }, { "acc": 0.7399106, "epoch": 1.603921442522653, "grad_norm": 5.96875, "learning_rate": 1.0343946261590099e-06, "loss": 1.04256077, "memory(GiB)": 142.32, "step": 143400, "train_speed(iter/s)": 0.28632 }, { "acc": 0.73619242, "epoch": 1.6041451414686114, "grad_norm": 6.40625, "learning_rate": 1.0332684818107425e-06, "loss": 1.04442806, "memory(GiB)": 142.32, "step": 143420, "train_speed(iter/s)": 0.286335 }, { "acc": 0.73006334, "epoch": 1.60436884041457, "grad_norm": 4.65625, "learning_rate": 1.0321428801680445e-06, "loss": 1.07855167, "memory(GiB)": 142.32, "step": 143440, "train_speed(iter/s)": 0.286348 }, { "acc": 0.74241924, "epoch": 1.6045925393605285, "grad_norm": 4.875, "learning_rate": 1.0310178213849126e-06, "loss": 1.04000435, "memory(GiB)": 142.32, "step": 143460, "train_speed(iter/s)": 0.286358 }, { "acc": 0.73862152, "epoch": 1.604816238306487, "grad_norm": 5.5625, "learning_rate": 1.0298933056152744e-06, "loss": 1.04280281, "memory(GiB)": 142.32, "step": 143480, "train_speed(iter/s)": 0.286372 }, { "acc": 0.75030041, "epoch": 1.6050399372524455, "grad_norm": 8.5, "learning_rate": 1.0287693330129762e-06, "loss": 0.99381514, "memory(GiB)": 142.32, "step": 143500, "train_speed(iter/s)": 0.286384 }, { "acc": 0.73447905, "epoch": 1.605263636198404, "grad_norm": 7.625, "learning_rate": 1.0276459037317972e-06, "loss": 1.04726028, "memory(GiB)": 142.32, "step": 143520, "train_speed(iter/s)": 0.286397 }, { "acc": 0.72643099, "epoch": 1.6054873351443626, "grad_norm": 5.65625, "learning_rate": 1.026523017925436e-06, "loss": 1.10164032, "memory(GiB)": 142.32, "step": 143540, "train_speed(iter/s)": 0.286411 }, { "acc": 0.73275075, "epoch": 1.6057110340903211, "grad_norm": 5.375, "learning_rate": 1.0254006757475188e-06, "loss": 1.0714859, "memory(GiB)": 142.32, "step": 143560, "train_speed(iter/s)": 0.286425 }, { "acc": 0.73050756, "epoch": 1.6059347330362796, "grad_norm": 5.65625, "learning_rate": 1.0242788773516004e-06, "loss": 1.07248993, "memory(GiB)": 142.32, "step": 143580, "train_speed(iter/s)": 0.286438 }, { "acc": 0.74491158, "epoch": 1.6061584319822382, "grad_norm": 8.25, "learning_rate": 1.0231576228911566e-06, "loss": 1.01557102, "memory(GiB)": 142.32, "step": 143600, "train_speed(iter/s)": 0.286452 }, { "acc": 0.74532657, "epoch": 1.6063821309281967, "grad_norm": 5.34375, "learning_rate": 1.0220369125195933e-06, "loss": 0.99910221, "memory(GiB)": 142.32, "step": 143620, "train_speed(iter/s)": 0.286465 }, { "acc": 0.73199248, "epoch": 1.6066058298741552, "grad_norm": 6.3125, "learning_rate": 1.020916746390239e-06, "loss": 1.07458773, "memory(GiB)": 142.32, "step": 143640, "train_speed(iter/s)": 0.286477 }, { "acc": 0.7387579, "epoch": 1.6068295288201138, "grad_norm": 7.28125, "learning_rate": 1.0197971246563465e-06, "loss": 1.05134449, "memory(GiB)": 142.32, "step": 143660, "train_speed(iter/s)": 0.286492 }, { "acc": 0.7476366, "epoch": 1.6070532277660723, "grad_norm": 7.65625, "learning_rate": 1.018678047471099e-06, "loss": 1.00907507, "memory(GiB)": 142.32, "step": 143680, "train_speed(iter/s)": 0.286508 }, { "acc": 0.74020176, "epoch": 1.6072769267120308, "grad_norm": 6.875, "learning_rate": 1.0175595149875988e-06, "loss": 1.0272831, "memory(GiB)": 142.32, "step": 143700, "train_speed(iter/s)": 0.286522 }, { "acc": 0.744979, "epoch": 1.6075006256579893, "grad_norm": 7.5625, "learning_rate": 1.0164415273588812e-06, "loss": 0.99988337, "memory(GiB)": 142.32, "step": 143720, "train_speed(iter/s)": 0.286536 }, { "acc": 0.72684512, "epoch": 1.6077243246039479, "grad_norm": 7.5, "learning_rate": 1.0153240847379003e-06, "loss": 1.09247704, "memory(GiB)": 142.32, "step": 143740, "train_speed(iter/s)": 0.28655 }, { "acc": 0.73663731, "epoch": 1.6079480235499064, "grad_norm": 5.53125, "learning_rate": 1.0142071872775378e-06, "loss": 1.06040173, "memory(GiB)": 142.32, "step": 143760, "train_speed(iter/s)": 0.286565 }, { "acc": 0.75128508, "epoch": 1.608171722495865, "grad_norm": 6.96875, "learning_rate": 1.0130908351306036e-06, "loss": 0.97005491, "memory(GiB)": 142.32, "step": 143780, "train_speed(iter/s)": 0.286578 }, { "acc": 0.73930507, "epoch": 1.6083954214418235, "grad_norm": 5.71875, "learning_rate": 1.0119750284498275e-06, "loss": 1.02987127, "memory(GiB)": 142.32, "step": 143800, "train_speed(iter/s)": 0.286593 }, { "acc": 0.73774414, "epoch": 1.608619120387782, "grad_norm": 6.5, "learning_rate": 1.0108597673878712e-06, "loss": 1.04284763, "memory(GiB)": 142.32, "step": 143820, "train_speed(iter/s)": 0.286607 }, { "acc": 0.73589096, "epoch": 1.6088428193337405, "grad_norm": 6.65625, "learning_rate": 1.0097450520973162e-06, "loss": 1.06329269, "memory(GiB)": 142.32, "step": 143840, "train_speed(iter/s)": 0.286619 }, { "acc": 0.7489511, "epoch": 1.609066518279699, "grad_norm": 5.5, "learning_rate": 1.0086308827306711e-06, "loss": 1.01026325, "memory(GiB)": 142.32, "step": 143860, "train_speed(iter/s)": 0.286632 }, { "acc": 0.74697523, "epoch": 1.6092902172256576, "grad_norm": 6.3125, "learning_rate": 1.0075172594403726e-06, "loss": 0.99528599, "memory(GiB)": 142.32, "step": 143880, "train_speed(iter/s)": 0.286647 }, { "acc": 0.73122058, "epoch": 1.609513916171616, "grad_norm": 6.71875, "learning_rate": 1.0064041823787768e-06, "loss": 1.0744215, "memory(GiB)": 142.32, "step": 143900, "train_speed(iter/s)": 0.286662 }, { "acc": 0.73304472, "epoch": 1.6097376151175746, "grad_norm": 6.4375, "learning_rate": 1.005291651698172e-06, "loss": 1.06953583, "memory(GiB)": 142.32, "step": 143920, "train_speed(iter/s)": 0.286674 }, { "acc": 0.74112778, "epoch": 1.6099613140635332, "grad_norm": 5.1875, "learning_rate": 1.004179667550767e-06, "loss": 1.02149239, "memory(GiB)": 142.32, "step": 143940, "train_speed(iter/s)": 0.286686 }, { "acc": 0.73696108, "epoch": 1.6101850130094917, "grad_norm": 6.9375, "learning_rate": 1.003068230088695e-06, "loss": 1.03877869, "memory(GiB)": 142.32, "step": 143960, "train_speed(iter/s)": 0.2867 }, { "acc": 0.72959185, "epoch": 1.6104087119554502, "grad_norm": 6.5, "learning_rate": 1.0019573394640204e-06, "loss": 1.08256702, "memory(GiB)": 142.32, "step": 143980, "train_speed(iter/s)": 0.286714 }, { "acc": 0.73840036, "epoch": 1.6106324109014087, "grad_norm": 6.3125, "learning_rate": 1.0008469958287253e-06, "loss": 1.03502693, "memory(GiB)": 142.32, "step": 144000, "train_speed(iter/s)": 0.286728 }, { "epoch": 1.6106324109014087, "eval_acc": 0.6963622322156401, "eval_loss": 1.0714056491851807, "eval_runtime": 2341.2662, "eval_samples_per_second": 32.155, "eval_steps_per_second": 16.078, "step": 144000 }, { "acc": 0.7381381, "epoch": 1.6108561098473673, "grad_norm": 5.46875, "learning_rate": 9.997371993347238e-07, "loss": 1.04175158, "memory(GiB)": 142.32, "step": 144020, "train_speed(iter/s)": 0.285383 }, { "acc": 0.73271899, "epoch": 1.6110798087933258, "grad_norm": 7.40625, "learning_rate": 9.986279501338503e-07, "loss": 1.05051975, "memory(GiB)": 142.32, "step": 144040, "train_speed(iter/s)": 0.285397 }, { "acc": 0.74667215, "epoch": 1.6113035077392843, "grad_norm": 6.65625, "learning_rate": 9.975192483778645e-07, "loss": 0.99876757, "memory(GiB)": 142.32, "step": 144060, "train_speed(iter/s)": 0.285409 }, { "acc": 0.73686113, "epoch": 1.6115272066852429, "grad_norm": 5.75, "learning_rate": 9.964110942184556e-07, "loss": 1.04602528, "memory(GiB)": 142.32, "step": 144080, "train_speed(iter/s)": 0.285423 }, { "acc": 0.74705906, "epoch": 1.6117509056312014, "grad_norm": 5.4375, "learning_rate": 9.953034878072326e-07, "loss": 1.00570068, "memory(GiB)": 142.32, "step": 144100, "train_speed(iter/s)": 0.285437 }, { "acc": 0.73804507, "epoch": 1.61197460457716, "grad_norm": 6.4375, "learning_rate": 9.941964292957335e-07, "loss": 1.05528221, "memory(GiB)": 142.32, "step": 144120, "train_speed(iter/s)": 0.28545 }, { "acc": 0.7351203, "epoch": 1.6121983035231184, "grad_norm": 7.28125, "learning_rate": 9.930899188354192e-07, "loss": 1.05356894, "memory(GiB)": 142.32, "step": 144140, "train_speed(iter/s)": 0.285463 }, { "acc": 0.74232497, "epoch": 1.612422002469077, "grad_norm": 7.09375, "learning_rate": 9.919839565776746e-07, "loss": 1.01875153, "memory(GiB)": 142.32, "step": 144160, "train_speed(iter/s)": 0.285477 }, { "acc": 0.73050156, "epoch": 1.6126457014150355, "grad_norm": 5.875, "learning_rate": 9.908785426738139e-07, "loss": 1.08287888, "memory(GiB)": 142.32, "step": 144180, "train_speed(iter/s)": 0.285491 }, { "acc": 0.73170767, "epoch": 1.612869400360994, "grad_norm": 6.90625, "learning_rate": 9.897736772750705e-07, "loss": 1.07035389, "memory(GiB)": 142.32, "step": 144200, "train_speed(iter/s)": 0.285505 }, { "acc": 0.72621489, "epoch": 1.6130930993069525, "grad_norm": 6.28125, "learning_rate": 9.886693605326086e-07, "loss": 1.11011782, "memory(GiB)": 142.32, "step": 144220, "train_speed(iter/s)": 0.285518 }, { "acc": 0.73667111, "epoch": 1.613316798252911, "grad_norm": 4.96875, "learning_rate": 9.875655925975142e-07, "loss": 1.04088421, "memory(GiB)": 142.32, "step": 144240, "train_speed(iter/s)": 0.285533 }, { "acc": 0.7504674, "epoch": 1.6135404971988696, "grad_norm": 6.53125, "learning_rate": 9.864623736207957e-07, "loss": 0.97757549, "memory(GiB)": 142.32, "step": 144260, "train_speed(iter/s)": 0.285546 }, { "acc": 0.73246546, "epoch": 1.6137641961448281, "grad_norm": 6.78125, "learning_rate": 9.853597037533924e-07, "loss": 1.06082373, "memory(GiB)": 142.32, "step": 144280, "train_speed(iter/s)": 0.28556 }, { "acc": 0.73152218, "epoch": 1.6139878950907867, "grad_norm": 6.4375, "learning_rate": 9.842575831461625e-07, "loss": 1.06869984, "memory(GiB)": 142.32, "step": 144300, "train_speed(iter/s)": 0.285571 }, { "acc": 0.72860289, "epoch": 1.6142115940367452, "grad_norm": 6.3125, "learning_rate": 9.83156011949895e-07, "loss": 1.10061436, "memory(GiB)": 142.32, "step": 144320, "train_speed(iter/s)": 0.285585 }, { "acc": 0.7391839, "epoch": 1.6144352929827037, "grad_norm": 9.5, "learning_rate": 9.820549903152992e-07, "loss": 1.03187294, "memory(GiB)": 142.32, "step": 144340, "train_speed(iter/s)": 0.285598 }, { "acc": 0.73312616, "epoch": 1.6146589919286622, "grad_norm": 6.21875, "learning_rate": 9.80954518393009e-07, "loss": 1.05781994, "memory(GiB)": 142.32, "step": 144360, "train_speed(iter/s)": 0.285609 }, { "acc": 0.71753235, "epoch": 1.6148826908746208, "grad_norm": 4.875, "learning_rate": 9.798545963335876e-07, "loss": 1.12521191, "memory(GiB)": 142.32, "step": 144380, "train_speed(iter/s)": 0.285623 }, { "acc": 0.73853369, "epoch": 1.6151063898205793, "grad_norm": 5.6875, "learning_rate": 9.787552242875165e-07, "loss": 1.04930038, "memory(GiB)": 142.32, "step": 144400, "train_speed(iter/s)": 0.285636 }, { "acc": 0.75169134, "epoch": 1.6153300887665378, "grad_norm": 6.09375, "learning_rate": 9.776564024052093e-07, "loss": 0.97685671, "memory(GiB)": 142.32, "step": 144420, "train_speed(iter/s)": 0.28565 }, { "acc": 0.74349012, "epoch": 1.6155537877124964, "grad_norm": 5.90625, "learning_rate": 9.765581308369993e-07, "loss": 1.00962296, "memory(GiB)": 142.32, "step": 144440, "train_speed(iter/s)": 0.285664 }, { "acc": 0.73503995, "epoch": 1.6157774866584549, "grad_norm": 6.375, "learning_rate": 9.754604097331444e-07, "loss": 1.060987, "memory(GiB)": 142.32, "step": 144460, "train_speed(iter/s)": 0.285677 }, { "acc": 0.73651485, "epoch": 1.6160011856044134, "grad_norm": 6.5625, "learning_rate": 9.743632392438295e-07, "loss": 1.06738682, "memory(GiB)": 142.32, "step": 144480, "train_speed(iter/s)": 0.28569 }, { "acc": 0.73625507, "epoch": 1.616224884550372, "grad_norm": 5.625, "learning_rate": 9.73266619519162e-07, "loss": 1.04837666, "memory(GiB)": 142.32, "step": 144500, "train_speed(iter/s)": 0.285703 }, { "acc": 0.73052764, "epoch": 1.6164485834963307, "grad_norm": 5.90625, "learning_rate": 9.721705507091778e-07, "loss": 1.07691612, "memory(GiB)": 142.32, "step": 144520, "train_speed(iter/s)": 0.285716 }, { "acc": 0.73219318, "epoch": 1.6166722824422892, "grad_norm": 5.78125, "learning_rate": 9.71075032963832e-07, "loss": 1.06477261, "memory(GiB)": 142.32, "step": 144540, "train_speed(iter/s)": 0.285729 }, { "acc": 0.73180475, "epoch": 1.6168959813882477, "grad_norm": 6.5, "learning_rate": 9.699800664330089e-07, "loss": 1.07312851, "memory(GiB)": 142.32, "step": 144560, "train_speed(iter/s)": 0.285742 }, { "acc": 0.72528276, "epoch": 1.6171196803342063, "grad_norm": 5.75, "learning_rate": 9.688856512665152e-07, "loss": 1.09209347, "memory(GiB)": 142.32, "step": 144580, "train_speed(iter/s)": 0.285755 }, { "acc": 0.73353701, "epoch": 1.6173433792801648, "grad_norm": 6.625, "learning_rate": 9.677917876140813e-07, "loss": 1.05342026, "memory(GiB)": 142.32, "step": 144600, "train_speed(iter/s)": 0.28577 }, { "acc": 0.73272524, "epoch": 1.6175670782261233, "grad_norm": 6.4375, "learning_rate": 9.666984756253656e-07, "loss": 1.06142712, "memory(GiB)": 142.32, "step": 144620, "train_speed(iter/s)": 0.285784 }, { "acc": 0.74590626, "epoch": 1.6177907771720819, "grad_norm": 7.28125, "learning_rate": 9.65605715449946e-07, "loss": 0.99476576, "memory(GiB)": 142.32, "step": 144640, "train_speed(iter/s)": 0.285799 }, { "acc": 0.74916329, "epoch": 1.6180144761180404, "grad_norm": 6.28125, "learning_rate": 9.645135072373308e-07, "loss": 0.99272852, "memory(GiB)": 142.32, "step": 144660, "train_speed(iter/s)": 0.285812 }, { "acc": 0.73825111, "epoch": 1.618238175063999, "grad_norm": 6.625, "learning_rate": 9.63421851136948e-07, "loss": 1.04923077, "memory(GiB)": 142.32, "step": 144680, "train_speed(iter/s)": 0.285825 }, { "acc": 0.74941602, "epoch": 1.6184618740099574, "grad_norm": 5.84375, "learning_rate": 9.623307472981509e-07, "loss": 0.98381014, "memory(GiB)": 142.32, "step": 144700, "train_speed(iter/s)": 0.285838 }, { "acc": 0.74082451, "epoch": 1.618685572955916, "grad_norm": 6.75, "learning_rate": 9.612401958702205e-07, "loss": 1.03487301, "memory(GiB)": 142.32, "step": 144720, "train_speed(iter/s)": 0.285852 }, { "acc": 0.73737059, "epoch": 1.6189092719018745, "grad_norm": 6.1875, "learning_rate": 9.601501970023569e-07, "loss": 1.0522171, "memory(GiB)": 142.32, "step": 144740, "train_speed(iter/s)": 0.285866 }, { "acc": 0.74574041, "epoch": 1.619132970847833, "grad_norm": 4.625, "learning_rate": 9.590607508436905e-07, "loss": 1.00567093, "memory(GiB)": 142.32, "step": 144760, "train_speed(iter/s)": 0.28588 }, { "acc": 0.73372602, "epoch": 1.6193566697937916, "grad_norm": 6.5625, "learning_rate": 9.579718575432722e-07, "loss": 1.04962444, "memory(GiB)": 142.32, "step": 144780, "train_speed(iter/s)": 0.285892 }, { "acc": 0.74403248, "epoch": 1.61958036873975, "grad_norm": 6.6875, "learning_rate": 9.568835172500762e-07, "loss": 1.02504959, "memory(GiB)": 142.32, "step": 144800, "train_speed(iter/s)": 0.285903 }, { "acc": 0.73196936, "epoch": 1.6198040676857086, "grad_norm": 6.34375, "learning_rate": 9.557957301130056e-07, "loss": 1.07729111, "memory(GiB)": 142.32, "step": 144820, "train_speed(iter/s)": 0.285916 }, { "acc": 0.73128977, "epoch": 1.6200277666316671, "grad_norm": 5.375, "learning_rate": 9.547084962808827e-07, "loss": 1.07785835, "memory(GiB)": 142.32, "step": 144840, "train_speed(iter/s)": 0.285929 }, { "acc": 0.74878349, "epoch": 1.6202514655776257, "grad_norm": 5.71875, "learning_rate": 9.536218159024591e-07, "loss": 0.99320927, "memory(GiB)": 142.32, "step": 144860, "train_speed(iter/s)": 0.285941 }, { "acc": 0.74222255, "epoch": 1.6204751645235842, "grad_norm": 5.875, "learning_rate": 9.525356891264076e-07, "loss": 1.01223164, "memory(GiB)": 142.32, "step": 144880, "train_speed(iter/s)": 0.285954 }, { "acc": 0.7495121, "epoch": 1.6206988634695427, "grad_norm": 7.46875, "learning_rate": 9.51450116101324e-07, "loss": 1.00048065, "memory(GiB)": 142.32, "step": 144900, "train_speed(iter/s)": 0.285968 }, { "acc": 0.73257895, "epoch": 1.6209225624155013, "grad_norm": 5.1875, "learning_rate": 9.503650969757333e-07, "loss": 1.07526045, "memory(GiB)": 142.32, "step": 144920, "train_speed(iter/s)": 0.285982 }, { "acc": 0.74180312, "epoch": 1.6211462613614598, "grad_norm": 5.96875, "learning_rate": 9.492806318980779e-07, "loss": 1.01719513, "memory(GiB)": 142.32, "step": 144940, "train_speed(iter/s)": 0.285996 }, { "acc": 0.72093258, "epoch": 1.6213699603074183, "grad_norm": 6.65625, "learning_rate": 9.48196721016732e-07, "loss": 1.10386639, "memory(GiB)": 142.32, "step": 144960, "train_speed(iter/s)": 0.28601 }, { "acc": 0.72706137, "epoch": 1.6215936592533768, "grad_norm": 5.28125, "learning_rate": 9.471133644799885e-07, "loss": 1.07899475, "memory(GiB)": 142.32, "step": 144980, "train_speed(iter/s)": 0.286024 }, { "acc": 0.72887487, "epoch": 1.6218173581993354, "grad_norm": 6.75, "learning_rate": 9.460305624360638e-07, "loss": 1.0777276, "memory(GiB)": 142.32, "step": 145000, "train_speed(iter/s)": 0.286037 }, { "acc": 0.72621751, "epoch": 1.622041057145294, "grad_norm": 5.96875, "learning_rate": 9.449483150331046e-07, "loss": 1.0933075, "memory(GiB)": 142.32, "step": 145020, "train_speed(iter/s)": 0.286051 }, { "acc": 0.73231711, "epoch": 1.6222647560912524, "grad_norm": 6.1875, "learning_rate": 9.438666224191745e-07, "loss": 1.06981316, "memory(GiB)": 142.32, "step": 145040, "train_speed(iter/s)": 0.286063 }, { "acc": 0.73176827, "epoch": 1.622488455037211, "grad_norm": 5.78125, "learning_rate": 9.427854847422674e-07, "loss": 1.06799946, "memory(GiB)": 142.32, "step": 145060, "train_speed(iter/s)": 0.286076 }, { "acc": 0.75497274, "epoch": 1.6227121539831695, "grad_norm": 5.09375, "learning_rate": 9.417049021502966e-07, "loss": 0.95185986, "memory(GiB)": 142.32, "step": 145080, "train_speed(iter/s)": 0.28609 }, { "acc": 0.74487996, "epoch": 1.622935852929128, "grad_norm": 7.375, "learning_rate": 9.406248747911007e-07, "loss": 1.01977558, "memory(GiB)": 142.32, "step": 145100, "train_speed(iter/s)": 0.286105 }, { "acc": 0.72536783, "epoch": 1.6231595518750865, "grad_norm": 7.65625, "learning_rate": 9.395454028124451e-07, "loss": 1.11869068, "memory(GiB)": 142.32, "step": 145120, "train_speed(iter/s)": 0.286119 }, { "acc": 0.75187016, "epoch": 1.623383250821045, "grad_norm": 6.0625, "learning_rate": 9.384664863620141e-07, "loss": 0.97450809, "memory(GiB)": 142.32, "step": 145140, "train_speed(iter/s)": 0.286134 }, { "acc": 0.74796844, "epoch": 1.6236069497670036, "grad_norm": 6.34375, "learning_rate": 9.373881255874217e-07, "loss": 0.98434496, "memory(GiB)": 142.32, "step": 145160, "train_speed(iter/s)": 0.286147 }, { "acc": 0.73260603, "epoch": 1.6238306487129621, "grad_norm": 6.59375, "learning_rate": 9.363103206362018e-07, "loss": 1.06185246, "memory(GiB)": 142.32, "step": 145180, "train_speed(iter/s)": 0.286159 }, { "acc": 0.74172211, "epoch": 1.6240543476589206, "grad_norm": 6.09375, "learning_rate": 9.352330716558123e-07, "loss": 1.04379978, "memory(GiB)": 142.32, "step": 145200, "train_speed(iter/s)": 0.286173 }, { "acc": 0.72951183, "epoch": 1.6242780466048792, "grad_norm": 4.59375, "learning_rate": 9.341563787936387e-07, "loss": 1.09907055, "memory(GiB)": 142.32, "step": 145220, "train_speed(iter/s)": 0.286187 }, { "acc": 0.74677038, "epoch": 1.6245017455508377, "grad_norm": 7.03125, "learning_rate": 9.330802421969859e-07, "loss": 0.99877748, "memory(GiB)": 142.32, "step": 145240, "train_speed(iter/s)": 0.286198 }, { "acc": 0.74576807, "epoch": 1.6247254444967962, "grad_norm": 5.8125, "learning_rate": 9.320046620130862e-07, "loss": 1.00179634, "memory(GiB)": 142.32, "step": 145260, "train_speed(iter/s)": 0.286213 }, { "acc": 0.73245497, "epoch": 1.6249491434427548, "grad_norm": 6.75, "learning_rate": 9.309296383890947e-07, "loss": 1.06264219, "memory(GiB)": 142.32, "step": 145280, "train_speed(iter/s)": 0.286225 }, { "acc": 0.73093929, "epoch": 1.6251728423887133, "grad_norm": 6.0, "learning_rate": 9.298551714720871e-07, "loss": 1.06879215, "memory(GiB)": 142.32, "step": 145300, "train_speed(iter/s)": 0.286238 }, { "acc": 0.72681527, "epoch": 1.6253965413346718, "grad_norm": 6.0625, "learning_rate": 9.287812614090697e-07, "loss": 1.10706968, "memory(GiB)": 142.32, "step": 145320, "train_speed(iter/s)": 0.286251 }, { "acc": 0.74236164, "epoch": 1.6256202402806303, "grad_norm": 5.53125, "learning_rate": 9.277079083469647e-07, "loss": 1.05186281, "memory(GiB)": 142.32, "step": 145340, "train_speed(iter/s)": 0.286264 }, { "acc": 0.73005323, "epoch": 1.6258439392265889, "grad_norm": 5.875, "learning_rate": 9.266351124326261e-07, "loss": 1.07326298, "memory(GiB)": 142.32, "step": 145360, "train_speed(iter/s)": 0.286278 }, { "acc": 0.71632051, "epoch": 1.6260676381725474, "grad_norm": 6.40625, "learning_rate": 9.255628738128263e-07, "loss": 1.13305817, "memory(GiB)": 142.32, "step": 145380, "train_speed(iter/s)": 0.286292 }, { "acc": 0.72875514, "epoch": 1.626291337118506, "grad_norm": 5.71875, "learning_rate": 9.244911926342609e-07, "loss": 1.07472954, "memory(GiB)": 142.32, "step": 145400, "train_speed(iter/s)": 0.286307 }, { "acc": 0.74275551, "epoch": 1.6265150360644645, "grad_norm": 4.90625, "learning_rate": 9.234200690435541e-07, "loss": 1.0235218, "memory(GiB)": 142.32, "step": 145420, "train_speed(iter/s)": 0.28632 }, { "acc": 0.74159069, "epoch": 1.626738735010423, "grad_norm": 7.5625, "learning_rate": 9.223495031872481e-07, "loss": 1.02058964, "memory(GiB)": 142.32, "step": 145440, "train_speed(iter/s)": 0.286334 }, { "acc": 0.74618864, "epoch": 1.6269624339563815, "grad_norm": 6.46875, "learning_rate": 9.212794952118143e-07, "loss": 1.01772976, "memory(GiB)": 142.32, "step": 145460, "train_speed(iter/s)": 0.286347 }, { "acc": 0.7286087, "epoch": 1.62718613290234, "grad_norm": 5.84375, "learning_rate": 9.202100452636442e-07, "loss": 1.08741398, "memory(GiB)": 142.32, "step": 145480, "train_speed(iter/s)": 0.28636 }, { "acc": 0.7294939, "epoch": 1.6274098318482986, "grad_norm": 5.5, "learning_rate": 9.191411534890521e-07, "loss": 1.0786253, "memory(GiB)": 142.32, "step": 145500, "train_speed(iter/s)": 0.286373 }, { "acc": 0.73802309, "epoch": 1.627633530794257, "grad_norm": 7.03125, "learning_rate": 9.180728200342809e-07, "loss": 1.05368309, "memory(GiB)": 142.32, "step": 145520, "train_speed(iter/s)": 0.286386 }, { "acc": 0.73564653, "epoch": 1.6278572297402156, "grad_norm": 5.15625, "learning_rate": 9.170050450454904e-07, "loss": 1.04932995, "memory(GiB)": 142.32, "step": 145540, "train_speed(iter/s)": 0.2864 }, { "acc": 0.75261555, "epoch": 1.6280809286861742, "grad_norm": 5.9375, "learning_rate": 9.159378286687703e-07, "loss": 0.98041973, "memory(GiB)": 142.32, "step": 145560, "train_speed(iter/s)": 0.286413 }, { "acc": 0.74064894, "epoch": 1.6283046276321327, "grad_norm": 6.90625, "learning_rate": 9.148711710501301e-07, "loss": 1.02831402, "memory(GiB)": 142.32, "step": 145580, "train_speed(iter/s)": 0.286427 }, { "acc": 0.72304096, "epoch": 1.6285283265780912, "grad_norm": 7.125, "learning_rate": 9.138050723355025e-07, "loss": 1.12012157, "memory(GiB)": 142.32, "step": 145600, "train_speed(iter/s)": 0.286441 }, { "acc": 0.73677077, "epoch": 1.6287520255240497, "grad_norm": 6.46875, "learning_rate": 9.127395326707472e-07, "loss": 1.04677105, "memory(GiB)": 142.32, "step": 145620, "train_speed(iter/s)": 0.286453 }, { "acc": 0.74404154, "epoch": 1.6289757244700083, "grad_norm": 5.75, "learning_rate": 9.116745522016446e-07, "loss": 1.02446337, "memory(GiB)": 142.32, "step": 145640, "train_speed(iter/s)": 0.286465 }, { "acc": 0.74347353, "epoch": 1.6291994234159668, "grad_norm": 5.90625, "learning_rate": 9.106101310738985e-07, "loss": 1.03124199, "memory(GiB)": 142.32, "step": 145660, "train_speed(iter/s)": 0.286478 }, { "acc": 0.74387641, "epoch": 1.6294231223619253, "grad_norm": 5.46875, "learning_rate": 9.095462694331364e-07, "loss": 1.02422199, "memory(GiB)": 142.32, "step": 145680, "train_speed(iter/s)": 0.286491 }, { "acc": 0.73767939, "epoch": 1.6296468213078839, "grad_norm": 5.21875, "learning_rate": 9.084829674249119e-07, "loss": 1.04984446, "memory(GiB)": 142.32, "step": 145700, "train_speed(iter/s)": 0.286505 }, { "acc": 0.73901148, "epoch": 1.6298705202538424, "grad_norm": 9.3125, "learning_rate": 9.074202251946984e-07, "loss": 1.03854256, "memory(GiB)": 142.32, "step": 145720, "train_speed(iter/s)": 0.286519 }, { "acc": 0.72946782, "epoch": 1.630094219199801, "grad_norm": 6.71875, "learning_rate": 9.063580428878938e-07, "loss": 1.08863268, "memory(GiB)": 142.32, "step": 145740, "train_speed(iter/s)": 0.286533 }, { "acc": 0.73368158, "epoch": 1.6303179181457594, "grad_norm": 5.75, "learning_rate": 9.05296420649821e-07, "loss": 1.06373272, "memory(GiB)": 142.32, "step": 145760, "train_speed(iter/s)": 0.286548 }, { "acc": 0.73306007, "epoch": 1.630541617091718, "grad_norm": 5.8125, "learning_rate": 9.042353586257241e-07, "loss": 1.06625404, "memory(GiB)": 142.32, "step": 145780, "train_speed(iter/s)": 0.286561 }, { "acc": 0.72960253, "epoch": 1.6307653160376765, "grad_norm": 6.78125, "learning_rate": 9.031748569607734e-07, "loss": 1.07631378, "memory(GiB)": 142.32, "step": 145800, "train_speed(iter/s)": 0.286575 }, { "acc": 0.73030262, "epoch": 1.630989014983635, "grad_norm": 4.5625, "learning_rate": 9.021149158000592e-07, "loss": 1.0706089, "memory(GiB)": 142.32, "step": 145820, "train_speed(iter/s)": 0.286586 }, { "acc": 0.73069153, "epoch": 1.6312127139295935, "grad_norm": 5.3125, "learning_rate": 9.010555352885952e-07, "loss": 1.07634563, "memory(GiB)": 142.32, "step": 145840, "train_speed(iter/s)": 0.286599 }, { "acc": 0.7484993, "epoch": 1.631436412875552, "grad_norm": 6.71875, "learning_rate": 8.99996715571323e-07, "loss": 0.98809252, "memory(GiB)": 142.32, "step": 145860, "train_speed(iter/s)": 0.286612 }, { "acc": 0.74399557, "epoch": 1.6316601118215106, "grad_norm": 5.90625, "learning_rate": 8.989384567931014e-07, "loss": 1.02003765, "memory(GiB)": 142.32, "step": 145880, "train_speed(iter/s)": 0.286624 }, { "acc": 0.73923559, "epoch": 1.6318838107674691, "grad_norm": 5.84375, "learning_rate": 8.978807590987177e-07, "loss": 1.05642204, "memory(GiB)": 142.32, "step": 145900, "train_speed(iter/s)": 0.286636 }, { "acc": 0.74018831, "epoch": 1.6321075097134277, "grad_norm": 6.40625, "learning_rate": 8.968236226328792e-07, "loss": 1.03645077, "memory(GiB)": 142.32, "step": 145920, "train_speed(iter/s)": 0.28665 }, { "acc": 0.73195024, "epoch": 1.6323312086593862, "grad_norm": 6.15625, "learning_rate": 8.957670475402152e-07, "loss": 1.07040615, "memory(GiB)": 142.32, "step": 145940, "train_speed(iter/s)": 0.286663 }, { "acc": 0.74361591, "epoch": 1.6325549076053447, "grad_norm": 5.09375, "learning_rate": 8.947110339652842e-07, "loss": 1.01384277, "memory(GiB)": 142.32, "step": 145960, "train_speed(iter/s)": 0.286676 }, { "acc": 0.7408474, "epoch": 1.6327786065513032, "grad_norm": 7.3125, "learning_rate": 8.9365558205256e-07, "loss": 1.03741837, "memory(GiB)": 142.32, "step": 145980, "train_speed(iter/s)": 0.286689 }, { "acc": 0.74042397, "epoch": 1.6330023054972618, "grad_norm": 5.03125, "learning_rate": 8.92600691946447e-07, "loss": 1.03557434, "memory(GiB)": 142.32, "step": 146000, "train_speed(iter/s)": 0.286704 }, { "epoch": 1.6330023054972618, "eval_acc": 0.6963760352790526, "eval_loss": 1.0714069604873657, "eval_runtime": 2340.2875, "eval_samples_per_second": 32.168, "eval_steps_per_second": 16.084, "step": 146000 }, { "acc": 0.72942295, "epoch": 1.6332260044432203, "grad_norm": 5.5625, "learning_rate": 8.915463637912675e-07, "loss": 1.07698336, "memory(GiB)": 142.32, "step": 146020, "train_speed(iter/s)": 0.285378 }, { "acc": 0.7311326, "epoch": 1.6334497033891788, "grad_norm": 6.25, "learning_rate": 8.904925977312673e-07, "loss": 1.06521492, "memory(GiB)": 142.32, "step": 146040, "train_speed(iter/s)": 0.285391 }, { "acc": 0.72297606, "epoch": 1.6336734023351374, "grad_norm": 4.8125, "learning_rate": 8.894393939106194e-07, "loss": 1.11362591, "memory(GiB)": 142.32, "step": 146060, "train_speed(iter/s)": 0.285404 }, { "acc": 0.73673248, "epoch": 1.6338971012810959, "grad_norm": 5.875, "learning_rate": 8.883867524734147e-07, "loss": 1.04969234, "memory(GiB)": 142.32, "step": 146080, "train_speed(iter/s)": 0.285419 }, { "acc": 0.74294672, "epoch": 1.6341208002270544, "grad_norm": 6.1875, "learning_rate": 8.873346735636718e-07, "loss": 1.02885208, "memory(GiB)": 142.32, "step": 146100, "train_speed(iter/s)": 0.285433 }, { "acc": 0.74304786, "epoch": 1.634344499173013, "grad_norm": 6.53125, "learning_rate": 8.862831573253295e-07, "loss": 1.02111702, "memory(GiB)": 142.32, "step": 146120, "train_speed(iter/s)": 0.285445 }, { "acc": 0.74198332, "epoch": 1.6345681981189715, "grad_norm": 6.1875, "learning_rate": 8.852322039022482e-07, "loss": 1.00856657, "memory(GiB)": 142.32, "step": 146140, "train_speed(iter/s)": 0.285458 }, { "acc": 0.74379854, "epoch": 1.63479189706493, "grad_norm": 5.75, "learning_rate": 8.841818134382163e-07, "loss": 1.00823841, "memory(GiB)": 142.32, "step": 146160, "train_speed(iter/s)": 0.285474 }, { "acc": 0.75354881, "epoch": 1.6350155960108885, "grad_norm": 5.84375, "learning_rate": 8.8313198607694e-07, "loss": 0.99127131, "memory(GiB)": 142.32, "step": 146180, "train_speed(iter/s)": 0.285486 }, { "acc": 0.72796917, "epoch": 1.635239294956847, "grad_norm": 5.96875, "learning_rate": 8.820827219620526e-07, "loss": 1.09384117, "memory(GiB)": 142.32, "step": 146200, "train_speed(iter/s)": 0.285499 }, { "acc": 0.73991585, "epoch": 1.6354629939028056, "grad_norm": 6.375, "learning_rate": 8.81034021237106e-07, "loss": 1.0266715, "memory(GiB)": 142.32, "step": 146220, "train_speed(iter/s)": 0.285513 }, { "acc": 0.733004, "epoch": 1.6356866928487641, "grad_norm": 6.15625, "learning_rate": 8.799858840455805e-07, "loss": 1.07404366, "memory(GiB)": 142.32, "step": 146240, "train_speed(iter/s)": 0.285525 }, { "acc": 0.71887312, "epoch": 1.6359103917947226, "grad_norm": 7.0, "learning_rate": 8.789383105308746e-07, "loss": 1.13007355, "memory(GiB)": 142.32, "step": 146260, "train_speed(iter/s)": 0.285538 }, { "acc": 0.73997631, "epoch": 1.6361340907406812, "grad_norm": 6.34375, "learning_rate": 8.778913008363099e-07, "loss": 1.04131517, "memory(GiB)": 142.32, "step": 146280, "train_speed(iter/s)": 0.285551 }, { "acc": 0.7341671, "epoch": 1.6363577896866397, "grad_norm": 6.4375, "learning_rate": 8.768448551051351e-07, "loss": 1.06291542, "memory(GiB)": 142.32, "step": 146300, "train_speed(iter/s)": 0.285565 }, { "acc": 0.74527268, "epoch": 1.6365814886325982, "grad_norm": 7.0, "learning_rate": 8.757989734805161e-07, "loss": 1.00134697, "memory(GiB)": 142.32, "step": 146320, "train_speed(iter/s)": 0.285578 }, { "acc": 0.72924757, "epoch": 1.6368051875785568, "grad_norm": 5.40625, "learning_rate": 8.747536561055475e-07, "loss": 1.08168049, "memory(GiB)": 142.32, "step": 146340, "train_speed(iter/s)": 0.28559 }, { "acc": 0.73360777, "epoch": 1.6370288865245153, "grad_norm": 6.8125, "learning_rate": 8.737089031232421e-07, "loss": 1.05702763, "memory(GiB)": 142.32, "step": 146360, "train_speed(iter/s)": 0.285604 }, { "acc": 0.7442565, "epoch": 1.6372525854704738, "grad_norm": 5.75, "learning_rate": 8.72664714676536e-07, "loss": 1.00421667, "memory(GiB)": 142.32, "step": 146380, "train_speed(iter/s)": 0.285618 }, { "acc": 0.73726883, "epoch": 1.6374762844164323, "grad_norm": 5.9375, "learning_rate": 8.716210909082912e-07, "loss": 1.04389133, "memory(GiB)": 142.32, "step": 146400, "train_speed(iter/s)": 0.285631 }, { "acc": 0.74649019, "epoch": 1.6376999833623909, "grad_norm": 7.625, "learning_rate": 8.705780319612877e-07, "loss": 0.97586384, "memory(GiB)": 142.32, "step": 146420, "train_speed(iter/s)": 0.285645 }, { "acc": 0.7406971, "epoch": 1.6379236823083494, "grad_norm": 5.875, "learning_rate": 8.695355379782339e-07, "loss": 1.02833939, "memory(GiB)": 142.32, "step": 146440, "train_speed(iter/s)": 0.285659 }, { "acc": 0.73546009, "epoch": 1.638147381254308, "grad_norm": 6.28125, "learning_rate": 8.684936091017565e-07, "loss": 1.04652872, "memory(GiB)": 142.32, "step": 146460, "train_speed(iter/s)": 0.285672 }, { "acc": 0.74560461, "epoch": 1.6383710802002664, "grad_norm": 7.5, "learning_rate": 8.67452245474405e-07, "loss": 0.98902092, "memory(GiB)": 142.32, "step": 146480, "train_speed(iter/s)": 0.285686 }, { "acc": 0.73837042, "epoch": 1.638594779146225, "grad_norm": 6.5625, "learning_rate": 8.664114472386553e-07, "loss": 1.04442463, "memory(GiB)": 142.32, "step": 146500, "train_speed(iter/s)": 0.285699 }, { "acc": 0.74215903, "epoch": 1.6388184780921835, "grad_norm": 5.90625, "learning_rate": 8.653712145369009e-07, "loss": 1.01942291, "memory(GiB)": 142.32, "step": 146520, "train_speed(iter/s)": 0.28571 }, { "acc": 0.74748549, "epoch": 1.639042177038142, "grad_norm": 5.90625, "learning_rate": 8.643315475114633e-07, "loss": 0.99095459, "memory(GiB)": 142.32, "step": 146540, "train_speed(iter/s)": 0.285723 }, { "acc": 0.73818655, "epoch": 1.6392658759841006, "grad_norm": 7.5, "learning_rate": 8.632924463045822e-07, "loss": 1.04103279, "memory(GiB)": 142.32, "step": 146560, "train_speed(iter/s)": 0.285736 }, { "acc": 0.7262866, "epoch": 1.639489574930059, "grad_norm": 6.1875, "learning_rate": 8.6225391105842e-07, "loss": 1.10651941, "memory(GiB)": 142.32, "step": 146580, "train_speed(iter/s)": 0.28575 }, { "acc": 0.75218058, "epoch": 1.6397132738760176, "grad_norm": 7.21875, "learning_rate": 8.612159419150662e-07, "loss": 0.9845829, "memory(GiB)": 142.32, "step": 146600, "train_speed(iter/s)": 0.285764 }, { "acc": 0.72068691, "epoch": 1.6399369728219761, "grad_norm": 5.34375, "learning_rate": 8.601785390165273e-07, "loss": 1.1066433, "memory(GiB)": 142.32, "step": 146620, "train_speed(iter/s)": 0.285777 }, { "acc": 0.73358021, "epoch": 1.6401606717679347, "grad_norm": 5.5625, "learning_rate": 8.591417025047371e-07, "loss": 1.08421612, "memory(GiB)": 142.32, "step": 146640, "train_speed(iter/s)": 0.285791 }, { "acc": 0.73680615, "epoch": 1.6403843707138932, "grad_norm": 5.46875, "learning_rate": 8.581054325215488e-07, "loss": 1.05009985, "memory(GiB)": 142.32, "step": 146660, "train_speed(iter/s)": 0.285804 }, { "acc": 0.73110561, "epoch": 1.6406080696598517, "grad_norm": 4.65625, "learning_rate": 8.570697292087371e-07, "loss": 1.0712328, "memory(GiB)": 142.32, "step": 146680, "train_speed(iter/s)": 0.285817 }, { "acc": 0.72993345, "epoch": 1.6408317686058103, "grad_norm": 6.125, "learning_rate": 8.560345927080038e-07, "loss": 1.07634296, "memory(GiB)": 142.32, "step": 146700, "train_speed(iter/s)": 0.285831 }, { "acc": 0.73077974, "epoch": 1.6410554675517688, "grad_norm": 6.4375, "learning_rate": 8.550000231609684e-07, "loss": 1.08653851, "memory(GiB)": 142.32, "step": 146720, "train_speed(iter/s)": 0.285845 }, { "acc": 0.73231401, "epoch": 1.6412791664977273, "grad_norm": 7.6875, "learning_rate": 8.53966020709176e-07, "loss": 1.06350613, "memory(GiB)": 142.32, "step": 146740, "train_speed(iter/s)": 0.285858 }, { "acc": 0.75459423, "epoch": 1.6415028654436858, "grad_norm": 6.15625, "learning_rate": 8.529325854940929e-07, "loss": 0.96280327, "memory(GiB)": 142.32, "step": 146760, "train_speed(iter/s)": 0.285872 }, { "acc": 0.73767114, "epoch": 1.6417265643896444, "grad_norm": 5.75, "learning_rate": 8.518997176571059e-07, "loss": 1.05247593, "memory(GiB)": 142.32, "step": 146780, "train_speed(iter/s)": 0.285887 }, { "acc": 0.7260458, "epoch": 1.641950263335603, "grad_norm": 4.96875, "learning_rate": 8.508674173395293e-07, "loss": 1.10348797, "memory(GiB)": 142.32, "step": 146800, "train_speed(iter/s)": 0.2859 }, { "acc": 0.72960672, "epoch": 1.6421739622815614, "grad_norm": 6.1875, "learning_rate": 8.498356846825939e-07, "loss": 1.0735323, "memory(GiB)": 142.32, "step": 146820, "train_speed(iter/s)": 0.285913 }, { "acc": 0.74347572, "epoch": 1.64239766122752, "grad_norm": 5.40625, "learning_rate": 8.48804519827457e-07, "loss": 1.02747765, "memory(GiB)": 142.32, "step": 146840, "train_speed(iter/s)": 0.285926 }, { "acc": 0.73967738, "epoch": 1.6426213601734785, "grad_norm": 5.0625, "learning_rate": 8.477739229151949e-07, "loss": 1.04606266, "memory(GiB)": 142.32, "step": 146860, "train_speed(iter/s)": 0.28594 }, { "acc": 0.73464818, "epoch": 1.642845059119437, "grad_norm": 7.40625, "learning_rate": 8.4674389408681e-07, "loss": 1.05838318, "memory(GiB)": 142.32, "step": 146880, "train_speed(iter/s)": 0.285953 }, { "acc": 0.73580313, "epoch": 1.6430687580653955, "grad_norm": 5.0625, "learning_rate": 8.45714433483224e-07, "loss": 1.05842171, "memory(GiB)": 142.32, "step": 146900, "train_speed(iter/s)": 0.285967 }, { "acc": 0.74333391, "epoch": 1.643292457011354, "grad_norm": 6.40625, "learning_rate": 8.446855412452809e-07, "loss": 1.02361012, "memory(GiB)": 142.32, "step": 146920, "train_speed(iter/s)": 0.285981 }, { "acc": 0.73646526, "epoch": 1.6435161559573126, "grad_norm": 5.46875, "learning_rate": 8.436572175137502e-07, "loss": 1.04939861, "memory(GiB)": 142.32, "step": 146940, "train_speed(iter/s)": 0.285996 }, { "acc": 0.7375989, "epoch": 1.6437398549032711, "grad_norm": 5.09375, "learning_rate": 8.426294624293186e-07, "loss": 1.04282131, "memory(GiB)": 142.32, "step": 146960, "train_speed(iter/s)": 0.28601 }, { "acc": 0.73239498, "epoch": 1.6439635538492297, "grad_norm": 5.84375, "learning_rate": 8.416022761326004e-07, "loss": 1.07358541, "memory(GiB)": 142.32, "step": 146980, "train_speed(iter/s)": 0.286023 }, { "acc": 0.73708591, "epoch": 1.6441872527951882, "grad_norm": 5.40625, "learning_rate": 8.405756587641278e-07, "loss": 1.05233631, "memory(GiB)": 142.32, "step": 147000, "train_speed(iter/s)": 0.286035 }, { "acc": 0.74263811, "epoch": 1.6444109517411467, "grad_norm": 6.28125, "learning_rate": 8.395496104643558e-07, "loss": 1.02532578, "memory(GiB)": 142.32, "step": 147020, "train_speed(iter/s)": 0.286048 }, { "acc": 0.74028282, "epoch": 1.6446346506871055, "grad_norm": 6.46875, "learning_rate": 8.385241313736647e-07, "loss": 1.00985355, "memory(GiB)": 142.32, "step": 147040, "train_speed(iter/s)": 0.286062 }, { "acc": 0.72970963, "epoch": 1.644858349633064, "grad_norm": 6.9375, "learning_rate": 8.374992216323529e-07, "loss": 1.08050671, "memory(GiB)": 142.32, "step": 147060, "train_speed(iter/s)": 0.286076 }, { "acc": 0.73320646, "epoch": 1.6450820485790225, "grad_norm": 4.5, "learning_rate": 8.364748813806444e-07, "loss": 1.06541176, "memory(GiB)": 142.32, "step": 147080, "train_speed(iter/s)": 0.286088 }, { "acc": 0.74723854, "epoch": 1.645305747524981, "grad_norm": 5.625, "learning_rate": 8.354511107586826e-07, "loss": 0.99897022, "memory(GiB)": 142.32, "step": 147100, "train_speed(iter/s)": 0.286101 }, { "acc": 0.73602281, "epoch": 1.6455294464709396, "grad_norm": 6.8125, "learning_rate": 8.344279099065328e-07, "loss": 1.03966618, "memory(GiB)": 142.32, "step": 147120, "train_speed(iter/s)": 0.286115 }, { "acc": 0.7351779, "epoch": 1.645753145416898, "grad_norm": 5.71875, "learning_rate": 8.334052789641856e-07, "loss": 1.06181164, "memory(GiB)": 142.32, "step": 147140, "train_speed(iter/s)": 0.286128 }, { "acc": 0.74196982, "epoch": 1.6459768443628566, "grad_norm": 6.78125, "learning_rate": 8.323832180715497e-07, "loss": 1.03369751, "memory(GiB)": 142.32, "step": 147160, "train_speed(iter/s)": 0.286142 }, { "acc": 0.74069724, "epoch": 1.6462005433088152, "grad_norm": 6.46875, "learning_rate": 8.313617273684593e-07, "loss": 1.03307552, "memory(GiB)": 142.32, "step": 147180, "train_speed(iter/s)": 0.286156 }, { "acc": 0.73212199, "epoch": 1.6464242422547737, "grad_norm": 5.3125, "learning_rate": 8.303408069946689e-07, "loss": 1.06773643, "memory(GiB)": 142.32, "step": 147200, "train_speed(iter/s)": 0.286171 }, { "acc": 0.73983316, "epoch": 1.6466479412007322, "grad_norm": 5.71875, "learning_rate": 8.293204570898528e-07, "loss": 1.04074335, "memory(GiB)": 142.32, "step": 147220, "train_speed(iter/s)": 0.286185 }, { "acc": 0.72628813, "epoch": 1.6468716401466907, "grad_norm": 6.9375, "learning_rate": 8.283006777936114e-07, "loss": 1.08075714, "memory(GiB)": 142.32, "step": 147240, "train_speed(iter/s)": 0.286199 }, { "acc": 0.74893274, "epoch": 1.6470953390926493, "grad_norm": 6.40625, "learning_rate": 8.272814692454644e-07, "loss": 0.98958321, "memory(GiB)": 142.32, "step": 147260, "train_speed(iter/s)": 0.286213 }, { "acc": 0.75870295, "epoch": 1.6473190380386078, "grad_norm": 6.75, "learning_rate": 8.262628315848547e-07, "loss": 0.95527897, "memory(GiB)": 142.32, "step": 147280, "train_speed(iter/s)": 0.286226 }, { "acc": 0.73777361, "epoch": 1.6475427369845663, "grad_norm": 5.40625, "learning_rate": 8.252447649511469e-07, "loss": 1.03277702, "memory(GiB)": 142.32, "step": 147300, "train_speed(iter/s)": 0.286239 }, { "acc": 0.75410328, "epoch": 1.6477664359305249, "grad_norm": 5.75, "learning_rate": 8.242272694836245e-07, "loss": 0.97406073, "memory(GiB)": 142.32, "step": 147320, "train_speed(iter/s)": 0.286253 }, { "acc": 0.73762665, "epoch": 1.6479901348764834, "grad_norm": 4.9375, "learning_rate": 8.232103453214985e-07, "loss": 1.03651695, "memory(GiB)": 142.32, "step": 147340, "train_speed(iter/s)": 0.286265 }, { "acc": 0.74015579, "epoch": 1.648213833822442, "grad_norm": 7.28125, "learning_rate": 8.221939926038963e-07, "loss": 1.0400032, "memory(GiB)": 142.32, "step": 147360, "train_speed(iter/s)": 0.286279 }, { "acc": 0.73582106, "epoch": 1.6484375327684004, "grad_norm": 5.71875, "learning_rate": 8.211782114698713e-07, "loss": 1.05948162, "memory(GiB)": 142.32, "step": 147380, "train_speed(iter/s)": 0.286294 }, { "acc": 0.72803316, "epoch": 1.648661231714359, "grad_norm": 6.5, "learning_rate": 8.201630020583967e-07, "loss": 1.088204, "memory(GiB)": 142.32, "step": 147400, "train_speed(iter/s)": 0.286308 }, { "acc": 0.72770357, "epoch": 1.6488849306603175, "grad_norm": 5.125, "learning_rate": 8.191483645083654e-07, "loss": 1.08780441, "memory(GiB)": 142.32, "step": 147420, "train_speed(iter/s)": 0.286321 }, { "acc": 0.74010248, "epoch": 1.649108629606276, "grad_norm": 7.34375, "learning_rate": 8.181342989585972e-07, "loss": 1.03663836, "memory(GiB)": 142.32, "step": 147440, "train_speed(iter/s)": 0.286335 }, { "acc": 0.73500156, "epoch": 1.6493323285522346, "grad_norm": 6.21875, "learning_rate": 8.171208055478286e-07, "loss": 1.05426216, "memory(GiB)": 142.32, "step": 147460, "train_speed(iter/s)": 0.286347 }, { "acc": 0.72855182, "epoch": 1.649556027498193, "grad_norm": 5.3125, "learning_rate": 8.161078844147219e-07, "loss": 1.08483982, "memory(GiB)": 142.32, "step": 147480, "train_speed(iter/s)": 0.286359 }, { "acc": 0.73874063, "epoch": 1.6497797264441516, "grad_norm": 8.25, "learning_rate": 8.150955356978579e-07, "loss": 1.04367313, "memory(GiB)": 142.32, "step": 147500, "train_speed(iter/s)": 0.286371 }, { "acc": 0.75238986, "epoch": 1.6500034253901101, "grad_norm": 4.6875, "learning_rate": 8.140837595357398e-07, "loss": 0.96363401, "memory(GiB)": 142.32, "step": 147520, "train_speed(iter/s)": 0.286384 }, { "acc": 0.74167933, "epoch": 1.6502271243360687, "grad_norm": 6.40625, "learning_rate": 8.130725560667957e-07, "loss": 1.0307848, "memory(GiB)": 142.32, "step": 147540, "train_speed(iter/s)": 0.286394 }, { "acc": 0.73661537, "epoch": 1.6504508232820272, "grad_norm": 5.96875, "learning_rate": 8.120619254293688e-07, "loss": 1.05079975, "memory(GiB)": 142.32, "step": 147560, "train_speed(iter/s)": 0.286408 }, { "acc": 0.73954334, "epoch": 1.6506745222279857, "grad_norm": 7.0625, "learning_rate": 8.110518677617318e-07, "loss": 1.03322163, "memory(GiB)": 142.32, "step": 147580, "train_speed(iter/s)": 0.286421 }, { "acc": 0.73266287, "epoch": 1.6508982211739442, "grad_norm": 6.0, "learning_rate": 8.100423832020726e-07, "loss": 1.06970577, "memory(GiB)": 142.32, "step": 147600, "train_speed(iter/s)": 0.286433 }, { "acc": 0.73289342, "epoch": 1.6511219201199028, "grad_norm": 6.78125, "learning_rate": 8.090334718885029e-07, "loss": 1.07237377, "memory(GiB)": 142.32, "step": 147620, "train_speed(iter/s)": 0.286445 }, { "acc": 0.74614892, "epoch": 1.6513456190658613, "grad_norm": 6.4375, "learning_rate": 8.080251339590578e-07, "loss": 1.0163023, "memory(GiB)": 142.32, "step": 147640, "train_speed(iter/s)": 0.286458 }, { "acc": 0.73663721, "epoch": 1.6515693180118198, "grad_norm": 6.875, "learning_rate": 8.070173695516908e-07, "loss": 1.0525836, "memory(GiB)": 142.32, "step": 147660, "train_speed(iter/s)": 0.286471 }, { "acc": 0.73114414, "epoch": 1.6517930169577784, "grad_norm": 5.5625, "learning_rate": 8.060101788042801e-07, "loss": 1.07455864, "memory(GiB)": 142.32, "step": 147680, "train_speed(iter/s)": 0.286486 }, { "acc": 0.73006883, "epoch": 1.6520167159037369, "grad_norm": 6.15625, "learning_rate": 8.050035618546226e-07, "loss": 1.079282, "memory(GiB)": 142.32, "step": 147700, "train_speed(iter/s)": 0.2865 }, { "acc": 0.73827615, "epoch": 1.6522404148496954, "grad_norm": 5.125, "learning_rate": 8.039975188404369e-07, "loss": 1.03211918, "memory(GiB)": 142.32, "step": 147720, "train_speed(iter/s)": 0.286514 }, { "acc": 0.73622608, "epoch": 1.652464113795654, "grad_norm": 6.375, "learning_rate": 8.02992049899367e-07, "loss": 1.05548458, "memory(GiB)": 142.32, "step": 147740, "train_speed(iter/s)": 0.286527 }, { "acc": 0.73399253, "epoch": 1.6526878127416125, "grad_norm": 7.09375, "learning_rate": 8.019871551689723e-07, "loss": 1.06186657, "memory(GiB)": 142.32, "step": 147760, "train_speed(iter/s)": 0.286541 }, { "acc": 0.7354917, "epoch": 1.652911511687571, "grad_norm": 7.59375, "learning_rate": 8.009828347867399e-07, "loss": 1.04597826, "memory(GiB)": 142.32, "step": 147780, "train_speed(iter/s)": 0.286553 }, { "acc": 0.74599266, "epoch": 1.6531352106335295, "grad_norm": 5.6875, "learning_rate": 7.999790888900727e-07, "loss": 1.01116791, "memory(GiB)": 142.32, "step": 147800, "train_speed(iter/s)": 0.286566 }, { "acc": 0.74033318, "epoch": 1.653358909579488, "grad_norm": 5.4375, "learning_rate": 7.989759176162976e-07, "loss": 1.03290062, "memory(GiB)": 142.32, "step": 147820, "train_speed(iter/s)": 0.286579 }, { "acc": 0.75005412, "epoch": 1.6535826085254466, "grad_norm": 6.375, "learning_rate": 7.979733211026641e-07, "loss": 0.97261658, "memory(GiB)": 142.32, "step": 147840, "train_speed(iter/s)": 0.286593 }, { "acc": 0.72926779, "epoch": 1.6538063074714051, "grad_norm": 5.5625, "learning_rate": 7.969712994863404e-07, "loss": 1.07520399, "memory(GiB)": 142.32, "step": 147860, "train_speed(iter/s)": 0.286606 }, { "acc": 0.73649812, "epoch": 1.6540300064173636, "grad_norm": 6.625, "learning_rate": 7.959698529044191e-07, "loss": 1.04522057, "memory(GiB)": 142.32, "step": 147880, "train_speed(iter/s)": 0.286619 }, { "acc": 0.73499727, "epoch": 1.6542537053633222, "grad_norm": 6.34375, "learning_rate": 7.949689814939115e-07, "loss": 1.05179214, "memory(GiB)": 142.32, "step": 147900, "train_speed(iter/s)": 0.286632 }, { "acc": 0.73787365, "epoch": 1.6544774043092807, "grad_norm": 6.59375, "learning_rate": 7.939686853917494e-07, "loss": 1.05863018, "memory(GiB)": 142.32, "step": 147920, "train_speed(iter/s)": 0.286646 }, { "acc": 0.71766834, "epoch": 1.6547011032552392, "grad_norm": 6.21875, "learning_rate": 7.929689647347904e-07, "loss": 1.11665764, "memory(GiB)": 142.32, "step": 147940, "train_speed(iter/s)": 0.286661 }, { "acc": 0.75124817, "epoch": 1.6549248022011978, "grad_norm": 5.375, "learning_rate": 7.919698196598086e-07, "loss": 0.97819824, "memory(GiB)": 142.32, "step": 147960, "train_speed(iter/s)": 0.286674 }, { "acc": 0.7388772, "epoch": 1.6551485011471563, "grad_norm": 8.0, "learning_rate": 7.909712503035027e-07, "loss": 1.04556913, "memory(GiB)": 142.32, "step": 147980, "train_speed(iter/s)": 0.286687 }, { "acc": 0.72745008, "epoch": 1.6553722000931148, "grad_norm": 5.1875, "learning_rate": 7.899732568024914e-07, "loss": 1.09143686, "memory(GiB)": 142.32, "step": 148000, "train_speed(iter/s)": 0.286701 }, { "epoch": 1.6553722000931148, "eval_acc": 0.696367654847695, "eval_loss": 1.0713894367218018, "eval_runtime": 2341.6794, "eval_samples_per_second": 32.149, "eval_steps_per_second": 16.075, "step": 148000 }, { "acc": 0.73880768, "epoch": 1.6555958990390733, "grad_norm": 5.34375, "learning_rate": 7.889758392933133e-07, "loss": 1.02867794, "memory(GiB)": 142.32, "step": 148020, "train_speed(iter/s)": 0.28539 }, { "acc": 0.73143444, "epoch": 1.6558195979850319, "grad_norm": 5.4375, "learning_rate": 7.879789979124297e-07, "loss": 1.0707983, "memory(GiB)": 142.32, "step": 148040, "train_speed(iter/s)": 0.285403 }, { "acc": 0.72809477, "epoch": 1.6560432969309904, "grad_norm": 6.46875, "learning_rate": 7.869827327962215e-07, "loss": 1.08645916, "memory(GiB)": 142.32, "step": 148060, "train_speed(iter/s)": 0.285417 }, { "acc": 0.74330792, "epoch": 1.656266995876949, "grad_norm": 6.1875, "learning_rate": 7.859870440809947e-07, "loss": 1.01866064, "memory(GiB)": 142.32, "step": 148080, "train_speed(iter/s)": 0.28543 }, { "acc": 0.73447886, "epoch": 1.6564906948229075, "grad_norm": 5.65625, "learning_rate": 7.849919319029714e-07, "loss": 1.05734367, "memory(GiB)": 142.32, "step": 148100, "train_speed(iter/s)": 0.285444 }, { "acc": 0.73718033, "epoch": 1.656714393768866, "grad_norm": 6.3125, "learning_rate": 7.839973963982995e-07, "loss": 1.04019394, "memory(GiB)": 142.32, "step": 148120, "train_speed(iter/s)": 0.285459 }, { "acc": 0.73682795, "epoch": 1.6569380927148245, "grad_norm": 6.4375, "learning_rate": 7.830034377030437e-07, "loss": 1.04694691, "memory(GiB)": 142.32, "step": 148140, "train_speed(iter/s)": 0.285472 }, { "acc": 0.74557657, "epoch": 1.657161791660783, "grad_norm": 5.8125, "learning_rate": 7.820100559531918e-07, "loss": 1.01773319, "memory(GiB)": 142.32, "step": 148160, "train_speed(iter/s)": 0.285484 }, { "acc": 0.74004021, "epoch": 1.6573854906067416, "grad_norm": 6.4375, "learning_rate": 7.810172512846537e-07, "loss": 1.05143814, "memory(GiB)": 142.32, "step": 148180, "train_speed(iter/s)": 0.285497 }, { "acc": 0.74252944, "epoch": 1.6576091895527, "grad_norm": 5.25, "learning_rate": 7.800250238332585e-07, "loss": 1.01510544, "memory(GiB)": 142.32, "step": 148200, "train_speed(iter/s)": 0.285511 }, { "acc": 0.74207759, "epoch": 1.6578328884986586, "grad_norm": 6.15625, "learning_rate": 7.790333737347577e-07, "loss": 1.02058544, "memory(GiB)": 142.32, "step": 148220, "train_speed(iter/s)": 0.285524 }, { "acc": 0.74503145, "epoch": 1.6580565874446171, "grad_norm": 5.5, "learning_rate": 7.780423011248234e-07, "loss": 0.99798527, "memory(GiB)": 142.32, "step": 148240, "train_speed(iter/s)": 0.285538 }, { "acc": 0.73566022, "epoch": 1.6582802863905757, "grad_norm": 5.875, "learning_rate": 7.770518061390469e-07, "loss": 1.06531887, "memory(GiB)": 142.32, "step": 148260, "train_speed(iter/s)": 0.285551 }, { "acc": 0.74702673, "epoch": 1.6585039853365342, "grad_norm": 6.25, "learning_rate": 7.76061888912944e-07, "loss": 1.00026989, "memory(GiB)": 142.32, "step": 148280, "train_speed(iter/s)": 0.285564 }, { "acc": 0.73974056, "epoch": 1.6587276842824927, "grad_norm": 5.6875, "learning_rate": 7.750725495819478e-07, "loss": 1.03786087, "memory(GiB)": 142.32, "step": 148300, "train_speed(iter/s)": 0.285576 }, { "acc": 0.735361, "epoch": 1.6589513832284513, "grad_norm": 6.625, "learning_rate": 7.740837882814156e-07, "loss": 1.05649357, "memory(GiB)": 142.32, "step": 148320, "train_speed(iter/s)": 0.285589 }, { "acc": 0.73241768, "epoch": 1.6591750821744098, "grad_norm": 7.25, "learning_rate": 7.730956051466237e-07, "loss": 1.07253542, "memory(GiB)": 142.32, "step": 148340, "train_speed(iter/s)": 0.285602 }, { "acc": 0.73312922, "epoch": 1.6593987811203683, "grad_norm": 5.46875, "learning_rate": 7.721080003127685e-07, "loss": 1.0652607, "memory(GiB)": 142.32, "step": 148360, "train_speed(iter/s)": 0.285615 }, { "acc": 0.7494441, "epoch": 1.6596224800663268, "grad_norm": 5.15625, "learning_rate": 7.711209739149706e-07, "loss": 0.99686546, "memory(GiB)": 142.32, "step": 148380, "train_speed(iter/s)": 0.285627 }, { "acc": 0.74017162, "epoch": 1.6598461790122854, "grad_norm": 5.21875, "learning_rate": 7.701345260882665e-07, "loss": 1.03599911, "memory(GiB)": 142.32, "step": 148400, "train_speed(iter/s)": 0.285638 }, { "acc": 0.73998351, "epoch": 1.660069877958244, "grad_norm": 5.1875, "learning_rate": 7.691486569676193e-07, "loss": 1.03028316, "memory(GiB)": 142.32, "step": 148420, "train_speed(iter/s)": 0.285651 }, { "acc": 0.73929, "epoch": 1.6602935769042024, "grad_norm": 5.65625, "learning_rate": 7.681633666879085e-07, "loss": 1.01987543, "memory(GiB)": 142.32, "step": 148440, "train_speed(iter/s)": 0.285665 }, { "acc": 0.73937016, "epoch": 1.660517275850161, "grad_norm": 5.0625, "learning_rate": 7.67178655383935e-07, "loss": 1.03310966, "memory(GiB)": 142.32, "step": 148460, "train_speed(iter/s)": 0.28568 }, { "acc": 0.73465986, "epoch": 1.6607409747961195, "grad_norm": 5.375, "learning_rate": 7.661945231904233e-07, "loss": 1.07405262, "memory(GiB)": 142.32, "step": 148480, "train_speed(iter/s)": 0.285692 }, { "acc": 0.73411474, "epoch": 1.660964673742078, "grad_norm": 6.9375, "learning_rate": 7.652109702420152e-07, "loss": 1.06611786, "memory(GiB)": 142.32, "step": 148500, "train_speed(iter/s)": 0.285705 }, { "acc": 0.72905416, "epoch": 1.6611883726880365, "grad_norm": 5.96875, "learning_rate": 7.642279966732763e-07, "loss": 1.08348122, "memory(GiB)": 142.32, "step": 148520, "train_speed(iter/s)": 0.285719 }, { "acc": 0.73183708, "epoch": 1.661412071633995, "grad_norm": 5.5, "learning_rate": 7.632456026186902e-07, "loss": 1.05864439, "memory(GiB)": 142.32, "step": 148540, "train_speed(iter/s)": 0.285732 }, { "acc": 0.73868685, "epoch": 1.6616357705799536, "grad_norm": 6.5625, "learning_rate": 7.622637882126622e-07, "loss": 1.04091072, "memory(GiB)": 142.32, "step": 148560, "train_speed(iter/s)": 0.285745 }, { "acc": 0.7465888, "epoch": 1.6618594695259121, "grad_norm": 6.6875, "learning_rate": 7.612825535895202e-07, "loss": 1.01623154, "memory(GiB)": 142.32, "step": 148580, "train_speed(iter/s)": 0.285759 }, { "acc": 0.72986555, "epoch": 1.6620831684718707, "grad_norm": 6.21875, "learning_rate": 7.603018988835087e-07, "loss": 1.05678387, "memory(GiB)": 142.32, "step": 148600, "train_speed(iter/s)": 0.285773 }, { "acc": 0.73586855, "epoch": 1.6623068674178292, "grad_norm": 6.65625, "learning_rate": 7.59321824228798e-07, "loss": 1.04797297, "memory(GiB)": 142.32, "step": 148620, "train_speed(iter/s)": 0.285786 }, { "acc": 0.73780465, "epoch": 1.6625305663637877, "grad_norm": 5.6875, "learning_rate": 7.583423297594755e-07, "loss": 1.03967857, "memory(GiB)": 142.32, "step": 148640, "train_speed(iter/s)": 0.285799 }, { "acc": 0.73394403, "epoch": 1.6627542653097462, "grad_norm": 6.5625, "learning_rate": 7.573634156095478e-07, "loss": 1.04091339, "memory(GiB)": 142.32, "step": 148660, "train_speed(iter/s)": 0.285812 }, { "acc": 0.72458916, "epoch": 1.6629779642557048, "grad_norm": 5.0, "learning_rate": 7.563850819129475e-07, "loss": 1.08893013, "memory(GiB)": 142.32, "step": 148680, "train_speed(iter/s)": 0.285825 }, { "acc": 0.75040989, "epoch": 1.6632016632016633, "grad_norm": 5.46875, "learning_rate": 7.554073288035218e-07, "loss": 0.98319883, "memory(GiB)": 142.32, "step": 148700, "train_speed(iter/s)": 0.285838 }, { "acc": 0.73880854, "epoch": 1.6634253621476218, "grad_norm": 6.75, "learning_rate": 7.544301564150442e-07, "loss": 1.03591833, "memory(GiB)": 142.32, "step": 148720, "train_speed(iter/s)": 0.28585 }, { "acc": 0.75044298, "epoch": 1.6636490610935804, "grad_norm": 6.21875, "learning_rate": 7.534535648812041e-07, "loss": 0.97849655, "memory(GiB)": 142.32, "step": 148740, "train_speed(iter/s)": 0.285864 }, { "acc": 0.74213557, "epoch": 1.6638727600395389, "grad_norm": 5.65625, "learning_rate": 7.524775543356128e-07, "loss": 1.01508656, "memory(GiB)": 142.32, "step": 148760, "train_speed(iter/s)": 0.285877 }, { "acc": 0.72916861, "epoch": 1.6640964589854974, "grad_norm": 6.25, "learning_rate": 7.515021249118038e-07, "loss": 1.07559643, "memory(GiB)": 142.32, "step": 148780, "train_speed(iter/s)": 0.285891 }, { "acc": 0.74713335, "epoch": 1.664320157931456, "grad_norm": 5.21875, "learning_rate": 7.505272767432281e-07, "loss": 0.9911932, "memory(GiB)": 142.32, "step": 148800, "train_speed(iter/s)": 0.285905 }, { "acc": 0.73802199, "epoch": 1.6645438568774145, "grad_norm": 5.53125, "learning_rate": 7.495530099632614e-07, "loss": 1.04168949, "memory(GiB)": 142.32, "step": 148820, "train_speed(iter/s)": 0.285918 }, { "acc": 0.74633675, "epoch": 1.664767555823373, "grad_norm": 5.4375, "learning_rate": 7.485793247051954e-07, "loss": 1.00557766, "memory(GiB)": 142.32, "step": 148840, "train_speed(iter/s)": 0.285931 }, { "acc": 0.74010763, "epoch": 1.6649912547693315, "grad_norm": 7.65625, "learning_rate": 7.476062211022433e-07, "loss": 1.03582478, "memory(GiB)": 142.32, "step": 148860, "train_speed(iter/s)": 0.285944 }, { "acc": 0.74349298, "epoch": 1.66521495371529, "grad_norm": 5.4375, "learning_rate": 7.466336992875423e-07, "loss": 1.02327728, "memory(GiB)": 142.32, "step": 148880, "train_speed(iter/s)": 0.285958 }, { "acc": 0.73757248, "epoch": 1.6654386526612486, "grad_norm": 6.375, "learning_rate": 7.456617593941451e-07, "loss": 1.04799585, "memory(GiB)": 142.32, "step": 148900, "train_speed(iter/s)": 0.28597 }, { "acc": 0.73736525, "epoch": 1.665662351607207, "grad_norm": 7.125, "learning_rate": 7.446904015550282e-07, "loss": 1.04934759, "memory(GiB)": 142.32, "step": 148920, "train_speed(iter/s)": 0.285984 }, { "acc": 0.74443293, "epoch": 1.6658860505531656, "grad_norm": 6.5, "learning_rate": 7.437196259030871e-07, "loss": 1.01902275, "memory(GiB)": 142.32, "step": 148940, "train_speed(iter/s)": 0.285998 }, { "acc": 0.72311573, "epoch": 1.6661097494991242, "grad_norm": 5.96875, "learning_rate": 7.42749432571136e-07, "loss": 1.11195717, "memory(GiB)": 142.32, "step": 148960, "train_speed(iter/s)": 0.286012 }, { "acc": 0.72988272, "epoch": 1.6663334484450827, "grad_norm": 7.625, "learning_rate": 7.417798216919142e-07, "loss": 1.07440929, "memory(GiB)": 142.32, "step": 148980, "train_speed(iter/s)": 0.286026 }, { "acc": 0.7239418, "epoch": 1.6665571473910412, "grad_norm": 5.65625, "learning_rate": 7.408107933980751e-07, "loss": 1.1005518, "memory(GiB)": 142.32, "step": 149000, "train_speed(iter/s)": 0.286039 }, { "acc": 0.72532234, "epoch": 1.6667808463369997, "grad_norm": 7.5625, "learning_rate": 7.398423478221989e-07, "loss": 1.10489998, "memory(GiB)": 142.32, "step": 149020, "train_speed(iter/s)": 0.286051 }, { "acc": 0.74558878, "epoch": 1.6670045452829583, "grad_norm": 5.59375, "learning_rate": 7.38874485096781e-07, "loss": 1.00202827, "memory(GiB)": 142.32, "step": 149040, "train_speed(iter/s)": 0.286063 }, { "acc": 0.74096155, "epoch": 1.6672282442289168, "grad_norm": 6.15625, "learning_rate": 7.37907205354238e-07, "loss": 1.03293161, "memory(GiB)": 142.32, "step": 149060, "train_speed(iter/s)": 0.286076 }, { "acc": 0.73616843, "epoch": 1.6674519431748753, "grad_norm": 5.84375, "learning_rate": 7.369405087269099e-07, "loss": 1.07308588, "memory(GiB)": 142.32, "step": 149080, "train_speed(iter/s)": 0.28609 }, { "acc": 0.71707964, "epoch": 1.6676756421208339, "grad_norm": 6.3125, "learning_rate": 7.359743953470522e-07, "loss": 1.14263191, "memory(GiB)": 142.32, "step": 149100, "train_speed(iter/s)": 0.286103 }, { "acc": 0.74593577, "epoch": 1.6678993410667924, "grad_norm": 6.71875, "learning_rate": 7.350088653468451e-07, "loss": 1.01842632, "memory(GiB)": 142.32, "step": 149120, "train_speed(iter/s)": 0.286117 }, { "acc": 0.74593024, "epoch": 1.668123040012751, "grad_norm": 7.375, "learning_rate": 7.340439188583864e-07, "loss": 1.0104578, "memory(GiB)": 142.32, "step": 149140, "train_speed(iter/s)": 0.286131 }, { "acc": 0.741572, "epoch": 1.6683467389587094, "grad_norm": 5.78125, "learning_rate": 7.330795560136928e-07, "loss": 1.02537384, "memory(GiB)": 142.32, "step": 149160, "train_speed(iter/s)": 0.286143 }, { "acc": 0.74056606, "epoch": 1.668570437904668, "grad_norm": 5.78125, "learning_rate": 7.321157769447052e-07, "loss": 1.02810354, "memory(GiB)": 142.32, "step": 149180, "train_speed(iter/s)": 0.286156 }, { "acc": 0.74969301, "epoch": 1.6687941368506265, "grad_norm": 5.59375, "learning_rate": 7.311525817832816e-07, "loss": 0.99553871, "memory(GiB)": 142.32, "step": 149200, "train_speed(iter/s)": 0.286169 }, { "acc": 0.74067087, "epoch": 1.669017835796585, "grad_norm": 4.8125, "learning_rate": 7.301899706612009e-07, "loss": 1.02474022, "memory(GiB)": 142.32, "step": 149220, "train_speed(iter/s)": 0.286182 }, { "acc": 0.73808022, "epoch": 1.6692415347425436, "grad_norm": 6.09375, "learning_rate": 7.292279437101601e-07, "loss": 1.03190699, "memory(GiB)": 142.32, "step": 149240, "train_speed(iter/s)": 0.286195 }, { "acc": 0.75095391, "epoch": 1.669465233688502, "grad_norm": 6.8125, "learning_rate": 7.282665010617817e-07, "loss": 0.96742802, "memory(GiB)": 142.32, "step": 149260, "train_speed(iter/s)": 0.286209 }, { "acc": 0.7609705, "epoch": 1.6696889326344606, "grad_norm": 6.03125, "learning_rate": 7.273056428476033e-07, "loss": 0.92822514, "memory(GiB)": 142.32, "step": 149280, "train_speed(iter/s)": 0.286221 }, { "acc": 0.72784529, "epoch": 1.6699126315804191, "grad_norm": 5.15625, "learning_rate": 7.263453691990824e-07, "loss": 1.08972988, "memory(GiB)": 142.32, "step": 149300, "train_speed(iter/s)": 0.286234 }, { "acc": 0.73869619, "epoch": 1.6701363305263777, "grad_norm": 6.96875, "learning_rate": 7.253856802476006e-07, "loss": 1.03186913, "memory(GiB)": 142.32, "step": 149320, "train_speed(iter/s)": 0.286247 }, { "acc": 0.74369111, "epoch": 1.6703600294723362, "grad_norm": 5.59375, "learning_rate": 7.244265761244551e-07, "loss": 1.02036896, "memory(GiB)": 142.32, "step": 149340, "train_speed(iter/s)": 0.286259 }, { "acc": 0.7360465, "epoch": 1.6705837284182947, "grad_norm": 7.3125, "learning_rate": 7.234680569608671e-07, "loss": 1.05582409, "memory(GiB)": 142.32, "step": 149360, "train_speed(iter/s)": 0.286273 }, { "acc": 0.72914686, "epoch": 1.6708074273642533, "grad_norm": 7.09375, "learning_rate": 7.225101228879744e-07, "loss": 1.08948288, "memory(GiB)": 142.32, "step": 149380, "train_speed(iter/s)": 0.286286 }, { "acc": 0.73900099, "epoch": 1.6710311263102118, "grad_norm": 6.9375, "learning_rate": 7.215527740368356e-07, "loss": 1.04851151, "memory(GiB)": 142.32, "step": 149400, "train_speed(iter/s)": 0.286299 }, { "acc": 0.74037175, "epoch": 1.6712548252561703, "grad_norm": 6.59375, "learning_rate": 7.205960105384313e-07, "loss": 1.01597443, "memory(GiB)": 142.32, "step": 149420, "train_speed(iter/s)": 0.286313 }, { "acc": 0.73455038, "epoch": 1.6714785242021288, "grad_norm": 7.25, "learning_rate": 7.196398325236581e-07, "loss": 1.06443501, "memory(GiB)": 142.32, "step": 149440, "train_speed(iter/s)": 0.286326 }, { "acc": 0.73253994, "epoch": 1.6717022231480874, "grad_norm": 7.65625, "learning_rate": 7.18684240123338e-07, "loss": 1.05511513, "memory(GiB)": 142.32, "step": 149460, "train_speed(iter/s)": 0.286337 }, { "acc": 0.74061975, "epoch": 1.671925922094046, "grad_norm": 5.21875, "learning_rate": 7.177292334682073e-07, "loss": 1.02775373, "memory(GiB)": 142.32, "step": 149480, "train_speed(iter/s)": 0.286351 }, { "acc": 0.73825607, "epoch": 1.6721496210400044, "grad_norm": 5.125, "learning_rate": 7.167748126889246e-07, "loss": 1.03294477, "memory(GiB)": 142.32, "step": 149500, "train_speed(iter/s)": 0.286364 }, { "acc": 0.73101568, "epoch": 1.672373319985963, "grad_norm": 6.0625, "learning_rate": 7.158209779160697e-07, "loss": 1.05695972, "memory(GiB)": 142.32, "step": 149520, "train_speed(iter/s)": 0.286377 }, { "acc": 0.73926854, "epoch": 1.6725970189319215, "grad_norm": 7.25, "learning_rate": 7.148677292801393e-07, "loss": 1.05157843, "memory(GiB)": 142.32, "step": 149540, "train_speed(iter/s)": 0.28639 }, { "acc": 0.73235512, "epoch": 1.67282071787788, "grad_norm": 6.5, "learning_rate": 7.13915066911553e-07, "loss": 1.06994314, "memory(GiB)": 142.32, "step": 149560, "train_speed(iter/s)": 0.286404 }, { "acc": 0.74499063, "epoch": 1.6730444168238385, "grad_norm": 5.875, "learning_rate": 7.129629909406483e-07, "loss": 1.02129707, "memory(GiB)": 142.32, "step": 149580, "train_speed(iter/s)": 0.286416 }, { "acc": 0.7434803, "epoch": 1.673268115769797, "grad_norm": 6.71875, "learning_rate": 7.120115014976803e-07, "loss": 1.01802902, "memory(GiB)": 142.32, "step": 149600, "train_speed(iter/s)": 0.286429 }, { "acc": 0.73501239, "epoch": 1.6734918147157556, "grad_norm": 5.8125, "learning_rate": 7.110605987128305e-07, "loss": 1.06815052, "memory(GiB)": 142.32, "step": 149620, "train_speed(iter/s)": 0.286441 }, { "acc": 0.73670421, "epoch": 1.6737155136617141, "grad_norm": 6.15625, "learning_rate": 7.101102827161921e-07, "loss": 1.05088215, "memory(GiB)": 142.32, "step": 149640, "train_speed(iter/s)": 0.286454 }, { "acc": 0.73210411, "epoch": 1.6739392126076726, "grad_norm": 6.65625, "learning_rate": 7.091605536377849e-07, "loss": 1.07208138, "memory(GiB)": 142.32, "step": 149660, "train_speed(iter/s)": 0.286468 }, { "acc": 0.73778534, "epoch": 1.6741629115536312, "grad_norm": 6.78125, "learning_rate": 7.082114116075445e-07, "loss": 1.03869457, "memory(GiB)": 142.32, "step": 149680, "train_speed(iter/s)": 0.286482 }, { "acc": 0.74316792, "epoch": 1.6743866104995897, "grad_norm": 4.875, "learning_rate": 7.072628567553253e-07, "loss": 1.00471096, "memory(GiB)": 142.32, "step": 149700, "train_speed(iter/s)": 0.286494 }, { "acc": 0.74797945, "epoch": 1.6746103094455482, "grad_norm": 5.375, "learning_rate": 7.063148892109056e-07, "loss": 1.00346508, "memory(GiB)": 142.32, "step": 149720, "train_speed(iter/s)": 0.286507 }, { "acc": 0.73430204, "epoch": 1.6748340083915068, "grad_norm": 5.90625, "learning_rate": 7.053675091039792e-07, "loss": 1.04823389, "memory(GiB)": 142.32, "step": 149740, "train_speed(iter/s)": 0.28652 }, { "acc": 0.74475088, "epoch": 1.6750577073374653, "grad_norm": 5.71875, "learning_rate": 7.044207165641631e-07, "loss": 1.00837879, "memory(GiB)": 142.32, "step": 149760, "train_speed(iter/s)": 0.286533 }, { "acc": 0.72998905, "epoch": 1.6752814062834238, "grad_norm": 5.9375, "learning_rate": 7.034745117209907e-07, "loss": 1.10051775, "memory(GiB)": 142.32, "step": 149780, "train_speed(iter/s)": 0.286545 }, { "acc": 0.72884188, "epoch": 1.6755051052293823, "grad_norm": 6.4375, "learning_rate": 7.025288947039161e-07, "loss": 1.08595715, "memory(GiB)": 142.32, "step": 149800, "train_speed(iter/s)": 0.286558 }, { "acc": 0.72569199, "epoch": 1.6757288041753409, "grad_norm": 6.5, "learning_rate": 7.015838656423141e-07, "loss": 1.11978283, "memory(GiB)": 142.32, "step": 149820, "train_speed(iter/s)": 0.286571 }, { "acc": 0.73582973, "epoch": 1.6759525031212994, "grad_norm": 5.71875, "learning_rate": 7.006394246654768e-07, "loss": 1.05514526, "memory(GiB)": 142.32, "step": 149840, "train_speed(iter/s)": 0.286584 }, { "acc": 0.72834148, "epoch": 1.676176202067258, "grad_norm": 6.34375, "learning_rate": 6.9969557190262e-07, "loss": 1.08436718, "memory(GiB)": 142.32, "step": 149860, "train_speed(iter/s)": 0.286596 }, { "acc": 0.75176082, "epoch": 1.6763999010132165, "grad_norm": 6.71875, "learning_rate": 6.987523074828739e-07, "loss": 0.97939587, "memory(GiB)": 142.32, "step": 149880, "train_speed(iter/s)": 0.28661 }, { "acc": 0.74023695, "epoch": 1.676623599959175, "grad_norm": 6.5, "learning_rate": 6.9780963153529e-07, "loss": 1.02010384, "memory(GiB)": 142.32, "step": 149900, "train_speed(iter/s)": 0.286623 }, { "acc": 0.73639441, "epoch": 1.6768472989051335, "grad_norm": 6.78125, "learning_rate": 6.968675441888422e-07, "loss": 1.06326208, "memory(GiB)": 142.32, "step": 149920, "train_speed(iter/s)": 0.286636 }, { "acc": 0.73097825, "epoch": 1.677070997851092, "grad_norm": 5.875, "learning_rate": 6.959260455724193e-07, "loss": 1.08545208, "memory(GiB)": 142.32, "step": 149940, "train_speed(iter/s)": 0.286648 }, { "acc": 0.74784193, "epoch": 1.6772946967970506, "grad_norm": 5.3125, "learning_rate": 6.949851358148335e-07, "loss": 0.99641876, "memory(GiB)": 142.32, "step": 149960, "train_speed(iter/s)": 0.286661 }, { "acc": 0.73807101, "epoch": 1.677518395743009, "grad_norm": 5.90625, "learning_rate": 6.940448150448143e-07, "loss": 1.03903551, "memory(GiB)": 142.32, "step": 149980, "train_speed(iter/s)": 0.286675 }, { "acc": 0.73430681, "epoch": 1.6777420946889676, "grad_norm": 5.8125, "learning_rate": 6.931050833910097e-07, "loss": 1.06238136, "memory(GiB)": 142.32, "step": 150000, "train_speed(iter/s)": 0.286688 }, { "epoch": 1.6777420946889676, "eval_acc": 0.6963762324656728, "eval_loss": 1.0713897943496704, "eval_runtime": 2342.1907, "eval_samples_per_second": 32.142, "eval_steps_per_second": 16.071, "step": 150000 }, { "acc": 0.74024334, "epoch": 1.6779657936349262, "grad_norm": 5.59375, "learning_rate": 6.921659409819903e-07, "loss": 1.02347984, "memory(GiB)": 142.32, "step": 150020, "train_speed(iter/s)": 0.285396 }, { "acc": 0.7423749, "epoch": 1.6781894925808847, "grad_norm": 5.96875, "learning_rate": 6.912273879462422e-07, "loss": 1.03355064, "memory(GiB)": 142.32, "step": 150040, "train_speed(iter/s)": 0.285409 }, { "acc": 0.74807696, "epoch": 1.6784131915268432, "grad_norm": 5.0625, "learning_rate": 6.90289424412175e-07, "loss": 0.99197102, "memory(GiB)": 142.32, "step": 150060, "train_speed(iter/s)": 0.285423 }, { "acc": 0.73295503, "epoch": 1.6786368904728017, "grad_norm": 5.59375, "learning_rate": 6.893520505081147e-07, "loss": 1.05885754, "memory(GiB)": 142.32, "step": 150080, "train_speed(iter/s)": 0.285437 }, { "acc": 0.73808661, "epoch": 1.6788605894187603, "grad_norm": 5.625, "learning_rate": 6.884152663623061e-07, "loss": 1.03154964, "memory(GiB)": 142.32, "step": 150100, "train_speed(iter/s)": 0.285447 }, { "acc": 0.73255439, "epoch": 1.6790842883647188, "grad_norm": 6.125, "learning_rate": 6.874790721029167e-07, "loss": 1.07062645, "memory(GiB)": 142.32, "step": 150120, "train_speed(iter/s)": 0.285461 }, { "acc": 0.73972282, "epoch": 1.6793079873106773, "grad_norm": 6.75, "learning_rate": 6.865434678580296e-07, "loss": 1.02442818, "memory(GiB)": 142.32, "step": 150140, "train_speed(iter/s)": 0.285474 }, { "acc": 0.71899643, "epoch": 1.6795316862566358, "grad_norm": 5.8125, "learning_rate": 6.856084537556507e-07, "loss": 1.1318224, "memory(GiB)": 142.32, "step": 150160, "train_speed(iter/s)": 0.285488 }, { "acc": 0.739221, "epoch": 1.6797553852025944, "grad_norm": 5.71875, "learning_rate": 6.846740299237015e-07, "loss": 1.03552876, "memory(GiB)": 142.32, "step": 150180, "train_speed(iter/s)": 0.2855 }, { "acc": 0.74254751, "epoch": 1.679979084148553, "grad_norm": 5.28125, "learning_rate": 6.837401964900248e-07, "loss": 1.01866207, "memory(GiB)": 142.32, "step": 150200, "train_speed(iter/s)": 0.285513 }, { "acc": 0.73371325, "epoch": 1.6802027830945114, "grad_norm": 6.125, "learning_rate": 6.828069535823839e-07, "loss": 1.05349684, "memory(GiB)": 142.32, "step": 150220, "train_speed(iter/s)": 0.285526 }, { "acc": 0.72351975, "epoch": 1.68042648204047, "grad_norm": 5.09375, "learning_rate": 6.818743013284573e-07, "loss": 1.11772261, "memory(GiB)": 142.32, "step": 150240, "train_speed(iter/s)": 0.285539 }, { "acc": 0.73112717, "epoch": 1.6806501809864285, "grad_norm": 6.9375, "learning_rate": 6.809422398558474e-07, "loss": 1.0772768, "memory(GiB)": 142.32, "step": 150260, "train_speed(iter/s)": 0.285551 }, { "acc": 0.73471308, "epoch": 1.680873879932387, "grad_norm": 6.28125, "learning_rate": 6.800107692920732e-07, "loss": 1.06124477, "memory(GiB)": 142.32, "step": 150280, "train_speed(iter/s)": 0.285564 }, { "acc": 0.72783403, "epoch": 1.6810975788783455, "grad_norm": 5.625, "learning_rate": 6.790798897645712e-07, "loss": 1.08834534, "memory(GiB)": 142.32, "step": 150300, "train_speed(iter/s)": 0.285578 }, { "acc": 0.73908606, "epoch": 1.681321277824304, "grad_norm": 6.5625, "learning_rate": 6.781496014007016e-07, "loss": 1.0439291, "memory(GiB)": 142.32, "step": 150320, "train_speed(iter/s)": 0.285592 }, { "acc": 0.74264278, "epoch": 1.6815449767702626, "grad_norm": 6.34375, "learning_rate": 6.772199043277389e-07, "loss": 1.0168582, "memory(GiB)": 142.32, "step": 150340, "train_speed(iter/s)": 0.285605 }, { "acc": 0.75288992, "epoch": 1.6817686757162211, "grad_norm": 6.375, "learning_rate": 6.762907986728811e-07, "loss": 0.99149933, "memory(GiB)": 142.32, "step": 150360, "train_speed(iter/s)": 0.285619 }, { "acc": 0.73579283, "epoch": 1.6819923746621797, "grad_norm": 6.375, "learning_rate": 6.753622845632424e-07, "loss": 1.05544262, "memory(GiB)": 142.32, "step": 150380, "train_speed(iter/s)": 0.285632 }, { "acc": 0.73665781, "epoch": 1.6822160736081382, "grad_norm": 7.75, "learning_rate": 6.744343621258565e-07, "loss": 1.04636955, "memory(GiB)": 142.32, "step": 150400, "train_speed(iter/s)": 0.285644 }, { "acc": 0.7386209, "epoch": 1.6824397725540967, "grad_norm": 6.375, "learning_rate": 6.735070314876757e-07, "loss": 1.04280109, "memory(GiB)": 142.32, "step": 150420, "train_speed(iter/s)": 0.285657 }, { "acc": 0.73344302, "epoch": 1.6826634715000552, "grad_norm": 5.875, "learning_rate": 6.725802927755726e-07, "loss": 1.06225853, "memory(GiB)": 142.32, "step": 150440, "train_speed(iter/s)": 0.285669 }, { "acc": 0.72118692, "epoch": 1.6828871704460138, "grad_norm": 5.34375, "learning_rate": 6.71654146116339e-07, "loss": 1.10813713, "memory(GiB)": 142.32, "step": 150460, "train_speed(iter/s)": 0.285682 }, { "acc": 0.73362026, "epoch": 1.6831108693919723, "grad_norm": 6.75, "learning_rate": 6.70728591636684e-07, "loss": 1.06777582, "memory(GiB)": 142.32, "step": 150480, "train_speed(iter/s)": 0.285696 }, { "acc": 0.73665724, "epoch": 1.6833345683379308, "grad_norm": 6.625, "learning_rate": 6.698036294632376e-07, "loss": 1.04583473, "memory(GiB)": 142.32, "step": 150500, "train_speed(iter/s)": 0.285709 }, { "acc": 0.72873569, "epoch": 1.6835582672838894, "grad_norm": 6.46875, "learning_rate": 6.68879259722548e-07, "loss": 1.06871433, "memory(GiB)": 142.32, "step": 150520, "train_speed(iter/s)": 0.285721 }, { "acc": 0.72902565, "epoch": 1.6837819662298479, "grad_norm": 6.6875, "learning_rate": 6.679554825410806e-07, "loss": 1.07325687, "memory(GiB)": 142.32, "step": 150540, "train_speed(iter/s)": 0.285734 }, { "acc": 0.75219584, "epoch": 1.6840056651758064, "grad_norm": 5.0625, "learning_rate": 6.670322980452232e-07, "loss": 0.98974819, "memory(GiB)": 142.32, "step": 150560, "train_speed(iter/s)": 0.285747 }, { "acc": 0.75071511, "epoch": 1.684229364121765, "grad_norm": 6.40625, "learning_rate": 6.661097063612787e-07, "loss": 0.98851376, "memory(GiB)": 142.32, "step": 150580, "train_speed(iter/s)": 0.28576 }, { "acc": 0.73521008, "epoch": 1.6844530630677235, "grad_norm": 6.125, "learning_rate": 6.651877076154728e-07, "loss": 1.05404072, "memory(GiB)": 142.32, "step": 150600, "train_speed(iter/s)": 0.285772 }, { "acc": 0.73300142, "epoch": 1.684676762013682, "grad_norm": 6.65625, "learning_rate": 6.642663019339479e-07, "loss": 1.04495354, "memory(GiB)": 142.32, "step": 150620, "train_speed(iter/s)": 0.285786 }, { "acc": 0.73433886, "epoch": 1.6849004609596405, "grad_norm": 6.03125, "learning_rate": 6.633454894427632e-07, "loss": 1.07046432, "memory(GiB)": 142.32, "step": 150640, "train_speed(iter/s)": 0.2858 }, { "acc": 0.74142218, "epoch": 1.685124159905599, "grad_norm": 6.625, "learning_rate": 6.624252702679018e-07, "loss": 1.02883272, "memory(GiB)": 142.32, "step": 150660, "train_speed(iter/s)": 0.285813 }, { "acc": 0.7325263, "epoch": 1.6853478588515576, "grad_norm": 6.6875, "learning_rate": 6.615056445352607e-07, "loss": 1.0677228, "memory(GiB)": 142.32, "step": 150680, "train_speed(iter/s)": 0.285825 }, { "acc": 0.73845844, "epoch": 1.685571557797516, "grad_norm": 5.375, "learning_rate": 6.6058661237066e-07, "loss": 1.02775393, "memory(GiB)": 142.32, "step": 150700, "train_speed(iter/s)": 0.285837 }, { "acc": 0.74308262, "epoch": 1.6857952567434746, "grad_norm": 6.03125, "learning_rate": 6.59668173899835e-07, "loss": 1.0136342, "memory(GiB)": 142.32, "step": 150720, "train_speed(iter/s)": 0.285851 }, { "acc": 0.74357858, "epoch": 1.6860189556894332, "grad_norm": 5.9375, "learning_rate": 6.587503292484404e-07, "loss": 1.00389366, "memory(GiB)": 142.32, "step": 150740, "train_speed(iter/s)": 0.285864 }, { "acc": 0.74230032, "epoch": 1.6862426546353917, "grad_norm": 5.03125, "learning_rate": 6.578330785420528e-07, "loss": 1.02042131, "memory(GiB)": 142.32, "step": 150760, "train_speed(iter/s)": 0.285877 }, { "acc": 0.74229259, "epoch": 1.6864663535813502, "grad_norm": 6.90625, "learning_rate": 6.569164219061625e-07, "loss": 1.02137413, "memory(GiB)": 142.32, "step": 150780, "train_speed(iter/s)": 0.285891 }, { "acc": 0.74386005, "epoch": 1.6866900525273087, "grad_norm": 7.71875, "learning_rate": 6.56000359466184e-07, "loss": 1.00709305, "memory(GiB)": 142.32, "step": 150800, "train_speed(iter/s)": 0.285904 }, { "acc": 0.7273005, "epoch": 1.6869137514732673, "grad_norm": 7.28125, "learning_rate": 6.550848913474462e-07, "loss": 1.08797703, "memory(GiB)": 142.32, "step": 150820, "train_speed(iter/s)": 0.285916 }, { "acc": 0.72869873, "epoch": 1.6871374504192258, "grad_norm": 7.15625, "learning_rate": 6.541700176751975e-07, "loss": 1.09670238, "memory(GiB)": 142.32, "step": 150840, "train_speed(iter/s)": 0.285929 }, { "acc": 0.73481708, "epoch": 1.6873611493651843, "grad_norm": 5.75, "learning_rate": 6.532557385746075e-07, "loss": 1.06513309, "memory(GiB)": 142.32, "step": 150860, "train_speed(iter/s)": 0.285942 }, { "acc": 0.74617243, "epoch": 1.6875848483111429, "grad_norm": 5.625, "learning_rate": 6.52342054170761e-07, "loss": 1.01008949, "memory(GiB)": 142.32, "step": 150880, "train_speed(iter/s)": 0.285954 }, { "acc": 0.72920485, "epoch": 1.6878085472571014, "grad_norm": 6.875, "learning_rate": 6.514289645886646e-07, "loss": 1.08739376, "memory(GiB)": 142.32, "step": 150900, "train_speed(iter/s)": 0.285968 }, { "acc": 0.73075843, "epoch": 1.68803224620306, "grad_norm": 5.375, "learning_rate": 6.505164699532412e-07, "loss": 1.08159466, "memory(GiB)": 142.32, "step": 150920, "train_speed(iter/s)": 0.285981 }, { "acc": 0.74024959, "epoch": 1.6882559451490184, "grad_norm": 5.125, "learning_rate": 6.496045703893322e-07, "loss": 1.02186356, "memory(GiB)": 142.32, "step": 150940, "train_speed(iter/s)": 0.285991 }, { "acc": 0.74252996, "epoch": 1.688479644094977, "grad_norm": 6.625, "learning_rate": 6.486932660217004e-07, "loss": 1.02532434, "memory(GiB)": 142.32, "step": 150960, "train_speed(iter/s)": 0.286002 }, { "acc": 0.73971944, "epoch": 1.6887033430409355, "grad_norm": 7.15625, "learning_rate": 6.477825569750234e-07, "loss": 1.03193321, "memory(GiB)": 142.32, "step": 150980, "train_speed(iter/s)": 0.286016 }, { "acc": 0.73027329, "epoch": 1.688927041986894, "grad_norm": 4.5, "learning_rate": 6.468724433739015e-07, "loss": 1.07987614, "memory(GiB)": 142.32, "step": 151000, "train_speed(iter/s)": 0.286029 }, { "acc": 0.74418588, "epoch": 1.6891507409328526, "grad_norm": 5.84375, "learning_rate": 6.459629253428495e-07, "loss": 1.01792507, "memory(GiB)": 142.32, "step": 151020, "train_speed(iter/s)": 0.286043 }, { "acc": 0.7316, "epoch": 1.689374439878811, "grad_norm": 6.875, "learning_rate": 6.450540030063018e-07, "loss": 1.06751251, "memory(GiB)": 142.32, "step": 151040, "train_speed(iter/s)": 0.286055 }, { "acc": 0.73704739, "epoch": 1.6895981388247696, "grad_norm": 6.25, "learning_rate": 6.441456764886139e-07, "loss": 1.04945107, "memory(GiB)": 142.32, "step": 151060, "train_speed(iter/s)": 0.286069 }, { "acc": 0.73414249, "epoch": 1.6898218377707281, "grad_norm": 6.65625, "learning_rate": 6.432379459140564e-07, "loss": 1.06924477, "memory(GiB)": 142.32, "step": 151080, "train_speed(iter/s)": 0.28608 }, { "acc": 0.72651548, "epoch": 1.6900455367166867, "grad_norm": 5.90625, "learning_rate": 6.423308114068211e-07, "loss": 1.09035931, "memory(GiB)": 142.32, "step": 151100, "train_speed(iter/s)": 0.286093 }, { "acc": 0.73337379, "epoch": 1.6902692356626452, "grad_norm": 6.34375, "learning_rate": 6.414242730910164e-07, "loss": 1.05243626, "memory(GiB)": 142.32, "step": 151120, "train_speed(iter/s)": 0.286105 }, { "acc": 0.73725424, "epoch": 1.6904929346086037, "grad_norm": 5.0625, "learning_rate": 6.405183310906682e-07, "loss": 1.06755314, "memory(GiB)": 142.32, "step": 151140, "train_speed(iter/s)": 0.286121 }, { "acc": 0.73748846, "epoch": 1.6907166335545623, "grad_norm": 4.75, "learning_rate": 6.396129855297245e-07, "loss": 1.03747988, "memory(GiB)": 142.32, "step": 151160, "train_speed(iter/s)": 0.286134 }, { "acc": 0.74575057, "epoch": 1.6909403325005208, "grad_norm": 6.84375, "learning_rate": 6.387082365320485e-07, "loss": 1.0144619, "memory(GiB)": 142.32, "step": 151180, "train_speed(iter/s)": 0.286147 }, { "acc": 0.74472189, "epoch": 1.6911640314464793, "grad_norm": 5.53125, "learning_rate": 6.378040842214234e-07, "loss": 1.01504841, "memory(GiB)": 142.32, "step": 151200, "train_speed(iter/s)": 0.28616 }, { "acc": 0.73203869, "epoch": 1.6913877303924378, "grad_norm": 6.4375, "learning_rate": 6.369005287215496e-07, "loss": 1.06637716, "memory(GiB)": 142.32, "step": 151220, "train_speed(iter/s)": 0.286173 }, { "acc": 0.74033127, "epoch": 1.6916114293383964, "grad_norm": 5.15625, "learning_rate": 6.359975701560456e-07, "loss": 1.0200491, "memory(GiB)": 142.32, "step": 151240, "train_speed(iter/s)": 0.286188 }, { "acc": 0.74755592, "epoch": 1.691835128284355, "grad_norm": 6.53125, "learning_rate": 6.350952086484513e-07, "loss": 1.00211554, "memory(GiB)": 142.32, "step": 151260, "train_speed(iter/s)": 0.286203 }, { "acc": 0.73665419, "epoch": 1.6920588272303134, "grad_norm": 5.6875, "learning_rate": 6.341934443222203e-07, "loss": 1.04880342, "memory(GiB)": 142.32, "step": 151280, "train_speed(iter/s)": 0.286215 }, { "acc": 0.75306997, "epoch": 1.692282526176272, "grad_norm": 6.78125, "learning_rate": 6.332922773007289e-07, "loss": 0.97532654, "memory(GiB)": 142.32, "step": 151300, "train_speed(iter/s)": 0.286227 }, { "acc": 0.73420181, "epoch": 1.6925062251222305, "grad_norm": 5.96875, "learning_rate": 6.323917077072683e-07, "loss": 1.05439396, "memory(GiB)": 142.32, "step": 151320, "train_speed(iter/s)": 0.28624 }, { "acc": 0.7321166, "epoch": 1.692729924068189, "grad_norm": 6.96875, "learning_rate": 6.314917356650485e-07, "loss": 1.04778605, "memory(GiB)": 142.32, "step": 151340, "train_speed(iter/s)": 0.286253 }, { "acc": 0.7347683, "epoch": 1.6929536230141475, "grad_norm": 5.53125, "learning_rate": 6.30592361297201e-07, "loss": 1.05313444, "memory(GiB)": 142.32, "step": 151360, "train_speed(iter/s)": 0.286266 }, { "acc": 0.7378499, "epoch": 1.693177321960106, "grad_norm": 6.375, "learning_rate": 6.296935847267705e-07, "loss": 1.01915855, "memory(GiB)": 142.32, "step": 151380, "train_speed(iter/s)": 0.28628 }, { "acc": 0.73511209, "epoch": 1.6934010209060646, "grad_norm": 4.96875, "learning_rate": 6.287954060767249e-07, "loss": 1.08615036, "memory(GiB)": 142.32, "step": 151400, "train_speed(iter/s)": 0.286293 }, { "acc": 0.74351606, "epoch": 1.6936247198520231, "grad_norm": 6.1875, "learning_rate": 6.278978254699464e-07, "loss": 1.01642427, "memory(GiB)": 142.32, "step": 151420, "train_speed(iter/s)": 0.286304 }, { "acc": 0.7298543, "epoch": 1.6938484187979816, "grad_norm": 6.375, "learning_rate": 6.270008430292357e-07, "loss": 1.0847517, "memory(GiB)": 142.32, "step": 151440, "train_speed(iter/s)": 0.286316 }, { "acc": 0.72345419, "epoch": 1.6940721177439402, "grad_norm": 6.84375, "learning_rate": 6.261044588773163e-07, "loss": 1.10293255, "memory(GiB)": 142.32, "step": 151460, "train_speed(iter/s)": 0.286329 }, { "acc": 0.74874649, "epoch": 1.6942958166898987, "grad_norm": 7.28125, "learning_rate": 6.252086731368224e-07, "loss": 0.98956432, "memory(GiB)": 142.32, "step": 151480, "train_speed(iter/s)": 0.286341 }, { "acc": 0.73454852, "epoch": 1.6945195156358572, "grad_norm": 6.5, "learning_rate": 6.243134859303134e-07, "loss": 1.06726122, "memory(GiB)": 142.32, "step": 151500, "train_speed(iter/s)": 0.286355 }, { "acc": 0.73473473, "epoch": 1.6947432145818158, "grad_norm": 5.5, "learning_rate": 6.234188973802629e-07, "loss": 1.05698481, "memory(GiB)": 142.32, "step": 151520, "train_speed(iter/s)": 0.286368 }, { "acc": 0.73312297, "epoch": 1.6949669135277743, "grad_norm": 6.34375, "learning_rate": 6.225249076090617e-07, "loss": 1.08193264, "memory(GiB)": 142.32, "step": 151540, "train_speed(iter/s)": 0.286379 }, { "acc": 0.73276644, "epoch": 1.6951906124737328, "grad_norm": 5.75, "learning_rate": 6.216315167390225e-07, "loss": 1.07521572, "memory(GiB)": 142.32, "step": 151560, "train_speed(iter/s)": 0.286391 }, { "acc": 0.72602577, "epoch": 1.6954143114196913, "grad_norm": 6.5, "learning_rate": 6.207387248923735e-07, "loss": 1.09444447, "memory(GiB)": 142.32, "step": 151580, "train_speed(iter/s)": 0.286405 }, { "acc": 0.74073706, "epoch": 1.6956380103656499, "grad_norm": 6.0, "learning_rate": 6.198465321912606e-07, "loss": 1.02971954, "memory(GiB)": 142.32, "step": 151600, "train_speed(iter/s)": 0.286418 }, { "acc": 0.74033723, "epoch": 1.6958617093116084, "grad_norm": 6.15625, "learning_rate": 6.189549387577482e-07, "loss": 1.0233408, "memory(GiB)": 142.32, "step": 151620, "train_speed(iter/s)": 0.286431 }, { "acc": 0.74379277, "epoch": 1.696085408257567, "grad_norm": 7.59375, "learning_rate": 6.180639447138203e-07, "loss": 1.02890863, "memory(GiB)": 142.32, "step": 151640, "train_speed(iter/s)": 0.286443 }, { "acc": 0.7251235, "epoch": 1.6963091072035255, "grad_norm": 5.34375, "learning_rate": 6.171735501813769e-07, "loss": 1.09803963, "memory(GiB)": 142.32, "step": 151660, "train_speed(iter/s)": 0.286457 }, { "acc": 0.73595448, "epoch": 1.696532806149484, "grad_norm": 6.65625, "learning_rate": 6.162837552822371e-07, "loss": 1.04909248, "memory(GiB)": 142.32, "step": 151680, "train_speed(iter/s)": 0.286468 }, { "acc": 0.75232887, "epoch": 1.6967565050954425, "grad_norm": 5.375, "learning_rate": 6.153945601381378e-07, "loss": 0.96200981, "memory(GiB)": 142.32, "step": 151700, "train_speed(iter/s)": 0.286481 }, { "acc": 0.73067517, "epoch": 1.696980204041401, "grad_norm": 6.21875, "learning_rate": 6.145059648707319e-07, "loss": 1.07417946, "memory(GiB)": 142.32, "step": 151720, "train_speed(iter/s)": 0.286494 }, { "acc": 0.73684363, "epoch": 1.6972039029873596, "grad_norm": 4.9375, "learning_rate": 6.136179696015942e-07, "loss": 1.05399933, "memory(GiB)": 142.32, "step": 151740, "train_speed(iter/s)": 0.286505 }, { "acc": 0.74158821, "epoch": 1.697427601933318, "grad_norm": 5.96875, "learning_rate": 6.127305744522127e-07, "loss": 1.0286541, "memory(GiB)": 142.32, "step": 151760, "train_speed(iter/s)": 0.286516 }, { "acc": 0.73625731, "epoch": 1.6976513008792766, "grad_norm": 5.9375, "learning_rate": 6.118437795439985e-07, "loss": 1.04294891, "memory(GiB)": 142.32, "step": 151780, "train_speed(iter/s)": 0.28653 }, { "acc": 0.74075041, "epoch": 1.6978749998252352, "grad_norm": 7.53125, "learning_rate": 6.109575849982769e-07, "loss": 1.03354053, "memory(GiB)": 142.32, "step": 151800, "train_speed(iter/s)": 0.286542 }, { "acc": 0.75044127, "epoch": 1.6980986987711937, "grad_norm": 6.03125, "learning_rate": 6.100719909362901e-07, "loss": 1.00833797, "memory(GiB)": 142.32, "step": 151820, "train_speed(iter/s)": 0.286555 }, { "acc": 0.74178538, "epoch": 1.6983223977171522, "grad_norm": 7.625, "learning_rate": 6.091869974792025e-07, "loss": 1.02752829, "memory(GiB)": 142.32, "step": 151840, "train_speed(iter/s)": 0.286566 }, { "acc": 0.73168244, "epoch": 1.6985460966631107, "grad_norm": 7.21875, "learning_rate": 6.083026047480916e-07, "loss": 1.07249527, "memory(GiB)": 142.32, "step": 151860, "train_speed(iter/s)": 0.28658 }, { "acc": 0.73794422, "epoch": 1.6987697956090693, "grad_norm": 7.28125, "learning_rate": 6.074188128639575e-07, "loss": 1.04279137, "memory(GiB)": 142.32, "step": 151880, "train_speed(iter/s)": 0.286592 }, { "acc": 0.72360959, "epoch": 1.6989934945550278, "grad_norm": 5.65625, "learning_rate": 6.065356219477142e-07, "loss": 1.09611092, "memory(GiB)": 142.32, "step": 151900, "train_speed(iter/s)": 0.286605 }, { "acc": 0.73301668, "epoch": 1.6992171935009863, "grad_norm": 5.5, "learning_rate": 6.056530321201936e-07, "loss": 1.06412163, "memory(GiB)": 142.32, "step": 151920, "train_speed(iter/s)": 0.286618 }, { "acc": 0.7393466, "epoch": 1.6994408924469449, "grad_norm": 7.90625, "learning_rate": 6.047710435021492e-07, "loss": 1.05373726, "memory(GiB)": 142.32, "step": 151940, "train_speed(iter/s)": 0.286631 }, { "acc": 0.73144693, "epoch": 1.6996645913929034, "grad_norm": 7.59375, "learning_rate": 6.038896562142477e-07, "loss": 1.08449993, "memory(GiB)": 142.32, "step": 151960, "train_speed(iter/s)": 0.286644 }, { "acc": 0.72153196, "epoch": 1.699888290338862, "grad_norm": 7.09375, "learning_rate": 6.030088703770765e-07, "loss": 1.12144604, "memory(GiB)": 142.32, "step": 151980, "train_speed(iter/s)": 0.286656 }, { "acc": 0.7219368, "epoch": 1.7001119892848204, "grad_norm": 8.0625, "learning_rate": 6.021286861111397e-07, "loss": 1.12191677, "memory(GiB)": 142.32, "step": 152000, "train_speed(iter/s)": 0.286669 }, { "epoch": 1.7001119892848204, "eval_acc": 0.6963766268389131, "eval_loss": 1.0713787078857422, "eval_runtime": 2341.1685, "eval_samples_per_second": 32.156, "eval_steps_per_second": 16.078, "step": 152000 }, { "acc": 0.74416952, "epoch": 1.700335688230779, "grad_norm": 6.53125, "learning_rate": 6.012491035368573e-07, "loss": 1.02623167, "memory(GiB)": 142.32, "step": 152020, "train_speed(iter/s)": 0.285394 }, { "acc": 0.74443674, "epoch": 1.7005593871767375, "grad_norm": 7.25, "learning_rate": 6.003701227745718e-07, "loss": 1.02236786, "memory(GiB)": 142.32, "step": 152040, "train_speed(iter/s)": 0.285407 }, { "acc": 0.73483133, "epoch": 1.700783086122696, "grad_norm": 5.1875, "learning_rate": 5.994917439445369e-07, "loss": 1.06809502, "memory(GiB)": 142.32, "step": 152060, "train_speed(iter/s)": 0.28542 }, { "acc": 0.73511715, "epoch": 1.7010067850686545, "grad_norm": 4.90625, "learning_rate": 5.986139671669305e-07, "loss": 1.05703039, "memory(GiB)": 142.32, "step": 152080, "train_speed(iter/s)": 0.285432 }, { "acc": 0.73925309, "epoch": 1.701230484014613, "grad_norm": 5.65625, "learning_rate": 5.977367925618438e-07, "loss": 1.031106, "memory(GiB)": 142.32, "step": 152100, "train_speed(iter/s)": 0.285446 }, { "acc": 0.74551487, "epoch": 1.7014541829605716, "grad_norm": 6.15625, "learning_rate": 5.968602202492857e-07, "loss": 1.0086071, "memory(GiB)": 142.32, "step": 152120, "train_speed(iter/s)": 0.285459 }, { "acc": 0.75341425, "epoch": 1.7016778819065301, "grad_norm": 5.8125, "learning_rate": 5.959842503491859e-07, "loss": 0.96878786, "memory(GiB)": 142.32, "step": 152140, "train_speed(iter/s)": 0.285471 }, { "acc": 0.73378248, "epoch": 1.7019015808524887, "grad_norm": 6.0, "learning_rate": 5.951088829813878e-07, "loss": 1.05176907, "memory(GiB)": 142.32, "step": 152160, "train_speed(iter/s)": 0.285484 }, { "acc": 0.73902893, "epoch": 1.7021252797984472, "grad_norm": 5.6875, "learning_rate": 5.942341182656552e-07, "loss": 1.03755178, "memory(GiB)": 142.32, "step": 152180, "train_speed(iter/s)": 0.285498 }, { "acc": 0.73923693, "epoch": 1.7023489787444057, "grad_norm": 7.53125, "learning_rate": 5.933599563216691e-07, "loss": 1.0360878, "memory(GiB)": 142.32, "step": 152200, "train_speed(iter/s)": 0.285513 }, { "acc": 0.72824669, "epoch": 1.7025726776903642, "grad_norm": 5.40625, "learning_rate": 5.924863972690248e-07, "loss": 1.08661919, "memory(GiB)": 142.32, "step": 152220, "train_speed(iter/s)": 0.285526 }, { "acc": 0.72408528, "epoch": 1.7027963766363228, "grad_norm": 5.46875, "learning_rate": 5.916134412272401e-07, "loss": 1.10485287, "memory(GiB)": 142.32, "step": 152240, "train_speed(iter/s)": 0.285538 }, { "acc": 0.7364521, "epoch": 1.7030200755822813, "grad_norm": 6.03125, "learning_rate": 5.90741088315746e-07, "loss": 1.05201387, "memory(GiB)": 142.32, "step": 152260, "train_speed(iter/s)": 0.285551 }, { "acc": 0.74355183, "epoch": 1.7032437745282398, "grad_norm": 5.59375, "learning_rate": 5.89869338653895e-07, "loss": 1.03007488, "memory(GiB)": 142.32, "step": 152280, "train_speed(iter/s)": 0.285563 }, { "acc": 0.74274135, "epoch": 1.7034674734741984, "grad_norm": 6.03125, "learning_rate": 5.889981923609539e-07, "loss": 1.03732042, "memory(GiB)": 142.32, "step": 152300, "train_speed(iter/s)": 0.285576 }, { "acc": 0.73864784, "epoch": 1.7036911724201569, "grad_norm": 5.0, "learning_rate": 5.881276495561067e-07, "loss": 1.05380497, "memory(GiB)": 142.32, "step": 152320, "train_speed(iter/s)": 0.285589 }, { "acc": 0.72877989, "epoch": 1.7039148713661154, "grad_norm": 6.84375, "learning_rate": 5.872577103584581e-07, "loss": 1.08418188, "memory(GiB)": 142.32, "step": 152340, "train_speed(iter/s)": 0.285601 }, { "acc": 0.72935538, "epoch": 1.704138570312074, "grad_norm": 7.0625, "learning_rate": 5.863883748870264e-07, "loss": 1.08344078, "memory(GiB)": 142.32, "step": 152360, "train_speed(iter/s)": 0.285613 }, { "acc": 0.73202982, "epoch": 1.7043622692580325, "grad_norm": 5.90625, "learning_rate": 5.855196432607513e-07, "loss": 1.06780138, "memory(GiB)": 142.32, "step": 152380, "train_speed(iter/s)": 0.285627 }, { "acc": 0.74099669, "epoch": 1.704585968203991, "grad_norm": 6.4375, "learning_rate": 5.846515155984861e-07, "loss": 1.03078699, "memory(GiB)": 142.32, "step": 152400, "train_speed(iter/s)": 0.28564 }, { "acc": 0.72649975, "epoch": 1.7048096671499495, "grad_norm": 5.15625, "learning_rate": 5.837839920190025e-07, "loss": 1.10323257, "memory(GiB)": 142.32, "step": 152420, "train_speed(iter/s)": 0.285652 }, { "acc": 0.72998228, "epoch": 1.705033366095908, "grad_norm": 6.9375, "learning_rate": 5.829170726409921e-07, "loss": 1.06308651, "memory(GiB)": 142.32, "step": 152440, "train_speed(iter/s)": 0.285665 }, { "acc": 0.74819994, "epoch": 1.7052570650418666, "grad_norm": 5.90625, "learning_rate": 5.820507575830603e-07, "loss": 1.00871143, "memory(GiB)": 142.32, "step": 152460, "train_speed(iter/s)": 0.285678 }, { "acc": 0.74434862, "epoch": 1.705480763987825, "grad_norm": 6.09375, "learning_rate": 5.811850469637326e-07, "loss": 1.02849216, "memory(GiB)": 142.32, "step": 152480, "train_speed(iter/s)": 0.285689 }, { "acc": 0.74992924, "epoch": 1.7057044629337836, "grad_norm": 5.34375, "learning_rate": 5.803199409014498e-07, "loss": 0.99971142, "memory(GiB)": 142.32, "step": 152500, "train_speed(iter/s)": 0.285701 }, { "acc": 0.73141317, "epoch": 1.7059281618797422, "grad_norm": 5.90625, "learning_rate": 5.794554395145696e-07, "loss": 1.05345268, "memory(GiB)": 142.32, "step": 152520, "train_speed(iter/s)": 0.285713 }, { "acc": 0.73540859, "epoch": 1.7061518608257007, "grad_norm": 7.21875, "learning_rate": 5.785915429213707e-07, "loss": 1.05366259, "memory(GiB)": 142.32, "step": 152540, "train_speed(iter/s)": 0.285725 }, { "acc": 0.75249071, "epoch": 1.7063755597716592, "grad_norm": 7.5625, "learning_rate": 5.777282512400445e-07, "loss": 0.97917538, "memory(GiB)": 142.32, "step": 152560, "train_speed(iter/s)": 0.285738 }, { "acc": 0.73949289, "epoch": 1.7065992587176178, "grad_norm": 6.34375, "learning_rate": 5.768655645887028e-07, "loss": 1.02816391, "memory(GiB)": 142.32, "step": 152580, "train_speed(iter/s)": 0.285751 }, { "acc": 0.74330339, "epoch": 1.7068229576635763, "grad_norm": 5.9375, "learning_rate": 5.760034830853733e-07, "loss": 1.01946354, "memory(GiB)": 142.32, "step": 152600, "train_speed(iter/s)": 0.285763 }, { "acc": 0.73297091, "epoch": 1.7070466566095348, "grad_norm": 7.5625, "learning_rate": 5.751420068479995e-07, "loss": 1.06031952, "memory(GiB)": 142.32, "step": 152620, "train_speed(iter/s)": 0.285776 }, { "acc": 0.73207788, "epoch": 1.7072703555554933, "grad_norm": 6.0, "learning_rate": 5.742811359944467e-07, "loss": 1.06994743, "memory(GiB)": 142.32, "step": 152640, "train_speed(iter/s)": 0.285787 }, { "acc": 0.72696009, "epoch": 1.7074940545014519, "grad_norm": 4.9375, "learning_rate": 5.73420870642491e-07, "loss": 1.09630585, "memory(GiB)": 142.32, "step": 152660, "train_speed(iter/s)": 0.2858 }, { "acc": 0.73419256, "epoch": 1.7077177534474104, "grad_norm": 6.71875, "learning_rate": 5.725612109098316e-07, "loss": 1.06909008, "memory(GiB)": 142.32, "step": 152680, "train_speed(iter/s)": 0.285811 }, { "acc": 0.74753208, "epoch": 1.707941452393369, "grad_norm": 6.65625, "learning_rate": 5.717021569140813e-07, "loss": 1.01342735, "memory(GiB)": 142.32, "step": 152700, "train_speed(iter/s)": 0.285822 }, { "acc": 0.74674506, "epoch": 1.7081651513393274, "grad_norm": 6.28125, "learning_rate": 5.708437087727703e-07, "loss": 1.00426292, "memory(GiB)": 142.32, "step": 152720, "train_speed(iter/s)": 0.285833 }, { "acc": 0.74256773, "epoch": 1.708388850285286, "grad_norm": 6.4375, "learning_rate": 5.699858666033475e-07, "loss": 1.02421436, "memory(GiB)": 142.32, "step": 152740, "train_speed(iter/s)": 0.285846 }, { "acc": 0.7297184, "epoch": 1.7086125492312445, "grad_norm": 7.09375, "learning_rate": 5.69128630523178e-07, "loss": 1.07369728, "memory(GiB)": 142.32, "step": 152760, "train_speed(iter/s)": 0.285859 }, { "acc": 0.74216814, "epoch": 1.708836248177203, "grad_norm": 6.5625, "learning_rate": 5.682720006495435e-07, "loss": 1.00357914, "memory(GiB)": 142.32, "step": 152780, "train_speed(iter/s)": 0.285872 }, { "acc": 0.73600311, "epoch": 1.7090599471231616, "grad_norm": 7.25, "learning_rate": 5.674159770996423e-07, "loss": 1.04872246, "memory(GiB)": 142.32, "step": 152800, "train_speed(iter/s)": 0.285885 }, { "acc": 0.73639669, "epoch": 1.70928364606912, "grad_norm": 7.9375, "learning_rate": 5.665605599905927e-07, "loss": 1.04796839, "memory(GiB)": 142.32, "step": 152820, "train_speed(iter/s)": 0.285898 }, { "acc": 0.74006929, "epoch": 1.7095073450150786, "grad_norm": 5.375, "learning_rate": 5.657057494394269e-07, "loss": 1.02929668, "memory(GiB)": 142.32, "step": 152840, "train_speed(iter/s)": 0.285911 }, { "acc": 0.73987951, "epoch": 1.7097310439610371, "grad_norm": 6.40625, "learning_rate": 5.648515455630943e-07, "loss": 1.03421803, "memory(GiB)": 142.32, "step": 152860, "train_speed(iter/s)": 0.285924 }, { "acc": 0.72102828, "epoch": 1.7099547429069957, "grad_norm": 5.71875, "learning_rate": 5.639979484784641e-07, "loss": 1.12206821, "memory(GiB)": 142.32, "step": 152880, "train_speed(iter/s)": 0.285935 }, { "acc": 0.73766875, "epoch": 1.7101784418529542, "grad_norm": 4.71875, "learning_rate": 5.631449583023191e-07, "loss": 1.02748232, "memory(GiB)": 142.32, "step": 152900, "train_speed(iter/s)": 0.285948 }, { "acc": 0.72813997, "epoch": 1.7104021407989127, "grad_norm": 5.34375, "learning_rate": 5.622925751513614e-07, "loss": 1.08987503, "memory(GiB)": 142.32, "step": 152920, "train_speed(iter/s)": 0.28596 }, { "acc": 0.73441381, "epoch": 1.7106258397448713, "grad_norm": 7.4375, "learning_rate": 5.614407991422094e-07, "loss": 1.05109587, "memory(GiB)": 142.32, "step": 152940, "train_speed(iter/s)": 0.285972 }, { "acc": 0.73997316, "epoch": 1.7108495386908298, "grad_norm": 6.03125, "learning_rate": 5.60589630391396e-07, "loss": 1.03332577, "memory(GiB)": 142.32, "step": 152960, "train_speed(iter/s)": 0.285986 }, { "acc": 0.74689445, "epoch": 1.7110732376367883, "grad_norm": 7.09375, "learning_rate": 5.597390690153765e-07, "loss": 0.99297228, "memory(GiB)": 142.32, "step": 152980, "train_speed(iter/s)": 0.285998 }, { "acc": 0.7404191, "epoch": 1.7112969365827468, "grad_norm": 5.875, "learning_rate": 5.58889115130517e-07, "loss": 1.03501244, "memory(GiB)": 142.32, "step": 153000, "train_speed(iter/s)": 0.286011 }, { "acc": 0.7358572, "epoch": 1.7115206355287054, "grad_norm": 5.5625, "learning_rate": 5.580397688531064e-07, "loss": 1.06333981, "memory(GiB)": 142.32, "step": 153020, "train_speed(iter/s)": 0.286025 }, { "acc": 0.73420601, "epoch": 1.711744334474664, "grad_norm": 7.96875, "learning_rate": 5.571910302993449e-07, "loss": 1.05600767, "memory(GiB)": 142.32, "step": 153040, "train_speed(iter/s)": 0.286038 }, { "acc": 0.72901106, "epoch": 1.7119680334206224, "grad_norm": 6.5, "learning_rate": 5.56342899585352e-07, "loss": 1.07672653, "memory(GiB)": 142.32, "step": 153060, "train_speed(iter/s)": 0.286051 }, { "acc": 0.74958224, "epoch": 1.712191732366581, "grad_norm": 6.78125, "learning_rate": 5.55495376827166e-07, "loss": 0.99513845, "memory(GiB)": 142.32, "step": 153080, "train_speed(iter/s)": 0.286064 }, { "acc": 0.73509798, "epoch": 1.7124154313125395, "grad_norm": 6.4375, "learning_rate": 5.546484621407378e-07, "loss": 1.08349266, "memory(GiB)": 142.32, "step": 153100, "train_speed(iter/s)": 0.286077 }, { "acc": 0.74595308, "epoch": 1.712639130258498, "grad_norm": 6.9375, "learning_rate": 5.538021556419399e-07, "loss": 0.9981802, "memory(GiB)": 142.32, "step": 153120, "train_speed(iter/s)": 0.28609 }, { "acc": 0.72953033, "epoch": 1.7128628292044565, "grad_norm": 5.71875, "learning_rate": 5.52956457446558e-07, "loss": 1.06033297, "memory(GiB)": 142.32, "step": 153140, "train_speed(iter/s)": 0.286102 }, { "acc": 0.73291655, "epoch": 1.713086528150415, "grad_norm": 7.75, "learning_rate": 5.521113676702944e-07, "loss": 1.06639442, "memory(GiB)": 142.32, "step": 153160, "train_speed(iter/s)": 0.286115 }, { "acc": 0.72875314, "epoch": 1.7133102270963736, "grad_norm": 7.59375, "learning_rate": 5.512668864287717e-07, "loss": 1.0762064, "memory(GiB)": 142.32, "step": 153180, "train_speed(iter/s)": 0.286129 }, { "acc": 0.74869256, "epoch": 1.7135339260423321, "grad_norm": 5.375, "learning_rate": 5.504230138375255e-07, "loss": 1.00952988, "memory(GiB)": 142.32, "step": 153200, "train_speed(iter/s)": 0.286141 }, { "acc": 0.73705645, "epoch": 1.7137576249882907, "grad_norm": 7.375, "learning_rate": 5.49579750012011e-07, "loss": 1.04708595, "memory(GiB)": 142.32, "step": 153220, "train_speed(iter/s)": 0.286154 }, { "acc": 0.7423171, "epoch": 1.7139813239342492, "grad_norm": 6.75, "learning_rate": 5.487370950675974e-07, "loss": 1.02202396, "memory(GiB)": 142.32, "step": 153240, "train_speed(iter/s)": 0.286167 }, { "acc": 0.73633451, "epoch": 1.7142050228802077, "grad_norm": 7.03125, "learning_rate": 5.478950491195717e-07, "loss": 1.04764881, "memory(GiB)": 142.32, "step": 153260, "train_speed(iter/s)": 0.28618 }, { "acc": 0.73254461, "epoch": 1.7144287218261662, "grad_norm": 7.28125, "learning_rate": 5.470536122831394e-07, "loss": 1.07577019, "memory(GiB)": 142.32, "step": 153280, "train_speed(iter/s)": 0.286194 }, { "acc": 0.74322338, "epoch": 1.7146524207721248, "grad_norm": 5.40625, "learning_rate": 5.462127846734189e-07, "loss": 1.01925993, "memory(GiB)": 142.32, "step": 153300, "train_speed(iter/s)": 0.286206 }, { "acc": 0.73246469, "epoch": 1.7148761197180833, "grad_norm": 5.375, "learning_rate": 5.453725664054504e-07, "loss": 1.06718426, "memory(GiB)": 142.32, "step": 153320, "train_speed(iter/s)": 0.286219 }, { "acc": 0.74459429, "epoch": 1.7150998186640418, "grad_norm": 6.28125, "learning_rate": 5.445329575941854e-07, "loss": 1.02125731, "memory(GiB)": 142.32, "step": 153340, "train_speed(iter/s)": 0.286231 }, { "acc": 0.72955971, "epoch": 1.7153235176100003, "grad_norm": 6.5625, "learning_rate": 5.436939583544948e-07, "loss": 1.08279591, "memory(GiB)": 142.32, "step": 153360, "train_speed(iter/s)": 0.286243 }, { "acc": 0.74068861, "epoch": 1.7155472165559589, "grad_norm": 4.78125, "learning_rate": 5.428555688011666e-07, "loss": 1.04934578, "memory(GiB)": 142.32, "step": 153380, "train_speed(iter/s)": 0.286256 }, { "acc": 0.73944464, "epoch": 1.7157709155019174, "grad_norm": 7.21875, "learning_rate": 5.420177890489026e-07, "loss": 1.03901005, "memory(GiB)": 142.32, "step": 153400, "train_speed(iter/s)": 0.286269 }, { "acc": 0.73932352, "epoch": 1.715994614447876, "grad_norm": 5.5625, "learning_rate": 5.411806192123259e-07, "loss": 1.04327126, "memory(GiB)": 142.32, "step": 153420, "train_speed(iter/s)": 0.28628 }, { "acc": 0.7319562, "epoch": 1.7162183133938345, "grad_norm": 5.96875, "learning_rate": 5.403440594059717e-07, "loss": 1.08672104, "memory(GiB)": 142.32, "step": 153440, "train_speed(iter/s)": 0.286294 }, { "acc": 0.7364676, "epoch": 1.716442012339793, "grad_norm": 6.84375, "learning_rate": 5.395081097442917e-07, "loss": 1.05436249, "memory(GiB)": 142.32, "step": 153460, "train_speed(iter/s)": 0.286304 }, { "acc": 0.7346899, "epoch": 1.7166657112857515, "grad_norm": 5.3125, "learning_rate": 5.386727703416589e-07, "loss": 1.04421597, "memory(GiB)": 142.32, "step": 153480, "train_speed(iter/s)": 0.286318 }, { "acc": 0.74103947, "epoch": 1.71688941023171, "grad_norm": 6.0, "learning_rate": 5.378380413123568e-07, "loss": 1.00261917, "memory(GiB)": 142.32, "step": 153500, "train_speed(iter/s)": 0.286331 }, { "acc": 0.72837291, "epoch": 1.7171131091776686, "grad_norm": 8.0, "learning_rate": 5.370039227705909e-07, "loss": 1.09670181, "memory(GiB)": 142.32, "step": 153520, "train_speed(iter/s)": 0.286344 }, { "acc": 0.73268256, "epoch": 1.717336808123627, "grad_norm": 6.15625, "learning_rate": 5.361704148304797e-07, "loss": 1.05615664, "memory(GiB)": 142.32, "step": 153540, "train_speed(iter/s)": 0.286358 }, { "acc": 0.73823519, "epoch": 1.7175605070695856, "grad_norm": 5.75, "learning_rate": 5.353375176060571e-07, "loss": 1.04535599, "memory(GiB)": 142.32, "step": 153560, "train_speed(iter/s)": 0.28637 }, { "acc": 0.73790207, "epoch": 1.7177842060155442, "grad_norm": 6.34375, "learning_rate": 5.345052312112775e-07, "loss": 1.0495821, "memory(GiB)": 142.32, "step": 153580, "train_speed(iter/s)": 0.286384 }, { "acc": 0.75146389, "epoch": 1.7180079049615027, "grad_norm": 6.0625, "learning_rate": 5.336735557600086e-07, "loss": 0.9824379, "memory(GiB)": 142.32, "step": 153600, "train_speed(iter/s)": 0.286397 }, { "acc": 0.74020014, "epoch": 1.7182316039074612, "grad_norm": 5.9375, "learning_rate": 5.328424913660363e-07, "loss": 1.02431011, "memory(GiB)": 142.32, "step": 153620, "train_speed(iter/s)": 0.28641 }, { "acc": 0.74479094, "epoch": 1.7184553028534197, "grad_norm": 4.84375, "learning_rate": 5.320120381430615e-07, "loss": 1.01566286, "memory(GiB)": 142.32, "step": 153640, "train_speed(iter/s)": 0.286421 }, { "acc": 0.724051, "epoch": 1.7186790017993783, "grad_norm": 6.28125, "learning_rate": 5.311821962047015e-07, "loss": 1.10226212, "memory(GiB)": 142.32, "step": 153660, "train_speed(iter/s)": 0.286434 }, { "acc": 0.737743, "epoch": 1.7189027007453368, "grad_norm": 5.4375, "learning_rate": 5.303529656644924e-07, "loss": 1.04011126, "memory(GiB)": 142.32, "step": 153680, "train_speed(iter/s)": 0.286447 }, { "acc": 0.74275308, "epoch": 1.7191263996912953, "grad_norm": 7.1875, "learning_rate": 5.295243466358823e-07, "loss": 1.00835457, "memory(GiB)": 142.32, "step": 153700, "train_speed(iter/s)": 0.286461 }, { "acc": 0.73703208, "epoch": 1.7193500986372539, "grad_norm": 5.15625, "learning_rate": 5.286963392322403e-07, "loss": 1.0373951, "memory(GiB)": 142.32, "step": 153720, "train_speed(iter/s)": 0.286473 }, { "acc": 0.73149748, "epoch": 1.7195737975832124, "grad_norm": 7.40625, "learning_rate": 5.278689435668488e-07, "loss": 1.06275597, "memory(GiB)": 142.32, "step": 153740, "train_speed(iter/s)": 0.286484 }, { "acc": 0.73696365, "epoch": 1.719797496529171, "grad_norm": 5.34375, "learning_rate": 5.270421597529063e-07, "loss": 1.04920864, "memory(GiB)": 142.32, "step": 153760, "train_speed(iter/s)": 0.286497 }, { "acc": 0.75007715, "epoch": 1.7200211954751294, "grad_norm": 7.34375, "learning_rate": 5.262159879035306e-07, "loss": 0.96897354, "memory(GiB)": 142.32, "step": 153780, "train_speed(iter/s)": 0.286504 }, { "acc": 0.74196, "epoch": 1.720244894421088, "grad_norm": 6.4375, "learning_rate": 5.253904281317518e-07, "loss": 1.01392803, "memory(GiB)": 142.32, "step": 153800, "train_speed(iter/s)": 0.286517 }, { "acc": 0.73137121, "epoch": 1.7204685933670465, "grad_norm": 5.78125, "learning_rate": 5.245654805505201e-07, "loss": 1.08443651, "memory(GiB)": 142.32, "step": 153820, "train_speed(iter/s)": 0.286531 }, { "acc": 0.73221087, "epoch": 1.720692292313005, "grad_norm": 6.125, "learning_rate": 5.237411452726998e-07, "loss": 1.06589899, "memory(GiB)": 142.32, "step": 153840, "train_speed(iter/s)": 0.286543 }, { "acc": 0.74572692, "epoch": 1.7209159912589636, "grad_norm": 5.84375, "learning_rate": 5.229174224110706e-07, "loss": 1.00791702, "memory(GiB)": 142.32, "step": 153860, "train_speed(iter/s)": 0.286557 }, { "acc": 0.74606581, "epoch": 1.721139690204922, "grad_norm": 6.53125, "learning_rate": 5.220943120783306e-07, "loss": 1.01500902, "memory(GiB)": 142.32, "step": 153880, "train_speed(iter/s)": 0.286569 }, { "acc": 0.75065131, "epoch": 1.7213633891508806, "grad_norm": 5.71875, "learning_rate": 5.212718143870921e-07, "loss": 0.98918972, "memory(GiB)": 142.32, "step": 153900, "train_speed(iter/s)": 0.286581 }, { "acc": 0.74427042, "epoch": 1.7215870880968391, "grad_norm": 6.6875, "learning_rate": 5.204499294498861e-07, "loss": 1.00980778, "memory(GiB)": 142.32, "step": 153920, "train_speed(iter/s)": 0.286594 }, { "acc": 0.74188542, "epoch": 1.7218107870427977, "grad_norm": 6.125, "learning_rate": 5.19628657379157e-07, "loss": 1.01224089, "memory(GiB)": 142.32, "step": 153940, "train_speed(iter/s)": 0.286606 }, { "acc": 0.73799458, "epoch": 1.7220344859887562, "grad_norm": 6.09375, "learning_rate": 5.188079982872673e-07, "loss": 1.0277174, "memory(GiB)": 142.32, "step": 153960, "train_speed(iter/s)": 0.286619 }, { "acc": 0.74668236, "epoch": 1.7222581849347147, "grad_norm": 5.75, "learning_rate": 5.179879522864939e-07, "loss": 0.99940634, "memory(GiB)": 142.32, "step": 153980, "train_speed(iter/s)": 0.28663 }, { "acc": 0.74032564, "epoch": 1.7224818838806732, "grad_norm": 4.96875, "learning_rate": 5.171685194890308e-07, "loss": 1.02492332, "memory(GiB)": 142.32, "step": 154000, "train_speed(iter/s)": 0.286642 }, { "epoch": 1.7224818838806732, "eval_acc": 0.6963969370607914, "eval_loss": 1.0713640451431274, "eval_runtime": 2341.3828, "eval_samples_per_second": 32.153, "eval_steps_per_second": 16.077, "step": 154000 }, { "acc": 0.74811697, "epoch": 1.7227055828266318, "grad_norm": 7.0, "learning_rate": 5.163497000069895e-07, "loss": 1.00168304, "memory(GiB)": 142.32, "step": 154020, "train_speed(iter/s)": 0.285383 }, { "acc": 0.73954916, "epoch": 1.7229292817725903, "grad_norm": 7.0, "learning_rate": 5.155314939523942e-07, "loss": 1.02571917, "memory(GiB)": 142.32, "step": 154040, "train_speed(iter/s)": 0.285395 }, { "acc": 0.73041687, "epoch": 1.7231529807185488, "grad_norm": 5.875, "learning_rate": 5.147139014371899e-07, "loss": 1.08191757, "memory(GiB)": 142.32, "step": 154060, "train_speed(iter/s)": 0.285407 }, { "acc": 0.72734442, "epoch": 1.7233766796645074, "grad_norm": 6.96875, "learning_rate": 5.138969225732326e-07, "loss": 1.0854578, "memory(GiB)": 142.32, "step": 154080, "train_speed(iter/s)": 0.285419 }, { "acc": 0.74561853, "epoch": 1.7236003786104659, "grad_norm": 5.5625, "learning_rate": 5.130805574722969e-07, "loss": 1.01532497, "memory(GiB)": 142.32, "step": 154100, "train_speed(iter/s)": 0.285432 }, { "acc": 0.72808361, "epoch": 1.7238240775564244, "grad_norm": 6.21875, "learning_rate": 5.12264806246075e-07, "loss": 1.08929749, "memory(GiB)": 142.32, "step": 154120, "train_speed(iter/s)": 0.285445 }, { "acc": 0.74925137, "epoch": 1.724047776502383, "grad_norm": 6.0, "learning_rate": 5.11449669006171e-07, "loss": 0.9936657, "memory(GiB)": 142.32, "step": 154140, "train_speed(iter/s)": 0.285459 }, { "acc": 0.7397831, "epoch": 1.7242714754483415, "grad_norm": 6.90625, "learning_rate": 5.10635145864109e-07, "loss": 1.0262084, "memory(GiB)": 142.32, "step": 154160, "train_speed(iter/s)": 0.285471 }, { "acc": 0.73730621, "epoch": 1.7244951743943, "grad_norm": 8.5, "learning_rate": 5.098212369313277e-07, "loss": 1.05697508, "memory(GiB)": 142.32, "step": 154180, "train_speed(iter/s)": 0.285483 }, { "acc": 0.72806358, "epoch": 1.7247188733402585, "grad_norm": 5.71875, "learning_rate": 5.090079423191791e-07, "loss": 1.09179516, "memory(GiB)": 142.32, "step": 154200, "train_speed(iter/s)": 0.285495 }, { "acc": 0.73958721, "epoch": 1.724942572286217, "grad_norm": 5.0, "learning_rate": 5.081952621389358e-07, "loss": 1.03034878, "memory(GiB)": 142.32, "step": 154220, "train_speed(iter/s)": 0.285506 }, { "acc": 0.73208857, "epoch": 1.7251662712321756, "grad_norm": 5.375, "learning_rate": 5.073831965017828e-07, "loss": 1.07742367, "memory(GiB)": 142.32, "step": 154240, "train_speed(iter/s)": 0.285519 }, { "acc": 0.73056269, "epoch": 1.7253899701781341, "grad_norm": 5.0625, "learning_rate": 5.065717455188235e-07, "loss": 1.08054543, "memory(GiB)": 142.32, "step": 154260, "train_speed(iter/s)": 0.285531 }, { "acc": 0.72866297, "epoch": 1.7256136691240926, "grad_norm": 6.21875, "learning_rate": 5.057609093010757e-07, "loss": 1.09631882, "memory(GiB)": 142.32, "step": 154280, "train_speed(iter/s)": 0.285544 }, { "acc": 0.74032583, "epoch": 1.7258373680700512, "grad_norm": 7.84375, "learning_rate": 5.049506879594713e-07, "loss": 1.04026327, "memory(GiB)": 142.32, "step": 154300, "train_speed(iter/s)": 0.285555 }, { "acc": 0.72842331, "epoch": 1.7260610670160097, "grad_norm": 6.6875, "learning_rate": 5.041410816048636e-07, "loss": 1.07324848, "memory(GiB)": 142.32, "step": 154320, "train_speed(iter/s)": 0.285568 }, { "acc": 0.7335145, "epoch": 1.7262847659619682, "grad_norm": 5.25, "learning_rate": 5.033320903480149e-07, "loss": 1.06795826, "memory(GiB)": 142.32, "step": 154340, "train_speed(iter/s)": 0.28558 }, { "acc": 0.73327179, "epoch": 1.7265084649079268, "grad_norm": 5.8125, "learning_rate": 5.025237142996097e-07, "loss": 1.06878595, "memory(GiB)": 142.32, "step": 154360, "train_speed(iter/s)": 0.285593 }, { "acc": 0.73492899, "epoch": 1.7267321638538853, "grad_norm": 5.59375, "learning_rate": 5.017159535702442e-07, "loss": 1.05332928, "memory(GiB)": 142.32, "step": 154380, "train_speed(iter/s)": 0.285605 }, { "acc": 0.7343082, "epoch": 1.7269558627998438, "grad_norm": 5.0625, "learning_rate": 5.009088082704305e-07, "loss": 1.06018696, "memory(GiB)": 142.32, "step": 154400, "train_speed(iter/s)": 0.285618 }, { "acc": 0.73721285, "epoch": 1.7271795617458023, "grad_norm": 6.4375, "learning_rate": 5.001022785105997e-07, "loss": 1.04076996, "memory(GiB)": 142.32, "step": 154420, "train_speed(iter/s)": 0.28563 }, { "acc": 0.74091759, "epoch": 1.7274032606917609, "grad_norm": 6.6875, "learning_rate": 4.992963644010951e-07, "loss": 1.03684464, "memory(GiB)": 142.32, "step": 154440, "train_speed(iter/s)": 0.285642 }, { "acc": 0.73842382, "epoch": 1.7276269596377194, "grad_norm": 6.71875, "learning_rate": 4.98491066052178e-07, "loss": 1.04632101, "memory(GiB)": 142.32, "step": 154460, "train_speed(iter/s)": 0.285653 }, { "acc": 0.72073793, "epoch": 1.727850658583678, "grad_norm": 7.21875, "learning_rate": 4.976863835740248e-07, "loss": 1.11841106, "memory(GiB)": 142.32, "step": 154480, "train_speed(iter/s)": 0.285666 }, { "acc": 0.7322546, "epoch": 1.7280743575296365, "grad_norm": 6.09375, "learning_rate": 4.968823170767256e-07, "loss": 1.06595764, "memory(GiB)": 142.32, "step": 154500, "train_speed(iter/s)": 0.285679 }, { "acc": 0.73338842, "epoch": 1.728298056475595, "grad_norm": 6.5625, "learning_rate": 4.960788666702915e-07, "loss": 1.04705296, "memory(GiB)": 142.32, "step": 154520, "train_speed(iter/s)": 0.285692 }, { "acc": 0.74828877, "epoch": 1.7285217554215535, "grad_norm": 6.25, "learning_rate": 4.952760324646427e-07, "loss": 1.00583982, "memory(GiB)": 142.32, "step": 154540, "train_speed(iter/s)": 0.285705 }, { "acc": 0.74924707, "epoch": 1.7287454543675123, "grad_norm": 4.96875, "learning_rate": 4.944738145696215e-07, "loss": 0.9919858, "memory(GiB)": 142.32, "step": 154560, "train_speed(iter/s)": 0.285717 }, { "acc": 0.75168886, "epoch": 1.7289691533134708, "grad_norm": 5.5, "learning_rate": 4.936722130949811e-07, "loss": 0.98901615, "memory(GiB)": 142.32, "step": 154580, "train_speed(iter/s)": 0.285729 }, { "acc": 0.74551716, "epoch": 1.7291928522594293, "grad_norm": 6.875, "learning_rate": 4.928712281503917e-07, "loss": 1.00771322, "memory(GiB)": 142.32, "step": 154600, "train_speed(iter/s)": 0.285742 }, { "acc": 0.74268265, "epoch": 1.7294165512053878, "grad_norm": 6.0, "learning_rate": 4.920708598454405e-07, "loss": 1.01775055, "memory(GiB)": 142.32, "step": 154620, "train_speed(iter/s)": 0.285755 }, { "acc": 0.73279247, "epoch": 1.7296402501513464, "grad_norm": 5.3125, "learning_rate": 4.912711082896276e-07, "loss": 1.07164383, "memory(GiB)": 142.32, "step": 154640, "train_speed(iter/s)": 0.285768 }, { "acc": 0.72926855, "epoch": 1.729863949097305, "grad_norm": 6.5625, "learning_rate": 4.90471973592373e-07, "loss": 1.06905308, "memory(GiB)": 142.32, "step": 154660, "train_speed(iter/s)": 0.285781 }, { "acc": 0.73459673, "epoch": 1.7300876480432634, "grad_norm": 6.125, "learning_rate": 4.896734558630084e-07, "loss": 1.05351315, "memory(GiB)": 142.32, "step": 154680, "train_speed(iter/s)": 0.285794 }, { "acc": 0.74091244, "epoch": 1.730311346989222, "grad_norm": 7.53125, "learning_rate": 4.888755552107815e-07, "loss": 1.01273823, "memory(GiB)": 142.32, "step": 154700, "train_speed(iter/s)": 0.285807 }, { "acc": 0.75189371, "epoch": 1.7305350459351805, "grad_norm": 6.71875, "learning_rate": 4.880782717448584e-07, "loss": 0.98345757, "memory(GiB)": 142.32, "step": 154720, "train_speed(iter/s)": 0.285817 }, { "acc": 0.74090004, "epoch": 1.730758744881139, "grad_norm": 5.15625, "learning_rate": 4.872816055743168e-07, "loss": 1.02370625, "memory(GiB)": 142.32, "step": 154740, "train_speed(iter/s)": 0.285829 }, { "acc": 0.7382431, "epoch": 1.7309824438270975, "grad_norm": 6.5625, "learning_rate": 4.864855568081545e-07, "loss": 1.04299488, "memory(GiB)": 142.32, "step": 154760, "train_speed(iter/s)": 0.285839 }, { "acc": 0.74068274, "epoch": 1.731206142773056, "grad_norm": 5.5, "learning_rate": 4.85690125555281e-07, "loss": 1.04289322, "memory(GiB)": 142.32, "step": 154780, "train_speed(iter/s)": 0.285851 }, { "acc": 0.73501959, "epoch": 1.7314298417190146, "grad_norm": 5.9375, "learning_rate": 4.848953119245215e-07, "loss": 1.06409626, "memory(GiB)": 142.32, "step": 154800, "train_speed(iter/s)": 0.285863 }, { "acc": 0.738449, "epoch": 1.7316535406649731, "grad_norm": 5.9375, "learning_rate": 4.841011160246195e-07, "loss": 1.026789, "memory(GiB)": 142.32, "step": 154820, "train_speed(iter/s)": 0.285874 }, { "acc": 0.7416986, "epoch": 1.7318772396109317, "grad_norm": 5.125, "learning_rate": 4.833075379642316e-07, "loss": 1.02652817, "memory(GiB)": 142.32, "step": 154840, "train_speed(iter/s)": 0.285886 }, { "acc": 0.74342651, "epoch": 1.7321009385568902, "grad_norm": 6.84375, "learning_rate": 4.825145778519318e-07, "loss": 1.02713146, "memory(GiB)": 142.32, "step": 154860, "train_speed(iter/s)": 0.285898 }, { "acc": 0.74593534, "epoch": 1.7323246375028487, "grad_norm": 6.46875, "learning_rate": 4.817222357962075e-07, "loss": 0.99873886, "memory(GiB)": 142.32, "step": 154880, "train_speed(iter/s)": 0.285911 }, { "acc": 0.74305696, "epoch": 1.7325483364488072, "grad_norm": 7.15625, "learning_rate": 4.809305119054613e-07, "loss": 1.01984215, "memory(GiB)": 142.32, "step": 154900, "train_speed(iter/s)": 0.285923 }, { "acc": 0.72763004, "epoch": 1.7327720353947658, "grad_norm": 6.75, "learning_rate": 4.801394062880144e-07, "loss": 1.07652168, "memory(GiB)": 142.32, "step": 154920, "train_speed(iter/s)": 0.285936 }, { "acc": 0.74522948, "epoch": 1.7329957343407243, "grad_norm": 7.1875, "learning_rate": 4.793489190521001e-07, "loss": 1.0150671, "memory(GiB)": 142.32, "step": 154940, "train_speed(iter/s)": 0.285949 }, { "acc": 0.73994231, "epoch": 1.7332194332866828, "grad_norm": 6.96875, "learning_rate": 4.785590503058691e-07, "loss": 1.04417248, "memory(GiB)": 142.32, "step": 154960, "train_speed(iter/s)": 0.285961 }, { "acc": 0.73460937, "epoch": 1.7334431322326413, "grad_norm": 6.84375, "learning_rate": 4.777698001573861e-07, "loss": 1.05481176, "memory(GiB)": 142.32, "step": 154980, "train_speed(iter/s)": 0.285974 }, { "acc": 0.7291636, "epoch": 1.7336668311785999, "grad_norm": 6.34375, "learning_rate": 4.769811687146308e-07, "loss": 1.08000526, "memory(GiB)": 142.32, "step": 155000, "train_speed(iter/s)": 0.285985 }, { "acc": 0.76096811, "epoch": 1.7338905301245584, "grad_norm": 7.40625, "learning_rate": 4.761931560855021e-07, "loss": 0.94794731, "memory(GiB)": 142.32, "step": 155020, "train_speed(iter/s)": 0.285998 }, { "acc": 0.74228454, "epoch": 1.734114229070517, "grad_norm": 6.3125, "learning_rate": 4.7540576237780854e-07, "loss": 1.0101099, "memory(GiB)": 142.32, "step": 155040, "train_speed(iter/s)": 0.286009 }, { "acc": 0.73861217, "epoch": 1.7343379280164755, "grad_norm": 6.71875, "learning_rate": 4.7461898769927904e-07, "loss": 1.04386339, "memory(GiB)": 142.32, "step": 155060, "train_speed(iter/s)": 0.286023 }, { "acc": 0.75030522, "epoch": 1.734561626962434, "grad_norm": 6.53125, "learning_rate": 4.738328321575547e-07, "loss": 0.98832397, "memory(GiB)": 142.32, "step": 155080, "train_speed(iter/s)": 0.286035 }, { "acc": 0.74413829, "epoch": 1.7347853259083925, "grad_norm": 6.5625, "learning_rate": 4.730472958601917e-07, "loss": 1.0105566, "memory(GiB)": 142.32, "step": 155100, "train_speed(iter/s)": 0.286048 }, { "acc": 0.73229856, "epoch": 1.735009024854351, "grad_norm": 5.6875, "learning_rate": 4.722623789146652e-07, "loss": 1.06882973, "memory(GiB)": 142.32, "step": 155120, "train_speed(iter/s)": 0.28606 }, { "acc": 0.73803043, "epoch": 1.7352327238003096, "grad_norm": 6.96875, "learning_rate": 4.714780814283604e-07, "loss": 1.0328455, "memory(GiB)": 142.32, "step": 155140, "train_speed(iter/s)": 0.286073 }, { "acc": 0.72829428, "epoch": 1.735456422746268, "grad_norm": 6.15625, "learning_rate": 4.7069440350858195e-07, "loss": 1.08282852, "memory(GiB)": 142.32, "step": 155160, "train_speed(iter/s)": 0.286085 }, { "acc": 0.75051527, "epoch": 1.7356801216922266, "grad_norm": 6.21875, "learning_rate": 4.69911345262547e-07, "loss": 0.98469095, "memory(GiB)": 142.32, "step": 155180, "train_speed(iter/s)": 0.286097 }, { "acc": 0.74400539, "epoch": 1.7359038206381852, "grad_norm": 6.90625, "learning_rate": 4.6912890679739086e-07, "loss": 1.00673447, "memory(GiB)": 142.32, "step": 155200, "train_speed(iter/s)": 0.28611 }, { "acc": 0.73596725, "epoch": 1.7361275195841437, "grad_norm": 7.09375, "learning_rate": 4.683470882201613e-07, "loss": 1.06335058, "memory(GiB)": 142.32, "step": 155220, "train_speed(iter/s)": 0.286122 }, { "acc": 0.72246022, "epoch": 1.7363512185301022, "grad_norm": 7.21875, "learning_rate": 4.675658896378216e-07, "loss": 1.10260305, "memory(GiB)": 142.32, "step": 155240, "train_speed(iter/s)": 0.286135 }, { "acc": 0.7352684, "epoch": 1.7365749174760607, "grad_norm": 6.59375, "learning_rate": 4.6678531115725235e-07, "loss": 1.0723362, "memory(GiB)": 142.32, "step": 155260, "train_speed(iter/s)": 0.286147 }, { "acc": 0.73625069, "epoch": 1.7367986164220193, "grad_norm": 5.78125, "learning_rate": 4.6600535288524594e-07, "loss": 1.04336882, "memory(GiB)": 142.32, "step": 155280, "train_speed(iter/s)": 0.28616 }, { "acc": 0.73333611, "epoch": 1.7370223153679778, "grad_norm": 5.875, "learning_rate": 4.6522601492851427e-07, "loss": 1.06593103, "memory(GiB)": 142.32, "step": 155300, "train_speed(iter/s)": 0.28617 }, { "acc": 0.73325572, "epoch": 1.7372460143139363, "grad_norm": 6.65625, "learning_rate": 4.644472973936803e-07, "loss": 1.05409632, "memory(GiB)": 142.32, "step": 155320, "train_speed(iter/s)": 0.286183 }, { "acc": 0.74234552, "epoch": 1.7374697132598949, "grad_norm": 6.15625, "learning_rate": 4.6366920038728335e-07, "loss": 1.03364983, "memory(GiB)": 142.32, "step": 155340, "train_speed(iter/s)": 0.286196 }, { "acc": 0.73232646, "epoch": 1.7376934122058534, "grad_norm": 5.71875, "learning_rate": 4.6289172401577984e-07, "loss": 1.07057524, "memory(GiB)": 142.32, "step": 155360, "train_speed(iter/s)": 0.286208 }, { "acc": 0.74444418, "epoch": 1.737917111151812, "grad_norm": 6.3125, "learning_rate": 4.6211486838553756e-07, "loss": 1.01222305, "memory(GiB)": 142.32, "step": 155380, "train_speed(iter/s)": 0.286221 }, { "acc": 0.75083532, "epoch": 1.7381408100977704, "grad_norm": 5.53125, "learning_rate": 4.6133863360284414e-07, "loss": 0.98469791, "memory(GiB)": 142.32, "step": 155400, "train_speed(iter/s)": 0.286234 }, { "acc": 0.74067907, "epoch": 1.738364509043729, "grad_norm": 7.125, "learning_rate": 4.605630197738975e-07, "loss": 1.01927719, "memory(GiB)": 142.32, "step": 155420, "train_speed(iter/s)": 0.286246 }, { "acc": 0.7250247, "epoch": 1.7385882079896875, "grad_norm": 5.28125, "learning_rate": 4.5978802700481216e-07, "loss": 1.09346638, "memory(GiB)": 142.32, "step": 155440, "train_speed(iter/s)": 0.286259 }, { "acc": 0.743997, "epoch": 1.738811906935646, "grad_norm": 5.5, "learning_rate": 4.5901365540162057e-07, "loss": 1.02037849, "memory(GiB)": 142.32, "step": 155460, "train_speed(iter/s)": 0.286271 }, { "acc": 0.73465996, "epoch": 1.7390356058816046, "grad_norm": 5.59375, "learning_rate": 4.582399050702657e-07, "loss": 1.06837463, "memory(GiB)": 142.32, "step": 155480, "train_speed(iter/s)": 0.286283 }, { "acc": 0.73604546, "epoch": 1.739259304827563, "grad_norm": 6.03125, "learning_rate": 4.5746677611660905e-07, "loss": 1.04399977, "memory(GiB)": 142.32, "step": 155500, "train_speed(iter/s)": 0.286295 }, { "acc": 0.74036713, "epoch": 1.7394830037735216, "grad_norm": 5.71875, "learning_rate": 4.566942686464254e-07, "loss": 1.00960503, "memory(GiB)": 142.32, "step": 155520, "train_speed(iter/s)": 0.286307 }, { "acc": 0.73411527, "epoch": 1.7397067027194801, "grad_norm": 6.28125, "learning_rate": 4.5592238276540415e-07, "loss": 1.07640514, "memory(GiB)": 142.32, "step": 155540, "train_speed(iter/s)": 0.286319 }, { "acc": 0.73631506, "epoch": 1.7399304016654387, "grad_norm": 7.1875, "learning_rate": 4.551511185791513e-07, "loss": 1.05729752, "memory(GiB)": 142.32, "step": 155560, "train_speed(iter/s)": 0.286331 }, { "acc": 0.73957596, "epoch": 1.7401541006113972, "grad_norm": 6.71875, "learning_rate": 4.5438047619318524e-07, "loss": 1.03299561, "memory(GiB)": 142.32, "step": 155580, "train_speed(iter/s)": 0.286344 }, { "acc": 0.73038983, "epoch": 1.7403777995573557, "grad_norm": 6.3125, "learning_rate": 4.536104557129428e-07, "loss": 1.07512236, "memory(GiB)": 142.32, "step": 155600, "train_speed(iter/s)": 0.286357 }, { "acc": 0.74374304, "epoch": 1.7406014985033142, "grad_norm": 7.03125, "learning_rate": 4.52841057243773e-07, "loss": 1.00786295, "memory(GiB)": 142.32, "step": 155620, "train_speed(iter/s)": 0.28637 }, { "acc": 0.73349667, "epoch": 1.7408251974492728, "grad_norm": 5.3125, "learning_rate": 4.5207228089093944e-07, "loss": 1.04774513, "memory(GiB)": 142.32, "step": 155640, "train_speed(iter/s)": 0.286383 }, { "acc": 0.73821287, "epoch": 1.7410488963952313, "grad_norm": 6.59375, "learning_rate": 4.513041267596241e-07, "loss": 1.03628483, "memory(GiB)": 142.32, "step": 155660, "train_speed(iter/s)": 0.286395 }, { "acc": 0.75440898, "epoch": 1.7412725953411898, "grad_norm": 6.90625, "learning_rate": 4.50536594954919e-07, "loss": 0.96801109, "memory(GiB)": 142.32, "step": 155680, "train_speed(iter/s)": 0.286407 }, { "acc": 0.72231331, "epoch": 1.7414962942871484, "grad_norm": 6.71875, "learning_rate": 4.497696855818351e-07, "loss": 1.10696316, "memory(GiB)": 142.32, "step": 155700, "train_speed(iter/s)": 0.28642 }, { "acc": 0.72494946, "epoch": 1.741719993233107, "grad_norm": 5.84375, "learning_rate": 4.490033987452963e-07, "loss": 1.11129789, "memory(GiB)": 142.32, "step": 155720, "train_speed(iter/s)": 0.286432 }, { "acc": 0.73996034, "epoch": 1.7419436921790654, "grad_norm": 5.59375, "learning_rate": 4.4823773455013983e-07, "loss": 1.03132811, "memory(GiB)": 142.32, "step": 155740, "train_speed(iter/s)": 0.286445 }, { "acc": 0.7394033, "epoch": 1.742167391125024, "grad_norm": 6.8125, "learning_rate": 4.474726931011225e-07, "loss": 1.04764214, "memory(GiB)": 142.32, "step": 155760, "train_speed(iter/s)": 0.286457 }, { "acc": 0.72663965, "epoch": 1.7423910900709825, "grad_norm": 6.53125, "learning_rate": 4.4670827450291e-07, "loss": 1.07669888, "memory(GiB)": 142.32, "step": 155780, "train_speed(iter/s)": 0.286469 }, { "acc": 0.734692, "epoch": 1.742614789016941, "grad_norm": 5.15625, "learning_rate": 4.459444788600881e-07, "loss": 1.06905766, "memory(GiB)": 142.32, "step": 155800, "train_speed(iter/s)": 0.286481 }, { "acc": 0.73060427, "epoch": 1.7428384879628995, "grad_norm": 6.1875, "learning_rate": 4.4518130627715426e-07, "loss": 1.10693054, "memory(GiB)": 142.32, "step": 155820, "train_speed(iter/s)": 0.286493 }, { "acc": 0.73796005, "epoch": 1.743062186908858, "grad_norm": 6.03125, "learning_rate": 4.444187568585201e-07, "loss": 1.04655209, "memory(GiB)": 142.32, "step": 155840, "train_speed(iter/s)": 0.286504 }, { "acc": 0.72800484, "epoch": 1.7432858858548166, "grad_norm": 6.3125, "learning_rate": 4.436568307085148e-07, "loss": 1.09798031, "memory(GiB)": 142.32, "step": 155860, "train_speed(iter/s)": 0.286516 }, { "acc": 0.72832146, "epoch": 1.7435095848007751, "grad_norm": 7.375, "learning_rate": 4.428955279313796e-07, "loss": 1.06899166, "memory(GiB)": 142.32, "step": 155880, "train_speed(iter/s)": 0.286527 }, { "acc": 0.7311758, "epoch": 1.7437332837467336, "grad_norm": 5.0625, "learning_rate": 4.4213484863127265e-07, "loss": 1.07089682, "memory(GiB)": 142.32, "step": 155900, "train_speed(iter/s)": 0.286538 }, { "acc": 0.7374999, "epoch": 1.7439569826926922, "grad_norm": 4.8125, "learning_rate": 4.413747929122658e-07, "loss": 1.02372503, "memory(GiB)": 142.32, "step": 155920, "train_speed(iter/s)": 0.286552 }, { "acc": 0.73781118, "epoch": 1.7441806816386507, "grad_norm": 5.6875, "learning_rate": 4.4061536087834425e-07, "loss": 1.02049236, "memory(GiB)": 142.32, "step": 155940, "train_speed(iter/s)": 0.286565 }, { "acc": 0.74242797, "epoch": 1.7444043805846092, "grad_norm": 5.71875, "learning_rate": 4.398565526334103e-07, "loss": 1.02185173, "memory(GiB)": 142.32, "step": 155960, "train_speed(iter/s)": 0.286577 }, { "acc": 0.73989701, "epoch": 1.7446280795305678, "grad_norm": 6.40625, "learning_rate": 4.3909836828127885e-07, "loss": 1.03718967, "memory(GiB)": 142.32, "step": 155980, "train_speed(iter/s)": 0.286589 }, { "acc": 0.74554811, "epoch": 1.7448517784765263, "grad_norm": 6.96875, "learning_rate": 4.3834080792568235e-07, "loss": 1.01001282, "memory(GiB)": 142.32, "step": 156000, "train_speed(iter/s)": 0.286601 }, { "epoch": 1.7448517784765263, "eval_acc": 0.6963729295897848, "eval_loss": 1.0714167356491089, "eval_runtime": 2346.2686, "eval_samples_per_second": 32.086, "eval_steps_per_second": 16.043, "step": 156000 }, { "acc": 0.74263554, "epoch": 1.7450754774224848, "grad_norm": 6.3125, "learning_rate": 4.375838716702635e-07, "loss": 1.00364742, "memory(GiB)": 142.32, "step": 156020, "train_speed(iter/s)": 0.285356 }, { "acc": 0.73670173, "epoch": 1.7452991763684433, "grad_norm": 5.28125, "learning_rate": 4.3682755961858283e-07, "loss": 1.04689579, "memory(GiB)": 142.32, "step": 156040, "train_speed(iter/s)": 0.285369 }, { "acc": 0.73347006, "epoch": 1.7455228753144019, "grad_norm": 5.875, "learning_rate": 4.3607187187411526e-07, "loss": 1.05343361, "memory(GiB)": 142.32, "step": 156060, "train_speed(iter/s)": 0.285381 }, { "acc": 0.73466415, "epoch": 1.7457465742603604, "grad_norm": 7.34375, "learning_rate": 4.3531680854024817e-07, "loss": 1.04711208, "memory(GiB)": 142.32, "step": 156080, "train_speed(iter/s)": 0.285393 }, { "acc": 0.73231993, "epoch": 1.745970273206319, "grad_norm": 5.96875, "learning_rate": 4.345623697202872e-07, "loss": 1.06882782, "memory(GiB)": 142.32, "step": 156100, "train_speed(iter/s)": 0.285406 }, { "acc": 0.73990507, "epoch": 1.7461939721522775, "grad_norm": 5.9375, "learning_rate": 4.338085555174493e-07, "loss": 1.04585161, "memory(GiB)": 142.32, "step": 156120, "train_speed(iter/s)": 0.285418 }, { "acc": 0.73800817, "epoch": 1.746417671098236, "grad_norm": 7.40625, "learning_rate": 4.3305536603486574e-07, "loss": 1.02696095, "memory(GiB)": 142.32, "step": 156140, "train_speed(iter/s)": 0.28543 }, { "acc": 0.73744974, "epoch": 1.7466413700441945, "grad_norm": 6.84375, "learning_rate": 4.3230280137558587e-07, "loss": 1.04769382, "memory(GiB)": 142.32, "step": 156160, "train_speed(iter/s)": 0.285443 }, { "acc": 0.72640128, "epoch": 1.746865068990153, "grad_norm": 5.625, "learning_rate": 4.31550861642569e-07, "loss": 1.08619938, "memory(GiB)": 142.32, "step": 156180, "train_speed(iter/s)": 0.285456 }, { "acc": 0.7231308, "epoch": 1.7470887679361116, "grad_norm": 6.0, "learning_rate": 4.307995469386933e-07, "loss": 1.12230558, "memory(GiB)": 142.32, "step": 156200, "train_speed(iter/s)": 0.285468 }, { "acc": 0.73476868, "epoch": 1.74731246688207, "grad_norm": 6.25, "learning_rate": 4.3004885736674885e-07, "loss": 1.06995354, "memory(GiB)": 142.32, "step": 156220, "train_speed(iter/s)": 0.285479 }, { "acc": 0.74651337, "epoch": 1.7475361658280286, "grad_norm": 5.0, "learning_rate": 4.2929879302943956e-07, "loss": 0.98255043, "memory(GiB)": 142.32, "step": 156240, "train_speed(iter/s)": 0.28549 }, { "acc": 0.74693365, "epoch": 1.7477598647739871, "grad_norm": 6.84375, "learning_rate": 4.285493540293867e-07, "loss": 1.00418644, "memory(GiB)": 142.32, "step": 156260, "train_speed(iter/s)": 0.285501 }, { "acc": 0.7282855, "epoch": 1.7479835637199457, "grad_norm": 6.75, "learning_rate": 4.278005404691221e-07, "loss": 1.07786102, "memory(GiB)": 142.32, "step": 156280, "train_speed(iter/s)": 0.285514 }, { "acc": 0.74985399, "epoch": 1.7482072626659042, "grad_norm": 7.1875, "learning_rate": 4.270523524510972e-07, "loss": 0.98496113, "memory(GiB)": 142.32, "step": 156300, "train_speed(iter/s)": 0.285527 }, { "acc": 0.74994593, "epoch": 1.7484309616118627, "grad_norm": 5.625, "learning_rate": 4.2630479007767124e-07, "loss": 0.97337341, "memory(GiB)": 142.32, "step": 156320, "train_speed(iter/s)": 0.28554 }, { "acc": 0.74353724, "epoch": 1.7486546605578213, "grad_norm": 6.6875, "learning_rate": 4.25557853451124e-07, "loss": 1.01699333, "memory(GiB)": 142.32, "step": 156340, "train_speed(iter/s)": 0.285548 }, { "acc": 0.73771095, "epoch": 1.7488783595037798, "grad_norm": 6.3125, "learning_rate": 4.24811542673646e-07, "loss": 1.03711948, "memory(GiB)": 142.32, "step": 156360, "train_speed(iter/s)": 0.28556 }, { "acc": 0.73829412, "epoch": 1.7491020584497383, "grad_norm": 6.21875, "learning_rate": 4.2406585784734277e-07, "loss": 1.05533981, "memory(GiB)": 142.32, "step": 156380, "train_speed(iter/s)": 0.285574 }, { "acc": 0.74485283, "epoch": 1.7493257573956968, "grad_norm": 6.21875, "learning_rate": 4.233207990742361e-07, "loss": 1.01628294, "memory(GiB)": 142.32, "step": 156400, "train_speed(iter/s)": 0.285587 }, { "acc": 0.73147612, "epoch": 1.7495494563416554, "grad_norm": 6.40625, "learning_rate": 4.2257636645625934e-07, "loss": 1.06229353, "memory(GiB)": 142.32, "step": 156420, "train_speed(iter/s)": 0.285599 }, { "acc": 0.73385987, "epoch": 1.749773155287614, "grad_norm": 5.5625, "learning_rate": 4.218325600952633e-07, "loss": 1.05556564, "memory(GiB)": 142.32, "step": 156440, "train_speed(iter/s)": 0.285611 }, { "acc": 0.73976622, "epoch": 1.7499968542335724, "grad_norm": 6.75, "learning_rate": 4.2108938009300935e-07, "loss": 1.03469505, "memory(GiB)": 142.32, "step": 156460, "train_speed(iter/s)": 0.285623 }, { "acc": 0.73538952, "epoch": 1.750220553179531, "grad_norm": 6.34375, "learning_rate": 4.2034682655117567e-07, "loss": 1.05120602, "memory(GiB)": 142.32, "step": 156480, "train_speed(iter/s)": 0.285635 }, { "acc": 0.73910403, "epoch": 1.7504442521254895, "grad_norm": 5.75, "learning_rate": 4.1960489957135475e-07, "loss": 1.02721386, "memory(GiB)": 142.32, "step": 156500, "train_speed(iter/s)": 0.285648 }, { "acc": 0.73963723, "epoch": 1.750667951071448, "grad_norm": 6.53125, "learning_rate": 4.1886359925505217e-07, "loss": 1.05180855, "memory(GiB)": 142.32, "step": 156520, "train_speed(iter/s)": 0.285659 }, { "acc": 0.73961906, "epoch": 1.7508916500174065, "grad_norm": 6.28125, "learning_rate": 4.181229257036895e-07, "loss": 1.03811188, "memory(GiB)": 142.32, "step": 156540, "train_speed(iter/s)": 0.285671 }, { "acc": 0.74525332, "epoch": 1.751115348963365, "grad_norm": 5.875, "learning_rate": 4.1738287901860075e-07, "loss": 1.00850639, "memory(GiB)": 142.32, "step": 156560, "train_speed(iter/s)": 0.285683 }, { "acc": 0.73720217, "epoch": 1.7513390479093236, "grad_norm": 5.5, "learning_rate": 4.1664345930103377e-07, "loss": 1.02827272, "memory(GiB)": 142.32, "step": 156580, "train_speed(iter/s)": 0.285696 }, { "acc": 0.72379017, "epoch": 1.7515627468552821, "grad_norm": 6.25, "learning_rate": 4.159046666521538e-07, "loss": 1.1018919, "memory(GiB)": 142.32, "step": 156600, "train_speed(iter/s)": 0.285709 }, { "acc": 0.74496946, "epoch": 1.7517864458012407, "grad_norm": 5.84375, "learning_rate": 4.15166501173036e-07, "loss": 1.01409397, "memory(GiB)": 142.32, "step": 156620, "train_speed(iter/s)": 0.285722 }, { "acc": 0.73210592, "epoch": 1.7520101447471992, "grad_norm": 5.1875, "learning_rate": 4.1442896296467416e-07, "loss": 1.06384888, "memory(GiB)": 142.32, "step": 156640, "train_speed(iter/s)": 0.285735 }, { "acc": 0.7458231, "epoch": 1.7522338436931577, "grad_norm": 6.125, "learning_rate": 4.136920521279736e-07, "loss": 0.99986649, "memory(GiB)": 142.32, "step": 156660, "train_speed(iter/s)": 0.285748 }, { "acc": 0.73460884, "epoch": 1.7524575426391162, "grad_norm": 6.0625, "learning_rate": 4.129557687637525e-07, "loss": 1.05682564, "memory(GiB)": 142.32, "step": 156680, "train_speed(iter/s)": 0.28576 }, { "acc": 0.74230223, "epoch": 1.7526812415850748, "grad_norm": 6.8125, "learning_rate": 4.122201129727471e-07, "loss": 1.02118826, "memory(GiB)": 142.32, "step": 156700, "train_speed(iter/s)": 0.285772 }, { "acc": 0.73699274, "epoch": 1.7529049405310333, "grad_norm": 6.46875, "learning_rate": 4.11485084855604e-07, "loss": 1.03098774, "memory(GiB)": 142.32, "step": 156720, "train_speed(iter/s)": 0.285784 }, { "acc": 0.73881311, "epoch": 1.7531286394769918, "grad_norm": 5.59375, "learning_rate": 4.107506845128872e-07, "loss": 1.05722227, "memory(GiB)": 142.32, "step": 156740, "train_speed(iter/s)": 0.285796 }, { "acc": 0.7520381, "epoch": 1.7533523384229504, "grad_norm": 6.46875, "learning_rate": 4.10016912045072e-07, "loss": 0.9880414, "memory(GiB)": 142.32, "step": 156760, "train_speed(iter/s)": 0.285808 }, { "acc": 0.75139189, "epoch": 1.7535760373689089, "grad_norm": 7.53125, "learning_rate": 4.092837675525485e-07, "loss": 0.99169292, "memory(GiB)": 142.32, "step": 156780, "train_speed(iter/s)": 0.285821 }, { "acc": 0.73875394, "epoch": 1.7537997363148674, "grad_norm": 5.90625, "learning_rate": 4.0855125113562324e-07, "loss": 1.03771572, "memory(GiB)": 142.32, "step": 156800, "train_speed(iter/s)": 0.285833 }, { "acc": 0.73815432, "epoch": 1.754023435260826, "grad_norm": 5.25, "learning_rate": 4.078193628945126e-07, "loss": 1.04319324, "memory(GiB)": 142.32, "step": 156820, "train_speed(iter/s)": 0.285845 }, { "acc": 0.73004155, "epoch": 1.7542471342067845, "grad_norm": 5.5, "learning_rate": 4.070881029293511e-07, "loss": 1.08752365, "memory(GiB)": 142.32, "step": 156840, "train_speed(iter/s)": 0.285858 }, { "acc": 0.73874855, "epoch": 1.754470833152743, "grad_norm": 6.53125, "learning_rate": 4.063574713401852e-07, "loss": 1.02751732, "memory(GiB)": 142.32, "step": 156860, "train_speed(iter/s)": 0.285872 }, { "acc": 0.7347456, "epoch": 1.7546945320987015, "grad_norm": 6.5, "learning_rate": 4.0562746822697454e-07, "loss": 1.05760403, "memory(GiB)": 142.32, "step": 156880, "train_speed(iter/s)": 0.285883 }, { "acc": 0.73679466, "epoch": 1.75491823104466, "grad_norm": 5.46875, "learning_rate": 4.048980936895958e-07, "loss": 1.06049328, "memory(GiB)": 142.32, "step": 156900, "train_speed(iter/s)": 0.285895 }, { "acc": 0.73547869, "epoch": 1.7551419299906186, "grad_norm": 5.78125, "learning_rate": 4.0416934782783633e-07, "loss": 1.06415272, "memory(GiB)": 142.32, "step": 156920, "train_speed(iter/s)": 0.285908 }, { "acc": 0.74609547, "epoch": 1.755365628936577, "grad_norm": 7.03125, "learning_rate": 4.034412307414004e-07, "loss": 0.99352369, "memory(GiB)": 142.32, "step": 156940, "train_speed(iter/s)": 0.285921 }, { "acc": 0.75226016, "epoch": 1.7555893278825356, "grad_norm": 6.96875, "learning_rate": 4.027137425299038e-07, "loss": 0.98597641, "memory(GiB)": 142.32, "step": 156960, "train_speed(iter/s)": 0.285933 }, { "acc": 0.75016079, "epoch": 1.7558130268284942, "grad_norm": 5.6875, "learning_rate": 4.019868832928769e-07, "loss": 0.97138376, "memory(GiB)": 142.32, "step": 156980, "train_speed(iter/s)": 0.285944 }, { "acc": 0.73854871, "epoch": 1.7560367257744527, "grad_norm": 6.25, "learning_rate": 4.0126065312976637e-07, "loss": 1.03172579, "memory(GiB)": 142.32, "step": 157000, "train_speed(iter/s)": 0.285957 }, { "acc": 0.7294476, "epoch": 1.7562604247204112, "grad_norm": 6.59375, "learning_rate": 4.0053505213992816e-07, "loss": 1.09228401, "memory(GiB)": 142.32, "step": 157020, "train_speed(iter/s)": 0.285967 }, { "acc": 0.7462678, "epoch": 1.7564841236663697, "grad_norm": 6.46875, "learning_rate": 3.9981008042263736e-07, "loss": 1.01080952, "memory(GiB)": 142.32, "step": 157040, "train_speed(iter/s)": 0.28598 }, { "acc": 0.73097191, "epoch": 1.7567078226123283, "grad_norm": 6.5, "learning_rate": 3.9908573807707963e-07, "loss": 1.07718382, "memory(GiB)": 142.32, "step": 157060, "train_speed(iter/s)": 0.285993 }, { "acc": 0.7409708, "epoch": 1.756931521558287, "grad_norm": 6.4375, "learning_rate": 3.98362025202354e-07, "loss": 1.01661549, "memory(GiB)": 142.32, "step": 157080, "train_speed(iter/s)": 0.286006 }, { "acc": 0.73340721, "epoch": 1.7571552205042456, "grad_norm": 6.4375, "learning_rate": 3.9763894189747744e-07, "loss": 1.06210194, "memory(GiB)": 142.32, "step": 157100, "train_speed(iter/s)": 0.286019 }, { "acc": 0.73322301, "epoch": 1.757378919450204, "grad_norm": 5.5, "learning_rate": 3.969164882613746e-07, "loss": 1.06684198, "memory(GiB)": 142.32, "step": 157120, "train_speed(iter/s)": 0.28603 }, { "acc": 0.7443459, "epoch": 1.7576026183961626, "grad_norm": 6.03125, "learning_rate": 3.96194664392891e-07, "loss": 1.01444359, "memory(GiB)": 142.32, "step": 157140, "train_speed(iter/s)": 0.286042 }, { "acc": 0.72925529, "epoch": 1.7578263173421211, "grad_norm": 5.71875, "learning_rate": 3.954734703907798e-07, "loss": 1.08077602, "memory(GiB)": 142.32, "step": 157160, "train_speed(iter/s)": 0.286053 }, { "acc": 0.74162359, "epoch": 1.7580500162880797, "grad_norm": 5.71875, "learning_rate": 3.9475290635371256e-07, "loss": 1.01799641, "memory(GiB)": 142.32, "step": 157180, "train_speed(iter/s)": 0.286065 }, { "acc": 0.74432774, "epoch": 1.7582737152340382, "grad_norm": 6.09375, "learning_rate": 3.940329723802716e-07, "loss": 1.026227, "memory(GiB)": 142.32, "step": 157200, "train_speed(iter/s)": 0.286077 }, { "acc": 0.73827472, "epoch": 1.7584974141799967, "grad_norm": 6.40625, "learning_rate": 3.9331366856895424e-07, "loss": 1.04041882, "memory(GiB)": 142.32, "step": 157220, "train_speed(iter/s)": 0.286089 }, { "acc": 0.73003349, "epoch": 1.7587211131259552, "grad_norm": 6.28125, "learning_rate": 3.925949950181718e-07, "loss": 1.07639923, "memory(GiB)": 142.32, "step": 157240, "train_speed(iter/s)": 0.286103 }, { "acc": 0.74056702, "epoch": 1.7589448120719138, "grad_norm": 6.09375, "learning_rate": 3.9187695182624887e-07, "loss": 1.02826195, "memory(GiB)": 142.32, "step": 157260, "train_speed(iter/s)": 0.286116 }, { "acc": 0.74108763, "epoch": 1.7591685110178723, "grad_norm": 6.4375, "learning_rate": 3.911595390914247e-07, "loss": 1.03485947, "memory(GiB)": 142.32, "step": 157280, "train_speed(iter/s)": 0.28613 }, { "acc": 0.73965197, "epoch": 1.7593922099638308, "grad_norm": 8.4375, "learning_rate": 3.904427569118513e-07, "loss": 1.03731279, "memory(GiB)": 142.32, "step": 157300, "train_speed(iter/s)": 0.286142 }, { "acc": 0.73860359, "epoch": 1.7596159089097894, "grad_norm": 5.34375, "learning_rate": 3.89726605385593e-07, "loss": 1.04139767, "memory(GiB)": 142.32, "step": 157320, "train_speed(iter/s)": 0.286155 }, { "acc": 0.72941484, "epoch": 1.759839607855748, "grad_norm": 6.0625, "learning_rate": 3.8901108461063264e-07, "loss": 1.07148094, "memory(GiB)": 142.32, "step": 157340, "train_speed(iter/s)": 0.286167 }, { "acc": 0.73564577, "epoch": 1.7600633068017064, "grad_norm": 6.40625, "learning_rate": 3.8829619468486125e-07, "loss": 1.0536046, "memory(GiB)": 142.32, "step": 157360, "train_speed(iter/s)": 0.286179 }, { "acc": 0.73609114, "epoch": 1.760287005747665, "grad_norm": 7.4375, "learning_rate": 3.8758193570608726e-07, "loss": 1.05272884, "memory(GiB)": 142.32, "step": 157380, "train_speed(iter/s)": 0.286192 }, { "acc": 0.73436527, "epoch": 1.7605107046936235, "grad_norm": 8.375, "learning_rate": 3.8686830777203143e-07, "loss": 1.07193985, "memory(GiB)": 142.32, "step": 157400, "train_speed(iter/s)": 0.286205 }, { "acc": 0.73626699, "epoch": 1.760734403639582, "grad_norm": 7.0, "learning_rate": 3.8615531098032734e-07, "loss": 1.06009407, "memory(GiB)": 142.32, "step": 157420, "train_speed(iter/s)": 0.286217 }, { "acc": 0.7297204, "epoch": 1.7609581025855405, "grad_norm": 6.15625, "learning_rate": 3.854429454285241e-07, "loss": 1.09106655, "memory(GiB)": 142.32, "step": 157440, "train_speed(iter/s)": 0.286228 }, { "acc": 0.72926702, "epoch": 1.761181801531499, "grad_norm": 5.90625, "learning_rate": 3.847312112140822e-07, "loss": 1.0806427, "memory(GiB)": 142.32, "step": 157460, "train_speed(iter/s)": 0.286241 }, { "acc": 0.73988581, "epoch": 1.7614055004774576, "grad_norm": 7.625, "learning_rate": 3.840201084343792e-07, "loss": 1.04653625, "memory(GiB)": 142.32, "step": 157480, "train_speed(iter/s)": 0.286252 }, { "acc": 0.73668413, "epoch": 1.7616291994234161, "grad_norm": 5.375, "learning_rate": 3.833096371867029e-07, "loss": 1.03960896, "memory(GiB)": 142.32, "step": 157500, "train_speed(iter/s)": 0.286264 }, { "acc": 0.73437204, "epoch": 1.7618528983693746, "grad_norm": 5.78125, "learning_rate": 3.8259979756825604e-07, "loss": 1.0579649, "memory(GiB)": 142.32, "step": 157520, "train_speed(iter/s)": 0.286277 }, { "acc": 0.73022356, "epoch": 1.7620765973153332, "grad_norm": 10.0, "learning_rate": 3.8189058967615433e-07, "loss": 1.07616186, "memory(GiB)": 142.32, "step": 157540, "train_speed(iter/s)": 0.286288 }, { "acc": 0.73439875, "epoch": 1.7623002962612917, "grad_norm": 5.34375, "learning_rate": 3.8118201360742736e-07, "loss": 1.07766457, "memory(GiB)": 142.32, "step": 157560, "train_speed(iter/s)": 0.2863 }, { "acc": 0.72984815, "epoch": 1.7625239952072502, "grad_norm": 6.46875, "learning_rate": 3.8047406945901976e-07, "loss": 1.06457891, "memory(GiB)": 142.32, "step": 157580, "train_speed(iter/s)": 0.286314 }, { "acc": 0.73659577, "epoch": 1.7627476941532088, "grad_norm": 5.0625, "learning_rate": 3.797667573277869e-07, "loss": 1.05161152, "memory(GiB)": 142.32, "step": 157600, "train_speed(iter/s)": 0.286326 }, { "acc": 0.73669991, "epoch": 1.7629713930991673, "grad_norm": 6.96875, "learning_rate": 3.7906007731050134e-07, "loss": 1.05382395, "memory(GiB)": 142.32, "step": 157620, "train_speed(iter/s)": 0.286337 }, { "acc": 0.73086748, "epoch": 1.7631950920451258, "grad_norm": 7.0625, "learning_rate": 3.7835402950384516e-07, "loss": 1.06333494, "memory(GiB)": 142.32, "step": 157640, "train_speed(iter/s)": 0.286348 }, { "acc": 0.74654818, "epoch": 1.7634187909910843, "grad_norm": 7.28125, "learning_rate": 3.7764861400441553e-07, "loss": 1.00928507, "memory(GiB)": 142.32, "step": 157660, "train_speed(iter/s)": 0.28636 }, { "acc": 0.75261765, "epoch": 1.7636424899370429, "grad_norm": 5.65625, "learning_rate": 3.7694383090872475e-07, "loss": 0.96664019, "memory(GiB)": 142.32, "step": 157680, "train_speed(iter/s)": 0.286372 }, { "acc": 0.73722878, "epoch": 1.7638661888830014, "grad_norm": 5.6875, "learning_rate": 3.7623968031319566e-07, "loss": 1.0291544, "memory(GiB)": 142.32, "step": 157700, "train_speed(iter/s)": 0.286384 }, { "acc": 0.73846159, "epoch": 1.76408988782896, "grad_norm": 5.1875, "learning_rate": 3.755361623141684e-07, "loss": 1.06010113, "memory(GiB)": 142.32, "step": 157720, "train_speed(iter/s)": 0.286398 }, { "acc": 0.73730612, "epoch": 1.7643135867749185, "grad_norm": 5.875, "learning_rate": 3.7483327700789276e-07, "loss": 1.08063164, "memory(GiB)": 142.32, "step": 157740, "train_speed(iter/s)": 0.28641 }, { "acc": 0.74219141, "epoch": 1.764537285720877, "grad_norm": 6.9375, "learning_rate": 3.7413102449053275e-07, "loss": 1.01849117, "memory(GiB)": 142.32, "step": 157760, "train_speed(iter/s)": 0.28642 }, { "acc": 0.74558086, "epoch": 1.7647609846668355, "grad_norm": 4.9375, "learning_rate": 3.734294048581688e-07, "loss": 0.99529076, "memory(GiB)": 142.32, "step": 157780, "train_speed(iter/s)": 0.286433 }, { "acc": 0.73341761, "epoch": 1.764984683612794, "grad_norm": 6.71875, "learning_rate": 3.727284182067897e-07, "loss": 1.06748848, "memory(GiB)": 142.32, "step": 157800, "train_speed(iter/s)": 0.286446 }, { "acc": 0.73604293, "epoch": 1.7652083825587526, "grad_norm": 5.28125, "learning_rate": 3.720280646323032e-07, "loss": 1.03479233, "memory(GiB)": 142.32, "step": 157820, "train_speed(iter/s)": 0.286459 }, { "acc": 0.74234452, "epoch": 1.765432081504711, "grad_norm": 6.4375, "learning_rate": 3.71328344230526e-07, "loss": 1.02065248, "memory(GiB)": 142.32, "step": 157840, "train_speed(iter/s)": 0.286472 }, { "acc": 0.72920246, "epoch": 1.7656557804506696, "grad_norm": 5.96875, "learning_rate": 3.7062925709718976e-07, "loss": 1.08032608, "memory(GiB)": 142.32, "step": 157860, "train_speed(iter/s)": 0.286485 }, { "acc": 0.73788433, "epoch": 1.7658794793966281, "grad_norm": 5.8125, "learning_rate": 3.6993080332794085e-07, "loss": 1.03346863, "memory(GiB)": 142.32, "step": 157880, "train_speed(iter/s)": 0.286497 }, { "acc": 0.74463582, "epoch": 1.7661031783425867, "grad_norm": 6.125, "learning_rate": 3.692329830183361e-07, "loss": 0.99515896, "memory(GiB)": 142.32, "step": 157900, "train_speed(iter/s)": 0.28651 }, { "acc": 0.73989334, "epoch": 1.7663268772885452, "grad_norm": 5.1875, "learning_rate": 3.6853579626384916e-07, "loss": 1.03241653, "memory(GiB)": 142.32, "step": 157920, "train_speed(iter/s)": 0.286522 }, { "acc": 0.73122711, "epoch": 1.7665505762345037, "grad_norm": 6.34375, "learning_rate": 3.678392431598643e-07, "loss": 1.07158079, "memory(GiB)": 142.32, "step": 157940, "train_speed(iter/s)": 0.286533 }, { "acc": 0.73494625, "epoch": 1.7667742751804623, "grad_norm": 6.84375, "learning_rate": 3.671433238016786e-07, "loss": 1.05388756, "memory(GiB)": 142.32, "step": 157960, "train_speed(iter/s)": 0.286546 }, { "acc": 0.73287125, "epoch": 1.7669979741264208, "grad_norm": 5.71875, "learning_rate": 3.664480382845059e-07, "loss": 1.072859, "memory(GiB)": 142.32, "step": 157980, "train_speed(iter/s)": 0.286558 }, { "acc": 0.7360281, "epoch": 1.7672216730723793, "grad_norm": 5.78125, "learning_rate": 3.657533867034696e-07, "loss": 1.03839684, "memory(GiB)": 142.32, "step": 158000, "train_speed(iter/s)": 0.286571 }, { "epoch": 1.7672216730723793, "eval_acc": 0.6963847607869955, "eval_loss": 1.0713729858398438, "eval_runtime": 2341.0434, "eval_samples_per_second": 32.158, "eval_steps_per_second": 16.079, "step": 158000 }, { "acc": 0.73859611, "epoch": 1.7674453720183378, "grad_norm": 5.4375, "learning_rate": 3.6505936915360974e-07, "loss": 1.04350357, "memory(GiB)": 142.32, "step": 158020, "train_speed(iter/s)": 0.285345 }, { "acc": 0.74664888, "epoch": 1.7676690709642964, "grad_norm": 6.9375, "learning_rate": 3.6436598572987645e-07, "loss": 1.00004444, "memory(GiB)": 142.32, "step": 158040, "train_speed(iter/s)": 0.285358 }, { "acc": 0.7418108, "epoch": 1.767892769910255, "grad_norm": 7.15625, "learning_rate": 3.636732365271339e-07, "loss": 1.01684189, "memory(GiB)": 142.32, "step": 158060, "train_speed(iter/s)": 0.285371 }, { "acc": 0.73211908, "epoch": 1.7681164688562134, "grad_norm": 5.8125, "learning_rate": 3.6298112164016185e-07, "loss": 1.05613537, "memory(GiB)": 142.32, "step": 158080, "train_speed(iter/s)": 0.285383 }, { "acc": 0.73339624, "epoch": 1.768340167802172, "grad_norm": 5.28125, "learning_rate": 3.622896411636495e-07, "loss": 1.0508213, "memory(GiB)": 142.32, "step": 158100, "train_speed(iter/s)": 0.285395 }, { "acc": 0.75260372, "epoch": 1.7685638667481305, "grad_norm": 6.21875, "learning_rate": 3.615987951922034e-07, "loss": 0.97975693, "memory(GiB)": 142.32, "step": 158120, "train_speed(iter/s)": 0.285407 }, { "acc": 0.73126969, "epoch": 1.768787565694089, "grad_norm": 8.1875, "learning_rate": 3.609085838203408e-07, "loss": 1.0543108, "memory(GiB)": 142.32, "step": 158140, "train_speed(iter/s)": 0.285419 }, { "acc": 0.74529171, "epoch": 1.7690112646400475, "grad_norm": 6.09375, "learning_rate": 3.6021900714249045e-07, "loss": 0.9991787, "memory(GiB)": 142.32, "step": 158160, "train_speed(iter/s)": 0.285432 }, { "acc": 0.72961712, "epoch": 1.769234963586006, "grad_norm": 7.15625, "learning_rate": 3.5953006525299817e-07, "loss": 1.08187838, "memory(GiB)": 142.32, "step": 158180, "train_speed(iter/s)": 0.285445 }, { "acc": 0.74448977, "epoch": 1.7694586625319646, "grad_norm": 6.34375, "learning_rate": 3.588417582461201e-07, "loss": 1.01321583, "memory(GiB)": 142.32, "step": 158200, "train_speed(iter/s)": 0.285457 }, { "acc": 0.74575, "epoch": 1.7696823614779231, "grad_norm": 5.9375, "learning_rate": 3.5815408621602767e-07, "loss": 0.99762783, "memory(GiB)": 142.32, "step": 158220, "train_speed(iter/s)": 0.285468 }, { "acc": 0.73880215, "epoch": 1.7699060604238817, "grad_norm": 7.25, "learning_rate": 3.5746704925680385e-07, "loss": 1.03088865, "memory(GiB)": 142.32, "step": 158240, "train_speed(iter/s)": 0.285481 }, { "acc": 0.73813233, "epoch": 1.7701297593698402, "grad_norm": 7.21875, "learning_rate": 3.567806474624436e-07, "loss": 1.04427156, "memory(GiB)": 142.32, "step": 158260, "train_speed(iter/s)": 0.285493 }, { "acc": 0.7289959, "epoch": 1.7703534583157987, "grad_norm": 6.09375, "learning_rate": 3.560948809268583e-07, "loss": 1.06744881, "memory(GiB)": 142.32, "step": 158280, "train_speed(iter/s)": 0.285505 }, { "acc": 0.7340662, "epoch": 1.7705771572617572, "grad_norm": 6.5625, "learning_rate": 3.5540974974386965e-07, "loss": 1.05410023, "memory(GiB)": 142.32, "step": 158300, "train_speed(iter/s)": 0.285517 }, { "acc": 0.73761749, "epoch": 1.7708008562077158, "grad_norm": 6.65625, "learning_rate": 3.547252540072144e-07, "loss": 1.02569962, "memory(GiB)": 142.32, "step": 158320, "train_speed(iter/s)": 0.285529 }, { "acc": 0.74187331, "epoch": 1.7710245551536743, "grad_norm": 6.5, "learning_rate": 3.540413938105408e-07, "loss": 1.02540369, "memory(GiB)": 142.32, "step": 158340, "train_speed(iter/s)": 0.285541 }, { "acc": 0.74184046, "epoch": 1.7712482540996328, "grad_norm": 6.9375, "learning_rate": 3.533581692474092e-07, "loss": 1.03628311, "memory(GiB)": 142.32, "step": 158360, "train_speed(iter/s)": 0.285554 }, { "acc": 0.73864222, "epoch": 1.7714719530455914, "grad_norm": 6.375, "learning_rate": 3.5267558041129745e-07, "loss": 1.05777645, "memory(GiB)": 142.32, "step": 158380, "train_speed(iter/s)": 0.285565 }, { "acc": 0.7474185, "epoch": 1.7716956519915499, "grad_norm": 7.84375, "learning_rate": 3.5199362739559037e-07, "loss": 0.98789053, "memory(GiB)": 142.32, "step": 158400, "train_speed(iter/s)": 0.285579 }, { "acc": 0.72608776, "epoch": 1.7719193509375084, "grad_norm": 6.125, "learning_rate": 3.513123102935917e-07, "loss": 1.08451204, "memory(GiB)": 142.32, "step": 158420, "train_speed(iter/s)": 0.285591 }, { "acc": 0.73781796, "epoch": 1.772143049883467, "grad_norm": 6.71875, "learning_rate": 3.5063162919851344e-07, "loss": 1.02794781, "memory(GiB)": 142.32, "step": 158440, "train_speed(iter/s)": 0.285603 }, { "acc": 0.74339218, "epoch": 1.7723667488294255, "grad_norm": 6.4375, "learning_rate": 3.4995158420348284e-07, "loss": 1.02238522, "memory(GiB)": 142.32, "step": 158460, "train_speed(iter/s)": 0.285616 }, { "acc": 0.73328209, "epoch": 1.772590447775384, "grad_norm": 6.1875, "learning_rate": 3.49272175401541e-07, "loss": 1.0661047, "memory(GiB)": 142.32, "step": 158480, "train_speed(iter/s)": 0.285627 }, { "acc": 0.72786455, "epoch": 1.7728141467213425, "grad_norm": 6.3125, "learning_rate": 3.4859340288563857e-07, "loss": 1.07743111, "memory(GiB)": 142.32, "step": 158500, "train_speed(iter/s)": 0.285641 }, { "acc": 0.72325134, "epoch": 1.773037845667301, "grad_norm": 5.375, "learning_rate": 3.479152667486435e-07, "loss": 1.10316219, "memory(GiB)": 142.32, "step": 158520, "train_speed(iter/s)": 0.285654 }, { "acc": 0.73282013, "epoch": 1.7732615446132596, "grad_norm": 6.59375, "learning_rate": 3.4723776708333325e-07, "loss": 1.08011589, "memory(GiB)": 142.32, "step": 158540, "train_speed(iter/s)": 0.285667 }, { "acc": 0.74448833, "epoch": 1.773485243559218, "grad_norm": 5.625, "learning_rate": 3.465609039823992e-07, "loss": 1.01661453, "memory(GiB)": 142.32, "step": 158560, "train_speed(iter/s)": 0.28568 }, { "acc": 0.73868947, "epoch": 1.7737089425051766, "grad_norm": 5.375, "learning_rate": 3.4588467753844725e-07, "loss": 1.04793587, "memory(GiB)": 142.32, "step": 158580, "train_speed(iter/s)": 0.285691 }, { "acc": 0.74033661, "epoch": 1.7739326414511352, "grad_norm": 7.4375, "learning_rate": 3.4520908784399287e-07, "loss": 1.03471756, "memory(GiB)": 142.32, "step": 158600, "train_speed(iter/s)": 0.285703 }, { "acc": 0.73178601, "epoch": 1.7741563403970937, "grad_norm": 5.875, "learning_rate": 3.4453413499146884e-07, "loss": 1.06202049, "memory(GiB)": 142.32, "step": 158620, "train_speed(iter/s)": 0.285716 }, { "acc": 0.73102169, "epoch": 1.7743800393430522, "grad_norm": 7.40625, "learning_rate": 3.438598190732173e-07, "loss": 1.06714621, "memory(GiB)": 142.32, "step": 158640, "train_speed(iter/s)": 0.285729 }, { "acc": 0.74102325, "epoch": 1.7746037382890107, "grad_norm": 5.625, "learning_rate": 3.4318614018149287e-07, "loss": 1.01684513, "memory(GiB)": 142.32, "step": 158660, "train_speed(iter/s)": 0.285742 }, { "acc": 0.73465247, "epoch": 1.7748274372349693, "grad_norm": 5.125, "learning_rate": 3.425130984084674e-07, "loss": 1.04641247, "memory(GiB)": 142.32, "step": 158680, "train_speed(iter/s)": 0.285754 }, { "acc": 0.73900847, "epoch": 1.7750511361809278, "grad_norm": 6.40625, "learning_rate": 3.4184069384621934e-07, "loss": 1.0378171, "memory(GiB)": 142.32, "step": 158700, "train_speed(iter/s)": 0.285765 }, { "acc": 0.73042502, "epoch": 1.7752748351268863, "grad_norm": 5.5, "learning_rate": 3.411689265867452e-07, "loss": 1.06784763, "memory(GiB)": 142.32, "step": 158720, "train_speed(iter/s)": 0.285777 }, { "acc": 0.74219322, "epoch": 1.7754985340728449, "grad_norm": 7.53125, "learning_rate": 3.404977967219514e-07, "loss": 1.01220512, "memory(GiB)": 142.32, "step": 158740, "train_speed(iter/s)": 0.285788 }, { "acc": 0.72995739, "epoch": 1.7757222330188034, "grad_norm": 5.65625, "learning_rate": 3.3982730434365954e-07, "loss": 1.08556461, "memory(GiB)": 142.32, "step": 158760, "train_speed(iter/s)": 0.285799 }, { "acc": 0.73347111, "epoch": 1.775945931964762, "grad_norm": 5.4375, "learning_rate": 3.3915744954360175e-07, "loss": 1.07768841, "memory(GiB)": 142.32, "step": 158780, "train_speed(iter/s)": 0.285811 }, { "acc": 0.74152555, "epoch": 1.7761696309107204, "grad_norm": 5.625, "learning_rate": 3.3848823241342254e-07, "loss": 1.01361361, "memory(GiB)": 142.32, "step": 158800, "train_speed(iter/s)": 0.285824 }, { "acc": 0.72985764, "epoch": 1.776393329856679, "grad_norm": 6.5, "learning_rate": 3.378196530446826e-07, "loss": 1.06354904, "memory(GiB)": 142.32, "step": 158820, "train_speed(iter/s)": 0.285836 }, { "acc": 0.73776236, "epoch": 1.7766170288026375, "grad_norm": 5.9375, "learning_rate": 3.371517115288514e-07, "loss": 1.04374619, "memory(GiB)": 142.32, "step": 158840, "train_speed(iter/s)": 0.285849 }, { "acc": 0.7424449, "epoch": 1.776840727748596, "grad_norm": 6.3125, "learning_rate": 3.3648440795731373e-07, "loss": 1.0294445, "memory(GiB)": 142.32, "step": 158860, "train_speed(iter/s)": 0.285859 }, { "acc": 0.73738613, "epoch": 1.7770644266945546, "grad_norm": 5.1875, "learning_rate": 3.3581774242136656e-07, "loss": 1.03259983, "memory(GiB)": 142.32, "step": 158880, "train_speed(iter/s)": 0.285873 }, { "acc": 0.7377964, "epoch": 1.777288125640513, "grad_norm": 6.09375, "learning_rate": 3.3515171501221745e-07, "loss": 1.04622135, "memory(GiB)": 142.32, "step": 158900, "train_speed(iter/s)": 0.285886 }, { "acc": 0.73419471, "epoch": 1.7775118245864716, "grad_norm": 6.625, "learning_rate": 3.3448632582099074e-07, "loss": 1.0657917, "memory(GiB)": 142.32, "step": 158920, "train_speed(iter/s)": 0.285898 }, { "acc": 0.73633156, "epoch": 1.7777355235324301, "grad_norm": 5.5625, "learning_rate": 3.338215749387197e-07, "loss": 1.05144444, "memory(GiB)": 142.32, "step": 158940, "train_speed(iter/s)": 0.285909 }, { "acc": 0.72420187, "epoch": 1.7779592224783887, "grad_norm": 4.46875, "learning_rate": 3.331574624563527e-07, "loss": 1.09946957, "memory(GiB)": 142.32, "step": 158960, "train_speed(iter/s)": 0.285923 }, { "acc": 0.74407988, "epoch": 1.7781829214243472, "grad_norm": 4.8125, "learning_rate": 3.324939884647499e-07, "loss": 1.02405052, "memory(GiB)": 142.32, "step": 158980, "train_speed(iter/s)": 0.285937 }, { "acc": 0.74634304, "epoch": 1.7784066203703057, "grad_norm": 5.90625, "learning_rate": 3.318311530546825e-07, "loss": 1.01238785, "memory(GiB)": 142.32, "step": 159000, "train_speed(iter/s)": 0.28595 }, { "acc": 0.74009733, "epoch": 1.7786303193162643, "grad_norm": 7.1875, "learning_rate": 3.3116895631683743e-07, "loss": 1.02046289, "memory(GiB)": 142.32, "step": 159020, "train_speed(iter/s)": 0.285962 }, { "acc": 0.73476548, "epoch": 1.7788540182622228, "grad_norm": 5.78125, "learning_rate": 3.3050739834181167e-07, "loss": 1.04182892, "memory(GiB)": 142.32, "step": 159040, "train_speed(iter/s)": 0.285973 }, { "acc": 0.74548626, "epoch": 1.7790777172081813, "grad_norm": 5.625, "learning_rate": 3.298464792201167e-07, "loss": 1.0184782, "memory(GiB)": 142.32, "step": 159060, "train_speed(iter/s)": 0.285986 }, { "acc": 0.7368701, "epoch": 1.7793014161541398, "grad_norm": 6.84375, "learning_rate": 3.291861990421752e-07, "loss": 1.03611832, "memory(GiB)": 142.32, "step": 159080, "train_speed(iter/s)": 0.285999 }, { "acc": 0.73623648, "epoch": 1.7795251151000984, "grad_norm": 6.03125, "learning_rate": 3.285265578983221e-07, "loss": 1.04934969, "memory(GiB)": 142.32, "step": 159100, "train_speed(iter/s)": 0.286011 }, { "acc": 0.73367777, "epoch": 1.779748814046057, "grad_norm": 5.25, "learning_rate": 3.27867555878808e-07, "loss": 1.06166134, "memory(GiB)": 142.32, "step": 159120, "train_speed(iter/s)": 0.286024 }, { "acc": 0.72981491, "epoch": 1.7799725129920154, "grad_norm": 6.71875, "learning_rate": 3.272091930737914e-07, "loss": 1.07110987, "memory(GiB)": 142.32, "step": 159140, "train_speed(iter/s)": 0.286036 }, { "acc": 0.73005629, "epoch": 1.780196211937974, "grad_norm": 5.25, "learning_rate": 3.265514695733474e-07, "loss": 1.09667273, "memory(GiB)": 142.32, "step": 159160, "train_speed(iter/s)": 0.286048 }, { "acc": 0.73717842, "epoch": 1.7804199108839325, "grad_norm": 6.375, "learning_rate": 3.258943854674612e-07, "loss": 1.06021252, "memory(GiB)": 142.32, "step": 159180, "train_speed(iter/s)": 0.286061 }, { "acc": 0.72808132, "epoch": 1.780643609829891, "grad_norm": 6.1875, "learning_rate": 3.2523794084603097e-07, "loss": 1.08774586, "memory(GiB)": 142.32, "step": 159200, "train_speed(iter/s)": 0.286074 }, { "acc": 0.74689751, "epoch": 1.7808673087758495, "grad_norm": 5.53125, "learning_rate": 3.245821357988688e-07, "loss": 0.98194752, "memory(GiB)": 142.32, "step": 159220, "train_speed(iter/s)": 0.286086 }, { "acc": 0.74954834, "epoch": 1.781091007721808, "grad_norm": 5.75, "learning_rate": 3.2392697041569665e-07, "loss": 0.98952351, "memory(GiB)": 142.32, "step": 159240, "train_speed(iter/s)": 0.286098 }, { "acc": 0.72431707, "epoch": 1.7813147066677666, "grad_norm": 5.1875, "learning_rate": 3.232724447861524e-07, "loss": 1.10280037, "memory(GiB)": 142.32, "step": 159260, "train_speed(iter/s)": 0.28611 }, { "acc": 0.74194851, "epoch": 1.7815384056137251, "grad_norm": 7.15625, "learning_rate": 3.2261855899978324e-07, "loss": 1.0211915, "memory(GiB)": 142.32, "step": 159280, "train_speed(iter/s)": 0.286122 }, { "acc": 0.73546991, "epoch": 1.7817621045596836, "grad_norm": 5.84375, "learning_rate": 3.219653131460498e-07, "loss": 1.05024357, "memory(GiB)": 142.32, "step": 159300, "train_speed(iter/s)": 0.286134 }, { "acc": 0.73359699, "epoch": 1.7819858035056422, "grad_norm": 7.15625, "learning_rate": 3.213127073143274e-07, "loss": 1.06133108, "memory(GiB)": 142.32, "step": 159320, "train_speed(iter/s)": 0.286147 }, { "acc": 0.73489242, "epoch": 1.7822095024516007, "grad_norm": 5.40625, "learning_rate": 3.2066074159389895e-07, "loss": 1.06633148, "memory(GiB)": 142.32, "step": 159340, "train_speed(iter/s)": 0.286158 }, { "acc": 0.74366674, "epoch": 1.7824332013975592, "grad_norm": 6.1875, "learning_rate": 3.200094160739653e-07, "loss": 1.01327152, "memory(GiB)": 142.32, "step": 159360, "train_speed(iter/s)": 0.28617 }, { "acc": 0.75090942, "epoch": 1.7826569003435178, "grad_norm": 6.78125, "learning_rate": 3.193587308436358e-07, "loss": 0.99213438, "memory(GiB)": 142.32, "step": 159380, "train_speed(iter/s)": 0.286182 }, { "acc": 0.73771248, "epoch": 1.7828805992894763, "grad_norm": 6.03125, "learning_rate": 3.1870868599193304e-07, "loss": 1.04062538, "memory(GiB)": 142.32, "step": 159400, "train_speed(iter/s)": 0.286194 }, { "acc": 0.72818427, "epoch": 1.7831042982354348, "grad_norm": 5.59375, "learning_rate": 3.1805928160779433e-07, "loss": 1.08413038, "memory(GiB)": 142.32, "step": 159420, "train_speed(iter/s)": 0.286206 }, { "acc": 0.73288202, "epoch": 1.7833279971813933, "grad_norm": 5.59375, "learning_rate": 3.174105177800646e-07, "loss": 1.05179539, "memory(GiB)": 142.32, "step": 159440, "train_speed(iter/s)": 0.286217 }, { "acc": 0.73434544, "epoch": 1.7835516961273519, "grad_norm": 5.21875, "learning_rate": 3.1676239459750735e-07, "loss": 1.07174053, "memory(GiB)": 142.32, "step": 159460, "train_speed(iter/s)": 0.28623 }, { "acc": 0.73340998, "epoch": 1.7837753950733104, "grad_norm": 7.625, "learning_rate": 3.1611491214879274e-07, "loss": 1.06682358, "memory(GiB)": 142.32, "step": 159480, "train_speed(iter/s)": 0.286243 }, { "acc": 0.73883982, "epoch": 1.783999094019269, "grad_norm": 5.03125, "learning_rate": 3.154680705225055e-07, "loss": 1.03376341, "memory(GiB)": 142.32, "step": 159500, "train_speed(iter/s)": 0.286256 }, { "acc": 0.72414122, "epoch": 1.7842227929652275, "grad_norm": 5.5625, "learning_rate": 3.148218698071448e-07, "loss": 1.1018364, "memory(GiB)": 142.32, "step": 159520, "train_speed(iter/s)": 0.286268 }, { "acc": 0.74079494, "epoch": 1.784446491911186, "grad_norm": 6.25, "learning_rate": 3.141763100911183e-07, "loss": 1.0202383, "memory(GiB)": 142.32, "step": 159540, "train_speed(iter/s)": 0.286281 }, { "acc": 0.74340506, "epoch": 1.7846701908571445, "grad_norm": 5.0625, "learning_rate": 3.135313914627486e-07, "loss": 1.02631454, "memory(GiB)": 142.32, "step": 159560, "train_speed(iter/s)": 0.286293 }, { "acc": 0.74346247, "epoch": 1.784893889803103, "grad_norm": 6.53125, "learning_rate": 3.1288711401027017e-07, "loss": 1.00469532, "memory(GiB)": 142.32, "step": 159580, "train_speed(iter/s)": 0.286306 }, { "acc": 0.74874506, "epoch": 1.7851175887490616, "grad_norm": 6.9375, "learning_rate": 3.1224347782182753e-07, "loss": 0.99827194, "memory(GiB)": 142.32, "step": 159600, "train_speed(iter/s)": 0.286317 }, { "acc": 0.74629335, "epoch": 1.78534128769502, "grad_norm": 5.4375, "learning_rate": 3.116004829854818e-07, "loss": 1.02115402, "memory(GiB)": 142.32, "step": 159620, "train_speed(iter/s)": 0.28633 }, { "acc": 0.74183636, "epoch": 1.7855649866409786, "grad_norm": 4.40625, "learning_rate": 3.109581295892017e-07, "loss": 1.02457752, "memory(GiB)": 142.32, "step": 159640, "train_speed(iter/s)": 0.286343 }, { "acc": 0.73465409, "epoch": 1.7857886855869372, "grad_norm": 5.78125, "learning_rate": 3.103164177208723e-07, "loss": 1.05993214, "memory(GiB)": 142.32, "step": 159660, "train_speed(iter/s)": 0.286355 }, { "acc": 0.72014112, "epoch": 1.7860123845328957, "grad_norm": 6.25, "learning_rate": 3.096753474682873e-07, "loss": 1.11407318, "memory(GiB)": 142.32, "step": 159680, "train_speed(iter/s)": 0.286367 }, { "acc": 0.72842073, "epoch": 1.7862360834788542, "grad_norm": 7.0625, "learning_rate": 3.0903491891915495e-07, "loss": 1.08263111, "memory(GiB)": 142.32, "step": 159700, "train_speed(iter/s)": 0.28638 }, { "acc": 0.7527751, "epoch": 1.7864597824248127, "grad_norm": 7.09375, "learning_rate": 3.083951321610956e-07, "loss": 0.9761713, "memory(GiB)": 142.32, "step": 159720, "train_speed(iter/s)": 0.286393 }, { "acc": 0.74365969, "epoch": 1.7866834813707713, "grad_norm": 7.65625, "learning_rate": 3.0775598728163994e-07, "loss": 1.01896811, "memory(GiB)": 142.32, "step": 159740, "train_speed(iter/s)": 0.286405 }, { "acc": 0.74119153, "epoch": 1.7869071803167298, "grad_norm": 5.96875, "learning_rate": 3.0711748436823343e-07, "loss": 1.02751427, "memory(GiB)": 142.32, "step": 159760, "train_speed(iter/s)": 0.286417 }, { "acc": 0.73336372, "epoch": 1.7871308792626883, "grad_norm": 5.59375, "learning_rate": 3.0647962350823177e-07, "loss": 1.06593227, "memory(GiB)": 142.32, "step": 159780, "train_speed(iter/s)": 0.286428 }, { "acc": 0.73597212, "epoch": 1.7873545782086468, "grad_norm": 6.40625, "learning_rate": 3.058424047889025e-07, "loss": 1.05686092, "memory(GiB)": 142.32, "step": 159800, "train_speed(iter/s)": 0.28644 }, { "acc": 0.73215351, "epoch": 1.7875782771546054, "grad_norm": 5.40625, "learning_rate": 3.052058282974285e-07, "loss": 1.06801786, "memory(GiB)": 142.32, "step": 159820, "train_speed(iter/s)": 0.286454 }, { "acc": 0.74108315, "epoch": 1.787801976100564, "grad_norm": 6.34375, "learning_rate": 3.045698941209002e-07, "loss": 1.02614479, "memory(GiB)": 142.32, "step": 159840, "train_speed(iter/s)": 0.286466 }, { "acc": 0.73931313, "epoch": 1.7880256750465224, "grad_norm": 5.25, "learning_rate": 3.039346023463252e-07, "loss": 1.0404211, "memory(GiB)": 142.32, "step": 159860, "train_speed(iter/s)": 0.286478 }, { "acc": 0.7420495, "epoch": 1.788249373992481, "grad_norm": 6.71875, "learning_rate": 3.0329995306061734e-07, "loss": 1.02219753, "memory(GiB)": 142.32, "step": 159880, "train_speed(iter/s)": 0.286492 }, { "acc": 0.73866329, "epoch": 1.7884730729384395, "grad_norm": 5.9375, "learning_rate": 3.0266594635060817e-07, "loss": 1.04972668, "memory(GiB)": 142.32, "step": 159900, "train_speed(iter/s)": 0.286503 }, { "acc": 0.73598881, "epoch": 1.788696771884398, "grad_norm": 8.0, "learning_rate": 3.020325823030373e-07, "loss": 1.05526466, "memory(GiB)": 142.32, "step": 159920, "train_speed(iter/s)": 0.286516 }, { "acc": 0.74018288, "epoch": 1.7889204708303565, "grad_norm": 5.03125, "learning_rate": 3.013998610045582e-07, "loss": 1.03390217, "memory(GiB)": 142.32, "step": 159940, "train_speed(iter/s)": 0.286528 }, { "acc": 0.73785067, "epoch": 1.789144169776315, "grad_norm": 6.6875, "learning_rate": 3.0076778254173767e-07, "loss": 1.03109627, "memory(GiB)": 142.32, "step": 159960, "train_speed(iter/s)": 0.286541 }, { "acc": 0.73412991, "epoch": 1.7893678687222736, "grad_norm": 6.84375, "learning_rate": 3.0013634700105045e-07, "loss": 1.06124535, "memory(GiB)": 142.32, "step": 159980, "train_speed(iter/s)": 0.286553 }, { "acc": 0.73464222, "epoch": 1.7895915676682321, "grad_norm": 6.03125, "learning_rate": 2.995055544688891e-07, "loss": 1.04463778, "memory(GiB)": 142.32, "step": 160000, "train_speed(iter/s)": 0.286565 }, { "epoch": 1.7895915676682321, "eval_acc": 0.6963874228063679, "eval_loss": 1.0713753700256348, "eval_runtime": 2344.7747, "eval_samples_per_second": 32.107, "eval_steps_per_second": 16.054, "step": 160000 }, { "acc": 0.74288931, "epoch": 1.7898152666141907, "grad_norm": 5.84375, "learning_rate": 2.9887540503155287e-07, "loss": 1.0113512, "memory(GiB)": 142.32, "step": 160020, "train_speed(iter/s)": 0.285354 }, { "acc": 0.72063313, "epoch": 1.7900389655601492, "grad_norm": 4.90625, "learning_rate": 2.982458987752551e-07, "loss": 1.12233925, "memory(GiB)": 142.32, "step": 160040, "train_speed(iter/s)": 0.285366 }, { "acc": 0.73995976, "epoch": 1.7902626645061077, "grad_norm": 6.84375, "learning_rate": 2.97617035786123e-07, "loss": 1.03073196, "memory(GiB)": 142.32, "step": 160060, "train_speed(iter/s)": 0.285378 }, { "acc": 0.74537783, "epoch": 1.7904863634520662, "grad_norm": 5.4375, "learning_rate": 2.969888161501916e-07, "loss": 1.00622959, "memory(GiB)": 142.32, "step": 160080, "train_speed(iter/s)": 0.285392 }, { "acc": 0.7475255, "epoch": 1.7907100623980248, "grad_norm": 5.46875, "learning_rate": 2.963612399534133e-07, "loss": 0.99749584, "memory(GiB)": 142.32, "step": 160100, "train_speed(iter/s)": 0.285404 }, { "acc": 0.73429594, "epoch": 1.7909337613439833, "grad_norm": 7.96875, "learning_rate": 2.957343072816471e-07, "loss": 1.07081432, "memory(GiB)": 142.32, "step": 160120, "train_speed(iter/s)": 0.285416 }, { "acc": 0.73832865, "epoch": 1.7911574602899418, "grad_norm": 7.21875, "learning_rate": 2.951080182206667e-07, "loss": 1.03068619, "memory(GiB)": 142.32, "step": 160140, "train_speed(iter/s)": 0.285428 }, { "acc": 0.73756628, "epoch": 1.7913811592359004, "grad_norm": 8.1875, "learning_rate": 2.9448237285615856e-07, "loss": 1.04092693, "memory(GiB)": 142.32, "step": 160160, "train_speed(iter/s)": 0.28544 }, { "acc": 0.73810558, "epoch": 1.7916048581818589, "grad_norm": 7.4375, "learning_rate": 2.9385737127371807e-07, "loss": 1.04618559, "memory(GiB)": 142.32, "step": 160180, "train_speed(iter/s)": 0.285453 }, { "acc": 0.74377489, "epoch": 1.7918285571278174, "grad_norm": 5.4375, "learning_rate": 2.932330135588568e-07, "loss": 1.0132102, "memory(GiB)": 142.32, "step": 160200, "train_speed(iter/s)": 0.285466 }, { "acc": 0.74018044, "epoch": 1.792052256073776, "grad_norm": 6.875, "learning_rate": 2.926092997969937e-07, "loss": 1.03378305, "memory(GiB)": 142.32, "step": 160220, "train_speed(iter/s)": 0.285477 }, { "acc": 0.73452959, "epoch": 1.7922759550197345, "grad_norm": 6.75, "learning_rate": 2.9198623007346274e-07, "loss": 1.06746368, "memory(GiB)": 142.32, "step": 160240, "train_speed(iter/s)": 0.285488 }, { "acc": 0.72404075, "epoch": 1.792499653965693, "grad_norm": 6.125, "learning_rate": 2.9136380447350843e-07, "loss": 1.12083111, "memory(GiB)": 142.32, "step": 160260, "train_speed(iter/s)": 0.285501 }, { "acc": 0.73632698, "epoch": 1.7927233529116515, "grad_norm": 7.34375, "learning_rate": 2.907420230822872e-07, "loss": 1.05641813, "memory(GiB)": 142.32, "step": 160280, "train_speed(iter/s)": 0.285513 }, { "acc": 0.72634397, "epoch": 1.79294705185761, "grad_norm": 6.625, "learning_rate": 2.9012088598486865e-07, "loss": 1.08436375, "memory(GiB)": 142.32, "step": 160300, "train_speed(iter/s)": 0.285526 }, { "acc": 0.7489954, "epoch": 1.7931707508035686, "grad_norm": 6.46875, "learning_rate": 2.895003932662327e-07, "loss": 0.99820156, "memory(GiB)": 142.32, "step": 160320, "train_speed(iter/s)": 0.285539 }, { "acc": 0.72846413, "epoch": 1.793394449749527, "grad_norm": 5.84375, "learning_rate": 2.8888054501127083e-07, "loss": 1.08267288, "memory(GiB)": 142.32, "step": 160340, "train_speed(iter/s)": 0.285552 }, { "acc": 0.72901444, "epoch": 1.7936181486954856, "grad_norm": 5.75, "learning_rate": 2.8826134130478856e-07, "loss": 1.07353773, "memory(GiB)": 142.32, "step": 160360, "train_speed(iter/s)": 0.285563 }, { "acc": 0.74104352, "epoch": 1.7938418476414442, "grad_norm": 4.875, "learning_rate": 2.8764278223149975e-07, "loss": 1.04326706, "memory(GiB)": 142.32, "step": 160380, "train_speed(iter/s)": 0.285576 }, { "acc": 0.74669762, "epoch": 1.7940655465874027, "grad_norm": 6.84375, "learning_rate": 2.870248678760351e-07, "loss": 1.00317488, "memory(GiB)": 142.32, "step": 160400, "train_speed(iter/s)": 0.285588 }, { "acc": 0.74263697, "epoch": 1.7942892455333612, "grad_norm": 5.75, "learning_rate": 2.8640759832293195e-07, "loss": 1.01888447, "memory(GiB)": 142.32, "step": 160420, "train_speed(iter/s)": 0.285598 }, { "acc": 0.73005862, "epoch": 1.7945129444793197, "grad_norm": 5.5625, "learning_rate": 2.857909736566411e-07, "loss": 1.09162111, "memory(GiB)": 142.32, "step": 160440, "train_speed(iter/s)": 0.285611 }, { "acc": 0.75636187, "epoch": 1.7947366434252783, "grad_norm": 6.125, "learning_rate": 2.8517499396152783e-07, "loss": 0.95218391, "memory(GiB)": 142.32, "step": 160460, "train_speed(iter/s)": 0.285623 }, { "acc": 0.7392283, "epoch": 1.7949603423712368, "grad_norm": 6.65625, "learning_rate": 2.8455965932186467e-07, "loss": 1.03810616, "memory(GiB)": 142.32, "step": 160480, "train_speed(iter/s)": 0.285636 }, { "acc": 0.74054189, "epoch": 1.7951840413171953, "grad_norm": 5.6875, "learning_rate": 2.839449698218405e-07, "loss": 1.02601585, "memory(GiB)": 142.32, "step": 160500, "train_speed(iter/s)": 0.285649 }, { "acc": 0.73380103, "epoch": 1.7954077402631539, "grad_norm": 5.71875, "learning_rate": 2.8333092554555185e-07, "loss": 1.06680202, "memory(GiB)": 142.32, "step": 160520, "train_speed(iter/s)": 0.28566 }, { "acc": 0.74070845, "epoch": 1.7956314392091124, "grad_norm": 6.5625, "learning_rate": 2.827175265770088e-07, "loss": 1.03467236, "memory(GiB)": 142.32, "step": 160540, "train_speed(iter/s)": 0.285671 }, { "acc": 0.73962135, "epoch": 1.795855138155071, "grad_norm": 6.09375, "learning_rate": 2.821047730001347e-07, "loss": 1.03200264, "memory(GiB)": 142.32, "step": 160560, "train_speed(iter/s)": 0.285684 }, { "acc": 0.74695802, "epoch": 1.7960788371010294, "grad_norm": 6.1875, "learning_rate": 2.814926648987609e-07, "loss": 1.00774612, "memory(GiB)": 142.32, "step": 160580, "train_speed(iter/s)": 0.285697 }, { "acc": 0.74283876, "epoch": 1.796302536046988, "grad_norm": 6.0625, "learning_rate": 2.808812023566343e-07, "loss": 1.02202759, "memory(GiB)": 142.32, "step": 160600, "train_speed(iter/s)": 0.285709 }, { "acc": 0.73522611, "epoch": 1.7965262349929465, "grad_norm": 5.3125, "learning_rate": 2.802703854574107e-07, "loss": 1.05848007, "memory(GiB)": 142.32, "step": 160620, "train_speed(iter/s)": 0.28572 }, { "acc": 0.73065538, "epoch": 1.796749933938905, "grad_norm": 6.40625, "learning_rate": 2.7966021428465885e-07, "loss": 1.07503548, "memory(GiB)": 142.32, "step": 160640, "train_speed(iter/s)": 0.285733 }, { "acc": 0.73626394, "epoch": 1.7969736328848636, "grad_norm": 5.875, "learning_rate": 2.7905068892185863e-07, "loss": 1.04307308, "memory(GiB)": 142.32, "step": 160660, "train_speed(iter/s)": 0.285747 }, { "acc": 0.73487005, "epoch": 1.797197331830822, "grad_norm": 6.84375, "learning_rate": 2.784418094524022e-07, "loss": 1.03967266, "memory(GiB)": 142.32, "step": 160680, "train_speed(iter/s)": 0.285759 }, { "acc": 0.72872362, "epoch": 1.7974210307767806, "grad_norm": 6.625, "learning_rate": 2.778335759595929e-07, "loss": 1.10015926, "memory(GiB)": 142.32, "step": 160700, "train_speed(iter/s)": 0.285771 }, { "acc": 0.74030838, "epoch": 1.7976447297227391, "grad_norm": 7.15625, "learning_rate": 2.772259885266465e-07, "loss": 1.04610214, "memory(GiB)": 142.32, "step": 160720, "train_speed(iter/s)": 0.285784 }, { "acc": 0.73711877, "epoch": 1.7978684286686977, "grad_norm": 5.5625, "learning_rate": 2.766190472366875e-07, "loss": 1.0460947, "memory(GiB)": 142.32, "step": 160740, "train_speed(iter/s)": 0.285797 }, { "acc": 0.72821751, "epoch": 1.7980921276146562, "grad_norm": 6.5, "learning_rate": 2.7601275217275626e-07, "loss": 1.08151512, "memory(GiB)": 142.32, "step": 160760, "train_speed(iter/s)": 0.285811 }, { "acc": 0.75157166, "epoch": 1.7983158265606147, "grad_norm": 6.4375, "learning_rate": 2.754071034178013e-07, "loss": 0.98166828, "memory(GiB)": 142.32, "step": 160780, "train_speed(iter/s)": 0.285822 }, { "acc": 0.72909813, "epoch": 1.7985395255065733, "grad_norm": 5.9375, "learning_rate": 2.748021010546853e-07, "loss": 1.08300724, "memory(GiB)": 142.32, "step": 160800, "train_speed(iter/s)": 0.285834 }, { "acc": 0.72865901, "epoch": 1.7987632244525318, "grad_norm": 6.3125, "learning_rate": 2.741977451661809e-07, "loss": 1.06749992, "memory(GiB)": 142.32, "step": 160820, "train_speed(iter/s)": 0.285846 }, { "acc": 0.73474226, "epoch": 1.7989869233984903, "grad_norm": 7.21875, "learning_rate": 2.735940358349709e-07, "loss": 1.05650406, "memory(GiB)": 142.32, "step": 160840, "train_speed(iter/s)": 0.285858 }, { "acc": 0.75239916, "epoch": 1.7992106223444488, "grad_norm": 7.0, "learning_rate": 2.7299097314365366e-07, "loss": 0.97111206, "memory(GiB)": 142.32, "step": 160860, "train_speed(iter/s)": 0.28587 }, { "acc": 0.73673515, "epoch": 1.7994343212904074, "grad_norm": 6.34375, "learning_rate": 2.7238855717473535e-07, "loss": 1.05503216, "memory(GiB)": 142.32, "step": 160880, "train_speed(iter/s)": 0.285882 }, { "acc": 0.7439919, "epoch": 1.799658020236366, "grad_norm": 5.5625, "learning_rate": 2.7178678801063563e-07, "loss": 1.00838375, "memory(GiB)": 142.32, "step": 160900, "train_speed(iter/s)": 0.285895 }, { "acc": 0.74049916, "epoch": 1.7998817191823244, "grad_norm": 6.28125, "learning_rate": 2.711856657336859e-07, "loss": 1.03496056, "memory(GiB)": 142.32, "step": 160920, "train_speed(iter/s)": 0.285905 }, { "acc": 0.74612837, "epoch": 1.800105418128283, "grad_norm": 6.0, "learning_rate": 2.705851904261259e-07, "loss": 0.99525213, "memory(GiB)": 142.32, "step": 160940, "train_speed(iter/s)": 0.285916 }, { "acc": 0.73684897, "epoch": 1.8003291170742415, "grad_norm": 7.34375, "learning_rate": 2.6998536217011207e-07, "loss": 1.02357359, "memory(GiB)": 142.32, "step": 160960, "train_speed(iter/s)": 0.285928 }, { "acc": 0.73597727, "epoch": 1.8005528160202, "grad_norm": 6.4375, "learning_rate": 2.693861810477072e-07, "loss": 1.07888966, "memory(GiB)": 142.32, "step": 160980, "train_speed(iter/s)": 0.285941 }, { "acc": 0.73111439, "epoch": 1.8007765149661585, "grad_norm": 7.21875, "learning_rate": 2.687876471408896e-07, "loss": 1.07272053, "memory(GiB)": 142.32, "step": 161000, "train_speed(iter/s)": 0.285953 }, { "acc": 0.73870335, "epoch": 1.801000213912117, "grad_norm": 5.46875, "learning_rate": 2.681897605315464e-07, "loss": 1.01885433, "memory(GiB)": 142.32, "step": 161020, "train_speed(iter/s)": 0.285966 }, { "acc": 0.74230347, "epoch": 1.8012239128580756, "grad_norm": 8.875, "learning_rate": 2.675925213014774e-07, "loss": 1.00913057, "memory(GiB)": 142.32, "step": 161040, "train_speed(iter/s)": 0.285978 }, { "acc": 0.7189395, "epoch": 1.8014476118040341, "grad_norm": 5.5625, "learning_rate": 2.669959295323926e-07, "loss": 1.11767616, "memory(GiB)": 142.32, "step": 161060, "train_speed(iter/s)": 0.285989 }, { "acc": 0.72683897, "epoch": 1.8016713107499926, "grad_norm": 5.875, "learning_rate": 2.663999853059146e-07, "loss": 1.09862185, "memory(GiB)": 142.32, "step": 161080, "train_speed(iter/s)": 0.286001 }, { "acc": 0.73745775, "epoch": 1.8018950096959512, "grad_norm": 5.28125, "learning_rate": 2.658046887035781e-07, "loss": 1.04608803, "memory(GiB)": 142.32, "step": 161100, "train_speed(iter/s)": 0.286014 }, { "acc": 0.73140516, "epoch": 1.8021187086419097, "grad_norm": 7.375, "learning_rate": 2.6521003980682637e-07, "loss": 1.06913347, "memory(GiB)": 142.32, "step": 161120, "train_speed(iter/s)": 0.286028 }, { "acc": 0.72689543, "epoch": 1.8023424075878682, "grad_norm": 7.9375, "learning_rate": 2.6461603869701814e-07, "loss": 1.08782539, "memory(GiB)": 142.32, "step": 161140, "train_speed(iter/s)": 0.28604 }, { "acc": 0.73697858, "epoch": 1.8025661065338268, "grad_norm": 6.125, "learning_rate": 2.6402268545541955e-07, "loss": 1.03675919, "memory(GiB)": 142.32, "step": 161160, "train_speed(iter/s)": 0.286052 }, { "acc": 0.72604618, "epoch": 1.8027898054797853, "grad_norm": 6.9375, "learning_rate": 2.6342998016320954e-07, "loss": 1.09049006, "memory(GiB)": 142.32, "step": 161180, "train_speed(iter/s)": 0.286065 }, { "acc": 0.75341029, "epoch": 1.8030135044257438, "grad_norm": 5.5625, "learning_rate": 2.6283792290148046e-07, "loss": 0.97926464, "memory(GiB)": 142.32, "step": 161200, "train_speed(iter/s)": 0.286077 }, { "acc": 0.73595839, "epoch": 1.8032372033717023, "grad_norm": 5.28125, "learning_rate": 2.622465137512326e-07, "loss": 1.05664787, "memory(GiB)": 142.32, "step": 161220, "train_speed(iter/s)": 0.28609 }, { "acc": 0.73426552, "epoch": 1.8034609023176609, "grad_norm": 6.15625, "learning_rate": 2.6165575279337995e-07, "loss": 1.0629612, "memory(GiB)": 142.32, "step": 161240, "train_speed(iter/s)": 0.286102 }, { "acc": 0.75079651, "epoch": 1.8036846012636194, "grad_norm": 7.4375, "learning_rate": 2.6106564010874744e-07, "loss": 0.9813529, "memory(GiB)": 142.32, "step": 161260, "train_speed(iter/s)": 0.286116 }, { "acc": 0.7320879, "epoch": 1.803908300209578, "grad_norm": 5.59375, "learning_rate": 2.604761757780694e-07, "loss": 1.07551327, "memory(GiB)": 142.32, "step": 161280, "train_speed(iter/s)": 0.28613 }, { "acc": 0.73805971, "epoch": 1.8041319991555365, "grad_norm": 5.65625, "learning_rate": 2.5988735988199453e-07, "loss": 1.03135738, "memory(GiB)": 142.32, "step": 161300, "train_speed(iter/s)": 0.286142 }, { "acc": 0.73124104, "epoch": 1.804355698101495, "grad_norm": 5.625, "learning_rate": 2.5929919250108016e-07, "loss": 1.07414227, "memory(GiB)": 142.32, "step": 161320, "train_speed(iter/s)": 0.286154 }, { "acc": 0.7400197, "epoch": 1.8045793970474535, "grad_norm": 6.21875, "learning_rate": 2.5871167371579684e-07, "loss": 1.02712431, "memory(GiB)": 142.32, "step": 161340, "train_speed(iter/s)": 0.286166 }, { "acc": 0.72663121, "epoch": 1.804803095993412, "grad_norm": 6.46875, "learning_rate": 2.5812480360652534e-07, "loss": 1.0924572, "memory(GiB)": 142.32, "step": 161360, "train_speed(iter/s)": 0.286179 }, { "acc": 0.73927546, "epoch": 1.8050267949393706, "grad_norm": 7.625, "learning_rate": 2.57538582253557e-07, "loss": 1.01954594, "memory(GiB)": 142.32, "step": 161380, "train_speed(iter/s)": 0.286192 }, { "acc": 0.73788109, "epoch": 1.805250493885329, "grad_norm": 6.6875, "learning_rate": 2.569530097370965e-07, "loss": 1.0486805, "memory(GiB)": 142.32, "step": 161400, "train_speed(iter/s)": 0.286204 }, { "acc": 0.73470097, "epoch": 1.8054741928312876, "grad_norm": 6.8125, "learning_rate": 2.5636808613725804e-07, "loss": 1.05379276, "memory(GiB)": 142.32, "step": 161420, "train_speed(iter/s)": 0.286215 }, { "acc": 0.73887796, "epoch": 1.8056978917772462, "grad_norm": 6.28125, "learning_rate": 2.557838115340677e-07, "loss": 1.02908707, "memory(GiB)": 142.32, "step": 161440, "train_speed(iter/s)": 0.286228 }, { "acc": 0.73069057, "epoch": 1.8059215907232047, "grad_norm": 6.09375, "learning_rate": 2.552001860074621e-07, "loss": 1.06236935, "memory(GiB)": 142.32, "step": 161460, "train_speed(iter/s)": 0.286242 }, { "acc": 0.73862576, "epoch": 1.8061452896691632, "grad_norm": 7.40625, "learning_rate": 2.546172096372895e-07, "loss": 1.02926006, "memory(GiB)": 142.32, "step": 161480, "train_speed(iter/s)": 0.286253 }, { "acc": 0.74400373, "epoch": 1.8063689886151217, "grad_norm": 5.78125, "learning_rate": 2.5403488250331066e-07, "loss": 1.00899658, "memory(GiB)": 142.32, "step": 161500, "train_speed(iter/s)": 0.286266 }, { "acc": 0.74201365, "epoch": 1.8065926875610803, "grad_norm": 5.78125, "learning_rate": 2.534532046851945e-07, "loss": 1.02401648, "memory(GiB)": 142.32, "step": 161520, "train_speed(iter/s)": 0.286277 }, { "acc": 0.7269732, "epoch": 1.8068163865070388, "grad_norm": 5.375, "learning_rate": 2.528721762625247e-07, "loss": 1.1080143, "memory(GiB)": 142.32, "step": 161540, "train_speed(iter/s)": 0.286289 }, { "acc": 0.72992845, "epoch": 1.8070400854529973, "grad_norm": 6.53125, "learning_rate": 2.522917973147926e-07, "loss": 1.06799812, "memory(GiB)": 142.32, "step": 161560, "train_speed(iter/s)": 0.286302 }, { "acc": 0.73253264, "epoch": 1.8072637843989559, "grad_norm": 5.90625, "learning_rate": 2.51712067921403e-07, "loss": 1.05670776, "memory(GiB)": 142.32, "step": 161580, "train_speed(iter/s)": 0.286314 }, { "acc": 0.7389102, "epoch": 1.8074874833449144, "grad_norm": 6.40625, "learning_rate": 2.511329881616714e-07, "loss": 1.02961435, "memory(GiB)": 142.32, "step": 161600, "train_speed(iter/s)": 0.286327 }, { "acc": 0.72926235, "epoch": 1.807711182290873, "grad_norm": 6.0625, "learning_rate": 2.505545581148239e-07, "loss": 1.068818, "memory(GiB)": 142.32, "step": 161620, "train_speed(iter/s)": 0.286339 }, { "acc": 0.73767548, "epoch": 1.8079348812368314, "grad_norm": 5.75, "learning_rate": 2.499767778599982e-07, "loss": 1.02662354, "memory(GiB)": 142.32, "step": 161640, "train_speed(iter/s)": 0.286351 }, { "acc": 0.74906616, "epoch": 1.80815858018279, "grad_norm": 6.65625, "learning_rate": 2.493996474762428e-07, "loss": 0.97993908, "memory(GiB)": 142.32, "step": 161660, "train_speed(iter/s)": 0.286363 }, { "acc": 0.72697077, "epoch": 1.8083822791287485, "grad_norm": 8.6875, "learning_rate": 2.4882316704251674e-07, "loss": 1.09020176, "memory(GiB)": 142.32, "step": 161680, "train_speed(iter/s)": 0.286376 }, { "acc": 0.74398651, "epoch": 1.808605978074707, "grad_norm": 6.125, "learning_rate": 2.4824733663769197e-07, "loss": 1.01488781, "memory(GiB)": 142.32, "step": 161700, "train_speed(iter/s)": 0.286388 }, { "acc": 0.74685431, "epoch": 1.8088296770206655, "grad_norm": 6.34375, "learning_rate": 2.476721563405493e-07, "loss": 1.01180277, "memory(GiB)": 142.32, "step": 161720, "train_speed(iter/s)": 0.2864 }, { "acc": 0.73060846, "epoch": 1.809053375966624, "grad_norm": 5.53125, "learning_rate": 2.470976262297825e-07, "loss": 1.0972456, "memory(GiB)": 142.32, "step": 161740, "train_speed(iter/s)": 0.286411 }, { "acc": 0.73404212, "epoch": 1.8092770749125826, "grad_norm": 6.65625, "learning_rate": 2.4652374638399534e-07, "loss": 1.06319065, "memory(GiB)": 142.32, "step": 161760, "train_speed(iter/s)": 0.286423 }, { "acc": 0.72814808, "epoch": 1.8095007738585411, "grad_norm": 5.3125, "learning_rate": 2.459505168817017e-07, "loss": 1.08006496, "memory(GiB)": 142.32, "step": 161780, "train_speed(iter/s)": 0.286434 }, { "acc": 0.73883109, "epoch": 1.8097244728044997, "grad_norm": 5.8125, "learning_rate": 2.4537793780132935e-07, "loss": 1.04745598, "memory(GiB)": 142.32, "step": 161800, "train_speed(iter/s)": 0.286445 }, { "acc": 0.73518562, "epoch": 1.8099481717504582, "grad_norm": 6.75, "learning_rate": 2.448060092212129e-07, "loss": 1.060958, "memory(GiB)": 142.32, "step": 161820, "train_speed(iter/s)": 0.286456 }, { "acc": 0.73520441, "epoch": 1.8101718706964167, "grad_norm": 6.71875, "learning_rate": 2.4423473121960305e-07, "loss": 1.05736475, "memory(GiB)": 142.32, "step": 161840, "train_speed(iter/s)": 0.286469 }, { "acc": 0.74134173, "epoch": 1.8103955696423752, "grad_norm": 5.5, "learning_rate": 2.4366410387465735e-07, "loss": 1.02191963, "memory(GiB)": 142.32, "step": 161860, "train_speed(iter/s)": 0.28648 }, { "acc": 0.72533569, "epoch": 1.8106192685883338, "grad_norm": 4.90625, "learning_rate": 2.4309412726444546e-07, "loss": 1.08905516, "memory(GiB)": 142.32, "step": 161880, "train_speed(iter/s)": 0.286491 }, { "acc": 0.74235096, "epoch": 1.8108429675342923, "grad_norm": 6.6875, "learning_rate": 2.42524801466949e-07, "loss": 1.02517776, "memory(GiB)": 142.32, "step": 161900, "train_speed(iter/s)": 0.286502 }, { "acc": 0.74464827, "epoch": 1.8110666664802508, "grad_norm": 5.53125, "learning_rate": 2.419561265600595e-07, "loss": 0.98700066, "memory(GiB)": 142.32, "step": 161920, "train_speed(iter/s)": 0.286515 }, { "acc": 0.73527145, "epoch": 1.8112903654262094, "grad_norm": 5.59375, "learning_rate": 2.413881026215803e-07, "loss": 1.06215038, "memory(GiB)": 142.32, "step": 161940, "train_speed(iter/s)": 0.286526 }, { "acc": 0.73622227, "epoch": 1.8115140643721679, "grad_norm": 6.46875, "learning_rate": 2.4082072972922533e-07, "loss": 1.04314537, "memory(GiB)": 142.32, "step": 161960, "train_speed(iter/s)": 0.286537 }, { "acc": 0.73431377, "epoch": 1.8117377633181264, "grad_norm": 6.65625, "learning_rate": 2.4025400796061803e-07, "loss": 1.05678349, "memory(GiB)": 142.32, "step": 161980, "train_speed(iter/s)": 0.286549 }, { "acc": 0.73463011, "epoch": 1.811961462264085, "grad_norm": 5.6875, "learning_rate": 2.396879373932953e-07, "loss": 1.06382008, "memory(GiB)": 142.32, "step": 162000, "train_speed(iter/s)": 0.286561 }, { "epoch": 1.811961462264085, "eval_acc": 0.6963849086769606, "eval_loss": 1.0713517665863037, "eval_runtime": 2340.944, "eval_samples_per_second": 32.159, "eval_steps_per_second": 16.08, "step": 162000 }, { "acc": 0.75009217, "epoch": 1.8121851612100435, "grad_norm": 5.875, "learning_rate": 2.39122518104703e-07, "loss": 0.97882462, "memory(GiB)": 142.32, "step": 162020, "train_speed(iter/s)": 0.285366 }, { "acc": 0.74071503, "epoch": 1.812408860156002, "grad_norm": 6.5625, "learning_rate": 2.385577501721992e-07, "loss": 1.03276405, "memory(GiB)": 142.32, "step": 162040, "train_speed(iter/s)": 0.285378 }, { "acc": 0.73400507, "epoch": 1.8126325591019605, "grad_norm": 6.4375, "learning_rate": 2.3799363367305206e-07, "loss": 1.04277477, "memory(GiB)": 142.32, "step": 162060, "train_speed(iter/s)": 0.285391 }, { "acc": 0.74692326, "epoch": 1.812856258047919, "grad_norm": 6.09375, "learning_rate": 2.3743016868443936e-07, "loss": 1.00586767, "memory(GiB)": 142.32, "step": 162080, "train_speed(iter/s)": 0.285403 }, { "acc": 0.74398179, "epoch": 1.8130799569938776, "grad_norm": 6.65625, "learning_rate": 2.3686735528345385e-07, "loss": 1.01099091, "memory(GiB)": 142.32, "step": 162100, "train_speed(iter/s)": 0.285416 }, { "acc": 0.732658, "epoch": 1.8133036559398361, "grad_norm": 5.875, "learning_rate": 2.363051935470939e-07, "loss": 1.0562809, "memory(GiB)": 142.32, "step": 162120, "train_speed(iter/s)": 0.285427 }, { "acc": 0.74754953, "epoch": 1.8135273548857946, "grad_norm": 5.46875, "learning_rate": 2.3574368355227306e-07, "loss": 1.00275192, "memory(GiB)": 142.32, "step": 162140, "train_speed(iter/s)": 0.285439 }, { "acc": 0.74327803, "epoch": 1.8137510538317532, "grad_norm": 6.90625, "learning_rate": 2.3518282537581316e-07, "loss": 1.00401583, "memory(GiB)": 142.32, "step": 162160, "train_speed(iter/s)": 0.285451 }, { "acc": 0.74126968, "epoch": 1.8139747527777117, "grad_norm": 5.25, "learning_rate": 2.346226190944473e-07, "loss": 1.02646103, "memory(GiB)": 142.32, "step": 162180, "train_speed(iter/s)": 0.285462 }, { "acc": 0.73957129, "epoch": 1.8141984517236702, "grad_norm": 6.6875, "learning_rate": 2.3406306478482078e-07, "loss": 1.03074608, "memory(GiB)": 142.32, "step": 162200, "train_speed(iter/s)": 0.285473 }, { "acc": 0.72852702, "epoch": 1.8144221506696288, "grad_norm": 6.75, "learning_rate": 2.3350416252348685e-07, "loss": 1.08330297, "memory(GiB)": 142.32, "step": 162220, "train_speed(iter/s)": 0.285484 }, { "acc": 0.73927536, "epoch": 1.8146458496155873, "grad_norm": 6.03125, "learning_rate": 2.3294591238691377e-07, "loss": 1.02829142, "memory(GiB)": 142.32, "step": 162240, "train_speed(iter/s)": 0.285496 }, { "acc": 0.72089443, "epoch": 1.8148695485615458, "grad_norm": 6.09375, "learning_rate": 2.3238831445147603e-07, "loss": 1.1074749, "memory(GiB)": 142.32, "step": 162260, "train_speed(iter/s)": 0.285508 }, { "acc": 0.73712273, "epoch": 1.8150932475075043, "grad_norm": 7.84375, "learning_rate": 2.3183136879346202e-07, "loss": 1.05202169, "memory(GiB)": 142.32, "step": 162280, "train_speed(iter/s)": 0.285521 }, { "acc": 0.73955259, "epoch": 1.8153169464534629, "grad_norm": 5.6875, "learning_rate": 2.312750754890697e-07, "loss": 1.02294846, "memory(GiB)": 142.32, "step": 162300, "train_speed(iter/s)": 0.285533 }, { "acc": 0.73390236, "epoch": 1.8155406453994214, "grad_norm": 6.3125, "learning_rate": 2.3071943461440705e-07, "loss": 1.0701829, "memory(GiB)": 142.32, "step": 162320, "train_speed(iter/s)": 0.285545 }, { "acc": 0.7319787, "epoch": 1.81576434434538, "grad_norm": 5.59375, "learning_rate": 2.3016444624549495e-07, "loss": 1.08582544, "memory(GiB)": 142.32, "step": 162340, "train_speed(iter/s)": 0.285555 }, { "acc": 0.73259044, "epoch": 1.8159880432913384, "grad_norm": 6.21875, "learning_rate": 2.2961011045826264e-07, "loss": 1.06832151, "memory(GiB)": 142.32, "step": 162360, "train_speed(iter/s)": 0.285568 }, { "acc": 0.73687143, "epoch": 1.816211742237297, "grad_norm": 5.65625, "learning_rate": 2.2905642732855283e-07, "loss": 1.04864101, "memory(GiB)": 142.32, "step": 162380, "train_speed(iter/s)": 0.285581 }, { "acc": 0.74476633, "epoch": 1.8164354411832555, "grad_norm": 6.15625, "learning_rate": 2.2850339693211542e-07, "loss": 0.99683418, "memory(GiB)": 142.32, "step": 162400, "train_speed(iter/s)": 0.285592 }, { "acc": 0.73043861, "epoch": 1.816659140129214, "grad_norm": 5.90625, "learning_rate": 2.279510193446144e-07, "loss": 1.08135042, "memory(GiB)": 142.32, "step": 162420, "train_speed(iter/s)": 0.285604 }, { "acc": 0.72877827, "epoch": 1.8168828390751726, "grad_norm": 5.40625, "learning_rate": 2.2739929464162202e-07, "loss": 1.06736965, "memory(GiB)": 142.32, "step": 162440, "train_speed(iter/s)": 0.285616 }, { "acc": 0.73453913, "epoch": 1.817106538021131, "grad_norm": 5.03125, "learning_rate": 2.2684822289862184e-07, "loss": 1.04405384, "memory(GiB)": 142.32, "step": 162460, "train_speed(iter/s)": 0.285627 }, { "acc": 0.73586574, "epoch": 1.8173302369670896, "grad_norm": 5.34375, "learning_rate": 2.2629780419100967e-07, "loss": 1.0639925, "memory(GiB)": 142.32, "step": 162480, "train_speed(iter/s)": 0.285638 }, { "acc": 0.73586874, "epoch": 1.8175539359130481, "grad_norm": 5.3125, "learning_rate": 2.2574803859408855e-07, "loss": 1.03880396, "memory(GiB)": 142.32, "step": 162500, "train_speed(iter/s)": 0.28565 }, { "acc": 0.72517118, "epoch": 1.8177776348590067, "grad_norm": 6.0, "learning_rate": 2.251989261830767e-07, "loss": 1.10582285, "memory(GiB)": 142.32, "step": 162520, "train_speed(iter/s)": 0.285661 }, { "acc": 0.74236803, "epoch": 1.8180013338049652, "grad_norm": 5.96875, "learning_rate": 2.24650467033099e-07, "loss": 1.01980877, "memory(GiB)": 142.32, "step": 162540, "train_speed(iter/s)": 0.285674 }, { "acc": 0.74593091, "epoch": 1.8182250327509237, "grad_norm": 7.21875, "learning_rate": 2.2410266121919266e-07, "loss": 1.01003246, "memory(GiB)": 142.32, "step": 162560, "train_speed(iter/s)": 0.285686 }, { "acc": 0.75249386, "epoch": 1.8184487316968823, "grad_norm": 5.46875, "learning_rate": 2.2355550881630604e-07, "loss": 0.96932373, "memory(GiB)": 142.32, "step": 162580, "train_speed(iter/s)": 0.285698 }, { "acc": 0.73581729, "epoch": 1.8186724306428408, "grad_norm": 5.90625, "learning_rate": 2.2300900989929654e-07, "loss": 1.03279142, "memory(GiB)": 142.32, "step": 162600, "train_speed(iter/s)": 0.285711 }, { "acc": 0.73512459, "epoch": 1.8188961295887993, "grad_norm": 7.1875, "learning_rate": 2.224631645429337e-07, "loss": 1.06608772, "memory(GiB)": 142.32, "step": 162620, "train_speed(iter/s)": 0.285724 }, { "acc": 0.73902693, "epoch": 1.8191198285347578, "grad_norm": 7.375, "learning_rate": 2.2191797282189676e-07, "loss": 1.04145527, "memory(GiB)": 142.32, "step": 162640, "train_speed(iter/s)": 0.285737 }, { "acc": 0.74163713, "epoch": 1.8193435274807164, "grad_norm": 7.09375, "learning_rate": 2.213734348107749e-07, "loss": 1.02627182, "memory(GiB)": 142.32, "step": 162660, "train_speed(iter/s)": 0.285748 }, { "acc": 0.740307, "epoch": 1.819567226426675, "grad_norm": 6.4375, "learning_rate": 2.2082955058407073e-07, "loss": 1.02776623, "memory(GiB)": 142.32, "step": 162680, "train_speed(iter/s)": 0.28576 }, { "acc": 0.72669563, "epoch": 1.8197909253726334, "grad_norm": 6.6875, "learning_rate": 2.2028632021619257e-07, "loss": 1.08939075, "memory(GiB)": 142.32, "step": 162700, "train_speed(iter/s)": 0.285771 }, { "acc": 0.73824563, "epoch": 1.820014624318592, "grad_norm": 6.96875, "learning_rate": 2.197437437814648e-07, "loss": 1.03457212, "memory(GiB)": 142.32, "step": 162720, "train_speed(iter/s)": 0.285784 }, { "acc": 0.734233, "epoch": 1.8202383232645505, "grad_norm": 5.6875, "learning_rate": 2.192018213541186e-07, "loss": 1.06833591, "memory(GiB)": 142.32, "step": 162740, "train_speed(iter/s)": 0.285795 }, { "acc": 0.73687778, "epoch": 1.820462022210509, "grad_norm": 5.28125, "learning_rate": 2.186605530082958e-07, "loss": 1.053088, "memory(GiB)": 142.32, "step": 162760, "train_speed(iter/s)": 0.285805 }, { "acc": 0.74265146, "epoch": 1.8206857211564675, "grad_norm": 5.5625, "learning_rate": 2.1811993881805104e-07, "loss": 1.02315969, "memory(GiB)": 142.32, "step": 162780, "train_speed(iter/s)": 0.285815 }, { "acc": 0.73869591, "epoch": 1.820909420102426, "grad_norm": 5.71875, "learning_rate": 2.1757997885734684e-07, "loss": 1.02696476, "memory(GiB)": 142.32, "step": 162800, "train_speed(iter/s)": 0.285828 }, { "acc": 0.73869095, "epoch": 1.8211331190483846, "grad_norm": 5.6875, "learning_rate": 2.1704067320005907e-07, "loss": 1.0550148, "memory(GiB)": 142.32, "step": 162820, "train_speed(iter/s)": 0.28584 }, { "acc": 0.73398724, "epoch": 1.8213568179943431, "grad_norm": 7.0625, "learning_rate": 2.1650202191997094e-07, "loss": 1.06765079, "memory(GiB)": 142.32, "step": 162840, "train_speed(iter/s)": 0.285849 }, { "acc": 0.72326603, "epoch": 1.8215805169403017, "grad_norm": 5.8125, "learning_rate": 2.1596402509077742e-07, "loss": 1.11976337, "memory(GiB)": 142.32, "step": 162860, "train_speed(iter/s)": 0.285862 }, { "acc": 0.73990183, "epoch": 1.8218042158862602, "grad_norm": 5.90625, "learning_rate": 2.154266827860857e-07, "loss": 1.02359257, "memory(GiB)": 142.32, "step": 162880, "train_speed(iter/s)": 0.285874 }, { "acc": 0.74730206, "epoch": 1.8220279148322187, "grad_norm": 6.34375, "learning_rate": 2.148899950794109e-07, "loss": 0.98758841, "memory(GiB)": 142.32, "step": 162900, "train_speed(iter/s)": 0.285886 }, { "acc": 0.73520613, "epoch": 1.8222516137781772, "grad_norm": 6.25, "learning_rate": 2.143539620441798e-07, "loss": 1.03715706, "memory(GiB)": 142.32, "step": 162920, "train_speed(iter/s)": 0.285898 }, { "acc": 0.74215059, "epoch": 1.8224753127241358, "grad_norm": 6.6875, "learning_rate": 2.1381858375372987e-07, "loss": 1.0247427, "memory(GiB)": 142.32, "step": 162940, "train_speed(iter/s)": 0.28591 }, { "acc": 0.74383278, "epoch": 1.8226990116700943, "grad_norm": 6.0, "learning_rate": 2.13283860281307e-07, "loss": 1.01374846, "memory(GiB)": 142.32, "step": 162960, "train_speed(iter/s)": 0.285922 }, { "acc": 0.73982377, "epoch": 1.8229227106160528, "grad_norm": 7.40625, "learning_rate": 2.1274979170007038e-07, "loss": 1.03460865, "memory(GiB)": 142.32, "step": 162980, "train_speed(iter/s)": 0.285935 }, { "acc": 0.74032145, "epoch": 1.8231464095620113, "grad_norm": 6.5, "learning_rate": 2.122163780830877e-07, "loss": 1.01879339, "memory(GiB)": 142.32, "step": 163000, "train_speed(iter/s)": 0.285947 }, { "acc": 0.74054098, "epoch": 1.8233701085079699, "grad_norm": 5.3125, "learning_rate": 2.1168361950333781e-07, "loss": 1.0379015, "memory(GiB)": 142.32, "step": 163020, "train_speed(iter/s)": 0.285957 }, { "acc": 0.74407344, "epoch": 1.8235938074539284, "grad_norm": 5.90625, "learning_rate": 2.111515160337102e-07, "loss": 1.02400284, "memory(GiB)": 142.32, "step": 163040, "train_speed(iter/s)": 0.28597 }, { "acc": 0.73289576, "epoch": 1.823817506399887, "grad_norm": 5.90625, "learning_rate": 2.1062006774700216e-07, "loss": 1.06108875, "memory(GiB)": 142.32, "step": 163060, "train_speed(iter/s)": 0.285983 }, { "acc": 0.72863741, "epoch": 1.8240412053458455, "grad_norm": 5.125, "learning_rate": 2.100892747159261e-07, "loss": 1.08350639, "memory(GiB)": 142.32, "step": 163080, "train_speed(iter/s)": 0.285995 }, { "acc": 0.7252636, "epoch": 1.824264904291804, "grad_norm": 5.9375, "learning_rate": 2.0955913701310005e-07, "loss": 1.1018549, "memory(GiB)": 142.32, "step": 163100, "train_speed(iter/s)": 0.286007 }, { "acc": 0.73125477, "epoch": 1.8244886032377625, "grad_norm": 6.625, "learning_rate": 2.0902965471105597e-07, "loss": 1.07885323, "memory(GiB)": 142.32, "step": 163120, "train_speed(iter/s)": 0.286017 }, { "acc": 0.73711438, "epoch": 1.824712302183721, "grad_norm": 6.3125, "learning_rate": 2.0850082788223424e-07, "loss": 1.06270151, "memory(GiB)": 142.32, "step": 163140, "train_speed(iter/s)": 0.286029 }, { "acc": 0.75024967, "epoch": 1.8249360011296796, "grad_norm": 7.625, "learning_rate": 2.0797265659898425e-07, "loss": 0.98533611, "memory(GiB)": 142.32, "step": 163160, "train_speed(iter/s)": 0.286041 }, { "acc": 0.74296017, "epoch": 1.825159700075638, "grad_norm": 5.03125, "learning_rate": 2.0744514093356982e-07, "loss": 1.02720871, "memory(GiB)": 142.32, "step": 163180, "train_speed(iter/s)": 0.286053 }, { "acc": 0.73608651, "epoch": 1.8253833990215966, "grad_norm": 6.28125, "learning_rate": 2.0691828095816048e-07, "loss": 1.04585142, "memory(GiB)": 142.32, "step": 163200, "train_speed(iter/s)": 0.286064 }, { "acc": 0.72402945, "epoch": 1.8256070979675552, "grad_norm": 8.0625, "learning_rate": 2.063920767448402e-07, "loss": 1.09562101, "memory(GiB)": 142.32, "step": 163220, "train_speed(iter/s)": 0.286077 }, { "acc": 0.7322073, "epoch": 1.8258307969135137, "grad_norm": 6.5, "learning_rate": 2.0586652836559973e-07, "loss": 1.08417854, "memory(GiB)": 142.32, "step": 163240, "train_speed(iter/s)": 0.286088 }, { "acc": 0.73837376, "epoch": 1.8260544958594722, "grad_norm": 6.0, "learning_rate": 2.0534163589234213e-07, "loss": 1.06117268, "memory(GiB)": 142.32, "step": 163260, "train_speed(iter/s)": 0.2861 }, { "acc": 0.72399025, "epoch": 1.8262781948054307, "grad_norm": 6.15625, "learning_rate": 2.048173993968805e-07, "loss": 1.10042, "memory(GiB)": 142.32, "step": 163280, "train_speed(iter/s)": 0.286112 }, { "acc": 0.74267263, "epoch": 1.8265018937513893, "grad_norm": 5.75, "learning_rate": 2.042938189509369e-07, "loss": 1.03637791, "memory(GiB)": 142.32, "step": 163300, "train_speed(iter/s)": 0.286124 }, { "acc": 0.7411809, "epoch": 1.8267255926973478, "grad_norm": 5.8125, "learning_rate": 2.0377089462614575e-07, "loss": 1.03257742, "memory(GiB)": 142.32, "step": 163320, "train_speed(iter/s)": 0.286137 }, { "acc": 0.74054356, "epoch": 1.8269492916433063, "grad_norm": 5.40625, "learning_rate": 2.0324862649405085e-07, "loss": 1.03374443, "memory(GiB)": 142.32, "step": 163340, "train_speed(iter/s)": 0.28615 }, { "acc": 0.75236254, "epoch": 1.8271729905892649, "grad_norm": 5.8125, "learning_rate": 2.02727014626104e-07, "loss": 0.97905607, "memory(GiB)": 142.32, "step": 163360, "train_speed(iter/s)": 0.286162 }, { "acc": 0.74030399, "epoch": 1.8273966895352234, "grad_norm": 6.71875, "learning_rate": 2.0220605909367142e-07, "loss": 1.03343945, "memory(GiB)": 142.32, "step": 163380, "train_speed(iter/s)": 0.286175 }, { "acc": 0.74328218, "epoch": 1.827620388481182, "grad_norm": 6.03125, "learning_rate": 2.0168575996802498e-07, "loss": 1.01144495, "memory(GiB)": 142.32, "step": 163400, "train_speed(iter/s)": 0.286186 }, { "acc": 0.73083611, "epoch": 1.8278440874271404, "grad_norm": 5.9375, "learning_rate": 2.011661173203522e-07, "loss": 1.06143398, "memory(GiB)": 142.32, "step": 163420, "train_speed(iter/s)": 0.286197 }, { "acc": 0.74885273, "epoch": 1.828067786373099, "grad_norm": 5.375, "learning_rate": 2.0064713122174394e-07, "loss": 1.00162067, "memory(GiB)": 142.32, "step": 163440, "train_speed(iter/s)": 0.286207 }, { "acc": 0.74923782, "epoch": 1.8282914853190575, "grad_norm": 6.9375, "learning_rate": 2.0012880174320738e-07, "loss": 1.00161266, "memory(GiB)": 142.32, "step": 163460, "train_speed(iter/s)": 0.28622 }, { "acc": 0.73096681, "epoch": 1.828515184265016, "grad_norm": 6.09375, "learning_rate": 1.996111289556568e-07, "loss": 1.07068691, "memory(GiB)": 142.32, "step": 163480, "train_speed(iter/s)": 0.286231 }, { "acc": 0.73465643, "epoch": 1.8287388832109746, "grad_norm": 5.15625, "learning_rate": 1.9909411292991677e-07, "loss": 1.05485554, "memory(GiB)": 142.32, "step": 163500, "train_speed(iter/s)": 0.286242 }, { "acc": 0.74231129, "epoch": 1.828962582156933, "grad_norm": 5.71875, "learning_rate": 1.985777537367234e-07, "loss": 1.02966566, "memory(GiB)": 142.32, "step": 163520, "train_speed(iter/s)": 0.286253 }, { "acc": 0.74048996, "epoch": 1.8291862811028916, "grad_norm": 5.6875, "learning_rate": 1.9806205144672075e-07, "loss": 1.0349988, "memory(GiB)": 142.32, "step": 163540, "train_speed(iter/s)": 0.286265 }, { "acc": 0.74135923, "epoch": 1.8294099800488501, "grad_norm": 6.21875, "learning_rate": 1.9754700613046574e-07, "loss": 1.03443642, "memory(GiB)": 142.32, "step": 163560, "train_speed(iter/s)": 0.286278 }, { "acc": 0.73670635, "epoch": 1.8296336789948087, "grad_norm": 5.6875, "learning_rate": 1.9703261785842252e-07, "loss": 1.0445364, "memory(GiB)": 142.32, "step": 163580, "train_speed(iter/s)": 0.28629 }, { "acc": 0.73899169, "epoch": 1.8298573779407672, "grad_norm": 6.03125, "learning_rate": 1.9651888670096765e-07, "loss": 1.03294868, "memory(GiB)": 142.32, "step": 163600, "train_speed(iter/s)": 0.2863 }, { "acc": 0.73022957, "epoch": 1.8300810768867257, "grad_norm": 6.34375, "learning_rate": 1.9600581272838647e-07, "loss": 1.08290367, "memory(GiB)": 142.32, "step": 163620, "train_speed(iter/s)": 0.286311 }, { "acc": 0.73908682, "epoch": 1.8303047758326842, "grad_norm": 5.25, "learning_rate": 1.9549339601087513e-07, "loss": 1.04688511, "memory(GiB)": 142.32, "step": 163640, "train_speed(iter/s)": 0.286323 }, { "acc": 0.74035826, "epoch": 1.8305284747786428, "grad_norm": 6.15625, "learning_rate": 1.9498163661853974e-07, "loss": 1.0350666, "memory(GiB)": 142.32, "step": 163660, "train_speed(iter/s)": 0.286335 }, { "acc": 0.73632483, "epoch": 1.8307521737246013, "grad_norm": 6.3125, "learning_rate": 1.9447053462139597e-07, "loss": 1.04572315, "memory(GiB)": 142.32, "step": 163680, "train_speed(iter/s)": 0.286347 }, { "acc": 0.73517294, "epoch": 1.8309758726705598, "grad_norm": 6.5625, "learning_rate": 1.93960090089369e-07, "loss": 1.05079279, "memory(GiB)": 142.32, "step": 163700, "train_speed(iter/s)": 0.28636 }, { "acc": 0.74507899, "epoch": 1.8311995716165184, "grad_norm": 6.75, "learning_rate": 1.9345030309229685e-07, "loss": 1.00733662, "memory(GiB)": 142.32, "step": 163720, "train_speed(iter/s)": 0.286372 }, { "acc": 0.74148693, "epoch": 1.831423270562477, "grad_norm": 5.71875, "learning_rate": 1.929411736999237e-07, "loss": 1.03153248, "memory(GiB)": 142.32, "step": 163740, "train_speed(iter/s)": 0.286384 }, { "acc": 0.73503962, "epoch": 1.8316469695084354, "grad_norm": 5.59375, "learning_rate": 1.9243270198190776e-07, "loss": 1.04956493, "memory(GiB)": 142.32, "step": 163760, "train_speed(iter/s)": 0.286395 }, { "acc": 0.72900076, "epoch": 1.831870668454394, "grad_norm": 5.0625, "learning_rate": 1.9192488800781394e-07, "loss": 1.06676655, "memory(GiB)": 142.32, "step": 163780, "train_speed(iter/s)": 0.286408 }, { "acc": 0.73379564, "epoch": 1.8320943674003525, "grad_norm": 6.6875, "learning_rate": 1.9141773184711832e-07, "loss": 1.05992336, "memory(GiB)": 142.32, "step": 163800, "train_speed(iter/s)": 0.28642 }, { "acc": 0.74230037, "epoch": 1.832318066346311, "grad_norm": 5.875, "learning_rate": 1.909112335692076e-07, "loss": 1.03013668, "memory(GiB)": 142.32, "step": 163820, "train_speed(iter/s)": 0.28643 }, { "acc": 0.73983903, "epoch": 1.8325417652922695, "grad_norm": 6.6875, "learning_rate": 1.9040539324337749e-07, "loss": 1.02862682, "memory(GiB)": 142.32, "step": 163840, "train_speed(iter/s)": 0.286442 }, { "acc": 0.73490887, "epoch": 1.832765464238228, "grad_norm": 6.46875, "learning_rate": 1.8990021093883482e-07, "loss": 1.05665827, "memory(GiB)": 142.32, "step": 163860, "train_speed(iter/s)": 0.286453 }, { "acc": 0.7384099, "epoch": 1.8329891631841866, "grad_norm": 6.0, "learning_rate": 1.89395686724696e-07, "loss": 1.03268595, "memory(GiB)": 142.32, "step": 163880, "train_speed(iter/s)": 0.286465 }, { "acc": 0.74834127, "epoch": 1.8332128621301451, "grad_norm": 5.75, "learning_rate": 1.8889182066998578e-07, "loss": 0.96966352, "memory(GiB)": 142.32, "step": 163900, "train_speed(iter/s)": 0.286477 }, { "acc": 0.73331318, "epoch": 1.8334365610761036, "grad_norm": 5.90625, "learning_rate": 1.8838861284364185e-07, "loss": 1.05180902, "memory(GiB)": 142.32, "step": 163920, "train_speed(iter/s)": 0.286489 }, { "acc": 0.7446722, "epoch": 1.8336602600220622, "grad_norm": 5.3125, "learning_rate": 1.8788606331450853e-07, "loss": 1.00923834, "memory(GiB)": 142.32, "step": 163940, "train_speed(iter/s)": 0.2865 }, { "acc": 0.74418044, "epoch": 1.8338839589680207, "grad_norm": 6.875, "learning_rate": 1.8738417215134308e-07, "loss": 1.02210999, "memory(GiB)": 142.32, "step": 163960, "train_speed(iter/s)": 0.28651 }, { "acc": 0.73658276, "epoch": 1.8341076579139792, "grad_norm": 6.09375, "learning_rate": 1.8688293942281167e-07, "loss": 1.0445713, "memory(GiB)": 142.32, "step": 163980, "train_speed(iter/s)": 0.286522 }, { "acc": 0.7493845, "epoch": 1.8343313568599378, "grad_norm": 6.21875, "learning_rate": 1.863823651974883e-07, "loss": 0.98705244, "memory(GiB)": 142.32, "step": 164000, "train_speed(iter/s)": 0.286534 }, { "epoch": 1.8343313568599378, "eval_acc": 0.6963762324656728, "eval_loss": 1.0713787078857422, "eval_runtime": 2339.4767, "eval_samples_per_second": 32.179, "eval_steps_per_second": 16.09, "step": 164000 }, { "acc": 0.72583241, "epoch": 1.8345550558058963, "grad_norm": 7.0625, "learning_rate": 1.8588244954386102e-07, "loss": 1.08985481, "memory(GiB)": 142.32, "step": 164020, "train_speed(iter/s)": 0.285353 }, { "acc": 0.72232308, "epoch": 1.8347787547518548, "grad_norm": 5.40625, "learning_rate": 1.8538319253032288e-07, "loss": 1.09425583, "memory(GiB)": 142.32, "step": 164040, "train_speed(iter/s)": 0.285366 }, { "acc": 0.73655477, "epoch": 1.8350024536978133, "grad_norm": 6.25, "learning_rate": 1.84884594225182e-07, "loss": 1.07116699, "memory(GiB)": 142.32, "step": 164060, "train_speed(iter/s)": 0.285379 }, { "acc": 0.7451787, "epoch": 1.8352261526437719, "grad_norm": 5.78125, "learning_rate": 1.8438665469665218e-07, "loss": 1.0261549, "memory(GiB)": 142.32, "step": 164080, "train_speed(iter/s)": 0.285392 }, { "acc": 0.71921206, "epoch": 1.8354498515897304, "grad_norm": 5.875, "learning_rate": 1.8388937401285832e-07, "loss": 1.1539588, "memory(GiB)": 142.32, "step": 164100, "train_speed(iter/s)": 0.285402 }, { "acc": 0.73873329, "epoch": 1.835673550535689, "grad_norm": 4.75, "learning_rate": 1.8339275224183662e-07, "loss": 1.02620687, "memory(GiB)": 142.32, "step": 164120, "train_speed(iter/s)": 0.285415 }, { "acc": 0.72599554, "epoch": 1.8358972494816475, "grad_norm": 5.25, "learning_rate": 1.8289678945153156e-07, "loss": 1.09684124, "memory(GiB)": 142.32, "step": 164140, "train_speed(iter/s)": 0.285427 }, { "acc": 0.73205719, "epoch": 1.836120948427606, "grad_norm": 5.8125, "learning_rate": 1.8240148570979832e-07, "loss": 1.08178091, "memory(GiB)": 142.32, "step": 164160, "train_speed(iter/s)": 0.285437 }, { "acc": 0.7307416, "epoch": 1.8363446473735645, "grad_norm": 6.6875, "learning_rate": 1.8190684108440103e-07, "loss": 1.0746314, "memory(GiB)": 142.32, "step": 164180, "train_speed(iter/s)": 0.285449 }, { "acc": 0.7306612, "epoch": 1.836568346319523, "grad_norm": 6.84375, "learning_rate": 1.8141285564301335e-07, "loss": 1.0792264, "memory(GiB)": 142.32, "step": 164200, "train_speed(iter/s)": 0.285459 }, { "acc": 0.73734922, "epoch": 1.8367920452654816, "grad_norm": 6.5, "learning_rate": 1.8091952945322178e-07, "loss": 1.041576, "memory(GiB)": 142.32, "step": 164220, "train_speed(iter/s)": 0.28547 }, { "acc": 0.73708262, "epoch": 1.83701574421144, "grad_norm": 5.5625, "learning_rate": 1.804268625825184e-07, "loss": 1.05252028, "memory(GiB)": 142.32, "step": 164240, "train_speed(iter/s)": 0.285482 }, { "acc": 0.74155912, "epoch": 1.8372394431573986, "grad_norm": 6.59375, "learning_rate": 1.799348550983082e-07, "loss": 1.02093697, "memory(GiB)": 142.32, "step": 164260, "train_speed(iter/s)": 0.285494 }, { "acc": 0.72989335, "epoch": 1.8374631421033571, "grad_norm": 6.625, "learning_rate": 1.7944350706790458e-07, "loss": 1.07719755, "memory(GiB)": 142.32, "step": 164280, "train_speed(iter/s)": 0.285504 }, { "acc": 0.72833023, "epoch": 1.8376868410493157, "grad_norm": 8.9375, "learning_rate": 1.7895281855852985e-07, "loss": 1.09647598, "memory(GiB)": 142.32, "step": 164300, "train_speed(iter/s)": 0.285516 }, { "acc": 0.73965793, "epoch": 1.8379105399952742, "grad_norm": 6.21875, "learning_rate": 1.7846278963731921e-07, "loss": 1.04110756, "memory(GiB)": 142.32, "step": 164320, "train_speed(iter/s)": 0.285529 }, { "acc": 0.7301168, "epoch": 1.8381342389412327, "grad_norm": 6.1875, "learning_rate": 1.7797342037131348e-07, "loss": 1.06079025, "memory(GiB)": 142.32, "step": 164340, "train_speed(iter/s)": 0.285541 }, { "acc": 0.73360424, "epoch": 1.8383579378871913, "grad_norm": 5.34375, "learning_rate": 1.7748471082746742e-07, "loss": 1.06174021, "memory(GiB)": 142.32, "step": 164360, "train_speed(iter/s)": 0.285551 }, { "acc": 0.73621526, "epoch": 1.8385816368331498, "grad_norm": 6.53125, "learning_rate": 1.7699666107264257e-07, "loss": 1.04249992, "memory(GiB)": 142.32, "step": 164380, "train_speed(iter/s)": 0.285563 }, { "acc": 0.72847853, "epoch": 1.8388053357791083, "grad_norm": 5.96875, "learning_rate": 1.765092711736105e-07, "loss": 1.07243443, "memory(GiB)": 142.32, "step": 164400, "train_speed(iter/s)": 0.285575 }, { "acc": 0.73428645, "epoch": 1.8390290347250668, "grad_norm": 6.8125, "learning_rate": 1.7602254119705453e-07, "loss": 1.06730928, "memory(GiB)": 142.32, "step": 164420, "train_speed(iter/s)": 0.285587 }, { "acc": 0.7392859, "epoch": 1.8392527336710254, "grad_norm": 5.6875, "learning_rate": 1.7553647120956474e-07, "loss": 1.04468613, "memory(GiB)": 142.32, "step": 164440, "train_speed(iter/s)": 0.285598 }, { "acc": 0.72230043, "epoch": 1.839476432616984, "grad_norm": 5.5625, "learning_rate": 1.7505106127764348e-07, "loss": 1.10523491, "memory(GiB)": 142.32, "step": 164460, "train_speed(iter/s)": 0.285611 }, { "acc": 0.73044472, "epoch": 1.8397001315629424, "grad_norm": 7.125, "learning_rate": 1.7456631146770154e-07, "loss": 1.08107185, "memory(GiB)": 142.32, "step": 164480, "train_speed(iter/s)": 0.285623 }, { "acc": 0.73542271, "epoch": 1.839923830508901, "grad_norm": 6.34375, "learning_rate": 1.7408222184605917e-07, "loss": 1.06260614, "memory(GiB)": 142.32, "step": 164500, "train_speed(iter/s)": 0.285635 }, { "acc": 0.74800792, "epoch": 1.8401475294548595, "grad_norm": 6.25, "learning_rate": 1.735987924789473e-07, "loss": 1.00377979, "memory(GiB)": 142.32, "step": 164520, "train_speed(iter/s)": 0.285647 }, { "acc": 0.74819698, "epoch": 1.840371228400818, "grad_norm": 5.25, "learning_rate": 1.7311602343250522e-07, "loss": 0.9872694, "memory(GiB)": 142.32, "step": 164540, "train_speed(iter/s)": 0.28566 }, { "acc": 0.72895975, "epoch": 1.8405949273467765, "grad_norm": 5.59375, "learning_rate": 1.72633914772784e-07, "loss": 1.07206135, "memory(GiB)": 142.32, "step": 164560, "train_speed(iter/s)": 0.285671 }, { "acc": 0.73053007, "epoch": 1.840818626292735, "grad_norm": 6.59375, "learning_rate": 1.7215246656574203e-07, "loss": 1.08549442, "memory(GiB)": 142.32, "step": 164580, "train_speed(iter/s)": 0.285683 }, { "acc": 0.73426199, "epoch": 1.8410423252386936, "grad_norm": 5.34375, "learning_rate": 1.7167167887724878e-07, "loss": 1.05449257, "memory(GiB)": 142.32, "step": 164600, "train_speed(iter/s)": 0.285695 }, { "acc": 0.74830117, "epoch": 1.8412660241846524, "grad_norm": 5.3125, "learning_rate": 1.7119155177308223e-07, "loss": 0.98601284, "memory(GiB)": 142.32, "step": 164620, "train_speed(iter/s)": 0.285707 }, { "acc": 0.74996777, "epoch": 1.8414897231306109, "grad_norm": 6.46875, "learning_rate": 1.707120853189309e-07, "loss": 0.99421759, "memory(GiB)": 142.32, "step": 164640, "train_speed(iter/s)": 0.285718 }, { "acc": 0.74321356, "epoch": 1.8417134220765694, "grad_norm": 6.3125, "learning_rate": 1.7023327958039292e-07, "loss": 1.02338867, "memory(GiB)": 142.32, "step": 164660, "train_speed(iter/s)": 0.28573 }, { "acc": 0.73320837, "epoch": 1.841937121022528, "grad_norm": 5.5625, "learning_rate": 1.6975513462297477e-07, "loss": 1.05163975, "memory(GiB)": 142.32, "step": 164680, "train_speed(iter/s)": 0.285742 }, { "acc": 0.73127165, "epoch": 1.8421608199684865, "grad_norm": 5.75, "learning_rate": 1.6927765051209523e-07, "loss": 1.08043518, "memory(GiB)": 142.32, "step": 164700, "train_speed(iter/s)": 0.285754 }, { "acc": 0.73641691, "epoch": 1.842384518914445, "grad_norm": 5.3125, "learning_rate": 1.6880082731308033e-07, "loss": 1.05076237, "memory(GiB)": 142.32, "step": 164720, "train_speed(iter/s)": 0.285766 }, { "acc": 0.73067517, "epoch": 1.8426082178604035, "grad_norm": 7.0, "learning_rate": 1.6832466509116462e-07, "loss": 1.0749507, "memory(GiB)": 142.32, "step": 164740, "train_speed(iter/s)": 0.285778 }, { "acc": 0.73303156, "epoch": 1.842831916806362, "grad_norm": 6.625, "learning_rate": 1.6784916391149651e-07, "loss": 1.06778564, "memory(GiB)": 142.32, "step": 164760, "train_speed(iter/s)": 0.28579 }, { "acc": 0.7338048, "epoch": 1.8430556157523206, "grad_norm": 6.28125, "learning_rate": 1.6737432383912955e-07, "loss": 1.04779358, "memory(GiB)": 142.32, "step": 164780, "train_speed(iter/s)": 0.285801 }, { "acc": 0.75254078, "epoch": 1.843279314698279, "grad_norm": 6.25, "learning_rate": 1.669001449390295e-07, "loss": 0.9690897, "memory(GiB)": 142.32, "step": 164800, "train_speed(iter/s)": 0.285813 }, { "acc": 0.72168131, "epoch": 1.8435030136442376, "grad_norm": 5.375, "learning_rate": 1.6642662727607062e-07, "loss": 1.09560251, "memory(GiB)": 142.32, "step": 164820, "train_speed(iter/s)": 0.285825 }, { "acc": 0.73612089, "epoch": 1.8437267125901962, "grad_norm": 5.9375, "learning_rate": 1.6595377091503605e-07, "loss": 1.05406055, "memory(GiB)": 142.32, "step": 164840, "train_speed(iter/s)": 0.285836 }, { "acc": 0.741572, "epoch": 1.8439504115361547, "grad_norm": 6.125, "learning_rate": 1.6548157592062075e-07, "loss": 1.01469402, "memory(GiB)": 142.32, "step": 164860, "train_speed(iter/s)": 0.285848 }, { "acc": 0.73686972, "epoch": 1.8441741104821132, "grad_norm": 4.59375, "learning_rate": 1.650100423574269e-07, "loss": 1.04703312, "memory(GiB)": 142.32, "step": 164880, "train_speed(iter/s)": 0.285861 }, { "acc": 0.7312254, "epoch": 1.8443978094280717, "grad_norm": 5.4375, "learning_rate": 1.6453917028996735e-07, "loss": 1.06583176, "memory(GiB)": 142.32, "step": 164900, "train_speed(iter/s)": 0.285872 }, { "acc": 0.73340015, "epoch": 1.8446215083740303, "grad_norm": 5.9375, "learning_rate": 1.6406895978266392e-07, "loss": 1.07439909, "memory(GiB)": 142.32, "step": 164920, "train_speed(iter/s)": 0.285883 }, { "acc": 0.7461606, "epoch": 1.8448452073199888, "grad_norm": 5.0, "learning_rate": 1.6359941089984787e-07, "loss": 1.0072958, "memory(GiB)": 142.32, "step": 164940, "train_speed(iter/s)": 0.285894 }, { "acc": 0.72558527, "epoch": 1.8450689062659473, "grad_norm": 5.875, "learning_rate": 1.6313052370576066e-07, "loss": 1.10427799, "memory(GiB)": 142.32, "step": 164960, "train_speed(iter/s)": 0.285906 }, { "acc": 0.73055658, "epoch": 1.8452926052119059, "grad_norm": 6.75, "learning_rate": 1.626622982645526e-07, "loss": 1.07831898, "memory(GiB)": 142.32, "step": 164980, "train_speed(iter/s)": 0.285918 }, { "acc": 0.74982214, "epoch": 1.8455163041578644, "grad_norm": 6.6875, "learning_rate": 1.6219473464028358e-07, "loss": 0.96989822, "memory(GiB)": 142.32, "step": 165000, "train_speed(iter/s)": 0.28593 }, { "acc": 0.75055823, "epoch": 1.845740003103823, "grad_norm": 6.65625, "learning_rate": 1.617278328969235e-07, "loss": 0.99339046, "memory(GiB)": 142.32, "step": 165020, "train_speed(iter/s)": 0.285941 }, { "acc": 0.73054609, "epoch": 1.8459637020497814, "grad_norm": 5.8125, "learning_rate": 1.6126159309835022e-07, "loss": 1.08612289, "memory(GiB)": 142.32, "step": 165040, "train_speed(iter/s)": 0.285954 }, { "acc": 0.73497987, "epoch": 1.84618740099574, "grad_norm": 6.53125, "learning_rate": 1.6079601530835264e-07, "loss": 1.05876961, "memory(GiB)": 142.32, "step": 165060, "train_speed(iter/s)": 0.285966 }, { "acc": 0.74524441, "epoch": 1.8464110999416985, "grad_norm": 5.59375, "learning_rate": 1.6033109959062765e-07, "loss": 1.00789261, "memory(GiB)": 142.32, "step": 165080, "train_speed(iter/s)": 0.285978 }, { "acc": 0.74508266, "epoch": 1.846634798887657, "grad_norm": 5.65625, "learning_rate": 1.5986684600878432e-07, "loss": 1.01784668, "memory(GiB)": 142.32, "step": 165100, "train_speed(iter/s)": 0.285988 }, { "acc": 0.72635603, "epoch": 1.8468584978336156, "grad_norm": 7.125, "learning_rate": 1.5940325462633744e-07, "loss": 1.09924564, "memory(GiB)": 142.32, "step": 165120, "train_speed(iter/s)": 0.286 }, { "acc": 0.7336144, "epoch": 1.847082196779574, "grad_norm": 5.625, "learning_rate": 1.589403255067129e-07, "loss": 1.04678669, "memory(GiB)": 142.32, "step": 165140, "train_speed(iter/s)": 0.286012 }, { "acc": 0.74473433, "epoch": 1.8473058957255326, "grad_norm": 6.625, "learning_rate": 1.5847805871324674e-07, "loss": 1.01468401, "memory(GiB)": 142.32, "step": 165160, "train_speed(iter/s)": 0.286024 }, { "acc": 0.74323306, "epoch": 1.8475295946714911, "grad_norm": 5.6875, "learning_rate": 1.5801645430918334e-07, "loss": 1.0162899, "memory(GiB)": 142.32, "step": 165180, "train_speed(iter/s)": 0.286035 }, { "acc": 0.74254808, "epoch": 1.8477532936174497, "grad_norm": 6.375, "learning_rate": 1.5755551235767776e-07, "loss": 1.02107983, "memory(GiB)": 142.32, "step": 165200, "train_speed(iter/s)": 0.286048 }, { "acc": 0.73508196, "epoch": 1.8479769925634082, "grad_norm": 5.9375, "learning_rate": 1.5709523292179174e-07, "loss": 1.04441261, "memory(GiB)": 142.32, "step": 165220, "train_speed(iter/s)": 0.286061 }, { "acc": 0.73394876, "epoch": 1.8482006915093667, "grad_norm": 7.96875, "learning_rate": 1.5663561606449884e-07, "loss": 1.06714058, "memory(GiB)": 142.32, "step": 165240, "train_speed(iter/s)": 0.286072 }, { "acc": 0.74117603, "epoch": 1.8484243904553253, "grad_norm": 5.8125, "learning_rate": 1.561766618486815e-07, "loss": 1.03660707, "memory(GiB)": 142.32, "step": 165260, "train_speed(iter/s)": 0.286084 }, { "acc": 0.73851004, "epoch": 1.8486480894012838, "grad_norm": 7.09375, "learning_rate": 1.5571837033713122e-07, "loss": 1.04708738, "memory(GiB)": 142.32, "step": 165280, "train_speed(iter/s)": 0.286095 }, { "acc": 0.74599295, "epoch": 1.8488717883472423, "grad_norm": 5.8125, "learning_rate": 1.5526074159254835e-07, "loss": 1.01616144, "memory(GiB)": 142.32, "step": 165300, "train_speed(iter/s)": 0.286107 }, { "acc": 0.72462187, "epoch": 1.8490954872932008, "grad_norm": 5.625, "learning_rate": 1.548037756775439e-07, "loss": 1.10943584, "memory(GiB)": 142.32, "step": 165320, "train_speed(iter/s)": 0.286118 }, { "acc": 0.7328414, "epoch": 1.8493191862391594, "grad_norm": 5.34375, "learning_rate": 1.5434747265463568e-07, "loss": 1.06969881, "memory(GiB)": 142.32, "step": 165340, "train_speed(iter/s)": 0.28613 }, { "acc": 0.73959522, "epoch": 1.849542885185118, "grad_norm": 8.6875, "learning_rate": 1.5389183258625428e-07, "loss": 1.02873726, "memory(GiB)": 142.32, "step": 165360, "train_speed(iter/s)": 0.286142 }, { "acc": 0.73893528, "epoch": 1.8497665841310764, "grad_norm": 6.34375, "learning_rate": 1.5343685553473653e-07, "loss": 1.03341122, "memory(GiB)": 142.32, "step": 165380, "train_speed(iter/s)": 0.286154 }, { "acc": 0.74480143, "epoch": 1.849990283077035, "grad_norm": 7.0625, "learning_rate": 1.5298254156233095e-07, "loss": 1.01236, "memory(GiB)": 142.32, "step": 165400, "train_speed(iter/s)": 0.286164 }, { "acc": 0.74140258, "epoch": 1.8502139820229935, "grad_norm": 6.0, "learning_rate": 1.5252889073119336e-07, "loss": 1.03675861, "memory(GiB)": 142.32, "step": 165420, "train_speed(iter/s)": 0.286177 }, { "acc": 0.72449703, "epoch": 1.850437680968952, "grad_norm": 5.75, "learning_rate": 1.5207590310338914e-07, "loss": 1.09832554, "memory(GiB)": 142.32, "step": 165440, "train_speed(iter/s)": 0.286188 }, { "acc": 0.72891121, "epoch": 1.8506613799149105, "grad_norm": 6.1875, "learning_rate": 1.5162357874089483e-07, "loss": 1.08821421, "memory(GiB)": 142.32, "step": 165460, "train_speed(iter/s)": 0.2862 }, { "acc": 0.737432, "epoch": 1.850885078860869, "grad_norm": 6.1875, "learning_rate": 1.5117191770559426e-07, "loss": 1.04127159, "memory(GiB)": 142.32, "step": 165480, "train_speed(iter/s)": 0.28621 }, { "acc": 0.73808122, "epoch": 1.8511087778068276, "grad_norm": 6.78125, "learning_rate": 1.5072092005928073e-07, "loss": 1.02890625, "memory(GiB)": 142.32, "step": 165500, "train_speed(iter/s)": 0.286222 }, { "acc": 0.73780112, "epoch": 1.8513324767527861, "grad_norm": 5.875, "learning_rate": 1.5027058586365828e-07, "loss": 1.05596857, "memory(GiB)": 142.32, "step": 165520, "train_speed(iter/s)": 0.286234 }, { "acc": 0.74012127, "epoch": 1.8515561756987446, "grad_norm": 6.5625, "learning_rate": 1.4982091518033703e-07, "loss": 1.03855639, "memory(GiB)": 142.32, "step": 165540, "train_speed(iter/s)": 0.286246 }, { "acc": 0.7438344, "epoch": 1.8517798746447032, "grad_norm": 5.84375, "learning_rate": 1.4937190807084057e-07, "loss": 1.0083065, "memory(GiB)": 142.32, "step": 165560, "train_speed(iter/s)": 0.286258 }, { "acc": 0.75584574, "epoch": 1.8520035735906617, "grad_norm": 6.28125, "learning_rate": 1.4892356459659807e-07, "loss": 0.95641584, "memory(GiB)": 142.32, "step": 165580, "train_speed(iter/s)": 0.286268 }, { "acc": 0.72822056, "epoch": 1.8522272725366202, "grad_norm": 5.65625, "learning_rate": 1.4847588481894993e-07, "loss": 1.07579098, "memory(GiB)": 142.32, "step": 165600, "train_speed(iter/s)": 0.286281 }, { "acc": 0.74225483, "epoch": 1.8524509714825788, "grad_norm": 7.0, "learning_rate": 1.4802886879914547e-07, "loss": 1.02752647, "memory(GiB)": 142.32, "step": 165620, "train_speed(iter/s)": 0.286291 }, { "acc": 0.73030839, "epoch": 1.8526746704285373, "grad_norm": 8.5, "learning_rate": 1.4758251659834132e-07, "loss": 1.07758512, "memory(GiB)": 142.32, "step": 165640, "train_speed(iter/s)": 0.286301 }, { "acc": 0.73408794, "epoch": 1.8528983693744958, "grad_norm": 5.3125, "learning_rate": 1.47136828277607e-07, "loss": 1.04955482, "memory(GiB)": 142.32, "step": 165660, "train_speed(iter/s)": 0.286312 }, { "acc": 0.74754238, "epoch": 1.8531220683204543, "grad_norm": 6.375, "learning_rate": 1.4669180389791705e-07, "loss": 0.97797165, "memory(GiB)": 142.32, "step": 165680, "train_speed(iter/s)": 0.286323 }, { "acc": 0.73885508, "epoch": 1.8533457672664129, "grad_norm": 6.21875, "learning_rate": 1.4624744352015885e-07, "loss": 1.04332962, "memory(GiB)": 142.32, "step": 165700, "train_speed(iter/s)": 0.286336 }, { "acc": 0.72581024, "epoch": 1.8535694662123714, "grad_norm": 6.8125, "learning_rate": 1.4580374720512659e-07, "loss": 1.10436306, "memory(GiB)": 142.32, "step": 165720, "train_speed(iter/s)": 0.286347 }, { "acc": 0.74526529, "epoch": 1.85379316515833, "grad_norm": 6.0625, "learning_rate": 1.4536071501352333e-07, "loss": 1.00422611, "memory(GiB)": 142.32, "step": 165740, "train_speed(iter/s)": 0.286358 }, { "acc": 0.73792076, "epoch": 1.8540168641042885, "grad_norm": 5.28125, "learning_rate": 1.4491834700596397e-07, "loss": 1.04545946, "memory(GiB)": 142.32, "step": 165760, "train_speed(iter/s)": 0.286371 }, { "acc": 0.74268332, "epoch": 1.854240563050247, "grad_norm": 5.6875, "learning_rate": 1.444766432429695e-07, "loss": 1.02358036, "memory(GiB)": 142.32, "step": 165780, "train_speed(iter/s)": 0.286383 }, { "acc": 0.75021243, "epoch": 1.8544642619962055, "grad_norm": 5.90625, "learning_rate": 1.440356037849716e-07, "loss": 0.98068466, "memory(GiB)": 142.32, "step": 165800, "train_speed(iter/s)": 0.286395 }, { "acc": 0.72618523, "epoch": 1.854687960942164, "grad_norm": 5.875, "learning_rate": 1.4359522869231035e-07, "loss": 1.10520239, "memory(GiB)": 142.32, "step": 165820, "train_speed(iter/s)": 0.286406 }, { "acc": 0.74585166, "epoch": 1.8549116598881226, "grad_norm": 5.65625, "learning_rate": 1.431555180252364e-07, "loss": 1.01467276, "memory(GiB)": 142.32, "step": 165840, "train_speed(iter/s)": 0.286419 }, { "acc": 0.7410408, "epoch": 1.855135358834081, "grad_norm": 6.53125, "learning_rate": 1.427164718439078e-07, "loss": 1.03135166, "memory(GiB)": 142.32, "step": 165860, "train_speed(iter/s)": 0.28643 }, { "acc": 0.73515558, "epoch": 1.8553590577800396, "grad_norm": 5.53125, "learning_rate": 1.4227809020839256e-07, "loss": 1.04190483, "memory(GiB)": 142.32, "step": 165880, "train_speed(iter/s)": 0.286441 }, { "acc": 0.7324214, "epoch": 1.8555827567259982, "grad_norm": 6.3125, "learning_rate": 1.4184037317866716e-07, "loss": 1.06229038, "memory(GiB)": 142.32, "step": 165900, "train_speed(iter/s)": 0.286453 }, { "acc": 0.73847733, "epoch": 1.8558064556719567, "grad_norm": 5.0625, "learning_rate": 1.4140332081461761e-07, "loss": 1.02315235, "memory(GiB)": 142.32, "step": 165920, "train_speed(iter/s)": 0.286465 }, { "acc": 0.73723745, "epoch": 1.8560301546179152, "grad_norm": 6.0625, "learning_rate": 1.4096693317603938e-07, "loss": 1.04019175, "memory(GiB)": 142.32, "step": 165940, "train_speed(iter/s)": 0.286476 }, { "acc": 0.75101018, "epoch": 1.8562538535638737, "grad_norm": 5.75, "learning_rate": 1.4053121032263638e-07, "loss": 0.99481564, "memory(GiB)": 142.32, "step": 165960, "train_speed(iter/s)": 0.286489 }, { "acc": 0.74666739, "epoch": 1.8564775525098323, "grad_norm": 4.90625, "learning_rate": 1.400961523140215e-07, "loss": 0.99924126, "memory(GiB)": 142.32, "step": 165980, "train_speed(iter/s)": 0.2865 }, { "acc": 0.72896166, "epoch": 1.8567012514557908, "grad_norm": 5.40625, "learning_rate": 1.3966175920971715e-07, "loss": 1.07981377, "memory(GiB)": 142.32, "step": 166000, "train_speed(iter/s)": 0.286512 }, { "epoch": 1.8567012514557908, "eval_acc": 0.6963575983300659, "eval_loss": 1.071368932723999, "eval_runtime": 2340.8738, "eval_samples_per_second": 32.16, "eval_steps_per_second": 16.08, "step": 166000 }, { "acc": 0.73117695, "epoch": 1.8569249504017493, "grad_norm": 6.25, "learning_rate": 1.3922803106915406e-07, "loss": 1.06938648, "memory(GiB)": 142.32, "step": 166020, "train_speed(iter/s)": 0.285345 }, { "acc": 0.74292831, "epoch": 1.8571486493477078, "grad_norm": 6.78125, "learning_rate": 1.3879496795167313e-07, "loss": 1.02270603, "memory(GiB)": 142.32, "step": 166040, "train_speed(iter/s)": 0.285357 }, { "acc": 0.73797607, "epoch": 1.8573723482936664, "grad_norm": 6.96875, "learning_rate": 1.3836256991652308e-07, "loss": 1.04723797, "memory(GiB)": 142.32, "step": 166060, "train_speed(iter/s)": 0.285368 }, { "acc": 0.73932648, "epoch": 1.857596047239625, "grad_norm": 5.65625, "learning_rate": 1.3793083702286214e-07, "loss": 1.02903156, "memory(GiB)": 142.32, "step": 166080, "train_speed(iter/s)": 0.285379 }, { "acc": 0.74235582, "epoch": 1.8578197461855834, "grad_norm": 5.84375, "learning_rate": 1.3749976932975807e-07, "loss": 1.01224594, "memory(GiB)": 142.32, "step": 166100, "train_speed(iter/s)": 0.285392 }, { "acc": 0.72932734, "epoch": 1.858043445131542, "grad_norm": 7.15625, "learning_rate": 1.370693668961859e-07, "loss": 1.08864861, "memory(GiB)": 142.32, "step": 166120, "train_speed(iter/s)": 0.285404 }, { "acc": 0.73234587, "epoch": 1.8582671440775005, "grad_norm": 5.9375, "learning_rate": 1.366396297810324e-07, "loss": 1.05717726, "memory(GiB)": 142.32, "step": 166140, "train_speed(iter/s)": 0.285415 }, { "acc": 0.73432589, "epoch": 1.858490843023459, "grad_norm": 6.0625, "learning_rate": 1.3621055804309115e-07, "loss": 1.05960579, "memory(GiB)": 142.32, "step": 166160, "train_speed(iter/s)": 0.285427 }, { "acc": 0.73619585, "epoch": 1.8587145419694175, "grad_norm": 4.96875, "learning_rate": 1.3578215174106403e-07, "loss": 1.04713469, "memory(GiB)": 142.32, "step": 166180, "train_speed(iter/s)": 0.285435 }, { "acc": 0.7462574, "epoch": 1.858938240915376, "grad_norm": 6.5, "learning_rate": 1.3535441093356526e-07, "loss": 1.00476761, "memory(GiB)": 142.32, "step": 166200, "train_speed(iter/s)": 0.285447 }, { "acc": 0.73418493, "epoch": 1.8591619398613346, "grad_norm": 6.34375, "learning_rate": 1.349273356791142e-07, "loss": 1.04171791, "memory(GiB)": 142.32, "step": 166220, "train_speed(iter/s)": 0.285458 }, { "acc": 0.725808, "epoch": 1.8593856388072931, "grad_norm": 6.71875, "learning_rate": 1.3450092603614185e-07, "loss": 1.07703991, "memory(GiB)": 142.32, "step": 166240, "train_speed(iter/s)": 0.28547 }, { "acc": 0.73621225, "epoch": 1.8596093377532517, "grad_norm": 6.0625, "learning_rate": 1.3407518206298708e-07, "loss": 1.04448929, "memory(GiB)": 142.32, "step": 166260, "train_speed(iter/s)": 0.285483 }, { "acc": 0.73346348, "epoch": 1.8598330366992102, "grad_norm": 6.90625, "learning_rate": 1.3365010381789722e-07, "loss": 1.07387047, "memory(GiB)": 142.32, "step": 166280, "train_speed(iter/s)": 0.285494 }, { "acc": 0.73863316, "epoch": 1.8600567356451687, "grad_norm": 5.75, "learning_rate": 1.3322569135902963e-07, "loss": 1.04974766, "memory(GiB)": 142.32, "step": 166300, "train_speed(iter/s)": 0.285505 }, { "acc": 0.73425145, "epoch": 1.8602804345911272, "grad_norm": 4.96875, "learning_rate": 1.3280194474444895e-07, "loss": 1.05245676, "memory(GiB)": 142.32, "step": 166320, "train_speed(iter/s)": 0.285515 }, { "acc": 0.7440217, "epoch": 1.8605041335370858, "grad_norm": 6.53125, "learning_rate": 1.3237886403213106e-07, "loss": 1.00610733, "memory(GiB)": 142.32, "step": 166340, "train_speed(iter/s)": 0.285527 }, { "acc": 0.74726214, "epoch": 1.8607278324830443, "grad_norm": 6.1875, "learning_rate": 1.319564492799591e-07, "loss": 1.0024622, "memory(GiB)": 142.32, "step": 166360, "train_speed(iter/s)": 0.28554 }, { "acc": 0.73899713, "epoch": 1.8609515314290028, "grad_norm": 7.3125, "learning_rate": 1.315347005457246e-07, "loss": 1.04116068, "memory(GiB)": 142.32, "step": 166380, "train_speed(iter/s)": 0.285551 }, { "acc": 0.72711143, "epoch": 1.8611752303749614, "grad_norm": 5.0625, "learning_rate": 1.3111361788713027e-07, "loss": 1.08891296, "memory(GiB)": 142.32, "step": 166400, "train_speed(iter/s)": 0.285563 }, { "acc": 0.73343649, "epoch": 1.8613989293209199, "grad_norm": 4.90625, "learning_rate": 1.3069320136178453e-07, "loss": 1.06121483, "memory(GiB)": 142.32, "step": 166420, "train_speed(iter/s)": 0.285575 }, { "acc": 0.73881989, "epoch": 1.8616226282668784, "grad_norm": 7.90625, "learning_rate": 1.30273451027208e-07, "loss": 1.03070889, "memory(GiB)": 142.32, "step": 166440, "train_speed(iter/s)": 0.285585 }, { "acc": 0.74464893, "epoch": 1.861846327212837, "grad_norm": 7.0625, "learning_rate": 1.2985436694082808e-07, "loss": 1.01819954, "memory(GiB)": 142.32, "step": 166460, "train_speed(iter/s)": 0.285598 }, { "acc": 0.73036871, "epoch": 1.8620700261587955, "grad_norm": 6.1875, "learning_rate": 1.2943594915998003e-07, "loss": 1.07146025, "memory(GiB)": 142.32, "step": 166480, "train_speed(iter/s)": 0.28561 }, { "acc": 0.73309712, "epoch": 1.862293725104754, "grad_norm": 5.40625, "learning_rate": 1.2901819774191137e-07, "loss": 1.0738965, "memory(GiB)": 142.32, "step": 166500, "train_speed(iter/s)": 0.285621 }, { "acc": 0.73131218, "epoch": 1.8625174240507125, "grad_norm": 5.34375, "learning_rate": 1.286011127437753e-07, "loss": 1.07355919, "memory(GiB)": 142.32, "step": 166520, "train_speed(iter/s)": 0.285633 }, { "acc": 0.73363447, "epoch": 1.862741122996671, "grad_norm": 7.71875, "learning_rate": 1.2818469422263558e-07, "loss": 1.06457644, "memory(GiB)": 142.32, "step": 166540, "train_speed(iter/s)": 0.285647 }, { "acc": 0.72356572, "epoch": 1.8629648219426296, "grad_norm": 6.9375, "learning_rate": 1.2776894223546442e-07, "loss": 1.11250401, "memory(GiB)": 142.32, "step": 166560, "train_speed(iter/s)": 0.28566 }, { "acc": 0.73317118, "epoch": 1.863188520888588, "grad_norm": 4.71875, "learning_rate": 1.2735385683914138e-07, "loss": 1.0488039, "memory(GiB)": 142.32, "step": 166580, "train_speed(iter/s)": 0.285672 }, { "acc": 0.74105139, "epoch": 1.8634122198345466, "grad_norm": 5.375, "learning_rate": 1.2693943809045705e-07, "loss": 1.02603111, "memory(GiB)": 142.32, "step": 166600, "train_speed(iter/s)": 0.285684 }, { "acc": 0.74343805, "epoch": 1.8636359187805052, "grad_norm": 4.5625, "learning_rate": 1.2652568604610949e-07, "loss": 1.01715422, "memory(GiB)": 142.32, "step": 166620, "train_speed(iter/s)": 0.285694 }, { "acc": 0.73265643, "epoch": 1.8638596177264637, "grad_norm": 5.5, "learning_rate": 1.261126007627067e-07, "loss": 1.06655102, "memory(GiB)": 142.32, "step": 166640, "train_speed(iter/s)": 0.285705 }, { "acc": 0.72986231, "epoch": 1.8640833166724222, "grad_norm": 5.90625, "learning_rate": 1.2570018229676352e-07, "loss": 1.09926052, "memory(GiB)": 142.32, "step": 166660, "train_speed(iter/s)": 0.285717 }, { "acc": 0.73665929, "epoch": 1.8643070156183807, "grad_norm": 6.15625, "learning_rate": 1.2528843070470531e-07, "loss": 1.03960924, "memory(GiB)": 142.32, "step": 166680, "train_speed(iter/s)": 0.285729 }, { "acc": 0.75790291, "epoch": 1.8645307145643393, "grad_norm": 6.5625, "learning_rate": 1.2487734604286595e-07, "loss": 0.95694561, "memory(GiB)": 142.32, "step": 166700, "train_speed(iter/s)": 0.28574 }, { "acc": 0.74543018, "epoch": 1.8647544135102978, "grad_norm": 4.5, "learning_rate": 1.2446692836748652e-07, "loss": 0.99698772, "memory(GiB)": 142.32, "step": 166720, "train_speed(iter/s)": 0.285752 }, { "acc": 0.74143734, "epoch": 1.8649781124562563, "grad_norm": 7.125, "learning_rate": 1.240571777347188e-07, "loss": 1.02929516, "memory(GiB)": 142.32, "step": 166740, "train_speed(iter/s)": 0.285764 }, { "acc": 0.7345602, "epoch": 1.8652018114022149, "grad_norm": 6.875, "learning_rate": 1.2364809420062295e-07, "loss": 1.05566444, "memory(GiB)": 142.32, "step": 166760, "train_speed(iter/s)": 0.285774 }, { "acc": 0.73210592, "epoch": 1.8654255103481734, "grad_norm": 5.6875, "learning_rate": 1.2323967782116642e-07, "loss": 1.05013685, "memory(GiB)": 142.32, "step": 166780, "train_speed(iter/s)": 0.285786 }, { "acc": 0.73686018, "epoch": 1.865649209294132, "grad_norm": 6.6875, "learning_rate": 1.2283192865222726e-07, "loss": 1.04418812, "memory(GiB)": 142.32, "step": 166800, "train_speed(iter/s)": 0.285798 }, { "acc": 0.73718777, "epoch": 1.8658729082400904, "grad_norm": 7.5, "learning_rate": 1.2242484674959031e-07, "loss": 1.04288931, "memory(GiB)": 142.32, "step": 166820, "train_speed(iter/s)": 0.285808 }, { "acc": 0.73518543, "epoch": 1.866096607186049, "grad_norm": 7.3125, "learning_rate": 1.2201843216895216e-07, "loss": 1.06209087, "memory(GiB)": 142.32, "step": 166840, "train_speed(iter/s)": 0.28582 }, { "acc": 0.74135351, "epoch": 1.8663203061320075, "grad_norm": 6.625, "learning_rate": 1.2161268496591438e-07, "loss": 1.03363037, "memory(GiB)": 142.32, "step": 166860, "train_speed(iter/s)": 0.285831 }, { "acc": 0.74088907, "epoch": 1.866544005077966, "grad_norm": 6.53125, "learning_rate": 1.212076051959893e-07, "loss": 1.0331501, "memory(GiB)": 142.32, "step": 166880, "train_speed(iter/s)": 0.285844 }, { "acc": 0.72692766, "epoch": 1.8667677040239246, "grad_norm": 6.0, "learning_rate": 1.2080319291459807e-07, "loss": 1.08878002, "memory(GiB)": 142.32, "step": 166900, "train_speed(iter/s)": 0.285856 }, { "acc": 0.74848032, "epoch": 1.866991402969883, "grad_norm": 7.6875, "learning_rate": 1.2039944817706928e-07, "loss": 0.98898611, "memory(GiB)": 142.32, "step": 166920, "train_speed(iter/s)": 0.285869 }, { "acc": 0.73998036, "epoch": 1.8672151019158416, "grad_norm": 6.0, "learning_rate": 1.1999637103864205e-07, "loss": 1.03625727, "memory(GiB)": 142.32, "step": 166940, "train_speed(iter/s)": 0.285881 }, { "acc": 0.72934284, "epoch": 1.8674388008618001, "grad_norm": 6.8125, "learning_rate": 1.195939615544628e-07, "loss": 1.07787895, "memory(GiB)": 142.32, "step": 166960, "train_speed(iter/s)": 0.285895 }, { "acc": 0.73492346, "epoch": 1.8676624998077587, "grad_norm": 5.5, "learning_rate": 1.1919221977958694e-07, "loss": 1.05264587, "memory(GiB)": 142.32, "step": 166980, "train_speed(iter/s)": 0.285906 }, { "acc": 0.75200033, "epoch": 1.8678861987537172, "grad_norm": 7.40625, "learning_rate": 1.1879114576897777e-07, "loss": 0.98317318, "memory(GiB)": 142.32, "step": 167000, "train_speed(iter/s)": 0.285917 }, { "acc": 0.72773457, "epoch": 1.8681098976996757, "grad_norm": 6.46875, "learning_rate": 1.1839073957750801e-07, "loss": 1.09247417, "memory(GiB)": 142.32, "step": 167020, "train_speed(iter/s)": 0.285927 }, { "acc": 0.73415537, "epoch": 1.8683335966456343, "grad_norm": 6.25, "learning_rate": 1.1799100125995999e-07, "loss": 1.06356468, "memory(GiB)": 142.32, "step": 167040, "train_speed(iter/s)": 0.28594 }, { "acc": 0.74013529, "epoch": 1.8685572955915928, "grad_norm": 6.21875, "learning_rate": 1.1759193087102216e-07, "loss": 1.03731327, "memory(GiB)": 142.32, "step": 167060, "train_speed(iter/s)": 0.285952 }, { "acc": 0.74118948, "epoch": 1.8687809945375513, "grad_norm": 6.46875, "learning_rate": 1.1719352846529475e-07, "loss": 1.03210983, "memory(GiB)": 142.32, "step": 167080, "train_speed(iter/s)": 0.285964 }, { "acc": 0.72622967, "epoch": 1.8690046934835098, "grad_norm": 5.375, "learning_rate": 1.1679579409728415e-07, "loss": 1.10596828, "memory(GiB)": 142.32, "step": 167100, "train_speed(iter/s)": 0.285975 }, { "acc": 0.74585285, "epoch": 1.8692283924294684, "grad_norm": 6.6875, "learning_rate": 1.1639872782140516e-07, "loss": 1.01183739, "memory(GiB)": 142.32, "step": 167120, "train_speed(iter/s)": 0.285988 }, { "acc": 0.73142376, "epoch": 1.8694520913754271, "grad_norm": 7.0625, "learning_rate": 1.1600232969198322e-07, "loss": 1.0704073, "memory(GiB)": 142.32, "step": 167140, "train_speed(iter/s)": 0.286 }, { "acc": 0.73251066, "epoch": 1.8696757903213856, "grad_norm": 6.25, "learning_rate": 1.1560659976325106e-07, "loss": 1.07159481, "memory(GiB)": 142.32, "step": 167160, "train_speed(iter/s)": 0.286012 }, { "acc": 0.73744102, "epoch": 1.8698994892673442, "grad_norm": 5.28125, "learning_rate": 1.1521153808935037e-07, "loss": 1.05684166, "memory(GiB)": 142.32, "step": 167180, "train_speed(iter/s)": 0.286022 }, { "acc": 0.7379425, "epoch": 1.8701231882133027, "grad_norm": 6.5, "learning_rate": 1.148171447243307e-07, "loss": 1.05149412, "memory(GiB)": 142.32, "step": 167200, "train_speed(iter/s)": 0.286035 }, { "acc": 0.74561191, "epoch": 1.8703468871592612, "grad_norm": 6.375, "learning_rate": 1.144234197221511e-07, "loss": 1.01429081, "memory(GiB)": 142.32, "step": 167220, "train_speed(iter/s)": 0.286047 }, { "acc": 0.74087453, "epoch": 1.8705705861052198, "grad_norm": 6.3125, "learning_rate": 1.1403036313667848e-07, "loss": 1.03257523, "memory(GiB)": 142.32, "step": 167240, "train_speed(iter/s)": 0.28606 }, { "acc": 0.73133178, "epoch": 1.8707942850511783, "grad_norm": 7.25, "learning_rate": 1.1363797502168872e-07, "loss": 1.07304621, "memory(GiB)": 142.32, "step": 167260, "train_speed(iter/s)": 0.286073 }, { "acc": 0.73499007, "epoch": 1.8710179839971368, "grad_norm": 7.09375, "learning_rate": 1.1324625543086665e-07, "loss": 1.04214897, "memory(GiB)": 142.32, "step": 167280, "train_speed(iter/s)": 0.286085 }, { "acc": 0.75344563, "epoch": 1.8712416829430953, "grad_norm": 6.4375, "learning_rate": 1.128552044178044e-07, "loss": 0.96315231, "memory(GiB)": 142.32, "step": 167300, "train_speed(iter/s)": 0.286098 }, { "acc": 0.73157482, "epoch": 1.8714653818890539, "grad_norm": 5.4375, "learning_rate": 1.1246482203600307e-07, "loss": 1.06168108, "memory(GiB)": 142.32, "step": 167320, "train_speed(iter/s)": 0.28611 }, { "acc": 0.73468704, "epoch": 1.8716890808350124, "grad_norm": 7.0, "learning_rate": 1.120751083388738e-07, "loss": 1.05027046, "memory(GiB)": 142.32, "step": 167340, "train_speed(iter/s)": 0.286121 }, { "acc": 0.73869715, "epoch": 1.871912779780971, "grad_norm": 6.6875, "learning_rate": 1.1168606337973398e-07, "loss": 1.0379221, "memory(GiB)": 142.32, "step": 167360, "train_speed(iter/s)": 0.286132 }, { "acc": 0.75091634, "epoch": 1.8721364787269295, "grad_norm": 5.6875, "learning_rate": 1.11297687211811e-07, "loss": 0.9923275, "memory(GiB)": 142.32, "step": 167380, "train_speed(iter/s)": 0.286145 }, { "acc": 0.73387709, "epoch": 1.872360177672888, "grad_norm": 5.6875, "learning_rate": 1.1090997988824016e-07, "loss": 1.03611717, "memory(GiB)": 142.32, "step": 167400, "train_speed(iter/s)": 0.286156 }, { "acc": 0.71785469, "epoch": 1.8725838766188465, "grad_norm": 5.71875, "learning_rate": 1.1052294146206455e-07, "loss": 1.1375968, "memory(GiB)": 142.32, "step": 167420, "train_speed(iter/s)": 0.286168 }, { "acc": 0.73302031, "epoch": 1.872807575564805, "grad_norm": 6.28125, "learning_rate": 1.1013657198623795e-07, "loss": 1.05997601, "memory(GiB)": 142.32, "step": 167440, "train_speed(iter/s)": 0.286181 }, { "acc": 0.7342267, "epoch": 1.8730312745107636, "grad_norm": 6.65625, "learning_rate": 1.0975087151361975e-07, "loss": 1.04626331, "memory(GiB)": 142.32, "step": 167460, "train_speed(iter/s)": 0.286192 }, { "acc": 0.73974695, "epoch": 1.873254973456722, "grad_norm": 6.1875, "learning_rate": 1.0936584009698104e-07, "loss": 1.01062498, "memory(GiB)": 142.32, "step": 167480, "train_speed(iter/s)": 0.286203 }, { "acc": 0.73167715, "epoch": 1.8734786724026806, "grad_norm": 5.5, "learning_rate": 1.0898147778899859e-07, "loss": 1.05269642, "memory(GiB)": 142.32, "step": 167500, "train_speed(iter/s)": 0.286214 }, { "acc": 0.73197689, "epoch": 1.8737023713486392, "grad_norm": 5.78125, "learning_rate": 1.0859778464225811e-07, "loss": 1.05322914, "memory(GiB)": 142.32, "step": 167520, "train_speed(iter/s)": 0.286225 }, { "acc": 0.74604869, "epoch": 1.8739260702945977, "grad_norm": 8.125, "learning_rate": 1.0821476070925541e-07, "loss": 1.00992336, "memory(GiB)": 142.32, "step": 167540, "train_speed(iter/s)": 0.286236 }, { "acc": 0.74809713, "epoch": 1.8741497692405562, "grad_norm": 5.625, "learning_rate": 1.0783240604239242e-07, "loss": 0.98543835, "memory(GiB)": 142.32, "step": 167560, "train_speed(iter/s)": 0.286247 }, { "acc": 0.73035994, "epoch": 1.8743734681865147, "grad_norm": 5.4375, "learning_rate": 1.0745072069398233e-07, "loss": 1.09233818, "memory(GiB)": 142.32, "step": 167580, "train_speed(iter/s)": 0.286258 }, { "acc": 0.72910004, "epoch": 1.8745971671324733, "grad_norm": 5.96875, "learning_rate": 1.0706970471624445e-07, "loss": 1.09222298, "memory(GiB)": 142.32, "step": 167600, "train_speed(iter/s)": 0.28627 }, { "acc": 0.74003534, "epoch": 1.8748208660784318, "grad_norm": 7.0, "learning_rate": 1.0668935816130599e-07, "loss": 1.04568024, "memory(GiB)": 142.32, "step": 167620, "train_speed(iter/s)": 0.28628 }, { "acc": 0.74223042, "epoch": 1.8750445650243903, "grad_norm": 7.0, "learning_rate": 1.0630968108120587e-07, "loss": 1.00760555, "memory(GiB)": 142.32, "step": 167640, "train_speed(iter/s)": 0.286292 }, { "acc": 0.7404542, "epoch": 1.8752682639703488, "grad_norm": 7.59375, "learning_rate": 1.0593067352788754e-07, "loss": 1.04292908, "memory(GiB)": 142.32, "step": 167660, "train_speed(iter/s)": 0.286304 }, { "acc": 0.73788791, "epoch": 1.8754919629163074, "grad_norm": 5.1875, "learning_rate": 1.055523355532062e-07, "loss": 1.03734121, "memory(GiB)": 142.32, "step": 167680, "train_speed(iter/s)": 0.286316 }, { "acc": 0.74897156, "epoch": 1.875715661862266, "grad_norm": 6.15625, "learning_rate": 1.0517466720892321e-07, "loss": 0.98741112, "memory(GiB)": 142.32, "step": 167700, "train_speed(iter/s)": 0.286328 }, { "acc": 0.74008555, "epoch": 1.8759393608082244, "grad_norm": 6.71875, "learning_rate": 1.047976685467078e-07, "loss": 1.03481607, "memory(GiB)": 142.32, "step": 167720, "train_speed(iter/s)": 0.28634 }, { "acc": 0.7307322, "epoch": 1.876163059754183, "grad_norm": 5.90625, "learning_rate": 1.0442133961814093e-07, "loss": 1.07929726, "memory(GiB)": 142.32, "step": 167740, "train_speed(iter/s)": 0.286352 }, { "acc": 0.74156041, "epoch": 1.8763867587001415, "grad_norm": 6.8125, "learning_rate": 1.0404568047470808e-07, "loss": 1.02543621, "memory(GiB)": 142.32, "step": 167760, "train_speed(iter/s)": 0.286364 }, { "acc": 0.73375015, "epoch": 1.8766104576461, "grad_norm": 6.6875, "learning_rate": 1.0367069116780593e-07, "loss": 1.05200853, "memory(GiB)": 142.32, "step": 167780, "train_speed(iter/s)": 0.286375 }, { "acc": 0.72969985, "epoch": 1.8768341565920585, "grad_norm": 5.8125, "learning_rate": 1.0329637174873841e-07, "loss": 1.08337383, "memory(GiB)": 142.32, "step": 167800, "train_speed(iter/s)": 0.286386 }, { "acc": 0.7274148, "epoch": 1.877057855538017, "grad_norm": 5.8125, "learning_rate": 1.0292272226871625e-07, "loss": 1.10040445, "memory(GiB)": 142.32, "step": 167820, "train_speed(iter/s)": 0.286398 }, { "acc": 0.73038692, "epoch": 1.8772815544839756, "grad_norm": 5.96875, "learning_rate": 1.0254974277886132e-07, "loss": 1.07132196, "memory(GiB)": 142.32, "step": 167840, "train_speed(iter/s)": 0.28641 }, { "acc": 0.74019489, "epoch": 1.8775052534299341, "grad_norm": 7.0625, "learning_rate": 1.0217743333020225e-07, "loss": 1.04262123, "memory(GiB)": 142.32, "step": 167860, "train_speed(iter/s)": 0.286421 }, { "acc": 0.7366807, "epoch": 1.8777289523758927, "grad_norm": 7.375, "learning_rate": 1.0180579397367662e-07, "loss": 1.03074436, "memory(GiB)": 142.32, "step": 167880, "train_speed(iter/s)": 0.286432 }, { "acc": 0.73747101, "epoch": 1.8779526513218512, "grad_norm": 5.5625, "learning_rate": 1.0143482476012933e-07, "loss": 1.02939968, "memory(GiB)": 142.32, "step": 167900, "train_speed(iter/s)": 0.286447 }, { "acc": 0.73861399, "epoch": 1.8781763502678097, "grad_norm": 5.96875, "learning_rate": 1.0106452574031477e-07, "loss": 1.04213085, "memory(GiB)": 142.32, "step": 167920, "train_speed(iter/s)": 0.286457 }, { "acc": 0.74089508, "epoch": 1.8784000492137682, "grad_norm": 7.0, "learning_rate": 1.0069489696489521e-07, "loss": 1.02992935, "memory(GiB)": 142.32, "step": 167940, "train_speed(iter/s)": 0.286469 }, { "acc": 0.72919149, "epoch": 1.8786237481597268, "grad_norm": 5.90625, "learning_rate": 1.0032593848444072e-07, "loss": 1.08005486, "memory(GiB)": 142.32, "step": 167960, "train_speed(iter/s)": 0.28648 }, { "acc": 0.74613256, "epoch": 1.8788474471056853, "grad_norm": 7.1875, "learning_rate": 9.99576503494304e-08, "loss": 1.00466328, "memory(GiB)": 142.32, "step": 167980, "train_speed(iter/s)": 0.286492 }, { "acc": 0.73688011, "epoch": 1.8790711460516438, "grad_norm": 7.03125, "learning_rate": 9.959003261025168e-08, "loss": 1.02920885, "memory(GiB)": 142.32, "step": 168000, "train_speed(iter/s)": 0.286503 }, { "epoch": 1.8790711460516438, "eval_acc": 0.6963662252446987, "eval_loss": 1.0713822841644287, "eval_runtime": 2339.7791, "eval_samples_per_second": 32.175, "eval_steps_per_second": 16.088, "step": 168000 }, { "acc": 0.73310699, "epoch": 1.8792948449976024, "grad_norm": 5.34375, "learning_rate": 9.922308531719882e-08, "loss": 1.06813869, "memory(GiB)": 142.32, "step": 168020, "train_speed(iter/s)": 0.285352 }, { "acc": 0.72942958, "epoch": 1.8795185439435609, "grad_norm": 5.625, "learning_rate": 9.885680852047607e-08, "loss": 1.09337397, "memory(GiB)": 142.32, "step": 168040, "train_speed(iter/s)": 0.285363 }, { "acc": 0.73150272, "epoch": 1.8797422428895194, "grad_norm": 5.65625, "learning_rate": 9.849120227019559e-08, "loss": 1.0729538, "memory(GiB)": 142.32, "step": 168060, "train_speed(iter/s)": 0.285375 }, { "acc": 0.73322668, "epoch": 1.879965941835478, "grad_norm": 5.5, "learning_rate": 9.812626661637792e-08, "loss": 1.06772623, "memory(GiB)": 142.32, "step": 168080, "train_speed(iter/s)": 0.285386 }, { "acc": 0.75019331, "epoch": 1.8801896407814365, "grad_norm": 6.5, "learning_rate": 9.776200160895033e-08, "loss": 0.97336102, "memory(GiB)": 142.32, "step": 168100, "train_speed(iter/s)": 0.285398 }, { "acc": 0.72886543, "epoch": 1.880413339727395, "grad_norm": 6.5625, "learning_rate": 9.739840729775019e-08, "loss": 1.07117214, "memory(GiB)": 142.32, "step": 168120, "train_speed(iter/s)": 0.285411 }, { "acc": 0.73436747, "epoch": 1.8806370386733535, "grad_norm": 7.34375, "learning_rate": 9.703548373252214e-08, "loss": 1.05455589, "memory(GiB)": 142.32, "step": 168140, "train_speed(iter/s)": 0.285422 }, { "acc": 0.73986797, "epoch": 1.880860737619312, "grad_norm": 6.5, "learning_rate": 9.667323096291981e-08, "loss": 1.03595238, "memory(GiB)": 142.32, "step": 168160, "train_speed(iter/s)": 0.285435 }, { "acc": 0.73362727, "epoch": 1.8810844365652706, "grad_norm": 6.0, "learning_rate": 9.63116490385041e-08, "loss": 1.06598015, "memory(GiB)": 142.32, "step": 168180, "train_speed(iter/s)": 0.285446 }, { "acc": 0.74967632, "epoch": 1.881308135511229, "grad_norm": 5.875, "learning_rate": 9.595073800874377e-08, "loss": 0.99485512, "memory(GiB)": 142.32, "step": 168200, "train_speed(iter/s)": 0.285457 }, { "acc": 0.73687954, "epoch": 1.8815318344571876, "grad_norm": 5.59375, "learning_rate": 9.559049792301822e-08, "loss": 1.04164228, "memory(GiB)": 142.32, "step": 168220, "train_speed(iter/s)": 0.285469 }, { "acc": 0.75130568, "epoch": 1.8817555334031462, "grad_norm": 5.375, "learning_rate": 9.523092883061246e-08, "loss": 0.9827035, "memory(GiB)": 142.32, "step": 168240, "train_speed(iter/s)": 0.285481 }, { "acc": 0.73984118, "epoch": 1.8819792323491047, "grad_norm": 5.65625, "learning_rate": 9.48720307807216e-08, "loss": 1.04555197, "memory(GiB)": 142.32, "step": 168260, "train_speed(iter/s)": 0.285493 }, { "acc": 0.72149291, "epoch": 1.8822029312950632, "grad_norm": 7.4375, "learning_rate": 9.45138038224469e-08, "loss": 1.12940607, "memory(GiB)": 142.32, "step": 168280, "train_speed(iter/s)": 0.285506 }, { "acc": 0.73461766, "epoch": 1.8824266302410217, "grad_norm": 7.0625, "learning_rate": 9.415624800479917e-08, "loss": 1.04371414, "memory(GiB)": 142.32, "step": 168300, "train_speed(iter/s)": 0.285519 }, { "acc": 0.73540707, "epoch": 1.8826503291869803, "grad_norm": 7.09375, "learning_rate": 9.37993633766976e-08, "loss": 1.05017529, "memory(GiB)": 142.32, "step": 168320, "train_speed(iter/s)": 0.285531 }, { "acc": 0.73743496, "epoch": 1.8828740281329388, "grad_norm": 6.15625, "learning_rate": 9.344314998696869e-08, "loss": 1.05389137, "memory(GiB)": 142.32, "step": 168340, "train_speed(iter/s)": 0.285543 }, { "acc": 0.74373918, "epoch": 1.8830977270788973, "grad_norm": 5.53125, "learning_rate": 9.30876078843479e-08, "loss": 1.00222855, "memory(GiB)": 142.32, "step": 168360, "train_speed(iter/s)": 0.285555 }, { "acc": 0.72997379, "epoch": 1.8833214260248559, "grad_norm": 5.25, "learning_rate": 9.273273711747855e-08, "loss": 1.06863708, "memory(GiB)": 142.32, "step": 168380, "train_speed(iter/s)": 0.285566 }, { "acc": 0.73042879, "epoch": 1.8835451249708144, "grad_norm": 5.9375, "learning_rate": 9.237853773491123e-08, "loss": 1.07104778, "memory(GiB)": 142.32, "step": 168400, "train_speed(iter/s)": 0.285577 }, { "acc": 0.73594971, "epoch": 1.883768823916773, "grad_norm": 6.0625, "learning_rate": 9.202500978510664e-08, "loss": 1.05138664, "memory(GiB)": 142.32, "step": 168420, "train_speed(iter/s)": 0.285588 }, { "acc": 0.7336153, "epoch": 1.8839925228627314, "grad_norm": 5.15625, "learning_rate": 9.167215331643109e-08, "loss": 1.06082668, "memory(GiB)": 142.32, "step": 168440, "train_speed(iter/s)": 0.285599 }, { "acc": 0.74899101, "epoch": 1.88421622180869, "grad_norm": 6.53125, "learning_rate": 9.13199683771615e-08, "loss": 0.97905064, "memory(GiB)": 142.32, "step": 168460, "train_speed(iter/s)": 0.285611 }, { "acc": 0.72887335, "epoch": 1.8844399207546485, "grad_norm": 6.875, "learning_rate": 9.096845501548213e-08, "loss": 1.07708626, "memory(GiB)": 142.32, "step": 168480, "train_speed(iter/s)": 0.285622 }, { "acc": 0.73672752, "epoch": 1.884663619700607, "grad_norm": 5.1875, "learning_rate": 9.061761327948338e-08, "loss": 1.04861908, "memory(GiB)": 142.32, "step": 168500, "train_speed(iter/s)": 0.285635 }, { "acc": 0.74124222, "epoch": 1.8848873186465656, "grad_norm": 6.1875, "learning_rate": 9.026744321716685e-08, "loss": 1.03246965, "memory(GiB)": 142.32, "step": 168520, "train_speed(iter/s)": 0.285645 }, { "acc": 0.73538952, "epoch": 1.885111017592524, "grad_norm": 5.59375, "learning_rate": 8.99179448764398e-08, "loss": 1.05918455, "memory(GiB)": 142.32, "step": 168540, "train_speed(iter/s)": 0.285656 }, { "acc": 0.73810468, "epoch": 1.8853347165384826, "grad_norm": 7.4375, "learning_rate": 8.95691183051195e-08, "loss": 1.05359898, "memory(GiB)": 142.32, "step": 168560, "train_speed(iter/s)": 0.285667 }, { "acc": 0.73281794, "epoch": 1.8855584154844411, "grad_norm": 5.84375, "learning_rate": 8.922096355093057e-08, "loss": 1.06329155, "memory(GiB)": 142.32, "step": 168580, "train_speed(iter/s)": 0.28568 }, { "acc": 0.73709607, "epoch": 1.8857821144303997, "grad_norm": 6.5, "learning_rate": 8.887348066150381e-08, "loss": 1.04767838, "memory(GiB)": 142.32, "step": 168600, "train_speed(iter/s)": 0.285691 }, { "acc": 0.74920244, "epoch": 1.8860058133763582, "grad_norm": 6.78125, "learning_rate": 8.852666968438228e-08, "loss": 0.9956953, "memory(GiB)": 142.32, "step": 168620, "train_speed(iter/s)": 0.285703 }, { "acc": 0.72906437, "epoch": 1.8862295123223167, "grad_norm": 5.875, "learning_rate": 8.818053066701248e-08, "loss": 1.08488331, "memory(GiB)": 142.32, "step": 168640, "train_speed(iter/s)": 0.285715 }, { "acc": 0.73326502, "epoch": 1.8864532112682753, "grad_norm": 7.0, "learning_rate": 8.783506365675321e-08, "loss": 1.05458736, "memory(GiB)": 142.32, "step": 168660, "train_speed(iter/s)": 0.285727 }, { "acc": 0.73448544, "epoch": 1.8866769102142338, "grad_norm": 9.0625, "learning_rate": 8.749026870086774e-08, "loss": 1.06032143, "memory(GiB)": 142.32, "step": 168680, "train_speed(iter/s)": 0.285738 }, { "acc": 0.74180927, "epoch": 1.8869006091601923, "grad_norm": 6.5, "learning_rate": 8.714614584652892e-08, "loss": 1.02928696, "memory(GiB)": 142.32, "step": 168700, "train_speed(iter/s)": 0.28575 }, { "acc": 0.72864532, "epoch": 1.8871243081061508, "grad_norm": 5.65625, "learning_rate": 8.680269514081907e-08, "loss": 1.08353539, "memory(GiB)": 142.32, "step": 168720, "train_speed(iter/s)": 0.285762 }, { "acc": 0.73336821, "epoch": 1.8873480070521094, "grad_norm": 5.90625, "learning_rate": 8.645991663072562e-08, "loss": 1.06019344, "memory(GiB)": 142.32, "step": 168740, "train_speed(iter/s)": 0.285773 }, { "acc": 0.73962202, "epoch": 1.887571705998068, "grad_norm": 5.84375, "learning_rate": 8.61178103631466e-08, "loss": 1.04344082, "memory(GiB)": 142.32, "step": 168760, "train_speed(iter/s)": 0.285784 }, { "acc": 0.7364378, "epoch": 1.8877954049440264, "grad_norm": 4.8125, "learning_rate": 8.577637638488678e-08, "loss": 1.05784578, "memory(GiB)": 142.32, "step": 168780, "train_speed(iter/s)": 0.285796 }, { "acc": 0.74424057, "epoch": 1.888019103889985, "grad_norm": 6.625, "learning_rate": 8.543561474265882e-08, "loss": 1.01844349, "memory(GiB)": 142.32, "step": 168800, "train_speed(iter/s)": 0.285809 }, { "acc": 0.73643055, "epoch": 1.8882428028359435, "grad_norm": 5.5625, "learning_rate": 8.509552548308431e-08, "loss": 1.03364534, "memory(GiB)": 142.32, "step": 168820, "train_speed(iter/s)": 0.285821 }, { "acc": 0.73427563, "epoch": 1.888466501781902, "grad_norm": 6.28125, "learning_rate": 8.475610865269158e-08, "loss": 1.04985628, "memory(GiB)": 142.32, "step": 168840, "train_speed(iter/s)": 0.285833 }, { "acc": 0.74700036, "epoch": 1.8886902007278605, "grad_norm": 5.65625, "learning_rate": 8.441736429791903e-08, "loss": 1.00230284, "memory(GiB)": 142.32, "step": 168860, "train_speed(iter/s)": 0.285845 }, { "acc": 0.73832617, "epoch": 1.888913899673819, "grad_norm": 6.4375, "learning_rate": 8.407929246511015e-08, "loss": 1.04687996, "memory(GiB)": 142.32, "step": 168880, "train_speed(iter/s)": 0.285858 }, { "acc": 0.74021049, "epoch": 1.8891375986197776, "grad_norm": 5.46875, "learning_rate": 8.374189320051907e-08, "loss": 1.02663336, "memory(GiB)": 142.32, "step": 168900, "train_speed(iter/s)": 0.285869 }, { "acc": 0.72660418, "epoch": 1.8893612975657361, "grad_norm": 7.0, "learning_rate": 8.340516655030662e-08, "loss": 1.09045639, "memory(GiB)": 142.32, "step": 168920, "train_speed(iter/s)": 0.285881 }, { "acc": 0.74081535, "epoch": 1.8895849965116946, "grad_norm": 7.59375, "learning_rate": 8.30691125605415e-08, "loss": 1.02434349, "memory(GiB)": 142.32, "step": 168940, "train_speed(iter/s)": 0.285893 }, { "acc": 0.72888727, "epoch": 1.8898086954576532, "grad_norm": 6.46875, "learning_rate": 8.273373127720141e-08, "loss": 1.08302135, "memory(GiB)": 142.32, "step": 168960, "train_speed(iter/s)": 0.285904 }, { "acc": 0.74354925, "epoch": 1.8900323944036117, "grad_norm": 6.78125, "learning_rate": 8.239902274617018e-08, "loss": 1.0039319, "memory(GiB)": 142.32, "step": 168980, "train_speed(iter/s)": 0.285916 }, { "acc": 0.73150444, "epoch": 1.8902560933495702, "grad_norm": 5.625, "learning_rate": 8.206498701324173e-08, "loss": 1.06569262, "memory(GiB)": 142.32, "step": 169000, "train_speed(iter/s)": 0.285927 }, { "acc": 0.73205433, "epoch": 1.8904797922955288, "grad_norm": 6.90625, "learning_rate": 8.173162412411672e-08, "loss": 1.06817446, "memory(GiB)": 142.32, "step": 169020, "train_speed(iter/s)": 0.285939 }, { "acc": 0.73113132, "epoch": 1.8907034912414873, "grad_norm": 6.375, "learning_rate": 8.139893412440315e-08, "loss": 1.09564133, "memory(GiB)": 142.32, "step": 169040, "train_speed(iter/s)": 0.285951 }, { "acc": 0.72878695, "epoch": 1.8909271901874458, "grad_norm": 5.65625, "learning_rate": 8.1066917059619e-08, "loss": 1.06970606, "memory(GiB)": 142.32, "step": 169060, "train_speed(iter/s)": 0.285964 }, { "acc": 0.73484979, "epoch": 1.8911508891334043, "grad_norm": 6.8125, "learning_rate": 8.073557297518797e-08, "loss": 1.0435174, "memory(GiB)": 142.32, "step": 169080, "train_speed(iter/s)": 0.285976 }, { "acc": 0.73236256, "epoch": 1.8913745880793629, "grad_norm": 5.53125, "learning_rate": 8.040490191644323e-08, "loss": 1.05823269, "memory(GiB)": 142.32, "step": 169100, "train_speed(iter/s)": 0.285987 }, { "acc": 0.72985067, "epoch": 1.8915982870253214, "grad_norm": 5.46875, "learning_rate": 8.007490392862527e-08, "loss": 1.0749258, "memory(GiB)": 142.32, "step": 169120, "train_speed(iter/s)": 0.285999 }, { "acc": 0.74276757, "epoch": 1.89182198597128, "grad_norm": 6.59375, "learning_rate": 7.974557905688185e-08, "loss": 1.0338171, "memory(GiB)": 142.32, "step": 169140, "train_speed(iter/s)": 0.286009 }, { "acc": 0.7322072, "epoch": 1.8920456849172385, "grad_norm": 5.5, "learning_rate": 7.941692734627026e-08, "loss": 1.06294403, "memory(GiB)": 142.32, "step": 169160, "train_speed(iter/s)": 0.28602 }, { "acc": 0.74725904, "epoch": 1.892269383863197, "grad_norm": 4.5625, "learning_rate": 7.908894884175455e-08, "loss": 1.00680714, "memory(GiB)": 142.32, "step": 169180, "train_speed(iter/s)": 0.286028 }, { "acc": 0.73301039, "epoch": 1.8924930828091555, "grad_norm": 6.0625, "learning_rate": 7.876164358820603e-08, "loss": 1.05786161, "memory(GiB)": 142.32, "step": 169200, "train_speed(iter/s)": 0.286038 }, { "acc": 0.7475935, "epoch": 1.892716781755114, "grad_norm": 7.03125, "learning_rate": 7.843501163040612e-08, "loss": 0.99562435, "memory(GiB)": 142.32, "step": 169220, "train_speed(iter/s)": 0.286051 }, { "acc": 0.7543437, "epoch": 1.8929404807010726, "grad_norm": 6.8125, "learning_rate": 7.810905301304129e-08, "loss": 0.97603111, "memory(GiB)": 142.32, "step": 169240, "train_speed(iter/s)": 0.286063 }, { "acc": 0.72513828, "epoch": 1.893164179647031, "grad_norm": 6.21875, "learning_rate": 7.778376778070862e-08, "loss": 1.11307716, "memory(GiB)": 142.32, "step": 169260, "train_speed(iter/s)": 0.286074 }, { "acc": 0.74710522, "epoch": 1.8933878785929896, "grad_norm": 5.65625, "learning_rate": 7.745915597791087e-08, "loss": 1.00341816, "memory(GiB)": 142.32, "step": 169280, "train_speed(iter/s)": 0.286085 }, { "acc": 0.74204664, "epoch": 1.8936115775389482, "grad_norm": 5.90625, "learning_rate": 7.713521764905974e-08, "loss": 1.01923103, "memory(GiB)": 142.32, "step": 169300, "train_speed(iter/s)": 0.286097 }, { "acc": 0.73588352, "epoch": 1.8938352764849067, "grad_norm": 4.8125, "learning_rate": 7.681195283847532e-08, "loss": 1.05327425, "memory(GiB)": 142.32, "step": 169320, "train_speed(iter/s)": 0.286107 }, { "acc": 0.73218818, "epoch": 1.8940589754308652, "grad_norm": 6.28125, "learning_rate": 7.648936159038389e-08, "loss": 1.05850716, "memory(GiB)": 142.32, "step": 169340, "train_speed(iter/s)": 0.286118 }, { "acc": 0.74030905, "epoch": 1.8942826743768237, "grad_norm": 6.71875, "learning_rate": 7.616744394892129e-08, "loss": 1.03058243, "memory(GiB)": 142.32, "step": 169360, "train_speed(iter/s)": 0.286129 }, { "acc": 0.7453413, "epoch": 1.8945063733227823, "grad_norm": 6.0, "learning_rate": 7.584619995813002e-08, "loss": 1.02141256, "memory(GiB)": 142.32, "step": 169380, "train_speed(iter/s)": 0.286139 }, { "acc": 0.73947625, "epoch": 1.8947300722687408, "grad_norm": 6.78125, "learning_rate": 7.552562966196052e-08, "loss": 1.04416924, "memory(GiB)": 142.32, "step": 169400, "train_speed(iter/s)": 0.28615 }, { "acc": 0.72507915, "epoch": 1.8949537712146993, "grad_norm": 6.34375, "learning_rate": 7.520573310427271e-08, "loss": 1.10703754, "memory(GiB)": 142.32, "step": 169420, "train_speed(iter/s)": 0.286161 }, { "acc": 0.73117542, "epoch": 1.8951774701606579, "grad_norm": 5.3125, "learning_rate": 7.488651032883154e-08, "loss": 1.07126265, "memory(GiB)": 142.32, "step": 169440, "train_speed(iter/s)": 0.286173 }, { "acc": 0.72100725, "epoch": 1.8954011691066164, "grad_norm": 5.46875, "learning_rate": 7.45679613793121e-08, "loss": 1.10930042, "memory(GiB)": 142.32, "step": 169460, "train_speed(iter/s)": 0.286185 }, { "acc": 0.73614578, "epoch": 1.895624868052575, "grad_norm": 5.21875, "learning_rate": 7.425008629929564e-08, "loss": 1.0429287, "memory(GiB)": 142.32, "step": 169480, "train_speed(iter/s)": 0.286198 }, { "acc": 0.74773197, "epoch": 1.8958485669985334, "grad_norm": 6.34375, "learning_rate": 7.393288513227293e-08, "loss": 1.00176544, "memory(GiB)": 142.32, "step": 169500, "train_speed(iter/s)": 0.286208 }, { "acc": 0.72777162, "epoch": 1.896072265944492, "grad_norm": 5.78125, "learning_rate": 7.361635792164146e-08, "loss": 1.07480869, "memory(GiB)": 142.32, "step": 169520, "train_speed(iter/s)": 0.28622 }, { "acc": 0.73685703, "epoch": 1.8962959648904505, "grad_norm": 10.375, "learning_rate": 7.330050471070604e-08, "loss": 1.03739815, "memory(GiB)": 142.32, "step": 169540, "train_speed(iter/s)": 0.286233 }, { "acc": 0.74008417, "epoch": 1.896519663836409, "grad_norm": 5.65625, "learning_rate": 7.2985325542681e-08, "loss": 1.03766041, "memory(GiB)": 142.32, "step": 169560, "train_speed(iter/s)": 0.286244 }, { "acc": 0.73477039, "epoch": 1.8967433627823675, "grad_norm": 8.125, "learning_rate": 7.26708204606863e-08, "loss": 1.06369934, "memory(GiB)": 142.32, "step": 169580, "train_speed(iter/s)": 0.286256 }, { "acc": 0.73347683, "epoch": 1.896967061728326, "grad_norm": 5.65625, "learning_rate": 7.235698950775083e-08, "loss": 1.07231293, "memory(GiB)": 142.32, "step": 169600, "train_speed(iter/s)": 0.286268 }, { "acc": 0.73912716, "epoch": 1.8971907606742846, "grad_norm": 6.375, "learning_rate": 7.204383272681193e-08, "loss": 1.04179668, "memory(GiB)": 142.32, "step": 169620, "train_speed(iter/s)": 0.28628 }, { "acc": 0.75348353, "epoch": 1.8974144596202431, "grad_norm": 6.5, "learning_rate": 7.173135016071309e-08, "loss": 0.98563623, "memory(GiB)": 142.32, "step": 169640, "train_speed(iter/s)": 0.286293 }, { "acc": 0.72905064, "epoch": 1.8976381585662017, "grad_norm": 6.40625, "learning_rate": 7.141954185220734e-08, "loss": 1.06774673, "memory(GiB)": 142.32, "step": 169660, "train_speed(iter/s)": 0.286305 }, { "acc": 0.74191446, "epoch": 1.8978618575121602, "grad_norm": 8.375, "learning_rate": 7.110840784395335e-08, "loss": 1.01853848, "memory(GiB)": 142.32, "step": 169680, "train_speed(iter/s)": 0.286317 }, { "acc": 0.73634291, "epoch": 1.8980855564581187, "grad_norm": 5.5625, "learning_rate": 7.079794817851925e-08, "loss": 1.06368141, "memory(GiB)": 142.32, "step": 169700, "train_speed(iter/s)": 0.28633 }, { "acc": 0.7395503, "epoch": 1.8983092554040772, "grad_norm": 6.46875, "learning_rate": 7.048816289838057e-08, "loss": 1.04408512, "memory(GiB)": 142.32, "step": 169720, "train_speed(iter/s)": 0.286342 }, { "acc": 0.73118615, "epoch": 1.8985329543500358, "grad_norm": 5.09375, "learning_rate": 7.017905204592001e-08, "loss": 1.07844639, "memory(GiB)": 142.32, "step": 169740, "train_speed(iter/s)": 0.286355 }, { "acc": 0.73192282, "epoch": 1.8987566532959943, "grad_norm": 4.96875, "learning_rate": 6.987061566342879e-08, "loss": 1.05565281, "memory(GiB)": 142.32, "step": 169760, "train_speed(iter/s)": 0.286368 }, { "acc": 0.73456593, "epoch": 1.8989803522419528, "grad_norm": 6.84375, "learning_rate": 6.956285379310479e-08, "loss": 1.06353569, "memory(GiB)": 142.32, "step": 169780, "train_speed(iter/s)": 0.286379 }, { "acc": 0.73259192, "epoch": 1.8992040511879114, "grad_norm": 5.4375, "learning_rate": 6.925576647705489e-08, "loss": 1.05774994, "memory(GiB)": 142.32, "step": 169800, "train_speed(iter/s)": 0.286391 }, { "acc": 0.73478642, "epoch": 1.8994277501338699, "grad_norm": 6.25, "learning_rate": 6.894935375729218e-08, "loss": 1.06078281, "memory(GiB)": 142.32, "step": 169820, "train_speed(iter/s)": 0.286401 }, { "acc": 0.74730654, "epoch": 1.8996514490798284, "grad_norm": 5.46875, "learning_rate": 6.864361567573918e-08, "loss": 1.01697178, "memory(GiB)": 142.32, "step": 169840, "train_speed(iter/s)": 0.286412 }, { "acc": 0.7389842, "epoch": 1.899875148025787, "grad_norm": 5.46875, "learning_rate": 6.833855227422525e-08, "loss": 1.03573704, "memory(GiB)": 142.32, "step": 169860, "train_speed(iter/s)": 0.286424 }, { "acc": 0.73638453, "epoch": 1.9000988469717455, "grad_norm": 7.28125, "learning_rate": 6.803416359448644e-08, "loss": 1.06139526, "memory(GiB)": 142.32, "step": 169880, "train_speed(iter/s)": 0.286434 }, { "acc": 0.7459631, "epoch": 1.900322545917704, "grad_norm": 6.40625, "learning_rate": 6.773044967816833e-08, "loss": 0.99980412, "memory(GiB)": 142.32, "step": 169900, "train_speed(iter/s)": 0.286446 }, { "acc": 0.72897415, "epoch": 1.9005462448636625, "grad_norm": 5.25, "learning_rate": 6.742741056682323e-08, "loss": 1.07334776, "memory(GiB)": 142.32, "step": 169920, "train_speed(iter/s)": 0.286457 }, { "acc": 0.73539515, "epoch": 1.900769943809621, "grad_norm": 6.71875, "learning_rate": 6.712504630191074e-08, "loss": 1.04989767, "memory(GiB)": 142.32, "step": 169940, "train_speed(iter/s)": 0.286468 }, { "acc": 0.73133259, "epoch": 1.9009936427555796, "grad_norm": 6.5625, "learning_rate": 6.682335692479946e-08, "loss": 1.07433434, "memory(GiB)": 142.32, "step": 169960, "train_speed(iter/s)": 0.28648 }, { "acc": 0.7456852, "epoch": 1.901217341701538, "grad_norm": 5.375, "learning_rate": 6.652234247676359e-08, "loss": 0.99921055, "memory(GiB)": 142.32, "step": 169980, "train_speed(iter/s)": 0.286491 }, { "acc": 0.72891541, "epoch": 1.9014410406474966, "grad_norm": 4.8125, "learning_rate": 6.62220029989874e-08, "loss": 1.08521805, "memory(GiB)": 142.32, "step": 170000, "train_speed(iter/s)": 0.286504 }, { "epoch": 1.9014410406474966, "eval_acc": 0.696383084700724, "eval_loss": 1.0713706016540527, "eval_runtime": 2338.8484, "eval_samples_per_second": 32.188, "eval_steps_per_second": 16.094, "step": 170000 }, { "acc": 0.74036818, "epoch": 1.9016647395934552, "grad_norm": 5.8125, "learning_rate": 6.592233853256136e-08, "loss": 1.03398895, "memory(GiB)": 142.32, "step": 170020, "train_speed(iter/s)": 0.285366 }, { "acc": 0.74155006, "epoch": 1.9018884385394137, "grad_norm": 5.8125, "learning_rate": 6.562334911848266e-08, "loss": 1.02337608, "memory(GiB)": 142.32, "step": 170040, "train_speed(iter/s)": 0.285378 }, { "acc": 0.72423964, "epoch": 1.9021121374853722, "grad_norm": 6.8125, "learning_rate": 6.532503479765917e-08, "loss": 1.10428762, "memory(GiB)": 142.32, "step": 170060, "train_speed(iter/s)": 0.285389 }, { "acc": 0.73502064, "epoch": 1.9023358364313308, "grad_norm": 6.28125, "learning_rate": 6.502739561090321e-08, "loss": 1.0503376, "memory(GiB)": 142.32, "step": 170080, "train_speed(iter/s)": 0.285401 }, { "acc": 0.73483782, "epoch": 1.9025595353772893, "grad_norm": 5.6875, "learning_rate": 6.473043159893722e-08, "loss": 1.06924591, "memory(GiB)": 142.32, "step": 170100, "train_speed(iter/s)": 0.285413 }, { "acc": 0.74202862, "epoch": 1.9027832343232478, "grad_norm": 6.28125, "learning_rate": 6.443414280238869e-08, "loss": 1.02624016, "memory(GiB)": 142.32, "step": 170120, "train_speed(iter/s)": 0.285424 }, { "acc": 0.74972019, "epoch": 1.9030069332692063, "grad_norm": 4.6875, "learning_rate": 6.41385292617952e-08, "loss": 0.98375626, "memory(GiB)": 142.32, "step": 170140, "train_speed(iter/s)": 0.285435 }, { "acc": 0.73665724, "epoch": 1.9032306322151649, "grad_norm": 6.46875, "learning_rate": 6.384359101760051e-08, "loss": 1.03166208, "memory(GiB)": 142.32, "step": 170160, "train_speed(iter/s)": 0.285447 }, { "acc": 0.74925389, "epoch": 1.9034543311611234, "grad_norm": 5.34375, "learning_rate": 6.354932811015679e-08, "loss": 0.97961149, "memory(GiB)": 142.32, "step": 170180, "train_speed(iter/s)": 0.285459 }, { "acc": 0.72643681, "epoch": 1.903678030107082, "grad_norm": 5.28125, "learning_rate": 6.325574057972295e-08, "loss": 1.10441589, "memory(GiB)": 142.32, "step": 170200, "train_speed(iter/s)": 0.285472 }, { "acc": 0.73296595, "epoch": 1.9039017290530404, "grad_norm": 6.0, "learning_rate": 6.296282846646629e-08, "loss": 1.08694916, "memory(GiB)": 142.32, "step": 170220, "train_speed(iter/s)": 0.285484 }, { "acc": 0.74667053, "epoch": 1.904125427998999, "grad_norm": 8.375, "learning_rate": 6.267059181046086e-08, "loss": 1.00397205, "memory(GiB)": 142.32, "step": 170240, "train_speed(iter/s)": 0.285497 }, { "acc": 0.7280695, "epoch": 1.9043491269449575, "grad_norm": 5.625, "learning_rate": 6.237903065168971e-08, "loss": 1.09315186, "memory(GiB)": 142.32, "step": 170260, "train_speed(iter/s)": 0.285507 }, { "acc": 0.73655758, "epoch": 1.904572825890916, "grad_norm": 6.15625, "learning_rate": 6.208814503004146e-08, "loss": 1.04080067, "memory(GiB)": 142.32, "step": 170280, "train_speed(iter/s)": 0.285518 }, { "acc": 0.73424649, "epoch": 1.9047965248368746, "grad_norm": 5.9375, "learning_rate": 6.179793498531427e-08, "loss": 1.06289577, "memory(GiB)": 142.32, "step": 170300, "train_speed(iter/s)": 0.28553 }, { "acc": 0.73396497, "epoch": 1.905020223782833, "grad_norm": 5.0625, "learning_rate": 6.150840055721252e-08, "loss": 1.0520647, "memory(GiB)": 142.32, "step": 170320, "train_speed(iter/s)": 0.285542 }, { "acc": 0.73659887, "epoch": 1.9052439227287916, "grad_norm": 6.84375, "learning_rate": 6.121954178534894e-08, "loss": 1.04995346, "memory(GiB)": 142.32, "step": 170340, "train_speed(iter/s)": 0.285554 }, { "acc": 0.74327793, "epoch": 1.9054676216747501, "grad_norm": 5.1875, "learning_rate": 6.093135870924416e-08, "loss": 1.01485062, "memory(GiB)": 142.32, "step": 170360, "train_speed(iter/s)": 0.285565 }, { "acc": 0.72353439, "epoch": 1.9056913206207087, "grad_norm": 5.78125, "learning_rate": 6.064385136832385e-08, "loss": 1.10713215, "memory(GiB)": 142.32, "step": 170380, "train_speed(iter/s)": 0.285578 }, { "acc": 0.73483062, "epoch": 1.9059150195666672, "grad_norm": 6.0, "learning_rate": 6.035701980192543e-08, "loss": 1.05719013, "memory(GiB)": 142.32, "step": 170400, "train_speed(iter/s)": 0.285588 }, { "acc": 0.74499149, "epoch": 1.9061387185126257, "grad_norm": 6.46875, "learning_rate": 6.00708640492903e-08, "loss": 1.00144787, "memory(GiB)": 142.32, "step": 170420, "train_speed(iter/s)": 0.285601 }, { "acc": 0.7377111, "epoch": 1.9063624174585843, "grad_norm": 5.90625, "learning_rate": 5.978538414956825e-08, "loss": 1.05956707, "memory(GiB)": 142.32, "step": 170440, "train_speed(iter/s)": 0.285612 }, { "acc": 0.73867273, "epoch": 1.9065861164045428, "grad_norm": 6.28125, "learning_rate": 5.9500580141818035e-08, "loss": 1.03118382, "memory(GiB)": 142.32, "step": 170460, "train_speed(iter/s)": 0.285623 }, { "acc": 0.74016237, "epoch": 1.9068098153505013, "grad_norm": 5.78125, "learning_rate": 5.921645206500404e-08, "loss": 1.02798891, "memory(GiB)": 142.32, "step": 170480, "train_speed(iter/s)": 0.285634 }, { "acc": 0.72978697, "epoch": 1.9070335142964598, "grad_norm": 5.9375, "learning_rate": 5.8932999957999616e-08, "loss": 1.0787755, "memory(GiB)": 142.32, "step": 170500, "train_speed(iter/s)": 0.285644 }, { "acc": 0.72075138, "epoch": 1.9072572132424184, "grad_norm": 5.5625, "learning_rate": 5.865022385958541e-08, "loss": 1.12264261, "memory(GiB)": 142.32, "step": 170520, "train_speed(iter/s)": 0.285656 }, { "acc": 0.74597383, "epoch": 1.907480912188377, "grad_norm": 7.6875, "learning_rate": 5.836812380844881e-08, "loss": 1.0103529, "memory(GiB)": 142.32, "step": 170540, "train_speed(iter/s)": 0.285666 }, { "acc": 0.7225625, "epoch": 1.9077046111343354, "grad_norm": 6.84375, "learning_rate": 5.808669984318449e-08, "loss": 1.11582289, "memory(GiB)": 142.32, "step": 170560, "train_speed(iter/s)": 0.285676 }, { "acc": 0.73658609, "epoch": 1.907928310080294, "grad_norm": 5.09375, "learning_rate": 5.780595200229611e-08, "loss": 1.04478788, "memory(GiB)": 142.32, "step": 170580, "train_speed(iter/s)": 0.285687 }, { "acc": 0.73907804, "epoch": 1.9081520090262525, "grad_norm": 5.625, "learning_rate": 5.7525880324193486e-08, "loss": 1.01877403, "memory(GiB)": 142.32, "step": 170600, "train_speed(iter/s)": 0.285699 }, { "acc": 0.74013329, "epoch": 1.908375707972211, "grad_norm": 6.21875, "learning_rate": 5.724648484719486e-08, "loss": 1.04589987, "memory(GiB)": 142.32, "step": 170620, "train_speed(iter/s)": 0.28571 }, { "acc": 0.74022198, "epoch": 1.9085994069181695, "grad_norm": 5.125, "learning_rate": 5.696776560952522e-08, "loss": 1.0232687, "memory(GiB)": 142.32, "step": 170640, "train_speed(iter/s)": 0.285721 }, { "acc": 0.74697623, "epoch": 1.908823105864128, "grad_norm": 6.75, "learning_rate": 5.668972264931738e-08, "loss": 0.99825916, "memory(GiB)": 142.32, "step": 170660, "train_speed(iter/s)": 0.285731 }, { "acc": 0.73300648, "epoch": 1.9090468048100866, "grad_norm": 6.46875, "learning_rate": 5.641235600461203e-08, "loss": 1.06383133, "memory(GiB)": 142.32, "step": 170680, "train_speed(iter/s)": 0.285742 }, { "acc": 0.74441714, "epoch": 1.9092705037560451, "grad_norm": 5.125, "learning_rate": 5.6135665713356043e-08, "loss": 1.01204033, "memory(GiB)": 142.32, "step": 170700, "train_speed(iter/s)": 0.285752 }, { "acc": 0.73237085, "epoch": 1.9094942027020037, "grad_norm": 5.21875, "learning_rate": 5.5859651813405246e-08, "loss": 1.05784225, "memory(GiB)": 142.32, "step": 170720, "train_speed(iter/s)": 0.285763 }, { "acc": 0.73816628, "epoch": 1.9097179016479622, "grad_norm": 6.34375, "learning_rate": 5.558431434252221e-08, "loss": 1.04377022, "memory(GiB)": 142.32, "step": 170740, "train_speed(iter/s)": 0.285775 }, { "acc": 0.74304876, "epoch": 1.9099416005939207, "grad_norm": 6.03125, "learning_rate": 5.530965333837734e-08, "loss": 1.01339588, "memory(GiB)": 142.32, "step": 170760, "train_speed(iter/s)": 0.285786 }, { "acc": 0.73174067, "epoch": 1.9101652995398792, "grad_norm": 7.34375, "learning_rate": 5.5035668838547276e-08, "loss": 1.06668491, "memory(GiB)": 142.32, "step": 170780, "train_speed(iter/s)": 0.285797 }, { "acc": 0.74479351, "epoch": 1.9103889984858378, "grad_norm": 5.90625, "learning_rate": 5.476236088051756e-08, "loss": 1.013305, "memory(GiB)": 142.32, "step": 170800, "train_speed(iter/s)": 0.285809 }, { "acc": 0.74564972, "epoch": 1.9106126974317963, "grad_norm": 6.5625, "learning_rate": 5.4489729501680525e-08, "loss": 1.00701504, "memory(GiB)": 142.32, "step": 170820, "train_speed(iter/s)": 0.285821 }, { "acc": 0.73360672, "epoch": 1.9108363963777548, "grad_norm": 8.125, "learning_rate": 5.421777473933687e-08, "loss": 1.06256523, "memory(GiB)": 142.32, "step": 170840, "train_speed(iter/s)": 0.285835 }, { "acc": 0.73822479, "epoch": 1.9110600953237133, "grad_norm": 5.46875, "learning_rate": 5.39464966306924e-08, "loss": 1.03193245, "memory(GiB)": 142.32, "step": 170860, "train_speed(iter/s)": 0.285847 }, { "acc": 0.73151875, "epoch": 1.9112837942696719, "grad_norm": 6.125, "learning_rate": 5.3675895212862986e-08, "loss": 1.08081865, "memory(GiB)": 142.32, "step": 170880, "train_speed(iter/s)": 0.285859 }, { "acc": 0.73916969, "epoch": 1.9115074932156304, "grad_norm": 5.9375, "learning_rate": 5.340597052287011e-08, "loss": 1.04556389, "memory(GiB)": 142.32, "step": 170900, "train_speed(iter/s)": 0.28587 }, { "acc": 0.7541141, "epoch": 1.911731192161589, "grad_norm": 7.03125, "learning_rate": 5.313672259764369e-08, "loss": 0.96677237, "memory(GiB)": 142.32, "step": 170920, "train_speed(iter/s)": 0.285882 }, { "acc": 0.75371847, "epoch": 1.9119548911075475, "grad_norm": 6.5625, "learning_rate": 5.2868151474020356e-08, "loss": 0.99868183, "memory(GiB)": 142.32, "step": 170940, "train_speed(iter/s)": 0.285894 }, { "acc": 0.7372921, "epoch": 1.912178590053506, "grad_norm": 7.09375, "learning_rate": 5.260025718874517e-08, "loss": 1.03663082, "memory(GiB)": 142.32, "step": 170960, "train_speed(iter/s)": 0.285907 }, { "acc": 0.73419218, "epoch": 1.9124022889994645, "grad_norm": 7.21875, "learning_rate": 5.233303977846882e-08, "loss": 1.05924063, "memory(GiB)": 142.32, "step": 170980, "train_speed(iter/s)": 0.285918 }, { "acc": 0.73277988, "epoch": 1.912625987945423, "grad_norm": 7.90625, "learning_rate": 5.206649927975205e-08, "loss": 1.06011133, "memory(GiB)": 142.32, "step": 171000, "train_speed(iter/s)": 0.28593 }, { "acc": 0.72543402, "epoch": 1.9128496868913816, "grad_norm": 5.9375, "learning_rate": 5.1800635729059576e-08, "loss": 1.07915478, "memory(GiB)": 142.32, "step": 171020, "train_speed(iter/s)": 0.285942 }, { "acc": 0.74717569, "epoch": 1.91307338583734, "grad_norm": 6.59375, "learning_rate": 5.153544916276621e-08, "loss": 1.00667334, "memory(GiB)": 142.32, "step": 171040, "train_speed(iter/s)": 0.285953 }, { "acc": 0.73765574, "epoch": 1.9132970847832986, "grad_norm": 5.6875, "learning_rate": 5.1270939617154016e-08, "loss": 1.03994389, "memory(GiB)": 142.32, "step": 171060, "train_speed(iter/s)": 0.285965 }, { "acc": 0.73452978, "epoch": 1.9135207837292572, "grad_norm": 6.03125, "learning_rate": 5.1007107128410174e-08, "loss": 1.06729584, "memory(GiB)": 142.32, "step": 171080, "train_speed(iter/s)": 0.285976 }, { "acc": 0.74680901, "epoch": 1.9137444826752157, "grad_norm": 6.03125, "learning_rate": 5.074395173263191e-08, "loss": 0.98887892, "memory(GiB)": 142.32, "step": 171100, "train_speed(iter/s)": 0.285987 }, { "acc": 0.73425317, "epoch": 1.9139681816211742, "grad_norm": 8.1875, "learning_rate": 5.048147346582155e-08, "loss": 1.04973803, "memory(GiB)": 142.32, "step": 171120, "train_speed(iter/s)": 0.285998 }, { "acc": 0.74416399, "epoch": 1.9141918805671327, "grad_norm": 6.375, "learning_rate": 5.021967236389147e-08, "loss": 1.00491734, "memory(GiB)": 142.32, "step": 171140, "train_speed(iter/s)": 0.286009 }, { "acc": 0.72847633, "epoch": 1.9144155795130913, "grad_norm": 5.3125, "learning_rate": 4.995854846265857e-08, "loss": 1.09207458, "memory(GiB)": 142.32, "step": 171160, "train_speed(iter/s)": 0.28602 }, { "acc": 0.7340807, "epoch": 1.9146392784590498, "grad_norm": 6.5, "learning_rate": 4.9698101797848174e-08, "loss": 1.06516476, "memory(GiB)": 142.32, "step": 171180, "train_speed(iter/s)": 0.286031 }, { "acc": 0.74136648, "epoch": 1.9148629774050083, "grad_norm": 6.40625, "learning_rate": 4.943833240509399e-08, "loss": 1.01577549, "memory(GiB)": 142.32, "step": 171200, "train_speed(iter/s)": 0.286041 }, { "acc": 0.75004354, "epoch": 1.9150866763509669, "grad_norm": 6.53125, "learning_rate": 4.917924031993537e-08, "loss": 0.9913929, "memory(GiB)": 142.32, "step": 171220, "train_speed(iter/s)": 0.286053 }, { "acc": 0.74732533, "epoch": 1.9153103752969254, "grad_norm": 7.84375, "learning_rate": 4.892082557782063e-08, "loss": 1.00758028, "memory(GiB)": 142.32, "step": 171240, "train_speed(iter/s)": 0.286064 }, { "acc": 0.72784419, "epoch": 1.915534074242884, "grad_norm": 5.0625, "learning_rate": 4.866308821410426e-08, "loss": 1.10074406, "memory(GiB)": 142.32, "step": 171260, "train_speed(iter/s)": 0.286075 }, { "acc": 0.73584547, "epoch": 1.9157577731888424, "grad_norm": 5.375, "learning_rate": 4.8406028264048055e-08, "loss": 1.04341307, "memory(GiB)": 142.32, "step": 171280, "train_speed(iter/s)": 0.286088 }, { "acc": 0.75202637, "epoch": 1.915981472134801, "grad_norm": 6.375, "learning_rate": 4.814964576282166e-08, "loss": 0.98781109, "memory(GiB)": 142.32, "step": 171300, "train_speed(iter/s)": 0.286099 }, { "acc": 0.72181129, "epoch": 1.9162051710807595, "grad_norm": 6.0625, "learning_rate": 4.789394074550202e-08, "loss": 1.11342449, "memory(GiB)": 142.32, "step": 171320, "train_speed(iter/s)": 0.28611 }, { "acc": 0.72959118, "epoch": 1.916428870026718, "grad_norm": 5.90625, "learning_rate": 4.763891324707337e-08, "loss": 1.06938515, "memory(GiB)": 142.32, "step": 171340, "train_speed(iter/s)": 0.286121 }, { "acc": 0.73485403, "epoch": 1.9166525689726766, "grad_norm": 6.65625, "learning_rate": 4.7384563302427244e-08, "loss": 1.05969009, "memory(GiB)": 142.32, "step": 171360, "train_speed(iter/s)": 0.286133 }, { "acc": 0.72921686, "epoch": 1.916876267918635, "grad_norm": 6.9375, "learning_rate": 4.713089094636136e-08, "loss": 1.07830524, "memory(GiB)": 142.32, "step": 171380, "train_speed(iter/s)": 0.286145 }, { "acc": 0.73405552, "epoch": 1.9170999668645936, "grad_norm": 5.75, "learning_rate": 4.687789621358296e-08, "loss": 1.09473629, "memory(GiB)": 142.32, "step": 171400, "train_speed(iter/s)": 0.286158 }, { "acc": 0.72971172, "epoch": 1.9173236658105521, "grad_norm": 5.625, "learning_rate": 4.6625579138704356e-08, "loss": 1.08085394, "memory(GiB)": 142.32, "step": 171420, "train_speed(iter/s)": 0.28617 }, { "acc": 0.74070997, "epoch": 1.9175473647565107, "grad_norm": 5.09375, "learning_rate": 4.6373939756246824e-08, "loss": 1.03441782, "memory(GiB)": 142.32, "step": 171440, "train_speed(iter/s)": 0.286182 }, { "acc": 0.73349743, "epoch": 1.9177710637024692, "grad_norm": 7.40625, "learning_rate": 4.612297810063782e-08, "loss": 1.06801023, "memory(GiB)": 142.32, "step": 171460, "train_speed(iter/s)": 0.286195 }, { "acc": 0.73234386, "epoch": 1.9179947626484277, "grad_norm": 6.40625, "learning_rate": 4.587269420621265e-08, "loss": 1.07438583, "memory(GiB)": 142.32, "step": 171480, "train_speed(iter/s)": 0.286206 }, { "acc": 0.73698201, "epoch": 1.9182184615943862, "grad_norm": 6.5625, "learning_rate": 4.562308810721394e-08, "loss": 1.04339371, "memory(GiB)": 142.32, "step": 171500, "train_speed(iter/s)": 0.286218 }, { "acc": 0.75010724, "epoch": 1.9184421605403448, "grad_norm": 7.125, "learning_rate": 4.5374159837790475e-08, "loss": 0.98533669, "memory(GiB)": 142.32, "step": 171520, "train_speed(iter/s)": 0.28623 }, { "acc": 0.73762903, "epoch": 1.9186658594863033, "grad_norm": 6.40625, "learning_rate": 4.512590943200057e-08, "loss": 1.03306036, "memory(GiB)": 142.32, "step": 171540, "train_speed(iter/s)": 0.286242 }, { "acc": 0.73779354, "epoch": 1.9188895584322618, "grad_norm": 5.96875, "learning_rate": 4.487833692380761e-08, "loss": 1.05505829, "memory(GiB)": 142.32, "step": 171560, "train_speed(iter/s)": 0.286254 }, { "acc": 0.73902407, "epoch": 1.9191132573782204, "grad_norm": 4.96875, "learning_rate": 4.4631442347082834e-08, "loss": 1.04783573, "memory(GiB)": 142.32, "step": 171580, "train_speed(iter/s)": 0.286264 }, { "acc": 0.73425875, "epoch": 1.9193369563241789, "grad_norm": 6.0625, "learning_rate": 4.438522573560589e-08, "loss": 1.05511713, "memory(GiB)": 142.32, "step": 171600, "train_speed(iter/s)": 0.286275 }, { "acc": 0.74272943, "epoch": 1.9195606552701374, "grad_norm": 6.28125, "learning_rate": 4.413968712306149e-08, "loss": 1.00518951, "memory(GiB)": 142.32, "step": 171620, "train_speed(iter/s)": 0.286285 }, { "acc": 0.73030148, "epoch": 1.919784354216096, "grad_norm": 5.5, "learning_rate": 4.389482654304389e-08, "loss": 1.09740248, "memory(GiB)": 142.32, "step": 171640, "train_speed(iter/s)": 0.286297 }, { "acc": 0.73147359, "epoch": 1.9200080531620545, "grad_norm": 5.0625, "learning_rate": 4.365064402905295e-08, "loss": 1.07740698, "memory(GiB)": 142.32, "step": 171660, "train_speed(iter/s)": 0.286308 }, { "acc": 0.74381866, "epoch": 1.920231752108013, "grad_norm": 7.71875, "learning_rate": 4.3407139614496384e-08, "loss": 0.99555798, "memory(GiB)": 142.32, "step": 171680, "train_speed(iter/s)": 0.28632 }, { "acc": 0.74038296, "epoch": 1.9204554510539715, "grad_norm": 5.15625, "learning_rate": 4.316431333268922e-08, "loss": 1.02348566, "memory(GiB)": 142.32, "step": 171700, "train_speed(iter/s)": 0.28633 }, { "acc": 0.73703055, "epoch": 1.92067914999993, "grad_norm": 5.71875, "learning_rate": 4.292216521685377e-08, "loss": 1.06555128, "memory(GiB)": 142.32, "step": 171720, "train_speed(iter/s)": 0.28634 }, { "acc": 0.74997816, "epoch": 1.9209028489458886, "grad_norm": 6.65625, "learning_rate": 4.268069530011853e-08, "loss": 0.98351669, "memory(GiB)": 142.32, "step": 171740, "train_speed(iter/s)": 0.286352 }, { "acc": 0.74332919, "epoch": 1.9211265478918471, "grad_norm": 5.96875, "learning_rate": 4.243990361552097e-08, "loss": 1.01947517, "memory(GiB)": 142.32, "step": 171760, "train_speed(iter/s)": 0.286361 }, { "acc": 0.72766361, "epoch": 1.9213502468378056, "grad_norm": 5.09375, "learning_rate": 4.219979019600418e-08, "loss": 1.08432474, "memory(GiB)": 142.32, "step": 171780, "train_speed(iter/s)": 0.286373 }, { "acc": 0.73556557, "epoch": 1.9215739457837642, "grad_norm": 6.3125, "learning_rate": 4.1960355074419644e-08, "loss": 1.04630051, "memory(GiB)": 142.32, "step": 171800, "train_speed(iter/s)": 0.286384 }, { "acc": 0.73712444, "epoch": 1.9217976447297227, "grad_norm": 5.28125, "learning_rate": 4.172159828352507e-08, "loss": 1.04960871, "memory(GiB)": 142.32, "step": 171820, "train_speed(iter/s)": 0.286395 }, { "acc": 0.74358325, "epoch": 1.9220213436756812, "grad_norm": 5.09375, "learning_rate": 4.148351985598598e-08, "loss": 1.01846371, "memory(GiB)": 142.32, "step": 171840, "train_speed(iter/s)": 0.286407 }, { "acc": 0.73018894, "epoch": 1.9222450426216398, "grad_norm": 6.0625, "learning_rate": 4.1246119824375205e-08, "loss": 1.07943029, "memory(GiB)": 142.32, "step": 171860, "train_speed(iter/s)": 0.286419 }, { "acc": 0.73448186, "epoch": 1.9224687415675983, "grad_norm": 8.4375, "learning_rate": 4.100939822117178e-08, "loss": 1.06331978, "memory(GiB)": 142.32, "step": 171880, "train_speed(iter/s)": 0.286431 }, { "acc": 0.7369781, "epoch": 1.9226924405135568, "grad_norm": 6.46875, "learning_rate": 4.077335507876312e-08, "loss": 1.05371389, "memory(GiB)": 142.32, "step": 171900, "train_speed(iter/s)": 0.286441 }, { "acc": 0.73867922, "epoch": 1.9229161394595153, "grad_norm": 5.15625, "learning_rate": 4.0537990429443394e-08, "loss": 1.03677807, "memory(GiB)": 142.32, "step": 171920, "train_speed(iter/s)": 0.286452 }, { "acc": 0.73432999, "epoch": 1.9231398384054739, "grad_norm": 6.5625, "learning_rate": 4.0303304305413516e-08, "loss": 1.06299973, "memory(GiB)": 142.32, "step": 171940, "train_speed(iter/s)": 0.286462 }, { "acc": 0.74776454, "epoch": 1.9233635373514324, "grad_norm": 5.5, "learning_rate": 4.006929673878168e-08, "loss": 0.99997501, "memory(GiB)": 142.32, "step": 171960, "train_speed(iter/s)": 0.286473 }, { "acc": 0.73692603, "epoch": 1.923587236297391, "grad_norm": 5.25, "learning_rate": 3.9835967761563954e-08, "loss": 1.04075356, "memory(GiB)": 142.32, "step": 171980, "train_speed(iter/s)": 0.286483 }, { "acc": 0.74432516, "epoch": 1.9238109352433495, "grad_norm": 6.1875, "learning_rate": 3.960331740568313e-08, "loss": 1.00998096, "memory(GiB)": 142.32, "step": 172000, "train_speed(iter/s)": 0.286489 }, { "epoch": 1.9238109352433495, "eval_acc": 0.6963811128345222, "eval_loss": 1.0713520050048828, "eval_runtime": 2338.5528, "eval_samples_per_second": 32.192, "eval_steps_per_second": 16.096, "step": 172000 }, { "acc": 0.74102001, "epoch": 1.924034634189308, "grad_norm": 7.09375, "learning_rate": 3.9371345702968746e-08, "loss": 1.0282732, "memory(GiB)": 142.32, "step": 172020, "train_speed(iter/s)": 0.285365 }, { "acc": 0.73900895, "epoch": 1.9242583331352665, "grad_norm": 5.625, "learning_rate": 3.9140052685157634e-08, "loss": 1.03498077, "memory(GiB)": 142.32, "step": 172040, "train_speed(iter/s)": 0.285376 }, { "acc": 0.74050884, "epoch": 1.924482032081225, "grad_norm": 6.28125, "learning_rate": 3.890943838389505e-08, "loss": 1.03066807, "memory(GiB)": 142.32, "step": 172060, "train_speed(iter/s)": 0.285386 }, { "acc": 0.74207311, "epoch": 1.9247057310271836, "grad_norm": 7.03125, "learning_rate": 3.8679502830731295e-08, "loss": 1.02955494, "memory(GiB)": 142.32, "step": 172080, "train_speed(iter/s)": 0.285398 }, { "acc": 0.73227935, "epoch": 1.924929429973142, "grad_norm": 5.375, "learning_rate": 3.845024605712511e-08, "loss": 1.04817886, "memory(GiB)": 142.32, "step": 172100, "train_speed(iter/s)": 0.285408 }, { "acc": 0.74299946, "epoch": 1.9251531289191006, "grad_norm": 7.28125, "learning_rate": 3.8221668094441947e-08, "loss": 1.02820549, "memory(GiB)": 142.32, "step": 172120, "train_speed(iter/s)": 0.285418 }, { "acc": 0.74251542, "epoch": 1.9253768278650591, "grad_norm": 6.0625, "learning_rate": 3.7993768973955125e-08, "loss": 1.01510792, "memory(GiB)": 142.32, "step": 172140, "train_speed(iter/s)": 0.28543 }, { "acc": 0.74675217, "epoch": 1.9256005268110177, "grad_norm": 5.4375, "learning_rate": 3.77665487268436e-08, "loss": 1.01252823, "memory(GiB)": 142.32, "step": 172160, "train_speed(iter/s)": 0.285442 }, { "acc": 0.74168468, "epoch": 1.9258242257569762, "grad_norm": 6.46875, "learning_rate": 3.7540007384195275e-08, "loss": 1.02939768, "memory(GiB)": 142.32, "step": 172180, "train_speed(iter/s)": 0.285452 }, { "acc": 0.73801022, "epoch": 1.9260479247029347, "grad_norm": 5.53125, "learning_rate": 3.731414497700425e-08, "loss": 1.03447876, "memory(GiB)": 142.32, "step": 172200, "train_speed(iter/s)": 0.285464 }, { "acc": 0.73336849, "epoch": 1.9262716236488933, "grad_norm": 6.0625, "learning_rate": 3.708896153617081e-08, "loss": 1.04877968, "memory(GiB)": 142.32, "step": 172220, "train_speed(iter/s)": 0.285475 }, { "acc": 0.74487376, "epoch": 1.9264953225948518, "grad_norm": 6.34375, "learning_rate": 3.6864457092503636e-08, "loss": 1.02231789, "memory(GiB)": 142.32, "step": 172240, "train_speed(iter/s)": 0.285486 }, { "acc": 0.73640556, "epoch": 1.9267190215408103, "grad_norm": 7.03125, "learning_rate": 3.664063167671816e-08, "loss": 1.04499607, "memory(GiB)": 142.32, "step": 172260, "train_speed(iter/s)": 0.285497 }, { "acc": 0.74440651, "epoch": 1.9269427204867688, "grad_norm": 6.59375, "learning_rate": 3.641748531943767e-08, "loss": 1.00817184, "memory(GiB)": 142.32, "step": 172280, "train_speed(iter/s)": 0.285509 }, { "acc": 0.73351774, "epoch": 1.9271664194327274, "grad_norm": 6.6875, "learning_rate": 3.619501805119108e-08, "loss": 1.05674133, "memory(GiB)": 142.32, "step": 172300, "train_speed(iter/s)": 0.28552 }, { "acc": 0.75140371, "epoch": 1.927390118378686, "grad_norm": 6.03125, "learning_rate": 3.5973229902414586e-08, "loss": 0.98975372, "memory(GiB)": 142.32, "step": 172320, "train_speed(iter/s)": 0.285531 }, { "acc": 0.7481998, "epoch": 1.9276138173246444, "grad_norm": 7.0625, "learning_rate": 3.575212090345281e-08, "loss": 0.99418259, "memory(GiB)": 142.32, "step": 172340, "train_speed(iter/s)": 0.285543 }, { "acc": 0.73569279, "epoch": 1.927837516270603, "grad_norm": 6.21875, "learning_rate": 3.553169108455601e-08, "loss": 1.04949455, "memory(GiB)": 142.32, "step": 172360, "train_speed(iter/s)": 0.285555 }, { "acc": 0.72913055, "epoch": 1.9280612152165615, "grad_norm": 5.8125, "learning_rate": 3.531194047588282e-08, "loss": 1.08144855, "memory(GiB)": 142.32, "step": 172380, "train_speed(iter/s)": 0.285565 }, { "acc": 0.74016056, "epoch": 1.92828491416252, "grad_norm": 6.21875, "learning_rate": 3.509286910749809e-08, "loss": 1.02086868, "memory(GiB)": 142.32, "step": 172400, "train_speed(iter/s)": 0.285576 }, { "acc": 0.74251766, "epoch": 1.9285086131084785, "grad_norm": 4.5625, "learning_rate": 3.487447700937341e-08, "loss": 1.01274128, "memory(GiB)": 142.32, "step": 172420, "train_speed(iter/s)": 0.285588 }, { "acc": 0.7371191, "epoch": 1.928732312054437, "grad_norm": 5.8125, "learning_rate": 3.4656764211388747e-08, "loss": 1.04506598, "memory(GiB)": 142.32, "step": 172440, "train_speed(iter/s)": 0.285599 }, { "acc": 0.73664389, "epoch": 1.9289560110003956, "grad_norm": 5.59375, "learning_rate": 3.443973074332918e-08, "loss": 1.05225315, "memory(GiB)": 142.32, "step": 172460, "train_speed(iter/s)": 0.28561 }, { "acc": 0.73446813, "epoch": 1.9291797099463541, "grad_norm": 5.78125, "learning_rate": 3.422337663488928e-08, "loss": 1.04647331, "memory(GiB)": 142.32, "step": 172480, "train_speed(iter/s)": 0.285622 }, { "acc": 0.74007349, "epoch": 1.9294034088923127, "grad_norm": 6.25, "learning_rate": 3.4007701915668156e-08, "loss": 1.02052574, "memory(GiB)": 142.32, "step": 172500, "train_speed(iter/s)": 0.285632 }, { "acc": 0.74299097, "epoch": 1.9296271078382712, "grad_norm": 6.0625, "learning_rate": 3.379270661517442e-08, "loss": 1.02259903, "memory(GiB)": 142.32, "step": 172520, "train_speed(iter/s)": 0.285644 }, { "acc": 0.72276454, "epoch": 1.9298508067842297, "grad_norm": 4.90625, "learning_rate": 3.357839076282121e-08, "loss": 1.12984486, "memory(GiB)": 142.32, "step": 172540, "train_speed(iter/s)": 0.285655 }, { "acc": 0.73643274, "epoch": 1.9300745057301882, "grad_norm": 6.3125, "learning_rate": 3.3364754387931186e-08, "loss": 1.04179316, "memory(GiB)": 142.32, "step": 172560, "train_speed(iter/s)": 0.285667 }, { "acc": 0.7501667, "epoch": 1.9302982046761468, "grad_norm": 5.6875, "learning_rate": 3.3151797519732074e-08, "loss": 0.98676815, "memory(GiB)": 142.32, "step": 172580, "train_speed(iter/s)": 0.285678 }, { "acc": 0.73428564, "epoch": 1.9305219036221053, "grad_norm": 5.96875, "learning_rate": 3.293952018736002e-08, "loss": 1.07236719, "memory(GiB)": 142.32, "step": 172600, "train_speed(iter/s)": 0.285689 }, { "acc": 0.72851977, "epoch": 1.9307456025680638, "grad_norm": 7.25, "learning_rate": 3.2727922419856784e-08, "loss": 1.09992714, "memory(GiB)": 142.32, "step": 172620, "train_speed(iter/s)": 0.2857 }, { "acc": 0.75043526, "epoch": 1.9309693015140224, "grad_norm": 6.28125, "learning_rate": 3.2517004246172544e-08, "loss": 0.98723469, "memory(GiB)": 142.32, "step": 172640, "train_speed(iter/s)": 0.285713 }, { "acc": 0.7413888, "epoch": 1.9311930004599809, "grad_norm": 6.6875, "learning_rate": 3.2306765695163666e-08, "loss": 1.022579, "memory(GiB)": 142.32, "step": 172660, "train_speed(iter/s)": 0.285725 }, { "acc": 0.75115914, "epoch": 1.9314166994059394, "grad_norm": 7.03125, "learning_rate": 3.209720679559436e-08, "loss": 0.98671341, "memory(GiB)": 142.32, "step": 172680, "train_speed(iter/s)": 0.285735 }, { "acc": 0.73895693, "epoch": 1.931640398351898, "grad_norm": 6.65625, "learning_rate": 3.1888327576135025e-08, "loss": 1.01734695, "memory(GiB)": 142.32, "step": 172700, "train_speed(iter/s)": 0.285745 }, { "acc": 0.7461813, "epoch": 1.9318640972978565, "grad_norm": 7.46875, "learning_rate": 3.1680128065362804e-08, "loss": 1.01896362, "memory(GiB)": 142.32, "step": 172720, "train_speed(iter/s)": 0.285757 }, { "acc": 0.73540916, "epoch": 1.932087796243815, "grad_norm": 6.53125, "learning_rate": 3.147260829176268e-08, "loss": 1.05655966, "memory(GiB)": 142.32, "step": 172740, "train_speed(iter/s)": 0.285768 }, { "acc": 0.72343216, "epoch": 1.9323114951897735, "grad_norm": 6.0625, "learning_rate": 3.126576828372641e-08, "loss": 1.1132246, "memory(GiB)": 142.32, "step": 172760, "train_speed(iter/s)": 0.28578 }, { "acc": 0.74263134, "epoch": 1.932535194135732, "grad_norm": 6.15625, "learning_rate": 3.105960806955244e-08, "loss": 1.02664061, "memory(GiB)": 142.32, "step": 172780, "train_speed(iter/s)": 0.285791 }, { "acc": 0.73655014, "epoch": 1.9327588930816906, "grad_norm": 5.96875, "learning_rate": 3.085412767744711e-08, "loss": 1.04457264, "memory(GiB)": 142.32, "step": 172800, "train_speed(iter/s)": 0.285803 }, { "acc": 0.73957329, "epoch": 1.932982592027649, "grad_norm": 6.0625, "learning_rate": 3.0649327135521824e-08, "loss": 1.02962933, "memory(GiB)": 142.32, "step": 172820, "train_speed(iter/s)": 0.285814 }, { "acc": 0.7282187, "epoch": 1.9332062909736076, "grad_norm": 5.9375, "learning_rate": 3.0445206471797496e-08, "loss": 1.0895895, "memory(GiB)": 142.32, "step": 172840, "train_speed(iter/s)": 0.285826 }, { "acc": 0.73671026, "epoch": 1.9334299899195662, "grad_norm": 6.0, "learning_rate": 3.0241765714200124e-08, "loss": 1.03458862, "memory(GiB)": 142.32, "step": 172860, "train_speed(iter/s)": 0.285837 }, { "acc": 0.73180661, "epoch": 1.9336536888655247, "grad_norm": 5.09375, "learning_rate": 3.003900489056355e-08, "loss": 1.06101532, "memory(GiB)": 142.32, "step": 172880, "train_speed(iter/s)": 0.285849 }, { "acc": 0.73545513, "epoch": 1.9338773878114832, "grad_norm": 6.21875, "learning_rate": 2.9836924028628364e-08, "loss": 1.04721241, "memory(GiB)": 142.32, "step": 172900, "train_speed(iter/s)": 0.285861 }, { "acc": 0.74085293, "epoch": 1.9341010867574417, "grad_norm": 6.1875, "learning_rate": 2.96355231560419e-08, "loss": 1.01154099, "memory(GiB)": 142.32, "step": 172920, "train_speed(iter/s)": 0.285873 }, { "acc": 0.72184372, "epoch": 1.9343247857034003, "grad_norm": 6.1875, "learning_rate": 2.9434802300358222e-08, "loss": 1.10786915, "memory(GiB)": 142.32, "step": 172940, "train_speed(iter/s)": 0.285883 }, { "acc": 0.72302847, "epoch": 1.9345484846493588, "grad_norm": 6.5, "learning_rate": 2.923476148903981e-08, "loss": 1.11610241, "memory(GiB)": 142.32, "step": 172960, "train_speed(iter/s)": 0.285895 }, { "acc": 0.73406181, "epoch": 1.9347721835953173, "grad_norm": 6.03125, "learning_rate": 2.9035400749454768e-08, "loss": 1.06777515, "memory(GiB)": 142.32, "step": 172980, "train_speed(iter/s)": 0.285905 }, { "acc": 0.74648867, "epoch": 1.9349958825412759, "grad_norm": 6.09375, "learning_rate": 2.8836720108877948e-08, "loss": 1.0102253, "memory(GiB)": 142.32, "step": 173000, "train_speed(iter/s)": 0.285918 }, { "acc": 0.73249092, "epoch": 1.9352195814872344, "grad_norm": 5.46875, "learning_rate": 2.8638719594492048e-08, "loss": 1.07071133, "memory(GiB)": 142.32, "step": 173020, "train_speed(iter/s)": 0.285928 }, { "acc": 0.72704301, "epoch": 1.935443280433193, "grad_norm": 5.53125, "learning_rate": 2.8441399233386513e-08, "loss": 1.08876181, "memory(GiB)": 142.32, "step": 173040, "train_speed(iter/s)": 0.285939 }, { "acc": 0.7364574, "epoch": 1.9356669793791514, "grad_norm": 6.53125, "learning_rate": 2.8244759052557526e-08, "loss": 1.04539032, "memory(GiB)": 142.32, "step": 173060, "train_speed(iter/s)": 0.28595 }, { "acc": 0.73562994, "epoch": 1.93589067832511, "grad_norm": 6.625, "learning_rate": 2.804879907890856e-08, "loss": 1.03886681, "memory(GiB)": 142.32, "step": 173080, "train_speed(iter/s)": 0.28596 }, { "acc": 0.72776599, "epoch": 1.9361143772710685, "grad_norm": 6.1875, "learning_rate": 2.785351933924929e-08, "loss": 1.06172705, "memory(GiB)": 142.32, "step": 173100, "train_speed(iter/s)": 0.285971 }, { "acc": 0.7288044, "epoch": 1.936338076217027, "grad_norm": 6.90625, "learning_rate": 2.7658919860296673e-08, "loss": 1.08716412, "memory(GiB)": 142.32, "step": 173120, "train_speed(iter/s)": 0.285983 }, { "acc": 0.730689, "epoch": 1.9365617751629856, "grad_norm": 7.0, "learning_rate": 2.7465000668675524e-08, "loss": 1.06102304, "memory(GiB)": 142.32, "step": 173140, "train_speed(iter/s)": 0.285996 }, { "acc": 0.73473845, "epoch": 1.936785474108944, "grad_norm": 6.1875, "learning_rate": 2.7271761790915728e-08, "loss": 1.05093861, "memory(GiB)": 142.32, "step": 173160, "train_speed(iter/s)": 0.286008 }, { "acc": 0.73806767, "epoch": 1.9370091730549026, "grad_norm": 6.1875, "learning_rate": 2.7079203253455587e-08, "loss": 1.03321819, "memory(GiB)": 142.32, "step": 173180, "train_speed(iter/s)": 0.28602 }, { "acc": 0.73493452, "epoch": 1.9372328720008611, "grad_norm": 6.1875, "learning_rate": 2.688732508263958e-08, "loss": 1.06272812, "memory(GiB)": 142.32, "step": 173200, "train_speed(iter/s)": 0.286031 }, { "acc": 0.73917866, "epoch": 1.9374565709468197, "grad_norm": 8.0, "learning_rate": 2.6696127304720044e-08, "loss": 1.03387461, "memory(GiB)": 142.32, "step": 173220, "train_speed(iter/s)": 0.286043 }, { "acc": 0.74521341, "epoch": 1.9376802698927782, "grad_norm": 4.53125, "learning_rate": 2.6505609945855494e-08, "loss": 1.03336239, "memory(GiB)": 142.32, "step": 173240, "train_speed(iter/s)": 0.286054 }, { "acc": 0.72985344, "epoch": 1.9379039688387367, "grad_norm": 6.03125, "learning_rate": 2.6315773032110638e-08, "loss": 1.06547108, "memory(GiB)": 142.32, "step": 173260, "train_speed(iter/s)": 0.286067 }, { "acc": 0.74675264, "epoch": 1.9381276677846953, "grad_norm": 5.96875, "learning_rate": 2.6126616589458585e-08, "loss": 1.00473547, "memory(GiB)": 142.32, "step": 173280, "train_speed(iter/s)": 0.286077 }, { "acc": 0.72990599, "epoch": 1.9383513667306538, "grad_norm": 4.96875, "learning_rate": 2.5938140643778086e-08, "loss": 1.08383083, "memory(GiB)": 142.32, "step": 173300, "train_speed(iter/s)": 0.286089 }, { "acc": 0.74223499, "epoch": 1.9385750656766123, "grad_norm": 5.375, "learning_rate": 2.5750345220856287e-08, "loss": 1.03717155, "memory(GiB)": 142.32, "step": 173320, "train_speed(iter/s)": 0.2861 }, { "acc": 0.7460072, "epoch": 1.9387987646225708, "grad_norm": 6.09375, "learning_rate": 2.556323034638486e-08, "loss": 1.02729197, "memory(GiB)": 142.32, "step": 173340, "train_speed(iter/s)": 0.28611 }, { "acc": 0.73970776, "epoch": 1.9390224635685294, "grad_norm": 5.6875, "learning_rate": 2.537679604596499e-08, "loss": 1.03682022, "memory(GiB)": 142.32, "step": 173360, "train_speed(iter/s)": 0.286121 }, { "acc": 0.73157558, "epoch": 1.939246162514488, "grad_norm": 5.5625, "learning_rate": 2.51910423451035e-08, "loss": 1.06591721, "memory(GiB)": 142.32, "step": 173380, "train_speed(iter/s)": 0.286133 }, { "acc": 0.75689154, "epoch": 1.9394698614604464, "grad_norm": 7.40625, "learning_rate": 2.5005969269213394e-08, "loss": 0.94871616, "memory(GiB)": 142.32, "step": 173400, "train_speed(iter/s)": 0.286144 }, { "acc": 0.74141588, "epoch": 1.939693560406405, "grad_norm": 7.125, "learning_rate": 2.482157684361608e-08, "loss": 1.03968334, "memory(GiB)": 142.32, "step": 173420, "train_speed(iter/s)": 0.286155 }, { "acc": 0.74307528, "epoch": 1.9399172593523635, "grad_norm": 5.21875, "learning_rate": 2.463786509353805e-08, "loss": 1.01091385, "memory(GiB)": 142.32, "step": 173440, "train_speed(iter/s)": 0.286167 }, { "acc": 0.73517275, "epoch": 1.940140958298322, "grad_norm": 6.65625, "learning_rate": 2.4454834044115305e-08, "loss": 1.05966167, "memory(GiB)": 142.32, "step": 173460, "train_speed(iter/s)": 0.286178 }, { "acc": 0.73873982, "epoch": 1.9403646572442805, "grad_norm": 6.0625, "learning_rate": 2.427248372038782e-08, "loss": 1.0582777, "memory(GiB)": 142.32, "step": 173480, "train_speed(iter/s)": 0.28619 }, { "acc": 0.73784986, "epoch": 1.940588356190239, "grad_norm": 5.46875, "learning_rate": 2.4090814147303964e-08, "loss": 1.05382919, "memory(GiB)": 142.32, "step": 173500, "train_speed(iter/s)": 0.2862 }, { "acc": 0.73969173, "epoch": 1.9408120551361976, "grad_norm": 6.0625, "learning_rate": 2.390982534971886e-08, "loss": 1.04256659, "memory(GiB)": 142.32, "step": 173520, "train_speed(iter/s)": 0.286212 }, { "acc": 0.73040857, "epoch": 1.9410357540821561, "grad_norm": 6.84375, "learning_rate": 2.3729517352394372e-08, "loss": 1.06920547, "memory(GiB)": 142.32, "step": 173540, "train_speed(iter/s)": 0.286223 }, { "acc": 0.74325342, "epoch": 1.9412594530281146, "grad_norm": 6.4375, "learning_rate": 2.354989017999909e-08, "loss": 1.00739193, "memory(GiB)": 142.32, "step": 173560, "train_speed(iter/s)": 0.286235 }, { "acc": 0.74187922, "epoch": 1.9414831519740732, "grad_norm": 6.375, "learning_rate": 2.337094385710892e-08, "loss": 1.01383877, "memory(GiB)": 142.32, "step": 173580, "train_speed(iter/s)": 0.286246 }, { "acc": 0.73159056, "epoch": 1.9417068509200317, "grad_norm": 5.84375, "learning_rate": 2.3192678408205937e-08, "loss": 1.06416893, "memory(GiB)": 142.32, "step": 173600, "train_speed(iter/s)": 0.286257 }, { "acc": 0.72416306, "epoch": 1.9419305498659902, "grad_norm": 5.0625, "learning_rate": 2.3015093857680083e-08, "loss": 1.10576229, "memory(GiB)": 142.32, "step": 173620, "train_speed(iter/s)": 0.286268 }, { "acc": 0.72866154, "epoch": 1.9421542488119488, "grad_norm": 6.65625, "learning_rate": 2.2838190229825807e-08, "loss": 1.08585682, "memory(GiB)": 142.32, "step": 173640, "train_speed(iter/s)": 0.286278 }, { "acc": 0.74575877, "epoch": 1.9423779477579073, "grad_norm": 6.59375, "learning_rate": 2.2661967548848197e-08, "loss": 0.99857388, "memory(GiB)": 142.32, "step": 173660, "train_speed(iter/s)": 0.28629 }, { "acc": 0.73325753, "epoch": 1.9426016467038658, "grad_norm": 5.75, "learning_rate": 2.2486425838855185e-08, "loss": 1.05060892, "memory(GiB)": 142.32, "step": 173680, "train_speed(iter/s)": 0.286302 }, { "acc": 0.73862467, "epoch": 1.9428253456498243, "grad_norm": 6.8125, "learning_rate": 2.2311565123864786e-08, "loss": 1.03737354, "memory(GiB)": 142.32, "step": 173700, "train_speed(iter/s)": 0.286313 }, { "acc": 0.73714008, "epoch": 1.9430490445957829, "grad_norm": 4.65625, "learning_rate": 2.213738542779953e-08, "loss": 1.04795475, "memory(GiB)": 142.32, "step": 173720, "train_speed(iter/s)": 0.286324 }, { "acc": 0.75160027, "epoch": 1.9432727435417414, "grad_norm": 6.34375, "learning_rate": 2.1963886774489794e-08, "loss": 0.9773015, "memory(GiB)": 142.32, "step": 173740, "train_speed(iter/s)": 0.286335 }, { "acc": 0.73500447, "epoch": 1.9434964424877, "grad_norm": 5.375, "learning_rate": 2.1791069187673264e-08, "loss": 1.03670635, "memory(GiB)": 142.32, "step": 173760, "train_speed(iter/s)": 0.286346 }, { "acc": 0.73586464, "epoch": 1.9437201414336585, "grad_norm": 5.4375, "learning_rate": 2.1618932690993244e-08, "loss": 1.04417477, "memory(GiB)": 142.32, "step": 173780, "train_speed(iter/s)": 0.286358 }, { "acc": 0.73930998, "epoch": 1.943943840379617, "grad_norm": 5.5, "learning_rate": 2.1447477308000896e-08, "loss": 1.03272381, "memory(GiB)": 142.32, "step": 173800, "train_speed(iter/s)": 0.286369 }, { "acc": 0.73652925, "epoch": 1.9441675393255755, "grad_norm": 5.4375, "learning_rate": 2.127670306215357e-08, "loss": 1.04643373, "memory(GiB)": 142.32, "step": 173820, "train_speed(iter/s)": 0.286381 }, { "acc": 0.73743277, "epoch": 1.944391238271534, "grad_norm": 6.15625, "learning_rate": 2.110660997681535e-08, "loss": 1.04158545, "memory(GiB)": 142.32, "step": 173840, "train_speed(iter/s)": 0.286392 }, { "acc": 0.73887472, "epoch": 1.9446149372174926, "grad_norm": 6.1875, "learning_rate": 2.0937198075257624e-08, "loss": 1.03364391, "memory(GiB)": 142.32, "step": 173860, "train_speed(iter/s)": 0.286403 }, { "acc": 0.7353302, "epoch": 1.944838636163451, "grad_norm": 5.40625, "learning_rate": 2.0768467380659073e-08, "loss": 1.05187969, "memory(GiB)": 142.32, "step": 173880, "train_speed(iter/s)": 0.286415 }, { "acc": 0.73139319, "epoch": 1.9450623351094096, "grad_norm": 6.5, "learning_rate": 2.06004179161029e-08, "loss": 1.06126699, "memory(GiB)": 142.32, "step": 173900, "train_speed(iter/s)": 0.286425 }, { "acc": 0.74000263, "epoch": 1.9452860340553682, "grad_norm": 5.3125, "learning_rate": 2.0433049704582375e-08, "loss": 1.03229961, "memory(GiB)": 142.32, "step": 173920, "train_speed(iter/s)": 0.286437 }, { "acc": 0.73517227, "epoch": 1.9455097330013267, "grad_norm": 5.71875, "learning_rate": 2.0266362768994184e-08, "loss": 1.05750904, "memory(GiB)": 142.32, "step": 173940, "train_speed(iter/s)": 0.286447 }, { "acc": 0.74511127, "epoch": 1.9457334319472852, "grad_norm": 6.25, "learning_rate": 2.0100357132145087e-08, "loss": 1.00217972, "memory(GiB)": 142.32, "step": 173960, "train_speed(iter/s)": 0.286458 }, { "acc": 0.74058218, "epoch": 1.9459571308932437, "grad_norm": 7.21875, "learning_rate": 1.9935032816746358e-08, "loss": 1.03035088, "memory(GiB)": 142.32, "step": 173980, "train_speed(iter/s)": 0.286469 }, { "acc": 0.74083571, "epoch": 1.9461808298392023, "grad_norm": 4.8125, "learning_rate": 1.9770389845416017e-08, "loss": 1.0376091, "memory(GiB)": 142.32, "step": 174000, "train_speed(iter/s)": 0.28648 }, { "epoch": 1.9461808298392023, "eval_acc": 0.6964031977359821, "eval_loss": 1.0713659524917603, "eval_runtime": 2339.2812, "eval_samples_per_second": 32.182, "eval_steps_per_second": 16.091, "step": 174000 }, { "acc": 0.72842636, "epoch": 1.9464045287851608, "grad_norm": 6.8125, "learning_rate": 1.9606428240680486e-08, "loss": 1.08881321, "memory(GiB)": 142.32, "step": 174020, "train_speed(iter/s)": 0.285369 }, { "acc": 0.741115, "epoch": 1.9466282277311193, "grad_norm": 6.09375, "learning_rate": 1.9443148024971827e-08, "loss": 1.01941633, "memory(GiB)": 142.32, "step": 174040, "train_speed(iter/s)": 0.285381 }, { "acc": 0.72748175, "epoch": 1.9468519266770778, "grad_norm": 7.21875, "learning_rate": 1.9280549220629386e-08, "loss": 1.09175625, "memory(GiB)": 142.32, "step": 174060, "train_speed(iter/s)": 0.285392 }, { "acc": 0.72693405, "epoch": 1.9470756256230364, "grad_norm": 4.84375, "learning_rate": 1.9118631849898707e-08, "loss": 1.10131626, "memory(GiB)": 142.32, "step": 174080, "train_speed(iter/s)": 0.285403 }, { "acc": 0.73665047, "epoch": 1.947299324568995, "grad_norm": 5.96875, "learning_rate": 1.8957395934932067e-08, "loss": 1.0533535, "memory(GiB)": 142.32, "step": 174100, "train_speed(iter/s)": 0.285415 }, { "acc": 0.72920771, "epoch": 1.9475230235149534, "grad_norm": 5.3125, "learning_rate": 1.8796841497789042e-08, "loss": 1.06513672, "memory(GiB)": 142.32, "step": 174120, "train_speed(iter/s)": 0.285428 }, { "acc": 0.73078861, "epoch": 1.947746722460912, "grad_norm": 5.9375, "learning_rate": 1.863696856043595e-08, "loss": 1.08311977, "memory(GiB)": 142.32, "step": 174140, "train_speed(iter/s)": 0.285438 }, { "acc": 0.73487139, "epoch": 1.9479704214068705, "grad_norm": 6.3125, "learning_rate": 1.8477777144745857e-08, "loss": 1.04111204, "memory(GiB)": 142.32, "step": 174160, "train_speed(iter/s)": 0.28545 }, { "acc": 0.73749595, "epoch": 1.948194120352829, "grad_norm": 5.40625, "learning_rate": 1.8319267272498552e-08, "loss": 1.05086231, "memory(GiB)": 142.32, "step": 174180, "train_speed(iter/s)": 0.285461 }, { "acc": 0.73995113, "epoch": 1.9484178192987875, "grad_norm": 5.6875, "learning_rate": 1.8161438965379474e-08, "loss": 1.04197674, "memory(GiB)": 142.32, "step": 174200, "train_speed(iter/s)": 0.285473 }, { "acc": 0.72816257, "epoch": 1.948641518244746, "grad_norm": 5.59375, "learning_rate": 1.8004292244983012e-08, "loss": 1.08518124, "memory(GiB)": 142.32, "step": 174220, "train_speed(iter/s)": 0.285484 }, { "acc": 0.72933531, "epoch": 1.9488652171907046, "grad_norm": 6.125, "learning_rate": 1.7847827132807526e-08, "loss": 1.07712097, "memory(GiB)": 142.32, "step": 174240, "train_speed(iter/s)": 0.285497 }, { "acc": 0.74192438, "epoch": 1.9490889161366631, "grad_norm": 6.46875, "learning_rate": 1.7692043650261447e-08, "loss": 1.02268095, "memory(GiB)": 142.32, "step": 174260, "train_speed(iter/s)": 0.285508 }, { "acc": 0.73822479, "epoch": 1.9493126150826217, "grad_norm": 8.0625, "learning_rate": 1.7536941818657173e-08, "loss": 1.04408588, "memory(GiB)": 142.32, "step": 174280, "train_speed(iter/s)": 0.285519 }, { "acc": 0.74206381, "epoch": 1.9495363140285802, "grad_norm": 6.65625, "learning_rate": 1.7382521659214946e-08, "loss": 1.03803825, "memory(GiB)": 142.32, "step": 174300, "train_speed(iter/s)": 0.28553 }, { "acc": 0.75158329, "epoch": 1.9497600129745387, "grad_norm": 6.25, "learning_rate": 1.7228783193062316e-08, "loss": 0.97012253, "memory(GiB)": 142.32, "step": 174320, "train_speed(iter/s)": 0.285539 }, { "acc": 0.73160281, "epoch": 1.9499837119204972, "grad_norm": 6.5625, "learning_rate": 1.70757264412319e-08, "loss": 1.06962948, "memory(GiB)": 142.32, "step": 174340, "train_speed(iter/s)": 0.28555 }, { "acc": 0.74043055, "epoch": 1.9502074108664558, "grad_norm": 6.1875, "learning_rate": 1.6923351424664724e-08, "loss": 1.02961235, "memory(GiB)": 142.32, "step": 174360, "train_speed(iter/s)": 0.285561 }, { "acc": 0.7289854, "epoch": 1.9504311098124143, "grad_norm": 6.21875, "learning_rate": 1.6771658164207448e-08, "loss": 1.09580803, "memory(GiB)": 142.32, "step": 174380, "train_speed(iter/s)": 0.285572 }, { "acc": 0.7276823, "epoch": 1.9506548087583728, "grad_norm": 5.09375, "learning_rate": 1.662064668061403e-08, "loss": 1.1086237, "memory(GiB)": 142.32, "step": 174400, "train_speed(iter/s)": 0.285583 }, { "acc": 0.73732891, "epoch": 1.9508785077043314, "grad_norm": 5.5625, "learning_rate": 1.6470316994545154e-08, "loss": 1.04749174, "memory(GiB)": 142.32, "step": 174420, "train_speed(iter/s)": 0.285594 }, { "acc": 0.73074636, "epoch": 1.9511022066502899, "grad_norm": 6.28125, "learning_rate": 1.632066912656771e-08, "loss": 1.08011713, "memory(GiB)": 142.32, "step": 174440, "train_speed(iter/s)": 0.285605 }, { "acc": 0.738239, "epoch": 1.9513259055962484, "grad_norm": 6.0, "learning_rate": 1.617170309715643e-08, "loss": 1.02489748, "memory(GiB)": 142.32, "step": 174460, "train_speed(iter/s)": 0.285616 }, { "acc": 0.73999701, "epoch": 1.951549604542207, "grad_norm": 5.625, "learning_rate": 1.6023418926691127e-08, "loss": 1.02703171, "memory(GiB)": 142.32, "step": 174480, "train_speed(iter/s)": 0.285627 }, { "acc": 0.73047557, "epoch": 1.9517733034881655, "grad_norm": 4.90625, "learning_rate": 1.587581663545945e-08, "loss": 1.07945595, "memory(GiB)": 142.32, "step": 174500, "train_speed(iter/s)": 0.285638 }, { "acc": 0.73448658, "epoch": 1.951997002434124, "grad_norm": 5.78125, "learning_rate": 1.572889624365581e-08, "loss": 1.06564484, "memory(GiB)": 142.32, "step": 174520, "train_speed(iter/s)": 0.285651 }, { "acc": 0.74053164, "epoch": 1.9522207013800825, "grad_norm": 5.65625, "learning_rate": 1.5582657771380793e-08, "loss": 1.03189774, "memory(GiB)": 142.32, "step": 174540, "train_speed(iter/s)": 0.285662 }, { "acc": 0.73871794, "epoch": 1.952444400326041, "grad_norm": 7.59375, "learning_rate": 1.5437101238642282e-08, "loss": 1.03808403, "memory(GiB)": 142.32, "step": 174560, "train_speed(iter/s)": 0.285673 }, { "acc": 0.73825397, "epoch": 1.9526680992719996, "grad_norm": 6.09375, "learning_rate": 1.5292226665353795e-08, "loss": 1.02766657, "memory(GiB)": 142.32, "step": 174580, "train_speed(iter/s)": 0.285684 }, { "acc": 0.74024158, "epoch": 1.952891798217958, "grad_norm": 3.921875, "learning_rate": 1.5148034071336137e-08, "loss": 1.03202801, "memory(GiB)": 142.32, "step": 174600, "train_speed(iter/s)": 0.285695 }, { "acc": 0.72797699, "epoch": 1.9531154971639166, "grad_norm": 8.5625, "learning_rate": 1.5004523476317978e-08, "loss": 1.09910736, "memory(GiB)": 142.32, "step": 174620, "train_speed(iter/s)": 0.285707 }, { "acc": 0.73985958, "epoch": 1.9533391961098752, "grad_norm": 7.75, "learning_rate": 1.4861694899933054e-08, "loss": 1.03705463, "memory(GiB)": 142.32, "step": 174640, "train_speed(iter/s)": 0.285716 }, { "acc": 0.74139118, "epoch": 1.9535628950558337, "grad_norm": 7.125, "learning_rate": 1.4719548361721847e-08, "loss": 1.03297606, "memory(GiB)": 142.32, "step": 174660, "train_speed(iter/s)": 0.285729 }, { "acc": 0.7527174, "epoch": 1.9537865940017924, "grad_norm": 5.78125, "learning_rate": 1.4578083881132687e-08, "loss": 0.9529974, "memory(GiB)": 142.32, "step": 174680, "train_speed(iter/s)": 0.285741 }, { "acc": 0.72949762, "epoch": 1.954010292947751, "grad_norm": 6.4375, "learning_rate": 1.4437301477520094e-08, "loss": 1.10103951, "memory(GiB)": 142.32, "step": 174700, "train_speed(iter/s)": 0.285752 }, { "acc": 0.74731331, "epoch": 1.9542339918937095, "grad_norm": 6.75, "learning_rate": 1.429720117014477e-08, "loss": 1.00550537, "memory(GiB)": 142.32, "step": 174720, "train_speed(iter/s)": 0.285762 }, { "acc": 0.73989601, "epoch": 1.954457690839668, "grad_norm": 6.71875, "learning_rate": 1.4157782978174162e-08, "loss": 1.0418973, "memory(GiB)": 142.32, "step": 174740, "train_speed(iter/s)": 0.285773 }, { "acc": 0.73073444, "epoch": 1.9546813897856266, "grad_norm": 6.375, "learning_rate": 1.401904692068301e-08, "loss": 1.07834435, "memory(GiB)": 142.32, "step": 174760, "train_speed(iter/s)": 0.285784 }, { "acc": 0.72966256, "epoch": 1.954905088731585, "grad_norm": 7.5625, "learning_rate": 1.388099301665169e-08, "loss": 1.07555733, "memory(GiB)": 142.32, "step": 174780, "train_speed(iter/s)": 0.285797 }, { "acc": 0.74814472, "epoch": 1.9551287876775436, "grad_norm": 6.5625, "learning_rate": 1.3743621284969533e-08, "loss": 0.97941942, "memory(GiB)": 142.32, "step": 174800, "train_speed(iter/s)": 0.285809 }, { "acc": 0.74566793, "epoch": 1.9553524866235021, "grad_norm": 7.09375, "learning_rate": 1.3606931744429841e-08, "loss": 1.01579247, "memory(GiB)": 142.32, "step": 174820, "train_speed(iter/s)": 0.28582 }, { "acc": 0.74364605, "epoch": 1.9555761855694607, "grad_norm": 5.5625, "learning_rate": 1.3470924413733211e-08, "loss": 1.01632242, "memory(GiB)": 142.32, "step": 174840, "train_speed(iter/s)": 0.285833 }, { "acc": 0.74550838, "epoch": 1.9557998845154192, "grad_norm": 5.75, "learning_rate": 1.3335599311488644e-08, "loss": 0.99844837, "memory(GiB)": 142.32, "step": 174860, "train_speed(iter/s)": 0.285845 }, { "acc": 0.73093424, "epoch": 1.9560235834613777, "grad_norm": 6.09375, "learning_rate": 1.3200956456209668e-08, "loss": 1.07797775, "memory(GiB)": 142.32, "step": 174880, "train_speed(iter/s)": 0.285856 }, { "acc": 0.73325758, "epoch": 1.9562472824073363, "grad_norm": 6.46875, "learning_rate": 1.306699586631821e-08, "loss": 1.07565737, "memory(GiB)": 142.32, "step": 174900, "train_speed(iter/s)": 0.285866 }, { "acc": 0.72946587, "epoch": 1.9564709813532948, "grad_norm": 6.78125, "learning_rate": 1.2933717560140723e-08, "loss": 1.07101793, "memory(GiB)": 142.32, "step": 174920, "train_speed(iter/s)": 0.28588 }, { "acc": 0.72850723, "epoch": 1.9566946802992533, "grad_norm": 6.34375, "learning_rate": 1.280112155591262e-08, "loss": 1.07512016, "memory(GiB)": 142.32, "step": 174940, "train_speed(iter/s)": 0.28589 }, { "acc": 0.73453808, "epoch": 1.9569183792452118, "grad_norm": 7.9375, "learning_rate": 1.2669207871774391e-08, "loss": 1.0540123, "memory(GiB)": 142.32, "step": 174960, "train_speed(iter/s)": 0.2859 }, { "acc": 0.75700288, "epoch": 1.9571420781911704, "grad_norm": 6.34375, "learning_rate": 1.2537976525774376e-08, "loss": 0.94423294, "memory(GiB)": 142.32, "step": 174980, "train_speed(iter/s)": 0.285911 }, { "acc": 0.7469244, "epoch": 1.957365777137129, "grad_norm": 5.0, "learning_rate": 1.240742753586599e-08, "loss": 0.99192247, "memory(GiB)": 142.32, "step": 175000, "train_speed(iter/s)": 0.285921 }, { "acc": 0.73778906, "epoch": 1.9575894760830874, "grad_norm": 5.875, "learning_rate": 1.2277560919910504e-08, "loss": 1.04740906, "memory(GiB)": 142.32, "step": 175020, "train_speed(iter/s)": 0.285931 }, { "acc": 0.7390439, "epoch": 1.957813175029046, "grad_norm": 5.96875, "learning_rate": 1.2148376695675923e-08, "loss": 1.02824211, "memory(GiB)": 142.32, "step": 175040, "train_speed(iter/s)": 0.285943 }, { "acc": 0.74233332, "epoch": 1.9580368739750045, "grad_norm": 5.59375, "learning_rate": 1.2019874880836446e-08, "loss": 1.02205582, "memory(GiB)": 142.32, "step": 175060, "train_speed(iter/s)": 0.285954 }, { "acc": 0.74909172, "epoch": 1.958260572920963, "grad_norm": 6.90625, "learning_rate": 1.1892055492972455e-08, "loss": 0.99131861, "memory(GiB)": 142.32, "step": 175080, "train_speed(iter/s)": 0.285964 }, { "acc": 0.72939663, "epoch": 1.9584842718669215, "grad_norm": 6.25, "learning_rate": 1.1764918549571624e-08, "loss": 1.09671307, "memory(GiB)": 142.32, "step": 175100, "train_speed(iter/s)": 0.285976 }, { "acc": 0.74326439, "epoch": 1.95870797081288, "grad_norm": 6.28125, "learning_rate": 1.1638464068028376e-08, "loss": 1.01327219, "memory(GiB)": 142.32, "step": 175120, "train_speed(iter/s)": 0.285986 }, { "acc": 0.73619823, "epoch": 1.9589316697588386, "grad_norm": 5.96875, "learning_rate": 1.1512692065643316e-08, "loss": 1.04018393, "memory(GiB)": 142.32, "step": 175140, "train_speed(iter/s)": 0.285997 }, { "acc": 0.74469185, "epoch": 1.9591553687047971, "grad_norm": 6.25, "learning_rate": 1.1387602559624345e-08, "loss": 1.01565628, "memory(GiB)": 142.32, "step": 175160, "train_speed(iter/s)": 0.286009 }, { "acc": 0.74230366, "epoch": 1.9593790676507556, "grad_norm": 5.875, "learning_rate": 1.126319556708444e-08, "loss": 1.01989841, "memory(GiB)": 142.32, "step": 175180, "train_speed(iter/s)": 0.286021 }, { "acc": 0.74284401, "epoch": 1.9596027665967142, "grad_norm": 5.875, "learning_rate": 1.1139471105044985e-08, "loss": 1.01779566, "memory(GiB)": 142.32, "step": 175200, "train_speed(iter/s)": 0.286033 }, { "acc": 0.73672857, "epoch": 1.9598264655426727, "grad_norm": 6.1875, "learning_rate": 1.1016429190433552e-08, "loss": 1.05044193, "memory(GiB)": 142.32, "step": 175220, "train_speed(iter/s)": 0.286043 }, { "acc": 0.74023905, "epoch": 1.9600501644886312, "grad_norm": 6.1875, "learning_rate": 1.0894069840083343e-08, "loss": 1.01302834, "memory(GiB)": 142.32, "step": 175240, "train_speed(iter/s)": 0.286053 }, { "acc": 0.7439992, "epoch": 1.9602738634345898, "grad_norm": 6.75, "learning_rate": 1.077239307073541e-08, "loss": 1.0165432, "memory(GiB)": 142.32, "step": 175260, "train_speed(iter/s)": 0.286064 }, { "acc": 0.73538275, "epoch": 1.9604975623805483, "grad_norm": 5.3125, "learning_rate": 1.065139889903588e-08, "loss": 1.05819464, "memory(GiB)": 142.32, "step": 175280, "train_speed(iter/s)": 0.286075 }, { "acc": 0.7352314, "epoch": 1.9607212613265068, "grad_norm": 5.65625, "learning_rate": 1.0531087341539847e-08, "loss": 1.06155186, "memory(GiB)": 142.32, "step": 175300, "train_speed(iter/s)": 0.286087 }, { "acc": 0.73586674, "epoch": 1.9609449602724653, "grad_norm": 6.34375, "learning_rate": 1.0411458414706921e-08, "loss": 1.05888042, "memory(GiB)": 142.32, "step": 175320, "train_speed(iter/s)": 0.286099 }, { "acc": 0.73443785, "epoch": 1.9611686592184239, "grad_norm": 6.21875, "learning_rate": 1.0292512134904009e-08, "loss": 1.06088047, "memory(GiB)": 142.32, "step": 175340, "train_speed(iter/s)": 0.286109 }, { "acc": 0.73484764, "epoch": 1.9613923581643824, "grad_norm": 6.1875, "learning_rate": 1.017424851840476e-08, "loss": 1.075881, "memory(GiB)": 142.32, "step": 175360, "train_speed(iter/s)": 0.28612 }, { "acc": 0.74040685, "epoch": 1.961616057110341, "grad_norm": 5.53125, "learning_rate": 1.0056667581389012e-08, "loss": 1.03159113, "memory(GiB)": 142.32, "step": 175380, "train_speed(iter/s)": 0.286132 }, { "acc": 0.73891344, "epoch": 1.9618397560562995, "grad_norm": 5.71875, "learning_rate": 9.93976933994445e-09, "loss": 1.02342892, "memory(GiB)": 142.32, "step": 175400, "train_speed(iter/s)": 0.286143 }, { "acc": 0.73387184, "epoch": 1.962063455002258, "grad_norm": 7.0625, "learning_rate": 9.82355381006328e-09, "loss": 1.05103779, "memory(GiB)": 142.32, "step": 175420, "train_speed(iter/s)": 0.286154 }, { "acc": 0.73628073, "epoch": 1.9622871539482165, "grad_norm": 4.8125, "learning_rate": 9.708021007646673e-09, "loss": 1.04207344, "memory(GiB)": 142.32, "step": 175440, "train_speed(iter/s)": 0.286165 }, { "acc": 0.74236741, "epoch": 1.962510852894175, "grad_norm": 6.4375, "learning_rate": 9.59317094850032e-09, "loss": 1.00205326, "memory(GiB)": 142.32, "step": 175460, "train_speed(iter/s)": 0.286175 }, { "acc": 0.72398605, "epoch": 1.9627345518401336, "grad_norm": 4.5625, "learning_rate": 9.479003648337204e-09, "loss": 1.11279678, "memory(GiB)": 142.32, "step": 175480, "train_speed(iter/s)": 0.286186 }, { "acc": 0.72924671, "epoch": 1.962958250786092, "grad_norm": 6.0625, "learning_rate": 9.36551912277761e-09, "loss": 1.0753624, "memory(GiB)": 142.32, "step": 175500, "train_speed(iter/s)": 0.286197 }, { "acc": 0.73531132, "epoch": 1.9631819497320506, "grad_norm": 6.375, "learning_rate": 9.252717387347455e-09, "loss": 1.0440176, "memory(GiB)": 142.32, "step": 175520, "train_speed(iter/s)": 0.286207 }, { "acc": 0.73639264, "epoch": 1.9634056486780092, "grad_norm": 5.96875, "learning_rate": 9.140598457479944e-09, "loss": 1.04552708, "memory(GiB)": 142.32, "step": 175540, "train_speed(iter/s)": 0.286217 }, { "acc": 0.73816242, "epoch": 1.9636293476239677, "grad_norm": 6.375, "learning_rate": 9.029162348514475e-09, "loss": 1.04587898, "memory(GiB)": 142.32, "step": 175560, "train_speed(iter/s)": 0.286228 }, { "acc": 0.73922558, "epoch": 1.9638530465699262, "grad_norm": 6.34375, "learning_rate": 8.918409075696632e-09, "loss": 1.03905354, "memory(GiB)": 142.32, "step": 175580, "train_speed(iter/s)": 0.28624 }, { "acc": 0.73846855, "epoch": 1.9640767455158847, "grad_norm": 5.5625, "learning_rate": 8.808338654179293e-09, "loss": 1.03496475, "memory(GiB)": 142.32, "step": 175600, "train_speed(iter/s)": 0.28625 }, { "acc": 0.74093132, "epoch": 1.9643004444618433, "grad_norm": 5.65625, "learning_rate": 8.698951099022079e-09, "loss": 1.03246269, "memory(GiB)": 142.32, "step": 175620, "train_speed(iter/s)": 0.286261 }, { "acc": 0.73448787, "epoch": 1.9645241434078018, "grad_norm": 6.125, "learning_rate": 8.590246425190797e-09, "loss": 1.05209599, "memory(GiB)": 142.32, "step": 175640, "train_speed(iter/s)": 0.286273 }, { "acc": 0.72278571, "epoch": 1.9647478423537603, "grad_norm": 5.28125, "learning_rate": 8.482224647557436e-09, "loss": 1.10229149, "memory(GiB)": 142.32, "step": 175660, "train_speed(iter/s)": 0.286285 }, { "acc": 0.73699517, "epoch": 1.9649715412997188, "grad_norm": 5.46875, "learning_rate": 8.374885780900177e-09, "loss": 1.06226206, "memory(GiB)": 142.32, "step": 175680, "train_speed(iter/s)": 0.286296 }, { "acc": 0.74098434, "epoch": 1.9651952402456774, "grad_norm": 7.4375, "learning_rate": 8.268229839906161e-09, "loss": 1.01249981, "memory(GiB)": 142.32, "step": 175700, "train_speed(iter/s)": 0.286305 }, { "acc": 0.73935027, "epoch": 1.965418939191636, "grad_norm": 4.75, "learning_rate": 8.162256839166494e-09, "loss": 1.03106194, "memory(GiB)": 142.32, "step": 175720, "train_speed(iter/s)": 0.286315 }, { "acc": 0.72956724, "epoch": 1.9656426381375944, "grad_norm": 6.09375, "learning_rate": 8.056966793179577e-09, "loss": 1.08564644, "memory(GiB)": 142.32, "step": 175740, "train_speed(iter/s)": 0.286327 }, { "acc": 0.74976988, "epoch": 1.965866337083553, "grad_norm": 7.875, "learning_rate": 7.95235971635111e-09, "loss": 1.00206079, "memory(GiB)": 142.32, "step": 175760, "train_speed(iter/s)": 0.286337 }, { "acc": 0.73001018, "epoch": 1.9660900360295115, "grad_norm": 7.28125, "learning_rate": 7.848435622992423e-09, "loss": 1.08151073, "memory(GiB)": 142.32, "step": 175780, "train_speed(iter/s)": 0.286348 }, { "acc": 0.74270449, "epoch": 1.96631373497547, "grad_norm": 6.09375, "learning_rate": 7.745194527322141e-09, "loss": 1.02196732, "memory(GiB)": 142.32, "step": 175800, "train_speed(iter/s)": 0.28636 }, { "acc": 0.74865093, "epoch": 1.9665374339214285, "grad_norm": 6.3125, "learning_rate": 7.64263644346508e-09, "loss": 0.98849573, "memory(GiB)": 142.32, "step": 175820, "train_speed(iter/s)": 0.286371 }, { "acc": 0.73524752, "epoch": 1.966761132867387, "grad_norm": 5.96875, "learning_rate": 7.540761385452233e-09, "loss": 1.04939804, "memory(GiB)": 142.32, "step": 175840, "train_speed(iter/s)": 0.286382 }, { "acc": 0.743887, "epoch": 1.9669848318133456, "grad_norm": 6.09375, "learning_rate": 7.4395693672224545e-09, "loss": 0.99978685, "memory(GiB)": 142.32, "step": 175860, "train_speed(iter/s)": 0.286393 }, { "acc": 0.74302149, "epoch": 1.9672085307593041, "grad_norm": 6.53125, "learning_rate": 7.339060402619668e-09, "loss": 1.03060064, "memory(GiB)": 142.32, "step": 175880, "train_speed(iter/s)": 0.286405 }, { "acc": 0.7340539, "epoch": 1.9674322297052627, "grad_norm": 5.78125, "learning_rate": 7.239234505394543e-09, "loss": 1.05179234, "memory(GiB)": 142.32, "step": 175900, "train_speed(iter/s)": 0.286416 }, { "acc": 0.74833941, "epoch": 1.9676559286512212, "grad_norm": 6.46875, "learning_rate": 7.1400916892055974e-09, "loss": 0.98697987, "memory(GiB)": 142.32, "step": 175920, "train_speed(iter/s)": 0.286428 }, { "acc": 0.74786482, "epoch": 1.9678796275971797, "grad_norm": 5.46875, "learning_rate": 7.0416319676164246e-09, "loss": 0.99864931, "memory(GiB)": 142.32, "step": 175940, "train_speed(iter/s)": 0.28644 }, { "acc": 0.74264688, "epoch": 1.9681033265431382, "grad_norm": 4.96875, "learning_rate": 6.943855354097362e-09, "loss": 1.02155762, "memory(GiB)": 142.32, "step": 175960, "train_speed(iter/s)": 0.286452 }, { "acc": 0.73431187, "epoch": 1.9683270254890968, "grad_norm": 6.21875, "learning_rate": 6.846761862026597e-09, "loss": 1.05050259, "memory(GiB)": 142.32, "step": 175980, "train_speed(iter/s)": 0.286461 }, { "acc": 0.74643164, "epoch": 1.9685507244350553, "grad_norm": 6.8125, "learning_rate": 6.7503515046873914e-09, "loss": 1.00934362, "memory(GiB)": 142.32, "step": 176000, "train_speed(iter/s)": 0.286473 }, { "epoch": 1.9685507244350553, "eval_acc": 0.6963874721030229, "eval_loss": 1.0713635683059692, "eval_runtime": 2336.9986, "eval_samples_per_second": 32.214, "eval_steps_per_second": 16.107, "step": 176000 }, { "acc": 0.74145699, "epoch": 1.9687744233810138, "grad_norm": 5.375, "learning_rate": 6.654624295269751e-09, "loss": 1.01185007, "memory(GiB)": 142.32, "step": 176020, "train_speed(iter/s)": 0.285375 }, { "acc": 0.74462662, "epoch": 1.9689981223269724, "grad_norm": 7.4375, "learning_rate": 6.559580246870978e-09, "loss": 1.02320156, "memory(GiB)": 142.32, "step": 176040, "train_speed(iter/s)": 0.285386 }, { "acc": 0.73821664, "epoch": 1.9692218212729309, "grad_norm": 5.90625, "learning_rate": 6.465219372494558e-09, "loss": 1.02282715, "memory(GiB)": 142.32, "step": 176060, "train_speed(iter/s)": 0.285398 }, { "acc": 0.74631104, "epoch": 1.9694455202188894, "grad_norm": 5.8125, "learning_rate": 6.371541685050164e-09, "loss": 1.0017065, "memory(GiB)": 142.32, "step": 176080, "train_speed(iter/s)": 0.285408 }, { "acc": 0.74035931, "epoch": 1.969669219164848, "grad_norm": 6.6875, "learning_rate": 6.278547197354212e-09, "loss": 1.03666687, "memory(GiB)": 142.32, "step": 176100, "train_speed(iter/s)": 0.28542 }, { "acc": 0.72664042, "epoch": 1.9698929181108065, "grad_norm": 5.40625, "learning_rate": 6.186235922129302e-09, "loss": 1.09097948, "memory(GiB)": 142.32, "step": 176120, "train_speed(iter/s)": 0.285432 }, { "acc": 0.74237695, "epoch": 1.970116617056765, "grad_norm": 5.0, "learning_rate": 6.094607872005886e-09, "loss": 1.02614136, "memory(GiB)": 142.32, "step": 176140, "train_speed(iter/s)": 0.285443 }, { "acc": 0.74100418, "epoch": 1.9703403160027235, "grad_norm": 6.65625, "learning_rate": 6.0036630595194936e-09, "loss": 1.02676086, "memory(GiB)": 142.32, "step": 176160, "train_speed(iter/s)": 0.285455 }, { "acc": 0.73579969, "epoch": 1.970564014948682, "grad_norm": 5.71875, "learning_rate": 5.913401497112392e-09, "loss": 1.04835091, "memory(GiB)": 142.32, "step": 176180, "train_speed(iter/s)": 0.285467 }, { "acc": 0.73031344, "epoch": 1.9707877138946406, "grad_norm": 6.34375, "learning_rate": 5.823823197133594e-09, "loss": 1.08686628, "memory(GiB)": 142.32, "step": 176200, "train_speed(iter/s)": 0.285478 }, { "acc": 0.74521341, "epoch": 1.971011412840599, "grad_norm": 6.25, "learning_rate": 5.7349281718394045e-09, "loss": 1.0233839, "memory(GiB)": 142.32, "step": 176220, "train_speed(iter/s)": 0.285488 }, { "acc": 0.74909611, "epoch": 1.9712351117865576, "grad_norm": 7.4375, "learning_rate": 5.646716433391763e-09, "loss": 0.98793144, "memory(GiB)": 142.32, "step": 176240, "train_speed(iter/s)": 0.2855 }, { "acc": 0.73263559, "epoch": 1.9714588107325162, "grad_norm": 6.0625, "learning_rate": 5.559187993858794e-09, "loss": 1.05533466, "memory(GiB)": 142.32, "step": 176260, "train_speed(iter/s)": 0.285511 }, { "acc": 0.73164835, "epoch": 1.9716825096784747, "grad_norm": 5.59375, "learning_rate": 5.472342865215918e-09, "loss": 1.06562672, "memory(GiB)": 142.32, "step": 176280, "train_speed(iter/s)": 0.285523 }, { "acc": 0.73597422, "epoch": 1.9719062086244332, "grad_norm": 6.6875, "learning_rate": 5.386181059344741e-09, "loss": 1.03987122, "memory(GiB)": 142.32, "step": 176300, "train_speed(iter/s)": 0.285533 }, { "acc": 0.74600477, "epoch": 1.9721299075703917, "grad_norm": 7.34375, "learning_rate": 5.3007025880336125e-09, "loss": 1.00083838, "memory(GiB)": 142.32, "step": 176320, "train_speed(iter/s)": 0.285544 }, { "acc": 0.741646, "epoch": 1.9723536065163503, "grad_norm": 6.5625, "learning_rate": 5.2159074629770655e-09, "loss": 1.02766228, "memory(GiB)": 142.32, "step": 176340, "train_speed(iter/s)": 0.285556 }, { "acc": 0.7423749, "epoch": 1.9725773054623088, "grad_norm": 6.625, "learning_rate": 5.1317956957763755e-09, "loss": 1.0475523, "memory(GiB)": 142.32, "step": 176360, "train_speed(iter/s)": 0.285564 }, { "acc": 0.73120055, "epoch": 1.9728010044082673, "grad_norm": 5.03125, "learning_rate": 5.0483672979390054e-09, "loss": 1.07517023, "memory(GiB)": 142.32, "step": 176380, "train_speed(iter/s)": 0.285575 }, { "acc": 0.73773489, "epoch": 1.9730247033542259, "grad_norm": 4.5625, "learning_rate": 4.965622280879157e-09, "loss": 1.04134293, "memory(GiB)": 142.32, "step": 176400, "train_speed(iter/s)": 0.285587 }, { "acc": 0.73583698, "epoch": 1.9732484023001844, "grad_norm": 6.1875, "learning_rate": 4.883560655917774e-09, "loss": 1.054039, "memory(GiB)": 142.32, "step": 176420, "train_speed(iter/s)": 0.285598 }, { "acc": 0.74270387, "epoch": 1.973472101246143, "grad_norm": 5.5, "learning_rate": 4.802182434281988e-09, "loss": 1.00855827, "memory(GiB)": 142.32, "step": 176440, "train_speed(iter/s)": 0.28561 }, { "acc": 0.75040717, "epoch": 1.9736958001921014, "grad_norm": 5.6875, "learning_rate": 4.721487627105669e-09, "loss": 1.00173302, "memory(GiB)": 142.32, "step": 176460, "train_speed(iter/s)": 0.28562 }, { "acc": 0.74554863, "epoch": 1.97391949913806, "grad_norm": 9.875, "learning_rate": 4.641476245428877e-09, "loss": 0.99881802, "memory(GiB)": 142.32, "step": 176480, "train_speed(iter/s)": 0.285631 }, { "acc": 0.73658485, "epoch": 1.9741431980840185, "grad_norm": 5.4375, "learning_rate": 4.562148300197855e-09, "loss": 1.05360966, "memory(GiB)": 142.32, "step": 176500, "train_speed(iter/s)": 0.285641 }, { "acc": 0.74653721, "epoch": 1.974366897029977, "grad_norm": 6.5, "learning_rate": 4.483503802266143e-09, "loss": 1.01018982, "memory(GiB)": 142.32, "step": 176520, "train_speed(iter/s)": 0.285653 }, { "acc": 0.73692465, "epoch": 1.9745905959759356, "grad_norm": 5.90625, "learning_rate": 4.4055427623940215e-09, "loss": 1.03880529, "memory(GiB)": 142.32, "step": 176540, "train_speed(iter/s)": 0.285665 }, { "acc": 0.74803019, "epoch": 1.974814294921894, "grad_norm": 6.875, "learning_rate": 4.32826519124685e-09, "loss": 0.99245577, "memory(GiB)": 142.32, "step": 176560, "train_speed(iter/s)": 0.285676 }, { "acc": 0.7402245, "epoch": 1.9750379938678526, "grad_norm": 6.125, "learning_rate": 4.251671099397836e-09, "loss": 1.03381119, "memory(GiB)": 142.32, "step": 176580, "train_speed(iter/s)": 0.285686 }, { "acc": 0.74382362, "epoch": 1.9752616928138111, "grad_norm": 5.25, "learning_rate": 4.175760497325265e-09, "loss": 1.01998892, "memory(GiB)": 142.32, "step": 176600, "train_speed(iter/s)": 0.285697 }, { "acc": 0.73538008, "epoch": 1.9754853917597697, "grad_norm": 6.40625, "learning_rate": 4.100533395415829e-09, "loss": 1.0411828, "memory(GiB)": 142.32, "step": 176620, "train_speed(iter/s)": 0.285709 }, { "acc": 0.74230356, "epoch": 1.9757090907057282, "grad_norm": 4.5625, "learning_rate": 4.025989803961294e-09, "loss": 1.02469845, "memory(GiB)": 142.32, "step": 176640, "train_speed(iter/s)": 0.28572 }, { "acc": 0.73516102, "epoch": 1.9759327896516867, "grad_norm": 6.53125, "learning_rate": 3.952129733160726e-09, "loss": 1.03164444, "memory(GiB)": 142.32, "step": 176660, "train_speed(iter/s)": 0.285733 }, { "acc": 0.73411446, "epoch": 1.9761564885976453, "grad_norm": 7.34375, "learning_rate": 3.878953193118262e-09, "loss": 1.06293507, "memory(GiB)": 142.32, "step": 176680, "train_speed(iter/s)": 0.285743 }, { "acc": 0.74488621, "epoch": 1.9763801875436038, "grad_norm": 6.03125, "learning_rate": 3.806460193845896e-09, "loss": 0.98984842, "memory(GiB)": 142.32, "step": 176700, "train_speed(iter/s)": 0.285753 }, { "acc": 0.73583417, "epoch": 1.9766038864895623, "grad_norm": 6.15625, "learning_rate": 3.734650745262358e-09, "loss": 1.04971743, "memory(GiB)": 142.32, "step": 176720, "train_speed(iter/s)": 0.285764 }, { "acc": 0.74210839, "epoch": 1.9768275854355208, "grad_norm": 6.125, "learning_rate": 3.6635248571914583e-09, "loss": 1.03148651, "memory(GiB)": 142.32, "step": 176740, "train_speed(iter/s)": 0.285775 }, { "acc": 0.7345706, "epoch": 1.9770512843814794, "grad_norm": 4.75, "learning_rate": 3.593082539364301e-09, "loss": 1.05787296, "memory(GiB)": 142.32, "step": 176760, "train_speed(iter/s)": 0.285787 }, { "acc": 0.73403602, "epoch": 1.977274983327438, "grad_norm": 5.375, "learning_rate": 3.523323801418732e-09, "loss": 1.0670536, "memory(GiB)": 142.32, "step": 176780, "train_speed(iter/s)": 0.285798 }, { "acc": 0.72479091, "epoch": 1.9774986822733964, "grad_norm": 6.21875, "learning_rate": 3.454248652898229e-09, "loss": 1.11584206, "memory(GiB)": 142.32, "step": 176800, "train_speed(iter/s)": 0.285808 }, { "acc": 0.73148775, "epoch": 1.977722381219355, "grad_norm": 5.59375, "learning_rate": 3.38585710325412e-09, "loss": 1.07826509, "memory(GiB)": 142.32, "step": 176820, "train_speed(iter/s)": 0.285819 }, { "acc": 0.74365978, "epoch": 1.9779460801653135, "grad_norm": 5.65625, "learning_rate": 3.3181491618422546e-09, "loss": 1.02291546, "memory(GiB)": 142.32, "step": 176840, "train_speed(iter/s)": 0.28583 }, { "acc": 0.73024511, "epoch": 1.978169779111272, "grad_norm": 5.96875, "learning_rate": 3.251124837926889e-09, "loss": 1.08441048, "memory(GiB)": 142.32, "step": 176860, "train_speed(iter/s)": 0.285841 }, { "acc": 0.72699451, "epoch": 1.9783934780572305, "grad_norm": 5.46875, "learning_rate": 3.18478414067791e-09, "loss": 1.09307432, "memory(GiB)": 142.32, "step": 176880, "train_speed(iter/s)": 0.285852 }, { "acc": 0.73123627, "epoch": 1.978617177003189, "grad_norm": 6.53125, "learning_rate": 3.1191270791713912e-09, "loss": 1.0901103, "memory(GiB)": 142.32, "step": 176900, "train_speed(iter/s)": 0.285862 }, { "acc": 0.74008427, "epoch": 1.9788408759491476, "grad_norm": 7.5625, "learning_rate": 3.0541536623895916e-09, "loss": 1.02502365, "memory(GiB)": 142.32, "step": 176920, "train_speed(iter/s)": 0.285874 }, { "acc": 0.7467979, "epoch": 1.9790645748951061, "grad_norm": 7.875, "learning_rate": 2.9898638992231776e-09, "loss": 0.98713608, "memory(GiB)": 142.32, "step": 176940, "train_speed(iter/s)": 0.285886 }, { "acc": 0.73766718, "epoch": 1.9792882738410646, "grad_norm": 7.65625, "learning_rate": 2.9262577984662255e-09, "loss": 1.03911552, "memory(GiB)": 142.32, "step": 176960, "train_speed(iter/s)": 0.285895 }, { "acc": 0.74580193, "epoch": 1.9795119727870232, "grad_norm": 5.6875, "learning_rate": 2.8633353688223285e-09, "loss": 1.00013342, "memory(GiB)": 142.32, "step": 176980, "train_speed(iter/s)": 0.285907 }, { "acc": 0.73230433, "epoch": 1.9797356717329817, "grad_norm": 6.21875, "learning_rate": 2.8010966189001567e-09, "loss": 1.06537361, "memory(GiB)": 142.32, "step": 177000, "train_speed(iter/s)": 0.285917 }, { "acc": 0.73624477, "epoch": 1.9799593706789402, "grad_norm": 6.125, "learning_rate": 2.739541557213454e-09, "loss": 1.07023478, "memory(GiB)": 142.32, "step": 177020, "train_speed(iter/s)": 0.285927 }, { "acc": 0.74868312, "epoch": 1.9801830696248988, "grad_norm": 5.625, "learning_rate": 2.678670192185484e-09, "loss": 0.9826807, "memory(GiB)": 142.32, "step": 177040, "train_speed(iter/s)": 0.285939 }, { "acc": 0.73196712, "epoch": 1.9804067685708573, "grad_norm": 6.15625, "learning_rate": 2.6184825321434737e-09, "loss": 1.06605434, "memory(GiB)": 142.32, "step": 177060, "train_speed(iter/s)": 0.285949 }, { "acc": 0.74335818, "epoch": 1.9806304675168158, "grad_norm": 5.65625, "learning_rate": 2.5589785853219474e-09, "loss": 1.01068134, "memory(GiB)": 142.32, "step": 177080, "train_speed(iter/s)": 0.285961 }, { "acc": 0.73794708, "epoch": 1.9808541664627743, "grad_norm": 6.0, "learning_rate": 2.5001583598621705e-09, "loss": 1.04977779, "memory(GiB)": 142.32, "step": 177100, "train_speed(iter/s)": 0.285972 }, { "acc": 0.7434391, "epoch": 1.9810778654087329, "grad_norm": 6.46875, "learning_rate": 2.4420218638110393e-09, "loss": 1.02961884, "memory(GiB)": 142.32, "step": 177120, "train_speed(iter/s)": 0.285983 }, { "acc": 0.73042917, "epoch": 1.9813015643546914, "grad_norm": 6.75, "learning_rate": 2.3845691051233022e-09, "loss": 1.08477945, "memory(GiB)": 142.32, "step": 177140, "train_speed(iter/s)": 0.285993 }, { "acc": 0.73902273, "epoch": 1.98152526330065, "grad_norm": 6.40625, "learning_rate": 2.327800091658783e-09, "loss": 1.03111296, "memory(GiB)": 142.32, "step": 177160, "train_speed(iter/s)": 0.286003 }, { "acc": 0.74311018, "epoch": 1.9817489622466087, "grad_norm": 5.15625, "learning_rate": 2.2717148311846016e-09, "loss": 1.02630329, "memory(GiB)": 142.32, "step": 177180, "train_speed(iter/s)": 0.286014 }, { "acc": 0.73649273, "epoch": 1.9819726611925672, "grad_norm": 6.25, "learning_rate": 2.2163133313740648e-09, "loss": 1.05937328, "memory(GiB)": 142.32, "step": 177200, "train_speed(iter/s)": 0.286025 }, { "acc": 0.74010873, "epoch": 1.9821963601385257, "grad_norm": 6.875, "learning_rate": 2.16159559980611e-09, "loss": 1.02862406, "memory(GiB)": 142.32, "step": 177220, "train_speed(iter/s)": 0.286036 }, { "acc": 0.74359264, "epoch": 1.9824200590844843, "grad_norm": 5.75, "learning_rate": 2.1075616439675263e-09, "loss": 1.02312546, "memory(GiB)": 142.32, "step": 177240, "train_speed(iter/s)": 0.286046 }, { "acc": 0.73221574, "epoch": 1.9826437580304428, "grad_norm": 8.6875, "learning_rate": 2.0542114712507333e-09, "loss": 1.0632618, "memory(GiB)": 142.32, "step": 177260, "train_speed(iter/s)": 0.286057 }, { "acc": 0.73891888, "epoch": 1.9828674569764013, "grad_norm": 5.46875, "learning_rate": 2.0015450889554477e-09, "loss": 1.04451962, "memory(GiB)": 142.32, "step": 177280, "train_speed(iter/s)": 0.286068 }, { "acc": 0.74086576, "epoch": 1.9830911559223598, "grad_norm": 5.9375, "learning_rate": 1.949562504285907e-09, "loss": 1.01912422, "memory(GiB)": 142.32, "step": 177300, "train_speed(iter/s)": 0.286079 }, { "acc": 0.7381465, "epoch": 1.9833148548683184, "grad_norm": 6.9375, "learning_rate": 1.8982637243553095e-09, "loss": 1.04638367, "memory(GiB)": 142.32, "step": 177320, "train_speed(iter/s)": 0.286091 }, { "acc": 0.74795995, "epoch": 1.983538553814277, "grad_norm": 5.59375, "learning_rate": 1.8476487561813749e-09, "loss": 0.99720793, "memory(GiB)": 142.32, "step": 177340, "train_speed(iter/s)": 0.286102 }, { "acc": 0.73623996, "epoch": 1.9837622527602354, "grad_norm": 6.1875, "learning_rate": 1.797717606689675e-09, "loss": 1.05333118, "memory(GiB)": 142.32, "step": 177360, "train_speed(iter/s)": 0.286113 }, { "acc": 0.73700457, "epoch": 1.983985951706194, "grad_norm": 6.5, "learning_rate": 1.7484702827103018e-09, "loss": 1.02814484, "memory(GiB)": 142.32, "step": 177380, "train_speed(iter/s)": 0.286125 }, { "acc": 0.73550105, "epoch": 1.9842096506521525, "grad_norm": 5.6875, "learning_rate": 1.6999067909823086e-09, "loss": 1.06020975, "memory(GiB)": 142.32, "step": 177400, "train_speed(iter/s)": 0.286136 }, { "acc": 0.73245835, "epoch": 1.984433349598111, "grad_norm": 6.53125, "learning_rate": 1.6520271381487152e-09, "loss": 1.05062637, "memory(GiB)": 142.32, "step": 177420, "train_speed(iter/s)": 0.286148 }, { "acc": 0.74267898, "epoch": 1.9846570485440695, "grad_norm": 6.28125, "learning_rate": 1.6048313307603925e-09, "loss": 1.02758141, "memory(GiB)": 142.32, "step": 177440, "train_speed(iter/s)": 0.286159 }, { "acc": 0.73170953, "epoch": 1.984880747490028, "grad_norm": 5.34375, "learning_rate": 1.5583193752755077e-09, "loss": 1.05838928, "memory(GiB)": 142.32, "step": 177460, "train_speed(iter/s)": 0.286171 }, { "acc": 0.72439413, "epoch": 1.9851044464359866, "grad_norm": 5.8125, "learning_rate": 1.512491278056194e-09, "loss": 1.1163558, "memory(GiB)": 142.32, "step": 177480, "train_speed(iter/s)": 0.286182 }, { "acc": 0.73941545, "epoch": 1.9853281453819451, "grad_norm": 6.21875, "learning_rate": 1.467347045372991e-09, "loss": 1.041572, "memory(GiB)": 142.32, "step": 177500, "train_speed(iter/s)": 0.286193 }, { "acc": 0.74092836, "epoch": 1.9855518443279037, "grad_norm": 6.40625, "learning_rate": 1.4228866834020693e-09, "loss": 1.026968, "memory(GiB)": 142.32, "step": 177520, "train_speed(iter/s)": 0.286203 }, { "acc": 0.72329054, "epoch": 1.9857755432738622, "grad_norm": 5.96875, "learning_rate": 1.3791101982268962e-09, "loss": 1.10552216, "memory(GiB)": 142.32, "step": 177540, "train_speed(iter/s)": 0.286214 }, { "acc": 0.74682989, "epoch": 1.9859992422198207, "grad_norm": 5.84375, "learning_rate": 1.3360175958354593e-09, "loss": 1.00895891, "memory(GiB)": 142.32, "step": 177560, "train_speed(iter/s)": 0.286225 }, { "acc": 0.7290226, "epoch": 1.9862229411657792, "grad_norm": 6.15625, "learning_rate": 1.2936088821247083e-09, "loss": 1.07235746, "memory(GiB)": 142.32, "step": 177580, "train_speed(iter/s)": 0.286236 }, { "acc": 0.72528257, "epoch": 1.9864466401117378, "grad_norm": 7.15625, "learning_rate": 1.2518840628961137e-09, "loss": 1.10422516, "memory(GiB)": 142.32, "step": 177600, "train_speed(iter/s)": 0.286245 }, { "acc": 0.73809347, "epoch": 1.9866703390576963, "grad_norm": 7.21875, "learning_rate": 1.2108431438584423e-09, "loss": 1.03391666, "memory(GiB)": 142.32, "step": 177620, "train_speed(iter/s)": 0.286256 }, { "acc": 0.73941565, "epoch": 1.9868940380036548, "grad_norm": 6.28125, "learning_rate": 1.1704861306266469e-09, "loss": 1.0473424, "memory(GiB)": 142.32, "step": 177640, "train_speed(iter/s)": 0.286266 }, { "acc": 0.72805877, "epoch": 1.9871177369496134, "grad_norm": 4.375, "learning_rate": 1.1308130287218666e-09, "loss": 1.08293266, "memory(GiB)": 142.32, "step": 177660, "train_speed(iter/s)": 0.286277 }, { "acc": 0.74333458, "epoch": 1.9873414358955719, "grad_norm": 6.125, "learning_rate": 1.091823843571982e-09, "loss": 1.00878162, "memory(GiB)": 142.32, "step": 177680, "train_speed(iter/s)": 0.286287 }, { "acc": 0.73663478, "epoch": 1.9875651348415304, "grad_norm": 5.875, "learning_rate": 1.0535185805121695e-09, "loss": 1.06401711, "memory(GiB)": 142.32, "step": 177700, "train_speed(iter/s)": 0.286297 }, { "acc": 0.7429739, "epoch": 1.987788833787489, "grad_norm": 7.0, "learning_rate": 1.0158972447815718e-09, "loss": 1.0178688, "memory(GiB)": 142.32, "step": 177720, "train_speed(iter/s)": 0.286308 }, { "acc": 0.73770761, "epoch": 1.9880125327334475, "grad_norm": 7.53125, "learning_rate": 9.789598415288482e-10, "loss": 1.04618416, "memory(GiB)": 142.32, "step": 177740, "train_speed(iter/s)": 0.286318 }, { "acc": 0.74811978, "epoch": 1.988236231679406, "grad_norm": 5.625, "learning_rate": 9.427063758066235e-10, "loss": 0.99421062, "memory(GiB)": 142.32, "step": 177760, "train_speed(iter/s)": 0.286329 }, { "acc": 0.74177213, "epoch": 1.9884599306253645, "grad_norm": 5.9375, "learning_rate": 9.071368525753743e-10, "loss": 1.03141222, "memory(GiB)": 142.32, "step": 177780, "train_speed(iter/s)": 0.286339 }, { "acc": 0.7428196, "epoch": 1.988683629571323, "grad_norm": 5.9375, "learning_rate": 8.72251276700653e-10, "loss": 1.01723566, "memory(GiB)": 142.32, "step": 177800, "train_speed(iter/s)": 0.28635 }, { "acc": 0.74426622, "epoch": 1.9889073285172816, "grad_norm": 5.96875, "learning_rate": 8.380496529564186e-10, "loss": 1.01369648, "memory(GiB)": 142.32, "step": 177820, "train_speed(iter/s)": 0.28636 }, { "acc": 0.74313393, "epoch": 1.98913102746324, "grad_norm": 6.75, "learning_rate": 8.045319860217061e-10, "loss": 1.03084793, "memory(GiB)": 142.32, "step": 177840, "train_speed(iter/s)": 0.28637 }, { "acc": 0.72903309, "epoch": 1.9893547264091986, "grad_norm": 4.78125, "learning_rate": 7.716982804817364e-10, "loss": 1.08766499, "memory(GiB)": 142.32, "step": 177860, "train_speed(iter/s)": 0.286381 }, { "acc": 0.73258009, "epoch": 1.9895784253551572, "grad_norm": 4.96875, "learning_rate": 7.395485408290271e-10, "loss": 1.08045006, "memory(GiB)": 142.32, "step": 177880, "train_speed(iter/s)": 0.286392 }, { "acc": 0.72404213, "epoch": 1.9898021243011157, "grad_norm": 5.34375, "learning_rate": 7.080827714617266e-10, "loss": 1.10645599, "memory(GiB)": 142.32, "step": 177900, "train_speed(iter/s)": 0.286404 }, { "acc": 0.73752079, "epoch": 1.9900258232470742, "grad_norm": 6.46875, "learning_rate": 6.77300976684725e-10, "loss": 1.03371038, "memory(GiB)": 142.32, "step": 177920, "train_speed(iter/s)": 0.286418 }, { "acc": 0.72861977, "epoch": 1.9902495221930327, "grad_norm": 6.15625, "learning_rate": 6.472031607102081e-10, "loss": 1.08699799, "memory(GiB)": 142.32, "step": 177940, "train_speed(iter/s)": 0.286428 }, { "acc": 0.73387947, "epoch": 1.9904732211389913, "grad_norm": 6.875, "learning_rate": 6.17789327655438e-10, "loss": 1.05481796, "memory(GiB)": 142.32, "step": 177960, "train_speed(iter/s)": 0.286439 }, { "acc": 0.73295517, "epoch": 1.9906969200849498, "grad_norm": 6.3125, "learning_rate": 5.890594815449735e-10, "loss": 1.06257877, "memory(GiB)": 142.32, "step": 177980, "train_speed(iter/s)": 0.28645 }, { "acc": 0.75113792, "epoch": 1.9909206190309083, "grad_norm": 5.75, "learning_rate": 5.610136263090038e-10, "loss": 0.97541809, "memory(GiB)": 142.32, "step": 178000, "train_speed(iter/s)": 0.286461 }, { "epoch": 1.9909206190309083, "eval_acc": 0.6963812114278323, "eval_loss": 1.0713574886322021, "eval_runtime": 2338.963, "eval_samples_per_second": 32.186, "eval_steps_per_second": 16.093, "step": 178000 } ], "logging_steps": 20, "max_steps": 178810, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "total_flos": 3.2669462371395174e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }