{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999149532682873, "eval_steps": 400, "global_step": 10655, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005630680168568487, "grad_norm": 2.0098321437835693, "learning_rate": 1.5196998123827392e-06, "loss": 3.0138, "step": 6 }, { "epoch": 0.0011261360337136974, "grad_norm": 1.8856154680252075, "learning_rate": 3.0393996247654785e-06, "loss": 2.874, "step": 12 }, { "epoch": 0.0016892040505705464, "grad_norm": 1.1700998544692993, "learning_rate": 4.559099437148218e-06, "loss": 2.8487, "step": 18 }, { "epoch": 0.002252272067427395, "grad_norm": 1.0079023838043213, "learning_rate": 6.078799249530957e-06, "loss": 2.6911, "step": 24 }, { "epoch": 0.0028153400842842436, "grad_norm": 0.7912698984146118, "learning_rate": 7.598499061913696e-06, "loss": 2.658, "step": 30 }, { "epoch": 0.0033784081011410927, "grad_norm": 0.6042062640190125, "learning_rate": 9.118198874296435e-06, "loss": 2.5924, "step": 36 }, { "epoch": 0.0039414761179979415, "grad_norm": 0.5435839891433716, "learning_rate": 1.0637898686679175e-05, "loss": 2.4944, "step": 42 }, { "epoch": 0.00450454413485479, "grad_norm": 0.516063928604126, "learning_rate": 1.2157598499061914e-05, "loss": 2.5139, "step": 48 }, { "epoch": 0.005067612151711639, "grad_norm": 0.5141405463218689, "learning_rate": 1.3677298311444653e-05, "loss": 2.4391, "step": 54 }, { "epoch": 0.005630680168568487, "grad_norm": 0.4787834584712982, "learning_rate": 1.5196998123827392e-05, "loss": 2.4051, "step": 60 }, { "epoch": 0.006193748185425336, "grad_norm": 0.4127436578273773, "learning_rate": 1.671669793621013e-05, "loss": 2.3656, "step": 66 }, { "epoch": 0.0067568162022821855, "grad_norm": 0.436476469039917, "learning_rate": 1.823639774859287e-05, "loss": 2.4106, "step": 72 }, { "epoch": 0.007319884219139034, "grad_norm": 0.4225737750530243, "learning_rate": 1.975609756097561e-05, "loss": 2.3683, "step": 78 }, { "epoch": 0.007882952235995883, "grad_norm": 0.4904423952102661, "learning_rate": 2.127579737335835e-05, "loss": 2.3596, "step": 84 }, { "epoch": 0.008446020252852732, "grad_norm": 0.4283861517906189, "learning_rate": 2.2795497185741085e-05, "loss": 2.3585, "step": 90 }, { "epoch": 0.00900908826970958, "grad_norm": 0.4704267680644989, "learning_rate": 2.4315196998123828e-05, "loss": 2.3056, "step": 96 }, { "epoch": 0.009572156286566429, "grad_norm": 0.4405248761177063, "learning_rate": 2.5834896810506567e-05, "loss": 2.3288, "step": 102 }, { "epoch": 0.010135224303423278, "grad_norm": 0.40695321559906006, "learning_rate": 2.7354596622889306e-05, "loss": 2.2924, "step": 108 }, { "epoch": 0.010698292320280127, "grad_norm": 0.4295126497745514, "learning_rate": 2.8874296435272046e-05, "loss": 2.3041, "step": 114 }, { "epoch": 0.011261360337136974, "grad_norm": 0.4459642171859741, "learning_rate": 3.0393996247654785e-05, "loss": 2.3286, "step": 120 }, { "epoch": 0.011824428353993824, "grad_norm": 0.4520178437232971, "learning_rate": 3.191369606003753e-05, "loss": 2.2717, "step": 126 }, { "epoch": 0.012387496370850673, "grad_norm": 0.42942941188812256, "learning_rate": 3.343339587242026e-05, "loss": 2.2997, "step": 132 }, { "epoch": 0.012950564387707522, "grad_norm": 0.5361221432685852, "learning_rate": 3.4953095684803e-05, "loss": 2.2425, "step": 138 }, { "epoch": 0.013513632404564371, "grad_norm": 0.4384731352329254, "learning_rate": 3.647279549718574e-05, "loss": 2.2655, "step": 144 }, { "epoch": 0.014076700421421218, "grad_norm": 0.5022804141044617, "learning_rate": 3.799249530956848e-05, "loss": 2.2439, "step": 150 }, { "epoch": 0.014639768438278068, "grad_norm": 0.4743802845478058, "learning_rate": 3.951219512195122e-05, "loss": 2.3152, "step": 156 }, { "epoch": 0.015202836455134917, "grad_norm": 0.47555288672447205, "learning_rate": 4.103189493433396e-05, "loss": 2.2911, "step": 162 }, { "epoch": 0.015765904471991766, "grad_norm": 0.47877249121665955, "learning_rate": 4.25515947467167e-05, "loss": 2.3031, "step": 168 }, { "epoch": 0.016328972488848615, "grad_norm": 0.5176670551300049, "learning_rate": 4.4071294559099435e-05, "loss": 2.2395, "step": 174 }, { "epoch": 0.016892040505705464, "grad_norm": 0.4753139317035675, "learning_rate": 4.559099437148217e-05, "loss": 2.2721, "step": 180 }, { "epoch": 0.01745510852256231, "grad_norm": 0.6060550212860107, "learning_rate": 4.711069418386492e-05, "loss": 2.2489, "step": 186 }, { "epoch": 0.01801817653941916, "grad_norm": 0.5679858326911926, "learning_rate": 4.8630393996247656e-05, "loss": 2.242, "step": 192 }, { "epoch": 0.018581244556276008, "grad_norm": 0.48603692650794983, "learning_rate": 5.015009380863039e-05, "loss": 2.25, "step": 198 }, { "epoch": 0.019144312573132857, "grad_norm": 0.48188212513923645, "learning_rate": 5.1669793621013134e-05, "loss": 2.2376, "step": 204 }, { "epoch": 0.019707380589989706, "grad_norm": 0.441798597574234, "learning_rate": 5.318949343339588e-05, "loss": 2.2407, "step": 210 }, { "epoch": 0.020270448606846556, "grad_norm": 0.4708426296710968, "learning_rate": 5.470919324577861e-05, "loss": 2.2832, "step": 216 }, { "epoch": 0.020833516623703405, "grad_norm": 0.5132296085357666, "learning_rate": 5.622889305816135e-05, "loss": 2.2816, "step": 222 }, { "epoch": 0.021396584640560254, "grad_norm": 0.4854268729686737, "learning_rate": 5.774859287054409e-05, "loss": 2.2528, "step": 228 }, { "epoch": 0.021959652657417103, "grad_norm": 0.6619467735290527, "learning_rate": 5.926829268292683e-05, "loss": 2.2416, "step": 234 }, { "epoch": 0.02252272067427395, "grad_norm": 0.4827911853790283, "learning_rate": 6.078799249530957e-05, "loss": 2.242, "step": 240 }, { "epoch": 0.023085788691130798, "grad_norm": 0.5112887620925903, "learning_rate": 6.23076923076923e-05, "loss": 2.2551, "step": 246 }, { "epoch": 0.023648856707987647, "grad_norm": 0.5310215353965759, "learning_rate": 6.382739212007505e-05, "loss": 2.2418, "step": 252 }, { "epoch": 0.024211924724844496, "grad_norm": 0.5903606414794922, "learning_rate": 6.534709193245779e-05, "loss": 2.2273, "step": 258 }, { "epoch": 0.024774992741701345, "grad_norm": 0.5666975378990173, "learning_rate": 6.686679174484053e-05, "loss": 2.2269, "step": 264 }, { "epoch": 0.025338060758558194, "grad_norm": 0.5110777020454407, "learning_rate": 6.838649155722326e-05, "loss": 2.2108, "step": 270 }, { "epoch": 0.025901128775415044, "grad_norm": 0.5167705416679382, "learning_rate": 6.9906191369606e-05, "loss": 2.2251, "step": 276 }, { "epoch": 0.026464196792271893, "grad_norm": 0.6298703551292419, "learning_rate": 7.142589118198875e-05, "loss": 2.1766, "step": 282 }, { "epoch": 0.027027264809128742, "grad_norm": 0.4743683636188507, "learning_rate": 7.294559099437148e-05, "loss": 2.1973, "step": 288 }, { "epoch": 0.027590332825985588, "grad_norm": 0.5053961873054504, "learning_rate": 7.446529080675422e-05, "loss": 2.2652, "step": 294 }, { "epoch": 0.028153400842842437, "grad_norm": 0.5965043306350708, "learning_rate": 7.598499061913696e-05, "loss": 2.2155, "step": 300 }, { "epoch": 0.028716468859699286, "grad_norm": 0.6304689049720764, "learning_rate": 7.75046904315197e-05, "loss": 2.1703, "step": 306 }, { "epoch": 0.029279536876556135, "grad_norm": 0.6420917510986328, "learning_rate": 7.902439024390244e-05, "loss": 2.2406, "step": 312 }, { "epoch": 0.029842604893412984, "grad_norm": 0.541969358921051, "learning_rate": 8.054409005628518e-05, "loss": 2.2174, "step": 318 }, { "epoch": 0.030405672910269833, "grad_norm": 0.6104909181594849, "learning_rate": 8.206378986866793e-05, "loss": 2.213, "step": 324 }, { "epoch": 0.030968740927126683, "grad_norm": 0.5623666048049927, "learning_rate": 8.358348968105066e-05, "loss": 2.2161, "step": 330 }, { "epoch": 0.03153180894398353, "grad_norm": 0.5236659049987793, "learning_rate": 8.51031894934334e-05, "loss": 2.1963, "step": 336 }, { "epoch": 0.03209487696084038, "grad_norm": 0.506647527217865, "learning_rate": 8.662288930581613e-05, "loss": 2.2562, "step": 342 }, { "epoch": 0.03265794497769723, "grad_norm": 0.47973525524139404, "learning_rate": 8.814258911819887e-05, "loss": 2.2007, "step": 348 }, { "epoch": 0.03322101299455408, "grad_norm": 0.5386266708374023, "learning_rate": 8.96622889305816e-05, "loss": 2.1867, "step": 354 }, { "epoch": 0.03378408101141093, "grad_norm": 0.6700956225395203, "learning_rate": 9.118198874296434e-05, "loss": 2.1987, "step": 360 }, { "epoch": 0.03434714902826778, "grad_norm": 0.5135593414306641, "learning_rate": 9.27016885553471e-05, "loss": 2.2231, "step": 366 }, { "epoch": 0.03491021704512462, "grad_norm": 0.5517572164535522, "learning_rate": 9.422138836772984e-05, "loss": 2.2385, "step": 372 }, { "epoch": 0.03547328506198147, "grad_norm": 0.6875866651535034, "learning_rate": 9.574108818011258e-05, "loss": 2.2392, "step": 378 }, { "epoch": 0.03603635307883832, "grad_norm": 0.4503846764564514, "learning_rate": 9.726078799249531e-05, "loss": 2.2224, "step": 384 }, { "epoch": 0.03659942109569517, "grad_norm": 0.48854905366897583, "learning_rate": 9.878048780487805e-05, "loss": 2.1978, "step": 390 }, { "epoch": 0.037162489112552016, "grad_norm": 0.4786721169948578, "learning_rate": 0.00010030018761726078, "loss": 2.1884, "step": 396 }, { "epoch": 0.037537867790456585, "eval_accuracy": 0.546890386869871, "eval_loss": 2.230736494064331, "eval_runtime": 87.4569, "eval_samples_per_second": 4.574, "eval_steps_per_second": 1.143, "step": 400 }, { "epoch": 0.037725557129408865, "grad_norm": 0.5907689929008484, "learning_rate": 0.00010181988742964352, "loss": 2.1913, "step": 402 }, { "epoch": 0.038288625146265715, "grad_norm": 0.5317164063453674, "learning_rate": 0.00010333958724202627, "loss": 2.2098, "step": 408 }, { "epoch": 0.038851693163122564, "grad_norm": 0.513721764087677, "learning_rate": 0.00010485928705440902, "loss": 2.1646, "step": 414 }, { "epoch": 0.03941476117997941, "grad_norm": 0.4749634563922882, "learning_rate": 0.00010637898686679175, "loss": 2.2116, "step": 420 }, { "epoch": 0.03997782919683626, "grad_norm": 0.594358503818512, "learning_rate": 0.00010789868667917449, "loss": 2.2123, "step": 426 }, { "epoch": 0.04054089721369311, "grad_norm": 0.8683858513832092, "learning_rate": 0.00010941838649155723, "loss": 2.1919, "step": 432 }, { "epoch": 0.04110396523054996, "grad_norm": 0.8690674901008606, "learning_rate": 0.00011093808630393996, "loss": 2.1614, "step": 438 }, { "epoch": 0.04166703324740681, "grad_norm": 0.5689049959182739, "learning_rate": 0.0001124577861163227, "loss": 2.249, "step": 444 }, { "epoch": 0.04223010126426366, "grad_norm": 0.6344694495201111, "learning_rate": 0.00011397748592870545, "loss": 2.1863, "step": 450 }, { "epoch": 0.04279316928112051, "grad_norm": 0.5258616209030151, "learning_rate": 0.00011549718574108818, "loss": 2.163, "step": 456 }, { "epoch": 0.04335623729797736, "grad_norm": 0.5147134065628052, "learning_rate": 0.00011701688555347092, "loss": 2.2098, "step": 462 }, { "epoch": 0.043919305314834206, "grad_norm": 0.5254240036010742, "learning_rate": 0.00011853658536585365, "loss": 2.1793, "step": 468 }, { "epoch": 0.04448237333169105, "grad_norm": 0.600652277469635, "learning_rate": 0.0001200562851782364, "loss": 2.1917, "step": 474 }, { "epoch": 0.0450454413485479, "grad_norm": 0.5646069645881653, "learning_rate": 0.00012157598499061914, "loss": 2.195, "step": 480 }, { "epoch": 0.04560850936540475, "grad_norm": 0.48661041259765625, "learning_rate": 0.00012309568480300186, "loss": 2.1983, "step": 486 }, { "epoch": 0.046171577382261596, "grad_norm": 0.5766115188598633, "learning_rate": 0.0001246153846153846, "loss": 2.2465, "step": 492 }, { "epoch": 0.046734645399118445, "grad_norm": 0.5215041041374207, "learning_rate": 0.00012613508442776736, "loss": 2.1306, "step": 498 }, { "epoch": 0.047297713415975294, "grad_norm": 0.5461471676826477, "learning_rate": 0.0001276547842401501, "loss": 2.1735, "step": 504 }, { "epoch": 0.04786078143283214, "grad_norm": 0.5795019268989563, "learning_rate": 0.00012917448405253283, "loss": 2.2116, "step": 510 }, { "epoch": 0.04842384944968899, "grad_norm": 0.6677626371383667, "learning_rate": 0.00013069418386491558, "loss": 2.2175, "step": 516 }, { "epoch": 0.04898691746654584, "grad_norm": 0.5724667906761169, "learning_rate": 0.0001322138836772983, "loss": 2.2586, "step": 522 }, { "epoch": 0.04954998548340269, "grad_norm": 0.709949254989624, "learning_rate": 0.00013373358348968105, "loss": 2.1952, "step": 528 }, { "epoch": 0.05011305350025954, "grad_norm": 0.5406577587127686, "learning_rate": 0.00013487353627200874, "loss": 2.2382, "step": 534 }, { "epoch": 0.05067612151711639, "grad_norm": 0.5364143252372742, "learning_rate": 0.00013412214582238086, "loss": 2.1886, "step": 540 }, { "epoch": 0.05123918953397324, "grad_norm": 0.5183210968971252, "learning_rate": 0.00013338317520801275, "loss": 2.169, "step": 546 }, { "epoch": 0.05180225755083009, "grad_norm": 0.5824753046035767, "learning_rate": 0.00013265628600783044, "loss": 2.2453, "step": 552 }, { "epoch": 0.052365325567686936, "grad_norm": 0.5645071864128113, "learning_rate": 0.00013194115257165022, "loss": 2.1985, "step": 558 }, { "epoch": 0.052928393584543786, "grad_norm": 0.6309590935707092, "learning_rate": 0.00013123746140715974, "loss": 2.1494, "step": 564 }, { "epoch": 0.053491461601400635, "grad_norm": 0.4858923852443695, "learning_rate": 0.00013054491060248395, "loss": 2.2275, "step": 570 }, { "epoch": 0.054054529618257484, "grad_norm": 0.51877760887146, "learning_rate": 0.00012986320928192095, "loss": 2.202, "step": 576 }, { "epoch": 0.054617597635114326, "grad_norm": 0.5425084233283997, "learning_rate": 0.0001291920770926188, "loss": 2.1469, "step": 582 }, { "epoch": 0.055180665651971175, "grad_norm": 0.5455939173698425, "learning_rate": 0.0001285312437201356, "loss": 2.2304, "step": 588 }, { "epoch": 0.055743733668828024, "grad_norm": 0.624026894569397, "learning_rate": 0.00012788044843098218, "loss": 2.2236, "step": 594 }, { "epoch": 0.056306801685684874, "grad_norm": 0.4480319619178772, "learning_rate": 0.00012723943964038824, "loss": 2.1442, "step": 600 }, { "epoch": 0.05686986970254172, "grad_norm": 0.44737479090690613, "learning_rate": 0.00012660797450366541, "loss": 2.2249, "step": 606 }, { "epoch": 0.05743293771939857, "grad_norm": 0.44034454226493835, "learning_rate": 0.00012598581852965926, "loss": 2.1868, "step": 612 }, { "epoch": 0.05799600573625542, "grad_norm": 0.5695493817329407, "learning_rate": 0.00012537274521489286, "loss": 2.1748, "step": 618 }, { "epoch": 0.05855907375311227, "grad_norm": 0.6109303832054138, "learning_rate": 0.00012476853569710596, "loss": 2.1288, "step": 624 }, { "epoch": 0.05912214176996912, "grad_norm": 0.5233228802680969, "learning_rate": 0.00012417297842698538, "loss": 2.1868, "step": 630 }, { "epoch": 0.05968520978682597, "grad_norm": 0.48908835649490356, "learning_rate": 0.00012358586885696956, "loss": 2.164, "step": 636 }, { "epoch": 0.06024827780368282, "grad_norm": 0.3833492696285248, "learning_rate": 0.00012300700914608654, "loss": 2.1544, "step": 642 }, { "epoch": 0.06081134582053967, "grad_norm": 0.3997538685798645, "learning_rate": 0.0001224362078798588, "loss": 2.1265, "step": 648 }, { "epoch": 0.061374413837396516, "grad_norm": 0.4559822082519531, "learning_rate": 0.00012187327980437398, "loss": 2.1812, "step": 654 }, { "epoch": 0.061937481854253365, "grad_norm": 0.47324708104133606, "learning_rate": 0.00012131804557368281, "loss": 2.1445, "step": 660 }, { "epoch": 0.06250054987111021, "grad_norm": 0.5981423854827881, "learning_rate": 0.00012077033150974198, "loss": 2.141, "step": 666 }, { "epoch": 0.06306361788796706, "grad_norm": 0.43759846687316895, "learning_rate": 0.00012022996937417178, "loss": 2.165, "step": 672 }, { "epoch": 0.06362668590482391, "grad_norm": 0.44400447607040405, "learning_rate": 0.00011969679615114833, "loss": 2.1422, "step": 678 }, { "epoch": 0.06418975392168076, "grad_norm": 0.47775810956954956, "learning_rate": 0.00011917065384079298, "loss": 2.1632, "step": 684 }, { "epoch": 0.06475282193853761, "grad_norm": 0.5651953816413879, "learning_rate": 0.00011865138926246521, "loss": 2.1877, "step": 690 }, { "epoch": 0.06531588995539446, "grad_norm": 0.476750910282135, "learning_rate": 0.00011813885386740235, "loss": 2.1548, "step": 696 }, { "epoch": 0.06587895797225131, "grad_norm": 0.4165765643119812, "learning_rate": 0.0001176329035601859, "loss": 2.1624, "step": 702 }, { "epoch": 0.06644202598910816, "grad_norm": 0.5530439615249634, "learning_rate": 0.00011713339852854718, "loss": 2.1722, "step": 708 }, { "epoch": 0.06700509400596501, "grad_norm": 0.4450165629386902, "learning_rate": 0.00011664020308105571, "loss": 2.1695, "step": 714 }, { "epoch": 0.06756816202282186, "grad_norm": 0.5588933229446411, "learning_rate": 0.00011615318549226277, "loss": 2.1849, "step": 720 }, { "epoch": 0.0681312300396787, "grad_norm": 0.44493842124938965, "learning_rate": 0.00011567221785489839, "loss": 2.1961, "step": 726 }, { "epoch": 0.06869429805653555, "grad_norm": 0.4788070023059845, "learning_rate": 0.00011519717593874553, "loss": 2.1694, "step": 732 }, { "epoch": 0.06925736607339239, "grad_norm": 0.44972002506256104, "learning_rate": 0.00011472793905583765, "loss": 2.1753, "step": 738 }, { "epoch": 0.06982043409024924, "grad_norm": 0.4528462886810303, "learning_rate": 0.00011426438993164762, "loss": 2.124, "step": 744 }, { "epoch": 0.07038350210710609, "grad_norm": 0.5124619603157043, "learning_rate": 0.00011380641458195579, "loss": 2.172, "step": 750 }, { "epoch": 0.07094657012396294, "grad_norm": 0.48104843497276306, "learning_rate": 0.00011335390219510372, "loss": 2.1276, "step": 756 }, { "epoch": 0.07150963814081979, "grad_norm": 0.43749359250068665, "learning_rate": 0.00011290674501935721, "loss": 2.1268, "step": 762 }, { "epoch": 0.07207270615767664, "grad_norm": 0.41453948616981506, "learning_rate": 0.00011246483825511865, "loss": 2.1657, "step": 768 }, { "epoch": 0.07263577417453349, "grad_norm": 0.44287821650505066, "learning_rate": 0.00011202807995174433, "loss": 2.1328, "step": 774 }, { "epoch": 0.07319884219139033, "grad_norm": 0.4652189314365387, "learning_rate": 0.00011159637090873521, "loss": 2.1848, "step": 780 }, { "epoch": 0.07376191020824718, "grad_norm": 0.5598775744438171, "learning_rate": 0.00011116961458108448, "loss": 2.11, "step": 786 }, { "epoch": 0.07432497822510403, "grad_norm": 0.49142366647720337, "learning_rate": 0.00011074771698857642, "loss": 2.1394, "step": 792 }, { "epoch": 0.07488804624196088, "grad_norm": 0.4415854513645172, "learning_rate": 0.00011033058662884315, "loss": 2.1215, "step": 798 }, { "epoch": 0.07507573558091317, "eval_accuracy": 0.555494333724111, "eval_loss": 2.1776483058929443, "eval_runtime": 87.9469, "eval_samples_per_second": 4.548, "eval_steps_per_second": 1.137, "step": 800 }, { "epoch": 0.07545111425881773, "grad_norm": 0.48106423020362854, "learning_rate": 0.00010991813439399668, "loss": 2.1499, "step": 804 }, { "epoch": 0.07601418227567458, "grad_norm": 0.4785069227218628, "learning_rate": 0.00010951027349066389, "loss": 2.1574, "step": 810 }, { "epoch": 0.07657725029253143, "grad_norm": 0.49595943093299866, "learning_rate": 0.00010910691936326119, "loss": 2.0972, "step": 816 }, { "epoch": 0.07714031830938828, "grad_norm": 0.4293830692768097, "learning_rate": 0.00010870798962035527, "loss": 2.2009, "step": 822 }, { "epoch": 0.07770338632624513, "grad_norm": 0.5432091355323792, "learning_rate": 0.00010831340396396408, "loss": 2.158, "step": 828 }, { "epoch": 0.07826645434310198, "grad_norm": 0.45801395177841187, "learning_rate": 0.00010792308412166016, "loss": 2.1849, "step": 834 }, { "epoch": 0.07882952235995883, "grad_norm": 0.5347604155540466, "learning_rate": 0.00010753695378134639, "loss": 2.1521, "step": 840 }, { "epoch": 0.07939259037681567, "grad_norm": 0.4341467320919037, "learning_rate": 0.00010715493852858034, "loss": 2.1956, "step": 846 }, { "epoch": 0.07995565839367252, "grad_norm": 0.4284418821334839, "learning_rate": 0.00010677696578633057, "loss": 2.1276, "step": 852 }, { "epoch": 0.08051872641052937, "grad_norm": 0.48132291436195374, "learning_rate": 0.00010640296475705415, "loss": 2.1754, "step": 858 }, { "epoch": 0.08108179442738622, "grad_norm": 0.4238269031047821, "learning_rate": 0.00010603286636699018, "loss": 2.1587, "step": 864 }, { "epoch": 0.08164486244424307, "grad_norm": 0.4699987769126892, "learning_rate": 0.00010566660321257024, "loss": 2.1206, "step": 870 }, { "epoch": 0.08220793046109992, "grad_norm": 0.4153059720993042, "learning_rate": 0.00010530410950885087, "loss": 2.1453, "step": 876 }, { "epoch": 0.08277099847795677, "grad_norm": 0.4487610459327698, "learning_rate": 0.00010494532103987897, "loss": 2.1319, "step": 882 }, { "epoch": 0.08333406649481362, "grad_norm": 0.42891180515289307, "learning_rate": 0.00010459017511090481, "loss": 2.1478, "step": 888 }, { "epoch": 0.08389713451167047, "grad_norm": 0.48686015605926514, "learning_rate": 0.00010423861050236179, "loss": 2.1553, "step": 894 }, { "epoch": 0.08446020252852732, "grad_norm": 0.4616381824016571, "learning_rate": 0.00010389056742553676, "loss": 2.1714, "step": 900 }, { "epoch": 0.08502327054538417, "grad_norm": 0.40427857637405396, "learning_rate": 0.00010354598747985751, "loss": 2.1043, "step": 906 }, { "epoch": 0.08558633856224102, "grad_norm": 0.44368940591812134, "learning_rate": 0.00010320481361172829, "loss": 2.1822, "step": 912 }, { "epoch": 0.08614940657909786, "grad_norm": 0.5172756910324097, "learning_rate": 0.00010286699007484775, "loss": 2.1347, "step": 918 }, { "epoch": 0.08671247459595471, "grad_norm": 0.43407630920410156, "learning_rate": 0.00010253246239194612, "loss": 2.1172, "step": 924 }, { "epoch": 0.08727554261281156, "grad_norm": 0.4051315486431122, "learning_rate": 0.00010220117731788266, "loss": 2.1095, "step": 930 }, { "epoch": 0.08783861062966841, "grad_norm": 0.42174217104911804, "learning_rate": 0.00010187308280404595, "loss": 2.134, "step": 936 }, { "epoch": 0.08840167864652526, "grad_norm": 0.3932819366455078, "learning_rate": 0.0001015481279640035, "loss": 2.1702, "step": 942 }, { "epoch": 0.0889647466633821, "grad_norm": 0.41248735785484314, "learning_rate": 0.00010122626304034899, "loss": 2.1581, "step": 948 }, { "epoch": 0.08952781468023895, "grad_norm": 0.4185050427913666, "learning_rate": 0.00010090743937269798, "loss": 2.1806, "step": 954 }, { "epoch": 0.0900908826970958, "grad_norm": 0.3725639879703522, "learning_rate": 0.00010059160936678565, "loss": 2.1549, "step": 960 }, { "epoch": 0.09065395071395264, "grad_norm": 0.397498220205307, "learning_rate": 0.00010027872646462189, "loss": 2.1715, "step": 966 }, { "epoch": 0.0912170187308095, "grad_norm": 0.40115973353385925, "learning_rate": 9.996874511566103e-05, "loss": 2.1411, "step": 972 }, { "epoch": 0.09178008674766634, "grad_norm": 0.4632989466190338, "learning_rate": 9.966162074894601e-05, "loss": 2.0854, "step": 978 }, { "epoch": 0.09234315476452319, "grad_norm": 0.5472243428230286, "learning_rate": 9.935730974618828e-05, "loss": 2.1444, "step": 984 }, { "epoch": 0.09290622278138004, "grad_norm": 0.48974740505218506, "learning_rate": 9.905576941574607e-05, "loss": 2.1238, "step": 990 }, { "epoch": 0.09346929079823689, "grad_norm": 0.4146271347999573, "learning_rate": 9.875695796746654e-05, "loss": 2.118, "step": 996 }, { "epoch": 0.09403235881509374, "grad_norm": 0.43020641803741455, "learning_rate": 9.846083448835742e-05, "loss": 2.0847, "step": 1002 }, { "epoch": 0.09459542683195059, "grad_norm": 0.4030325412750244, "learning_rate": 9.816735891905647e-05, "loss": 2.1046, "step": 1008 }, { "epoch": 0.09515849484880744, "grad_norm": 0.5563904643058777, "learning_rate": 9.787649203106788e-05, "loss": 2.1221, "step": 1014 }, { "epoch": 0.09572156286566429, "grad_norm": 0.4427477717399597, "learning_rate": 9.758819540473601e-05, "loss": 2.1399, "step": 1020 }, { "epoch": 0.09628463088252114, "grad_norm": 0.4915330111980438, "learning_rate": 9.730243140792903e-05, "loss": 2.1063, "step": 1026 }, { "epoch": 0.09684769889937798, "grad_norm": 0.3732217848300934, "learning_rate": 9.701916317540476e-05, "loss": 2.1361, "step": 1032 }, { "epoch": 0.09741076691623483, "grad_norm": 0.4579565227031708, "learning_rate": 9.673835458883366e-05, "loss": 2.191, "step": 1038 }, { "epoch": 0.09797383493309168, "grad_norm": 0.4258158504962921, "learning_rate": 9.645997025745429e-05, "loss": 2.128, "step": 1044 }, { "epoch": 0.09853690294994853, "grad_norm": 0.3904872536659241, "learning_rate": 9.618397549933743e-05, "loss": 2.1437, "step": 1050 }, { "epoch": 0.09909997096680538, "grad_norm": 0.42212560772895813, "learning_rate": 9.591033632323664e-05, "loss": 2.1096, "step": 1056 }, { "epoch": 0.09966303898366223, "grad_norm": 0.43507489562034607, "learning_rate": 9.563901941100351e-05, "loss": 2.1546, "step": 1062 }, { "epoch": 0.10022610700051908, "grad_norm": 0.4443258047103882, "learning_rate": 9.536999210054716e-05, "loss": 2.1104, "step": 1068 }, { "epoch": 0.10078917501737593, "grad_norm": 0.4024173617362976, "learning_rate": 9.510322236931792e-05, "loss": 2.1004, "step": 1074 }, { "epoch": 0.10135224303423278, "grad_norm": 0.4750373363494873, "learning_rate": 9.483867881829651e-05, "loss": 2.1307, "step": 1080 }, { "epoch": 0.10191531105108963, "grad_norm": 0.42277786135673523, "learning_rate": 9.457633065647022e-05, "loss": 2.0832, "step": 1086 }, { "epoch": 0.10247837906794648, "grad_norm": 0.40229344367980957, "learning_rate": 9.431614768577921e-05, "loss": 2.1351, "step": 1092 }, { "epoch": 0.10304144708480333, "grad_norm": 0.4866289794445038, "learning_rate": 9.405810028651543e-05, "loss": 2.1452, "step": 1098 }, { "epoch": 0.10360451510166017, "grad_norm": 0.46909454464912415, "learning_rate": 9.380215940315901e-05, "loss": 2.084, "step": 1104 }, { "epoch": 0.10416758311851702, "grad_norm": 0.4079931974411011, "learning_rate": 9.35482965306359e-05, "loss": 2.1841, "step": 1110 }, { "epoch": 0.10473065113537387, "grad_norm": 0.4802832007408142, "learning_rate": 9.329648370098276e-05, "loss": 2.1256, "step": 1116 }, { "epoch": 0.10529371915223072, "grad_norm": 0.5322299599647522, "learning_rate": 9.304669347040442e-05, "loss": 2.1006, "step": 1122 }, { "epoch": 0.10585678716908757, "grad_norm": 0.4667050540447235, "learning_rate": 9.279889890671048e-05, "loss": 2.1196, "step": 1128 }, { "epoch": 0.10641985518594442, "grad_norm": 0.4636259973049164, "learning_rate": 9.255307357711806e-05, "loss": 2.1338, "step": 1134 }, { "epoch": 0.10698292320280127, "grad_norm": 0.4346386790275574, "learning_rate": 9.230919153640802e-05, "loss": 2.129, "step": 1140 }, { "epoch": 0.10754599121965812, "grad_norm": 0.43968209624290466, "learning_rate": 9.206722731542283e-05, "loss": 2.1171, "step": 1146 }, { "epoch": 0.10810905923651497, "grad_norm": 0.43884578347206116, "learning_rate": 9.182715590989411e-05, "loss": 2.0585, "step": 1152 }, { "epoch": 0.1086721272533718, "grad_norm": 0.456012099981308, "learning_rate": 9.158895276958926e-05, "loss": 2.0795, "step": 1158 }, { "epoch": 0.10923519527022865, "grad_norm": 0.426696240901947, "learning_rate": 9.135259378776598e-05, "loss": 2.0788, "step": 1164 }, { "epoch": 0.1097982632870855, "grad_norm": 0.4199647605419159, "learning_rate": 9.111805529092465e-05, "loss": 2.0985, "step": 1170 }, { "epoch": 0.11036133130394235, "grad_norm": 0.39858222007751465, "learning_rate": 9.088531402884875e-05, "loss": 2.1274, "step": 1176 }, { "epoch": 0.1109243993207992, "grad_norm": 0.40872737765312195, "learning_rate": 9.065434716492353e-05, "loss": 2.1284, "step": 1182 }, { "epoch": 0.11148746733765605, "grad_norm": 0.4115428626537323, "learning_rate": 9.042513226672409e-05, "loss": 2.0914, "step": 1188 }, { "epoch": 0.1120505353545129, "grad_norm": 0.48554012179374695, "learning_rate": 9.019764729686382e-05, "loss": 2.1253, "step": 1194 }, { "epoch": 0.11261360337136975, "grad_norm": 0.4830922484397888, "learning_rate": 8.997187060409492e-05, "loss": 2.1304, "step": 1200 }, { "epoch": 0.11261360337136975, "eval_accuracy": 0.5614390386869871, "eval_loss": 2.1418612003326416, "eval_runtime": 88.1101, "eval_samples_per_second": 4.54, "eval_steps_per_second": 1.135, "step": 1200 }, { "epoch": 0.1131766713882266, "grad_norm": 0.4679175913333893, "learning_rate": 8.974778091465258e-05, "loss": 2.0973, "step": 1206 }, { "epoch": 0.11373973940508345, "grad_norm": 0.4429488182067871, "learning_rate": 8.952535732383532e-05, "loss": 2.1574, "step": 1212 }, { "epoch": 0.1143028074219403, "grad_norm": 0.37007570266723633, "learning_rate": 8.930457928781368e-05, "loss": 2.0641, "step": 1218 }, { "epoch": 0.11486587543879714, "grad_norm": 0.43866094946861267, "learning_rate": 8.908542661565986e-05, "loss": 2.1494, "step": 1224 }, { "epoch": 0.11542894345565399, "grad_norm": 0.43514716625213623, "learning_rate": 8.886787946159176e-05, "loss": 2.1031, "step": 1230 }, { "epoch": 0.11599201147251084, "grad_norm": 0.4231458902359009, "learning_rate": 8.865191831742403e-05, "loss": 2.1736, "step": 1236 }, { "epoch": 0.11655507948936769, "grad_norm": 0.46445292234420776, "learning_rate": 8.843752400522028e-05, "loss": 2.142, "step": 1242 }, { "epoch": 0.11711814750622454, "grad_norm": 0.38551396131515503, "learning_rate": 8.822467767013945e-05, "loss": 2.0441, "step": 1248 }, { "epoch": 0.11768121552308139, "grad_norm": 0.5045029520988464, "learning_rate": 8.801336077347091e-05, "loss": 2.0946, "step": 1254 }, { "epoch": 0.11824428353993824, "grad_norm": 0.4801216423511505, "learning_rate": 8.780355508585223e-05, "loss": 2.1007, "step": 1260 }, { "epoch": 0.11880735155679509, "grad_norm": 0.4105584919452667, "learning_rate": 8.759524268066365e-05, "loss": 2.079, "step": 1266 }, { "epoch": 0.11937041957365194, "grad_norm": 0.42829808592796326, "learning_rate": 8.738840592759453e-05, "loss": 2.1333, "step": 1272 }, { "epoch": 0.11993348759050879, "grad_norm": 0.46542689204216003, "learning_rate": 8.718302748637569e-05, "loss": 2.1329, "step": 1278 }, { "epoch": 0.12049655560736564, "grad_norm": 0.37131962180137634, "learning_rate": 8.697909030067348e-05, "loss": 2.1119, "step": 1284 }, { "epoch": 0.12105962362422248, "grad_norm": 0.37895721197128296, "learning_rate": 8.677657759213975e-05, "loss": 2.1152, "step": 1290 }, { "epoch": 0.12162269164107933, "grad_norm": 0.3756585121154785, "learning_rate": 8.657547285461397e-05, "loss": 2.1079, "step": 1296 }, { "epoch": 0.12218575965793618, "grad_norm": 0.558789849281311, "learning_rate": 8.637575984847208e-05, "loss": 2.0946, "step": 1302 }, { "epoch": 0.12274882767479303, "grad_norm": 0.40680012106895447, "learning_rate": 8.617742259511835e-05, "loss": 2.1401, "step": 1308 }, { "epoch": 0.12331189569164988, "grad_norm": 0.3852643072605133, "learning_rate": 8.598044537161558e-05, "loss": 2.0949, "step": 1314 }, { "epoch": 0.12387496370850673, "grad_norm": 0.4692872166633606, "learning_rate": 8.578481270544974e-05, "loss": 2.1053, "step": 1320 }, { "epoch": 0.12443803172536358, "grad_norm": 0.4869590997695923, "learning_rate": 8.559050936942511e-05, "loss": 2.094, "step": 1326 }, { "epoch": 0.12500109974222043, "grad_norm": 0.48363006114959717, "learning_rate": 8.539752037668592e-05, "loss": 2.1249, "step": 1332 }, { "epoch": 0.12556416775907728, "grad_norm": 0.45796316862106323, "learning_rate": 8.5205830975861e-05, "loss": 2.095, "step": 1338 }, { "epoch": 0.12612723577593413, "grad_norm": 0.4385221302509308, "learning_rate": 8.501542664632779e-05, "loss": 2.1257, "step": 1344 }, { "epoch": 0.12669030379279098, "grad_norm": 0.4110018312931061, "learning_rate": 8.482629309359217e-05, "loss": 2.0637, "step": 1350 }, { "epoch": 0.12725337180964783, "grad_norm": 0.43518000841140747, "learning_rate": 8.463841624478083e-05, "loss": 2.1224, "step": 1356 }, { "epoch": 0.12781643982650467, "grad_norm": 0.4879107177257538, "learning_rate": 8.445178224424323e-05, "loss": 2.118, "step": 1362 }, { "epoch": 0.12837950784336152, "grad_norm": 0.5114567279815674, "learning_rate": 8.42663774492594e-05, "loss": 2.0766, "step": 1368 }, { "epoch": 0.12894257586021837, "grad_norm": 0.41177934408187866, "learning_rate": 8.408218842585123e-05, "loss": 2.1182, "step": 1374 }, { "epoch": 0.12950564387707522, "grad_norm": 0.4221201539039612, "learning_rate": 8.389920194469387e-05, "loss": 2.1146, "step": 1380 }, { "epoch": 0.13006871189393207, "grad_norm": 0.3679744601249695, "learning_rate": 8.371740497712465e-05, "loss": 2.0689, "step": 1386 }, { "epoch": 0.13063177991078892, "grad_norm": 0.45015183091163635, "learning_rate": 8.353678469124678e-05, "loss": 2.0646, "step": 1392 }, { "epoch": 0.13119484792764577, "grad_norm": 0.43882912397384644, "learning_rate": 8.335732844812498e-05, "loss": 2.0777, "step": 1398 }, { "epoch": 0.13175791594450262, "grad_norm": 0.4096289873123169, "learning_rate": 8.317902379807064e-05, "loss": 2.0983, "step": 1404 }, { "epoch": 0.13232098396135947, "grad_norm": 0.43430134654045105, "learning_rate": 8.300185847701422e-05, "loss": 2.1383, "step": 1410 }, { "epoch": 0.13288405197821632, "grad_norm": 0.44193264842033386, "learning_rate": 8.282582040296207e-05, "loss": 2.0803, "step": 1416 }, { "epoch": 0.13344711999507317, "grad_norm": 0.42731207609176636, "learning_rate": 8.265089767253559e-05, "loss": 2.1034, "step": 1422 }, { "epoch": 0.13401018801193001, "grad_norm": 0.45263174176216125, "learning_rate": 8.247707855759053e-05, "loss": 2.0889, "step": 1428 }, { "epoch": 0.13457325602878686, "grad_norm": 0.39663687348365784, "learning_rate": 8.23043515019141e-05, "loss": 2.0787, "step": 1434 }, { "epoch": 0.1351363240456437, "grad_norm": 0.45102155208587646, "learning_rate": 8.21327051179979e-05, "loss": 2.1004, "step": 1440 }, { "epoch": 0.13569939206250056, "grad_norm": 0.4846293032169342, "learning_rate": 8.196212818388463e-05, "loss": 2.0786, "step": 1446 }, { "epoch": 0.1362624600793574, "grad_norm": 0.3850274384021759, "learning_rate": 8.179260964008629e-05, "loss": 2.1828, "step": 1452 }, { "epoch": 0.13682552809621426, "grad_norm": 0.36706045269966125, "learning_rate": 8.162413858657253e-05, "loss": 2.0787, "step": 1458 }, { "epoch": 0.1373885961130711, "grad_norm": 0.43489155173301697, "learning_rate": 8.145670427982677e-05, "loss": 2.0888, "step": 1464 }, { "epoch": 0.13795166412992793, "grad_norm": 0.5299365520477295, "learning_rate": 8.129029612996842e-05, "loss": 2.126, "step": 1470 }, { "epoch": 0.13851473214678478, "grad_norm": 0.42338883876800537, "learning_rate": 8.112490369793977e-05, "loss": 2.076, "step": 1476 }, { "epoch": 0.13907780016364163, "grad_norm": 2.1590936183929443, "learning_rate": 8.096051669275522e-05, "loss": 2.1066, "step": 1482 }, { "epoch": 0.13964086818049848, "grad_norm": 0.39690083265304565, "learning_rate": 8.079712496881189e-05, "loss": 2.1045, "step": 1488 }, { "epoch": 0.14020393619735533, "grad_norm": 0.4019521176815033, "learning_rate": 8.063471852325958e-05, "loss": 2.0922, "step": 1494 }, { "epoch": 0.14076700421421218, "grad_norm": 0.4089321494102478, "learning_rate": 8.047328749342853e-05, "loss": 2.0298, "step": 1500 }, { "epoch": 0.14133007223106903, "grad_norm": 0.3862135112285614, "learning_rate": 8.03128221543138e-05, "loss": 2.0808, "step": 1506 }, { "epoch": 0.14189314024792588, "grad_norm": 0.3389727473258972, "learning_rate": 8.015331291611452e-05, "loss": 2.0986, "step": 1512 }, { "epoch": 0.14245620826478272, "grad_norm": 0.42676207423210144, "learning_rate": 7.999475032182664e-05, "loss": 2.1002, "step": 1518 }, { "epoch": 0.14301927628163957, "grad_norm": 0.3740767538547516, "learning_rate": 7.983712504488794e-05, "loss": 2.0948, "step": 1524 }, { "epoch": 0.14358234429849642, "grad_norm": 0.38225531578063965, "learning_rate": 7.968042788687378e-05, "loss": 2.1272, "step": 1530 }, { "epoch": 0.14414541231535327, "grad_norm": 0.3941880166530609, "learning_rate": 7.952464977524264e-05, "loss": 2.0633, "step": 1536 }, { "epoch": 0.14470848033221012, "grad_norm": 0.4058215618133545, "learning_rate": 7.936978176112959e-05, "loss": 2.1275, "step": 1542 }, { "epoch": 0.14527154834906697, "grad_norm": 0.48998016119003296, "learning_rate": 7.921581501718713e-05, "loss": 2.147, "step": 1548 }, { "epoch": 0.14583461636592382, "grad_norm": 0.42124542593955994, "learning_rate": 7.906274083547185e-05, "loss": 2.1347, "step": 1554 }, { "epoch": 0.14639768438278067, "grad_norm": 0.4518285393714905, "learning_rate": 7.891055062537582e-05, "loss": 2.1247, "step": 1560 }, { "epoch": 0.14696075239963752, "grad_norm": 0.4741106629371643, "learning_rate": 7.875923591160157e-05, "loss": 2.0805, "step": 1566 }, { "epoch": 0.14752382041649437, "grad_norm": 0.40501755475997925, "learning_rate": 7.860878833217973e-05, "loss": 2.0831, "step": 1572 }, { "epoch": 0.14808688843335122, "grad_norm": 0.4189727306365967, "learning_rate": 7.845919963652804e-05, "loss": 2.0384, "step": 1578 }, { "epoch": 0.14864995645020807, "grad_norm": 0.35300374031066895, "learning_rate": 7.8310461683551e-05, "loss": 2.0666, "step": 1584 }, { "epoch": 0.14921302446706491, "grad_norm": 0.40332961082458496, "learning_rate": 7.816256643977876e-05, "loss": 2.0911, "step": 1590 }, { "epoch": 0.14977609248392176, "grad_norm": 0.40466591715812683, "learning_rate": 7.801550597754482e-05, "loss": 2.0552, "step": 1596 }, { "epoch": 0.15015147116182634, "eval_accuracy": 0.5652764751856194, "eval_loss": 2.1181135177612305, "eval_runtime": 88.3044, "eval_samples_per_second": 4.53, "eval_steps_per_second": 1.132, "step": 1600 }, { "epoch": 0.1503391605007786, "grad_norm": 0.35561028122901917, "learning_rate": 7.7869272473201e-05, "loss": 2.0863, "step": 1602 }, { "epoch": 0.15090222851763546, "grad_norm": 0.37869158387184143, "learning_rate": 7.772385820536933e-05, "loss": 2.0754, "step": 1608 }, { "epoch": 0.1514652965344923, "grad_norm": 0.5729403495788574, "learning_rate": 7.757925555322962e-05, "loss": 2.0683, "step": 1614 }, { "epoch": 0.15202836455134916, "grad_norm": 0.4816875159740448, "learning_rate": 7.743545699484184e-05, "loss": 2.1182, "step": 1620 }, { "epoch": 0.152591432568206, "grad_norm": 0.47446584701538086, "learning_rate": 7.729245510550278e-05, "loss": 2.0451, "step": 1626 }, { "epoch": 0.15315450058506286, "grad_norm": 0.37788906693458557, "learning_rate": 7.715024255613581e-05, "loss": 2.0634, "step": 1632 }, { "epoch": 0.1537175686019197, "grad_norm": 0.5028734803199768, "learning_rate": 7.70088121117132e-05, "loss": 2.1098, "step": 1638 }, { "epoch": 0.15428063661877656, "grad_norm": 0.46816110610961914, "learning_rate": 7.686815662971004e-05, "loss": 2.0484, "step": 1644 }, { "epoch": 0.1548437046356334, "grad_norm": 0.48828208446502686, "learning_rate": 7.67282690585893e-05, "loss": 2.0786, "step": 1650 }, { "epoch": 0.15540677265249025, "grad_norm": 0.4531189799308777, "learning_rate": 7.658914243631687e-05, "loss": 2.1227, "step": 1656 }, { "epoch": 0.1559698406693471, "grad_norm": 0.4399756193161011, "learning_rate": 7.645076988890626e-05, "loss": 2.0839, "step": 1662 }, { "epoch": 0.15653290868620395, "grad_norm": 0.4496377110481262, "learning_rate": 7.631314462899213e-05, "loss": 2.1002, "step": 1668 }, { "epoch": 0.1570959767030608, "grad_norm": 0.4543704390525818, "learning_rate": 7.617625995443176e-05, "loss": 2.1156, "step": 1674 }, { "epoch": 0.15765904471991765, "grad_norm": 0.4478834867477417, "learning_rate": 7.604010924693437e-05, "loss": 2.0873, "step": 1680 }, { "epoch": 0.1582221127367745, "grad_norm": 0.37818101048469543, "learning_rate": 7.590468597071703e-05, "loss": 2.0265, "step": 1686 }, { "epoch": 0.15878518075363135, "grad_norm": 0.4982151687145233, "learning_rate": 7.576998367118681e-05, "loss": 2.0215, "step": 1692 }, { "epoch": 0.1593482487704882, "grad_norm": 0.4497143626213074, "learning_rate": 7.563599597364865e-05, "loss": 2.0852, "step": 1698 }, { "epoch": 0.15991131678734505, "grad_norm": 0.3602100908756256, "learning_rate": 7.550271658203833e-05, "loss": 2.0479, "step": 1704 }, { "epoch": 0.1604743848042019, "grad_norm": 0.43396827578544617, "learning_rate": 7.537013927767975e-05, "loss": 2.1216, "step": 1710 }, { "epoch": 0.16103745282105875, "grad_norm": 0.41697368025779724, "learning_rate": 7.523825791806622e-05, "loss": 2.0755, "step": 1716 }, { "epoch": 0.1616005208379156, "grad_norm": 0.3902270495891571, "learning_rate": 7.510706643566521e-05, "loss": 2.0377, "step": 1722 }, { "epoch": 0.16216358885477244, "grad_norm": 0.46782979369163513, "learning_rate": 7.497655883674577e-05, "loss": 2.0788, "step": 1728 }, { "epoch": 0.1627266568716293, "grad_norm": 0.432955265045166, "learning_rate": 7.484672920022837e-05, "loss": 2.069, "step": 1734 }, { "epoch": 0.16328972488848614, "grad_norm": 0.5338298082351685, "learning_rate": 7.471757167655665e-05, "loss": 2.0672, "step": 1740 }, { "epoch": 0.163852792905343, "grad_norm": 0.37978705763816833, "learning_rate": 7.458908048659033e-05, "loss": 2.0275, "step": 1746 }, { "epoch": 0.16441586092219984, "grad_norm": 0.39591217041015625, "learning_rate": 7.446124992051925e-05, "loss": 2.0952, "step": 1752 }, { "epoch": 0.1649789289390567, "grad_norm": 0.3606795072555542, "learning_rate": 7.433407433679747e-05, "loss": 2.0522, "step": 1758 }, { "epoch": 0.16554199695591354, "grad_norm": 0.40163201093673706, "learning_rate": 7.420754816109769e-05, "loss": 2.0506, "step": 1764 }, { "epoch": 0.1661050649727704, "grad_norm": 0.44406571984291077, "learning_rate": 7.408166588528488e-05, "loss": 2.0743, "step": 1770 }, { "epoch": 0.16666813298962724, "grad_norm": 0.4531542658805847, "learning_rate": 7.395642206640926e-05, "loss": 2.0794, "step": 1776 }, { "epoch": 0.1672312010064841, "grad_norm": 0.4056132435798645, "learning_rate": 7.383181132571762e-05, "loss": 2.0874, "step": 1782 }, { "epoch": 0.16779426902334094, "grad_norm": 0.3953051269054413, "learning_rate": 7.370782834768328e-05, "loss": 2.0852, "step": 1788 }, { "epoch": 0.16835733704019779, "grad_norm": 0.37753647565841675, "learning_rate": 7.358446787905365e-05, "loss": 2.0516, "step": 1794 }, { "epoch": 0.16892040505705463, "grad_norm": 0.3379611372947693, "learning_rate": 7.346172472791527e-05, "loss": 2.0941, "step": 1800 }, { "epoch": 0.16948347307391148, "grad_norm": 0.4238949120044708, "learning_rate": 7.333959376277617e-05, "loss": 2.0545, "step": 1806 }, { "epoch": 0.17004654109076833, "grad_norm": 0.4323907494544983, "learning_rate": 7.321806991166461e-05, "loss": 2.0603, "step": 1812 }, { "epoch": 0.17060960910762518, "grad_norm": 0.3597232401371002, "learning_rate": 7.309714816124451e-05, "loss": 2.0772, "step": 1818 }, { "epoch": 0.17117267712448203, "grad_norm": 0.45222851634025574, "learning_rate": 7.297682355594677e-05, "loss": 2.133, "step": 1824 }, { "epoch": 0.17173574514133888, "grad_norm": 0.3663848340511322, "learning_rate": 7.285709119711628e-05, "loss": 2.0321, "step": 1830 }, { "epoch": 0.17229881315819573, "grad_norm": 0.3802311420440674, "learning_rate": 7.273794624217411e-05, "loss": 2.0747, "step": 1836 }, { "epoch": 0.17286188117505258, "grad_norm": 0.4087699055671692, "learning_rate": 7.26193839037951e-05, "loss": 2.0695, "step": 1842 }, { "epoch": 0.17342494919190943, "grad_norm": 0.3881111443042755, "learning_rate": 7.250139944909976e-05, "loss": 2.0485, "step": 1848 }, { "epoch": 0.17398801720876628, "grad_norm": 0.41190746426582336, "learning_rate": 7.238398819886077e-05, "loss": 2.0716, "step": 1854 }, { "epoch": 0.17455108522562313, "grad_norm": 0.3849361836910248, "learning_rate": 7.22671455267236e-05, "loss": 2.0489, "step": 1860 }, { "epoch": 0.17511415324247998, "grad_norm": 0.42296290397644043, "learning_rate": 7.21508668584407e-05, "loss": 2.0747, "step": 1866 }, { "epoch": 0.17567722125933682, "grad_norm": 0.42117878794670105, "learning_rate": 7.203514767111955e-05, "loss": 2.1052, "step": 1872 }, { "epoch": 0.17624028927619367, "grad_norm": 0.4024169445037842, "learning_rate": 7.191998349248354e-05, "loss": 2.1141, "step": 1878 }, { "epoch": 0.17680335729305052, "grad_norm": 0.3742789328098297, "learning_rate": 7.180536990014614e-05, "loss": 2.0407, "step": 1884 }, { "epoch": 0.17736642530990734, "grad_norm": 0.39530882239341736, "learning_rate": 7.169130252089759e-05, "loss": 2.0502, "step": 1890 }, { "epoch": 0.1779294933267642, "grad_norm": 0.43266841769218445, "learning_rate": 7.157777703000395e-05, "loss": 2.0308, "step": 1896 }, { "epoch": 0.17849256134362104, "grad_norm": 0.394989013671875, "learning_rate": 7.146478915051855e-05, "loss": 2.0512, "step": 1902 }, { "epoch": 0.1790556293604779, "grad_norm": 0.4730623960494995, "learning_rate": 7.135233465260516e-05, "loss": 2.065, "step": 1908 }, { "epoch": 0.17961869737733474, "grad_norm": 0.35876256227493286, "learning_rate": 7.124040935287284e-05, "loss": 2.0411, "step": 1914 }, { "epoch": 0.1801817653941916, "grad_norm": 0.4595516324043274, "learning_rate": 7.112900911372237e-05, "loss": 2.1105, "step": 1920 }, { "epoch": 0.18074483341104844, "grad_norm": 0.44887006282806396, "learning_rate": 7.101812984270383e-05, "loss": 2.1185, "step": 1926 }, { "epoch": 0.1813079014279053, "grad_norm": 0.4761316478252411, "learning_rate": 7.090776749188504e-05, "loss": 2.1177, "step": 1932 }, { "epoch": 0.18187096944476214, "grad_norm": 0.3656371533870697, "learning_rate": 7.079791805723096e-05, "loss": 2.0452, "step": 1938 }, { "epoch": 0.182434037461619, "grad_norm": 0.47010138630867004, "learning_rate": 7.068857757799346e-05, "loss": 2.0719, "step": 1944 }, { "epoch": 0.18299710547847584, "grad_norm": 0.4701285660266876, "learning_rate": 7.057974213611155e-05, "loss": 2.0236, "step": 1950 }, { "epoch": 0.18356017349533268, "grad_norm": 0.43752458691596985, "learning_rate": 7.047140785562165e-05, "loss": 2.0602, "step": 1956 }, { "epoch": 0.18412324151218953, "grad_norm": 0.471031129360199, "learning_rate": 7.03635709020779e-05, "loss": 2.0707, "step": 1962 }, { "epoch": 0.18468630952904638, "grad_norm": 0.44324010610580444, "learning_rate": 7.025622748198198e-05, "loss": 2.0389, "step": 1968 }, { "epoch": 0.18524937754590323, "grad_norm": 0.39876291155815125, "learning_rate": 7.014937384222276e-05, "loss": 2.0257, "step": 1974 }, { "epoch": 0.18581244556276008, "grad_norm": 0.43958184123039246, "learning_rate": 7.004300626952507e-05, "loss": 2.032, "step": 1980 }, { "epoch": 0.18637551357961693, "grad_norm": 0.3778688311576843, "learning_rate": 6.993712108990769e-05, "loss": 2.1139, "step": 1986 }, { "epoch": 0.18693858159647378, "grad_norm": 0.451320618391037, "learning_rate": 6.983171466815044e-05, "loss": 2.0186, "step": 1992 }, { "epoch": 0.18750164961333063, "grad_norm": 0.3794098198413849, "learning_rate": 6.972678340726987e-05, "loss": 2.07, "step": 1998 }, { "epoch": 0.18768933895228293, "eval_accuracy": 0.5678683079327862, "eval_loss": 2.1013710498809814, "eval_runtime": 87.9844, "eval_samples_per_second": 4.546, "eval_steps_per_second": 1.137, "step": 2000 }, { "epoch": 0.18806471763018748, "grad_norm": 0.41135984659194946, "learning_rate": 6.962232374800382e-05, "loss": 2.0599, "step": 2004 }, { "epoch": 0.18862778564704433, "grad_norm": 0.4069685935974121, "learning_rate": 6.951833216830425e-05, "loss": 2.0573, "step": 2010 }, { "epoch": 0.18919085366390118, "grad_norm": 0.4652012586593628, "learning_rate": 6.941480518283855e-05, "loss": 2.099, "step": 2016 }, { "epoch": 0.18975392168075803, "grad_norm": 0.3845677971839905, "learning_rate": 6.931173934249878e-05, "loss": 2.0472, "step": 2022 }, { "epoch": 0.19031698969761487, "grad_norm": 0.5444955229759216, "learning_rate": 6.920913123391918e-05, "loss": 2.0897, "step": 2028 }, { "epoch": 0.19088005771447172, "grad_norm": 0.494041383266449, "learning_rate": 6.910697747900125e-05, "loss": 2.0209, "step": 2034 }, { "epoch": 0.19144312573132857, "grad_norm": 0.46688124537467957, "learning_rate": 6.900527473444672e-05, "loss": 2.0482, "step": 2040 }, { "epoch": 0.19200619374818542, "grad_norm": 0.45177990198135376, "learning_rate": 6.89040196912979e-05, "loss": 2.0431, "step": 2046 }, { "epoch": 0.19256926176504227, "grad_norm": 0.4476774036884308, "learning_rate": 6.880320907448553e-05, "loss": 2.0977, "step": 2052 }, { "epoch": 0.19313232978189912, "grad_norm": 0.4231646955013275, "learning_rate": 6.870283964238388e-05, "loss": 2.0507, "step": 2058 }, { "epoch": 0.19369539779875597, "grad_norm": 0.4136286675930023, "learning_rate": 6.860290818637287e-05, "loss": 2.0221, "step": 2064 }, { "epoch": 0.19425846581561282, "grad_norm": 0.3533666431903839, "learning_rate": 6.850341153040736e-05, "loss": 2.0536, "step": 2070 }, { "epoch": 0.19482153383246967, "grad_norm": 0.34639379382133484, "learning_rate": 6.840434653059304e-05, "loss": 2.0639, "step": 2076 }, { "epoch": 0.19538460184932652, "grad_norm": 0.4379017949104309, "learning_rate": 6.830571007476932e-05, "loss": 2.0598, "step": 2082 }, { "epoch": 0.19594766986618337, "grad_norm": 0.46008849143981934, "learning_rate": 6.820749908209861e-05, "loss": 2.0441, "step": 2088 }, { "epoch": 0.19651073788304022, "grad_norm": 0.39759039878845215, "learning_rate": 6.810971050266215e-05, "loss": 2.076, "step": 2094 }, { "epoch": 0.19707380589989706, "grad_norm": 0.42170873284339905, "learning_rate": 6.801234131706223e-05, "loss": 2.0645, "step": 2100 }, { "epoch": 0.1976368739167539, "grad_norm": 0.46593090891838074, "learning_rate": 6.791538853603062e-05, "loss": 2.0742, "step": 2106 }, { "epoch": 0.19819994193361076, "grad_norm": 0.46976691484451294, "learning_rate": 6.781884920004306e-05, "loss": 2.0167, "step": 2112 }, { "epoch": 0.1987630099504676, "grad_norm": 0.3913651406764984, "learning_rate": 6.772272037893988e-05, "loss": 2.0959, "step": 2118 }, { "epoch": 0.19932607796732446, "grad_norm": 0.35092833638191223, "learning_rate": 6.762699917155243e-05, "loss": 2.0725, "step": 2124 }, { "epoch": 0.1998891459841813, "grad_norm": 0.4150833487510681, "learning_rate": 6.753168270533531e-05, "loss": 2.068, "step": 2130 }, { "epoch": 0.20045221400103816, "grad_norm": 0.35213208198547363, "learning_rate": 6.743676813600437e-05, "loss": 2.0435, "step": 2136 }, { "epoch": 0.201015282017895, "grad_norm": 0.3910273611545563, "learning_rate": 6.734225264718012e-05, "loss": 2.1254, "step": 2142 }, { "epoch": 0.20157835003475186, "grad_norm": 0.37565183639526367, "learning_rate": 6.724813345003686e-05, "loss": 2.0649, "step": 2148 }, { "epoch": 0.2021414180516087, "grad_norm": 0.3882327973842621, "learning_rate": 6.715440778295693e-05, "loss": 2.074, "step": 2154 }, { "epoch": 0.20270448606846556, "grad_norm": 0.36626866459846497, "learning_rate": 6.706107291119043e-05, "loss": 2.0665, "step": 2160 }, { "epoch": 0.2032675540853224, "grad_norm": 0.38924655318260193, "learning_rate": 6.696812612652013e-05, "loss": 2.0142, "step": 2166 }, { "epoch": 0.20383062210217925, "grad_norm": 0.44027435779571533, "learning_rate": 6.687556474693125e-05, "loss": 2.0478, "step": 2172 }, { "epoch": 0.2043936901190361, "grad_norm": 0.36333200335502625, "learning_rate": 6.678338611628662e-05, "loss": 2.0345, "step": 2178 }, { "epoch": 0.20495675813589295, "grad_norm": 0.3367884159088135, "learning_rate": 6.669158760400638e-05, "loss": 2.062, "step": 2184 }, { "epoch": 0.2055198261527498, "grad_norm": 0.37709954380989075, "learning_rate": 6.660016660475273e-05, "loss": 2.0349, "step": 2190 }, { "epoch": 0.20608289416960665, "grad_norm": 0.40647539496421814, "learning_rate": 6.650912053811943e-05, "loss": 2.0022, "step": 2196 }, { "epoch": 0.2066459621864635, "grad_norm": 0.3865557610988617, "learning_rate": 6.641844684832573e-05, "loss": 2.0749, "step": 2202 }, { "epoch": 0.20720903020332035, "grad_norm": 0.47110477089881897, "learning_rate": 6.632814300391522e-05, "loss": 2.101, "step": 2208 }, { "epoch": 0.2077720982201772, "grad_norm": 0.40132007002830505, "learning_rate": 6.623820649745885e-05, "loss": 2.0584, "step": 2214 }, { "epoch": 0.20833516623703405, "grad_norm": 0.3928954601287842, "learning_rate": 6.614863484526261e-05, "loss": 2.0474, "step": 2220 }, { "epoch": 0.2088982342538909, "grad_norm": 0.36073553562164307, "learning_rate": 6.605942558707938e-05, "loss": 2.0426, "step": 2226 }, { "epoch": 0.20946130227074775, "grad_norm": 0.4960843622684479, "learning_rate": 6.597057628582511e-05, "loss": 2.1127, "step": 2232 }, { "epoch": 0.2100243702876046, "grad_norm": 0.5345079898834229, "learning_rate": 6.588208452729921e-05, "loss": 2.0406, "step": 2238 }, { "epoch": 0.21058743830446144, "grad_norm": 0.48983460664749146, "learning_rate": 6.579394791990903e-05, "loss": 2.0785, "step": 2244 }, { "epoch": 0.2111505063213183, "grad_norm": 0.4585428833961487, "learning_rate": 6.570616409439833e-05, "loss": 2.0549, "step": 2250 }, { "epoch": 0.21171357433817514, "grad_norm": 0.40239760279655457, "learning_rate": 6.561873070357987e-05, "loss": 2.0437, "step": 2256 }, { "epoch": 0.212276642355032, "grad_norm": 0.3720134496688843, "learning_rate": 6.553164542207175e-05, "loss": 2.0554, "step": 2262 }, { "epoch": 0.21283971037188884, "grad_norm": 0.3556349277496338, "learning_rate": 6.544490594603766e-05, "loss": 2.0351, "step": 2268 }, { "epoch": 0.2134027783887457, "grad_norm": 0.3526850640773773, "learning_rate": 6.535850999293085e-05, "loss": 2.0638, "step": 2274 }, { "epoch": 0.21396584640560254, "grad_norm": 0.3612268567085266, "learning_rate": 6.527245530124197e-05, "loss": 2.0293, "step": 2280 }, { "epoch": 0.2145289144224594, "grad_norm": 0.43372833728790283, "learning_rate": 6.518673963025031e-05, "loss": 2.0539, "step": 2286 }, { "epoch": 0.21509198243931624, "grad_norm": 0.38292548060417175, "learning_rate": 6.510136075977882e-05, "loss": 2.0108, "step": 2292 }, { "epoch": 0.2156550504561731, "grad_norm": 0.4530802369117737, "learning_rate": 6.501631648995252e-05, "loss": 2.0306, "step": 2298 }, { "epoch": 0.21621811847302994, "grad_norm": 0.44018489122390747, "learning_rate": 6.493160464096047e-05, "loss": 2.0693, "step": 2304 }, { "epoch": 0.21678118648988676, "grad_norm": 0.4362567663192749, "learning_rate": 6.484722305282114e-05, "loss": 2.0486, "step": 2310 }, { "epoch": 0.2173442545067436, "grad_norm": 0.42122185230255127, "learning_rate": 6.476316958515099e-05, "loss": 2.0518, "step": 2316 }, { "epoch": 0.21790732252360046, "grad_norm": 0.402195006608963, "learning_rate": 6.467944211693651e-05, "loss": 1.9958, "step": 2322 }, { "epoch": 0.2184703905404573, "grad_norm": 0.43360304832458496, "learning_rate": 6.45960385463094e-05, "loss": 2.0721, "step": 2328 }, { "epoch": 0.21903345855731415, "grad_norm": 0.4220491945743561, "learning_rate": 6.451295679032496e-05, "loss": 2.0723, "step": 2334 }, { "epoch": 0.219596526574171, "grad_norm": 0.45116424560546875, "learning_rate": 6.443019478474358e-05, "loss": 2.0712, "step": 2340 }, { "epoch": 0.22015959459102785, "grad_norm": 0.4368237257003784, "learning_rate": 6.43477504838154e-05, "loss": 2.072, "step": 2346 }, { "epoch": 0.2207226626078847, "grad_norm": 0.3944617807865143, "learning_rate": 6.42656218600678e-05, "loss": 2.0417, "step": 2352 }, { "epoch": 0.22128573062474155, "grad_norm": 0.33972054719924927, "learning_rate": 6.418380690409608e-05, "loss": 2.0536, "step": 2358 }, { "epoch": 0.2218487986415984, "grad_norm": 0.3861721456050873, "learning_rate": 6.410230362435689e-05, "loss": 2.0773, "step": 2364 }, { "epoch": 0.22241186665845525, "grad_norm": 0.36385035514831543, "learning_rate": 6.402111004696475e-05, "loss": 2.0491, "step": 2370 }, { "epoch": 0.2229749346753121, "grad_norm": 0.4009208083152771, "learning_rate": 6.394022421549109e-05, "loss": 2.0291, "step": 2376 }, { "epoch": 0.22353800269216895, "grad_norm": 0.43149232864379883, "learning_rate": 6.385964419076639e-05, "loss": 2.0883, "step": 2382 }, { "epoch": 0.2241010707090258, "grad_norm": 0.417315810918808, "learning_rate": 6.377936805068487e-05, "loss": 2.0532, "step": 2388 }, { "epoch": 0.22466413872588264, "grad_norm": 0.37928810715675354, "learning_rate": 6.36993938900119e-05, "loss": 2.0404, "step": 2394 }, { "epoch": 0.2252272067427395, "grad_norm": 0.4288337230682373, "learning_rate": 6.361971982019412e-05, "loss": 2.0684, "step": 2400 }, { "epoch": 0.2252272067427395, "eval_accuracy": 0.5699872997264557, "eval_loss": 2.0868825912475586, "eval_runtime": 88.1274, "eval_samples_per_second": 4.539, "eval_steps_per_second": 1.135, "step": 2400 }, { "epoch": 0.22579027475959634, "grad_norm": 0.39167284965515137, "learning_rate": 6.354034396917204e-05, "loss": 2.0309, "step": 2406 }, { "epoch": 0.2263533427764532, "grad_norm": 0.45797017216682434, "learning_rate": 6.346126448119546e-05, "loss": 2.0324, "step": 2412 }, { "epoch": 0.22691641079331004, "grad_norm": 0.3881801664829254, "learning_rate": 6.338247951664103e-05, "loss": 2.0016, "step": 2418 }, { "epoch": 0.2274794788101669, "grad_norm": 0.3974197208881378, "learning_rate": 6.330398725183271e-05, "loss": 2.0592, "step": 2424 }, { "epoch": 0.22804254682702374, "grad_norm": 0.3651202917098999, "learning_rate": 6.322578587886434e-05, "loss": 2.0618, "step": 2430 }, { "epoch": 0.2286056148438806, "grad_norm": 0.4772557020187378, "learning_rate": 6.314787360542476e-05, "loss": 2.0374, "step": 2436 }, { "epoch": 0.22916868286073744, "grad_norm": 0.4444785714149475, "learning_rate": 6.307024865462531e-05, "loss": 2.0465, "step": 2442 }, { "epoch": 0.2297317508775943, "grad_norm": 0.40455254912376404, "learning_rate": 6.299290926482963e-05, "loss": 2.082, "step": 2448 }, { "epoch": 0.23029481889445114, "grad_norm": 0.45888659358024597, "learning_rate": 6.291585368948571e-05, "loss": 2.0492, "step": 2454 }, { "epoch": 0.23085788691130799, "grad_norm": 0.4138409495353699, "learning_rate": 6.283908019696024e-05, "loss": 2.0763, "step": 2460 }, { "epoch": 0.23142095492816483, "grad_norm": 0.3871881663799286, "learning_rate": 6.276258707037516e-05, "loss": 2.0195, "step": 2466 }, { "epoch": 0.23198402294502168, "grad_norm": 0.36518746614456177, "learning_rate": 6.268637260744643e-05, "loss": 2.0724, "step": 2472 }, { "epoch": 0.23254709096187853, "grad_norm": 0.35598716139793396, "learning_rate": 6.261043512032493e-05, "loss": 2.0158, "step": 2478 }, { "epoch": 0.23311015897873538, "grad_norm": 0.4468774199485779, "learning_rate": 6.253477293543935e-05, "loss": 2.1059, "step": 2484 }, { "epoch": 0.23367322699559223, "grad_norm": 0.4021683931350708, "learning_rate": 6.245938439334141e-05, "loss": 2.026, "step": 2490 }, { "epoch": 0.23423629501244908, "grad_norm": 0.40791988372802734, "learning_rate": 6.238426784855298e-05, "loss": 2.1173, "step": 2496 }, { "epoch": 0.23479936302930593, "grad_norm": 0.3916425406932831, "learning_rate": 6.230942166941513e-05, "loss": 2.0438, "step": 2502 }, { "epoch": 0.23536243104616278, "grad_norm": 0.4245172441005707, "learning_rate": 6.223484423793936e-05, "loss": 2.0043, "step": 2508 }, { "epoch": 0.23592549906301963, "grad_norm": 0.428930401802063, "learning_rate": 6.21605339496607e-05, "loss": 2.0718, "step": 2514 }, { "epoch": 0.23648856707987648, "grad_norm": 0.4980619251728058, "learning_rate": 6.208648921349269e-05, "loss": 2.0652, "step": 2520 }, { "epoch": 0.23705163509673333, "grad_norm": 0.42203381657600403, "learning_rate": 6.201270845158424e-05, "loss": 2.0915, "step": 2526 }, { "epoch": 0.23761470311359018, "grad_norm": 0.38787636160850525, "learning_rate": 6.193919009917858e-05, "loss": 2.0714, "step": 2532 }, { "epoch": 0.23817777113044702, "grad_norm": 0.3857220411300659, "learning_rate": 6.186593260447366e-05, "loss": 2.0452, "step": 2538 }, { "epoch": 0.23874083914730387, "grad_norm": 0.3777729868888855, "learning_rate": 6.179293442848478e-05, "loss": 2.0535, "step": 2544 }, { "epoch": 0.23930390716416072, "grad_norm": 0.42431920766830444, "learning_rate": 6.172019404490864e-05, "loss": 2.0589, "step": 2550 }, { "epoch": 0.23986697518101757, "grad_norm": 0.42051079869270325, "learning_rate": 6.164770993998942e-05, "loss": 2.0711, "step": 2556 }, { "epoch": 0.24043004319787442, "grad_norm": 0.3624160587787628, "learning_rate": 6.157548061238637e-05, "loss": 2.0478, "step": 2562 }, { "epoch": 0.24099311121473127, "grad_norm": 0.37505748867988586, "learning_rate": 6.150350457304327e-05, "loss": 2.0362, "step": 2568 }, { "epoch": 0.24155617923158812, "grad_norm": 0.4620640277862549, "learning_rate": 6.143178034505949e-05, "loss": 2.0839, "step": 2574 }, { "epoch": 0.24211924724844497, "grad_norm": 0.35475900769233704, "learning_rate": 6.136030646356263e-05, "loss": 2.0821, "step": 2580 }, { "epoch": 0.24268231526530182, "grad_norm": 0.41738393902778625, "learning_rate": 6.128908147558298e-05, "loss": 2.0746, "step": 2586 }, { "epoch": 0.24324538328215867, "grad_norm": 0.4684674143791199, "learning_rate": 6.12181039399294e-05, "loss": 2.0366, "step": 2592 }, { "epoch": 0.24380845129901552, "grad_norm": 0.389291375875473, "learning_rate": 6.114737242706682e-05, "loss": 2.0515, "step": 2598 }, { "epoch": 0.24437151931587237, "grad_norm": 0.4097399413585663, "learning_rate": 6.107688551899533e-05, "loss": 2.0642, "step": 2604 }, { "epoch": 0.24493458733272921, "grad_norm": 0.3804687261581421, "learning_rate": 6.10066418091308e-05, "loss": 2.0171, "step": 2610 }, { "epoch": 0.24549765534958606, "grad_norm": 0.3644648790359497, "learning_rate": 6.093663990218699e-05, "loss": 1.9995, "step": 2616 }, { "epoch": 0.2460607233664429, "grad_norm": 0.35695719718933105, "learning_rate": 6.08668784140591e-05, "loss": 2.0424, "step": 2622 }, { "epoch": 0.24662379138329976, "grad_norm": 0.49889206886291504, "learning_rate": 6.0797355971708884e-05, "loss": 2.0189, "step": 2628 }, { "epoch": 0.2471868594001566, "grad_norm": 0.3799843490123749, "learning_rate": 6.072807121305108e-05, "loss": 2.0575, "step": 2634 }, { "epoch": 0.24774992741701346, "grad_norm": 0.41617363691329956, "learning_rate": 6.065902278684141e-05, "loss": 2.0544, "step": 2640 }, { "epoch": 0.2483129954338703, "grad_norm": 0.4012010097503662, "learning_rate": 6.059020935256583e-05, "loss": 2.0938, "step": 2646 }, { "epoch": 0.24887606345072716, "grad_norm": 0.4061677157878876, "learning_rate": 6.052162958033122e-05, "loss": 2.0418, "step": 2652 }, { "epoch": 0.249439131467584, "grad_norm": 0.40574613213539124, "learning_rate": 6.0453282150757516e-05, "loss": 2.0218, "step": 2658 }, { "epoch": 0.25000219948444086, "grad_norm": 0.49144884943962097, "learning_rate": 6.038516575487099e-05, "loss": 2.0415, "step": 2664 }, { "epoch": 0.2505652675012977, "grad_norm": 0.3971235156059265, "learning_rate": 6.031727909399902e-05, "loss": 2.0529, "step": 2670 }, { "epoch": 0.25112833551815456, "grad_norm": 0.3698517680168152, "learning_rate": 6.02496208796661e-05, "loss": 2.0446, "step": 2676 }, { "epoch": 0.2516914035350114, "grad_norm": 0.35364294052124023, "learning_rate": 6.0182189833491124e-05, "loss": 2.0007, "step": 2682 }, { "epoch": 0.25225447155186825, "grad_norm": 0.40880894660949707, "learning_rate": 6.011498468708589e-05, "loss": 2.017, "step": 2688 }, { "epoch": 0.2528175395687251, "grad_norm": 0.35359126329421997, "learning_rate": 6.004800418195496e-05, "loss": 2.0232, "step": 2694 }, { "epoch": 0.25338060758558195, "grad_norm": 0.3863252103328705, "learning_rate": 5.998124706939661e-05, "loss": 2.0224, "step": 2700 }, { "epoch": 0.2539436756024388, "grad_norm": 0.543828547000885, "learning_rate": 5.991471211040513e-05, "loss": 2.0464, "step": 2706 }, { "epoch": 0.25450674361929565, "grad_norm": 0.3437909483909607, "learning_rate": 5.9848398075574166e-05, "loss": 2.0044, "step": 2712 }, { "epoch": 0.2550698116361525, "grad_norm": 0.35814785957336426, "learning_rate": 5.978230374500136e-05, "loss": 2.0213, "step": 2718 }, { "epoch": 0.25563287965300935, "grad_norm": 0.37194880843162537, "learning_rate": 5.9716427908194056e-05, "loss": 2.0479, "step": 2724 }, { "epoch": 0.2561959476698662, "grad_norm": 0.3700201213359833, "learning_rate": 5.965076936397624e-05, "loss": 2.0779, "step": 2730 }, { "epoch": 0.25675901568672305, "grad_norm": 0.39581894874572754, "learning_rate": 5.958532692039649e-05, "loss": 2.0844, "step": 2736 }, { "epoch": 0.2573220837035799, "grad_norm": 0.4304547607898712, "learning_rate": 5.952009939463713e-05, "loss": 2.0307, "step": 2742 }, { "epoch": 0.25788515172043674, "grad_norm": 0.37728819251060486, "learning_rate": 5.945508561292444e-05, "loss": 2.0305, "step": 2748 }, { "epoch": 0.2584482197372936, "grad_norm": 0.3683187961578369, "learning_rate": 5.93902844104399e-05, "loss": 2.027, "step": 2754 }, { "epoch": 0.25901128775415044, "grad_norm": 0.48876839876174927, "learning_rate": 5.9325694631232605e-05, "loss": 2.0415, "step": 2760 }, { "epoch": 0.2595743557710073, "grad_norm": 0.38156595826148987, "learning_rate": 5.9261315128132574e-05, "loss": 2.0641, "step": 2766 }, { "epoch": 0.26013742378786414, "grad_norm": 0.38213634490966797, "learning_rate": 5.9197144762665264e-05, "loss": 2.0125, "step": 2772 }, { "epoch": 0.260700491804721, "grad_norm": 0.39129582047462463, "learning_rate": 5.9133182404966956e-05, "loss": 2.0884, "step": 2778 }, { "epoch": 0.26126355982157784, "grad_norm": 0.3738529682159424, "learning_rate": 5.9069426933701174e-05, "loss": 2.0231, "step": 2784 }, { "epoch": 0.2618266278384347, "grad_norm": 0.35130468010902405, "learning_rate": 5.900587723597622e-05, "loss": 2.0255, "step": 2790 }, { "epoch": 0.26238969585529154, "grad_norm": 0.36425885558128357, "learning_rate": 5.894253220726348e-05, "loss": 2.0446, "step": 2796 }, { "epoch": 0.2627650745331961, "eval_accuracy": 0.5724208675263774, "eval_loss": 2.0747954845428467, "eval_runtime": 87.9764, "eval_samples_per_second": 4.547, "eval_steps_per_second": 1.137, "step": 2800 }, { "epoch": 0.2629527638721484, "grad_norm": 0.3615624010562897, "learning_rate": 5.8879390751316825e-05, "loss": 2.0338, "step": 2802 }, { "epoch": 0.26351583188900524, "grad_norm": 0.4102693498134613, "learning_rate": 5.881645178009295e-05, "loss": 2.0254, "step": 2808 }, { "epoch": 0.2640788999058621, "grad_norm": 0.3537704646587372, "learning_rate": 5.875371421367264e-05, "loss": 2.0115, "step": 2814 }, { "epoch": 0.26464196792271893, "grad_norm": 0.3698138892650604, "learning_rate": 5.869117698018288e-05, "loss": 2.0565, "step": 2820 }, { "epoch": 0.2652050359395758, "grad_norm": 0.39750632643699646, "learning_rate": 5.8628839015719986e-05, "loss": 2.0529, "step": 2826 }, { "epoch": 0.26576810395643263, "grad_norm": 0.43249914050102234, "learning_rate": 5.856669926427359e-05, "loss": 2.0286, "step": 2832 }, { "epoch": 0.2663311719732895, "grad_norm": 0.39230218529701233, "learning_rate": 5.8504756677651487e-05, "loss": 2.0733, "step": 2838 }, { "epoch": 0.26689423999014633, "grad_norm": 0.36384284496307373, "learning_rate": 5.8443010215405354e-05, "loss": 2.066, "step": 2844 }, { "epoch": 0.2674573080070032, "grad_norm": 0.3756430149078369, "learning_rate": 5.838145884475742e-05, "loss": 2.0186, "step": 2850 }, { "epoch": 0.26802037602386003, "grad_norm": 0.3530266284942627, "learning_rate": 5.8320101540527855e-05, "loss": 2.0172, "step": 2856 }, { "epoch": 0.2685834440407169, "grad_norm": 0.38201698660850525, "learning_rate": 5.8258937285063016e-05, "loss": 2.0407, "step": 2862 }, { "epoch": 0.26914651205757373, "grad_norm": 0.36573222279548645, "learning_rate": 5.8197965068164656e-05, "loss": 2.0128, "step": 2868 }, { "epoch": 0.2697095800744306, "grad_norm": 0.35366177558898926, "learning_rate": 5.8137183887019783e-05, "loss": 2.0428, "step": 2874 }, { "epoch": 0.2702726480912874, "grad_norm": 0.3702300488948822, "learning_rate": 5.8076592746131384e-05, "loss": 2.02, "step": 2880 }, { "epoch": 0.2708357161081443, "grad_norm": 0.3710714876651764, "learning_rate": 5.8016190657249965e-05, "loss": 2.0353, "step": 2886 }, { "epoch": 0.2713987841250011, "grad_norm": 0.49205076694488525, "learning_rate": 5.795597663930587e-05, "loss": 2.0223, "step": 2892 }, { "epoch": 0.271961852141858, "grad_norm": 0.4656328856945038, "learning_rate": 5.7895949718342306e-05, "loss": 2.0662, "step": 2898 }, { "epoch": 0.2725249201587148, "grad_norm": 0.4834170937538147, "learning_rate": 5.7836108927449193e-05, "loss": 2.0453, "step": 2904 }, { "epoch": 0.27308798817557167, "grad_norm": 0.4475932717323303, "learning_rate": 5.7776453306697806e-05, "loss": 2.0475, "step": 2910 }, { "epoch": 0.2736510561924285, "grad_norm": 0.42237889766693115, "learning_rate": 5.7716981903075986e-05, "loss": 2.0166, "step": 2916 }, { "epoch": 0.27421412420928537, "grad_norm": 0.4259392023086548, "learning_rate": 5.7657693770424295e-05, "loss": 2.0139, "step": 2922 }, { "epoch": 0.2747771922261422, "grad_norm": 0.36421456933021545, "learning_rate": 5.759858796937277e-05, "loss": 2.0544, "step": 2928 }, { "epoch": 0.27534026024299907, "grad_norm": 0.4016430675983429, "learning_rate": 5.7539663567278374e-05, "loss": 1.9988, "step": 2934 }, { "epoch": 0.27590332825985586, "grad_norm": 0.38135629892349243, "learning_rate": 5.748091963816323e-05, "loss": 1.9818, "step": 2940 }, { "epoch": 0.2764663962767127, "grad_norm": 0.41381314396858215, "learning_rate": 5.742235526265347e-05, "loss": 2.0478, "step": 2946 }, { "epoch": 0.27702946429356956, "grad_norm": 0.3991234004497528, "learning_rate": 5.7363969527918824e-05, "loss": 2.0464, "step": 2952 }, { "epoch": 0.2775925323104264, "grad_norm": 0.43262916803359985, "learning_rate": 5.7305761527612875e-05, "loss": 2.0753, "step": 2958 }, { "epoch": 0.27815560032728326, "grad_norm": 0.37170666456222534, "learning_rate": 5.724773036181389e-05, "loss": 2.055, "step": 2964 }, { "epoch": 0.2787186683441401, "grad_norm": 0.38720643520355225, "learning_rate": 5.71898751369665e-05, "loss": 2.0689, "step": 2970 }, { "epoch": 0.27928173636099696, "grad_norm": 0.465340793132782, "learning_rate": 5.713219496582381e-05, "loss": 1.9902, "step": 2976 }, { "epoch": 0.2798448043778538, "grad_norm": 0.4295141398906708, "learning_rate": 5.707468896739032e-05, "loss": 2.0434, "step": 2982 }, { "epoch": 0.28040787239471066, "grad_norm": 0.4417092502117157, "learning_rate": 5.701735626686536e-05, "loss": 2.0067, "step": 2988 }, { "epoch": 0.2809709404115675, "grad_norm": 0.44249144196510315, "learning_rate": 5.696019599558723e-05, "loss": 2.0258, "step": 2994 }, { "epoch": 0.28153400842842435, "grad_norm": 0.41725948452949524, "learning_rate": 5.6903207290977895e-05, "loss": 2.0643, "step": 3000 }, { "epoch": 0.2820970764452812, "grad_norm": 0.3545498549938202, "learning_rate": 5.684638929648833e-05, "loss": 2.0409, "step": 3006 }, { "epoch": 0.28266014446213805, "grad_norm": 0.3477479815483093, "learning_rate": 5.6789741161544485e-05, "loss": 2.0328, "step": 3012 }, { "epoch": 0.2832232124789949, "grad_norm": 0.3443724811077118, "learning_rate": 5.673326204149368e-05, "loss": 2.0101, "step": 3018 }, { "epoch": 0.28378628049585175, "grad_norm": 0.38158220052719116, "learning_rate": 5.667695109755186e-05, "loss": 1.9945, "step": 3024 }, { "epoch": 0.2843493485127086, "grad_norm": 0.4364180862903595, "learning_rate": 5.662080749675122e-05, "loss": 2.0643, "step": 3030 }, { "epoch": 0.28491241652956545, "grad_norm": 0.38518819212913513, "learning_rate": 5.656483041188837e-05, "loss": 2.0264, "step": 3036 }, { "epoch": 0.2854754845464223, "grad_norm": 0.3753647804260254, "learning_rate": 5.650901902147329e-05, "loss": 2.0396, "step": 3042 }, { "epoch": 0.28603855256327915, "grad_norm": 0.4051103889942169, "learning_rate": 5.6453372509678604e-05, "loss": 2.0169, "step": 3048 }, { "epoch": 0.286601620580136, "grad_norm": 0.4188491702079773, "learning_rate": 5.639789006628947e-05, "loss": 2.0293, "step": 3054 }, { "epoch": 0.28716468859699285, "grad_norm": 0.4279583692550659, "learning_rate": 5.634257088665414e-05, "loss": 2.0516, "step": 3060 }, { "epoch": 0.2877277566138497, "grad_norm": 0.4570727050304413, "learning_rate": 5.6287414171634846e-05, "loss": 2.0575, "step": 3066 }, { "epoch": 0.28829082463070654, "grad_norm": 0.3688872456550598, "learning_rate": 5.6232419127559326e-05, "loss": 2.0717, "step": 3072 }, { "epoch": 0.2888538926475634, "grad_norm": 0.39403700828552246, "learning_rate": 5.6177584966172934e-05, "loss": 2.0547, "step": 3078 }, { "epoch": 0.28941696066442024, "grad_norm": 0.4359886348247528, "learning_rate": 5.612291090459109e-05, "loss": 2.0335, "step": 3084 }, { "epoch": 0.2899800286812771, "grad_norm": 0.3721937835216522, "learning_rate": 5.606839616525239e-05, "loss": 2.0388, "step": 3090 }, { "epoch": 0.29054309669813394, "grad_norm": 0.38884449005126953, "learning_rate": 5.6014039975872164e-05, "loss": 2.0225, "step": 3096 }, { "epoch": 0.2911061647149908, "grad_norm": 0.3890187740325928, "learning_rate": 5.595984156939648e-05, "loss": 2.0147, "step": 3102 }, { "epoch": 0.29166923273184764, "grad_norm": 0.39999085664749146, "learning_rate": 5.590580018395672e-05, "loss": 2.0036, "step": 3108 }, { "epoch": 0.2922323007487045, "grad_norm": 0.46363580226898193, "learning_rate": 5.585191506282458e-05, "loss": 2.0849, "step": 3114 }, { "epoch": 0.29279536876556134, "grad_norm": 0.4399106204509735, "learning_rate": 5.5798185454367605e-05, "loss": 2.0546, "step": 3120 }, { "epoch": 0.2933584367824182, "grad_norm": 0.4152950048446655, "learning_rate": 5.574461061200508e-05, "loss": 2.0366, "step": 3126 }, { "epoch": 0.29392150479927504, "grad_norm": 0.39241957664489746, "learning_rate": 5.5691189794164535e-05, "loss": 2.006, "step": 3132 }, { "epoch": 0.2944845728161319, "grad_norm": 0.4056050479412079, "learning_rate": 5.563792226423857e-05, "loss": 2.0223, "step": 3138 }, { "epoch": 0.29504764083298873, "grad_norm": 0.4625614583492279, "learning_rate": 5.558480729054224e-05, "loss": 2.003, "step": 3144 }, { "epoch": 0.2956107088498456, "grad_norm": 0.45702484250068665, "learning_rate": 5.553184414627084e-05, "loss": 1.9814, "step": 3150 }, { "epoch": 0.29617377686670243, "grad_norm": 0.4293862283229828, "learning_rate": 5.5479032109458086e-05, "loss": 2.036, "step": 3156 }, { "epoch": 0.2967368448835593, "grad_norm": 0.4018986225128174, "learning_rate": 5.542637046293486e-05, "loss": 2.0142, "step": 3162 }, { "epoch": 0.29729991290041613, "grad_norm": 0.37095019221305847, "learning_rate": 5.537385849428821e-05, "loss": 2.0189, "step": 3168 }, { "epoch": 0.297862980917273, "grad_norm": 0.38865411281585693, "learning_rate": 5.5321495495820976e-05, "loss": 2.0082, "step": 3174 }, { "epoch": 0.29842604893412983, "grad_norm": 0.37039583921432495, "learning_rate": 5.5269280764511624e-05, "loss": 2.0314, "step": 3180 }, { "epoch": 0.2989891169509867, "grad_norm": 0.3515841066837311, "learning_rate": 5.5217213601974714e-05, "loss": 2.0142, "step": 3186 }, { "epoch": 0.2995521849678435, "grad_norm": 0.38627445697784424, "learning_rate": 5.516529331442158e-05, "loss": 2.0108, "step": 3192 }, { "epoch": 0.3001152529847004, "grad_norm": 0.504530131816864, "learning_rate": 5.511351921262152e-05, "loss": 2.0709, "step": 3198 }, { "epoch": 0.3003029423236527, "eval_accuracy": 0.5744783118405627, "eval_loss": 2.0626447200775146, "eval_runtime": 88.0995, "eval_samples_per_second": 4.54, "eval_steps_per_second": 1.135, "step": 3200 }, { "epoch": 0.3006783210015572, "grad_norm": 0.415378600358963, "learning_rate": 5.5061890611863385e-05, "loss": 2.0141, "step": 3204 }, { "epoch": 0.3012413890184141, "grad_norm": 0.42509496212005615, "learning_rate": 5.501040683191758e-05, "loss": 1.9947, "step": 3210 }, { "epoch": 0.3018044570352709, "grad_norm": 0.4218599498271942, "learning_rate": 5.495906719699834e-05, "loss": 2.0641, "step": 3216 }, { "epoch": 0.3023675250521278, "grad_norm": 0.4598177671432495, "learning_rate": 5.4907871035726546e-05, "loss": 2.0384, "step": 3222 }, { "epoch": 0.3029305930689846, "grad_norm": 0.511275053024292, "learning_rate": 5.485681768109278e-05, "loss": 1.9988, "step": 3228 }, { "epoch": 0.30349366108584147, "grad_norm": 0.42812275886535645, "learning_rate": 5.4805906470420936e-05, "loss": 2.0184, "step": 3234 }, { "epoch": 0.3040567291026983, "grad_norm": 0.38051459193229675, "learning_rate": 5.475513674533194e-05, "loss": 2.0134, "step": 3240 }, { "epoch": 0.30461979711955517, "grad_norm": 0.44344890117645264, "learning_rate": 5.470450785170814e-05, "loss": 2.0382, "step": 3246 }, { "epoch": 0.305182865136412, "grad_norm": 0.40117329359054565, "learning_rate": 5.4654019139657804e-05, "loss": 2.0119, "step": 3252 }, { "epoch": 0.30574593315326887, "grad_norm": 0.37831515073776245, "learning_rate": 5.460366996348014e-05, "loss": 2.0252, "step": 3258 }, { "epoch": 0.3063090011701257, "grad_norm": 0.37700262665748596, "learning_rate": 5.4553459681630594e-05, "loss": 2.0193, "step": 3264 }, { "epoch": 0.30687206918698257, "grad_norm": 0.4053364396095276, "learning_rate": 5.45033876566865e-05, "loss": 1.9863, "step": 3270 }, { "epoch": 0.3074351372038394, "grad_norm": 0.3868497908115387, "learning_rate": 5.4453453255313126e-05, "loss": 2.0093, "step": 3276 }, { "epoch": 0.30799820522069626, "grad_norm": 0.3937080502510071, "learning_rate": 5.440365584823001e-05, "loss": 1.9924, "step": 3282 }, { "epoch": 0.3085612732375531, "grad_norm": 0.3999801278114319, "learning_rate": 5.4353994810177634e-05, "loss": 2.0308, "step": 3288 }, { "epoch": 0.30912434125440996, "grad_norm": 0.4131228029727936, "learning_rate": 5.430446951988451e-05, "loss": 1.9673, "step": 3294 }, { "epoch": 0.3096874092712668, "grad_norm": 0.3545723557472229, "learning_rate": 5.425507936003445e-05, "loss": 2.0361, "step": 3300 }, { "epoch": 0.31025047728812366, "grad_norm": 0.34958207607269287, "learning_rate": 5.4205823717234335e-05, "loss": 2.0333, "step": 3306 }, { "epoch": 0.3108135453049805, "grad_norm": 0.4413776099681854, "learning_rate": 5.415670198198204e-05, "loss": 2.0359, "step": 3312 }, { "epoch": 0.31137661332183736, "grad_norm": 0.3996366262435913, "learning_rate": 5.410771354863483e-05, "loss": 2.0283, "step": 3318 }, { "epoch": 0.3119396813386942, "grad_norm": 0.4103921055793762, "learning_rate": 5.4058857815377944e-05, "loss": 2.0092, "step": 3324 }, { "epoch": 0.31250274935555106, "grad_norm": 0.44243037700653076, "learning_rate": 5.401013418419357e-05, "loss": 2.0154, "step": 3330 }, { "epoch": 0.3130658173724079, "grad_norm": 0.38803765177726746, "learning_rate": 5.396154206083008e-05, "loss": 1.9631, "step": 3336 }, { "epoch": 0.31362888538926476, "grad_norm": 0.40536633133888245, "learning_rate": 5.391308085477161e-05, "loss": 2.0559, "step": 3342 }, { "epoch": 0.3141919534061216, "grad_norm": 0.39018675684928894, "learning_rate": 5.3864749979207926e-05, "loss": 2.0851, "step": 3348 }, { "epoch": 0.31475502142297845, "grad_norm": 0.46329465508461, "learning_rate": 5.381654885100454e-05, "loss": 1.9985, "step": 3354 }, { "epoch": 0.3153180894398353, "grad_norm": 0.5089358687400818, "learning_rate": 5.3768476890673196e-05, "loss": 2.0476, "step": 3360 }, { "epoch": 0.31588115745669215, "grad_norm": 0.4741554260253906, "learning_rate": 5.3720533522342584e-05, "loss": 2.0086, "step": 3366 }, { "epoch": 0.316444225473549, "grad_norm": 0.4743395447731018, "learning_rate": 5.367271817372941e-05, "loss": 2.0013, "step": 3372 }, { "epoch": 0.31700729349040585, "grad_norm": 0.4550338089466095, "learning_rate": 5.362503027610965e-05, "loss": 2.0521, "step": 3378 }, { "epoch": 0.3175703615072627, "grad_norm": 0.42378419637680054, "learning_rate": 5.357746926429017e-05, "loss": 2.0185, "step": 3384 }, { "epoch": 0.31813342952411955, "grad_norm": 0.4124879539012909, "learning_rate": 5.353003457658057e-05, "loss": 2.0365, "step": 3390 }, { "epoch": 0.3186964975409764, "grad_norm": 0.3821447491645813, "learning_rate": 5.348272565476537e-05, "loss": 2.0758, "step": 3396 }, { "epoch": 0.31925956555783325, "grad_norm": 0.4227793514728546, "learning_rate": 5.343554194407635e-05, "loss": 2.0069, "step": 3402 }, { "epoch": 0.3198226335746901, "grad_norm": 0.37183356285095215, "learning_rate": 5.3388482893165285e-05, "loss": 2.0061, "step": 3408 }, { "epoch": 0.32038570159154695, "grad_norm": 0.5581812262535095, "learning_rate": 5.334154795407688e-05, "loss": 2.024, "step": 3414 }, { "epoch": 0.3209487696084038, "grad_norm": 0.4077145457267761, "learning_rate": 5.3294736582221894e-05, "loss": 1.9898, "step": 3420 }, { "epoch": 0.32151183762526064, "grad_norm": 0.38057756423950195, "learning_rate": 5.324804823635075e-05, "loss": 2.0474, "step": 3426 }, { "epoch": 0.3220749056421175, "grad_norm": 0.4186228811740875, "learning_rate": 5.320148237852708e-05, "loss": 2.0002, "step": 3432 }, { "epoch": 0.32263797365897434, "grad_norm": 0.3820471167564392, "learning_rate": 5.315503847410182e-05, "loss": 2.0109, "step": 3438 }, { "epoch": 0.3232010416758312, "grad_norm": 0.48680955171585083, "learning_rate": 5.310871599168741e-05, "loss": 2.0044, "step": 3444 }, { "epoch": 0.32376410969268804, "grad_norm": 0.38966673612594604, "learning_rate": 5.3062514403132166e-05, "loss": 2.0028, "step": 3450 }, { "epoch": 0.3243271777095449, "grad_norm": 0.4388502836227417, "learning_rate": 5.301643318349509e-05, "loss": 2.01, "step": 3456 }, { "epoch": 0.32489024572640174, "grad_norm": 0.3897734582424164, "learning_rate": 5.2970471811020804e-05, "loss": 1.9788, "step": 3462 }, { "epoch": 0.3254533137432586, "grad_norm": 0.35381340980529785, "learning_rate": 5.292462976711467e-05, "loss": 2.0155, "step": 3468 }, { "epoch": 0.32601638176011544, "grad_norm": 0.4205129146575928, "learning_rate": 5.287890653631827e-05, "loss": 2.0296, "step": 3474 }, { "epoch": 0.3265794497769723, "grad_norm": 0.40094494819641113, "learning_rate": 5.283330160628512e-05, "loss": 2.0031, "step": 3480 }, { "epoch": 0.32714251779382914, "grad_norm": 0.394264817237854, "learning_rate": 5.27878144677564e-05, "loss": 2.0439, "step": 3486 }, { "epoch": 0.327705585810686, "grad_norm": 0.3710891604423523, "learning_rate": 5.274244461453722e-05, "loss": 2.0294, "step": 3492 }, { "epoch": 0.32826865382754283, "grad_norm": 0.3848089873790741, "learning_rate": 5.269719154347287e-05, "loss": 2.0674, "step": 3498 }, { "epoch": 0.3288317218443997, "grad_norm": 0.36767858266830444, "learning_rate": 5.265205475442544e-05, "loss": 1.9797, "step": 3504 }, { "epoch": 0.32939478986125653, "grad_norm": 0.3754799962043762, "learning_rate": 5.260703375025054e-05, "loss": 2.0472, "step": 3510 }, { "epoch": 0.3299578578781134, "grad_norm": 0.382953941822052, "learning_rate": 5.256212803677441e-05, "loss": 2.0221, "step": 3516 }, { "epoch": 0.33052092589497023, "grad_norm": 0.46734532713890076, "learning_rate": 5.251733712277099e-05, "loss": 2.0134, "step": 3522 }, { "epoch": 0.3310839939118271, "grad_norm": 0.4192453920841217, "learning_rate": 5.2472660519939486e-05, "loss": 2.0411, "step": 3528 }, { "epoch": 0.33164706192868393, "grad_norm": 0.37821507453918457, "learning_rate": 5.242809774288194e-05, "loss": 2.0763, "step": 3534 }, { "epoch": 0.3322101299455408, "grad_norm": 0.47242215275764465, "learning_rate": 5.238364830908107e-05, "loss": 2.062, "step": 3540 }, { "epoch": 0.3327731979623976, "grad_norm": 0.35488608479499817, "learning_rate": 5.2339311738878386e-05, "loss": 2.0342, "step": 3546 }, { "epoch": 0.3333362659792545, "grad_norm": 0.40076401829719543, "learning_rate": 5.2295087555452404e-05, "loss": 2.0577, "step": 3552 }, { "epoch": 0.3338993339961113, "grad_norm": 0.3867778778076172, "learning_rate": 5.225097528479711e-05, "loss": 2.0176, "step": 3558 }, { "epoch": 0.3344624020129682, "grad_norm": 0.40837720036506653, "learning_rate": 5.220697445570066e-05, "loss": 1.98, "step": 3564 }, { "epoch": 0.335025470029825, "grad_norm": 0.3657865524291992, "learning_rate": 5.2163084599724194e-05, "loss": 2.0948, "step": 3570 }, { "epoch": 0.3355885380466819, "grad_norm": 0.41901615262031555, "learning_rate": 5.211930525118089e-05, "loss": 1.9741, "step": 3576 }, { "epoch": 0.3361516060635387, "grad_norm": 0.369616836309433, "learning_rate": 5.207563594711526e-05, "loss": 2.0069, "step": 3582 }, { "epoch": 0.33671467408039557, "grad_norm": 0.3916521370410919, "learning_rate": 5.203207622728253e-05, "loss": 2.0424, "step": 3588 }, { "epoch": 0.3372777420972524, "grad_norm": 0.3725515604019165, "learning_rate": 5.19886256341283e-05, "loss": 2.0209, "step": 3594 }, { "epoch": 0.33784081011410927, "grad_norm": 0.34721139073371887, "learning_rate": 5.194528371276838e-05, "loss": 2.0322, "step": 3600 }, { "epoch": 0.33784081011410927, "eval_accuracy": 0.5755509964830011, "eval_loss": 2.0539650917053223, "eval_runtime": 88.0551, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 3600 }, { "epoch": 0.3384038781309661, "grad_norm": 0.5088382959365845, "learning_rate": 5.1902050010968727e-05, "loss": 2.0801, "step": 3606 }, { "epoch": 0.33896694614782297, "grad_norm": 0.3996741473674774, "learning_rate": 5.185892407912565e-05, "loss": 2.0454, "step": 3612 }, { "epoch": 0.3395300141646798, "grad_norm": 0.40185314416885376, "learning_rate": 5.181590547024622e-05, "loss": 1.9889, "step": 3618 }, { "epoch": 0.34009308218153667, "grad_norm": 0.4054214060306549, "learning_rate": 5.1772993739928756e-05, "loss": 2.0184, "step": 3624 }, { "epoch": 0.3406561501983935, "grad_norm": 0.4095800817012787, "learning_rate": 5.1730188446343546e-05, "loss": 1.9936, "step": 3630 }, { "epoch": 0.34121921821525036, "grad_norm": 0.44072896242141724, "learning_rate": 5.168748915021376e-05, "loss": 2.0167, "step": 3636 }, { "epoch": 0.3417822862321072, "grad_norm": 0.4206717312335968, "learning_rate": 5.164489541479653e-05, "loss": 2.0268, "step": 3642 }, { "epoch": 0.34234535424896406, "grad_norm": 0.41684889793395996, "learning_rate": 5.1602406805864146e-05, "loss": 1.9614, "step": 3648 }, { "epoch": 0.3429084222658209, "grad_norm": 0.4334743320941925, "learning_rate": 5.15600228916855e-05, "loss": 1.9908, "step": 3654 }, { "epoch": 0.34347149028267776, "grad_norm": 0.3616984188556671, "learning_rate": 5.1517743243007634e-05, "loss": 1.9746, "step": 3660 }, { "epoch": 0.3440345582995346, "grad_norm": 0.3747680187225342, "learning_rate": 5.1475567433037505e-05, "loss": 2.0203, "step": 3666 }, { "epoch": 0.34459762631639146, "grad_norm": 0.37610551714897156, "learning_rate": 5.1433495037423875e-05, "loss": 1.9802, "step": 3672 }, { "epoch": 0.3451606943332483, "grad_norm": 0.4405878186225891, "learning_rate": 5.1391525634239364e-05, "loss": 2.0131, "step": 3678 }, { "epoch": 0.34572376235010516, "grad_norm": 0.3651449680328369, "learning_rate": 5.134965880396273e-05, "loss": 1.9685, "step": 3684 }, { "epoch": 0.346286830366962, "grad_norm": 0.3731696903705597, "learning_rate": 5.1307894129461214e-05, "loss": 2.0114, "step": 3690 }, { "epoch": 0.34684989838381886, "grad_norm": 0.43238288164138794, "learning_rate": 5.126623119597306e-05, "loss": 2.0264, "step": 3696 }, { "epoch": 0.3474129664006757, "grad_norm": 0.3562651574611664, "learning_rate": 5.1224669591090287e-05, "loss": 2.0289, "step": 3702 }, { "epoch": 0.34797603441753255, "grad_norm": 0.4091859757900238, "learning_rate": 5.118320890474148e-05, "loss": 2.0538, "step": 3708 }, { "epoch": 0.3485391024343894, "grad_norm": 0.4634217321872711, "learning_rate": 5.114184872917485e-05, "loss": 1.9525, "step": 3714 }, { "epoch": 0.34910217045124625, "grad_norm": 0.48576030135154724, "learning_rate": 5.110058865894133e-05, "loss": 2.0112, "step": 3720 }, { "epoch": 0.3496652384681031, "grad_norm": 0.38545283675193787, "learning_rate": 5.105942829087792e-05, "loss": 2.015, "step": 3726 }, { "epoch": 0.35022830648495995, "grad_norm": 0.40605759620666504, "learning_rate": 5.101836722409116e-05, "loss": 1.9975, "step": 3732 }, { "epoch": 0.3507913745018168, "grad_norm": 0.3971734046936035, "learning_rate": 5.097740505994068e-05, "loss": 1.9739, "step": 3738 }, { "epoch": 0.35135444251867365, "grad_norm": 0.4183933436870575, "learning_rate": 5.0936541402022973e-05, "loss": 2.0291, "step": 3744 }, { "epoch": 0.3519175105355305, "grad_norm": 0.3829009532928467, "learning_rate": 5.08957758561553e-05, "loss": 2.0456, "step": 3750 }, { "epoch": 0.35248057855238735, "grad_norm": 0.40464216470718384, "learning_rate": 5.085510803035967e-05, "loss": 2.0033, "step": 3756 }, { "epoch": 0.3530436465692442, "grad_norm": 0.4127351939678192, "learning_rate": 5.081453753484708e-05, "loss": 1.9991, "step": 3762 }, { "epoch": 0.35360671458610105, "grad_norm": 0.4243849217891693, "learning_rate": 5.077406398200175e-05, "loss": 2.0463, "step": 3768 }, { "epoch": 0.3541697826029579, "grad_norm": 0.4474215805530548, "learning_rate": 5.073368698636562e-05, "loss": 2.0189, "step": 3774 }, { "epoch": 0.3547328506198147, "grad_norm": 0.44173893332481384, "learning_rate": 5.0693406164622915e-05, "loss": 2.0221, "step": 3780 }, { "epoch": 0.35529591863667154, "grad_norm": 0.40083175897598267, "learning_rate": 5.065322113558485e-05, "loss": 2.0486, "step": 3786 }, { "epoch": 0.3558589866535284, "grad_norm": 0.37787899374961853, "learning_rate": 5.0613131520174496e-05, "loss": 1.9856, "step": 3792 }, { "epoch": 0.35642205467038524, "grad_norm": 0.3796723484992981, "learning_rate": 5.057313694141176e-05, "loss": 1.9868, "step": 3798 }, { "epoch": 0.3569851226872421, "grad_norm": 0.39724838733673096, "learning_rate": 5.0533237024398486e-05, "loss": 2.026, "step": 3804 }, { "epoch": 0.35754819070409893, "grad_norm": 0.3485679626464844, "learning_rate": 5.049343139630371e-05, "loss": 1.9794, "step": 3810 }, { "epoch": 0.3581112587209558, "grad_norm": 0.40003934502601624, "learning_rate": 5.045371968634899e-05, "loss": 2.0121, "step": 3816 }, { "epoch": 0.35867432673781263, "grad_norm": 0.35561761260032654, "learning_rate": 5.041410152579396e-05, "loss": 2.0258, "step": 3822 }, { "epoch": 0.3592373947546695, "grad_norm": 0.3979058265686035, "learning_rate": 5.037457654792192e-05, "loss": 2.0205, "step": 3828 }, { "epoch": 0.35980046277152633, "grad_norm": 0.4447910785675049, "learning_rate": 5.0335144388025555e-05, "loss": 1.9857, "step": 3834 }, { "epoch": 0.3603635307883832, "grad_norm": 0.39398401975631714, "learning_rate": 5.029580468339283e-05, "loss": 1.9912, "step": 3840 }, { "epoch": 0.36092659880524003, "grad_norm": 0.40648922324180603, "learning_rate": 5.025655707329298e-05, "loss": 1.9833, "step": 3846 }, { "epoch": 0.3614896668220969, "grad_norm": 0.4858560562133789, "learning_rate": 5.02174011989626e-05, "loss": 2.0078, "step": 3852 }, { "epoch": 0.3620527348389537, "grad_norm": 0.4390614926815033, "learning_rate": 5.0178336703591874e-05, "loss": 2.0338, "step": 3858 }, { "epoch": 0.3626158028558106, "grad_norm": 0.34914150834083557, "learning_rate": 5.0139363232310944e-05, "loss": 2.0491, "step": 3864 }, { "epoch": 0.3631788708726674, "grad_norm": 0.40928298234939575, "learning_rate": 5.010048043217634e-05, "loss": 1.9826, "step": 3870 }, { "epoch": 0.3637419388895243, "grad_norm": 0.3748873174190521, "learning_rate": 5.006168795215754e-05, "loss": 2.0241, "step": 3876 }, { "epoch": 0.3643050069063811, "grad_norm": 0.4088262617588043, "learning_rate": 5.002298544312372e-05, "loss": 2.0365, "step": 3882 }, { "epoch": 0.364868074923238, "grad_norm": 0.4868689477443695, "learning_rate": 4.9984372557830514e-05, "loss": 1.9677, "step": 3888 }, { "epoch": 0.3654311429400948, "grad_norm": 0.4113070070743561, "learning_rate": 4.994584895090693e-05, "loss": 2.0327, "step": 3894 }, { "epoch": 0.36599421095695167, "grad_norm": 0.38294121623039246, "learning_rate": 4.990741427884237e-05, "loss": 1.9781, "step": 3900 }, { "epoch": 0.3665572789738085, "grad_norm": 0.3510473370552063, "learning_rate": 4.9869068199973826e-05, "loss": 2.0155, "step": 3906 }, { "epoch": 0.36712034699066537, "grad_norm": 0.3893468976020813, "learning_rate": 4.9830810374473006e-05, "loss": 2.0131, "step": 3912 }, { "epoch": 0.3676834150075222, "grad_norm": 0.3671349287033081, "learning_rate": 4.9792640464333823e-05, "loss": 1.9962, "step": 3918 }, { "epoch": 0.36824648302437907, "grad_norm": 0.39604249596595764, "learning_rate": 4.975455813335972e-05, "loss": 1.9888, "step": 3924 }, { "epoch": 0.3688095510412359, "grad_norm": 0.40294209122657776, "learning_rate": 4.971656304715134e-05, "loss": 2.01, "step": 3930 }, { "epoch": 0.36937261905809277, "grad_norm": 0.3786156177520752, "learning_rate": 4.967865487309414e-05, "loss": 2.0356, "step": 3936 }, { "epoch": 0.3699356870749496, "grad_norm": 0.3874383568763733, "learning_rate": 4.9640833280346165e-05, "loss": 1.9859, "step": 3942 }, { "epoch": 0.37049875509180646, "grad_norm": 0.49237367510795593, "learning_rate": 4.960309793982594e-05, "loss": 2.0653, "step": 3948 }, { "epoch": 0.3710618231086633, "grad_norm": 0.516211211681366, "learning_rate": 4.95654485242004e-05, "loss": 1.9784, "step": 3954 }, { "epoch": 0.37162489112552016, "grad_norm": 0.458514541387558, "learning_rate": 4.952788470787304e-05, "loss": 2.0039, "step": 3960 }, { "epoch": 0.372187959142377, "grad_norm": 0.4534285068511963, "learning_rate": 4.9490406166972006e-05, "loss": 2.0362, "step": 3966 }, { "epoch": 0.37275102715923386, "grad_norm": 0.4010961055755615, "learning_rate": 4.945301257933843e-05, "loss": 2.0181, "step": 3972 }, { "epoch": 0.3733140951760907, "grad_norm": 0.35976648330688477, "learning_rate": 4.941570362451477e-05, "loss": 1.9662, "step": 3978 }, { "epoch": 0.37387716319294756, "grad_norm": 0.41424092650413513, "learning_rate": 4.937847898373327e-05, "loss": 2.0182, "step": 3984 }, { "epoch": 0.3744402312098044, "grad_norm": 0.4216844439506531, "learning_rate": 4.934133833990452e-05, "loss": 2.0284, "step": 3990 }, { "epoch": 0.37500329922666126, "grad_norm": 0.43283116817474365, "learning_rate": 4.9304281377606166e-05, "loss": 1.9582, "step": 3996 }, { "epoch": 0.37537867790456586, "eval_accuracy": 0.5764497850722938, "eval_loss": 2.0474069118499756, "eval_runtime": 88.2833, "eval_samples_per_second": 4.531, "eval_steps_per_second": 1.133, "step": 4000 }, { "epoch": 0.3755663672435181, "grad_norm": 0.4382220208644867, "learning_rate": 4.9267307783071566e-05, "loss": 2.0259, "step": 4002 }, { "epoch": 0.37612943526037496, "grad_norm": 0.4296628534793854, "learning_rate": 4.923041724417871e-05, "loss": 2.0151, "step": 4008 }, { "epoch": 0.3766925032772318, "grad_norm": 0.4568118453025818, "learning_rate": 4.919360945043911e-05, "loss": 1.9888, "step": 4014 }, { "epoch": 0.37725557129408865, "grad_norm": 0.4683358371257782, "learning_rate": 4.915688409298685e-05, "loss": 2.0306, "step": 4020 }, { "epoch": 0.3778186393109455, "grad_norm": 0.5152496099472046, "learning_rate": 4.9120240864567675e-05, "loss": 2.0577, "step": 4026 }, { "epoch": 0.37838170732780235, "grad_norm": 0.4130321443080902, "learning_rate": 4.908367945952824e-05, "loss": 1.9695, "step": 4032 }, { "epoch": 0.3789447753446592, "grad_norm": 0.45234936475753784, "learning_rate": 4.904719957380533e-05, "loss": 2.075, "step": 4038 }, { "epoch": 0.37950784336151605, "grad_norm": 0.4091279208660126, "learning_rate": 4.9010800904915305e-05, "loss": 1.9953, "step": 4044 }, { "epoch": 0.3800709113783729, "grad_norm": 0.4063579738140106, "learning_rate": 4.8974483151943525e-05, "loss": 2.0496, "step": 4050 }, { "epoch": 0.38063397939522975, "grad_norm": 0.3787045180797577, "learning_rate": 4.893824601553394e-05, "loss": 1.9658, "step": 4056 }, { "epoch": 0.3811970474120866, "grad_norm": 0.4505304992198944, "learning_rate": 4.890208919787868e-05, "loss": 2.0261, "step": 4062 }, { "epoch": 0.38176011542894345, "grad_norm": 0.3929736018180847, "learning_rate": 4.8866012402707804e-05, "loss": 2.043, "step": 4068 }, { "epoch": 0.3823231834458003, "grad_norm": 0.43084853887557983, "learning_rate": 4.883001533527911e-05, "loss": 1.9827, "step": 4074 }, { "epoch": 0.38288625146265715, "grad_norm": 0.4066733717918396, "learning_rate": 4.879409770236801e-05, "loss": 1.9889, "step": 4080 }, { "epoch": 0.383449319479514, "grad_norm": 0.41796305775642395, "learning_rate": 4.8758259212257476e-05, "loss": 1.9582, "step": 4086 }, { "epoch": 0.38401238749637084, "grad_norm": 0.37358757853507996, "learning_rate": 4.872249957472815e-05, "loss": 2.0255, "step": 4092 }, { "epoch": 0.3845754555132277, "grad_norm": 0.4157041907310486, "learning_rate": 4.868681850104837e-05, "loss": 1.9793, "step": 4098 }, { "epoch": 0.38513852353008454, "grad_norm": 0.4109339714050293, "learning_rate": 4.8651215703964516e-05, "loss": 1.9642, "step": 4104 }, { "epoch": 0.3857015915469414, "grad_norm": 0.3696960508823395, "learning_rate": 4.8615690897691185e-05, "loss": 1.9797, "step": 4110 }, { "epoch": 0.38626465956379824, "grad_norm": 0.3811885416507721, "learning_rate": 4.85802437979016e-05, "loss": 2.0403, "step": 4116 }, { "epoch": 0.3868277275806551, "grad_norm": 0.36819127202033997, "learning_rate": 4.854487412171804e-05, "loss": 1.9569, "step": 4122 }, { "epoch": 0.38739079559751194, "grad_norm": 0.38054630160331726, "learning_rate": 4.850958158770238e-05, "loss": 1.9631, "step": 4128 }, { "epoch": 0.3879538636143688, "grad_norm": 0.4304635226726532, "learning_rate": 4.847436591584666e-05, "loss": 2.0104, "step": 4134 }, { "epoch": 0.38851693163122564, "grad_norm": 0.42490389943122864, "learning_rate": 4.843922682756378e-05, "loss": 1.9927, "step": 4140 }, { "epoch": 0.3890799996480825, "grad_norm": 0.38235440850257874, "learning_rate": 4.84041640456782e-05, "loss": 1.9741, "step": 4146 }, { "epoch": 0.38964306766493934, "grad_norm": 0.393965482711792, "learning_rate": 4.836917729441683e-05, "loss": 1.9587, "step": 4152 }, { "epoch": 0.3902061356817962, "grad_norm": 0.4099915623664856, "learning_rate": 4.833426629939983e-05, "loss": 2.0443, "step": 4158 }, { "epoch": 0.39076920369865303, "grad_norm": 0.47078028321266174, "learning_rate": 4.829943078763167e-05, "loss": 2.0187, "step": 4164 }, { "epoch": 0.3913322717155099, "grad_norm": 0.4085240662097931, "learning_rate": 4.826467048749206e-05, "loss": 2.0193, "step": 4170 }, { "epoch": 0.39189533973236673, "grad_norm": 0.4600004255771637, "learning_rate": 4.8229985128727144e-05, "loss": 1.991, "step": 4176 }, { "epoch": 0.3924584077492236, "grad_norm": 0.3831058740615845, "learning_rate": 4.8195374442440624e-05, "loss": 2.027, "step": 4182 }, { "epoch": 0.39302147576608043, "grad_norm": 0.39269423484802246, "learning_rate": 4.816083816108502e-05, "loss": 2.0258, "step": 4188 }, { "epoch": 0.3935845437829373, "grad_norm": 0.376082181930542, "learning_rate": 4.8126376018453e-05, "loss": 2.018, "step": 4194 }, { "epoch": 0.39414761179979413, "grad_norm": 0.41746577620506287, "learning_rate": 4.8091987749668716e-05, "loss": 1.9914, "step": 4200 }, { "epoch": 0.394710679816651, "grad_norm": 0.3821529746055603, "learning_rate": 4.80576730911793e-05, "loss": 2.0761, "step": 4206 }, { "epoch": 0.3952737478335078, "grad_norm": 0.36599040031433105, "learning_rate": 4.802343178074636e-05, "loss": 1.9882, "step": 4212 }, { "epoch": 0.3958368158503647, "grad_norm": 0.46529850363731384, "learning_rate": 4.798926355743758e-05, "loss": 2.009, "step": 4218 }, { "epoch": 0.3963998838672215, "grad_norm": 0.37118858098983765, "learning_rate": 4.795516816161832e-05, "loss": 2.0014, "step": 4224 }, { "epoch": 0.3969629518840784, "grad_norm": 0.4417169392108917, "learning_rate": 4.7921145334943384e-05, "loss": 1.9857, "step": 4230 }, { "epoch": 0.3975260199009352, "grad_norm": 0.4320991635322571, "learning_rate": 4.788719482034879e-05, "loss": 2.0061, "step": 4236 }, { "epoch": 0.3980890879177921, "grad_norm": 0.39100182056427, "learning_rate": 4.7853316362043574e-05, "loss": 2.0054, "step": 4242 }, { "epoch": 0.3986521559346489, "grad_norm": 0.37149932980537415, "learning_rate": 4.7819509705501754e-05, "loss": 2.0364, "step": 4248 }, { "epoch": 0.39921522395150577, "grad_norm": 0.46323665976524353, "learning_rate": 4.778577459745424e-05, "loss": 2.025, "step": 4254 }, { "epoch": 0.3997782919683626, "grad_norm": 0.4209620952606201, "learning_rate": 4.775211078588089e-05, "loss": 1.9509, "step": 4260 }, { "epoch": 0.40034135998521947, "grad_norm": 0.4448121190071106, "learning_rate": 4.771851802000263e-05, "loss": 1.9863, "step": 4266 }, { "epoch": 0.4009044280020763, "grad_norm": 0.41466081142425537, "learning_rate": 4.768499605027358e-05, "loss": 2.0874, "step": 4272 }, { "epoch": 0.40146749601893317, "grad_norm": 0.4168870747089386, "learning_rate": 4.7651544628373225e-05, "loss": 2.0091, "step": 4278 }, { "epoch": 0.40203056403579, "grad_norm": 0.39112240076065063, "learning_rate": 4.76181635071988e-05, "loss": 1.9781, "step": 4284 }, { "epoch": 0.40259363205264687, "grad_norm": 0.43508586287498474, "learning_rate": 4.75848524408575e-05, "loss": 2.008, "step": 4290 }, { "epoch": 0.4031567000695037, "grad_norm": 0.37801817059516907, "learning_rate": 4.755161118465896e-05, "loss": 1.9984, "step": 4296 }, { "epoch": 0.40371976808636056, "grad_norm": 0.43554967641830444, "learning_rate": 4.7518439495107674e-05, "loss": 1.9949, "step": 4302 }, { "epoch": 0.4042828361032174, "grad_norm": 0.4210146963596344, "learning_rate": 4.748533712989552e-05, "loss": 2.0356, "step": 4308 }, { "epoch": 0.40484590412007426, "grad_norm": 0.42051735520362854, "learning_rate": 4.7452303847894245e-05, "loss": 2.0075, "step": 4314 }, { "epoch": 0.4054089721369311, "grad_norm": 0.42944321036338806, "learning_rate": 4.7419339409148256e-05, "loss": 1.9982, "step": 4320 }, { "epoch": 0.40597204015378796, "grad_norm": 0.38112640380859375, "learning_rate": 4.738644357486711e-05, "loss": 1.989, "step": 4326 }, { "epoch": 0.4065351081706448, "grad_norm": 0.40997588634490967, "learning_rate": 4.7353616107418384e-05, "loss": 2.0755, "step": 4332 }, { "epoch": 0.40709817618750166, "grad_norm": 0.38680556416511536, "learning_rate": 4.73208567703204e-05, "loss": 1.9681, "step": 4338 }, { "epoch": 0.4076612442043585, "grad_norm": 0.3942596912384033, "learning_rate": 4.728816532823511e-05, "loss": 2.0298, "step": 4344 }, { "epoch": 0.40822431222121536, "grad_norm": 0.36778849363327026, "learning_rate": 4.725554154696094e-05, "loss": 2.0226, "step": 4350 }, { "epoch": 0.4087873802380722, "grad_norm": 0.3804781436920166, "learning_rate": 4.72229851934258e-05, "loss": 2.0029, "step": 4356 }, { "epoch": 0.40935044825492906, "grad_norm": 0.4098118543624878, "learning_rate": 4.719049603568006e-05, "loss": 2.0034, "step": 4362 }, { "epoch": 0.4099135162717859, "grad_norm": 0.39653199911117554, "learning_rate": 4.7158073842889605e-05, "loss": 1.9716, "step": 4368 }, { "epoch": 0.41047658428864275, "grad_norm": 0.38280466198921204, "learning_rate": 4.7125718385328973e-05, "loss": 1.9738, "step": 4374 }, { "epoch": 0.4110396523054996, "grad_norm": 0.38685503602027893, "learning_rate": 4.7093429434374506e-05, "loss": 2.0109, "step": 4380 }, { "epoch": 0.41160272032235645, "grad_norm": 0.37603771686553955, "learning_rate": 4.706120676249755e-05, "loss": 1.9575, "step": 4386 }, { "epoch": 0.4121657883392133, "grad_norm": 0.41713500022888184, "learning_rate": 4.702905014325772e-05, "loss": 1.9747, "step": 4392 }, { "epoch": 0.41272885635607015, "grad_norm": 0.3733316957950592, "learning_rate": 4.6996959351296275e-05, "loss": 1.9826, "step": 4398 }, { "epoch": 0.4129165456950224, "eval_accuracy": 0.5780920281359906, "eval_loss": 2.0386855602264404, "eval_runtime": 88.1863, "eval_samples_per_second": 4.536, "eval_steps_per_second": 1.134, "step": 4400 }, { "epoch": 0.413291924372927, "grad_norm": 0.3832037150859833, "learning_rate": 4.696493416232939e-05, "loss": 1.994, "step": 4404 }, { "epoch": 0.41385499238978385, "grad_norm": 0.3938111662864685, "learning_rate": 4.693297435314167e-05, "loss": 2.0431, "step": 4410 }, { "epoch": 0.4144180604066407, "grad_norm": 0.4139000475406647, "learning_rate": 4.690107970157951e-05, "loss": 2.0532, "step": 4416 }, { "epoch": 0.41498112842349755, "grad_norm": 0.41459357738494873, "learning_rate": 4.6869249986544704e-05, "loss": 1.9905, "step": 4422 }, { "epoch": 0.4155441964403544, "grad_norm": 0.4231138825416565, "learning_rate": 4.683748498798799e-05, "loss": 1.9892, "step": 4428 }, { "epoch": 0.41610726445721125, "grad_norm": 0.44362714886665344, "learning_rate": 4.6805784486902586e-05, "loss": 2.0089, "step": 4434 }, { "epoch": 0.4166703324740681, "grad_norm": 0.40625667572021484, "learning_rate": 4.677414826531795e-05, "loss": 2.0155, "step": 4440 }, { "epoch": 0.41723340049092494, "grad_norm": 0.3711986839771271, "learning_rate": 4.674257610629341e-05, "loss": 2.0009, "step": 4446 }, { "epoch": 0.4177964685077818, "grad_norm": 0.3925608694553375, "learning_rate": 4.671106779391197e-05, "loss": 2.0385, "step": 4452 }, { "epoch": 0.41835953652463864, "grad_norm": 0.39015141129493713, "learning_rate": 4.6679623113274044e-05, "loss": 2.0483, "step": 4458 }, { "epoch": 0.4189226045414955, "grad_norm": 0.34900400042533875, "learning_rate": 4.664824185049138e-05, "loss": 2.0053, "step": 4464 }, { "epoch": 0.41948567255835234, "grad_norm": 0.4000098407268524, "learning_rate": 4.661692379268089e-05, "loss": 1.9928, "step": 4470 }, { "epoch": 0.4200487405752092, "grad_norm": 0.3886759281158447, "learning_rate": 4.6585668727958596e-05, "loss": 2.028, "step": 4476 }, { "epoch": 0.42061180859206604, "grad_norm": 0.5590499639511108, "learning_rate": 4.6554476445433625e-05, "loss": 2.036, "step": 4482 }, { "epoch": 0.4211748766089229, "grad_norm": 0.4035160541534424, "learning_rate": 4.652334673520221e-05, "loss": 2.0097, "step": 4488 }, { "epoch": 0.42173794462577974, "grad_norm": 0.4141244888305664, "learning_rate": 4.64922793883418e-05, "loss": 2.031, "step": 4494 }, { "epoch": 0.4223010126426366, "grad_norm": 0.3954647183418274, "learning_rate": 4.6461274196905105e-05, "loss": 2.0029, "step": 4500 }, { "epoch": 0.42286408065949344, "grad_norm": 0.38948148488998413, "learning_rate": 4.643033095391431e-05, "loss": 1.9743, "step": 4506 }, { "epoch": 0.4234271486763503, "grad_norm": 0.483926922082901, "learning_rate": 4.639944945335524e-05, "loss": 2.0529, "step": 4512 }, { "epoch": 0.42399021669320713, "grad_norm": 0.3625122010707855, "learning_rate": 4.636862949017162e-05, "loss": 1.9732, "step": 4518 }, { "epoch": 0.424553284710064, "grad_norm": 0.4112747311592102, "learning_rate": 4.633787086025931e-05, "loss": 1.9889, "step": 4524 }, { "epoch": 0.42511635272692083, "grad_norm": 0.37921956181526184, "learning_rate": 4.630717336046071e-05, "loss": 1.9923, "step": 4530 }, { "epoch": 0.4256794207437777, "grad_norm": 0.3868277072906494, "learning_rate": 4.627653678855903e-05, "loss": 1.9997, "step": 4536 }, { "epoch": 0.42624248876063453, "grad_norm": 0.40097272396087646, "learning_rate": 4.624596094327277e-05, "loss": 2.0283, "step": 4542 }, { "epoch": 0.4268055567774914, "grad_norm": 0.3810982406139374, "learning_rate": 4.6215445624250144e-05, "loss": 2.0183, "step": 4548 }, { "epoch": 0.42736862479434823, "grad_norm": 0.4347774088382721, "learning_rate": 4.618499063206351e-05, "loss": 2.0331, "step": 4554 }, { "epoch": 0.4279316928112051, "grad_norm": 0.4025699198246002, "learning_rate": 4.615459576820401e-05, "loss": 2.0223, "step": 4560 }, { "epoch": 0.4284947608280619, "grad_norm": 0.379475861787796, "learning_rate": 4.612426083507603e-05, "loss": 2.0116, "step": 4566 }, { "epoch": 0.4290578288449188, "grad_norm": 0.3645590543746948, "learning_rate": 4.609398563599186e-05, "loss": 2.0268, "step": 4572 }, { "epoch": 0.4296208968617756, "grad_norm": 0.34019747376441956, "learning_rate": 4.606376997516631e-05, "loss": 2.0189, "step": 4578 }, { "epoch": 0.4301839648786325, "grad_norm": 0.3727477192878723, "learning_rate": 4.6033613657711415e-05, "loss": 2.0145, "step": 4584 }, { "epoch": 0.4307470328954893, "grad_norm": 0.3858342170715332, "learning_rate": 4.600351648963115e-05, "loss": 1.9879, "step": 4590 }, { "epoch": 0.4313101009123462, "grad_norm": 0.3761114776134491, "learning_rate": 4.597347827781617e-05, "loss": 1.9813, "step": 4596 }, { "epoch": 0.431873168929203, "grad_norm": 0.40534552931785583, "learning_rate": 4.5943498830038654e-05, "loss": 2.0073, "step": 4602 }, { "epoch": 0.43243623694605987, "grad_norm": 0.3589653968811035, "learning_rate": 4.5913577954947056e-05, "loss": 2.0054, "step": 4608 }, { "epoch": 0.43299930496291666, "grad_norm": 0.37278643250465393, "learning_rate": 4.5883715462061064e-05, "loss": 1.9911, "step": 4614 }, { "epoch": 0.4335623729797735, "grad_norm": 0.3925362825393677, "learning_rate": 4.5853911161766435e-05, "loss": 1.9848, "step": 4620 }, { "epoch": 0.43412544099663036, "grad_norm": 0.4189651310443878, "learning_rate": 4.582416486531002e-05, "loss": 2.0345, "step": 4626 }, { "epoch": 0.4346885090134872, "grad_norm": 0.37379634380340576, "learning_rate": 4.579447638479463e-05, "loss": 2.0225, "step": 4632 }, { "epoch": 0.43525157703034406, "grad_norm": 0.39949163794517517, "learning_rate": 4.576484553317417e-05, "loss": 1.9517, "step": 4638 }, { "epoch": 0.4358146450472009, "grad_norm": 0.42105695605278015, "learning_rate": 4.573527212424859e-05, "loss": 2.0237, "step": 4644 }, { "epoch": 0.43637771306405776, "grad_norm": 0.41173410415649414, "learning_rate": 4.570575597265904e-05, "loss": 2.0322, "step": 4650 }, { "epoch": 0.4369407810809146, "grad_norm": 0.37722301483154297, "learning_rate": 4.567629689388299e-05, "loss": 2.006, "step": 4656 }, { "epoch": 0.43750384909777146, "grad_norm": 0.39493557810783386, "learning_rate": 4.564689470422929e-05, "loss": 2.0129, "step": 4662 }, { "epoch": 0.4380669171146283, "grad_norm": 0.4211169183254242, "learning_rate": 4.5617549220833506e-05, "loss": 2.006, "step": 4668 }, { "epoch": 0.43862998513148516, "grad_norm": 0.39213791489601135, "learning_rate": 4.5588260261653006e-05, "loss": 1.974, "step": 4674 }, { "epoch": 0.439193053148342, "grad_norm": 0.40885406732559204, "learning_rate": 4.5559027645462324e-05, "loss": 1.9886, "step": 4680 }, { "epoch": 0.43975612116519885, "grad_norm": 0.41522645950317383, "learning_rate": 4.5529851191848374e-05, "loss": 2.0222, "step": 4686 }, { "epoch": 0.4403191891820557, "grad_norm": 0.44755819439888, "learning_rate": 4.550073072120582e-05, "loss": 1.9889, "step": 4692 }, { "epoch": 0.44088225719891255, "grad_norm": 0.5971991419792175, "learning_rate": 4.5471666054732403e-05, "loss": 1.95, "step": 4698 }, { "epoch": 0.4414453252157694, "grad_norm": 0.431736022233963, "learning_rate": 4.5442657014424375e-05, "loss": 2.0113, "step": 4704 }, { "epoch": 0.44200839323262625, "grad_norm": 0.3869287371635437, "learning_rate": 4.541370342307183e-05, "loss": 1.9741, "step": 4710 }, { "epoch": 0.4425714612494831, "grad_norm": 0.40412795543670654, "learning_rate": 4.538480510425427e-05, "loss": 1.9473, "step": 4716 }, { "epoch": 0.44313452926633995, "grad_norm": 0.3960817754268646, "learning_rate": 4.535596188233603e-05, "loss": 1.9749, "step": 4722 }, { "epoch": 0.4436975972831968, "grad_norm": 0.40139082074165344, "learning_rate": 4.532717358246177e-05, "loss": 1.9571, "step": 4728 }, { "epoch": 0.44426066530005365, "grad_norm": 0.4051312804222107, "learning_rate": 4.529844003055206e-05, "loss": 2.0426, "step": 4734 }, { "epoch": 0.4448237333169105, "grad_norm": 0.354390412569046, "learning_rate": 4.526976105329898e-05, "loss": 2.0233, "step": 4740 }, { "epoch": 0.44538680133376735, "grad_norm": 0.39987653493881226, "learning_rate": 4.524113647816168e-05, "loss": 1.9685, "step": 4746 }, { "epoch": 0.4459498693506242, "grad_norm": 0.4512884318828583, "learning_rate": 4.5212566133362046e-05, "loss": 2.0169, "step": 4752 }, { "epoch": 0.44651293736748104, "grad_norm": 0.43869003653526306, "learning_rate": 4.518404984788037e-05, "loss": 1.9892, "step": 4758 }, { "epoch": 0.4470760053843379, "grad_norm": 0.3767068684101105, "learning_rate": 4.5155587451451034e-05, "loss": 1.9931, "step": 4764 }, { "epoch": 0.44763907340119474, "grad_norm": 0.3433118760585785, "learning_rate": 4.512717877455828e-05, "loss": 1.9523, "step": 4770 }, { "epoch": 0.4482021414180516, "grad_norm": 0.3721299469470978, "learning_rate": 4.509882364843191e-05, "loss": 1.9919, "step": 4776 }, { "epoch": 0.44876520943490844, "grad_norm": 0.38583362102508545, "learning_rate": 4.5070521905043136e-05, "loss": 2.0561, "step": 4782 }, { "epoch": 0.4493282774517653, "grad_norm": 0.36249709129333496, "learning_rate": 4.5042273377100356e-05, "loss": 1.9842, "step": 4788 }, { "epoch": 0.44989134546862214, "grad_norm": 0.39112961292266846, "learning_rate": 4.501407789804501e-05, "loss": 1.9677, "step": 4794 }, { "epoch": 0.450454413485479, "grad_norm": 0.3742753565311432, "learning_rate": 4.498593530204746e-05, "loss": 1.992, "step": 4800 }, { "epoch": 0.450454413485479, "eval_accuracy": 0.5789419695193435, "eval_loss": 2.034294843673706, "eval_runtime": 88.1828, "eval_samples_per_second": 4.536, "eval_steps_per_second": 1.134, "step": 4800 }, { "epoch": 0.45101748150233584, "grad_norm": 0.38137221336364746, "learning_rate": 4.495784542400291e-05, "loss": 1.9651, "step": 4806 }, { "epoch": 0.4515805495191927, "grad_norm": 0.37463250756263733, "learning_rate": 4.4929808099527306e-05, "loss": 2.0504, "step": 4812 }, { "epoch": 0.45214361753604954, "grad_norm": 0.39512187242507935, "learning_rate": 4.490182316495331e-05, "loss": 1.9737, "step": 4818 }, { "epoch": 0.4527066855529064, "grad_norm": 0.39524558186531067, "learning_rate": 4.487389045732629e-05, "loss": 2.0189, "step": 4824 }, { "epoch": 0.45326975356976323, "grad_norm": 0.5054473876953125, "learning_rate": 4.484600981440034e-05, "loss": 1.9489, "step": 4830 }, { "epoch": 0.4538328215866201, "grad_norm": 0.4812467694282532, "learning_rate": 4.481818107463432e-05, "loss": 2.008, "step": 4836 }, { "epoch": 0.45439588960347693, "grad_norm": 0.36081430315971375, "learning_rate": 4.479040407718789e-05, "loss": 1.9865, "step": 4842 }, { "epoch": 0.4549589576203338, "grad_norm": 0.3785586655139923, "learning_rate": 4.476267866191766e-05, "loss": 1.9991, "step": 4848 }, { "epoch": 0.45552202563719063, "grad_norm": 0.37796053290367126, "learning_rate": 4.473500466937329e-05, "loss": 1.9765, "step": 4854 }, { "epoch": 0.4560850936540475, "grad_norm": 0.40865832567214966, "learning_rate": 4.4707381940793626e-05, "loss": 1.9833, "step": 4860 }, { "epoch": 0.45664816167090433, "grad_norm": 0.4423794150352478, "learning_rate": 4.467981031810288e-05, "loss": 2.0219, "step": 4866 }, { "epoch": 0.4572112296877612, "grad_norm": 0.37965404987335205, "learning_rate": 4.465228964390684e-05, "loss": 2.016, "step": 4872 }, { "epoch": 0.457774297704618, "grad_norm": 0.3970780372619629, "learning_rate": 4.4624819761489096e-05, "loss": 2.0022, "step": 4878 }, { "epoch": 0.4583373657214749, "grad_norm": 0.3551555573940277, "learning_rate": 4.459740051480728e-05, "loss": 2.018, "step": 4884 }, { "epoch": 0.4589004337383317, "grad_norm": 0.4176592230796814, "learning_rate": 4.4570031748489365e-05, "loss": 2.0084, "step": 4890 }, { "epoch": 0.4594635017551886, "grad_norm": 0.36661621928215027, "learning_rate": 4.454271330782993e-05, "loss": 1.9953, "step": 4896 }, { "epoch": 0.4600265697720454, "grad_norm": 0.39188647270202637, "learning_rate": 4.451544503878653e-05, "loss": 1.965, "step": 4902 }, { "epoch": 0.4605896377889023, "grad_norm": 0.3898799419403076, "learning_rate": 4.448822678797601e-05, "loss": 2.0041, "step": 4908 }, { "epoch": 0.4611527058057591, "grad_norm": 0.3701639175415039, "learning_rate": 4.4461058402670924e-05, "loss": 2.0506, "step": 4914 }, { "epoch": 0.46171577382261597, "grad_norm": 0.36612963676452637, "learning_rate": 4.443393973079588e-05, "loss": 1.9376, "step": 4920 }, { "epoch": 0.4622788418394728, "grad_norm": 0.38877561688423157, "learning_rate": 4.4406870620924e-05, "loss": 2.0064, "step": 4926 }, { "epoch": 0.46284190985632967, "grad_norm": 0.3941739499568939, "learning_rate": 4.437985092227341e-05, "loss": 1.9803, "step": 4932 }, { "epoch": 0.4634049778731865, "grad_norm": 0.42990630865097046, "learning_rate": 4.435288048470359e-05, "loss": 1.9902, "step": 4938 }, { "epoch": 0.46396804589004337, "grad_norm": 0.3804304003715515, "learning_rate": 4.4325959158712016e-05, "loss": 1.9673, "step": 4944 }, { "epoch": 0.4645311139069002, "grad_norm": 0.37610408663749695, "learning_rate": 4.429908679543057e-05, "loss": 1.934, "step": 4950 }, { "epoch": 0.46509418192375707, "grad_norm": 0.4213414490222931, "learning_rate": 4.427226324662212e-05, "loss": 1.99, "step": 4956 }, { "epoch": 0.4656572499406139, "grad_norm": 0.3729839622974396, "learning_rate": 4.4245488364677115e-05, "loss": 2.0783, "step": 4962 }, { "epoch": 0.46622031795747076, "grad_norm": 0.36352238059043884, "learning_rate": 4.421876200261014e-05, "loss": 2.0745, "step": 4968 }, { "epoch": 0.4667833859743276, "grad_norm": 0.41008704900741577, "learning_rate": 4.419208401405651e-05, "loss": 2.0273, "step": 4974 }, { "epoch": 0.46734645399118446, "grad_norm": 0.46867361664772034, "learning_rate": 4.416545425326893e-05, "loss": 1.9564, "step": 4980 }, { "epoch": 0.4679095220080413, "grad_norm": 0.42757487297058105, "learning_rate": 4.413887257511417e-05, "loss": 1.9665, "step": 4986 }, { "epoch": 0.46847259002489816, "grad_norm": 0.4368777871131897, "learning_rate": 4.411233883506972e-05, "loss": 1.9814, "step": 4992 }, { "epoch": 0.469035658041755, "grad_norm": 0.3792327642440796, "learning_rate": 4.408585288922046e-05, "loss": 1.9478, "step": 4998 }, { "epoch": 0.46959872605861186, "grad_norm": 0.38210752606391907, "learning_rate": 4.4059414594255445e-05, "loss": 1.9757, "step": 5004 }, { "epoch": 0.4701617940754687, "grad_norm": 0.42559003829956055, "learning_rate": 4.403302380746458e-05, "loss": 1.9743, "step": 5010 }, { "epoch": 0.47072486209232556, "grad_norm": 0.3685985803604126, "learning_rate": 4.4006680386735456e-05, "loss": 2.0325, "step": 5016 }, { "epoch": 0.4712879301091824, "grad_norm": 0.40414172410964966, "learning_rate": 4.3980384190550076e-05, "loss": 1.9765, "step": 5022 }, { "epoch": 0.47185099812603926, "grad_norm": 0.42191949486732483, "learning_rate": 4.395413507798169e-05, "loss": 2.0242, "step": 5028 }, { "epoch": 0.4724140661428961, "grad_norm": 0.3976402282714844, "learning_rate": 4.3927932908691624e-05, "loss": 1.9984, "step": 5034 }, { "epoch": 0.47297713415975295, "grad_norm": 0.35657799243927, "learning_rate": 4.3901777542926114e-05, "loss": 2.0282, "step": 5040 }, { "epoch": 0.4735402021766098, "grad_norm": 0.380347341299057, "learning_rate": 4.387566884151318e-05, "loss": 1.9466, "step": 5046 }, { "epoch": 0.47410327019346665, "grad_norm": 0.35433387756347656, "learning_rate": 4.384960666585954e-05, "loss": 1.9652, "step": 5052 }, { "epoch": 0.4746663382103235, "grad_norm": 0.42322662472724915, "learning_rate": 4.382359087794749e-05, "loss": 1.9941, "step": 5058 }, { "epoch": 0.47522940622718035, "grad_norm": 0.39936837553977966, "learning_rate": 4.379762134033183e-05, "loss": 1.9999, "step": 5064 }, { "epoch": 0.4757924742440372, "grad_norm": 0.4206814765930176, "learning_rate": 4.377169791613685e-05, "loss": 1.9692, "step": 5070 }, { "epoch": 0.47635554226089405, "grad_norm": 0.4499512314796448, "learning_rate": 4.374582046905325e-05, "loss": 2.0242, "step": 5076 }, { "epoch": 0.4769186102777509, "grad_norm": 0.3798932135105133, "learning_rate": 4.3719988863335196e-05, "loss": 1.9667, "step": 5082 }, { "epoch": 0.47748167829460775, "grad_norm": 0.46176213026046753, "learning_rate": 4.3694202963797264e-05, "loss": 1.9712, "step": 5088 }, { "epoch": 0.4780447463114646, "grad_norm": 0.39314746856689453, "learning_rate": 4.36684626358115e-05, "loss": 2.0008, "step": 5094 }, { "epoch": 0.47860781432832145, "grad_norm": 0.3828774094581604, "learning_rate": 4.3642767745304464e-05, "loss": 1.9985, "step": 5100 }, { "epoch": 0.4791708823451783, "grad_norm": 0.36102715134620667, "learning_rate": 4.361711815875432e-05, "loss": 1.983, "step": 5106 }, { "epoch": 0.47973395036203514, "grad_norm": 0.3579404354095459, "learning_rate": 4.3591513743187846e-05, "loss": 2.0368, "step": 5112 }, { "epoch": 0.480297018378892, "grad_norm": 0.39704570174217224, "learning_rate": 4.3565954366177646e-05, "loss": 1.9834, "step": 5118 }, { "epoch": 0.48086008639574884, "grad_norm": 0.39848726987838745, "learning_rate": 4.354043989583919e-05, "loss": 1.9676, "step": 5124 }, { "epoch": 0.4814231544126057, "grad_norm": 0.42678242921829224, "learning_rate": 4.3514970200827985e-05, "loss": 1.965, "step": 5130 }, { "epoch": 0.48198622242946254, "grad_norm": 0.4154205322265625, "learning_rate": 4.348954515033674e-05, "loss": 2.0133, "step": 5136 }, { "epoch": 0.4825492904463194, "grad_norm": 0.3604782521724701, "learning_rate": 4.346416461409254e-05, "loss": 1.9554, "step": 5142 }, { "epoch": 0.48311235846317624, "grad_norm": 0.38928836584091187, "learning_rate": 4.343882846235402e-05, "loss": 1.9729, "step": 5148 }, { "epoch": 0.4836754264800331, "grad_norm": 0.4028972089290619, "learning_rate": 4.341353656590866e-05, "loss": 2.0059, "step": 5154 }, { "epoch": 0.48423849449688994, "grad_norm": 0.3868143856525421, "learning_rate": 4.3388288796069875e-05, "loss": 1.9648, "step": 5160 }, { "epoch": 0.4848015625137468, "grad_norm": 0.3591245412826538, "learning_rate": 4.336308502467442e-05, "loss": 2.0175, "step": 5166 }, { "epoch": 0.48536463053060364, "grad_norm": 0.45276692509651184, "learning_rate": 4.3337925124079545e-05, "loss": 2.0237, "step": 5172 }, { "epoch": 0.4859276985474605, "grad_norm": 0.43265360593795776, "learning_rate": 4.331280896716033e-05, "loss": 2.0134, "step": 5178 }, { "epoch": 0.48649076656431733, "grad_norm": 0.39135798811912537, "learning_rate": 4.328773642730699e-05, "loss": 1.9668, "step": 5184 }, { "epoch": 0.4870538345811742, "grad_norm": 0.38416069746017456, "learning_rate": 4.3262707378422155e-05, "loss": 1.9807, "step": 5190 }, { "epoch": 0.48761690259803103, "grad_norm": 0.36530598998069763, "learning_rate": 4.323772169491826e-05, "loss": 1.9851, "step": 5196 }, { "epoch": 0.4879922812759356, "eval_accuracy": 0.5802002735443532, "eval_loss": 2.0278139114379883, "eval_runtime": 88.1787, "eval_samples_per_second": 4.536, "eval_steps_per_second": 1.134, "step": 5200 }, { "epoch": 0.4881799706148879, "grad_norm": 0.37099042534828186, "learning_rate": 4.321277925171487e-05, "loss": 1.9903, "step": 5202 }, { "epoch": 0.48874303863174473, "grad_norm": 0.4254438579082489, "learning_rate": 4.318787992423604e-05, "loss": 1.9744, "step": 5208 }, { "epoch": 0.4893061066486016, "grad_norm": 0.37186649441719055, "learning_rate": 4.3163023588407716e-05, "loss": 2.0106, "step": 5214 }, { "epoch": 0.48986917466545843, "grad_norm": 0.41129064559936523, "learning_rate": 4.3138210120655134e-05, "loss": 1.9958, "step": 5220 }, { "epoch": 0.4904322426823153, "grad_norm": 0.4904578924179077, "learning_rate": 4.311343939790026e-05, "loss": 1.9849, "step": 5226 }, { "epoch": 0.4909953106991721, "grad_norm": 0.4175708591938019, "learning_rate": 4.3088711297559174e-05, "loss": 1.9602, "step": 5232 }, { "epoch": 0.491558378716029, "grad_norm": 0.4567301571369171, "learning_rate": 4.306402569753959e-05, "loss": 1.9912, "step": 5238 }, { "epoch": 0.4921214467328858, "grad_norm": 0.4682433307170868, "learning_rate": 4.303938247623828e-05, "loss": 1.9792, "step": 5244 }, { "epoch": 0.4926845147497427, "grad_norm": 0.3930492401123047, "learning_rate": 4.301478151253856e-05, "loss": 1.9866, "step": 5250 }, { "epoch": 0.4932475827665995, "grad_norm": 0.39510783553123474, "learning_rate": 4.299022268580779e-05, "loss": 1.9449, "step": 5256 }, { "epoch": 0.4938106507834564, "grad_norm": 0.3905639052391052, "learning_rate": 4.296570587589492e-05, "loss": 1.9231, "step": 5262 }, { "epoch": 0.4943737188003132, "grad_norm": 0.3940321207046509, "learning_rate": 4.2941230963127985e-05, "loss": 2.0034, "step": 5268 }, { "epoch": 0.49493678681717007, "grad_norm": 0.4676946997642517, "learning_rate": 4.291679782831167e-05, "loss": 2.0171, "step": 5274 }, { "epoch": 0.4954998548340269, "grad_norm": 0.3989827632904053, "learning_rate": 4.289240635272487e-05, "loss": 1.9295, "step": 5280 }, { "epoch": 0.49606292285088377, "grad_norm": 0.3598518669605255, "learning_rate": 4.286805641811825e-05, "loss": 1.9618, "step": 5286 }, { "epoch": 0.4966259908677406, "grad_norm": 0.38201889395713806, "learning_rate": 4.2843747906711866e-05, "loss": 1.9556, "step": 5292 }, { "epoch": 0.49718905888459747, "grad_norm": 0.3844050168991089, "learning_rate": 4.281948070119275e-05, "loss": 1.9712, "step": 5298 }, { "epoch": 0.4977521269014543, "grad_norm": 0.40299567580223083, "learning_rate": 4.2795254684712555e-05, "loss": 2.0329, "step": 5304 }, { "epoch": 0.49831519491831117, "grad_norm": 0.3820916414260864, "learning_rate": 4.2771069740885146e-05, "loss": 2.0085, "step": 5310 }, { "epoch": 0.498878262935168, "grad_norm": 0.4059142768383026, "learning_rate": 4.274692575378432e-05, "loss": 1.9922, "step": 5316 }, { "epoch": 0.49944133095202486, "grad_norm": 0.3828839957714081, "learning_rate": 4.2722822607941395e-05, "loss": 1.9574, "step": 5322 }, { "epoch": 0.5000043989688817, "grad_norm": 0.41781193017959595, "learning_rate": 4.269876018834296e-05, "loss": 1.958, "step": 5328 }, { "epoch": 0.5005674669857385, "grad_norm": 0.3976834714412689, "learning_rate": 4.2674738380428535e-05, "loss": 1.9632, "step": 5334 }, { "epoch": 0.5011305350025954, "grad_norm": 0.4100625514984131, "learning_rate": 4.265075707008828e-05, "loss": 2.0404, "step": 5340 }, { "epoch": 0.5016936030194522, "grad_norm": 0.44064298272132874, "learning_rate": 4.262681614366072e-05, "loss": 1.9679, "step": 5346 }, { "epoch": 0.5022566710363091, "grad_norm": 0.38148877024650574, "learning_rate": 4.26029154879305e-05, "loss": 2.0058, "step": 5352 }, { "epoch": 0.5028197390531659, "grad_norm": 0.5082322359085083, "learning_rate": 4.2579054990126114e-05, "loss": 1.985, "step": 5358 }, { "epoch": 0.5033828070700228, "grad_norm": 0.43715718388557434, "learning_rate": 4.255523453791768e-05, "loss": 1.9994, "step": 5364 }, { "epoch": 0.5039458750868796, "grad_norm": 0.3919329345226288, "learning_rate": 4.25314540194147e-05, "loss": 1.9609, "step": 5370 }, { "epoch": 0.5045089431037365, "grad_norm": 0.4321536421775818, "learning_rate": 4.2507713323163895e-05, "loss": 1.986, "step": 5376 }, { "epoch": 0.5050720111205933, "grad_norm": 0.3757963180541992, "learning_rate": 4.2484012338147e-05, "loss": 1.9682, "step": 5382 }, { "epoch": 0.5056350791374502, "grad_norm": 0.43193474411964417, "learning_rate": 4.2460350953778513e-05, "loss": 1.9565, "step": 5388 }, { "epoch": 0.506198147154307, "grad_norm": 0.3811191916465759, "learning_rate": 4.2436729059903646e-05, "loss": 2.0171, "step": 5394 }, { "epoch": 0.5067612151711639, "grad_norm": 0.3768790662288666, "learning_rate": 4.241314654679608e-05, "loss": 2.0314, "step": 5400 }, { "epoch": 0.5073242831880207, "grad_norm": 0.3737028241157532, "learning_rate": 4.238960330515585e-05, "loss": 2.0421, "step": 5406 }, { "epoch": 0.5078873512048776, "grad_norm": 0.41352012753486633, "learning_rate": 4.236609922610723e-05, "loss": 1.9534, "step": 5412 }, { "epoch": 0.5084504192217344, "grad_norm": 0.3723868131637573, "learning_rate": 4.234263420119662e-05, "loss": 1.9472, "step": 5418 }, { "epoch": 0.5090134872385913, "grad_norm": 0.3860258460044861, "learning_rate": 4.2319208122390415e-05, "loss": 1.9744, "step": 5424 }, { "epoch": 0.5095765552554481, "grad_norm": 0.40669429302215576, "learning_rate": 4.229582088207294e-05, "loss": 1.9667, "step": 5430 }, { "epoch": 0.510139623272305, "grad_norm": 0.3971322774887085, "learning_rate": 4.227247237304439e-05, "loss": 2.0016, "step": 5436 }, { "epoch": 0.5107026912891618, "grad_norm": 0.3952597379684448, "learning_rate": 4.224916248851871e-05, "loss": 2.0048, "step": 5442 }, { "epoch": 0.5112657593060187, "grad_norm": 0.39064130187034607, "learning_rate": 4.222589112212161e-05, "loss": 2.0167, "step": 5448 }, { "epoch": 0.5118288273228755, "grad_norm": 0.3681340515613556, "learning_rate": 4.220265816788847e-05, "loss": 2.0448, "step": 5454 }, { "epoch": 0.5123918953397324, "grad_norm": 0.3815629482269287, "learning_rate": 4.217946352026236e-05, "loss": 1.9873, "step": 5460 }, { "epoch": 0.5129549633565892, "grad_norm": 0.3496919870376587, "learning_rate": 4.215630707409197e-05, "loss": 2.0434, "step": 5466 }, { "epoch": 0.5135180313734461, "grad_norm": 0.3726310133934021, "learning_rate": 4.21331887246297e-05, "loss": 1.9301, "step": 5472 }, { "epoch": 0.5140810993903029, "grad_norm": 0.36616119742393494, "learning_rate": 4.2110108367529544e-05, "loss": 1.9825, "step": 5478 }, { "epoch": 0.5146441674071598, "grad_norm": 0.368679404258728, "learning_rate": 4.208706589884524e-05, "loss": 2.0039, "step": 5484 }, { "epoch": 0.5152072354240166, "grad_norm": 0.4155595898628235, "learning_rate": 4.206406121502819e-05, "loss": 1.9718, "step": 5490 }, { "epoch": 0.5157703034408735, "grad_norm": 0.3916139006614685, "learning_rate": 4.2041094212925614e-05, "loss": 2.0137, "step": 5496 }, { "epoch": 0.5163333714577303, "grad_norm": 0.38437360525131226, "learning_rate": 4.20181647897785e-05, "loss": 1.9531, "step": 5502 }, { "epoch": 0.5168964394745872, "grad_norm": 0.3994635045528412, "learning_rate": 4.199527284321976e-05, "loss": 2.0357, "step": 5508 }, { "epoch": 0.517459507491444, "grad_norm": 0.3694004714488983, "learning_rate": 4.1972418271272256e-05, "loss": 1.9798, "step": 5514 }, { "epoch": 0.5180225755083009, "grad_norm": 0.39978957176208496, "learning_rate": 4.1949600972346934e-05, "loss": 1.9764, "step": 5520 }, { "epoch": 0.5185856435251577, "grad_norm": 0.3891492486000061, "learning_rate": 4.1926820845240884e-05, "loss": 1.9808, "step": 5526 }, { "epoch": 0.5191487115420146, "grad_norm": 0.3843173682689667, "learning_rate": 4.190407778913548e-05, "loss": 2.028, "step": 5532 }, { "epoch": 0.5197117795588714, "grad_norm": 0.4036204218864441, "learning_rate": 4.188137170359452e-05, "loss": 1.9625, "step": 5538 }, { "epoch": 0.5202748475757283, "grad_norm": 0.40590202808380127, "learning_rate": 4.185870248856233e-05, "loss": 2.0048, "step": 5544 }, { "epoch": 0.5208379155925851, "grad_norm": 0.45766258239746094, "learning_rate": 4.1836070044361916e-05, "loss": 2.0167, "step": 5550 }, { "epoch": 0.521400983609442, "grad_norm": 0.3446559011936188, "learning_rate": 4.181347427169317e-05, "loss": 1.9946, "step": 5556 }, { "epoch": 0.5219640516262988, "grad_norm": 0.34969398379325867, "learning_rate": 4.179091507163096e-05, "loss": 1.997, "step": 5562 }, { "epoch": 0.5225271196431557, "grad_norm": 0.336882084608078, "learning_rate": 4.176839234562339e-05, "loss": 2.0171, "step": 5568 }, { "epoch": 0.5230901876600125, "grad_norm": 0.3853786289691925, "learning_rate": 4.1745905995489955e-05, "loss": 2.0432, "step": 5574 }, { "epoch": 0.5236532556768694, "grad_norm": 0.3777616620063782, "learning_rate": 4.1723455923419725e-05, "loss": 1.9698, "step": 5580 }, { "epoch": 0.5242163236937262, "grad_norm": 0.4077838361263275, "learning_rate": 4.170104203196959e-05, "loss": 1.9435, "step": 5586 }, { "epoch": 0.5247793917105831, "grad_norm": 0.3969927728176117, "learning_rate": 4.167866422406249e-05, "loss": 2.006, "step": 5592 }, { "epoch": 0.5253424597274399, "grad_norm": 0.36376693844795227, "learning_rate": 4.165632240298559e-05, "loss": 1.982, "step": 5598 }, { "epoch": 0.5255301490663922, "eval_accuracy": 0.5807346619773349, "eval_loss": 2.0233240127563477, "eval_runtime": 88.1458, "eval_samples_per_second": 4.538, "eval_steps_per_second": 1.134, "step": 5600 }, { "epoch": 0.5259055277442968, "grad_norm": 0.4011813700199127, "learning_rate": 4.1634016472388606e-05, "loss": 1.9611, "step": 5604 }, { "epoch": 0.5264685957611536, "grad_norm": 0.3519646227359772, "learning_rate": 4.161174633628203e-05, "loss": 2.0213, "step": 5610 }, { "epoch": 0.5270316637780105, "grad_norm": 0.44300225377082825, "learning_rate": 4.158951189903532e-05, "loss": 2.0176, "step": 5616 }, { "epoch": 0.5275947317948673, "grad_norm": 0.4334258735179901, "learning_rate": 4.156731306537529e-05, "loss": 1.9705, "step": 5622 }, { "epoch": 0.5281577998117242, "grad_norm": 0.39069312810897827, "learning_rate": 4.1545149740384374e-05, "loss": 1.9718, "step": 5628 }, { "epoch": 0.528720867828581, "grad_norm": 0.38930240273475647, "learning_rate": 4.152302182949881e-05, "loss": 1.978, "step": 5634 }, { "epoch": 0.5292839358454379, "grad_norm": 0.3829316198825836, "learning_rate": 4.150092923850711e-05, "loss": 2.0176, "step": 5640 }, { "epoch": 0.5298470038622947, "grad_norm": 0.4023854434490204, "learning_rate": 4.1478871873548244e-05, "loss": 1.9787, "step": 5646 }, { "epoch": 0.5304100718791516, "grad_norm": 0.34888702630996704, "learning_rate": 4.1456849641110036e-05, "loss": 1.9791, "step": 5652 }, { "epoch": 0.5309731398960084, "grad_norm": 0.38115644454956055, "learning_rate": 4.143486244802747e-05, "loss": 1.9956, "step": 5658 }, { "epoch": 0.5315362079128653, "grad_norm": 0.3804568648338318, "learning_rate": 4.1412910201481036e-05, "loss": 2.0282, "step": 5664 }, { "epoch": 0.5320992759297221, "grad_norm": 0.4192138612270355, "learning_rate": 4.1390992808995126e-05, "loss": 2.0244, "step": 5670 }, { "epoch": 0.532662343946579, "grad_norm": 0.3765939474105835, "learning_rate": 4.136911017843632e-05, "loss": 2.0021, "step": 5676 }, { "epoch": 0.5332254119634358, "grad_norm": 0.3987525999546051, "learning_rate": 4.1347262218011806e-05, "loss": 2.029, "step": 5682 }, { "epoch": 0.5337884799802927, "grad_norm": 0.4318542182445526, "learning_rate": 4.1325448836267796e-05, "loss": 1.9305, "step": 5688 }, { "epoch": 0.5343515479971495, "grad_norm": 0.382104754447937, "learning_rate": 4.130366994208785e-05, "loss": 1.9998, "step": 5694 }, { "epoch": 0.5349146160140064, "grad_norm": 0.3847979009151459, "learning_rate": 4.128192544469132e-05, "loss": 1.9599, "step": 5700 }, { "epoch": 0.5354776840308632, "grad_norm": 0.39705976843833923, "learning_rate": 4.126021525363174e-05, "loss": 1.9778, "step": 5706 }, { "epoch": 0.5360407520477201, "grad_norm": 0.4714218080043793, "learning_rate": 4.1238539278795265e-05, "loss": 2.0187, "step": 5712 }, { "epoch": 0.5366038200645769, "grad_norm": 0.4245346784591675, "learning_rate": 4.121689743039907e-05, "loss": 2.0008, "step": 5718 }, { "epoch": 0.5371668880814338, "grad_norm": 0.37979698181152344, "learning_rate": 4.1195289618989857e-05, "loss": 1.9908, "step": 5724 }, { "epoch": 0.5377299560982906, "grad_norm": 0.37260618805885315, "learning_rate": 4.117371575544218e-05, "loss": 1.9809, "step": 5730 }, { "epoch": 0.5382930241151475, "grad_norm": 0.3810240626335144, "learning_rate": 4.115217575095705e-05, "loss": 1.9944, "step": 5736 }, { "epoch": 0.5388560921320042, "grad_norm": 0.3896656930446625, "learning_rate": 4.113066951706026e-05, "loss": 2.0138, "step": 5742 }, { "epoch": 0.5394191601488612, "grad_norm": 0.3967185318470001, "learning_rate": 4.110919696560098e-05, "loss": 2.0092, "step": 5748 }, { "epoch": 0.539982228165718, "grad_norm": 0.40669170022010803, "learning_rate": 4.108775800875013e-05, "loss": 2.007, "step": 5754 }, { "epoch": 0.5405452961825749, "grad_norm": 0.4059445559978485, "learning_rate": 4.106635255899895e-05, "loss": 1.973, "step": 5760 }, { "epoch": 0.5411083641994316, "grad_norm": 0.40218299627304077, "learning_rate": 4.10449805291575e-05, "loss": 1.9988, "step": 5766 }, { "epoch": 0.5416714322162886, "grad_norm": 0.3900507688522339, "learning_rate": 4.102364183235308e-05, "loss": 1.987, "step": 5772 }, { "epoch": 0.5422345002331453, "grad_norm": 0.4266583323478699, "learning_rate": 4.100233638202885e-05, "loss": 2.0225, "step": 5778 }, { "epoch": 0.5427975682500022, "grad_norm": 0.4902029037475586, "learning_rate": 4.098106409194232e-05, "loss": 1.9949, "step": 5784 }, { "epoch": 0.543360636266859, "grad_norm": 0.38995635509490967, "learning_rate": 4.0959824876163846e-05, "loss": 2.0069, "step": 5790 }, { "epoch": 0.543923704283716, "grad_norm": 0.3783828020095825, "learning_rate": 4.0938618649075226e-05, "loss": 2.0285, "step": 5796 }, { "epoch": 0.5444867723005727, "grad_norm": 0.3786715269088745, "learning_rate": 4.091744532536824e-05, "loss": 2.0177, "step": 5802 }, { "epoch": 0.5450498403174296, "grad_norm": 0.3907123804092407, "learning_rate": 4.0896304820043145e-05, "loss": 2.0151, "step": 5808 }, { "epoch": 0.5456129083342864, "grad_norm": 0.4417511224746704, "learning_rate": 4.087519704840737e-05, "loss": 1.9682, "step": 5814 }, { "epoch": 0.5461759763511433, "grad_norm": 0.4394706189632416, "learning_rate": 4.085412192607394e-05, "loss": 1.9332, "step": 5820 }, { "epoch": 0.5467390443680001, "grad_norm": 0.4520696699619293, "learning_rate": 4.083307936896019e-05, "loss": 2.0031, "step": 5826 }, { "epoch": 0.547302112384857, "grad_norm": 0.3709902763366699, "learning_rate": 4.0812069293286266e-05, "loss": 2.0281, "step": 5832 }, { "epoch": 0.5478651804017138, "grad_norm": 0.4162772595882416, "learning_rate": 4.0791091615573786e-05, "loss": 1.9531, "step": 5838 }, { "epoch": 0.5484282484185707, "grad_norm": 0.35189324617385864, "learning_rate": 4.077014625264438e-05, "loss": 1.9935, "step": 5844 }, { "epoch": 0.5489913164354275, "grad_norm": 0.4410395622253418, "learning_rate": 4.074923312161837e-05, "loss": 1.9707, "step": 5850 }, { "epoch": 0.5495543844522844, "grad_norm": 0.3544395864009857, "learning_rate": 4.072835213991339e-05, "loss": 1.9886, "step": 5856 }, { "epoch": 0.5501174524691412, "grad_norm": 0.39896485209465027, "learning_rate": 4.070750322524292e-05, "loss": 1.9666, "step": 5862 }, { "epoch": 0.5506805204859981, "grad_norm": 0.43210774660110474, "learning_rate": 4.068668629561507e-05, "loss": 1.987, "step": 5868 }, { "epoch": 0.5512435885028549, "grad_norm": 0.37000924348831177, "learning_rate": 4.066590126933112e-05, "loss": 1.9952, "step": 5874 }, { "epoch": 0.5518066565197117, "grad_norm": 0.34932956099510193, "learning_rate": 4.064514806498421e-05, "loss": 1.9578, "step": 5880 }, { "epoch": 0.5523697245365686, "grad_norm": 0.42461860179901123, "learning_rate": 4.062442660145799e-05, "loss": 1.9773, "step": 5886 }, { "epoch": 0.5529327925534254, "grad_norm": 0.40360283851623535, "learning_rate": 4.060373679792531e-05, "loss": 1.9834, "step": 5892 }, { "epoch": 0.5534958605702823, "grad_norm": 0.4303828775882721, "learning_rate": 4.058307857384685e-05, "loss": 1.988, "step": 5898 }, { "epoch": 0.5540589285871391, "grad_norm": 0.4159733057022095, "learning_rate": 4.0562451848969884e-05, "loss": 1.9096, "step": 5904 }, { "epoch": 0.554621996603996, "grad_norm": 0.3848516047000885, "learning_rate": 4.0541856543326863e-05, "loss": 1.977, "step": 5910 }, { "epoch": 0.5551850646208528, "grad_norm": 0.35450857877731323, "learning_rate": 4.052129257723423e-05, "loss": 1.9543, "step": 5916 }, { "epoch": 0.5557481326377097, "grad_norm": 0.4022143483161926, "learning_rate": 4.0500759871291013e-05, "loss": 2.0292, "step": 5922 }, { "epoch": 0.5563112006545665, "grad_norm": 0.35112500190734863, "learning_rate": 4.048025834637761e-05, "loss": 1.9552, "step": 5928 }, { "epoch": 0.5568742686714234, "grad_norm": 0.44715118408203125, "learning_rate": 4.045978792365449e-05, "loss": 2.021, "step": 5934 }, { "epoch": 0.5574373366882802, "grad_norm": 0.39389798045158386, "learning_rate": 4.043934852456094e-05, "loss": 1.9982, "step": 5940 }, { "epoch": 0.5580004047051371, "grad_norm": 0.38352641463279724, "learning_rate": 4.041894007081374e-05, "loss": 2.022, "step": 5946 }, { "epoch": 0.5585634727219939, "grad_norm": 0.3996003568172455, "learning_rate": 4.0398562484405944e-05, "loss": 1.9648, "step": 5952 }, { "epoch": 0.5591265407388508, "grad_norm": 0.3656814694404602, "learning_rate": 4.037821568760565e-05, "loss": 1.9455, "step": 5958 }, { "epoch": 0.5596896087557076, "grad_norm": 0.4010196626186371, "learning_rate": 4.0357899602954724e-05, "loss": 1.9773, "step": 5964 }, { "epoch": 0.5602526767725645, "grad_norm": 0.44710975885391235, "learning_rate": 4.033761415326754e-05, "loss": 2.0405, "step": 5970 }, { "epoch": 0.5608157447894213, "grad_norm": 0.39366260170936584, "learning_rate": 4.031735926162979e-05, "loss": 1.9554, "step": 5976 }, { "epoch": 0.5613788128062782, "grad_norm": 0.3697490096092224, "learning_rate": 4.029713485139725e-05, "loss": 1.9916, "step": 5982 }, { "epoch": 0.561941880823135, "grad_norm": 0.37148427963256836, "learning_rate": 4.027694084619456e-05, "loss": 1.9588, "step": 5988 }, { "epoch": 0.5625049488399919, "grad_norm": 0.4041338562965393, "learning_rate": 4.0256777169913983e-05, "loss": 1.9591, "step": 5994 }, { "epoch": 0.5630680168568487, "grad_norm": 0.44425681233406067, "learning_rate": 4.0236643746714266e-05, "loss": 1.964, "step": 6000 }, { "epoch": 0.5630680168568487, "eval_accuracy": 0.5813032434544744, "eval_loss": 2.019303321838379, "eval_runtime": 88.0718, "eval_samples_per_second": 4.542, "eval_steps_per_second": 1.135, "step": 6000 }, { "epoch": 0.5636310848737056, "grad_norm": 0.37781983613967896, "learning_rate": 4.021654050101935e-05, "loss": 1.9946, "step": 6006 }, { "epoch": 0.5641941528905624, "grad_norm": 0.4044470489025116, "learning_rate": 4.019646735751728e-05, "loss": 1.9303, "step": 6012 }, { "epoch": 0.5647572209074193, "grad_norm": 0.41582611203193665, "learning_rate": 4.0176424241158936e-05, "loss": 1.9341, "step": 6018 }, { "epoch": 0.5653202889242761, "grad_norm": 0.36913079023361206, "learning_rate": 4.01564110771569e-05, "loss": 1.9961, "step": 6024 }, { "epoch": 0.565883356941133, "grad_norm": 0.4392784535884857, "learning_rate": 4.013642779098429e-05, "loss": 1.9828, "step": 6030 }, { "epoch": 0.5664464249579898, "grad_norm": 0.377360075712204, "learning_rate": 4.011647430837353e-05, "loss": 1.9792, "step": 6036 }, { "epoch": 0.5670094929748467, "grad_norm": 0.3980574607849121, "learning_rate": 4.0096550555315296e-05, "loss": 2.0204, "step": 6042 }, { "epoch": 0.5675725609917035, "grad_norm": 0.41038304567337036, "learning_rate": 4.007665645805726e-05, "loss": 1.9897, "step": 6048 }, { "epoch": 0.5681356290085604, "grad_norm": 0.3898538053035736, "learning_rate": 4.005679194310302e-05, "loss": 1.9637, "step": 6054 }, { "epoch": 0.5686986970254172, "grad_norm": 0.42959845066070557, "learning_rate": 4.0036956937210896e-05, "loss": 2.0207, "step": 6060 }, { "epoch": 0.5692617650422741, "grad_norm": 0.4774961471557617, "learning_rate": 4.001715136739284e-05, "loss": 1.984, "step": 6066 }, { "epoch": 0.5698248330591309, "grad_norm": 0.40249085426330566, "learning_rate": 3.999737516091332e-05, "loss": 2.003, "step": 6072 }, { "epoch": 0.5703879010759878, "grad_norm": 0.36520400643348694, "learning_rate": 3.9977628245288136e-05, "loss": 1.9854, "step": 6078 }, { "epoch": 0.5709509690928446, "grad_norm": 0.405719131231308, "learning_rate": 3.995791054828337e-05, "loss": 1.9834, "step": 6084 }, { "epoch": 0.5715140371097015, "grad_norm": 0.42953357100486755, "learning_rate": 3.993822199791423e-05, "loss": 1.9432, "step": 6090 }, { "epoch": 0.5720771051265583, "grad_norm": 0.3763583302497864, "learning_rate": 3.991856252244397e-05, "loss": 2.0298, "step": 6096 }, { "epoch": 0.5726401731434152, "grad_norm": 0.41984015703201294, "learning_rate": 3.989893205038277e-05, "loss": 1.9869, "step": 6102 }, { "epoch": 0.573203241160272, "grad_norm": 0.4197002649307251, "learning_rate": 3.987933051048671e-05, "loss": 1.9573, "step": 6108 }, { "epoch": 0.5737663091771289, "grad_norm": 0.3574821650981903, "learning_rate": 3.985975783175659e-05, "loss": 1.9507, "step": 6114 }, { "epoch": 0.5743293771939857, "grad_norm": 0.3665715456008911, "learning_rate": 3.984021394343689e-05, "loss": 2.0247, "step": 6120 }, { "epoch": 0.5748924452108426, "grad_norm": 0.41787222027778625, "learning_rate": 3.9820698775014734e-05, "loss": 1.9686, "step": 6126 }, { "epoch": 0.5754555132276994, "grad_norm": 0.47998836636543274, "learning_rate": 3.9801212256218766e-05, "loss": 1.9563, "step": 6132 }, { "epoch": 0.5760185812445563, "grad_norm": 0.40270689129829407, "learning_rate": 3.978175431701812e-05, "loss": 2.0253, "step": 6138 }, { "epoch": 0.5765816492614131, "grad_norm": 0.3729938268661499, "learning_rate": 3.976232488762132e-05, "loss": 1.9416, "step": 6144 }, { "epoch": 0.57714471727827, "grad_norm": 0.35704505443573, "learning_rate": 3.974292389847531e-05, "loss": 1.9887, "step": 6150 }, { "epoch": 0.5777077852951268, "grad_norm": 0.3545173704624176, "learning_rate": 3.972355128026433e-05, "loss": 2.0352, "step": 6156 }, { "epoch": 0.5782708533119837, "grad_norm": 0.35938069224357605, "learning_rate": 3.970420696390889e-05, "loss": 1.9313, "step": 6162 }, { "epoch": 0.5788339213288405, "grad_norm": 0.35183244943618774, "learning_rate": 3.9684890880564796e-05, "loss": 1.9698, "step": 6168 }, { "epoch": 0.5793969893456974, "grad_norm": 0.5035154819488525, "learning_rate": 3.9665602961622025e-05, "loss": 1.9527, "step": 6174 }, { "epoch": 0.5799600573625542, "grad_norm": 0.3815072178840637, "learning_rate": 3.9646343138703784e-05, "loss": 2.0499, "step": 6180 }, { "epoch": 0.5805231253794111, "grad_norm": 0.4054972529411316, "learning_rate": 3.9627111343665453e-05, "loss": 1.9997, "step": 6186 }, { "epoch": 0.5810861933962679, "grad_norm": 0.35555070638656616, "learning_rate": 3.9607907508593564e-05, "loss": 1.9835, "step": 6192 }, { "epoch": 0.5816492614131248, "grad_norm": 0.3769945502281189, "learning_rate": 3.958873156580482e-05, "loss": 2.0264, "step": 6198 }, { "epoch": 0.5822123294299816, "grad_norm": 0.40851595997810364, "learning_rate": 3.95695834478451e-05, "loss": 2.0144, "step": 6204 }, { "epoch": 0.5827753974468385, "grad_norm": 0.35131165385246277, "learning_rate": 3.95504630874884e-05, "loss": 2.0026, "step": 6210 }, { "epoch": 0.5833384654636953, "grad_norm": 0.39233967661857605, "learning_rate": 3.9531370417735925e-05, "loss": 1.9898, "step": 6216 }, { "epoch": 0.5839015334805522, "grad_norm": 0.36482545733451843, "learning_rate": 3.9512305371815044e-05, "loss": 1.9722, "step": 6222 }, { "epoch": 0.584464601497409, "grad_norm": 0.45449182391166687, "learning_rate": 3.9493267883178343e-05, "loss": 1.9298, "step": 6228 }, { "epoch": 0.5850276695142659, "grad_norm": 0.4093194603919983, "learning_rate": 3.94742578855026e-05, "loss": 1.9663, "step": 6234 }, { "epoch": 0.5855907375311227, "grad_norm": 0.4480719566345215, "learning_rate": 3.945527531268791e-05, "loss": 1.9938, "step": 6240 }, { "epoch": 0.5861538055479796, "grad_norm": 0.38587144017219543, "learning_rate": 3.94363200988566e-05, "loss": 1.99, "step": 6246 }, { "epoch": 0.5867168735648364, "grad_norm": 0.36470258235931396, "learning_rate": 3.9417392178352367e-05, "loss": 1.9565, "step": 6252 }, { "epoch": 0.5872799415816933, "grad_norm": 0.39844486117362976, "learning_rate": 3.939849148573927e-05, "loss": 1.9639, "step": 6258 }, { "epoch": 0.5878430095985501, "grad_norm": 0.42090922594070435, "learning_rate": 3.9379617955800785e-05, "loss": 1.9964, "step": 6264 }, { "epoch": 0.588406077615407, "grad_norm": 0.3681942820549011, "learning_rate": 3.9360771523538886e-05, "loss": 1.9939, "step": 6270 }, { "epoch": 0.5889691456322638, "grad_norm": 0.40321579575538635, "learning_rate": 3.934195212417307e-05, "loss": 2.0181, "step": 6276 }, { "epoch": 0.5895322136491207, "grad_norm": 0.4560224413871765, "learning_rate": 3.932315969313948e-05, "loss": 1.9846, "step": 6282 }, { "epoch": 0.5900952816659775, "grad_norm": 0.3909851312637329, "learning_rate": 3.9304394166089865e-05, "loss": 2.003, "step": 6288 }, { "epoch": 0.5906583496828344, "grad_norm": 0.4251001179218292, "learning_rate": 3.928565547889078e-05, "loss": 2.0079, "step": 6294 }, { "epoch": 0.5912214176996912, "grad_norm": 0.39706355333328247, "learning_rate": 3.926694356762259e-05, "loss": 1.9886, "step": 6300 }, { "epoch": 0.5917844857165481, "grad_norm": 0.39942666888237, "learning_rate": 3.9248258368578574e-05, "loss": 1.9719, "step": 6306 }, { "epoch": 0.5923475537334049, "grad_norm": 0.415943443775177, "learning_rate": 3.922959981826402e-05, "loss": 2.0211, "step": 6312 }, { "epoch": 0.5929106217502618, "grad_norm": 0.3943447470664978, "learning_rate": 3.921096785339531e-05, "loss": 1.9656, "step": 6318 }, { "epoch": 0.5934736897671186, "grad_norm": 0.36723560094833374, "learning_rate": 3.9192362410899e-05, "loss": 1.991, "step": 6324 }, { "epoch": 0.5940367577839755, "grad_norm": 0.4864744544029236, "learning_rate": 3.917378342791097e-05, "loss": 1.9978, "step": 6330 }, { "epoch": 0.5945998258008323, "grad_norm": 0.3999435603618622, "learning_rate": 3.91552308417755e-05, "loss": 1.9837, "step": 6336 }, { "epoch": 0.5951628938176892, "grad_norm": 0.3919436037540436, "learning_rate": 3.913670459004438e-05, "loss": 1.9717, "step": 6342 }, { "epoch": 0.595725961834546, "grad_norm": 0.42146408557891846, "learning_rate": 3.911820461047606e-05, "loss": 2.0219, "step": 6348 }, { "epoch": 0.5962890298514029, "grad_norm": 0.47247040271759033, "learning_rate": 3.909973084103469e-05, "loss": 1.9678, "step": 6354 }, { "epoch": 0.5968520978682597, "grad_norm": 0.39644455909729004, "learning_rate": 3.908128321988938e-05, "loss": 1.9303, "step": 6360 }, { "epoch": 0.5974151658851166, "grad_norm": 0.390960156917572, "learning_rate": 3.90628616854132e-05, "loss": 1.9821, "step": 6366 }, { "epoch": 0.5979782339019734, "grad_norm": 0.38516461849212646, "learning_rate": 3.904446617618239e-05, "loss": 1.9724, "step": 6372 }, { "epoch": 0.5985413019188303, "grad_norm": 0.3723876476287842, "learning_rate": 3.9026096630975474e-05, "loss": 1.9985, "step": 6378 }, { "epoch": 0.599104369935687, "grad_norm": 0.41217973828315735, "learning_rate": 3.900775298877241e-05, "loss": 1.9881, "step": 6384 }, { "epoch": 0.599667437952544, "grad_norm": 0.365089476108551, "learning_rate": 3.8989435188753736e-05, "loss": 1.9351, "step": 6390 }, { "epoch": 0.6002305059694008, "grad_norm": 0.4036978483200073, "learning_rate": 3.897114317029974e-05, "loss": 2.0033, "step": 6396 }, { "epoch": 0.6006058846473054, "eval_accuracy": 0.5818132082844861, "eval_loss": 2.0161798000335693, "eval_runtime": 88.2032, "eval_samples_per_second": 4.535, "eval_steps_per_second": 1.134, "step": 6400 }, { "epoch": 0.6007935739862577, "grad_norm": 0.35811367630958557, "learning_rate": 3.8952876872989586e-05, "loss": 1.9778, "step": 6402 }, { "epoch": 0.6013566420031144, "grad_norm": 0.36226609349250793, "learning_rate": 3.89346362366005e-05, "loss": 1.9735, "step": 6408 }, { "epoch": 0.6019197100199714, "grad_norm": 0.40958431363105774, "learning_rate": 3.891642120110692e-05, "loss": 2.0134, "step": 6414 }, { "epoch": 0.6024827780368281, "grad_norm": 0.4320530891418457, "learning_rate": 3.88982317066797e-05, "loss": 1.9886, "step": 6420 }, { "epoch": 0.603045846053685, "grad_norm": 0.4495929777622223, "learning_rate": 3.888006769368524e-05, "loss": 1.9429, "step": 6426 }, { "epoch": 0.6036089140705418, "grad_norm": 0.42731255292892456, "learning_rate": 3.8861929102684667e-05, "loss": 1.9402, "step": 6432 }, { "epoch": 0.6041719820873988, "grad_norm": 0.37746015191078186, "learning_rate": 3.884381587443308e-05, "loss": 1.9567, "step": 6438 }, { "epoch": 0.6047350501042555, "grad_norm": 0.43025797605514526, "learning_rate": 3.8825727949878655e-05, "loss": 1.9837, "step": 6444 }, { "epoch": 0.6052981181211124, "grad_norm": 0.40397879481315613, "learning_rate": 3.88076652701619e-05, "loss": 2.0043, "step": 6450 }, { "epoch": 0.6058611861379692, "grad_norm": 0.3777856230735779, "learning_rate": 3.878962777661481e-05, "loss": 1.97, "step": 6456 }, { "epoch": 0.6064242541548261, "grad_norm": 0.4164550006389618, "learning_rate": 3.877161541076008e-05, "loss": 1.9955, "step": 6462 }, { "epoch": 0.6069873221716829, "grad_norm": 0.4218258857727051, "learning_rate": 3.8753628114310326e-05, "loss": 2.0071, "step": 6468 }, { "epoch": 0.6075503901885398, "grad_norm": 0.503016471862793, "learning_rate": 3.8735665829167254e-05, "loss": 2.0065, "step": 6474 }, { "epoch": 0.6081134582053966, "grad_norm": 0.39448899030685425, "learning_rate": 3.871772849742092e-05, "loss": 2.015, "step": 6480 }, { "epoch": 0.6086765262222535, "grad_norm": 0.395650714635849, "learning_rate": 3.8699816061348904e-05, "loss": 2.0183, "step": 6486 }, { "epoch": 0.6092395942391103, "grad_norm": 0.43221452832221985, "learning_rate": 3.868192846341556e-05, "loss": 2.01, "step": 6492 }, { "epoch": 0.6098026622559672, "grad_norm": 0.3857628405094147, "learning_rate": 3.86640656462712e-05, "loss": 1.9654, "step": 6498 }, { "epoch": 0.610365730272824, "grad_norm": 0.3724585771560669, "learning_rate": 3.864622755275139e-05, "loss": 1.9956, "step": 6504 }, { "epoch": 0.6109287982896809, "grad_norm": 0.42504069209098816, "learning_rate": 3.86284141258761e-05, "loss": 1.9677, "step": 6510 }, { "epoch": 0.6114918663065377, "grad_norm": 0.39714619517326355, "learning_rate": 3.861062530884901e-05, "loss": 1.9673, "step": 6516 }, { "epoch": 0.6120549343233946, "grad_norm": 0.45507359504699707, "learning_rate": 3.85928610450567e-05, "loss": 1.9891, "step": 6522 }, { "epoch": 0.6126180023402514, "grad_norm": 0.40843355655670166, "learning_rate": 3.8575121278067906e-05, "loss": 1.9965, "step": 6528 }, { "epoch": 0.6131810703571083, "grad_norm": 0.4982570707798004, "learning_rate": 3.855740595163279e-05, "loss": 1.9335, "step": 6534 }, { "epoch": 0.6137441383739651, "grad_norm": 0.43439215421676636, "learning_rate": 3.853971500968219e-05, "loss": 2.022, "step": 6540 }, { "epoch": 0.614307206390822, "grad_norm": 0.396650105714798, "learning_rate": 3.8522048396326824e-05, "loss": 1.9668, "step": 6546 }, { "epoch": 0.6148702744076788, "grad_norm": 0.3932853043079376, "learning_rate": 3.85044060558566e-05, "loss": 1.9385, "step": 6552 }, { "epoch": 0.6154333424245357, "grad_norm": 0.37701502442359924, "learning_rate": 3.8486787932739844e-05, "loss": 1.9984, "step": 6558 }, { "epoch": 0.6159964104413925, "grad_norm": 0.3989914655685425, "learning_rate": 3.846919397162261e-05, "loss": 1.9868, "step": 6564 }, { "epoch": 0.6165594784582494, "grad_norm": 0.4196706712245941, "learning_rate": 3.8451624117327916e-05, "loss": 1.9709, "step": 6570 }, { "epoch": 0.6171225464751062, "grad_norm": 0.40802663564682007, "learning_rate": 3.843407831485502e-05, "loss": 1.9472, "step": 6576 }, { "epoch": 0.6176856144919631, "grad_norm": 0.3687900900840759, "learning_rate": 3.841655650937869e-05, "loss": 1.9707, "step": 6582 }, { "epoch": 0.6182486825088199, "grad_norm": 0.3762992322444916, "learning_rate": 3.839905864624851e-05, "loss": 1.9094, "step": 6588 }, { "epoch": 0.6188117505256768, "grad_norm": 0.357194721698761, "learning_rate": 3.8381584670988155e-05, "loss": 2.0029, "step": 6594 }, { "epoch": 0.6193748185425336, "grad_norm": 0.36560729146003723, "learning_rate": 3.836413452929465e-05, "loss": 2.0049, "step": 6600 }, { "epoch": 0.6199378865593905, "grad_norm": 0.35655179619789124, "learning_rate": 3.834670816703771e-05, "loss": 1.9645, "step": 6606 }, { "epoch": 0.6205009545762473, "grad_norm": 0.384278267621994, "learning_rate": 3.832930553025899e-05, "loss": 2.0287, "step": 6612 }, { "epoch": 0.6210640225931042, "grad_norm": 0.4420374631881714, "learning_rate": 3.8311926565171403e-05, "loss": 1.9676, "step": 6618 }, { "epoch": 0.621627090609961, "grad_norm": 0.5364717245101929, "learning_rate": 3.8294571218158436e-05, "loss": 1.9948, "step": 6624 }, { "epoch": 0.6221901586268179, "grad_norm": 0.41895872354507446, "learning_rate": 3.827723943577344e-05, "loss": 2.032, "step": 6630 }, { "epoch": 0.6227532266436747, "grad_norm": 0.41526928544044495, "learning_rate": 3.8259931164738924e-05, "loss": 1.9714, "step": 6636 }, { "epoch": 0.6233162946605316, "grad_norm": 0.40324461460113525, "learning_rate": 3.824264635194589e-05, "loss": 1.9495, "step": 6642 }, { "epoch": 0.6238793626773884, "grad_norm": 0.43266692757606506, "learning_rate": 3.822538494445313e-05, "loss": 1.9955, "step": 6648 }, { "epoch": 0.6244424306942453, "grad_norm": 0.4115055501461029, "learning_rate": 3.82081468894866e-05, "loss": 1.9779, "step": 6654 }, { "epoch": 0.6250054987111021, "grad_norm": 0.40066128969192505, "learning_rate": 3.819093213443863e-05, "loss": 1.9228, "step": 6660 }, { "epoch": 0.625568566727959, "grad_norm": 0.3730969727039337, "learning_rate": 3.817374062686738e-05, "loss": 1.9712, "step": 6666 }, { "epoch": 0.6261316347448158, "grad_norm": 0.3869776725769043, "learning_rate": 3.8156572314496066e-05, "loss": 1.9876, "step": 6672 }, { "epoch": 0.6266947027616727, "grad_norm": 0.3770878314971924, "learning_rate": 3.813942714521234e-05, "loss": 1.988, "step": 6678 }, { "epoch": 0.6272577707785295, "grad_norm": 0.41099244356155396, "learning_rate": 3.812230506706764e-05, "loss": 1.9567, "step": 6684 }, { "epoch": 0.6278208387953864, "grad_norm": 0.36439961194992065, "learning_rate": 3.810520602827649e-05, "loss": 1.9709, "step": 6690 }, { "epoch": 0.6283839068122432, "grad_norm": 0.3720015287399292, "learning_rate": 3.808812997721588e-05, "loss": 1.9722, "step": 6696 }, { "epoch": 0.6289469748291001, "grad_norm": 0.4005125164985657, "learning_rate": 3.807107686242454e-05, "loss": 1.9955, "step": 6702 }, { "epoch": 0.6295100428459569, "grad_norm": 0.36164671182632446, "learning_rate": 3.8054046632602414e-05, "loss": 1.9937, "step": 6708 }, { "epoch": 0.6300731108628137, "grad_norm": 0.3621242642402649, "learning_rate": 3.8037039236609874e-05, "loss": 1.9683, "step": 6714 }, { "epoch": 0.6306361788796706, "grad_norm": 0.45920518040657043, "learning_rate": 3.8020054623467186e-05, "loss": 1.9658, "step": 6720 }, { "epoch": 0.6311992468965274, "grad_norm": 0.4063742160797119, "learning_rate": 3.800309274235378e-05, "loss": 2.0209, "step": 6726 }, { "epoch": 0.6317623149133843, "grad_norm": 0.38678812980651855, "learning_rate": 3.798615354260769e-05, "loss": 1.952, "step": 6732 }, { "epoch": 0.6323253829302411, "grad_norm": 0.3813757002353668, "learning_rate": 3.796923697372486e-05, "loss": 1.9565, "step": 6738 }, { "epoch": 0.632888450947098, "grad_norm": 0.43353238701820374, "learning_rate": 3.795234298535852e-05, "loss": 1.9209, "step": 6744 }, { "epoch": 0.6334515189639548, "grad_norm": 0.4134998619556427, "learning_rate": 3.7935471527318596e-05, "loss": 1.984, "step": 6750 }, { "epoch": 0.6340145869808117, "grad_norm": 0.3529532253742218, "learning_rate": 3.791862254957105e-05, "loss": 1.9411, "step": 6756 }, { "epoch": 0.6345776549976685, "grad_norm": 0.3921270966529846, "learning_rate": 3.7901796002237264e-05, "loss": 1.9697, "step": 6762 }, { "epoch": 0.6351407230145254, "grad_norm": 0.3699937164783478, "learning_rate": 3.7884991835593404e-05, "loss": 2.0023, "step": 6768 }, { "epoch": 0.6357037910313822, "grad_norm": 0.3940852880477905, "learning_rate": 3.7868210000069835e-05, "loss": 1.9312, "step": 6774 }, { "epoch": 0.6362668590482391, "grad_norm": 0.3512316346168518, "learning_rate": 3.7851450446250484e-05, "loss": 2.0172, "step": 6780 }, { "epoch": 0.6368299270650959, "grad_norm": 0.38011234998703003, "learning_rate": 3.7834713124872234e-05, "loss": 1.9837, "step": 6786 }, { "epoch": 0.6373929950819528, "grad_norm": 0.41160330176353455, "learning_rate": 3.7817997986824325e-05, "loss": 1.9556, "step": 6792 }, { "epoch": 0.6379560630988096, "grad_norm": 0.3537224531173706, "learning_rate": 3.780130498314774e-05, "loss": 1.9992, "step": 6798 }, { "epoch": 0.6381437524377619, "eval_accuracy": 0.5824364986322782, "eval_loss": 2.013535737991333, "eval_runtime": 88.1798, "eval_samples_per_second": 4.536, "eval_steps_per_second": 1.134, "step": 6800 }, { "epoch": 0.6385191311156665, "grad_norm": 0.40040162205696106, "learning_rate": 3.778463406503458e-05, "loss": 1.9614, "step": 6804 }, { "epoch": 0.6390821991325233, "grad_norm": 0.46792563796043396, "learning_rate": 3.7767985183827516e-05, "loss": 1.9895, "step": 6810 }, { "epoch": 0.6396452671493802, "grad_norm": 0.4619467854499817, "learning_rate": 3.7751358291019166e-05, "loss": 1.9948, "step": 6816 }, { "epoch": 0.640208335166237, "grad_norm": 0.40062960982322693, "learning_rate": 3.773475333825147e-05, "loss": 1.9609, "step": 6822 }, { "epoch": 0.6407714031830939, "grad_norm": 0.37203484773635864, "learning_rate": 3.7718170277315164e-05, "loss": 2.0028, "step": 6828 }, { "epoch": 0.6413344711999507, "grad_norm": 0.3750365972518921, "learning_rate": 3.770160906014914e-05, "loss": 1.9674, "step": 6834 }, { "epoch": 0.6418975392168076, "grad_norm": 0.39040395617485046, "learning_rate": 3.768506963883988e-05, "loss": 2.0035, "step": 6840 }, { "epoch": 0.6424606072336644, "grad_norm": 0.34079399704933167, "learning_rate": 3.766855196562085e-05, "loss": 2.0266, "step": 6846 }, { "epoch": 0.6430236752505213, "grad_norm": 0.36824363470077515, "learning_rate": 3.7652055992872e-05, "loss": 1.9603, "step": 6852 }, { "epoch": 0.6435867432673781, "grad_norm": 0.4937073886394501, "learning_rate": 3.763558167311907e-05, "loss": 1.9511, "step": 6858 }, { "epoch": 0.644149811284235, "grad_norm": 0.42677006125450134, "learning_rate": 3.761912895903311e-05, "loss": 1.9978, "step": 6864 }, { "epoch": 0.6447128793010918, "grad_norm": 0.3888424038887024, "learning_rate": 3.7602697803429877e-05, "loss": 2.0, "step": 6870 }, { "epoch": 0.6452759473179487, "grad_norm": 0.38250523805618286, "learning_rate": 3.7586288159269235e-05, "loss": 2.005, "step": 6876 }, { "epoch": 0.6458390153348055, "grad_norm": 0.37467533349990845, "learning_rate": 3.756989997965466e-05, "loss": 1.93, "step": 6882 }, { "epoch": 0.6464020833516624, "grad_norm": 0.3594309985637665, "learning_rate": 3.7553533217832604e-05, "loss": 1.9649, "step": 6888 }, { "epoch": 0.6469651513685192, "grad_norm": 0.38040682673454285, "learning_rate": 3.7537187827191995e-05, "loss": 1.9638, "step": 6894 }, { "epoch": 0.6475282193853761, "grad_norm": 0.3573301434516907, "learning_rate": 3.7520863761263605e-05, "loss": 1.9599, "step": 6900 }, { "epoch": 0.6480912874022329, "grad_norm": 0.37074151635169983, "learning_rate": 3.750456097371959e-05, "loss": 2.0169, "step": 6906 }, { "epoch": 0.6486543554190898, "grad_norm": 0.371880441904068, "learning_rate": 3.7488279418372884e-05, "loss": 1.952, "step": 6912 }, { "epoch": 0.6492174234359466, "grad_norm": 0.4159197509288788, "learning_rate": 3.747201904917663e-05, "loss": 1.9634, "step": 6918 }, { "epoch": 0.6497804914528035, "grad_norm": 0.4010482132434845, "learning_rate": 3.745577982022367e-05, "loss": 1.9285, "step": 6924 }, { "epoch": 0.6503435594696603, "grad_norm": 0.36435064673423767, "learning_rate": 3.7439561685745995e-05, "loss": 1.9827, "step": 6930 }, { "epoch": 0.6509066274865172, "grad_norm": 0.38765305280685425, "learning_rate": 3.742336460011418e-05, "loss": 1.9432, "step": 6936 }, { "epoch": 0.651469695503374, "grad_norm": 0.39336585998535156, "learning_rate": 3.74071885178369e-05, "loss": 1.9405, "step": 6942 }, { "epoch": 0.6520327635202309, "grad_norm": 0.3766852617263794, "learning_rate": 3.739103339356031e-05, "loss": 2.0283, "step": 6948 }, { "epoch": 0.6525958315370877, "grad_norm": 0.3888882100582123, "learning_rate": 3.7374899182067575e-05, "loss": 1.9481, "step": 6954 }, { "epoch": 0.6531588995539446, "grad_norm": 0.38194769620895386, "learning_rate": 3.7358785838278324e-05, "loss": 1.9665, "step": 6960 }, { "epoch": 0.6537219675708014, "grad_norm": 0.4055176377296448, "learning_rate": 3.7342693317248105e-05, "loss": 1.967, "step": 6966 }, { "epoch": 0.6542850355876583, "grad_norm": 0.37927111983299255, "learning_rate": 3.73266215741679e-05, "loss": 2.0434, "step": 6972 }, { "epoch": 0.6548481036045151, "grad_norm": 0.38141560554504395, "learning_rate": 3.731057056436351e-05, "loss": 1.9331, "step": 6978 }, { "epoch": 0.655411171621372, "grad_norm": 0.39817070960998535, "learning_rate": 3.729454024329517e-05, "loss": 1.9677, "step": 6984 }, { "epoch": 0.6559742396382288, "grad_norm": 0.3717750608921051, "learning_rate": 3.7278530566556894e-05, "loss": 1.9881, "step": 6990 }, { "epoch": 0.6565373076550857, "grad_norm": 0.390986829996109, "learning_rate": 3.7262541489876053e-05, "loss": 2.0411, "step": 6996 }, { "epoch": 0.6571003756719425, "grad_norm": 0.4088055491447449, "learning_rate": 3.724657296911281e-05, "loss": 2.0283, "step": 7002 }, { "epoch": 0.6576634436887994, "grad_norm": 0.3909282386302948, "learning_rate": 3.7230624960259624e-05, "loss": 1.9363, "step": 7008 }, { "epoch": 0.6582265117056562, "grad_norm": 0.3950212895870209, "learning_rate": 3.7214697419440754e-05, "loss": 2.068, "step": 7014 }, { "epoch": 0.6587895797225131, "grad_norm": 0.3636452257633209, "learning_rate": 3.7198790302911735e-05, "loss": 1.9331, "step": 7020 }, { "epoch": 0.6593526477393699, "grad_norm": 0.41783154010772705, "learning_rate": 3.7182903567058866e-05, "loss": 1.9906, "step": 7026 }, { "epoch": 0.6599157157562268, "grad_norm": 0.37947478890419006, "learning_rate": 3.7167037168398735e-05, "loss": 1.9736, "step": 7032 }, { "epoch": 0.6604787837730836, "grad_norm": 0.410206139087677, "learning_rate": 3.7151191063577696e-05, "loss": 1.9782, "step": 7038 }, { "epoch": 0.6610418517899405, "grad_norm": 0.3837246894836426, "learning_rate": 3.7135365209371374e-05, "loss": 1.9522, "step": 7044 }, { "epoch": 0.6616049198067973, "grad_norm": 0.4738423228263855, "learning_rate": 3.711955956268419e-05, "loss": 1.9555, "step": 7050 }, { "epoch": 0.6621679878236542, "grad_norm": 0.4042760133743286, "learning_rate": 3.7103774080548846e-05, "loss": 1.998, "step": 7056 }, { "epoch": 0.662731055840511, "grad_norm": 0.3969372808933258, "learning_rate": 3.7088008720125826e-05, "loss": 2.0088, "step": 7062 }, { "epoch": 0.6632941238573679, "grad_norm": 0.43925297260284424, "learning_rate": 3.707226343870294e-05, "loss": 1.9616, "step": 7068 }, { "epoch": 0.6638571918742247, "grad_norm": 0.41445526480674744, "learning_rate": 3.705653819369483e-05, "loss": 1.9746, "step": 7074 }, { "epoch": 0.6644202598910816, "grad_norm": 0.368735671043396, "learning_rate": 3.704083294264244e-05, "loss": 2.0154, "step": 7080 }, { "epoch": 0.6649833279079383, "grad_norm": 0.3975413739681244, "learning_rate": 3.702514764321263e-05, "loss": 1.9672, "step": 7086 }, { "epoch": 0.6655463959247953, "grad_norm": 0.37592795491218567, "learning_rate": 3.700948225319758e-05, "loss": 1.9708, "step": 7092 }, { "epoch": 0.666109463941652, "grad_norm": 0.36831220984458923, "learning_rate": 3.69938367305144e-05, "loss": 1.9383, "step": 7098 }, { "epoch": 0.666672531958509, "grad_norm": 0.40441715717315674, "learning_rate": 3.697821103320463e-05, "loss": 1.9647, "step": 7104 }, { "epoch": 0.6672355999753657, "grad_norm": 0.40096864104270935, "learning_rate": 3.696260511943375e-05, "loss": 1.9299, "step": 7110 }, { "epoch": 0.6677986679922226, "grad_norm": 0.38981834053993225, "learning_rate": 3.6947018947490736e-05, "loss": 1.9979, "step": 7116 }, { "epoch": 0.6683617360090794, "grad_norm": 0.36453720927238464, "learning_rate": 3.693145247578758e-05, "loss": 1.9979, "step": 7122 }, { "epoch": 0.6689248040259363, "grad_norm": 0.3832484185695648, "learning_rate": 3.691590566285881e-05, "loss": 1.9168, "step": 7128 }, { "epoch": 0.6694878720427931, "grad_norm": 0.3875962793827057, "learning_rate": 3.690037846736105e-05, "loss": 1.9312, "step": 7134 }, { "epoch": 0.67005094005965, "grad_norm": 0.4123656451702118, "learning_rate": 3.688487084807254e-05, "loss": 1.953, "step": 7140 }, { "epoch": 0.6706140080765068, "grad_norm": 0.35591596364974976, "learning_rate": 3.68693827638927e-05, "loss": 1.9712, "step": 7146 }, { "epoch": 0.6711770760933637, "grad_norm": 0.4561684727668762, "learning_rate": 3.685391417384164e-05, "loss": 1.935, "step": 7152 }, { "epoch": 0.6717401441102205, "grad_norm": 0.4021289348602295, "learning_rate": 3.683846503705973e-05, "loss": 1.9859, "step": 7158 }, { "epoch": 0.6723032121270774, "grad_norm": 0.3604682981967926, "learning_rate": 3.682303531280714e-05, "loss": 1.9316, "step": 7164 }, { "epoch": 0.6728662801439342, "grad_norm": 0.45049095153808594, "learning_rate": 3.6807624960463366e-05, "loss": 1.9953, "step": 7170 }, { "epoch": 0.6734293481607911, "grad_norm": 0.4109976291656494, "learning_rate": 3.6792233939526824e-05, "loss": 1.9334, "step": 7176 }, { "epoch": 0.6739924161776479, "grad_norm": 0.4282071590423584, "learning_rate": 3.677686220961439e-05, "loss": 1.9241, "step": 7182 }, { "epoch": 0.6745554841945048, "grad_norm": 0.4123218357563019, "learning_rate": 3.676150973046091e-05, "loss": 1.9987, "step": 7188 }, { "epoch": 0.6751185522113616, "grad_norm": 0.3913956582546234, "learning_rate": 3.6746176461918804e-05, "loss": 1.9635, "step": 7194 }, { "epoch": 0.6756816202282185, "grad_norm": 0.40118473768234253, "learning_rate": 3.673086236395764e-05, "loss": 2.0023, "step": 7200 }, { "epoch": 0.6756816202282185, "eval_accuracy": 0.5833128175068386, "eval_loss": 2.0078184604644775, "eval_runtime": 88.1841, "eval_samples_per_second": 4.536, "eval_steps_per_second": 1.134, "step": 7200 }, { "epoch": 0.6762446882450753, "grad_norm": 0.40749770402908325, "learning_rate": 3.671556739666365e-05, "loss": 1.9503, "step": 7206 }, { "epoch": 0.6768077562619322, "grad_norm": 0.4109352231025696, "learning_rate": 3.67002915202393e-05, "loss": 1.9975, "step": 7212 }, { "epoch": 0.677370824278789, "grad_norm": 0.39660871028900146, "learning_rate": 3.668503469500289e-05, "loss": 1.9765, "step": 7218 }, { "epoch": 0.6779338922956459, "grad_norm": 0.3976249694824219, "learning_rate": 3.6669796881388083e-05, "loss": 1.9904, "step": 7224 }, { "epoch": 0.6784969603125027, "grad_norm": 0.4379390478134155, "learning_rate": 3.6654578039943485e-05, "loss": 1.9539, "step": 7230 }, { "epoch": 0.6790600283293596, "grad_norm": 0.3851417899131775, "learning_rate": 3.663937813133223e-05, "loss": 1.9822, "step": 7236 }, { "epoch": 0.6796230963462164, "grad_norm": 0.35688313841819763, "learning_rate": 3.662419711633154e-05, "loss": 2.0002, "step": 7242 }, { "epoch": 0.6801861643630733, "grad_norm": 0.44306492805480957, "learning_rate": 3.6609034955832304e-05, "loss": 1.9705, "step": 7248 }, { "epoch": 0.6807492323799301, "grad_norm": 0.37084150314331055, "learning_rate": 3.659389161083864e-05, "loss": 1.9871, "step": 7254 }, { "epoch": 0.681312300396787, "grad_norm": 0.38667091727256775, "learning_rate": 3.6578767042467514e-05, "loss": 1.9596, "step": 7260 }, { "epoch": 0.6818753684136438, "grad_norm": 0.38415420055389404, "learning_rate": 3.6563661211948274e-05, "loss": 1.9708, "step": 7266 }, { "epoch": 0.6824384364305007, "grad_norm": 0.4058648347854614, "learning_rate": 3.6548574080622256e-05, "loss": 2.0097, "step": 7272 }, { "epoch": 0.6830015044473575, "grad_norm": 0.4687325358390808, "learning_rate": 3.6533505609942366e-05, "loss": 1.9409, "step": 7278 }, { "epoch": 0.6835645724642144, "grad_norm": 0.4553219676017761, "learning_rate": 3.651845576147266e-05, "loss": 2.0254, "step": 7284 }, { "epoch": 0.6841276404810712, "grad_norm": 0.5270876884460449, "learning_rate": 3.6503424496887964e-05, "loss": 1.9647, "step": 7290 }, { "epoch": 0.6846907084979281, "grad_norm": 0.4004805088043213, "learning_rate": 3.648841177797339e-05, "loss": 2.0211, "step": 7296 }, { "epoch": 0.6852537765147849, "grad_norm": 0.37324100732803345, "learning_rate": 3.647341756662403e-05, "loss": 1.9638, "step": 7302 }, { "epoch": 0.6858168445316418, "grad_norm": 0.412540078163147, "learning_rate": 3.645844182484444e-05, "loss": 1.9286, "step": 7308 }, { "epoch": 0.6863799125484986, "grad_norm": 0.3937520980834961, "learning_rate": 3.644348451474834e-05, "loss": 2.0108, "step": 7314 }, { "epoch": 0.6869429805653555, "grad_norm": 0.4147574007511139, "learning_rate": 3.642854559855814e-05, "loss": 2.0032, "step": 7320 }, { "epoch": 0.6875060485822123, "grad_norm": 0.3924231231212616, "learning_rate": 3.641362503860456e-05, "loss": 1.9339, "step": 7326 }, { "epoch": 0.6880691165990692, "grad_norm": 0.3666737377643585, "learning_rate": 3.639872279732622e-05, "loss": 1.9919, "step": 7332 }, { "epoch": 0.688632184615926, "grad_norm": 0.3755744397640228, "learning_rate": 3.63838388372693e-05, "loss": 2.0152, "step": 7338 }, { "epoch": 0.6891952526327829, "grad_norm": 0.37046104669570923, "learning_rate": 3.6368973121087056e-05, "loss": 1.998, "step": 7344 }, { "epoch": 0.6897583206496397, "grad_norm": 0.37386026978492737, "learning_rate": 3.63541256115395e-05, "loss": 1.959, "step": 7350 }, { "epoch": 0.6903213886664966, "grad_norm": 0.383626252412796, "learning_rate": 3.633929627149295e-05, "loss": 1.9983, "step": 7356 }, { "epoch": 0.6908844566833534, "grad_norm": 0.37021809816360474, "learning_rate": 3.632448506391969e-05, "loss": 1.9967, "step": 7362 }, { "epoch": 0.6914475247002103, "grad_norm": 0.3737945854663849, "learning_rate": 3.630969195189755e-05, "loss": 1.9676, "step": 7368 }, { "epoch": 0.6920105927170671, "grad_norm": 0.4062651991844177, "learning_rate": 3.629491689860955e-05, "loss": 1.9816, "step": 7374 }, { "epoch": 0.692573660733924, "grad_norm": 0.39062774181365967, "learning_rate": 3.628015986734347e-05, "loss": 1.9946, "step": 7380 }, { "epoch": 0.6931367287507808, "grad_norm": 0.3759336769580841, "learning_rate": 3.626542082149151e-05, "loss": 1.9686, "step": 7386 }, { "epoch": 0.6936997967676377, "grad_norm": 0.4347020089626312, "learning_rate": 3.625069972454988e-05, "loss": 1.942, "step": 7392 }, { "epoch": 0.6942628647844945, "grad_norm": 0.38715800642967224, "learning_rate": 3.623599654011843e-05, "loss": 2.0024, "step": 7398 }, { "epoch": 0.6948259328013514, "grad_norm": 0.4498685300350189, "learning_rate": 3.6221311231900275e-05, "loss": 1.9541, "step": 7404 }, { "epoch": 0.6953890008182082, "grad_norm": 0.4342799782752991, "learning_rate": 3.620664376370142e-05, "loss": 1.9671, "step": 7410 }, { "epoch": 0.6959520688350651, "grad_norm": 0.3925463855266571, "learning_rate": 3.6191994099430384e-05, "loss": 2.0179, "step": 7416 }, { "epoch": 0.6965151368519219, "grad_norm": 0.36476460099220276, "learning_rate": 3.6177362203097826e-05, "loss": 2.0028, "step": 7422 }, { "epoch": 0.6970782048687788, "grad_norm": 0.3994980752468109, "learning_rate": 3.6162748038816155e-05, "loss": 1.9548, "step": 7428 }, { "epoch": 0.6976412728856356, "grad_norm": 0.37360233068466187, "learning_rate": 3.61481515707992e-05, "loss": 1.9497, "step": 7434 }, { "epoch": 0.6982043409024925, "grad_norm": 0.40021395683288574, "learning_rate": 3.61335727633618e-05, "loss": 1.977, "step": 7440 }, { "epoch": 0.6987674089193493, "grad_norm": 0.39066028594970703, "learning_rate": 3.611901158091947e-05, "loss": 1.9958, "step": 7446 }, { "epoch": 0.6993304769362062, "grad_norm": 0.3877423405647278, "learning_rate": 3.610446798798802e-05, "loss": 1.9877, "step": 7452 }, { "epoch": 0.699893544953063, "grad_norm": 0.3660953640937805, "learning_rate": 3.608994194918322e-05, "loss": 1.9477, "step": 7458 }, { "epoch": 0.7004566129699199, "grad_norm": 0.41111746430397034, "learning_rate": 3.607543342922035e-05, "loss": 1.9676, "step": 7464 }, { "epoch": 0.7010196809867767, "grad_norm": 0.376044362783432, "learning_rate": 3.606094239291398e-05, "loss": 1.9895, "step": 7470 }, { "epoch": 0.7015827490036336, "grad_norm": 0.3439435064792633, "learning_rate": 3.6046468805177465e-05, "loss": 2.0259, "step": 7476 }, { "epoch": 0.7021458170204904, "grad_norm": 0.35587993264198303, "learning_rate": 3.603201263102272e-05, "loss": 2.0034, "step": 7482 }, { "epoch": 0.7027088850373473, "grad_norm": 0.4326861500740051, "learning_rate": 3.6017573835559776e-05, "loss": 1.9269, "step": 7488 }, { "epoch": 0.7032719530542041, "grad_norm": 0.426268070936203, "learning_rate": 3.6003152383996434e-05, "loss": 1.9762, "step": 7494 }, { "epoch": 0.703835021071061, "grad_norm": 0.42990854382514954, "learning_rate": 3.598874824163797e-05, "loss": 2.0513, "step": 7500 }, { "epoch": 0.7043980890879178, "grad_norm": 0.3862810730934143, "learning_rate": 3.5974361373886725e-05, "loss": 1.9533, "step": 7506 }, { "epoch": 0.7049611571047747, "grad_norm": 0.4440693259239197, "learning_rate": 3.595999174624177e-05, "loss": 2.0278, "step": 7512 }, { "epoch": 0.7055242251216315, "grad_norm": 0.3862363398075104, "learning_rate": 3.59456393242986e-05, "loss": 1.9687, "step": 7518 }, { "epoch": 0.7060872931384884, "grad_norm": 0.3690185844898224, "learning_rate": 3.593130407374872e-05, "loss": 1.9872, "step": 7524 }, { "epoch": 0.7066503611553452, "grad_norm": 0.4289971590042114, "learning_rate": 3.591698596037936e-05, "loss": 1.9789, "step": 7530 }, { "epoch": 0.7072134291722021, "grad_norm": 0.4030942916870117, "learning_rate": 3.590268495007307e-05, "loss": 1.9326, "step": 7536 }, { "epoch": 0.7077764971890589, "grad_norm": 0.3630010783672333, "learning_rate": 3.5888401008807485e-05, "loss": 1.9598, "step": 7542 }, { "epoch": 0.7083395652059158, "grad_norm": 0.36757519841194153, "learning_rate": 3.587413410265483e-05, "loss": 1.9509, "step": 7548 }, { "epoch": 0.7089026332227726, "grad_norm": 0.4057968854904175, "learning_rate": 3.585988419778174e-05, "loss": 1.9252, "step": 7554 }, { "epoch": 0.7094657012396294, "grad_norm": 0.3869026005268097, "learning_rate": 3.5845651260448795e-05, "loss": 1.9553, "step": 7560 }, { "epoch": 0.7100287692564863, "grad_norm": 0.5217390656471252, "learning_rate": 3.583143525701028e-05, "loss": 1.9199, "step": 7566 }, { "epoch": 0.7105918372733431, "grad_norm": 0.38928520679473877, "learning_rate": 3.58172361539138e-05, "loss": 1.9579, "step": 7572 }, { "epoch": 0.7111549052902, "grad_norm": 0.42109161615371704, "learning_rate": 3.580305391769994e-05, "loss": 2.0431, "step": 7578 }, { "epoch": 0.7117179733070568, "grad_norm": 0.3834002912044525, "learning_rate": 3.5788888515001974e-05, "loss": 1.9226, "step": 7584 }, { "epoch": 0.7122810413239137, "grad_norm": 0.39533498883247375, "learning_rate": 3.5774739912545506e-05, "loss": 1.987, "step": 7590 }, { "epoch": 0.7128441093407705, "grad_norm": 0.398948073387146, "learning_rate": 3.576060807714815e-05, "loss": 1.977, "step": 7596 }, { "epoch": 0.7132194880186751, "eval_accuracy": 0.5838159437280187, "eval_loss": 2.005033254623413, "eval_runtime": 88.0744, "eval_samples_per_second": 4.542, "eval_steps_per_second": 1.135, "step": 7600 }, { "epoch": 0.7134071773576274, "grad_norm": 0.3781101107597351, "learning_rate": 3.574649297571919e-05, "loss": 1.9426, "step": 7602 }, { "epoch": 0.7139702453744842, "grad_norm": 0.4426119923591614, "learning_rate": 3.5732394575259276e-05, "loss": 1.9437, "step": 7608 }, { "epoch": 0.7145333133913411, "grad_norm": 0.3957236409187317, "learning_rate": 3.571831284286012e-05, "loss": 1.9446, "step": 7614 }, { "epoch": 0.7150963814081979, "grad_norm": 0.45682474970817566, "learning_rate": 3.570424774570407e-05, "loss": 1.9791, "step": 7620 }, { "epoch": 0.7156594494250548, "grad_norm": 0.4267377555370331, "learning_rate": 3.569019925106394e-05, "loss": 1.9102, "step": 7626 }, { "epoch": 0.7162225174419116, "grad_norm": 0.36490267515182495, "learning_rate": 3.567616732630258e-05, "loss": 2.0466, "step": 7632 }, { "epoch": 0.7167855854587685, "grad_norm": 0.3970522880554199, "learning_rate": 3.566215193887258e-05, "loss": 2.0039, "step": 7638 }, { "epoch": 0.7173486534756253, "grad_norm": 0.38292717933654785, "learning_rate": 3.564815305631599e-05, "loss": 1.9798, "step": 7644 }, { "epoch": 0.7179117214924822, "grad_norm": 0.39947909116744995, "learning_rate": 3.563417064626394e-05, "loss": 1.9924, "step": 7650 }, { "epoch": 0.718474789509339, "grad_norm": 0.41775065660476685, "learning_rate": 3.562020467643642e-05, "loss": 2.0031, "step": 7656 }, { "epoch": 0.7190378575261959, "grad_norm": 0.39935892820358276, "learning_rate": 3.5606255114641846e-05, "loss": 1.9616, "step": 7662 }, { "epoch": 0.7196009255430527, "grad_norm": 0.44099992513656616, "learning_rate": 3.559232192877686e-05, "loss": 1.9628, "step": 7668 }, { "epoch": 0.7201639935599096, "grad_norm": 0.3703477680683136, "learning_rate": 3.557840508682596e-05, "loss": 1.9234, "step": 7674 }, { "epoch": 0.7207270615767664, "grad_norm": 0.38969701528549194, "learning_rate": 3.556450455686118e-05, "loss": 1.9714, "step": 7680 }, { "epoch": 0.7212901295936233, "grad_norm": 0.3755648732185364, "learning_rate": 3.5550620307041853e-05, "loss": 1.9639, "step": 7686 }, { "epoch": 0.7218531976104801, "grad_norm": 0.4677676856517792, "learning_rate": 3.553675230561422e-05, "loss": 1.9763, "step": 7692 }, { "epoch": 0.722416265627337, "grad_norm": 0.42161738872528076, "learning_rate": 3.5522900520911166e-05, "loss": 2.0456, "step": 7698 }, { "epoch": 0.7229793336441938, "grad_norm": 0.3988853096961975, "learning_rate": 3.550906492135191e-05, "loss": 1.9953, "step": 7704 }, { "epoch": 0.7235424016610507, "grad_norm": 0.41087397933006287, "learning_rate": 3.549524547544175e-05, "loss": 1.9448, "step": 7710 }, { "epoch": 0.7241054696779075, "grad_norm": 0.4454058110713959, "learning_rate": 3.548144215177165e-05, "loss": 2.04, "step": 7716 }, { "epoch": 0.7246685376947644, "grad_norm": 0.3935624659061432, "learning_rate": 3.546765491901805e-05, "loss": 1.9711, "step": 7722 }, { "epoch": 0.7252316057116212, "grad_norm": 0.40041229128837585, "learning_rate": 3.545388374594252e-05, "loss": 1.9453, "step": 7728 }, { "epoch": 0.7257946737284781, "grad_norm": 0.4587880074977875, "learning_rate": 3.5440128601391464e-05, "loss": 1.9568, "step": 7734 }, { "epoch": 0.7263577417453349, "grad_norm": 0.35514307022094727, "learning_rate": 3.542638945429582e-05, "loss": 1.9439, "step": 7740 }, { "epoch": 0.7269208097621918, "grad_norm": 0.37032076716423035, "learning_rate": 3.5412666273670776e-05, "loss": 1.9258, "step": 7746 }, { "epoch": 0.7274838777790485, "grad_norm": 0.36563533544540405, "learning_rate": 3.539895902861548e-05, "loss": 1.9732, "step": 7752 }, { "epoch": 0.7280469457959055, "grad_norm": 0.3970559537410736, "learning_rate": 3.538526768831275e-05, "loss": 1.9736, "step": 7758 }, { "epoch": 0.7286100138127622, "grad_norm": 0.38019853830337524, "learning_rate": 3.537159222202874e-05, "loss": 1.9939, "step": 7764 }, { "epoch": 0.7291730818296192, "grad_norm": 0.3873913288116455, "learning_rate": 3.5357932599112716e-05, "loss": 1.9595, "step": 7770 }, { "epoch": 0.729736149846476, "grad_norm": 0.39359939098358154, "learning_rate": 3.534428878899673e-05, "loss": 1.9195, "step": 7776 }, { "epoch": 0.7302992178633329, "grad_norm": 0.3982885181903839, "learning_rate": 3.5330660761195334e-05, "loss": 1.9605, "step": 7782 }, { "epoch": 0.7308622858801896, "grad_norm": 0.3919159770011902, "learning_rate": 3.53170484853053e-05, "loss": 2.0307, "step": 7788 }, { "epoch": 0.7314253538970465, "grad_norm": 0.4182370901107788, "learning_rate": 3.5303451931005325e-05, "loss": 2.0218, "step": 7794 }, { "epoch": 0.7319884219139033, "grad_norm": 0.3828820586204529, "learning_rate": 3.528987106805577e-05, "loss": 1.9391, "step": 7800 }, { "epoch": 0.7325514899307602, "grad_norm": 0.3585250973701477, "learning_rate": 3.527630586629836e-05, "loss": 1.8921, "step": 7806 }, { "epoch": 0.733114557947617, "grad_norm": 0.44462716579437256, "learning_rate": 3.5262756295655905e-05, "loss": 1.9938, "step": 7812 }, { "epoch": 0.733677625964474, "grad_norm": 0.48742231726646423, "learning_rate": 3.524922232613201e-05, "loss": 1.9545, "step": 7818 }, { "epoch": 0.7342406939813307, "grad_norm": 0.41367968916893005, "learning_rate": 3.523570392781083e-05, "loss": 1.9276, "step": 7824 }, { "epoch": 0.7348037619981876, "grad_norm": 0.3732398450374603, "learning_rate": 3.5222201070856745e-05, "loss": 1.9556, "step": 7830 }, { "epoch": 0.7353668300150444, "grad_norm": 0.36246633529663086, "learning_rate": 3.520871372551413e-05, "loss": 1.9308, "step": 7836 }, { "epoch": 0.7359298980319013, "grad_norm": 0.3910323977470398, "learning_rate": 3.519524186210703e-05, "loss": 1.9743, "step": 7842 }, { "epoch": 0.7364929660487581, "grad_norm": 0.44693106412887573, "learning_rate": 3.518178545103895e-05, "loss": 1.9473, "step": 7848 }, { "epoch": 0.737056034065615, "grad_norm": 0.3921654224395752, "learning_rate": 3.516834446279251e-05, "loss": 1.9945, "step": 7854 }, { "epoch": 0.7376191020824718, "grad_norm": 0.3527200520038605, "learning_rate": 3.515491886792924e-05, "loss": 1.9832, "step": 7860 }, { "epoch": 0.7381821700993287, "grad_norm": 0.3801862299442291, "learning_rate": 3.514150863708924e-05, "loss": 1.9771, "step": 7866 }, { "epoch": 0.7387452381161855, "grad_norm": 0.36322107911109924, "learning_rate": 3.512811374099099e-05, "loss": 1.9112, "step": 7872 }, { "epoch": 0.7393083061330424, "grad_norm": 0.3639414310455322, "learning_rate": 3.5114734150431e-05, "loss": 1.9258, "step": 7878 }, { "epoch": 0.7398713741498992, "grad_norm": 0.3855288624763489, "learning_rate": 3.510136983628362e-05, "loss": 1.9877, "step": 7884 }, { "epoch": 0.7404344421667561, "grad_norm": 0.4075772762298584, "learning_rate": 3.50880207695007e-05, "loss": 1.9926, "step": 7890 }, { "epoch": 0.7409975101836129, "grad_norm": 0.3856334686279297, "learning_rate": 3.507468692111138e-05, "loss": 1.9459, "step": 7896 }, { "epoch": 0.7415605782004698, "grad_norm": 0.401824027299881, "learning_rate": 3.5061368262221805e-05, "loss": 1.9852, "step": 7902 }, { "epoch": 0.7421236462173266, "grad_norm": 0.3914256989955902, "learning_rate": 3.504806476401485e-05, "loss": 1.9644, "step": 7908 }, { "epoch": 0.7426867142341835, "grad_norm": 0.48440101742744446, "learning_rate": 3.50347763977499e-05, "loss": 2.0059, "step": 7914 }, { "epoch": 0.7432497822510403, "grad_norm": 0.4204399883747101, "learning_rate": 3.5021503134762534e-05, "loss": 1.973, "step": 7920 }, { "epoch": 0.7438128502678972, "grad_norm": 0.3703163266181946, "learning_rate": 3.500824494646429e-05, "loss": 2.0106, "step": 7926 }, { "epoch": 0.744375918284754, "grad_norm": 0.38786181807518005, "learning_rate": 3.4995001804342435e-05, "loss": 1.9535, "step": 7932 }, { "epoch": 0.7449389863016109, "grad_norm": 0.3797600269317627, "learning_rate": 3.498177367995966e-05, "loss": 1.9336, "step": 7938 }, { "epoch": 0.7455020543184677, "grad_norm": 0.41417446732521057, "learning_rate": 3.4968560544953846e-05, "loss": 1.97, "step": 7944 }, { "epoch": 0.7460651223353246, "grad_norm": 0.40871796011924744, "learning_rate": 3.495536237103781e-05, "loss": 2.0133, "step": 7950 }, { "epoch": 0.7466281903521814, "grad_norm": 0.36400946974754333, "learning_rate": 3.494217912999905e-05, "loss": 1.9277, "step": 7956 }, { "epoch": 0.7471912583690383, "grad_norm": 0.4025874137878418, "learning_rate": 3.492901079369949e-05, "loss": 2.0018, "step": 7962 }, { "epoch": 0.7477543263858951, "grad_norm": 0.38932889699935913, "learning_rate": 3.491585733407522e-05, "loss": 2.0048, "step": 7968 }, { "epoch": 0.748317394402752, "grad_norm": 0.4791739881038666, "learning_rate": 3.4902718723136255e-05, "loss": 1.9666, "step": 7974 }, { "epoch": 0.7488804624196088, "grad_norm": 0.46065428853034973, "learning_rate": 3.488959493296628e-05, "loss": 1.9143, "step": 7980 }, { "epoch": 0.7494435304364657, "grad_norm": 0.4609152674674988, "learning_rate": 3.487648593572241e-05, "loss": 1.9984, "step": 7986 }, { "epoch": 0.7500065984533225, "grad_norm": 0.38375088572502136, "learning_rate": 3.4863391703634936e-05, "loss": 1.9601, "step": 7992 }, { "epoch": 0.7505696664701794, "grad_norm": 0.39126455783843994, "learning_rate": 3.485031220900706e-05, "loss": 1.9846, "step": 7998 }, { "epoch": 0.7507573558091317, "eval_accuracy": 0.5846004298554123, "eval_loss": 2.000138759613037, "eval_runtime": 88.0379, "eval_samples_per_second": 4.543, "eval_steps_per_second": 1.136, "step": 8000 }, { "epoch": 0.7511327344870362, "grad_norm": 0.3836112916469574, "learning_rate": 3.4837247424214674e-05, "loss": 1.9339, "step": 8004 }, { "epoch": 0.7516958025038931, "grad_norm": 0.394758939743042, "learning_rate": 3.482419732170612e-05, "loss": 1.9277, "step": 8010 }, { "epoch": 0.7522588705207499, "grad_norm": 0.3572465777397156, "learning_rate": 3.481116187400191e-05, "loss": 1.9228, "step": 8016 }, { "epoch": 0.7528219385376068, "grad_norm": 0.38521361351013184, "learning_rate": 3.4798141053694517e-05, "loss": 1.9756, "step": 8022 }, { "epoch": 0.7533850065544636, "grad_norm": 0.3838768005371094, "learning_rate": 3.4785134833448124e-05, "loss": 1.9735, "step": 8028 }, { "epoch": 0.7539480745713205, "grad_norm": 0.3647937476634979, "learning_rate": 3.477214318599837e-05, "loss": 1.971, "step": 8034 }, { "epoch": 0.7545111425881773, "grad_norm": 0.382097065448761, "learning_rate": 3.4759166084152125e-05, "loss": 1.9631, "step": 8040 }, { "epoch": 0.7550742106050342, "grad_norm": 0.4123421013355255, "learning_rate": 3.4746203500787255e-05, "loss": 1.972, "step": 8046 }, { "epoch": 0.755637278621891, "grad_norm": 0.3972260653972626, "learning_rate": 3.4733255408852373e-05, "loss": 1.9445, "step": 8052 }, { "epoch": 0.7562003466387479, "grad_norm": 0.3801904618740082, "learning_rate": 3.4720321781366573e-05, "loss": 1.9743, "step": 8058 }, { "epoch": 0.7567634146556047, "grad_norm": 0.3866405487060547, "learning_rate": 3.470740259141927e-05, "loss": 2.0207, "step": 8064 }, { "epoch": 0.7573264826724616, "grad_norm": 0.39437583088874817, "learning_rate": 3.46944978121699e-05, "loss": 1.8884, "step": 8070 }, { "epoch": 0.7578895506893184, "grad_norm": 0.3957318365573883, "learning_rate": 3.4681607416847684e-05, "loss": 1.9605, "step": 8076 }, { "epoch": 0.7584526187061753, "grad_norm": 0.39457759261131287, "learning_rate": 3.466873137875147e-05, "loss": 1.9937, "step": 8082 }, { "epoch": 0.7590156867230321, "grad_norm": 0.3554055988788605, "learning_rate": 3.465586967124939e-05, "loss": 1.9414, "step": 8088 }, { "epoch": 0.759578754739889, "grad_norm": 0.46388471126556396, "learning_rate": 3.464302226777871e-05, "loss": 1.954, "step": 8094 }, { "epoch": 0.7601418227567458, "grad_norm": 0.4880917966365814, "learning_rate": 3.463018914184559e-05, "loss": 1.9479, "step": 8100 }, { "epoch": 0.7607048907736027, "grad_norm": 0.40936535596847534, "learning_rate": 3.461737026702481e-05, "loss": 2.0034, "step": 8106 }, { "epoch": 0.7612679587904595, "grad_norm": 0.40068137645721436, "learning_rate": 3.460456561695959e-05, "loss": 1.9724, "step": 8112 }, { "epoch": 0.7618310268073164, "grad_norm": 0.37068653106689453, "learning_rate": 3.459177516536134e-05, "loss": 1.9972, "step": 8118 }, { "epoch": 0.7623940948241732, "grad_norm": 0.3963022828102112, "learning_rate": 3.457899888600942e-05, "loss": 1.9983, "step": 8124 }, { "epoch": 0.7629571628410301, "grad_norm": 0.35001254081726074, "learning_rate": 3.456623675275098e-05, "loss": 1.9813, "step": 8130 }, { "epoch": 0.7635202308578869, "grad_norm": 0.3785606026649475, "learning_rate": 3.4553488739500625e-05, "loss": 1.977, "step": 8136 }, { "epoch": 0.7640832988747438, "grad_norm": 0.38054734468460083, "learning_rate": 3.454075482024029e-05, "loss": 2.0104, "step": 8142 }, { "epoch": 0.7646463668916006, "grad_norm": 0.3801315128803253, "learning_rate": 3.452803496901897e-05, "loss": 1.9686, "step": 8148 }, { "epoch": 0.7652094349084575, "grad_norm": 0.3760085701942444, "learning_rate": 3.4515329159952506e-05, "loss": 2.0074, "step": 8154 }, { "epoch": 0.7657725029253143, "grad_norm": 0.3910846412181854, "learning_rate": 3.450263736722336e-05, "loss": 1.9772, "step": 8160 }, { "epoch": 0.7663355709421712, "grad_norm": 0.4067443907260895, "learning_rate": 3.448995956508041e-05, "loss": 1.9353, "step": 8166 }, { "epoch": 0.766898638959028, "grad_norm": 0.4505312144756317, "learning_rate": 3.447729572783872e-05, "loss": 1.9978, "step": 8172 }, { "epoch": 0.7674617069758849, "grad_norm": 0.4159944951534271, "learning_rate": 3.446464582987931e-05, "loss": 1.9922, "step": 8178 }, { "epoch": 0.7680247749927417, "grad_norm": 0.4128129780292511, "learning_rate": 3.445200984564895e-05, "loss": 1.9449, "step": 8184 }, { "epoch": 0.7685878430095986, "grad_norm": 0.4210661053657532, "learning_rate": 3.443938774965996e-05, "loss": 1.9939, "step": 8190 }, { "epoch": 0.7691509110264554, "grad_norm": 0.4222375154495239, "learning_rate": 3.442677951648997e-05, "loss": 2.0213, "step": 8196 }, { "epoch": 0.7697139790433123, "grad_norm": 0.4665868878364563, "learning_rate": 3.44141851207817e-05, "loss": 1.9355, "step": 8202 }, { "epoch": 0.7702770470601691, "grad_norm": 0.43431010842323303, "learning_rate": 3.4401604537242767e-05, "loss": 2.0164, "step": 8208 }, { "epoch": 0.770840115077026, "grad_norm": 0.4578784108161926, "learning_rate": 3.438903774064547e-05, "loss": 1.9716, "step": 8214 }, { "epoch": 0.7714031830938828, "grad_norm": 0.4845302700996399, "learning_rate": 3.437648470582656e-05, "loss": 1.975, "step": 8220 }, { "epoch": 0.7719662511107397, "grad_norm": 0.36762380599975586, "learning_rate": 3.436394540768703e-05, "loss": 1.9434, "step": 8226 }, { "epoch": 0.7725293191275965, "grad_norm": 0.4544452726840973, "learning_rate": 3.435141982119194e-05, "loss": 2.012, "step": 8232 }, { "epoch": 0.7730923871444534, "grad_norm": 0.38828518986701965, "learning_rate": 3.4338907921370154e-05, "loss": 1.9554, "step": 8238 }, { "epoch": 0.7736554551613102, "grad_norm": 0.35577309131622314, "learning_rate": 3.432640968331417e-05, "loss": 1.9228, "step": 8244 }, { "epoch": 0.7742185231781671, "grad_norm": 0.3738974928855896, "learning_rate": 3.43139250821799e-05, "loss": 1.9829, "step": 8250 }, { "epoch": 0.7747815911950239, "grad_norm": 0.3606838285923004, "learning_rate": 3.4301454093186436e-05, "loss": 2.0056, "step": 8256 }, { "epoch": 0.7753446592118808, "grad_norm": 0.3882864713668823, "learning_rate": 3.428899669161591e-05, "loss": 1.9526, "step": 8262 }, { "epoch": 0.7759077272287376, "grad_norm": 0.3913410007953644, "learning_rate": 3.427655285281323e-05, "loss": 1.9368, "step": 8268 }, { "epoch": 0.7764707952455945, "grad_norm": 0.3903445303440094, "learning_rate": 3.426412255218586e-05, "loss": 1.9475, "step": 8274 }, { "epoch": 0.7770338632624513, "grad_norm": 0.43475666642189026, "learning_rate": 3.425170576520368e-05, "loss": 1.9869, "step": 8280 }, { "epoch": 0.7775969312793082, "grad_norm": 0.39863237738609314, "learning_rate": 3.423930246739876e-05, "loss": 1.9532, "step": 8286 }, { "epoch": 0.778159999296165, "grad_norm": 0.41893458366394043, "learning_rate": 3.422691263436513e-05, "loss": 2.0201, "step": 8292 }, { "epoch": 0.7787230673130219, "grad_norm": 0.4425799250602722, "learning_rate": 3.421453624175859e-05, "loss": 1.9337, "step": 8298 }, { "epoch": 0.7792861353298787, "grad_norm": 0.4273892343044281, "learning_rate": 3.420217326529652e-05, "loss": 1.9513, "step": 8304 }, { "epoch": 0.7798492033467356, "grad_norm": 0.38947421312332153, "learning_rate": 3.4189823680757695e-05, "loss": 1.9206, "step": 8310 }, { "epoch": 0.7804122713635924, "grad_norm": 0.37935513257980347, "learning_rate": 3.4177487463982037e-05, "loss": 1.9847, "step": 8316 }, { "epoch": 0.7809753393804493, "grad_norm": 0.3708648681640625, "learning_rate": 3.416516459087047e-05, "loss": 1.9774, "step": 8322 }, { "epoch": 0.7815384073973061, "grad_norm": 0.35795021057128906, "learning_rate": 3.415285503738466e-05, "loss": 1.9493, "step": 8328 }, { "epoch": 0.782101475414163, "grad_norm": 0.38157418370246887, "learning_rate": 3.4140558779546907e-05, "loss": 1.9625, "step": 8334 }, { "epoch": 0.7826645434310198, "grad_norm": 0.3852948844432831, "learning_rate": 3.4128275793439866e-05, "loss": 1.9144, "step": 8340 }, { "epoch": 0.7832276114478767, "grad_norm": 0.3767140209674835, "learning_rate": 3.411600605520638e-05, "loss": 1.9683, "step": 8346 }, { "epoch": 0.7837906794647335, "grad_norm": 0.4172358512878418, "learning_rate": 3.4103749541049306e-05, "loss": 1.9564, "step": 8352 }, { "epoch": 0.7843537474815904, "grad_norm": 0.40248605608940125, "learning_rate": 3.409150622723129e-05, "loss": 1.9872, "step": 8358 }, { "epoch": 0.7849168154984472, "grad_norm": 0.38719263672828674, "learning_rate": 3.4079276090074587e-05, "loss": 1.9599, "step": 8364 }, { "epoch": 0.7854798835153041, "grad_norm": 0.36014750599861145, "learning_rate": 3.406705910596088e-05, "loss": 2.0145, "step": 8370 }, { "epoch": 0.7860429515321609, "grad_norm": 0.3820033669471741, "learning_rate": 3.4054855251331076e-05, "loss": 1.9608, "step": 8376 }, { "epoch": 0.7866060195490178, "grad_norm": 0.36340025067329407, "learning_rate": 3.404266450268511e-05, "loss": 1.9603, "step": 8382 }, { "epoch": 0.7871690875658746, "grad_norm": 0.42531833052635193, "learning_rate": 3.4030486836581754e-05, "loss": 1.9491, "step": 8388 }, { "epoch": 0.7877321555827314, "grad_norm": 0.3907950520515442, "learning_rate": 3.401832222963846e-05, "loss": 2.0175, "step": 8394 }, { "epoch": 0.7882952235995883, "grad_norm": 0.40734371542930603, "learning_rate": 3.4006170658531116e-05, "loss": 1.9894, "step": 8400 }, { "epoch": 0.7882952235995883, "eval_accuracy": 0.5850400547088707, "eval_loss": 1.9982987642288208, "eval_runtime": 88.1101, "eval_samples_per_second": 4.54, "eval_steps_per_second": 1.135, "step": 8400 }, { "epoch": 0.788858291616445, "grad_norm": 0.37503156065940857, "learning_rate": 3.399403209999393e-05, "loss": 1.9336, "step": 8406 }, { "epoch": 0.789421359633302, "grad_norm": 0.3932798504829407, "learning_rate": 3.398190653081915e-05, "loss": 1.9087, "step": 8412 }, { "epoch": 0.7899844276501587, "grad_norm": 0.37500983476638794, "learning_rate": 3.396979392785698e-05, "loss": 1.9573, "step": 8418 }, { "epoch": 0.7905474956670157, "grad_norm": 0.3650251030921936, "learning_rate": 3.395769426801531e-05, "loss": 1.9585, "step": 8424 }, { "epoch": 0.7911105636838724, "grad_norm": 0.3855944275856018, "learning_rate": 3.394560752825958e-05, "loss": 1.9868, "step": 8430 }, { "epoch": 0.7916736317007294, "grad_norm": 0.39262717962265015, "learning_rate": 3.393353368561257e-05, "loss": 1.9684, "step": 8436 }, { "epoch": 0.7922366997175861, "grad_norm": 0.3787207007408142, "learning_rate": 3.392147271715425e-05, "loss": 1.9406, "step": 8442 }, { "epoch": 0.792799767734443, "grad_norm": 0.4026426374912262, "learning_rate": 3.390942460002153e-05, "loss": 1.9688, "step": 8448 }, { "epoch": 0.7933628357512998, "grad_norm": 0.3971922695636749, "learning_rate": 3.389738931140818e-05, "loss": 1.9658, "step": 8454 }, { "epoch": 0.7939259037681567, "grad_norm": 0.38063785433769226, "learning_rate": 3.388536682856455e-05, "loss": 1.9589, "step": 8460 }, { "epoch": 0.7944889717850135, "grad_norm": 0.37502774596214294, "learning_rate": 3.387335712879745e-05, "loss": 1.9652, "step": 8466 }, { "epoch": 0.7950520398018704, "grad_norm": 0.41701793670654297, "learning_rate": 3.386136018946994e-05, "loss": 1.9708, "step": 8472 }, { "epoch": 0.7956151078187272, "grad_norm": 0.4339619278907776, "learning_rate": 3.384937598800117e-05, "loss": 1.975, "step": 8478 }, { "epoch": 0.7961781758355841, "grad_norm": 0.3745657801628113, "learning_rate": 3.383740450186618e-05, "loss": 1.9486, "step": 8484 }, { "epoch": 0.7967412438524409, "grad_norm": 0.4028776288032532, "learning_rate": 3.382544570859576e-05, "loss": 1.9629, "step": 8490 }, { "epoch": 0.7973043118692978, "grad_norm": 0.3997963070869446, "learning_rate": 3.3813499585776214e-05, "loss": 1.9725, "step": 8496 }, { "epoch": 0.7978673798861546, "grad_norm": 0.4024353623390198, "learning_rate": 3.380156611104925e-05, "loss": 1.9339, "step": 8502 }, { "epoch": 0.7984304479030115, "grad_norm": 0.3997054696083069, "learning_rate": 3.378964526211175e-05, "loss": 1.9923, "step": 8508 }, { "epoch": 0.7989935159198683, "grad_norm": 0.42767786979675293, "learning_rate": 3.3777737016715634e-05, "loss": 1.9503, "step": 8514 }, { "epoch": 0.7995565839367252, "grad_norm": 0.41181913018226624, "learning_rate": 3.3765841352667656e-05, "loss": 1.966, "step": 8520 }, { "epoch": 0.800119651953582, "grad_norm": 0.3655342161655426, "learning_rate": 3.3753958247829244e-05, "loss": 1.9448, "step": 8526 }, { "epoch": 0.8006827199704389, "grad_norm": 0.39619022607803345, "learning_rate": 3.374208768011633e-05, "loss": 1.905, "step": 8532 }, { "epoch": 0.8012457879872957, "grad_norm": 0.4261336624622345, "learning_rate": 3.3730229627499175e-05, "loss": 2.016, "step": 8538 }, { "epoch": 0.8018088560041526, "grad_norm": 0.40768730640411377, "learning_rate": 3.3718384068002185e-05, "loss": 1.9198, "step": 8544 }, { "epoch": 0.8023719240210094, "grad_norm": 0.45761510729789734, "learning_rate": 3.3706550979703755e-05, "loss": 1.9793, "step": 8550 }, { "epoch": 0.8029349920378663, "grad_norm": 0.36893948912620544, "learning_rate": 3.369473034073611e-05, "loss": 1.9166, "step": 8556 }, { "epoch": 0.8034980600547231, "grad_norm": 0.3611001968383789, "learning_rate": 3.368292212928511e-05, "loss": 1.9643, "step": 8562 }, { "epoch": 0.80406112807158, "grad_norm": 0.37042495608329773, "learning_rate": 3.367112632359006e-05, "loss": 1.9304, "step": 8568 }, { "epoch": 0.8046241960884368, "grad_norm": 0.3723931610584259, "learning_rate": 3.3659342901943626e-05, "loss": 1.9385, "step": 8574 }, { "epoch": 0.8051872641052937, "grad_norm": 0.3636028468608856, "learning_rate": 3.364757184269158e-05, "loss": 1.9654, "step": 8580 }, { "epoch": 0.8057503321221505, "grad_norm": 0.41643384099006653, "learning_rate": 3.363581312423266e-05, "loss": 1.979, "step": 8586 }, { "epoch": 0.8063134001390074, "grad_norm": 0.38418376445770264, "learning_rate": 3.362406672501843e-05, "loss": 1.977, "step": 8592 }, { "epoch": 0.8068764681558642, "grad_norm": 0.37700155377388, "learning_rate": 3.3612332623553087e-05, "loss": 2.0038, "step": 8598 }, { "epoch": 0.8074395361727211, "grad_norm": 0.42334243655204773, "learning_rate": 3.36006107983933e-05, "loss": 1.9576, "step": 8604 }, { "epoch": 0.8080026041895779, "grad_norm": 0.3770296573638916, "learning_rate": 3.358890122814805e-05, "loss": 1.9613, "step": 8610 }, { "epoch": 0.8085656722064348, "grad_norm": 0.3738340437412262, "learning_rate": 3.357720389147846e-05, "loss": 1.9877, "step": 8616 }, { "epoch": 0.8091287402232916, "grad_norm": 0.4223477244377136, "learning_rate": 3.356551876709765e-05, "loss": 1.9989, "step": 8622 }, { "epoch": 0.8096918082401485, "grad_norm": 0.3993932008743286, "learning_rate": 3.3553845833770526e-05, "loss": 1.944, "step": 8628 }, { "epoch": 0.8102548762570053, "grad_norm": 0.3896678388118744, "learning_rate": 3.3542185070313693e-05, "loss": 1.9372, "step": 8634 }, { "epoch": 0.8108179442738622, "grad_norm": 0.4034690260887146, "learning_rate": 3.3530536455595216e-05, "loss": 1.9539, "step": 8640 }, { "epoch": 0.811381012290719, "grad_norm": 0.4175611138343811, "learning_rate": 3.3518899968534544e-05, "loss": 1.9085, "step": 8646 }, { "epoch": 0.8119440803075759, "grad_norm": 0.40881142020225525, "learning_rate": 3.3507275588102234e-05, "loss": 1.9556, "step": 8652 }, { "epoch": 0.8125071483244327, "grad_norm": 0.3851398229598999, "learning_rate": 3.3495663293319924e-05, "loss": 2.0045, "step": 8658 }, { "epoch": 0.8130702163412896, "grad_norm": 0.40550872683525085, "learning_rate": 3.3484063063260065e-05, "loss": 1.9658, "step": 8664 }, { "epoch": 0.8136332843581464, "grad_norm": 0.38699740171432495, "learning_rate": 3.347247487704582e-05, "loss": 1.9636, "step": 8670 }, { "epoch": 0.8141963523750033, "grad_norm": 0.3549595773220062, "learning_rate": 3.3460898713850906e-05, "loss": 1.9711, "step": 8676 }, { "epoch": 0.8147594203918601, "grad_norm": 0.41606447100639343, "learning_rate": 3.34493345528994e-05, "loss": 2.0004, "step": 8682 }, { "epoch": 0.815322488408717, "grad_norm": 0.4422591030597687, "learning_rate": 3.3437782373465626e-05, "loss": 1.9628, "step": 8688 }, { "epoch": 0.8158855564255738, "grad_norm": 0.4435579776763916, "learning_rate": 3.342624215487396e-05, "loss": 1.9436, "step": 8694 }, { "epoch": 0.8164486244424307, "grad_norm": 0.3788127303123474, "learning_rate": 3.3414713876498716e-05, "loss": 1.9088, "step": 8700 }, { "epoch": 0.8170116924592875, "grad_norm": 0.38239002227783203, "learning_rate": 3.3403197517763936e-05, "loss": 1.9767, "step": 8706 }, { "epoch": 0.8175747604761444, "grad_norm": 0.4029586613178253, "learning_rate": 3.339169305814331e-05, "loss": 1.994, "step": 8712 }, { "epoch": 0.8181378284930012, "grad_norm": 0.40495699644088745, "learning_rate": 3.338020047715995e-05, "loss": 1.931, "step": 8718 }, { "epoch": 0.8187008965098581, "grad_norm": 0.3782508671283722, "learning_rate": 3.3368719754386255e-05, "loss": 2.0063, "step": 8724 }, { "epoch": 0.8192639645267149, "grad_norm": 0.3620794117450714, "learning_rate": 3.335725086944382e-05, "loss": 1.886, "step": 8730 }, { "epoch": 0.8198270325435718, "grad_norm": 0.43919533491134644, "learning_rate": 3.334579380200319e-05, "loss": 2.0075, "step": 8736 }, { "epoch": 0.8203901005604286, "grad_norm": 0.3692356050014496, "learning_rate": 3.333434853178377e-05, "loss": 1.9617, "step": 8742 }, { "epoch": 0.8209531685772855, "grad_norm": 0.34397009015083313, "learning_rate": 3.332291503855368e-05, "loss": 1.9706, "step": 8748 }, { "epoch": 0.8215162365941423, "grad_norm": 0.37551331520080566, "learning_rate": 3.3311493302129526e-05, "loss": 1.9469, "step": 8754 }, { "epoch": 0.8220793046109992, "grad_norm": 0.40155109763145447, "learning_rate": 3.3300083302376366e-05, "loss": 1.9382, "step": 8760 }, { "epoch": 0.822642372627856, "grad_norm": 0.40719571709632874, "learning_rate": 3.328868501920747e-05, "loss": 1.9079, "step": 8766 }, { "epoch": 0.8232054406447129, "grad_norm": 0.36370590329170227, "learning_rate": 3.327729843258422e-05, "loss": 1.9849, "step": 8772 }, { "epoch": 0.8237685086615697, "grad_norm": 0.38854455947875977, "learning_rate": 3.326592352251593e-05, "loss": 1.9597, "step": 8778 }, { "epoch": 0.8243315766784266, "grad_norm": 0.36376461386680603, "learning_rate": 3.325456026905971e-05, "loss": 1.9429, "step": 8784 }, { "epoch": 0.8248946446952834, "grad_norm": 0.3613795042037964, "learning_rate": 3.3243208652320356e-05, "loss": 1.992, "step": 8790 }, { "epoch": 0.8254577127121403, "grad_norm": 0.3482646942138672, "learning_rate": 3.323186865245013e-05, "loss": 1.9683, "step": 8796 }, { "epoch": 0.8258330913900448, "eval_accuracy": 0.5853507229386479, "eval_loss": 1.9948217868804932, "eval_runtime": 88.1468, "eval_samples_per_second": 4.538, "eval_steps_per_second": 1.134, "step": 8800 }, { "epoch": 0.8260207807289971, "grad_norm": 0.3850889205932617, "learning_rate": 3.322054024964868e-05, "loss": 1.9263, "step": 8802 }, { "epoch": 0.826583848745854, "grad_norm": 0.3809353709220886, "learning_rate": 3.3209223424162865e-05, "loss": 1.9346, "step": 8808 }, { "epoch": 0.8271469167627108, "grad_norm": 0.4036242961883545, "learning_rate": 3.319791815628662e-05, "loss": 1.964, "step": 8814 }, { "epoch": 0.8277099847795677, "grad_norm": 0.39300206303596497, "learning_rate": 3.318662442636079e-05, "loss": 1.9507, "step": 8820 }, { "epoch": 0.8282730527964245, "grad_norm": 0.457164466381073, "learning_rate": 3.317534221477303e-05, "loss": 2.0059, "step": 8826 }, { "epoch": 0.8288361208132814, "grad_norm": 0.40241411328315735, "learning_rate": 3.316407150195761e-05, "loss": 1.9149, "step": 8832 }, { "epoch": 0.8293991888301382, "grad_norm": 0.4136147201061249, "learning_rate": 3.315281226839531e-05, "loss": 1.9857, "step": 8838 }, { "epoch": 0.8299622568469951, "grad_norm": 0.39899858832359314, "learning_rate": 3.3141564494613264e-05, "loss": 1.9799, "step": 8844 }, { "epoch": 0.8305253248638519, "grad_norm": 0.4028012454509735, "learning_rate": 3.313032816118483e-05, "loss": 1.9609, "step": 8850 }, { "epoch": 0.8310883928807088, "grad_norm": 0.41206884384155273, "learning_rate": 3.3119103248729423e-05, "loss": 2.0117, "step": 8856 }, { "epoch": 0.8316514608975656, "grad_norm": 0.3733574450016022, "learning_rate": 3.310788973791241e-05, "loss": 1.9608, "step": 8862 }, { "epoch": 0.8322145289144225, "grad_norm": 0.4146965742111206, "learning_rate": 3.3096687609444924e-05, "loss": 2.0024, "step": 8868 }, { "epoch": 0.8327775969312793, "grad_norm": 0.411347359418869, "learning_rate": 3.3085496844083785e-05, "loss": 1.9337, "step": 8874 }, { "epoch": 0.8333406649481362, "grad_norm": 0.3731308579444885, "learning_rate": 3.3074317422631307e-05, "loss": 1.999, "step": 8880 }, { "epoch": 0.833903732964993, "grad_norm": 0.4018101096153259, "learning_rate": 3.306314932593519e-05, "loss": 1.925, "step": 8886 }, { "epoch": 0.8344668009818499, "grad_norm": 0.3949969410896301, "learning_rate": 3.3051992534888356e-05, "loss": 1.9301, "step": 8892 }, { "epoch": 0.8350298689987067, "grad_norm": 0.41097021102905273, "learning_rate": 3.3040847030428865e-05, "loss": 1.9519, "step": 8898 }, { "epoch": 0.8355929370155636, "grad_norm": 0.3554755449295044, "learning_rate": 3.302971279353969e-05, "loss": 1.9284, "step": 8904 }, { "epoch": 0.8361560050324204, "grad_norm": 0.36937373876571655, "learning_rate": 3.301858980524869e-05, "loss": 1.9295, "step": 8910 }, { "epoch": 0.8367190730492773, "grad_norm": 0.3943465054035187, "learning_rate": 3.3007478046628376e-05, "loss": 2.0025, "step": 8916 }, { "epoch": 0.8372821410661341, "grad_norm": 0.4117925465106964, "learning_rate": 3.299637749879583e-05, "loss": 1.9323, "step": 8922 }, { "epoch": 0.837845209082991, "grad_norm": 0.41620519757270813, "learning_rate": 3.2985288142912556e-05, "loss": 1.9474, "step": 8928 }, { "epoch": 0.8384082770998478, "grad_norm": 0.35099557042121887, "learning_rate": 3.297420996018435e-05, "loss": 1.9406, "step": 8934 }, { "epoch": 0.8389713451167047, "grad_norm": 0.4031484127044678, "learning_rate": 3.296314293186116e-05, "loss": 1.9322, "step": 8940 }, { "epoch": 0.8395344131335615, "grad_norm": 0.39607474207878113, "learning_rate": 3.295208703923697e-05, "loss": 1.9804, "step": 8946 }, { "epoch": 0.8400974811504184, "grad_norm": 0.37675487995147705, "learning_rate": 3.2941042263649606e-05, "loss": 1.9555, "step": 8952 }, { "epoch": 0.8406605491672752, "grad_norm": 0.43703773617744446, "learning_rate": 3.293000858648071e-05, "loss": 1.9518, "step": 8958 }, { "epoch": 0.8412236171841321, "grad_norm": 0.37094470858573914, "learning_rate": 3.2918985989155515e-05, "loss": 1.8987, "step": 8964 }, { "epoch": 0.8417866852009889, "grad_norm": 0.37659260630607605, "learning_rate": 3.2907974453142745e-05, "loss": 1.978, "step": 8970 }, { "epoch": 0.8423497532178458, "grad_norm": 0.4053337872028351, "learning_rate": 3.2896973959954514e-05, "loss": 1.9161, "step": 8976 }, { "epoch": 0.8429128212347026, "grad_norm": 0.39191868901252747, "learning_rate": 3.288598449114613e-05, "loss": 1.9134, "step": 8982 }, { "epoch": 0.8434758892515595, "grad_norm": 0.3756552040576935, "learning_rate": 3.2875006028316036e-05, "loss": 1.943, "step": 8988 }, { "epoch": 0.8440389572684163, "grad_norm": 0.3830448389053345, "learning_rate": 3.2864038553105636e-05, "loss": 1.9452, "step": 8994 }, { "epoch": 0.8446020252852732, "grad_norm": 0.3783881962299347, "learning_rate": 3.2853082047199166e-05, "loss": 2.0025, "step": 9000 }, { "epoch": 0.84516509330213, "grad_norm": 0.39876589179039, "learning_rate": 3.2842136492323596e-05, "loss": 1.9445, "step": 9006 }, { "epoch": 0.8457281613189869, "grad_norm": 0.4345170259475708, "learning_rate": 3.283120187024847e-05, "loss": 1.9965, "step": 9012 }, { "epoch": 0.8462912293358437, "grad_norm": 0.4467675983905792, "learning_rate": 3.28202781627858e-05, "loss": 1.9744, "step": 9018 }, { "epoch": 0.8468542973527006, "grad_norm": 0.3719506561756134, "learning_rate": 3.2809365351789936e-05, "loss": 1.9375, "step": 9024 }, { "epoch": 0.8474173653695574, "grad_norm": 0.40250223875045776, "learning_rate": 3.279846341915742e-05, "loss": 1.9088, "step": 9030 }, { "epoch": 0.8479804333864143, "grad_norm": 0.4048844575881958, "learning_rate": 3.278757234682688e-05, "loss": 1.9379, "step": 9036 }, { "epoch": 0.8485435014032711, "grad_norm": 0.4125281870365143, "learning_rate": 3.277669211677889e-05, "loss": 2.002, "step": 9042 }, { "epoch": 0.849106569420128, "grad_norm": 0.3707270324230194, "learning_rate": 3.2765822711035876e-05, "loss": 1.9625, "step": 9048 }, { "epoch": 0.8496696374369848, "grad_norm": 0.3665633201599121, "learning_rate": 3.275496411166195e-05, "loss": 1.9346, "step": 9054 }, { "epoch": 0.8502327054538417, "grad_norm": 0.37000006437301636, "learning_rate": 3.274411630076281e-05, "loss": 1.9458, "step": 9060 }, { "epoch": 0.8507957734706985, "grad_norm": 0.3748253285884857, "learning_rate": 3.273327926048561e-05, "loss": 1.9631, "step": 9066 }, { "epoch": 0.8513588414875554, "grad_norm": 0.35877230763435364, "learning_rate": 3.272245297301883e-05, "loss": 1.953, "step": 9072 }, { "epoch": 0.8519219095044122, "grad_norm": 0.3925628364086151, "learning_rate": 3.271163742059216e-05, "loss": 1.9985, "step": 9078 }, { "epoch": 0.8524849775212691, "grad_norm": 0.3697543442249298, "learning_rate": 3.270083258547641e-05, "loss": 1.9672, "step": 9084 }, { "epoch": 0.8530480455381259, "grad_norm": 0.3840068578720093, "learning_rate": 3.2690038449983296e-05, "loss": 1.9488, "step": 9090 }, { "epoch": 0.8536111135549828, "grad_norm": 0.39720016717910767, "learning_rate": 3.267925499646543e-05, "loss": 1.9614, "step": 9096 }, { "epoch": 0.8541741815718396, "grad_norm": 0.3421434760093689, "learning_rate": 3.2668482207316116e-05, "loss": 1.95, "step": 9102 }, { "epoch": 0.8547372495886965, "grad_norm": 0.4017122685909271, "learning_rate": 3.265772006496929e-05, "loss": 1.9565, "step": 9108 }, { "epoch": 0.8553003176055533, "grad_norm": 0.4200017750263214, "learning_rate": 3.264696855189932e-05, "loss": 1.922, "step": 9114 }, { "epoch": 0.8558633856224102, "grad_norm": 0.39358532428741455, "learning_rate": 3.263622765062099e-05, "loss": 2.0118, "step": 9120 }, { "epoch": 0.856426453639267, "grad_norm": 0.4083099067211151, "learning_rate": 3.262549734368929e-05, "loss": 1.9691, "step": 9126 }, { "epoch": 0.8569895216561239, "grad_norm": 0.4010527431964874, "learning_rate": 3.2614777613699353e-05, "loss": 1.9212, "step": 9132 }, { "epoch": 0.8575525896729806, "grad_norm": 0.3995525538921356, "learning_rate": 3.2604068443286305e-05, "loss": 1.9157, "step": 9138 }, { "epoch": 0.8581156576898376, "grad_norm": 0.3746720254421234, "learning_rate": 3.2593369815125156e-05, "loss": 1.9115, "step": 9144 }, { "epoch": 0.8586787257066943, "grad_norm": 0.4368610084056854, "learning_rate": 3.25826817119307e-05, "loss": 1.9727, "step": 9150 }, { "epoch": 0.8592417937235513, "grad_norm": 0.4019470512866974, "learning_rate": 3.2572004116457384e-05, "loss": 1.9359, "step": 9156 }, { "epoch": 0.859804861740408, "grad_norm": 0.38947227597236633, "learning_rate": 3.256133701149915e-05, "loss": 1.952, "step": 9162 }, { "epoch": 0.860367929757265, "grad_norm": 0.3747156858444214, "learning_rate": 3.255068037988941e-05, "loss": 1.9789, "step": 9168 }, { "epoch": 0.8609309977741217, "grad_norm": 0.37910282611846924, "learning_rate": 3.254003420450085e-05, "loss": 1.9738, "step": 9174 }, { "epoch": 0.8614940657909786, "grad_norm": 0.36365416646003723, "learning_rate": 3.2529398468245345e-05, "loss": 1.952, "step": 9180 }, { "epoch": 0.8620571338078354, "grad_norm": 0.3798990547657013, "learning_rate": 3.251877315407384e-05, "loss": 1.9261, "step": 9186 }, { "epoch": 0.8626202018246923, "grad_norm": 0.3802495002746582, "learning_rate": 3.250815824497626e-05, "loss": 1.8847, "step": 9192 }, { "epoch": 0.8631832698415491, "grad_norm": 0.3571455776691437, "learning_rate": 3.249755372398133e-05, "loss": 1.9349, "step": 9198 }, { "epoch": 0.8633709591805014, "eval_accuracy": 0.5856604142243064, "eval_loss": 1.9930232763290405, "eval_runtime": 88.06, "eval_samples_per_second": 4.542, "eval_steps_per_second": 1.136, "step": 9200 }, { "epoch": 0.863746337858406, "grad_norm": 0.3956944942474365, "learning_rate": 3.248695957415654e-05, "loss": 1.9595, "step": 9204 }, { "epoch": 0.8643094058752628, "grad_norm": 0.3841532766819, "learning_rate": 3.247637577860798e-05, "loss": 2.0001, "step": 9210 }, { "epoch": 0.8648724738921197, "grad_norm": 0.4102056324481964, "learning_rate": 3.246580232048024e-05, "loss": 1.9915, "step": 9216 }, { "epoch": 0.8654355419089765, "grad_norm": 0.37308117747306824, "learning_rate": 3.24552391829563e-05, "loss": 1.8931, "step": 9222 }, { "epoch": 0.8659986099258333, "grad_norm": 0.40245312452316284, "learning_rate": 3.244468634925741e-05, "loss": 1.9347, "step": 9228 }, { "epoch": 0.8665616779426902, "grad_norm": 0.3715590834617615, "learning_rate": 3.243414380264302e-05, "loss": 1.9623, "step": 9234 }, { "epoch": 0.867124745959547, "grad_norm": 0.371250718832016, "learning_rate": 3.242361152641057e-05, "loss": 2.0109, "step": 9240 }, { "epoch": 0.8676878139764039, "grad_norm": 0.3663799464702606, "learning_rate": 3.24130895038955e-05, "loss": 1.9722, "step": 9246 }, { "epoch": 0.8682508819932607, "grad_norm": 0.45195579528808594, "learning_rate": 3.240257771847105e-05, "loss": 1.9144, "step": 9252 }, { "epoch": 0.8688139500101176, "grad_norm": 0.4033130705356598, "learning_rate": 3.239207615354819e-05, "loss": 1.9949, "step": 9258 }, { "epoch": 0.8693770180269744, "grad_norm": 0.38628217577934265, "learning_rate": 3.2381584792575496e-05, "loss": 1.9425, "step": 9264 }, { "epoch": 0.8699400860438313, "grad_norm": 0.4411437511444092, "learning_rate": 3.2371103619039055e-05, "loss": 1.9603, "step": 9270 }, { "epoch": 0.8705031540606881, "grad_norm": 0.38852837681770325, "learning_rate": 3.236063261646233e-05, "loss": 1.9423, "step": 9276 }, { "epoch": 0.871066222077545, "grad_norm": 0.37564632296562195, "learning_rate": 3.2350171768406085e-05, "loss": 1.9425, "step": 9282 }, { "epoch": 0.8716292900944018, "grad_norm": 0.41267696022987366, "learning_rate": 3.2339721058468254e-05, "loss": 1.9558, "step": 9288 }, { "epoch": 0.8721923581112587, "grad_norm": 0.38808706402778625, "learning_rate": 3.232928047028383e-05, "loss": 1.9554, "step": 9294 }, { "epoch": 0.8727554261281155, "grad_norm": 0.370883047580719, "learning_rate": 3.231884998752476e-05, "loss": 1.9873, "step": 9300 }, { "epoch": 0.8733184941449724, "grad_norm": 0.371262788772583, "learning_rate": 3.2308429593899865e-05, "loss": 1.9884, "step": 9306 }, { "epoch": 0.8738815621618292, "grad_norm": 0.4044255316257477, "learning_rate": 3.22980192731547e-05, "loss": 1.9736, "step": 9312 }, { "epoch": 0.8744446301786861, "grad_norm": 0.371586412191391, "learning_rate": 3.228761900907144e-05, "loss": 1.9654, "step": 9318 }, { "epoch": 0.8750076981955429, "grad_norm": 0.36450275778770447, "learning_rate": 3.227722878546884e-05, "loss": 2.0038, "step": 9324 }, { "epoch": 0.8755707662123998, "grad_norm": 0.400814414024353, "learning_rate": 3.2266848586202025e-05, "loss": 1.9818, "step": 9330 }, { "epoch": 0.8761338342292566, "grad_norm": 0.39048540592193604, "learning_rate": 3.225647839516248e-05, "loss": 1.9407, "step": 9336 }, { "epoch": 0.8766969022461135, "grad_norm": 0.4229477047920227, "learning_rate": 3.224611819627788e-05, "loss": 1.9271, "step": 9342 }, { "epoch": 0.8772599702629703, "grad_norm": 0.42068013548851013, "learning_rate": 3.223576797351205e-05, "loss": 2.0042, "step": 9348 }, { "epoch": 0.8778230382798272, "grad_norm": 0.3993867039680481, "learning_rate": 3.222542771086478e-05, "loss": 1.9418, "step": 9354 }, { "epoch": 0.878386106296684, "grad_norm": 0.3810088634490967, "learning_rate": 3.221509739237179e-05, "loss": 1.9334, "step": 9360 }, { "epoch": 0.8789491743135409, "grad_norm": 0.3738817274570465, "learning_rate": 3.220477700210459e-05, "loss": 1.9853, "step": 9366 }, { "epoch": 0.8795122423303977, "grad_norm": 0.37888726592063904, "learning_rate": 3.219446652417039e-05, "loss": 1.9289, "step": 9372 }, { "epoch": 0.8800753103472546, "grad_norm": 0.40941867232322693, "learning_rate": 3.2184165942712e-05, "loss": 1.9734, "step": 9378 }, { "epoch": 0.8806383783641114, "grad_norm": 0.38799768686294556, "learning_rate": 3.21738752419077e-05, "loss": 1.9557, "step": 9384 }, { "epoch": 0.8812014463809683, "grad_norm": 0.3755500614643097, "learning_rate": 3.216359440597119e-05, "loss": 1.9522, "step": 9390 }, { "epoch": 0.8817645143978251, "grad_norm": 0.36881741881370544, "learning_rate": 3.215332341915143e-05, "loss": 1.9696, "step": 9396 }, { "epoch": 0.882327582414682, "grad_norm": 0.3972366154193878, "learning_rate": 3.214306226573258e-05, "loss": 1.9297, "step": 9402 }, { "epoch": 0.8828906504315388, "grad_norm": 0.3602343201637268, "learning_rate": 3.21328109300339e-05, "loss": 1.8998, "step": 9408 }, { "epoch": 0.8834537184483957, "grad_norm": 0.40044376254081726, "learning_rate": 3.212256939640961e-05, "loss": 1.9711, "step": 9414 }, { "epoch": 0.8840167864652525, "grad_norm": 0.5214889049530029, "learning_rate": 3.211233764924882e-05, "loss": 1.9371, "step": 9420 }, { "epoch": 0.8845798544821094, "grad_norm": 0.44645512104034424, "learning_rate": 3.210211567297543e-05, "loss": 1.9476, "step": 9426 }, { "epoch": 0.8851429224989662, "grad_norm": 0.41275012493133545, "learning_rate": 3.209190345204804e-05, "loss": 1.9284, "step": 9432 }, { "epoch": 0.8857059905158231, "grad_norm": 0.3787823021411896, "learning_rate": 3.20817009709598e-05, "loss": 1.9601, "step": 9438 }, { "epoch": 0.8862690585326799, "grad_norm": 0.38770022988319397, "learning_rate": 3.207150821423837e-05, "loss": 1.9616, "step": 9444 }, { "epoch": 0.8868321265495368, "grad_norm": 0.3845151960849762, "learning_rate": 3.206132516644581e-05, "loss": 1.9889, "step": 9450 }, { "epoch": 0.8873951945663936, "grad_norm": 0.3821997344493866, "learning_rate": 3.2051151812178444e-05, "loss": 1.9225, "step": 9456 }, { "epoch": 0.8879582625832505, "grad_norm": 0.45653244853019714, "learning_rate": 3.2040988136066804e-05, "loss": 1.9942, "step": 9462 }, { "epoch": 0.8885213306001073, "grad_norm": 0.3708157241344452, "learning_rate": 3.203083412277552e-05, "loss": 1.9105, "step": 9468 }, { "epoch": 0.8890843986169642, "grad_norm": 0.41360709071159363, "learning_rate": 3.2020689757003204e-05, "loss": 2.0118, "step": 9474 }, { "epoch": 0.889647466633821, "grad_norm": 0.36699047684669495, "learning_rate": 3.2010555023482374e-05, "loss": 1.9558, "step": 9480 }, { "epoch": 0.8902105346506779, "grad_norm": 0.3978089392185211, "learning_rate": 3.200042990697936e-05, "loss": 1.9139, "step": 9486 }, { "epoch": 0.8907736026675347, "grad_norm": 0.3821735978126526, "learning_rate": 3.1990314392294205e-05, "loss": 1.9397, "step": 9492 }, { "epoch": 0.8913366706843916, "grad_norm": 0.3824523687362671, "learning_rate": 3.1980208464260545e-05, "loss": 1.9475, "step": 9498 }, { "epoch": 0.8918997387012484, "grad_norm": 0.38726651668548584, "learning_rate": 3.1970112107745546e-05, "loss": 1.9215, "step": 9504 }, { "epoch": 0.8924628067181053, "grad_norm": 0.38234198093414307, "learning_rate": 3.1960025307649794e-05, "loss": 1.9723, "step": 9510 }, { "epoch": 0.8930258747349621, "grad_norm": 0.40177488327026367, "learning_rate": 3.1949948048907196e-05, "loss": 1.9701, "step": 9516 }, { "epoch": 0.893588942751819, "grad_norm": 0.3648139536380768, "learning_rate": 3.1939880316484903e-05, "loss": 1.9714, "step": 9522 }, { "epoch": 0.8941520107686758, "grad_norm": 0.40424102544784546, "learning_rate": 3.1929822095383195e-05, "loss": 2.0174, "step": 9528 }, { "epoch": 0.8947150787855327, "grad_norm": 0.39874470233917236, "learning_rate": 3.1919773370635405e-05, "loss": 1.9417, "step": 9534 }, { "epoch": 0.8952781468023895, "grad_norm": 0.3891903758049011, "learning_rate": 3.1909734127307795e-05, "loss": 1.951, "step": 9540 }, { "epoch": 0.8958412148192464, "grad_norm": 0.51023268699646, "learning_rate": 3.18997043504995e-05, "loss": 1.9427, "step": 9546 }, { "epoch": 0.8964042828361032, "grad_norm": 0.3599036931991577, "learning_rate": 3.1889684025342437e-05, "loss": 1.9179, "step": 9552 }, { "epoch": 0.8969673508529601, "grad_norm": 0.3587019145488739, "learning_rate": 3.187967313700117e-05, "loss": 1.9227, "step": 9558 }, { "epoch": 0.8975304188698169, "grad_norm": 0.36857011914253235, "learning_rate": 3.186967167067284e-05, "loss": 1.9589, "step": 9564 }, { "epoch": 0.8980934868866738, "grad_norm": 0.37425538897514343, "learning_rate": 3.185967961158709e-05, "loss": 1.95, "step": 9570 }, { "epoch": 0.8986565549035306, "grad_norm": 0.38583794236183167, "learning_rate": 3.184969694500595e-05, "loss": 2.033, "step": 9576 }, { "epoch": 0.8992196229203875, "grad_norm": 0.3778344988822937, "learning_rate": 3.183972365622378e-05, "loss": 1.9749, "step": 9582 }, { "epoch": 0.8997826909372443, "grad_norm": 0.37402454018592834, "learning_rate": 3.1829759730567116e-05, "loss": 2.0232, "step": 9588 }, { "epoch": 0.9003457589541012, "grad_norm": 0.4074670970439911, "learning_rate": 3.181980515339464e-05, "loss": 1.9344, "step": 9594 }, { "epoch": 0.900908826970958, "grad_norm": 0.37147244811058044, "learning_rate": 3.180985991009706e-05, "loss": 1.9911, "step": 9600 }, { "epoch": 0.900908826970958, "eval_accuracy": 0.5864527159046502, "eval_loss": 1.9884545803070068, "eval_runtime": 88.1158, "eval_samples_per_second": 4.539, "eval_steps_per_second": 1.135, "step": 9600 }, { "epoch": 0.9014718949878149, "grad_norm": 0.3862541615962982, "learning_rate": 3.179992398609703e-05, "loss": 1.9632, "step": 9606 }, { "epoch": 0.9020349630046717, "grad_norm": 0.3696303069591522, "learning_rate": 3.178999736684905e-05, "loss": 1.9399, "step": 9612 }, { "epoch": 0.9025980310215286, "grad_norm": 0.3520839512348175, "learning_rate": 3.17800800378394e-05, "loss": 1.9599, "step": 9618 }, { "epoch": 0.9031610990383854, "grad_norm": 0.3755667209625244, "learning_rate": 3.177017198458602e-05, "loss": 1.9264, "step": 9624 }, { "epoch": 0.9037241670552423, "grad_norm": 0.3818325996398926, "learning_rate": 3.176027319263844e-05, "loss": 1.9545, "step": 9630 }, { "epoch": 0.9042872350720991, "grad_norm": 0.42685505747795105, "learning_rate": 3.175038364757769e-05, "loss": 1.9639, "step": 9636 }, { "epoch": 0.904850303088956, "grad_norm": 0.40599679946899414, "learning_rate": 3.1740503335016195e-05, "loss": 1.9166, "step": 9642 }, { "epoch": 0.9054133711058128, "grad_norm": 0.39855843782424927, "learning_rate": 3.173063224059773e-05, "loss": 1.9339, "step": 9648 }, { "epoch": 0.9059764391226697, "grad_norm": 0.3713553547859192, "learning_rate": 3.172077034999726e-05, "loss": 1.9595, "step": 9654 }, { "epoch": 0.9065395071395265, "grad_norm": 0.42200320959091187, "learning_rate": 3.1710917648920955e-05, "loss": 1.9982, "step": 9660 }, { "epoch": 0.9071025751563834, "grad_norm": 0.4081229269504547, "learning_rate": 3.1701074123105976e-05, "loss": 1.9672, "step": 9666 }, { "epoch": 0.9076656431732402, "grad_norm": 0.3960514962673187, "learning_rate": 3.1691239758320516e-05, "loss": 1.9485, "step": 9672 }, { "epoch": 0.9082287111900971, "grad_norm": 0.3812468945980072, "learning_rate": 3.168141454036362e-05, "loss": 1.9387, "step": 9678 }, { "epoch": 0.9087917792069539, "grad_norm": 0.3966079652309418, "learning_rate": 3.167159845506514e-05, "loss": 1.9363, "step": 9684 }, { "epoch": 0.9093548472238108, "grad_norm": 0.37505200505256653, "learning_rate": 3.166179148828566e-05, "loss": 1.9418, "step": 9690 }, { "epoch": 0.9099179152406676, "grad_norm": 0.4363604784011841, "learning_rate": 3.1651993625916354e-05, "loss": 1.9572, "step": 9696 }, { "epoch": 0.9104809832575245, "grad_norm": 0.3731118142604828, "learning_rate": 3.164220485387898e-05, "loss": 1.9409, "step": 9702 }, { "epoch": 0.9110440512743813, "grad_norm": 0.3694874346256256, "learning_rate": 3.1632425158125724e-05, "loss": 1.9569, "step": 9708 }, { "epoch": 0.9116071192912382, "grad_norm": 0.3757215142250061, "learning_rate": 3.162265452463917e-05, "loss": 1.9421, "step": 9714 }, { "epoch": 0.912170187308095, "grad_norm": 0.427449494600296, "learning_rate": 3.161289293943217e-05, "loss": 1.9403, "step": 9720 }, { "epoch": 0.9127332553249519, "grad_norm": 0.386810302734375, "learning_rate": 3.160314038854778e-05, "loss": 1.9646, "step": 9726 }, { "epoch": 0.9132963233418087, "grad_norm": 0.38273054361343384, "learning_rate": 3.159339685805922e-05, "loss": 2.0068, "step": 9732 }, { "epoch": 0.9138593913586656, "grad_norm": 0.40770789980888367, "learning_rate": 3.158366233406969e-05, "loss": 1.9591, "step": 9738 }, { "epoch": 0.9144224593755224, "grad_norm": 0.3706749975681305, "learning_rate": 3.157393680271238e-05, "loss": 1.9821, "step": 9744 }, { "epoch": 0.9149855273923793, "grad_norm": 0.3477626442909241, "learning_rate": 3.156422025015033e-05, "loss": 1.9294, "step": 9750 }, { "epoch": 0.915548595409236, "grad_norm": 0.3916824162006378, "learning_rate": 3.155451266257639e-05, "loss": 1.9655, "step": 9756 }, { "epoch": 0.916111663426093, "grad_norm": 0.36156710982322693, "learning_rate": 3.15448140262131e-05, "loss": 1.9764, "step": 9762 }, { "epoch": 0.9166747314429498, "grad_norm": 0.4031555652618408, "learning_rate": 3.1535124327312655e-05, "loss": 1.9685, "step": 9768 }, { "epoch": 0.9172377994598067, "grad_norm": 0.4135826528072357, "learning_rate": 3.1525443552156735e-05, "loss": 1.9277, "step": 9774 }, { "epoch": 0.9178008674766635, "grad_norm": 0.3952166438102722, "learning_rate": 3.1515771687056546e-05, "loss": 1.9582, "step": 9780 }, { "epoch": 0.9183639354935204, "grad_norm": 0.4089564383029938, "learning_rate": 3.150610871835262e-05, "loss": 1.909, "step": 9786 }, { "epoch": 0.9189270035103771, "grad_norm": 0.35195696353912354, "learning_rate": 3.1496454632414815e-05, "loss": 1.9575, "step": 9792 }, { "epoch": 0.919490071527234, "grad_norm": 0.3907264471054077, "learning_rate": 3.1486809415642214e-05, "loss": 1.9645, "step": 9798 }, { "epoch": 0.9200531395440908, "grad_norm": 0.4442322254180908, "learning_rate": 3.147717305446301e-05, "loss": 1.9234, "step": 9804 }, { "epoch": 0.9206162075609478, "grad_norm": 0.35055217146873474, "learning_rate": 3.146754553533448e-05, "loss": 1.9139, "step": 9810 }, { "epoch": 0.9211792755778045, "grad_norm": 0.3579668402671814, "learning_rate": 3.1457926844742855e-05, "loss": 1.9437, "step": 9816 }, { "epoch": 0.9217423435946615, "grad_norm": 0.40800175070762634, "learning_rate": 3.144831696920329e-05, "loss": 1.9653, "step": 9822 }, { "epoch": 0.9223054116115182, "grad_norm": 0.38536494970321655, "learning_rate": 3.143871589525973e-05, "loss": 1.9601, "step": 9828 }, { "epoch": 0.9228684796283751, "grad_norm": 0.3938160836696625, "learning_rate": 3.142912360948489e-05, "loss": 1.9748, "step": 9834 }, { "epoch": 0.9234315476452319, "grad_norm": 0.3731740415096283, "learning_rate": 3.141954009848012e-05, "loss": 1.9726, "step": 9840 }, { "epoch": 0.9239946156620888, "grad_norm": 0.4037737250328064, "learning_rate": 3.140996534887535e-05, "loss": 1.96, "step": 9846 }, { "epoch": 0.9245576836789456, "grad_norm": 0.3677574694156647, "learning_rate": 3.140039934732904e-05, "loss": 1.9314, "step": 9852 }, { "epoch": 0.9251207516958025, "grad_norm": 0.36950281262397766, "learning_rate": 3.139084208052804e-05, "loss": 1.9629, "step": 9858 }, { "epoch": 0.9256838197126593, "grad_norm": 0.3666466772556305, "learning_rate": 3.138129353518758e-05, "loss": 1.9446, "step": 9864 }, { "epoch": 0.9262468877295162, "grad_norm": 0.3922479748725891, "learning_rate": 3.1371753698051136e-05, "loss": 1.9799, "step": 9870 }, { "epoch": 0.926809955746373, "grad_norm": 0.3709563612937927, "learning_rate": 3.13622225558904e-05, "loss": 1.9219, "step": 9876 }, { "epoch": 0.9273730237632299, "grad_norm": 0.4073300063610077, "learning_rate": 3.135270009550515e-05, "loss": 1.9436, "step": 9882 }, { "epoch": 0.9279360917800867, "grad_norm": 0.4735027551651001, "learning_rate": 3.1343186303723216e-05, "loss": 1.9417, "step": 9888 }, { "epoch": 0.9284991597969436, "grad_norm": 0.3504180610179901, "learning_rate": 3.133368116740041e-05, "loss": 1.9497, "step": 9894 }, { "epoch": 0.9290622278138004, "grad_norm": 0.36758220195770264, "learning_rate": 3.13241846734204e-05, "loss": 1.9504, "step": 9900 }, { "epoch": 0.9296252958306573, "grad_norm": 0.3575434982776642, "learning_rate": 3.1314696808694673e-05, "loss": 1.8972, "step": 9906 }, { "epoch": 0.9301883638475141, "grad_norm": 0.35772043466567993, "learning_rate": 3.1305217560162464e-05, "loss": 2.0042, "step": 9912 }, { "epoch": 0.930751431864371, "grad_norm": 0.3853083550930023, "learning_rate": 3.129574691479064e-05, "loss": 1.9433, "step": 9918 }, { "epoch": 0.9313144998812278, "grad_norm": 0.43719708919525146, "learning_rate": 3.1286284859573685e-05, "loss": 1.9371, "step": 9924 }, { "epoch": 0.9318775678980847, "grad_norm": 0.3808354437351227, "learning_rate": 3.127683138153356e-05, "loss": 1.9719, "step": 9930 }, { "epoch": 0.9324406359149415, "grad_norm": 0.37888312339782715, "learning_rate": 3.1267386467719674e-05, "loss": 1.9497, "step": 9936 }, { "epoch": 0.9330037039317984, "grad_norm": 0.39421749114990234, "learning_rate": 3.125795010520879e-05, "loss": 1.9755, "step": 9942 }, { "epoch": 0.9335667719486552, "grad_norm": 0.3888514041900635, "learning_rate": 3.124852228110498e-05, "loss": 2.0048, "step": 9948 }, { "epoch": 0.9341298399655121, "grad_norm": 0.38524529337882996, "learning_rate": 3.1239102982539485e-05, "loss": 1.9753, "step": 9954 }, { "epoch": 0.9346929079823689, "grad_norm": 0.3764738142490387, "learning_rate": 3.1229692196670705e-05, "loss": 1.9518, "step": 9960 }, { "epoch": 0.9352559759992258, "grad_norm": 0.46206945180892944, "learning_rate": 3.122028991068411e-05, "loss": 1.9711, "step": 9966 }, { "epoch": 0.9358190440160826, "grad_norm": 0.4121486246585846, "learning_rate": 3.121089611179216e-05, "loss": 1.9437, "step": 9972 }, { "epoch": 0.9363821120329395, "grad_norm": 0.3985231816768646, "learning_rate": 3.1201510787234214e-05, "loss": 1.9514, "step": 9978 }, { "epoch": 0.9369451800497963, "grad_norm": 0.3805953860282898, "learning_rate": 3.119213392427649e-05, "loss": 1.959, "step": 9984 }, { "epoch": 0.9375082480666532, "grad_norm": 0.37793779373168945, "learning_rate": 3.118276551021197e-05, "loss": 1.9445, "step": 9990 }, { "epoch": 0.93807131608351, "grad_norm": 0.3659811317920685, "learning_rate": 3.117340553236034e-05, "loss": 1.9578, "step": 9996 }, { "epoch": 0.9384466947614146, "eval_accuracy": 0.5870154357170769, "eval_loss": 1.9860219955444336, "eval_runtime": 88.0983, "eval_samples_per_second": 4.54, "eval_steps_per_second": 1.135, "step": 10000 }, { "epoch": 0.9386343841003669, "grad_norm": 0.3702959716320038, "learning_rate": 3.1164053978067914e-05, "loss": 1.961, "step": 10002 }, { "epoch": 0.9391974521172237, "grad_norm": 0.3879670202732086, "learning_rate": 3.115471083470756e-05, "loss": 1.9866, "step": 10008 }, { "epoch": 0.9397605201340806, "grad_norm": 0.3765878975391388, "learning_rate": 3.1145376089678633e-05, "loss": 1.9121, "step": 10014 }, { "epoch": 0.9403235881509374, "grad_norm": 0.36340415477752686, "learning_rate": 3.1136049730406894e-05, "loss": 1.8842, "step": 10020 }, { "epoch": 0.9408866561677943, "grad_norm": 0.404559463262558, "learning_rate": 3.112673174434445e-05, "loss": 1.9149, "step": 10026 }, { "epoch": 0.9414497241846511, "grad_norm": 0.37986552715301514, "learning_rate": 3.111742211896968e-05, "loss": 1.9, "step": 10032 }, { "epoch": 0.942012792201508, "grad_norm": 0.3724873960018158, "learning_rate": 3.110812084178715e-05, "loss": 1.9184, "step": 10038 }, { "epoch": 0.9425758602183648, "grad_norm": 0.4172978699207306, "learning_rate": 3.1098827900327584e-05, "loss": 1.923, "step": 10044 }, { "epoch": 0.9431389282352217, "grad_norm": 0.4021415412425995, "learning_rate": 3.108954328214773e-05, "loss": 1.9164, "step": 10050 }, { "epoch": 0.9437019962520785, "grad_norm": 0.3755131959915161, "learning_rate": 3.108026697483035e-05, "loss": 1.9901, "step": 10056 }, { "epoch": 0.9442650642689354, "grad_norm": 0.37166106700897217, "learning_rate": 3.107099896598411e-05, "loss": 1.8931, "step": 10062 }, { "epoch": 0.9448281322857922, "grad_norm": 0.37471044063568115, "learning_rate": 3.1061739243243544e-05, "loss": 1.9782, "step": 10068 }, { "epoch": 0.945391200302649, "grad_norm": 0.3785649240016937, "learning_rate": 3.1052487794268946e-05, "loss": 1.9452, "step": 10074 }, { "epoch": 0.9459542683195059, "grad_norm": 0.392055869102478, "learning_rate": 3.1043244606746344e-05, "loss": 1.9373, "step": 10080 }, { "epoch": 0.9465173363363627, "grad_norm": 0.39918309450149536, "learning_rate": 3.1034009668387376e-05, "loss": 1.9177, "step": 10086 }, { "epoch": 0.9470804043532196, "grad_norm": 0.42963945865631104, "learning_rate": 3.102478296692929e-05, "loss": 1.945, "step": 10092 }, { "epoch": 0.9476434723700764, "grad_norm": 0.4004127085208893, "learning_rate": 3.101556449013481e-05, "loss": 1.9467, "step": 10098 }, { "epoch": 0.9482065403869333, "grad_norm": 0.4446695148944855, "learning_rate": 3.100635422579212e-05, "loss": 1.9421, "step": 10104 }, { "epoch": 0.9487696084037901, "grad_norm": 0.42058244347572327, "learning_rate": 3.099715216171477e-05, "loss": 1.9249, "step": 10110 }, { "epoch": 0.949332676420647, "grad_norm": 0.39542973041534424, "learning_rate": 3.09879582857416e-05, "loss": 1.9262, "step": 10116 }, { "epoch": 0.9498957444375038, "grad_norm": 0.4077974259853363, "learning_rate": 3.097877258573669e-05, "loss": 1.9829, "step": 10122 }, { "epoch": 0.9504588124543607, "grad_norm": 0.35841768980026245, "learning_rate": 3.096959504958929e-05, "loss": 2.0058, "step": 10128 }, { "epoch": 0.9510218804712175, "grad_norm": 0.4319620728492737, "learning_rate": 3.096042566521374e-05, "loss": 1.9384, "step": 10134 }, { "epoch": 0.9515849484880744, "grad_norm": 0.3852229118347168, "learning_rate": 3.095126442054944e-05, "loss": 1.9388, "step": 10140 }, { "epoch": 0.9521480165049312, "grad_norm": 0.36925965547561646, "learning_rate": 3.094211130356071e-05, "loss": 1.9638, "step": 10146 }, { "epoch": 0.9527110845217881, "grad_norm": 0.3909997045993805, "learning_rate": 3.093296630223683e-05, "loss": 1.9331, "step": 10152 }, { "epoch": 0.9532741525386449, "grad_norm": 0.3804713785648346, "learning_rate": 3.0923829404591856e-05, "loss": 1.9893, "step": 10158 }, { "epoch": 0.9538372205555018, "grad_norm": 0.41485777497291565, "learning_rate": 3.0914700598664656e-05, "loss": 1.9356, "step": 10164 }, { "epoch": 0.9544002885723586, "grad_norm": 0.4173021912574768, "learning_rate": 3.0905579872518766e-05, "loss": 1.9127, "step": 10170 }, { "epoch": 0.9549633565892155, "grad_norm": 0.3770638704299927, "learning_rate": 3.089646721424239e-05, "loss": 1.9391, "step": 10176 }, { "epoch": 0.9555264246060723, "grad_norm": 0.4797912538051605, "learning_rate": 3.088736261194827e-05, "loss": 1.9866, "step": 10182 }, { "epoch": 0.9560894926229292, "grad_norm": 0.37178194522857666, "learning_rate": 3.087826605377369e-05, "loss": 1.9605, "step": 10188 }, { "epoch": 0.956652560639786, "grad_norm": 0.38409048318862915, "learning_rate": 3.086917752788034e-05, "loss": 1.9647, "step": 10194 }, { "epoch": 0.9572156286566429, "grad_norm": 0.41321849822998047, "learning_rate": 3.086009702245432e-05, "loss": 2.0029, "step": 10200 }, { "epoch": 0.9577786966734997, "grad_norm": 0.38576197624206543, "learning_rate": 3.085102452570602e-05, "loss": 1.9636, "step": 10206 }, { "epoch": 0.9583417646903566, "grad_norm": 0.43905162811279297, "learning_rate": 3.084196002587008e-05, "loss": 1.975, "step": 10212 }, { "epoch": 0.9589048327072134, "grad_norm": 0.4242101013660431, "learning_rate": 3.0832903511205325e-05, "loss": 1.9514, "step": 10218 }, { "epoch": 0.9594679007240703, "grad_norm": 0.38106513023376465, "learning_rate": 3.082385496999471e-05, "loss": 1.9755, "step": 10224 }, { "epoch": 0.9600309687409271, "grad_norm": 0.44776099920272827, "learning_rate": 3.0814814390545235e-05, "loss": 1.989, "step": 10230 }, { "epoch": 0.960594036757784, "grad_norm": 0.37672579288482666, "learning_rate": 3.080578176118789e-05, "loss": 1.9297, "step": 10236 }, { "epoch": 0.9611571047746408, "grad_norm": 0.3746655583381653, "learning_rate": 3.079675707027762e-05, "loss": 1.9233, "step": 10242 }, { "epoch": 0.9617201727914977, "grad_norm": 0.41289401054382324, "learning_rate": 3.078774030619319e-05, "loss": 1.9654, "step": 10248 }, { "epoch": 0.9622832408083545, "grad_norm": 0.4446800947189331, "learning_rate": 3.07787314573372e-05, "loss": 1.9407, "step": 10254 }, { "epoch": 0.9628463088252114, "grad_norm": 0.411948025226593, "learning_rate": 3.0769730512136015e-05, "loss": 1.9605, "step": 10260 }, { "epoch": 0.9634093768420682, "grad_norm": 0.3884698748588562, "learning_rate": 3.076073745903961e-05, "loss": 1.949, "step": 10266 }, { "epoch": 0.9639724448589251, "grad_norm": 0.419141560792923, "learning_rate": 3.0751752286521635e-05, "loss": 1.9729, "step": 10272 }, { "epoch": 0.9645355128757819, "grad_norm": 0.3598424196243286, "learning_rate": 3.074277498307928e-05, "loss": 1.9646, "step": 10278 }, { "epoch": 0.9650985808926388, "grad_norm": 0.3791840672492981, "learning_rate": 3.0733805537233216e-05, "loss": 1.9707, "step": 10284 }, { "epoch": 0.9656616489094956, "grad_norm": 0.3914964497089386, "learning_rate": 3.072484393752754e-05, "loss": 1.9328, "step": 10290 }, { "epoch": 0.9662247169263525, "grad_norm": 0.38034605979919434, "learning_rate": 3.071589017252974e-05, "loss": 1.975, "step": 10296 }, { "epoch": 0.9667877849432093, "grad_norm": 0.39466121792793274, "learning_rate": 3.07069442308306e-05, "loss": 1.9811, "step": 10302 }, { "epoch": 0.9673508529600662, "grad_norm": 0.37380218505859375, "learning_rate": 3.069800610104415e-05, "loss": 1.9459, "step": 10308 }, { "epoch": 0.967913920976923, "grad_norm": 0.3680315315723419, "learning_rate": 3.068907577180761e-05, "loss": 1.934, "step": 10314 }, { "epoch": 0.9684769889937799, "grad_norm": 0.3989509642124176, "learning_rate": 3.068015323178131e-05, "loss": 1.9229, "step": 10320 }, { "epoch": 0.9690400570106367, "grad_norm": 0.3797784745693207, "learning_rate": 3.067123846964868e-05, "loss": 1.9667, "step": 10326 }, { "epoch": 0.9696031250274936, "grad_norm": 0.3958461880683899, "learning_rate": 3.066233147411611e-05, "loss": 1.9744, "step": 10332 }, { "epoch": 0.9701661930443504, "grad_norm": 0.3977872133255005, "learning_rate": 3.065343223391297e-05, "loss": 1.9619, "step": 10338 }, { "epoch": 0.9707292610612073, "grad_norm": 0.40070608258247375, "learning_rate": 3.064454073779149e-05, "loss": 1.9502, "step": 10344 }, { "epoch": 0.9712923290780641, "grad_norm": 0.387321799993515, "learning_rate": 3.063565697452675e-05, "loss": 1.9357, "step": 10350 }, { "epoch": 0.971855397094921, "grad_norm": 0.35796454548835754, "learning_rate": 3.0626780932916574e-05, "loss": 1.9552, "step": 10356 }, { "epoch": 0.9724184651117778, "grad_norm": 0.39839133620262146, "learning_rate": 3.061791260178149e-05, "loss": 1.978, "step": 10362 }, { "epoch": 0.9729815331286347, "grad_norm": 0.40770214796066284, "learning_rate": 3.06090519699647e-05, "loss": 1.967, "step": 10368 }, { "epoch": 0.9735446011454915, "grad_norm": 0.3807249367237091, "learning_rate": 3.060019902633197e-05, "loss": 1.9333, "step": 10374 }, { "epoch": 0.9741076691623484, "grad_norm": 0.39396604895591736, "learning_rate": 3.0591353759771584e-05, "loss": 1.9491, "step": 10380 }, { "epoch": 0.9746707371792052, "grad_norm": 0.37708649039268494, "learning_rate": 3.0582516159194345e-05, "loss": 1.9346, "step": 10386 }, { "epoch": 0.9752338051960621, "grad_norm": 0.3898712992668152, "learning_rate": 3.057368621353341e-05, "loss": 1.9644, "step": 10392 }, { "epoch": 0.9757968732129189, "grad_norm": 0.4210917353630066, "learning_rate": 3.056486391174433e-05, "loss": 1.9099, "step": 10398 }, { "epoch": 0.9759845625518712, "eval_accuracy": 0.5869900351699883, "eval_loss": 1.9850906133651733, "eval_runtime": 88.1227, "eval_samples_per_second": 4.539, "eval_steps_per_second": 1.135, "step": 10400 }, { "epoch": 0.9763599412297758, "grad_norm": 0.43469569087028503, "learning_rate": 3.0556049242804926e-05, "loss": 1.9279, "step": 10404 }, { "epoch": 0.9769230092466326, "grad_norm": 0.36712446808815, "learning_rate": 3.05472421957153e-05, "loss": 1.8801, "step": 10410 }, { "epoch": 0.9774860772634895, "grad_norm": 0.4147302210330963, "learning_rate": 3.053844275949766e-05, "loss": 1.9819, "step": 10416 }, { "epoch": 0.9780491452803463, "grad_norm": 0.38622787594795227, "learning_rate": 3.052965092319642e-05, "loss": 1.9404, "step": 10422 }, { "epoch": 0.9786122132972032, "grad_norm": 0.4137551486492157, "learning_rate": 3.0520866675878e-05, "loss": 1.9515, "step": 10428 }, { "epoch": 0.97917528131406, "grad_norm": 0.39437612891197205, "learning_rate": 3.0512090006630867e-05, "loss": 1.9199, "step": 10434 }, { "epoch": 0.9797383493309169, "grad_norm": 0.3909685015678406, "learning_rate": 3.05033209045654e-05, "loss": 1.958, "step": 10440 }, { "epoch": 0.9803014173477737, "grad_norm": 0.3557184934616089, "learning_rate": 3.0494559358813913e-05, "loss": 1.9473, "step": 10446 }, { "epoch": 0.9808644853646306, "grad_norm": 0.4142801761627197, "learning_rate": 3.0485805358530533e-05, "loss": 1.928, "step": 10452 }, { "epoch": 0.9814275533814873, "grad_norm": 0.3811998665332794, "learning_rate": 3.0477058892891184e-05, "loss": 2.0048, "step": 10458 }, { "epoch": 0.9819906213983443, "grad_norm": 0.3731052875518799, "learning_rate": 3.0468319951093496e-05, "loss": 1.9509, "step": 10464 }, { "epoch": 0.982553689415201, "grad_norm": 0.37513041496276855, "learning_rate": 3.0459588522356785e-05, "loss": 1.9267, "step": 10470 }, { "epoch": 0.983116757432058, "grad_norm": 0.448738157749176, "learning_rate": 3.0450864595921988e-05, "loss": 1.9057, "step": 10476 }, { "epoch": 0.9836798254489147, "grad_norm": 0.4294474422931671, "learning_rate": 3.044214816105158e-05, "loss": 2.0175, "step": 10482 }, { "epoch": 0.9842428934657717, "grad_norm": 0.40496277809143066, "learning_rate": 3.043343920702955e-05, "loss": 1.9308, "step": 10488 }, { "epoch": 0.9848059614826284, "grad_norm": 0.37892308831214905, "learning_rate": 3.0424737723161333e-05, "loss": 1.9018, "step": 10494 }, { "epoch": 0.9853690294994853, "grad_norm": 0.41418689489364624, "learning_rate": 3.0416043698773747e-05, "loss": 1.9086, "step": 10500 }, { "epoch": 0.9859320975163421, "grad_norm": 0.38455674052238464, "learning_rate": 3.040735712321497e-05, "loss": 1.9557, "step": 10506 }, { "epoch": 0.986495165533199, "grad_norm": 0.38525018095970154, "learning_rate": 3.0398677985854442e-05, "loss": 1.9609, "step": 10512 }, { "epoch": 0.9870582335500558, "grad_norm": 0.39015087485313416, "learning_rate": 3.039000627608283e-05, "loss": 1.9324, "step": 10518 }, { "epoch": 0.9876213015669127, "grad_norm": 0.37166082859039307, "learning_rate": 3.038134198331199e-05, "loss": 1.9303, "step": 10524 }, { "epoch": 0.9881843695837695, "grad_norm": 0.3551478981971741, "learning_rate": 3.037268509697488e-05, "loss": 1.9447, "step": 10530 }, { "epoch": 0.9887474376006264, "grad_norm": 0.39377501606941223, "learning_rate": 3.036403560652554e-05, "loss": 1.9156, "step": 10536 }, { "epoch": 0.9893105056174832, "grad_norm": 0.3845690190792084, "learning_rate": 3.0355393501439007e-05, "loss": 1.934, "step": 10542 }, { "epoch": 0.9898735736343401, "grad_norm": 0.4104871451854706, "learning_rate": 3.034675877121128e-05, "loss": 1.987, "step": 10548 }, { "epoch": 0.9904366416511969, "grad_norm": 0.42940935492515564, "learning_rate": 3.0338131405359264e-05, "loss": 2.0361, "step": 10554 }, { "epoch": 0.9909997096680538, "grad_norm": 0.4018247723579407, "learning_rate": 3.0329511393420704e-05, "loss": 1.884, "step": 10560 }, { "epoch": 0.9915627776849106, "grad_norm": 0.4772709012031555, "learning_rate": 3.0320898724954162e-05, "loss": 1.9679, "step": 10566 }, { "epoch": 0.9921258457017675, "grad_norm": 0.4571984112262726, "learning_rate": 3.0312293389538914e-05, "loss": 1.9726, "step": 10572 }, { "epoch": 0.9926889137186243, "grad_norm": 0.36707180738449097, "learning_rate": 3.030369537677496e-05, "loss": 1.934, "step": 10578 }, { "epoch": 0.9932519817354812, "grad_norm": 0.3697710335254669, "learning_rate": 3.0295104676282914e-05, "loss": 1.9738, "step": 10584 }, { "epoch": 0.993815049752338, "grad_norm": 0.37338554859161377, "learning_rate": 3.028652127770398e-05, "loss": 1.9409, "step": 10590 }, { "epoch": 0.9943781177691949, "grad_norm": 0.37625664472579956, "learning_rate": 3.02779451706999e-05, "loss": 1.9477, "step": 10596 }, { "epoch": 0.9949411857860517, "grad_norm": 0.4084542989730835, "learning_rate": 3.0269376344952893e-05, "loss": 1.937, "step": 10602 }, { "epoch": 0.9955042538029086, "grad_norm": 0.40196311473846436, "learning_rate": 3.026081479016561e-05, "loss": 1.9079, "step": 10608 }, { "epoch": 0.9960673218197654, "grad_norm": 0.39217957854270935, "learning_rate": 3.025226049606108e-05, "loss": 1.9792, "step": 10614 }, { "epoch": 0.9966303898366223, "grad_norm": 0.3680386543273926, "learning_rate": 3.024371345238264e-05, "loss": 1.9463, "step": 10620 }, { "epoch": 0.9971934578534791, "grad_norm": 0.40704578161239624, "learning_rate": 3.0235173648893914e-05, "loss": 1.9156, "step": 10626 }, { "epoch": 0.997756525870336, "grad_norm": 0.4476427733898163, "learning_rate": 3.0226641075378758e-05, "loss": 1.9733, "step": 10632 }, { "epoch": 0.9983195938871928, "grad_norm": 0.39435121417045593, "learning_rate": 3.0218115721641177e-05, "loss": 1.9179, "step": 10638 }, { "epoch": 0.9988826619040497, "grad_norm": 0.39758792519569397, "learning_rate": 3.02095975775053e-05, "loss": 1.9179, "step": 10644 }, { "epoch": 0.9994457299209065, "grad_norm": 0.4091814160346985, "learning_rate": 3.0201086632815333e-05, "loss": 1.9691, "step": 10650 }, { "epoch": 0.9999149532682873, "step": 10655, "total_flos": 6.861219031857234e+18, "train_loss": 2.0199434388696393, "train_runtime": 95478.6823, "train_samples_per_second": 14.285, "train_steps_per_second": 0.112 } ], "logging_steps": 6, "max_steps": 10655, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 6.861219031857234e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }