diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,165787 @@ +{ + "best_metric": 0.8879019021987915, + "best_model_checkpoint": "saves_323_ml_20241202/llama3-3b/full/pt/checkpoint-231500", + "epoch": 17.939478476500447, + "eval_steps": 500, + "global_step": 231500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007749234763067147, + "grad_norm": 4.636014863613732, + "learning_rate": 3.874767513949163e-11, + "loss": 1.6324, + "step": 10 + }, + { + "epoch": 0.0015498469526134295, + "grad_norm": 4.159714289443761, + "learning_rate": 7.749535027898326e-11, + "loss": 1.6342, + "step": 20 + }, + { + "epoch": 0.002324770428920144, + "grad_norm": 7.325738720698286, + "learning_rate": 1.1624302541847491e-10, + "loss": 1.6082, + "step": 30 + }, + { + "epoch": 0.003099693905226859, + "grad_norm": 3.9846465062419036, + "learning_rate": 1.5499070055796653e-10, + "loss": 1.6145, + "step": 40 + }, + { + "epoch": 0.0038746173815335737, + "grad_norm": 4.041855942340654, + "learning_rate": 1.9373837569745816e-10, + "loss": 1.5946, + "step": 50 + }, + { + "epoch": 0.004649540857840288, + "grad_norm": 4.717724619532811, + "learning_rate": 2.3248605083694982e-10, + "loss": 1.6147, + "step": 60 + }, + { + "epoch": 0.005424464334147003, + "grad_norm": 4.448001769716281, + "learning_rate": 2.7123372597644143e-10, + "loss": 1.6684, + "step": 70 + }, + { + "epoch": 0.006199387810453718, + "grad_norm": 7.33004285043044, + "learning_rate": 3.0998140111593306e-10, + "loss": 1.6147, + "step": 80 + }, + { + "epoch": 0.006974311286760433, + "grad_norm": 4.371426588129316, + "learning_rate": 3.487290762554247e-10, + "loss": 1.6205, + "step": 90 + }, + { + "epoch": 0.0077492347630671475, + "grad_norm": 4.719210208312149, + "learning_rate": 3.874767513949163e-10, + "loss": 1.6336, + "step": 100 + }, + { + "epoch": 0.008524158239373861, + "grad_norm": 4.367122704469949, + "learning_rate": 4.2622442653440796e-10, + "loss": 1.6198, + "step": 110 + }, + { + "epoch": 0.009299081715680576, + "grad_norm": 4.521055926788438, + "learning_rate": 4.6497210167389964e-10, + "loss": 1.6257, + "step": 120 + }, + { + "epoch": 0.010074005191987291, + "grad_norm": 4.017097912673022, + "learning_rate": 5.037197768133913e-10, + "loss": 1.6396, + "step": 130 + }, + { + "epoch": 0.010848928668294006, + "grad_norm": 4.485855250457777, + "learning_rate": 5.424674519528829e-10, + "loss": 1.5985, + "step": 140 + }, + { + "epoch": 0.01162385214460072, + "grad_norm": 4.582100879279215, + "learning_rate": 5.812151270923745e-10, + "loss": 1.6244, + "step": 150 + }, + { + "epoch": 0.012398775620907436, + "grad_norm": 4.3362954987300135, + "learning_rate": 6.199628022318661e-10, + "loss": 1.6121, + "step": 160 + }, + { + "epoch": 0.01317369909721415, + "grad_norm": 4.522883465665204, + "learning_rate": 6.587104773713578e-10, + "loss": 1.6115, + "step": 170 + }, + { + "epoch": 0.013948622573520865, + "grad_norm": 5.256178016741185, + "learning_rate": 6.974581525108494e-10, + "loss": 1.6158, + "step": 180 + }, + { + "epoch": 0.01472354604982758, + "grad_norm": 4.242896388478094, + "learning_rate": 7.36205827650341e-10, + "loss": 1.6174, + "step": 190 + }, + { + "epoch": 0.015498469526134295, + "grad_norm": 4.198955182056819, + "learning_rate": 7.749535027898326e-10, + "loss": 1.6514, + "step": 200 + }, + { + "epoch": 0.01627339300244101, + "grad_norm": 4.154810975335848, + "learning_rate": 8.137011779293242e-10, + "loss": 1.606, + "step": 210 + }, + { + "epoch": 0.017048316478747723, + "grad_norm": 4.801961339762207, + "learning_rate": 8.524488530688159e-10, + "loss": 1.6, + "step": 220 + }, + { + "epoch": 0.01782323995505444, + "grad_norm": 4.7176204059756355, + "learning_rate": 8.911965282083075e-10, + "loss": 1.6093, + "step": 230 + }, + { + "epoch": 0.018598163431361153, + "grad_norm": 4.497417048853554, + "learning_rate": 9.299442033477993e-10, + "loss": 1.6066, + "step": 240 + }, + { + "epoch": 0.01937308690766787, + "grad_norm": 4.33556527884099, + "learning_rate": 9.68691878487291e-10, + "loss": 1.6059, + "step": 250 + }, + { + "epoch": 0.020148010383974582, + "grad_norm": 4.76848334362276, + "learning_rate": 1.0074395536267825e-09, + "loss": 1.6129, + "step": 260 + }, + { + "epoch": 0.0209229338602813, + "grad_norm": 4.625384401532896, + "learning_rate": 1.0461872287662741e-09, + "loss": 1.602, + "step": 270 + }, + { + "epoch": 0.021697857336588012, + "grad_norm": 4.6910215624711675, + "learning_rate": 1.0849349039057657e-09, + "loss": 1.6156, + "step": 280 + }, + { + "epoch": 0.022472780812894725, + "grad_norm": 4.184940899019704, + "learning_rate": 1.1236825790452575e-09, + "loss": 1.5934, + "step": 290 + }, + { + "epoch": 0.02324770428920144, + "grad_norm": 4.451654659918566, + "learning_rate": 1.162430254184749e-09, + "loss": 1.6169, + "step": 300 + }, + { + "epoch": 0.024022627765508155, + "grad_norm": 13.364375806120705, + "learning_rate": 1.2011779293242407e-09, + "loss": 1.6204, + "step": 310 + }, + { + "epoch": 0.02479755124181487, + "grad_norm": 4.391502294575251, + "learning_rate": 1.2399256044637322e-09, + "loss": 1.6211, + "step": 320 + }, + { + "epoch": 0.025572474718121584, + "grad_norm": 4.5754700537035475, + "learning_rate": 1.2786732796032238e-09, + "loss": 1.5978, + "step": 330 + }, + { + "epoch": 0.0263473981944283, + "grad_norm": 4.253515581647408, + "learning_rate": 1.3174209547427156e-09, + "loss": 1.6081, + "step": 340 + }, + { + "epoch": 0.027122321670735014, + "grad_norm": 4.311002221715996, + "learning_rate": 1.3561686298822072e-09, + "loss": 1.606, + "step": 350 + }, + { + "epoch": 0.02789724514704173, + "grad_norm": 4.44387913842637, + "learning_rate": 1.3949163050216988e-09, + "loss": 1.6172, + "step": 360 + }, + { + "epoch": 0.028672168623348444, + "grad_norm": 4.387490514812309, + "learning_rate": 1.4336639801611903e-09, + "loss": 1.6287, + "step": 370 + }, + { + "epoch": 0.02944709209965516, + "grad_norm": 4.279441369736356, + "learning_rate": 1.472411655300682e-09, + "loss": 1.6224, + "step": 380 + }, + { + "epoch": 0.030222015575961873, + "grad_norm": 4.481168685328298, + "learning_rate": 1.5111593304401737e-09, + "loss": 1.592, + "step": 390 + }, + { + "epoch": 0.03099693905226859, + "grad_norm": 4.3218448830188265, + "learning_rate": 1.5499070055796653e-09, + "loss": 1.6158, + "step": 400 + }, + { + "epoch": 0.031771862528575306, + "grad_norm": 4.190621374322848, + "learning_rate": 1.5886546807191569e-09, + "loss": 1.5707, + "step": 410 + }, + { + "epoch": 0.03254678600488202, + "grad_norm": 4.710311529399391, + "learning_rate": 1.6274023558586485e-09, + "loss": 1.6134, + "step": 420 + }, + { + "epoch": 0.03332170948118873, + "grad_norm": 3.940116567578523, + "learning_rate": 1.6661500309981402e-09, + "loss": 1.5934, + "step": 430 + }, + { + "epoch": 0.034096632957495446, + "grad_norm": 4.539155952229941, + "learning_rate": 1.7048977061376318e-09, + "loss": 1.6021, + "step": 440 + }, + { + "epoch": 0.03487155643380216, + "grad_norm": 4.536232935543118, + "learning_rate": 1.7436453812771234e-09, + "loss": 1.6073, + "step": 450 + }, + { + "epoch": 0.03564647991010888, + "grad_norm": 4.661521726616842, + "learning_rate": 1.782393056416615e-09, + "loss": 1.6156, + "step": 460 + }, + { + "epoch": 0.03642140338641559, + "grad_norm": 4.265058167442682, + "learning_rate": 1.8211407315561066e-09, + "loss": 1.5894, + "step": 470 + }, + { + "epoch": 0.037196326862722305, + "grad_norm": 5.774690862175508, + "learning_rate": 1.8598884066955986e-09, + "loss": 1.6035, + "step": 480 + }, + { + "epoch": 0.03797125033902902, + "grad_norm": 4.790647823945311, + "learning_rate": 1.89863608183509e-09, + "loss": 1.6181, + "step": 490 + }, + { + "epoch": 0.03874617381533574, + "grad_norm": 4.644410075391135, + "learning_rate": 1.937383756974582e-09, + "loss": 1.6431, + "step": 500 + }, + { + "epoch": 0.03874617381533574, + "eval_loss": 1.6119384765625, + "eval_runtime": 316.6461, + "eval_samples_per_second": 36.227, + "eval_steps_per_second": 9.057, + "step": 500 + }, + { + "epoch": 0.03952109729164245, + "grad_norm": 4.789316355486449, + "learning_rate": 1.9761314321140733e-09, + "loss": 1.614, + "step": 510 + }, + { + "epoch": 0.040296020767949164, + "grad_norm": 4.534942367622371, + "learning_rate": 2.014879107253565e-09, + "loss": 1.6194, + "step": 520 + }, + { + "epoch": 0.04107094424425588, + "grad_norm": 4.247318986756639, + "learning_rate": 2.0536267823930565e-09, + "loss": 1.6222, + "step": 530 + }, + { + "epoch": 0.0418458677205626, + "grad_norm": 3.967613310912756, + "learning_rate": 2.0923744575325483e-09, + "loss": 1.6223, + "step": 540 + }, + { + "epoch": 0.04262079119686931, + "grad_norm": 4.106392277909388, + "learning_rate": 2.13112213267204e-09, + "loss": 1.6191, + "step": 550 + }, + { + "epoch": 0.043395714673176024, + "grad_norm": 4.614681698163825, + "learning_rate": 2.1698698078115314e-09, + "loss": 1.6037, + "step": 560 + }, + { + "epoch": 0.04417063814948274, + "grad_norm": 4.034678190536517, + "learning_rate": 2.208617482951023e-09, + "loss": 1.5993, + "step": 570 + }, + { + "epoch": 0.04494556162578945, + "grad_norm": 4.1883827945820835, + "learning_rate": 2.247365158090515e-09, + "loss": 1.6173, + "step": 580 + }, + { + "epoch": 0.04572048510209617, + "grad_norm": 4.0277051405027215, + "learning_rate": 2.2861128332300064e-09, + "loss": 1.6157, + "step": 590 + }, + { + "epoch": 0.04649540857840288, + "grad_norm": 4.815641306291401, + "learning_rate": 2.324860508369498e-09, + "loss": 1.6296, + "step": 600 + }, + { + "epoch": 0.047270332054709596, + "grad_norm": 4.4942359136030365, + "learning_rate": 2.3636081835089895e-09, + "loss": 1.6386, + "step": 610 + }, + { + "epoch": 0.04804525553101631, + "grad_norm": 5.01014297684587, + "learning_rate": 2.4023558586484813e-09, + "loss": 1.6158, + "step": 620 + }, + { + "epoch": 0.04882017900732303, + "grad_norm": 4.253635936648828, + "learning_rate": 2.441103533787973e-09, + "loss": 1.6201, + "step": 630 + }, + { + "epoch": 0.04959510248362974, + "grad_norm": 4.496778312318948, + "learning_rate": 2.4798512089274645e-09, + "loss": 1.6088, + "step": 640 + }, + { + "epoch": 0.050370025959936456, + "grad_norm": 4.4487905828882335, + "learning_rate": 2.5185988840669563e-09, + "loss": 1.6051, + "step": 650 + }, + { + "epoch": 0.05114494943624317, + "grad_norm": 5.5510645649069685, + "learning_rate": 2.5573465592064476e-09, + "loss": 1.6405, + "step": 660 + }, + { + "epoch": 0.05191987291254989, + "grad_norm": 4.661427880531987, + "learning_rate": 2.5960942343459394e-09, + "loss": 1.6107, + "step": 670 + }, + { + "epoch": 0.0526947963888566, + "grad_norm": 4.507194608213907, + "learning_rate": 2.6348419094854312e-09, + "loss": 1.5819, + "step": 680 + }, + { + "epoch": 0.053469719865163315, + "grad_norm": 4.846814759964465, + "learning_rate": 2.6735895846249226e-09, + "loss": 1.6156, + "step": 690 + }, + { + "epoch": 0.05424464334147003, + "grad_norm": 4.2069344701467575, + "learning_rate": 2.7123372597644144e-09, + "loss": 1.5977, + "step": 700 + }, + { + "epoch": 0.05501956681777675, + "grad_norm": 4.346818015428174, + "learning_rate": 2.7510849349039057e-09, + "loss": 1.6361, + "step": 710 + }, + { + "epoch": 0.05579449029408346, + "grad_norm": 4.511203905565294, + "learning_rate": 2.7898326100433975e-09, + "loss": 1.6099, + "step": 720 + }, + { + "epoch": 0.056569413770390174, + "grad_norm": 4.407683330250345, + "learning_rate": 2.8285802851828893e-09, + "loss": 1.6299, + "step": 730 + }, + { + "epoch": 0.05734433724669689, + "grad_norm": 4.270096388615882, + "learning_rate": 2.8673279603223807e-09, + "loss": 1.6253, + "step": 740 + }, + { + "epoch": 0.0581192607230036, + "grad_norm": 4.551645952913065, + "learning_rate": 2.9060756354618725e-09, + "loss": 1.6093, + "step": 750 + }, + { + "epoch": 0.05889418419931032, + "grad_norm": 4.407422347461121, + "learning_rate": 2.944823310601364e-09, + "loss": 1.6041, + "step": 760 + }, + { + "epoch": 0.059669107675617034, + "grad_norm": 4.521172878646271, + "learning_rate": 2.9835709857408556e-09, + "loss": 1.6311, + "step": 770 + }, + { + "epoch": 0.06044403115192375, + "grad_norm": 4.28287221472258, + "learning_rate": 3.0223186608803474e-09, + "loss": 1.5938, + "step": 780 + }, + { + "epoch": 0.06121895462823046, + "grad_norm": 4.911678585381728, + "learning_rate": 3.061066336019839e-09, + "loss": 1.6011, + "step": 790 + }, + { + "epoch": 0.06199387810453718, + "grad_norm": 4.466852150750014, + "learning_rate": 3.0998140111593306e-09, + "loss": 1.5903, + "step": 800 + }, + { + "epoch": 0.06276880158084389, + "grad_norm": 4.0473795245097985, + "learning_rate": 3.1385616862988224e-09, + "loss": 1.632, + "step": 810 + }, + { + "epoch": 0.06354372505715061, + "grad_norm": 4.089687730360117, + "learning_rate": 3.1773093614383138e-09, + "loss": 1.5994, + "step": 820 + }, + { + "epoch": 0.06431864853345733, + "grad_norm": 4.431273007184106, + "learning_rate": 3.2160570365778055e-09, + "loss": 1.5982, + "step": 830 + }, + { + "epoch": 0.06509357200976404, + "grad_norm": 5.301481136690969, + "learning_rate": 3.254804711717297e-09, + "loss": 1.6166, + "step": 840 + }, + { + "epoch": 0.06586849548607075, + "grad_norm": 4.422629320577847, + "learning_rate": 3.2935523868567887e-09, + "loss": 1.5719, + "step": 850 + }, + { + "epoch": 0.06664341896237747, + "grad_norm": 4.373679477165583, + "learning_rate": 3.3323000619962805e-09, + "loss": 1.6028, + "step": 860 + }, + { + "epoch": 0.06741834243868418, + "grad_norm": 4.211265973149877, + "learning_rate": 3.371047737135772e-09, + "loss": 1.6255, + "step": 870 + }, + { + "epoch": 0.06819326591499089, + "grad_norm": 4.419688729459902, + "learning_rate": 3.4097954122752636e-09, + "loss": 1.6106, + "step": 880 + }, + { + "epoch": 0.0689681893912976, + "grad_norm": 4.639028896016265, + "learning_rate": 3.448543087414755e-09, + "loss": 1.5847, + "step": 890 + }, + { + "epoch": 0.06974311286760432, + "grad_norm": 4.7386871225047456, + "learning_rate": 3.487290762554247e-09, + "loss": 1.6151, + "step": 900 + }, + { + "epoch": 0.07051803634391104, + "grad_norm": 3.966034086404883, + "learning_rate": 3.5260384376937386e-09, + "loss": 1.616, + "step": 910 + }, + { + "epoch": 0.07129295982021776, + "grad_norm": 4.743720872743189, + "learning_rate": 3.56478611283323e-09, + "loss": 1.6256, + "step": 920 + }, + { + "epoch": 0.07206788329652447, + "grad_norm": 5.041660629288632, + "learning_rate": 3.6035337879727218e-09, + "loss": 1.6294, + "step": 930 + }, + { + "epoch": 0.07284280677283118, + "grad_norm": 3.9939239610581603, + "learning_rate": 3.642281463112213e-09, + "loss": 1.6119, + "step": 940 + }, + { + "epoch": 0.0736177302491379, + "grad_norm": 4.816046650389995, + "learning_rate": 3.6810291382517053e-09, + "loss": 1.6134, + "step": 950 + }, + { + "epoch": 0.07439265372544461, + "grad_norm": 4.377946419420401, + "learning_rate": 3.719776813391197e-09, + "loss": 1.6098, + "step": 960 + }, + { + "epoch": 0.07516757720175132, + "grad_norm": 4.255026226596184, + "learning_rate": 3.758524488530689e-09, + "loss": 1.6124, + "step": 970 + }, + { + "epoch": 0.07594250067805804, + "grad_norm": 8.569535453128996, + "learning_rate": 3.79727216367018e-09, + "loss": 1.6305, + "step": 980 + }, + { + "epoch": 0.07671742415436476, + "grad_norm": 4.403310817728333, + "learning_rate": 3.836019838809672e-09, + "loss": 1.6067, + "step": 990 + }, + { + "epoch": 0.07749234763067148, + "grad_norm": 4.716711325703831, + "learning_rate": 3.874767513949164e-09, + "loss": 1.6087, + "step": 1000 + }, + { + "epoch": 0.07749234763067148, + "eval_loss": 1.6101443767547607, + "eval_runtime": 317.3549, + "eval_samples_per_second": 36.146, + "eval_steps_per_second": 9.037, + "step": 1000 + }, + { + "epoch": 0.07826727110697819, + "grad_norm": 4.6431578645206, + "learning_rate": 3.913515189088655e-09, + "loss": 1.6044, + "step": 1010 + }, + { + "epoch": 0.0790421945832849, + "grad_norm": 4.057684589645631, + "learning_rate": 3.952262864228147e-09, + "loss": 1.6297, + "step": 1020 + }, + { + "epoch": 0.07981711805959162, + "grad_norm": 4.708960365351832, + "learning_rate": 3.991010539367639e-09, + "loss": 1.6395, + "step": 1030 + }, + { + "epoch": 0.08059204153589833, + "grad_norm": 4.802761188912857, + "learning_rate": 4.02975821450713e-09, + "loss": 1.648, + "step": 1040 + }, + { + "epoch": 0.08136696501220504, + "grad_norm": 4.236912552791146, + "learning_rate": 4.0685058896466216e-09, + "loss": 1.6086, + "step": 1050 + }, + { + "epoch": 0.08214188848851176, + "grad_norm": 4.6958312979448635, + "learning_rate": 4.107253564786113e-09, + "loss": 1.6526, + "step": 1060 + }, + { + "epoch": 0.08291681196481847, + "grad_norm": 4.598828646577646, + "learning_rate": 4.146001239925605e-09, + "loss": 1.6113, + "step": 1070 + }, + { + "epoch": 0.0836917354411252, + "grad_norm": 4.462194487526267, + "learning_rate": 4.1847489150650965e-09, + "loss": 1.6328, + "step": 1080 + }, + { + "epoch": 0.08446665891743191, + "grad_norm": 4.3475847029782395, + "learning_rate": 4.223496590204588e-09, + "loss": 1.6145, + "step": 1090 + }, + { + "epoch": 0.08524158239373862, + "grad_norm": 4.115128534729173, + "learning_rate": 4.26224426534408e-09, + "loss": 1.5877, + "step": 1100 + }, + { + "epoch": 0.08601650587004533, + "grad_norm": 4.134270747948412, + "learning_rate": 4.3009919404835715e-09, + "loss": 1.6217, + "step": 1110 + }, + { + "epoch": 0.08679142934635205, + "grad_norm": 4.1784812222145105, + "learning_rate": 4.339739615623063e-09, + "loss": 1.6297, + "step": 1120 + }, + { + "epoch": 0.08756635282265876, + "grad_norm": 3.683280899432023, + "learning_rate": 4.378487290762555e-09, + "loss": 1.5789, + "step": 1130 + }, + { + "epoch": 0.08834127629896547, + "grad_norm": 3.9690510527906597, + "learning_rate": 4.417234965902046e-09, + "loss": 1.6132, + "step": 1140 + }, + { + "epoch": 0.08911619977527219, + "grad_norm": 4.299954495981797, + "learning_rate": 4.455982641041538e-09, + "loss": 1.5925, + "step": 1150 + }, + { + "epoch": 0.0898911232515789, + "grad_norm": 4.208475034221307, + "learning_rate": 4.49473031618103e-09, + "loss": 1.6307, + "step": 1160 + }, + { + "epoch": 0.09066604672788563, + "grad_norm": 4.585758743637265, + "learning_rate": 4.533477991320521e-09, + "loss": 1.6098, + "step": 1170 + }, + { + "epoch": 0.09144097020419234, + "grad_norm": 4.069018964950438, + "learning_rate": 4.572225666460013e-09, + "loss": 1.583, + "step": 1180 + }, + { + "epoch": 0.09221589368049905, + "grad_norm": 4.281635949952725, + "learning_rate": 4.610973341599504e-09, + "loss": 1.5793, + "step": 1190 + }, + { + "epoch": 0.09299081715680577, + "grad_norm": 4.004756813982838, + "learning_rate": 4.649721016738996e-09, + "loss": 1.5971, + "step": 1200 + }, + { + "epoch": 0.09376574063311248, + "grad_norm": 4.267561505586098, + "learning_rate": 4.688468691878488e-09, + "loss": 1.5748, + "step": 1210 + }, + { + "epoch": 0.09454066410941919, + "grad_norm": 4.3070968552499895, + "learning_rate": 4.727216367017979e-09, + "loss": 1.622, + "step": 1220 + }, + { + "epoch": 0.0953155875857259, + "grad_norm": 3.8006073284703845, + "learning_rate": 4.765964042157471e-09, + "loss": 1.6142, + "step": 1230 + }, + { + "epoch": 0.09609051106203262, + "grad_norm": 4.790441922087329, + "learning_rate": 4.804711717296963e-09, + "loss": 1.5806, + "step": 1240 + }, + { + "epoch": 0.09686543453833935, + "grad_norm": 4.3902241745431665, + "learning_rate": 4.843459392436454e-09, + "loss": 1.623, + "step": 1250 + }, + { + "epoch": 0.09764035801464606, + "grad_norm": 4.4523724101982545, + "learning_rate": 4.882207067575946e-09, + "loss": 1.5914, + "step": 1260 + }, + { + "epoch": 0.09841528149095277, + "grad_norm": 3.96942622405592, + "learning_rate": 4.9209547427154376e-09, + "loss": 1.6038, + "step": 1270 + }, + { + "epoch": 0.09919020496725948, + "grad_norm": 4.141596396000178, + "learning_rate": 4.959702417854929e-09, + "loss": 1.6385, + "step": 1280 + }, + { + "epoch": 0.0999651284435662, + "grad_norm": 4.228189004959891, + "learning_rate": 4.99845009299442e-09, + "loss": 1.6223, + "step": 1290 + }, + { + "epoch": 0.10074005191987291, + "grad_norm": 4.327615649330958, + "learning_rate": 5.0371977681339125e-09, + "loss": 1.6088, + "step": 1300 + }, + { + "epoch": 0.10151497539617962, + "grad_norm": 4.2639165752635995, + "learning_rate": 5.075945443273404e-09, + "loss": 1.6005, + "step": 1310 + }, + { + "epoch": 0.10228989887248634, + "grad_norm": 4.102411854585701, + "learning_rate": 5.114693118412895e-09, + "loss": 1.6075, + "step": 1320 + }, + { + "epoch": 0.10306482234879305, + "grad_norm": 5.36015617271848, + "learning_rate": 5.1534407935523875e-09, + "loss": 1.6365, + "step": 1330 + }, + { + "epoch": 0.10383974582509978, + "grad_norm": 4.379484886897155, + "learning_rate": 5.192188468691879e-09, + "loss": 1.603, + "step": 1340 + }, + { + "epoch": 0.10461466930140649, + "grad_norm": 4.052430798962801, + "learning_rate": 5.23093614383137e-09, + "loss": 1.6012, + "step": 1350 + }, + { + "epoch": 0.1053895927777132, + "grad_norm": 3.7362991289892107, + "learning_rate": 5.2696838189708624e-09, + "loss": 1.6206, + "step": 1360 + }, + { + "epoch": 0.10616451625401992, + "grad_norm": 4.393573587570734, + "learning_rate": 5.308431494110354e-09, + "loss": 1.5885, + "step": 1370 + }, + { + "epoch": 0.10693943973032663, + "grad_norm": 3.732908394732237, + "learning_rate": 5.347179169249845e-09, + "loss": 1.5961, + "step": 1380 + }, + { + "epoch": 0.10771436320663334, + "grad_norm": 4.384569639671992, + "learning_rate": 5.385926844389337e-09, + "loss": 1.6054, + "step": 1390 + }, + { + "epoch": 0.10848928668294006, + "grad_norm": 4.5640203283538385, + "learning_rate": 5.424674519528829e-09, + "loss": 1.6145, + "step": 1400 + }, + { + "epoch": 0.10926421015924677, + "grad_norm": 3.8632392086017946, + "learning_rate": 5.46342219466832e-09, + "loss": 1.6087, + "step": 1410 + }, + { + "epoch": 0.1100391336355535, + "grad_norm": 4.392700712547041, + "learning_rate": 5.5021698698078115e-09, + "loss": 1.6047, + "step": 1420 + }, + { + "epoch": 0.11081405711186021, + "grad_norm": 3.809744638426645, + "learning_rate": 5.540917544947304e-09, + "loss": 1.5934, + "step": 1430 + }, + { + "epoch": 0.11158898058816692, + "grad_norm": 3.9710762490671723, + "learning_rate": 5.579665220086795e-09, + "loss": 1.5866, + "step": 1440 + }, + { + "epoch": 0.11236390406447364, + "grad_norm": 3.951639927557524, + "learning_rate": 5.6184128952262864e-09, + "loss": 1.6351, + "step": 1450 + }, + { + "epoch": 0.11313882754078035, + "grad_norm": 4.008416897939947, + "learning_rate": 5.657160570365779e-09, + "loss": 1.6049, + "step": 1460 + }, + { + "epoch": 0.11391375101708706, + "grad_norm": 4.122668402940883, + "learning_rate": 5.69590824550527e-09, + "loss": 1.6119, + "step": 1470 + }, + { + "epoch": 0.11468867449339377, + "grad_norm": 4.039409551280387, + "learning_rate": 5.734655920644761e-09, + "loss": 1.5917, + "step": 1480 + }, + { + "epoch": 0.11546359796970049, + "grad_norm": 3.7111209397655274, + "learning_rate": 5.7734035957842536e-09, + "loss": 1.6104, + "step": 1490 + }, + { + "epoch": 0.1162385214460072, + "grad_norm": 3.772382643112357, + "learning_rate": 5.812151270923745e-09, + "loss": 1.6396, + "step": 1500 + }, + { + "epoch": 0.1162385214460072, + "eval_loss": 1.6006035804748535, + "eval_runtime": 319.9765, + "eval_samples_per_second": 35.85, + "eval_steps_per_second": 8.963, + "step": 1500 + }, + { + "epoch": 0.11701344492231393, + "grad_norm": 3.836625066441132, + "learning_rate": 5.850898946063236e-09, + "loss": 1.5886, + "step": 1510 + }, + { + "epoch": 0.11778836839862064, + "grad_norm": 3.7414878972801717, + "learning_rate": 5.889646621202728e-09, + "loss": 1.5951, + "step": 1520 + }, + { + "epoch": 0.11856329187492735, + "grad_norm": 3.6533975578330633, + "learning_rate": 5.92839429634222e-09, + "loss": 1.6173, + "step": 1530 + }, + { + "epoch": 0.11933821535123407, + "grad_norm": 3.5742503125886715, + "learning_rate": 5.967141971481711e-09, + "loss": 1.6161, + "step": 1540 + }, + { + "epoch": 0.12011313882754078, + "grad_norm": 3.694294011619015, + "learning_rate": 6.005889646621203e-09, + "loss": 1.5954, + "step": 1550 + }, + { + "epoch": 0.1208880623038475, + "grad_norm": 3.7489276315651017, + "learning_rate": 6.044637321760695e-09, + "loss": 1.6301, + "step": 1560 + }, + { + "epoch": 0.1216629857801542, + "grad_norm": 3.6477504267148095, + "learning_rate": 6.083384996900186e-09, + "loss": 1.5906, + "step": 1570 + }, + { + "epoch": 0.12243790925646092, + "grad_norm": 3.8281082121738614, + "learning_rate": 6.122132672039678e-09, + "loss": 1.5864, + "step": 1580 + }, + { + "epoch": 0.12321283273276763, + "grad_norm": 3.6378471458472417, + "learning_rate": 6.16088034717917e-09, + "loss": 1.5683, + "step": 1590 + }, + { + "epoch": 0.12398775620907436, + "grad_norm": 3.94277076404456, + "learning_rate": 6.199628022318661e-09, + "loss": 1.6113, + "step": 1600 + }, + { + "epoch": 0.12476267968538107, + "grad_norm": 3.573522368845293, + "learning_rate": 6.2383756974581526e-09, + "loss": 1.581, + "step": 1610 + }, + { + "epoch": 0.12553760316168777, + "grad_norm": 3.8631833184986424, + "learning_rate": 6.277123372597645e-09, + "loss": 1.5753, + "step": 1620 + }, + { + "epoch": 0.1263125266379945, + "grad_norm": 3.5227149736055043, + "learning_rate": 6.315871047737136e-09, + "loss": 1.5877, + "step": 1630 + }, + { + "epoch": 0.12708745011430123, + "grad_norm": 3.4711323358639445, + "learning_rate": 6.3546187228766275e-09, + "loss": 1.5861, + "step": 1640 + }, + { + "epoch": 0.12786237359060793, + "grad_norm": 4.553504522971683, + "learning_rate": 6.393366398016119e-09, + "loss": 1.6177, + "step": 1650 + }, + { + "epoch": 0.12863729706691465, + "grad_norm": 3.8208208158844656, + "learning_rate": 6.432114073155611e-09, + "loss": 1.5651, + "step": 1660 + }, + { + "epoch": 0.12941222054322135, + "grad_norm": 3.6688229881834493, + "learning_rate": 6.4708617482951025e-09, + "loss": 1.6125, + "step": 1670 + }, + { + "epoch": 0.13018714401952808, + "grad_norm": 3.5778519475154735, + "learning_rate": 6.509609423434594e-09, + "loss": 1.5994, + "step": 1680 + }, + { + "epoch": 0.13096206749583478, + "grad_norm": 3.3321250054514353, + "learning_rate": 6.548357098574086e-09, + "loss": 1.5892, + "step": 1690 + }, + { + "epoch": 0.1317369909721415, + "grad_norm": 3.544761127807508, + "learning_rate": 6.587104773713577e-09, + "loss": 1.5973, + "step": 1700 + }, + { + "epoch": 0.1325119144484482, + "grad_norm": 3.4020519538212413, + "learning_rate": 6.625852448853069e-09, + "loss": 1.5767, + "step": 1710 + }, + { + "epoch": 0.13328683792475493, + "grad_norm": 3.5119171443232315, + "learning_rate": 6.664600123992561e-09, + "loss": 1.5718, + "step": 1720 + }, + { + "epoch": 0.13406176140106166, + "grad_norm": 3.5169225627473337, + "learning_rate": 6.703347799132052e-09, + "loss": 1.6034, + "step": 1730 + }, + { + "epoch": 0.13483668487736836, + "grad_norm": 3.762520917332192, + "learning_rate": 6.742095474271544e-09, + "loss": 1.5937, + "step": 1740 + }, + { + "epoch": 0.13561160835367508, + "grad_norm": 4.088350531315349, + "learning_rate": 6.780843149411035e-09, + "loss": 1.5893, + "step": 1750 + }, + { + "epoch": 0.13638653182998178, + "grad_norm": 3.403571061752429, + "learning_rate": 6.819590824550527e-09, + "loss": 1.5704, + "step": 1760 + }, + { + "epoch": 0.1371614553062885, + "grad_norm": 3.312195597703196, + "learning_rate": 6.858338499690019e-09, + "loss": 1.5823, + "step": 1770 + }, + { + "epoch": 0.1379363787825952, + "grad_norm": 3.821006607956483, + "learning_rate": 6.89708617482951e-09, + "loss": 1.6186, + "step": 1780 + }, + { + "epoch": 0.13871130225890194, + "grad_norm": 3.797004443866766, + "learning_rate": 6.935833849969002e-09, + "loss": 1.5654, + "step": 1790 + }, + { + "epoch": 0.13948622573520864, + "grad_norm": 3.6275371399393146, + "learning_rate": 6.974581525108494e-09, + "loss": 1.6016, + "step": 1800 + }, + { + "epoch": 0.14026114921151536, + "grad_norm": 3.804526514738555, + "learning_rate": 7.013329200247985e-09, + "loss": 1.5674, + "step": 1810 + }, + { + "epoch": 0.1410360726878221, + "grad_norm": 3.3146429378100186, + "learning_rate": 7.052076875387477e-09, + "loss": 1.586, + "step": 1820 + }, + { + "epoch": 0.1418109961641288, + "grad_norm": 3.6689913050315175, + "learning_rate": 7.0908245505269686e-09, + "loss": 1.5658, + "step": 1830 + }, + { + "epoch": 0.14258591964043552, + "grad_norm": 3.551502095643226, + "learning_rate": 7.12957222566646e-09, + "loss": 1.6024, + "step": 1840 + }, + { + "epoch": 0.14336084311674221, + "grad_norm": 3.385947413621861, + "learning_rate": 7.168319900805951e-09, + "loss": 1.6017, + "step": 1850 + }, + { + "epoch": 0.14413576659304894, + "grad_norm": 3.3341898322304053, + "learning_rate": 7.2070675759454435e-09, + "loss": 1.5936, + "step": 1860 + }, + { + "epoch": 0.14491069006935564, + "grad_norm": 3.9664144987419325, + "learning_rate": 7.245815251084935e-09, + "loss": 1.6126, + "step": 1870 + }, + { + "epoch": 0.14568561354566237, + "grad_norm": 3.5808559948554985, + "learning_rate": 7.284562926224426e-09, + "loss": 1.5856, + "step": 1880 + }, + { + "epoch": 0.14646053702196907, + "grad_norm": 3.6364840023746763, + "learning_rate": 7.3233106013639185e-09, + "loss": 1.5729, + "step": 1890 + }, + { + "epoch": 0.1472354604982758, + "grad_norm": 3.4535193607381136, + "learning_rate": 7.362058276503411e-09, + "loss": 1.5536, + "step": 1900 + }, + { + "epoch": 0.14801038397458252, + "grad_norm": 3.406541094258975, + "learning_rate": 7.400805951642903e-09, + "loss": 1.6137, + "step": 1910 + }, + { + "epoch": 0.14878530745088922, + "grad_norm": 3.736148076184637, + "learning_rate": 7.439553626782394e-09, + "loss": 1.5863, + "step": 1920 + }, + { + "epoch": 0.14956023092719595, + "grad_norm": 3.3301698115972007, + "learning_rate": 7.478301301921886e-09, + "loss": 1.5821, + "step": 1930 + }, + { + "epoch": 0.15033515440350265, + "grad_norm": 3.5339852824887257, + "learning_rate": 7.517048977061378e-09, + "loss": 1.5733, + "step": 1940 + }, + { + "epoch": 0.15111007787980937, + "grad_norm": 3.967364866913959, + "learning_rate": 7.555796652200868e-09, + "loss": 1.5685, + "step": 1950 + }, + { + "epoch": 0.15188500135611607, + "grad_norm": 3.602027663616697, + "learning_rate": 7.59454432734036e-09, + "loss": 1.5726, + "step": 1960 + }, + { + "epoch": 0.1526599248324228, + "grad_norm": 3.0632689962810766, + "learning_rate": 7.633292002479853e-09, + "loss": 1.5567, + "step": 1970 + }, + { + "epoch": 0.15343484830872953, + "grad_norm": 3.333492192901838, + "learning_rate": 7.672039677619343e-09, + "loss": 1.5916, + "step": 1980 + }, + { + "epoch": 0.15420977178503623, + "grad_norm": 3.671588021004419, + "learning_rate": 7.710787352758836e-09, + "loss": 1.5895, + "step": 1990 + }, + { + "epoch": 0.15498469526134295, + "grad_norm": 3.329363735660189, + "learning_rate": 7.749535027898328e-09, + "loss": 1.586, + "step": 2000 + }, + { + "epoch": 0.15498469526134295, + "eval_loss": 1.5870592594146729, + "eval_runtime": 318.1783, + "eval_samples_per_second": 36.052, + "eval_steps_per_second": 9.014, + "step": 2000 + }, + { + "epoch": 0.15575961873764965, + "grad_norm": 3.648564941684134, + "learning_rate": 7.788282703037818e-09, + "loss": 1.5966, + "step": 2010 + }, + { + "epoch": 0.15653454221395638, + "grad_norm": 3.290098016149067, + "learning_rate": 7.82703037817731e-09, + "loss": 1.5801, + "step": 2020 + }, + { + "epoch": 0.15730946569026308, + "grad_norm": 3.234762852128186, + "learning_rate": 7.865778053316803e-09, + "loss": 1.5946, + "step": 2030 + }, + { + "epoch": 0.1580843891665698, + "grad_norm": 3.296704063943758, + "learning_rate": 7.904525728456293e-09, + "loss": 1.5912, + "step": 2040 + }, + { + "epoch": 0.1588593126428765, + "grad_norm": 3.7932973149243705, + "learning_rate": 7.943273403595785e-09, + "loss": 1.5928, + "step": 2050 + }, + { + "epoch": 0.15963423611918323, + "grad_norm": 3.285882816624262, + "learning_rate": 7.982021078735278e-09, + "loss": 1.5565, + "step": 2060 + }, + { + "epoch": 0.16040915959548996, + "grad_norm": 3.4076587044543736, + "learning_rate": 8.020768753874768e-09, + "loss": 1.5708, + "step": 2070 + }, + { + "epoch": 0.16118408307179666, + "grad_norm": 20.995879520711295, + "learning_rate": 8.05951642901426e-09, + "loss": 1.5658, + "step": 2080 + }, + { + "epoch": 0.16195900654810338, + "grad_norm": 3.1600427225593415, + "learning_rate": 8.098264104153753e-09, + "loss": 1.5736, + "step": 2090 + }, + { + "epoch": 0.16273393002441008, + "grad_norm": 3.223874287313102, + "learning_rate": 8.137011779293243e-09, + "loss": 1.5926, + "step": 2100 + }, + { + "epoch": 0.1635088535007168, + "grad_norm": 3.0103817400707937, + "learning_rate": 8.175759454432735e-09, + "loss": 1.6165, + "step": 2110 + }, + { + "epoch": 0.1642837769770235, + "grad_norm": 3.2886627725196678, + "learning_rate": 8.214507129572226e-09, + "loss": 1.567, + "step": 2120 + }, + { + "epoch": 0.16505870045333024, + "grad_norm": 3.019104207448854, + "learning_rate": 8.253254804711718e-09, + "loss": 1.5518, + "step": 2130 + }, + { + "epoch": 0.16583362392963694, + "grad_norm": 3.2900896294818236, + "learning_rate": 8.29200247985121e-09, + "loss": 1.5799, + "step": 2140 + }, + { + "epoch": 0.16660854740594366, + "grad_norm": 2.894458578245081, + "learning_rate": 8.3307501549907e-09, + "loss": 1.6026, + "step": 2150 + }, + { + "epoch": 0.1673834708822504, + "grad_norm": 3.016227238581772, + "learning_rate": 8.369497830130193e-09, + "loss": 1.5865, + "step": 2160 + }, + { + "epoch": 0.1681583943585571, + "grad_norm": 3.1640762593210767, + "learning_rate": 8.408245505269685e-09, + "loss": 1.5769, + "step": 2170 + }, + { + "epoch": 0.16893331783486382, + "grad_norm": 3.113753904648018, + "learning_rate": 8.446993180409176e-09, + "loss": 1.555, + "step": 2180 + }, + { + "epoch": 0.16970824131117052, + "grad_norm": 3.0332455402611562, + "learning_rate": 8.485740855548668e-09, + "loss": 1.5731, + "step": 2190 + }, + { + "epoch": 0.17048316478747724, + "grad_norm": 3.595360650454923, + "learning_rate": 8.52448853068816e-09, + "loss": 1.5512, + "step": 2200 + }, + { + "epoch": 0.17125808826378394, + "grad_norm": 3.0187509831689097, + "learning_rate": 8.56323620582765e-09, + "loss": 1.571, + "step": 2210 + }, + { + "epoch": 0.17203301174009067, + "grad_norm": 2.911095366901569, + "learning_rate": 8.601983880967143e-09, + "loss": 1.5486, + "step": 2220 + }, + { + "epoch": 0.17280793521639737, + "grad_norm": 2.9655325255426743, + "learning_rate": 8.640731556106635e-09, + "loss": 1.5631, + "step": 2230 + }, + { + "epoch": 0.1735828586927041, + "grad_norm": 2.9257344879587457, + "learning_rate": 8.679479231246126e-09, + "loss": 1.5415, + "step": 2240 + }, + { + "epoch": 0.17435778216901082, + "grad_norm": 2.9301308516365236, + "learning_rate": 8.718226906385618e-09, + "loss": 1.5771, + "step": 2250 + }, + { + "epoch": 0.17513270564531752, + "grad_norm": 2.9606103069299707, + "learning_rate": 8.75697458152511e-09, + "loss": 1.5721, + "step": 2260 + }, + { + "epoch": 0.17590762912162425, + "grad_norm": 12.741497611670667, + "learning_rate": 8.7957222566646e-09, + "loss": 1.5673, + "step": 2270 + }, + { + "epoch": 0.17668255259793095, + "grad_norm": 3.112154083592631, + "learning_rate": 8.834469931804093e-09, + "loss": 1.5667, + "step": 2280 + }, + { + "epoch": 0.17745747607423767, + "grad_norm": 2.9039643145277534, + "learning_rate": 8.873217606943585e-09, + "loss": 1.5724, + "step": 2290 + }, + { + "epoch": 0.17823239955054437, + "grad_norm": 3.3838041508926353, + "learning_rate": 8.911965282083076e-09, + "loss": 1.5312, + "step": 2300 + }, + { + "epoch": 0.1790073230268511, + "grad_norm": 3.049800552618511, + "learning_rate": 8.950712957222568e-09, + "loss": 1.5359, + "step": 2310 + }, + { + "epoch": 0.1797822465031578, + "grad_norm": 3.105858896021684, + "learning_rate": 8.98946063236206e-09, + "loss": 1.5472, + "step": 2320 + }, + { + "epoch": 0.18055716997946453, + "grad_norm": 2.7519231528964627, + "learning_rate": 9.02820830750155e-09, + "loss": 1.5552, + "step": 2330 + }, + { + "epoch": 0.18133209345577125, + "grad_norm": 2.8886938178268027, + "learning_rate": 9.066955982641043e-09, + "loss": 1.5612, + "step": 2340 + }, + { + "epoch": 0.18210701693207795, + "grad_norm": 2.8809160255158726, + "learning_rate": 9.105703657780533e-09, + "loss": 1.572, + "step": 2350 + }, + { + "epoch": 0.18288194040838468, + "grad_norm": 2.7393727262831638, + "learning_rate": 9.144451332920025e-09, + "loss": 1.5855, + "step": 2360 + }, + { + "epoch": 0.18365686388469138, + "grad_norm": 3.1135452151463356, + "learning_rate": 9.183199008059518e-09, + "loss": 1.5401, + "step": 2370 + }, + { + "epoch": 0.1844317873609981, + "grad_norm": 3.0530596692116805, + "learning_rate": 9.221946683199008e-09, + "loss": 1.5545, + "step": 2380 + }, + { + "epoch": 0.1852067108373048, + "grad_norm": 2.9611842648217697, + "learning_rate": 9.2606943583385e-09, + "loss": 1.5449, + "step": 2390 + }, + { + "epoch": 0.18598163431361153, + "grad_norm": 2.938766118413425, + "learning_rate": 9.299442033477993e-09, + "loss": 1.5619, + "step": 2400 + }, + { + "epoch": 0.18675655778991826, + "grad_norm": 2.893764595431157, + "learning_rate": 9.338189708617483e-09, + "loss": 1.591, + "step": 2410 + }, + { + "epoch": 0.18753148126622496, + "grad_norm": 2.693560813384404, + "learning_rate": 9.376937383756975e-09, + "loss": 1.5403, + "step": 2420 + }, + { + "epoch": 0.18830640474253169, + "grad_norm": 2.7869488171825147, + "learning_rate": 9.415685058896468e-09, + "loss": 1.56, + "step": 2430 + }, + { + "epoch": 0.18908132821883838, + "grad_norm": 2.9668482204438105, + "learning_rate": 9.454432734035958e-09, + "loss": 1.5862, + "step": 2440 + }, + { + "epoch": 0.1898562516951451, + "grad_norm": 2.822234373157727, + "learning_rate": 9.49318040917545e-09, + "loss": 1.5596, + "step": 2450 + }, + { + "epoch": 0.1906311751714518, + "grad_norm": 2.7806698744201035, + "learning_rate": 9.531928084314942e-09, + "loss": 1.549, + "step": 2460 + }, + { + "epoch": 0.19140609864775854, + "grad_norm": 3.265646753037088, + "learning_rate": 9.570675759454433e-09, + "loss": 1.5567, + "step": 2470 + }, + { + "epoch": 0.19218102212406524, + "grad_norm": 2.992716400603076, + "learning_rate": 9.609423434593925e-09, + "loss": 1.5763, + "step": 2480 + }, + { + "epoch": 0.19295594560037196, + "grad_norm": 2.696639362943408, + "learning_rate": 9.648171109733417e-09, + "loss": 1.56, + "step": 2490 + }, + { + "epoch": 0.1937308690766787, + "grad_norm": 2.7112377788085196, + "learning_rate": 9.686918784872908e-09, + "loss": 1.5629, + "step": 2500 + }, + { + "epoch": 0.1937308690766787, + "eval_loss": 1.5555708408355713, + "eval_runtime": 317.7064, + "eval_samples_per_second": 36.106, + "eval_steps_per_second": 9.027, + "step": 2500 + }, + { + "epoch": 0.1945057925529854, + "grad_norm": 2.6206773968801196, + "learning_rate": 9.7256664600124e-09, + "loss": 1.5457, + "step": 2510 + }, + { + "epoch": 0.19528071602929212, + "grad_norm": 3.015409358048921, + "learning_rate": 9.764414135151892e-09, + "loss": 1.5517, + "step": 2520 + }, + { + "epoch": 0.19605563950559882, + "grad_norm": 2.761835871978777, + "learning_rate": 9.803161810291383e-09, + "loss": 1.5507, + "step": 2530 + }, + { + "epoch": 0.19683056298190554, + "grad_norm": 2.5920951511774994, + "learning_rate": 9.841909485430875e-09, + "loss": 1.5756, + "step": 2540 + }, + { + "epoch": 0.19760548645821224, + "grad_norm": 2.9001874693935052, + "learning_rate": 9.880657160570367e-09, + "loss": 1.5665, + "step": 2550 + }, + { + "epoch": 0.19838040993451897, + "grad_norm": 2.7568194147075866, + "learning_rate": 9.919404835709858e-09, + "loss": 1.5573, + "step": 2560 + }, + { + "epoch": 0.19915533341082567, + "grad_norm": 2.8014268011942547, + "learning_rate": 9.95815251084935e-09, + "loss": 1.5479, + "step": 2570 + }, + { + "epoch": 0.1999302568871324, + "grad_norm": 3.009869637011403, + "learning_rate": 9.99690018598884e-09, + "loss": 1.536, + "step": 2580 + }, + { + "epoch": 0.20070518036343912, + "grad_norm": 2.6978397814979105, + "learning_rate": 1.0035647861128333e-08, + "loss": 1.5783, + "step": 2590 + }, + { + "epoch": 0.20148010383974582, + "grad_norm": 2.614925202023332, + "learning_rate": 1.0074395536267825e-08, + "loss": 1.5489, + "step": 2600 + }, + { + "epoch": 0.20225502731605255, + "grad_norm": 2.8264999407058977, + "learning_rate": 1.0113143211407316e-08, + "loss": 1.5543, + "step": 2610 + }, + { + "epoch": 0.20302995079235925, + "grad_norm": 2.6475718624773346, + "learning_rate": 1.0151890886546808e-08, + "loss": 1.5311, + "step": 2620 + }, + { + "epoch": 0.20380487426866598, + "grad_norm": 2.974037905491895, + "learning_rate": 1.01906385616863e-08, + "loss": 1.5849, + "step": 2630 + }, + { + "epoch": 0.20457979774497267, + "grad_norm": 2.713429643189924, + "learning_rate": 1.022938623682579e-08, + "loss": 1.5508, + "step": 2640 + }, + { + "epoch": 0.2053547212212794, + "grad_norm": 2.7252402744846096, + "learning_rate": 1.0268133911965283e-08, + "loss": 1.5569, + "step": 2650 + }, + { + "epoch": 0.2061296446975861, + "grad_norm": 2.6806983258245776, + "learning_rate": 1.0306881587104775e-08, + "loss": 1.5575, + "step": 2660 + }, + { + "epoch": 0.20690456817389283, + "grad_norm": 2.7865888691293557, + "learning_rate": 1.0345629262244265e-08, + "loss": 1.5539, + "step": 2670 + }, + { + "epoch": 0.20767949165019955, + "grad_norm": 2.5938280253006143, + "learning_rate": 1.0384376937383758e-08, + "loss": 1.5589, + "step": 2680 + }, + { + "epoch": 0.20845441512650625, + "grad_norm": 2.85375507480992, + "learning_rate": 1.042312461252325e-08, + "loss": 1.5096, + "step": 2690 + }, + { + "epoch": 0.20922933860281298, + "grad_norm": 2.5025947703521156, + "learning_rate": 1.046187228766274e-08, + "loss": 1.5579, + "step": 2700 + }, + { + "epoch": 0.21000426207911968, + "grad_norm": 2.6947161266173834, + "learning_rate": 1.0500619962802233e-08, + "loss": 1.5179, + "step": 2710 + }, + { + "epoch": 0.2107791855554264, + "grad_norm": 2.7078401554948637, + "learning_rate": 1.0539367637941725e-08, + "loss": 1.5474, + "step": 2720 + }, + { + "epoch": 0.2115541090317331, + "grad_norm": 2.6670934237403, + "learning_rate": 1.0578115313081215e-08, + "loss": 1.571, + "step": 2730 + }, + { + "epoch": 0.21232903250803983, + "grad_norm": 2.626079159020465, + "learning_rate": 1.0616862988220708e-08, + "loss": 1.5886, + "step": 2740 + }, + { + "epoch": 0.21310395598434653, + "grad_norm": 2.665132497233358, + "learning_rate": 1.06556106633602e-08, + "loss": 1.5483, + "step": 2750 + }, + { + "epoch": 0.21387887946065326, + "grad_norm": 2.999449540369886, + "learning_rate": 1.069435833849969e-08, + "loss": 1.5512, + "step": 2760 + }, + { + "epoch": 0.21465380293696, + "grad_norm": 2.658261378171169, + "learning_rate": 1.0733106013639183e-08, + "loss": 1.5521, + "step": 2770 + }, + { + "epoch": 0.21542872641326669, + "grad_norm": 4.169999295483339, + "learning_rate": 1.0771853688778675e-08, + "loss": 1.5585, + "step": 2780 + }, + { + "epoch": 0.2162036498895734, + "grad_norm": 2.7587570341525627, + "learning_rate": 1.0810601363918165e-08, + "loss": 1.5253, + "step": 2790 + }, + { + "epoch": 0.2169785733658801, + "grad_norm": 2.7794634622298964, + "learning_rate": 1.0849349039057657e-08, + "loss": 1.5722, + "step": 2800 + }, + { + "epoch": 0.21775349684218684, + "grad_norm": 2.6543300035069057, + "learning_rate": 1.0888096714197148e-08, + "loss": 1.5301, + "step": 2810 + }, + { + "epoch": 0.21852842031849354, + "grad_norm": 2.7129268623118565, + "learning_rate": 1.092684438933664e-08, + "loss": 1.5309, + "step": 2820 + }, + { + "epoch": 0.21930334379480027, + "grad_norm": 3.1047079800680466, + "learning_rate": 1.0965592064476132e-08, + "loss": 1.5383, + "step": 2830 + }, + { + "epoch": 0.220078267271107, + "grad_norm": 2.608976146452637, + "learning_rate": 1.1004339739615623e-08, + "loss": 1.5508, + "step": 2840 + }, + { + "epoch": 0.2208531907474137, + "grad_norm": 2.4272851136885305, + "learning_rate": 1.1043087414755115e-08, + "loss": 1.5552, + "step": 2850 + }, + { + "epoch": 0.22162811422372042, + "grad_norm": 2.8278104509083306, + "learning_rate": 1.1081835089894607e-08, + "loss": 1.5445, + "step": 2860 + }, + { + "epoch": 0.22240303770002712, + "grad_norm": 2.589434545609631, + "learning_rate": 1.1120582765034098e-08, + "loss": 1.5423, + "step": 2870 + }, + { + "epoch": 0.22317796117633384, + "grad_norm": 2.5843114992976584, + "learning_rate": 1.115933044017359e-08, + "loss": 1.5281, + "step": 2880 + }, + { + "epoch": 0.22395288465264054, + "grad_norm": 2.5757615809130865, + "learning_rate": 1.1198078115313082e-08, + "loss": 1.5234, + "step": 2890 + }, + { + "epoch": 0.22472780812894727, + "grad_norm": 5.581812895232684, + "learning_rate": 1.1236825790452573e-08, + "loss": 1.5365, + "step": 2900 + }, + { + "epoch": 0.22550273160525397, + "grad_norm": 2.7034926690197585, + "learning_rate": 1.1275573465592065e-08, + "loss": 1.5079, + "step": 2910 + }, + { + "epoch": 0.2262776550815607, + "grad_norm": 2.662653617538462, + "learning_rate": 1.1314321140731557e-08, + "loss": 1.5483, + "step": 2920 + }, + { + "epoch": 0.22705257855786742, + "grad_norm": 2.4258173653411905, + "learning_rate": 1.1353068815871048e-08, + "loss": 1.582, + "step": 2930 + }, + { + "epoch": 0.22782750203417412, + "grad_norm": 2.5204548918714123, + "learning_rate": 1.139181649101054e-08, + "loss": 1.5328, + "step": 2940 + }, + { + "epoch": 0.22860242551048085, + "grad_norm": 2.6478433846214724, + "learning_rate": 1.1430564166150032e-08, + "loss": 1.5423, + "step": 2950 + }, + { + "epoch": 0.22937734898678755, + "grad_norm": 2.522124948204053, + "learning_rate": 1.1469311841289523e-08, + "loss": 1.5419, + "step": 2960 + }, + { + "epoch": 0.23015227246309428, + "grad_norm": 2.32309785770899, + "learning_rate": 1.1508059516429015e-08, + "loss": 1.5245, + "step": 2970 + }, + { + "epoch": 0.23092719593940098, + "grad_norm": 2.442999983464966, + "learning_rate": 1.1546807191568507e-08, + "loss": 1.5455, + "step": 2980 + }, + { + "epoch": 0.2317021194157077, + "grad_norm": 2.527444691825758, + "learning_rate": 1.1585554866707998e-08, + "loss": 1.5316, + "step": 2990 + }, + { + "epoch": 0.2324770428920144, + "grad_norm": 2.721199957753387, + "learning_rate": 1.162430254184749e-08, + "loss": 1.5223, + "step": 3000 + }, + { + "epoch": 0.2324770428920144, + "eval_loss": 1.527623176574707, + "eval_runtime": 317.8993, + "eval_samples_per_second": 36.084, + "eval_steps_per_second": 9.022, + "step": 3000 + }, + { + "epoch": 0.23325196636832113, + "grad_norm": 2.527534021856482, + "learning_rate": 1.1663050216986982e-08, + "loss": 1.5149, + "step": 3010 + }, + { + "epoch": 0.23402688984462786, + "grad_norm": 2.4608657151041076, + "learning_rate": 1.1701797892126473e-08, + "loss": 1.527, + "step": 3020 + }, + { + "epoch": 0.23480181332093455, + "grad_norm": 2.5499955387372255, + "learning_rate": 1.1740545567265965e-08, + "loss": 1.525, + "step": 3030 + }, + { + "epoch": 0.23557673679724128, + "grad_norm": 2.4753332369432766, + "learning_rate": 1.1779293242405455e-08, + "loss": 1.5377, + "step": 3040 + }, + { + "epoch": 0.23635166027354798, + "grad_norm": 2.4380092224179513, + "learning_rate": 1.1818040917544948e-08, + "loss": 1.5313, + "step": 3050 + }, + { + "epoch": 0.2371265837498547, + "grad_norm": 2.734589072906729, + "learning_rate": 1.185678859268444e-08, + "loss": 1.5343, + "step": 3060 + }, + { + "epoch": 0.2379015072261614, + "grad_norm": 2.389353360196365, + "learning_rate": 1.189553626782393e-08, + "loss": 1.5141, + "step": 3070 + }, + { + "epoch": 0.23867643070246813, + "grad_norm": 2.755460241097715, + "learning_rate": 1.1934283942963423e-08, + "loss": 1.5106, + "step": 3080 + }, + { + "epoch": 0.23945135417877483, + "grad_norm": 2.641717203187733, + "learning_rate": 1.1973031618102915e-08, + "loss": 1.5348, + "step": 3090 + }, + { + "epoch": 0.24022627765508156, + "grad_norm": 2.6540310290754476, + "learning_rate": 1.2011779293242405e-08, + "loss": 1.5064, + "step": 3100 + }, + { + "epoch": 0.2410012011313883, + "grad_norm": 2.4492016328198125, + "learning_rate": 1.2050526968381898e-08, + "loss": 1.5176, + "step": 3110 + }, + { + "epoch": 0.241776124607695, + "grad_norm": 2.4503624112102114, + "learning_rate": 1.208927464352139e-08, + "loss": 1.5, + "step": 3120 + }, + { + "epoch": 0.2425510480840017, + "grad_norm": 2.3738794029218746, + "learning_rate": 1.212802231866088e-08, + "loss": 1.5209, + "step": 3130 + }, + { + "epoch": 0.2433259715603084, + "grad_norm": 2.804601688701839, + "learning_rate": 1.2166769993800372e-08, + "loss": 1.5284, + "step": 3140 + }, + { + "epoch": 0.24410089503661514, + "grad_norm": 2.266366195377394, + "learning_rate": 1.2205517668939865e-08, + "loss": 1.5221, + "step": 3150 + }, + { + "epoch": 0.24487581851292184, + "grad_norm": 2.80247160290531, + "learning_rate": 1.2244265344079355e-08, + "loss": 1.5359, + "step": 3160 + }, + { + "epoch": 0.24565074198922857, + "grad_norm": 2.61597379771551, + "learning_rate": 1.2283013019218847e-08, + "loss": 1.5191, + "step": 3170 + }, + { + "epoch": 0.24642566546553527, + "grad_norm": 2.4382834971340497, + "learning_rate": 1.232176069435834e-08, + "loss": 1.5318, + "step": 3180 + }, + { + "epoch": 0.247200588941842, + "grad_norm": 2.354574696302971, + "learning_rate": 1.236050836949783e-08, + "loss": 1.543, + "step": 3190 + }, + { + "epoch": 0.24797551241814872, + "grad_norm": 2.356315260918934, + "learning_rate": 1.2399256044637322e-08, + "loss": 1.4771, + "step": 3200 + }, + { + "epoch": 0.24875043589445542, + "grad_norm": 2.522549011410741, + "learning_rate": 1.2438003719776815e-08, + "loss": 1.5218, + "step": 3210 + }, + { + "epoch": 0.24952535937076215, + "grad_norm": 2.2553812220398615, + "learning_rate": 1.2476751394916305e-08, + "loss": 1.4928, + "step": 3220 + }, + { + "epoch": 0.25030028284706884, + "grad_norm": 2.3609081280195796, + "learning_rate": 1.2515499070055797e-08, + "loss": 1.5128, + "step": 3230 + }, + { + "epoch": 0.25107520632337554, + "grad_norm": 2.2391018496527675, + "learning_rate": 1.255424674519529e-08, + "loss": 1.4951, + "step": 3240 + }, + { + "epoch": 0.2518501297996823, + "grad_norm": 2.7061942588066517, + "learning_rate": 1.259299442033478e-08, + "loss": 1.5282, + "step": 3250 + }, + { + "epoch": 0.252625053275989, + "grad_norm": 2.2678757292800245, + "learning_rate": 1.2631742095474272e-08, + "loss": 1.5152, + "step": 3260 + }, + { + "epoch": 0.2533999767522957, + "grad_norm": 2.3612532831665805, + "learning_rate": 1.2670489770613763e-08, + "loss": 1.5244, + "step": 3270 + }, + { + "epoch": 0.25417490022860245, + "grad_norm": 2.4755943697057243, + "learning_rate": 1.2709237445753255e-08, + "loss": 1.5147, + "step": 3280 + }, + { + "epoch": 0.25494982370490915, + "grad_norm": 2.296878529024576, + "learning_rate": 1.2747985120892747e-08, + "loss": 1.5285, + "step": 3290 + }, + { + "epoch": 0.25572474718121585, + "grad_norm": 2.230079262159547, + "learning_rate": 1.2786732796032238e-08, + "loss": 1.5106, + "step": 3300 + }, + { + "epoch": 0.25649967065752255, + "grad_norm": 2.3356945934793902, + "learning_rate": 1.282548047117173e-08, + "loss": 1.5129, + "step": 3310 + }, + { + "epoch": 0.2572745941338293, + "grad_norm": 2.306803876457089, + "learning_rate": 1.2864228146311222e-08, + "loss": 1.5139, + "step": 3320 + }, + { + "epoch": 0.258049517610136, + "grad_norm": 2.1750016305263196, + "learning_rate": 1.2902975821450713e-08, + "loss": 1.5375, + "step": 3330 + }, + { + "epoch": 0.2588244410864427, + "grad_norm": 2.3489873507616865, + "learning_rate": 1.2941723496590205e-08, + "loss": 1.4992, + "step": 3340 + }, + { + "epoch": 0.2595993645627494, + "grad_norm": 2.2127423581171577, + "learning_rate": 1.2980471171729697e-08, + "loss": 1.5009, + "step": 3350 + }, + { + "epoch": 0.26037428803905616, + "grad_norm": 2.521545535949093, + "learning_rate": 1.3019218846869188e-08, + "loss": 1.4933, + "step": 3360 + }, + { + "epoch": 0.26114921151536286, + "grad_norm": 2.3168799693325117, + "learning_rate": 1.305796652200868e-08, + "loss": 1.4789, + "step": 3370 + }, + { + "epoch": 0.26192413499166955, + "grad_norm": 2.4494500607770937, + "learning_rate": 1.3096714197148172e-08, + "loss": 1.5001, + "step": 3380 + }, + { + "epoch": 0.2626990584679763, + "grad_norm": 2.1604746465098423, + "learning_rate": 1.3135461872287663e-08, + "loss": 1.4854, + "step": 3390 + }, + { + "epoch": 0.263473981944283, + "grad_norm": 2.1253394728903965, + "learning_rate": 1.3174209547427155e-08, + "loss": 1.5091, + "step": 3400 + }, + { + "epoch": 0.2642489054205897, + "grad_norm": 2.2944698361495917, + "learning_rate": 1.3212957222566647e-08, + "loss": 1.4711, + "step": 3410 + }, + { + "epoch": 0.2650238288968964, + "grad_norm": 2.322450414499471, + "learning_rate": 1.3251704897706138e-08, + "loss": 1.5033, + "step": 3420 + }, + { + "epoch": 0.26579875237320316, + "grad_norm": 2.239284884882306, + "learning_rate": 1.329045257284563e-08, + "loss": 1.47, + "step": 3430 + }, + { + "epoch": 0.26657367584950986, + "grad_norm": 2.310065383000257, + "learning_rate": 1.3329200247985122e-08, + "loss": 1.4843, + "step": 3440 + }, + { + "epoch": 0.26734859932581656, + "grad_norm": 2.3401972422967594, + "learning_rate": 1.3367947923124612e-08, + "loss": 1.5193, + "step": 3450 + }, + { + "epoch": 0.2681235228021233, + "grad_norm": 27.89082302571771, + "learning_rate": 1.3406695598264105e-08, + "loss": 1.4852, + "step": 3460 + }, + { + "epoch": 0.26889844627843, + "grad_norm": 2.6410907464719813, + "learning_rate": 1.3445443273403595e-08, + "loss": 1.522, + "step": 3470 + }, + { + "epoch": 0.2696733697547367, + "grad_norm": 2.141016546863239, + "learning_rate": 1.3484190948543087e-08, + "loss": 1.5128, + "step": 3480 + }, + { + "epoch": 0.2704482932310434, + "grad_norm": 2.334804113607199, + "learning_rate": 1.352293862368258e-08, + "loss": 1.504, + "step": 3490 + }, + { + "epoch": 0.27122321670735017, + "grad_norm": 2.247694549326328, + "learning_rate": 1.356168629882207e-08, + "loss": 1.4746, + "step": 3500 + }, + { + "epoch": 0.27122321670735017, + "eval_loss": 1.4922304153442383, + "eval_runtime": 316.6002, + "eval_samples_per_second": 36.232, + "eval_steps_per_second": 9.059, + "step": 3500 + }, + { + "epoch": 0.27199814018365687, + "grad_norm": 2.2854116777842712, + "learning_rate": 1.3600433973961562e-08, + "loss": 1.4742, + "step": 3510 + }, + { + "epoch": 0.27277306365996357, + "grad_norm": 2.3443159890497967, + "learning_rate": 1.3639181649101055e-08, + "loss": 1.5068, + "step": 3520 + }, + { + "epoch": 0.2735479871362703, + "grad_norm": 2.3891709500352265, + "learning_rate": 1.3677929324240545e-08, + "loss": 1.4667, + "step": 3530 + }, + { + "epoch": 0.274322910612577, + "grad_norm": 2.2131639578246216, + "learning_rate": 1.3716676999380037e-08, + "loss": 1.5072, + "step": 3540 + }, + { + "epoch": 0.2750978340888837, + "grad_norm": 2.3114490824941942, + "learning_rate": 1.375542467451953e-08, + "loss": 1.454, + "step": 3550 + }, + { + "epoch": 0.2758727575651904, + "grad_norm": 2.3423796343767602, + "learning_rate": 1.379417234965902e-08, + "loss": 1.5146, + "step": 3560 + }, + { + "epoch": 0.2766476810414972, + "grad_norm": 2.2417929382117747, + "learning_rate": 1.3832920024798512e-08, + "loss": 1.4955, + "step": 3570 + }, + { + "epoch": 0.2774226045178039, + "grad_norm": 2.1190104070721874, + "learning_rate": 1.3871667699938004e-08, + "loss": 1.4851, + "step": 3580 + }, + { + "epoch": 0.27819752799411057, + "grad_norm": 2.161835294978339, + "learning_rate": 1.3910415375077495e-08, + "loss": 1.4936, + "step": 3590 + }, + { + "epoch": 0.27897245147041727, + "grad_norm": 4.291623130914577, + "learning_rate": 1.3949163050216987e-08, + "loss": 1.4959, + "step": 3600 + }, + { + "epoch": 0.279747374946724, + "grad_norm": 2.2362796012846324, + "learning_rate": 1.398791072535648e-08, + "loss": 1.4772, + "step": 3610 + }, + { + "epoch": 0.2805222984230307, + "grad_norm": 2.378881526717449, + "learning_rate": 1.402665840049597e-08, + "loss": 1.4877, + "step": 3620 + }, + { + "epoch": 0.2812972218993374, + "grad_norm": 2.406233985122958, + "learning_rate": 1.4065406075635462e-08, + "loss": 1.5132, + "step": 3630 + }, + { + "epoch": 0.2820721453756442, + "grad_norm": 2.028756834169269, + "learning_rate": 1.4104153750774954e-08, + "loss": 1.5092, + "step": 3640 + }, + { + "epoch": 0.2828470688519509, + "grad_norm": 2.1858403214181723, + "learning_rate": 1.4142901425914445e-08, + "loss": 1.4829, + "step": 3650 + }, + { + "epoch": 0.2836219923282576, + "grad_norm": 2.126727914254776, + "learning_rate": 1.4181649101053937e-08, + "loss": 1.5199, + "step": 3660 + }, + { + "epoch": 0.2843969158045643, + "grad_norm": 2.258830446995321, + "learning_rate": 1.422039677619343e-08, + "loss": 1.4951, + "step": 3670 + }, + { + "epoch": 0.28517183928087103, + "grad_norm": 2.20439476505417, + "learning_rate": 1.425914445133292e-08, + "loss": 1.525, + "step": 3680 + }, + { + "epoch": 0.28594676275717773, + "grad_norm": 2.2082412650430356, + "learning_rate": 1.4297892126472412e-08, + "loss": 1.4433, + "step": 3690 + }, + { + "epoch": 0.28672168623348443, + "grad_norm": 2.1411530259946443, + "learning_rate": 1.4336639801611903e-08, + "loss": 1.4896, + "step": 3700 + }, + { + "epoch": 0.2874966097097912, + "grad_norm": 16.50674658553526, + "learning_rate": 1.4375387476751395e-08, + "loss": 1.479, + "step": 3710 + }, + { + "epoch": 0.2882715331860979, + "grad_norm": 2.523783459533072, + "learning_rate": 1.4414135151890887e-08, + "loss": 1.5025, + "step": 3720 + }, + { + "epoch": 0.2890464566624046, + "grad_norm": 2.1016590276871843, + "learning_rate": 1.4452882827030378e-08, + "loss": 1.4872, + "step": 3730 + }, + { + "epoch": 0.2898213801387113, + "grad_norm": 2.0891381087755607, + "learning_rate": 1.449163050216987e-08, + "loss": 1.4838, + "step": 3740 + }, + { + "epoch": 0.29059630361501804, + "grad_norm": 2.146080337595682, + "learning_rate": 1.4530378177309362e-08, + "loss": 1.4996, + "step": 3750 + }, + { + "epoch": 0.29137122709132474, + "grad_norm": 2.2681402189540654, + "learning_rate": 1.4569125852448853e-08, + "loss": 1.4735, + "step": 3760 + }, + { + "epoch": 0.29214615056763144, + "grad_norm": 2.094541392008989, + "learning_rate": 1.4607873527588345e-08, + "loss": 1.4924, + "step": 3770 + }, + { + "epoch": 0.29292107404393813, + "grad_norm": 2.130804000161082, + "learning_rate": 1.4646621202727837e-08, + "loss": 1.4713, + "step": 3780 + }, + { + "epoch": 0.2936959975202449, + "grad_norm": 2.0477068807660403, + "learning_rate": 1.468536887786733e-08, + "loss": 1.4817, + "step": 3790 + }, + { + "epoch": 0.2944709209965516, + "grad_norm": 2.0372172288919232, + "learning_rate": 1.4724116553006821e-08, + "loss": 1.4956, + "step": 3800 + }, + { + "epoch": 0.2952458444728583, + "grad_norm": 5.765593523360565, + "learning_rate": 1.4762864228146314e-08, + "loss": 1.4918, + "step": 3810 + }, + { + "epoch": 0.29602076794916504, + "grad_norm": 2.083252363183582, + "learning_rate": 1.4801611903285806e-08, + "loss": 1.4806, + "step": 3820 + }, + { + "epoch": 0.29679569142547174, + "grad_norm": 2.0857534678757133, + "learning_rate": 1.4840359578425296e-08, + "loss": 1.4582, + "step": 3830 + }, + { + "epoch": 0.29757061490177844, + "grad_norm": 2.207123126288962, + "learning_rate": 1.4879107253564788e-08, + "loss": 1.4692, + "step": 3840 + }, + { + "epoch": 0.29834553837808514, + "grad_norm": 2.3187791482097166, + "learning_rate": 1.491785492870428e-08, + "loss": 1.4807, + "step": 3850 + }, + { + "epoch": 0.2991204618543919, + "grad_norm": 2.2285916851887184, + "learning_rate": 1.495660260384377e-08, + "loss": 1.475, + "step": 3860 + }, + { + "epoch": 0.2998953853306986, + "grad_norm": 2.086709450066472, + "learning_rate": 1.4995350278983263e-08, + "loss": 1.4622, + "step": 3870 + }, + { + "epoch": 0.3006703088070053, + "grad_norm": 2.2473779521563575, + "learning_rate": 1.5034097954122756e-08, + "loss": 1.5019, + "step": 3880 + }, + { + "epoch": 0.30144523228331205, + "grad_norm": 2.1280014651081056, + "learning_rate": 1.5072845629262248e-08, + "loss": 1.4694, + "step": 3890 + }, + { + "epoch": 0.30222015575961875, + "grad_norm": 2.0305453495316503, + "learning_rate": 1.5111593304401737e-08, + "loss": 1.4683, + "step": 3900 + }, + { + "epoch": 0.30299507923592545, + "grad_norm": 2.0849773432233225, + "learning_rate": 1.515034097954123e-08, + "loss": 1.4662, + "step": 3910 + }, + { + "epoch": 0.30377000271223215, + "grad_norm": 2.0263896960309857, + "learning_rate": 1.518908865468072e-08, + "loss": 1.4676, + "step": 3920 + }, + { + "epoch": 0.3045449261885389, + "grad_norm": 2.2002886335729985, + "learning_rate": 1.5227836329820213e-08, + "loss": 1.5185, + "step": 3930 + }, + { + "epoch": 0.3053198496648456, + "grad_norm": 2.202961631561911, + "learning_rate": 1.5266584004959706e-08, + "loss": 1.4545, + "step": 3940 + }, + { + "epoch": 0.3060947731411523, + "grad_norm": 1.9439649719628256, + "learning_rate": 1.5305331680099198e-08, + "loss": 1.4368, + "step": 3950 + }, + { + "epoch": 0.30686969661745905, + "grad_norm": 2.069050696121599, + "learning_rate": 1.5344079355238687e-08, + "loss": 1.4687, + "step": 3960 + }, + { + "epoch": 0.30764462009376575, + "grad_norm": 1.9624705002517318, + "learning_rate": 1.538282703037818e-08, + "loss": 1.4573, + "step": 3970 + }, + { + "epoch": 0.30841954357007245, + "grad_norm": 2.047349445107125, + "learning_rate": 1.542157470551767e-08, + "loss": 1.477, + "step": 3980 + }, + { + "epoch": 0.30919446704637915, + "grad_norm": 2.1784199401405187, + "learning_rate": 1.5460322380657163e-08, + "loss": 1.4916, + "step": 3990 + }, + { + "epoch": 0.3099693905226859, + "grad_norm": 2.018482946768411, + "learning_rate": 1.5499070055796655e-08, + "loss": 1.4576, + "step": 4000 + }, + { + "epoch": 0.3099693905226859, + "eval_loss": 1.46771240234375, + "eval_runtime": 317.8562, + "eval_samples_per_second": 36.089, + "eval_steps_per_second": 9.023, + "step": 4000 + }, + { + "epoch": 0.3107443139989926, + "grad_norm": 2.032551388014473, + "learning_rate": 1.5537817730936148e-08, + "loss": 1.4753, + "step": 4010 + }, + { + "epoch": 0.3115192374752993, + "grad_norm": 2.2593157847958363, + "learning_rate": 1.5576565406075637e-08, + "loss": 1.4673, + "step": 4020 + }, + { + "epoch": 0.312294160951606, + "grad_norm": 1.9782030419759056, + "learning_rate": 1.561531308121513e-08, + "loss": 1.4729, + "step": 4030 + }, + { + "epoch": 0.31306908442791276, + "grad_norm": 2.103351604871065, + "learning_rate": 1.565406075635462e-08, + "loss": 1.4635, + "step": 4040 + }, + { + "epoch": 0.31384400790421946, + "grad_norm": 2.0975839785701287, + "learning_rate": 1.5692808431494113e-08, + "loss": 1.4459, + "step": 4050 + }, + { + "epoch": 0.31461893138052616, + "grad_norm": 2.0654428567830045, + "learning_rate": 1.5731556106633605e-08, + "loss": 1.4425, + "step": 4060 + }, + { + "epoch": 0.3153938548568329, + "grad_norm": 2.2579630761240264, + "learning_rate": 1.5770303781773094e-08, + "loss": 1.4841, + "step": 4070 + }, + { + "epoch": 0.3161687783331396, + "grad_norm": 2.0648773640533884, + "learning_rate": 1.5809051456912586e-08, + "loss": 1.4534, + "step": 4080 + }, + { + "epoch": 0.3169437018094463, + "grad_norm": 1.9277227339297653, + "learning_rate": 1.584779913205208e-08, + "loss": 1.4649, + "step": 4090 + }, + { + "epoch": 0.317718625285753, + "grad_norm": 2.0620087191759797, + "learning_rate": 1.588654680719157e-08, + "loss": 1.438, + "step": 4100 + }, + { + "epoch": 0.31849354876205976, + "grad_norm": 2.094421571918141, + "learning_rate": 1.5925294482331063e-08, + "loss": 1.473, + "step": 4110 + }, + { + "epoch": 0.31926847223836646, + "grad_norm": 2.0464913737918806, + "learning_rate": 1.5964042157470555e-08, + "loss": 1.4467, + "step": 4120 + }, + { + "epoch": 0.32004339571467316, + "grad_norm": 2.063675857738863, + "learning_rate": 1.6002789832610044e-08, + "loss": 1.4534, + "step": 4130 + }, + { + "epoch": 0.3208183191909799, + "grad_norm": 2.0159752929344474, + "learning_rate": 1.6041537507749536e-08, + "loss": 1.462, + "step": 4140 + }, + { + "epoch": 0.3215932426672866, + "grad_norm": 2.2552122666138366, + "learning_rate": 1.608028518288903e-08, + "loss": 1.4454, + "step": 4150 + }, + { + "epoch": 0.3223681661435933, + "grad_norm": 2.0845077083157193, + "learning_rate": 1.611903285802852e-08, + "loss": 1.4637, + "step": 4160 + }, + { + "epoch": 0.3231430896199, + "grad_norm": 1.9892586755611343, + "learning_rate": 1.6157780533168013e-08, + "loss": 1.4594, + "step": 4170 + }, + { + "epoch": 0.32391801309620677, + "grad_norm": 2.0473643154479277, + "learning_rate": 1.6196528208307505e-08, + "loss": 1.4729, + "step": 4180 + }, + { + "epoch": 0.32469293657251347, + "grad_norm": 2.064553974619053, + "learning_rate": 1.6235275883446994e-08, + "loss": 1.4388, + "step": 4190 + }, + { + "epoch": 0.32546786004882017, + "grad_norm": 1.980222683947851, + "learning_rate": 1.6274023558586486e-08, + "loss": 1.4593, + "step": 4200 + }, + { + "epoch": 0.32624278352512687, + "grad_norm": 2.004372569102924, + "learning_rate": 1.631277123372598e-08, + "loss": 1.4599, + "step": 4210 + }, + { + "epoch": 0.3270177070014336, + "grad_norm": 1.8624427194595095, + "learning_rate": 1.635151890886547e-08, + "loss": 1.4693, + "step": 4220 + }, + { + "epoch": 0.3277926304777403, + "grad_norm": 2.012614021443024, + "learning_rate": 1.6390266584004963e-08, + "loss": 1.4566, + "step": 4230 + }, + { + "epoch": 0.328567553954047, + "grad_norm": 1.954014568743121, + "learning_rate": 1.6429014259144452e-08, + "loss": 1.441, + "step": 4240 + }, + { + "epoch": 0.3293424774303538, + "grad_norm": 2.12274565598196, + "learning_rate": 1.6467761934283944e-08, + "loss": 1.4653, + "step": 4250 + }, + { + "epoch": 0.3301174009066605, + "grad_norm": 2.0969244773780265, + "learning_rate": 1.6506509609423436e-08, + "loss": 1.4768, + "step": 4260 + }, + { + "epoch": 0.3308923243829672, + "grad_norm": 2.2042540021517674, + "learning_rate": 1.6545257284562928e-08, + "loss": 1.4586, + "step": 4270 + }, + { + "epoch": 0.3316672478592739, + "grad_norm": 2.3198762342010975, + "learning_rate": 1.658400495970242e-08, + "loss": 1.4492, + "step": 4280 + }, + { + "epoch": 0.3324421713355806, + "grad_norm": 1.954567500125166, + "learning_rate": 1.6622752634841913e-08, + "loss": 1.4795, + "step": 4290 + }, + { + "epoch": 0.3332170948118873, + "grad_norm": 1.8624179214068792, + "learning_rate": 1.66615003099814e-08, + "loss": 1.4483, + "step": 4300 + }, + { + "epoch": 0.333992018288194, + "grad_norm": 1.9240580268869203, + "learning_rate": 1.6700247985120894e-08, + "loss": 1.4536, + "step": 4310 + }, + { + "epoch": 0.3347669417645008, + "grad_norm": 2.0153524345982756, + "learning_rate": 1.6738995660260386e-08, + "loss": 1.4501, + "step": 4320 + }, + { + "epoch": 0.3355418652408075, + "grad_norm": 1.933107877270082, + "learning_rate": 1.6777743335399878e-08, + "loss": 1.4527, + "step": 4330 + }, + { + "epoch": 0.3363167887171142, + "grad_norm": 1.9480889097207332, + "learning_rate": 1.681649101053937e-08, + "loss": 1.4603, + "step": 4340 + }, + { + "epoch": 0.3370917121934209, + "grad_norm": 2.001961972974772, + "learning_rate": 1.6855238685678863e-08, + "loss": 1.462, + "step": 4350 + }, + { + "epoch": 0.33786663566972763, + "grad_norm": 1.9008890739545212, + "learning_rate": 1.689398636081835e-08, + "loss": 1.4357, + "step": 4360 + }, + { + "epoch": 0.33864155914603433, + "grad_norm": 1.9022208101974156, + "learning_rate": 1.6932734035957844e-08, + "loss": 1.4497, + "step": 4370 + }, + { + "epoch": 0.33941648262234103, + "grad_norm": 2.348494890386764, + "learning_rate": 1.6971481711097336e-08, + "loss": 1.4249, + "step": 4380 + }, + { + "epoch": 0.3401914060986478, + "grad_norm": 1.9980797215301493, + "learning_rate": 1.7010229386236828e-08, + "loss": 1.4532, + "step": 4390 + }, + { + "epoch": 0.3409663295749545, + "grad_norm": 1.8233836842746247, + "learning_rate": 1.704897706137632e-08, + "loss": 1.4129, + "step": 4400 + }, + { + "epoch": 0.3417412530512612, + "grad_norm": 1.8755909798777675, + "learning_rate": 1.7087724736515813e-08, + "loss": 1.4381, + "step": 4410 + }, + { + "epoch": 0.3425161765275679, + "grad_norm": 1.898254331813473, + "learning_rate": 1.71264724116553e-08, + "loss": 1.4434, + "step": 4420 + }, + { + "epoch": 0.34329110000387464, + "grad_norm": 2.038799316902501, + "learning_rate": 1.7165220086794794e-08, + "loss": 1.435, + "step": 4430 + }, + { + "epoch": 0.34406602348018134, + "grad_norm": 2.3475960534078513, + "learning_rate": 1.7203967761934286e-08, + "loss": 1.4525, + "step": 4440 + }, + { + "epoch": 0.34484094695648804, + "grad_norm": 1.9125749446053195, + "learning_rate": 1.7242715437073778e-08, + "loss": 1.4374, + "step": 4450 + }, + { + "epoch": 0.34561587043279474, + "grad_norm": 1.951086727983187, + "learning_rate": 1.728146311221327e-08, + "loss": 1.4312, + "step": 4460 + }, + { + "epoch": 0.3463907939091015, + "grad_norm": 2.000825179382526, + "learning_rate": 1.732021078735276e-08, + "loss": 1.4455, + "step": 4470 + }, + { + "epoch": 0.3471657173854082, + "grad_norm": 2.5827393573139372, + "learning_rate": 1.735895846249225e-08, + "loss": 1.4417, + "step": 4480 + }, + { + "epoch": 0.3479406408617149, + "grad_norm": 2.0740932348797214, + "learning_rate": 1.7397706137631744e-08, + "loss": 1.4323, + "step": 4490 + }, + { + "epoch": 0.34871556433802164, + "grad_norm": 1.9543484161209685, + "learning_rate": 1.7436453812771236e-08, + "loss": 1.4365, + "step": 4500 + }, + { + "epoch": 0.34871556433802164, + "eval_loss": 1.4376304149627686, + "eval_runtime": 317.289, + "eval_samples_per_second": 36.153, + "eval_steps_per_second": 9.039, + "step": 4500 + }, + { + "epoch": 0.34949048781432834, + "grad_norm": 2.1141701625955625, + "learning_rate": 1.7475201487910728e-08, + "loss": 1.4245, + "step": 4510 + }, + { + "epoch": 0.35026541129063504, + "grad_norm": 1.8294668951498207, + "learning_rate": 1.751394916305022e-08, + "loss": 1.4409, + "step": 4520 + }, + { + "epoch": 0.35104033476694174, + "grad_norm": 1.923093252561926, + "learning_rate": 1.755269683818971e-08, + "loss": 1.3984, + "step": 4530 + }, + { + "epoch": 0.3518152582432485, + "grad_norm": 2.0839553036471266, + "learning_rate": 1.75914445133292e-08, + "loss": 1.4214, + "step": 4540 + }, + { + "epoch": 0.3525901817195552, + "grad_norm": 1.9024500732546843, + "learning_rate": 1.7630192188468693e-08, + "loss": 1.4659, + "step": 4550 + }, + { + "epoch": 0.3533651051958619, + "grad_norm": 2.2105441326359463, + "learning_rate": 1.7668939863608186e-08, + "loss": 1.4419, + "step": 4560 + }, + { + "epoch": 0.35414002867216865, + "grad_norm": 1.9204159887016312, + "learning_rate": 1.7707687538747678e-08, + "loss": 1.4369, + "step": 4570 + }, + { + "epoch": 0.35491495214847535, + "grad_norm": 2.098366870066466, + "learning_rate": 1.774643521388717e-08, + "loss": 1.4277, + "step": 4580 + }, + { + "epoch": 0.35568987562478205, + "grad_norm": 1.8425051814954072, + "learning_rate": 1.778518288902666e-08, + "loss": 1.4357, + "step": 4590 + }, + { + "epoch": 0.35646479910108875, + "grad_norm": 1.9664746285177312, + "learning_rate": 1.782393056416615e-08, + "loss": 1.4141, + "step": 4600 + }, + { + "epoch": 0.3572397225773955, + "grad_norm": 1.827365321624947, + "learning_rate": 1.7862678239305643e-08, + "loss": 1.417, + "step": 4610 + }, + { + "epoch": 0.3580146460537022, + "grad_norm": 1.937377584841897, + "learning_rate": 1.7901425914445135e-08, + "loss": 1.4309, + "step": 4620 + }, + { + "epoch": 0.3587895695300089, + "grad_norm": 2.0767843596002864, + "learning_rate": 1.7940173589584628e-08, + "loss": 1.4098, + "step": 4630 + }, + { + "epoch": 0.3595644930063156, + "grad_norm": 2.0000107824161693, + "learning_rate": 1.797892126472412e-08, + "loss": 1.4185, + "step": 4640 + }, + { + "epoch": 0.36033941648262235, + "grad_norm": 1.8163994007798316, + "learning_rate": 1.801766893986361e-08, + "loss": 1.4745, + "step": 4650 + }, + { + "epoch": 0.36111433995892905, + "grad_norm": 1.9558842156880607, + "learning_rate": 1.80564166150031e-08, + "loss": 1.4491, + "step": 4660 + }, + { + "epoch": 0.36188926343523575, + "grad_norm": 2.0482532234681847, + "learning_rate": 1.8095164290142593e-08, + "loss": 1.4199, + "step": 4670 + }, + { + "epoch": 0.3626641869115425, + "grad_norm": 1.9015779167301499, + "learning_rate": 1.8133911965282085e-08, + "loss": 1.4584, + "step": 4680 + }, + { + "epoch": 0.3634391103878492, + "grad_norm": 1.8652158950688125, + "learning_rate": 1.8172659640421578e-08, + "loss": 1.3979, + "step": 4690 + }, + { + "epoch": 0.3642140338641559, + "grad_norm": 2.1203168009720024, + "learning_rate": 1.8211407315561066e-08, + "loss": 1.4232, + "step": 4700 + }, + { + "epoch": 0.3649889573404626, + "grad_norm": 1.923678434579324, + "learning_rate": 1.825015499070056e-08, + "loss": 1.4253, + "step": 4710 + }, + { + "epoch": 0.36576388081676936, + "grad_norm": 2.07107366919737, + "learning_rate": 1.828890266584005e-08, + "loss": 1.4141, + "step": 4720 + }, + { + "epoch": 0.36653880429307606, + "grad_norm": 2.086913383231131, + "learning_rate": 1.8327650340979543e-08, + "loss": 1.4412, + "step": 4730 + }, + { + "epoch": 0.36731372776938276, + "grad_norm": 1.9930551769822478, + "learning_rate": 1.8366398016119035e-08, + "loss": 1.4438, + "step": 4740 + }, + { + "epoch": 0.3680886512456895, + "grad_norm": 1.9257170730027384, + "learning_rate": 1.8405145691258527e-08, + "loss": 1.4385, + "step": 4750 + }, + { + "epoch": 0.3688635747219962, + "grad_norm": 2.026755631347902, + "learning_rate": 1.8443893366398016e-08, + "loss": 1.4419, + "step": 4760 + }, + { + "epoch": 0.3696384981983029, + "grad_norm": 1.88996294861281, + "learning_rate": 1.848264104153751e-08, + "loss": 1.4572, + "step": 4770 + }, + { + "epoch": 0.3704134216746096, + "grad_norm": 1.9383463857356458, + "learning_rate": 1.8521388716677e-08, + "loss": 1.4243, + "step": 4780 + }, + { + "epoch": 0.37118834515091637, + "grad_norm": 1.881331891271936, + "learning_rate": 1.8560136391816493e-08, + "loss": 1.4349, + "step": 4790 + }, + { + "epoch": 0.37196326862722306, + "grad_norm": 1.9722163312869534, + "learning_rate": 1.8598884066955985e-08, + "loss": 1.4239, + "step": 4800 + }, + { + "epoch": 0.37273819210352976, + "grad_norm": 2.0174636350560426, + "learning_rate": 1.8637631742095477e-08, + "loss": 1.4412, + "step": 4810 + }, + { + "epoch": 0.3735131155798365, + "grad_norm": 1.8939780069662269, + "learning_rate": 1.8676379417234966e-08, + "loss": 1.389, + "step": 4820 + }, + { + "epoch": 0.3742880390561432, + "grad_norm": 1.942300837712324, + "learning_rate": 1.871512709237446e-08, + "loss": 1.4456, + "step": 4830 + }, + { + "epoch": 0.3750629625324499, + "grad_norm": 2.0682657801688644, + "learning_rate": 1.875387476751395e-08, + "loss": 1.4274, + "step": 4840 + }, + { + "epoch": 0.3758378860087566, + "grad_norm": 1.9771029447297441, + "learning_rate": 1.8792622442653443e-08, + "loss": 1.4311, + "step": 4850 + }, + { + "epoch": 0.37661280948506337, + "grad_norm": 1.990371612872751, + "learning_rate": 1.8831370117792935e-08, + "loss": 1.4036, + "step": 4860 + }, + { + "epoch": 0.37738773296137007, + "grad_norm": 2.0073607534098876, + "learning_rate": 1.8870117792932427e-08, + "loss": 1.446, + "step": 4870 + }, + { + "epoch": 0.37816265643767677, + "grad_norm": 1.799198381741255, + "learning_rate": 1.8908865468071916e-08, + "loss": 1.428, + "step": 4880 + }, + { + "epoch": 0.37893757991398347, + "grad_norm": 1.8321483210363987, + "learning_rate": 1.894761314321141e-08, + "loss": 1.4188, + "step": 4890 + }, + { + "epoch": 0.3797125033902902, + "grad_norm": 1.804386140428998, + "learning_rate": 1.89863608183509e-08, + "loss": 1.4427, + "step": 4900 + }, + { + "epoch": 0.3804874268665969, + "grad_norm": 1.7519556752312881, + "learning_rate": 1.9025108493490393e-08, + "loss": 1.4279, + "step": 4910 + }, + { + "epoch": 0.3812623503429036, + "grad_norm": 1.9905856575509207, + "learning_rate": 1.9063856168629885e-08, + "loss": 1.4152, + "step": 4920 + }, + { + "epoch": 0.3820372738192104, + "grad_norm": 1.8589991794281375, + "learning_rate": 1.9102603843769374e-08, + "loss": 1.4158, + "step": 4930 + }, + { + "epoch": 0.3828121972955171, + "grad_norm": 1.9658752676772593, + "learning_rate": 1.9141351518908866e-08, + "loss": 1.4153, + "step": 4940 + }, + { + "epoch": 0.3835871207718238, + "grad_norm": 1.9734611056383533, + "learning_rate": 1.9180099194048358e-08, + "loss": 1.4081, + "step": 4950 + }, + { + "epoch": 0.3843620442481305, + "grad_norm": 1.8244603487327848, + "learning_rate": 1.921884686918785e-08, + "loss": 1.4058, + "step": 4960 + }, + { + "epoch": 0.38513696772443723, + "grad_norm": 1.8930426726654723, + "learning_rate": 1.9257594544327343e-08, + "loss": 1.3869, + "step": 4970 + }, + { + "epoch": 0.38591189120074393, + "grad_norm": 2.162366094249367, + "learning_rate": 1.9296342219466835e-08, + "loss": 1.4175, + "step": 4980 + }, + { + "epoch": 0.3866868146770506, + "grad_norm": 1.8756777067753827, + "learning_rate": 1.9335089894606324e-08, + "loss": 1.405, + "step": 4990 + }, + { + "epoch": 0.3874617381533574, + "grad_norm": 2.0005098607490397, + "learning_rate": 1.9373837569745816e-08, + "loss": 1.4481, + "step": 5000 + }, + { + "epoch": 0.3874617381533574, + "eval_loss": 1.4081398248672485, + "eval_runtime": 318.4877, + "eval_samples_per_second": 36.017, + "eval_steps_per_second": 9.005, + "step": 5000 + }, + { + "epoch": 0.3882366616296641, + "grad_norm": 1.9324335287514265, + "learning_rate": 1.9412585244885308e-08, + "loss": 1.4376, + "step": 5010 + }, + { + "epoch": 0.3890115851059708, + "grad_norm": 1.7935206214936383, + "learning_rate": 1.94513329200248e-08, + "loss": 1.4217, + "step": 5020 + }, + { + "epoch": 0.3897865085822775, + "grad_norm": 1.8160666974991877, + "learning_rate": 1.9490080595164293e-08, + "loss": 1.3888, + "step": 5030 + }, + { + "epoch": 0.39056143205858423, + "grad_norm": 1.7705634155748964, + "learning_rate": 1.9528828270303785e-08, + "loss": 1.3967, + "step": 5040 + }, + { + "epoch": 0.39133635553489093, + "grad_norm": 1.8888661145345023, + "learning_rate": 1.9567575945443274e-08, + "loss": 1.4168, + "step": 5050 + }, + { + "epoch": 0.39211127901119763, + "grad_norm": 1.8019450453923476, + "learning_rate": 1.9606323620582766e-08, + "loss": 1.3869, + "step": 5060 + }, + { + "epoch": 0.39288620248750433, + "grad_norm": 1.774729623957452, + "learning_rate": 1.9645071295722258e-08, + "loss": 1.3824, + "step": 5070 + }, + { + "epoch": 0.3936611259638111, + "grad_norm": 1.9238421760408075, + "learning_rate": 1.968381897086175e-08, + "loss": 1.4071, + "step": 5080 + }, + { + "epoch": 0.3944360494401178, + "grad_norm": 1.8026806063470113, + "learning_rate": 1.9722566646001242e-08, + "loss": 1.407, + "step": 5090 + }, + { + "epoch": 0.3952109729164245, + "grad_norm": 1.8427702097283123, + "learning_rate": 1.9761314321140735e-08, + "loss": 1.4281, + "step": 5100 + }, + { + "epoch": 0.39598589639273124, + "grad_norm": 1.8066550241240436, + "learning_rate": 1.9800061996280224e-08, + "loss": 1.3969, + "step": 5110 + }, + { + "epoch": 0.39676081986903794, + "grad_norm": 1.985393258860536, + "learning_rate": 1.9838809671419716e-08, + "loss": 1.4, + "step": 5120 + }, + { + "epoch": 0.39753574334534464, + "grad_norm": 1.9112764242131048, + "learning_rate": 1.9877557346559208e-08, + "loss": 1.4155, + "step": 5130 + }, + { + "epoch": 0.39831066682165134, + "grad_norm": 1.7322379490774231, + "learning_rate": 1.99163050216987e-08, + "loss": 1.4246, + "step": 5140 + }, + { + "epoch": 0.3990855902979581, + "grad_norm": 1.8947171455375242, + "learning_rate": 1.9955052696838192e-08, + "loss": 1.4396, + "step": 5150 + }, + { + "epoch": 0.3998605137742648, + "grad_norm": 1.7528846830709879, + "learning_rate": 1.999380037197768e-08, + "loss": 1.4309, + "step": 5160 + }, + { + "epoch": 0.4006354372505715, + "grad_norm": 1.7720818318261586, + "learning_rate": 2.0032548047117173e-08, + "loss": 1.4112, + "step": 5170 + }, + { + "epoch": 0.40141036072687825, + "grad_norm": 1.7317741722962714, + "learning_rate": 2.0071295722256666e-08, + "loss": 1.3983, + "step": 5180 + }, + { + "epoch": 0.40218528420318495, + "grad_norm": 1.7772901549921187, + "learning_rate": 2.0110043397396158e-08, + "loss": 1.3927, + "step": 5190 + }, + { + "epoch": 0.40296020767949164, + "grad_norm": 1.6756670934196745, + "learning_rate": 2.014879107253565e-08, + "loss": 1.3723, + "step": 5200 + }, + { + "epoch": 0.40373513115579834, + "grad_norm": 1.7539152622932865, + "learning_rate": 2.0187538747675142e-08, + "loss": 1.3849, + "step": 5210 + }, + { + "epoch": 0.4045100546321051, + "grad_norm": 1.7642326357444957, + "learning_rate": 2.022628642281463e-08, + "loss": 1.3916, + "step": 5220 + }, + { + "epoch": 0.4052849781084118, + "grad_norm": 1.842691379533282, + "learning_rate": 2.0265034097954123e-08, + "loss": 1.4075, + "step": 5230 + }, + { + "epoch": 0.4060599015847185, + "grad_norm": 1.8744861311090635, + "learning_rate": 2.0303781773093616e-08, + "loss": 1.3785, + "step": 5240 + }, + { + "epoch": 0.40683482506102525, + "grad_norm": 1.8838746258971883, + "learning_rate": 2.0342529448233108e-08, + "loss": 1.4145, + "step": 5250 + }, + { + "epoch": 0.40760974853733195, + "grad_norm": 1.885317911355544, + "learning_rate": 2.03812771233726e-08, + "loss": 1.3797, + "step": 5260 + }, + { + "epoch": 0.40838467201363865, + "grad_norm": 2.292397316164498, + "learning_rate": 2.0420024798512092e-08, + "loss": 1.3755, + "step": 5270 + }, + { + "epoch": 0.40915959548994535, + "grad_norm": 1.9044548421949006, + "learning_rate": 2.045877247365158e-08, + "loss": 1.4002, + "step": 5280 + }, + { + "epoch": 0.4099345189662521, + "grad_norm": 1.6518783899021756, + "learning_rate": 2.0497520148791073e-08, + "loss": 1.3864, + "step": 5290 + }, + { + "epoch": 0.4107094424425588, + "grad_norm": 1.69223985980797, + "learning_rate": 2.0536267823930565e-08, + "loss": 1.3792, + "step": 5300 + }, + { + "epoch": 0.4114843659188655, + "grad_norm": 1.863706244792208, + "learning_rate": 2.0575015499070058e-08, + "loss": 1.4195, + "step": 5310 + }, + { + "epoch": 0.4122592893951722, + "grad_norm": 1.9375913202463957, + "learning_rate": 2.061376317420955e-08, + "loss": 1.3954, + "step": 5320 + }, + { + "epoch": 0.41303421287147896, + "grad_norm": 1.7493151248066647, + "learning_rate": 2.0652510849349042e-08, + "loss": 1.373, + "step": 5330 + }, + { + "epoch": 0.41380913634778566, + "grad_norm": 2.02581581701517, + "learning_rate": 2.069125852448853e-08, + "loss": 1.4073, + "step": 5340 + }, + { + "epoch": 0.41458405982409235, + "grad_norm": 1.8600352555164068, + "learning_rate": 2.0730006199628023e-08, + "loss": 1.3691, + "step": 5350 + }, + { + "epoch": 0.4153589833003991, + "grad_norm": 1.7731436837514571, + "learning_rate": 2.0768753874767515e-08, + "loss": 1.3954, + "step": 5360 + }, + { + "epoch": 0.4161339067767058, + "grad_norm": 1.687670202549249, + "learning_rate": 2.0807501549907008e-08, + "loss": 1.3687, + "step": 5370 + }, + { + "epoch": 0.4169088302530125, + "grad_norm": 1.811648664351653, + "learning_rate": 2.08462492250465e-08, + "loss": 1.3925, + "step": 5380 + }, + { + "epoch": 0.4176837537293192, + "grad_norm": 1.8448436528013474, + "learning_rate": 2.088499690018599e-08, + "loss": 1.3718, + "step": 5390 + }, + { + "epoch": 0.41845867720562596, + "grad_norm": 1.8579764467741446, + "learning_rate": 2.092374457532548e-08, + "loss": 1.3617, + "step": 5400 + }, + { + "epoch": 0.41923360068193266, + "grad_norm": 1.917950722719247, + "learning_rate": 2.0962492250464973e-08, + "loss": 1.3814, + "step": 5410 + }, + { + "epoch": 0.42000852415823936, + "grad_norm": 1.7891346230019944, + "learning_rate": 2.1001239925604465e-08, + "loss": 1.3926, + "step": 5420 + }, + { + "epoch": 0.4207834476345461, + "grad_norm": 2.034392303493986, + "learning_rate": 2.1039987600743957e-08, + "loss": 1.3382, + "step": 5430 + }, + { + "epoch": 0.4215583711108528, + "grad_norm": 1.7354763112348361, + "learning_rate": 2.107873527588345e-08, + "loss": 1.3873, + "step": 5440 + }, + { + "epoch": 0.4223332945871595, + "grad_norm": 1.901080320981215, + "learning_rate": 2.111748295102294e-08, + "loss": 1.3914, + "step": 5450 + }, + { + "epoch": 0.4231082180634662, + "grad_norm": 1.7996408794447196, + "learning_rate": 2.115623062616243e-08, + "loss": 1.3767, + "step": 5460 + }, + { + "epoch": 0.42388314153977297, + "grad_norm": 1.832708856562478, + "learning_rate": 2.1194978301301923e-08, + "loss": 1.3649, + "step": 5470 + }, + { + "epoch": 0.42465806501607967, + "grad_norm": 1.8189571232576658, + "learning_rate": 2.1233725976441415e-08, + "loss": 1.4109, + "step": 5480 + }, + { + "epoch": 0.42543298849238637, + "grad_norm": 2.110891714838711, + "learning_rate": 2.1272473651580907e-08, + "loss": 1.3977, + "step": 5490 + }, + { + "epoch": 0.42620791196869307, + "grad_norm": 1.7894338499139657, + "learning_rate": 2.13112213267204e-08, + "loss": 1.3635, + "step": 5500 + }, + { + "epoch": 0.42620791196869307, + "eval_loss": 1.3836323022842407, + "eval_runtime": 318.922, + "eval_samples_per_second": 35.968, + "eval_steps_per_second": 8.993, + "step": 5500 + }, + { + "epoch": 0.4269828354449998, + "grad_norm": 1.9602202050575965, + "learning_rate": 2.134996900185989e-08, + "loss": 1.407, + "step": 5510 + }, + { + "epoch": 0.4277577589213065, + "grad_norm": 1.790109461990561, + "learning_rate": 2.138871667699938e-08, + "loss": 1.3783, + "step": 5520 + }, + { + "epoch": 0.4285326823976132, + "grad_norm": 1.675748889238598, + "learning_rate": 2.1427464352138873e-08, + "loss": 1.3785, + "step": 5530 + }, + { + "epoch": 0.42930760587392, + "grad_norm": 1.8027433194928235, + "learning_rate": 2.1466212027278365e-08, + "loss": 1.3811, + "step": 5540 + }, + { + "epoch": 0.43008252935022667, + "grad_norm": 1.7696993558084997, + "learning_rate": 2.1504959702417857e-08, + "loss": 1.374, + "step": 5550 + }, + { + "epoch": 0.43085745282653337, + "grad_norm": 1.760601195793188, + "learning_rate": 2.154370737755735e-08, + "loss": 1.3911, + "step": 5560 + }, + { + "epoch": 0.43163237630284007, + "grad_norm": 1.9217544818989145, + "learning_rate": 2.158245505269684e-08, + "loss": 1.3969, + "step": 5570 + }, + { + "epoch": 0.4324072997791468, + "grad_norm": 1.7652393846256893, + "learning_rate": 2.162120272783633e-08, + "loss": 1.3838, + "step": 5580 + }, + { + "epoch": 0.4331822232554535, + "grad_norm": 1.7651200832098881, + "learning_rate": 2.1659950402975823e-08, + "loss": 1.3499, + "step": 5590 + }, + { + "epoch": 0.4339571467317602, + "grad_norm": 1.6807711949389503, + "learning_rate": 2.1698698078115315e-08, + "loss": 1.3646, + "step": 5600 + }, + { + "epoch": 0.434732070208067, + "grad_norm": 1.9243103231317142, + "learning_rate": 2.1737445753254807e-08, + "loss": 1.3881, + "step": 5610 + }, + { + "epoch": 0.4355069936843737, + "grad_norm": 1.7038927526763856, + "learning_rate": 2.1776193428394296e-08, + "loss": 1.3535, + "step": 5620 + }, + { + "epoch": 0.4362819171606804, + "grad_norm": 1.8879458879070954, + "learning_rate": 2.1814941103533788e-08, + "loss": 1.4003, + "step": 5630 + }, + { + "epoch": 0.4370568406369871, + "grad_norm": 1.8237057929645564, + "learning_rate": 2.185368877867328e-08, + "loss": 1.3984, + "step": 5640 + }, + { + "epoch": 0.43783176411329383, + "grad_norm": 1.8366544197665295, + "learning_rate": 2.1892436453812773e-08, + "loss": 1.3807, + "step": 5650 + }, + { + "epoch": 0.43860668758960053, + "grad_norm": 1.888871552321471, + "learning_rate": 2.1931184128952265e-08, + "loss": 1.3849, + "step": 5660 + }, + { + "epoch": 0.43938161106590723, + "grad_norm": 1.7294049745586926, + "learning_rate": 2.1969931804091757e-08, + "loss": 1.3863, + "step": 5670 + }, + { + "epoch": 0.440156534542214, + "grad_norm": 1.7754955276195574, + "learning_rate": 2.2008679479231246e-08, + "loss": 1.3773, + "step": 5680 + }, + { + "epoch": 0.4409314580185207, + "grad_norm": 1.8542974033476263, + "learning_rate": 2.2047427154370738e-08, + "loss": 1.3884, + "step": 5690 + }, + { + "epoch": 0.4417063814948274, + "grad_norm": 1.7798835236752109, + "learning_rate": 2.208617482951023e-08, + "loss": 1.3602, + "step": 5700 + }, + { + "epoch": 0.4424813049711341, + "grad_norm": 1.952644616001783, + "learning_rate": 2.2124922504649723e-08, + "loss": 1.3839, + "step": 5710 + }, + { + "epoch": 0.44325622844744084, + "grad_norm": 1.724390695273649, + "learning_rate": 2.2163670179789215e-08, + "loss": 1.3829, + "step": 5720 + }, + { + "epoch": 0.44403115192374754, + "grad_norm": 1.7284935108242139, + "learning_rate": 2.2202417854928707e-08, + "loss": 1.3734, + "step": 5730 + }, + { + "epoch": 0.44480607540005423, + "grad_norm": 1.8290376974920637, + "learning_rate": 2.2241165530068196e-08, + "loss": 1.3784, + "step": 5740 + }, + { + "epoch": 0.44558099887636093, + "grad_norm": 1.9775704921016792, + "learning_rate": 2.2279913205207688e-08, + "loss": 1.3627, + "step": 5750 + }, + { + "epoch": 0.4463559223526677, + "grad_norm": 1.7336633116882796, + "learning_rate": 2.231866088034718e-08, + "loss": 1.3825, + "step": 5760 + }, + { + "epoch": 0.4471308458289744, + "grad_norm": 1.686771878256135, + "learning_rate": 2.2357408555486672e-08, + "loss": 1.3643, + "step": 5770 + }, + { + "epoch": 0.4479057693052811, + "grad_norm": 1.805152107093289, + "learning_rate": 2.2396156230626165e-08, + "loss": 1.3691, + "step": 5780 + }, + { + "epoch": 0.44868069278158784, + "grad_norm": 1.7761163891558707, + "learning_rate": 2.2434903905765657e-08, + "loss": 1.3672, + "step": 5790 + }, + { + "epoch": 0.44945561625789454, + "grad_norm": 1.8829530953912879, + "learning_rate": 2.2473651580905146e-08, + "loss": 1.3867, + "step": 5800 + }, + { + "epoch": 0.45023053973420124, + "grad_norm": 1.626842277595545, + "learning_rate": 2.2512399256044638e-08, + "loss": 1.3515, + "step": 5810 + }, + { + "epoch": 0.45100546321050794, + "grad_norm": 1.6405142292796924, + "learning_rate": 2.255114693118413e-08, + "loss": 1.3428, + "step": 5820 + }, + { + "epoch": 0.4517803866868147, + "grad_norm": 1.7817305997558786, + "learning_rate": 2.2589894606323622e-08, + "loss": 1.3764, + "step": 5830 + }, + { + "epoch": 0.4525553101631214, + "grad_norm": 1.8754716464118986, + "learning_rate": 2.2628642281463115e-08, + "loss": 1.4059, + "step": 5840 + }, + { + "epoch": 0.4533302336394281, + "grad_norm": 1.7585705003502603, + "learning_rate": 2.2667389956602603e-08, + "loss": 1.3686, + "step": 5850 + }, + { + "epoch": 0.45410515711573485, + "grad_norm": 1.8664108082768094, + "learning_rate": 2.2706137631742096e-08, + "loss": 1.372, + "step": 5860 + }, + { + "epoch": 0.45488008059204155, + "grad_norm": 1.8839858617919918, + "learning_rate": 2.2744885306881588e-08, + "loss": 1.3536, + "step": 5870 + }, + { + "epoch": 0.45565500406834825, + "grad_norm": 1.8058949644790432, + "learning_rate": 2.278363298202108e-08, + "loss": 1.3867, + "step": 5880 + }, + { + "epoch": 0.45642992754465495, + "grad_norm": 1.6600514964508026, + "learning_rate": 2.2822380657160572e-08, + "loss": 1.3724, + "step": 5890 + }, + { + "epoch": 0.4572048510209617, + "grad_norm": 1.7009310558385118, + "learning_rate": 2.2861128332300064e-08, + "loss": 1.3728, + "step": 5900 + }, + { + "epoch": 0.4579797744972684, + "grad_norm": 1.7606057858518953, + "learning_rate": 2.2899876007439553e-08, + "loss": 1.3758, + "step": 5910 + }, + { + "epoch": 0.4587546979735751, + "grad_norm": 1.7037353267867907, + "learning_rate": 2.2938623682579046e-08, + "loss": 1.3471, + "step": 5920 + }, + { + "epoch": 0.4595296214498818, + "grad_norm": 1.7849952240837152, + "learning_rate": 2.2977371357718538e-08, + "loss": 1.3647, + "step": 5930 + }, + { + "epoch": 0.46030454492618855, + "grad_norm": 1.805030205028309, + "learning_rate": 2.301611903285803e-08, + "loss": 1.3845, + "step": 5940 + }, + { + "epoch": 0.46107946840249525, + "grad_norm": 1.802625140091779, + "learning_rate": 2.3054866707997522e-08, + "loss": 1.3634, + "step": 5950 + }, + { + "epoch": 0.46185439187880195, + "grad_norm": 1.8097335823468899, + "learning_rate": 2.3093614383137014e-08, + "loss": 1.3707, + "step": 5960 + }, + { + "epoch": 0.4626293153551087, + "grad_norm": 1.7476173940700597, + "learning_rate": 2.3132362058276503e-08, + "loss": 1.362, + "step": 5970 + }, + { + "epoch": 0.4634042388314154, + "grad_norm": 1.7103875315344328, + "learning_rate": 2.3171109733415995e-08, + "loss": 1.3685, + "step": 5980 + }, + { + "epoch": 0.4641791623077221, + "grad_norm": 1.8173998469223034, + "learning_rate": 2.3209857408555488e-08, + "loss": 1.3634, + "step": 5990 + }, + { + "epoch": 0.4649540857840288, + "grad_norm": 1.7408472208037848, + "learning_rate": 2.324860508369498e-08, + "loss": 1.3511, + "step": 6000 + }, + { + "epoch": 0.4649540857840288, + "eval_loss": 1.3619437217712402, + "eval_runtime": 316.8229, + "eval_samples_per_second": 36.206, + "eval_steps_per_second": 9.052, + "step": 6000 + }, + { + "epoch": 0.46572900926033556, + "grad_norm": 1.9058415201675094, + "learning_rate": 2.3287352758834472e-08, + "loss": 1.3773, + "step": 6010 + }, + { + "epoch": 0.46650393273664226, + "grad_norm": 1.6661860539632767, + "learning_rate": 2.3326100433973964e-08, + "loss": 1.3837, + "step": 6020 + }, + { + "epoch": 0.46727885621294896, + "grad_norm": 1.6943004403254949, + "learning_rate": 2.3364848109113453e-08, + "loss": 1.3804, + "step": 6030 + }, + { + "epoch": 0.4680537796892557, + "grad_norm": 1.687276671210281, + "learning_rate": 2.3403595784252945e-08, + "loss": 1.3684, + "step": 6040 + }, + { + "epoch": 0.4688287031655624, + "grad_norm": 1.7655531898234809, + "learning_rate": 2.3442343459392438e-08, + "loss": 1.3548, + "step": 6050 + }, + { + "epoch": 0.4696036266418691, + "grad_norm": 1.7745061716010473, + "learning_rate": 2.348109113453193e-08, + "loss": 1.351, + "step": 6060 + }, + { + "epoch": 0.4703785501181758, + "grad_norm": 1.6426036198473566, + "learning_rate": 2.3519838809671422e-08, + "loss": 1.3625, + "step": 6070 + }, + { + "epoch": 0.47115347359448256, + "grad_norm": 1.7243806500107675, + "learning_rate": 2.355858648481091e-08, + "loss": 1.3759, + "step": 6080 + }, + { + "epoch": 0.47192839707078926, + "grad_norm": 1.5768778364321812, + "learning_rate": 2.3597334159950403e-08, + "loss": 1.359, + "step": 6090 + }, + { + "epoch": 0.47270332054709596, + "grad_norm": 1.725783773924587, + "learning_rate": 2.3636081835089895e-08, + "loss": 1.3758, + "step": 6100 + }, + { + "epoch": 0.47347824402340266, + "grad_norm": 1.7332974377421835, + "learning_rate": 2.3674829510229387e-08, + "loss": 1.3766, + "step": 6110 + }, + { + "epoch": 0.4742531674997094, + "grad_norm": 1.656027403145756, + "learning_rate": 2.371357718536888e-08, + "loss": 1.3566, + "step": 6120 + }, + { + "epoch": 0.4750280909760161, + "grad_norm": 2.240219206936125, + "learning_rate": 2.3752324860508372e-08, + "loss": 1.3859, + "step": 6130 + }, + { + "epoch": 0.4758030144523228, + "grad_norm": 1.6651148791234727, + "learning_rate": 2.379107253564786e-08, + "loss": 1.3591, + "step": 6140 + }, + { + "epoch": 0.47657793792862957, + "grad_norm": 1.792391947473312, + "learning_rate": 2.3829820210787353e-08, + "loss": 1.3854, + "step": 6150 + }, + { + "epoch": 0.47735286140493627, + "grad_norm": 1.680860301145216, + "learning_rate": 2.3868567885926845e-08, + "loss": 1.3595, + "step": 6160 + }, + { + "epoch": 0.47812778488124297, + "grad_norm": 1.6606085889554205, + "learning_rate": 2.3907315561066337e-08, + "loss": 1.374, + "step": 6170 + }, + { + "epoch": 0.47890270835754967, + "grad_norm": 1.87849798582432, + "learning_rate": 2.394606323620583e-08, + "loss": 1.3766, + "step": 6180 + }, + { + "epoch": 0.4796776318338564, + "grad_norm": 1.5435109800846687, + "learning_rate": 2.3984810911345322e-08, + "loss": 1.3547, + "step": 6190 + }, + { + "epoch": 0.4804525553101631, + "grad_norm": 1.736543282744245, + "learning_rate": 2.402355858648481e-08, + "loss": 1.3632, + "step": 6200 + }, + { + "epoch": 0.4812274787864698, + "grad_norm": 1.6470356665946961, + "learning_rate": 2.4062306261624303e-08, + "loss": 1.3861, + "step": 6210 + }, + { + "epoch": 0.4820024022627766, + "grad_norm": 3.4522664913227183, + "learning_rate": 2.4101053936763795e-08, + "loss": 1.3515, + "step": 6220 + }, + { + "epoch": 0.4827773257390833, + "grad_norm": 1.6749118513722498, + "learning_rate": 2.4139801611903287e-08, + "loss": 1.345, + "step": 6230 + }, + { + "epoch": 0.48355224921539, + "grad_norm": 1.7510669821882832, + "learning_rate": 2.417854928704278e-08, + "loss": 1.3587, + "step": 6240 + }, + { + "epoch": 0.4843271726916967, + "grad_norm": 1.6877364543891622, + "learning_rate": 2.421729696218227e-08, + "loss": 1.372, + "step": 6250 + }, + { + "epoch": 0.4851020961680034, + "grad_norm": 1.7464717516752892, + "learning_rate": 2.425604463732176e-08, + "loss": 1.354, + "step": 6260 + }, + { + "epoch": 0.4858770196443101, + "grad_norm": 1.794737799233447, + "learning_rate": 2.4294792312461253e-08, + "loss": 1.3779, + "step": 6270 + }, + { + "epoch": 0.4866519431206168, + "grad_norm": 1.7178343411023833, + "learning_rate": 2.4333539987600745e-08, + "loss": 1.3753, + "step": 6280 + }, + { + "epoch": 0.4874268665969236, + "grad_norm": 1.7526426491436013, + "learning_rate": 2.4372287662740237e-08, + "loss": 1.341, + "step": 6290 + }, + { + "epoch": 0.4882017900732303, + "grad_norm": 1.733954431572153, + "learning_rate": 2.441103533787973e-08, + "loss": 1.3503, + "step": 6300 + }, + { + "epoch": 0.488976713549537, + "grad_norm": 1.7585844967123496, + "learning_rate": 2.4449783013019218e-08, + "loss": 1.3387, + "step": 6310 + }, + { + "epoch": 0.4897516370258437, + "grad_norm": 1.6674600775001258, + "learning_rate": 2.448853068815871e-08, + "loss": 1.3172, + "step": 6320 + }, + { + "epoch": 0.49052656050215043, + "grad_norm": 1.8001432497946814, + "learning_rate": 2.4527278363298203e-08, + "loss": 1.3663, + "step": 6330 + }, + { + "epoch": 0.49130148397845713, + "grad_norm": 1.622624554206836, + "learning_rate": 2.4566026038437695e-08, + "loss": 1.3708, + "step": 6340 + }, + { + "epoch": 0.49207640745476383, + "grad_norm": 1.8097000336675153, + "learning_rate": 2.4604773713577187e-08, + "loss": 1.3676, + "step": 6350 + }, + { + "epoch": 0.49285133093107053, + "grad_norm": 1.8291351504989712, + "learning_rate": 2.464352138871668e-08, + "loss": 1.3228, + "step": 6360 + }, + { + "epoch": 0.4936262544073773, + "grad_norm": 1.7315503675053898, + "learning_rate": 2.4682269063856168e-08, + "loss": 1.3449, + "step": 6370 + }, + { + "epoch": 0.494401177883684, + "grad_norm": 1.7613138932356855, + "learning_rate": 2.472101673899566e-08, + "loss": 1.3389, + "step": 6380 + }, + { + "epoch": 0.4951761013599907, + "grad_norm": 1.8328752905442396, + "learning_rate": 2.4759764414135153e-08, + "loss": 1.3362, + "step": 6390 + }, + { + "epoch": 0.49595102483629744, + "grad_norm": 1.9505584207017517, + "learning_rate": 2.4798512089274645e-08, + "loss": 1.3279, + "step": 6400 + }, + { + "epoch": 0.49672594831260414, + "grad_norm": 1.6167900647848852, + "learning_rate": 2.4837259764414137e-08, + "loss": 1.3506, + "step": 6410 + }, + { + "epoch": 0.49750087178891084, + "grad_norm": 1.7948352395199292, + "learning_rate": 2.487600743955363e-08, + "loss": 1.358, + "step": 6420 + }, + { + "epoch": 0.49827579526521754, + "grad_norm": 1.6940965925453084, + "learning_rate": 2.4914755114693118e-08, + "loss": 1.3471, + "step": 6430 + }, + { + "epoch": 0.4990507187415243, + "grad_norm": 1.714596991524616, + "learning_rate": 2.495350278983261e-08, + "loss": 1.3631, + "step": 6440 + }, + { + "epoch": 0.499825642217831, + "grad_norm": 1.6353086214346768, + "learning_rate": 2.4992250464972102e-08, + "loss": 1.3552, + "step": 6450 + }, + { + "epoch": 0.5006005656941377, + "grad_norm": 3.938398796826866, + "learning_rate": 2.5030998140111595e-08, + "loss": 1.343, + "step": 6460 + }, + { + "epoch": 0.5013754891704444, + "grad_norm": 1.6654674353855954, + "learning_rate": 2.5069745815251087e-08, + "loss": 1.3214, + "step": 6470 + }, + { + "epoch": 0.5021504126467511, + "grad_norm": 1.7570298444544976, + "learning_rate": 2.510849349039058e-08, + "loss": 1.3579, + "step": 6480 + }, + { + "epoch": 0.5029253361230579, + "grad_norm": 1.7007874828932403, + "learning_rate": 2.5147241165530068e-08, + "loss": 1.3344, + "step": 6490 + }, + { + "epoch": 0.5037002595993646, + "grad_norm": 1.645461255281783, + "learning_rate": 2.518598884066956e-08, + "loss": 1.3454, + "step": 6500 + }, + { + "epoch": 0.5037002595993646, + "eval_loss": 1.3429144620895386, + "eval_runtime": 317.9938, + "eval_samples_per_second": 36.073, + "eval_steps_per_second": 9.019, + "step": 6500 + }, + { + "epoch": 0.5044751830756713, + "grad_norm": 1.7559613252293755, + "learning_rate": 2.5224736515809052e-08, + "loss": 1.3284, + "step": 6510 + }, + { + "epoch": 0.505250106551978, + "grad_norm": 1.6401359122948365, + "learning_rate": 2.5263484190948545e-08, + "loss": 1.351, + "step": 6520 + }, + { + "epoch": 0.5060250300282847, + "grad_norm": 1.646396041377876, + "learning_rate": 2.5302231866088037e-08, + "loss": 1.3116, + "step": 6530 + }, + { + "epoch": 0.5067999535045914, + "grad_norm": 1.8666953588301065, + "learning_rate": 2.5340979541227526e-08, + "loss": 1.3479, + "step": 6540 + }, + { + "epoch": 0.5075748769808981, + "grad_norm": 1.6797266397610309, + "learning_rate": 2.5379727216367018e-08, + "loss": 1.3092, + "step": 6550 + }, + { + "epoch": 0.5083498004572049, + "grad_norm": 1.6717868264952032, + "learning_rate": 2.541847489150651e-08, + "loss": 1.3389, + "step": 6560 + }, + { + "epoch": 0.5091247239335116, + "grad_norm": 1.7624792990497924, + "learning_rate": 2.5457222566646002e-08, + "loss": 1.3223, + "step": 6570 + }, + { + "epoch": 0.5098996474098183, + "grad_norm": 2.0870013029459638, + "learning_rate": 2.5495970241785494e-08, + "loss": 1.3502, + "step": 6580 + }, + { + "epoch": 0.510674570886125, + "grad_norm": 1.7950726729375333, + "learning_rate": 2.5534717916924987e-08, + "loss": 1.36, + "step": 6590 + }, + { + "epoch": 0.5114494943624317, + "grad_norm": 1.6226307646324147, + "learning_rate": 2.5573465592064476e-08, + "loss": 1.3298, + "step": 6600 + }, + { + "epoch": 0.5122244178387384, + "grad_norm": 2.4989087065832445, + "learning_rate": 2.5612213267203968e-08, + "loss": 1.332, + "step": 6610 + }, + { + "epoch": 0.5129993413150451, + "grad_norm": 1.9315589939287499, + "learning_rate": 2.565096094234346e-08, + "loss": 1.3257, + "step": 6620 + }, + { + "epoch": 0.5137742647913519, + "grad_norm": 1.669217492587651, + "learning_rate": 2.5689708617482952e-08, + "loss": 1.3343, + "step": 6630 + }, + { + "epoch": 0.5145491882676586, + "grad_norm": 13.069815295593893, + "learning_rate": 2.5728456292622444e-08, + "loss": 1.3646, + "step": 6640 + }, + { + "epoch": 0.5153241117439653, + "grad_norm": 1.6861028785635372, + "learning_rate": 2.5767203967761937e-08, + "loss": 1.3267, + "step": 6650 + }, + { + "epoch": 0.516099035220272, + "grad_norm": 1.7516487741870432, + "learning_rate": 2.5805951642901425e-08, + "loss": 1.3278, + "step": 6660 + }, + { + "epoch": 0.5168739586965787, + "grad_norm": 1.745872189396125, + "learning_rate": 2.5844699318040918e-08, + "loss": 1.3392, + "step": 6670 + }, + { + "epoch": 0.5176488821728854, + "grad_norm": 1.6485207560934654, + "learning_rate": 2.588344699318041e-08, + "loss": 1.3264, + "step": 6680 + }, + { + "epoch": 0.5184238056491921, + "grad_norm": 1.6449577858512967, + "learning_rate": 2.5922194668319902e-08, + "loss": 1.3421, + "step": 6690 + }, + { + "epoch": 0.5191987291254988, + "grad_norm": 1.711975774528819, + "learning_rate": 2.5960942343459394e-08, + "loss": 1.3209, + "step": 6700 + }, + { + "epoch": 0.5199736526018056, + "grad_norm": 1.6377360837667565, + "learning_rate": 2.5999690018598886e-08, + "loss": 1.3449, + "step": 6710 + }, + { + "epoch": 0.5207485760781123, + "grad_norm": 1.5978324497063212, + "learning_rate": 2.6038437693738375e-08, + "loss": 1.3249, + "step": 6720 + }, + { + "epoch": 0.521523499554419, + "grad_norm": 2.9098786299889725, + "learning_rate": 2.6077185368877867e-08, + "loss": 1.3253, + "step": 6730 + }, + { + "epoch": 0.5222984230307257, + "grad_norm": 1.7571026071418236, + "learning_rate": 2.611593304401736e-08, + "loss": 1.3241, + "step": 6740 + }, + { + "epoch": 0.5230733465070324, + "grad_norm": 1.7761317485501877, + "learning_rate": 2.6154680719156852e-08, + "loss": 1.3486, + "step": 6750 + }, + { + "epoch": 0.5238482699833391, + "grad_norm": 1.661778067885993, + "learning_rate": 2.6193428394296344e-08, + "loss": 1.3624, + "step": 6760 + }, + { + "epoch": 0.5246231934596458, + "grad_norm": 1.5505206330333121, + "learning_rate": 2.6232176069435833e-08, + "loss": 1.3456, + "step": 6770 + }, + { + "epoch": 0.5253981169359526, + "grad_norm": 1.7135646162101914, + "learning_rate": 2.6270923744575325e-08, + "loss": 1.327, + "step": 6780 + }, + { + "epoch": 0.5261730404122593, + "grad_norm": 1.8176468816061562, + "learning_rate": 2.6309671419714817e-08, + "loss": 1.3528, + "step": 6790 + }, + { + "epoch": 0.526947963888566, + "grad_norm": 1.6202815718131272, + "learning_rate": 2.634841909485431e-08, + "loss": 1.3057, + "step": 6800 + }, + { + "epoch": 0.5277228873648727, + "grad_norm": 22.363511928928375, + "learning_rate": 2.6387166769993802e-08, + "loss": 1.3203, + "step": 6810 + }, + { + "epoch": 0.5284978108411794, + "grad_norm": 1.712175703621639, + "learning_rate": 2.6425914445133294e-08, + "loss": 1.3444, + "step": 6820 + }, + { + "epoch": 0.5292727343174861, + "grad_norm": 2.132910794369731, + "learning_rate": 2.6464662120272783e-08, + "loss": 1.3173, + "step": 6830 + }, + { + "epoch": 0.5300476577937928, + "grad_norm": 1.7803233532208362, + "learning_rate": 2.6503409795412275e-08, + "loss": 1.3446, + "step": 6840 + }, + { + "epoch": 0.5308225812700996, + "grad_norm": 1.6355524588717312, + "learning_rate": 2.6542157470551767e-08, + "loss": 1.3101, + "step": 6850 + }, + { + "epoch": 0.5315975047464063, + "grad_norm": 1.5334006658576282, + "learning_rate": 2.658090514569126e-08, + "loss": 1.319, + "step": 6860 + }, + { + "epoch": 0.532372428222713, + "grad_norm": 1.6798878647705162, + "learning_rate": 2.6619652820830752e-08, + "loss": 1.3125, + "step": 6870 + }, + { + "epoch": 0.5331473516990197, + "grad_norm": 1.7153998294977557, + "learning_rate": 2.6658400495970244e-08, + "loss": 1.3121, + "step": 6880 + }, + { + "epoch": 0.5339222751753264, + "grad_norm": 1.6235346926122447, + "learning_rate": 2.6697148171109733e-08, + "loss": 1.3115, + "step": 6890 + }, + { + "epoch": 0.5346971986516331, + "grad_norm": 1.7958272971829368, + "learning_rate": 2.6735895846249225e-08, + "loss": 1.3256, + "step": 6900 + }, + { + "epoch": 0.5354721221279398, + "grad_norm": 1.839037156028821, + "learning_rate": 2.6774643521388717e-08, + "loss": 1.3127, + "step": 6910 + }, + { + "epoch": 0.5362470456042466, + "grad_norm": 1.947471095061869, + "learning_rate": 2.681339119652821e-08, + "loss": 1.3314, + "step": 6920 + }, + { + "epoch": 0.5370219690805533, + "grad_norm": 1.6855833543235383, + "learning_rate": 2.68521388716677e-08, + "loss": 1.3174, + "step": 6930 + }, + { + "epoch": 0.53779689255686, + "grad_norm": 1.6537777805453178, + "learning_rate": 2.689088654680719e-08, + "loss": 1.3177, + "step": 6940 + }, + { + "epoch": 0.5385718160331667, + "grad_norm": 1.6910192105345918, + "learning_rate": 2.6929634221946683e-08, + "loss": 1.3527, + "step": 6950 + }, + { + "epoch": 0.5393467395094734, + "grad_norm": 1.7181115673393028, + "learning_rate": 2.6968381897086175e-08, + "loss": 1.3278, + "step": 6960 + }, + { + "epoch": 0.5401216629857801, + "grad_norm": 1.7059863938107513, + "learning_rate": 2.7007129572225667e-08, + "loss": 1.312, + "step": 6970 + }, + { + "epoch": 0.5408965864620868, + "grad_norm": 1.7146450101143826, + "learning_rate": 2.704587724736516e-08, + "loss": 1.3096, + "step": 6980 + }, + { + "epoch": 0.5416715099383936, + "grad_norm": 1.6553143562971095, + "learning_rate": 2.708462492250465e-08, + "loss": 1.3043, + "step": 6990 + }, + { + "epoch": 0.5424464334147003, + "grad_norm": 1.7539548144383597, + "learning_rate": 2.712337259764414e-08, + "loss": 1.3213, + "step": 7000 + }, + { + "epoch": 0.5424464334147003, + "eval_loss": 1.3243813514709473, + "eval_runtime": 318.3543, + "eval_samples_per_second": 36.032, + "eval_steps_per_second": 9.009, + "step": 7000 + }, + { + "epoch": 0.543221356891007, + "grad_norm": 1.5559625630075224, + "learning_rate": 2.7162120272783633e-08, + "loss": 1.3127, + "step": 7010 + }, + { + "epoch": 0.5439962803673137, + "grad_norm": 1.5782806615246001, + "learning_rate": 2.7200867947923125e-08, + "loss": 1.3369, + "step": 7020 + }, + { + "epoch": 0.5447712038436204, + "grad_norm": 1.5204272700740222, + "learning_rate": 2.7239615623062617e-08, + "loss": 1.3103, + "step": 7030 + }, + { + "epoch": 0.5455461273199271, + "grad_norm": 1.5363925450503308, + "learning_rate": 2.727836329820211e-08, + "loss": 1.3053, + "step": 7040 + }, + { + "epoch": 0.5463210507962338, + "grad_norm": 1.7004109119295385, + "learning_rate": 2.73171109733416e-08, + "loss": 1.3157, + "step": 7050 + }, + { + "epoch": 0.5470959742725406, + "grad_norm": 1.692359470288306, + "learning_rate": 2.735585864848109e-08, + "loss": 1.337, + "step": 7060 + }, + { + "epoch": 0.5478708977488473, + "grad_norm": 1.5789593270041324, + "learning_rate": 2.7394606323620582e-08, + "loss": 1.3132, + "step": 7070 + }, + { + "epoch": 0.548645821225154, + "grad_norm": 1.6660663184421363, + "learning_rate": 2.7433353998760075e-08, + "loss": 1.3424, + "step": 7080 + }, + { + "epoch": 0.5494207447014607, + "grad_norm": 1.6206200968905162, + "learning_rate": 2.7472101673899567e-08, + "loss": 1.308, + "step": 7090 + }, + { + "epoch": 0.5501956681777674, + "grad_norm": 1.549901076755582, + "learning_rate": 2.751084934903906e-08, + "loss": 1.3205, + "step": 7100 + }, + { + "epoch": 0.5509705916540741, + "grad_norm": 1.6351892900748402, + "learning_rate": 2.754959702417855e-08, + "loss": 1.3272, + "step": 7110 + }, + { + "epoch": 0.5517455151303808, + "grad_norm": 1.6791958472662993, + "learning_rate": 2.758834469931804e-08, + "loss": 1.3079, + "step": 7120 + }, + { + "epoch": 0.5525204386066875, + "grad_norm": 1.7021373421685357, + "learning_rate": 2.7627092374457532e-08, + "loss": 1.3246, + "step": 7130 + }, + { + "epoch": 0.5532953620829943, + "grad_norm": 1.6802751385462784, + "learning_rate": 2.7665840049597025e-08, + "loss": 1.3001, + "step": 7140 + }, + { + "epoch": 0.554070285559301, + "grad_norm": 1.6156583226435977, + "learning_rate": 2.7704587724736517e-08, + "loss": 1.3417, + "step": 7150 + }, + { + "epoch": 0.5548452090356077, + "grad_norm": 1.6582528314263159, + "learning_rate": 2.774333539987601e-08, + "loss": 1.3289, + "step": 7160 + }, + { + "epoch": 0.5556201325119144, + "grad_norm": 2.029658466196419, + "learning_rate": 2.7782083075015498e-08, + "loss": 1.3038, + "step": 7170 + }, + { + "epoch": 0.5563950559882211, + "grad_norm": 1.7763570010395875, + "learning_rate": 2.782083075015499e-08, + "loss": 1.3256, + "step": 7180 + }, + { + "epoch": 0.5571699794645278, + "grad_norm": 1.6139773542675953, + "learning_rate": 2.7859578425294482e-08, + "loss": 1.3294, + "step": 7190 + }, + { + "epoch": 0.5579449029408345, + "grad_norm": 1.5751902113321474, + "learning_rate": 2.7898326100433974e-08, + "loss": 1.3121, + "step": 7200 + }, + { + "epoch": 0.5587198264171414, + "grad_norm": 1.6657186361801153, + "learning_rate": 2.7937073775573467e-08, + "loss": 1.2982, + "step": 7210 + }, + { + "epoch": 0.559494749893448, + "grad_norm": 1.6435125705701372, + "learning_rate": 2.797582145071296e-08, + "loss": 1.3238, + "step": 7220 + }, + { + "epoch": 0.5602696733697548, + "grad_norm": 1.5715299778238412, + "learning_rate": 2.8014569125852448e-08, + "loss": 1.326, + "step": 7230 + }, + { + "epoch": 0.5610445968460614, + "grad_norm": 1.756420902479542, + "learning_rate": 2.805331680099194e-08, + "loss": 1.3145, + "step": 7240 + }, + { + "epoch": 0.5618195203223681, + "grad_norm": 1.7992160098841954, + "learning_rate": 2.8092064476131432e-08, + "loss": 1.3206, + "step": 7250 + }, + { + "epoch": 0.5625944437986748, + "grad_norm": 1.6204687933218693, + "learning_rate": 2.8130812151270924e-08, + "loss": 1.3134, + "step": 7260 + }, + { + "epoch": 0.5633693672749815, + "grad_norm": 1.593343579217188, + "learning_rate": 2.8169559826410417e-08, + "loss": 1.3463, + "step": 7270 + }, + { + "epoch": 0.5641442907512884, + "grad_norm": 1.6082965662010495, + "learning_rate": 2.820830750154991e-08, + "loss": 1.3222, + "step": 7280 + }, + { + "epoch": 0.5649192142275951, + "grad_norm": 1.5103344325284234, + "learning_rate": 2.8247055176689398e-08, + "loss": 1.332, + "step": 7290 + }, + { + "epoch": 0.5656941377039018, + "grad_norm": 1.5687897532081858, + "learning_rate": 2.828580285182889e-08, + "loss": 1.297, + "step": 7300 + }, + { + "epoch": 0.5664690611802085, + "grad_norm": 1.6463473069583978, + "learning_rate": 2.8324550526968382e-08, + "loss": 1.3028, + "step": 7310 + }, + { + "epoch": 0.5672439846565152, + "grad_norm": 1.691667035790162, + "learning_rate": 2.8363298202107874e-08, + "loss": 1.3202, + "step": 7320 + }, + { + "epoch": 0.5680189081328219, + "grad_norm": 1.6396537679945486, + "learning_rate": 2.8402045877247366e-08, + "loss": 1.2911, + "step": 7330 + }, + { + "epoch": 0.5687938316091286, + "grad_norm": 1.8027732255042506, + "learning_rate": 2.844079355238686e-08, + "loss": 1.3153, + "step": 7340 + }, + { + "epoch": 0.5695687550854354, + "grad_norm": 1.6295967561968552, + "learning_rate": 2.8479541227526348e-08, + "loss": 1.3204, + "step": 7350 + }, + { + "epoch": 0.5703436785617421, + "grad_norm": 1.50826794553936, + "learning_rate": 2.851828890266584e-08, + "loss": 1.3268, + "step": 7360 + }, + { + "epoch": 0.5711186020380488, + "grad_norm": 1.630826534247081, + "learning_rate": 2.8557036577805332e-08, + "loss": 1.304, + "step": 7370 + }, + { + "epoch": 0.5718935255143555, + "grad_norm": 1.6118524250337825, + "learning_rate": 2.8595784252944824e-08, + "loss": 1.3467, + "step": 7380 + }, + { + "epoch": 0.5726684489906622, + "grad_norm": 1.6315278162134161, + "learning_rate": 2.8634531928084316e-08, + "loss": 1.3083, + "step": 7390 + }, + { + "epoch": 0.5734433724669689, + "grad_norm": 1.6794307770567816, + "learning_rate": 2.8673279603223805e-08, + "loss": 1.2902, + "step": 7400 + }, + { + "epoch": 0.5742182959432756, + "grad_norm": 1.5859781974693337, + "learning_rate": 2.8712027278363297e-08, + "loss": 1.3203, + "step": 7410 + }, + { + "epoch": 0.5749932194195824, + "grad_norm": 1.695112427943916, + "learning_rate": 2.875077495350279e-08, + "loss": 1.3026, + "step": 7420 + }, + { + "epoch": 0.5757681428958891, + "grad_norm": 1.6335130967817877, + "learning_rate": 2.8789522628642282e-08, + "loss": 1.3326, + "step": 7430 + }, + { + "epoch": 0.5765430663721958, + "grad_norm": 1.9331955069991196, + "learning_rate": 2.8828270303781774e-08, + "loss": 1.3175, + "step": 7440 + }, + { + "epoch": 0.5773179898485025, + "grad_norm": 1.7493017638662738, + "learning_rate": 2.8867017978921266e-08, + "loss": 1.3328, + "step": 7450 + }, + { + "epoch": 0.5780929133248092, + "grad_norm": 1.6437719649261073, + "learning_rate": 2.8905765654060755e-08, + "loss": 1.2984, + "step": 7460 + }, + { + "epoch": 0.5788678368011159, + "grad_norm": 1.5059479856151214, + "learning_rate": 2.8944513329200247e-08, + "loss": 1.2938, + "step": 7470 + }, + { + "epoch": 0.5796427602774226, + "grad_norm": 1.6525577038120522, + "learning_rate": 2.898326100433974e-08, + "loss": 1.2738, + "step": 7480 + }, + { + "epoch": 0.5804176837537294, + "grad_norm": 1.535239845898143, + "learning_rate": 2.9022008679479232e-08, + "loss": 1.2803, + "step": 7490 + }, + { + "epoch": 0.5811926072300361, + "grad_norm": 1.6183860446594918, + "learning_rate": 2.9060756354618724e-08, + "loss": 1.2856, + "step": 7500 + }, + { + "epoch": 0.5811926072300361, + "eval_loss": 1.3070015907287598, + "eval_runtime": 318.4776, + "eval_samples_per_second": 36.018, + "eval_steps_per_second": 9.005, + "step": 7500 + }, + { + "epoch": 0.5819675307063428, + "grad_norm": 1.7480384424875814, + "learning_rate": 2.9099504029758216e-08, + "loss": 1.3046, + "step": 7510 + }, + { + "epoch": 0.5827424541826495, + "grad_norm": 1.5297155164483658, + "learning_rate": 2.9138251704897705e-08, + "loss": 1.3183, + "step": 7520 + }, + { + "epoch": 0.5835173776589562, + "grad_norm": 1.5181780329332786, + "learning_rate": 2.9176999380037197e-08, + "loss": 1.3049, + "step": 7530 + }, + { + "epoch": 0.5842923011352629, + "grad_norm": 1.6273168942218779, + "learning_rate": 2.921574705517669e-08, + "loss": 1.2846, + "step": 7540 + }, + { + "epoch": 0.5850672246115696, + "grad_norm": 1.69102464195553, + "learning_rate": 2.9254494730316182e-08, + "loss": 1.3101, + "step": 7550 + }, + { + "epoch": 0.5858421480878763, + "grad_norm": 1.7363563873686767, + "learning_rate": 2.9293242405455674e-08, + "loss": 1.3258, + "step": 7560 + }, + { + "epoch": 0.5866170715641831, + "grad_norm": 1.6140445848903562, + "learning_rate": 2.933199008059517e-08, + "loss": 1.3022, + "step": 7570 + }, + { + "epoch": 0.5873919950404898, + "grad_norm": 1.623913282622136, + "learning_rate": 2.937073775573466e-08, + "loss": 1.3106, + "step": 7580 + }, + { + "epoch": 0.5881669185167965, + "grad_norm": 1.512363590304044, + "learning_rate": 2.940948543087415e-08, + "loss": 1.3177, + "step": 7590 + }, + { + "epoch": 0.5889418419931032, + "grad_norm": 1.70081343355845, + "learning_rate": 2.9448233106013643e-08, + "loss": 1.3106, + "step": 7600 + }, + { + "epoch": 0.5897167654694099, + "grad_norm": 1.5647224116394058, + "learning_rate": 2.9486980781153135e-08, + "loss": 1.3068, + "step": 7610 + }, + { + "epoch": 0.5904916889457166, + "grad_norm": 1.88121078324001, + "learning_rate": 2.9525728456292627e-08, + "loss": 1.2792, + "step": 7620 + }, + { + "epoch": 0.5912666124220233, + "grad_norm": 1.6126293515632215, + "learning_rate": 2.956447613143212e-08, + "loss": 1.3076, + "step": 7630 + }, + { + "epoch": 0.5920415358983301, + "grad_norm": 1.672888599292994, + "learning_rate": 2.960322380657161e-08, + "loss": 1.322, + "step": 7640 + }, + { + "epoch": 0.5928164593746368, + "grad_norm": 1.6183399832539127, + "learning_rate": 2.96419714817111e-08, + "loss": 1.2804, + "step": 7650 + }, + { + "epoch": 0.5935913828509435, + "grad_norm": 1.7167081630214114, + "learning_rate": 2.9680719156850593e-08, + "loss": 1.3089, + "step": 7660 + }, + { + "epoch": 0.5943663063272502, + "grad_norm": 1.5958639973985163, + "learning_rate": 2.9719466831990085e-08, + "loss": 1.2759, + "step": 7670 + }, + { + "epoch": 0.5951412298035569, + "grad_norm": 1.6171462293946834, + "learning_rate": 2.9758214507129577e-08, + "loss": 1.3227, + "step": 7680 + }, + { + "epoch": 0.5959161532798636, + "grad_norm": 1.6461705590132978, + "learning_rate": 2.979696218226907e-08, + "loss": 1.3, + "step": 7690 + }, + { + "epoch": 0.5966910767561703, + "grad_norm": 1.6332307795919572, + "learning_rate": 2.983570985740856e-08, + "loss": 1.2951, + "step": 7700 + }, + { + "epoch": 0.5974660002324771, + "grad_norm": 1.6717537939045188, + "learning_rate": 2.9874457532548054e-08, + "loss": 1.2647, + "step": 7710 + }, + { + "epoch": 0.5982409237087838, + "grad_norm": 1.6195035503380406, + "learning_rate": 2.991320520768754e-08, + "loss": 1.2785, + "step": 7720 + }, + { + "epoch": 0.5990158471850905, + "grad_norm": 1.6300186985065814, + "learning_rate": 2.995195288282704e-08, + "loss": 1.3109, + "step": 7730 + }, + { + "epoch": 0.5997907706613972, + "grad_norm": 1.5386953475417953, + "learning_rate": 2.999070055796653e-08, + "loss": 1.3133, + "step": 7740 + }, + { + "epoch": 0.6005656941377039, + "grad_norm": 1.820320346647064, + "learning_rate": 3.0029448233106016e-08, + "loss": 1.3053, + "step": 7750 + }, + { + "epoch": 0.6013406176140106, + "grad_norm": 1.5657623579281204, + "learning_rate": 3.006819590824551e-08, + "loss": 1.277, + "step": 7760 + }, + { + "epoch": 0.6021155410903173, + "grad_norm": 1.5530012650097356, + "learning_rate": 3.0106943583385e-08, + "loss": 1.2942, + "step": 7770 + }, + { + "epoch": 0.6028904645666241, + "grad_norm": 1.5469748892255744, + "learning_rate": 3.0145691258524496e-08, + "loss": 1.2931, + "step": 7780 + }, + { + "epoch": 0.6036653880429308, + "grad_norm": 1.520216662079202, + "learning_rate": 3.0184438933663985e-08, + "loss": 1.2893, + "step": 7790 + }, + { + "epoch": 0.6044403115192375, + "grad_norm": 1.6814519263049112, + "learning_rate": 3.0223186608803473e-08, + "loss": 1.2891, + "step": 7800 + }, + { + "epoch": 0.6052152349955442, + "grad_norm": 1.4766566461227053, + "learning_rate": 3.026193428394297e-08, + "loss": 1.29, + "step": 7810 + }, + { + "epoch": 0.6059901584718509, + "grad_norm": 1.6457133998275884, + "learning_rate": 3.030068195908246e-08, + "loss": 1.2876, + "step": 7820 + }, + { + "epoch": 0.6067650819481576, + "grad_norm": 1.5243589433111908, + "learning_rate": 3.0339429634221953e-08, + "loss": 1.3057, + "step": 7830 + }, + { + "epoch": 0.6075400054244643, + "grad_norm": 1.7107387507939642, + "learning_rate": 3.037817730936144e-08, + "loss": 1.2871, + "step": 7840 + }, + { + "epoch": 0.6083149289007711, + "grad_norm": 1.5318357875275745, + "learning_rate": 3.041692498450093e-08, + "loss": 1.3237, + "step": 7850 + }, + { + "epoch": 0.6090898523770778, + "grad_norm": 1.5872361316188688, + "learning_rate": 3.0455672659640427e-08, + "loss": 1.3026, + "step": 7860 + }, + { + "epoch": 0.6098647758533845, + "grad_norm": 1.6632206793696955, + "learning_rate": 3.0494420334779916e-08, + "loss": 1.2855, + "step": 7870 + }, + { + "epoch": 0.6106396993296912, + "grad_norm": 1.653408108725867, + "learning_rate": 3.053316800991941e-08, + "loss": 1.3117, + "step": 7880 + }, + { + "epoch": 0.6114146228059979, + "grad_norm": 1.6395193706314968, + "learning_rate": 3.05719156850589e-08, + "loss": 1.2747, + "step": 7890 + }, + { + "epoch": 0.6121895462823046, + "grad_norm": 1.7108925040158176, + "learning_rate": 3.0610663360198395e-08, + "loss": 1.2941, + "step": 7900 + }, + { + "epoch": 0.6129644697586113, + "grad_norm": 1.7339402608423398, + "learning_rate": 3.0649411035337884e-08, + "loss": 1.3, + "step": 7910 + }, + { + "epoch": 0.6137393932349181, + "grad_norm": 1.7162128160081327, + "learning_rate": 3.068815871047737e-08, + "loss": 1.2763, + "step": 7920 + }, + { + "epoch": 0.6145143167112248, + "grad_norm": 1.6197835055254148, + "learning_rate": 3.072690638561687e-08, + "loss": 1.2769, + "step": 7930 + }, + { + "epoch": 0.6152892401875315, + "grad_norm": 1.5929552183966182, + "learning_rate": 3.076565406075636e-08, + "loss": 1.3108, + "step": 7940 + }, + { + "epoch": 0.6160641636638382, + "grad_norm": 1.6491466601265907, + "learning_rate": 3.080440173589585e-08, + "loss": 1.2819, + "step": 7950 + }, + { + "epoch": 0.6168390871401449, + "grad_norm": 1.5640939939110179, + "learning_rate": 3.084314941103534e-08, + "loss": 1.2791, + "step": 7960 + }, + { + "epoch": 0.6176140106164516, + "grad_norm": 1.676636257202115, + "learning_rate": 3.088189708617483e-08, + "loss": 1.3327, + "step": 7970 + }, + { + "epoch": 0.6183889340927583, + "grad_norm": 1.475281943073299, + "learning_rate": 3.0920644761314326e-08, + "loss": 1.2698, + "step": 7980 + }, + { + "epoch": 0.619163857569065, + "grad_norm": 1.7881467936447657, + "learning_rate": 3.0959392436453815e-08, + "loss": 1.3396, + "step": 7990 + }, + { + "epoch": 0.6199387810453718, + "grad_norm": 1.5942449301550743, + "learning_rate": 3.099814011159331e-08, + "loss": 1.2965, + "step": 8000 + }, + { + "epoch": 0.6199387810453718, + "eval_loss": 1.290848970413208, + "eval_runtime": 317.6555, + "eval_samples_per_second": 36.111, + "eval_steps_per_second": 9.029, + "step": 8000 + }, + { + "epoch": 0.6207137045216785, + "grad_norm": 1.592764993308704, + "learning_rate": 3.10368877867328e-08, + "loss": 1.3131, + "step": 8010 + }, + { + "epoch": 0.6214886279979852, + "grad_norm": 1.551295843091275, + "learning_rate": 3.1075635461872295e-08, + "loss": 1.2748, + "step": 8020 + }, + { + "epoch": 0.6222635514742919, + "grad_norm": 1.750360318899126, + "learning_rate": 3.1114383137011784e-08, + "loss": 1.2815, + "step": 8030 + }, + { + "epoch": 0.6230384749505986, + "grad_norm": 1.532315207765978, + "learning_rate": 3.115313081215127e-08, + "loss": 1.2857, + "step": 8040 + }, + { + "epoch": 0.6238133984269053, + "grad_norm": 1.5853606432082201, + "learning_rate": 3.119187848729077e-08, + "loss": 1.2985, + "step": 8050 + }, + { + "epoch": 0.624588321903212, + "grad_norm": 1.8428911025196812, + "learning_rate": 3.123062616243026e-08, + "loss": 1.2899, + "step": 8060 + }, + { + "epoch": 0.6253632453795188, + "grad_norm": 2.233261734255558, + "learning_rate": 3.126937383756975e-08, + "loss": 1.2855, + "step": 8070 + }, + { + "epoch": 0.6261381688558255, + "grad_norm": 1.6032598210661013, + "learning_rate": 3.130812151270924e-08, + "loss": 1.2986, + "step": 8080 + }, + { + "epoch": 0.6269130923321322, + "grad_norm": 1.5578290556612022, + "learning_rate": 3.134686918784873e-08, + "loss": 1.2888, + "step": 8090 + }, + { + "epoch": 0.6276880158084389, + "grad_norm": 1.7088444054020964, + "learning_rate": 3.1385616862988226e-08, + "loss": 1.2586, + "step": 8100 + }, + { + "epoch": 0.6284629392847456, + "grad_norm": 1.5689792530179711, + "learning_rate": 3.1424364538127715e-08, + "loss": 1.2798, + "step": 8110 + }, + { + "epoch": 0.6292378627610523, + "grad_norm": 1.5107149940600821, + "learning_rate": 3.146311221326721e-08, + "loss": 1.2992, + "step": 8120 + }, + { + "epoch": 0.630012786237359, + "grad_norm": 2.4207712439466293, + "learning_rate": 3.15018598884067e-08, + "loss": 1.2981, + "step": 8130 + }, + { + "epoch": 0.6307877097136658, + "grad_norm": 1.5203509337487753, + "learning_rate": 3.154060756354619e-08, + "loss": 1.293, + "step": 8140 + }, + { + "epoch": 0.6315626331899725, + "grad_norm": 1.5957848245153947, + "learning_rate": 3.1579355238685684e-08, + "loss": 1.2666, + "step": 8150 + }, + { + "epoch": 0.6323375566662792, + "grad_norm": 1.5597755191353115, + "learning_rate": 3.161810291382517e-08, + "loss": 1.2708, + "step": 8160 + }, + { + "epoch": 0.6331124801425859, + "grad_norm": 1.6459877391192783, + "learning_rate": 3.165685058896467e-08, + "loss": 1.2995, + "step": 8170 + }, + { + "epoch": 0.6338874036188926, + "grad_norm": 1.6413627549631784, + "learning_rate": 3.169559826410416e-08, + "loss": 1.2872, + "step": 8180 + }, + { + "epoch": 0.6346623270951993, + "grad_norm": 1.640634219298847, + "learning_rate": 3.173434593924365e-08, + "loss": 1.2912, + "step": 8190 + }, + { + "epoch": 0.635437250571506, + "grad_norm": 1.5553645661940259, + "learning_rate": 3.177309361438314e-08, + "loss": 1.2543, + "step": 8200 + }, + { + "epoch": 0.6362121740478128, + "grad_norm": 1.6564761695676435, + "learning_rate": 3.181184128952263e-08, + "loss": 1.2872, + "step": 8210 + }, + { + "epoch": 0.6369870975241195, + "grad_norm": 1.5181100015981632, + "learning_rate": 3.1850588964662126e-08, + "loss": 1.2999, + "step": 8220 + }, + { + "epoch": 0.6377620210004262, + "grad_norm": 1.6519563533058852, + "learning_rate": 3.1889336639801615e-08, + "loss": 1.261, + "step": 8230 + }, + { + "epoch": 0.6385369444767329, + "grad_norm": 1.6652609988941263, + "learning_rate": 3.192808431494111e-08, + "loss": 1.2839, + "step": 8240 + }, + { + "epoch": 0.6393118679530396, + "grad_norm": 1.629007552465449, + "learning_rate": 3.19668319900806e-08, + "loss": 1.2955, + "step": 8250 + }, + { + "epoch": 0.6400867914293463, + "grad_norm": 1.5116387362529295, + "learning_rate": 3.200557966522009e-08, + "loss": 1.2885, + "step": 8260 + }, + { + "epoch": 0.640861714905653, + "grad_norm": 1.574858053361504, + "learning_rate": 3.2044327340359584e-08, + "loss": 1.277, + "step": 8270 + }, + { + "epoch": 0.6416366383819598, + "grad_norm": 1.5072244051488521, + "learning_rate": 3.208307501549907e-08, + "loss": 1.2933, + "step": 8280 + }, + { + "epoch": 0.6424115618582665, + "grad_norm": 2.069666776824591, + "learning_rate": 3.212182269063857e-08, + "loss": 1.2795, + "step": 8290 + }, + { + "epoch": 0.6431864853345732, + "grad_norm": 1.6817587539509642, + "learning_rate": 3.216057036577806e-08, + "loss": 1.2788, + "step": 8300 + }, + { + "epoch": 0.6439614088108799, + "grad_norm": 1.5246113390682856, + "learning_rate": 3.2199318040917546e-08, + "loss": 1.319, + "step": 8310 + }, + { + "epoch": 0.6447363322871866, + "grad_norm": 1.649829270081839, + "learning_rate": 3.223806571605704e-08, + "loss": 1.2747, + "step": 8320 + }, + { + "epoch": 0.6455112557634933, + "grad_norm": 1.4867364644854448, + "learning_rate": 3.227681339119653e-08, + "loss": 1.2584, + "step": 8330 + }, + { + "epoch": 0.6462861792398, + "grad_norm": 4.1325756792455275, + "learning_rate": 3.2315561066336026e-08, + "loss": 1.2919, + "step": 8340 + }, + { + "epoch": 0.6470611027161068, + "grad_norm": 1.5575902581160725, + "learning_rate": 3.2354308741475515e-08, + "loss": 1.2762, + "step": 8350 + }, + { + "epoch": 0.6478360261924135, + "grad_norm": 1.7805575340437454, + "learning_rate": 3.239305641661501e-08, + "loss": 1.2514, + "step": 8360 + }, + { + "epoch": 0.6486109496687202, + "grad_norm": 1.5736688888809882, + "learning_rate": 3.24318040917545e-08, + "loss": 1.2695, + "step": 8370 + }, + { + "epoch": 0.6493858731450269, + "grad_norm": 2.223380769930216, + "learning_rate": 3.247055176689399e-08, + "loss": 1.3027, + "step": 8380 + }, + { + "epoch": 0.6501607966213336, + "grad_norm": 1.5981995044401767, + "learning_rate": 3.2509299442033484e-08, + "loss": 1.2907, + "step": 8390 + }, + { + "epoch": 0.6509357200976403, + "grad_norm": 1.5412614003310725, + "learning_rate": 3.254804711717297e-08, + "loss": 1.2901, + "step": 8400 + }, + { + "epoch": 0.651710643573947, + "grad_norm": 1.5827678429371816, + "learning_rate": 3.258679479231247e-08, + "loss": 1.2755, + "step": 8410 + }, + { + "epoch": 0.6524855670502537, + "grad_norm": 1.6342605783470783, + "learning_rate": 3.262554246745196e-08, + "loss": 1.2739, + "step": 8420 + }, + { + "epoch": 0.6532604905265605, + "grad_norm": 1.5798090399718117, + "learning_rate": 3.2664290142591446e-08, + "loss": 1.2937, + "step": 8430 + }, + { + "epoch": 0.6540354140028672, + "grad_norm": 1.610650052298721, + "learning_rate": 3.270303781773094e-08, + "loss": 1.3061, + "step": 8440 + }, + { + "epoch": 0.6548103374791739, + "grad_norm": 1.591818698466337, + "learning_rate": 3.274178549287043e-08, + "loss": 1.2987, + "step": 8450 + }, + { + "epoch": 0.6555852609554806, + "grad_norm": 1.5717397192655875, + "learning_rate": 3.2780533168009926e-08, + "loss": 1.2989, + "step": 8460 + }, + { + "epoch": 0.6563601844317873, + "grad_norm": 1.5822994290427468, + "learning_rate": 3.2819280843149415e-08, + "loss": 1.2785, + "step": 8470 + }, + { + "epoch": 0.657135107908094, + "grad_norm": 1.650351804022458, + "learning_rate": 3.2858028518288903e-08, + "loss": 1.2781, + "step": 8480 + }, + { + "epoch": 0.6579100313844007, + "grad_norm": 1.5304129064558547, + "learning_rate": 3.28967761934284e-08, + "loss": 1.281, + "step": 8490 + }, + { + "epoch": 0.6586849548607075, + "grad_norm": 1.5686075925710916, + "learning_rate": 3.293552386856789e-08, + "loss": 1.2696, + "step": 8500 + }, + { + "epoch": 0.6586849548607075, + "eval_loss": 1.2756379842758179, + "eval_runtime": 319.8787, + "eval_samples_per_second": 35.86, + "eval_steps_per_second": 8.966, + "step": 8500 + }, + { + "epoch": 0.6594598783370142, + "grad_norm": 1.60247391388705, + "learning_rate": 3.2974271543707383e-08, + "loss": 1.2772, + "step": 8510 + }, + { + "epoch": 0.660234801813321, + "grad_norm": 1.775272120083721, + "learning_rate": 3.301301921884687e-08, + "loss": 1.2817, + "step": 8520 + }, + { + "epoch": 0.6610097252896276, + "grad_norm": 1.58768095929185, + "learning_rate": 3.305176689398637e-08, + "loss": 1.2961, + "step": 8530 + }, + { + "epoch": 0.6617846487659343, + "grad_norm": 1.6404323230349431, + "learning_rate": 3.3090514569125857e-08, + "loss": 1.3151, + "step": 8540 + }, + { + "epoch": 0.662559572242241, + "grad_norm": 1.6742956494132422, + "learning_rate": 3.3129262244265346e-08, + "loss": 1.2573, + "step": 8550 + }, + { + "epoch": 0.6633344957185477, + "grad_norm": 1.5423791672833023, + "learning_rate": 3.316800991940484e-08, + "loss": 1.28, + "step": 8560 + }, + { + "epoch": 0.6641094191948546, + "grad_norm": 1.5045380061702016, + "learning_rate": 3.320675759454433e-08, + "loss": 1.2807, + "step": 8570 + }, + { + "epoch": 0.6648843426711613, + "grad_norm": 1.6100483296532677, + "learning_rate": 3.3245505269683825e-08, + "loss": 1.2836, + "step": 8580 + }, + { + "epoch": 0.665659266147468, + "grad_norm": 1.7154631774181113, + "learning_rate": 3.3284252944823314e-08, + "loss": 1.2528, + "step": 8590 + }, + { + "epoch": 0.6664341896237747, + "grad_norm": 1.7378383255558754, + "learning_rate": 3.33230006199628e-08, + "loss": 1.2572, + "step": 8600 + }, + { + "epoch": 0.6672091131000814, + "grad_norm": 1.525429723294493, + "learning_rate": 3.33617482951023e-08, + "loss": 1.2789, + "step": 8610 + }, + { + "epoch": 0.667984036576388, + "grad_norm": 1.5091442347594428, + "learning_rate": 3.340049597024179e-08, + "loss": 1.3066, + "step": 8620 + }, + { + "epoch": 0.6687589600526948, + "grad_norm": 1.6016635767683405, + "learning_rate": 3.343924364538128e-08, + "loss": 1.2857, + "step": 8630 + }, + { + "epoch": 0.6695338835290016, + "grad_norm": 1.4879044345134305, + "learning_rate": 3.347799132052077e-08, + "loss": 1.2555, + "step": 8640 + }, + { + "epoch": 0.6703088070053083, + "grad_norm": 1.6432900001404154, + "learning_rate": 3.351673899566027e-08, + "loss": 1.2946, + "step": 8650 + }, + { + "epoch": 0.671083730481615, + "grad_norm": 1.5391280968309513, + "learning_rate": 3.3555486670799756e-08, + "loss": 1.2963, + "step": 8660 + }, + { + "epoch": 0.6718586539579217, + "grad_norm": 1.6586423649465902, + "learning_rate": 3.3594234345939245e-08, + "loss": 1.3174, + "step": 8670 + }, + { + "epoch": 0.6726335774342284, + "grad_norm": 1.6619726389298048, + "learning_rate": 3.363298202107874e-08, + "loss": 1.2544, + "step": 8680 + }, + { + "epoch": 0.6734085009105351, + "grad_norm": 1.5460887661939366, + "learning_rate": 3.367172969621823e-08, + "loss": 1.2981, + "step": 8690 + }, + { + "epoch": 0.6741834243868418, + "grad_norm": 1.9542356655445645, + "learning_rate": 3.3710477371357725e-08, + "loss": 1.3036, + "step": 8700 + }, + { + "epoch": 0.6749583478631486, + "grad_norm": 1.5831306597336057, + "learning_rate": 3.3749225046497214e-08, + "loss": 1.2739, + "step": 8710 + }, + { + "epoch": 0.6757332713394553, + "grad_norm": 1.6456221548205674, + "learning_rate": 3.37879727216367e-08, + "loss": 1.2503, + "step": 8720 + }, + { + "epoch": 0.676508194815762, + "grad_norm": 1.5164313929645292, + "learning_rate": 3.38267203967762e-08, + "loss": 1.2691, + "step": 8730 + }, + { + "epoch": 0.6772831182920687, + "grad_norm": 1.538574581293163, + "learning_rate": 3.386546807191569e-08, + "loss": 1.2949, + "step": 8740 + }, + { + "epoch": 0.6780580417683754, + "grad_norm": 1.542376124354385, + "learning_rate": 3.390421574705518e-08, + "loss": 1.2651, + "step": 8750 + }, + { + "epoch": 0.6788329652446821, + "grad_norm": 1.4825537032272367, + "learning_rate": 3.394296342219467e-08, + "loss": 1.2978, + "step": 8760 + }, + { + "epoch": 0.6796078887209888, + "grad_norm": 4.892407820841158, + "learning_rate": 3.398171109733416e-08, + "loss": 1.2606, + "step": 8770 + }, + { + "epoch": 0.6803828121972956, + "grad_norm": 1.71839169470713, + "learning_rate": 3.4020458772473656e-08, + "loss": 1.2855, + "step": 8780 + }, + { + "epoch": 0.6811577356736023, + "grad_norm": 1.7379576662225906, + "learning_rate": 3.4059206447613145e-08, + "loss": 1.2558, + "step": 8790 + }, + { + "epoch": 0.681932659149909, + "grad_norm": 1.4043006481485314, + "learning_rate": 3.409795412275264e-08, + "loss": 1.2558, + "step": 8800 + }, + { + "epoch": 0.6827075826262157, + "grad_norm": 1.559712196016308, + "learning_rate": 3.413670179789213e-08, + "loss": 1.2599, + "step": 8810 + }, + { + "epoch": 0.6834825061025224, + "grad_norm": 1.4538296869697924, + "learning_rate": 3.4175449473031625e-08, + "loss": 1.2506, + "step": 8820 + }, + { + "epoch": 0.6842574295788291, + "grad_norm": 1.5519813540523484, + "learning_rate": 3.4214197148171114e-08, + "loss": 1.2751, + "step": 8830 + }, + { + "epoch": 0.6850323530551358, + "grad_norm": 1.4889750808020366, + "learning_rate": 3.42529448233106e-08, + "loss": 1.27, + "step": 8840 + }, + { + "epoch": 0.6858072765314425, + "grad_norm": 1.7775152709229314, + "learning_rate": 3.42916924984501e-08, + "loss": 1.2797, + "step": 8850 + }, + { + "epoch": 0.6865822000077493, + "grad_norm": 1.4796934814682667, + "learning_rate": 3.433044017358959e-08, + "loss": 1.2768, + "step": 8860 + }, + { + "epoch": 0.687357123484056, + "grad_norm": 1.6544258231742504, + "learning_rate": 3.436918784872908e-08, + "loss": 1.2649, + "step": 8870 + }, + { + "epoch": 0.6881320469603627, + "grad_norm": 1.4810660522135293, + "learning_rate": 3.440793552386857e-08, + "loss": 1.2304, + "step": 8880 + }, + { + "epoch": 0.6889069704366694, + "grad_norm": 1.527843384097687, + "learning_rate": 3.444668319900806e-08, + "loss": 1.2806, + "step": 8890 + }, + { + "epoch": 0.6896818939129761, + "grad_norm": 1.5127803337552455, + "learning_rate": 3.4485430874147556e-08, + "loss": 1.2456, + "step": 8900 + }, + { + "epoch": 0.6904568173892828, + "grad_norm": 1.434029844512033, + "learning_rate": 3.4524178549287045e-08, + "loss": 1.2692, + "step": 8910 + }, + { + "epoch": 0.6912317408655895, + "grad_norm": 1.6300478628829231, + "learning_rate": 3.456292622442654e-08, + "loss": 1.2615, + "step": 8920 + }, + { + "epoch": 0.6920066643418963, + "grad_norm": 1.7998837598022908, + "learning_rate": 3.460167389956603e-08, + "loss": 1.2829, + "step": 8930 + }, + { + "epoch": 0.692781587818203, + "grad_norm": 1.574839823666908, + "learning_rate": 3.464042157470552e-08, + "loss": 1.2878, + "step": 8940 + }, + { + "epoch": 0.6935565112945097, + "grad_norm": 1.6252912736921088, + "learning_rate": 3.4679169249845014e-08, + "loss": 1.2746, + "step": 8950 + }, + { + "epoch": 0.6943314347708164, + "grad_norm": 1.485286135149687, + "learning_rate": 3.47179169249845e-08, + "loss": 1.2651, + "step": 8960 + }, + { + "epoch": 0.6951063582471231, + "grad_norm": 1.4978701050134227, + "learning_rate": 3.4756664600124e-08, + "loss": 1.2381, + "step": 8970 + }, + { + "epoch": 0.6958812817234298, + "grad_norm": 1.4863058201766013, + "learning_rate": 3.479541227526349e-08, + "loss": 1.2721, + "step": 8980 + }, + { + "epoch": 0.6966562051997365, + "grad_norm": 1.483499406177941, + "learning_rate": 3.483415995040298e-08, + "loss": 1.2733, + "step": 8990 + }, + { + "epoch": 0.6974311286760433, + "grad_norm": 1.5435986758508624, + "learning_rate": 3.487290762554247e-08, + "loss": 1.2679, + "step": 9000 + }, + { + "epoch": 0.6974311286760433, + "eval_loss": 1.2620388269424438, + "eval_runtime": 317.7771, + "eval_samples_per_second": 36.098, + "eval_steps_per_second": 9.025, + "step": 9000 + }, + { + "epoch": 0.69820605215235, + "grad_norm": 1.5554169009633247, + "learning_rate": 3.491165530068196e-08, + "loss": 1.2718, + "step": 9010 + }, + { + "epoch": 0.6989809756286567, + "grad_norm": 1.5518081964068586, + "learning_rate": 3.4950402975821456e-08, + "loss": 1.2463, + "step": 9020 + }, + { + "epoch": 0.6997558991049634, + "grad_norm": 1.7038547155146884, + "learning_rate": 3.4989150650960945e-08, + "loss": 1.2787, + "step": 9030 + }, + { + "epoch": 0.7005308225812701, + "grad_norm": 1.6904662431723645, + "learning_rate": 3.502789832610044e-08, + "loss": 1.2729, + "step": 9040 + }, + { + "epoch": 0.7013057460575768, + "grad_norm": 1.5363796712498987, + "learning_rate": 3.506664600123993e-08, + "loss": 1.265, + "step": 9050 + }, + { + "epoch": 0.7020806695338835, + "grad_norm": 1.4704838642783713, + "learning_rate": 3.510539367637942e-08, + "loss": 1.2509, + "step": 9060 + }, + { + "epoch": 0.7028555930101903, + "grad_norm": 1.6263754117685318, + "learning_rate": 3.5144141351518914e-08, + "loss": 1.2702, + "step": 9070 + }, + { + "epoch": 0.703630516486497, + "grad_norm": 1.5692932886467348, + "learning_rate": 3.51828890266584e-08, + "loss": 1.2855, + "step": 9080 + }, + { + "epoch": 0.7044054399628037, + "grad_norm": 1.573892013631362, + "learning_rate": 3.52216367017979e-08, + "loss": 1.2525, + "step": 9090 + }, + { + "epoch": 0.7051803634391104, + "grad_norm": 1.7120895494159751, + "learning_rate": 3.526038437693739e-08, + "loss": 1.2742, + "step": 9100 + }, + { + "epoch": 0.7059552869154171, + "grad_norm": 1.576188639591934, + "learning_rate": 3.529913205207688e-08, + "loss": 1.2455, + "step": 9110 + }, + { + "epoch": 0.7067302103917238, + "grad_norm": 1.7319595158918417, + "learning_rate": 3.533787972721637e-08, + "loss": 1.2583, + "step": 9120 + }, + { + "epoch": 0.7075051338680305, + "grad_norm": 1.495397599242847, + "learning_rate": 3.537662740235586e-08, + "loss": 1.263, + "step": 9130 + }, + { + "epoch": 0.7082800573443373, + "grad_norm": 1.5152422597997877, + "learning_rate": 3.5415375077495356e-08, + "loss": 1.2702, + "step": 9140 + }, + { + "epoch": 0.709054980820644, + "grad_norm": 1.5343458828811851, + "learning_rate": 3.5454122752634844e-08, + "loss": 1.2575, + "step": 9150 + }, + { + "epoch": 0.7098299042969507, + "grad_norm": 1.509556048694585, + "learning_rate": 3.549287042777434e-08, + "loss": 1.2518, + "step": 9160 + }, + { + "epoch": 0.7106048277732574, + "grad_norm": 1.6774729589126591, + "learning_rate": 3.553161810291383e-08, + "loss": 1.2629, + "step": 9170 + }, + { + "epoch": 0.7113797512495641, + "grad_norm": 1.661195669777254, + "learning_rate": 3.557036577805332e-08, + "loss": 1.2496, + "step": 9180 + }, + { + "epoch": 0.7121546747258708, + "grad_norm": 1.4788451322480591, + "learning_rate": 3.560911345319281e-08, + "loss": 1.2419, + "step": 9190 + }, + { + "epoch": 0.7129295982021775, + "grad_norm": 1.5374397686591446, + "learning_rate": 3.56478611283323e-08, + "loss": 1.2681, + "step": 9200 + }, + { + "epoch": 0.7137045216784843, + "grad_norm": 1.6267150384084905, + "learning_rate": 3.56866088034718e-08, + "loss": 1.3139, + "step": 9210 + }, + { + "epoch": 0.714479445154791, + "grad_norm": 1.5127453706883922, + "learning_rate": 3.5725356478611287e-08, + "loss": 1.229, + "step": 9220 + }, + { + "epoch": 0.7152543686310977, + "grad_norm": 1.6299129181403833, + "learning_rate": 3.5764104153750775e-08, + "loss": 1.2569, + "step": 9230 + }, + { + "epoch": 0.7160292921074044, + "grad_norm": 1.5500208532736681, + "learning_rate": 3.580285182889027e-08, + "loss": 1.2326, + "step": 9240 + }, + { + "epoch": 0.7168042155837111, + "grad_norm": 1.5419071465316898, + "learning_rate": 3.584159950402976e-08, + "loss": 1.2669, + "step": 9250 + }, + { + "epoch": 0.7175791390600178, + "grad_norm": 1.6172666054767528, + "learning_rate": 3.5880347179169255e-08, + "loss": 1.2537, + "step": 9260 + }, + { + "epoch": 0.7183540625363245, + "grad_norm": 2.0195303507998545, + "learning_rate": 3.5919094854308744e-08, + "loss": 1.2503, + "step": 9270 + }, + { + "epoch": 0.7191289860126312, + "grad_norm": 1.4216459549586429, + "learning_rate": 3.595784252944824e-08, + "loss": 1.2471, + "step": 9280 + }, + { + "epoch": 0.719903909488938, + "grad_norm": 2.5472504766180593, + "learning_rate": 3.599659020458773e-08, + "loss": 1.2462, + "step": 9290 + }, + { + "epoch": 0.7206788329652447, + "grad_norm": 1.8557339861558952, + "learning_rate": 3.603533787972722e-08, + "loss": 1.2595, + "step": 9300 + }, + { + "epoch": 0.7214537564415514, + "grad_norm": 1.7046931817272817, + "learning_rate": 3.607408555486671e-08, + "loss": 1.2822, + "step": 9310 + }, + { + "epoch": 0.7222286799178581, + "grad_norm": 1.5017153697828636, + "learning_rate": 3.61128332300062e-08, + "loss": 1.2684, + "step": 9320 + }, + { + "epoch": 0.7230036033941648, + "grad_norm": 1.7245896974228816, + "learning_rate": 3.61515809051457e-08, + "loss": 1.254, + "step": 9330 + }, + { + "epoch": 0.7237785268704715, + "grad_norm": 1.5354163058271773, + "learning_rate": 3.6190328580285186e-08, + "loss": 1.26, + "step": 9340 + }, + { + "epoch": 0.7245534503467782, + "grad_norm": 1.4597198626591992, + "learning_rate": 3.6229076255424675e-08, + "loss": 1.2279, + "step": 9350 + }, + { + "epoch": 0.725328373823085, + "grad_norm": 1.5302498384786907, + "learning_rate": 3.626782393056417e-08, + "loss": 1.2511, + "step": 9360 + }, + { + "epoch": 0.7261032972993917, + "grad_norm": 1.6071364599985951, + "learning_rate": 3.630657160570366e-08, + "loss": 1.2362, + "step": 9370 + }, + { + "epoch": 0.7268782207756984, + "grad_norm": 1.6308732819049305, + "learning_rate": 3.6345319280843155e-08, + "loss": 1.2595, + "step": 9380 + }, + { + "epoch": 0.7276531442520051, + "grad_norm": 1.6052317149704647, + "learning_rate": 3.6384066955982644e-08, + "loss": 1.2376, + "step": 9390 + }, + { + "epoch": 0.7284280677283118, + "grad_norm": 1.6782569055293146, + "learning_rate": 3.642281463112213e-08, + "loss": 1.2421, + "step": 9400 + }, + { + "epoch": 0.7292029912046185, + "grad_norm": 1.5437687640192272, + "learning_rate": 3.646156230626163e-08, + "loss": 1.2562, + "step": 9410 + }, + { + "epoch": 0.7299779146809252, + "grad_norm": 1.5104768348458935, + "learning_rate": 3.650030998140112e-08, + "loss": 1.264, + "step": 9420 + }, + { + "epoch": 0.730752838157232, + "grad_norm": 1.4579031252173718, + "learning_rate": 3.653905765654061e-08, + "loss": 1.2374, + "step": 9430 + }, + { + "epoch": 0.7315277616335387, + "grad_norm": 1.7925750384639196, + "learning_rate": 3.65778053316801e-08, + "loss": 1.2554, + "step": 9440 + }, + { + "epoch": 0.7323026851098454, + "grad_norm": 1.558103781497252, + "learning_rate": 3.66165530068196e-08, + "loss": 1.2329, + "step": 9450 + }, + { + "epoch": 0.7330776085861521, + "grad_norm": 1.5781844856559977, + "learning_rate": 3.6655300681959086e-08, + "loss": 1.2403, + "step": 9460 + }, + { + "epoch": 0.7338525320624588, + "grad_norm": 1.5861054887480428, + "learning_rate": 3.6694048357098575e-08, + "loss": 1.2435, + "step": 9470 + }, + { + "epoch": 0.7346274555387655, + "grad_norm": 1.5427219019113685, + "learning_rate": 3.673279603223807e-08, + "loss": 1.2593, + "step": 9480 + }, + { + "epoch": 0.7354023790150722, + "grad_norm": 1.5721575693513974, + "learning_rate": 3.677154370737756e-08, + "loss": 1.2742, + "step": 9490 + }, + { + "epoch": 0.736177302491379, + "grad_norm": 1.641522169660689, + "learning_rate": 3.6810291382517055e-08, + "loss": 1.2351, + "step": 9500 + }, + { + "epoch": 0.736177302491379, + "eval_loss": 1.2493815422058105, + "eval_runtime": 319.0616, + "eval_samples_per_second": 35.952, + "eval_steps_per_second": 8.989, + "step": 9500 + }, + { + "epoch": 0.7369522259676857, + "grad_norm": 1.4652391466981913, + "learning_rate": 3.6849039057656544e-08, + "loss": 1.2582, + "step": 9510 + }, + { + "epoch": 0.7377271494439924, + "grad_norm": 1.4880964209009262, + "learning_rate": 3.688778673279603e-08, + "loss": 1.2316, + "step": 9520 + }, + { + "epoch": 0.7385020729202991, + "grad_norm": 1.6442876504732484, + "learning_rate": 3.692653440793553e-08, + "loss": 1.2415, + "step": 9530 + }, + { + "epoch": 0.7392769963966058, + "grad_norm": 1.6252226973068364, + "learning_rate": 3.696528208307502e-08, + "loss": 1.2475, + "step": 9540 + }, + { + "epoch": 0.7400519198729125, + "grad_norm": 1.4925884431426946, + "learning_rate": 3.700402975821451e-08, + "loss": 1.2479, + "step": 9550 + }, + { + "epoch": 0.7408268433492192, + "grad_norm": 1.5527531519700357, + "learning_rate": 3.7042777433354e-08, + "loss": 1.2514, + "step": 9560 + }, + { + "epoch": 0.741601766825526, + "grad_norm": 1.5025963347284534, + "learning_rate": 3.70815251084935e-08, + "loss": 1.2519, + "step": 9570 + }, + { + "epoch": 0.7423766903018327, + "grad_norm": 1.411542918629515, + "learning_rate": 3.7120272783632986e-08, + "loss": 1.2913, + "step": 9580 + }, + { + "epoch": 0.7431516137781394, + "grad_norm": 1.5300228441335642, + "learning_rate": 3.7159020458772475e-08, + "loss": 1.2383, + "step": 9590 + }, + { + "epoch": 0.7439265372544461, + "grad_norm": 1.5267135692041522, + "learning_rate": 3.719776813391197e-08, + "loss": 1.2358, + "step": 9600 + }, + { + "epoch": 0.7447014607307528, + "grad_norm": 1.5435501980206334, + "learning_rate": 3.723651580905146e-08, + "loss": 1.2666, + "step": 9610 + }, + { + "epoch": 0.7454763842070595, + "grad_norm": 1.7489323672077113, + "learning_rate": 3.7275263484190955e-08, + "loss": 1.2109, + "step": 9620 + }, + { + "epoch": 0.7462513076833662, + "grad_norm": 2.4803074894993307, + "learning_rate": 3.7314011159330444e-08, + "loss": 1.2558, + "step": 9630 + }, + { + "epoch": 0.747026231159673, + "grad_norm": 1.520366501299349, + "learning_rate": 3.735275883446993e-08, + "loss": 1.2686, + "step": 9640 + }, + { + "epoch": 0.7478011546359797, + "grad_norm": 1.4744303304048294, + "learning_rate": 3.739150650960943e-08, + "loss": 1.2223, + "step": 9650 + }, + { + "epoch": 0.7485760781122864, + "grad_norm": 1.3971093814542468, + "learning_rate": 3.743025418474892e-08, + "loss": 1.2573, + "step": 9660 + }, + { + "epoch": 0.7493510015885931, + "grad_norm": 1.480248766485944, + "learning_rate": 3.746900185988841e-08, + "loss": 1.2717, + "step": 9670 + }, + { + "epoch": 0.7501259250648998, + "grad_norm": 1.515711821266614, + "learning_rate": 3.75077495350279e-08, + "loss": 1.2702, + "step": 9680 + }, + { + "epoch": 0.7509008485412065, + "grad_norm": 1.4740563106185167, + "learning_rate": 3.754649721016739e-08, + "loss": 1.2471, + "step": 9690 + }, + { + "epoch": 0.7516757720175132, + "grad_norm": 1.5958904262542362, + "learning_rate": 3.7585244885306886e-08, + "loss": 1.2463, + "step": 9700 + }, + { + "epoch": 0.7524506954938199, + "grad_norm": 1.508620314026431, + "learning_rate": 3.7623992560446375e-08, + "loss": 1.2185, + "step": 9710 + }, + { + "epoch": 0.7532256189701267, + "grad_norm": 1.5606915872931273, + "learning_rate": 3.766274023558587e-08, + "loss": 1.2525, + "step": 9720 + }, + { + "epoch": 0.7540005424464334, + "grad_norm": 1.5707832163356317, + "learning_rate": 3.770148791072536e-08, + "loss": 1.2438, + "step": 9730 + }, + { + "epoch": 0.7547754659227401, + "grad_norm": 1.5345021886008199, + "learning_rate": 3.7740235585864855e-08, + "loss": 1.2169, + "step": 9740 + }, + { + "epoch": 0.7555503893990468, + "grad_norm": 1.5217529117359705, + "learning_rate": 3.7778983261004343e-08, + "loss": 1.2104, + "step": 9750 + }, + { + "epoch": 0.7563253128753535, + "grad_norm": 1.5795163081015748, + "learning_rate": 3.781773093614383e-08, + "loss": 1.2538, + "step": 9760 + }, + { + "epoch": 0.7571002363516602, + "grad_norm": 1.5173770097980512, + "learning_rate": 3.785647861128333e-08, + "loss": 1.2726, + "step": 9770 + }, + { + "epoch": 0.7578751598279669, + "grad_norm": 1.4682612119789376, + "learning_rate": 3.789522628642282e-08, + "loss": 1.2418, + "step": 9780 + }, + { + "epoch": 0.7586500833042737, + "grad_norm": 1.4688631248715385, + "learning_rate": 3.793397396156231e-08, + "loss": 1.2511, + "step": 9790 + }, + { + "epoch": 0.7594250067805804, + "grad_norm": 1.3972409665995147, + "learning_rate": 3.79727216367018e-08, + "loss": 1.241, + "step": 9800 + }, + { + "epoch": 0.7601999302568871, + "grad_norm": 1.5073568330953224, + "learning_rate": 3.801146931184129e-08, + "loss": 1.2547, + "step": 9810 + }, + { + "epoch": 0.7609748537331938, + "grad_norm": 1.5073868735436253, + "learning_rate": 3.8050216986980786e-08, + "loss": 1.2533, + "step": 9820 + }, + { + "epoch": 0.7617497772095005, + "grad_norm": 1.5258443403655098, + "learning_rate": 3.8088964662120274e-08, + "loss": 1.2328, + "step": 9830 + }, + { + "epoch": 0.7625247006858072, + "grad_norm": 1.6010454381700212, + "learning_rate": 3.812771233725977e-08, + "loss": 1.2388, + "step": 9840 + }, + { + "epoch": 0.7632996241621139, + "grad_norm": 2.0589237619945893, + "learning_rate": 3.816646001239926e-08, + "loss": 1.2528, + "step": 9850 + }, + { + "epoch": 0.7640745476384208, + "grad_norm": 1.423519644474255, + "learning_rate": 3.820520768753875e-08, + "loss": 1.2167, + "step": 9860 + }, + { + "epoch": 0.7648494711147275, + "grad_norm": 1.5479626873577206, + "learning_rate": 3.824395536267824e-08, + "loss": 1.228, + "step": 9870 + }, + { + "epoch": 0.7656243945910342, + "grad_norm": 1.5336104719462236, + "learning_rate": 3.828270303781773e-08, + "loss": 1.2441, + "step": 9880 + }, + { + "epoch": 0.7663993180673409, + "grad_norm": 1.519229272398965, + "learning_rate": 3.832145071295723e-08, + "loss": 1.2227, + "step": 9890 + }, + { + "epoch": 0.7671742415436476, + "grad_norm": 1.5248398850065108, + "learning_rate": 3.8360198388096717e-08, + "loss": 1.2497, + "step": 9900 + }, + { + "epoch": 0.7679491650199542, + "grad_norm": 1.491902868377861, + "learning_rate": 3.839894606323621e-08, + "loss": 1.2331, + "step": 9910 + }, + { + "epoch": 0.768724088496261, + "grad_norm": 1.4784731004449323, + "learning_rate": 3.84376937383757e-08, + "loss": 1.2295, + "step": 9920 + }, + { + "epoch": 0.7694990119725678, + "grad_norm": 1.4785771257072957, + "learning_rate": 3.847644141351519e-08, + "loss": 1.2343, + "step": 9930 + }, + { + "epoch": 0.7702739354488745, + "grad_norm": 1.5104388495226568, + "learning_rate": 3.8515189088654685e-08, + "loss": 1.2504, + "step": 9940 + }, + { + "epoch": 0.7710488589251812, + "grad_norm": 1.6513293435743668, + "learning_rate": 3.8553936763794174e-08, + "loss": 1.2231, + "step": 9950 + }, + { + "epoch": 0.7718237824014879, + "grad_norm": 1.51222684972077, + "learning_rate": 3.859268443893367e-08, + "loss": 1.2433, + "step": 9960 + }, + { + "epoch": 0.7725987058777946, + "grad_norm": 1.7069851486317837, + "learning_rate": 3.863143211407316e-08, + "loss": 1.2503, + "step": 9970 + }, + { + "epoch": 0.7733736293541013, + "grad_norm": 1.4778829744408626, + "learning_rate": 3.867017978921265e-08, + "loss": 1.2482, + "step": 9980 + }, + { + "epoch": 0.774148552830408, + "grad_norm": 1.7856136793058652, + "learning_rate": 3.870892746435214e-08, + "loss": 1.2633, + "step": 9990 + }, + { + "epoch": 0.7749234763067148, + "grad_norm": 1.4353890873066026, + "learning_rate": 3.874767513949163e-08, + "loss": 1.2252, + "step": 10000 + }, + { + "epoch": 0.7749234763067148, + "eval_loss": 1.2375032901763916, + "eval_runtime": 319.7854, + "eval_samples_per_second": 35.871, + "eval_steps_per_second": 8.969, + "step": 10000 + }, + { + "epoch": 0.7756983997830215, + "grad_norm": 1.6643610938975022, + "learning_rate": 3.878642281463113e-08, + "loss": 1.2475, + "step": 10010 + }, + { + "epoch": 0.7764733232593282, + "grad_norm": 1.5346822233395128, + "learning_rate": 3.8825170489770616e-08, + "loss": 1.2356, + "step": 10020 + }, + { + "epoch": 0.7772482467356349, + "grad_norm": 1.4954800261032313, + "learning_rate": 3.886391816491011e-08, + "loss": 1.2261, + "step": 10030 + }, + { + "epoch": 0.7780231702119416, + "grad_norm": 1.4146377632647487, + "learning_rate": 3.89026658400496e-08, + "loss": 1.2388, + "step": 10040 + }, + { + "epoch": 0.7787980936882483, + "grad_norm": 1.61067606895235, + "learning_rate": 3.894141351518909e-08, + "loss": 1.2433, + "step": 10050 + }, + { + "epoch": 0.779573017164555, + "grad_norm": 1.4413661579688894, + "learning_rate": 3.8980161190328585e-08, + "loss": 1.2479, + "step": 10060 + }, + { + "epoch": 0.7803479406408618, + "grad_norm": 1.4629641110287206, + "learning_rate": 3.9018908865468074e-08, + "loss": 1.2619, + "step": 10070 + }, + { + "epoch": 0.7811228641171685, + "grad_norm": 1.5181758503924605, + "learning_rate": 3.905765654060757e-08, + "loss": 1.2417, + "step": 10080 + }, + { + "epoch": 0.7818977875934752, + "grad_norm": 1.5726119109627836, + "learning_rate": 3.909640421574706e-08, + "loss": 1.2455, + "step": 10090 + }, + { + "epoch": 0.7826727110697819, + "grad_norm": 1.5621348456692952, + "learning_rate": 3.913515189088655e-08, + "loss": 1.2423, + "step": 10100 + }, + { + "epoch": 0.7834476345460886, + "grad_norm": 1.5269429785563784, + "learning_rate": 3.917389956602604e-08, + "loss": 1.239, + "step": 10110 + }, + { + "epoch": 0.7842225580223953, + "grad_norm": 1.467922994015552, + "learning_rate": 3.921264724116553e-08, + "loss": 1.2323, + "step": 10120 + }, + { + "epoch": 0.784997481498702, + "grad_norm": 1.6187861083777157, + "learning_rate": 3.925139491630503e-08, + "loss": 1.2239, + "step": 10130 + }, + { + "epoch": 0.7857724049750087, + "grad_norm": 1.5289223079230154, + "learning_rate": 3.9290142591444516e-08, + "loss": 1.2683, + "step": 10140 + }, + { + "epoch": 0.7865473284513155, + "grad_norm": 2.5731526784735754, + "learning_rate": 3.9328890266584005e-08, + "loss": 1.27, + "step": 10150 + }, + { + "epoch": 0.7873222519276222, + "grad_norm": 1.4679069792190311, + "learning_rate": 3.93676379417235e-08, + "loss": 1.2294, + "step": 10160 + }, + { + "epoch": 0.7880971754039289, + "grad_norm": 1.5223222834989774, + "learning_rate": 3.940638561686299e-08, + "loss": 1.2473, + "step": 10170 + }, + { + "epoch": 0.7888720988802356, + "grad_norm": 1.5714000416549392, + "learning_rate": 3.9445133292002485e-08, + "loss": 1.2519, + "step": 10180 + }, + { + "epoch": 0.7896470223565423, + "grad_norm": 1.5243198762636858, + "learning_rate": 3.9483880967141974e-08, + "loss": 1.2369, + "step": 10190 + }, + { + "epoch": 0.790421945832849, + "grad_norm": 1.543152445975422, + "learning_rate": 3.952262864228147e-08, + "loss": 1.2025, + "step": 10200 + }, + { + "epoch": 0.7911968693091557, + "grad_norm": 1.4330105544333849, + "learning_rate": 3.956137631742096e-08, + "loss": 1.2149, + "step": 10210 + }, + { + "epoch": 0.7919717927854625, + "grad_norm": 1.5283975504481615, + "learning_rate": 3.960012399256045e-08, + "loss": 1.234, + "step": 10220 + }, + { + "epoch": 0.7927467162617692, + "grad_norm": 1.5114056336180282, + "learning_rate": 3.963887166769994e-08, + "loss": 1.2238, + "step": 10230 + }, + { + "epoch": 0.7935216397380759, + "grad_norm": 1.5146340624869545, + "learning_rate": 3.967761934283943e-08, + "loss": 1.2116, + "step": 10240 + }, + { + "epoch": 0.7942965632143826, + "grad_norm": 1.660997178383639, + "learning_rate": 3.971636701797893e-08, + "loss": 1.2487, + "step": 10250 + }, + { + "epoch": 0.7950714866906893, + "grad_norm": 1.5510277368951122, + "learning_rate": 3.9755114693118416e-08, + "loss": 1.2322, + "step": 10260 + }, + { + "epoch": 0.795846410166996, + "grad_norm": 1.4217165449195965, + "learning_rate": 3.9793862368257905e-08, + "loss": 1.2553, + "step": 10270 + }, + { + "epoch": 0.7966213336433027, + "grad_norm": 1.4105078480137248, + "learning_rate": 3.98326100433974e-08, + "loss": 1.2458, + "step": 10280 + }, + { + "epoch": 0.7973962571196095, + "grad_norm": 1.5009619972830783, + "learning_rate": 3.987135771853689e-08, + "loss": 1.2321, + "step": 10290 + }, + { + "epoch": 0.7981711805959162, + "grad_norm": 1.5089072447825471, + "learning_rate": 3.9910105393676385e-08, + "loss": 1.2262, + "step": 10300 + }, + { + "epoch": 0.7989461040722229, + "grad_norm": 1.3943039154353778, + "learning_rate": 3.9948853068815874e-08, + "loss": 1.2585, + "step": 10310 + }, + { + "epoch": 0.7997210275485296, + "grad_norm": 1.6225277920244232, + "learning_rate": 3.998760074395536e-08, + "loss": 1.2391, + "step": 10320 + }, + { + "epoch": 0.8004959510248363, + "grad_norm": 1.4375951508341591, + "learning_rate": 4.002634841909486e-08, + "loss": 1.2263, + "step": 10330 + }, + { + "epoch": 0.801270874501143, + "grad_norm": 1.4910939633350877, + "learning_rate": 4.006509609423435e-08, + "loss": 1.2235, + "step": 10340 + }, + { + "epoch": 0.8020457979774497, + "grad_norm": 1.3850579329842383, + "learning_rate": 4.010384376937384e-08, + "loss": 1.2204, + "step": 10350 + }, + { + "epoch": 0.8028207214537565, + "grad_norm": 1.5451172102455353, + "learning_rate": 4.014259144451333e-08, + "loss": 1.2755, + "step": 10360 + }, + { + "epoch": 0.8035956449300632, + "grad_norm": 1.614992860834169, + "learning_rate": 4.018133911965283e-08, + "loss": 1.2195, + "step": 10370 + }, + { + "epoch": 0.8043705684063699, + "grad_norm": 4.183937458035448, + "learning_rate": 4.0220086794792316e-08, + "loss": 1.2201, + "step": 10380 + }, + { + "epoch": 0.8051454918826766, + "grad_norm": 1.5964219158688508, + "learning_rate": 4.0258834469931805e-08, + "loss": 1.2181, + "step": 10390 + }, + { + "epoch": 0.8059204153589833, + "grad_norm": 1.488943571933079, + "learning_rate": 4.02975821450713e-08, + "loss": 1.2093, + "step": 10400 + }, + { + "epoch": 0.80669533883529, + "grad_norm": 1.4951834222230016, + "learning_rate": 4.033632982021079e-08, + "loss": 1.2247, + "step": 10410 + }, + { + "epoch": 0.8074702623115967, + "grad_norm": 1.57338631176016, + "learning_rate": 4.0375077495350285e-08, + "loss": 1.2109, + "step": 10420 + }, + { + "epoch": 0.8082451857879035, + "grad_norm": 1.6031978996776564, + "learning_rate": 4.0413825170489773e-08, + "loss": 1.2454, + "step": 10430 + }, + { + "epoch": 0.8090201092642102, + "grad_norm": 1.7060795450442938, + "learning_rate": 4.045257284562926e-08, + "loss": 1.2435, + "step": 10440 + }, + { + "epoch": 0.8097950327405169, + "grad_norm": 1.33747359104178, + "learning_rate": 4.049132052076876e-08, + "loss": 1.2194, + "step": 10450 + }, + { + "epoch": 0.8105699562168236, + "grad_norm": 1.5520370092772953, + "learning_rate": 4.053006819590825e-08, + "loss": 1.2252, + "step": 10460 + }, + { + "epoch": 0.8113448796931303, + "grad_norm": 4.769583265897173, + "learning_rate": 4.056881587104774e-08, + "loss": 1.2383, + "step": 10470 + }, + { + "epoch": 0.812119803169437, + "grad_norm": 1.4399418133064166, + "learning_rate": 4.060756354618723e-08, + "loss": 1.2267, + "step": 10480 + }, + { + "epoch": 0.8128947266457437, + "grad_norm": 1.5502619734794034, + "learning_rate": 4.0646311221326727e-08, + "loss": 1.2343, + "step": 10490 + }, + { + "epoch": 0.8136696501220505, + "grad_norm": 1.581043631920284, + "learning_rate": 4.0685058896466216e-08, + "loss": 1.2325, + "step": 10500 + }, + { + "epoch": 0.8136696501220505, + "eval_loss": 1.226379156112671, + "eval_runtime": 319.6237, + "eval_samples_per_second": 35.889, + "eval_steps_per_second": 8.973, + "step": 10500 + }, + { + "epoch": 0.8144445735983572, + "grad_norm": 1.6058247116261752, + "learning_rate": 4.0723806571605704e-08, + "loss": 1.2107, + "step": 10510 + }, + { + "epoch": 0.8152194970746639, + "grad_norm": 1.525508189453348, + "learning_rate": 4.07625542467452e-08, + "loss": 1.237, + "step": 10520 + }, + { + "epoch": 0.8159944205509706, + "grad_norm": 1.5299173132112818, + "learning_rate": 4.080130192188469e-08, + "loss": 1.2291, + "step": 10530 + }, + { + "epoch": 0.8167693440272773, + "grad_norm": 1.49701932147412, + "learning_rate": 4.0840049597024184e-08, + "loss": 1.2521, + "step": 10540 + }, + { + "epoch": 0.817544267503584, + "grad_norm": 1.582994728365688, + "learning_rate": 4.087879727216367e-08, + "loss": 1.2253, + "step": 10550 + }, + { + "epoch": 0.8183191909798907, + "grad_norm": 1.5471113937576282, + "learning_rate": 4.091754494730316e-08, + "loss": 1.1951, + "step": 10560 + }, + { + "epoch": 0.8190941144561974, + "grad_norm": 1.441389378551863, + "learning_rate": 4.095629262244266e-08, + "loss": 1.1918, + "step": 10570 + }, + { + "epoch": 0.8198690379325042, + "grad_norm": 1.6915420645627748, + "learning_rate": 4.0995040297582147e-08, + "loss": 1.1918, + "step": 10580 + }, + { + "epoch": 0.8206439614088109, + "grad_norm": 1.4809400924389855, + "learning_rate": 4.103378797272164e-08, + "loss": 1.2377, + "step": 10590 + }, + { + "epoch": 0.8214188848851176, + "grad_norm": 1.5147912754208035, + "learning_rate": 4.107253564786113e-08, + "loss": 1.2019, + "step": 10600 + }, + { + "epoch": 0.8221938083614243, + "grad_norm": 1.5130831951564552, + "learning_rate": 4.111128332300062e-08, + "loss": 1.244, + "step": 10610 + }, + { + "epoch": 0.822968731837731, + "grad_norm": 1.7185779349287142, + "learning_rate": 4.1150030998140115e-08, + "loss": 1.2079, + "step": 10620 + }, + { + "epoch": 0.8237436553140377, + "grad_norm": 2.620089076941645, + "learning_rate": 4.1188778673279604e-08, + "loss": 1.1965, + "step": 10630 + }, + { + "epoch": 0.8245185787903444, + "grad_norm": 1.637316857800361, + "learning_rate": 4.12275263484191e-08, + "loss": 1.2006, + "step": 10640 + }, + { + "epoch": 0.8252935022666512, + "grad_norm": 1.4495445236759186, + "learning_rate": 4.126627402355859e-08, + "loss": 1.1887, + "step": 10650 + }, + { + "epoch": 0.8260684257429579, + "grad_norm": 1.5623277779935585, + "learning_rate": 4.1305021698698084e-08, + "loss": 1.226, + "step": 10660 + }, + { + "epoch": 0.8268433492192646, + "grad_norm": 1.492459245959197, + "learning_rate": 4.134376937383757e-08, + "loss": 1.2477, + "step": 10670 + }, + { + "epoch": 0.8276182726955713, + "grad_norm": 1.555876111658894, + "learning_rate": 4.138251704897706e-08, + "loss": 1.2153, + "step": 10680 + }, + { + "epoch": 0.828393196171878, + "grad_norm": 1.4744742083289633, + "learning_rate": 4.142126472411656e-08, + "loss": 1.19, + "step": 10690 + }, + { + "epoch": 0.8291681196481847, + "grad_norm": 1.535957393292872, + "learning_rate": 4.1460012399256046e-08, + "loss": 1.2413, + "step": 10700 + }, + { + "epoch": 0.8299430431244914, + "grad_norm": 1.5692241413177714, + "learning_rate": 4.149876007439554e-08, + "loss": 1.2253, + "step": 10710 + }, + { + "epoch": 0.8307179666007982, + "grad_norm": 1.4940737631988905, + "learning_rate": 4.153750774953503e-08, + "loss": 1.217, + "step": 10720 + }, + { + "epoch": 0.8314928900771049, + "grad_norm": 1.5873355212865687, + "learning_rate": 4.157625542467452e-08, + "loss": 1.2081, + "step": 10730 + }, + { + "epoch": 0.8322678135534116, + "grad_norm": 1.4274344279057314, + "learning_rate": 4.1615003099814015e-08, + "loss": 1.2241, + "step": 10740 + }, + { + "epoch": 0.8330427370297183, + "grad_norm": 1.4366588508801872, + "learning_rate": 4.1653750774953504e-08, + "loss": 1.2005, + "step": 10750 + }, + { + "epoch": 0.833817660506025, + "grad_norm": 1.4728171007484332, + "learning_rate": 4.1692498450093e-08, + "loss": 1.2029, + "step": 10760 + }, + { + "epoch": 0.8345925839823317, + "grad_norm": 1.6676086125051512, + "learning_rate": 4.173124612523249e-08, + "loss": 1.217, + "step": 10770 + }, + { + "epoch": 0.8353675074586384, + "grad_norm": 1.5226607932412977, + "learning_rate": 4.176999380037198e-08, + "loss": 1.2364, + "step": 10780 + }, + { + "epoch": 0.8361424309349452, + "grad_norm": 1.4049874337009838, + "learning_rate": 4.180874147551147e-08, + "loss": 1.2115, + "step": 10790 + }, + { + "epoch": 0.8369173544112519, + "grad_norm": 1.5949769373292886, + "learning_rate": 4.184748915065096e-08, + "loss": 1.2253, + "step": 10800 + }, + { + "epoch": 0.8376922778875586, + "grad_norm": 1.4087269257232091, + "learning_rate": 4.188623682579046e-08, + "loss": 1.1998, + "step": 10810 + }, + { + "epoch": 0.8384672013638653, + "grad_norm": 1.7793191609905288, + "learning_rate": 4.1924984500929946e-08, + "loss": 1.2528, + "step": 10820 + }, + { + "epoch": 0.839242124840172, + "grad_norm": 1.5854398866309718, + "learning_rate": 4.196373217606944e-08, + "loss": 1.2518, + "step": 10830 + }, + { + "epoch": 0.8400170483164787, + "grad_norm": 1.460590388280138, + "learning_rate": 4.200247985120893e-08, + "loss": 1.2409, + "step": 10840 + }, + { + "epoch": 0.8407919717927854, + "grad_norm": 1.4800727866125751, + "learning_rate": 4.204122752634842e-08, + "loss": 1.2086, + "step": 10850 + }, + { + "epoch": 0.8415668952690922, + "grad_norm": 1.5766042013935082, + "learning_rate": 4.2079975201487915e-08, + "loss": 1.2717, + "step": 10860 + }, + { + "epoch": 0.8423418187453989, + "grad_norm": 1.4996595654594305, + "learning_rate": 4.2118722876627404e-08, + "loss": 1.2291, + "step": 10870 + }, + { + "epoch": 0.8431167422217056, + "grad_norm": 1.4874594557512937, + "learning_rate": 4.21574705517669e-08, + "loss": 1.2412, + "step": 10880 + }, + { + "epoch": 0.8438916656980123, + "grad_norm": 1.6678035255325314, + "learning_rate": 4.219621822690639e-08, + "loss": 1.2249, + "step": 10890 + }, + { + "epoch": 0.844666589174319, + "grad_norm": 1.426856300914683, + "learning_rate": 4.223496590204588e-08, + "loss": 1.2162, + "step": 10900 + }, + { + "epoch": 0.8454415126506257, + "grad_norm": 1.5732146275983077, + "learning_rate": 4.227371357718537e-08, + "loss": 1.2579, + "step": 10910 + }, + { + "epoch": 0.8462164361269324, + "grad_norm": 1.6848529712485945, + "learning_rate": 4.231246125232486e-08, + "loss": 1.2481, + "step": 10920 + }, + { + "epoch": 0.8469913596032392, + "grad_norm": 1.428602185484097, + "learning_rate": 4.235120892746436e-08, + "loss": 1.2135, + "step": 10930 + }, + { + "epoch": 0.8477662830795459, + "grad_norm": 1.5587319750380266, + "learning_rate": 4.2389956602603846e-08, + "loss": 1.2168, + "step": 10940 + }, + { + "epoch": 0.8485412065558526, + "grad_norm": 1.6164780675612622, + "learning_rate": 4.242870427774334e-08, + "loss": 1.2012, + "step": 10950 + }, + { + "epoch": 0.8493161300321593, + "grad_norm": 1.5317215175961618, + "learning_rate": 4.246745195288283e-08, + "loss": 1.2336, + "step": 10960 + }, + { + "epoch": 0.850091053508466, + "grad_norm": 1.524311279027627, + "learning_rate": 4.250619962802232e-08, + "loss": 1.2216, + "step": 10970 + }, + { + "epoch": 0.8508659769847727, + "grad_norm": 1.4563822333119456, + "learning_rate": 4.2544947303161815e-08, + "loss": 1.2245, + "step": 10980 + }, + { + "epoch": 0.8516409004610794, + "grad_norm": 1.4633313167520805, + "learning_rate": 4.2583694978301304e-08, + "loss": 1.205, + "step": 10990 + }, + { + "epoch": 0.8524158239373861, + "grad_norm": 1.7031284863096294, + "learning_rate": 4.26224426534408e-08, + "loss": 1.2156, + "step": 11000 + }, + { + "epoch": 0.8524158239373861, + "eval_loss": 1.2160017490386963, + "eval_runtime": 320.2693, + "eval_samples_per_second": 35.817, + "eval_steps_per_second": 8.955, + "step": 11000 + }, + { + "epoch": 0.8531907474136929, + "grad_norm": 1.6537962509768966, + "learning_rate": 4.266119032858029e-08, + "loss": 1.2063, + "step": 11010 + }, + { + "epoch": 0.8539656708899996, + "grad_norm": 1.4443632510619808, + "learning_rate": 4.269993800371978e-08, + "loss": 1.2126, + "step": 11020 + }, + { + "epoch": 0.8547405943663063, + "grad_norm": 1.5152117171767754, + "learning_rate": 4.273868567885927e-08, + "loss": 1.2227, + "step": 11030 + }, + { + "epoch": 0.855515517842613, + "grad_norm": 1.4117161189395613, + "learning_rate": 4.277743335399876e-08, + "loss": 1.2302, + "step": 11040 + }, + { + "epoch": 0.8562904413189197, + "grad_norm": 1.5313870331289186, + "learning_rate": 4.281618102913826e-08, + "loss": 1.2235, + "step": 11050 + }, + { + "epoch": 0.8570653647952264, + "grad_norm": 1.5399026160582874, + "learning_rate": 4.2854928704277746e-08, + "loss": 1.2321, + "step": 11060 + }, + { + "epoch": 0.8578402882715331, + "grad_norm": 1.4343391599725415, + "learning_rate": 4.2893676379417235e-08, + "loss": 1.1854, + "step": 11070 + }, + { + "epoch": 0.85861521174784, + "grad_norm": 1.5701698954260606, + "learning_rate": 4.293242405455673e-08, + "loss": 1.2337, + "step": 11080 + }, + { + "epoch": 0.8593901352241466, + "grad_norm": 1.4205935397821614, + "learning_rate": 4.297117172969622e-08, + "loss": 1.2027, + "step": 11090 + }, + { + "epoch": 0.8601650587004533, + "grad_norm": 1.5267479703862639, + "learning_rate": 4.3009919404835715e-08, + "loss": 1.2332, + "step": 11100 + }, + { + "epoch": 0.86093998217676, + "grad_norm": 1.4926462893966703, + "learning_rate": 4.3048667079975203e-08, + "loss": 1.2035, + "step": 11110 + }, + { + "epoch": 0.8617149056530667, + "grad_norm": 1.4905015474386865, + "learning_rate": 4.30874147551147e-08, + "loss": 1.2274, + "step": 11120 + }, + { + "epoch": 0.8624898291293734, + "grad_norm": 1.5130787393108134, + "learning_rate": 4.312616243025419e-08, + "loss": 1.215, + "step": 11130 + }, + { + "epoch": 0.8632647526056801, + "grad_norm": 1.4932328479759003, + "learning_rate": 4.316491010539368e-08, + "loss": 1.2206, + "step": 11140 + }, + { + "epoch": 0.864039676081987, + "grad_norm": 1.4573883743914784, + "learning_rate": 4.320365778053317e-08, + "loss": 1.1881, + "step": 11150 + }, + { + "epoch": 0.8648145995582937, + "grad_norm": 1.4415813476022117, + "learning_rate": 4.324240545567266e-08, + "loss": 1.2074, + "step": 11160 + }, + { + "epoch": 0.8655895230346003, + "grad_norm": 1.456855882832978, + "learning_rate": 4.3281153130812157e-08, + "loss": 1.1986, + "step": 11170 + }, + { + "epoch": 0.866364446510907, + "grad_norm": 1.3907146633573462, + "learning_rate": 4.3319900805951646e-08, + "loss": 1.2067, + "step": 11180 + }, + { + "epoch": 0.8671393699872137, + "grad_norm": 1.5258340220488096, + "learning_rate": 4.3358648481091134e-08, + "loss": 1.2019, + "step": 11190 + }, + { + "epoch": 0.8679142934635204, + "grad_norm": 1.3640231167652843, + "learning_rate": 4.339739615623063e-08, + "loss": 1.1768, + "step": 11200 + }, + { + "epoch": 0.8686892169398271, + "grad_norm": 1.674835776814399, + "learning_rate": 4.343614383137012e-08, + "loss": 1.1999, + "step": 11210 + }, + { + "epoch": 0.869464140416134, + "grad_norm": 1.572183314419079, + "learning_rate": 4.3474891506509614e-08, + "loss": 1.218, + "step": 11220 + }, + { + "epoch": 0.8702390638924407, + "grad_norm": 1.4191677916839163, + "learning_rate": 4.35136391816491e-08, + "loss": 1.2334, + "step": 11230 + }, + { + "epoch": 0.8710139873687474, + "grad_norm": 1.4040938766590845, + "learning_rate": 4.355238685678859e-08, + "loss": 1.2231, + "step": 11240 + }, + { + "epoch": 0.871788910845054, + "grad_norm": 1.513308681004946, + "learning_rate": 4.359113453192809e-08, + "loss": 1.2053, + "step": 11250 + }, + { + "epoch": 0.8725638343213608, + "grad_norm": 1.4661752583965983, + "learning_rate": 4.3629882207067576e-08, + "loss": 1.2027, + "step": 11260 + }, + { + "epoch": 0.8733387577976675, + "grad_norm": 1.3592276953902167, + "learning_rate": 4.366862988220707e-08, + "loss": 1.209, + "step": 11270 + }, + { + "epoch": 0.8741136812739742, + "grad_norm": 1.543178504986452, + "learning_rate": 4.370737755734656e-08, + "loss": 1.2455, + "step": 11280 + }, + { + "epoch": 0.874888604750281, + "grad_norm": 1.3709174733754315, + "learning_rate": 4.3746125232486056e-08, + "loss": 1.1881, + "step": 11290 + }, + { + "epoch": 0.8756635282265877, + "grad_norm": 1.5056731595492605, + "learning_rate": 4.3784872907625545e-08, + "loss": 1.2108, + "step": 11300 + }, + { + "epoch": 0.8764384517028944, + "grad_norm": 1.5697690121739305, + "learning_rate": 4.3823620582765034e-08, + "loss": 1.2213, + "step": 11310 + }, + { + "epoch": 0.8772133751792011, + "grad_norm": 1.6020023400843917, + "learning_rate": 4.386236825790453e-08, + "loss": 1.2214, + "step": 11320 + }, + { + "epoch": 0.8779882986555078, + "grad_norm": 1.573036949871703, + "learning_rate": 4.390111593304402e-08, + "loss": 1.2058, + "step": 11330 + }, + { + "epoch": 0.8787632221318145, + "grad_norm": 1.4953981399817307, + "learning_rate": 4.3939863608183514e-08, + "loss": 1.2133, + "step": 11340 + }, + { + "epoch": 0.8795381456081212, + "grad_norm": 1.3988353892393417, + "learning_rate": 4.3978611283323e-08, + "loss": 1.1933, + "step": 11350 + }, + { + "epoch": 0.880313069084428, + "grad_norm": 1.4846633678109136, + "learning_rate": 4.401735895846249e-08, + "loss": 1.2226, + "step": 11360 + }, + { + "epoch": 0.8810879925607347, + "grad_norm": 1.5224542471751241, + "learning_rate": 4.405610663360199e-08, + "loss": 1.2385, + "step": 11370 + }, + { + "epoch": 0.8818629160370414, + "grad_norm": 1.509660860823185, + "learning_rate": 4.4094854308741476e-08, + "loss": 1.2134, + "step": 11380 + }, + { + "epoch": 0.8826378395133481, + "grad_norm": 1.3951970313374897, + "learning_rate": 4.413360198388097e-08, + "loss": 1.1921, + "step": 11390 + }, + { + "epoch": 0.8834127629896548, + "grad_norm": 1.5683789554219763, + "learning_rate": 4.417234965902046e-08, + "loss": 1.2107, + "step": 11400 + }, + { + "epoch": 0.8841876864659615, + "grad_norm": 1.4611416590832935, + "learning_rate": 4.421109733415995e-08, + "loss": 1.2133, + "step": 11410 + }, + { + "epoch": 0.8849626099422682, + "grad_norm": 1.4598540068707353, + "learning_rate": 4.4249845009299445e-08, + "loss": 1.2197, + "step": 11420 + }, + { + "epoch": 0.8857375334185749, + "grad_norm": 1.440151551869026, + "learning_rate": 4.4288592684438934e-08, + "loss": 1.2445, + "step": 11430 + }, + { + "epoch": 0.8865124568948817, + "grad_norm": 1.5596230001622502, + "learning_rate": 4.432734035957843e-08, + "loss": 1.2096, + "step": 11440 + }, + { + "epoch": 0.8872873803711884, + "grad_norm": 1.5402714787648588, + "learning_rate": 4.436608803471792e-08, + "loss": 1.2153, + "step": 11450 + }, + { + "epoch": 0.8880623038474951, + "grad_norm": 1.4025204416106203, + "learning_rate": 4.4404835709857414e-08, + "loss": 1.195, + "step": 11460 + }, + { + "epoch": 0.8888372273238018, + "grad_norm": 1.4051328990796128, + "learning_rate": 4.44435833849969e-08, + "loss": 1.2009, + "step": 11470 + }, + { + "epoch": 0.8896121508001085, + "grad_norm": 1.4853999928464294, + "learning_rate": 4.448233106013639e-08, + "loss": 1.2014, + "step": 11480 + }, + { + "epoch": 0.8903870742764152, + "grad_norm": 1.4620036029289356, + "learning_rate": 4.452107873527589e-08, + "loss": 1.2332, + "step": 11490 + }, + { + "epoch": 0.8911619977527219, + "grad_norm": 1.436438424019568, + "learning_rate": 4.4559826410415376e-08, + "loss": 1.2129, + "step": 11500 + }, + { + "epoch": 0.8911619977527219, + "eval_loss": 1.206390380859375, + "eval_runtime": 321.0049, + "eval_samples_per_second": 35.735, + "eval_steps_per_second": 8.934, + "step": 11500 + }, + { + "epoch": 0.8919369212290287, + "grad_norm": 1.5500368189385076, + "learning_rate": 4.459857408555487e-08, + "loss": 1.1903, + "step": 11510 + }, + { + "epoch": 0.8927118447053354, + "grad_norm": 1.4191477512266482, + "learning_rate": 4.463732176069436e-08, + "loss": 1.1976, + "step": 11520 + }, + { + "epoch": 0.8934867681816421, + "grad_norm": 1.6079768447677902, + "learning_rate": 4.467606943583385e-08, + "loss": 1.203, + "step": 11530 + }, + { + "epoch": 0.8942616916579488, + "grad_norm": 1.3954118002215992, + "learning_rate": 4.4714817110973345e-08, + "loss": 1.1683, + "step": 11540 + }, + { + "epoch": 0.8950366151342555, + "grad_norm": 1.4161378208283106, + "learning_rate": 4.4753564786112834e-08, + "loss": 1.1973, + "step": 11550 + }, + { + "epoch": 0.8958115386105622, + "grad_norm": 1.594332498775422, + "learning_rate": 4.479231246125233e-08, + "loss": 1.2314, + "step": 11560 + }, + { + "epoch": 0.8965864620868689, + "grad_norm": 1.4364082964698321, + "learning_rate": 4.483106013639182e-08, + "loss": 1.187, + "step": 11570 + }, + { + "epoch": 0.8973613855631757, + "grad_norm": 1.4872807778138963, + "learning_rate": 4.4869807811531314e-08, + "loss": 1.2082, + "step": 11580 + }, + { + "epoch": 0.8981363090394824, + "grad_norm": 1.510927784733966, + "learning_rate": 4.49085554866708e-08, + "loss": 1.1899, + "step": 11590 + }, + { + "epoch": 0.8989112325157891, + "grad_norm": 1.4461195196142478, + "learning_rate": 4.494730316181029e-08, + "loss": 1.24, + "step": 11600 + }, + { + "epoch": 0.8996861559920958, + "grad_norm": 1.3505263104307315, + "learning_rate": 4.498605083694979e-08, + "loss": 1.2192, + "step": 11610 + }, + { + "epoch": 0.9004610794684025, + "grad_norm": 1.4675401113530673, + "learning_rate": 4.5024798512089276e-08, + "loss": 1.2163, + "step": 11620 + }, + { + "epoch": 0.9012360029447092, + "grad_norm": 1.5551778235641869, + "learning_rate": 4.506354618722877e-08, + "loss": 1.238, + "step": 11630 + }, + { + "epoch": 0.9020109264210159, + "grad_norm": 1.3892655686312583, + "learning_rate": 4.510229386236826e-08, + "loss": 1.1838, + "step": 11640 + }, + { + "epoch": 0.9027858498973227, + "grad_norm": 1.4524124857009981, + "learning_rate": 4.514104153750775e-08, + "loss": 1.2059, + "step": 11650 + }, + { + "epoch": 0.9035607733736294, + "grad_norm": 1.3850279094693374, + "learning_rate": 4.5179789212647245e-08, + "loss": 1.2102, + "step": 11660 + }, + { + "epoch": 0.9043356968499361, + "grad_norm": 1.5505095998215583, + "learning_rate": 4.5218536887786734e-08, + "loss": 1.2206, + "step": 11670 + }, + { + "epoch": 0.9051106203262428, + "grad_norm": 1.4360155006719406, + "learning_rate": 4.525728456292623e-08, + "loss": 1.2143, + "step": 11680 + }, + { + "epoch": 0.9058855438025495, + "grad_norm": 1.5743422640545075, + "learning_rate": 4.529603223806572e-08, + "loss": 1.2176, + "step": 11690 + }, + { + "epoch": 0.9066604672788562, + "grad_norm": 1.5677474662384667, + "learning_rate": 4.533477991320521e-08, + "loss": 1.2149, + "step": 11700 + }, + { + "epoch": 0.9074353907551629, + "grad_norm": 1.535846406188768, + "learning_rate": 4.53735275883447e-08, + "loss": 1.235, + "step": 11710 + }, + { + "epoch": 0.9082103142314697, + "grad_norm": 1.5093638059134276, + "learning_rate": 4.541227526348419e-08, + "loss": 1.1947, + "step": 11720 + }, + { + "epoch": 0.9089852377077764, + "grad_norm": 1.4555442484756282, + "learning_rate": 4.545102293862369e-08, + "loss": 1.1955, + "step": 11730 + }, + { + "epoch": 0.9097601611840831, + "grad_norm": 1.4735896717079793, + "learning_rate": 4.5489770613763176e-08, + "loss": 1.2126, + "step": 11740 + }, + { + "epoch": 0.9105350846603898, + "grad_norm": 1.353391146082438, + "learning_rate": 4.552851828890267e-08, + "loss": 1.1734, + "step": 11750 + }, + { + "epoch": 0.9113100081366965, + "grad_norm": 1.4842425671589294, + "learning_rate": 4.556726596404216e-08, + "loss": 1.2028, + "step": 11760 + }, + { + "epoch": 0.9120849316130032, + "grad_norm": 1.4521921118164693, + "learning_rate": 4.560601363918165e-08, + "loss": 1.1809, + "step": 11770 + }, + { + "epoch": 0.9128598550893099, + "grad_norm": 1.4853913403778911, + "learning_rate": 4.5644761314321144e-08, + "loss": 1.208, + "step": 11780 + }, + { + "epoch": 0.9136347785656166, + "grad_norm": 1.4808982046440213, + "learning_rate": 4.5683508989460633e-08, + "loss": 1.2111, + "step": 11790 + }, + { + "epoch": 0.9144097020419234, + "grad_norm": 1.400397614300953, + "learning_rate": 4.572225666460013e-08, + "loss": 1.1961, + "step": 11800 + }, + { + "epoch": 0.9151846255182301, + "grad_norm": 1.3980246356351806, + "learning_rate": 4.576100433973962e-08, + "loss": 1.1685, + "step": 11810 + }, + { + "epoch": 0.9159595489945368, + "grad_norm": 1.4813108847168244, + "learning_rate": 4.5799752014879107e-08, + "loss": 1.2025, + "step": 11820 + }, + { + "epoch": 0.9167344724708435, + "grad_norm": 1.4950319968630477, + "learning_rate": 4.58384996900186e-08, + "loss": 1.1794, + "step": 11830 + }, + { + "epoch": 0.9175093959471502, + "grad_norm": 1.517139108775405, + "learning_rate": 4.587724736515809e-08, + "loss": 1.2166, + "step": 11840 + }, + { + "epoch": 0.9182843194234569, + "grad_norm": 1.649882644392438, + "learning_rate": 4.5915995040297587e-08, + "loss": 1.2069, + "step": 11850 + }, + { + "epoch": 0.9190592428997636, + "grad_norm": 1.3598652617601823, + "learning_rate": 4.5954742715437075e-08, + "loss": 1.181, + "step": 11860 + }, + { + "epoch": 0.9198341663760704, + "grad_norm": 1.467446116092554, + "learning_rate": 4.5993490390576564e-08, + "loss": 1.1784, + "step": 11870 + }, + { + "epoch": 0.9206090898523771, + "grad_norm": 1.5140236427404523, + "learning_rate": 4.603223806571606e-08, + "loss": 1.2269, + "step": 11880 + }, + { + "epoch": 0.9213840133286838, + "grad_norm": 1.3432671318527363, + "learning_rate": 4.607098574085555e-08, + "loss": 1.2058, + "step": 11890 + }, + { + "epoch": 0.9221589368049905, + "grad_norm": 1.4787093332006698, + "learning_rate": 4.6109733415995044e-08, + "loss": 1.2071, + "step": 11900 + }, + { + "epoch": 0.9229338602812972, + "grad_norm": 1.470616437981984, + "learning_rate": 4.614848109113453e-08, + "loss": 1.2112, + "step": 11910 + }, + { + "epoch": 0.9237087837576039, + "grad_norm": 1.4824484572571621, + "learning_rate": 4.618722876627403e-08, + "loss": 1.1911, + "step": 11920 + }, + { + "epoch": 0.9244837072339106, + "grad_norm": 1.4656114500498827, + "learning_rate": 4.622597644141352e-08, + "loss": 1.2156, + "step": 11930 + }, + { + "epoch": 0.9252586307102174, + "grad_norm": 1.4445313682595091, + "learning_rate": 4.6264724116553006e-08, + "loss": 1.2031, + "step": 11940 + }, + { + "epoch": 0.9260335541865241, + "grad_norm": 1.492625306692464, + "learning_rate": 4.63034717916925e-08, + "loss": 1.2096, + "step": 11950 + }, + { + "epoch": 0.9268084776628308, + "grad_norm": 1.6455426732711738, + "learning_rate": 4.634221946683199e-08, + "loss": 1.2193, + "step": 11960 + }, + { + "epoch": 0.9275834011391375, + "grad_norm": 1.4898498032290879, + "learning_rate": 4.6380967141971486e-08, + "loss": 1.1853, + "step": 11970 + }, + { + "epoch": 0.9283583246154442, + "grad_norm": 1.397777273957871, + "learning_rate": 4.6419714817110975e-08, + "loss": 1.1993, + "step": 11980 + }, + { + "epoch": 0.9291332480917509, + "grad_norm": 1.3402444008749075, + "learning_rate": 4.6458462492250464e-08, + "loss": 1.1835, + "step": 11990 + }, + { + "epoch": 0.9299081715680576, + "grad_norm": 1.497151014310327, + "learning_rate": 4.649721016738996e-08, + "loss": 1.2026, + "step": 12000 + }, + { + "epoch": 0.9299081715680576, + "eval_loss": 1.1973105669021606, + "eval_runtime": 319.1351, + "eval_samples_per_second": 35.944, + "eval_steps_per_second": 8.987, + "step": 12000 + }, + { + "epoch": 0.9306830950443644, + "grad_norm": 1.4835484775155086, + "learning_rate": 4.653595784252945e-08, + "loss": 1.2007, + "step": 12010 + }, + { + "epoch": 0.9314580185206711, + "grad_norm": 1.4382058149754826, + "learning_rate": 4.6574705517668944e-08, + "loss": 1.1591, + "step": 12020 + }, + { + "epoch": 0.9322329419969778, + "grad_norm": 1.3774710027842343, + "learning_rate": 4.661345319280843e-08, + "loss": 1.187, + "step": 12030 + }, + { + "epoch": 0.9330078654732845, + "grad_norm": 1.4487842521980094, + "learning_rate": 4.665220086794793e-08, + "loss": 1.2149, + "step": 12040 + }, + { + "epoch": 0.9337827889495912, + "grad_norm": 1.4281731776022961, + "learning_rate": 4.669094854308742e-08, + "loss": 1.1951, + "step": 12050 + }, + { + "epoch": 0.9345577124258979, + "grad_norm": 1.5565496029808525, + "learning_rate": 4.6729696218226906e-08, + "loss": 1.2191, + "step": 12060 + }, + { + "epoch": 0.9353326359022046, + "grad_norm": 1.408616445478415, + "learning_rate": 4.67684438933664e-08, + "loss": 1.2152, + "step": 12070 + }, + { + "epoch": 0.9361075593785114, + "grad_norm": 1.4444482923875686, + "learning_rate": 4.680719156850589e-08, + "loss": 1.1943, + "step": 12080 + }, + { + "epoch": 0.9368824828548181, + "grad_norm": 1.4081842812301963, + "learning_rate": 4.6845939243645386e-08, + "loss": 1.2038, + "step": 12090 + }, + { + "epoch": 0.9376574063311248, + "grad_norm": 1.5699996679230883, + "learning_rate": 4.6884686918784875e-08, + "loss": 1.2002, + "step": 12100 + }, + { + "epoch": 0.9384323298074315, + "grad_norm": 1.4426732739013137, + "learning_rate": 4.6923434593924364e-08, + "loss": 1.2164, + "step": 12110 + }, + { + "epoch": 0.9392072532837382, + "grad_norm": 1.45691015237672, + "learning_rate": 4.696218226906386e-08, + "loss": 1.2104, + "step": 12120 + }, + { + "epoch": 0.9399821767600449, + "grad_norm": 1.4761548167846044, + "learning_rate": 4.700092994420335e-08, + "loss": 1.2081, + "step": 12130 + }, + { + "epoch": 0.9407571002363516, + "grad_norm": 1.4134256372680123, + "learning_rate": 4.7039677619342844e-08, + "loss": 1.1811, + "step": 12140 + }, + { + "epoch": 0.9415320237126584, + "grad_norm": 1.4124268216968432, + "learning_rate": 4.707842529448233e-08, + "loss": 1.2177, + "step": 12150 + }, + { + "epoch": 0.9423069471889651, + "grad_norm": 1.526924227023754, + "learning_rate": 4.711717296962182e-08, + "loss": 1.1945, + "step": 12160 + }, + { + "epoch": 0.9430818706652718, + "grad_norm": 1.4536963837451586, + "learning_rate": 4.715592064476132e-08, + "loss": 1.1699, + "step": 12170 + }, + { + "epoch": 0.9438567941415785, + "grad_norm": 1.4311638305885617, + "learning_rate": 4.7194668319900806e-08, + "loss": 1.1796, + "step": 12180 + }, + { + "epoch": 0.9446317176178852, + "grad_norm": 1.4518441221880614, + "learning_rate": 4.72334159950403e-08, + "loss": 1.197, + "step": 12190 + }, + { + "epoch": 0.9454066410941919, + "grad_norm": 1.6601654248542665, + "learning_rate": 4.727216367017979e-08, + "loss": 1.238, + "step": 12200 + }, + { + "epoch": 0.9461815645704986, + "grad_norm": 1.576048635506624, + "learning_rate": 4.7310911345319286e-08, + "loss": 1.1771, + "step": 12210 + }, + { + "epoch": 0.9469564880468053, + "grad_norm": 1.5472195640572706, + "learning_rate": 4.7349659020458775e-08, + "loss": 1.2009, + "step": 12220 + }, + { + "epoch": 0.9477314115231121, + "grad_norm": 1.4350874499844422, + "learning_rate": 4.7388406695598264e-08, + "loss": 1.1862, + "step": 12230 + }, + { + "epoch": 0.9485063349994188, + "grad_norm": 1.424221456321788, + "learning_rate": 4.742715437073776e-08, + "loss": 1.1944, + "step": 12240 + }, + { + "epoch": 0.9492812584757255, + "grad_norm": 1.4389829354553407, + "learning_rate": 4.746590204587725e-08, + "loss": 1.1959, + "step": 12250 + }, + { + "epoch": 0.9500561819520322, + "grad_norm": 1.4225522068455798, + "learning_rate": 4.7504649721016744e-08, + "loss": 1.1845, + "step": 12260 + }, + { + "epoch": 0.9508311054283389, + "grad_norm": 1.4424866246701062, + "learning_rate": 4.754339739615623e-08, + "loss": 1.2287, + "step": 12270 + }, + { + "epoch": 0.9516060289046456, + "grad_norm": 1.4546425483425631, + "learning_rate": 4.758214507129572e-08, + "loss": 1.1925, + "step": 12280 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 1.4835118065422723, + "learning_rate": 4.762089274643522e-08, + "loss": 1.1849, + "step": 12290 + }, + { + "epoch": 0.9531558758572591, + "grad_norm": 1.4532129680192214, + "learning_rate": 4.7659640421574706e-08, + "loss": 1.2148, + "step": 12300 + }, + { + "epoch": 0.9539307993335658, + "grad_norm": 1.5309076817536955, + "learning_rate": 4.76983880967142e-08, + "loss": 1.2033, + "step": 12310 + }, + { + "epoch": 0.9547057228098725, + "grad_norm": 1.6639353691956489, + "learning_rate": 4.773713577185369e-08, + "loss": 1.1755, + "step": 12320 + }, + { + "epoch": 0.9554806462861792, + "grad_norm": 1.5122010283123022, + "learning_rate": 4.777588344699318e-08, + "loss": 1.1818, + "step": 12330 + }, + { + "epoch": 0.9562555697624859, + "grad_norm": 1.445451126351922, + "learning_rate": 4.7814631122132675e-08, + "loss": 1.189, + "step": 12340 + }, + { + "epoch": 0.9570304932387926, + "grad_norm": 1.4441156758235094, + "learning_rate": 4.7853378797272164e-08, + "loss": 1.1789, + "step": 12350 + }, + { + "epoch": 0.9578054167150993, + "grad_norm": 1.4870199270692956, + "learning_rate": 4.789212647241166e-08, + "loss": 1.1989, + "step": 12360 + }, + { + "epoch": 0.9585803401914061, + "grad_norm": 1.4666667983650805, + "learning_rate": 4.793087414755115e-08, + "loss": 1.1914, + "step": 12370 + }, + { + "epoch": 0.9593552636677128, + "grad_norm": 1.4875536934528273, + "learning_rate": 4.7969621822690643e-08, + "loss": 1.1945, + "step": 12380 + }, + { + "epoch": 0.9601301871440195, + "grad_norm": 1.5074311231389015, + "learning_rate": 4.800836949783013e-08, + "loss": 1.1998, + "step": 12390 + }, + { + "epoch": 0.9609051106203262, + "grad_norm": 1.4693091305108916, + "learning_rate": 4.804711717296962e-08, + "loss": 1.2074, + "step": 12400 + }, + { + "epoch": 0.9616800340966329, + "grad_norm": 1.4884337568967092, + "learning_rate": 4.808586484810912e-08, + "loss": 1.2121, + "step": 12410 + }, + { + "epoch": 0.9624549575729396, + "grad_norm": 1.4851063788815924, + "learning_rate": 4.8124612523248606e-08, + "loss": 1.1744, + "step": 12420 + }, + { + "epoch": 0.9632298810492463, + "grad_norm": 1.4422578023864576, + "learning_rate": 4.81633601983881e-08, + "loss": 1.1691, + "step": 12430 + }, + { + "epoch": 0.9640048045255531, + "grad_norm": 1.4511562518621561, + "learning_rate": 4.820210787352759e-08, + "loss": 1.179, + "step": 12440 + }, + { + "epoch": 0.9647797280018598, + "grad_norm": 1.5135481360171625, + "learning_rate": 4.824085554866708e-08, + "loss": 1.2061, + "step": 12450 + }, + { + "epoch": 0.9655546514781665, + "grad_norm": 1.6387488779812396, + "learning_rate": 4.8279603223806574e-08, + "loss": 1.1714, + "step": 12460 + }, + { + "epoch": 0.9663295749544732, + "grad_norm": 1.3970107716895621, + "learning_rate": 4.8318350898946063e-08, + "loss": 1.1919, + "step": 12470 + }, + { + "epoch": 0.96710449843078, + "grad_norm": 1.4366960199986, + "learning_rate": 4.835709857408556e-08, + "loss": 1.1869, + "step": 12480 + }, + { + "epoch": 0.9678794219070866, + "grad_norm": 1.4492418696879024, + "learning_rate": 4.839584624922505e-08, + "loss": 1.1937, + "step": 12490 + }, + { + "epoch": 0.9686543453833933, + "grad_norm": 1.4273221706710268, + "learning_rate": 4.843459392436454e-08, + "loss": 1.2104, + "step": 12500 + }, + { + "epoch": 0.9686543453833933, + "eval_loss": 1.188887119293213, + "eval_runtime": 318.108, + "eval_samples_per_second": 36.06, + "eval_steps_per_second": 9.016, + "step": 12500 + }, + { + "epoch": 0.9694292688597002, + "grad_norm": 1.4627638138400594, + "learning_rate": 4.847334159950403e-08, + "loss": 1.1999, + "step": 12510 + }, + { + "epoch": 0.9702041923360069, + "grad_norm": 1.482203537747869, + "learning_rate": 4.851208927464352e-08, + "loss": 1.2123, + "step": 12520 + }, + { + "epoch": 0.9709791158123136, + "grad_norm": 1.483639168091874, + "learning_rate": 4.8550836949783017e-08, + "loss": 1.1539, + "step": 12530 + }, + { + "epoch": 0.9717540392886203, + "grad_norm": 1.6582204769314695, + "learning_rate": 4.8589584624922505e-08, + "loss": 1.1995, + "step": 12540 + }, + { + "epoch": 0.972528962764927, + "grad_norm": 1.4353105854789079, + "learning_rate": 4.8628332300062e-08, + "loss": 1.1928, + "step": 12550 + }, + { + "epoch": 0.9733038862412337, + "grad_norm": 1.4083380313946923, + "learning_rate": 4.866707997520149e-08, + "loss": 1.1977, + "step": 12560 + }, + { + "epoch": 0.9740788097175404, + "grad_norm": 1.4205198660325031, + "learning_rate": 4.870582765034098e-08, + "loss": 1.1827, + "step": 12570 + }, + { + "epoch": 0.9748537331938472, + "grad_norm": 1.4625865404608225, + "learning_rate": 4.8744575325480474e-08, + "loss": 1.1721, + "step": 12580 + }, + { + "epoch": 0.9756286566701539, + "grad_norm": 1.4034398976712643, + "learning_rate": 4.878332300061996e-08, + "loss": 1.1987, + "step": 12590 + }, + { + "epoch": 0.9764035801464606, + "grad_norm": 1.5357236320053564, + "learning_rate": 4.882207067575946e-08, + "loss": 1.1825, + "step": 12600 + }, + { + "epoch": 0.9771785036227673, + "grad_norm": 1.6238566851774867, + "learning_rate": 4.886081835089895e-08, + "loss": 1.1791, + "step": 12610 + }, + { + "epoch": 0.977953427099074, + "grad_norm": 1.4194682564026198, + "learning_rate": 4.8899566026038436e-08, + "loss": 1.1699, + "step": 12620 + }, + { + "epoch": 0.9787283505753807, + "grad_norm": 1.4126766535502762, + "learning_rate": 4.893831370117793e-08, + "loss": 1.1942, + "step": 12630 + }, + { + "epoch": 0.9795032740516874, + "grad_norm": 1.3569033475168424, + "learning_rate": 4.897706137631742e-08, + "loss": 1.1836, + "step": 12640 + }, + { + "epoch": 0.980278197527994, + "grad_norm": 1.4798265424684853, + "learning_rate": 4.9015809051456916e-08, + "loss": 1.2291, + "step": 12650 + }, + { + "epoch": 0.9810531210043009, + "grad_norm": 1.4670252392677992, + "learning_rate": 4.9054556726596405e-08, + "loss": 1.1875, + "step": 12660 + }, + { + "epoch": 0.9818280444806076, + "grad_norm": 1.4520623548225513, + "learning_rate": 4.90933044017359e-08, + "loss": 1.1831, + "step": 12670 + }, + { + "epoch": 0.9826029679569143, + "grad_norm": 1.3886125826680755, + "learning_rate": 4.913205207687539e-08, + "loss": 1.1849, + "step": 12680 + }, + { + "epoch": 0.983377891433221, + "grad_norm": 1.3914442782106728, + "learning_rate": 4.917079975201488e-08, + "loss": 1.1876, + "step": 12690 + }, + { + "epoch": 0.9841528149095277, + "grad_norm": 1.426187123396002, + "learning_rate": 4.9209547427154374e-08, + "loss": 1.1922, + "step": 12700 + }, + { + "epoch": 0.9849277383858344, + "grad_norm": 1.525826227228044, + "learning_rate": 4.924829510229386e-08, + "loss": 1.1713, + "step": 12710 + }, + { + "epoch": 0.9857026618621411, + "grad_norm": 1.5104726868348086, + "learning_rate": 4.928704277743336e-08, + "loss": 1.1865, + "step": 12720 + }, + { + "epoch": 0.9864775853384479, + "grad_norm": 1.43840372477839, + "learning_rate": 4.932579045257285e-08, + "loss": 1.1806, + "step": 12730 + }, + { + "epoch": 0.9872525088147546, + "grad_norm": 1.372421034054577, + "learning_rate": 4.9364538127712336e-08, + "loss": 1.1524, + "step": 12740 + }, + { + "epoch": 0.9880274322910613, + "grad_norm": 1.5718261909040678, + "learning_rate": 4.940328580285183e-08, + "loss": 1.1759, + "step": 12750 + }, + { + "epoch": 0.988802355767368, + "grad_norm": 1.3961572582226334, + "learning_rate": 4.944203347799132e-08, + "loss": 1.1681, + "step": 12760 + }, + { + "epoch": 0.9895772792436747, + "grad_norm": 1.5541611649785163, + "learning_rate": 4.9480781153130816e-08, + "loss": 1.1722, + "step": 12770 + }, + { + "epoch": 0.9903522027199814, + "grad_norm": 1.4410140265607057, + "learning_rate": 4.9519528828270305e-08, + "loss": 1.1769, + "step": 12780 + }, + { + "epoch": 0.9911271261962881, + "grad_norm": 1.5437150606159806, + "learning_rate": 4.9558276503409794e-08, + "loss": 1.2009, + "step": 12790 + }, + { + "epoch": 0.9919020496725949, + "grad_norm": 1.4457640018287492, + "learning_rate": 4.959702417854929e-08, + "loss": 1.1612, + "step": 12800 + }, + { + "epoch": 0.9926769731489016, + "grad_norm": 1.5059292758038088, + "learning_rate": 4.963577185368878e-08, + "loss": 1.1595, + "step": 12810 + }, + { + "epoch": 0.9934518966252083, + "grad_norm": 1.4771036049067767, + "learning_rate": 4.9674519528828274e-08, + "loss": 1.1786, + "step": 12820 + }, + { + "epoch": 0.994226820101515, + "grad_norm": 1.3946727183930374, + "learning_rate": 4.971326720396776e-08, + "loss": 1.1958, + "step": 12830 + }, + { + "epoch": 0.9950017435778217, + "grad_norm": 1.3567996227610806, + "learning_rate": 4.975201487910726e-08, + "loss": 1.1733, + "step": 12840 + }, + { + "epoch": 0.9957766670541284, + "grad_norm": 1.3766264745439676, + "learning_rate": 4.979076255424675e-08, + "loss": 1.1776, + "step": 12850 + }, + { + "epoch": 0.9965515905304351, + "grad_norm": 1.6656386847278137, + "learning_rate": 4.9829510229386236e-08, + "loss": 1.1833, + "step": 12860 + }, + { + "epoch": 0.9973265140067419, + "grad_norm": 1.4114048958316372, + "learning_rate": 4.986825790452573e-08, + "loss": 1.2066, + "step": 12870 + }, + { + "epoch": 0.9981014374830486, + "grad_norm": 1.4677433975352736, + "learning_rate": 4.990700557966522e-08, + "loss": 1.1732, + "step": 12880 + }, + { + "epoch": 0.9988763609593553, + "grad_norm": 1.4383692872071803, + "learning_rate": 4.9945753254804716e-08, + "loss": 1.2044, + "step": 12890 + }, + { + "epoch": 0.999651284435662, + "grad_norm": 2.15667854245829, + "learning_rate": 4.9984500929944205e-08, + "loss": 1.1985, + "step": 12900 + }, + { + "epoch": 1.0004262079119688, + "grad_norm": 1.405448731152059, + "learning_rate": 5.0023248605083694e-08, + "loss": 1.177, + "step": 12910 + }, + { + "epoch": 1.0012011313882754, + "grad_norm": 1.5834010355554942, + "learning_rate": 5.006199628022319e-08, + "loss": 1.1972, + "step": 12920 + }, + { + "epoch": 1.0019760548645822, + "grad_norm": 1.383906016617411, + "learning_rate": 5.010074395536268e-08, + "loss": 1.1707, + "step": 12930 + }, + { + "epoch": 1.0027509783408888, + "grad_norm": 1.4462814910981303, + "learning_rate": 5.0139491630502174e-08, + "loss": 1.208, + "step": 12940 + }, + { + "epoch": 1.0035259018171956, + "grad_norm": 1.4082361915400368, + "learning_rate": 5.017823930564166e-08, + "loss": 1.216, + "step": 12950 + }, + { + "epoch": 1.0043008252935022, + "grad_norm": 1.5667032781110084, + "learning_rate": 5.021698698078116e-08, + "loss": 1.2026, + "step": 12960 + }, + { + "epoch": 1.005075748769809, + "grad_norm": 1.523931570704954, + "learning_rate": 5.025573465592065e-08, + "loss": 1.1824, + "step": 12970 + }, + { + "epoch": 1.0058506722461158, + "grad_norm": 1.4678253312633198, + "learning_rate": 5.0294482331060136e-08, + "loss": 1.2018, + "step": 12980 + }, + { + "epoch": 1.0066255957224224, + "grad_norm": 1.6719544229190182, + "learning_rate": 5.033323000619963e-08, + "loss": 1.1917, + "step": 12990 + }, + { + "epoch": 1.0074005191987292, + "grad_norm": 1.59592703067135, + "learning_rate": 5.037197768133912e-08, + "loss": 1.1747, + "step": 13000 + }, + { + "epoch": 1.0074005191987292, + "eval_loss": 1.180772304534912, + "eval_runtime": 320.4299, + "eval_samples_per_second": 35.799, + "eval_steps_per_second": 8.95, + "step": 13000 + }, + { + "epoch": 1.0081754426750358, + "grad_norm": 1.3996454337041404, + "learning_rate": 5.0410725356478616e-08, + "loss": 1.1723, + "step": 13010 + }, + { + "epoch": 1.0089503661513426, + "grad_norm": 1.643828352014698, + "learning_rate": 5.0449473031618105e-08, + "loss": 1.1928, + "step": 13020 + }, + { + "epoch": 1.0097252896276492, + "grad_norm": 1.579189862198854, + "learning_rate": 5.0488220706757594e-08, + "loss": 1.1747, + "step": 13030 + }, + { + "epoch": 1.010500213103956, + "grad_norm": 1.399826535739909, + "learning_rate": 5.052696838189709e-08, + "loss": 1.1945, + "step": 13040 + }, + { + "epoch": 1.0112751365802628, + "grad_norm": 1.4330583313296184, + "learning_rate": 5.056571605703658e-08, + "loss": 1.1835, + "step": 13050 + }, + { + "epoch": 1.0120500600565694, + "grad_norm": 1.394486027664434, + "learning_rate": 5.0604463732176073e-08, + "loss": 1.1715, + "step": 13060 + }, + { + "epoch": 1.0128249835328762, + "grad_norm": 1.49339221850534, + "learning_rate": 5.064321140731556e-08, + "loss": 1.1742, + "step": 13070 + }, + { + "epoch": 1.0135999070091828, + "grad_norm": 1.4016522148240556, + "learning_rate": 5.068195908245505e-08, + "loss": 1.1861, + "step": 13080 + }, + { + "epoch": 1.0143748304854896, + "grad_norm": 1.3834616228975694, + "learning_rate": 5.072070675759455e-08, + "loss": 1.1601, + "step": 13090 + }, + { + "epoch": 1.0151497539617962, + "grad_norm": 1.4123852279201505, + "learning_rate": 5.0759454432734036e-08, + "loss": 1.1661, + "step": 13100 + }, + { + "epoch": 1.015924677438103, + "grad_norm": 1.4895226101280274, + "learning_rate": 5.079820210787353e-08, + "loss": 1.197, + "step": 13110 + }, + { + "epoch": 1.0166996009144098, + "grad_norm": 1.4309318525555708, + "learning_rate": 5.083694978301302e-08, + "loss": 1.2005, + "step": 13120 + }, + { + "epoch": 1.0174745243907164, + "grad_norm": 1.3270336579240327, + "learning_rate": 5.0875697458152516e-08, + "loss": 1.1601, + "step": 13130 + }, + { + "epoch": 1.0182494478670232, + "grad_norm": 1.4609181805574762, + "learning_rate": 5.0914445133292004e-08, + "loss": 1.1548, + "step": 13140 + }, + { + "epoch": 1.0190243713433298, + "grad_norm": 1.6066937515915873, + "learning_rate": 5.0953192808431493e-08, + "loss": 1.178, + "step": 13150 + }, + { + "epoch": 1.0197992948196366, + "grad_norm": 1.5494436244124818, + "learning_rate": 5.099194048357099e-08, + "loss": 1.1774, + "step": 13160 + }, + { + "epoch": 1.0205742182959432, + "grad_norm": 1.4694114096481632, + "learning_rate": 5.103068815871048e-08, + "loss": 1.1857, + "step": 13170 + }, + { + "epoch": 1.02134914177225, + "grad_norm": 1.4441600575064957, + "learning_rate": 5.106943583384997e-08, + "loss": 1.1873, + "step": 13180 + }, + { + "epoch": 1.0221240652485568, + "grad_norm": 1.456418443115685, + "learning_rate": 5.110818350898946e-08, + "loss": 1.1882, + "step": 13190 + }, + { + "epoch": 1.0228989887248634, + "grad_norm": 1.44491295767725, + "learning_rate": 5.114693118412895e-08, + "loss": 1.1678, + "step": 13200 + }, + { + "epoch": 1.0236739122011702, + "grad_norm": 1.445530152718081, + "learning_rate": 5.1185678859268447e-08, + "loss": 1.1817, + "step": 13210 + }, + { + "epoch": 1.0244488356774768, + "grad_norm": 1.53580152407905, + "learning_rate": 5.1224426534407935e-08, + "loss": 1.1958, + "step": 13220 + }, + { + "epoch": 1.0252237591537836, + "grad_norm": 1.7824748834940596, + "learning_rate": 5.126317420954743e-08, + "loss": 1.2013, + "step": 13230 + }, + { + "epoch": 1.0259986826300902, + "grad_norm": 1.5101953052693906, + "learning_rate": 5.130192188468692e-08, + "loss": 1.1781, + "step": 13240 + }, + { + "epoch": 1.026773606106397, + "grad_norm": 1.3816355453901696, + "learning_rate": 5.134066955982641e-08, + "loss": 1.1939, + "step": 13250 + }, + { + "epoch": 1.0275485295827038, + "grad_norm": 1.4097023423474597, + "learning_rate": 5.1379417234965904e-08, + "loss": 1.1786, + "step": 13260 + }, + { + "epoch": 1.0283234530590104, + "grad_norm": 1.6138985649698192, + "learning_rate": 5.141816491010539e-08, + "loss": 1.1884, + "step": 13270 + }, + { + "epoch": 1.0290983765353172, + "grad_norm": 1.3868727539743344, + "learning_rate": 5.145691258524489e-08, + "loss": 1.1584, + "step": 13280 + }, + { + "epoch": 1.0298733000116238, + "grad_norm": 1.485850560806282, + "learning_rate": 5.149566026038438e-08, + "loss": 1.1824, + "step": 13290 + }, + { + "epoch": 1.0306482234879306, + "grad_norm": 1.4786096878954536, + "learning_rate": 5.153440793552387e-08, + "loss": 1.1601, + "step": 13300 + }, + { + "epoch": 1.0314231469642372, + "grad_norm": 1.4961687464747933, + "learning_rate": 5.157315561066336e-08, + "loss": 1.185, + "step": 13310 + }, + { + "epoch": 1.032198070440544, + "grad_norm": 1.4226908316940592, + "learning_rate": 5.161190328580285e-08, + "loss": 1.1727, + "step": 13320 + }, + { + "epoch": 1.0329729939168506, + "grad_norm": 1.4473050125681923, + "learning_rate": 5.1650650960942346e-08, + "loss": 1.1899, + "step": 13330 + }, + { + "epoch": 1.0337479173931574, + "grad_norm": 1.4468523714026926, + "learning_rate": 5.1689398636081835e-08, + "loss": 1.1706, + "step": 13340 + }, + { + "epoch": 1.0345228408694642, + "grad_norm": 1.511507902058324, + "learning_rate": 5.172814631122133e-08, + "loss": 1.1923, + "step": 13350 + }, + { + "epoch": 1.0352977643457708, + "grad_norm": 1.5237396863083819, + "learning_rate": 5.176689398636082e-08, + "loss": 1.1685, + "step": 13360 + }, + { + "epoch": 1.0360726878220776, + "grad_norm": 1.429475766718335, + "learning_rate": 5.180564166150031e-08, + "loss": 1.1886, + "step": 13370 + }, + { + "epoch": 1.0368476112983842, + "grad_norm": 1.3701997356024405, + "learning_rate": 5.1844389336639804e-08, + "loss": 1.1489, + "step": 13380 + }, + { + "epoch": 1.037622534774691, + "grad_norm": 1.498124421146957, + "learning_rate": 5.188313701177929e-08, + "loss": 1.1979, + "step": 13390 + }, + { + "epoch": 1.0383974582509976, + "grad_norm": 1.4444413605272872, + "learning_rate": 5.192188468691879e-08, + "loss": 1.1525, + "step": 13400 + }, + { + "epoch": 1.0391723817273044, + "grad_norm": 1.438331975978573, + "learning_rate": 5.196063236205828e-08, + "loss": 1.1648, + "step": 13410 + }, + { + "epoch": 1.0399473052036112, + "grad_norm": 1.4495037793279717, + "learning_rate": 5.199938003719777e-08, + "loss": 1.1619, + "step": 13420 + }, + { + "epoch": 1.0407222286799178, + "grad_norm": 1.4697057300669891, + "learning_rate": 5.203812771233726e-08, + "loss": 1.1815, + "step": 13430 + }, + { + "epoch": 1.0414971521562246, + "grad_norm": 1.4506450240286777, + "learning_rate": 5.207687538747675e-08, + "loss": 1.1943, + "step": 13440 + }, + { + "epoch": 1.0422720756325312, + "grad_norm": 1.4298325457112608, + "learning_rate": 5.2115623062616246e-08, + "loss": 1.2127, + "step": 13450 + }, + { + "epoch": 1.043046999108838, + "grad_norm": 1.4874906778766723, + "learning_rate": 5.2154370737755735e-08, + "loss": 1.167, + "step": 13460 + }, + { + "epoch": 1.0438219225851446, + "grad_norm": 1.4220066755752625, + "learning_rate": 5.219311841289523e-08, + "loss": 1.1545, + "step": 13470 + }, + { + "epoch": 1.0445968460614514, + "grad_norm": 1.4176911159172887, + "learning_rate": 5.223186608803472e-08, + "loss": 1.1885, + "step": 13480 + }, + { + "epoch": 1.0453717695377582, + "grad_norm": 1.433162453266465, + "learning_rate": 5.227061376317421e-08, + "loss": 1.17, + "step": 13490 + }, + { + "epoch": 1.0461466930140648, + "grad_norm": 1.4192586982250581, + "learning_rate": 5.2309361438313704e-08, + "loss": 1.1712, + "step": 13500 + }, + { + "epoch": 1.0461466930140648, + "eval_loss": 1.1729521751403809, + "eval_runtime": 319.5472, + "eval_samples_per_second": 35.898, + "eval_steps_per_second": 8.975, + "step": 13500 + }, + { + "epoch": 1.0469216164903716, + "grad_norm": 1.3761073080135229, + "learning_rate": 5.234810911345319e-08, + "loss": 1.1783, + "step": 13510 + }, + { + "epoch": 1.0476965399666782, + "grad_norm": 1.4240470460011918, + "learning_rate": 5.238685678859269e-08, + "loss": 1.172, + "step": 13520 + }, + { + "epoch": 1.048471463442985, + "grad_norm": 1.4566714187111705, + "learning_rate": 5.242560446373218e-08, + "loss": 1.1754, + "step": 13530 + }, + { + "epoch": 1.0492463869192916, + "grad_norm": 1.5022567722442268, + "learning_rate": 5.2464352138871666e-08, + "loss": 1.1948, + "step": 13540 + }, + { + "epoch": 1.0500213103955984, + "grad_norm": 1.4450381486291863, + "learning_rate": 5.250309981401116e-08, + "loss": 1.167, + "step": 13550 + }, + { + "epoch": 1.0507962338719052, + "grad_norm": 1.3202477615576236, + "learning_rate": 5.254184748915065e-08, + "loss": 1.1562, + "step": 13560 + }, + { + "epoch": 1.0515711573482118, + "grad_norm": 1.5263434065561485, + "learning_rate": 5.2580595164290146e-08, + "loss": 1.1287, + "step": 13570 + }, + { + "epoch": 1.0523460808245186, + "grad_norm": 1.4954904580900572, + "learning_rate": 5.2619342839429635e-08, + "loss": 1.1751, + "step": 13580 + }, + { + "epoch": 1.0531210043008252, + "grad_norm": 1.5338224796248092, + "learning_rate": 5.265809051456913e-08, + "loss": 1.193, + "step": 13590 + }, + { + "epoch": 1.053895927777132, + "grad_norm": 1.3082610160874364, + "learning_rate": 5.269683818970862e-08, + "loss": 1.1722, + "step": 13600 + }, + { + "epoch": 1.0546708512534386, + "grad_norm": 1.5244479394128223, + "learning_rate": 5.273558586484811e-08, + "loss": 1.1701, + "step": 13610 + }, + { + "epoch": 1.0554457747297454, + "grad_norm": 1.4326136345151987, + "learning_rate": 5.2774333539987604e-08, + "loss": 1.2089, + "step": 13620 + }, + { + "epoch": 1.0562206982060522, + "grad_norm": 1.4594887247106199, + "learning_rate": 5.281308121512709e-08, + "loss": 1.1485, + "step": 13630 + }, + { + "epoch": 1.0569956216823588, + "grad_norm": 1.4158417191987858, + "learning_rate": 5.285182889026659e-08, + "loss": 1.1599, + "step": 13640 + }, + { + "epoch": 1.0577705451586656, + "grad_norm": 1.381559032737556, + "learning_rate": 5.289057656540608e-08, + "loss": 1.1712, + "step": 13650 + }, + { + "epoch": 1.0585454686349722, + "grad_norm": 1.4028510875653482, + "learning_rate": 5.2929324240545566e-08, + "loss": 1.1539, + "step": 13660 + }, + { + "epoch": 1.059320392111279, + "grad_norm": 1.3873982086129424, + "learning_rate": 5.296807191568506e-08, + "loss": 1.1669, + "step": 13670 + }, + { + "epoch": 1.0600953155875856, + "grad_norm": 1.398534036095938, + "learning_rate": 5.300681959082455e-08, + "loss": 1.19, + "step": 13680 + }, + { + "epoch": 1.0608702390638924, + "grad_norm": 1.4628806442624256, + "learning_rate": 5.3045567265964046e-08, + "loss": 1.1765, + "step": 13690 + }, + { + "epoch": 1.0616451625401992, + "grad_norm": 1.394350920369252, + "learning_rate": 5.3084314941103535e-08, + "loss": 1.1727, + "step": 13700 + }, + { + "epoch": 1.0624200860165058, + "grad_norm": 3.785348743266935, + "learning_rate": 5.3123062616243023e-08, + "loss": 1.1606, + "step": 13710 + }, + { + "epoch": 1.0631950094928126, + "grad_norm": 1.494183541970159, + "learning_rate": 5.316181029138252e-08, + "loss": 1.1931, + "step": 13720 + }, + { + "epoch": 1.0639699329691192, + "grad_norm": 1.4764330882811298, + "learning_rate": 5.320055796652201e-08, + "loss": 1.1493, + "step": 13730 + }, + { + "epoch": 1.064744856445426, + "grad_norm": 1.4352481147870744, + "learning_rate": 5.3239305641661503e-08, + "loss": 1.1723, + "step": 13740 + }, + { + "epoch": 1.0655197799217326, + "grad_norm": 1.447935715515503, + "learning_rate": 5.327805331680099e-08, + "loss": 1.2011, + "step": 13750 + }, + { + "epoch": 1.0662947033980394, + "grad_norm": 1.5079859752186908, + "learning_rate": 5.331680099194049e-08, + "loss": 1.1451, + "step": 13760 + }, + { + "epoch": 1.0670696268743463, + "grad_norm": 1.4167085883368684, + "learning_rate": 5.3355548667079977e-08, + "loss": 1.1716, + "step": 13770 + }, + { + "epoch": 1.0678445503506528, + "grad_norm": 1.5705843570170412, + "learning_rate": 5.3394296342219466e-08, + "loss": 1.1863, + "step": 13780 + }, + { + "epoch": 1.0686194738269597, + "grad_norm": 1.5432358673681597, + "learning_rate": 5.343304401735896e-08, + "loss": 1.1907, + "step": 13790 + }, + { + "epoch": 1.0693943973032662, + "grad_norm": 1.372728245878669, + "learning_rate": 5.347179169249845e-08, + "loss": 1.1799, + "step": 13800 + }, + { + "epoch": 1.070169320779573, + "grad_norm": 1.4589465462224176, + "learning_rate": 5.3510539367637945e-08, + "loss": 1.1759, + "step": 13810 + }, + { + "epoch": 1.0709442442558796, + "grad_norm": 1.4935351596487199, + "learning_rate": 5.3549287042777434e-08, + "loss": 1.1785, + "step": 13820 + }, + { + "epoch": 1.0717191677321865, + "grad_norm": 1.2939190079339733, + "learning_rate": 5.358803471791692e-08, + "loss": 1.1608, + "step": 13830 + }, + { + "epoch": 1.0724940912084933, + "grad_norm": 1.3701084903754803, + "learning_rate": 5.362678239305642e-08, + "loss": 1.1586, + "step": 13840 + }, + { + "epoch": 1.0732690146847998, + "grad_norm": 1.4710368248109253, + "learning_rate": 5.366553006819591e-08, + "loss": 1.1677, + "step": 13850 + }, + { + "epoch": 1.0740439381611067, + "grad_norm": 1.493427584642573, + "learning_rate": 5.37042777433354e-08, + "loss": 1.1363, + "step": 13860 + }, + { + "epoch": 1.0748188616374132, + "grad_norm": 1.5977500177495585, + "learning_rate": 5.374302541847489e-08, + "loss": 1.188, + "step": 13870 + }, + { + "epoch": 1.07559378511372, + "grad_norm": 1.546626094292046, + "learning_rate": 5.378177309361438e-08, + "loss": 1.164, + "step": 13880 + }, + { + "epoch": 1.0763687085900266, + "grad_norm": 1.37237914212265, + "learning_rate": 5.3820520768753876e-08, + "loss": 1.2092, + "step": 13890 + }, + { + "epoch": 1.0771436320663335, + "grad_norm": 1.4049678803495096, + "learning_rate": 5.3859268443893365e-08, + "loss": 1.1735, + "step": 13900 + }, + { + "epoch": 1.0779185555426403, + "grad_norm": 1.5383096040885331, + "learning_rate": 5.389801611903286e-08, + "loss": 1.1825, + "step": 13910 + }, + { + "epoch": 1.0786934790189469, + "grad_norm": 1.4492135839323663, + "learning_rate": 5.393676379417235e-08, + "loss": 1.1808, + "step": 13920 + }, + { + "epoch": 1.0794684024952537, + "grad_norm": 1.4630696525009115, + "learning_rate": 5.3975511469311845e-08, + "loss": 1.1554, + "step": 13930 + }, + { + "epoch": 1.0802433259715603, + "grad_norm": 1.6764858690936744, + "learning_rate": 5.4014259144451334e-08, + "loss": 1.1571, + "step": 13940 + }, + { + "epoch": 1.081018249447867, + "grad_norm": 1.5209946218981854, + "learning_rate": 5.405300681959082e-08, + "loss": 1.1683, + "step": 13950 + }, + { + "epoch": 1.0817931729241737, + "grad_norm": 1.3893094800760324, + "learning_rate": 5.409175449473032e-08, + "loss": 1.1539, + "step": 13960 + }, + { + "epoch": 1.0825680964004805, + "grad_norm": 1.4190789597308562, + "learning_rate": 5.413050216986981e-08, + "loss": 1.1597, + "step": 13970 + }, + { + "epoch": 1.083343019876787, + "grad_norm": 1.4950446586441288, + "learning_rate": 5.41692498450093e-08, + "loss": 1.1949, + "step": 13980 + }, + { + "epoch": 1.0841179433530939, + "grad_norm": 1.4015546496851392, + "learning_rate": 5.420799752014879e-08, + "loss": 1.1678, + "step": 13990 + }, + { + "epoch": 1.0848928668294007, + "grad_norm": 2.989010290282467, + "learning_rate": 5.424674519528828e-08, + "loss": 1.169, + "step": 14000 + }, + { + "epoch": 1.0848928668294007, + "eval_loss": 1.1655501127243042, + "eval_runtime": 319.9057, + "eval_samples_per_second": 35.857, + "eval_steps_per_second": 8.965, + "step": 14000 + }, + { + "epoch": 1.0856677903057073, + "grad_norm": 1.5148470887732841, + "learning_rate": 5.4285492870427776e-08, + "loss": 1.1533, + "step": 14010 + }, + { + "epoch": 1.086442713782014, + "grad_norm": 1.3939883535993634, + "learning_rate": 5.4324240545567265e-08, + "loss": 1.1576, + "step": 14020 + }, + { + "epoch": 1.0872176372583207, + "grad_norm": 1.4520014555972993, + "learning_rate": 5.436298822070676e-08, + "loss": 1.1807, + "step": 14030 + }, + { + "epoch": 1.0879925607346275, + "grad_norm": 1.4513136384348262, + "learning_rate": 5.440173589584625e-08, + "loss": 1.1586, + "step": 14040 + }, + { + "epoch": 1.0887674842109343, + "grad_norm": 1.568747683548411, + "learning_rate": 5.4440483570985745e-08, + "loss": 1.1576, + "step": 14050 + }, + { + "epoch": 1.0895424076872409, + "grad_norm": 1.472119265947027, + "learning_rate": 5.4479231246125234e-08, + "loss": 1.1789, + "step": 14060 + }, + { + "epoch": 1.0903173311635477, + "grad_norm": 1.3365629482835641, + "learning_rate": 5.451797892126472e-08, + "loss": 1.1912, + "step": 14070 + }, + { + "epoch": 1.0910922546398543, + "grad_norm": 1.378482825347371, + "learning_rate": 5.455672659640422e-08, + "loss": 1.1538, + "step": 14080 + }, + { + "epoch": 1.091867178116161, + "grad_norm": 1.415792644222042, + "learning_rate": 5.459547427154371e-08, + "loss": 1.1393, + "step": 14090 + }, + { + "epoch": 1.0926421015924677, + "grad_norm": 1.4848387763674058, + "learning_rate": 5.46342219466832e-08, + "loss": 1.1392, + "step": 14100 + }, + { + "epoch": 1.0934170250687745, + "grad_norm": 1.6362301625057716, + "learning_rate": 5.467296962182269e-08, + "loss": 1.1648, + "step": 14110 + }, + { + "epoch": 1.094191948545081, + "grad_norm": 1.4505680998589394, + "learning_rate": 5.471171729696218e-08, + "loss": 1.1865, + "step": 14120 + }, + { + "epoch": 1.0949668720213879, + "grad_norm": 2.335664810970196, + "learning_rate": 5.4750464972101676e-08, + "loss": 1.191, + "step": 14130 + }, + { + "epoch": 1.0957417954976947, + "grad_norm": 1.3551913871387091, + "learning_rate": 5.4789212647241165e-08, + "loss": 1.161, + "step": 14140 + }, + { + "epoch": 1.0965167189740013, + "grad_norm": 1.4864866952755762, + "learning_rate": 5.482796032238066e-08, + "loss": 1.1866, + "step": 14150 + }, + { + "epoch": 1.097291642450308, + "grad_norm": 1.4944055557347349, + "learning_rate": 5.486670799752015e-08, + "loss": 1.1792, + "step": 14160 + }, + { + "epoch": 1.0980665659266147, + "grad_norm": 1.393780884254536, + "learning_rate": 5.490545567265964e-08, + "loss": 1.1791, + "step": 14170 + }, + { + "epoch": 1.0988414894029215, + "grad_norm": 1.3632228062050467, + "learning_rate": 5.4944203347799134e-08, + "loss": 1.1803, + "step": 14180 + }, + { + "epoch": 1.0996164128792283, + "grad_norm": 1.4397640027117589, + "learning_rate": 5.498295102293862e-08, + "loss": 1.1557, + "step": 14190 + }, + { + "epoch": 1.1003913363555349, + "grad_norm": 1.4353361776910802, + "learning_rate": 5.502169869807812e-08, + "loss": 1.1768, + "step": 14200 + }, + { + "epoch": 1.1011662598318417, + "grad_norm": 1.3531191827953368, + "learning_rate": 5.506044637321761e-08, + "loss": 1.1563, + "step": 14210 + }, + { + "epoch": 1.1019411833081483, + "grad_norm": 1.5526318119601206, + "learning_rate": 5.50991940483571e-08, + "loss": 1.1539, + "step": 14220 + }, + { + "epoch": 1.102716106784455, + "grad_norm": 1.4989753336699185, + "learning_rate": 5.513794172349659e-08, + "loss": 1.149, + "step": 14230 + }, + { + "epoch": 1.1034910302607617, + "grad_norm": 1.359448038873486, + "learning_rate": 5.517668939863608e-08, + "loss": 1.1609, + "step": 14240 + }, + { + "epoch": 1.1042659537370685, + "grad_norm": 1.3783348558200976, + "learning_rate": 5.5215437073775576e-08, + "loss": 1.1727, + "step": 14250 + }, + { + "epoch": 1.105040877213375, + "grad_norm": 1.4687167930356566, + "learning_rate": 5.5254184748915065e-08, + "loss": 1.1566, + "step": 14260 + }, + { + "epoch": 1.1058158006896819, + "grad_norm": 1.4659741291067852, + "learning_rate": 5.529293242405456e-08, + "loss": 1.1457, + "step": 14270 + }, + { + "epoch": 1.1065907241659887, + "grad_norm": 1.5262664863985318, + "learning_rate": 5.533168009919405e-08, + "loss": 1.1775, + "step": 14280 + }, + { + "epoch": 1.1073656476422953, + "grad_norm": 1.3734826627295473, + "learning_rate": 5.537042777433354e-08, + "loss": 1.1579, + "step": 14290 + }, + { + "epoch": 1.108140571118602, + "grad_norm": 1.501788744833084, + "learning_rate": 5.5409175449473034e-08, + "loss": 1.175, + "step": 14300 + }, + { + "epoch": 1.1089154945949087, + "grad_norm": 1.402135120683112, + "learning_rate": 5.544792312461252e-08, + "loss": 1.151, + "step": 14310 + }, + { + "epoch": 1.1096904180712155, + "grad_norm": 1.342709753051904, + "learning_rate": 5.548667079975202e-08, + "loss": 1.1451, + "step": 14320 + }, + { + "epoch": 1.1104653415475223, + "grad_norm": 1.4886701861197233, + "learning_rate": 5.552541847489151e-08, + "loss": 1.158, + "step": 14330 + }, + { + "epoch": 1.1112402650238289, + "grad_norm": 1.462375965672272, + "learning_rate": 5.5564166150030996e-08, + "loss": 1.1654, + "step": 14340 + }, + { + "epoch": 1.1120151885001357, + "grad_norm": 1.4646455715301472, + "learning_rate": 5.560291382517049e-08, + "loss": 1.1307, + "step": 14350 + }, + { + "epoch": 1.1127901119764423, + "grad_norm": 1.3818063941480732, + "learning_rate": 5.564166150030998e-08, + "loss": 1.1892, + "step": 14360 + }, + { + "epoch": 1.113565035452749, + "grad_norm": 1.4308024993567963, + "learning_rate": 5.5680409175449476e-08, + "loss": 1.1543, + "step": 14370 + }, + { + "epoch": 1.1143399589290557, + "grad_norm": 1.414979752541956, + "learning_rate": 5.5719156850588965e-08, + "loss": 1.159, + "step": 14380 + }, + { + "epoch": 1.1151148824053625, + "grad_norm": 1.3685941878392465, + "learning_rate": 5.575790452572846e-08, + "loss": 1.1652, + "step": 14390 + }, + { + "epoch": 1.115889805881669, + "grad_norm": 1.3522380309955804, + "learning_rate": 5.579665220086795e-08, + "loss": 1.161, + "step": 14400 + }, + { + "epoch": 1.116664729357976, + "grad_norm": 1.3472039443093224, + "learning_rate": 5.583539987600744e-08, + "loss": 1.1649, + "step": 14410 + }, + { + "epoch": 1.1174396528342827, + "grad_norm": 1.4135690848497124, + "learning_rate": 5.5874147551146933e-08, + "loss": 1.1667, + "step": 14420 + }, + { + "epoch": 1.1182145763105893, + "grad_norm": 1.4676650338333375, + "learning_rate": 5.591289522628642e-08, + "loss": 1.1598, + "step": 14430 + }, + { + "epoch": 1.118989499786896, + "grad_norm": 1.4300003420207048, + "learning_rate": 5.595164290142592e-08, + "loss": 1.1579, + "step": 14440 + }, + { + "epoch": 1.1197644232632027, + "grad_norm": 1.5898094596681132, + "learning_rate": 5.5990390576565407e-08, + "loss": 1.1719, + "step": 14450 + }, + { + "epoch": 1.1205393467395095, + "grad_norm": 1.388415486476465, + "learning_rate": 5.6029138251704896e-08, + "loss": 1.1615, + "step": 14460 + }, + { + "epoch": 1.121314270215816, + "grad_norm": 1.421230930560616, + "learning_rate": 5.606788592684439e-08, + "loss": 1.132, + "step": 14470 + }, + { + "epoch": 1.122089193692123, + "grad_norm": 1.5136137585671192, + "learning_rate": 5.610663360198388e-08, + "loss": 1.1545, + "step": 14480 + }, + { + "epoch": 1.1228641171684297, + "grad_norm": 1.513263803997006, + "learning_rate": 5.6145381277123375e-08, + "loss": 1.1721, + "step": 14490 + }, + { + "epoch": 1.1236390406447363, + "grad_norm": 1.486740442542544, + "learning_rate": 5.6184128952262864e-08, + "loss": 1.1705, + "step": 14500 + }, + { + "epoch": 1.1236390406447363, + "eval_loss": 1.1583832502365112, + "eval_runtime": 321.3101, + "eval_samples_per_second": 35.701, + "eval_steps_per_second": 8.926, + "step": 14500 + }, + { + "epoch": 1.124413964121043, + "grad_norm": 1.3645535545410896, + "learning_rate": 5.622287662740236e-08, + "loss": 1.1368, + "step": 14510 + }, + { + "epoch": 1.1251888875973497, + "grad_norm": 1.411816586117511, + "learning_rate": 5.626162430254185e-08, + "loss": 1.1435, + "step": 14520 + }, + { + "epoch": 1.1259638110736565, + "grad_norm": 1.4076120541222064, + "learning_rate": 5.630037197768134e-08, + "loss": 1.1663, + "step": 14530 + }, + { + "epoch": 1.126738734549963, + "grad_norm": 3.5630597226860523, + "learning_rate": 5.633911965282083e-08, + "loss": 1.1676, + "step": 14540 + }, + { + "epoch": 1.12751365802627, + "grad_norm": 1.4839320048449718, + "learning_rate": 5.637786732796032e-08, + "loss": 1.1703, + "step": 14550 + }, + { + "epoch": 1.1282885815025767, + "grad_norm": 1.3704251560432714, + "learning_rate": 5.641661500309982e-08, + "loss": 1.1743, + "step": 14560 + }, + { + "epoch": 1.1290635049788833, + "grad_norm": 1.5171921794740844, + "learning_rate": 5.6455362678239306e-08, + "loss": 1.1355, + "step": 14570 + }, + { + "epoch": 1.1298384284551901, + "grad_norm": 1.4384346470058138, + "learning_rate": 5.6494110353378795e-08, + "loss": 1.1629, + "step": 14580 + }, + { + "epoch": 1.1306133519314967, + "grad_norm": 1.407117437736937, + "learning_rate": 5.653285802851829e-08, + "loss": 1.1493, + "step": 14590 + }, + { + "epoch": 1.1313882754078035, + "grad_norm": 1.8928982861287296, + "learning_rate": 5.657160570365778e-08, + "loss": 1.1424, + "step": 14600 + }, + { + "epoch": 1.13216319888411, + "grad_norm": 1.4344481549709844, + "learning_rate": 5.6610353378797275e-08, + "loss": 1.1677, + "step": 14610 + }, + { + "epoch": 1.132938122360417, + "grad_norm": 1.4268694202844432, + "learning_rate": 5.6649101053936764e-08, + "loss": 1.1449, + "step": 14620 + }, + { + "epoch": 1.1337130458367237, + "grad_norm": 1.5105259176044241, + "learning_rate": 5.668784872907625e-08, + "loss": 1.1588, + "step": 14630 + }, + { + "epoch": 1.1344879693130303, + "grad_norm": 1.3968005782294652, + "learning_rate": 5.672659640421575e-08, + "loss": 1.164, + "step": 14640 + }, + { + "epoch": 1.1352628927893371, + "grad_norm": 1.4228729790415076, + "learning_rate": 5.676534407935524e-08, + "loss": 1.1455, + "step": 14650 + }, + { + "epoch": 1.1360378162656437, + "grad_norm": 1.4692393061443991, + "learning_rate": 5.680409175449473e-08, + "loss": 1.1952, + "step": 14660 + }, + { + "epoch": 1.1368127397419505, + "grad_norm": 1.3943094733155323, + "learning_rate": 5.684283942963422e-08, + "loss": 1.1655, + "step": 14670 + }, + { + "epoch": 1.137587663218257, + "grad_norm": 1.3504345881641608, + "learning_rate": 5.688158710477372e-08, + "loss": 1.1452, + "step": 14680 + }, + { + "epoch": 1.138362586694564, + "grad_norm": 1.3912363616398848, + "learning_rate": 5.6920334779913206e-08, + "loss": 1.1876, + "step": 14690 + }, + { + "epoch": 1.1391375101708707, + "grad_norm": 1.4298571161580516, + "learning_rate": 5.6959082455052695e-08, + "loss": 1.1534, + "step": 14700 + }, + { + "epoch": 1.1399124336471773, + "grad_norm": 1.4136506463743186, + "learning_rate": 5.699783013019219e-08, + "loss": 1.1556, + "step": 14710 + }, + { + "epoch": 1.1406873571234841, + "grad_norm": 1.4604026368634975, + "learning_rate": 5.703657780533168e-08, + "loss": 1.1892, + "step": 14720 + }, + { + "epoch": 1.1414622805997907, + "grad_norm": 1.39869420652595, + "learning_rate": 5.7075325480471175e-08, + "loss": 1.1456, + "step": 14730 + }, + { + "epoch": 1.1422372040760975, + "grad_norm": 1.4944277219384905, + "learning_rate": 5.7114073155610664e-08, + "loss": 1.1975, + "step": 14740 + }, + { + "epoch": 1.1430121275524041, + "grad_norm": 1.4186709093962275, + "learning_rate": 5.715282083075015e-08, + "loss": 1.1429, + "step": 14750 + }, + { + "epoch": 1.143787051028711, + "grad_norm": 1.4186675096577293, + "learning_rate": 5.719156850588965e-08, + "loss": 1.1679, + "step": 14760 + }, + { + "epoch": 1.1445619745050175, + "grad_norm": 1.398481098407177, + "learning_rate": 5.723031618102914e-08, + "loss": 1.144, + "step": 14770 + }, + { + "epoch": 1.1453368979813243, + "grad_norm": 1.3679152929148193, + "learning_rate": 5.726906385616863e-08, + "loss": 1.1442, + "step": 14780 + }, + { + "epoch": 1.1461118214576311, + "grad_norm": 1.3108238936832994, + "learning_rate": 5.730781153130812e-08, + "loss": 1.1521, + "step": 14790 + }, + { + "epoch": 1.1468867449339377, + "grad_norm": 1.422243508262765, + "learning_rate": 5.734655920644761e-08, + "loss": 1.1671, + "step": 14800 + }, + { + "epoch": 1.1476616684102445, + "grad_norm": 1.337379465767717, + "learning_rate": 5.7385306881587106e-08, + "loss": 1.1906, + "step": 14810 + }, + { + "epoch": 1.1484365918865511, + "grad_norm": 1.4211526579085303, + "learning_rate": 5.7424054556726595e-08, + "loss": 1.1631, + "step": 14820 + }, + { + "epoch": 1.149211515362858, + "grad_norm": 1.4360524402606833, + "learning_rate": 5.746280223186609e-08, + "loss": 1.143, + "step": 14830 + }, + { + "epoch": 1.1499864388391647, + "grad_norm": 1.6275863487572304, + "learning_rate": 5.750154990700558e-08, + "loss": 1.1282, + "step": 14840 + }, + { + "epoch": 1.1507613623154713, + "grad_norm": 3.5835991125648596, + "learning_rate": 5.7540297582145075e-08, + "loss": 1.1652, + "step": 14850 + }, + { + "epoch": 1.1515362857917781, + "grad_norm": 1.3223423353257096, + "learning_rate": 5.7579045257284564e-08, + "loss": 1.1432, + "step": 14860 + }, + { + "epoch": 1.1523112092680847, + "grad_norm": 1.3559564672318603, + "learning_rate": 5.761779293242405e-08, + "loss": 1.1544, + "step": 14870 + }, + { + "epoch": 1.1530861327443915, + "grad_norm": 1.292617311010894, + "learning_rate": 5.765654060756355e-08, + "loss": 1.1399, + "step": 14880 + }, + { + "epoch": 1.1538610562206981, + "grad_norm": 1.3417551000505419, + "learning_rate": 5.769528828270304e-08, + "loss": 1.1759, + "step": 14890 + }, + { + "epoch": 1.154635979697005, + "grad_norm": 1.3504539692827682, + "learning_rate": 5.773403595784253e-08, + "loss": 1.1679, + "step": 14900 + }, + { + "epoch": 1.1554109031733115, + "grad_norm": 1.5071963358410891, + "learning_rate": 5.777278363298202e-08, + "loss": 1.1665, + "step": 14910 + }, + { + "epoch": 1.1561858266496183, + "grad_norm": 1.404964332615735, + "learning_rate": 5.781153130812151e-08, + "loss": 1.1476, + "step": 14920 + }, + { + "epoch": 1.1569607501259251, + "grad_norm": 1.3934157022065017, + "learning_rate": 5.7850278983261006e-08, + "loss": 1.1513, + "step": 14930 + }, + { + "epoch": 1.1577356736022317, + "grad_norm": 1.4534685099805944, + "learning_rate": 5.7889026658400495e-08, + "loss": 1.1483, + "step": 14940 + }, + { + "epoch": 1.1585105970785385, + "grad_norm": 1.4142207839213365, + "learning_rate": 5.792777433353999e-08, + "loss": 1.1638, + "step": 14950 + }, + { + "epoch": 1.1592855205548451, + "grad_norm": 1.4015195123683715, + "learning_rate": 5.796652200867948e-08, + "loss": 1.1553, + "step": 14960 + }, + { + "epoch": 1.160060444031152, + "grad_norm": 1.5586366795509523, + "learning_rate": 5.8005269683818975e-08, + "loss": 1.1853, + "step": 14970 + }, + { + "epoch": 1.1608353675074587, + "grad_norm": 1.3749723530776752, + "learning_rate": 5.8044017358958464e-08, + "loss": 1.1566, + "step": 14980 + }, + { + "epoch": 1.1616102909837653, + "grad_norm": 1.378014948375665, + "learning_rate": 5.808276503409795e-08, + "loss": 1.129, + "step": 14990 + }, + { + "epoch": 1.1623852144600721, + "grad_norm": 1.3475400422596533, + "learning_rate": 5.812151270923745e-08, + "loss": 1.1636, + "step": 15000 + }, + { + "epoch": 1.1623852144600721, + "eval_loss": 1.1515921354293823, + "eval_runtime": 320.5452, + "eval_samples_per_second": 35.786, + "eval_steps_per_second": 8.947, + "step": 15000 + }, + { + "epoch": 1.1631601379363787, + "grad_norm": 1.5417735859519563, + "learning_rate": 5.816026038437694e-08, + "loss": 1.1472, + "step": 15010 + }, + { + "epoch": 1.1639350614126855, + "grad_norm": 1.3812250066397582, + "learning_rate": 5.819900805951643e-08, + "loss": 1.1827, + "step": 15020 + }, + { + "epoch": 1.1647099848889921, + "grad_norm": 1.4325531807938794, + "learning_rate": 5.823775573465592e-08, + "loss": 1.1704, + "step": 15030 + }, + { + "epoch": 1.165484908365299, + "grad_norm": 1.391404450226171, + "learning_rate": 5.827650340979541e-08, + "loss": 1.1357, + "step": 15040 + }, + { + "epoch": 1.1662598318416055, + "grad_norm": 1.472864707241478, + "learning_rate": 5.8315251084934906e-08, + "loss": 1.1673, + "step": 15050 + }, + { + "epoch": 1.1670347553179123, + "grad_norm": 1.3445286768322318, + "learning_rate": 5.8353998760074395e-08, + "loss": 1.1435, + "step": 15060 + }, + { + "epoch": 1.1678096787942192, + "grad_norm": 1.400134399236306, + "learning_rate": 5.839274643521389e-08, + "loss": 1.1402, + "step": 15070 + }, + { + "epoch": 1.1685846022705257, + "grad_norm": 1.3315620183213648, + "learning_rate": 5.843149411035338e-08, + "loss": 1.1618, + "step": 15080 + }, + { + "epoch": 1.1693595257468326, + "grad_norm": 1.3278792918049618, + "learning_rate": 5.847024178549287e-08, + "loss": 1.152, + "step": 15090 + }, + { + "epoch": 1.1701344492231391, + "grad_norm": 1.524643694511296, + "learning_rate": 5.8508989460632363e-08, + "loss": 1.133, + "step": 15100 + }, + { + "epoch": 1.170909372699446, + "grad_norm": 1.3106913634691444, + "learning_rate": 5.854773713577185e-08, + "loss": 1.148, + "step": 15110 + }, + { + "epoch": 1.1716842961757528, + "grad_norm": 1.4116585328304074, + "learning_rate": 5.858648481091135e-08, + "loss": 1.1781, + "step": 15120 + }, + { + "epoch": 1.1724592196520593, + "grad_norm": 1.4213211766658718, + "learning_rate": 5.862523248605084e-08, + "loss": 1.1336, + "step": 15130 + }, + { + "epoch": 1.1732341431283662, + "grad_norm": 1.3869108343085665, + "learning_rate": 5.866398016119034e-08, + "loss": 1.1422, + "step": 15140 + }, + { + "epoch": 1.1740090666046727, + "grad_norm": 1.3911725614539339, + "learning_rate": 5.870272783632983e-08, + "loss": 1.139, + "step": 15150 + }, + { + "epoch": 1.1747839900809796, + "grad_norm": 1.3511450372431764, + "learning_rate": 5.874147551146932e-08, + "loss": 1.1247, + "step": 15160 + }, + { + "epoch": 1.1755589135572861, + "grad_norm": 1.3460019749640018, + "learning_rate": 5.878022318660881e-08, + "loss": 1.1546, + "step": 15170 + }, + { + "epoch": 1.176333837033593, + "grad_norm": 1.340145852556935, + "learning_rate": 5.88189708617483e-08, + "loss": 1.174, + "step": 15180 + }, + { + "epoch": 1.1771087605098995, + "grad_norm": 1.5413894967506179, + "learning_rate": 5.8857718536887796e-08, + "loss": 1.1368, + "step": 15190 + }, + { + "epoch": 1.1778836839862064, + "grad_norm": 1.3792237481325311, + "learning_rate": 5.8896466212027285e-08, + "loss": 1.1678, + "step": 15200 + }, + { + "epoch": 1.1786586074625132, + "grad_norm": 1.4074363665016905, + "learning_rate": 5.893521388716678e-08, + "loss": 1.1478, + "step": 15210 + }, + { + "epoch": 1.1794335309388198, + "grad_norm": 1.4251186097301223, + "learning_rate": 5.897396156230627e-08, + "loss": 1.1533, + "step": 15220 + }, + { + "epoch": 1.1802084544151266, + "grad_norm": 1.332399748388334, + "learning_rate": 5.9012709237445765e-08, + "loss": 1.123, + "step": 15230 + }, + { + "epoch": 1.1809833778914332, + "grad_norm": 1.3060937359662714, + "learning_rate": 5.9051456912585254e-08, + "loss": 1.1557, + "step": 15240 + }, + { + "epoch": 1.18175830136774, + "grad_norm": 1.5382732148322456, + "learning_rate": 5.909020458772474e-08, + "loss": 1.1654, + "step": 15250 + }, + { + "epoch": 1.1825332248440468, + "grad_norm": 1.4323260923539445, + "learning_rate": 5.912895226286424e-08, + "loss": 1.1589, + "step": 15260 + }, + { + "epoch": 1.1833081483203534, + "grad_norm": 1.4034658835998135, + "learning_rate": 5.916769993800373e-08, + "loss": 1.1113, + "step": 15270 + }, + { + "epoch": 1.1840830717966602, + "grad_norm": 1.4381825387035145, + "learning_rate": 5.920644761314322e-08, + "loss": 1.1469, + "step": 15280 + }, + { + "epoch": 1.1848579952729668, + "grad_norm": 1.470028253414063, + "learning_rate": 5.924519528828271e-08, + "loss": 1.1655, + "step": 15290 + }, + { + "epoch": 1.1856329187492736, + "grad_norm": 1.3812874943839795, + "learning_rate": 5.92839429634222e-08, + "loss": 1.157, + "step": 15300 + }, + { + "epoch": 1.1864078422255802, + "grad_norm": 1.4184201110781889, + "learning_rate": 5.9322690638561696e-08, + "loss": 1.1562, + "step": 15310 + }, + { + "epoch": 1.187182765701887, + "grad_norm": 1.4498395641946442, + "learning_rate": 5.9361438313701185e-08, + "loss": 1.1672, + "step": 15320 + }, + { + "epoch": 1.1879576891781936, + "grad_norm": 1.4338036053802325, + "learning_rate": 5.940018598884068e-08, + "loss": 1.1668, + "step": 15330 + }, + { + "epoch": 1.1887326126545004, + "grad_norm": 1.4874743777663602, + "learning_rate": 5.943893366398017e-08, + "loss": 1.1571, + "step": 15340 + }, + { + "epoch": 1.1895075361308072, + "grad_norm": 1.4301401716526854, + "learning_rate": 5.9477681339119665e-08, + "loss": 1.1789, + "step": 15350 + }, + { + "epoch": 1.1902824596071138, + "grad_norm": 1.5149580714652466, + "learning_rate": 5.9516429014259154e-08, + "loss": 1.1376, + "step": 15360 + }, + { + "epoch": 1.1910573830834206, + "grad_norm": 1.4025730374618883, + "learning_rate": 5.955517668939864e-08, + "loss": 1.1384, + "step": 15370 + }, + { + "epoch": 1.1918323065597272, + "grad_norm": 1.3403582749646858, + "learning_rate": 5.959392436453814e-08, + "loss": 1.1479, + "step": 15380 + }, + { + "epoch": 1.192607230036034, + "grad_norm": 1.421720562517927, + "learning_rate": 5.963267203967763e-08, + "loss": 1.1557, + "step": 15390 + }, + { + "epoch": 1.1933821535123406, + "grad_norm": 1.4902001437399324, + "learning_rate": 5.967141971481712e-08, + "loss": 1.1532, + "step": 15400 + }, + { + "epoch": 1.1941570769886474, + "grad_norm": 1.3889819959272476, + "learning_rate": 5.971016738995661e-08, + "loss": 1.1476, + "step": 15410 + }, + { + "epoch": 1.1949320004649542, + "grad_norm": 1.4266590418447094, + "learning_rate": 5.974891506509611e-08, + "loss": 1.174, + "step": 15420 + }, + { + "epoch": 1.1957069239412608, + "grad_norm": 1.4388083715523285, + "learning_rate": 5.978766274023559e-08, + "loss": 1.1227, + "step": 15430 + }, + { + "epoch": 1.1964818474175676, + "grad_norm": 1.3831689393189586, + "learning_rate": 5.982641041537508e-08, + "loss": 1.1624, + "step": 15440 + }, + { + "epoch": 1.1972567708938742, + "grad_norm": 1.4489643309150497, + "learning_rate": 5.986515809051458e-08, + "loss": 1.1627, + "step": 15450 + }, + { + "epoch": 1.198031694370181, + "grad_norm": 1.4355994323196197, + "learning_rate": 5.990390576565408e-08, + "loss": 1.1499, + "step": 15460 + }, + { + "epoch": 1.1988066178464876, + "grad_norm": 1.3398840786616362, + "learning_rate": 5.994265344079356e-08, + "loss": 1.1461, + "step": 15470 + }, + { + "epoch": 1.1995815413227944, + "grad_norm": 1.4649965889300391, + "learning_rate": 5.998140111593305e-08, + "loss": 1.164, + "step": 15480 + }, + { + "epoch": 1.2003564647991012, + "grad_norm": 1.3779686089193925, + "learning_rate": 6.002014879107255e-08, + "loss": 1.1508, + "step": 15490 + }, + { + "epoch": 1.2011313882754078, + "grad_norm": 1.4197362269578746, + "learning_rate": 6.005889646621203e-08, + "loss": 1.1456, + "step": 15500 + }, + { + "epoch": 1.2011313882754078, + "eval_loss": 1.1451845169067383, + "eval_runtime": 319.8901, + "eval_samples_per_second": 35.859, + "eval_steps_per_second": 8.966, + "step": 15500 + }, + { + "epoch": 1.2019063117517146, + "grad_norm": 1.3886605611463738, + "learning_rate": 6.009764414135153e-08, + "loss": 1.1538, + "step": 15510 + }, + { + "epoch": 1.2026812352280212, + "grad_norm": 1.3542307250735348, + "learning_rate": 6.013639181649102e-08, + "loss": 1.1398, + "step": 15520 + }, + { + "epoch": 1.203456158704328, + "grad_norm": 1.505620330934728, + "learning_rate": 6.01751394916305e-08, + "loss": 1.1387, + "step": 15530 + }, + { + "epoch": 1.2042310821806346, + "grad_norm": 1.4033125929885277, + "learning_rate": 6.021388716677e-08, + "loss": 1.1301, + "step": 15540 + }, + { + "epoch": 1.2050060056569414, + "grad_norm": 1.4313110897816521, + "learning_rate": 6.02526348419095e-08, + "loss": 1.1249, + "step": 15550 + }, + { + "epoch": 1.205780929133248, + "grad_norm": 1.445682261518311, + "learning_rate": 6.029138251704899e-08, + "loss": 1.1454, + "step": 15560 + }, + { + "epoch": 1.2065558526095548, + "grad_norm": 1.4430098005360397, + "learning_rate": 6.033013019218847e-08, + "loss": 1.1434, + "step": 15570 + }, + { + "epoch": 1.2073307760858616, + "grad_norm": 1.5010934090044457, + "learning_rate": 6.036887786732797e-08, + "loss": 1.1377, + "step": 15580 + }, + { + "epoch": 1.2081056995621682, + "grad_norm": 1.4522274840641372, + "learning_rate": 6.040762554246746e-08, + "loss": 1.1427, + "step": 15590 + }, + { + "epoch": 1.208880623038475, + "grad_norm": 1.3777559688001473, + "learning_rate": 6.044637321760695e-08, + "loss": 1.1476, + "step": 15600 + }, + { + "epoch": 1.2096555465147816, + "grad_norm": 1.3840333396189484, + "learning_rate": 6.048512089274644e-08, + "loss": 1.1515, + "step": 15610 + }, + { + "epoch": 1.2104304699910884, + "grad_norm": 1.320859060996204, + "learning_rate": 6.052386856788594e-08, + "loss": 1.1133, + "step": 15620 + }, + { + "epoch": 1.2112053934673952, + "grad_norm": 1.3478759309580082, + "learning_rate": 6.056261624302543e-08, + "loss": 1.1377, + "step": 15630 + }, + { + "epoch": 1.2119803169437018, + "grad_norm": 1.4678243285866965, + "learning_rate": 6.060136391816492e-08, + "loss": 1.1373, + "step": 15640 + }, + { + "epoch": 1.2127552404200086, + "grad_norm": 1.4625461527039143, + "learning_rate": 6.064011159330441e-08, + "loss": 1.1227, + "step": 15650 + }, + { + "epoch": 1.2135301638963152, + "grad_norm": 1.4106563032230295, + "learning_rate": 6.067885926844391e-08, + "loss": 1.127, + "step": 15660 + }, + { + "epoch": 1.214305087372622, + "grad_norm": 1.4892222046762726, + "learning_rate": 6.071760694358339e-08, + "loss": 1.1499, + "step": 15670 + }, + { + "epoch": 1.2150800108489286, + "grad_norm": 1.3757659584379336, + "learning_rate": 6.075635461872288e-08, + "loss": 1.1427, + "step": 15680 + }, + { + "epoch": 1.2158549343252354, + "grad_norm": 1.4352757337796838, + "learning_rate": 6.079510229386238e-08, + "loss": 1.1484, + "step": 15690 + }, + { + "epoch": 1.216629857801542, + "grad_norm": 1.274412180220279, + "learning_rate": 6.083384996900186e-08, + "loss": 1.1471, + "step": 15700 + }, + { + "epoch": 1.2174047812778488, + "grad_norm": 1.3742434708452158, + "learning_rate": 6.087259764414136e-08, + "loss": 1.1173, + "step": 15710 + }, + { + "epoch": 1.2181797047541556, + "grad_norm": 1.348325066665464, + "learning_rate": 6.091134531928085e-08, + "loss": 1.1383, + "step": 15720 + }, + { + "epoch": 1.2189546282304622, + "grad_norm": 1.3308356685454392, + "learning_rate": 6.095009299442035e-08, + "loss": 1.1598, + "step": 15730 + }, + { + "epoch": 1.219729551706769, + "grad_norm": 1.5020450996929935, + "learning_rate": 6.098884066955983e-08, + "loss": 1.1139, + "step": 15740 + }, + { + "epoch": 1.2205044751830756, + "grad_norm": 1.3773428961208394, + "learning_rate": 6.102758834469933e-08, + "loss": 1.1399, + "step": 15750 + }, + { + "epoch": 1.2212793986593824, + "grad_norm": 1.2884405821612472, + "learning_rate": 6.106633601983882e-08, + "loss": 1.1318, + "step": 15760 + }, + { + "epoch": 1.2220543221356892, + "grad_norm": 1.3578153941915978, + "learning_rate": 6.11050836949783e-08, + "loss": 1.1069, + "step": 15770 + }, + { + "epoch": 1.2228292456119958, + "grad_norm": 1.4705114359578777, + "learning_rate": 6.11438313701178e-08, + "loss": 1.1424, + "step": 15780 + }, + { + "epoch": 1.2236041690883026, + "grad_norm": 1.3069067623232693, + "learning_rate": 6.11825790452573e-08, + "loss": 1.1693, + "step": 15790 + }, + { + "epoch": 1.2243790925646092, + "grad_norm": 1.3693444887817432, + "learning_rate": 6.122132672039679e-08, + "loss": 1.1478, + "step": 15800 + }, + { + "epoch": 1.225154016040916, + "grad_norm": 1.457055510087766, + "learning_rate": 6.126007439553627e-08, + "loss": 1.1335, + "step": 15810 + }, + { + "epoch": 1.2259289395172226, + "grad_norm": 1.3674059767846147, + "learning_rate": 6.129882207067577e-08, + "loss": 1.1275, + "step": 15820 + }, + { + "epoch": 1.2267038629935294, + "grad_norm": 1.3292533115538832, + "learning_rate": 6.133756974581526e-08, + "loss": 1.1175, + "step": 15830 + }, + { + "epoch": 1.227478786469836, + "grad_norm": 1.5422852158436453, + "learning_rate": 6.137631742095475e-08, + "loss": 1.1245, + "step": 15840 + }, + { + "epoch": 1.2282537099461428, + "grad_norm": 1.4356714379122744, + "learning_rate": 6.141506509609424e-08, + "loss": 1.1339, + "step": 15850 + }, + { + "epoch": 1.2290286334224496, + "grad_norm": 1.9881825389963335, + "learning_rate": 6.145381277123374e-08, + "loss": 1.1425, + "step": 15860 + }, + { + "epoch": 1.2298035568987562, + "grad_norm": 1.4750292576305928, + "learning_rate": 6.149256044637322e-08, + "loss": 1.1522, + "step": 15870 + }, + { + "epoch": 1.230578480375063, + "grad_norm": 1.4654408878017762, + "learning_rate": 6.153130812151272e-08, + "loss": 1.1384, + "step": 15880 + }, + { + "epoch": 1.2313534038513696, + "grad_norm": 1.3566038378560232, + "learning_rate": 6.157005579665221e-08, + "loss": 1.1453, + "step": 15890 + }, + { + "epoch": 1.2321283273276764, + "grad_norm": 1.3146723625342351, + "learning_rate": 6.16088034717917e-08, + "loss": 1.1674, + "step": 15900 + }, + { + "epoch": 1.2329032508039832, + "grad_norm": 1.3635559703116533, + "learning_rate": 6.164755114693119e-08, + "loss": 1.1573, + "step": 15910 + }, + { + "epoch": 1.2336781742802898, + "grad_norm": 1.2869914637941524, + "learning_rate": 6.168629882207068e-08, + "loss": 1.0836, + "step": 15920 + }, + { + "epoch": 1.2344530977565966, + "grad_norm": 1.5378238198670788, + "learning_rate": 6.172504649721018e-08, + "loss": 1.1498, + "step": 15930 + }, + { + "epoch": 1.2352280212329032, + "grad_norm": 1.5219319744786708, + "learning_rate": 6.176379417234966e-08, + "loss": 1.1475, + "step": 15940 + }, + { + "epoch": 1.23600294470921, + "grad_norm": 1.2948907915259853, + "learning_rate": 6.180254184748916e-08, + "loss": 1.121, + "step": 15950 + }, + { + "epoch": 1.2367778681855166, + "grad_norm": 1.3986764810077168, + "learning_rate": 6.184128952262865e-08, + "loss": 1.1279, + "step": 15960 + }, + { + "epoch": 1.2375527916618234, + "grad_norm": 1.380236221486562, + "learning_rate": 6.188003719776815e-08, + "loss": 1.1277, + "step": 15970 + }, + { + "epoch": 1.23832771513813, + "grad_norm": 1.4518454001610546, + "learning_rate": 6.191878487290763e-08, + "loss": 1.1201, + "step": 15980 + }, + { + "epoch": 1.2391026386144368, + "grad_norm": 1.5231808818443904, + "learning_rate": 6.195753254804713e-08, + "loss": 1.1184, + "step": 15990 + }, + { + "epoch": 1.2398775620907436, + "grad_norm": 1.4581992056736874, + "learning_rate": 6.199628022318662e-08, + "loss": 1.1376, + "step": 16000 + }, + { + "epoch": 1.2398775620907436, + "eval_loss": 1.1389532089233398, + "eval_runtime": 321.4598, + "eval_samples_per_second": 35.684, + "eval_steps_per_second": 8.922, + "step": 16000 + }, + { + "epoch": 1.2406524855670502, + "grad_norm": 1.3821809566390144, + "learning_rate": 6.20350278983261e-08, + "loss": 1.1179, + "step": 16010 + }, + { + "epoch": 1.241427409043357, + "grad_norm": 1.4250234769310843, + "learning_rate": 6.20737755734656e-08, + "loss": 1.1333, + "step": 16020 + }, + { + "epoch": 1.2422023325196636, + "grad_norm": 1.3979488616085667, + "learning_rate": 6.21125232486051e-08, + "loss": 1.1114, + "step": 16030 + }, + { + "epoch": 1.2429772559959704, + "grad_norm": 1.3076553265344755, + "learning_rate": 6.215127092374459e-08, + "loss": 1.1527, + "step": 16040 + }, + { + "epoch": 1.2437521794722772, + "grad_norm": 1.4870735899742153, + "learning_rate": 6.219001859888407e-08, + "loss": 1.1272, + "step": 16050 + }, + { + "epoch": 1.2445271029485838, + "grad_norm": 1.4510855396856888, + "learning_rate": 6.222876627402357e-08, + "loss": 1.1512, + "step": 16060 + }, + { + "epoch": 1.2453020264248906, + "grad_norm": 1.5295409975376921, + "learning_rate": 6.226751394916306e-08, + "loss": 1.1324, + "step": 16070 + }, + { + "epoch": 1.2460769499011972, + "grad_norm": 1.3750048145300269, + "learning_rate": 6.230626162430255e-08, + "loss": 1.1236, + "step": 16080 + }, + { + "epoch": 1.246851873377504, + "grad_norm": 1.4071150384032838, + "learning_rate": 6.234500929944204e-08, + "loss": 1.1384, + "step": 16090 + }, + { + "epoch": 1.2476267968538106, + "grad_norm": 1.3870356280008682, + "learning_rate": 6.238375697458154e-08, + "loss": 1.151, + "step": 16100 + }, + { + "epoch": 1.2484017203301174, + "grad_norm": 1.524185250790658, + "learning_rate": 6.242250464972102e-08, + "loss": 1.1318, + "step": 16110 + }, + { + "epoch": 1.249176643806424, + "grad_norm": 1.3848479358613313, + "learning_rate": 6.246125232486051e-08, + "loss": 1.1414, + "step": 16120 + }, + { + "epoch": 1.2499515672827308, + "grad_norm": 1.4732122793666387, + "learning_rate": 6.250000000000001e-08, + "loss": 1.1521, + "step": 16130 + }, + { + "epoch": 1.2507264907590376, + "grad_norm": 1.4674110028329688, + "learning_rate": 6.25387476751395e-08, + "loss": 1.1431, + "step": 16140 + }, + { + "epoch": 1.2515014142353442, + "grad_norm": 1.3369962563812408, + "learning_rate": 6.257749535027899e-08, + "loss": 1.1303, + "step": 16150 + }, + { + "epoch": 1.252276337711651, + "grad_norm": 1.3722219139409484, + "learning_rate": 6.261624302541848e-08, + "loss": 1.1251, + "step": 16160 + }, + { + "epoch": 1.2530512611879576, + "grad_norm": 1.4607933048201387, + "learning_rate": 6.265499070055798e-08, + "loss": 1.1428, + "step": 16170 + }, + { + "epoch": 1.2538261846642644, + "grad_norm": 1.4512393704393365, + "learning_rate": 6.269373837569746e-08, + "loss": 1.1213, + "step": 16180 + }, + { + "epoch": 1.2546011081405712, + "grad_norm": 1.4140081672482445, + "learning_rate": 6.273248605083696e-08, + "loss": 1.1338, + "step": 16190 + }, + { + "epoch": 1.2553760316168778, + "grad_norm": 1.4150747497835605, + "learning_rate": 6.277123372597645e-08, + "loss": 1.1365, + "step": 16200 + }, + { + "epoch": 1.2561509550931844, + "grad_norm": 1.4091977003469287, + "learning_rate": 6.280998140111595e-08, + "loss": 1.111, + "step": 16210 + }, + { + "epoch": 1.2569258785694912, + "grad_norm": 1.3527273889568456, + "learning_rate": 6.284872907625543e-08, + "loss": 1.1172, + "step": 16220 + }, + { + "epoch": 1.257700802045798, + "grad_norm": 1.345177788131581, + "learning_rate": 6.288747675139493e-08, + "loss": 1.1254, + "step": 16230 + }, + { + "epoch": 1.2584757255221046, + "grad_norm": 1.5404494256521943, + "learning_rate": 6.292622442653442e-08, + "loss": 1.1175, + "step": 16240 + }, + { + "epoch": 1.2592506489984114, + "grad_norm": 1.3737415585598842, + "learning_rate": 6.29649721016739e-08, + "loss": 1.1357, + "step": 16250 + }, + { + "epoch": 1.260025572474718, + "grad_norm": 1.3522587769107635, + "learning_rate": 6.30037197768134e-08, + "loss": 1.1388, + "step": 16260 + }, + { + "epoch": 1.2608004959510248, + "grad_norm": 1.2601679551076164, + "learning_rate": 6.30424674519529e-08, + "loss": 1.1286, + "step": 16270 + }, + { + "epoch": 1.2615754194273316, + "grad_norm": 1.388522214413641, + "learning_rate": 6.308121512709238e-08, + "loss": 1.0916, + "step": 16280 + }, + { + "epoch": 1.2623503429036382, + "grad_norm": 1.434324223418016, + "learning_rate": 6.311996280223187e-08, + "loss": 1.1306, + "step": 16290 + }, + { + "epoch": 1.263125266379945, + "grad_norm": 1.4345523484047695, + "learning_rate": 6.315871047737137e-08, + "loss": 1.1344, + "step": 16300 + }, + { + "epoch": 1.2639001898562516, + "grad_norm": 1.5077249840723836, + "learning_rate": 6.319745815251086e-08, + "loss": 1.1441, + "step": 16310 + }, + { + "epoch": 1.2646751133325584, + "grad_norm": 1.3807957425305855, + "learning_rate": 6.323620582765035e-08, + "loss": 1.1447, + "step": 16320 + }, + { + "epoch": 1.2654500368088653, + "grad_norm": 1.335616949229422, + "learning_rate": 6.327495350278984e-08, + "loss": 1.1275, + "step": 16330 + }, + { + "epoch": 1.2662249602851718, + "grad_norm": 1.3691311554798953, + "learning_rate": 6.331370117792934e-08, + "loss": 1.1332, + "step": 16340 + }, + { + "epoch": 1.2669998837614784, + "grad_norm": 1.2983432709933977, + "learning_rate": 6.335244885306882e-08, + "loss": 1.1525, + "step": 16350 + }, + { + "epoch": 1.2677748072377852, + "grad_norm": 1.6260723897989542, + "learning_rate": 6.339119652820831e-08, + "loss": 1.1368, + "step": 16360 + }, + { + "epoch": 1.268549730714092, + "grad_norm": 1.4072309640559157, + "learning_rate": 6.342994420334781e-08, + "loss": 1.1225, + "step": 16370 + }, + { + "epoch": 1.2693246541903986, + "grad_norm": 1.4356277434582985, + "learning_rate": 6.34686918784873e-08, + "loss": 1.1378, + "step": 16380 + }, + { + "epoch": 1.2700995776667054, + "grad_norm": 1.3807195915265533, + "learning_rate": 6.350743955362679e-08, + "loss": 1.1511, + "step": 16390 + }, + { + "epoch": 1.270874501143012, + "grad_norm": 1.7839905331713781, + "learning_rate": 6.354618722876628e-08, + "loss": 1.1446, + "step": 16400 + }, + { + "epoch": 1.2716494246193188, + "grad_norm": 4.155712690855731, + "learning_rate": 6.358493490390578e-08, + "loss": 1.1091, + "step": 16410 + }, + { + "epoch": 1.2724243480956257, + "grad_norm": 1.3939763354356094, + "learning_rate": 6.362368257904526e-08, + "loss": 1.1602, + "step": 16420 + }, + { + "epoch": 1.2731992715719322, + "grad_norm": 1.3591795444842787, + "learning_rate": 6.366243025418476e-08, + "loss": 1.1384, + "step": 16430 + }, + { + "epoch": 1.273974195048239, + "grad_norm": 1.5324786454899617, + "learning_rate": 6.370117792932425e-08, + "loss": 1.1293, + "step": 16440 + }, + { + "epoch": 1.2747491185245456, + "grad_norm": 1.5010662613680352, + "learning_rate": 6.373992560446373e-08, + "loss": 1.1443, + "step": 16450 + }, + { + "epoch": 1.2755240420008525, + "grad_norm": 1.4377233400788683, + "learning_rate": 6.377867327960323e-08, + "loss": 1.1614, + "step": 16460 + }, + { + "epoch": 1.2762989654771593, + "grad_norm": 2.5336392671491437, + "learning_rate": 6.381742095474273e-08, + "loss": 1.1046, + "step": 16470 + }, + { + "epoch": 1.2770738889534659, + "grad_norm": 1.2709414947724866, + "learning_rate": 6.385616862988222e-08, + "loss": 1.1056, + "step": 16480 + }, + { + "epoch": 1.2778488124297724, + "grad_norm": 1.4264398268872573, + "learning_rate": 6.38949163050217e-08, + "loss": 1.1518, + "step": 16490 + }, + { + "epoch": 1.2786237359060793, + "grad_norm": 1.458995510977672, + "learning_rate": 6.39336639801612e-08, + "loss": 1.1203, + "step": 16500 + }, + { + "epoch": 1.2786237359060793, + "eval_loss": 1.1331018209457397, + "eval_runtime": 319.7586, + "eval_samples_per_second": 35.874, + "eval_steps_per_second": 8.969, + "step": 16500 + }, + { + "epoch": 1.279398659382386, + "grad_norm": 1.3210427185696325, + "learning_rate": 6.39724116553007e-08, + "loss": 1.158, + "step": 16510 + }, + { + "epoch": 1.2801735828586926, + "grad_norm": 1.4418210021497746, + "learning_rate": 6.401115933044018e-08, + "loss": 1.1694, + "step": 16520 + }, + { + "epoch": 1.2809485063349995, + "grad_norm": 1.3916825231370273, + "learning_rate": 6.404990700557967e-08, + "loss": 1.1253, + "step": 16530 + }, + { + "epoch": 1.281723429811306, + "grad_norm": 1.3105229035036627, + "learning_rate": 6.408865468071917e-08, + "loss": 1.1318, + "step": 16540 + }, + { + "epoch": 1.2824983532876129, + "grad_norm": 1.462361088281064, + "learning_rate": 6.412740235585866e-08, + "loss": 1.1482, + "step": 16550 + }, + { + "epoch": 1.2832732767639197, + "grad_norm": 1.3972042294653222, + "learning_rate": 6.416615003099815e-08, + "loss": 1.1246, + "step": 16560 + }, + { + "epoch": 1.2840482002402263, + "grad_norm": 1.3860924564481252, + "learning_rate": 6.420489770613764e-08, + "loss": 1.1386, + "step": 16570 + }, + { + "epoch": 1.284823123716533, + "grad_norm": 1.4403022731550386, + "learning_rate": 6.424364538127714e-08, + "loss": 1.1552, + "step": 16580 + }, + { + "epoch": 1.2855980471928397, + "grad_norm": 1.4226064127630245, + "learning_rate": 6.428239305641662e-08, + "loss": 1.1598, + "step": 16590 + }, + { + "epoch": 1.2863729706691465, + "grad_norm": 1.3806600244811267, + "learning_rate": 6.432114073155611e-08, + "loss": 1.1195, + "step": 16600 + }, + { + "epoch": 1.2871478941454533, + "grad_norm": 1.4894178324900464, + "learning_rate": 6.435988840669561e-08, + "loss": 1.1185, + "step": 16610 + }, + { + "epoch": 1.2879228176217599, + "grad_norm": 1.3880417051324276, + "learning_rate": 6.439863608183509e-08, + "loss": 1.1505, + "step": 16620 + }, + { + "epoch": 1.2886977410980665, + "grad_norm": 1.4630869689358692, + "learning_rate": 6.443738375697459e-08, + "loss": 1.1356, + "step": 16630 + }, + { + "epoch": 1.2894726645743733, + "grad_norm": 1.352975041370612, + "learning_rate": 6.447613143211408e-08, + "loss": 1.1397, + "step": 16640 + }, + { + "epoch": 1.29024758805068, + "grad_norm": 1.4854684175449497, + "learning_rate": 6.451487910725358e-08, + "loss": 1.1342, + "step": 16650 + }, + { + "epoch": 1.2910225115269867, + "grad_norm": 1.4521921222692875, + "learning_rate": 6.455362678239306e-08, + "loss": 1.1172, + "step": 16660 + }, + { + "epoch": 1.2917974350032935, + "grad_norm": 1.2693279044480656, + "learning_rate": 6.459237445753256e-08, + "loss": 1.1285, + "step": 16670 + }, + { + "epoch": 1.2925723584796, + "grad_norm": 1.4145995307609458, + "learning_rate": 6.463112213267205e-08, + "loss": 1.1313, + "step": 16680 + }, + { + "epoch": 1.2933472819559069, + "grad_norm": 1.3804285492278563, + "learning_rate": 6.466986980781153e-08, + "loss": 1.1461, + "step": 16690 + }, + { + "epoch": 1.2941222054322137, + "grad_norm": 1.3315501058720651, + "learning_rate": 6.470861748295103e-08, + "loss": 1.1281, + "step": 16700 + }, + { + "epoch": 1.2948971289085203, + "grad_norm": 1.820444284173459, + "learning_rate": 6.474736515809052e-08, + "loss": 1.1712, + "step": 16710 + }, + { + "epoch": 1.295672052384827, + "grad_norm": 1.4825553484533538, + "learning_rate": 6.478611283323002e-08, + "loss": 1.1162, + "step": 16720 + }, + { + "epoch": 1.2964469758611337, + "grad_norm": 1.4359374617173795, + "learning_rate": 6.48248605083695e-08, + "loss": 1.1223, + "step": 16730 + }, + { + "epoch": 1.2972218993374405, + "grad_norm": 1.3950008675124728, + "learning_rate": 6.4863608183509e-08, + "loss": 1.1248, + "step": 16740 + }, + { + "epoch": 1.297996822813747, + "grad_norm": 2.272533031326909, + "learning_rate": 6.49023558586485e-08, + "loss": 1.1344, + "step": 16750 + }, + { + "epoch": 1.2987717462900539, + "grad_norm": 1.4198794760838416, + "learning_rate": 6.494110353378798e-08, + "loss": 1.1237, + "step": 16760 + }, + { + "epoch": 1.2995466697663605, + "grad_norm": 1.3292365330745268, + "learning_rate": 6.497985120892747e-08, + "loss": 1.137, + "step": 16770 + }, + { + "epoch": 1.3003215932426673, + "grad_norm": 1.414409425669799, + "learning_rate": 6.501859888406697e-08, + "loss": 1.1562, + "step": 16780 + }, + { + "epoch": 1.301096516718974, + "grad_norm": 1.3436687071831108, + "learning_rate": 6.505734655920645e-08, + "loss": 1.1474, + "step": 16790 + }, + { + "epoch": 1.3018714401952807, + "grad_norm": 1.5222715270383493, + "learning_rate": 6.509609423434594e-08, + "loss": 1.1291, + "step": 16800 + }, + { + "epoch": 1.3026463636715875, + "grad_norm": 1.3880767765455932, + "learning_rate": 6.513484190948544e-08, + "loss": 1.131, + "step": 16810 + }, + { + "epoch": 1.303421287147894, + "grad_norm": 1.4643625013141566, + "learning_rate": 6.517358958462494e-08, + "loss": 1.1538, + "step": 16820 + }, + { + "epoch": 1.3041962106242009, + "grad_norm": 1.4305716143551463, + "learning_rate": 6.521233725976442e-08, + "loss": 1.1375, + "step": 16830 + }, + { + "epoch": 1.3049711341005077, + "grad_norm": 1.4508425528315767, + "learning_rate": 6.525108493490391e-08, + "loss": 1.1533, + "step": 16840 + }, + { + "epoch": 1.3057460575768143, + "grad_norm": 1.31349511487263, + "learning_rate": 6.528983261004341e-08, + "loss": 1.1229, + "step": 16850 + }, + { + "epoch": 1.3065209810531209, + "grad_norm": 1.3477599014289263, + "learning_rate": 6.532858028518289e-08, + "loss": 1.1531, + "step": 16860 + }, + { + "epoch": 1.3072959045294277, + "grad_norm": 1.542992864845555, + "learning_rate": 6.536732796032239e-08, + "loss": 1.111, + "step": 16870 + }, + { + "epoch": 1.3080708280057345, + "grad_norm": 1.6671675201266307, + "learning_rate": 6.540607563546188e-08, + "loss": 1.1325, + "step": 16880 + }, + { + "epoch": 1.308845751482041, + "grad_norm": 1.3699799039068832, + "learning_rate": 6.544482331060138e-08, + "loss": 1.129, + "step": 16890 + }, + { + "epoch": 1.3096206749583479, + "grad_norm": 1.3899581948185815, + "learning_rate": 6.548357098574086e-08, + "loss": 1.1478, + "step": 16900 + }, + { + "epoch": 1.3103955984346545, + "grad_norm": 1.3428451685667266, + "learning_rate": 6.552231866088036e-08, + "loss": 1.1351, + "step": 16910 + }, + { + "epoch": 1.3111705219109613, + "grad_norm": 1.361626604969262, + "learning_rate": 6.556106633601985e-08, + "loss": 1.126, + "step": 16920 + }, + { + "epoch": 1.311945445387268, + "grad_norm": 2.503583858546546, + "learning_rate": 6.559981401115933e-08, + "loss": 1.117, + "step": 16930 + }, + { + "epoch": 1.3127203688635747, + "grad_norm": 1.4605710264231917, + "learning_rate": 6.563856168629883e-08, + "loss": 1.1243, + "step": 16940 + }, + { + "epoch": 1.3134952923398815, + "grad_norm": 1.6566171594034649, + "learning_rate": 6.567730936143832e-08, + "loss": 1.1486, + "step": 16950 + }, + { + "epoch": 1.314270215816188, + "grad_norm": 1.7591717684154724, + "learning_rate": 6.571605703657781e-08, + "loss": 1.13, + "step": 16960 + }, + { + "epoch": 1.315045139292495, + "grad_norm": 1.3921007551242264, + "learning_rate": 6.57548047117173e-08, + "loss": 1.154, + "step": 16970 + }, + { + "epoch": 1.3158200627688017, + "grad_norm": 1.348042266402069, + "learning_rate": 6.57935523868568e-08, + "loss": 1.1218, + "step": 16980 + }, + { + "epoch": 1.3165949862451083, + "grad_norm": 1.381349590737265, + "learning_rate": 6.583230006199629e-08, + "loss": 1.1356, + "step": 16990 + }, + { + "epoch": 1.3173699097214149, + "grad_norm": 1.3174558609667808, + "learning_rate": 6.587104773713578e-08, + "loss": 1.1347, + "step": 17000 + }, + { + "epoch": 1.3173699097214149, + "eval_loss": 1.127612829208374, + "eval_runtime": 319.6849, + "eval_samples_per_second": 35.882, + "eval_steps_per_second": 8.971, + "step": 17000 + }, + { + "epoch": 1.3181448331977217, + "grad_norm": 1.3312133984231944, + "learning_rate": 6.590979541227527e-08, + "loss": 1.1173, + "step": 17010 + }, + { + "epoch": 1.3189197566740285, + "grad_norm": 1.3726434839450474, + "learning_rate": 6.594854308741477e-08, + "loss": 1.1542, + "step": 17020 + }, + { + "epoch": 1.319694680150335, + "grad_norm": 1.4386619161766154, + "learning_rate": 6.598729076255425e-08, + "loss": 1.1295, + "step": 17030 + }, + { + "epoch": 1.320469603626642, + "grad_norm": 1.4622335255923309, + "learning_rate": 6.602603843769374e-08, + "loss": 1.127, + "step": 17040 + }, + { + "epoch": 1.3212445271029485, + "grad_norm": 1.4836366751123666, + "learning_rate": 6.606478611283324e-08, + "loss": 1.1171, + "step": 17050 + }, + { + "epoch": 1.3220194505792553, + "grad_norm": 1.3434126654565115, + "learning_rate": 6.610353378797274e-08, + "loss": 1.1049, + "step": 17060 + }, + { + "epoch": 1.322794374055562, + "grad_norm": 1.500257682632651, + "learning_rate": 6.614228146311222e-08, + "loss": 1.1235, + "step": 17070 + }, + { + "epoch": 1.3235692975318687, + "grad_norm": 1.4699307693435435, + "learning_rate": 6.618102913825171e-08, + "loss": 1.1821, + "step": 17080 + }, + { + "epoch": 1.3243442210081755, + "grad_norm": 1.3815147090863795, + "learning_rate": 6.621977681339121e-08, + "loss": 1.1072, + "step": 17090 + }, + { + "epoch": 1.325119144484482, + "grad_norm": 1.528117673552901, + "learning_rate": 6.625852448853069e-08, + "loss": 1.1422, + "step": 17100 + }, + { + "epoch": 1.325894067960789, + "grad_norm": 1.3638292158852918, + "learning_rate": 6.629727216367019e-08, + "loss": 1.1045, + "step": 17110 + }, + { + "epoch": 1.3266689914370957, + "grad_norm": 1.5054519396226433, + "learning_rate": 6.633601983880968e-08, + "loss": 1.1482, + "step": 17120 + }, + { + "epoch": 1.3274439149134023, + "grad_norm": 1.3304679410341582, + "learning_rate": 6.637476751394918e-08, + "loss": 1.1472, + "step": 17130 + }, + { + "epoch": 1.328218838389709, + "grad_norm": 1.373013127031986, + "learning_rate": 6.641351518908866e-08, + "loss": 1.1371, + "step": 17140 + }, + { + "epoch": 1.3289937618660157, + "grad_norm": 2.4340179651509843, + "learning_rate": 6.645226286422816e-08, + "loss": 1.1436, + "step": 17150 + }, + { + "epoch": 1.3297686853423225, + "grad_norm": 1.3956812322941723, + "learning_rate": 6.649101053936765e-08, + "loss": 1.127, + "step": 17160 + }, + { + "epoch": 1.330543608818629, + "grad_norm": 1.3477874451278384, + "learning_rate": 6.652975821450713e-08, + "loss": 1.1064, + "step": 17170 + }, + { + "epoch": 1.331318532294936, + "grad_norm": 1.5614076152592549, + "learning_rate": 6.656850588964663e-08, + "loss": 1.1225, + "step": 17180 + }, + { + "epoch": 1.3320934557712425, + "grad_norm": 1.3838995491840484, + "learning_rate": 6.660725356478612e-08, + "loss": 1.1073, + "step": 17190 + }, + { + "epoch": 1.3328683792475493, + "grad_norm": 1.3813770245788, + "learning_rate": 6.66460012399256e-08, + "loss": 1.122, + "step": 17200 + }, + { + "epoch": 1.3336433027238561, + "grad_norm": 1.3405137082105054, + "learning_rate": 6.66847489150651e-08, + "loss": 1.1186, + "step": 17210 + }, + { + "epoch": 1.3344182262001627, + "grad_norm": 1.4199937164992462, + "learning_rate": 6.67234965902046e-08, + "loss": 1.1193, + "step": 17220 + }, + { + "epoch": 1.3351931496764695, + "grad_norm": 1.3476570561012167, + "learning_rate": 6.676224426534409e-08, + "loss": 1.1308, + "step": 17230 + }, + { + "epoch": 1.335968073152776, + "grad_norm": 1.4640997638841182, + "learning_rate": 6.680099194048358e-08, + "loss": 1.1443, + "step": 17240 + }, + { + "epoch": 1.336742996629083, + "grad_norm": 1.893619705299882, + "learning_rate": 6.683973961562307e-08, + "loss": 1.1385, + "step": 17250 + }, + { + "epoch": 1.3375179201053897, + "grad_norm": 1.370213210542643, + "learning_rate": 6.687848729076257e-08, + "loss": 1.1535, + "step": 17260 + }, + { + "epoch": 1.3382928435816963, + "grad_norm": 1.3940033286557931, + "learning_rate": 6.691723496590205e-08, + "loss": 1.1299, + "step": 17270 + }, + { + "epoch": 1.339067767058003, + "grad_norm": 1.3416214881965658, + "learning_rate": 6.695598264104154e-08, + "loss": 1.1294, + "step": 17280 + }, + { + "epoch": 1.3398426905343097, + "grad_norm": 1.5446750989335398, + "learning_rate": 6.699473031618104e-08, + "loss": 1.1353, + "step": 17290 + }, + { + "epoch": 1.3406176140106165, + "grad_norm": 1.3766803281431272, + "learning_rate": 6.703347799132054e-08, + "loss": 1.1177, + "step": 17300 + }, + { + "epoch": 1.341392537486923, + "grad_norm": 1.3872908053863655, + "learning_rate": 6.707222566646002e-08, + "loss": 1.1229, + "step": 17310 + }, + { + "epoch": 1.34216746096323, + "grad_norm": 1.3672006958869658, + "learning_rate": 6.711097334159951e-08, + "loss": 1.1365, + "step": 17320 + }, + { + "epoch": 1.3429423844395365, + "grad_norm": 1.3955718932345593, + "learning_rate": 6.714972101673901e-08, + "loss": 1.1219, + "step": 17330 + }, + { + "epoch": 1.3437173079158433, + "grad_norm": 1.4111537881126983, + "learning_rate": 6.718846869187849e-08, + "loss": 1.1342, + "step": 17340 + }, + { + "epoch": 1.3444922313921501, + "grad_norm": 1.4351593855951537, + "learning_rate": 6.722721636701799e-08, + "loss": 1.1003, + "step": 17350 + }, + { + "epoch": 1.3452671548684567, + "grad_norm": 1.4524790222522528, + "learning_rate": 6.726596404215748e-08, + "loss": 1.1296, + "step": 17360 + }, + { + "epoch": 1.3460420783447635, + "grad_norm": 1.3091813095751692, + "learning_rate": 6.730471171729696e-08, + "loss": 1.1216, + "step": 17370 + }, + { + "epoch": 1.3468170018210701, + "grad_norm": 1.383498106609951, + "learning_rate": 6.734345939243646e-08, + "loss": 1.1286, + "step": 17380 + }, + { + "epoch": 1.347591925297377, + "grad_norm": 1.4733139076340602, + "learning_rate": 6.738220706757595e-08, + "loss": 1.1449, + "step": 17390 + }, + { + "epoch": 1.3483668487736837, + "grad_norm": 1.3445768539005476, + "learning_rate": 6.742095474271545e-08, + "loss": 1.1492, + "step": 17400 + }, + { + "epoch": 1.3491417722499903, + "grad_norm": 1.397604489172335, + "learning_rate": 6.745970241785493e-08, + "loss": 1.1273, + "step": 17410 + }, + { + "epoch": 1.349916695726297, + "grad_norm": 1.3928854212822501, + "learning_rate": 6.749845009299443e-08, + "loss": 1.1081, + "step": 17420 + }, + { + "epoch": 1.3506916192026037, + "grad_norm": 1.6981327773885961, + "learning_rate": 6.753719776813392e-08, + "loss": 1.1201, + "step": 17430 + }, + { + "epoch": 1.3514665426789105, + "grad_norm": 1.3654913327347902, + "learning_rate": 6.75759454432734e-08, + "loss": 1.1298, + "step": 17440 + }, + { + "epoch": 1.3522414661552171, + "grad_norm": 1.328512641481709, + "learning_rate": 6.76146931184129e-08, + "loss": 1.1163, + "step": 17450 + }, + { + "epoch": 1.353016389631524, + "grad_norm": 1.4832650103997684, + "learning_rate": 6.76534407935524e-08, + "loss": 1.1303, + "step": 17460 + }, + { + "epoch": 1.3537913131078305, + "grad_norm": 1.3082427164265968, + "learning_rate": 6.769218846869189e-08, + "loss": 1.1274, + "step": 17470 + }, + { + "epoch": 1.3545662365841373, + "grad_norm": 1.288169983491863, + "learning_rate": 6.773093614383137e-08, + "loss": 1.1166, + "step": 17480 + }, + { + "epoch": 1.3553411600604441, + "grad_norm": 1.3386556124685434, + "learning_rate": 6.776968381897087e-08, + "loss": 1.0999, + "step": 17490 + }, + { + "epoch": 1.3561160835367507, + "grad_norm": 1.3610193387075584, + "learning_rate": 6.780843149411037e-08, + "loss": 1.1314, + "step": 17500 + }, + { + "epoch": 1.3561160835367507, + "eval_loss": 1.1223490238189697, + "eval_runtime": 321.2109, + "eval_samples_per_second": 35.712, + "eval_steps_per_second": 8.929, + "step": 17500 + }, + { + "epoch": 1.3568910070130575, + "grad_norm": 1.3480379030387266, + "learning_rate": 6.784717916924985e-08, + "loss": 1.1265, + "step": 17510 + }, + { + "epoch": 1.3576659304893641, + "grad_norm": 1.4192025103660386, + "learning_rate": 6.788592684438934e-08, + "loss": 1.1137, + "step": 17520 + }, + { + "epoch": 1.358440853965671, + "grad_norm": 1.4307185987273106, + "learning_rate": 6.792467451952884e-08, + "loss": 1.1299, + "step": 17530 + }, + { + "epoch": 1.3592157774419775, + "grad_norm": 1.3401173547390464, + "learning_rate": 6.796342219466832e-08, + "loss": 1.1109, + "step": 17540 + }, + { + "epoch": 1.3599907009182843, + "grad_norm": 1.432918925854167, + "learning_rate": 6.800216986980782e-08, + "loss": 1.1432, + "step": 17550 + }, + { + "epoch": 1.360765624394591, + "grad_norm": 1.561480592243417, + "learning_rate": 6.804091754494731e-08, + "loss": 1.1299, + "step": 17560 + }, + { + "epoch": 1.3615405478708977, + "grad_norm": 1.4401146705425594, + "learning_rate": 6.807966522008681e-08, + "loss": 1.1329, + "step": 17570 + }, + { + "epoch": 1.3623154713472045, + "grad_norm": 1.3179790403918616, + "learning_rate": 6.811841289522629e-08, + "loss": 1.1329, + "step": 17580 + }, + { + "epoch": 1.3630903948235111, + "grad_norm": 1.4606728506060194, + "learning_rate": 6.815716057036579e-08, + "loss": 1.147, + "step": 17590 + }, + { + "epoch": 1.363865318299818, + "grad_norm": 1.4309201300632726, + "learning_rate": 6.819590824550528e-08, + "loss": 1.1201, + "step": 17600 + }, + { + "epoch": 1.3646402417761245, + "grad_norm": 1.7669001849991386, + "learning_rate": 6.823465592064476e-08, + "loss": 1.1362, + "step": 17610 + }, + { + "epoch": 1.3654151652524313, + "grad_norm": 1.2878183493823807, + "learning_rate": 6.827340359578426e-08, + "loss": 1.1075, + "step": 17620 + }, + { + "epoch": 1.3661900887287382, + "grad_norm": 1.3363350935877338, + "learning_rate": 6.831215127092375e-08, + "loss": 1.0926, + "step": 17630 + }, + { + "epoch": 1.3669650122050447, + "grad_norm": 1.4554606293203296, + "learning_rate": 6.835089894606325e-08, + "loss": 1.1235, + "step": 17640 + }, + { + "epoch": 1.3677399356813515, + "grad_norm": 1.5737254546535921, + "learning_rate": 6.838964662120273e-08, + "loss": 1.1088, + "step": 17650 + }, + { + "epoch": 1.3685148591576581, + "grad_norm": 1.4547668511283343, + "learning_rate": 6.842839429634223e-08, + "loss": 1.1397, + "step": 17660 + }, + { + "epoch": 1.369289782633965, + "grad_norm": 1.6178188109239946, + "learning_rate": 6.846714197148172e-08, + "loss": 1.1382, + "step": 17670 + }, + { + "epoch": 1.3700647061102715, + "grad_norm": 1.4450917485968005, + "learning_rate": 6.85058896466212e-08, + "loss": 1.1256, + "step": 17680 + }, + { + "epoch": 1.3708396295865783, + "grad_norm": 1.307505681666164, + "learning_rate": 6.85446373217607e-08, + "loss": 1.1665, + "step": 17690 + }, + { + "epoch": 1.371614553062885, + "grad_norm": 1.4238840737875509, + "learning_rate": 6.85833849969002e-08, + "loss": 1.1111, + "step": 17700 + }, + { + "epoch": 1.3723894765391917, + "grad_norm": 1.389602324780525, + "learning_rate": 6.862213267203968e-08, + "loss": 1.1192, + "step": 17710 + }, + { + "epoch": 1.3731644000154986, + "grad_norm": 1.3851036621170538, + "learning_rate": 6.866088034717917e-08, + "loss": 1.1104, + "step": 17720 + }, + { + "epoch": 1.3739393234918051, + "grad_norm": 1.3090966337045735, + "learning_rate": 6.869962802231867e-08, + "loss": 1.1198, + "step": 17730 + }, + { + "epoch": 1.374714246968112, + "grad_norm": 1.405745576491021, + "learning_rate": 6.873837569745817e-08, + "loss": 1.1104, + "step": 17740 + }, + { + "epoch": 1.3754891704444185, + "grad_norm": 1.3844175824031717, + "learning_rate": 6.877712337259765e-08, + "loss": 1.1046, + "step": 17750 + }, + { + "epoch": 1.3762640939207254, + "grad_norm": 1.325781756255059, + "learning_rate": 6.881587104773714e-08, + "loss": 1.1204, + "step": 17760 + }, + { + "epoch": 1.3770390173970322, + "grad_norm": 1.4031618951758862, + "learning_rate": 6.885461872287664e-08, + "loss": 1.1281, + "step": 17770 + }, + { + "epoch": 1.3778139408733387, + "grad_norm": 1.3320262132030172, + "learning_rate": 6.889336639801612e-08, + "loss": 1.1177, + "step": 17780 + }, + { + "epoch": 1.3785888643496453, + "grad_norm": 1.3326882293424505, + "learning_rate": 6.893211407315562e-08, + "loss": 1.127, + "step": 17790 + }, + { + "epoch": 1.3793637878259521, + "grad_norm": 1.428186602825759, + "learning_rate": 6.897086174829511e-08, + "loss": 1.104, + "step": 17800 + }, + { + "epoch": 1.380138711302259, + "grad_norm": 1.852692864721474, + "learning_rate": 6.900960942343461e-08, + "loss": 1.1081, + "step": 17810 + }, + { + "epoch": 1.3809136347785655, + "grad_norm": 1.3268696067088614, + "learning_rate": 6.904835709857409e-08, + "loss": 1.1, + "step": 17820 + }, + { + "epoch": 1.3816885582548724, + "grad_norm": 1.433207358466287, + "learning_rate": 6.908710477371359e-08, + "loss": 1.1032, + "step": 17830 + }, + { + "epoch": 1.382463481731179, + "grad_norm": 1.3470024009832362, + "learning_rate": 6.912585244885308e-08, + "loss": 1.1334, + "step": 17840 + }, + { + "epoch": 1.3832384052074858, + "grad_norm": 1.2732997516701867, + "learning_rate": 6.916460012399256e-08, + "loss": 1.1066, + "step": 17850 + }, + { + "epoch": 1.3840133286837926, + "grad_norm": 1.437470365990163, + "learning_rate": 6.920334779913206e-08, + "loss": 1.1018, + "step": 17860 + }, + { + "epoch": 1.3847882521600992, + "grad_norm": 1.3635360799205294, + "learning_rate": 6.924209547427155e-08, + "loss": 1.1084, + "step": 17870 + }, + { + "epoch": 1.385563175636406, + "grad_norm": 1.4161092905427182, + "learning_rate": 6.928084314941104e-08, + "loss": 1.1109, + "step": 17880 + }, + { + "epoch": 1.3863380991127126, + "grad_norm": 1.3503186210645775, + "learning_rate": 6.931959082455053e-08, + "loss": 1.0967, + "step": 17890 + }, + { + "epoch": 1.3871130225890194, + "grad_norm": 1.3677534794699668, + "learning_rate": 6.935833849969003e-08, + "loss": 1.1301, + "step": 17900 + }, + { + "epoch": 1.3878879460653262, + "grad_norm": 1.350737189026979, + "learning_rate": 6.939708617482952e-08, + "loss": 1.0958, + "step": 17910 + }, + { + "epoch": 1.3886628695416328, + "grad_norm": 1.404042430262787, + "learning_rate": 6.9435833849969e-08, + "loss": 1.13, + "step": 17920 + }, + { + "epoch": 1.3894377930179393, + "grad_norm": 1.4354944122198963, + "learning_rate": 6.94745815251085e-08, + "loss": 1.0999, + "step": 17930 + }, + { + "epoch": 1.3902127164942462, + "grad_norm": 1.3400595085451594, + "learning_rate": 6.9513329200248e-08, + "loss": 1.1343, + "step": 17940 + }, + { + "epoch": 1.390987639970553, + "grad_norm": 1.4259413896728717, + "learning_rate": 6.955207687538748e-08, + "loss": 1.0953, + "step": 17950 + }, + { + "epoch": 1.3917625634468596, + "grad_norm": 1.4373793977979945, + "learning_rate": 6.959082455052697e-08, + "loss": 1.1158, + "step": 17960 + }, + { + "epoch": 1.3925374869231664, + "grad_norm": 1.4563304477612193, + "learning_rate": 6.962957222566647e-08, + "loss": 1.1464, + "step": 17970 + }, + { + "epoch": 1.393312410399473, + "grad_norm": 1.341458311517623, + "learning_rate": 6.966831990080597e-08, + "loss": 1.1002, + "step": 17980 + }, + { + "epoch": 1.3940873338757798, + "grad_norm": 1.383682722575486, + "learning_rate": 6.970706757594545e-08, + "loss": 1.1484, + "step": 17990 + }, + { + "epoch": 1.3948622573520866, + "grad_norm": 1.3933964185504335, + "learning_rate": 6.974581525108494e-08, + "loss": 1.1489, + "step": 18000 + }, + { + "epoch": 1.3948622573520866, + "eval_loss": 1.1172149181365967, + "eval_runtime": 320.0565, + "eval_samples_per_second": 35.841, + "eval_steps_per_second": 8.961, + "step": 18000 + }, + { + "epoch": 1.3956371808283932, + "grad_norm": 1.3785776446572384, + "learning_rate": 6.978456292622444e-08, + "loss": 1.102, + "step": 18010 + }, + { + "epoch": 1.3964121043047, + "grad_norm": 1.3950538490162079, + "learning_rate": 6.982331060136392e-08, + "loss": 1.1018, + "step": 18020 + }, + { + "epoch": 1.3971870277810066, + "grad_norm": 1.3623739552608116, + "learning_rate": 6.986205827650342e-08, + "loss": 1.1309, + "step": 18030 + }, + { + "epoch": 1.3979619512573134, + "grad_norm": 1.4382774009230241, + "learning_rate": 6.990080595164291e-08, + "loss": 1.1148, + "step": 18040 + }, + { + "epoch": 1.3987368747336202, + "grad_norm": 1.3936425096962448, + "learning_rate": 6.993955362678241e-08, + "loss": 1.099, + "step": 18050 + }, + { + "epoch": 1.3995117982099268, + "grad_norm": 1.329262801399352, + "learning_rate": 6.997830130192189e-08, + "loss": 1.1175, + "step": 18060 + }, + { + "epoch": 1.4002867216862334, + "grad_norm": 1.2955865399598856, + "learning_rate": 7.001704897706138e-08, + "loss": 1.0982, + "step": 18070 + }, + { + "epoch": 1.4010616451625402, + "grad_norm": 1.4798246979769407, + "learning_rate": 7.005579665220088e-08, + "loss": 1.1362, + "step": 18080 + }, + { + "epoch": 1.401836568638847, + "grad_norm": 1.3981421635345008, + "learning_rate": 7.009454432734036e-08, + "loss": 1.0924, + "step": 18090 + }, + { + "epoch": 1.4026114921151536, + "grad_norm": 1.3113447964987304, + "learning_rate": 7.013329200247986e-08, + "loss": 1.112, + "step": 18100 + }, + { + "epoch": 1.4033864155914604, + "grad_norm": 2.0050274724597865, + "learning_rate": 7.017203967761935e-08, + "loss": 1.1103, + "step": 18110 + }, + { + "epoch": 1.404161339067767, + "grad_norm": 1.3959773517213323, + "learning_rate": 7.021078735275884e-08, + "loss": 1.1231, + "step": 18120 + }, + { + "epoch": 1.4049362625440738, + "grad_norm": 1.4169927370531588, + "learning_rate": 7.024953502789833e-08, + "loss": 1.1184, + "step": 18130 + }, + { + "epoch": 1.4057111860203806, + "grad_norm": 1.3933386192359667, + "learning_rate": 7.028828270303783e-08, + "loss": 1.1102, + "step": 18140 + }, + { + "epoch": 1.4064861094966872, + "grad_norm": 1.402899689722201, + "learning_rate": 7.032703037817732e-08, + "loss": 1.1087, + "step": 18150 + }, + { + "epoch": 1.407261032972994, + "grad_norm": 1.26475772875576, + "learning_rate": 7.03657780533168e-08, + "loss": 1.1143, + "step": 18160 + }, + { + "epoch": 1.4080359564493006, + "grad_norm": 1.3865150686541459, + "learning_rate": 7.04045257284563e-08, + "loss": 1.1004, + "step": 18170 + }, + { + "epoch": 1.4088108799256074, + "grad_norm": 1.4090555609190503, + "learning_rate": 7.04432734035958e-08, + "loss": 1.1357, + "step": 18180 + }, + { + "epoch": 1.4095858034019142, + "grad_norm": 1.3722655542368143, + "learning_rate": 7.048202107873528e-08, + "loss": 1.0996, + "step": 18190 + }, + { + "epoch": 1.4103607268782208, + "grad_norm": 1.3700351156661277, + "learning_rate": 7.052076875387477e-08, + "loss": 1.1295, + "step": 18200 + }, + { + "epoch": 1.4111356503545274, + "grad_norm": 1.3151606642201987, + "learning_rate": 7.055951642901427e-08, + "loss": 1.0974, + "step": 18210 + }, + { + "epoch": 1.4119105738308342, + "grad_norm": 1.4417985987255875, + "learning_rate": 7.059826410415376e-08, + "loss": 1.1256, + "step": 18220 + }, + { + "epoch": 1.412685497307141, + "grad_norm": 1.3606077267947876, + "learning_rate": 7.063701177929325e-08, + "loss": 1.1207, + "step": 18230 + }, + { + "epoch": 1.4134604207834476, + "grad_norm": 1.3065544893521277, + "learning_rate": 7.067575945443274e-08, + "loss": 1.1321, + "step": 18240 + }, + { + "epoch": 1.4142353442597544, + "grad_norm": 1.3045231647246147, + "learning_rate": 7.071450712957224e-08, + "loss": 1.1052, + "step": 18250 + }, + { + "epoch": 1.415010267736061, + "grad_norm": 1.4878795755977468, + "learning_rate": 7.075325480471172e-08, + "loss": 1.107, + "step": 18260 + }, + { + "epoch": 1.4157851912123678, + "grad_norm": 1.4223257374376952, + "learning_rate": 7.079200247985122e-08, + "loss": 1.144, + "step": 18270 + }, + { + "epoch": 1.4165601146886746, + "grad_norm": 1.3428492598613855, + "learning_rate": 7.083075015499071e-08, + "loss": 1.1161, + "step": 18280 + }, + { + "epoch": 1.4173350381649812, + "grad_norm": 1.3230631438648746, + "learning_rate": 7.08694978301302e-08, + "loss": 1.12, + "step": 18290 + }, + { + "epoch": 1.418109961641288, + "grad_norm": 1.3529292543284375, + "learning_rate": 7.090824550526969e-08, + "loss": 1.1152, + "step": 18300 + }, + { + "epoch": 1.4188848851175946, + "grad_norm": 1.2397474943123814, + "learning_rate": 7.094699318040918e-08, + "loss": 1.1094, + "step": 18310 + }, + { + "epoch": 1.4196598085939014, + "grad_norm": 1.451568970981212, + "learning_rate": 7.098574085554868e-08, + "loss": 1.1083, + "step": 18320 + }, + { + "epoch": 1.420434732070208, + "grad_norm": 1.412259747961268, + "learning_rate": 7.102448853068816e-08, + "loss": 1.1055, + "step": 18330 + }, + { + "epoch": 1.4212096555465148, + "grad_norm": 1.3303647626986683, + "learning_rate": 7.106323620582766e-08, + "loss": 1.1081, + "step": 18340 + }, + { + "epoch": 1.4219845790228214, + "grad_norm": 1.3900007078074126, + "learning_rate": 7.110198388096715e-08, + "loss": 1.1135, + "step": 18350 + }, + { + "epoch": 1.4227595024991282, + "grad_norm": 1.4095476446447588, + "learning_rate": 7.114073155610664e-08, + "loss": 1.1289, + "step": 18360 + }, + { + "epoch": 1.423534425975435, + "grad_norm": 1.3308318397526602, + "learning_rate": 7.117947923124613e-08, + "loss": 1.1049, + "step": 18370 + }, + { + "epoch": 1.4243093494517416, + "grad_norm": 1.4204502768702558, + "learning_rate": 7.121822690638563e-08, + "loss": 1.1182, + "step": 18380 + }, + { + "epoch": 1.4250842729280484, + "grad_norm": 1.419306204890056, + "learning_rate": 7.125697458152512e-08, + "loss": 1.1191, + "step": 18390 + }, + { + "epoch": 1.425859196404355, + "grad_norm": 1.3538994734111567, + "learning_rate": 7.12957222566646e-08, + "loss": 1.1177, + "step": 18400 + }, + { + "epoch": 1.4266341198806618, + "grad_norm": 1.385920902866615, + "learning_rate": 7.13344699318041e-08, + "loss": 1.0882, + "step": 18410 + }, + { + "epoch": 1.4274090433569686, + "grad_norm": 1.4167857427336472, + "learning_rate": 7.13732176069436e-08, + "loss": 1.1089, + "step": 18420 + }, + { + "epoch": 1.4281839668332752, + "grad_norm": 1.3068541527801696, + "learning_rate": 7.141196528208308e-08, + "loss": 1.0943, + "step": 18430 + }, + { + "epoch": 1.428958890309582, + "grad_norm": 1.3249360667642893, + "learning_rate": 7.145071295722257e-08, + "loss": 1.1087, + "step": 18440 + }, + { + "epoch": 1.4297338137858886, + "grad_norm": 1.366863603500205, + "learning_rate": 7.148946063236207e-08, + "loss": 1.1507, + "step": 18450 + }, + { + "epoch": 1.4305087372621954, + "grad_norm": 1.4893598759811983, + "learning_rate": 7.152820830750155e-08, + "loss": 1.1037, + "step": 18460 + }, + { + "epoch": 1.431283660738502, + "grad_norm": 1.536822451348275, + "learning_rate": 7.156695598264105e-08, + "loss": 1.1392, + "step": 18470 + }, + { + "epoch": 1.4320585842148088, + "grad_norm": 1.2876845288462218, + "learning_rate": 7.160570365778054e-08, + "loss": 1.0999, + "step": 18480 + }, + { + "epoch": 1.4328335076911154, + "grad_norm": 1.4050204428177788, + "learning_rate": 7.164445133292004e-08, + "loss": 1.099, + "step": 18490 + }, + { + "epoch": 1.4336084311674222, + "grad_norm": 1.352383904390125, + "learning_rate": 7.168319900805952e-08, + "loss": 1.0959, + "step": 18500 + }, + { + "epoch": 1.4336084311674222, + "eval_loss": 1.1122339963912964, + "eval_runtime": 319.0762, + "eval_samples_per_second": 35.951, + "eval_steps_per_second": 8.988, + "step": 18500 + }, + { + "epoch": 1.434383354643729, + "grad_norm": 1.346192341103638, + "learning_rate": 7.172194668319902e-08, + "loss": 1.156, + "step": 18510 + }, + { + "epoch": 1.4351582781200356, + "grad_norm": 1.4196824154443202, + "learning_rate": 7.176069435833851e-08, + "loss": 1.0909, + "step": 18520 + }, + { + "epoch": 1.4359332015963424, + "grad_norm": 1.3560910811411846, + "learning_rate": 7.179944203347799e-08, + "loss": 1.1067, + "step": 18530 + }, + { + "epoch": 1.436708125072649, + "grad_norm": 1.3987562055026777, + "learning_rate": 7.183818970861749e-08, + "loss": 1.1014, + "step": 18540 + }, + { + "epoch": 1.4374830485489558, + "grad_norm": 1.339103940362661, + "learning_rate": 7.187693738375698e-08, + "loss": 1.1145, + "step": 18550 + }, + { + "epoch": 1.4382579720252626, + "grad_norm": 1.4223744305028256, + "learning_rate": 7.191568505889648e-08, + "loss": 1.0955, + "step": 18560 + }, + { + "epoch": 1.4390328955015692, + "grad_norm": 1.3659975901580939, + "learning_rate": 7.195443273403596e-08, + "loss": 1.1158, + "step": 18570 + }, + { + "epoch": 1.4398078189778758, + "grad_norm": 1.442648853090171, + "learning_rate": 7.199318040917546e-08, + "loss": 1.1139, + "step": 18580 + }, + { + "epoch": 1.4405827424541826, + "grad_norm": 1.3213678208604855, + "learning_rate": 7.203192808431495e-08, + "loss": 1.0976, + "step": 18590 + }, + { + "epoch": 1.4413576659304894, + "grad_norm": 1.4420171604051484, + "learning_rate": 7.207067575945444e-08, + "loss": 1.113, + "step": 18600 + }, + { + "epoch": 1.442132589406796, + "grad_norm": 1.3879505263659297, + "learning_rate": 7.210942343459393e-08, + "loss": 1.1128, + "step": 18610 + }, + { + "epoch": 1.4429075128831028, + "grad_norm": 1.3782551523443674, + "learning_rate": 7.214817110973343e-08, + "loss": 1.1065, + "step": 18620 + }, + { + "epoch": 1.4436824363594094, + "grad_norm": 1.3711118944252296, + "learning_rate": 7.218691878487291e-08, + "loss": 1.1137, + "step": 18630 + }, + { + "epoch": 1.4444573598357162, + "grad_norm": 1.3172524813785498, + "learning_rate": 7.22256664600124e-08, + "loss": 1.1284, + "step": 18640 + }, + { + "epoch": 1.445232283312023, + "grad_norm": 1.321336942718469, + "learning_rate": 7.22644141351519e-08, + "loss": 1.1209, + "step": 18650 + }, + { + "epoch": 1.4460072067883296, + "grad_norm": 1.3965215810851654, + "learning_rate": 7.23031618102914e-08, + "loss": 1.1037, + "step": 18660 + }, + { + "epoch": 1.4467821302646364, + "grad_norm": 1.4760366445444293, + "learning_rate": 7.234190948543088e-08, + "loss": 1.0867, + "step": 18670 + }, + { + "epoch": 1.447557053740943, + "grad_norm": 1.4220609103919941, + "learning_rate": 7.238065716057037e-08, + "loss": 1.0924, + "step": 18680 + }, + { + "epoch": 1.4483319772172498, + "grad_norm": 1.4066822985368317, + "learning_rate": 7.241940483570987e-08, + "loss": 1.1263, + "step": 18690 + }, + { + "epoch": 1.4491069006935566, + "grad_norm": 1.352760183731444, + "learning_rate": 7.245815251084935e-08, + "loss": 1.126, + "step": 18700 + }, + { + "epoch": 1.4498818241698632, + "grad_norm": 1.3337998355527962, + "learning_rate": 7.249690018598885e-08, + "loss": 1.1206, + "step": 18710 + }, + { + "epoch": 1.4506567476461698, + "grad_norm": 1.4985558392037515, + "learning_rate": 7.253564786112834e-08, + "loss": 1.0974, + "step": 18720 + }, + { + "epoch": 1.4514316711224766, + "grad_norm": 1.456056525771984, + "learning_rate": 7.257439553626784e-08, + "loss": 1.1288, + "step": 18730 + }, + { + "epoch": 1.4522065945987834, + "grad_norm": 1.365452300996824, + "learning_rate": 7.261314321140732e-08, + "loss": 1.1172, + "step": 18740 + }, + { + "epoch": 1.45298151807509, + "grad_norm": 1.39990737647484, + "learning_rate": 7.265189088654681e-08, + "loss": 1.0985, + "step": 18750 + }, + { + "epoch": 1.4537564415513968, + "grad_norm": 1.3011804239353681, + "learning_rate": 7.269063856168631e-08, + "loss": 1.139, + "step": 18760 + }, + { + "epoch": 1.4545313650277034, + "grad_norm": 1.3594263528064057, + "learning_rate": 7.272938623682579e-08, + "loss": 1.1049, + "step": 18770 + }, + { + "epoch": 1.4553062885040102, + "grad_norm": 1.3539695305479609, + "learning_rate": 7.276813391196529e-08, + "loss": 1.1393, + "step": 18780 + }, + { + "epoch": 1.456081211980317, + "grad_norm": 1.4088943869501818, + "learning_rate": 7.280688158710478e-08, + "loss": 1.1175, + "step": 18790 + }, + { + "epoch": 1.4568561354566236, + "grad_norm": 1.3518213068413012, + "learning_rate": 7.284562926224427e-08, + "loss": 1.1209, + "step": 18800 + }, + { + "epoch": 1.4576310589329304, + "grad_norm": 1.4191684322259754, + "learning_rate": 7.288437693738376e-08, + "loss": 1.1511, + "step": 18810 + }, + { + "epoch": 1.458405982409237, + "grad_norm": 1.4161854060913879, + "learning_rate": 7.292312461252326e-08, + "loss": 1.1127, + "step": 18820 + }, + { + "epoch": 1.4591809058855438, + "grad_norm": 1.4297974410371952, + "learning_rate": 7.296187228766275e-08, + "loss": 1.1138, + "step": 18830 + }, + { + "epoch": 1.4599558293618506, + "grad_norm": 1.350958569293082, + "learning_rate": 7.300061996280223e-08, + "loss": 1.1092, + "step": 18840 + }, + { + "epoch": 1.4607307528381572, + "grad_norm": 1.4110029583187809, + "learning_rate": 7.303936763794173e-08, + "loss": 1.1031, + "step": 18850 + }, + { + "epoch": 1.4615056763144638, + "grad_norm": 1.3869530435228525, + "learning_rate": 7.307811531308123e-08, + "loss": 1.1155, + "step": 18860 + }, + { + "epoch": 1.4622805997907706, + "grad_norm": 1.2717907542863434, + "learning_rate": 7.311686298822071e-08, + "loss": 1.1137, + "step": 18870 + }, + { + "epoch": 1.4630555232670774, + "grad_norm": 1.26030478394414, + "learning_rate": 7.31556106633602e-08, + "loss": 1.1105, + "step": 18880 + }, + { + "epoch": 1.463830446743384, + "grad_norm": 1.4138242542253838, + "learning_rate": 7.31943583384997e-08, + "loss": 1.1139, + "step": 18890 + }, + { + "epoch": 1.4646053702196908, + "grad_norm": 1.3467116520103017, + "learning_rate": 7.32331060136392e-08, + "loss": 1.1099, + "step": 18900 + }, + { + "epoch": 1.4653802936959974, + "grad_norm": 1.3225968136669761, + "learning_rate": 7.327185368877868e-08, + "loss": 1.0908, + "step": 18910 + }, + { + "epoch": 1.4661552171723042, + "grad_norm": 1.3434441292174764, + "learning_rate": 7.331060136391817e-08, + "loss": 1.1153, + "step": 18920 + }, + { + "epoch": 1.466930140648611, + "grad_norm": 1.3995872004233296, + "learning_rate": 7.334934903905767e-08, + "loss": 1.1191, + "step": 18930 + }, + { + "epoch": 1.4677050641249176, + "grad_norm": 1.4359570719569599, + "learning_rate": 7.338809671419715e-08, + "loss": 1.0979, + "step": 18940 + }, + { + "epoch": 1.4684799876012244, + "grad_norm": 1.3233097414475006, + "learning_rate": 7.342684438933665e-08, + "loss": 1.1074, + "step": 18950 + }, + { + "epoch": 1.469254911077531, + "grad_norm": 1.427874108141418, + "learning_rate": 7.346559206447614e-08, + "loss": 1.1129, + "step": 18960 + }, + { + "epoch": 1.4700298345538378, + "grad_norm": 1.4017112475474915, + "learning_rate": 7.350433973961564e-08, + "loss": 1.1138, + "step": 18970 + }, + { + "epoch": 1.4708047580301447, + "grad_norm": 1.3356392094203333, + "learning_rate": 7.354308741475512e-08, + "loss": 1.0962, + "step": 18980 + }, + { + "epoch": 1.4715796815064512, + "grad_norm": 1.4277469417153732, + "learning_rate": 7.358183508989461e-08, + "loss": 1.1206, + "step": 18990 + }, + { + "epoch": 1.4723546049827578, + "grad_norm": 1.3487301979315471, + "learning_rate": 7.362058276503411e-08, + "loss": 1.0922, + "step": 19000 + }, + { + "epoch": 1.4723546049827578, + "eval_loss": 1.107385277748108, + "eval_runtime": 320.0256, + "eval_samples_per_second": 35.844, + "eval_steps_per_second": 8.962, + "step": 19000 + }, + { + "epoch": 1.4731295284590646, + "grad_norm": 1.3811991520196294, + "learning_rate": 7.365933044017359e-08, + "loss": 1.1193, + "step": 19010 + }, + { + "epoch": 1.4739044519353715, + "grad_norm": 1.4196990469056998, + "learning_rate": 7.369807811531309e-08, + "loss": 1.1377, + "step": 19020 + }, + { + "epoch": 1.474679375411678, + "grad_norm": 1.2759966426460834, + "learning_rate": 7.373682579045258e-08, + "loss": 1.1169, + "step": 19030 + }, + { + "epoch": 1.4754542988879848, + "grad_norm": 1.3738495162127766, + "learning_rate": 7.377557346559207e-08, + "loss": 1.1313, + "step": 19040 + }, + { + "epoch": 1.4762292223642914, + "grad_norm": 1.3694612505305146, + "learning_rate": 7.381432114073156e-08, + "loss": 1.1188, + "step": 19050 + }, + { + "epoch": 1.4770041458405982, + "grad_norm": 1.3247693433551142, + "learning_rate": 7.385306881587106e-08, + "loss": 1.0985, + "step": 19060 + }, + { + "epoch": 1.477779069316905, + "grad_norm": 1.3880554347603702, + "learning_rate": 7.389181649101055e-08, + "loss": 1.1087, + "step": 19070 + }, + { + "epoch": 1.4785539927932116, + "grad_norm": 1.407827584781016, + "learning_rate": 7.393056416615003e-08, + "loss": 1.1073, + "step": 19080 + }, + { + "epoch": 1.4793289162695185, + "grad_norm": 1.2883723173401733, + "learning_rate": 7.396931184128953e-08, + "loss": 1.0985, + "step": 19090 + }, + { + "epoch": 1.480103839745825, + "grad_norm": 1.2987712690371471, + "learning_rate": 7.400805951642903e-08, + "loss": 1.104, + "step": 19100 + }, + { + "epoch": 1.4808787632221319, + "grad_norm": 1.4026637327800606, + "learning_rate": 7.404680719156851e-08, + "loss": 1.1004, + "step": 19110 + }, + { + "epoch": 1.4816536866984387, + "grad_norm": 1.357984022471409, + "learning_rate": 7.4085554866708e-08, + "loss": 1.1066, + "step": 19120 + }, + { + "epoch": 1.4824286101747453, + "grad_norm": 1.995535143008599, + "learning_rate": 7.41243025418475e-08, + "loss": 1.1153, + "step": 19130 + }, + { + "epoch": 1.4832035336510518, + "grad_norm": 1.4251279848796783, + "learning_rate": 7.4163050216987e-08, + "loss": 1.1127, + "step": 19140 + }, + { + "epoch": 1.4839784571273587, + "grad_norm": 1.3832267314229871, + "learning_rate": 7.420179789212648e-08, + "loss": 1.1467, + "step": 19150 + }, + { + "epoch": 1.4847533806036655, + "grad_norm": 1.3196338117966586, + "learning_rate": 7.424054556726597e-08, + "loss": 1.094, + "step": 19160 + }, + { + "epoch": 1.485528304079972, + "grad_norm": 1.2894593568241628, + "learning_rate": 7.427929324240547e-08, + "loss": 1.0978, + "step": 19170 + }, + { + "epoch": 1.4863032275562789, + "grad_norm": 1.3625150899731446, + "learning_rate": 7.431804091754495e-08, + "loss": 1.1227, + "step": 19180 + }, + { + "epoch": 1.4870781510325854, + "grad_norm": 1.7581995326701354, + "learning_rate": 7.435678859268445e-08, + "loss": 1.1162, + "step": 19190 + }, + { + "epoch": 1.4878530745088923, + "grad_norm": 1.3417249858621023, + "learning_rate": 7.439553626782394e-08, + "loss": 1.1049, + "step": 19200 + }, + { + "epoch": 1.488627997985199, + "grad_norm": 1.3476269745130431, + "learning_rate": 7.443428394296342e-08, + "loss": 1.102, + "step": 19210 + }, + { + "epoch": 1.4894029214615057, + "grad_norm": 1.444852233149233, + "learning_rate": 7.447303161810292e-08, + "loss": 1.1115, + "step": 19220 + }, + { + "epoch": 1.4901778449378125, + "grad_norm": 1.5485979319913323, + "learning_rate": 7.451177929324241e-08, + "loss": 1.1305, + "step": 19230 + }, + { + "epoch": 1.490952768414119, + "grad_norm": 1.513358562777132, + "learning_rate": 7.455052696838191e-08, + "loss": 1.1049, + "step": 19240 + }, + { + "epoch": 1.4917276918904259, + "grad_norm": 1.3663868386193743, + "learning_rate": 7.458927464352139e-08, + "loss": 1.0825, + "step": 19250 + }, + { + "epoch": 1.4925026153667325, + "grad_norm": 1.4586282891171884, + "learning_rate": 7.462802231866089e-08, + "loss": 1.1118, + "step": 19260 + }, + { + "epoch": 1.4932775388430393, + "grad_norm": 1.7118388338721584, + "learning_rate": 7.466676999380038e-08, + "loss": 1.1286, + "step": 19270 + }, + { + "epoch": 1.4940524623193459, + "grad_norm": 1.3221608169401033, + "learning_rate": 7.470551766893987e-08, + "loss": 1.0922, + "step": 19280 + }, + { + "epoch": 1.4948273857956527, + "grad_norm": 1.4349946259044901, + "learning_rate": 7.474426534407936e-08, + "loss": 1.097, + "step": 19290 + }, + { + "epoch": 1.4956023092719595, + "grad_norm": 1.341965669411586, + "learning_rate": 7.478301301921886e-08, + "loss": 1.0883, + "step": 19300 + }, + { + "epoch": 1.496377232748266, + "grad_norm": 1.413620001797467, + "learning_rate": 7.482176069435835e-08, + "loss": 1.1402, + "step": 19310 + }, + { + "epoch": 1.4971521562245729, + "grad_norm": 1.4282951313735552, + "learning_rate": 7.486050836949783e-08, + "loss": 1.1094, + "step": 19320 + }, + { + "epoch": 1.4979270797008795, + "grad_norm": 1.379606233146707, + "learning_rate": 7.489925604463733e-08, + "loss": 1.1224, + "step": 19330 + }, + { + "epoch": 1.4987020031771863, + "grad_norm": 1.3372073893207406, + "learning_rate": 7.493800371977682e-08, + "loss": 1.0974, + "step": 19340 + }, + { + "epoch": 1.499476926653493, + "grad_norm": 1.4350739498983571, + "learning_rate": 7.497675139491631e-08, + "loss": 1.1113, + "step": 19350 + }, + { + "epoch": 1.5002518501297997, + "grad_norm": 1.3635110791052718, + "learning_rate": 7.50154990700558e-08, + "loss": 1.0967, + "step": 19360 + }, + { + "epoch": 1.5010267736061063, + "grad_norm": 1.4194189785399305, + "learning_rate": 7.50542467451953e-08, + "loss": 1.1178, + "step": 19370 + }, + { + "epoch": 1.501801697082413, + "grad_norm": 1.3464380639931846, + "learning_rate": 7.509299442033478e-08, + "loss": 1.0917, + "step": 19380 + }, + { + "epoch": 1.5025766205587199, + "grad_norm": 1.4225347180646044, + "learning_rate": 7.513174209547428e-08, + "loss": 1.1047, + "step": 19390 + }, + { + "epoch": 1.5033515440350267, + "grad_norm": 1.4448176028120205, + "learning_rate": 7.517048977061377e-08, + "loss": 1.1094, + "step": 19400 + }, + { + "epoch": 1.5041264675113333, + "grad_norm": 1.3506950745039783, + "learning_rate": 7.520923744575327e-08, + "loss": 1.0835, + "step": 19410 + }, + { + "epoch": 1.5049013909876399, + "grad_norm": 1.5236159221594634, + "learning_rate": 7.524798512089275e-08, + "loss": 1.1377, + "step": 19420 + }, + { + "epoch": 1.5056763144639467, + "grad_norm": 1.4955086426403852, + "learning_rate": 7.528673279603224e-08, + "loss": 1.107, + "step": 19430 + }, + { + "epoch": 1.5064512379402535, + "grad_norm": 1.4122727300207445, + "learning_rate": 7.532548047117174e-08, + "loss": 1.0875, + "step": 19440 + }, + { + "epoch": 1.50722616141656, + "grad_norm": 1.294225948083921, + "learning_rate": 7.536422814631122e-08, + "loss": 1.1161, + "step": 19450 + }, + { + "epoch": 1.5080010848928669, + "grad_norm": 1.5522823001317312, + "learning_rate": 7.540297582145072e-08, + "loss": 1.0916, + "step": 19460 + }, + { + "epoch": 1.5087760083691735, + "grad_norm": 1.2858704022672156, + "learning_rate": 7.544172349659021e-08, + "loss": 1.073, + "step": 19470 + }, + { + "epoch": 1.5095509318454803, + "grad_norm": 1.3849886026422105, + "learning_rate": 7.548047117172971e-08, + "loss": 1.0691, + "step": 19480 + }, + { + "epoch": 1.510325855321787, + "grad_norm": 1.3803897342070526, + "learning_rate": 7.551921884686919e-08, + "loss": 1.1003, + "step": 19490 + }, + { + "epoch": 1.5111007787980937, + "grad_norm": 1.321016901610216, + "learning_rate": 7.555796652200869e-08, + "loss": 1.0835, + "step": 19500 + }, + { + "epoch": 1.5111007787980937, + "eval_loss": 1.1027668714523315, + "eval_runtime": 318.3463, + "eval_samples_per_second": 36.033, + "eval_steps_per_second": 9.009, + "step": 19500 + }, + { + "epoch": 1.5118757022744003, + "grad_norm": 1.397817098581279, + "learning_rate": 7.559671419714818e-08, + "loss": 1.121, + "step": 19510 + }, + { + "epoch": 1.512650625750707, + "grad_norm": 1.3910248235071256, + "learning_rate": 7.563546187228766e-08, + "loss": 1.1118, + "step": 19520 + }, + { + "epoch": 1.5134255492270139, + "grad_norm": 1.3569107621282703, + "learning_rate": 7.567420954742716e-08, + "loss": 1.107, + "step": 19530 + }, + { + "epoch": 1.5142004727033207, + "grad_norm": 1.368537134669878, + "learning_rate": 7.571295722256666e-08, + "loss": 1.1112, + "step": 19540 + }, + { + "epoch": 1.5149753961796273, + "grad_norm": 1.4040421490328223, + "learning_rate": 7.575170489770614e-08, + "loss": 1.1116, + "step": 19550 + }, + { + "epoch": 1.5157503196559339, + "grad_norm": 1.3112757859386122, + "learning_rate": 7.579045257284563e-08, + "loss": 1.1198, + "step": 19560 + }, + { + "epoch": 1.5165252431322407, + "grad_norm": 1.393218894970085, + "learning_rate": 7.582920024798513e-08, + "loss": 1.1092, + "step": 19570 + }, + { + "epoch": 1.5173001666085475, + "grad_norm": 1.4467145436330122, + "learning_rate": 7.586794792312462e-08, + "loss": 1.1291, + "step": 19580 + }, + { + "epoch": 1.518075090084854, + "grad_norm": 1.363710466167471, + "learning_rate": 7.590669559826411e-08, + "loss": 1.1011, + "step": 19590 + }, + { + "epoch": 1.5188500135611607, + "grad_norm": 1.4356089479710397, + "learning_rate": 7.59454432734036e-08, + "loss": 1.1047, + "step": 19600 + }, + { + "epoch": 1.5196249370374675, + "grad_norm": 1.3940411692765464, + "learning_rate": 7.59841909485431e-08, + "loss": 1.1089, + "step": 19610 + }, + { + "epoch": 1.5203998605137743, + "grad_norm": 1.3201443298596922, + "learning_rate": 7.602293862368258e-08, + "loss": 1.0956, + "step": 19620 + }, + { + "epoch": 1.521174783990081, + "grad_norm": 1.4411993538936394, + "learning_rate": 7.606168629882208e-08, + "loss": 1.1283, + "step": 19630 + }, + { + "epoch": 1.5219497074663877, + "grad_norm": 1.3970940902812368, + "learning_rate": 7.610043397396157e-08, + "loss": 1.0905, + "step": 19640 + }, + { + "epoch": 1.5227246309426943, + "grad_norm": 1.3078119323511854, + "learning_rate": 7.613918164910107e-08, + "loss": 1.1207, + "step": 19650 + }, + { + "epoch": 1.523499554419001, + "grad_norm": 2.021263166291192, + "learning_rate": 7.617792932424055e-08, + "loss": 1.0968, + "step": 19660 + }, + { + "epoch": 1.524274477895308, + "grad_norm": 1.317182724251208, + "learning_rate": 7.621667699938004e-08, + "loss": 1.0931, + "step": 19670 + }, + { + "epoch": 1.5250494013716147, + "grad_norm": 1.443566992210088, + "learning_rate": 7.625542467451954e-08, + "loss": 1.1032, + "step": 19680 + }, + { + "epoch": 1.5258243248479213, + "grad_norm": 1.388388922965222, + "learning_rate": 7.629417234965902e-08, + "loss": 1.1132, + "step": 19690 + }, + { + "epoch": 1.5265992483242279, + "grad_norm": 1.4265161564623114, + "learning_rate": 7.633292002479852e-08, + "loss": 1.0803, + "step": 19700 + }, + { + "epoch": 1.5273741718005347, + "grad_norm": 1.4056892894793265, + "learning_rate": 7.637166769993801e-08, + "loss": 1.081, + "step": 19710 + }, + { + "epoch": 1.5281490952768415, + "grad_norm": 1.3361031101786651, + "learning_rate": 7.64104153750775e-08, + "loss": 1.0926, + "step": 19720 + }, + { + "epoch": 1.528924018753148, + "grad_norm": 1.3681844006505735, + "learning_rate": 7.644916305021699e-08, + "loss": 1.0964, + "step": 19730 + }, + { + "epoch": 1.5296989422294547, + "grad_norm": 1.435776308508404, + "learning_rate": 7.648791072535649e-08, + "loss": 1.0922, + "step": 19740 + }, + { + "epoch": 1.5304738657057615, + "grad_norm": 1.7482453072417905, + "learning_rate": 7.652665840049598e-08, + "loss": 1.0904, + "step": 19750 + }, + { + "epoch": 1.5312487891820683, + "grad_norm": 1.4547945931146824, + "learning_rate": 7.656540607563546e-08, + "loss": 1.1007, + "step": 19760 + }, + { + "epoch": 1.5320237126583751, + "grad_norm": 1.3080563800723861, + "learning_rate": 7.660415375077496e-08, + "loss": 1.0949, + "step": 19770 + }, + { + "epoch": 1.5327986361346817, + "grad_norm": 1.3292192893003631, + "learning_rate": 7.664290142591446e-08, + "loss": 1.1435, + "step": 19780 + }, + { + "epoch": 1.5335735596109883, + "grad_norm": 1.3249635440968452, + "learning_rate": 7.668164910105394e-08, + "loss": 1.1008, + "step": 19790 + }, + { + "epoch": 1.534348483087295, + "grad_norm": 1.3521227339556743, + "learning_rate": 7.672039677619343e-08, + "loss": 1.1239, + "step": 19800 + }, + { + "epoch": 1.535123406563602, + "grad_norm": 1.349389353224472, + "learning_rate": 7.675914445133293e-08, + "loss": 1.0811, + "step": 19810 + }, + { + "epoch": 1.5358983300399087, + "grad_norm": 1.3396563492140474, + "learning_rate": 7.679789212647242e-08, + "loss": 1.116, + "step": 19820 + }, + { + "epoch": 1.5366732535162153, + "grad_norm": 1.3545803567597459, + "learning_rate": 7.68366398016119e-08, + "loss": 1.1031, + "step": 19830 + }, + { + "epoch": 1.537448176992522, + "grad_norm": 1.4136348083105506, + "learning_rate": 7.68753874767514e-08, + "loss": 1.0983, + "step": 19840 + }, + { + "epoch": 1.5382231004688287, + "grad_norm": 1.3504633814457698, + "learning_rate": 7.69141351518909e-08, + "loss": 1.1068, + "step": 19850 + }, + { + "epoch": 1.5389980239451355, + "grad_norm": 1.3736190576899652, + "learning_rate": 7.695288282703038e-08, + "loss": 1.0904, + "step": 19860 + }, + { + "epoch": 1.539772947421442, + "grad_norm": 1.3923610493310628, + "learning_rate": 7.699163050216988e-08, + "loss": 1.1097, + "step": 19870 + }, + { + "epoch": 1.5405478708977487, + "grad_norm": 1.3464588285487205, + "learning_rate": 7.703037817730937e-08, + "loss": 1.1179, + "step": 19880 + }, + { + "epoch": 1.5413227943740555, + "grad_norm": 1.3201839270866185, + "learning_rate": 7.706912585244885e-08, + "loss": 1.0893, + "step": 19890 + }, + { + "epoch": 1.5420977178503623, + "grad_norm": 1.2351783030458672, + "learning_rate": 7.710787352758835e-08, + "loss": 1.1117, + "step": 19900 + }, + { + "epoch": 1.5428726413266691, + "grad_norm": 1.3779564145962824, + "learning_rate": 7.714662120272784e-08, + "loss": 1.1047, + "step": 19910 + }, + { + "epoch": 1.5436475648029757, + "grad_norm": 1.3472054559311237, + "learning_rate": 7.718536887786734e-08, + "loss": 1.1011, + "step": 19920 + }, + { + "epoch": 1.5444224882792823, + "grad_norm": 1.4574297559593055, + "learning_rate": 7.722411655300682e-08, + "loss": 1.1073, + "step": 19930 + }, + { + "epoch": 1.5451974117555891, + "grad_norm": 1.3729522329143566, + "learning_rate": 7.726286422814632e-08, + "loss": 1.0963, + "step": 19940 + }, + { + "epoch": 1.545972335231896, + "grad_norm": 1.3788132045782842, + "learning_rate": 7.730161190328581e-08, + "loss": 1.0901, + "step": 19950 + }, + { + "epoch": 1.5467472587082025, + "grad_norm": 1.4754392355244268, + "learning_rate": 7.73403595784253e-08, + "loss": 1.0873, + "step": 19960 + }, + { + "epoch": 1.5475221821845093, + "grad_norm": 1.3489887760211607, + "learning_rate": 7.737910725356479e-08, + "loss": 1.1069, + "step": 19970 + }, + { + "epoch": 1.548297105660816, + "grad_norm": 1.3422700315008105, + "learning_rate": 7.741785492870429e-08, + "loss": 1.0884, + "step": 19980 + }, + { + "epoch": 1.5490720291371227, + "grad_norm": 1.3849760339351247, + "learning_rate": 7.745660260384378e-08, + "loss": 1.0979, + "step": 19990 + }, + { + "epoch": 1.5498469526134295, + "grad_norm": 1.3659745036808884, + "learning_rate": 7.749535027898326e-08, + "loss": 1.0946, + "step": 20000 + }, + { + "epoch": 1.5498469526134295, + "eval_loss": 1.0983129739761353, + "eval_runtime": 318.4889, + "eval_samples_per_second": 36.017, + "eval_steps_per_second": 9.005, + "step": 20000 + }, + { + "epoch": 1.5506218760897361, + "grad_norm": 1.4251573572217415, + "learning_rate": 7.753409795412276e-08, + "loss": 1.1061, + "step": 20010 + }, + { + "epoch": 1.5513967995660427, + "grad_norm": 1.3725596252867154, + "learning_rate": 7.757284562926225e-08, + "loss": 1.0843, + "step": 20020 + }, + { + "epoch": 1.5521717230423495, + "grad_norm": 1.5399828227201708, + "learning_rate": 7.761159330440174e-08, + "loss": 1.1002, + "step": 20030 + }, + { + "epoch": 1.5529466465186563, + "grad_norm": 1.4356586903674284, + "learning_rate": 7.765034097954123e-08, + "loss": 1.0812, + "step": 20040 + }, + { + "epoch": 1.5537215699949631, + "grad_norm": 1.3295666248175293, + "learning_rate": 7.768908865468073e-08, + "loss": 1.0626, + "step": 20050 + }, + { + "epoch": 1.5544964934712697, + "grad_norm": 1.3128441038489833, + "learning_rate": 7.772783632982022e-08, + "loss": 1.1082, + "step": 20060 + }, + { + "epoch": 1.5552714169475763, + "grad_norm": 1.3574263693595658, + "learning_rate": 7.77665840049597e-08, + "loss": 1.1009, + "step": 20070 + }, + { + "epoch": 1.5560463404238831, + "grad_norm": 1.3595063467720434, + "learning_rate": 7.78053316800992e-08, + "loss": 1.1211, + "step": 20080 + }, + { + "epoch": 1.55682126390019, + "grad_norm": 1.3733623829511004, + "learning_rate": 7.78440793552387e-08, + "loss": 1.1316, + "step": 20090 + }, + { + "epoch": 1.5575961873764965, + "grad_norm": 1.2731843228585853, + "learning_rate": 7.788282703037818e-08, + "loss": 1.0941, + "step": 20100 + }, + { + "epoch": 1.5583711108528033, + "grad_norm": 1.249822731393116, + "learning_rate": 7.792157470551767e-08, + "loss": 1.0918, + "step": 20110 + }, + { + "epoch": 1.55914603432911, + "grad_norm": 1.4300295890131491, + "learning_rate": 7.796032238065717e-08, + "loss": 1.1152, + "step": 20120 + }, + { + "epoch": 1.5599209578054167, + "grad_norm": 1.300194151995854, + "learning_rate": 7.799907005579665e-08, + "loss": 1.0886, + "step": 20130 + }, + { + "epoch": 1.5606958812817235, + "grad_norm": 1.3533566785709237, + "learning_rate": 7.803781773093615e-08, + "loss": 1.0932, + "step": 20140 + }, + { + "epoch": 1.5614708047580301, + "grad_norm": 1.3496421867891857, + "learning_rate": 7.807656540607564e-08, + "loss": 1.0762, + "step": 20150 + }, + { + "epoch": 1.5622457282343367, + "grad_norm": 1.2912195706941567, + "learning_rate": 7.811531308121514e-08, + "loss": 1.1099, + "step": 20160 + }, + { + "epoch": 1.5630206517106435, + "grad_norm": 1.3677411289329926, + "learning_rate": 7.815406075635462e-08, + "loss": 1.0712, + "step": 20170 + }, + { + "epoch": 1.5637955751869503, + "grad_norm": 1.3610119825483227, + "learning_rate": 7.819280843149412e-08, + "loss": 1.1046, + "step": 20180 + }, + { + "epoch": 1.5645704986632571, + "grad_norm": 1.298064307074768, + "learning_rate": 7.823155610663361e-08, + "loss": 1.0865, + "step": 20190 + }, + { + "epoch": 1.5653454221395637, + "grad_norm": 1.3477810506876293, + "learning_rate": 7.82703037817731e-08, + "loss": 1.1023, + "step": 20200 + }, + { + "epoch": 1.5661203456158703, + "grad_norm": 1.2612063164331275, + "learning_rate": 7.830905145691259e-08, + "loss": 1.0856, + "step": 20210 + }, + { + "epoch": 1.5668952690921771, + "grad_norm": 1.3426594549735758, + "learning_rate": 7.834779913205209e-08, + "loss": 1.0922, + "step": 20220 + }, + { + "epoch": 1.567670192568484, + "grad_norm": 1.4004358939754225, + "learning_rate": 7.838654680719158e-08, + "loss": 1.0906, + "step": 20230 + }, + { + "epoch": 1.5684451160447905, + "grad_norm": 1.2953616811145734, + "learning_rate": 7.842529448233106e-08, + "loss": 1.1093, + "step": 20240 + }, + { + "epoch": 1.5692200395210973, + "grad_norm": 1.2474914505400114, + "learning_rate": 7.846404215747056e-08, + "loss": 1.1131, + "step": 20250 + }, + { + "epoch": 1.569994962997404, + "grad_norm": 1.4151319304187246, + "learning_rate": 7.850278983261005e-08, + "loss": 1.0868, + "step": 20260 + }, + { + "epoch": 1.5707698864737107, + "grad_norm": 1.3308472829385378, + "learning_rate": 7.854153750774954e-08, + "loss": 1.1192, + "step": 20270 + }, + { + "epoch": 1.5715448099500176, + "grad_norm": 1.4208214736677918, + "learning_rate": 7.858028518288903e-08, + "loss": 1.1031, + "step": 20280 + }, + { + "epoch": 1.5723197334263241, + "grad_norm": 1.279563381134807, + "learning_rate": 7.861903285802853e-08, + "loss": 1.0939, + "step": 20290 + }, + { + "epoch": 1.5730946569026307, + "grad_norm": 1.42051782238107, + "learning_rate": 7.865778053316801e-08, + "loss": 1.0982, + "step": 20300 + }, + { + "epoch": 1.5738695803789375, + "grad_norm": 1.3515250747761596, + "learning_rate": 7.86965282083075e-08, + "loss": 1.1162, + "step": 20310 + }, + { + "epoch": 1.5746445038552443, + "grad_norm": 1.3113296802232732, + "learning_rate": 7.8735275883447e-08, + "loss": 1.085, + "step": 20320 + }, + { + "epoch": 1.5754194273315512, + "grad_norm": 1.2965680474855439, + "learning_rate": 7.87740235585865e-08, + "loss": 1.1073, + "step": 20330 + }, + { + "epoch": 1.5761943508078577, + "grad_norm": 1.4504924981359075, + "learning_rate": 7.881277123372598e-08, + "loss": 1.1219, + "step": 20340 + }, + { + "epoch": 1.5769692742841643, + "grad_norm": 1.3375014407138062, + "learning_rate": 7.885151890886547e-08, + "loss": 1.0995, + "step": 20350 + }, + { + "epoch": 1.5777441977604711, + "grad_norm": 1.3814503675506427, + "learning_rate": 7.889026658400497e-08, + "loss": 1.1201, + "step": 20360 + }, + { + "epoch": 1.578519121236778, + "grad_norm": 1.4018224111556479, + "learning_rate": 7.892901425914445e-08, + "loss": 1.1001, + "step": 20370 + }, + { + "epoch": 1.5792940447130845, + "grad_norm": 1.4234587928495177, + "learning_rate": 7.896776193428395e-08, + "loss": 1.1003, + "step": 20380 + }, + { + "epoch": 1.5800689681893911, + "grad_norm": 1.261680131756925, + "learning_rate": 7.900650960942344e-08, + "loss": 1.1342, + "step": 20390 + }, + { + "epoch": 1.580843891665698, + "grad_norm": 1.3230503442219403, + "learning_rate": 7.904525728456294e-08, + "loss": 1.112, + "step": 20400 + }, + { + "epoch": 1.5816188151420048, + "grad_norm": 1.370181117992767, + "learning_rate": 7.908400495970242e-08, + "loss": 1.0943, + "step": 20410 + }, + { + "epoch": 1.5823937386183116, + "grad_norm": 1.3523173664036034, + "learning_rate": 7.912275263484192e-08, + "loss": 1.1063, + "step": 20420 + }, + { + "epoch": 1.5831686620946182, + "grad_norm": 2.5774548091954963, + "learning_rate": 7.916150030998141e-08, + "loss": 1.0991, + "step": 20430 + }, + { + "epoch": 1.5839435855709247, + "grad_norm": 1.3374690257721138, + "learning_rate": 7.92002479851209e-08, + "loss": 1.0881, + "step": 20440 + }, + { + "epoch": 1.5847185090472315, + "grad_norm": 1.3344506968616923, + "learning_rate": 7.923899566026039e-08, + "loss": 1.0897, + "step": 20450 + }, + { + "epoch": 1.5854934325235384, + "grad_norm": 1.4219364282421003, + "learning_rate": 7.927774333539989e-08, + "loss": 1.1048, + "step": 20460 + }, + { + "epoch": 1.5862683559998452, + "grad_norm": 1.3429752442584693, + "learning_rate": 7.931649101053937e-08, + "loss": 1.0745, + "step": 20470 + }, + { + "epoch": 1.5870432794761518, + "grad_norm": 1.451532506309598, + "learning_rate": 7.935523868567886e-08, + "loss": 1.0845, + "step": 20480 + }, + { + "epoch": 1.5878182029524583, + "grad_norm": 1.3213457454522806, + "learning_rate": 7.939398636081836e-08, + "loss": 1.0969, + "step": 20490 + }, + { + "epoch": 1.5885931264287652, + "grad_norm": 1.3089035765078367, + "learning_rate": 7.943273403595785e-08, + "loss": 1.0926, + "step": 20500 + }, + { + "epoch": 1.5885931264287652, + "eval_loss": 1.0940340757369995, + "eval_runtime": 320.3777, + "eval_samples_per_second": 35.805, + "eval_steps_per_second": 8.952, + "step": 20500 + }, + { + "epoch": 1.589368049905072, + "grad_norm": 1.4285366529108932, + "learning_rate": 7.947148171109734e-08, + "loss": 1.0858, + "step": 20510 + }, + { + "epoch": 1.5901429733813786, + "grad_norm": 1.4112164417234314, + "learning_rate": 7.951022938623683e-08, + "loss": 1.1121, + "step": 20520 + }, + { + "epoch": 1.5909178968576851, + "grad_norm": 1.3910957586772186, + "learning_rate": 7.954897706137633e-08, + "loss": 1.1085, + "step": 20530 + }, + { + "epoch": 1.591692820333992, + "grad_norm": 1.6512273566262619, + "learning_rate": 7.958772473651581e-08, + "loss": 1.0805, + "step": 20540 + }, + { + "epoch": 1.5924677438102988, + "grad_norm": 1.3494411980453833, + "learning_rate": 7.96264724116553e-08, + "loss": 1.0974, + "step": 20550 + }, + { + "epoch": 1.5932426672866056, + "grad_norm": 1.3919870250565574, + "learning_rate": 7.96652200867948e-08, + "loss": 1.0841, + "step": 20560 + }, + { + "epoch": 1.5940175907629122, + "grad_norm": 1.40458910714019, + "learning_rate": 7.97039677619343e-08, + "loss": 1.1131, + "step": 20570 + }, + { + "epoch": 1.5947925142392188, + "grad_norm": 1.3555290375810047, + "learning_rate": 7.974271543707378e-08, + "loss": 1.0867, + "step": 20580 + }, + { + "epoch": 1.5955674377155256, + "grad_norm": 1.2361824316186205, + "learning_rate": 7.978146311221327e-08, + "loss": 1.0857, + "step": 20590 + }, + { + "epoch": 1.5963423611918324, + "grad_norm": 1.2847506504755486, + "learning_rate": 7.982021078735277e-08, + "loss": 1.083, + "step": 20600 + }, + { + "epoch": 1.5971172846681392, + "grad_norm": 1.330311893560439, + "learning_rate": 7.985895846249225e-08, + "loss": 1.0931, + "step": 20610 + }, + { + "epoch": 1.5978922081444458, + "grad_norm": 1.3570391353336548, + "learning_rate": 7.989770613763175e-08, + "loss": 1.0674, + "step": 20620 + }, + { + "epoch": 1.5986671316207524, + "grad_norm": 1.420252744806801, + "learning_rate": 7.993645381277124e-08, + "loss": 1.0764, + "step": 20630 + }, + { + "epoch": 1.5994420550970592, + "grad_norm": 1.391031247831753, + "learning_rate": 7.997520148791073e-08, + "loss": 1.1006, + "step": 20640 + }, + { + "epoch": 1.600216978573366, + "grad_norm": 1.4114749674670966, + "learning_rate": 8.001394916305022e-08, + "loss": 1.0964, + "step": 20650 + }, + { + "epoch": 1.6009919020496726, + "grad_norm": 1.4176951119210857, + "learning_rate": 8.005269683818972e-08, + "loss": 1.0969, + "step": 20660 + }, + { + "epoch": 1.6017668255259792, + "grad_norm": 1.3936519890364067, + "learning_rate": 8.009144451332921e-08, + "loss": 1.1147, + "step": 20670 + }, + { + "epoch": 1.602541749002286, + "grad_norm": 1.4140851942570136, + "learning_rate": 8.01301921884687e-08, + "loss": 1.1018, + "step": 20680 + }, + { + "epoch": 1.6033166724785928, + "grad_norm": 1.2763008391560335, + "learning_rate": 8.016893986360819e-08, + "loss": 1.0636, + "step": 20690 + }, + { + "epoch": 1.6040915959548996, + "grad_norm": 1.2282422422819277, + "learning_rate": 8.020768753874768e-08, + "loss": 1.0658, + "step": 20700 + }, + { + "epoch": 1.6048665194312062, + "grad_norm": 1.3752386980240063, + "learning_rate": 8.024643521388717e-08, + "loss": 1.0936, + "step": 20710 + }, + { + "epoch": 1.6056414429075128, + "grad_norm": 1.4230892479469102, + "learning_rate": 8.028518288902666e-08, + "loss": 1.0934, + "step": 20720 + }, + { + "epoch": 1.6064163663838196, + "grad_norm": 1.6791432100065742, + "learning_rate": 8.032393056416616e-08, + "loss": 1.0997, + "step": 20730 + }, + { + "epoch": 1.6071912898601264, + "grad_norm": 1.3288599714149585, + "learning_rate": 8.036267823930565e-08, + "loss": 1.0811, + "step": 20740 + }, + { + "epoch": 1.6079662133364332, + "grad_norm": 1.3653355453669305, + "learning_rate": 8.040142591444514e-08, + "loss": 1.1083, + "step": 20750 + }, + { + "epoch": 1.6087411368127398, + "grad_norm": 1.2900133048972287, + "learning_rate": 8.044017358958463e-08, + "loss": 1.1013, + "step": 20760 + }, + { + "epoch": 1.6095160602890464, + "grad_norm": 1.302479123339873, + "learning_rate": 8.047892126472413e-08, + "loss": 1.0755, + "step": 20770 + }, + { + "epoch": 1.6102909837653532, + "grad_norm": 1.3625037186809819, + "learning_rate": 8.051766893986361e-08, + "loss": 1.1074, + "step": 20780 + }, + { + "epoch": 1.61106590724166, + "grad_norm": 1.2706037382535973, + "learning_rate": 8.05564166150031e-08, + "loss": 1.0757, + "step": 20790 + }, + { + "epoch": 1.6118408307179666, + "grad_norm": 1.3508461288327838, + "learning_rate": 8.05951642901426e-08, + "loss": 1.1136, + "step": 20800 + }, + { + "epoch": 1.6126157541942732, + "grad_norm": 1.4040516399536709, + "learning_rate": 8.063391196528208e-08, + "loss": 1.0982, + "step": 20810 + }, + { + "epoch": 1.61339067767058, + "grad_norm": 1.3154669977430584, + "learning_rate": 8.067265964042158e-08, + "loss": 1.0892, + "step": 20820 + }, + { + "epoch": 1.6141656011468868, + "grad_norm": 1.3582339946236395, + "learning_rate": 8.071140731556107e-08, + "loss": 1.0898, + "step": 20830 + }, + { + "epoch": 1.6149405246231936, + "grad_norm": 1.4213875329528942, + "learning_rate": 8.075015499070057e-08, + "loss": 1.0805, + "step": 20840 + }, + { + "epoch": 1.6157154480995002, + "grad_norm": 1.387364388495007, + "learning_rate": 8.078890266584005e-08, + "loss": 1.1063, + "step": 20850 + }, + { + "epoch": 1.6164903715758068, + "grad_norm": 1.2903464108997407, + "learning_rate": 8.082765034097955e-08, + "loss": 1.1044, + "step": 20860 + }, + { + "epoch": 1.6172652950521136, + "grad_norm": 1.2491937177683248, + "learning_rate": 8.086639801611904e-08, + "loss": 1.0766, + "step": 20870 + }, + { + "epoch": 1.6180402185284204, + "grad_norm": 1.3420344907949255, + "learning_rate": 8.090514569125852e-08, + "loss": 1.0817, + "step": 20880 + }, + { + "epoch": 1.618815142004727, + "grad_norm": 1.390788800165181, + "learning_rate": 8.094389336639802e-08, + "loss": 1.1118, + "step": 20890 + }, + { + "epoch": 1.6195900654810338, + "grad_norm": 1.3854936566960985, + "learning_rate": 8.098264104153752e-08, + "loss": 1.0786, + "step": 20900 + }, + { + "epoch": 1.6203649889573404, + "grad_norm": 1.469846781832145, + "learning_rate": 8.102138871667701e-08, + "loss": 1.0857, + "step": 20910 + }, + { + "epoch": 1.6211399124336472, + "grad_norm": 1.4102907319202032, + "learning_rate": 8.10601363918165e-08, + "loss": 1.0759, + "step": 20920 + }, + { + "epoch": 1.621914835909954, + "grad_norm": 1.321745162989723, + "learning_rate": 8.109888406695599e-08, + "loss": 1.0831, + "step": 20930 + }, + { + "epoch": 1.6226897593862606, + "grad_norm": 1.262868253729103, + "learning_rate": 8.113763174209548e-08, + "loss": 1.0905, + "step": 20940 + }, + { + "epoch": 1.6234646828625672, + "grad_norm": 1.3282438001898536, + "learning_rate": 8.117637941723497e-08, + "loss": 1.0806, + "step": 20950 + }, + { + "epoch": 1.624239606338874, + "grad_norm": 1.3118543515782717, + "learning_rate": 8.121512709237446e-08, + "loss": 1.0846, + "step": 20960 + }, + { + "epoch": 1.6250145298151808, + "grad_norm": 1.3003745557029938, + "learning_rate": 8.125387476751396e-08, + "loss": 1.0953, + "step": 20970 + }, + { + "epoch": 1.6257894532914876, + "grad_norm": 1.280888904139647, + "learning_rate": 8.129262244265345e-08, + "loss": 1.1, + "step": 20980 + }, + { + "epoch": 1.6265643767677942, + "grad_norm": 1.4818060836368747, + "learning_rate": 8.133137011779294e-08, + "loss": 1.1132, + "step": 20990 + }, + { + "epoch": 1.6273393002441008, + "grad_norm": 1.7053948207513212, + "learning_rate": 8.137011779293243e-08, + "loss": 1.1221, + "step": 21000 + }, + { + "epoch": 1.6273393002441008, + "eval_loss": 1.0899423360824585, + "eval_runtime": 319.0922, + "eval_samples_per_second": 35.949, + "eval_steps_per_second": 8.988, + "step": 21000 + }, + { + "epoch": 1.6281142237204076, + "grad_norm": 1.3013862640249425, + "learning_rate": 8.140886546807193e-08, + "loss": 1.0846, + "step": 21010 + }, + { + "epoch": 1.6288891471967144, + "grad_norm": 1.4880019484849596, + "learning_rate": 8.144761314321141e-08, + "loss": 1.1037, + "step": 21020 + }, + { + "epoch": 1.629664070673021, + "grad_norm": 1.3900979737049317, + "learning_rate": 8.14863608183509e-08, + "loss": 1.0933, + "step": 21030 + }, + { + "epoch": 1.6304389941493278, + "grad_norm": 1.363951986442343, + "learning_rate": 8.15251084934904e-08, + "loss": 1.0747, + "step": 21040 + }, + { + "epoch": 1.6312139176256344, + "grad_norm": 1.4216823877559939, + "learning_rate": 8.156385616862988e-08, + "loss": 1.0835, + "step": 21050 + }, + { + "epoch": 1.6319888411019412, + "grad_norm": 1.3985865472390187, + "learning_rate": 8.160260384376938e-08, + "loss": 1.096, + "step": 21060 + }, + { + "epoch": 1.632763764578248, + "grad_norm": 1.333015507559394, + "learning_rate": 8.164135151890887e-08, + "loss": 1.0827, + "step": 21070 + }, + { + "epoch": 1.6335386880545546, + "grad_norm": 1.4390532820117758, + "learning_rate": 8.168009919404837e-08, + "loss": 1.0792, + "step": 21080 + }, + { + "epoch": 1.6343136115308612, + "grad_norm": 1.3915458818775392, + "learning_rate": 8.171884686918785e-08, + "loss": 1.1034, + "step": 21090 + }, + { + "epoch": 1.635088535007168, + "grad_norm": 1.3445652273509474, + "learning_rate": 8.175759454432735e-08, + "loss": 1.074, + "step": 21100 + }, + { + "epoch": 1.6358634584834748, + "grad_norm": 1.267946677284684, + "learning_rate": 8.179634221946684e-08, + "loss": 1.0839, + "step": 21110 + }, + { + "epoch": 1.6366383819597816, + "grad_norm": 1.2578047074910534, + "learning_rate": 8.183508989460632e-08, + "loss": 1.0908, + "step": 21120 + }, + { + "epoch": 1.6374133054360882, + "grad_norm": 1.372263490147757, + "learning_rate": 8.187383756974582e-08, + "loss": 1.0862, + "step": 21130 + }, + { + "epoch": 1.6381882289123948, + "grad_norm": 1.3561794165877905, + "learning_rate": 8.191258524488532e-08, + "loss": 1.0989, + "step": 21140 + }, + { + "epoch": 1.6389631523887016, + "grad_norm": 1.399501420473453, + "learning_rate": 8.195133292002481e-08, + "loss": 1.0801, + "step": 21150 + }, + { + "epoch": 1.6397380758650084, + "grad_norm": 1.7296500201562546, + "learning_rate": 8.199008059516429e-08, + "loss": 1.1185, + "step": 21160 + }, + { + "epoch": 1.640512999341315, + "grad_norm": 1.3186083706629408, + "learning_rate": 8.202882827030379e-08, + "loss": 1.0868, + "step": 21170 + }, + { + "epoch": 1.6412879228176218, + "grad_norm": 1.3263644737799378, + "learning_rate": 8.206757594544328e-08, + "loss": 1.1086, + "step": 21180 + }, + { + "epoch": 1.6420628462939284, + "grad_norm": 1.3493225832014695, + "learning_rate": 8.210632362058277e-08, + "loss": 1.0878, + "step": 21190 + }, + { + "epoch": 1.6428377697702352, + "grad_norm": 1.4525723643659363, + "learning_rate": 8.214507129572226e-08, + "loss": 1.0638, + "step": 21200 + }, + { + "epoch": 1.643612693246542, + "grad_norm": 1.4276991268983161, + "learning_rate": 8.218381897086176e-08, + "loss": 1.0752, + "step": 21210 + }, + { + "epoch": 1.6443876167228486, + "grad_norm": 1.361686209007067, + "learning_rate": 8.222256664600124e-08, + "loss": 1.099, + "step": 21220 + }, + { + "epoch": 1.6451625401991552, + "grad_norm": 1.4577197380525861, + "learning_rate": 8.226131432114074e-08, + "loss": 1.1086, + "step": 21230 + }, + { + "epoch": 1.645937463675462, + "grad_norm": 1.319714292009487, + "learning_rate": 8.230006199628023e-08, + "loss": 1.0907, + "step": 21240 + }, + { + "epoch": 1.6467123871517688, + "grad_norm": 1.38140012215908, + "learning_rate": 8.233880967141973e-08, + "loss": 1.1145, + "step": 21250 + }, + { + "epoch": 1.6474873106280756, + "grad_norm": 1.36066474424862, + "learning_rate": 8.237755734655921e-08, + "loss": 1.099, + "step": 21260 + }, + { + "epoch": 1.6482622341043822, + "grad_norm": 1.3218322452225533, + "learning_rate": 8.24163050216987e-08, + "loss": 1.0825, + "step": 21270 + }, + { + "epoch": 1.6490371575806888, + "grad_norm": 1.3505867589747975, + "learning_rate": 8.24550526968382e-08, + "loss": 1.111, + "step": 21280 + }, + { + "epoch": 1.6498120810569956, + "grad_norm": 1.3168435963242635, + "learning_rate": 8.249380037197768e-08, + "loss": 1.0774, + "step": 21290 + }, + { + "epoch": 1.6505870045333024, + "grad_norm": 1.3292446052747706, + "learning_rate": 8.253254804711718e-08, + "loss": 1.0678, + "step": 21300 + }, + { + "epoch": 1.651361928009609, + "grad_norm": 1.5616682298555296, + "learning_rate": 8.257129572225667e-08, + "loss": 1.0906, + "step": 21310 + }, + { + "epoch": 1.6521368514859156, + "grad_norm": 1.342377062267312, + "learning_rate": 8.261004339739617e-08, + "loss": 1.0768, + "step": 21320 + }, + { + "epoch": 1.6529117749622224, + "grad_norm": 1.3226737955264796, + "learning_rate": 8.264879107253565e-08, + "loss": 1.0887, + "step": 21330 + }, + { + "epoch": 1.6536866984385292, + "grad_norm": 1.341602137346081, + "learning_rate": 8.268753874767515e-08, + "loss": 1.0657, + "step": 21340 + }, + { + "epoch": 1.654461621914836, + "grad_norm": 1.4281941201286572, + "learning_rate": 8.272628642281464e-08, + "loss": 1.1051, + "step": 21350 + }, + { + "epoch": 1.6552365453911426, + "grad_norm": 1.3084294398931242, + "learning_rate": 8.276503409795412e-08, + "loss": 1.0578, + "step": 21360 + }, + { + "epoch": 1.6560114688674492, + "grad_norm": 1.799537560397924, + "learning_rate": 8.280378177309362e-08, + "loss": 1.0905, + "step": 21370 + }, + { + "epoch": 1.656786392343756, + "grad_norm": 1.433208839020938, + "learning_rate": 8.284252944823311e-08, + "loss": 1.1207, + "step": 21380 + }, + { + "epoch": 1.6575613158200628, + "grad_norm": 1.44853197425508, + "learning_rate": 8.28812771233726e-08, + "loss": 1.0787, + "step": 21390 + }, + { + "epoch": 1.6583362392963696, + "grad_norm": 1.5564277048393356, + "learning_rate": 8.292002479851209e-08, + "loss": 1.0876, + "step": 21400 + }, + { + "epoch": 1.6591111627726762, + "grad_norm": 1.465092443321648, + "learning_rate": 8.295877247365159e-08, + "loss": 1.0883, + "step": 21410 + }, + { + "epoch": 1.6598860862489828, + "grad_norm": 1.3178135013662013, + "learning_rate": 8.299752014879108e-08, + "loss": 1.0547, + "step": 21420 + }, + { + "epoch": 1.6606610097252896, + "grad_norm": 1.4178860204150336, + "learning_rate": 8.303626782393057e-08, + "loss": 1.1042, + "step": 21430 + }, + { + "epoch": 1.6614359332015964, + "grad_norm": 1.4416099162806015, + "learning_rate": 8.307501549907006e-08, + "loss": 1.0961, + "step": 21440 + }, + { + "epoch": 1.662210856677903, + "grad_norm": 1.354116827086681, + "learning_rate": 8.311376317420956e-08, + "loss": 1.0783, + "step": 21450 + }, + { + "epoch": 1.6629857801542096, + "grad_norm": 1.4083216446153215, + "learning_rate": 8.315251084934904e-08, + "loss": 1.0924, + "step": 21460 + }, + { + "epoch": 1.6637607036305164, + "grad_norm": 1.3472634431535924, + "learning_rate": 8.319125852448853e-08, + "loss": 1.0945, + "step": 21470 + }, + { + "epoch": 1.6645356271068232, + "grad_norm": 1.3110014095101907, + "learning_rate": 8.323000619962803e-08, + "loss": 1.0735, + "step": 21480 + }, + { + "epoch": 1.66531055058313, + "grad_norm": 1.3768501405378766, + "learning_rate": 8.326875387476753e-08, + "loss": 1.0858, + "step": 21490 + }, + { + "epoch": 1.6660854740594366, + "grad_norm": 1.37196083733312, + "learning_rate": 8.330750154990701e-08, + "loss": 1.0873, + "step": 21500 + }, + { + "epoch": 1.6660854740594366, + "eval_loss": 1.0860689878463745, + "eval_runtime": 319.2655, + "eval_samples_per_second": 35.929, + "eval_steps_per_second": 8.983, + "step": 21500 + }, + { + "epoch": 1.6668603975357432, + "grad_norm": 1.441736162798698, + "learning_rate": 8.33462492250465e-08, + "loss": 1.1044, + "step": 21510 + }, + { + "epoch": 1.66763532101205, + "grad_norm": 1.5035845086987345, + "learning_rate": 8.3384996900186e-08, + "loss": 1.0966, + "step": 21520 + }, + { + "epoch": 1.6684102444883568, + "grad_norm": 1.3380549880063084, + "learning_rate": 8.342374457532548e-08, + "loss": 1.0924, + "step": 21530 + }, + { + "epoch": 1.6691851679646637, + "grad_norm": 1.5473798264400869, + "learning_rate": 8.346249225046498e-08, + "loss": 1.0843, + "step": 21540 + }, + { + "epoch": 1.6699600914409702, + "grad_norm": 1.402967679279267, + "learning_rate": 8.350123992560447e-08, + "loss": 1.0856, + "step": 21550 + }, + { + "epoch": 1.6707350149172768, + "grad_norm": 1.4461906441873402, + "learning_rate": 8.353998760074395e-08, + "loss": 1.0853, + "step": 21560 + }, + { + "epoch": 1.6715099383935836, + "grad_norm": 1.3475509923100772, + "learning_rate": 8.357873527588345e-08, + "loss": 1.1019, + "step": 21570 + }, + { + "epoch": 1.6722848618698904, + "grad_norm": 1.6801834488578105, + "learning_rate": 8.361748295102295e-08, + "loss": 1.1223, + "step": 21580 + }, + { + "epoch": 1.673059785346197, + "grad_norm": 1.4755950203566697, + "learning_rate": 8.365623062616244e-08, + "loss": 1.083, + "step": 21590 + }, + { + "epoch": 1.6738347088225036, + "grad_norm": 1.4073191380658232, + "learning_rate": 8.369497830130192e-08, + "loss": 1.0904, + "step": 21600 + }, + { + "epoch": 1.6746096322988104, + "grad_norm": 1.3467548422729232, + "learning_rate": 8.373372597644142e-08, + "loss": 1.0832, + "step": 21610 + }, + { + "epoch": 1.6753845557751172, + "grad_norm": 1.3793939347688156, + "learning_rate": 8.377247365158091e-08, + "loss": 1.0807, + "step": 21620 + }, + { + "epoch": 1.676159479251424, + "grad_norm": 1.402515038311812, + "learning_rate": 8.38112213267204e-08, + "loss": 1.0862, + "step": 21630 + }, + { + "epoch": 1.6769344027277306, + "grad_norm": 1.3603810748141183, + "learning_rate": 8.384996900185989e-08, + "loss": 1.0757, + "step": 21640 + }, + { + "epoch": 1.6777093262040372, + "grad_norm": 1.4953245263112345, + "learning_rate": 8.388871667699939e-08, + "loss": 1.0757, + "step": 21650 + }, + { + "epoch": 1.678484249680344, + "grad_norm": 1.3516878236836052, + "learning_rate": 8.392746435213888e-08, + "loss": 1.0691, + "step": 21660 + }, + { + "epoch": 1.6792591731566509, + "grad_norm": 1.3426797555141, + "learning_rate": 8.396621202727837e-08, + "loss": 1.0914, + "step": 21670 + }, + { + "epoch": 1.6800340966329574, + "grad_norm": 1.4091582854752511, + "learning_rate": 8.400495970241786e-08, + "loss": 1.0835, + "step": 21680 + }, + { + "epoch": 1.6808090201092643, + "grad_norm": 1.3627347176436693, + "learning_rate": 8.404370737755736e-08, + "loss": 1.077, + "step": 21690 + }, + { + "epoch": 1.6815839435855708, + "grad_norm": 1.332253868868632, + "learning_rate": 8.408245505269684e-08, + "loss": 1.0823, + "step": 21700 + }, + { + "epoch": 1.6823588670618776, + "grad_norm": 1.3242184667359644, + "learning_rate": 8.412120272783633e-08, + "loss": 1.0677, + "step": 21710 + }, + { + "epoch": 1.6831337905381845, + "grad_norm": 1.3349015433275198, + "learning_rate": 8.415995040297583e-08, + "loss": 1.0648, + "step": 21720 + }, + { + "epoch": 1.683908714014491, + "grad_norm": 1.3977515228322017, + "learning_rate": 8.419869807811531e-08, + "loss": 1.1131, + "step": 21730 + }, + { + "epoch": 1.6846836374907976, + "grad_norm": 1.2584333225254452, + "learning_rate": 8.423744575325481e-08, + "loss": 1.0847, + "step": 21740 + }, + { + "epoch": 1.6854585609671044, + "grad_norm": 1.3467029885079085, + "learning_rate": 8.42761934283943e-08, + "loss": 1.0794, + "step": 21750 + }, + { + "epoch": 1.6862334844434113, + "grad_norm": 1.3345298950044608, + "learning_rate": 8.43149411035338e-08, + "loss": 1.0829, + "step": 21760 + }, + { + "epoch": 1.687008407919718, + "grad_norm": 1.3137644314742951, + "learning_rate": 8.435368877867328e-08, + "loss": 1.0825, + "step": 21770 + }, + { + "epoch": 1.6877833313960247, + "grad_norm": 1.25932372319807, + "learning_rate": 8.439243645381278e-08, + "loss": 1.0689, + "step": 21780 + }, + { + "epoch": 1.6885582548723312, + "grad_norm": 1.3711149431499543, + "learning_rate": 8.443118412895227e-08, + "loss": 1.1162, + "step": 21790 + }, + { + "epoch": 1.689333178348638, + "grad_norm": 1.331911168389847, + "learning_rate": 8.446993180409175e-08, + "loss": 1.0757, + "step": 21800 + }, + { + "epoch": 1.6901081018249449, + "grad_norm": 4.033504692009767, + "learning_rate": 8.450867947923125e-08, + "loss": 1.0842, + "step": 21810 + }, + { + "epoch": 1.6908830253012515, + "grad_norm": 1.3380402885320992, + "learning_rate": 8.454742715437075e-08, + "loss": 1.1011, + "step": 21820 + }, + { + "epoch": 1.6916579487775583, + "grad_norm": 1.335998296408576, + "learning_rate": 8.458617482951024e-08, + "loss": 1.0962, + "step": 21830 + }, + { + "epoch": 1.6924328722538649, + "grad_norm": 1.3205098054814988, + "learning_rate": 8.462492250464972e-08, + "loss": 1.1233, + "step": 21840 + }, + { + "epoch": 1.6932077957301717, + "grad_norm": 1.3456071799335527, + "learning_rate": 8.466367017978922e-08, + "loss": 1.0988, + "step": 21850 + }, + { + "epoch": 1.6939827192064785, + "grad_norm": 1.3946763734277252, + "learning_rate": 8.470241785492871e-08, + "loss": 1.0802, + "step": 21860 + }, + { + "epoch": 1.694757642682785, + "grad_norm": 1.40520184556234, + "learning_rate": 8.47411655300682e-08, + "loss": 1.0562, + "step": 21870 + }, + { + "epoch": 1.6955325661590916, + "grad_norm": 1.3762576842931586, + "learning_rate": 8.477991320520769e-08, + "loss": 1.0757, + "step": 21880 + }, + { + "epoch": 1.6963074896353985, + "grad_norm": 1.347379779449589, + "learning_rate": 8.481866088034719e-08, + "loss": 1.0524, + "step": 21890 + }, + { + "epoch": 1.6970824131117053, + "grad_norm": 1.344714333057377, + "learning_rate": 8.485740855548668e-08, + "loss": 1.0586, + "step": 21900 + }, + { + "epoch": 1.697857336588012, + "grad_norm": 1.3419395363477058, + "learning_rate": 8.489615623062617e-08, + "loss": 1.0664, + "step": 21910 + }, + { + "epoch": 1.6986322600643187, + "grad_norm": 1.4270849299607913, + "learning_rate": 8.493490390576566e-08, + "loss": 1.0886, + "step": 21920 + }, + { + "epoch": 1.6994071835406253, + "grad_norm": 1.3182949773883563, + "learning_rate": 8.497365158090516e-08, + "loss": 1.0948, + "step": 21930 + }, + { + "epoch": 1.700182107016932, + "grad_norm": 1.5339506275218395, + "learning_rate": 8.501239925604464e-08, + "loss": 1.073, + "step": 21940 + }, + { + "epoch": 1.7009570304932389, + "grad_norm": 1.2195272295464932, + "learning_rate": 8.505114693118413e-08, + "loss": 1.0717, + "step": 21950 + }, + { + "epoch": 1.7017319539695455, + "grad_norm": 1.3724272464729594, + "learning_rate": 8.508989460632363e-08, + "loss": 1.0812, + "step": 21960 + }, + { + "epoch": 1.7025068774458523, + "grad_norm": 1.315710728608015, + "learning_rate": 8.512864228146311e-08, + "loss": 1.0782, + "step": 21970 + }, + { + "epoch": 1.7032818009221589, + "grad_norm": 1.28950337854254, + "learning_rate": 8.516738995660261e-08, + "loss": 1.1052, + "step": 21980 + }, + { + "epoch": 1.7040567243984657, + "grad_norm": 1.3628569178759224, + "learning_rate": 8.52061376317421e-08, + "loss": 1.0609, + "step": 21990 + }, + { + "epoch": 1.7048316478747725, + "grad_norm": 1.3144357032482599, + "learning_rate": 8.52448853068816e-08, + "loss": 1.0767, + "step": 22000 + }, + { + "epoch": 1.7048316478747725, + "eval_loss": 1.0822482109069824, + "eval_runtime": 320.4459, + "eval_samples_per_second": 35.797, + "eval_steps_per_second": 8.95, + "step": 22000 + }, + { + "epoch": 1.705606571351079, + "grad_norm": 1.3512488716830422, + "learning_rate": 8.528363298202108e-08, + "loss": 1.0716, + "step": 22010 + }, + { + "epoch": 1.7063814948273857, + "grad_norm": 1.4106997404509278, + "learning_rate": 8.532238065716058e-08, + "loss": 1.0638, + "step": 22020 + }, + { + "epoch": 1.7071564183036925, + "grad_norm": 1.2967381730780203, + "learning_rate": 8.536112833230007e-08, + "loss": 1.0882, + "step": 22030 + }, + { + "epoch": 1.7079313417799993, + "grad_norm": 1.3443198644972358, + "learning_rate": 8.539987600743955e-08, + "loss": 1.0717, + "step": 22040 + }, + { + "epoch": 1.708706265256306, + "grad_norm": 1.2760148483167475, + "learning_rate": 8.543862368257905e-08, + "loss": 1.0862, + "step": 22050 + }, + { + "epoch": 1.7094811887326127, + "grad_norm": 1.3477627876231721, + "learning_rate": 8.547737135771854e-08, + "loss": 1.1174, + "step": 22060 + }, + { + "epoch": 1.7102561122089193, + "grad_norm": 1.3736618830129113, + "learning_rate": 8.551611903285804e-08, + "loss": 1.0842, + "step": 22070 + }, + { + "epoch": 1.711031035685226, + "grad_norm": 1.3425970114158, + "learning_rate": 8.555486670799752e-08, + "loss": 1.0869, + "step": 22080 + }, + { + "epoch": 1.7118059591615329, + "grad_norm": 1.431021881899628, + "learning_rate": 8.559361438313702e-08, + "loss": 1.0746, + "step": 22090 + }, + { + "epoch": 1.7125808826378395, + "grad_norm": 1.3113429013528377, + "learning_rate": 8.563236205827651e-08, + "loss": 1.0934, + "step": 22100 + }, + { + "epoch": 1.713355806114146, + "grad_norm": 1.2916857938986506, + "learning_rate": 8.5671109733416e-08, + "loss": 1.0689, + "step": 22110 + }, + { + "epoch": 1.7141307295904529, + "grad_norm": 1.3737292350611843, + "learning_rate": 8.570985740855549e-08, + "loss": 1.0775, + "step": 22120 + }, + { + "epoch": 1.7149056530667597, + "grad_norm": 1.32456802328816, + "learning_rate": 8.574860508369499e-08, + "loss": 1.094, + "step": 22130 + }, + { + "epoch": 1.7156805765430665, + "grad_norm": 1.3251098036874769, + "learning_rate": 8.578735275883447e-08, + "loss": 1.0729, + "step": 22140 + }, + { + "epoch": 1.716455500019373, + "grad_norm": 1.316773504614761, + "learning_rate": 8.582610043397396e-08, + "loss": 1.0637, + "step": 22150 + }, + { + "epoch": 1.7172304234956797, + "grad_norm": 1.3931577513605875, + "learning_rate": 8.586484810911346e-08, + "loss": 1.0851, + "step": 22160 + }, + { + "epoch": 1.7180053469719865, + "grad_norm": 1.3458531982464872, + "learning_rate": 8.590359578425296e-08, + "loss": 1.1048, + "step": 22170 + }, + { + "epoch": 1.7187802704482933, + "grad_norm": 1.3150124375425276, + "learning_rate": 8.594234345939244e-08, + "loss": 1.1128, + "step": 22180 + }, + { + "epoch": 1.7195551939246, + "grad_norm": 1.325047117297204, + "learning_rate": 8.598109113453193e-08, + "loss": 1.0782, + "step": 22190 + }, + { + "epoch": 1.7203301174009067, + "grad_norm": 1.3167646953309904, + "learning_rate": 8.601983880967143e-08, + "loss": 1.066, + "step": 22200 + }, + { + "epoch": 1.7211050408772133, + "grad_norm": 1.2570170142034451, + "learning_rate": 8.605858648481091e-08, + "loss": 1.0953, + "step": 22210 + }, + { + "epoch": 1.72187996435352, + "grad_norm": 1.4508558778506022, + "learning_rate": 8.609733415995041e-08, + "loss": 1.0766, + "step": 22220 + }, + { + "epoch": 1.722654887829827, + "grad_norm": 1.3072835943734615, + "learning_rate": 8.61360818350899e-08, + "loss": 1.0828, + "step": 22230 + }, + { + "epoch": 1.7234298113061335, + "grad_norm": 1.3580219656004695, + "learning_rate": 8.61748295102294e-08, + "loss": 1.0905, + "step": 22240 + }, + { + "epoch": 1.72420473478244, + "grad_norm": 1.3401565974186882, + "learning_rate": 8.621357718536888e-08, + "loss": 1.1184, + "step": 22250 + }, + { + "epoch": 1.7249796582587469, + "grad_norm": 1.3249613629541475, + "learning_rate": 8.625232486050838e-08, + "loss": 1.0598, + "step": 22260 + }, + { + "epoch": 1.7257545817350537, + "grad_norm": 1.3655606730700505, + "learning_rate": 8.629107253564787e-08, + "loss": 1.0731, + "step": 22270 + }, + { + "epoch": 1.7265295052113605, + "grad_norm": 1.281614590133607, + "learning_rate": 8.632982021078735e-08, + "loss": 1.0866, + "step": 22280 + }, + { + "epoch": 1.727304428687667, + "grad_norm": 1.397957923389028, + "learning_rate": 8.636856788592685e-08, + "loss": 1.0885, + "step": 22290 + }, + { + "epoch": 1.7280793521639737, + "grad_norm": 1.3016777474376426, + "learning_rate": 8.640731556106634e-08, + "loss": 1.0868, + "step": 22300 + }, + { + "epoch": 1.7288542756402805, + "grad_norm": 1.3578256694397908, + "learning_rate": 8.644606323620583e-08, + "loss": 1.0686, + "step": 22310 + }, + { + "epoch": 1.7296291991165873, + "grad_norm": 1.3225825920292718, + "learning_rate": 8.648481091134532e-08, + "loss": 1.1028, + "step": 22320 + }, + { + "epoch": 1.7304041225928941, + "grad_norm": 1.3942355800419324, + "learning_rate": 8.652355858648482e-08, + "loss": 1.0516, + "step": 22330 + }, + { + "epoch": 1.7311790460692007, + "grad_norm": 1.5799494320926646, + "learning_rate": 8.656230626162431e-08, + "loss": 1.1015, + "step": 22340 + }, + { + "epoch": 1.7319539695455073, + "grad_norm": 1.3566095342005444, + "learning_rate": 8.66010539367638e-08, + "loss": 1.0827, + "step": 22350 + }, + { + "epoch": 1.732728893021814, + "grad_norm": 1.3004719099061515, + "learning_rate": 8.663980161190329e-08, + "loss": 1.0881, + "step": 22360 + }, + { + "epoch": 1.733503816498121, + "grad_norm": 1.3332409720876506, + "learning_rate": 8.667854928704279e-08, + "loss": 1.0563, + "step": 22370 + }, + { + "epoch": 1.7342787399744275, + "grad_norm": 1.3467144157808673, + "learning_rate": 8.671729696218227e-08, + "loss": 1.0963, + "step": 22380 + }, + { + "epoch": 1.735053663450734, + "grad_norm": 1.381298548810399, + "learning_rate": 8.675604463732176e-08, + "loss": 1.0923, + "step": 22390 + }, + { + "epoch": 1.735828586927041, + "grad_norm": 1.4039291111755963, + "learning_rate": 8.679479231246126e-08, + "loss": 1.0963, + "step": 22400 + }, + { + "epoch": 1.7366035104033477, + "grad_norm": 1.2982951274919177, + "learning_rate": 8.683353998760076e-08, + "loss": 1.0748, + "step": 22410 + }, + { + "epoch": 1.7373784338796545, + "grad_norm": 1.368599675091323, + "learning_rate": 8.687228766274024e-08, + "loss": 1.0763, + "step": 22420 + }, + { + "epoch": 1.738153357355961, + "grad_norm": 1.309567573615782, + "learning_rate": 8.691103533787973e-08, + "loss": 1.0787, + "step": 22430 + }, + { + "epoch": 1.7389282808322677, + "grad_norm": 1.3519304706775088, + "learning_rate": 8.694978301301923e-08, + "loss": 1.1149, + "step": 22440 + }, + { + "epoch": 1.7397032043085745, + "grad_norm": 1.2427041570331698, + "learning_rate": 8.698853068815871e-08, + "loss": 1.0794, + "step": 22450 + }, + { + "epoch": 1.7404781277848813, + "grad_norm": 1.359738671494722, + "learning_rate": 8.70272783632982e-08, + "loss": 1.0828, + "step": 22460 + }, + { + "epoch": 1.7412530512611881, + "grad_norm": 1.3706063089980034, + "learning_rate": 8.70660260384377e-08, + "loss": 1.0746, + "step": 22470 + }, + { + "epoch": 1.7420279747374947, + "grad_norm": 1.3086939866637357, + "learning_rate": 8.710477371357718e-08, + "loss": 1.0594, + "step": 22480 + }, + { + "epoch": 1.7428028982138013, + "grad_norm": 1.3413699087282454, + "learning_rate": 8.714352138871668e-08, + "loss": 1.0928, + "step": 22490 + }, + { + "epoch": 1.743577821690108, + "grad_norm": 1.386856320798816, + "learning_rate": 8.718226906385618e-08, + "loss": 1.0604, + "step": 22500 + }, + { + "epoch": 1.743577821690108, + "eval_loss": 1.0785870552062988, + "eval_runtime": 319.0196, + "eval_samples_per_second": 35.957, + "eval_steps_per_second": 8.99, + "step": 22500 + }, + { + "epoch": 1.744352745166415, + "grad_norm": 1.349734875467202, + "learning_rate": 8.722101673899567e-08, + "loss": 1.0891, + "step": 22510 + }, + { + "epoch": 1.7451276686427215, + "grad_norm": 1.3578785677100735, + "learning_rate": 8.725976441413515e-08, + "loss": 1.0863, + "step": 22520 + }, + { + "epoch": 1.745902592119028, + "grad_norm": 1.3784113894657601, + "learning_rate": 8.729851208927465e-08, + "loss": 1.0633, + "step": 22530 + }, + { + "epoch": 1.746677515595335, + "grad_norm": 1.330628675334231, + "learning_rate": 8.733725976441414e-08, + "loss": 1.1027, + "step": 22540 + }, + { + "epoch": 1.7474524390716417, + "grad_norm": 1.3558352144337313, + "learning_rate": 8.737600743955363e-08, + "loss": 1.0606, + "step": 22550 + }, + { + "epoch": 1.7482273625479485, + "grad_norm": 1.3426654418679822, + "learning_rate": 8.741475511469312e-08, + "loss": 1.093, + "step": 22560 + }, + { + "epoch": 1.7490022860242551, + "grad_norm": 1.2744148972829235, + "learning_rate": 8.745350278983262e-08, + "loss": 1.0844, + "step": 22570 + }, + { + "epoch": 1.7497772095005617, + "grad_norm": 1.448944543902253, + "learning_rate": 8.749225046497211e-08, + "loss": 1.1092, + "step": 22580 + }, + { + "epoch": 1.7505521329768685, + "grad_norm": 1.2787860009347773, + "learning_rate": 8.75309981401116e-08, + "loss": 1.0687, + "step": 22590 + }, + { + "epoch": 1.7513270564531753, + "grad_norm": 1.3025488089852797, + "learning_rate": 8.756974581525109e-08, + "loss": 1.0969, + "step": 22600 + }, + { + "epoch": 1.752101979929482, + "grad_norm": 1.3105642973829048, + "learning_rate": 8.760849349039059e-08, + "loss": 1.0754, + "step": 22610 + }, + { + "epoch": 1.7528769034057887, + "grad_norm": 1.2922871087775303, + "learning_rate": 8.764724116553007e-08, + "loss": 1.0701, + "step": 22620 + }, + { + "epoch": 1.7536518268820953, + "grad_norm": 1.4019957264633394, + "learning_rate": 8.768598884066956e-08, + "loss": 1.0522, + "step": 22630 + }, + { + "epoch": 1.7544267503584021, + "grad_norm": 1.3627802329159304, + "learning_rate": 8.772473651580906e-08, + "loss": 1.0543, + "step": 22640 + }, + { + "epoch": 1.755201673834709, + "grad_norm": 1.6825996370861216, + "learning_rate": 8.776348419094854e-08, + "loss": 1.0882, + "step": 22650 + }, + { + "epoch": 1.7559765973110155, + "grad_norm": 1.2576455571966263, + "learning_rate": 8.780223186608804e-08, + "loss": 1.0599, + "step": 22660 + }, + { + "epoch": 1.756751520787322, + "grad_norm": 1.4040022851255438, + "learning_rate": 8.784097954122753e-08, + "loss": 1.0742, + "step": 22670 + }, + { + "epoch": 1.757526444263629, + "grad_norm": 1.2772583782083655, + "learning_rate": 8.787972721636703e-08, + "loss": 1.0783, + "step": 22680 + }, + { + "epoch": 1.7583013677399357, + "grad_norm": 1.386912874068717, + "learning_rate": 8.791847489150651e-08, + "loss": 1.0744, + "step": 22690 + }, + { + "epoch": 1.7590762912162425, + "grad_norm": 1.4124555724351298, + "learning_rate": 8.7957222566646e-08, + "loss": 1.0847, + "step": 22700 + }, + { + "epoch": 1.7598512146925491, + "grad_norm": 1.2669691796688454, + "learning_rate": 8.79959702417855e-08, + "loss": 1.0729, + "step": 22710 + }, + { + "epoch": 1.7606261381688557, + "grad_norm": 1.3135970694664045, + "learning_rate": 8.803471791692498e-08, + "loss": 1.0825, + "step": 22720 + }, + { + "epoch": 1.7614010616451625, + "grad_norm": 1.3608092731565364, + "learning_rate": 8.807346559206448e-08, + "loss": 1.0551, + "step": 22730 + }, + { + "epoch": 1.7621759851214693, + "grad_norm": 1.340361783273507, + "learning_rate": 8.811221326720397e-08, + "loss": 1.0661, + "step": 22740 + }, + { + "epoch": 1.762950908597776, + "grad_norm": 1.4858512023011246, + "learning_rate": 8.815096094234347e-08, + "loss": 1.095, + "step": 22750 + }, + { + "epoch": 1.7637258320740827, + "grad_norm": 1.3990199284178921, + "learning_rate": 8.818970861748295e-08, + "loss": 1.0737, + "step": 22760 + }, + { + "epoch": 1.7645007555503893, + "grad_norm": 1.3417003168290178, + "learning_rate": 8.822845629262245e-08, + "loss": 1.0721, + "step": 22770 + }, + { + "epoch": 1.7652756790266961, + "grad_norm": 1.240581505393346, + "learning_rate": 8.826720396776194e-08, + "loss": 1.0496, + "step": 22780 + }, + { + "epoch": 1.766050602503003, + "grad_norm": 1.388521418367984, + "learning_rate": 8.830595164290143e-08, + "loss": 1.0755, + "step": 22790 + }, + { + "epoch": 1.7668255259793095, + "grad_norm": 1.4553891300019852, + "learning_rate": 8.834469931804092e-08, + "loss": 1.0827, + "step": 22800 + }, + { + "epoch": 1.7676004494556161, + "grad_norm": 1.3885352463758294, + "learning_rate": 8.838344699318042e-08, + "loss": 1.0775, + "step": 22810 + }, + { + "epoch": 1.768375372931923, + "grad_norm": 1.2758451446572112, + "learning_rate": 8.84221946683199e-08, + "loss": 1.0686, + "step": 22820 + }, + { + "epoch": 1.7691502964082297, + "grad_norm": 1.3573772566819393, + "learning_rate": 8.84609423434594e-08, + "loss": 1.0873, + "step": 22830 + }, + { + "epoch": 1.7699252198845365, + "grad_norm": 1.3207532663419255, + "learning_rate": 8.849969001859889e-08, + "loss": 1.077, + "step": 22840 + }, + { + "epoch": 1.7707001433608431, + "grad_norm": 1.3755013855474323, + "learning_rate": 8.853843769373839e-08, + "loss": 1.0856, + "step": 22850 + }, + { + "epoch": 1.7714750668371497, + "grad_norm": 1.4074852620727905, + "learning_rate": 8.857718536887787e-08, + "loss": 1.0578, + "step": 22860 + }, + { + "epoch": 1.7722499903134565, + "grad_norm": 1.2934331700472934, + "learning_rate": 8.861593304401736e-08, + "loss": 1.0626, + "step": 22870 + }, + { + "epoch": 1.7730249137897633, + "grad_norm": 1.2621534118982014, + "learning_rate": 8.865468071915686e-08, + "loss": 1.0798, + "step": 22880 + }, + { + "epoch": 1.77379983726607, + "grad_norm": 1.3234812214100118, + "learning_rate": 8.869342839429634e-08, + "loss": 1.0753, + "step": 22890 + }, + { + "epoch": 1.7745747607423767, + "grad_norm": 1.3031051103473408, + "learning_rate": 8.873217606943584e-08, + "loss": 1.0807, + "step": 22900 + }, + { + "epoch": 1.7753496842186833, + "grad_norm": 1.3270120472921703, + "learning_rate": 8.877092374457533e-08, + "loss": 1.0668, + "step": 22910 + }, + { + "epoch": 1.7761246076949901, + "grad_norm": 1.3944947290914285, + "learning_rate": 8.880967141971483e-08, + "loss": 1.0714, + "step": 22920 + }, + { + "epoch": 1.776899531171297, + "grad_norm": 1.2982806504366988, + "learning_rate": 8.884841909485431e-08, + "loss": 1.1194, + "step": 22930 + }, + { + "epoch": 1.7776744546476035, + "grad_norm": 1.3317279312148897, + "learning_rate": 8.88871667699938e-08, + "loss": 1.0787, + "step": 22940 + }, + { + "epoch": 1.7784493781239101, + "grad_norm": 1.3336768331520774, + "learning_rate": 8.89259144451333e-08, + "loss": 1.0656, + "step": 22950 + }, + { + "epoch": 1.779224301600217, + "grad_norm": 1.3066020225197212, + "learning_rate": 8.896466212027278e-08, + "loss": 1.076, + "step": 22960 + }, + { + "epoch": 1.7799992250765238, + "grad_norm": 1.2318402157563164, + "learning_rate": 8.900340979541228e-08, + "loss": 1.0423, + "step": 22970 + }, + { + "epoch": 1.7807741485528306, + "grad_norm": 1.3125641332031641, + "learning_rate": 8.904215747055177e-08, + "loss": 1.0888, + "step": 22980 + }, + { + "epoch": 1.7815490720291371, + "grad_norm": 1.4071846702488362, + "learning_rate": 8.908090514569127e-08, + "loss": 1.0682, + "step": 22990 + }, + { + "epoch": 1.7823239955054437, + "grad_norm": 1.3767100618720298, + "learning_rate": 8.911965282083075e-08, + "loss": 1.0758, + "step": 23000 + }, + { + "epoch": 1.7823239955054437, + "eval_loss": 1.07499361038208, + "eval_runtime": 319.6988, + "eval_samples_per_second": 35.881, + "eval_steps_per_second": 8.971, + "step": 23000 + }, + { + "epoch": 1.7830989189817505, + "grad_norm": 1.285602279863542, + "learning_rate": 8.915840049597025e-08, + "loss": 1.0556, + "step": 23010 + }, + { + "epoch": 1.7838738424580574, + "grad_norm": 1.4279342335934821, + "learning_rate": 8.919714817110974e-08, + "loss": 1.0708, + "step": 23020 + }, + { + "epoch": 1.784648765934364, + "grad_norm": 1.2671968135206402, + "learning_rate": 8.923589584624923e-08, + "loss": 1.0654, + "step": 23030 + }, + { + "epoch": 1.7854236894106705, + "grad_norm": 1.3085077216959264, + "learning_rate": 8.927464352138872e-08, + "loss": 1.0939, + "step": 23040 + }, + { + "epoch": 1.7861986128869773, + "grad_norm": 1.4503660957864786, + "learning_rate": 8.931339119652822e-08, + "loss": 1.0789, + "step": 23050 + }, + { + "epoch": 1.7869735363632842, + "grad_norm": 1.3421887225875377, + "learning_rate": 8.93521388716677e-08, + "loss": 1.0511, + "step": 23060 + }, + { + "epoch": 1.787748459839591, + "grad_norm": 1.275183972028468, + "learning_rate": 8.93908865468072e-08, + "loss": 1.0558, + "step": 23070 + }, + { + "epoch": 1.7885233833158976, + "grad_norm": 1.3870358206771796, + "learning_rate": 8.942963422194669e-08, + "loss": 1.0826, + "step": 23080 + }, + { + "epoch": 1.7892983067922041, + "grad_norm": 1.3664214110947546, + "learning_rate": 8.946838189708619e-08, + "loss": 1.0585, + "step": 23090 + }, + { + "epoch": 1.790073230268511, + "grad_norm": 1.2815130015787772, + "learning_rate": 8.950712957222567e-08, + "loss": 1.0753, + "step": 23100 + }, + { + "epoch": 1.7908481537448178, + "grad_norm": 1.4040453071596732, + "learning_rate": 8.954587724736516e-08, + "loss": 1.0755, + "step": 23110 + }, + { + "epoch": 1.7916230772211246, + "grad_norm": 1.369854862184728, + "learning_rate": 8.958462492250466e-08, + "loss": 1.0693, + "step": 23120 + }, + { + "epoch": 1.7923980006974312, + "grad_norm": 1.3148882631397538, + "learning_rate": 8.962337259764414e-08, + "loss": 1.0618, + "step": 23130 + }, + { + "epoch": 1.7931729241737377, + "grad_norm": 1.931426002427859, + "learning_rate": 8.966212027278364e-08, + "loss": 1.0911, + "step": 23140 + }, + { + "epoch": 1.7939478476500446, + "grad_norm": 1.458519743589764, + "learning_rate": 8.970086794792313e-08, + "loss": 1.0786, + "step": 23150 + }, + { + "epoch": 1.7947227711263514, + "grad_norm": 1.2743352376737067, + "learning_rate": 8.973961562306263e-08, + "loss": 1.0884, + "step": 23160 + }, + { + "epoch": 1.795497694602658, + "grad_norm": 1.3410254303385616, + "learning_rate": 8.977836329820211e-08, + "loss": 1.0574, + "step": 23170 + }, + { + "epoch": 1.7962726180789645, + "grad_norm": 1.3430273528945096, + "learning_rate": 8.98171109733416e-08, + "loss": 1.0815, + "step": 23180 + }, + { + "epoch": 1.7970475415552714, + "grad_norm": 1.3238789763050334, + "learning_rate": 8.98558586484811e-08, + "loss": 1.056, + "step": 23190 + }, + { + "epoch": 1.7978224650315782, + "grad_norm": 1.3231847530626688, + "learning_rate": 8.989460632362058e-08, + "loss": 1.0728, + "step": 23200 + }, + { + "epoch": 1.798597388507885, + "grad_norm": 1.3578035870035186, + "learning_rate": 8.993335399876008e-08, + "loss": 1.0738, + "step": 23210 + }, + { + "epoch": 1.7993723119841916, + "grad_norm": 1.3560000316898633, + "learning_rate": 8.997210167389957e-08, + "loss": 1.0804, + "step": 23220 + }, + { + "epoch": 1.8001472354604982, + "grad_norm": 1.2989489526058478, + "learning_rate": 9.001084934903906e-08, + "loss": 1.0825, + "step": 23230 + }, + { + "epoch": 1.800922158936805, + "grad_norm": 1.3348241823501932, + "learning_rate": 9.004959702417855e-08, + "loss": 1.059, + "step": 23240 + }, + { + "epoch": 1.8016970824131118, + "grad_norm": 1.3496498031126014, + "learning_rate": 9.008834469931805e-08, + "loss": 1.0773, + "step": 23250 + }, + { + "epoch": 1.8024720058894186, + "grad_norm": 1.324691355087382, + "learning_rate": 9.012709237445754e-08, + "loss": 1.0893, + "step": 23260 + }, + { + "epoch": 1.8032469293657252, + "grad_norm": 1.3033374852636095, + "learning_rate": 9.016584004959703e-08, + "loss": 1.0627, + "step": 23270 + }, + { + "epoch": 1.8040218528420318, + "grad_norm": 1.3131280300254455, + "learning_rate": 9.020458772473652e-08, + "loss": 1.0702, + "step": 23280 + }, + { + "epoch": 1.8047967763183386, + "grad_norm": 1.4056264713266737, + "learning_rate": 9.024333539987602e-08, + "loss": 1.0668, + "step": 23290 + }, + { + "epoch": 1.8055716997946454, + "grad_norm": 1.2700993218658267, + "learning_rate": 9.02820830750155e-08, + "loss": 1.0681, + "step": 23300 + }, + { + "epoch": 1.806346623270952, + "grad_norm": 1.376355369820361, + "learning_rate": 9.0320830750155e-08, + "loss": 1.1086, + "step": 23310 + }, + { + "epoch": 1.8071215467472586, + "grad_norm": 1.3845451820698291, + "learning_rate": 9.035957842529449e-08, + "loss": 1.0602, + "step": 23320 + }, + { + "epoch": 1.8078964702235654, + "grad_norm": 1.4832975742977388, + "learning_rate": 9.039832610043398e-08, + "loss": 1.0776, + "step": 23330 + }, + { + "epoch": 1.8086713936998722, + "grad_norm": 1.3270462053260996, + "learning_rate": 9.043707377557347e-08, + "loss": 1.0768, + "step": 23340 + }, + { + "epoch": 1.809446317176179, + "grad_norm": 1.377588961591723, + "learning_rate": 9.047582145071296e-08, + "loss": 1.0597, + "step": 23350 + }, + { + "epoch": 1.8102212406524856, + "grad_norm": 1.3398840365583602, + "learning_rate": 9.051456912585246e-08, + "loss": 1.0761, + "step": 23360 + }, + { + "epoch": 1.8109961641287922, + "grad_norm": 1.373009271241925, + "learning_rate": 9.055331680099194e-08, + "loss": 1.0748, + "step": 23370 + }, + { + "epoch": 1.811771087605099, + "grad_norm": 1.369115442113301, + "learning_rate": 9.059206447613144e-08, + "loss": 1.0708, + "step": 23380 + }, + { + "epoch": 1.8125460110814058, + "grad_norm": 1.3117531813353531, + "learning_rate": 9.063081215127093e-08, + "loss": 1.0626, + "step": 23390 + }, + { + "epoch": 1.8133209345577124, + "grad_norm": 1.3368295391643477, + "learning_rate": 9.066955982641041e-08, + "loss": 1.073, + "step": 23400 + }, + { + "epoch": 1.8140958580340192, + "grad_norm": 1.3822105045958135, + "learning_rate": 9.070830750154991e-08, + "loss": 1.0908, + "step": 23410 + }, + { + "epoch": 1.8148707815103258, + "grad_norm": 1.2601261937036328, + "learning_rate": 9.07470551766894e-08, + "loss": 1.0946, + "step": 23420 + }, + { + "epoch": 1.8156457049866326, + "grad_norm": 1.392507917046196, + "learning_rate": 9.07858028518289e-08, + "loss": 1.0705, + "step": 23430 + }, + { + "epoch": 1.8164206284629394, + "grad_norm": 1.5032348009504963, + "learning_rate": 9.082455052696838e-08, + "loss": 1.0598, + "step": 23440 + }, + { + "epoch": 1.817195551939246, + "grad_norm": 1.37901095793463, + "learning_rate": 9.086329820210788e-08, + "loss": 1.0668, + "step": 23450 + }, + { + "epoch": 1.8179704754155526, + "grad_norm": 1.3363160332385622, + "learning_rate": 9.090204587724737e-08, + "loss": 1.0574, + "step": 23460 + }, + { + "epoch": 1.8187453988918594, + "grad_norm": 1.4905393236109452, + "learning_rate": 9.094079355238686e-08, + "loss": 1.0969, + "step": 23470 + }, + { + "epoch": 1.8195203223681662, + "grad_norm": 1.437013353278802, + "learning_rate": 9.097954122752635e-08, + "loss": 1.0946, + "step": 23480 + }, + { + "epoch": 1.820295245844473, + "grad_norm": 1.339165297118242, + "learning_rate": 9.101828890266585e-08, + "loss": 1.0708, + "step": 23490 + }, + { + "epoch": 1.8210701693207796, + "grad_norm": 1.4098351991355569, + "learning_rate": 9.105703657780534e-08, + "loss": 1.0679, + "step": 23500 + }, + { + "epoch": 1.8210701693207796, + "eval_loss": 1.0715937614440918, + "eval_runtime": 320.6131, + "eval_samples_per_second": 35.778, + "eval_steps_per_second": 8.945, + "step": 23500 + }, + { + "epoch": 1.8218450927970862, + "grad_norm": 1.3372858716949185, + "learning_rate": 9.109578425294482e-08, + "loss": 1.0733, + "step": 23510 + }, + { + "epoch": 1.822620016273393, + "grad_norm": 1.5312582710358313, + "learning_rate": 9.113453192808432e-08, + "loss": 1.0849, + "step": 23520 + }, + { + "epoch": 1.8233949397496998, + "grad_norm": 1.313379042033025, + "learning_rate": 9.117327960322382e-08, + "loss": 1.0674, + "step": 23530 + }, + { + "epoch": 1.8241698632260064, + "grad_norm": 1.2893556184940032, + "learning_rate": 9.12120272783633e-08, + "loss": 1.0661, + "step": 23540 + }, + { + "epoch": 1.8249447867023132, + "grad_norm": 1.3685675044047592, + "learning_rate": 9.12507749535028e-08, + "loss": 1.0898, + "step": 23550 + }, + { + "epoch": 1.8257197101786198, + "grad_norm": 1.4337444910756918, + "learning_rate": 9.128952262864229e-08, + "loss": 1.0933, + "step": 23560 + }, + { + "epoch": 1.8264946336549266, + "grad_norm": 1.3927713997590387, + "learning_rate": 9.132827030378177e-08, + "loss": 1.0619, + "step": 23570 + }, + { + "epoch": 1.8272695571312334, + "grad_norm": 1.4174842013822506, + "learning_rate": 9.136701797892127e-08, + "loss": 1.0672, + "step": 23580 + }, + { + "epoch": 1.82804448060754, + "grad_norm": 1.3367623356368021, + "learning_rate": 9.140576565406076e-08, + "loss": 1.0757, + "step": 23590 + }, + { + "epoch": 1.8288194040838466, + "grad_norm": 1.4039907719861966, + "learning_rate": 9.144451332920026e-08, + "loss": 1.0538, + "step": 23600 + }, + { + "epoch": 1.8295943275601534, + "grad_norm": 1.2642097271375936, + "learning_rate": 9.148326100433974e-08, + "loss": 1.0769, + "step": 23610 + }, + { + "epoch": 1.8303692510364602, + "grad_norm": 1.3420751556301465, + "learning_rate": 9.152200867947924e-08, + "loss": 1.0858, + "step": 23620 + }, + { + "epoch": 1.831144174512767, + "grad_norm": 1.2965747383172679, + "learning_rate": 9.156075635461873e-08, + "loss": 1.0625, + "step": 23630 + }, + { + "epoch": 1.8319190979890736, + "grad_norm": 1.353193307001166, + "learning_rate": 9.159950402975821e-08, + "loss": 1.0809, + "step": 23640 + }, + { + "epoch": 1.8326940214653802, + "grad_norm": 1.3239636173708076, + "learning_rate": 9.163825170489771e-08, + "loss": 1.0678, + "step": 23650 + }, + { + "epoch": 1.833468944941687, + "grad_norm": 1.375599214487265, + "learning_rate": 9.16769993800372e-08, + "loss": 1.0734, + "step": 23660 + }, + { + "epoch": 1.8342438684179938, + "grad_norm": 1.3522407840328305, + "learning_rate": 9.17157470551767e-08, + "loss": 1.0629, + "step": 23670 + }, + { + "epoch": 1.8350187918943004, + "grad_norm": 1.3819281336319222, + "learning_rate": 9.175449473031618e-08, + "loss": 1.0708, + "step": 23680 + }, + { + "epoch": 1.8357937153706072, + "grad_norm": 1.3873184047471692, + "learning_rate": 9.179324240545568e-08, + "loss": 1.0878, + "step": 23690 + }, + { + "epoch": 1.8365686388469138, + "grad_norm": 1.3415046989729142, + "learning_rate": 9.183199008059517e-08, + "loss": 1.0512, + "step": 23700 + }, + { + "epoch": 1.8373435623232206, + "grad_norm": 1.4353275514430388, + "learning_rate": 9.187073775573466e-08, + "loss": 1.0604, + "step": 23710 + }, + { + "epoch": 1.8381184857995274, + "grad_norm": 1.2974161563598934, + "learning_rate": 9.190948543087415e-08, + "loss": 1.0693, + "step": 23720 + }, + { + "epoch": 1.838893409275834, + "grad_norm": 1.3438482685589352, + "learning_rate": 9.194823310601365e-08, + "loss": 1.084, + "step": 23730 + }, + { + "epoch": 1.8396683327521406, + "grad_norm": 1.344206325397171, + "learning_rate": 9.198698078115313e-08, + "loss": 1.0669, + "step": 23740 + }, + { + "epoch": 1.8404432562284474, + "grad_norm": 1.488301017887555, + "learning_rate": 9.202572845629262e-08, + "loss": 1.088, + "step": 23750 + }, + { + "epoch": 1.8412181797047542, + "grad_norm": 1.3623273099050341, + "learning_rate": 9.206447613143212e-08, + "loss": 1.0754, + "step": 23760 + }, + { + "epoch": 1.841993103181061, + "grad_norm": 1.3381882158662728, + "learning_rate": 9.210322380657162e-08, + "loss": 1.0825, + "step": 23770 + }, + { + "epoch": 1.8427680266573676, + "grad_norm": 1.3162882232494835, + "learning_rate": 9.21419714817111e-08, + "loss": 1.08, + "step": 23780 + }, + { + "epoch": 1.8435429501336742, + "grad_norm": 1.3111389027444982, + "learning_rate": 9.218071915685059e-08, + "loss": 1.0464, + "step": 23790 + }, + { + "epoch": 1.844317873609981, + "grad_norm": 1.470753190884087, + "learning_rate": 9.221946683199009e-08, + "loss": 1.083, + "step": 23800 + }, + { + "epoch": 1.8450927970862878, + "grad_norm": 1.2931275398933713, + "learning_rate": 9.225821450712957e-08, + "loss": 1.0659, + "step": 23810 + }, + { + "epoch": 1.8458677205625944, + "grad_norm": 1.297106863464114, + "learning_rate": 9.229696218226907e-08, + "loss": 1.0689, + "step": 23820 + }, + { + "epoch": 1.846642644038901, + "grad_norm": 1.2901476170915602, + "learning_rate": 9.233570985740856e-08, + "loss": 1.0823, + "step": 23830 + }, + { + "epoch": 1.8474175675152078, + "grad_norm": 1.4043870758577757, + "learning_rate": 9.237445753254806e-08, + "loss": 1.0913, + "step": 23840 + }, + { + "epoch": 1.8481924909915146, + "grad_norm": 1.27358016052527, + "learning_rate": 9.241320520768754e-08, + "loss": 1.0684, + "step": 23850 + }, + { + "epoch": 1.8489674144678214, + "grad_norm": 1.2954129189655204, + "learning_rate": 9.245195288282704e-08, + "loss": 1.1231, + "step": 23860 + }, + { + "epoch": 1.849742337944128, + "grad_norm": 1.2834781206137096, + "learning_rate": 9.249070055796653e-08, + "loss": 1.0541, + "step": 23870 + }, + { + "epoch": 1.8505172614204346, + "grad_norm": 1.3877423000855704, + "learning_rate": 9.252944823310601e-08, + "loss": 1.0967, + "step": 23880 + }, + { + "epoch": 1.8512921848967414, + "grad_norm": 1.3675610767342603, + "learning_rate": 9.256819590824551e-08, + "loss": 1.0434, + "step": 23890 + }, + { + "epoch": 1.8520671083730482, + "grad_norm": 1.4347941365925376, + "learning_rate": 9.2606943583385e-08, + "loss": 1.0787, + "step": 23900 + }, + { + "epoch": 1.852842031849355, + "grad_norm": 1.371663294271692, + "learning_rate": 9.26456912585245e-08, + "loss": 1.0288, + "step": 23910 + }, + { + "epoch": 1.8536169553256616, + "grad_norm": 1.3058747109617777, + "learning_rate": 9.268443893366398e-08, + "loss": 1.0663, + "step": 23920 + }, + { + "epoch": 1.8543918788019682, + "grad_norm": 1.6952538602199185, + "learning_rate": 9.272318660880348e-08, + "loss": 1.0718, + "step": 23930 + }, + { + "epoch": 1.855166802278275, + "grad_norm": 1.3945194233967484, + "learning_rate": 9.276193428394297e-08, + "loss": 1.0602, + "step": 23940 + }, + { + "epoch": 1.8559417257545818, + "grad_norm": 1.345919581378995, + "learning_rate": 9.280068195908246e-08, + "loss": 1.0598, + "step": 23950 + }, + { + "epoch": 1.8567166492308884, + "grad_norm": 1.3457401937857971, + "learning_rate": 9.283942963422195e-08, + "loss": 1.0742, + "step": 23960 + }, + { + "epoch": 1.857491572707195, + "grad_norm": 1.296237607155456, + "learning_rate": 9.287817730936145e-08, + "loss": 1.0509, + "step": 23970 + }, + { + "epoch": 1.8582664961835018, + "grad_norm": 1.5184544167212704, + "learning_rate": 9.291692498450093e-08, + "loss": 1.0854, + "step": 23980 + }, + { + "epoch": 1.8590414196598086, + "grad_norm": 1.2774815652796798, + "learning_rate": 9.295567265964042e-08, + "loss": 1.0432, + "step": 23990 + }, + { + "epoch": 1.8598163431361154, + "grad_norm": 1.4410822632021592, + "learning_rate": 9.299442033477992e-08, + "loss": 1.0919, + "step": 24000 + }, + { + "epoch": 1.8598163431361154, + "eval_loss": 1.0682460069656372, + "eval_runtime": 318.0692, + "eval_samples_per_second": 36.064, + "eval_steps_per_second": 9.017, + "step": 24000 + }, + { + "epoch": 1.860591266612422, + "grad_norm": 1.3325913414034867, + "learning_rate": 9.303316800991941e-08, + "loss": 1.0455, + "step": 24010 + }, + { + "epoch": 1.8613661900887286, + "grad_norm": 1.4360187427771485, + "learning_rate": 9.30719156850589e-08, + "loss": 1.0693, + "step": 24020 + }, + { + "epoch": 1.8621411135650354, + "grad_norm": 1.3163798395825603, + "learning_rate": 9.311066336019839e-08, + "loss": 1.0701, + "step": 24030 + }, + { + "epoch": 1.8629160370413422, + "grad_norm": 1.342340269609767, + "learning_rate": 9.314941103533789e-08, + "loss": 1.0616, + "step": 24040 + }, + { + "epoch": 1.863690960517649, + "grad_norm": 1.3899242968960397, + "learning_rate": 9.318815871047737e-08, + "loss": 1.0571, + "step": 24050 + }, + { + "epoch": 1.8644658839939556, + "grad_norm": 1.317177453797997, + "learning_rate": 9.322690638561687e-08, + "loss": 1.0928, + "step": 24060 + }, + { + "epoch": 1.8652408074702622, + "grad_norm": 1.301621050709291, + "learning_rate": 9.326565406075636e-08, + "loss": 1.0777, + "step": 24070 + }, + { + "epoch": 1.866015730946569, + "grad_norm": 1.355779927180908, + "learning_rate": 9.330440173589586e-08, + "loss": 1.0553, + "step": 24080 + }, + { + "epoch": 1.8667906544228758, + "grad_norm": 1.4159272811637766, + "learning_rate": 9.334314941103534e-08, + "loss": 1.0747, + "step": 24090 + }, + { + "epoch": 1.8675655778991824, + "grad_norm": 1.5006072218471886, + "learning_rate": 9.338189708617483e-08, + "loss": 1.0474, + "step": 24100 + }, + { + "epoch": 1.868340501375489, + "grad_norm": 1.3333485702546355, + "learning_rate": 9.342064476131433e-08, + "loss": 1.0581, + "step": 24110 + }, + { + "epoch": 1.8691154248517958, + "grad_norm": 1.4688485207129505, + "learning_rate": 9.345939243645381e-08, + "loss": 1.0868, + "step": 24120 + }, + { + "epoch": 1.8698903483281026, + "grad_norm": 1.3005029405629198, + "learning_rate": 9.349814011159331e-08, + "loss": 1.0665, + "step": 24130 + }, + { + "epoch": 1.8706652718044094, + "grad_norm": 1.3269318224760427, + "learning_rate": 9.35368877867328e-08, + "loss": 1.0729, + "step": 24140 + }, + { + "epoch": 1.871440195280716, + "grad_norm": 1.359169709468867, + "learning_rate": 9.357563546187229e-08, + "loss": 1.0911, + "step": 24150 + }, + { + "epoch": 1.8722151187570226, + "grad_norm": 1.361169727979565, + "learning_rate": 9.361438313701178e-08, + "loss": 1.0654, + "step": 24160 + }, + { + "epoch": 1.8729900422333294, + "grad_norm": 1.341851781319987, + "learning_rate": 9.365313081215128e-08, + "loss": 1.0485, + "step": 24170 + }, + { + "epoch": 1.8737649657096362, + "grad_norm": 1.2804989679039003, + "learning_rate": 9.369187848729077e-08, + "loss": 1.0868, + "step": 24180 + }, + { + "epoch": 1.874539889185943, + "grad_norm": 1.3955965930339844, + "learning_rate": 9.373062616243025e-08, + "loss": 1.0738, + "step": 24190 + }, + { + "epoch": 1.8753148126622496, + "grad_norm": 1.402712279467875, + "learning_rate": 9.376937383756975e-08, + "loss": 1.0819, + "step": 24200 + }, + { + "epoch": 1.8760897361385562, + "grad_norm": 1.3663535511575116, + "learning_rate": 9.380812151270925e-08, + "loss": 1.0735, + "step": 24210 + }, + { + "epoch": 1.876864659614863, + "grad_norm": 1.3243137870073713, + "learning_rate": 9.384686918784873e-08, + "loss": 1.0664, + "step": 24220 + }, + { + "epoch": 1.8776395830911699, + "grad_norm": 1.3574204173321194, + "learning_rate": 9.388561686298822e-08, + "loss": 1.0556, + "step": 24230 + }, + { + "epoch": 1.8784145065674764, + "grad_norm": 1.320735415486837, + "learning_rate": 9.392436453812772e-08, + "loss": 1.0741, + "step": 24240 + }, + { + "epoch": 1.879189430043783, + "grad_norm": 1.3882598627913412, + "learning_rate": 9.396311221326721e-08, + "loss": 1.0729, + "step": 24250 + }, + { + "epoch": 1.8799643535200898, + "grad_norm": 1.3712309370084104, + "learning_rate": 9.40018598884067e-08, + "loss": 1.0693, + "step": 24260 + }, + { + "epoch": 1.8807392769963966, + "grad_norm": 1.4622984034987498, + "learning_rate": 9.404060756354619e-08, + "loss": 1.0748, + "step": 24270 + }, + { + "epoch": 1.8815142004727035, + "grad_norm": 1.3212453384940444, + "learning_rate": 9.407935523868569e-08, + "loss": 1.0611, + "step": 24280 + }, + { + "epoch": 1.88228912394901, + "grad_norm": 1.368646064490842, + "learning_rate": 9.411810291382517e-08, + "loss": 1.0721, + "step": 24290 + }, + { + "epoch": 1.8830640474253166, + "grad_norm": 1.3294480549630907, + "learning_rate": 9.415685058896467e-08, + "loss": 1.0531, + "step": 24300 + }, + { + "epoch": 1.8838389709016234, + "grad_norm": 1.361237438717162, + "learning_rate": 9.419559826410416e-08, + "loss": 1.037, + "step": 24310 + }, + { + "epoch": 1.8846138943779303, + "grad_norm": 1.3780096688805963, + "learning_rate": 9.423434593924364e-08, + "loss": 1.0597, + "step": 24320 + }, + { + "epoch": 1.8853888178542368, + "grad_norm": 1.365741125421088, + "learning_rate": 9.427309361438314e-08, + "loss": 1.0693, + "step": 24330 + }, + { + "epoch": 1.8861637413305437, + "grad_norm": 1.3645645579208285, + "learning_rate": 9.431184128952263e-08, + "loss": 1.0957, + "step": 24340 + }, + { + "epoch": 1.8869386648068502, + "grad_norm": 1.2923015548883263, + "learning_rate": 9.435058896466213e-08, + "loss": 1.1099, + "step": 24350 + }, + { + "epoch": 1.887713588283157, + "grad_norm": 1.3715070938799403, + "learning_rate": 9.438933663980161e-08, + "loss": 1.0675, + "step": 24360 + }, + { + "epoch": 1.8884885117594639, + "grad_norm": 1.3426486540903992, + "learning_rate": 9.442808431494111e-08, + "loss": 1.052, + "step": 24370 + }, + { + "epoch": 1.8892634352357704, + "grad_norm": 1.2590287518466459, + "learning_rate": 9.44668319900806e-08, + "loss": 1.0491, + "step": 24380 + }, + { + "epoch": 1.890038358712077, + "grad_norm": 1.302325713783644, + "learning_rate": 9.450557966522009e-08, + "loss": 1.1089, + "step": 24390 + }, + { + "epoch": 1.8908132821883838, + "grad_norm": 1.3269541187696456, + "learning_rate": 9.454432734035958e-08, + "loss": 1.0629, + "step": 24400 + }, + { + "epoch": 1.8915882056646907, + "grad_norm": 1.372626864746578, + "learning_rate": 9.458307501549908e-08, + "loss": 1.0616, + "step": 24410 + }, + { + "epoch": 1.8923631291409975, + "grad_norm": 1.324797074210446, + "learning_rate": 9.462182269063857e-08, + "loss": 1.083, + "step": 24420 + }, + { + "epoch": 1.893138052617304, + "grad_norm": 1.3420042300967026, + "learning_rate": 9.466057036577805e-08, + "loss": 1.0757, + "step": 24430 + }, + { + "epoch": 1.8939129760936106, + "grad_norm": 1.3236302632635883, + "learning_rate": 9.469931804091755e-08, + "loss": 1.0839, + "step": 24440 + }, + { + "epoch": 1.8946878995699175, + "grad_norm": 1.3087890315287727, + "learning_rate": 9.473806571605705e-08, + "loss": 1.0761, + "step": 24450 + }, + { + "epoch": 1.8954628230462243, + "grad_norm": 1.426059101073239, + "learning_rate": 9.477681339119653e-08, + "loss": 1.0632, + "step": 24460 + }, + { + "epoch": 1.8962377465225309, + "grad_norm": 1.2829255635199475, + "learning_rate": 9.481556106633602e-08, + "loss": 1.0711, + "step": 24470 + }, + { + "epoch": 1.8970126699988377, + "grad_norm": 1.3107781086186738, + "learning_rate": 9.485430874147552e-08, + "loss": 1.0413, + "step": 24480 + }, + { + "epoch": 1.8977875934751443, + "grad_norm": 1.3911642868210456, + "learning_rate": 9.4893056416615e-08, + "loss": 1.0574, + "step": 24490 + }, + { + "epoch": 1.898562516951451, + "grad_norm": 1.456730844475671, + "learning_rate": 9.49318040917545e-08, + "loss": 1.0555, + "step": 24500 + }, + { + "epoch": 1.898562516951451, + "eval_loss": 1.0651272535324097, + "eval_runtime": 319.6302, + "eval_samples_per_second": 35.888, + "eval_steps_per_second": 8.973, + "step": 24500 + }, + { + "epoch": 1.8993374404277579, + "grad_norm": 1.3605912399914442, + "learning_rate": 9.497055176689399e-08, + "loss": 1.0762, + "step": 24510 + }, + { + "epoch": 1.9001123639040645, + "grad_norm": 1.3920489546499117, + "learning_rate": 9.500929944203349e-08, + "loss": 1.0591, + "step": 24520 + }, + { + "epoch": 1.900887287380371, + "grad_norm": 1.2267705389126564, + "learning_rate": 9.504804711717297e-08, + "loss": 1.063, + "step": 24530 + }, + { + "epoch": 1.9016622108566779, + "grad_norm": 1.3160227797110045, + "learning_rate": 9.508679479231247e-08, + "loss": 1.0723, + "step": 24540 + }, + { + "epoch": 1.9024371343329847, + "grad_norm": 1.4263619591849828, + "learning_rate": 9.512554246745196e-08, + "loss": 1.0704, + "step": 24550 + }, + { + "epoch": 1.9032120578092915, + "grad_norm": 1.4349401932369095, + "learning_rate": 9.516429014259144e-08, + "loss": 1.0659, + "step": 24560 + }, + { + "epoch": 1.903986981285598, + "grad_norm": 1.3472661014828842, + "learning_rate": 9.520303781773094e-08, + "loss": 1.0765, + "step": 24570 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 1.4776810989409328, + "learning_rate": 9.524178549287043e-08, + "loss": 1.0694, + "step": 24580 + }, + { + "epoch": 1.9055368282382115, + "grad_norm": 1.2880129770921735, + "learning_rate": 9.528053316800993e-08, + "loss": 1.0576, + "step": 24590 + }, + { + "epoch": 1.9063117517145183, + "grad_norm": 1.2902969628083518, + "learning_rate": 9.531928084314941e-08, + "loss": 1.0439, + "step": 24600 + }, + { + "epoch": 1.9070866751908249, + "grad_norm": 1.3487029244794366, + "learning_rate": 9.535802851828891e-08, + "loss": 1.0948, + "step": 24610 + }, + { + "epoch": 1.9078615986671317, + "grad_norm": 1.400008351098338, + "learning_rate": 9.53967761934284e-08, + "loss": 1.0946, + "step": 24620 + }, + { + "epoch": 1.9086365221434383, + "grad_norm": 1.3615590761952763, + "learning_rate": 9.543552386856788e-08, + "loss": 1.0923, + "step": 24630 + }, + { + "epoch": 1.909411445619745, + "grad_norm": 1.4550855282901327, + "learning_rate": 9.547427154370738e-08, + "loss": 1.0796, + "step": 24640 + }, + { + "epoch": 1.9101863690960519, + "grad_norm": 1.4519495785266507, + "learning_rate": 9.551301921884688e-08, + "loss": 1.058, + "step": 24650 + }, + { + "epoch": 1.9109612925723585, + "grad_norm": 1.2970683302211854, + "learning_rate": 9.555176689398636e-08, + "loss": 1.0751, + "step": 24660 + }, + { + "epoch": 1.911736216048665, + "grad_norm": 1.3682812991423678, + "learning_rate": 9.559051456912585e-08, + "loss": 1.0763, + "step": 24670 + }, + { + "epoch": 1.9125111395249719, + "grad_norm": 1.4875802592251177, + "learning_rate": 9.562926224426535e-08, + "loss": 1.0901, + "step": 24680 + }, + { + "epoch": 1.9132860630012787, + "grad_norm": 1.3441737941616807, + "learning_rate": 9.566800991940484e-08, + "loss": 1.0714, + "step": 24690 + }, + { + "epoch": 1.9140609864775855, + "grad_norm": 1.3822908119992232, + "learning_rate": 9.570675759454433e-08, + "loss": 1.0673, + "step": 24700 + }, + { + "epoch": 1.914835909953892, + "grad_norm": 1.2772434945481377, + "learning_rate": 9.574550526968382e-08, + "loss": 1.0535, + "step": 24710 + }, + { + "epoch": 1.9156108334301987, + "grad_norm": 1.3779943211752206, + "learning_rate": 9.578425294482332e-08, + "loss": 1.0577, + "step": 24720 + }, + { + "epoch": 1.9163857569065055, + "grad_norm": 1.292951593305023, + "learning_rate": 9.58230006199628e-08, + "loss": 1.0548, + "step": 24730 + }, + { + "epoch": 1.9171606803828123, + "grad_norm": 1.3274550803977025, + "learning_rate": 9.58617482951023e-08, + "loss": 1.0756, + "step": 24740 + }, + { + "epoch": 1.9179356038591189, + "grad_norm": 1.3598582504141894, + "learning_rate": 9.590049597024179e-08, + "loss": 1.0694, + "step": 24750 + }, + { + "epoch": 1.9187105273354255, + "grad_norm": 1.303508973257161, + "learning_rate": 9.593924364538129e-08, + "loss": 1.0699, + "step": 24760 + }, + { + "epoch": 1.9194854508117323, + "grad_norm": 1.3258127485816136, + "learning_rate": 9.597799132052077e-08, + "loss": 1.0665, + "step": 24770 + }, + { + "epoch": 1.920260374288039, + "grad_norm": 1.3008710837656707, + "learning_rate": 9.601673899566026e-08, + "loss": 1.0442, + "step": 24780 + }, + { + "epoch": 1.921035297764346, + "grad_norm": 1.2870553314292834, + "learning_rate": 9.605548667079976e-08, + "loss": 1.0416, + "step": 24790 + }, + { + "epoch": 1.9218102212406525, + "grad_norm": 1.2919139015174708, + "learning_rate": 9.609423434593924e-08, + "loss": 1.0513, + "step": 24800 + }, + { + "epoch": 1.922585144716959, + "grad_norm": 1.3543956859094162, + "learning_rate": 9.613298202107874e-08, + "loss": 1.0546, + "step": 24810 + }, + { + "epoch": 1.9233600681932659, + "grad_norm": 1.356540325279654, + "learning_rate": 9.617172969621823e-08, + "loss": 1.0745, + "step": 24820 + }, + { + "epoch": 1.9241349916695727, + "grad_norm": 1.4317735093283444, + "learning_rate": 9.621047737135772e-08, + "loss": 1.0636, + "step": 24830 + }, + { + "epoch": 1.9249099151458795, + "grad_norm": 1.2950823674627134, + "learning_rate": 9.624922504649721e-08, + "loss": 1.0409, + "step": 24840 + }, + { + "epoch": 1.925684838622186, + "grad_norm": 1.2911011624003923, + "learning_rate": 9.628797272163671e-08, + "loss": 1.0328, + "step": 24850 + }, + { + "epoch": 1.9264597620984927, + "grad_norm": 1.2968869708148518, + "learning_rate": 9.63267203967762e-08, + "loss": 1.0479, + "step": 24860 + }, + { + "epoch": 1.9272346855747995, + "grad_norm": 1.3030782622280876, + "learning_rate": 9.636546807191568e-08, + "loss": 1.0696, + "step": 24870 + }, + { + "epoch": 1.9280096090511063, + "grad_norm": 1.3936652370137432, + "learning_rate": 9.640421574705518e-08, + "loss": 1.0495, + "step": 24880 + }, + { + "epoch": 1.9287845325274129, + "grad_norm": 1.4024146788527618, + "learning_rate": 9.644296342219468e-08, + "loss": 1.0545, + "step": 24890 + }, + { + "epoch": 1.9295594560037195, + "grad_norm": 1.348703607876724, + "learning_rate": 9.648171109733416e-08, + "loss": 1.0658, + "step": 24900 + }, + { + "epoch": 1.9303343794800263, + "grad_norm": 1.383670389571402, + "learning_rate": 9.652045877247365e-08, + "loss": 1.0568, + "step": 24910 + }, + { + "epoch": 1.931109302956333, + "grad_norm": 1.370620422406879, + "learning_rate": 9.655920644761315e-08, + "loss": 1.0667, + "step": 24920 + }, + { + "epoch": 1.93188422643264, + "grad_norm": 1.2432264399287916, + "learning_rate": 9.659795412275264e-08, + "loss": 1.0611, + "step": 24930 + }, + { + "epoch": 1.9326591499089465, + "grad_norm": 1.2600766519505393, + "learning_rate": 9.663670179789213e-08, + "loss": 1.0433, + "step": 24940 + }, + { + "epoch": 1.933434073385253, + "grad_norm": 1.2629532641564882, + "learning_rate": 9.667544947303162e-08, + "loss": 1.0311, + "step": 24950 + }, + { + "epoch": 1.93420899686156, + "grad_norm": 1.5275587428691786, + "learning_rate": 9.671419714817112e-08, + "loss": 1.0811, + "step": 24960 + }, + { + "epoch": 1.9349839203378667, + "grad_norm": 1.2430014459929006, + "learning_rate": 9.67529448233106e-08, + "loss": 1.0417, + "step": 24970 + }, + { + "epoch": 1.9357588438141735, + "grad_norm": 1.3029901825310417, + "learning_rate": 9.67916924984501e-08, + "loss": 1.0833, + "step": 24980 + }, + { + "epoch": 1.93653376729048, + "grad_norm": 1.413866735168716, + "learning_rate": 9.683044017358959e-08, + "loss": 1.0754, + "step": 24990 + }, + { + "epoch": 1.9373086907667867, + "grad_norm": 1.360411199127255, + "learning_rate": 9.686918784872909e-08, + "loss": 1.0669, + "step": 25000 + }, + { + "epoch": 1.9373086907667867, + "eval_loss": 1.0621033906936646, + "eval_runtime": 319.6685, + "eval_samples_per_second": 35.884, + "eval_steps_per_second": 8.972, + "step": 25000 + }, + { + "epoch": 1.9380836142430935, + "grad_norm": 1.3139271437864857, + "learning_rate": 9.690793552386857e-08, + "loss": 1.0422, + "step": 25010 + }, + { + "epoch": 1.9388585377194003, + "grad_norm": 1.401052550526291, + "learning_rate": 9.694668319900806e-08, + "loss": 1.0635, + "step": 25020 + }, + { + "epoch": 1.939633461195707, + "grad_norm": 1.5044108984380702, + "learning_rate": 9.698543087414756e-08, + "loss": 1.0826, + "step": 25030 + }, + { + "epoch": 1.9404083846720135, + "grad_norm": 1.3654795778569835, + "learning_rate": 9.702417854928704e-08, + "loss": 1.0545, + "step": 25040 + }, + { + "epoch": 1.9411833081483203, + "grad_norm": 1.3818750519426741, + "learning_rate": 9.706292622442654e-08, + "loss": 1.0644, + "step": 25050 + }, + { + "epoch": 1.941958231624627, + "grad_norm": 1.3299682564634592, + "learning_rate": 9.710167389956603e-08, + "loss": 1.0424, + "step": 25060 + }, + { + "epoch": 1.942733155100934, + "grad_norm": 1.4019376646959145, + "learning_rate": 9.714042157470552e-08, + "loss": 1.0712, + "step": 25070 + }, + { + "epoch": 1.9435080785772405, + "grad_norm": 1.3244431701665633, + "learning_rate": 9.717916924984501e-08, + "loss": 1.0745, + "step": 25080 + }, + { + "epoch": 1.944283002053547, + "grad_norm": 1.3603113670051563, + "learning_rate": 9.72179169249845e-08, + "loss": 1.0502, + "step": 25090 + }, + { + "epoch": 1.945057925529854, + "grad_norm": 1.4617676287587102, + "learning_rate": 9.7256664600124e-08, + "loss": 1.0431, + "step": 25100 + }, + { + "epoch": 1.9458328490061607, + "grad_norm": 1.2995390020302742, + "learning_rate": 9.729541227526348e-08, + "loss": 1.0802, + "step": 25110 + }, + { + "epoch": 1.9466077724824673, + "grad_norm": 1.35226756489035, + "learning_rate": 9.733415995040298e-08, + "loss": 1.0467, + "step": 25120 + }, + { + "epoch": 1.9473826959587741, + "grad_norm": 1.405357938816198, + "learning_rate": 9.737290762554248e-08, + "loss": 1.0709, + "step": 25130 + }, + { + "epoch": 1.9481576194350807, + "grad_norm": 1.3246190656008978, + "learning_rate": 9.741165530068196e-08, + "loss": 1.0757, + "step": 25140 + }, + { + "epoch": 1.9489325429113875, + "grad_norm": 1.2871951898243605, + "learning_rate": 9.745040297582145e-08, + "loss": 1.0668, + "step": 25150 + }, + { + "epoch": 1.9497074663876943, + "grad_norm": 2.1088696179076583, + "learning_rate": 9.748915065096095e-08, + "loss": 1.054, + "step": 25160 + }, + { + "epoch": 1.950482389864001, + "grad_norm": 1.2999393276819218, + "learning_rate": 9.752789832610044e-08, + "loss": 1.0538, + "step": 25170 + }, + { + "epoch": 1.9512573133403075, + "grad_norm": 1.3129913479039201, + "learning_rate": 9.756664600123993e-08, + "loss": 1.0672, + "step": 25180 + }, + { + "epoch": 1.9520322368166143, + "grad_norm": 1.3254999093458177, + "learning_rate": 9.760539367637942e-08, + "loss": 1.0882, + "step": 25190 + }, + { + "epoch": 1.9528071602929211, + "grad_norm": 1.2822099992356348, + "learning_rate": 9.764414135151892e-08, + "loss": 1.0956, + "step": 25200 + }, + { + "epoch": 1.953582083769228, + "grad_norm": 1.3790405575964095, + "learning_rate": 9.76828890266584e-08, + "loss": 1.0665, + "step": 25210 + }, + { + "epoch": 1.9543570072455345, + "grad_norm": 1.4389261451132598, + "learning_rate": 9.77216367017979e-08, + "loss": 1.0664, + "step": 25220 + }, + { + "epoch": 1.955131930721841, + "grad_norm": 1.281848101751462, + "learning_rate": 9.776038437693739e-08, + "loss": 1.0734, + "step": 25230 + }, + { + "epoch": 1.955906854198148, + "grad_norm": 1.4056734581407069, + "learning_rate": 9.779913205207687e-08, + "loss": 1.0592, + "step": 25240 + }, + { + "epoch": 1.9566817776744547, + "grad_norm": 1.2756747282931884, + "learning_rate": 9.783787972721637e-08, + "loss": 1.0643, + "step": 25250 + }, + { + "epoch": 1.9574567011507613, + "grad_norm": 1.4073324317564715, + "learning_rate": 9.787662740235586e-08, + "loss": 1.057, + "step": 25260 + }, + { + "epoch": 1.9582316246270681, + "grad_norm": 1.368763950351524, + "learning_rate": 9.791537507749536e-08, + "loss": 1.0837, + "step": 25270 + }, + { + "epoch": 1.9590065481033747, + "grad_norm": 1.3536827245791736, + "learning_rate": 9.795412275263484e-08, + "loss": 1.0659, + "step": 25280 + }, + { + "epoch": 1.9597814715796815, + "grad_norm": 1.3643213014616218, + "learning_rate": 9.799287042777434e-08, + "loss": 1.0738, + "step": 25290 + }, + { + "epoch": 1.9605563950559883, + "grad_norm": 1.2926574919815508, + "learning_rate": 9.803161810291383e-08, + "loss": 1.0852, + "step": 25300 + }, + { + "epoch": 1.961331318532295, + "grad_norm": 1.276102202455619, + "learning_rate": 9.807036577805331e-08, + "loss": 1.0855, + "step": 25310 + }, + { + "epoch": 1.9621062420086015, + "grad_norm": 1.288768748049967, + "learning_rate": 9.810911345319281e-08, + "loss": 1.0548, + "step": 25320 + }, + { + "epoch": 1.9628811654849083, + "grad_norm": 1.2856991926689318, + "learning_rate": 9.81478611283323e-08, + "loss": 1.044, + "step": 25330 + }, + { + "epoch": 1.9636560889612151, + "grad_norm": 1.3207050461467997, + "learning_rate": 9.81866088034718e-08, + "loss": 1.0507, + "step": 25340 + }, + { + "epoch": 1.964431012437522, + "grad_norm": 1.306122895924902, + "learning_rate": 9.822535647861128e-08, + "loss": 1.0572, + "step": 25350 + }, + { + "epoch": 1.9652059359138285, + "grad_norm": 1.3112849952853585, + "learning_rate": 9.826410415375078e-08, + "loss": 1.0656, + "step": 25360 + }, + { + "epoch": 1.9659808593901351, + "grad_norm": 1.2644863720846147, + "learning_rate": 9.830285182889027e-08, + "loss": 1.0704, + "step": 25370 + }, + { + "epoch": 1.966755782866442, + "grad_norm": 1.356201599759232, + "learning_rate": 9.834159950402976e-08, + "loss": 1.0549, + "step": 25380 + }, + { + "epoch": 1.9675307063427487, + "grad_norm": 1.3448590338975426, + "learning_rate": 9.838034717916925e-08, + "loss": 1.0449, + "step": 25390 + }, + { + "epoch": 1.9683056298190553, + "grad_norm": 1.3626493763742396, + "learning_rate": 9.841909485430875e-08, + "loss": 1.0662, + "step": 25400 + }, + { + "epoch": 1.9690805532953621, + "grad_norm": 1.3936419468066912, + "learning_rate": 9.845784252944823e-08, + "loss": 1.0471, + "step": 25410 + }, + { + "epoch": 1.9698554767716687, + "grad_norm": 1.3096670596610864, + "learning_rate": 9.849659020458773e-08, + "loss": 1.0734, + "step": 25420 + }, + { + "epoch": 1.9706304002479755, + "grad_norm": 1.3172735013082397, + "learning_rate": 9.853533787972722e-08, + "loss": 1.0492, + "step": 25430 + }, + { + "epoch": 1.9714053237242823, + "grad_norm": 1.2701138432341148, + "learning_rate": 9.857408555486672e-08, + "loss": 1.0383, + "step": 25440 + }, + { + "epoch": 1.972180247200589, + "grad_norm": 1.3269266565769364, + "learning_rate": 9.86128332300062e-08, + "loss": 1.0385, + "step": 25450 + }, + { + "epoch": 1.9729551706768955, + "grad_norm": 1.3946881344453166, + "learning_rate": 9.86515809051457e-08, + "loss": 1.078, + "step": 25460 + }, + { + "epoch": 1.9737300941532023, + "grad_norm": 1.3361713346127337, + "learning_rate": 9.869032858028519e-08, + "loss": 1.0859, + "step": 25470 + }, + { + "epoch": 1.9745050176295091, + "grad_norm": 1.3303652693115022, + "learning_rate": 9.872907625542467e-08, + "loss": 1.0348, + "step": 25480 + }, + { + "epoch": 1.975279941105816, + "grad_norm": 1.320547940893084, + "learning_rate": 9.876782393056417e-08, + "loss": 1.0508, + "step": 25490 + }, + { + "epoch": 1.9760548645821225, + "grad_norm": 1.363385982139487, + "learning_rate": 9.880657160570366e-08, + "loss": 1.0653, + "step": 25500 + }, + { + "epoch": 1.9760548645821225, + "eval_loss": 1.0590498447418213, + "eval_runtime": 315.8507, + "eval_samples_per_second": 36.318, + "eval_steps_per_second": 9.08, + "step": 25500 + }, + { + "epoch": 1.9768297880584291, + "grad_norm": 1.3687440018452282, + "learning_rate": 9.884531928084316e-08, + "loss": 1.0367, + "step": 25510 + }, + { + "epoch": 1.977604711534736, + "grad_norm": 1.3759766931663129, + "learning_rate": 9.888406695598264e-08, + "loss": 1.0672, + "step": 25520 + }, + { + "epoch": 1.9783796350110427, + "grad_norm": 1.3257466534785163, + "learning_rate": 9.892281463112214e-08, + "loss": 1.0649, + "step": 25530 + }, + { + "epoch": 1.9791545584873493, + "grad_norm": 1.402137910898998, + "learning_rate": 9.896156230626163e-08, + "loss": 1.0657, + "step": 25540 + }, + { + "epoch": 1.979929481963656, + "grad_norm": 1.3360538141134246, + "learning_rate": 9.900030998140111e-08, + "loss": 1.0352, + "step": 25550 + }, + { + "epoch": 1.9807044054399627, + "grad_norm": 1.3924267443216896, + "learning_rate": 9.903905765654061e-08, + "loss": 1.0728, + "step": 25560 + }, + { + "epoch": 1.9814793289162695, + "grad_norm": 1.2869273444132057, + "learning_rate": 9.90778053316801e-08, + "loss": 1.0538, + "step": 25570 + }, + { + "epoch": 1.9822542523925764, + "grad_norm": 1.3491909162543712, + "learning_rate": 9.911655300681959e-08, + "loss": 1.0543, + "step": 25580 + }, + { + "epoch": 1.983029175868883, + "grad_norm": 1.402726996598221, + "learning_rate": 9.915530068195908e-08, + "loss": 1.0702, + "step": 25590 + }, + { + "epoch": 1.9838040993451895, + "grad_norm": 1.3340709053115571, + "learning_rate": 9.919404835709858e-08, + "loss": 1.0723, + "step": 25600 + }, + { + "epoch": 1.9845790228214963, + "grad_norm": 1.324119220040636, + "learning_rate": 9.923279603223807e-08, + "loss": 1.0584, + "step": 25610 + }, + { + "epoch": 1.9853539462978032, + "grad_norm": 1.452882684811193, + "learning_rate": 9.927154370737756e-08, + "loss": 1.0551, + "step": 25620 + }, + { + "epoch": 1.98612886977411, + "grad_norm": 1.3617517137569057, + "learning_rate": 9.931029138251705e-08, + "loss": 1.0494, + "step": 25630 + }, + { + "epoch": 1.9869037932504166, + "grad_norm": 1.308769326522026, + "learning_rate": 9.934903905765655e-08, + "loss": 1.066, + "step": 25640 + }, + { + "epoch": 1.9876787167267231, + "grad_norm": 1.3363249370320944, + "learning_rate": 9.938778673279603e-08, + "loss": 1.0555, + "step": 25650 + }, + { + "epoch": 1.98845364020303, + "grad_norm": 1.3164095767930009, + "learning_rate": 9.942653440793553e-08, + "loss": 1.0456, + "step": 25660 + }, + { + "epoch": 1.9892285636793368, + "grad_norm": 1.4346769060337743, + "learning_rate": 9.946528208307502e-08, + "loss": 1.0671, + "step": 25670 + }, + { + "epoch": 1.9900034871556433, + "grad_norm": 1.375166387201116, + "learning_rate": 9.950402975821452e-08, + "loss": 1.0416, + "step": 25680 + }, + { + "epoch": 1.99077841063195, + "grad_norm": 1.412229514864043, + "learning_rate": 9.9542777433354e-08, + "loss": 1.0776, + "step": 25690 + }, + { + "epoch": 1.9915533341082567, + "grad_norm": 1.307229741685703, + "learning_rate": 9.95815251084935e-08, + "loss": 1.0462, + "step": 25700 + }, + { + "epoch": 1.9923282575845636, + "grad_norm": 1.3806992598035435, + "learning_rate": 9.962027278363299e-08, + "loss": 1.0618, + "step": 25710 + }, + { + "epoch": 1.9931031810608704, + "grad_norm": 1.441212894035071, + "learning_rate": 9.965902045877247e-08, + "loss": 1.0792, + "step": 25720 + }, + { + "epoch": 1.993878104537177, + "grad_norm": 1.2972224847868794, + "learning_rate": 9.969776813391197e-08, + "loss": 1.0454, + "step": 25730 + }, + { + "epoch": 1.9946530280134835, + "grad_norm": 1.306301069438066, + "learning_rate": 9.973651580905146e-08, + "loss": 1.0495, + "step": 25740 + }, + { + "epoch": 1.9954279514897904, + "grad_norm": 1.3769393272441528, + "learning_rate": 9.977526348419095e-08, + "loss": 1.0448, + "step": 25750 + }, + { + "epoch": 1.9962028749660972, + "grad_norm": 1.3764254077198566, + "learning_rate": 9.981401115933044e-08, + "loss": 1.0597, + "step": 25760 + }, + { + "epoch": 1.996977798442404, + "grad_norm": 1.3353095296372113, + "learning_rate": 9.985275883446994e-08, + "loss": 1.0457, + "step": 25770 + }, + { + "epoch": 1.9977527219187106, + "grad_norm": 1.3700804638075894, + "learning_rate": 9.989150650960943e-08, + "loss": 1.047, + "step": 25780 + }, + { + "epoch": 1.9985276453950171, + "grad_norm": 1.308909776599131, + "learning_rate": 9.993025418474891e-08, + "loss": 1.0843, + "step": 25790 + }, + { + "epoch": 1.999302568871324, + "grad_norm": 1.2909176712279875, + "learning_rate": 9.996900185988841e-08, + "loss": 1.0656, + "step": 25800 + }, + { + "epoch": 2.0000774923476308, + "grad_norm": 1.2916472128375247, + "learning_rate": 1.000077495350279e-07, + "loss": 1.0655, + "step": 25810 + }, + { + "epoch": 2.0008524158239376, + "grad_norm": 1.4559521536885, + "learning_rate": 1.0004649721016739e-07, + "loss": 1.0624, + "step": 25820 + }, + { + "epoch": 2.001627339300244, + "grad_norm": 1.2875595384186709, + "learning_rate": 1.0008524488530688e-07, + "loss": 1.0576, + "step": 25830 + }, + { + "epoch": 2.0024022627765508, + "grad_norm": 1.333177855030872, + "learning_rate": 1.0012399256044638e-07, + "loss": 1.1008, + "step": 25840 + }, + { + "epoch": 2.0031771862528576, + "grad_norm": 1.2908212529866323, + "learning_rate": 1.0016274023558587e-07, + "loss": 1.0779, + "step": 25850 + }, + { + "epoch": 2.0039521097291644, + "grad_norm": 1.2640887152275933, + "learning_rate": 1.0020148791072536e-07, + "loss": 1.0603, + "step": 25860 + }, + { + "epoch": 2.0047270332054707, + "grad_norm": 1.324904780472473, + "learning_rate": 1.0024023558586485e-07, + "loss": 1.0709, + "step": 25870 + }, + { + "epoch": 2.0055019566817776, + "grad_norm": 1.407109624325278, + "learning_rate": 1.0027898326100435e-07, + "loss": 1.0446, + "step": 25880 + }, + { + "epoch": 2.0062768801580844, + "grad_norm": 1.2856571835320023, + "learning_rate": 1.0031773093614383e-07, + "loss": 1.0555, + "step": 25890 + }, + { + "epoch": 2.007051803634391, + "grad_norm": 1.3726770222926024, + "learning_rate": 1.0035647861128333e-07, + "loss": 1.0524, + "step": 25900 + }, + { + "epoch": 2.007826727110698, + "grad_norm": 1.3480328898138625, + "learning_rate": 1.0039522628642282e-07, + "loss": 1.05, + "step": 25910 + }, + { + "epoch": 2.0086016505870044, + "grad_norm": 1.3955278274637943, + "learning_rate": 1.0043397396156232e-07, + "loss": 1.0722, + "step": 25920 + }, + { + "epoch": 2.009376574063311, + "grad_norm": 1.3304089387934308, + "learning_rate": 1.004727216367018e-07, + "loss": 1.078, + "step": 25930 + }, + { + "epoch": 2.010151497539618, + "grad_norm": 1.3096546562512377, + "learning_rate": 1.005114693118413e-07, + "loss": 1.0542, + "step": 25940 + }, + { + "epoch": 2.010926421015925, + "grad_norm": 1.3361346145272002, + "learning_rate": 1.0055021698698079e-07, + "loss": 1.0649, + "step": 25950 + }, + { + "epoch": 2.0117013444922316, + "grad_norm": 1.3923755128145983, + "learning_rate": 1.0058896466212027e-07, + "loss": 1.0774, + "step": 25960 + }, + { + "epoch": 2.012476267968538, + "grad_norm": 1.3240932861007966, + "learning_rate": 1.0062771233725977e-07, + "loss": 1.0181, + "step": 25970 + }, + { + "epoch": 2.0132511914448448, + "grad_norm": 1.4225267019553942, + "learning_rate": 1.0066646001239926e-07, + "loss": 1.0548, + "step": 25980 + }, + { + "epoch": 2.0140261149211516, + "grad_norm": 1.2246273204718, + "learning_rate": 1.0070520768753874e-07, + "loss": 1.0642, + "step": 25990 + }, + { + "epoch": 2.0148010383974584, + "grad_norm": 1.37018855990621, + "learning_rate": 1.0074395536267824e-07, + "loss": 1.0602, + "step": 26000 + }, + { + "epoch": 2.0148010383974584, + "eval_loss": 1.0561400651931763, + "eval_runtime": 316.618, + "eval_samples_per_second": 36.23, + "eval_steps_per_second": 9.058, + "step": 26000 + }, + { + "epoch": 2.0155759618737648, + "grad_norm": 1.349243456842036, + "learning_rate": 1.0078270303781774e-07, + "loss": 1.0692, + "step": 26010 + }, + { + "epoch": 2.0163508853500716, + "grad_norm": 1.2718168536621763, + "learning_rate": 1.0082145071295723e-07, + "loss": 1.0539, + "step": 26020 + }, + { + "epoch": 2.0171258088263784, + "grad_norm": 1.3910660604572702, + "learning_rate": 1.0086019838809671e-07, + "loss": 1.0378, + "step": 26030 + }, + { + "epoch": 2.017900732302685, + "grad_norm": 1.353055321467941, + "learning_rate": 1.0089894606323621e-07, + "loss": 1.0523, + "step": 26040 + }, + { + "epoch": 2.018675655778992, + "grad_norm": 1.2724389641114828, + "learning_rate": 1.009376937383757e-07, + "loss": 1.0447, + "step": 26050 + }, + { + "epoch": 2.0194505792552984, + "grad_norm": 1.3119997375012882, + "learning_rate": 1.0097644141351519e-07, + "loss": 1.057, + "step": 26060 + }, + { + "epoch": 2.020225502731605, + "grad_norm": 1.3309366435858174, + "learning_rate": 1.0101518908865468e-07, + "loss": 1.0349, + "step": 26070 + }, + { + "epoch": 2.021000426207912, + "grad_norm": 1.309439896598238, + "learning_rate": 1.0105393676379418e-07, + "loss": 1.0612, + "step": 26080 + }, + { + "epoch": 2.021775349684219, + "grad_norm": 1.3867179317351568, + "learning_rate": 1.0109268443893367e-07, + "loss": 1.0578, + "step": 26090 + }, + { + "epoch": 2.0225502731605256, + "grad_norm": 1.46699208331649, + "learning_rate": 1.0113143211407316e-07, + "loss": 1.0636, + "step": 26100 + }, + { + "epoch": 2.023325196636832, + "grad_norm": 1.3570185690490457, + "learning_rate": 1.0117017978921265e-07, + "loss": 1.0911, + "step": 26110 + }, + { + "epoch": 2.024100120113139, + "grad_norm": 1.3802473217279085, + "learning_rate": 1.0120892746435215e-07, + "loss": 1.0406, + "step": 26120 + }, + { + "epoch": 2.0248750435894456, + "grad_norm": 1.446305033162465, + "learning_rate": 1.0124767513949163e-07, + "loss": 1.0553, + "step": 26130 + }, + { + "epoch": 2.0256499670657524, + "grad_norm": 1.3214677878272325, + "learning_rate": 1.0128642281463112e-07, + "loss": 1.0439, + "step": 26140 + }, + { + "epoch": 2.0264248905420588, + "grad_norm": 1.281861716418427, + "learning_rate": 1.0132517048977062e-07, + "loss": 1.0317, + "step": 26150 + }, + { + "epoch": 2.0271998140183656, + "grad_norm": 1.3484315307415164, + "learning_rate": 1.013639181649101e-07, + "loss": 1.0742, + "step": 26160 + }, + { + "epoch": 2.0279747374946724, + "grad_norm": 1.4636548793665654, + "learning_rate": 1.014026658400496e-07, + "loss": 1.0577, + "step": 26170 + }, + { + "epoch": 2.028749660970979, + "grad_norm": 1.3088561545186093, + "learning_rate": 1.014414135151891e-07, + "loss": 1.0471, + "step": 26180 + }, + { + "epoch": 2.029524584447286, + "grad_norm": 1.3116531507830425, + "learning_rate": 1.0148016119032859e-07, + "loss": 1.0446, + "step": 26190 + }, + { + "epoch": 2.0302995079235924, + "grad_norm": 1.31753457394349, + "learning_rate": 1.0151890886546807e-07, + "loss": 1.0402, + "step": 26200 + }, + { + "epoch": 2.031074431399899, + "grad_norm": 1.3359081257495764, + "learning_rate": 1.0155765654060757e-07, + "loss": 1.0472, + "step": 26210 + }, + { + "epoch": 2.031849354876206, + "grad_norm": 1.3637343761544352, + "learning_rate": 1.0159640421574706e-07, + "loss": 1.0373, + "step": 26220 + }, + { + "epoch": 2.032624278352513, + "grad_norm": 1.2959217733483452, + "learning_rate": 1.0163515189088654e-07, + "loss": 1.038, + "step": 26230 + }, + { + "epoch": 2.0333992018288196, + "grad_norm": 1.3444955379353687, + "learning_rate": 1.0167389956602604e-07, + "loss": 1.0333, + "step": 26240 + }, + { + "epoch": 2.034174125305126, + "grad_norm": 1.332508193876807, + "learning_rate": 1.0171264724116554e-07, + "loss": 1.0453, + "step": 26250 + }, + { + "epoch": 2.034949048781433, + "grad_norm": 1.3349898976994132, + "learning_rate": 1.0175139491630503e-07, + "loss": 1.0395, + "step": 26260 + }, + { + "epoch": 2.0357239722577396, + "grad_norm": 1.3385421965286166, + "learning_rate": 1.0179014259144451e-07, + "loss": 1.0595, + "step": 26270 + }, + { + "epoch": 2.0364988957340464, + "grad_norm": 1.3340672311256032, + "learning_rate": 1.0182889026658401e-07, + "loss": 1.0723, + "step": 26280 + }, + { + "epoch": 2.0372738192103528, + "grad_norm": 1.3092118626976952, + "learning_rate": 1.018676379417235e-07, + "loss": 1.0408, + "step": 26290 + }, + { + "epoch": 2.0380487426866596, + "grad_norm": 1.3214567510295026, + "learning_rate": 1.0190638561686299e-07, + "loss": 1.037, + "step": 26300 + }, + { + "epoch": 2.0388236661629664, + "grad_norm": 1.3107049384390992, + "learning_rate": 1.0194513329200248e-07, + "loss": 1.0504, + "step": 26310 + }, + { + "epoch": 2.039598589639273, + "grad_norm": 1.4327556248608828, + "learning_rate": 1.0198388096714198e-07, + "loss": 1.0787, + "step": 26320 + }, + { + "epoch": 2.04037351311558, + "grad_norm": 1.3626200317048438, + "learning_rate": 1.0202262864228146e-07, + "loss": 1.0817, + "step": 26330 + }, + { + "epoch": 2.0411484365918864, + "grad_norm": 1.3063746135022245, + "learning_rate": 1.0206137631742096e-07, + "loss": 1.0454, + "step": 26340 + }, + { + "epoch": 2.041923360068193, + "grad_norm": 1.3142304071563424, + "learning_rate": 1.0210012399256045e-07, + "loss": 1.0432, + "step": 26350 + }, + { + "epoch": 2.0426982835445, + "grad_norm": 1.3149287713913282, + "learning_rate": 1.0213887166769995e-07, + "loss": 1.0551, + "step": 26360 + }, + { + "epoch": 2.043473207020807, + "grad_norm": 1.4342916479854684, + "learning_rate": 1.0217761934283943e-07, + "loss": 1.0468, + "step": 26370 + }, + { + "epoch": 2.0442481304971136, + "grad_norm": 1.367003106124302, + "learning_rate": 1.0221636701797892e-07, + "loss": 1.0615, + "step": 26380 + }, + { + "epoch": 2.04502305397342, + "grad_norm": 1.376228321335437, + "learning_rate": 1.0225511469311842e-07, + "loss": 1.0349, + "step": 26390 + }, + { + "epoch": 2.045797977449727, + "grad_norm": 1.3561889422907416, + "learning_rate": 1.022938623682579e-07, + "loss": 1.0443, + "step": 26400 + }, + { + "epoch": 2.0465729009260336, + "grad_norm": 1.3463232367975504, + "learning_rate": 1.023326100433974e-07, + "loss": 1.0587, + "step": 26410 + }, + { + "epoch": 2.0473478244023404, + "grad_norm": 1.3852616522809698, + "learning_rate": 1.0237135771853689e-07, + "loss": 1.0536, + "step": 26420 + }, + { + "epoch": 2.048122747878647, + "grad_norm": 1.3149318161341463, + "learning_rate": 1.0241010539367639e-07, + "loss": 1.0523, + "step": 26430 + }, + { + "epoch": 2.0488976713549536, + "grad_norm": 1.3820031363087095, + "learning_rate": 1.0244885306881587e-07, + "loss": 1.0692, + "step": 26440 + }, + { + "epoch": 2.0496725948312604, + "grad_norm": 1.3899414420037246, + "learning_rate": 1.0248760074395537e-07, + "loss": 1.0655, + "step": 26450 + }, + { + "epoch": 2.050447518307567, + "grad_norm": 1.2955896097437334, + "learning_rate": 1.0252634841909486e-07, + "loss": 1.0646, + "step": 26460 + }, + { + "epoch": 2.051222441783874, + "grad_norm": 1.359457100394692, + "learning_rate": 1.0256509609423434e-07, + "loss": 1.0618, + "step": 26470 + }, + { + "epoch": 2.0519973652601804, + "grad_norm": 1.3960466835468215, + "learning_rate": 1.0260384376937384e-07, + "loss": 1.0592, + "step": 26480 + }, + { + "epoch": 2.052772288736487, + "grad_norm": 1.396815773991179, + "learning_rate": 1.0264259144451334e-07, + "loss": 1.0507, + "step": 26490 + }, + { + "epoch": 2.053547212212794, + "grad_norm": 1.4194068675271005, + "learning_rate": 1.0268133911965282e-07, + "loss": 1.0749, + "step": 26500 + }, + { + "epoch": 2.053547212212794, + "eval_loss": 1.0534113645553589, + "eval_runtime": 317.3201, + "eval_samples_per_second": 36.15, + "eval_steps_per_second": 9.038, + "step": 26500 + }, + { + "epoch": 2.054322135689101, + "grad_norm": 1.3325792483749428, + "learning_rate": 1.0272008679479231e-07, + "loss": 1.0373, + "step": 26510 + }, + { + "epoch": 2.0550970591654076, + "grad_norm": 1.257421209077693, + "learning_rate": 1.0275883446993181e-07, + "loss": 1.0464, + "step": 26520 + }, + { + "epoch": 2.055871982641714, + "grad_norm": 1.2885438554194764, + "learning_rate": 1.027975821450713e-07, + "loss": 1.0568, + "step": 26530 + }, + { + "epoch": 2.056646906118021, + "grad_norm": 1.336231661709952, + "learning_rate": 1.0283632982021079e-07, + "loss": 1.0545, + "step": 26540 + }, + { + "epoch": 2.0574218295943276, + "grad_norm": 1.2897288693504636, + "learning_rate": 1.0287507749535028e-07, + "loss": 1.034, + "step": 26550 + }, + { + "epoch": 2.0581967530706344, + "grad_norm": 1.3428578480418802, + "learning_rate": 1.0291382517048978e-07, + "loss": 1.0594, + "step": 26560 + }, + { + "epoch": 2.058971676546941, + "grad_norm": 1.423755124069085, + "learning_rate": 1.0295257284562926e-07, + "loss": 1.0809, + "step": 26570 + }, + { + "epoch": 2.0597466000232476, + "grad_norm": 1.271988251855651, + "learning_rate": 1.0299132052076876e-07, + "loss": 1.0405, + "step": 26580 + }, + { + "epoch": 2.0605215234995544, + "grad_norm": 1.3438213920181197, + "learning_rate": 1.0303006819590825e-07, + "loss": 1.0608, + "step": 26590 + }, + { + "epoch": 2.0612964469758612, + "grad_norm": 1.398945154512971, + "learning_rate": 1.0306881587104775e-07, + "loss": 1.0675, + "step": 26600 + }, + { + "epoch": 2.062071370452168, + "grad_norm": 1.3114956619590414, + "learning_rate": 1.0310756354618723e-07, + "loss": 1.0473, + "step": 26610 + }, + { + "epoch": 2.0628462939284744, + "grad_norm": 1.2928737119153269, + "learning_rate": 1.0314631122132672e-07, + "loss": 1.033, + "step": 26620 + }, + { + "epoch": 2.063621217404781, + "grad_norm": 1.3631380939231446, + "learning_rate": 1.0318505889646622e-07, + "loss": 1.0854, + "step": 26630 + }, + { + "epoch": 2.064396140881088, + "grad_norm": 1.2766856173813736, + "learning_rate": 1.032238065716057e-07, + "loss": 1.0413, + "step": 26640 + }, + { + "epoch": 2.065171064357395, + "grad_norm": 1.3806165605718514, + "learning_rate": 1.032625542467452e-07, + "loss": 1.0722, + "step": 26650 + }, + { + "epoch": 2.065945987833701, + "grad_norm": 1.3281416320130452, + "learning_rate": 1.0330130192188469e-07, + "loss": 1.0642, + "step": 26660 + }, + { + "epoch": 2.066720911310008, + "grad_norm": 1.3312971011076178, + "learning_rate": 1.0334004959702417e-07, + "loss": 1.0612, + "step": 26670 + }, + { + "epoch": 2.067495834786315, + "grad_norm": 1.3567268922244209, + "learning_rate": 1.0337879727216367e-07, + "loss": 1.0597, + "step": 26680 + }, + { + "epoch": 2.0682707582626216, + "grad_norm": 1.3391762000548455, + "learning_rate": 1.0341754494730317e-07, + "loss": 1.0428, + "step": 26690 + }, + { + "epoch": 2.0690456817389284, + "grad_norm": 1.3653382819692041, + "learning_rate": 1.0345629262244266e-07, + "loss": 1.0395, + "step": 26700 + }, + { + "epoch": 2.069820605215235, + "grad_norm": 1.3910219708448373, + "learning_rate": 1.0349504029758214e-07, + "loss": 1.0534, + "step": 26710 + }, + { + "epoch": 2.0705955286915416, + "grad_norm": 1.40913352744336, + "learning_rate": 1.0353378797272164e-07, + "loss": 1.0528, + "step": 26720 + }, + { + "epoch": 2.0713704521678484, + "grad_norm": 1.3750469658109679, + "learning_rate": 1.0357253564786113e-07, + "loss": 1.0592, + "step": 26730 + }, + { + "epoch": 2.0721453756441552, + "grad_norm": 1.3390795202720138, + "learning_rate": 1.0361128332300062e-07, + "loss": 1.0655, + "step": 26740 + }, + { + "epoch": 2.072920299120462, + "grad_norm": 1.325296440228338, + "learning_rate": 1.0365003099814011e-07, + "loss": 1.0515, + "step": 26750 + }, + { + "epoch": 2.0736952225967684, + "grad_norm": 1.2766580150246971, + "learning_rate": 1.0368877867327961e-07, + "loss": 1.0617, + "step": 26760 + }, + { + "epoch": 2.0744701460730752, + "grad_norm": 1.2558498680397308, + "learning_rate": 1.037275263484191e-07, + "loss": 1.0385, + "step": 26770 + }, + { + "epoch": 2.075245069549382, + "grad_norm": 1.302656326684559, + "learning_rate": 1.0376627402355859e-07, + "loss": 1.0726, + "step": 26780 + }, + { + "epoch": 2.076019993025689, + "grad_norm": 1.6017244190688849, + "learning_rate": 1.0380502169869808e-07, + "loss": 1.075, + "step": 26790 + }, + { + "epoch": 2.076794916501995, + "grad_norm": 1.288697462845318, + "learning_rate": 1.0384376937383758e-07, + "loss": 1.0348, + "step": 26800 + }, + { + "epoch": 2.077569839978302, + "grad_norm": 1.2979486503004654, + "learning_rate": 1.0388251704897706e-07, + "loss": 1.03, + "step": 26810 + }, + { + "epoch": 2.078344763454609, + "grad_norm": 1.271474415611516, + "learning_rate": 1.0392126472411655e-07, + "loss": 1.0433, + "step": 26820 + }, + { + "epoch": 2.0791196869309156, + "grad_norm": 1.3933995510691444, + "learning_rate": 1.0396001239925605e-07, + "loss": 1.0522, + "step": 26830 + }, + { + "epoch": 2.0798946104072225, + "grad_norm": 1.3108922486391181, + "learning_rate": 1.0399876007439555e-07, + "loss": 1.0507, + "step": 26840 + }, + { + "epoch": 2.080669533883529, + "grad_norm": 1.3781660017948154, + "learning_rate": 1.0403750774953503e-07, + "loss": 1.0361, + "step": 26850 + }, + { + "epoch": 2.0814444573598356, + "grad_norm": 1.342816326365809, + "learning_rate": 1.0407625542467452e-07, + "loss": 1.0405, + "step": 26860 + }, + { + "epoch": 2.0822193808361424, + "grad_norm": 1.359381263099218, + "learning_rate": 1.0411500309981402e-07, + "loss": 1.0694, + "step": 26870 + }, + { + "epoch": 2.0829943043124493, + "grad_norm": 1.3783154963295137, + "learning_rate": 1.041537507749535e-07, + "loss": 1.0536, + "step": 26880 + }, + { + "epoch": 2.083769227788756, + "grad_norm": 1.3344382159523747, + "learning_rate": 1.04192498450093e-07, + "loss": 1.0205, + "step": 26890 + }, + { + "epoch": 2.0845441512650624, + "grad_norm": 1.524547970907624, + "learning_rate": 1.0423124612523249e-07, + "loss": 1.0369, + "step": 26900 + }, + { + "epoch": 2.0853190747413692, + "grad_norm": 1.3741681977929103, + "learning_rate": 1.0426999380037197e-07, + "loss": 1.0607, + "step": 26910 + }, + { + "epoch": 2.086093998217676, + "grad_norm": 1.2468079379233814, + "learning_rate": 1.0430874147551147e-07, + "loss": 1.0443, + "step": 26920 + }, + { + "epoch": 2.086868921693983, + "grad_norm": 1.2651660337857287, + "learning_rate": 1.0434748915065097e-07, + "loss": 1.0288, + "step": 26930 + }, + { + "epoch": 2.0876438451702892, + "grad_norm": 1.3544319654228105, + "learning_rate": 1.0438623682579046e-07, + "loss": 1.0347, + "step": 26940 + }, + { + "epoch": 2.088418768646596, + "grad_norm": 1.3026557544853576, + "learning_rate": 1.0442498450092994e-07, + "loss": 1.1015, + "step": 26950 + }, + { + "epoch": 2.089193692122903, + "grad_norm": 1.3082417188798359, + "learning_rate": 1.0446373217606944e-07, + "loss": 1.0396, + "step": 26960 + }, + { + "epoch": 2.0899686155992097, + "grad_norm": 1.2724629101474607, + "learning_rate": 1.0450247985120893e-07, + "loss": 1.0533, + "step": 26970 + }, + { + "epoch": 2.0907435390755165, + "grad_norm": 1.3729821579617891, + "learning_rate": 1.0454122752634842e-07, + "loss": 1.06, + "step": 26980 + }, + { + "epoch": 2.091518462551823, + "grad_norm": 1.4420458675464118, + "learning_rate": 1.0457997520148791e-07, + "loss": 1.0519, + "step": 26990 + }, + { + "epoch": 2.0922933860281296, + "grad_norm": 1.2706100447902033, + "learning_rate": 1.0461872287662741e-07, + "loss": 1.0307, + "step": 27000 + }, + { + "epoch": 2.0922933860281296, + "eval_loss": 1.0506844520568848, + "eval_runtime": 315.1894, + "eval_samples_per_second": 36.394, + "eval_steps_per_second": 9.099, + "step": 27000 + }, + { + "epoch": 2.0930683095044365, + "grad_norm": 1.816427715213832, + "learning_rate": 1.046574705517669e-07, + "loss": 1.0389, + "step": 27010 + }, + { + "epoch": 2.0938432329807433, + "grad_norm": 1.2567544620613196, + "learning_rate": 1.0469621822690639e-07, + "loss": 1.0519, + "step": 27020 + }, + { + "epoch": 2.09461815645705, + "grad_norm": 1.3042295863000075, + "learning_rate": 1.0473496590204588e-07, + "loss": 1.0534, + "step": 27030 + }, + { + "epoch": 2.0953930799333564, + "grad_norm": 1.4386734522921498, + "learning_rate": 1.0477371357718538e-07, + "loss": 1.0664, + "step": 27040 + }, + { + "epoch": 2.0961680034096632, + "grad_norm": 1.5066041906723153, + "learning_rate": 1.0481246125232486e-07, + "loss": 1.0671, + "step": 27050 + }, + { + "epoch": 2.09694292688597, + "grad_norm": 1.3017831479843651, + "learning_rate": 1.0485120892746435e-07, + "loss": 1.0412, + "step": 27060 + }, + { + "epoch": 2.097717850362277, + "grad_norm": 1.2972844547959643, + "learning_rate": 1.0488995660260385e-07, + "loss": 1.0529, + "step": 27070 + }, + { + "epoch": 2.0984927738385832, + "grad_norm": 1.314303093870134, + "learning_rate": 1.0492870427774333e-07, + "loss": 1.0672, + "step": 27080 + }, + { + "epoch": 2.09926769731489, + "grad_norm": 1.3938610078511, + "learning_rate": 1.0496745195288283e-07, + "loss": 1.0723, + "step": 27090 + }, + { + "epoch": 2.100042620791197, + "grad_norm": 1.234521773708635, + "learning_rate": 1.0500619962802232e-07, + "loss": 1.0443, + "step": 27100 + }, + { + "epoch": 2.1008175442675037, + "grad_norm": 1.3030356294582786, + "learning_rate": 1.0504494730316182e-07, + "loss": 1.0557, + "step": 27110 + }, + { + "epoch": 2.1015924677438105, + "grad_norm": 1.34856169719472, + "learning_rate": 1.050836949783013e-07, + "loss": 1.0562, + "step": 27120 + }, + { + "epoch": 2.102367391220117, + "grad_norm": 1.2707529410339564, + "learning_rate": 1.051224426534408e-07, + "loss": 1.0511, + "step": 27130 + }, + { + "epoch": 2.1031423146964237, + "grad_norm": 1.3046260240472662, + "learning_rate": 1.0516119032858029e-07, + "loss": 1.0647, + "step": 27140 + }, + { + "epoch": 2.1039172381727305, + "grad_norm": 1.3115761688385197, + "learning_rate": 1.0519993800371977e-07, + "loss": 1.0423, + "step": 27150 + }, + { + "epoch": 2.1046921616490373, + "grad_norm": 1.3398050527916114, + "learning_rate": 1.0523868567885927e-07, + "loss": 1.0666, + "step": 27160 + }, + { + "epoch": 2.105467085125344, + "grad_norm": 1.3095127902030914, + "learning_rate": 1.0527743335399877e-07, + "loss": 1.0532, + "step": 27170 + }, + { + "epoch": 2.1062420086016505, + "grad_norm": 1.3511364062979452, + "learning_rate": 1.0531618102913826e-07, + "loss": 1.0193, + "step": 27180 + }, + { + "epoch": 2.1070169320779573, + "grad_norm": 1.3167114732694287, + "learning_rate": 1.0535492870427774e-07, + "loss": 1.0504, + "step": 27190 + }, + { + "epoch": 2.107791855554264, + "grad_norm": 1.3114500138369438, + "learning_rate": 1.0539367637941724e-07, + "loss": 1.0288, + "step": 27200 + }, + { + "epoch": 2.108566779030571, + "grad_norm": 1.341885148561864, + "learning_rate": 1.0543242405455673e-07, + "loss": 1.0296, + "step": 27210 + }, + { + "epoch": 2.1093417025068772, + "grad_norm": 1.2421538438275888, + "learning_rate": 1.0547117172969622e-07, + "loss": 1.0388, + "step": 27220 + }, + { + "epoch": 2.110116625983184, + "grad_norm": 1.354125216827104, + "learning_rate": 1.0550991940483571e-07, + "loss": 1.044, + "step": 27230 + }, + { + "epoch": 2.110891549459491, + "grad_norm": 1.3674792348621354, + "learning_rate": 1.0554866707997521e-07, + "loss": 1.0459, + "step": 27240 + }, + { + "epoch": 2.1116664729357977, + "grad_norm": 1.3169700201596366, + "learning_rate": 1.0558741475511469e-07, + "loss": 1.0579, + "step": 27250 + }, + { + "epoch": 2.1124413964121045, + "grad_norm": 1.2877088081253871, + "learning_rate": 1.0562616243025418e-07, + "loss": 1.0265, + "step": 27260 + }, + { + "epoch": 2.113216319888411, + "grad_norm": 1.3055543406381667, + "learning_rate": 1.0566491010539368e-07, + "loss": 1.0676, + "step": 27270 + }, + { + "epoch": 2.1139912433647177, + "grad_norm": 1.2702308154892121, + "learning_rate": 1.0570365778053318e-07, + "loss": 1.0537, + "step": 27280 + }, + { + "epoch": 2.1147661668410245, + "grad_norm": 1.2747500382168186, + "learning_rate": 1.0574240545567266e-07, + "loss": 1.052, + "step": 27290 + }, + { + "epoch": 2.1155410903173313, + "grad_norm": 1.294950837590732, + "learning_rate": 1.0578115313081215e-07, + "loss": 1.0498, + "step": 27300 + }, + { + "epoch": 2.1163160137936377, + "grad_norm": 1.2970196327315717, + "learning_rate": 1.0581990080595165e-07, + "loss": 1.0353, + "step": 27310 + }, + { + "epoch": 2.1170909372699445, + "grad_norm": 1.3687240090061878, + "learning_rate": 1.0585864848109113e-07, + "loss": 1.0547, + "step": 27320 + }, + { + "epoch": 2.1178658607462513, + "grad_norm": 1.377473092637541, + "learning_rate": 1.0589739615623063e-07, + "loss": 1.0643, + "step": 27330 + }, + { + "epoch": 2.118640784222558, + "grad_norm": 1.1963320056450961, + "learning_rate": 1.0593614383137012e-07, + "loss": 1.0271, + "step": 27340 + }, + { + "epoch": 2.119415707698865, + "grad_norm": 1.3987142970038204, + "learning_rate": 1.0597489150650962e-07, + "loss": 1.0632, + "step": 27350 + }, + { + "epoch": 2.1201906311751713, + "grad_norm": 1.4885615527355294, + "learning_rate": 1.060136391816491e-07, + "loss": 1.063, + "step": 27360 + }, + { + "epoch": 2.120965554651478, + "grad_norm": 1.3370880253200532, + "learning_rate": 1.060523868567886e-07, + "loss": 1.0446, + "step": 27370 + }, + { + "epoch": 2.121740478127785, + "grad_norm": 1.32216820835169, + "learning_rate": 1.0609113453192809e-07, + "loss": 1.0292, + "step": 27380 + }, + { + "epoch": 2.1225154016040917, + "grad_norm": 1.2192655438730542, + "learning_rate": 1.0612988220706757e-07, + "loss": 1.0392, + "step": 27390 + }, + { + "epoch": 2.1232903250803985, + "grad_norm": 1.4873679793917998, + "learning_rate": 1.0616862988220707e-07, + "loss": 1.0631, + "step": 27400 + }, + { + "epoch": 2.124065248556705, + "grad_norm": 1.3073398108546328, + "learning_rate": 1.0620737755734656e-07, + "loss": 1.0499, + "step": 27410 + }, + { + "epoch": 2.1248401720330117, + "grad_norm": 1.3465913511213552, + "learning_rate": 1.0624612523248605e-07, + "loss": 1.047, + "step": 27420 + }, + { + "epoch": 2.1256150955093185, + "grad_norm": 1.2952170758768726, + "learning_rate": 1.0628487290762554e-07, + "loss": 1.0524, + "step": 27430 + }, + { + "epoch": 2.1263900189856253, + "grad_norm": 1.4296730204223123, + "learning_rate": 1.0632362058276504e-07, + "loss": 1.0533, + "step": 27440 + }, + { + "epoch": 2.127164942461932, + "grad_norm": 1.2764992530512032, + "learning_rate": 1.0636236825790453e-07, + "loss": 1.0439, + "step": 27450 + }, + { + "epoch": 2.1279398659382385, + "grad_norm": 1.2265029074210028, + "learning_rate": 1.0640111593304402e-07, + "loss": 1.0492, + "step": 27460 + }, + { + "epoch": 2.1287147894145453, + "grad_norm": 1.2860717039222331, + "learning_rate": 1.0643986360818351e-07, + "loss": 1.0427, + "step": 27470 + }, + { + "epoch": 2.129489712890852, + "grad_norm": 1.2907521608971826, + "learning_rate": 1.0647861128332301e-07, + "loss": 1.0673, + "step": 27480 + }, + { + "epoch": 2.130264636367159, + "grad_norm": 1.330257754520052, + "learning_rate": 1.0651735895846249e-07, + "loss": 1.0342, + "step": 27490 + }, + { + "epoch": 2.1310395598434653, + "grad_norm": 1.3421906671180028, + "learning_rate": 1.0655610663360198e-07, + "loss": 1.0475, + "step": 27500 + }, + { + "epoch": 2.1310395598434653, + "eval_loss": 1.048052191734314, + "eval_runtime": 321.06, + "eval_samples_per_second": 35.729, + "eval_steps_per_second": 8.933, + "step": 27500 + }, + { + "epoch": 2.131814483319772, + "grad_norm": 2.5151582168485556, + "learning_rate": 1.0659485430874148e-07, + "loss": 1.0435, + "step": 27510 + }, + { + "epoch": 2.132589406796079, + "grad_norm": 1.4730286181111119, + "learning_rate": 1.0663360198388098e-07, + "loss": 1.0595, + "step": 27520 + }, + { + "epoch": 2.1333643302723857, + "grad_norm": 1.2978061264580283, + "learning_rate": 1.0667234965902046e-07, + "loss": 1.0427, + "step": 27530 + }, + { + "epoch": 2.1341392537486925, + "grad_norm": 1.3210371108179346, + "learning_rate": 1.0671109733415995e-07, + "loss": 1.0411, + "step": 27540 + }, + { + "epoch": 2.134914177224999, + "grad_norm": 1.2742350796978719, + "learning_rate": 1.0674984500929945e-07, + "loss": 1.041, + "step": 27550 + }, + { + "epoch": 2.1356891007013057, + "grad_norm": 1.3480168669461097, + "learning_rate": 1.0678859268443893e-07, + "loss": 1.0519, + "step": 27560 + }, + { + "epoch": 2.1364640241776125, + "grad_norm": 1.3634668852666751, + "learning_rate": 1.0682734035957843e-07, + "loss": 1.0417, + "step": 27570 + }, + { + "epoch": 2.1372389476539193, + "grad_norm": 1.3030570269635813, + "learning_rate": 1.0686608803471792e-07, + "loss": 1.0173, + "step": 27580 + }, + { + "epoch": 2.1380138711302257, + "grad_norm": 1.3158458279323721, + "learning_rate": 1.069048357098574e-07, + "loss": 1.0563, + "step": 27590 + }, + { + "epoch": 2.1387887946065325, + "grad_norm": 1.3856567086357083, + "learning_rate": 1.069435833849969e-07, + "loss": 1.0532, + "step": 27600 + }, + { + "epoch": 2.1395637180828393, + "grad_norm": 1.383971057082803, + "learning_rate": 1.069823310601364e-07, + "loss": 1.0466, + "step": 27610 + }, + { + "epoch": 2.140338641559146, + "grad_norm": 1.363825192995587, + "learning_rate": 1.0702107873527589e-07, + "loss": 1.0378, + "step": 27620 + }, + { + "epoch": 2.141113565035453, + "grad_norm": 1.2326949797655893, + "learning_rate": 1.0705982641041537e-07, + "loss": 1.0298, + "step": 27630 + }, + { + "epoch": 2.1418884885117593, + "grad_norm": 1.4240734683681195, + "learning_rate": 1.0709857408555487e-07, + "loss": 1.0549, + "step": 27640 + }, + { + "epoch": 2.142663411988066, + "grad_norm": 1.3286755880443875, + "learning_rate": 1.0713732176069436e-07, + "loss": 1.0353, + "step": 27650 + }, + { + "epoch": 2.143438335464373, + "grad_norm": 1.2706548730190128, + "learning_rate": 1.0717606943583385e-07, + "loss": 1.0302, + "step": 27660 + }, + { + "epoch": 2.1442132589406797, + "grad_norm": 1.3454258702117585, + "learning_rate": 1.0721481711097334e-07, + "loss": 1.0499, + "step": 27670 + }, + { + "epoch": 2.1449881824169865, + "grad_norm": 1.3416544393847063, + "learning_rate": 1.0725356478611284e-07, + "loss": 1.0238, + "step": 27680 + }, + { + "epoch": 2.145763105893293, + "grad_norm": 1.3387730869388976, + "learning_rate": 1.0729231246125233e-07, + "loss": 1.0632, + "step": 27690 + }, + { + "epoch": 2.1465380293695997, + "grad_norm": 1.2477415698855299, + "learning_rate": 1.0733106013639182e-07, + "loss": 1.0609, + "step": 27700 + }, + { + "epoch": 2.1473129528459065, + "grad_norm": 1.2956544373909844, + "learning_rate": 1.0736980781153131e-07, + "loss": 1.0472, + "step": 27710 + }, + { + "epoch": 2.1480878763222133, + "grad_norm": 1.2952082168400036, + "learning_rate": 1.074085554866708e-07, + "loss": 1.0434, + "step": 27720 + }, + { + "epoch": 2.14886279979852, + "grad_norm": 1.312016795183587, + "learning_rate": 1.0744730316181029e-07, + "loss": 1.0408, + "step": 27730 + }, + { + "epoch": 2.1496377232748265, + "grad_norm": 1.4336983202086164, + "learning_rate": 1.0748605083694978e-07, + "loss": 1.0446, + "step": 27740 + }, + { + "epoch": 2.1504126467511333, + "grad_norm": 1.2729866720997716, + "learning_rate": 1.0752479851208928e-07, + "loss": 1.0498, + "step": 27750 + }, + { + "epoch": 2.15118757022744, + "grad_norm": 1.3277090012562986, + "learning_rate": 1.0756354618722876e-07, + "loss": 1.0406, + "step": 27760 + }, + { + "epoch": 2.151962493703747, + "grad_norm": 1.3164250577889356, + "learning_rate": 1.0760229386236826e-07, + "loss": 1.0462, + "step": 27770 + }, + { + "epoch": 2.1527374171800533, + "grad_norm": 1.2200889016594092, + "learning_rate": 1.0764104153750775e-07, + "loss": 1.0476, + "step": 27780 + }, + { + "epoch": 2.15351234065636, + "grad_norm": 1.3478991351201666, + "learning_rate": 1.0767978921264725e-07, + "loss": 1.027, + "step": 27790 + }, + { + "epoch": 2.154287264132667, + "grad_norm": 1.3504535195147704, + "learning_rate": 1.0771853688778673e-07, + "loss": 1.043, + "step": 27800 + }, + { + "epoch": 2.1550621876089737, + "grad_norm": 1.3700113593684253, + "learning_rate": 1.0775728456292623e-07, + "loss": 1.0711, + "step": 27810 + }, + { + "epoch": 2.1558371110852805, + "grad_norm": 1.4087902250029851, + "learning_rate": 1.0779603223806572e-07, + "loss": 1.0744, + "step": 27820 + }, + { + "epoch": 2.156612034561587, + "grad_norm": 1.3330980886365476, + "learning_rate": 1.078347799132052e-07, + "loss": 1.041, + "step": 27830 + }, + { + "epoch": 2.1573869580378937, + "grad_norm": 1.3727310396819903, + "learning_rate": 1.078735275883447e-07, + "loss": 1.0304, + "step": 27840 + }, + { + "epoch": 2.1581618815142005, + "grad_norm": 1.2696747485936577, + "learning_rate": 1.079122752634842e-07, + "loss": 1.0471, + "step": 27850 + }, + { + "epoch": 2.1589368049905073, + "grad_norm": 1.2675165864689082, + "learning_rate": 1.0795102293862369e-07, + "loss": 1.061, + "step": 27860 + }, + { + "epoch": 2.1597117284668137, + "grad_norm": 1.3093140687640397, + "learning_rate": 1.0798977061376317e-07, + "loss": 1.0524, + "step": 27870 + }, + { + "epoch": 2.1604866519431205, + "grad_norm": 1.309750957337762, + "learning_rate": 1.0802851828890267e-07, + "loss": 1.0632, + "step": 27880 + }, + { + "epoch": 2.1612615754194273, + "grad_norm": 1.278958355527116, + "learning_rate": 1.0806726596404216e-07, + "loss": 1.0473, + "step": 27890 + }, + { + "epoch": 2.162036498895734, + "grad_norm": 1.4375615101944348, + "learning_rate": 1.0810601363918165e-07, + "loss": 1.0516, + "step": 27900 + }, + { + "epoch": 2.162811422372041, + "grad_norm": 1.351474359006033, + "learning_rate": 1.0814476131432114e-07, + "loss": 1.0347, + "step": 27910 + }, + { + "epoch": 2.1635863458483473, + "grad_norm": 1.4496606028773888, + "learning_rate": 1.0818350898946064e-07, + "loss": 1.0507, + "step": 27920 + }, + { + "epoch": 2.164361269324654, + "grad_norm": 1.3421262483015108, + "learning_rate": 1.0822225666460013e-07, + "loss": 1.059, + "step": 27930 + }, + { + "epoch": 2.165136192800961, + "grad_norm": 1.8642293595740325, + "learning_rate": 1.0826100433973961e-07, + "loss": 1.0368, + "step": 27940 + }, + { + "epoch": 2.1659111162772677, + "grad_norm": 1.3001675447154677, + "learning_rate": 1.0829975201487911e-07, + "loss": 1.0555, + "step": 27950 + }, + { + "epoch": 2.166686039753574, + "grad_norm": 1.3350611453807306, + "learning_rate": 1.083384996900186e-07, + "loss": 1.0466, + "step": 27960 + }, + { + "epoch": 2.167460963229881, + "grad_norm": 1.2956176364722016, + "learning_rate": 1.0837724736515809e-07, + "loss": 1.0559, + "step": 27970 + }, + { + "epoch": 2.1682358867061877, + "grad_norm": 1.2764501760519076, + "learning_rate": 1.0841599504029758e-07, + "loss": 1.0328, + "step": 27980 + }, + { + "epoch": 2.1690108101824945, + "grad_norm": 1.2798383221370928, + "learning_rate": 1.0845474271543708e-07, + "loss": 1.046, + "step": 27990 + }, + { + "epoch": 2.1697857336588013, + "grad_norm": 1.3637338480388492, + "learning_rate": 1.0849349039057656e-07, + "loss": 1.0386, + "step": 28000 + }, + { + "epoch": 2.1697857336588013, + "eval_loss": 1.0455366373062134, + "eval_runtime": 320.276, + "eval_samples_per_second": 35.816, + "eval_steps_per_second": 8.955, + "step": 28000 + }, + { + "epoch": 2.1705606571351077, + "grad_norm": 1.29318766458794, + "learning_rate": 1.0853223806571606e-07, + "loss": 1.0671, + "step": 28010 + }, + { + "epoch": 2.1713355806114145, + "grad_norm": 1.4353161958240075, + "learning_rate": 1.0857098574085555e-07, + "loss": 1.041, + "step": 28020 + }, + { + "epoch": 2.1721105040877213, + "grad_norm": 1.3683577224892198, + "learning_rate": 1.0860973341599505e-07, + "loss": 1.0466, + "step": 28030 + }, + { + "epoch": 2.172885427564028, + "grad_norm": 1.4026803419184688, + "learning_rate": 1.0864848109113453e-07, + "loss": 1.0298, + "step": 28040 + }, + { + "epoch": 2.173660351040335, + "grad_norm": 1.3227778603970766, + "learning_rate": 1.0868722876627403e-07, + "loss": 1.0371, + "step": 28050 + }, + { + "epoch": 2.1744352745166413, + "grad_norm": 1.3017858160755709, + "learning_rate": 1.0872597644141352e-07, + "loss": 1.0404, + "step": 28060 + }, + { + "epoch": 2.175210197992948, + "grad_norm": 1.3106657190264699, + "learning_rate": 1.08764724116553e-07, + "loss": 1.0362, + "step": 28070 + }, + { + "epoch": 2.175985121469255, + "grad_norm": 1.3542781177268612, + "learning_rate": 1.088034717916925e-07, + "loss": 1.0798, + "step": 28080 + }, + { + "epoch": 2.1767600449455617, + "grad_norm": 1.3875691482723278, + "learning_rate": 1.08842219466832e-07, + "loss": 1.0351, + "step": 28090 + }, + { + "epoch": 2.1775349684218686, + "grad_norm": 1.4358569745567564, + "learning_rate": 1.0888096714197149e-07, + "loss": 1.0525, + "step": 28100 + }, + { + "epoch": 2.178309891898175, + "grad_norm": 1.3768407443588606, + "learning_rate": 1.0891971481711097e-07, + "loss": 1.0598, + "step": 28110 + }, + { + "epoch": 2.1790848153744817, + "grad_norm": 1.4145915695236702, + "learning_rate": 1.0895846249225047e-07, + "loss": 1.0384, + "step": 28120 + }, + { + "epoch": 2.1798597388507885, + "grad_norm": 1.425046546074581, + "learning_rate": 1.0899721016738996e-07, + "loss": 1.0505, + "step": 28130 + }, + { + "epoch": 2.1806346623270954, + "grad_norm": 1.2741950822067931, + "learning_rate": 1.0903595784252945e-07, + "loss": 1.0308, + "step": 28140 + }, + { + "epoch": 2.1814095858034017, + "grad_norm": 1.4422773440551129, + "learning_rate": 1.0907470551766894e-07, + "loss": 1.0602, + "step": 28150 + }, + { + "epoch": 2.1821845092797085, + "grad_norm": 1.3858509492888749, + "learning_rate": 1.0911345319280844e-07, + "loss": 1.0341, + "step": 28160 + }, + { + "epoch": 2.1829594327560153, + "grad_norm": 1.3689771477252295, + "learning_rate": 1.0915220086794792e-07, + "loss": 1.0517, + "step": 28170 + }, + { + "epoch": 2.183734356232322, + "grad_norm": 1.3607526665570466, + "learning_rate": 1.0919094854308741e-07, + "loss": 1.0451, + "step": 28180 + }, + { + "epoch": 2.184509279708629, + "grad_norm": 1.3124134296456478, + "learning_rate": 1.0922969621822691e-07, + "loss": 1.0564, + "step": 28190 + }, + { + "epoch": 2.1852842031849353, + "grad_norm": 1.292705142449864, + "learning_rate": 1.092684438933664e-07, + "loss": 1.0327, + "step": 28200 + }, + { + "epoch": 2.186059126661242, + "grad_norm": 1.3443850596202078, + "learning_rate": 1.0930719156850589e-07, + "loss": 1.0363, + "step": 28210 + }, + { + "epoch": 2.186834050137549, + "grad_norm": 1.2434800737004992, + "learning_rate": 1.0934593924364538e-07, + "loss": 1.0302, + "step": 28220 + }, + { + "epoch": 2.1876089736138558, + "grad_norm": 1.3151010771705594, + "learning_rate": 1.0938468691878488e-07, + "loss": 1.0482, + "step": 28230 + }, + { + "epoch": 2.188383897090162, + "grad_norm": 1.3044074729160586, + "learning_rate": 1.0942343459392436e-07, + "loss": 1.0455, + "step": 28240 + }, + { + "epoch": 2.189158820566469, + "grad_norm": 1.3501991892831018, + "learning_rate": 1.0946218226906386e-07, + "loss": 1.0479, + "step": 28250 + }, + { + "epoch": 2.1899337440427757, + "grad_norm": 1.2713877476776219, + "learning_rate": 1.0950092994420335e-07, + "loss": 1.0686, + "step": 28260 + }, + { + "epoch": 2.1907086675190826, + "grad_norm": 1.2777286038683293, + "learning_rate": 1.0953967761934285e-07, + "loss": 1.0408, + "step": 28270 + }, + { + "epoch": 2.1914835909953894, + "grad_norm": 1.2943918675483097, + "learning_rate": 1.0957842529448233e-07, + "loss": 1.0413, + "step": 28280 + }, + { + "epoch": 2.1922585144716957, + "grad_norm": 1.5006012608566919, + "learning_rate": 1.0961717296962183e-07, + "loss": 1.0591, + "step": 28290 + }, + { + "epoch": 2.1930334379480025, + "grad_norm": 1.4349611717988795, + "learning_rate": 1.0965592064476132e-07, + "loss": 1.0571, + "step": 28300 + }, + { + "epoch": 2.1938083614243093, + "grad_norm": 1.3245991975627052, + "learning_rate": 1.096946683199008e-07, + "loss": 1.058, + "step": 28310 + }, + { + "epoch": 2.194583284900616, + "grad_norm": 1.3081163169745846, + "learning_rate": 1.097334159950403e-07, + "loss": 1.0421, + "step": 28320 + }, + { + "epoch": 2.195358208376923, + "grad_norm": 1.3944367323747973, + "learning_rate": 1.097721636701798e-07, + "loss": 1.0511, + "step": 28330 + }, + { + "epoch": 2.1961331318532293, + "grad_norm": 1.236848497865013, + "learning_rate": 1.0981091134531928e-07, + "loss": 1.0332, + "step": 28340 + }, + { + "epoch": 2.196908055329536, + "grad_norm": 1.344658198954809, + "learning_rate": 1.0984965902045877e-07, + "loss": 1.0717, + "step": 28350 + }, + { + "epoch": 2.197682978805843, + "grad_norm": 1.4533339247151198, + "learning_rate": 1.0988840669559827e-07, + "loss": 1.041, + "step": 28360 + }, + { + "epoch": 2.1984579022821498, + "grad_norm": 1.207320960634886, + "learning_rate": 1.0992715437073776e-07, + "loss": 1.0519, + "step": 28370 + }, + { + "epoch": 2.1992328257584566, + "grad_norm": 1.2764650574944836, + "learning_rate": 1.0996590204587725e-07, + "loss": 1.0383, + "step": 28380 + }, + { + "epoch": 2.200007749234763, + "grad_norm": 1.328398141439983, + "learning_rate": 1.1000464972101674e-07, + "loss": 1.0313, + "step": 28390 + }, + { + "epoch": 2.2007826727110698, + "grad_norm": 1.3360771915393375, + "learning_rate": 1.1004339739615624e-07, + "loss": 1.0706, + "step": 28400 + }, + { + "epoch": 2.2015575961873766, + "grad_norm": 1.2504053917676, + "learning_rate": 1.1008214507129572e-07, + "loss": 1.0389, + "step": 28410 + }, + { + "epoch": 2.2023325196636834, + "grad_norm": 1.2632669741409308, + "learning_rate": 1.1012089274643521e-07, + "loss": 1.03, + "step": 28420 + }, + { + "epoch": 2.2031074431399897, + "grad_norm": 1.321005653112248, + "learning_rate": 1.1015964042157471e-07, + "loss": 1.0424, + "step": 28430 + }, + { + "epoch": 2.2038823666162966, + "grad_norm": 1.3094591900366825, + "learning_rate": 1.101983880967142e-07, + "loss": 1.0121, + "step": 28440 + }, + { + "epoch": 2.2046572900926034, + "grad_norm": 1.3706094977974093, + "learning_rate": 1.1023713577185369e-07, + "loss": 1.0489, + "step": 28450 + }, + { + "epoch": 2.20543221356891, + "grad_norm": 1.3860429548637199, + "learning_rate": 1.1027588344699318e-07, + "loss": 1.0364, + "step": 28460 + }, + { + "epoch": 2.206207137045217, + "grad_norm": 1.290898872108468, + "learning_rate": 1.1031463112213268e-07, + "loss": 1.0393, + "step": 28470 + }, + { + "epoch": 2.2069820605215233, + "grad_norm": 1.313397441207088, + "learning_rate": 1.1035337879727216e-07, + "loss": 1.0511, + "step": 28480 + }, + { + "epoch": 2.20775698399783, + "grad_norm": 1.32647159772131, + "learning_rate": 1.1039212647241166e-07, + "loss": 1.046, + "step": 28490 + }, + { + "epoch": 2.208531907474137, + "grad_norm": 1.2399203129941458, + "learning_rate": 1.1043087414755115e-07, + "loss": 1.0228, + "step": 28500 + }, + { + "epoch": 2.208531907474137, + "eval_loss": 1.0430920124053955, + "eval_runtime": 317.9672, + "eval_samples_per_second": 36.076, + "eval_steps_per_second": 9.02, + "step": 28500 + }, + { + "epoch": 2.209306830950444, + "grad_norm": 1.3063490919854452, + "learning_rate": 1.1046962182269063e-07, + "loss": 1.0339, + "step": 28510 + }, + { + "epoch": 2.21008175442675, + "grad_norm": 1.3431509923764224, + "learning_rate": 1.1050836949783013e-07, + "loss": 1.0507, + "step": 28520 + }, + { + "epoch": 2.210856677903057, + "grad_norm": 1.2676409835429054, + "learning_rate": 1.1054711717296963e-07, + "loss": 1.0389, + "step": 28530 + }, + { + "epoch": 2.2116316013793638, + "grad_norm": 1.3721626737129209, + "learning_rate": 1.1058586484810912e-07, + "loss": 1.0556, + "step": 28540 + }, + { + "epoch": 2.2124065248556706, + "grad_norm": 1.4033784699591225, + "learning_rate": 1.106246125232486e-07, + "loss": 1.0633, + "step": 28550 + }, + { + "epoch": 2.2131814483319774, + "grad_norm": 1.3411428201626168, + "learning_rate": 1.106633601983881e-07, + "loss": 1.0551, + "step": 28560 + }, + { + "epoch": 2.2139563718082838, + "grad_norm": 1.4004314615373643, + "learning_rate": 1.107021078735276e-07, + "loss": 1.0417, + "step": 28570 + }, + { + "epoch": 2.2147312952845906, + "grad_norm": 1.283457843317271, + "learning_rate": 1.1074085554866708e-07, + "loss": 1.0473, + "step": 28580 + }, + { + "epoch": 2.2155062187608974, + "grad_norm": 1.3763541704498559, + "learning_rate": 1.1077960322380657e-07, + "loss": 1.0269, + "step": 28590 + }, + { + "epoch": 2.216281142237204, + "grad_norm": 1.300071016744356, + "learning_rate": 1.1081835089894607e-07, + "loss": 1.027, + "step": 28600 + }, + { + "epoch": 2.217056065713511, + "grad_norm": 1.2881047179489353, + "learning_rate": 1.1085709857408556e-07, + "loss": 1.0292, + "step": 28610 + }, + { + "epoch": 2.2178309891898174, + "grad_norm": 1.3793116655420277, + "learning_rate": 1.1089584624922504e-07, + "loss": 1.0234, + "step": 28620 + }, + { + "epoch": 2.218605912666124, + "grad_norm": 1.3580137258631921, + "learning_rate": 1.1093459392436454e-07, + "loss": 1.0297, + "step": 28630 + }, + { + "epoch": 2.219380836142431, + "grad_norm": 1.4113748063723623, + "learning_rate": 1.1097334159950404e-07, + "loss": 1.0356, + "step": 28640 + }, + { + "epoch": 2.220155759618738, + "grad_norm": 1.397298692629597, + "learning_rate": 1.1101208927464352e-07, + "loss": 1.0433, + "step": 28650 + }, + { + "epoch": 2.2209306830950446, + "grad_norm": 1.275990639832436, + "learning_rate": 1.1105083694978301e-07, + "loss": 1.0337, + "step": 28660 + }, + { + "epoch": 2.221705606571351, + "grad_norm": 1.3917510365085295, + "learning_rate": 1.1108958462492251e-07, + "loss": 1.0202, + "step": 28670 + }, + { + "epoch": 2.2224805300476578, + "grad_norm": 1.3233377013963452, + "learning_rate": 1.1112833230006199e-07, + "loss": 1.0365, + "step": 28680 + }, + { + "epoch": 2.2232554535239646, + "grad_norm": 1.2964692615475646, + "learning_rate": 1.1116707997520149e-07, + "loss": 1.0346, + "step": 28690 + }, + { + "epoch": 2.2240303770002714, + "grad_norm": 1.421963930415503, + "learning_rate": 1.1120582765034098e-07, + "loss": 1.0457, + "step": 28700 + }, + { + "epoch": 2.2248053004765778, + "grad_norm": 1.3068082133464274, + "learning_rate": 1.1124457532548048e-07, + "loss": 1.0272, + "step": 28710 + }, + { + "epoch": 2.2255802239528846, + "grad_norm": 1.310503390898841, + "learning_rate": 1.1128332300061996e-07, + "loss": 1.0469, + "step": 28720 + }, + { + "epoch": 2.2263551474291914, + "grad_norm": 1.3450510523197694, + "learning_rate": 1.1132207067575946e-07, + "loss": 1.0556, + "step": 28730 + }, + { + "epoch": 2.227130070905498, + "grad_norm": 1.393804838354062, + "learning_rate": 1.1136081835089895e-07, + "loss": 1.0507, + "step": 28740 + }, + { + "epoch": 2.227904994381805, + "grad_norm": 1.3125747105124133, + "learning_rate": 1.1139956602603843e-07, + "loss": 1.0278, + "step": 28750 + }, + { + "epoch": 2.2286799178581114, + "grad_norm": 1.3139041711232335, + "learning_rate": 1.1143831370117793e-07, + "loss": 1.0469, + "step": 28760 + }, + { + "epoch": 2.229454841334418, + "grad_norm": 1.293978085557227, + "learning_rate": 1.1147706137631742e-07, + "loss": 1.0449, + "step": 28770 + }, + { + "epoch": 2.230229764810725, + "grad_norm": 1.4303053353674606, + "learning_rate": 1.1151580905145692e-07, + "loss": 1.0461, + "step": 28780 + }, + { + "epoch": 2.231004688287032, + "grad_norm": 1.3065202993886096, + "learning_rate": 1.115545567265964e-07, + "loss": 1.0719, + "step": 28790 + }, + { + "epoch": 2.231779611763338, + "grad_norm": 1.4200994203088741, + "learning_rate": 1.115933044017359e-07, + "loss": 1.0366, + "step": 28800 + }, + { + "epoch": 2.232554535239645, + "grad_norm": 1.3577951036077693, + "learning_rate": 1.116320520768754e-07, + "loss": 1.0443, + "step": 28810 + }, + { + "epoch": 2.233329458715952, + "grad_norm": 1.7027258824774005, + "learning_rate": 1.1167079975201488e-07, + "loss": 1.0645, + "step": 28820 + }, + { + "epoch": 2.2341043821922586, + "grad_norm": 1.4245767276432408, + "learning_rate": 1.1170954742715437e-07, + "loss": 1.0454, + "step": 28830 + }, + { + "epoch": 2.2348793056685654, + "grad_norm": 1.2661266404601532, + "learning_rate": 1.1174829510229387e-07, + "loss": 1.054, + "step": 28840 + }, + { + "epoch": 2.2356542291448718, + "grad_norm": 1.4005917413046594, + "learning_rate": 1.1178704277743336e-07, + "loss": 1.0318, + "step": 28850 + }, + { + "epoch": 2.2364291526211786, + "grad_norm": 1.7424564808804057, + "learning_rate": 1.1182579045257284e-07, + "loss": 1.0527, + "step": 28860 + }, + { + "epoch": 2.2372040760974854, + "grad_norm": 1.2880294475843206, + "learning_rate": 1.1186453812771234e-07, + "loss": 1.0356, + "step": 28870 + }, + { + "epoch": 2.237978999573792, + "grad_norm": 1.3366506722435196, + "learning_rate": 1.1190328580285184e-07, + "loss": 1.0589, + "step": 28880 + }, + { + "epoch": 2.2387539230500986, + "grad_norm": 1.339922747828946, + "learning_rate": 1.1194203347799132e-07, + "loss": 1.0422, + "step": 28890 + }, + { + "epoch": 2.2395288465264054, + "grad_norm": 1.3797985680365743, + "learning_rate": 1.1198078115313081e-07, + "loss": 1.0494, + "step": 28900 + }, + { + "epoch": 2.240303770002712, + "grad_norm": 1.3126229211632272, + "learning_rate": 1.1201952882827031e-07, + "loss": 1.0311, + "step": 28910 + }, + { + "epoch": 2.241078693479019, + "grad_norm": 1.2866030569853972, + "learning_rate": 1.1205827650340979e-07, + "loss": 1.044, + "step": 28920 + }, + { + "epoch": 2.241853616955326, + "grad_norm": 1.2741903451626748, + "learning_rate": 1.1209702417854929e-07, + "loss": 1.034, + "step": 28930 + }, + { + "epoch": 2.242628540431632, + "grad_norm": 1.4043222081958995, + "learning_rate": 1.1213577185368878e-07, + "loss": 1.0587, + "step": 28940 + }, + { + "epoch": 2.243403463907939, + "grad_norm": 1.3718592620660697, + "learning_rate": 1.1217451952882828e-07, + "loss": 1.0454, + "step": 28950 + }, + { + "epoch": 2.244178387384246, + "grad_norm": 1.2877572231211991, + "learning_rate": 1.1221326720396776e-07, + "loss": 1.0468, + "step": 28960 + }, + { + "epoch": 2.2449533108605526, + "grad_norm": 1.3313033974210469, + "learning_rate": 1.1225201487910726e-07, + "loss": 1.032, + "step": 28970 + }, + { + "epoch": 2.2457282343368594, + "grad_norm": 1.2859866228270427, + "learning_rate": 1.1229076255424675e-07, + "loss": 1.023, + "step": 28980 + }, + { + "epoch": 2.246503157813166, + "grad_norm": 1.3253893938403072, + "learning_rate": 1.1232951022938623e-07, + "loss": 1.0408, + "step": 28990 + }, + { + "epoch": 2.2472780812894726, + "grad_norm": 1.2587611792777451, + "learning_rate": 1.1236825790452573e-07, + "loss": 1.0454, + "step": 29000 + }, + { + "epoch": 2.2472780812894726, + "eval_loss": 1.0407191514968872, + "eval_runtime": 320.0487, + "eval_samples_per_second": 35.841, + "eval_steps_per_second": 8.961, + "step": 29000 + }, + { + "epoch": 2.2480530047657794, + "grad_norm": 1.353942025088661, + "learning_rate": 1.1240700557966522e-07, + "loss": 1.0366, + "step": 29010 + }, + { + "epoch": 2.248827928242086, + "grad_norm": 1.4254323086350775, + "learning_rate": 1.1244575325480472e-07, + "loss": 1.018, + "step": 29020 + }, + { + "epoch": 2.249602851718393, + "grad_norm": 1.415634893831238, + "learning_rate": 1.124845009299442e-07, + "loss": 1.0399, + "step": 29030 + }, + { + "epoch": 2.2503777751946994, + "grad_norm": 1.2893884980771588, + "learning_rate": 1.125232486050837e-07, + "loss": 1.0249, + "step": 29040 + }, + { + "epoch": 2.251152698671006, + "grad_norm": 1.4373512394343337, + "learning_rate": 1.1256199628022319e-07, + "loss": 1.0815, + "step": 29050 + }, + { + "epoch": 2.251927622147313, + "grad_norm": 1.32315253562018, + "learning_rate": 1.1260074395536268e-07, + "loss": 1.0375, + "step": 29060 + }, + { + "epoch": 2.25270254562362, + "grad_norm": 1.3075122591397594, + "learning_rate": 1.1263949163050217e-07, + "loss": 1.0457, + "step": 29070 + }, + { + "epoch": 2.253477469099926, + "grad_norm": 1.4183251491005504, + "learning_rate": 1.1267823930564167e-07, + "loss": 1.0391, + "step": 29080 + }, + { + "epoch": 2.254252392576233, + "grad_norm": 1.3432245954294766, + "learning_rate": 1.1271698698078115e-07, + "loss": 1.0467, + "step": 29090 + }, + { + "epoch": 2.25502731605254, + "grad_norm": 1.3982839756692567, + "learning_rate": 1.1275573465592064e-07, + "loss": 1.0533, + "step": 29100 + }, + { + "epoch": 2.2558022395288466, + "grad_norm": 1.454848540713707, + "learning_rate": 1.1279448233106014e-07, + "loss": 1.0366, + "step": 29110 + }, + { + "epoch": 2.2565771630051534, + "grad_norm": 1.3215609988162849, + "learning_rate": 1.1283323000619964e-07, + "loss": 1.0839, + "step": 29120 + }, + { + "epoch": 2.25735208648146, + "grad_norm": 1.2793412595392406, + "learning_rate": 1.1287197768133912e-07, + "loss": 1.0232, + "step": 29130 + }, + { + "epoch": 2.2581270099577666, + "grad_norm": 1.3514902525983121, + "learning_rate": 1.1291072535647861e-07, + "loss": 1.0558, + "step": 29140 + }, + { + "epoch": 2.2589019334340734, + "grad_norm": 1.415021754511848, + "learning_rate": 1.1294947303161811e-07, + "loss": 1.0714, + "step": 29150 + }, + { + "epoch": 2.2596768569103802, + "grad_norm": 1.2546830334453711, + "learning_rate": 1.1298822070675759e-07, + "loss": 1.0798, + "step": 29160 + }, + { + "epoch": 2.2604517803866866, + "grad_norm": 1.3258292959132325, + "learning_rate": 1.1302696838189709e-07, + "loss": 1.062, + "step": 29170 + }, + { + "epoch": 2.2612267038629934, + "grad_norm": 1.3617659599933614, + "learning_rate": 1.1306571605703658e-07, + "loss": 1.0446, + "step": 29180 + }, + { + "epoch": 2.2620016273393, + "grad_norm": 1.2745725769323437, + "learning_rate": 1.1310446373217608e-07, + "loss": 1.0242, + "step": 29190 + }, + { + "epoch": 2.262776550815607, + "grad_norm": 1.3954681271586118, + "learning_rate": 1.1314321140731556e-07, + "loss": 1.0191, + "step": 29200 + }, + { + "epoch": 2.263551474291914, + "grad_norm": 1.3894249221384576, + "learning_rate": 1.1318195908245506e-07, + "loss": 1.0308, + "step": 29210 + }, + { + "epoch": 2.26432639776822, + "grad_norm": 1.305718241598138, + "learning_rate": 1.1322070675759455e-07, + "loss": 1.0379, + "step": 29220 + }, + { + "epoch": 2.265101321244527, + "grad_norm": 1.337391170773476, + "learning_rate": 1.1325945443273403e-07, + "loss": 1.0363, + "step": 29230 + }, + { + "epoch": 2.265876244720834, + "grad_norm": 1.2486634085993817, + "learning_rate": 1.1329820210787353e-07, + "loss": 1.0309, + "step": 29240 + }, + { + "epoch": 2.2666511681971406, + "grad_norm": 1.3560295367560065, + "learning_rate": 1.1333694978301302e-07, + "loss": 1.043, + "step": 29250 + }, + { + "epoch": 2.2674260916734474, + "grad_norm": 1.3351057850409787, + "learning_rate": 1.133756974581525e-07, + "loss": 1.0293, + "step": 29260 + }, + { + "epoch": 2.268201015149754, + "grad_norm": 1.2793798777370455, + "learning_rate": 1.13414445133292e-07, + "loss": 1.0345, + "step": 29270 + }, + { + "epoch": 2.2689759386260606, + "grad_norm": 1.2728070884632143, + "learning_rate": 1.134531928084315e-07, + "loss": 1.0466, + "step": 29280 + }, + { + "epoch": 2.2697508621023674, + "grad_norm": 1.2790213672309687, + "learning_rate": 1.1349194048357099e-07, + "loss": 1.05, + "step": 29290 + }, + { + "epoch": 2.2705257855786742, + "grad_norm": 1.4513697897859206, + "learning_rate": 1.1353068815871047e-07, + "loss": 1.0234, + "step": 29300 + }, + { + "epoch": 2.271300709054981, + "grad_norm": 1.3442048235450121, + "learning_rate": 1.1356943583384997e-07, + "loss": 1.0133, + "step": 29310 + }, + { + "epoch": 2.2720756325312874, + "grad_norm": 1.3723934485793534, + "learning_rate": 1.1360818350898947e-07, + "loss": 1.0378, + "step": 29320 + }, + { + "epoch": 2.2728505560075942, + "grad_norm": 1.3754546311305336, + "learning_rate": 1.1364693118412895e-07, + "loss": 1.0363, + "step": 29330 + }, + { + "epoch": 2.273625479483901, + "grad_norm": 1.3203921146009172, + "learning_rate": 1.1368567885926844e-07, + "loss": 1.0438, + "step": 29340 + }, + { + "epoch": 2.274400402960208, + "grad_norm": 1.4221102274888944, + "learning_rate": 1.1372442653440794e-07, + "loss": 1.0502, + "step": 29350 + }, + { + "epoch": 2.275175326436514, + "grad_norm": 1.3022329786205742, + "learning_rate": 1.1376317420954743e-07, + "loss": 1.043, + "step": 29360 + }, + { + "epoch": 2.275950249912821, + "grad_norm": 1.3174376275780264, + "learning_rate": 1.1380192188468692e-07, + "loss": 1.0148, + "step": 29370 + }, + { + "epoch": 2.276725173389128, + "grad_norm": 1.379646892472586, + "learning_rate": 1.1384066955982641e-07, + "loss": 1.0474, + "step": 29380 + }, + { + "epoch": 2.2775000968654346, + "grad_norm": 1.3071545848329067, + "learning_rate": 1.1387941723496591e-07, + "loss": 1.0422, + "step": 29390 + }, + { + "epoch": 2.2782750203417415, + "grad_norm": 1.4197725939218508, + "learning_rate": 1.1391816491010539e-07, + "loss": 1.0627, + "step": 29400 + }, + { + "epoch": 2.279049943818048, + "grad_norm": 1.3655730124244028, + "learning_rate": 1.1395691258524489e-07, + "loss": 1.047, + "step": 29410 + }, + { + "epoch": 2.2798248672943546, + "grad_norm": 1.2271567717542797, + "learning_rate": 1.1399566026038438e-07, + "loss": 1.0256, + "step": 29420 + }, + { + "epoch": 2.2805997907706614, + "grad_norm": 1.3053235930999876, + "learning_rate": 1.1403440793552386e-07, + "loss": 1.0577, + "step": 29430 + }, + { + "epoch": 2.2813747142469682, + "grad_norm": 1.371389871505357, + "learning_rate": 1.1407315561066336e-07, + "loss": 1.054, + "step": 29440 + }, + { + "epoch": 2.2821496377232746, + "grad_norm": 1.2753694413358463, + "learning_rate": 1.1411190328580285e-07, + "loss": 1.0372, + "step": 29450 + }, + { + "epoch": 2.2829245611995814, + "grad_norm": 1.3889982946266544, + "learning_rate": 1.1415065096094235e-07, + "loss": 1.0477, + "step": 29460 + }, + { + "epoch": 2.2836994846758882, + "grad_norm": 1.2828144210800416, + "learning_rate": 1.1418939863608183e-07, + "loss": 1.0535, + "step": 29470 + }, + { + "epoch": 2.284474408152195, + "grad_norm": 1.2908950550879699, + "learning_rate": 1.1422814631122133e-07, + "loss": 1.0419, + "step": 29480 + }, + { + "epoch": 2.285249331628502, + "grad_norm": 1.3533941540605037, + "learning_rate": 1.1426689398636082e-07, + "loss": 1.0482, + "step": 29490 + }, + { + "epoch": 2.2860242551048082, + "grad_norm": 1.3924278834352644, + "learning_rate": 1.143056416615003e-07, + "loss": 1.0381, + "step": 29500 + }, + { + "epoch": 2.2860242551048082, + "eval_loss": 1.0383321046829224, + "eval_runtime": 321.2312, + "eval_samples_per_second": 35.709, + "eval_steps_per_second": 8.928, + "step": 29500 + }, + { + "epoch": 2.286799178581115, + "grad_norm": 1.2825280825361207, + "learning_rate": 1.143443893366398e-07, + "loss": 1.044, + "step": 29510 + }, + { + "epoch": 2.287574102057422, + "grad_norm": 1.3286014955044338, + "learning_rate": 1.143831370117793e-07, + "loss": 1.0334, + "step": 29520 + }, + { + "epoch": 2.2883490255337287, + "grad_norm": 1.3709191695036202, + "learning_rate": 1.1442188468691879e-07, + "loss": 1.0423, + "step": 29530 + }, + { + "epoch": 2.289123949010035, + "grad_norm": 1.2919069194563675, + "learning_rate": 1.1446063236205827e-07, + "loss": 1.022, + "step": 29540 + }, + { + "epoch": 2.289898872486342, + "grad_norm": 1.2908562125343133, + "learning_rate": 1.1449938003719777e-07, + "loss": 1.033, + "step": 29550 + }, + { + "epoch": 2.2906737959626486, + "grad_norm": 1.2609518296651474, + "learning_rate": 1.1453812771233727e-07, + "loss": 1.0505, + "step": 29560 + }, + { + "epoch": 2.2914487194389555, + "grad_norm": 1.3613426060800298, + "learning_rate": 1.1457687538747675e-07, + "loss": 1.0308, + "step": 29570 + }, + { + "epoch": 2.2922236429152623, + "grad_norm": 1.3358026908804284, + "learning_rate": 1.1461562306261624e-07, + "loss": 1.0465, + "step": 29580 + }, + { + "epoch": 2.292998566391569, + "grad_norm": 1.2871509605607012, + "learning_rate": 1.1465437073775574e-07, + "loss": 1.0374, + "step": 29590 + }, + { + "epoch": 2.2937734898678754, + "grad_norm": 1.3077120334814845, + "learning_rate": 1.1469311841289522e-07, + "loss": 1.0451, + "step": 29600 + }, + { + "epoch": 2.2945484133441822, + "grad_norm": 1.287262566363879, + "learning_rate": 1.1473186608803472e-07, + "loss": 1.0475, + "step": 29610 + }, + { + "epoch": 2.295323336820489, + "grad_norm": 1.3792486060635356, + "learning_rate": 1.1477061376317421e-07, + "loss": 1.0744, + "step": 29620 + }, + { + "epoch": 2.296098260296796, + "grad_norm": 1.3251670297324751, + "learning_rate": 1.1480936143831371e-07, + "loss": 1.0458, + "step": 29630 + }, + { + "epoch": 2.2968731837731022, + "grad_norm": 1.3890816479519514, + "learning_rate": 1.1484810911345319e-07, + "loss": 1.0418, + "step": 29640 + }, + { + "epoch": 2.297648107249409, + "grad_norm": 1.296456205717341, + "learning_rate": 1.1488685678859269e-07, + "loss": 1.0476, + "step": 29650 + }, + { + "epoch": 2.298423030725716, + "grad_norm": 1.3436160163259774, + "learning_rate": 1.1492560446373218e-07, + "loss": 1.0424, + "step": 29660 + }, + { + "epoch": 2.2991979542020227, + "grad_norm": 1.439991460822752, + "learning_rate": 1.1496435213887166e-07, + "loss": 1.0293, + "step": 29670 + }, + { + "epoch": 2.2999728776783295, + "grad_norm": 1.3207745950197742, + "learning_rate": 1.1500309981401116e-07, + "loss": 1.0317, + "step": 29680 + }, + { + "epoch": 2.300747801154636, + "grad_norm": 1.3581094169557946, + "learning_rate": 1.1504184748915065e-07, + "loss": 1.0535, + "step": 29690 + }, + { + "epoch": 2.3015227246309427, + "grad_norm": 1.329160397805074, + "learning_rate": 1.1508059516429015e-07, + "loss": 1.0319, + "step": 29700 + }, + { + "epoch": 2.3022976481072495, + "grad_norm": 1.4179471309228653, + "learning_rate": 1.1511934283942963e-07, + "loss": 1.0234, + "step": 29710 + }, + { + "epoch": 2.3030725715835563, + "grad_norm": 1.3207713300918351, + "learning_rate": 1.1515809051456913e-07, + "loss": 1.0163, + "step": 29720 + }, + { + "epoch": 2.3038474950598626, + "grad_norm": 1.329956959327521, + "learning_rate": 1.1519683818970862e-07, + "loss": 1.0526, + "step": 29730 + }, + { + "epoch": 2.3046224185361694, + "grad_norm": 1.3611236496150831, + "learning_rate": 1.152355858648481e-07, + "loss": 1.0388, + "step": 29740 + }, + { + "epoch": 2.3053973420124763, + "grad_norm": 1.3401675663423114, + "learning_rate": 1.152743335399876e-07, + "loss": 1.016, + "step": 29750 + }, + { + "epoch": 2.306172265488783, + "grad_norm": 1.3055866951854864, + "learning_rate": 1.153130812151271e-07, + "loss": 1.0397, + "step": 29760 + }, + { + "epoch": 2.30694718896509, + "grad_norm": 1.3040408498367269, + "learning_rate": 1.1535182889026659e-07, + "loss": 1.0421, + "step": 29770 + }, + { + "epoch": 2.3077221124413962, + "grad_norm": 1.4140766477813171, + "learning_rate": 1.1539057656540607e-07, + "loss": 1.0677, + "step": 29780 + }, + { + "epoch": 2.308497035917703, + "grad_norm": 1.2410972393836135, + "learning_rate": 1.1542932424054557e-07, + "loss": 1.0358, + "step": 29790 + }, + { + "epoch": 2.30927195939401, + "grad_norm": 1.3966206291466436, + "learning_rate": 1.1546807191568507e-07, + "loss": 1.0254, + "step": 29800 + }, + { + "epoch": 2.3100468828703167, + "grad_norm": 1.2986001826480298, + "learning_rate": 1.1550681959082455e-07, + "loss": 1.0428, + "step": 29810 + }, + { + "epoch": 2.310821806346623, + "grad_norm": 1.2869846548137822, + "learning_rate": 1.1554556726596404e-07, + "loss": 1.0638, + "step": 29820 + }, + { + "epoch": 2.31159672982293, + "grad_norm": 1.2891845809813018, + "learning_rate": 1.1558431494110354e-07, + "loss": 1.0325, + "step": 29830 + }, + { + "epoch": 2.3123716532992367, + "grad_norm": 1.4116913031191651, + "learning_rate": 1.1562306261624302e-07, + "loss": 1.026, + "step": 29840 + }, + { + "epoch": 2.3131465767755435, + "grad_norm": 1.2853505033810007, + "learning_rate": 1.1566181029138252e-07, + "loss": 1.0048, + "step": 29850 + }, + { + "epoch": 2.3139215002518503, + "grad_norm": 1.2882202531005655, + "learning_rate": 1.1570055796652201e-07, + "loss": 1.0162, + "step": 29860 + }, + { + "epoch": 2.314696423728157, + "grad_norm": 1.3878762137290144, + "learning_rate": 1.1573930564166151e-07, + "loss": 1.0314, + "step": 29870 + }, + { + "epoch": 2.3154713472044635, + "grad_norm": 2.7739687604217877, + "learning_rate": 1.1577805331680099e-07, + "loss": 1.0394, + "step": 29880 + }, + { + "epoch": 2.3162462706807703, + "grad_norm": 1.3282736976520424, + "learning_rate": 1.1581680099194048e-07, + "loss": 1.024, + "step": 29890 + }, + { + "epoch": 2.317021194157077, + "grad_norm": 1.3459600280734028, + "learning_rate": 1.1585554866707998e-07, + "loss": 1.0316, + "step": 29900 + }, + { + "epoch": 2.317796117633384, + "grad_norm": 1.3459246716497306, + "learning_rate": 1.1589429634221946e-07, + "loss": 1.0223, + "step": 29910 + }, + { + "epoch": 2.3185710411096903, + "grad_norm": 1.2697433289496947, + "learning_rate": 1.1593304401735896e-07, + "loss": 1.0329, + "step": 29920 + }, + { + "epoch": 2.319345964585997, + "grad_norm": 1.2714214824562595, + "learning_rate": 1.1597179169249845e-07, + "loss": 1.025, + "step": 29930 + }, + { + "epoch": 2.320120888062304, + "grad_norm": 1.2940864032818067, + "learning_rate": 1.1601053936763795e-07, + "loss": 1.0413, + "step": 29940 + }, + { + "epoch": 2.3208958115386107, + "grad_norm": 1.290131779563987, + "learning_rate": 1.1604928704277743e-07, + "loss": 1.0506, + "step": 29950 + }, + { + "epoch": 2.3216707350149175, + "grad_norm": 1.4271751745143593, + "learning_rate": 1.1608803471791693e-07, + "loss": 1.0082, + "step": 29960 + }, + { + "epoch": 2.322445658491224, + "grad_norm": 1.2670323998787143, + "learning_rate": 1.1612678239305642e-07, + "loss": 1.0367, + "step": 29970 + }, + { + "epoch": 2.3232205819675307, + "grad_norm": 1.315511893035948, + "learning_rate": 1.161655300681959e-07, + "loss": 1.0369, + "step": 29980 + }, + { + "epoch": 2.3239955054438375, + "grad_norm": 1.2737901542387626, + "learning_rate": 1.162042777433354e-07, + "loss": 1.0325, + "step": 29990 + }, + { + "epoch": 2.3247704289201443, + "grad_norm": 1.2672153854642207, + "learning_rate": 1.162430254184749e-07, + "loss": 1.0417, + "step": 30000 + }, + { + "epoch": 2.3247704289201443, + "eval_loss": 1.0361366271972656, + "eval_runtime": 318.4234, + "eval_samples_per_second": 36.024, + "eval_steps_per_second": 9.007, + "step": 30000 + }, + { + "epoch": 2.3255453523964507, + "grad_norm": 1.275006228790495, + "learning_rate": 1.1628177309361438e-07, + "loss": 1.0342, + "step": 30010 + }, + { + "epoch": 2.3263202758727575, + "grad_norm": 1.3646431193394915, + "learning_rate": 1.1632052076875387e-07, + "loss": 1.0424, + "step": 30020 + }, + { + "epoch": 2.3270951993490643, + "grad_norm": 1.405165650314217, + "learning_rate": 1.1635926844389337e-07, + "loss": 1.0424, + "step": 30030 + }, + { + "epoch": 2.327870122825371, + "grad_norm": 1.3834046971493157, + "learning_rate": 1.1639801611903286e-07, + "loss": 1.0406, + "step": 30040 + }, + { + "epoch": 2.328645046301678, + "grad_norm": 1.2973892880999836, + "learning_rate": 1.1643676379417235e-07, + "loss": 1.0333, + "step": 30050 + }, + { + "epoch": 2.3294199697779843, + "grad_norm": 1.3643331014645808, + "learning_rate": 1.1647551146931184e-07, + "loss": 1.0373, + "step": 30060 + }, + { + "epoch": 2.330194893254291, + "grad_norm": 1.3045607763545068, + "learning_rate": 1.1651425914445134e-07, + "loss": 1.0644, + "step": 30070 + }, + { + "epoch": 2.330969816730598, + "grad_norm": 1.322284159586191, + "learning_rate": 1.1655300681959082e-07, + "loss": 1.0422, + "step": 30080 + }, + { + "epoch": 2.3317447402069047, + "grad_norm": 1.3767893367632715, + "learning_rate": 1.1659175449473032e-07, + "loss": 1.0384, + "step": 30090 + }, + { + "epoch": 2.332519663683211, + "grad_norm": 1.3327355025902867, + "learning_rate": 1.1663050216986981e-07, + "loss": 1.0221, + "step": 30100 + }, + { + "epoch": 2.333294587159518, + "grad_norm": 1.3153754589437212, + "learning_rate": 1.1666924984500931e-07, + "loss": 1.0394, + "step": 30110 + }, + { + "epoch": 2.3340695106358247, + "grad_norm": 1.290907823006538, + "learning_rate": 1.1670799752014879e-07, + "loss": 1.0451, + "step": 30120 + }, + { + "epoch": 2.3348444341121315, + "grad_norm": 1.937916151167588, + "learning_rate": 1.1674674519528828e-07, + "loss": 1.0729, + "step": 30130 + }, + { + "epoch": 2.3356193575884383, + "grad_norm": 1.377907612069236, + "learning_rate": 1.1678549287042778e-07, + "loss": 1.0295, + "step": 30140 + }, + { + "epoch": 2.3363942810647447, + "grad_norm": 1.2139270842779837, + "learning_rate": 1.1682424054556726e-07, + "loss": 1.0042, + "step": 30150 + }, + { + "epoch": 2.3371692045410515, + "grad_norm": 1.3119542700684435, + "learning_rate": 1.1686298822070676e-07, + "loss": 1.0202, + "step": 30160 + }, + { + "epoch": 2.3379441280173583, + "grad_norm": 1.248411940271664, + "learning_rate": 1.1690173589584625e-07, + "loss": 1.0018, + "step": 30170 + }, + { + "epoch": 2.338719051493665, + "grad_norm": 1.3945295999147844, + "learning_rate": 1.1694048357098574e-07, + "loss": 1.0344, + "step": 30180 + }, + { + "epoch": 2.3394939749699715, + "grad_norm": 1.3208244357063954, + "learning_rate": 1.1697923124612523e-07, + "loss": 1.0474, + "step": 30190 + }, + { + "epoch": 2.3402688984462783, + "grad_norm": 1.2740169288430763, + "learning_rate": 1.1701797892126473e-07, + "loss": 1.0117, + "step": 30200 + }, + { + "epoch": 2.341043821922585, + "grad_norm": 1.3118750791522376, + "learning_rate": 1.1705672659640422e-07, + "loss": 1.0497, + "step": 30210 + }, + { + "epoch": 2.341818745398892, + "grad_norm": 1.2912790603938915, + "learning_rate": 1.170954742715437e-07, + "loss": 1.0299, + "step": 30220 + }, + { + "epoch": 2.3425936688751987, + "grad_norm": 1.2777204502133914, + "learning_rate": 1.171342219466832e-07, + "loss": 1.0308, + "step": 30230 + }, + { + "epoch": 2.3433685923515055, + "grad_norm": 1.3327648646046635, + "learning_rate": 1.171729696218227e-07, + "loss": 1.0436, + "step": 30240 + }, + { + "epoch": 2.344143515827812, + "grad_norm": 1.3757575676867304, + "learning_rate": 1.172117172969622e-07, + "loss": 1.0367, + "step": 30250 + }, + { + "epoch": 2.3449184393041187, + "grad_norm": 1.2621533000306213, + "learning_rate": 1.1725046497210169e-07, + "loss": 1.0271, + "step": 30260 + }, + { + "epoch": 2.3456933627804255, + "grad_norm": 1.3743176335340712, + "learning_rate": 1.1728921264724118e-07, + "loss": 1.057, + "step": 30270 + }, + { + "epoch": 2.3464682862567323, + "grad_norm": 1.376080040424906, + "learning_rate": 1.1732796032238068e-07, + "loss": 1.0362, + "step": 30280 + }, + { + "epoch": 2.3472432097330387, + "grad_norm": 1.2700332276416346, + "learning_rate": 1.1736670799752017e-07, + "loss": 1.0336, + "step": 30290 + }, + { + "epoch": 2.3480181332093455, + "grad_norm": 1.3747265182833974, + "learning_rate": 1.1740545567265966e-07, + "loss": 1.0183, + "step": 30300 + }, + { + "epoch": 2.3487930566856523, + "grad_norm": 1.3591211675011758, + "learning_rate": 1.1744420334779915e-07, + "loss": 1.0432, + "step": 30310 + }, + { + "epoch": 2.349567980161959, + "grad_norm": 1.2778377528225, + "learning_rate": 1.1748295102293865e-07, + "loss": 1.0621, + "step": 30320 + }, + { + "epoch": 2.350342903638266, + "grad_norm": 1.349032878311865, + "learning_rate": 1.1752169869807813e-07, + "loss": 1.0548, + "step": 30330 + }, + { + "epoch": 2.3511178271145723, + "grad_norm": 1.2034101112670297, + "learning_rate": 1.1756044637321762e-07, + "loss": 1.0295, + "step": 30340 + }, + { + "epoch": 2.351892750590879, + "grad_norm": 1.3561081956878083, + "learning_rate": 1.1759919404835712e-07, + "loss": 1.0601, + "step": 30350 + }, + { + "epoch": 2.352667674067186, + "grad_norm": 1.297474247636885, + "learning_rate": 1.176379417234966e-07, + "loss": 1.036, + "step": 30360 + }, + { + "epoch": 2.3534425975434927, + "grad_norm": 1.2422002542055686, + "learning_rate": 1.176766893986361e-07, + "loss": 1.0314, + "step": 30370 + }, + { + "epoch": 2.354217521019799, + "grad_norm": 1.4322859162467505, + "learning_rate": 1.1771543707377559e-07, + "loss": 1.042, + "step": 30380 + }, + { + "epoch": 2.354992444496106, + "grad_norm": 1.4134632000126504, + "learning_rate": 1.1775418474891509e-07, + "loss": 1.0697, + "step": 30390 + }, + { + "epoch": 2.3557673679724127, + "grad_norm": 1.328482219841296, + "learning_rate": 1.1779293242405457e-07, + "loss": 1.0059, + "step": 30400 + }, + { + "epoch": 2.3565422914487195, + "grad_norm": 1.3634949448802904, + "learning_rate": 1.1783168009919407e-07, + "loss": 1.0299, + "step": 30410 + }, + { + "epoch": 2.3573172149250263, + "grad_norm": 1.2458355497306646, + "learning_rate": 1.1787042777433356e-07, + "loss": 1.0468, + "step": 30420 + }, + { + "epoch": 2.3580921384013327, + "grad_norm": 1.3257820924844586, + "learning_rate": 1.1790917544947304e-07, + "loss": 1.0191, + "step": 30430 + }, + { + "epoch": 2.3588670618776395, + "grad_norm": 1.3331167011088285, + "learning_rate": 1.1794792312461254e-07, + "loss": 1.0345, + "step": 30440 + }, + { + "epoch": 2.3596419853539463, + "grad_norm": 1.3186285979111996, + "learning_rate": 1.1798667079975203e-07, + "loss": 1.0127, + "step": 30450 + }, + { + "epoch": 2.360416908830253, + "grad_norm": 1.2278961788793694, + "learning_rate": 1.1802541847489153e-07, + "loss": 1.0028, + "step": 30460 + }, + { + "epoch": 2.3611918323065595, + "grad_norm": 1.3380187022634271, + "learning_rate": 1.1806416615003101e-07, + "loss": 1.0624, + "step": 30470 + }, + { + "epoch": 2.3619667557828663, + "grad_norm": 1.2126897497296112, + "learning_rate": 1.1810291382517051e-07, + "loss": 1.0202, + "step": 30480 + }, + { + "epoch": 2.362741679259173, + "grad_norm": 1.2059158660395237, + "learning_rate": 1.1814166150031e-07, + "loss": 1.0354, + "step": 30490 + }, + { + "epoch": 2.36351660273548, + "grad_norm": 1.2747598959459807, + "learning_rate": 1.1818040917544949e-07, + "loss": 1.0389, + "step": 30500 + }, + { + "epoch": 2.36351660273548, + "eval_loss": 1.0339261293411255, + "eval_runtime": 320.2962, + "eval_samples_per_second": 35.814, + "eval_steps_per_second": 8.954, + "step": 30500 + }, + { + "epoch": 2.3642915262117867, + "grad_norm": 1.3575305766017676, + "learning_rate": 1.1821915685058898e-07, + "loss": 1.0417, + "step": 30510 + }, + { + "epoch": 2.3650664496880935, + "grad_norm": 1.33663361549576, + "learning_rate": 1.1825790452572848e-07, + "loss": 1.0409, + "step": 30520 + }, + { + "epoch": 2.3658413731644, + "grad_norm": 1.2721188511480654, + "learning_rate": 1.1829665220086797e-07, + "loss": 1.0065, + "step": 30530 + }, + { + "epoch": 2.3666162966407067, + "grad_norm": 1.2278890194120735, + "learning_rate": 1.1833539987600745e-07, + "loss": 1.0314, + "step": 30540 + }, + { + "epoch": 2.3673912201170135, + "grad_norm": 1.4436452110743951, + "learning_rate": 1.1837414755114695e-07, + "loss": 1.0368, + "step": 30550 + }, + { + "epoch": 2.3681661435933203, + "grad_norm": 1.3436641067714232, + "learning_rate": 1.1841289522628645e-07, + "loss": 1.0259, + "step": 30560 + }, + { + "epoch": 2.3689410670696267, + "grad_norm": 1.3399767450312932, + "learning_rate": 1.1845164290142593e-07, + "loss": 1.042, + "step": 30570 + }, + { + "epoch": 2.3697159905459335, + "grad_norm": 1.3676799297012714, + "learning_rate": 1.1849039057656542e-07, + "loss": 1.0262, + "step": 30580 + }, + { + "epoch": 2.3704909140222403, + "grad_norm": 1.3561643918993542, + "learning_rate": 1.1852913825170492e-07, + "loss": 1.0443, + "step": 30590 + }, + { + "epoch": 2.371265837498547, + "grad_norm": 1.3272961126101177, + "learning_rate": 1.185678859268444e-07, + "loss": 1.0457, + "step": 30600 + }, + { + "epoch": 2.372040760974854, + "grad_norm": 1.3181519074246646, + "learning_rate": 1.186066336019839e-07, + "loss": 1.018, + "step": 30610 + }, + { + "epoch": 2.3728156844511603, + "grad_norm": 1.296328764104945, + "learning_rate": 1.1864538127712339e-07, + "loss": 1.0271, + "step": 30620 + }, + { + "epoch": 2.373590607927467, + "grad_norm": 1.2164109828440741, + "learning_rate": 1.1868412895226289e-07, + "loss": 1.0385, + "step": 30630 + }, + { + "epoch": 2.374365531403774, + "grad_norm": 1.2891277115600557, + "learning_rate": 1.1872287662740237e-07, + "loss": 1.0313, + "step": 30640 + }, + { + "epoch": 2.3751404548800807, + "grad_norm": 1.3692151413975473, + "learning_rate": 1.1876162430254187e-07, + "loss": 1.0208, + "step": 30650 + }, + { + "epoch": 2.375915378356387, + "grad_norm": 1.3035040047701703, + "learning_rate": 1.1880037197768136e-07, + "loss": 1.0349, + "step": 30660 + }, + { + "epoch": 2.376690301832694, + "grad_norm": 1.3737077758041198, + "learning_rate": 1.1883911965282084e-07, + "loss": 1.0568, + "step": 30670 + }, + { + "epoch": 2.3774652253090007, + "grad_norm": 1.3777737591139667, + "learning_rate": 1.1887786732796034e-07, + "loss": 1.034, + "step": 30680 + }, + { + "epoch": 2.3782401487853075, + "grad_norm": 1.3975206417353523, + "learning_rate": 1.1891661500309983e-07, + "loss": 1.0118, + "step": 30690 + }, + { + "epoch": 2.3790150722616143, + "grad_norm": 1.2978593502230347, + "learning_rate": 1.1895536267823933e-07, + "loss": 1.0243, + "step": 30700 + }, + { + "epoch": 2.3797899957379207, + "grad_norm": 1.337420188718056, + "learning_rate": 1.1899411035337881e-07, + "loss": 1.0385, + "step": 30710 + }, + { + "epoch": 2.3805649192142275, + "grad_norm": 1.332074910939702, + "learning_rate": 1.1903285802851831e-07, + "loss": 1.0534, + "step": 30720 + }, + { + "epoch": 2.3813398426905343, + "grad_norm": 1.398492585917639, + "learning_rate": 1.190716057036578e-07, + "loss": 1.0377, + "step": 30730 + }, + { + "epoch": 2.382114766166841, + "grad_norm": 1.9527203480648239, + "learning_rate": 1.1911035337879729e-07, + "loss": 1.0531, + "step": 30740 + }, + { + "epoch": 2.3828896896431475, + "grad_norm": 1.3912610363692655, + "learning_rate": 1.1914910105393678e-07, + "loss": 1.0218, + "step": 30750 + }, + { + "epoch": 2.3836646131194543, + "grad_norm": 1.4451215196150762, + "learning_rate": 1.1918784872907628e-07, + "loss": 1.0394, + "step": 30760 + }, + { + "epoch": 2.384439536595761, + "grad_norm": 1.2827497311172995, + "learning_rate": 1.1922659640421577e-07, + "loss": 1.0297, + "step": 30770 + }, + { + "epoch": 2.385214460072068, + "grad_norm": 1.2531277642197398, + "learning_rate": 1.1926534407935527e-07, + "loss": 1.0401, + "step": 30780 + }, + { + "epoch": 2.3859893835483748, + "grad_norm": 1.2476456539643275, + "learning_rate": 1.1930409175449474e-07, + "loss": 1.0345, + "step": 30790 + }, + { + "epoch": 2.386764307024681, + "grad_norm": 1.431457176441591, + "learning_rate": 1.1934283942963423e-07, + "loss": 1.0221, + "step": 30800 + }, + { + "epoch": 2.387539230500988, + "grad_norm": 1.285622718037392, + "learning_rate": 1.1938158710477373e-07, + "loss": 1.0459, + "step": 30810 + }, + { + "epoch": 2.3883141539772947, + "grad_norm": 1.387337169075615, + "learning_rate": 1.1942033477991322e-07, + "loss": 1.0553, + "step": 30820 + }, + { + "epoch": 2.3890890774536016, + "grad_norm": 1.2834962580485367, + "learning_rate": 1.1945908245505272e-07, + "loss": 1.0174, + "step": 30830 + }, + { + "epoch": 2.3898640009299084, + "grad_norm": 1.3806854863122795, + "learning_rate": 1.1949783013019221e-07, + "loss": 1.026, + "step": 30840 + }, + { + "epoch": 2.3906389244062147, + "grad_norm": 1.3728749295747644, + "learning_rate": 1.195365778053317e-07, + "loss": 1.0477, + "step": 30850 + }, + { + "epoch": 2.3914138478825215, + "grad_norm": 1.4442185303540052, + "learning_rate": 1.1957532548047118e-07, + "loss": 1.0603, + "step": 30860 + }, + { + "epoch": 2.3921887713588283, + "grad_norm": 1.3584584702180902, + "learning_rate": 1.1961407315561067e-07, + "loss": 1.0449, + "step": 30870 + }, + { + "epoch": 2.392963694835135, + "grad_norm": 1.4737866230092842, + "learning_rate": 1.1965282083075017e-07, + "loss": 1.0301, + "step": 30880 + }, + { + "epoch": 2.393738618311442, + "grad_norm": 1.3351862715439469, + "learning_rate": 1.1969156850588967e-07, + "loss": 1.0448, + "step": 30890 + }, + { + "epoch": 2.3945135417877483, + "grad_norm": 1.3435568801741105, + "learning_rate": 1.1973031618102916e-07, + "loss": 1.0223, + "step": 30900 + }, + { + "epoch": 2.395288465264055, + "grad_norm": 1.3789358523738846, + "learning_rate": 1.1976906385616866e-07, + "loss": 1.0426, + "step": 30910 + }, + { + "epoch": 2.396063388740362, + "grad_norm": 1.2943457606468898, + "learning_rate": 1.1980781153130815e-07, + "loss": 1.0158, + "step": 30920 + }, + { + "epoch": 2.3968383122166688, + "grad_norm": 1.2875666609914653, + "learning_rate": 1.1984655920644762e-07, + "loss": 1.0207, + "step": 30930 + }, + { + "epoch": 2.397613235692975, + "grad_norm": 1.257022722856215, + "learning_rate": 1.1988530688158712e-07, + "loss": 1.0399, + "step": 30940 + }, + { + "epoch": 2.398388159169282, + "grad_norm": 1.2876690179185037, + "learning_rate": 1.199240545567266e-07, + "loss": 1.0307, + "step": 30950 + }, + { + "epoch": 2.3991630826455888, + "grad_norm": 1.3160378015254668, + "learning_rate": 1.199628022318661e-07, + "loss": 1.0214, + "step": 30960 + }, + { + "epoch": 2.3999380061218956, + "grad_norm": 1.3589867043468395, + "learning_rate": 1.200015499070056e-07, + "loss": 1.0287, + "step": 30970 + }, + { + "epoch": 2.4007129295982024, + "grad_norm": 1.3717419313703105, + "learning_rate": 1.200402975821451e-07, + "loss": 1.0165, + "step": 30980 + }, + { + "epoch": 2.4014878530745087, + "grad_norm": 1.3007999667538066, + "learning_rate": 1.200790452572846e-07, + "loss": 1.0337, + "step": 30990 + }, + { + "epoch": 2.4022627765508155, + "grad_norm": 1.3508493250771691, + "learning_rate": 1.2011779293242406e-07, + "loss": 1.0549, + "step": 31000 + }, + { + "epoch": 2.4022627765508155, + "eval_loss": 1.031859278678894, + "eval_runtime": 320.3988, + "eval_samples_per_second": 35.802, + "eval_steps_per_second": 8.951, + "step": 31000 + }, + { + "epoch": 2.4030377000271224, + "grad_norm": 1.324798636407297, + "learning_rate": 1.2015654060756356e-07, + "loss": 1.0564, + "step": 31010 + }, + { + "epoch": 2.403812623503429, + "grad_norm": 1.289790390002471, + "learning_rate": 1.2019528828270305e-07, + "loss": 1.0326, + "step": 31020 + }, + { + "epoch": 2.4045875469797355, + "grad_norm": 1.2286431065192582, + "learning_rate": 1.2023403595784255e-07, + "loss": 1.0222, + "step": 31030 + }, + { + "epoch": 2.4053624704560423, + "grad_norm": 1.3420671862699227, + "learning_rate": 1.2027278363298205e-07, + "loss": 1.0473, + "step": 31040 + }, + { + "epoch": 2.406137393932349, + "grad_norm": 1.3797477806265523, + "learning_rate": 1.2031153130812154e-07, + "loss": 1.0378, + "step": 31050 + }, + { + "epoch": 2.406912317408656, + "grad_norm": 1.4629065948186246, + "learning_rate": 1.20350278983261e-07, + "loss": 1.0431, + "step": 31060 + }, + { + "epoch": 2.4076872408849628, + "grad_norm": 1.298130123155499, + "learning_rate": 1.203890266584005e-07, + "loss": 1.0183, + "step": 31070 + }, + { + "epoch": 2.408462164361269, + "grad_norm": 1.2939092042882738, + "learning_rate": 1.2042777433354e-07, + "loss": 1.0313, + "step": 31080 + }, + { + "epoch": 2.409237087837576, + "grad_norm": 1.2972549728311613, + "learning_rate": 1.204665220086795e-07, + "loss": 1.0366, + "step": 31090 + }, + { + "epoch": 2.4100120113138828, + "grad_norm": 1.4158718199759603, + "learning_rate": 1.20505269683819e-07, + "loss": 1.0532, + "step": 31100 + }, + { + "epoch": 2.4107869347901896, + "grad_norm": 1.3648783755698137, + "learning_rate": 1.205440173589585e-07, + "loss": 1.0073, + "step": 31110 + }, + { + "epoch": 2.411561858266496, + "grad_norm": 1.2647135043384583, + "learning_rate": 1.2058276503409798e-07, + "loss": 1.0172, + "step": 31120 + }, + { + "epoch": 2.4123367817428027, + "grad_norm": 1.3784660728806875, + "learning_rate": 1.2062151270923745e-07, + "loss": 1.0208, + "step": 31130 + }, + { + "epoch": 2.4131117052191096, + "grad_norm": 1.3246461753424392, + "learning_rate": 1.2066026038437695e-07, + "loss": 1.0247, + "step": 31140 + }, + { + "epoch": 2.4138866286954164, + "grad_norm": 1.2965753044162345, + "learning_rate": 1.2069900805951644e-07, + "loss": 1.0342, + "step": 31150 + }, + { + "epoch": 2.414661552171723, + "grad_norm": 1.2262378616336314, + "learning_rate": 1.2073775573465594e-07, + "loss": 1.0383, + "step": 31160 + }, + { + "epoch": 2.41543647564803, + "grad_norm": 1.3951927348928208, + "learning_rate": 1.2077650340979543e-07, + "loss": 1.0442, + "step": 31170 + }, + { + "epoch": 2.4162113991243364, + "grad_norm": 1.3704755618993891, + "learning_rate": 1.2081525108493493e-07, + "loss": 1.0459, + "step": 31180 + }, + { + "epoch": 2.416986322600643, + "grad_norm": 1.271110315980442, + "learning_rate": 1.2085399876007442e-07, + "loss": 1.0371, + "step": 31190 + }, + { + "epoch": 2.41776124607695, + "grad_norm": 1.2941961977797551, + "learning_rate": 1.208927464352139e-07, + "loss": 1.0484, + "step": 31200 + }, + { + "epoch": 2.418536169553257, + "grad_norm": 1.3623036257748344, + "learning_rate": 1.209314941103534e-07, + "loss": 1.0597, + "step": 31210 + }, + { + "epoch": 2.419311093029563, + "grad_norm": 1.3386694338432163, + "learning_rate": 1.2097024178549288e-07, + "loss": 1.0445, + "step": 31220 + }, + { + "epoch": 2.42008601650587, + "grad_norm": 1.3350016538171052, + "learning_rate": 1.2100898946063238e-07, + "loss": 1.0421, + "step": 31230 + }, + { + "epoch": 2.4208609399821768, + "grad_norm": 1.339531358967365, + "learning_rate": 1.2104773713577188e-07, + "loss": 1.0229, + "step": 31240 + }, + { + "epoch": 2.4216358634584836, + "grad_norm": 1.2828298044940647, + "learning_rate": 1.2108648481091137e-07, + "loss": 1.0486, + "step": 31250 + }, + { + "epoch": 2.4224107869347904, + "grad_norm": 1.2943661135164368, + "learning_rate": 1.2112523248605087e-07, + "loss": 1.0281, + "step": 31260 + }, + { + "epoch": 2.4231857104110968, + "grad_norm": 1.2901550115200353, + "learning_rate": 1.2116398016119034e-07, + "loss": 1.0299, + "step": 31270 + }, + { + "epoch": 2.4239606338874036, + "grad_norm": 1.2917315879241935, + "learning_rate": 1.2120272783632983e-07, + "loss": 1.0268, + "step": 31280 + }, + { + "epoch": 2.4247355573637104, + "grad_norm": 1.4280417464189397, + "learning_rate": 1.2124147551146933e-07, + "loss": 1.0194, + "step": 31290 + }, + { + "epoch": 2.425510480840017, + "grad_norm": 1.353728286448043, + "learning_rate": 1.2128022318660882e-07, + "loss": 1.0341, + "step": 31300 + }, + { + "epoch": 2.4262854043163236, + "grad_norm": 1.3365269592925433, + "learning_rate": 1.2131897086174832e-07, + "loss": 1.017, + "step": 31310 + }, + { + "epoch": 2.4270603277926304, + "grad_norm": 1.3740540564661003, + "learning_rate": 1.2135771853688781e-07, + "loss": 1.0457, + "step": 31320 + }, + { + "epoch": 2.427835251268937, + "grad_norm": 1.3393682117274606, + "learning_rate": 1.213964662120273e-07, + "loss": 1.0195, + "step": 31330 + }, + { + "epoch": 2.428610174745244, + "grad_norm": 1.2312521091701811, + "learning_rate": 1.2143521388716678e-07, + "loss": 1.026, + "step": 31340 + }, + { + "epoch": 2.429385098221551, + "grad_norm": 1.3243721996885038, + "learning_rate": 1.2147396156230627e-07, + "loss": 1.0134, + "step": 31350 + }, + { + "epoch": 2.430160021697857, + "grad_norm": 1.3300601329092523, + "learning_rate": 1.2151270923744577e-07, + "loss": 1.0099, + "step": 31360 + }, + { + "epoch": 2.430934945174164, + "grad_norm": 1.3476227411428316, + "learning_rate": 1.2155145691258526e-07, + "loss": 1.0294, + "step": 31370 + }, + { + "epoch": 2.431709868650471, + "grad_norm": 1.37899465908892, + "learning_rate": 1.2159020458772476e-07, + "loss": 1.0615, + "step": 31380 + }, + { + "epoch": 2.4324847921267776, + "grad_norm": 1.2810453764315617, + "learning_rate": 1.2162895226286426e-07, + "loss": 1.0372, + "step": 31390 + }, + { + "epoch": 2.433259715603084, + "grad_norm": 1.3451042262538788, + "learning_rate": 1.2166769993800372e-07, + "loss": 1.0369, + "step": 31400 + }, + { + "epoch": 2.4340346390793908, + "grad_norm": 1.3277589046782243, + "learning_rate": 1.2170644761314322e-07, + "loss": 1.0382, + "step": 31410 + }, + { + "epoch": 2.4348095625556976, + "grad_norm": 1.4014611457903299, + "learning_rate": 1.2174519528828272e-07, + "loss": 1.0424, + "step": 31420 + }, + { + "epoch": 2.4355844860320044, + "grad_norm": 1.3367598583258167, + "learning_rate": 1.217839429634222e-07, + "loss": 1.0358, + "step": 31430 + }, + { + "epoch": 2.436359409508311, + "grad_norm": 1.2890474382416368, + "learning_rate": 1.218226906385617e-07, + "loss": 1.0316, + "step": 31440 + }, + { + "epoch": 2.437134332984618, + "grad_norm": 1.4521704034617409, + "learning_rate": 1.218614383137012e-07, + "loss": 1.0202, + "step": 31450 + }, + { + "epoch": 2.4379092564609244, + "grad_norm": 1.3210915556104028, + "learning_rate": 1.219001859888407e-07, + "loss": 1.0528, + "step": 31460 + }, + { + "epoch": 2.438684179937231, + "grad_norm": 1.2967164968744949, + "learning_rate": 1.2193893366398017e-07, + "loss": 1.0195, + "step": 31470 + }, + { + "epoch": 2.439459103413538, + "grad_norm": 1.336896426647807, + "learning_rate": 1.2197768133911966e-07, + "loss": 1.0214, + "step": 31480 + }, + { + "epoch": 2.440234026889845, + "grad_norm": 1.3069231571035729, + "learning_rate": 1.2201642901425916e-07, + "loss": 1.0216, + "step": 31490 + }, + { + "epoch": 2.441008950366151, + "grad_norm": 1.244276561609857, + "learning_rate": 1.2205517668939865e-07, + "loss": 1.0396, + "step": 31500 + }, + { + "epoch": 2.441008950366151, + "eval_loss": 1.0297197103500366, + "eval_runtime": 320.9743, + "eval_samples_per_second": 35.738, + "eval_steps_per_second": 8.935, + "step": 31500 + }, + { + "epoch": 2.441783873842458, + "grad_norm": 1.2858143628111187, + "learning_rate": 1.2209392436453815e-07, + "loss": 1.0204, + "step": 31510 + }, + { + "epoch": 2.442558797318765, + "grad_norm": 1.330195647067226, + "learning_rate": 1.2213267203967764e-07, + "loss": 1.0369, + "step": 31520 + }, + { + "epoch": 2.4433337207950716, + "grad_norm": 1.3154196432269971, + "learning_rate": 1.2217141971481714e-07, + "loss": 1.0254, + "step": 31530 + }, + { + "epoch": 2.4441086442713784, + "grad_norm": 1.3164728837034838, + "learning_rate": 1.222101673899566e-07, + "loss": 1.0458, + "step": 31540 + }, + { + "epoch": 2.444883567747685, + "grad_norm": 1.2536582426884193, + "learning_rate": 1.222489150650961e-07, + "loss": 1.0152, + "step": 31550 + }, + { + "epoch": 2.4456584912239916, + "grad_norm": 1.3809849715501024, + "learning_rate": 1.222876627402356e-07, + "loss": 1.0497, + "step": 31560 + }, + { + "epoch": 2.4464334147002984, + "grad_norm": 1.4617061430374498, + "learning_rate": 1.223264104153751e-07, + "loss": 1.0285, + "step": 31570 + }, + { + "epoch": 2.447208338176605, + "grad_norm": 3.4165085650477316, + "learning_rate": 1.223651580905146e-07, + "loss": 1.0181, + "step": 31580 + }, + { + "epoch": 2.4479832616529116, + "grad_norm": 1.2776273965128744, + "learning_rate": 1.2240390576565409e-07, + "loss": 1.0086, + "step": 31590 + }, + { + "epoch": 2.4487581851292184, + "grad_norm": 1.2564034937236086, + "learning_rate": 1.2244265344079358e-07, + "loss": 1.0196, + "step": 31600 + }, + { + "epoch": 2.449533108605525, + "grad_norm": 1.423461626033995, + "learning_rate": 1.2248140111593305e-07, + "loss": 1.0524, + "step": 31610 + }, + { + "epoch": 2.450308032081832, + "grad_norm": 1.3238153089817561, + "learning_rate": 1.2252014879107255e-07, + "loss": 1.0213, + "step": 31620 + }, + { + "epoch": 2.451082955558139, + "grad_norm": 1.3556763051302958, + "learning_rate": 1.2255889646621204e-07, + "loss": 1.0271, + "step": 31630 + }, + { + "epoch": 2.451857879034445, + "grad_norm": 2.7328522517211193, + "learning_rate": 1.2259764414135154e-07, + "loss": 1.028, + "step": 31640 + }, + { + "epoch": 2.452632802510752, + "grad_norm": 1.2991077655076535, + "learning_rate": 1.2263639181649103e-07, + "loss": 1.0604, + "step": 31650 + }, + { + "epoch": 2.453407725987059, + "grad_norm": 1.3188205553164627, + "learning_rate": 1.2267513949163053e-07, + "loss": 1.0353, + "step": 31660 + }, + { + "epoch": 2.4541826494633656, + "grad_norm": 1.281617284528667, + "learning_rate": 1.2271388716677002e-07, + "loss": 1.0144, + "step": 31670 + }, + { + "epoch": 2.454957572939672, + "grad_norm": 1.2571982268839819, + "learning_rate": 1.227526348419095e-07, + "loss": 0.9876, + "step": 31680 + }, + { + "epoch": 2.455732496415979, + "grad_norm": 1.3171657610906509, + "learning_rate": 1.22791382517049e-07, + "loss": 1.0272, + "step": 31690 + }, + { + "epoch": 2.4565074198922856, + "grad_norm": 1.2979303685655155, + "learning_rate": 1.2283013019218848e-07, + "loss": 1.0118, + "step": 31700 + }, + { + "epoch": 2.4572823433685924, + "grad_norm": 1.3458288532327174, + "learning_rate": 1.2286887786732798e-07, + "loss": 1.0204, + "step": 31710 + }, + { + "epoch": 2.4580572668448992, + "grad_norm": 1.4000442972813647, + "learning_rate": 1.2290762554246748e-07, + "loss": 1.0192, + "step": 31720 + }, + { + "epoch": 2.4588321903212056, + "grad_norm": 1.3908304847952226, + "learning_rate": 1.2294637321760697e-07, + "loss": 1.0235, + "step": 31730 + }, + { + "epoch": 2.4596071137975124, + "grad_norm": 1.84394417716968, + "learning_rate": 1.2298512089274644e-07, + "loss": 1.0183, + "step": 31740 + }, + { + "epoch": 2.460382037273819, + "grad_norm": 1.3343989066661128, + "learning_rate": 1.2302386856788594e-07, + "loss": 0.9992, + "step": 31750 + }, + { + "epoch": 2.461156960750126, + "grad_norm": 1.3578503447871424, + "learning_rate": 1.2306261624302543e-07, + "loss": 1.0291, + "step": 31760 + }, + { + "epoch": 2.461931884226433, + "grad_norm": 1.3970342020575997, + "learning_rate": 1.2310136391816493e-07, + "loss": 1.0156, + "step": 31770 + }, + { + "epoch": 2.462706807702739, + "grad_norm": 1.26924337458599, + "learning_rate": 1.2314011159330442e-07, + "loss": 1.0153, + "step": 31780 + }, + { + "epoch": 2.463481731179046, + "grad_norm": 1.2452868421963825, + "learning_rate": 1.2317885926844392e-07, + "loss": 0.9969, + "step": 31790 + }, + { + "epoch": 2.464256654655353, + "grad_norm": 1.369009285982752, + "learning_rate": 1.232176069435834e-07, + "loss": 1.0395, + "step": 31800 + }, + { + "epoch": 2.4650315781316596, + "grad_norm": 1.3118533677697375, + "learning_rate": 1.2325635461872288e-07, + "loss": 1.0216, + "step": 31810 + }, + { + "epoch": 2.4658065016079664, + "grad_norm": 1.730626462198374, + "learning_rate": 1.2329510229386238e-07, + "loss": 1.0292, + "step": 31820 + }, + { + "epoch": 2.466581425084273, + "grad_norm": 1.3040949208426924, + "learning_rate": 1.2333384996900187e-07, + "loss": 1.055, + "step": 31830 + }, + { + "epoch": 2.4673563485605796, + "grad_norm": 1.3645091483009908, + "learning_rate": 1.2337259764414137e-07, + "loss": 1.043, + "step": 31840 + }, + { + "epoch": 2.4681312720368864, + "grad_norm": 1.2958621445726826, + "learning_rate": 1.2341134531928086e-07, + "loss": 1.0411, + "step": 31850 + }, + { + "epoch": 2.4689061955131932, + "grad_norm": 1.371596241228174, + "learning_rate": 1.2345009299442036e-07, + "loss": 1.0333, + "step": 31860 + }, + { + "epoch": 2.4696811189894996, + "grad_norm": 1.3340024173807916, + "learning_rate": 1.2348884066955985e-07, + "loss": 1.0393, + "step": 31870 + }, + { + "epoch": 2.4704560424658064, + "grad_norm": 1.3247231240198232, + "learning_rate": 1.2352758834469932e-07, + "loss": 1.0534, + "step": 31880 + }, + { + "epoch": 2.4712309659421132, + "grad_norm": 1.3028292431537039, + "learning_rate": 1.2356633601983882e-07, + "loss": 1.0129, + "step": 31890 + }, + { + "epoch": 2.47200588941842, + "grad_norm": 1.2990628008269474, + "learning_rate": 1.2360508369497831e-07, + "loss": 1.0278, + "step": 31900 + }, + { + "epoch": 2.472780812894727, + "grad_norm": 1.4129120267373514, + "learning_rate": 1.236438313701178e-07, + "loss": 1.0191, + "step": 31910 + }, + { + "epoch": 2.473555736371033, + "grad_norm": 1.3483282519197224, + "learning_rate": 1.236825790452573e-07, + "loss": 1.0108, + "step": 31920 + }, + { + "epoch": 2.47433065984734, + "grad_norm": 1.3581829273806385, + "learning_rate": 1.237213267203968e-07, + "loss": 1.0411, + "step": 31930 + }, + { + "epoch": 2.475105583323647, + "grad_norm": 1.3358295831557816, + "learning_rate": 1.237600743955363e-07, + "loss": 1.0012, + "step": 31940 + }, + { + "epoch": 2.4758805067999536, + "grad_norm": 1.3864583596253226, + "learning_rate": 1.2379882207067577e-07, + "loss": 1.0383, + "step": 31950 + }, + { + "epoch": 2.47665543027626, + "grad_norm": 1.2834451727859257, + "learning_rate": 1.2383756974581526e-07, + "loss": 1.0317, + "step": 31960 + }, + { + "epoch": 2.477430353752567, + "grad_norm": 1.2803263158187301, + "learning_rate": 1.2387631742095476e-07, + "loss": 1.027, + "step": 31970 + }, + { + "epoch": 2.4782052772288736, + "grad_norm": 1.2562540369364092, + "learning_rate": 1.2391506509609425e-07, + "loss": 1.0178, + "step": 31980 + }, + { + "epoch": 2.4789802007051804, + "grad_norm": 1.3337016704547182, + "learning_rate": 1.2395381277123375e-07, + "loss": 1.0119, + "step": 31990 + }, + { + "epoch": 2.4797551241814872, + "grad_norm": 1.3768736007308462, + "learning_rate": 1.2399256044637324e-07, + "loss": 1.0266, + "step": 32000 + }, + { + "epoch": 2.4797551241814872, + "eval_loss": 1.027754783630371, + "eval_runtime": 320.5901, + "eval_samples_per_second": 35.781, + "eval_steps_per_second": 8.946, + "step": 32000 + }, + { + "epoch": 2.4805300476577936, + "grad_norm": 1.349588742167206, + "learning_rate": 1.2403130812151274e-07, + "loss": 1.034, + "step": 32010 + }, + { + "epoch": 2.4813049711341004, + "grad_norm": 1.2631035710951612, + "learning_rate": 1.240700557966522e-07, + "loss": 1.0407, + "step": 32020 + }, + { + "epoch": 2.4820798946104072, + "grad_norm": 1.3991371194103568, + "learning_rate": 1.241088034717917e-07, + "loss": 1.0191, + "step": 32030 + }, + { + "epoch": 2.482854818086714, + "grad_norm": 1.2889156982405483, + "learning_rate": 1.241475511469312e-07, + "loss": 1.0373, + "step": 32040 + }, + { + "epoch": 2.4836297415630204, + "grad_norm": 1.3927551572615802, + "learning_rate": 1.241862988220707e-07, + "loss": 1.0238, + "step": 32050 + }, + { + "epoch": 2.484404665039327, + "grad_norm": 1.3470571978154762, + "learning_rate": 1.242250464972102e-07, + "loss": 1.0184, + "step": 32060 + }, + { + "epoch": 2.485179588515634, + "grad_norm": 1.2488620086600997, + "learning_rate": 1.2426379417234969e-07, + "loss": 1.0065, + "step": 32070 + }, + { + "epoch": 2.485954511991941, + "grad_norm": 1.3663843642080011, + "learning_rate": 1.2430254184748918e-07, + "loss": 1.0237, + "step": 32080 + }, + { + "epoch": 2.4867294354682477, + "grad_norm": 2.6654057198251184, + "learning_rate": 1.2434128952262865e-07, + "loss": 1.0237, + "step": 32090 + }, + { + "epoch": 2.4875043589445545, + "grad_norm": 1.3622192947161451, + "learning_rate": 1.2438003719776815e-07, + "loss": 1.0264, + "step": 32100 + }, + { + "epoch": 2.488279282420861, + "grad_norm": 1.322122011894797, + "learning_rate": 1.2441878487290764e-07, + "loss": 1.0273, + "step": 32110 + }, + { + "epoch": 2.4890542058971676, + "grad_norm": 1.2616269138982896, + "learning_rate": 1.2445753254804714e-07, + "loss": 1.0053, + "step": 32120 + }, + { + "epoch": 2.4898291293734744, + "grad_norm": 1.3620843245693632, + "learning_rate": 1.2449628022318663e-07, + "loss": 1.0469, + "step": 32130 + }, + { + "epoch": 2.4906040528497813, + "grad_norm": 1.2557613964685195, + "learning_rate": 1.2453502789832613e-07, + "loss": 1.0268, + "step": 32140 + }, + { + "epoch": 2.4913789763260876, + "grad_norm": 1.2442346649151903, + "learning_rate": 1.245737755734656e-07, + "loss": 1.034, + "step": 32150 + }, + { + "epoch": 2.4921538998023944, + "grad_norm": 1.315798375931804, + "learning_rate": 1.246125232486051e-07, + "loss": 1.0389, + "step": 32160 + }, + { + "epoch": 2.4929288232787012, + "grad_norm": 1.352392696518136, + "learning_rate": 1.246512709237446e-07, + "loss": 1.0277, + "step": 32170 + }, + { + "epoch": 2.493703746755008, + "grad_norm": 1.321655128792712, + "learning_rate": 1.2469001859888408e-07, + "loss": 1.0272, + "step": 32180 + }, + { + "epoch": 2.494478670231315, + "grad_norm": 1.2927838769120266, + "learning_rate": 1.2472876627402358e-07, + "loss": 1.036, + "step": 32190 + }, + { + "epoch": 2.4952535937076212, + "grad_norm": 1.1948614214976243, + "learning_rate": 1.2476751394916307e-07, + "loss": 1.0155, + "step": 32200 + }, + { + "epoch": 2.496028517183928, + "grad_norm": 1.3869524047107948, + "learning_rate": 1.2480626162430257e-07, + "loss": 1.0287, + "step": 32210 + }, + { + "epoch": 2.496803440660235, + "grad_norm": 1.4146220612142506, + "learning_rate": 1.2484500929944204e-07, + "loss": 1.0295, + "step": 32220 + }, + { + "epoch": 2.4975783641365417, + "grad_norm": 1.416089548057688, + "learning_rate": 1.2488375697458153e-07, + "loss": 1.0225, + "step": 32230 + }, + { + "epoch": 2.498353287612848, + "grad_norm": 1.346449028849504, + "learning_rate": 1.2492250464972103e-07, + "loss": 1.0519, + "step": 32240 + }, + { + "epoch": 2.499128211089155, + "grad_norm": 1.3832332661557538, + "learning_rate": 1.2496125232486053e-07, + "loss": 1.0387, + "step": 32250 + }, + { + "epoch": 2.4999031345654616, + "grad_norm": 1.3564030637800086, + "learning_rate": 1.2500000000000002e-07, + "loss": 1.0207, + "step": 32260 + }, + { + "epoch": 2.5006780580417685, + "grad_norm": 1.3098747342392496, + "learning_rate": 1.2503874767513952e-07, + "loss": 1.0068, + "step": 32270 + }, + { + "epoch": 2.5014529815180753, + "grad_norm": 1.350998095294408, + "learning_rate": 1.25077495350279e-07, + "loss": 1.0228, + "step": 32280 + }, + { + "epoch": 2.502227904994382, + "grad_norm": 1.317337076631063, + "learning_rate": 1.2511624302541848e-07, + "loss": 1.0225, + "step": 32290 + }, + { + "epoch": 2.5030028284706884, + "grad_norm": 1.4398824442885465, + "learning_rate": 1.2515499070055798e-07, + "loss": 1.0342, + "step": 32300 + }, + { + "epoch": 2.5037777519469953, + "grad_norm": 1.307036759382867, + "learning_rate": 1.2519373837569747e-07, + "loss": 1.0153, + "step": 32310 + }, + { + "epoch": 2.504552675423302, + "grad_norm": 1.377961298815634, + "learning_rate": 1.2523248605083697e-07, + "loss": 1.0094, + "step": 32320 + }, + { + "epoch": 2.5053275988996084, + "grad_norm": 1.3563604294624894, + "learning_rate": 1.2527123372597646e-07, + "loss": 1.032, + "step": 32330 + }, + { + "epoch": 2.5061025223759152, + "grad_norm": 1.343821673898946, + "learning_rate": 1.2530998140111596e-07, + "loss": 1.0247, + "step": 32340 + }, + { + "epoch": 2.506877445852222, + "grad_norm": 1.2971738330831923, + "learning_rate": 1.2534872907625545e-07, + "loss": 1.032, + "step": 32350 + }, + { + "epoch": 2.507652369328529, + "grad_norm": 1.3068124078595513, + "learning_rate": 1.2538747675139492e-07, + "loss": 1.0357, + "step": 32360 + }, + { + "epoch": 2.5084272928048357, + "grad_norm": 1.2929174571901803, + "learning_rate": 1.2542622442653442e-07, + "loss": 1.0366, + "step": 32370 + }, + { + "epoch": 2.5092022162811425, + "grad_norm": 1.4190482371399105, + "learning_rate": 1.2546497210167391e-07, + "loss": 1.0222, + "step": 32380 + }, + { + "epoch": 2.509977139757449, + "grad_norm": 1.3800757076875305, + "learning_rate": 1.255037197768134e-07, + "loss": 1.0419, + "step": 32390 + }, + { + "epoch": 2.5107520632337557, + "grad_norm": 1.4538775874728775, + "learning_rate": 1.255424674519529e-07, + "loss": 1.025, + "step": 32400 + }, + { + "epoch": 2.5115269867100625, + "grad_norm": 1.3121818683593067, + "learning_rate": 1.255812151270924e-07, + "loss": 1.0153, + "step": 32410 + }, + { + "epoch": 2.512301910186369, + "grad_norm": 1.3763828782340983, + "learning_rate": 1.256199628022319e-07, + "loss": 1.0363, + "step": 32420 + }, + { + "epoch": 2.5130768336626756, + "grad_norm": 1.4011680730830747, + "learning_rate": 1.2565871047737137e-07, + "loss": 1.0239, + "step": 32430 + }, + { + "epoch": 2.5138517571389825, + "grad_norm": 1.2907634166807993, + "learning_rate": 1.2569745815251086e-07, + "loss": 1.0208, + "step": 32440 + }, + { + "epoch": 2.5146266806152893, + "grad_norm": 1.3419115167429376, + "learning_rate": 1.2573620582765036e-07, + "loss": 1.0208, + "step": 32450 + }, + { + "epoch": 2.515401604091596, + "grad_norm": 1.2820123870462914, + "learning_rate": 1.2577495350278985e-07, + "loss": 1.0394, + "step": 32460 + }, + { + "epoch": 2.516176527567903, + "grad_norm": 1.4568822540025637, + "learning_rate": 1.2581370117792935e-07, + "loss": 1.0192, + "step": 32470 + }, + { + "epoch": 2.5169514510442093, + "grad_norm": 1.7128536730671213, + "learning_rate": 1.2585244885306884e-07, + "loss": 1.0657, + "step": 32480 + }, + { + "epoch": 2.517726374520516, + "grad_norm": 1.4295420618628263, + "learning_rate": 1.258911965282083e-07, + "loss": 1.023, + "step": 32490 + }, + { + "epoch": 2.518501297996823, + "grad_norm": 1.3847170494051215, + "learning_rate": 1.259299442033478e-07, + "loss": 1.0189, + "step": 32500 + }, + { + "epoch": 2.518501297996823, + "eval_loss": 1.025810956954956, + "eval_runtime": 321.9241, + "eval_samples_per_second": 35.633, + "eval_steps_per_second": 8.909, + "step": 32500 + }, + { + "epoch": 2.5192762214731297, + "grad_norm": 1.264775710446259, + "learning_rate": 1.259686918784873e-07, + "loss": 1.0144, + "step": 32510 + }, + { + "epoch": 2.520051144949436, + "grad_norm": 1.3792523118932545, + "learning_rate": 1.260074395536268e-07, + "loss": 1.0138, + "step": 32520 + }, + { + "epoch": 2.520826068425743, + "grad_norm": 1.276304896842322, + "learning_rate": 1.260461872287663e-07, + "loss": 1.0407, + "step": 32530 + }, + { + "epoch": 2.5216009919020497, + "grad_norm": 1.3276216523749989, + "learning_rate": 1.260849349039058e-07, + "loss": 1.0373, + "step": 32540 + }, + { + "epoch": 2.5223759153783565, + "grad_norm": 1.3326497086242073, + "learning_rate": 1.2612368257904528e-07, + "loss": 1.0221, + "step": 32550 + }, + { + "epoch": 2.5231508388546633, + "grad_norm": 1.3407260669949446, + "learning_rate": 1.2616243025418475e-07, + "loss": 1.0229, + "step": 32560 + }, + { + "epoch": 2.5239257623309697, + "grad_norm": 1.4437060632862626, + "learning_rate": 1.2620117792932425e-07, + "loss": 1.0285, + "step": 32570 + }, + { + "epoch": 2.5247006858072765, + "grad_norm": 1.369016806401981, + "learning_rate": 1.2623992560446374e-07, + "loss": 1.0123, + "step": 32580 + }, + { + "epoch": 2.5254756092835833, + "grad_norm": 1.3823704407368578, + "learning_rate": 1.2627867327960324e-07, + "loss": 1.0153, + "step": 32590 + }, + { + "epoch": 2.52625053275989, + "grad_norm": 1.2590055112262855, + "learning_rate": 1.2631742095474274e-07, + "loss": 1.0107, + "step": 32600 + }, + { + "epoch": 2.5270254562361965, + "grad_norm": 1.373729029075316, + "learning_rate": 1.2635616862988223e-07, + "loss": 1.0214, + "step": 32610 + }, + { + "epoch": 2.5278003797125033, + "grad_norm": 1.2920418618758838, + "learning_rate": 1.2639491630502173e-07, + "loss": 1.028, + "step": 32620 + }, + { + "epoch": 2.52857530318881, + "grad_norm": 1.3458968707786285, + "learning_rate": 1.264336639801612e-07, + "loss": 1.0215, + "step": 32630 + }, + { + "epoch": 2.529350226665117, + "grad_norm": 1.3482384245818784, + "learning_rate": 1.264724116553007e-07, + "loss": 1.0401, + "step": 32640 + }, + { + "epoch": 2.5301251501414237, + "grad_norm": 1.3446729076883401, + "learning_rate": 1.265111593304402e-07, + "loss": 1.0066, + "step": 32650 + }, + { + "epoch": 2.5309000736177305, + "grad_norm": 1.304055980609436, + "learning_rate": 1.2654990700557968e-07, + "loss": 1.035, + "step": 32660 + }, + { + "epoch": 2.531674997094037, + "grad_norm": 1.3056356149711974, + "learning_rate": 1.2658865468071918e-07, + "loss": 1.018, + "step": 32670 + }, + { + "epoch": 2.5324499205703437, + "grad_norm": 1.2326188446822726, + "learning_rate": 1.2662740235585867e-07, + "loss": 1.0205, + "step": 32680 + }, + { + "epoch": 2.5332248440466505, + "grad_norm": 1.3934034230151404, + "learning_rate": 1.2666615003099817e-07, + "loss": 1.0208, + "step": 32690 + }, + { + "epoch": 2.533999767522957, + "grad_norm": 1.311477627044682, + "learning_rate": 1.2670489770613764e-07, + "loss": 1.0208, + "step": 32700 + }, + { + "epoch": 2.5347746909992637, + "grad_norm": 1.338569673784658, + "learning_rate": 1.2674364538127713e-07, + "loss": 1.03, + "step": 32710 + }, + { + "epoch": 2.5355496144755705, + "grad_norm": 1.358366965145062, + "learning_rate": 1.2678239305641663e-07, + "loss": 1.0253, + "step": 32720 + }, + { + "epoch": 2.5363245379518773, + "grad_norm": 1.325785759288938, + "learning_rate": 1.2682114073155612e-07, + "loss": 1.0211, + "step": 32730 + }, + { + "epoch": 2.537099461428184, + "grad_norm": 1.2495618392995334, + "learning_rate": 1.2685988840669562e-07, + "loss": 1.0377, + "step": 32740 + }, + { + "epoch": 2.537874384904491, + "grad_norm": 1.397576459034045, + "learning_rate": 1.2689863608183512e-07, + "loss": 1.0085, + "step": 32750 + }, + { + "epoch": 2.5386493083807973, + "grad_norm": 1.3937162327587493, + "learning_rate": 1.269373837569746e-07, + "loss": 1.0252, + "step": 32760 + }, + { + "epoch": 2.539424231857104, + "grad_norm": 1.2916200398996667, + "learning_rate": 1.2697613143211408e-07, + "loss": 1.032, + "step": 32770 + }, + { + "epoch": 2.540199155333411, + "grad_norm": 1.2583179577325223, + "learning_rate": 1.2701487910725358e-07, + "loss": 1.0131, + "step": 32780 + }, + { + "epoch": 2.5409740788097177, + "grad_norm": 1.2735528727996273, + "learning_rate": 1.2705362678239307e-07, + "loss": 1.0342, + "step": 32790 + }, + { + "epoch": 2.541749002286024, + "grad_norm": 1.3429686312753268, + "learning_rate": 1.2709237445753257e-07, + "loss": 1.0355, + "step": 32800 + }, + { + "epoch": 2.542523925762331, + "grad_norm": 1.3243450171905906, + "learning_rate": 1.2713112213267206e-07, + "loss": 1.0169, + "step": 32810 + }, + { + "epoch": 2.5432988492386377, + "grad_norm": 1.5168031507972835, + "learning_rate": 1.2716986980781156e-07, + "loss": 1.0375, + "step": 32820 + }, + { + "epoch": 2.5440737727149445, + "grad_norm": 1.3018289324333847, + "learning_rate": 1.2720861748295103e-07, + "loss": 1.042, + "step": 32830 + }, + { + "epoch": 2.5448486961912513, + "grad_norm": 1.3432134957706996, + "learning_rate": 1.2724736515809052e-07, + "loss": 1.0385, + "step": 32840 + }, + { + "epoch": 2.5456236196675577, + "grad_norm": 1.311720124839164, + "learning_rate": 1.2728611283323002e-07, + "loss": 1.0071, + "step": 32850 + }, + { + "epoch": 2.5463985431438645, + "grad_norm": 1.2783224702923834, + "learning_rate": 1.273248605083695e-07, + "loss": 1.018, + "step": 32860 + }, + { + "epoch": 2.5471734666201713, + "grad_norm": 1.2580585066211052, + "learning_rate": 1.27363608183509e-07, + "loss": 0.9947, + "step": 32870 + }, + { + "epoch": 2.547948390096478, + "grad_norm": 1.3133352096867756, + "learning_rate": 1.274023558586485e-07, + "loss": 1.016, + "step": 32880 + }, + { + "epoch": 2.5487233135727845, + "grad_norm": 1.3871616957033246, + "learning_rate": 1.27441103533788e-07, + "loss": 1.017, + "step": 32890 + }, + { + "epoch": 2.5494982370490913, + "grad_norm": 1.3546910721364283, + "learning_rate": 1.2747985120892747e-07, + "loss": 1.0354, + "step": 32900 + }, + { + "epoch": 2.550273160525398, + "grad_norm": 1.3262107695859433, + "learning_rate": 1.2751859888406696e-07, + "loss": 1.0345, + "step": 32910 + }, + { + "epoch": 2.551048084001705, + "grad_norm": 1.3008533851648467, + "learning_rate": 1.2755734655920646e-07, + "loss": 1.0071, + "step": 32920 + }, + { + "epoch": 2.5518230074780117, + "grad_norm": 1.2625679772826806, + "learning_rate": 1.2759609423434596e-07, + "loss": 1.0261, + "step": 32930 + }, + { + "epoch": 2.5525979309543185, + "grad_norm": 1.2771129176850142, + "learning_rate": 1.2763484190948545e-07, + "loss": 1.0197, + "step": 32940 + }, + { + "epoch": 2.553372854430625, + "grad_norm": 1.4684638347806738, + "learning_rate": 1.2767358958462495e-07, + "loss": 1.0256, + "step": 32950 + }, + { + "epoch": 2.5541477779069317, + "grad_norm": 1.3044366704849037, + "learning_rate": 1.2771233725976444e-07, + "loss": 1.0261, + "step": 32960 + }, + { + "epoch": 2.5549227013832385, + "grad_norm": 1.227119996832115, + "learning_rate": 1.277510849349039e-07, + "loss": 1.0336, + "step": 32970 + }, + { + "epoch": 2.555697624859545, + "grad_norm": 1.3356489708543164, + "learning_rate": 1.277898326100434e-07, + "loss": 0.9958, + "step": 32980 + }, + { + "epoch": 2.5564725483358517, + "grad_norm": 1.2751458267267135, + "learning_rate": 1.278285802851829e-07, + "loss": 1.0374, + "step": 32990 + }, + { + "epoch": 2.5572474718121585, + "grad_norm": 1.360361482606436, + "learning_rate": 1.278673279603224e-07, + "loss": 1.0251, + "step": 33000 + }, + { + "epoch": 2.5572474718121585, + "eval_loss": 1.0238994359970093, + "eval_runtime": 319.8471, + "eval_samples_per_second": 35.864, + "eval_steps_per_second": 8.967, + "step": 33000 + }, + { + "epoch": 2.5580223952884653, + "grad_norm": 1.2844904843480223, + "learning_rate": 1.279060756354619e-07, + "loss": 1.0211, + "step": 33010 + }, + { + "epoch": 2.558797318764772, + "grad_norm": 1.3706581522117605, + "learning_rate": 1.279448233106014e-07, + "loss": 1.0259, + "step": 33020 + }, + { + "epoch": 2.559572242241079, + "grad_norm": 1.400529206184911, + "learning_rate": 1.2798357098574088e-07, + "loss": 1.0172, + "step": 33030 + }, + { + "epoch": 2.5603471657173853, + "grad_norm": 1.3488207652944857, + "learning_rate": 1.2802231866088035e-07, + "loss": 1.0278, + "step": 33040 + }, + { + "epoch": 2.561122089193692, + "grad_norm": 1.3899740668328748, + "learning_rate": 1.2806106633601985e-07, + "loss": 1.0451, + "step": 33050 + }, + { + "epoch": 2.561897012669999, + "grad_norm": 1.2700914248874442, + "learning_rate": 1.2809981401115934e-07, + "loss": 1.0556, + "step": 33060 + }, + { + "epoch": 2.5626719361463053, + "grad_norm": 1.3053489784442065, + "learning_rate": 1.2813856168629884e-07, + "loss": 0.9895, + "step": 33070 + }, + { + "epoch": 2.563446859622612, + "grad_norm": 1.3586106148499464, + "learning_rate": 1.2817730936143833e-07, + "loss": 1.0172, + "step": 33080 + }, + { + "epoch": 2.564221783098919, + "grad_norm": 1.3451172395005853, + "learning_rate": 1.2821605703657783e-07, + "loss": 1.035, + "step": 33090 + }, + { + "epoch": 2.5649967065752257, + "grad_norm": 1.2969737846532658, + "learning_rate": 1.2825480471171733e-07, + "loss": 1.0119, + "step": 33100 + }, + { + "epoch": 2.5657716300515325, + "grad_norm": 1.3125073466704635, + "learning_rate": 1.282935523868568e-07, + "loss": 1.0158, + "step": 33110 + }, + { + "epoch": 2.5665465535278393, + "grad_norm": 1.3729100872166609, + "learning_rate": 1.283323000619963e-07, + "loss": 1.0179, + "step": 33120 + }, + { + "epoch": 2.5673214770041457, + "grad_norm": 1.2794993877967704, + "learning_rate": 1.2837104773713579e-07, + "loss": 1.013, + "step": 33130 + }, + { + "epoch": 2.5680964004804525, + "grad_norm": 1.2838069192389336, + "learning_rate": 1.2840979541227528e-07, + "loss": 1.0343, + "step": 33140 + }, + { + "epoch": 2.5688713239567593, + "grad_norm": 1.3243420111551287, + "learning_rate": 1.2844854308741478e-07, + "loss": 1.0097, + "step": 33150 + }, + { + "epoch": 2.569646247433066, + "grad_norm": 1.2846810768851185, + "learning_rate": 1.2848729076255427e-07, + "loss": 1.031, + "step": 33160 + }, + { + "epoch": 2.5704211709093725, + "grad_norm": 1.4963596590674808, + "learning_rate": 1.2852603843769377e-07, + "loss": 1.0166, + "step": 33170 + }, + { + "epoch": 2.5711960943856793, + "grad_norm": 1.3365307014787233, + "learning_rate": 1.2856478611283324e-07, + "loss": 1.0315, + "step": 33180 + }, + { + "epoch": 2.571971017861986, + "grad_norm": 1.3163546219927935, + "learning_rate": 1.2860353378797273e-07, + "loss": 1.0118, + "step": 33190 + }, + { + "epoch": 2.572745941338293, + "grad_norm": 1.3149485767833227, + "learning_rate": 1.2864228146311223e-07, + "loss": 1.0148, + "step": 33200 + }, + { + "epoch": 2.5735208648145997, + "grad_norm": 1.491715457833688, + "learning_rate": 1.2868102913825172e-07, + "loss": 1.045, + "step": 33210 + }, + { + "epoch": 2.5742957882909066, + "grad_norm": 1.3648472123802553, + "learning_rate": 1.2871977681339122e-07, + "loss": 1.038, + "step": 33220 + }, + { + "epoch": 2.575070711767213, + "grad_norm": 1.3084645092596272, + "learning_rate": 1.2875852448853071e-07, + "loss": 1.028, + "step": 33230 + }, + { + "epoch": 2.5758456352435197, + "grad_norm": 1.3651511517968309, + "learning_rate": 1.2879727216367018e-07, + "loss": 1.0201, + "step": 33240 + }, + { + "epoch": 2.5766205587198265, + "grad_norm": 1.3836994577475081, + "learning_rate": 1.2883601983880968e-07, + "loss": 1.0417, + "step": 33250 + }, + { + "epoch": 2.577395482196133, + "grad_norm": 1.3198861514090312, + "learning_rate": 1.2887476751394917e-07, + "loss": 1.0397, + "step": 33260 + }, + { + "epoch": 2.5781704056724397, + "grad_norm": 1.3565735407088908, + "learning_rate": 1.2891351518908867e-07, + "loss": 1.0061, + "step": 33270 + }, + { + "epoch": 2.5789453291487465, + "grad_norm": 1.36367618021678, + "learning_rate": 1.2895226286422817e-07, + "loss": 1.0323, + "step": 33280 + }, + { + "epoch": 2.5797202526250533, + "grad_norm": 1.2991625330040855, + "learning_rate": 1.2899101053936766e-07, + "loss": 1.0166, + "step": 33290 + }, + { + "epoch": 2.58049517610136, + "grad_norm": 1.3503006865931866, + "learning_rate": 1.2902975821450716e-07, + "loss": 1.0152, + "step": 33300 + }, + { + "epoch": 2.581270099577667, + "grad_norm": 1.301098846985475, + "learning_rate": 1.2906850588964663e-07, + "loss": 1.0189, + "step": 33310 + }, + { + "epoch": 2.5820450230539733, + "grad_norm": 1.3660008058253228, + "learning_rate": 1.2910725356478612e-07, + "loss": 1.0281, + "step": 33320 + }, + { + "epoch": 2.58281994653028, + "grad_norm": 1.3384325263833423, + "learning_rate": 1.2914600123992562e-07, + "loss": 1.0188, + "step": 33330 + }, + { + "epoch": 2.583594870006587, + "grad_norm": 1.2343450214378109, + "learning_rate": 1.291847489150651e-07, + "loss": 1.0312, + "step": 33340 + }, + { + "epoch": 2.5843697934828933, + "grad_norm": 1.335413748655022, + "learning_rate": 1.292234965902046e-07, + "loss": 1.0246, + "step": 33350 + }, + { + "epoch": 2.5851447169592, + "grad_norm": 1.2713733585481035, + "learning_rate": 1.292622442653441e-07, + "loss": 1.0096, + "step": 33360 + }, + { + "epoch": 2.585919640435507, + "grad_norm": 1.3208978519962058, + "learning_rate": 1.293009919404836e-07, + "loss": 1.0327, + "step": 33370 + }, + { + "epoch": 2.5866945639118137, + "grad_norm": 1.4212136598451865, + "learning_rate": 1.2933973961562307e-07, + "loss": 1.0453, + "step": 33380 + }, + { + "epoch": 2.5874694873881205, + "grad_norm": 1.3573849941489535, + "learning_rate": 1.2937848729076256e-07, + "loss": 1.0223, + "step": 33390 + }, + { + "epoch": 2.5882444108644274, + "grad_norm": 1.399024568426201, + "learning_rate": 1.2941723496590206e-07, + "loss": 1.025, + "step": 33400 + }, + { + "epoch": 2.5890193343407337, + "grad_norm": 1.356914120499604, + "learning_rate": 1.2945598264104155e-07, + "loss": 1.0384, + "step": 33410 + }, + { + "epoch": 2.5897942578170405, + "grad_norm": 1.2902139323075392, + "learning_rate": 1.2949473031618105e-07, + "loss": 1.0114, + "step": 33420 + }, + { + "epoch": 2.5905691812933473, + "grad_norm": 1.3153163540296684, + "learning_rate": 1.2953347799132055e-07, + "loss": 1.0381, + "step": 33430 + }, + { + "epoch": 2.591344104769654, + "grad_norm": 1.359344631264724, + "learning_rate": 1.2957222566646004e-07, + "loss": 1.03, + "step": 33440 + }, + { + "epoch": 2.5921190282459605, + "grad_norm": 1.2164525549883167, + "learning_rate": 1.296109733415995e-07, + "loss": 1.0065, + "step": 33450 + }, + { + "epoch": 2.5928939517222673, + "grad_norm": 1.230768644002472, + "learning_rate": 1.29649721016739e-07, + "loss": 1.0347, + "step": 33460 + }, + { + "epoch": 2.593668875198574, + "grad_norm": 1.2940443305739076, + "learning_rate": 1.296884686918785e-07, + "loss": 1.029, + "step": 33470 + }, + { + "epoch": 2.594443798674881, + "grad_norm": 1.327544179163975, + "learning_rate": 1.29727216367018e-07, + "loss": 1.0299, + "step": 33480 + }, + { + "epoch": 2.5952187221511878, + "grad_norm": 1.3376577293590546, + "learning_rate": 1.297659640421575e-07, + "loss": 1.0355, + "step": 33490 + }, + { + "epoch": 2.595993645627494, + "grad_norm": 1.4361612902716654, + "learning_rate": 1.29804711717297e-07, + "loss": 1.0031, + "step": 33500 + }, + { + "epoch": 2.595993645627494, + "eval_loss": 1.0220338106155396, + "eval_runtime": 319.3424, + "eval_samples_per_second": 35.921, + "eval_steps_per_second": 8.981, + "step": 33500 + }, + { + "epoch": 2.596768569103801, + "grad_norm": 1.3222340122843383, + "learning_rate": 1.2984345939243648e-07, + "loss": 1.0179, + "step": 33510 + }, + { + "epoch": 2.5975434925801077, + "grad_norm": 1.332287889997271, + "learning_rate": 1.2988220706757595e-07, + "loss": 1.0394, + "step": 33520 + }, + { + "epoch": 2.5983184160564146, + "grad_norm": 1.3124428285164733, + "learning_rate": 1.2992095474271545e-07, + "loss": 1.0202, + "step": 33530 + }, + { + "epoch": 2.599093339532721, + "grad_norm": 1.2495488979418716, + "learning_rate": 1.2995970241785494e-07, + "loss": 1.0227, + "step": 33540 + }, + { + "epoch": 2.5998682630090277, + "grad_norm": 1.2736451206941735, + "learning_rate": 1.2999845009299444e-07, + "loss": 1.0331, + "step": 33550 + }, + { + "epoch": 2.6006431864853345, + "grad_norm": 1.334937703947115, + "learning_rate": 1.3003719776813393e-07, + "loss": 1.0042, + "step": 33560 + }, + { + "epoch": 2.6014181099616414, + "grad_norm": 1.3018317738150398, + "learning_rate": 1.3007594544327343e-07, + "loss": 1.0356, + "step": 33570 + }, + { + "epoch": 2.602193033437948, + "grad_norm": 1.374619086439277, + "learning_rate": 1.301146931184129e-07, + "loss": 1.03, + "step": 33580 + }, + { + "epoch": 2.602967956914255, + "grad_norm": 1.3727416071874796, + "learning_rate": 1.301534407935524e-07, + "loss": 1.0108, + "step": 33590 + }, + { + "epoch": 2.6037428803905613, + "grad_norm": 1.325949615270174, + "learning_rate": 1.301921884686919e-07, + "loss": 1.0108, + "step": 33600 + }, + { + "epoch": 2.604517803866868, + "grad_norm": 1.3510457279596848, + "learning_rate": 1.3023093614383139e-07, + "loss": 1.0285, + "step": 33610 + }, + { + "epoch": 2.605292727343175, + "grad_norm": 1.3161866334769297, + "learning_rate": 1.3026968381897088e-07, + "loss": 1.0323, + "step": 33620 + }, + { + "epoch": 2.6060676508194813, + "grad_norm": 1.301827451620502, + "learning_rate": 1.3030843149411038e-07, + "loss": 1.0067, + "step": 33630 + }, + { + "epoch": 2.606842574295788, + "grad_norm": 1.5011296976449602, + "learning_rate": 1.3034717916924987e-07, + "loss": 1.0226, + "step": 33640 + }, + { + "epoch": 2.607617497772095, + "grad_norm": 1.4202325398553588, + "learning_rate": 1.3038592684438934e-07, + "loss": 1.0352, + "step": 33650 + }, + { + "epoch": 2.6083924212484018, + "grad_norm": 1.2892154035141419, + "learning_rate": 1.3042467451952884e-07, + "loss": 1.0042, + "step": 33660 + }, + { + "epoch": 2.6091673447247086, + "grad_norm": 1.289360530171919, + "learning_rate": 1.3046342219466833e-07, + "loss": 1.0626, + "step": 33670 + }, + { + "epoch": 2.6099422682010154, + "grad_norm": 1.2894617216880195, + "learning_rate": 1.3050216986980783e-07, + "loss": 1.0107, + "step": 33680 + }, + { + "epoch": 2.6107171916773217, + "grad_norm": 1.3641493860069733, + "learning_rate": 1.3054091754494732e-07, + "loss": 1.0095, + "step": 33690 + }, + { + "epoch": 2.6114921151536286, + "grad_norm": 1.2672495128515393, + "learning_rate": 1.3057966522008682e-07, + "loss": 1.0292, + "step": 33700 + }, + { + "epoch": 2.6122670386299354, + "grad_norm": 1.3385495135406, + "learning_rate": 1.3061841289522631e-07, + "loss": 1.0136, + "step": 33710 + }, + { + "epoch": 2.6130419621062417, + "grad_norm": 1.4938087965239613, + "learning_rate": 1.3065716057036578e-07, + "loss": 1.0392, + "step": 33720 + }, + { + "epoch": 2.6138168855825485, + "grad_norm": 1.3484883575377729, + "learning_rate": 1.3069590824550528e-07, + "loss": 1.0224, + "step": 33730 + }, + { + "epoch": 2.6145918090588554, + "grad_norm": 1.318122281381031, + "learning_rate": 1.3073465592064477e-07, + "loss": 1.0302, + "step": 33740 + }, + { + "epoch": 2.615366732535162, + "grad_norm": 1.4722486812291626, + "learning_rate": 1.3077340359578427e-07, + "loss": 1.03, + "step": 33750 + }, + { + "epoch": 2.616141656011469, + "grad_norm": 1.304446546206479, + "learning_rate": 1.3081215127092376e-07, + "loss": 1.0263, + "step": 33760 + }, + { + "epoch": 2.616916579487776, + "grad_norm": 1.229328186376154, + "learning_rate": 1.3085089894606326e-07, + "loss": 1.013, + "step": 33770 + }, + { + "epoch": 2.617691502964082, + "grad_norm": 1.296458634180164, + "learning_rate": 1.3088964662120276e-07, + "loss": 1.0088, + "step": 33780 + }, + { + "epoch": 2.618466426440389, + "grad_norm": 1.3785623108243334, + "learning_rate": 1.3092839429634223e-07, + "loss": 1.032, + "step": 33790 + }, + { + "epoch": 2.6192413499166958, + "grad_norm": 1.3542907961411215, + "learning_rate": 1.3096714197148172e-07, + "loss": 1.0309, + "step": 33800 + }, + { + "epoch": 2.6200162733930026, + "grad_norm": 1.429257174558624, + "learning_rate": 1.3100588964662122e-07, + "loss": 1.0428, + "step": 33810 + }, + { + "epoch": 2.620791196869309, + "grad_norm": 1.2981950117224377, + "learning_rate": 1.310446373217607e-07, + "loss": 1.0304, + "step": 33820 + }, + { + "epoch": 2.6215661203456158, + "grad_norm": 1.3776499886381788, + "learning_rate": 1.310833849969002e-07, + "loss": 1.0182, + "step": 33830 + }, + { + "epoch": 2.6223410438219226, + "grad_norm": 1.3576394479219918, + "learning_rate": 1.311221326720397e-07, + "loss": 1.0158, + "step": 33840 + }, + { + "epoch": 2.6231159672982294, + "grad_norm": 1.3900905938426402, + "learning_rate": 1.311608803471792e-07, + "loss": 1.0244, + "step": 33850 + }, + { + "epoch": 2.623890890774536, + "grad_norm": 1.374320814805697, + "learning_rate": 1.3119962802231867e-07, + "loss": 1.0091, + "step": 33860 + }, + { + "epoch": 2.624665814250843, + "grad_norm": 1.3745644561972938, + "learning_rate": 1.3123837569745816e-07, + "loss": 1.0112, + "step": 33870 + }, + { + "epoch": 2.6254407377271494, + "grad_norm": 1.3174550826485214, + "learning_rate": 1.3127712337259766e-07, + "loss": 1.0037, + "step": 33880 + }, + { + "epoch": 2.626215661203456, + "grad_norm": 1.3404729432636202, + "learning_rate": 1.3131587104773715e-07, + "loss": 1.0357, + "step": 33890 + }, + { + "epoch": 2.626990584679763, + "grad_norm": 1.3429196543276725, + "learning_rate": 1.3135461872287665e-07, + "loss": 1.0187, + "step": 33900 + }, + { + "epoch": 2.6277655081560694, + "grad_norm": 1.265837284983085, + "learning_rate": 1.3139336639801614e-07, + "loss": 1.0356, + "step": 33910 + }, + { + "epoch": 2.628540431632376, + "grad_norm": 1.3094715042693807, + "learning_rate": 1.3143211407315561e-07, + "loss": 1.0228, + "step": 33920 + }, + { + "epoch": 2.629315355108683, + "grad_norm": 1.266725405628592, + "learning_rate": 1.314708617482951e-07, + "loss": 1.0083, + "step": 33930 + }, + { + "epoch": 2.63009027858499, + "grad_norm": 1.277328087712583, + "learning_rate": 1.315096094234346e-07, + "loss": 1.0042, + "step": 33940 + }, + { + "epoch": 2.6308652020612966, + "grad_norm": 1.3250817727386535, + "learning_rate": 1.315483570985741e-07, + "loss": 1.0105, + "step": 33950 + }, + { + "epoch": 2.6316401255376034, + "grad_norm": 1.3068954556109809, + "learning_rate": 1.315871047737136e-07, + "loss": 1.01, + "step": 33960 + }, + { + "epoch": 2.6324150490139098, + "grad_norm": 1.3455962344179329, + "learning_rate": 1.316258524488531e-07, + "loss": 1.021, + "step": 33970 + }, + { + "epoch": 2.6331899724902166, + "grad_norm": 1.3295256113558702, + "learning_rate": 1.3166460012399259e-07, + "loss": 1.0151, + "step": 33980 + }, + { + "epoch": 2.6339648959665234, + "grad_norm": 1.326562430212986, + "learning_rate": 1.3170334779913206e-07, + "loss": 1.0144, + "step": 33990 + }, + { + "epoch": 2.6347398194428298, + "grad_norm": 1.3723842794815695, + "learning_rate": 1.3174209547427155e-07, + "loss": 1.0107, + "step": 34000 + }, + { + "epoch": 2.6347398194428298, + "eval_loss": 1.0201956033706665, + "eval_runtime": 323.332, + "eval_samples_per_second": 35.477, + "eval_steps_per_second": 8.87, + "step": 34000 + }, + { + "epoch": 2.6355147429191366, + "grad_norm": 1.3100209138846586, + "learning_rate": 1.3178084314941105e-07, + "loss": 1.0142, + "step": 34010 + }, + { + "epoch": 2.6362896663954434, + "grad_norm": 1.3027826331044914, + "learning_rate": 1.3181959082455054e-07, + "loss": 1.0231, + "step": 34020 + }, + { + "epoch": 2.63706458987175, + "grad_norm": 1.3614673017945944, + "learning_rate": 1.3185833849969004e-07, + "loss": 1.0293, + "step": 34030 + }, + { + "epoch": 2.637839513348057, + "grad_norm": 1.303762051468111, + "learning_rate": 1.3189708617482953e-07, + "loss": 1.0205, + "step": 34040 + }, + { + "epoch": 2.638614436824364, + "grad_norm": 1.3037444178028994, + "learning_rate": 1.3193583384996903e-07, + "loss": 1.0137, + "step": 34050 + }, + { + "epoch": 2.63938936030067, + "grad_norm": 1.3202168149752398, + "learning_rate": 1.319745815251085e-07, + "loss": 1.0117, + "step": 34060 + }, + { + "epoch": 2.640164283776977, + "grad_norm": 1.3577628681332594, + "learning_rate": 1.32013329200248e-07, + "loss": 1.0211, + "step": 34070 + }, + { + "epoch": 2.640939207253284, + "grad_norm": 1.3684225299512651, + "learning_rate": 1.320520768753875e-07, + "loss": 1.02, + "step": 34080 + }, + { + "epoch": 2.6417141307295906, + "grad_norm": 1.716582438806454, + "learning_rate": 1.3209082455052698e-07, + "loss": 1.0215, + "step": 34090 + }, + { + "epoch": 2.642489054205897, + "grad_norm": 1.4006890285348426, + "learning_rate": 1.3212957222566648e-07, + "loss": 1.0136, + "step": 34100 + }, + { + "epoch": 2.643263977682204, + "grad_norm": 1.3027439781702734, + "learning_rate": 1.3216831990080598e-07, + "loss": 1.0123, + "step": 34110 + }, + { + "epoch": 2.6440389011585106, + "grad_norm": 1.3318734126352703, + "learning_rate": 1.3220706757594547e-07, + "loss": 1.0089, + "step": 34120 + }, + { + "epoch": 2.6448138246348174, + "grad_norm": 1.3795310374954017, + "learning_rate": 1.3224581525108494e-07, + "loss": 1.0085, + "step": 34130 + }, + { + "epoch": 2.645588748111124, + "grad_norm": 1.2683589166214473, + "learning_rate": 1.3228456292622444e-07, + "loss": 1.0246, + "step": 34140 + }, + { + "epoch": 2.6463636715874306, + "grad_norm": 1.3705079315967255, + "learning_rate": 1.3232331060136393e-07, + "loss": 1.0037, + "step": 34150 + }, + { + "epoch": 2.6471385950637374, + "grad_norm": 1.3348482077602817, + "learning_rate": 1.3236205827650343e-07, + "loss": 1.0014, + "step": 34160 + }, + { + "epoch": 2.647913518540044, + "grad_norm": 1.235127898999522, + "learning_rate": 1.3240080595164292e-07, + "loss": 0.9994, + "step": 34170 + }, + { + "epoch": 2.648688442016351, + "grad_norm": 1.3133057447117105, + "learning_rate": 1.3243955362678242e-07, + "loss": 1.008, + "step": 34180 + }, + { + "epoch": 2.6494633654926574, + "grad_norm": 1.3687514771631493, + "learning_rate": 1.324783013019219e-07, + "loss": 1.0412, + "step": 34190 + }, + { + "epoch": 2.650238288968964, + "grad_norm": 1.3579713521714332, + "learning_rate": 1.3251704897706138e-07, + "loss": 1.0133, + "step": 34200 + }, + { + "epoch": 2.651013212445271, + "grad_norm": 1.2693540738348799, + "learning_rate": 1.3255579665220088e-07, + "loss": 1.0425, + "step": 34210 + }, + { + "epoch": 2.651788135921578, + "grad_norm": 1.3308812042523592, + "learning_rate": 1.3259454432734037e-07, + "loss": 1.0249, + "step": 34220 + }, + { + "epoch": 2.6525630593978846, + "grad_norm": 1.3682173361241694, + "learning_rate": 1.3263329200247987e-07, + "loss": 1.0158, + "step": 34230 + }, + { + "epoch": 2.6533379828741914, + "grad_norm": 1.3374517546424962, + "learning_rate": 1.3267203967761936e-07, + "loss": 1.006, + "step": 34240 + }, + { + "epoch": 2.654112906350498, + "grad_norm": 1.2686606801573823, + "learning_rate": 1.3271078735275886e-07, + "loss": 1.0094, + "step": 34250 + }, + { + "epoch": 2.6548878298268046, + "grad_norm": 1.3187387271909954, + "learning_rate": 1.3274953502789836e-07, + "loss": 1.026, + "step": 34260 + }, + { + "epoch": 2.6556627533031114, + "grad_norm": 1.38735619002685, + "learning_rate": 1.3278828270303782e-07, + "loss": 1.0402, + "step": 34270 + }, + { + "epoch": 2.656437676779418, + "grad_norm": 1.2677345353086957, + "learning_rate": 1.3282703037817732e-07, + "loss": 1.0156, + "step": 34280 + }, + { + "epoch": 2.6572126002557246, + "grad_norm": 1.3697104772490343, + "learning_rate": 1.3286577805331682e-07, + "loss": 1.0231, + "step": 34290 + }, + { + "epoch": 2.6579875237320314, + "grad_norm": 1.2476233169280853, + "learning_rate": 1.329045257284563e-07, + "loss": 1.0326, + "step": 34300 + }, + { + "epoch": 2.658762447208338, + "grad_norm": 1.2666799708368213, + "learning_rate": 1.329432734035958e-07, + "loss": 1.0159, + "step": 34310 + }, + { + "epoch": 2.659537370684645, + "grad_norm": 1.3306858392801049, + "learning_rate": 1.329820210787353e-07, + "loss": 1.032, + "step": 34320 + }, + { + "epoch": 2.660312294160952, + "grad_norm": 1.365167025618513, + "learning_rate": 1.3302076875387477e-07, + "loss": 1.0309, + "step": 34330 + }, + { + "epoch": 2.661087217637258, + "grad_norm": 1.397059368743592, + "learning_rate": 1.3305951642901427e-07, + "loss": 1.0243, + "step": 34340 + }, + { + "epoch": 2.661862141113565, + "grad_norm": 1.383217898941549, + "learning_rate": 1.3309826410415376e-07, + "loss": 1.0317, + "step": 34350 + }, + { + "epoch": 2.662637064589872, + "grad_norm": 1.2803016278421202, + "learning_rate": 1.3313701177929326e-07, + "loss": 1.0362, + "step": 34360 + }, + { + "epoch": 2.6634119880661786, + "grad_norm": 1.264613497017584, + "learning_rate": 1.3317575945443275e-07, + "loss": 1.0247, + "step": 34370 + }, + { + "epoch": 2.664186911542485, + "grad_norm": 1.4243003240941559, + "learning_rate": 1.3321450712957225e-07, + "loss": 1.0279, + "step": 34380 + }, + { + "epoch": 2.664961835018792, + "grad_norm": 1.374472276234064, + "learning_rate": 1.3325325480471174e-07, + "loss": 1.0239, + "step": 34390 + }, + { + "epoch": 2.6657367584950986, + "grad_norm": 1.2991958985984773, + "learning_rate": 1.332920024798512e-07, + "loss": 1.0362, + "step": 34400 + }, + { + "epoch": 2.6665116819714054, + "grad_norm": 1.3443935622735694, + "learning_rate": 1.333307501549907e-07, + "loss": 1.0082, + "step": 34410 + }, + { + "epoch": 2.6672866054477122, + "grad_norm": 1.288260815712817, + "learning_rate": 1.333694978301302e-07, + "loss": 1.0243, + "step": 34420 + }, + { + "epoch": 2.6680615289240186, + "grad_norm": 1.4052748375905175, + "learning_rate": 1.334082455052697e-07, + "loss": 1.0195, + "step": 34430 + }, + { + "epoch": 2.6688364524003254, + "grad_norm": 1.3149461902231556, + "learning_rate": 1.334469931804092e-07, + "loss": 1.0034, + "step": 34440 + }, + { + "epoch": 2.669611375876632, + "grad_norm": 1.3468003295280362, + "learning_rate": 1.334857408555487e-07, + "loss": 1.0465, + "step": 34450 + }, + { + "epoch": 2.670386299352939, + "grad_norm": 1.3286464728859462, + "learning_rate": 1.3352448853068819e-07, + "loss": 1.022, + "step": 34460 + }, + { + "epoch": 2.6711612228292454, + "grad_norm": 1.4297333992008063, + "learning_rate": 1.3356323620582765e-07, + "loss": 1.0084, + "step": 34470 + }, + { + "epoch": 2.671936146305552, + "grad_norm": 1.4216947117795578, + "learning_rate": 1.3360198388096715e-07, + "loss": 1.046, + "step": 34480 + }, + { + "epoch": 2.672711069781859, + "grad_norm": 1.3150473491552457, + "learning_rate": 1.3364073155610665e-07, + "loss": 1.0341, + "step": 34490 + }, + { + "epoch": 2.673485993258166, + "grad_norm": 1.3028007033795146, + "learning_rate": 1.3367947923124614e-07, + "loss": 1.0248, + "step": 34500 + }, + { + "epoch": 2.673485993258166, + "eval_loss": 1.0184437036514282, + "eval_runtime": 320.4836, + "eval_samples_per_second": 35.793, + "eval_steps_per_second": 8.949, + "step": 34500 + }, + { + "epoch": 2.6742609167344726, + "grad_norm": 1.346558995201955, + "learning_rate": 1.3371822690638564e-07, + "loss": 1.0284, + "step": 34510 + }, + { + "epoch": 2.6750358402107794, + "grad_norm": 1.369012844675878, + "learning_rate": 1.3375697458152513e-07, + "loss": 1.0141, + "step": 34520 + }, + { + "epoch": 2.675810763687086, + "grad_norm": 1.3614179040287893, + "learning_rate": 1.3379572225666463e-07, + "loss": 1.0193, + "step": 34530 + }, + { + "epoch": 2.6765856871633926, + "grad_norm": 1.4221209537117905, + "learning_rate": 1.338344699318041e-07, + "loss": 1.0281, + "step": 34540 + }, + { + "epoch": 2.6773606106396994, + "grad_norm": 1.4073454713666558, + "learning_rate": 1.338732176069436e-07, + "loss": 1.017, + "step": 34550 + }, + { + "epoch": 2.678135534116006, + "grad_norm": 1.485002103882593, + "learning_rate": 1.339119652820831e-07, + "loss": 1.0212, + "step": 34560 + }, + { + "epoch": 2.6789104575923126, + "grad_norm": 1.314937283602613, + "learning_rate": 1.3395071295722258e-07, + "loss": 1.0131, + "step": 34570 + }, + { + "epoch": 2.6796853810686194, + "grad_norm": 1.3723157525967142, + "learning_rate": 1.3398946063236208e-07, + "loss": 1.0065, + "step": 34580 + }, + { + "epoch": 2.6804603045449262, + "grad_norm": 1.4104861874324588, + "learning_rate": 1.3402820830750157e-07, + "loss": 1.0105, + "step": 34590 + }, + { + "epoch": 2.681235228021233, + "grad_norm": 1.3238338994182097, + "learning_rate": 1.3406695598264107e-07, + "loss": 1.0102, + "step": 34600 + }, + { + "epoch": 2.68201015149754, + "grad_norm": 1.3237234739553219, + "learning_rate": 1.3410570365778054e-07, + "loss": 1.0183, + "step": 34610 + }, + { + "epoch": 2.682785074973846, + "grad_norm": 1.3038390456642233, + "learning_rate": 1.3414445133292003e-07, + "loss": 1.0206, + "step": 34620 + }, + { + "epoch": 2.683559998450153, + "grad_norm": 1.2854421198750572, + "learning_rate": 1.3418319900805953e-07, + "loss": 1.0123, + "step": 34630 + }, + { + "epoch": 2.68433492192646, + "grad_norm": 1.3999710223891253, + "learning_rate": 1.3422194668319903e-07, + "loss": 1.0023, + "step": 34640 + }, + { + "epoch": 2.685109845402766, + "grad_norm": 1.3029174846157563, + "learning_rate": 1.3426069435833852e-07, + "loss": 1.0245, + "step": 34650 + }, + { + "epoch": 2.685884768879073, + "grad_norm": 1.3246882102845274, + "learning_rate": 1.3429944203347802e-07, + "loss": 1.0127, + "step": 34660 + }, + { + "epoch": 2.68665969235538, + "grad_norm": 1.3096301716652246, + "learning_rate": 1.3433818970861749e-07, + "loss": 1.0121, + "step": 34670 + }, + { + "epoch": 2.6874346158316866, + "grad_norm": 1.267010180209304, + "learning_rate": 1.3437693738375698e-07, + "loss": 1.0143, + "step": 34680 + }, + { + "epoch": 2.6882095393079934, + "grad_norm": 1.3044441741825286, + "learning_rate": 1.3441568505889648e-07, + "loss": 1.0177, + "step": 34690 + }, + { + "epoch": 2.6889844627843003, + "grad_norm": 1.315704063017301, + "learning_rate": 1.3445443273403597e-07, + "loss": 1.0234, + "step": 34700 + }, + { + "epoch": 2.6897593862606066, + "grad_norm": 1.2364845146238943, + "learning_rate": 1.3449318040917547e-07, + "loss": 1.0222, + "step": 34710 + }, + { + "epoch": 2.6905343097369134, + "grad_norm": 1.3287912412933471, + "learning_rate": 1.3453192808431496e-07, + "loss": 1.0001, + "step": 34720 + }, + { + "epoch": 2.6913092332132202, + "grad_norm": 1.3003753775907283, + "learning_rate": 1.3457067575945446e-07, + "loss": 1.0211, + "step": 34730 + }, + { + "epoch": 2.692084156689527, + "grad_norm": 1.3484475567036962, + "learning_rate": 1.3460942343459393e-07, + "loss": 1.0311, + "step": 34740 + }, + { + "epoch": 2.6928590801658334, + "grad_norm": 1.3139788558332754, + "learning_rate": 1.3464817110973342e-07, + "loss": 1.0028, + "step": 34750 + }, + { + "epoch": 2.6936340036421402, + "grad_norm": 1.361592275924966, + "learning_rate": 1.3468691878487292e-07, + "loss": 1.0268, + "step": 34760 + }, + { + "epoch": 2.694408927118447, + "grad_norm": 1.3397920138291777, + "learning_rate": 1.3472566646001241e-07, + "loss": 1.0051, + "step": 34770 + }, + { + "epoch": 2.695183850594754, + "grad_norm": 1.3141469764897695, + "learning_rate": 1.347644141351519e-07, + "loss": 0.997, + "step": 34780 + }, + { + "epoch": 2.6959587740710607, + "grad_norm": 1.2220222982942224, + "learning_rate": 1.348031618102914e-07, + "loss": 0.9972, + "step": 34790 + }, + { + "epoch": 2.6967336975473675, + "grad_norm": 1.3554712266371294, + "learning_rate": 1.348419094854309e-07, + "loss": 0.9921, + "step": 34800 + }, + { + "epoch": 2.697508621023674, + "grad_norm": 1.2660541087716637, + "learning_rate": 1.3488065716057037e-07, + "loss": 1.0235, + "step": 34810 + }, + { + "epoch": 2.6982835444999806, + "grad_norm": 1.3819882768517382, + "learning_rate": 1.3491940483570987e-07, + "loss": 1.0187, + "step": 34820 + }, + { + "epoch": 2.6990584679762875, + "grad_norm": 1.2429053645720094, + "learning_rate": 1.3495815251084936e-07, + "loss": 1.0044, + "step": 34830 + }, + { + "epoch": 2.699833391452594, + "grad_norm": 1.3462282631169848, + "learning_rate": 1.3499690018598886e-07, + "loss": 1.0341, + "step": 34840 + }, + { + "epoch": 2.7006083149289006, + "grad_norm": 1.3225870548070984, + "learning_rate": 1.3503564786112835e-07, + "loss": 1.0271, + "step": 34850 + }, + { + "epoch": 2.7013832384052074, + "grad_norm": 1.3722118012095617, + "learning_rate": 1.3507439553626785e-07, + "loss": 1.0271, + "step": 34860 + }, + { + "epoch": 2.7021581618815143, + "grad_norm": 1.3021188176154255, + "learning_rate": 1.3511314321140734e-07, + "loss": 1.0223, + "step": 34870 + }, + { + "epoch": 2.702933085357821, + "grad_norm": 1.3304916993268827, + "learning_rate": 1.351518908865468e-07, + "loss": 1.0223, + "step": 34880 + }, + { + "epoch": 2.703708008834128, + "grad_norm": 1.279906222593239, + "learning_rate": 1.351906385616863e-07, + "loss": 1.0031, + "step": 34890 + }, + { + "epoch": 2.7044829323104342, + "grad_norm": 1.2966193156202976, + "learning_rate": 1.352293862368258e-07, + "loss": 1.0179, + "step": 34900 + }, + { + "epoch": 2.705257855786741, + "grad_norm": 1.3772874701468552, + "learning_rate": 1.352681339119653e-07, + "loss": 1.0599, + "step": 34910 + }, + { + "epoch": 2.706032779263048, + "grad_norm": 1.3462671288310017, + "learning_rate": 1.353068815871048e-07, + "loss": 1.0131, + "step": 34920 + }, + { + "epoch": 2.7068077027393542, + "grad_norm": 1.2908259460258773, + "learning_rate": 1.353456292622443e-07, + "loss": 1.0066, + "step": 34930 + }, + { + "epoch": 2.707582626215661, + "grad_norm": 1.3527410617319566, + "learning_rate": 1.3538437693738379e-07, + "loss": 1.0224, + "step": 34940 + }, + { + "epoch": 2.708357549691968, + "grad_norm": 1.2606443771111744, + "learning_rate": 1.3542312461252325e-07, + "loss": 1.0367, + "step": 34950 + }, + { + "epoch": 2.7091324731682747, + "grad_norm": 1.3243915247976246, + "learning_rate": 1.3546187228766275e-07, + "loss": 1.0444, + "step": 34960 + }, + { + "epoch": 2.7099073966445815, + "grad_norm": 1.33605838038335, + "learning_rate": 1.3550061996280225e-07, + "loss": 1.0292, + "step": 34970 + }, + { + "epoch": 2.7106823201208883, + "grad_norm": 1.2413827056074491, + "learning_rate": 1.3553936763794174e-07, + "loss": 1.0041, + "step": 34980 + }, + { + "epoch": 2.7114572435971946, + "grad_norm": 1.2698745706891084, + "learning_rate": 1.3557811531308124e-07, + "loss": 0.9923, + "step": 34990 + }, + { + "epoch": 2.7122321670735015, + "grad_norm": 1.3088057105550197, + "learning_rate": 1.3561686298822073e-07, + "loss": 0.9938, + "step": 35000 + }, + { + "epoch": 2.7122321670735015, + "eval_loss": 1.0166804790496826, + "eval_runtime": 321.3465, + "eval_samples_per_second": 35.697, + "eval_steps_per_second": 8.925, + "step": 35000 + }, + { + "epoch": 2.7130070905498083, + "grad_norm": 1.2875813196845725, + "learning_rate": 1.3565561066336023e-07, + "loss": 1.0164, + "step": 35010 + }, + { + "epoch": 2.713782014026115, + "grad_norm": 1.245769935860255, + "learning_rate": 1.356943583384997e-07, + "loss": 1.005, + "step": 35020 + }, + { + "epoch": 2.7145569375024214, + "grad_norm": 1.3597863689922607, + "learning_rate": 1.357331060136392e-07, + "loss": 0.9868, + "step": 35030 + }, + { + "epoch": 2.7153318609787283, + "grad_norm": 1.3345399495678745, + "learning_rate": 1.357718536887787e-07, + "loss": 1.0222, + "step": 35040 + }, + { + "epoch": 2.716106784455035, + "grad_norm": 1.3471766763360737, + "learning_rate": 1.3581060136391818e-07, + "loss": 1.0184, + "step": 35050 + }, + { + "epoch": 2.716881707931342, + "grad_norm": 1.3407299829448565, + "learning_rate": 1.3584934903905768e-07, + "loss": 1.0204, + "step": 35060 + }, + { + "epoch": 2.7176566314076487, + "grad_norm": 1.343096232194896, + "learning_rate": 1.3588809671419717e-07, + "loss": 1.0097, + "step": 35070 + }, + { + "epoch": 2.718431554883955, + "grad_norm": 1.2914753363133307, + "learning_rate": 1.3592684438933664e-07, + "loss": 1.0095, + "step": 35080 + }, + { + "epoch": 2.719206478360262, + "grad_norm": 1.2814989390299374, + "learning_rate": 1.3596559206447614e-07, + "loss": 1.055, + "step": 35090 + }, + { + "epoch": 2.7199814018365687, + "grad_norm": 1.3175559274361994, + "learning_rate": 1.3600433973961563e-07, + "loss": 1.0216, + "step": 35100 + }, + { + "epoch": 2.7207563253128755, + "grad_norm": 1.3141473438263538, + "learning_rate": 1.3604308741475513e-07, + "loss": 0.9928, + "step": 35110 + }, + { + "epoch": 2.721531248789182, + "grad_norm": 1.273150459804345, + "learning_rate": 1.3608183508989462e-07, + "loss": 1.0356, + "step": 35120 + }, + { + "epoch": 2.7223061722654887, + "grad_norm": 1.2141756324017305, + "learning_rate": 1.3612058276503412e-07, + "loss": 1.0291, + "step": 35130 + }, + { + "epoch": 2.7230810957417955, + "grad_norm": 1.2600081092442084, + "learning_rate": 1.3615933044017362e-07, + "loss": 1.0148, + "step": 35140 + }, + { + "epoch": 2.7238560192181023, + "grad_norm": 1.3306827718704701, + "learning_rate": 1.3619807811531308e-07, + "loss": 1.0203, + "step": 35150 + }, + { + "epoch": 2.724630942694409, + "grad_norm": 1.3588128682651823, + "learning_rate": 1.3623682579045258e-07, + "loss": 1.0124, + "step": 35160 + }, + { + "epoch": 2.725405866170716, + "grad_norm": 1.406881597654078, + "learning_rate": 1.3627557346559208e-07, + "loss": 1.007, + "step": 35170 + }, + { + "epoch": 2.7261807896470223, + "grad_norm": 1.3036108023257535, + "learning_rate": 1.3631432114073157e-07, + "loss": 0.9999, + "step": 35180 + }, + { + "epoch": 2.726955713123329, + "grad_norm": 1.3325619635639074, + "learning_rate": 1.3635306881587107e-07, + "loss": 1.0139, + "step": 35190 + }, + { + "epoch": 2.727730636599636, + "grad_norm": 1.3051406690504241, + "learning_rate": 1.3639181649101056e-07, + "loss": 1.0494, + "step": 35200 + }, + { + "epoch": 2.7285055600759422, + "grad_norm": 1.3819354551475687, + "learning_rate": 1.3643056416615006e-07, + "loss": 1.0246, + "step": 35210 + }, + { + "epoch": 2.729280483552249, + "grad_norm": 1.2709511100955875, + "learning_rate": 1.3646931184128953e-07, + "loss": 1.0065, + "step": 35220 + }, + { + "epoch": 2.730055407028556, + "grad_norm": 1.2710113130841334, + "learning_rate": 1.3650805951642902e-07, + "loss": 0.9986, + "step": 35230 + }, + { + "epoch": 2.7308303305048627, + "grad_norm": 1.259650226441796, + "learning_rate": 1.3654680719156852e-07, + "loss": 1.0151, + "step": 35240 + }, + { + "epoch": 2.7316052539811695, + "grad_norm": 1.3577339942665083, + "learning_rate": 1.3658555486670801e-07, + "loss": 1.021, + "step": 35250 + }, + { + "epoch": 2.7323801774574763, + "grad_norm": 1.3671233027997423, + "learning_rate": 1.366243025418475e-07, + "loss": 1.0175, + "step": 35260 + }, + { + "epoch": 2.7331551009337827, + "grad_norm": 1.3385580242196542, + "learning_rate": 1.36663050216987e-07, + "loss": 1.0146, + "step": 35270 + }, + { + "epoch": 2.7339300244100895, + "grad_norm": 1.378283760158229, + "learning_rate": 1.367017978921265e-07, + "loss": 1.0182, + "step": 35280 + }, + { + "epoch": 2.7347049478863963, + "grad_norm": 1.3132664988582954, + "learning_rate": 1.3674054556726597e-07, + "loss": 1.0296, + "step": 35290 + }, + { + "epoch": 2.735479871362703, + "grad_norm": 1.375288742494518, + "learning_rate": 1.3677929324240546e-07, + "loss": 1.0019, + "step": 35300 + }, + { + "epoch": 2.7362547948390095, + "grad_norm": 1.2392202042126985, + "learning_rate": 1.3681804091754496e-07, + "loss": 0.995, + "step": 35310 + }, + { + "epoch": 2.7370297183153163, + "grad_norm": 1.3409128623093716, + "learning_rate": 1.3685678859268446e-07, + "loss": 1.0241, + "step": 35320 + }, + { + "epoch": 2.737804641791623, + "grad_norm": 1.3512241753962473, + "learning_rate": 1.3689553626782395e-07, + "loss": 1.0091, + "step": 35330 + }, + { + "epoch": 2.73857956526793, + "grad_norm": 1.371577213932334, + "learning_rate": 1.3693428394296345e-07, + "loss": 1.0314, + "step": 35340 + }, + { + "epoch": 2.7393544887442367, + "grad_norm": 1.3469384639306587, + "learning_rate": 1.3697303161810294e-07, + "loss": 1.0292, + "step": 35350 + }, + { + "epoch": 2.740129412220543, + "grad_norm": 1.3510277359552418, + "learning_rate": 1.370117792932424e-07, + "loss": 1.0284, + "step": 35360 + }, + { + "epoch": 2.74090433569685, + "grad_norm": 1.2456225436990593, + "learning_rate": 1.370505269683819e-07, + "loss": 1.0136, + "step": 35370 + }, + { + "epoch": 2.7416792591731567, + "grad_norm": 1.3431069404143199, + "learning_rate": 1.370892746435214e-07, + "loss": 1.0118, + "step": 35380 + }, + { + "epoch": 2.7424541826494635, + "grad_norm": 1.3295718522366473, + "learning_rate": 1.371280223186609e-07, + "loss": 1.0048, + "step": 35390 + }, + { + "epoch": 2.74322910612577, + "grad_norm": 1.4499260808725254, + "learning_rate": 1.371667699938004e-07, + "loss": 1.0096, + "step": 35400 + }, + { + "epoch": 2.7440040296020767, + "grad_norm": 1.309314346044385, + "learning_rate": 1.372055176689399e-07, + "loss": 0.9918, + "step": 35410 + }, + { + "epoch": 2.7447789530783835, + "grad_norm": 1.3146619677826703, + "learning_rate": 1.3724426534407936e-07, + "loss": 0.9923, + "step": 35420 + }, + { + "epoch": 2.7455538765546903, + "grad_norm": 1.3898085226457886, + "learning_rate": 1.3728301301921885e-07, + "loss": 1.0194, + "step": 35430 + }, + { + "epoch": 2.746328800030997, + "grad_norm": 1.327219917235149, + "learning_rate": 1.3732176069435835e-07, + "loss": 1.0136, + "step": 35440 + }, + { + "epoch": 2.747103723507304, + "grad_norm": 1.3072298894224086, + "learning_rate": 1.3736050836949784e-07, + "loss": 1.0117, + "step": 35450 + }, + { + "epoch": 2.7478786469836103, + "grad_norm": 1.4075938194818929, + "learning_rate": 1.3739925604463734e-07, + "loss": 1.0302, + "step": 35460 + }, + { + "epoch": 2.748653570459917, + "grad_norm": 1.7561113560823376, + "learning_rate": 1.3743800371977684e-07, + "loss": 1.006, + "step": 35470 + }, + { + "epoch": 2.749428493936224, + "grad_norm": 1.4256943440592493, + "learning_rate": 1.3747675139491633e-07, + "loss": 0.9953, + "step": 35480 + }, + { + "epoch": 2.7502034174125303, + "grad_norm": 1.27360790275181, + "learning_rate": 1.375154990700558e-07, + "loss": 1.0178, + "step": 35490 + }, + { + "epoch": 2.750978340888837, + "grad_norm": 1.3243019148215749, + "learning_rate": 1.375542467451953e-07, + "loss": 1.0178, + "step": 35500 + }, + { + "epoch": 2.750978340888837, + "eval_loss": 1.015048623085022, + "eval_runtime": 323.1238, + "eval_samples_per_second": 35.5, + "eval_steps_per_second": 8.876, + "step": 35500 + }, + { + "epoch": 2.751753264365144, + "grad_norm": 1.2855316420911815, + "learning_rate": 1.375929944203348e-07, + "loss": 1.008, + "step": 35510 + }, + { + "epoch": 2.7525281878414507, + "grad_norm": 1.3740843954874964, + "learning_rate": 1.3763174209547429e-07, + "loss": 1.0182, + "step": 35520 + }, + { + "epoch": 2.7533031113177575, + "grad_norm": 1.2289354973895263, + "learning_rate": 1.3767048977061378e-07, + "loss": 1.0032, + "step": 35530 + }, + { + "epoch": 2.7540780347940643, + "grad_norm": 1.3313207057375556, + "learning_rate": 1.3770923744575328e-07, + "loss": 1.03, + "step": 35540 + }, + { + "epoch": 2.7548529582703707, + "grad_norm": 1.3460915625850396, + "learning_rate": 1.3774798512089277e-07, + "loss": 1.0153, + "step": 35550 + }, + { + "epoch": 2.7556278817466775, + "grad_norm": 1.3262357209466007, + "learning_rate": 1.3778673279603224e-07, + "loss": 1.0177, + "step": 35560 + }, + { + "epoch": 2.7564028052229843, + "grad_norm": 1.3199140059931513, + "learning_rate": 1.3782548047117174e-07, + "loss": 1.0124, + "step": 35570 + }, + { + "epoch": 2.7571777286992907, + "grad_norm": 1.3409751087768231, + "learning_rate": 1.3786422814631123e-07, + "loss": 0.9901, + "step": 35580 + }, + { + "epoch": 2.7579526521755975, + "grad_norm": 1.2121835511854828, + "learning_rate": 1.3790297582145073e-07, + "loss": 0.9955, + "step": 35590 + }, + { + "epoch": 2.7587275756519043, + "grad_norm": 1.2789799336290064, + "learning_rate": 1.3794172349659022e-07, + "loss": 1.0516, + "step": 35600 + }, + { + "epoch": 2.759502499128211, + "grad_norm": 1.3239335098135903, + "learning_rate": 1.3798047117172972e-07, + "loss": 1.0028, + "step": 35610 + }, + { + "epoch": 2.760277422604518, + "grad_norm": 1.2955297672164856, + "learning_rate": 1.3801921884686922e-07, + "loss": 1.0333, + "step": 35620 + }, + { + "epoch": 2.7610523460808247, + "grad_norm": 1.394584909735403, + "learning_rate": 1.3805796652200868e-07, + "loss": 1.0119, + "step": 35630 + }, + { + "epoch": 2.761827269557131, + "grad_norm": 1.3337174184282465, + "learning_rate": 1.3809671419714818e-07, + "loss": 1.029, + "step": 35640 + }, + { + "epoch": 2.762602193033438, + "grad_norm": 1.248241738992346, + "learning_rate": 1.3813546187228768e-07, + "loss": 1.0069, + "step": 35650 + }, + { + "epoch": 2.7633771165097447, + "grad_norm": 1.2534544516277153, + "learning_rate": 1.3817420954742717e-07, + "loss": 0.9883, + "step": 35660 + }, + { + "epoch": 2.7641520399860515, + "grad_norm": 1.3255888790658519, + "learning_rate": 1.3821295722256667e-07, + "loss": 1.0075, + "step": 35670 + }, + { + "epoch": 2.764926963462358, + "grad_norm": 1.2973031363816936, + "learning_rate": 1.3825170489770616e-07, + "loss": 1.0073, + "step": 35680 + }, + { + "epoch": 2.7657018869386647, + "grad_norm": 1.298073824227892, + "learning_rate": 1.3829045257284566e-07, + "loss": 1.0041, + "step": 35690 + }, + { + "epoch": 2.7664768104149715, + "grad_norm": 1.2304416168791004, + "learning_rate": 1.3832920024798513e-07, + "loss": 1.0242, + "step": 35700 + }, + { + "epoch": 2.7672517338912783, + "grad_norm": 1.2674928328777129, + "learning_rate": 1.3836794792312462e-07, + "loss": 1.0208, + "step": 35710 + }, + { + "epoch": 2.768026657367585, + "grad_norm": 1.254898923443534, + "learning_rate": 1.3840669559826412e-07, + "loss": 1.0339, + "step": 35720 + }, + { + "epoch": 2.768801580843892, + "grad_norm": 1.328483774462737, + "learning_rate": 1.384454432734036e-07, + "loss": 1.0263, + "step": 35730 + }, + { + "epoch": 2.7695765043201983, + "grad_norm": 1.4918390953330602, + "learning_rate": 1.384841909485431e-07, + "loss": 1.018, + "step": 35740 + }, + { + "epoch": 2.770351427796505, + "grad_norm": 1.2558911054703141, + "learning_rate": 1.385229386236826e-07, + "loss": 1.01, + "step": 35750 + }, + { + "epoch": 2.771126351272812, + "grad_norm": 1.2985256212866474, + "learning_rate": 1.3856168629882207e-07, + "loss": 1.009, + "step": 35760 + }, + { + "epoch": 2.7719012747491183, + "grad_norm": 1.2816008683383664, + "learning_rate": 1.3860043397396157e-07, + "loss": 1.0108, + "step": 35770 + }, + { + "epoch": 2.772676198225425, + "grad_norm": 1.332259710169729, + "learning_rate": 1.3863918164910106e-07, + "loss": 1.0217, + "step": 35780 + }, + { + "epoch": 2.773451121701732, + "grad_norm": 1.3003775800531494, + "learning_rate": 1.3867792932424056e-07, + "loss": 1.0209, + "step": 35790 + }, + { + "epoch": 2.7742260451780387, + "grad_norm": 1.4332741220288707, + "learning_rate": 1.3871667699938005e-07, + "loss": 1.0013, + "step": 35800 + }, + { + "epoch": 2.7750009686543455, + "grad_norm": 1.3072833316703472, + "learning_rate": 1.3875542467451955e-07, + "loss": 1.0234, + "step": 35810 + }, + { + "epoch": 2.7757758921306523, + "grad_norm": 1.3641039776466286, + "learning_rate": 1.3879417234965905e-07, + "loss": 1.0259, + "step": 35820 + }, + { + "epoch": 2.7765508156069587, + "grad_norm": 1.3290749068633416, + "learning_rate": 1.3883292002479851e-07, + "loss": 0.9992, + "step": 35830 + }, + { + "epoch": 2.7773257390832655, + "grad_norm": 1.3400696670830745, + "learning_rate": 1.38871667699938e-07, + "loss": 1.0081, + "step": 35840 + }, + { + "epoch": 2.7781006625595723, + "grad_norm": 1.3028706290841998, + "learning_rate": 1.389104153750775e-07, + "loss": 1.0311, + "step": 35850 + }, + { + "epoch": 2.7788755860358787, + "grad_norm": 1.2483379951246296, + "learning_rate": 1.38949163050217e-07, + "loss": 1.0041, + "step": 35860 + }, + { + "epoch": 2.7796505095121855, + "grad_norm": 1.3585149661854228, + "learning_rate": 1.389879107253565e-07, + "loss": 1.0309, + "step": 35870 + }, + { + "epoch": 2.7804254329884923, + "grad_norm": 1.3742495387079716, + "learning_rate": 1.39026658400496e-07, + "loss": 1.0024, + "step": 35880 + }, + { + "epoch": 2.781200356464799, + "grad_norm": 1.358571439881564, + "learning_rate": 1.390654060756355e-07, + "loss": 0.9952, + "step": 35890 + }, + { + "epoch": 2.781975279941106, + "grad_norm": 1.3362528863521836, + "learning_rate": 1.3910415375077496e-07, + "loss": 1.0083, + "step": 35900 + }, + { + "epoch": 2.7827502034174127, + "grad_norm": 1.2708459908801282, + "learning_rate": 1.3914290142591445e-07, + "loss": 1.029, + "step": 35910 + }, + { + "epoch": 2.783525126893719, + "grad_norm": 1.3495802044420893, + "learning_rate": 1.3918164910105395e-07, + "loss": 1.0343, + "step": 35920 + }, + { + "epoch": 2.784300050370026, + "grad_norm": 1.3009351478182565, + "learning_rate": 1.3922039677619344e-07, + "loss": 1.0294, + "step": 35930 + }, + { + "epoch": 2.7850749738463327, + "grad_norm": 1.2850066987850788, + "learning_rate": 1.3925914445133294e-07, + "loss": 1.0331, + "step": 35940 + }, + { + "epoch": 2.7858498973226395, + "grad_norm": 1.3310890391995367, + "learning_rate": 1.3929789212647243e-07, + "loss": 1.0043, + "step": 35950 + }, + { + "epoch": 2.786624820798946, + "grad_norm": 1.4386498093536404, + "learning_rate": 1.3933663980161193e-07, + "loss": 1.0209, + "step": 35960 + }, + { + "epoch": 2.7873997442752527, + "grad_norm": 1.3415984764758186, + "learning_rate": 1.393753874767514e-07, + "loss": 1.0233, + "step": 35970 + }, + { + "epoch": 2.7881746677515595, + "grad_norm": 1.3423435795240966, + "learning_rate": 1.394141351518909e-07, + "loss": 1.0092, + "step": 35980 + }, + { + "epoch": 2.7889495912278663, + "grad_norm": 1.2883584130078338, + "learning_rate": 1.394528828270304e-07, + "loss": 1.0097, + "step": 35990 + }, + { + "epoch": 2.789724514704173, + "grad_norm": 1.309049095559936, + "learning_rate": 1.3949163050216989e-07, + "loss": 1.0185, + "step": 36000 + }, + { + "epoch": 2.789724514704173, + "eval_loss": 1.0134021043777466, + "eval_runtime": 321.8161, + "eval_samples_per_second": 35.645, + "eval_steps_per_second": 8.912, + "step": 36000 + }, + { + "epoch": 2.7904994381804795, + "grad_norm": 1.440912602736193, + "learning_rate": 1.3953037817730938e-07, + "loss": 1.0329, + "step": 36010 + }, + { + "epoch": 2.7912743616567863, + "grad_norm": 1.3408979796931835, + "learning_rate": 1.3956912585244888e-07, + "loss": 1.0299, + "step": 36020 + }, + { + "epoch": 2.792049285133093, + "grad_norm": 1.3409441909356332, + "learning_rate": 1.3960787352758837e-07, + "loss": 0.9957, + "step": 36030 + }, + { + "epoch": 2.7928242086094, + "grad_norm": 1.3811351667200689, + "learning_rate": 1.3964662120272784e-07, + "loss": 1.0361, + "step": 36040 + }, + { + "epoch": 2.7935991320857063, + "grad_norm": 1.4018500352007377, + "learning_rate": 1.3968536887786734e-07, + "loss": 1.016, + "step": 36050 + }, + { + "epoch": 2.794374055562013, + "grad_norm": 1.3167639876960535, + "learning_rate": 1.3972411655300683e-07, + "loss": 1.0081, + "step": 36060 + }, + { + "epoch": 2.79514897903832, + "grad_norm": 1.394297182606109, + "learning_rate": 1.3976286422814633e-07, + "loss": 1.0291, + "step": 36070 + }, + { + "epoch": 2.7959239025146267, + "grad_norm": 1.3231697762229995, + "learning_rate": 1.3980161190328582e-07, + "loss": 0.9904, + "step": 36080 + }, + { + "epoch": 2.7966988259909336, + "grad_norm": 1.3107935753916795, + "learning_rate": 1.3984035957842532e-07, + "loss": 1.05, + "step": 36090 + }, + { + "epoch": 2.7974737494672404, + "grad_norm": 1.340345982687143, + "learning_rate": 1.3987910725356481e-07, + "loss": 1.0384, + "step": 36100 + }, + { + "epoch": 2.7982486729435467, + "grad_norm": 1.2740826122262514, + "learning_rate": 1.3991785492870428e-07, + "loss": 1.0162, + "step": 36110 + }, + { + "epoch": 2.7990235964198535, + "grad_norm": 1.4487258072893365, + "learning_rate": 1.3995660260384378e-07, + "loss": 1.0144, + "step": 36120 + }, + { + "epoch": 2.7997985198961604, + "grad_norm": 1.4426201230217561, + "learning_rate": 1.3999535027898327e-07, + "loss": 0.9943, + "step": 36130 + }, + { + "epoch": 2.8005734433724667, + "grad_norm": 1.2738689401992314, + "learning_rate": 1.4003409795412277e-07, + "loss": 1.0366, + "step": 36140 + }, + { + "epoch": 2.8013483668487735, + "grad_norm": 1.3130115729158536, + "learning_rate": 1.4007284562926227e-07, + "loss": 1.0033, + "step": 36150 + }, + { + "epoch": 2.8021232903250803, + "grad_norm": 1.3190240653167937, + "learning_rate": 1.4011159330440176e-07, + "loss": 1.0086, + "step": 36160 + }, + { + "epoch": 2.802898213801387, + "grad_norm": 1.3326339285942956, + "learning_rate": 1.4015034097954123e-07, + "loss": 1.0138, + "step": 36170 + }, + { + "epoch": 2.803673137277694, + "grad_norm": 1.301151855495348, + "learning_rate": 1.4018908865468073e-07, + "loss": 1.0115, + "step": 36180 + }, + { + "epoch": 2.8044480607540008, + "grad_norm": 1.357930882096236, + "learning_rate": 1.4022783632982022e-07, + "loss": 1.0183, + "step": 36190 + }, + { + "epoch": 2.805222984230307, + "grad_norm": 1.3257740185724427, + "learning_rate": 1.4026658400495972e-07, + "loss": 1.0233, + "step": 36200 + }, + { + "epoch": 2.805997907706614, + "grad_norm": 1.4143705626293364, + "learning_rate": 1.403053316800992e-07, + "loss": 1.041, + "step": 36210 + }, + { + "epoch": 2.8067728311829208, + "grad_norm": 1.2703924965293802, + "learning_rate": 1.403440793552387e-07, + "loss": 1.0223, + "step": 36220 + }, + { + "epoch": 2.807547754659227, + "grad_norm": 1.3307736742332297, + "learning_rate": 1.403828270303782e-07, + "loss": 0.9987, + "step": 36230 + }, + { + "epoch": 2.808322678135534, + "grad_norm": 1.2758486906293556, + "learning_rate": 1.4042157470551767e-07, + "loss": 1.0169, + "step": 36240 + }, + { + "epoch": 2.8090976016118407, + "grad_norm": 1.3774450797286135, + "learning_rate": 1.4046032238065717e-07, + "loss": 1.0099, + "step": 36250 + }, + { + "epoch": 2.8098725250881476, + "grad_norm": 1.2772572913959273, + "learning_rate": 1.4049907005579666e-07, + "loss": 1.0099, + "step": 36260 + }, + { + "epoch": 2.8106474485644544, + "grad_norm": 1.2928033452139482, + "learning_rate": 1.4053781773093616e-07, + "loss": 1.0358, + "step": 36270 + }, + { + "epoch": 2.811422372040761, + "grad_norm": 1.3351671462296577, + "learning_rate": 1.4057656540607565e-07, + "loss": 0.9971, + "step": 36280 + }, + { + "epoch": 2.8121972955170675, + "grad_norm": 1.4068513112426078, + "learning_rate": 1.4061531308121515e-07, + "loss": 1.012, + "step": 36290 + }, + { + "epoch": 2.8129722189933744, + "grad_norm": 1.2723869000950798, + "learning_rate": 1.4065406075635465e-07, + "loss": 1.0089, + "step": 36300 + }, + { + "epoch": 2.813747142469681, + "grad_norm": 1.3169825073431949, + "learning_rate": 1.4069280843149411e-07, + "loss": 1.0051, + "step": 36310 + }, + { + "epoch": 2.814522065945988, + "grad_norm": 1.3069046585507216, + "learning_rate": 1.407315561066336e-07, + "loss": 1.0111, + "step": 36320 + }, + { + "epoch": 2.8152969894222943, + "grad_norm": 1.3412400932581, + "learning_rate": 1.407703037817731e-07, + "loss": 1.0041, + "step": 36330 + }, + { + "epoch": 2.816071912898601, + "grad_norm": 1.2908562648472268, + "learning_rate": 1.408090514569126e-07, + "loss": 0.9971, + "step": 36340 + }, + { + "epoch": 2.816846836374908, + "grad_norm": 1.3127432119745082, + "learning_rate": 1.408477991320521e-07, + "loss": 1.0012, + "step": 36350 + }, + { + "epoch": 2.8176217598512148, + "grad_norm": 1.3018230640001034, + "learning_rate": 1.408865468071916e-07, + "loss": 1.0386, + "step": 36360 + }, + { + "epoch": 2.8183966833275216, + "grad_norm": 1.3853763739419112, + "learning_rate": 1.409252944823311e-07, + "loss": 1.0102, + "step": 36370 + }, + { + "epoch": 2.8191716068038284, + "grad_norm": 1.3292632274289762, + "learning_rate": 1.4096404215747056e-07, + "loss": 0.9895, + "step": 36380 + }, + { + "epoch": 2.8199465302801348, + "grad_norm": 1.489784712845495, + "learning_rate": 1.4100278983261005e-07, + "loss": 1.0186, + "step": 36390 + }, + { + "epoch": 2.8207214537564416, + "grad_norm": 1.3262621290462095, + "learning_rate": 1.4104153750774955e-07, + "loss": 1.013, + "step": 36400 + }, + { + "epoch": 2.8214963772327484, + "grad_norm": 1.7414160147056261, + "learning_rate": 1.4108028518288904e-07, + "loss": 1.037, + "step": 36410 + }, + { + "epoch": 2.8222713007090547, + "grad_norm": 1.3335776176877931, + "learning_rate": 1.4111903285802854e-07, + "loss": 1.0474, + "step": 36420 + }, + { + "epoch": 2.8230462241853616, + "grad_norm": 1.3287573051570047, + "learning_rate": 1.4115778053316803e-07, + "loss": 1.022, + "step": 36430 + }, + { + "epoch": 2.8238211476616684, + "grad_norm": 1.2663546701948891, + "learning_rate": 1.4119652820830753e-07, + "loss": 1.021, + "step": 36440 + }, + { + "epoch": 2.824596071137975, + "grad_norm": 1.3057680538247451, + "learning_rate": 1.41235275883447e-07, + "loss": 1.0189, + "step": 36450 + }, + { + "epoch": 2.825370994614282, + "grad_norm": 1.2403941302664054, + "learning_rate": 1.412740235585865e-07, + "loss": 1.0225, + "step": 36460 + }, + { + "epoch": 2.826145918090589, + "grad_norm": 1.3460661021290243, + "learning_rate": 1.41312771233726e-07, + "loss": 1.0007, + "step": 36470 + }, + { + "epoch": 2.826920841566895, + "grad_norm": 1.3635007308888514, + "learning_rate": 1.4135151890886548e-07, + "loss": 1.0117, + "step": 36480 + }, + { + "epoch": 2.827695765043202, + "grad_norm": 1.3987787539927352, + "learning_rate": 1.4139026658400498e-07, + "loss": 1.0431, + "step": 36490 + }, + { + "epoch": 2.828470688519509, + "grad_norm": 1.418397415804187, + "learning_rate": 1.4142901425914448e-07, + "loss": 1.0066, + "step": 36500 + }, + { + "epoch": 2.828470688519509, + "eval_loss": 1.0118157863616943, + "eval_runtime": 320.3254, + "eval_samples_per_second": 35.81, + "eval_steps_per_second": 8.953, + "step": 36500 + }, + { + "epoch": 2.829245611995815, + "grad_norm": 1.3465113125195065, + "learning_rate": 1.4146776193428394e-07, + "loss": 0.999, + "step": 36510 + }, + { + "epoch": 2.830020535472122, + "grad_norm": 1.3090074937855884, + "learning_rate": 1.4150650960942344e-07, + "loss": 0.9959, + "step": 36520 + }, + { + "epoch": 2.8307954589484288, + "grad_norm": 1.319257525236101, + "learning_rate": 1.4154525728456294e-07, + "loss": 1.0266, + "step": 36530 + }, + { + "epoch": 2.8315703824247356, + "grad_norm": 1.302531120230377, + "learning_rate": 1.4158400495970243e-07, + "loss": 1.0143, + "step": 36540 + }, + { + "epoch": 2.8323453059010424, + "grad_norm": 1.2932936463625133, + "learning_rate": 1.4162275263484193e-07, + "loss": 1.0213, + "step": 36550 + }, + { + "epoch": 2.833120229377349, + "grad_norm": 1.3503810035492483, + "learning_rate": 1.4166150030998142e-07, + "loss": 0.9984, + "step": 36560 + }, + { + "epoch": 2.8338951528536556, + "grad_norm": 1.334086913759021, + "learning_rate": 1.4170024798512092e-07, + "loss": 1.0362, + "step": 36570 + }, + { + "epoch": 2.8346700763299624, + "grad_norm": 1.3452897252887577, + "learning_rate": 1.417389956602604e-07, + "loss": 1.0411, + "step": 36580 + }, + { + "epoch": 2.835444999806269, + "grad_norm": 1.5204685534124789, + "learning_rate": 1.4177774333539988e-07, + "loss": 0.9997, + "step": 36590 + }, + { + "epoch": 2.836219923282576, + "grad_norm": 1.3424093444706333, + "learning_rate": 1.4181649101053938e-07, + "loss": 0.9993, + "step": 36600 + }, + { + "epoch": 2.8369948467588824, + "grad_norm": 1.3250277184762091, + "learning_rate": 1.4185523868567887e-07, + "loss": 1.0146, + "step": 36610 + }, + { + "epoch": 2.837769770235189, + "grad_norm": 1.3365096043505493, + "learning_rate": 1.4189398636081837e-07, + "loss": 1.0171, + "step": 36620 + }, + { + "epoch": 2.838544693711496, + "grad_norm": 1.2847548893849814, + "learning_rate": 1.4193273403595786e-07, + "loss": 0.9908, + "step": 36630 + }, + { + "epoch": 2.839319617187803, + "grad_norm": 1.3075090313160642, + "learning_rate": 1.4197148171109736e-07, + "loss": 1.0062, + "step": 36640 + }, + { + "epoch": 2.8400945406641096, + "grad_norm": 1.2846645331874156, + "learning_rate": 1.4201022938623683e-07, + "loss": 1.0242, + "step": 36650 + }, + { + "epoch": 2.840869464140416, + "grad_norm": 1.3842509171749204, + "learning_rate": 1.4204897706137632e-07, + "loss": 1.043, + "step": 36660 + }, + { + "epoch": 2.8416443876167228, + "grad_norm": 1.316551440809172, + "learning_rate": 1.4208772473651582e-07, + "loss": 1.0072, + "step": 36670 + }, + { + "epoch": 2.8424193110930296, + "grad_norm": 1.4050535013092393, + "learning_rate": 1.4212647241165532e-07, + "loss": 1.012, + "step": 36680 + }, + { + "epoch": 2.8431942345693364, + "grad_norm": 1.3296814472828273, + "learning_rate": 1.421652200867948e-07, + "loss": 1.0178, + "step": 36690 + }, + { + "epoch": 2.8439691580456428, + "grad_norm": 1.2928520589321755, + "learning_rate": 1.422039677619343e-07, + "loss": 1.0198, + "step": 36700 + }, + { + "epoch": 2.8447440815219496, + "grad_norm": 1.311500366311242, + "learning_rate": 1.422427154370738e-07, + "loss": 1.0061, + "step": 36710 + }, + { + "epoch": 2.8455190049982564, + "grad_norm": 1.4032497925455238, + "learning_rate": 1.4228146311221327e-07, + "loss": 1.0263, + "step": 36720 + }, + { + "epoch": 2.846293928474563, + "grad_norm": 1.2861423955500197, + "learning_rate": 1.4232021078735277e-07, + "loss": 1.0088, + "step": 36730 + }, + { + "epoch": 2.84706885195087, + "grad_norm": 1.4099352973626367, + "learning_rate": 1.4235895846249226e-07, + "loss": 1.0071, + "step": 36740 + }, + { + "epoch": 2.847843775427177, + "grad_norm": 1.2481388513899794, + "learning_rate": 1.4239770613763176e-07, + "loss": 0.9891, + "step": 36750 + }, + { + "epoch": 2.848618698903483, + "grad_norm": 1.2684188954705469, + "learning_rate": 1.4243645381277125e-07, + "loss": 1.0076, + "step": 36760 + }, + { + "epoch": 2.84939362237979, + "grad_norm": 1.221765229276093, + "learning_rate": 1.4247520148791075e-07, + "loss": 0.9984, + "step": 36770 + }, + { + "epoch": 2.850168545856097, + "grad_norm": 1.4067251185048906, + "learning_rate": 1.4251394916305024e-07, + "loss": 1.0184, + "step": 36780 + }, + { + "epoch": 2.850943469332403, + "grad_norm": 1.3428205286383743, + "learning_rate": 1.4255269683818971e-07, + "loss": 1.04, + "step": 36790 + }, + { + "epoch": 2.85171839280871, + "grad_norm": 1.360165111400424, + "learning_rate": 1.425914445133292e-07, + "loss": 1.0195, + "step": 36800 + }, + { + "epoch": 2.852493316285017, + "grad_norm": 1.332830557379784, + "learning_rate": 1.426301921884687e-07, + "loss": 1.0354, + "step": 36810 + }, + { + "epoch": 2.8532682397613236, + "grad_norm": 1.3530932437242356, + "learning_rate": 1.426689398636082e-07, + "loss": 1.0012, + "step": 36820 + }, + { + "epoch": 2.8540431632376304, + "grad_norm": 1.2865232478141533, + "learning_rate": 1.427076875387477e-07, + "loss": 0.9875, + "step": 36830 + }, + { + "epoch": 2.854818086713937, + "grad_norm": 1.40895028403881, + "learning_rate": 1.427464352138872e-07, + "loss": 1.0095, + "step": 36840 + }, + { + "epoch": 2.8555930101902436, + "grad_norm": 1.329776695233931, + "learning_rate": 1.4278518288902666e-07, + "loss": 1.0077, + "step": 36850 + }, + { + "epoch": 2.8563679336665504, + "grad_norm": 1.3861754420803105, + "learning_rate": 1.4282393056416616e-07, + "loss": 1.0065, + "step": 36860 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.317157209265148, + "learning_rate": 1.4286267823930565e-07, + "loss": 1.018, + "step": 36870 + }, + { + "epoch": 2.857917780619164, + "grad_norm": 1.3583613245098878, + "learning_rate": 1.4290142591444515e-07, + "loss": 1.0065, + "step": 36880 + }, + { + "epoch": 2.8586927040954704, + "grad_norm": 1.3373893492099793, + "learning_rate": 1.4294017358958464e-07, + "loss": 1.009, + "step": 36890 + }, + { + "epoch": 2.859467627571777, + "grad_norm": 1.348383843939623, + "learning_rate": 1.4297892126472414e-07, + "loss": 1.0162, + "step": 36900 + }, + { + "epoch": 2.860242551048084, + "grad_norm": 1.315462165153355, + "learning_rate": 1.4301766893986363e-07, + "loss": 0.9874, + "step": 36910 + }, + { + "epoch": 2.861017474524391, + "grad_norm": 1.3022939109672058, + "learning_rate": 1.430564166150031e-07, + "loss": 0.994, + "step": 36920 + }, + { + "epoch": 2.8617923980006976, + "grad_norm": 1.3790043315575773, + "learning_rate": 1.430951642901426e-07, + "loss": 1.0536, + "step": 36930 + }, + { + "epoch": 2.862567321477004, + "grad_norm": 1.2892216558517655, + "learning_rate": 1.431339119652821e-07, + "loss": 0.991, + "step": 36940 + }, + { + "epoch": 2.863342244953311, + "grad_norm": 1.3206341534793296, + "learning_rate": 1.431726596404216e-07, + "loss": 0.998, + "step": 36950 + }, + { + "epoch": 2.8641171684296176, + "grad_norm": 1.4002636958742891, + "learning_rate": 1.4321140731556108e-07, + "loss": 1.0049, + "step": 36960 + }, + { + "epoch": 2.8648920919059244, + "grad_norm": 1.3485489559799648, + "learning_rate": 1.4325015499070058e-07, + "loss": 1.0276, + "step": 36970 + }, + { + "epoch": 2.865667015382231, + "grad_norm": 1.3295444136895094, + "learning_rate": 1.4328890266584008e-07, + "loss": 1.0712, + "step": 36980 + }, + { + "epoch": 2.8664419388585376, + "grad_norm": 1.317736175575181, + "learning_rate": 1.4332765034097954e-07, + "loss": 1.0003, + "step": 36990 + }, + { + "epoch": 2.8672168623348444, + "grad_norm": 1.2884540237849849, + "learning_rate": 1.4336639801611904e-07, + "loss": 0.9978, + "step": 37000 + }, + { + "epoch": 2.8672168623348444, + "eval_loss": 1.0101838111877441, + "eval_runtime": 320.8251, + "eval_samples_per_second": 35.755, + "eval_steps_per_second": 8.939, + "step": 37000 + }, + { + "epoch": 2.867991785811151, + "grad_norm": 1.351131102381207, + "learning_rate": 1.4340514569125854e-07, + "loss": 1.0045, + "step": 37010 + }, + { + "epoch": 2.868766709287458, + "grad_norm": 1.4260784766503207, + "learning_rate": 1.4344389336639803e-07, + "loss": 1.0055, + "step": 37020 + }, + { + "epoch": 2.869541632763765, + "grad_norm": 1.4183003032181363, + "learning_rate": 1.4348264104153753e-07, + "loss": 1.0071, + "step": 37030 + }, + { + "epoch": 2.870316556240071, + "grad_norm": 1.2651630922302857, + "learning_rate": 1.4352138871667702e-07, + "loss": 1.0245, + "step": 37040 + }, + { + "epoch": 2.871091479716378, + "grad_norm": 1.3861270830012533, + "learning_rate": 1.4356013639181652e-07, + "loss": 0.9985, + "step": 37050 + }, + { + "epoch": 2.871866403192685, + "grad_norm": 1.278054042973367, + "learning_rate": 1.4359888406695599e-07, + "loss": 1.0172, + "step": 37060 + }, + { + "epoch": 2.872641326668991, + "grad_norm": 1.3393548494341327, + "learning_rate": 1.4363763174209548e-07, + "loss": 1.0217, + "step": 37070 + }, + { + "epoch": 2.873416250145298, + "grad_norm": 1.3674474945935602, + "learning_rate": 1.4367637941723498e-07, + "loss": 1.0451, + "step": 37080 + }, + { + "epoch": 2.874191173621605, + "grad_norm": 1.387130025737813, + "learning_rate": 1.4371512709237447e-07, + "loss": 0.9963, + "step": 37090 + }, + { + "epoch": 2.8749660970979116, + "grad_norm": 1.293328676138041, + "learning_rate": 1.4375387476751397e-07, + "loss": 1.0291, + "step": 37100 + }, + { + "epoch": 2.8757410205742184, + "grad_norm": 1.3097369146271747, + "learning_rate": 1.4379262244265346e-07, + "loss": 1.0027, + "step": 37110 + }, + { + "epoch": 2.8765159440505252, + "grad_norm": 1.3020047596151436, + "learning_rate": 1.4383137011779296e-07, + "loss": 0.9803, + "step": 37120 + }, + { + "epoch": 2.8772908675268316, + "grad_norm": 1.2504564918130554, + "learning_rate": 1.4387011779293243e-07, + "loss": 1.0286, + "step": 37130 + }, + { + "epoch": 2.8780657910031384, + "grad_norm": 1.300032433782937, + "learning_rate": 1.4390886546807192e-07, + "loss": 1.0015, + "step": 37140 + }, + { + "epoch": 2.8788407144794452, + "grad_norm": 1.387341504984675, + "learning_rate": 1.4394761314321142e-07, + "loss": 1.0183, + "step": 37150 + }, + { + "epoch": 2.8796156379557516, + "grad_norm": 1.6661555920727253, + "learning_rate": 1.4398636081835091e-07, + "loss": 1.002, + "step": 37160 + }, + { + "epoch": 2.8803905614320584, + "grad_norm": 1.2427822606994738, + "learning_rate": 1.440251084934904e-07, + "loss": 0.9986, + "step": 37170 + }, + { + "epoch": 2.881165484908365, + "grad_norm": 1.3039637147393484, + "learning_rate": 1.440638561686299e-07, + "loss": 1.0005, + "step": 37180 + }, + { + "epoch": 2.881940408384672, + "grad_norm": 1.3130864533152886, + "learning_rate": 1.441026038437694e-07, + "loss": 1.005, + "step": 37190 + }, + { + "epoch": 2.882715331860979, + "grad_norm": 1.2786918180134499, + "learning_rate": 1.4414135151890887e-07, + "loss": 1.0182, + "step": 37200 + }, + { + "epoch": 2.8834902553372856, + "grad_norm": 1.2982897565395297, + "learning_rate": 1.4418009919404837e-07, + "loss": 1.016, + "step": 37210 + }, + { + "epoch": 2.884265178813592, + "grad_norm": 1.3106449971296463, + "learning_rate": 1.4421884686918786e-07, + "loss": 1.0316, + "step": 37220 + }, + { + "epoch": 2.885040102289899, + "grad_norm": 1.365807148279343, + "learning_rate": 1.4425759454432736e-07, + "loss": 1.0141, + "step": 37230 + }, + { + "epoch": 2.8858150257662056, + "grad_norm": 1.3638222965195066, + "learning_rate": 1.4429634221946685e-07, + "loss": 0.979, + "step": 37240 + }, + { + "epoch": 2.8865899492425124, + "grad_norm": 1.2546701497511832, + "learning_rate": 1.4433508989460635e-07, + "loss": 1.0067, + "step": 37250 + }, + { + "epoch": 2.887364872718819, + "grad_norm": 1.3111929595562197, + "learning_rate": 1.4437383756974582e-07, + "loss": 0.9949, + "step": 37260 + }, + { + "epoch": 2.8881397961951256, + "grad_norm": 1.3649511542148836, + "learning_rate": 1.444125852448853e-07, + "loss": 0.9944, + "step": 37270 + }, + { + "epoch": 2.8889147196714324, + "grad_norm": 1.4004352385186523, + "learning_rate": 1.444513329200248e-07, + "loss": 1.0044, + "step": 37280 + }, + { + "epoch": 2.8896896431477392, + "grad_norm": 1.3650460970425715, + "learning_rate": 1.444900805951643e-07, + "loss": 1.0253, + "step": 37290 + }, + { + "epoch": 2.890464566624046, + "grad_norm": 1.2927658912143876, + "learning_rate": 1.445288282703038e-07, + "loss": 1.0139, + "step": 37300 + }, + { + "epoch": 2.891239490100353, + "grad_norm": 1.3827891723292707, + "learning_rate": 1.445675759454433e-07, + "loss": 1.0074, + "step": 37310 + }, + { + "epoch": 2.8920144135766592, + "grad_norm": 2.389079979481399, + "learning_rate": 1.446063236205828e-07, + "loss": 1.0207, + "step": 37320 + }, + { + "epoch": 2.892789337052966, + "grad_norm": 1.3024981131962734, + "learning_rate": 1.4464507129572226e-07, + "loss": 1.0042, + "step": 37330 + }, + { + "epoch": 2.893564260529273, + "grad_norm": 1.2643114258357084, + "learning_rate": 1.4468381897086175e-07, + "loss": 1.0328, + "step": 37340 + }, + { + "epoch": 2.894339184005579, + "grad_norm": 1.377719348022848, + "learning_rate": 1.4472256664600125e-07, + "loss": 1.0193, + "step": 37350 + }, + { + "epoch": 2.895114107481886, + "grad_norm": 1.4510226798663355, + "learning_rate": 1.4476131432114075e-07, + "loss": 1.0158, + "step": 37360 + }, + { + "epoch": 2.895889030958193, + "grad_norm": 1.338699332357397, + "learning_rate": 1.4480006199628024e-07, + "loss": 1.0278, + "step": 37370 + }, + { + "epoch": 2.8966639544344996, + "grad_norm": 1.3051165769194721, + "learning_rate": 1.4483880967141974e-07, + "loss": 1.008, + "step": 37380 + }, + { + "epoch": 2.8974388779108065, + "grad_norm": 1.3027805462127926, + "learning_rate": 1.4487755734655923e-07, + "loss": 0.9984, + "step": 37390 + }, + { + "epoch": 2.8982138013871133, + "grad_norm": 1.4883648803784497, + "learning_rate": 1.449163050216987e-07, + "loss": 1.0076, + "step": 37400 + }, + { + "epoch": 2.8989887248634196, + "grad_norm": 1.2836460919859904, + "learning_rate": 1.449550526968382e-07, + "loss": 0.9873, + "step": 37410 + }, + { + "epoch": 2.8997636483397264, + "grad_norm": 1.2882556540251464, + "learning_rate": 1.449938003719777e-07, + "loss": 1.0075, + "step": 37420 + }, + { + "epoch": 2.9005385718160333, + "grad_norm": 1.2758818491400796, + "learning_rate": 1.450325480471172e-07, + "loss": 1.0174, + "step": 37430 + }, + { + "epoch": 2.9013134952923396, + "grad_norm": 1.2862439518578694, + "learning_rate": 1.4507129572225668e-07, + "loss": 1.0244, + "step": 37440 + }, + { + "epoch": 2.9020884187686464, + "grad_norm": 1.3165455677756293, + "learning_rate": 1.4511004339739618e-07, + "loss": 1.0114, + "step": 37450 + }, + { + "epoch": 2.9028633422449532, + "grad_norm": 1.3828126115659292, + "learning_rate": 1.4514879107253567e-07, + "loss": 1.0148, + "step": 37460 + }, + { + "epoch": 2.90363826572126, + "grad_norm": 1.3297557397817734, + "learning_rate": 1.4518753874767514e-07, + "loss": 1.0123, + "step": 37470 + }, + { + "epoch": 2.904413189197567, + "grad_norm": 1.726999067921104, + "learning_rate": 1.4522628642281464e-07, + "loss": 1.0402, + "step": 37480 + }, + { + "epoch": 2.9051881126738737, + "grad_norm": 1.2846148504899557, + "learning_rate": 1.4526503409795413e-07, + "loss": 1.0146, + "step": 37490 + }, + { + "epoch": 2.90596303615018, + "grad_norm": 1.3403103130367688, + "learning_rate": 1.4530378177309363e-07, + "loss": 1.0292, + "step": 37500 + }, + { + "epoch": 2.90596303615018, + "eval_loss": 1.0086870193481445, + "eval_runtime": 318.5128, + "eval_samples_per_second": 36.014, + "eval_steps_per_second": 9.004, + "step": 37500 + }, + { + "epoch": 2.906737959626487, + "grad_norm": 1.2807352130428176, + "learning_rate": 1.4534252944823313e-07, + "loss": 1.0011, + "step": 37510 + }, + { + "epoch": 2.9075128831027937, + "grad_norm": 1.3767898885999863, + "learning_rate": 1.4538127712337262e-07, + "loss": 1.0158, + "step": 37520 + }, + { + "epoch": 2.9082878065791005, + "grad_norm": 1.4642008740109218, + "learning_rate": 1.4542002479851212e-07, + "loss": 1.0055, + "step": 37530 + }, + { + "epoch": 2.909062730055407, + "grad_norm": 1.32408101207633, + "learning_rate": 1.4545877247365159e-07, + "loss": 0.9883, + "step": 37540 + }, + { + "epoch": 2.9098376535317136, + "grad_norm": 1.790300028586483, + "learning_rate": 1.4549752014879108e-07, + "loss": 1.0339, + "step": 37550 + }, + { + "epoch": 2.9106125770080205, + "grad_norm": 1.2630176852681325, + "learning_rate": 1.4553626782393058e-07, + "loss": 1.0339, + "step": 37560 + }, + { + "epoch": 2.9113875004843273, + "grad_norm": 1.2906117816625966, + "learning_rate": 1.4557501549907007e-07, + "loss": 1.0019, + "step": 37570 + }, + { + "epoch": 2.912162423960634, + "grad_norm": 1.3760888867362788, + "learning_rate": 1.4561376317420957e-07, + "loss": 0.9932, + "step": 37580 + }, + { + "epoch": 2.9129373474369404, + "grad_norm": 1.2572934825947537, + "learning_rate": 1.4565251084934906e-07, + "loss": 1.0125, + "step": 37590 + }, + { + "epoch": 2.9137122709132472, + "grad_norm": 1.3799228693911743, + "learning_rate": 1.4569125852448853e-07, + "loss": 1.0045, + "step": 37600 + }, + { + "epoch": 2.914487194389554, + "grad_norm": 1.3733148550420926, + "learning_rate": 1.4573000619962803e-07, + "loss": 1.0134, + "step": 37610 + }, + { + "epoch": 2.915262117865861, + "grad_norm": 1.3848649315495984, + "learning_rate": 1.4576875387476752e-07, + "loss": 1.0067, + "step": 37620 + }, + { + "epoch": 2.9160370413421672, + "grad_norm": 1.3352433896879603, + "learning_rate": 1.4580750154990702e-07, + "loss": 1.0006, + "step": 37630 + }, + { + "epoch": 2.916811964818474, + "grad_norm": 1.3545529688815003, + "learning_rate": 1.4584624922504651e-07, + "loss": 1.0007, + "step": 37640 + }, + { + "epoch": 2.917586888294781, + "grad_norm": 1.319308808728763, + "learning_rate": 1.45884996900186e-07, + "loss": 1.0223, + "step": 37650 + }, + { + "epoch": 2.9183618117710877, + "grad_norm": 1.2382061158352888, + "learning_rate": 1.459237445753255e-07, + "loss": 0.9868, + "step": 37660 + }, + { + "epoch": 2.9191367352473945, + "grad_norm": 1.2880297079158896, + "learning_rate": 1.4596249225046497e-07, + "loss": 1.0023, + "step": 37670 + }, + { + "epoch": 2.9199116587237013, + "grad_norm": 1.2923021907659384, + "learning_rate": 1.4600123992560447e-07, + "loss": 1.03, + "step": 37680 + }, + { + "epoch": 2.9206865822000077, + "grad_norm": 1.3954972554060092, + "learning_rate": 1.4603998760074397e-07, + "loss": 1.0089, + "step": 37690 + }, + { + "epoch": 2.9214615056763145, + "grad_norm": 1.4151923321387165, + "learning_rate": 1.4607873527588346e-07, + "loss": 1.0358, + "step": 37700 + }, + { + "epoch": 2.9222364291526213, + "grad_norm": 1.3279346112421098, + "learning_rate": 1.4611748295102296e-07, + "loss": 1.0005, + "step": 37710 + }, + { + "epoch": 2.9230113526289276, + "grad_norm": 1.2909235723974872, + "learning_rate": 1.4615623062616245e-07, + "loss": 1.0187, + "step": 37720 + }, + { + "epoch": 2.9237862761052344, + "grad_norm": 1.2726440672660615, + "learning_rate": 1.4619497830130195e-07, + "loss": 0.9986, + "step": 37730 + }, + { + "epoch": 2.9245611995815413, + "grad_norm": 1.2891610391347836, + "learning_rate": 1.4623372597644142e-07, + "loss": 0.9893, + "step": 37740 + }, + { + "epoch": 2.925336123057848, + "grad_norm": 1.3342700989972907, + "learning_rate": 1.462724736515809e-07, + "loss": 1.0327, + "step": 37750 + }, + { + "epoch": 2.926111046534155, + "grad_norm": 1.2656521886833192, + "learning_rate": 1.463112213267204e-07, + "loss": 1.0143, + "step": 37760 + }, + { + "epoch": 2.9268859700104617, + "grad_norm": 1.2913978337063363, + "learning_rate": 1.463499690018599e-07, + "loss": 1.0115, + "step": 37770 + }, + { + "epoch": 2.927660893486768, + "grad_norm": 1.4216262270436633, + "learning_rate": 1.463887166769994e-07, + "loss": 1.0213, + "step": 37780 + }, + { + "epoch": 2.928435816963075, + "grad_norm": 1.3033789708298789, + "learning_rate": 1.464274643521389e-07, + "loss": 1.041, + "step": 37790 + }, + { + "epoch": 2.9292107404393817, + "grad_norm": 1.595855466427719, + "learning_rate": 1.464662120272784e-07, + "loss": 1.0252, + "step": 37800 + }, + { + "epoch": 2.9299856639156885, + "grad_norm": 1.3879743166584528, + "learning_rate": 1.4650495970241786e-07, + "loss": 1.0007, + "step": 37810 + }, + { + "epoch": 2.930760587391995, + "grad_norm": 1.2928739505532296, + "learning_rate": 1.4654370737755735e-07, + "loss": 0.9911, + "step": 37820 + }, + { + "epoch": 2.9315355108683017, + "grad_norm": 1.3572631072479755, + "learning_rate": 1.4658245505269685e-07, + "loss": 1.0202, + "step": 37830 + }, + { + "epoch": 2.9323104343446085, + "grad_norm": 1.3180717853436528, + "learning_rate": 1.4662120272783634e-07, + "loss": 0.9991, + "step": 37840 + }, + { + "epoch": 2.9330853578209153, + "grad_norm": 1.3956261728077985, + "learning_rate": 1.4665995040297584e-07, + "loss": 0.9776, + "step": 37850 + }, + { + "epoch": 2.933860281297222, + "grad_norm": 1.414123496892295, + "learning_rate": 1.4669869807811534e-07, + "loss": 0.9881, + "step": 37860 + }, + { + "epoch": 2.9346352047735285, + "grad_norm": 1.2910088901139867, + "learning_rate": 1.4673744575325483e-07, + "loss": 0.996, + "step": 37870 + }, + { + "epoch": 2.9354101282498353, + "grad_norm": 1.2228488806990645, + "learning_rate": 1.467761934283943e-07, + "loss": 1.0053, + "step": 37880 + }, + { + "epoch": 2.936185051726142, + "grad_norm": 1.3513563892920695, + "learning_rate": 1.468149411035338e-07, + "loss": 0.9989, + "step": 37890 + }, + { + "epoch": 2.936959975202449, + "grad_norm": 1.293517317026724, + "learning_rate": 1.468536887786733e-07, + "loss": 0.9905, + "step": 37900 + }, + { + "epoch": 2.9377348986787553, + "grad_norm": 1.2944338699639784, + "learning_rate": 1.468924364538128e-07, + "loss": 1.0205, + "step": 37910 + }, + { + "epoch": 2.938509822155062, + "grad_norm": 1.3130668462679307, + "learning_rate": 1.4693118412895228e-07, + "loss": 1.0073, + "step": 37920 + }, + { + "epoch": 2.939284745631369, + "grad_norm": 1.269445892721158, + "learning_rate": 1.4696993180409178e-07, + "loss": 1.0016, + "step": 37930 + }, + { + "epoch": 2.9400596691076757, + "grad_norm": 1.3558964587247762, + "learning_rate": 1.4700867947923127e-07, + "loss": 0.9841, + "step": 37940 + }, + { + "epoch": 2.9408345925839825, + "grad_norm": 1.3679276010282333, + "learning_rate": 1.4704742715437074e-07, + "loss": 1.0174, + "step": 37950 + }, + { + "epoch": 2.9416095160602893, + "grad_norm": 1.3507622843848621, + "learning_rate": 1.4708617482951024e-07, + "loss": 1.0157, + "step": 37960 + }, + { + "epoch": 2.9423844395365957, + "grad_norm": 1.2834504644913758, + "learning_rate": 1.4712492250464973e-07, + "loss": 1.0196, + "step": 37970 + }, + { + "epoch": 2.9431593630129025, + "grad_norm": 1.3991734442976542, + "learning_rate": 1.4716367017978923e-07, + "loss": 1.0134, + "step": 37980 + }, + { + "epoch": 2.9439342864892093, + "grad_norm": 1.2771962577446034, + "learning_rate": 1.4720241785492872e-07, + "loss": 0.9996, + "step": 37990 + }, + { + "epoch": 2.9447092099655157, + "grad_norm": 1.3844008047930878, + "learning_rate": 1.4724116553006822e-07, + "loss": 0.992, + "step": 38000 + }, + { + "epoch": 2.9447092099655157, + "eval_loss": 1.0071407556533813, + "eval_runtime": 320.2037, + "eval_samples_per_second": 35.824, + "eval_steps_per_second": 8.957, + "step": 38000 + }, + { + "epoch": 2.9454841334418225, + "grad_norm": 1.2831909158416364, + "learning_rate": 1.472799132052077e-07, + "loss": 1.0317, + "step": 38010 + }, + { + "epoch": 2.9462590569181293, + "grad_norm": 1.2813647061850189, + "learning_rate": 1.4731866088034718e-07, + "loss": 0.9959, + "step": 38020 + }, + { + "epoch": 2.947033980394436, + "grad_norm": 1.3046135736206672, + "learning_rate": 1.4735740855548668e-07, + "loss": 1.034, + "step": 38030 + }, + { + "epoch": 2.947808903870743, + "grad_norm": 1.2947354988452808, + "learning_rate": 1.4739615623062618e-07, + "loss": 0.9957, + "step": 38040 + }, + { + "epoch": 2.9485838273470497, + "grad_norm": 1.3167547416848977, + "learning_rate": 1.4743490390576567e-07, + "loss": 1.0052, + "step": 38050 + }, + { + "epoch": 2.949358750823356, + "grad_norm": 1.4485573160328098, + "learning_rate": 1.4747365158090517e-07, + "loss": 1.0162, + "step": 38060 + }, + { + "epoch": 2.950133674299663, + "grad_norm": 1.4029198899233264, + "learning_rate": 1.4751239925604466e-07, + "loss": 1.0125, + "step": 38070 + }, + { + "epoch": 2.9509085977759697, + "grad_norm": 1.3256109022394595, + "learning_rate": 1.4755114693118413e-07, + "loss": 1.0227, + "step": 38080 + }, + { + "epoch": 2.951683521252276, + "grad_norm": 1.3353520251373117, + "learning_rate": 1.4758989460632363e-07, + "loss": 1.0428, + "step": 38090 + }, + { + "epoch": 2.952458444728583, + "grad_norm": 1.256671063974885, + "learning_rate": 1.4762864228146312e-07, + "loss": 1.0142, + "step": 38100 + }, + { + "epoch": 2.9532333682048897, + "grad_norm": 1.3744236002189032, + "learning_rate": 1.4766738995660262e-07, + "loss": 1.0118, + "step": 38110 + }, + { + "epoch": 2.9540082916811965, + "grad_norm": 1.3065457246428873, + "learning_rate": 1.477061376317421e-07, + "loss": 1.0081, + "step": 38120 + }, + { + "epoch": 2.9547832151575033, + "grad_norm": 1.3251666721490731, + "learning_rate": 1.477448853068816e-07, + "loss": 1.0041, + "step": 38130 + }, + { + "epoch": 2.95555813863381, + "grad_norm": 1.2641999321149238, + "learning_rate": 1.477836329820211e-07, + "loss": 0.9936, + "step": 38140 + }, + { + "epoch": 2.9563330621101165, + "grad_norm": 1.3246889628032226, + "learning_rate": 1.4782238065716057e-07, + "loss": 1.0113, + "step": 38150 + }, + { + "epoch": 2.9571079855864233, + "grad_norm": 1.2601976580951681, + "learning_rate": 1.4786112833230007e-07, + "loss": 0.9988, + "step": 38160 + }, + { + "epoch": 2.95788290906273, + "grad_norm": 1.3632979799006044, + "learning_rate": 1.4789987600743956e-07, + "loss": 0.9973, + "step": 38170 + }, + { + "epoch": 2.958657832539037, + "grad_norm": 1.2836504416185444, + "learning_rate": 1.4793862368257906e-07, + "loss": 0.9943, + "step": 38180 + }, + { + "epoch": 2.9594327560153433, + "grad_norm": 1.348034274207801, + "learning_rate": 1.4797737135771856e-07, + "loss": 1.0118, + "step": 38190 + }, + { + "epoch": 2.96020767949165, + "grad_norm": 1.2812215947766015, + "learning_rate": 1.4801611903285805e-07, + "loss": 0.9967, + "step": 38200 + }, + { + "epoch": 2.960982602967957, + "grad_norm": 1.3254759740468094, + "learning_rate": 1.4805486670799755e-07, + "loss": 1.0088, + "step": 38210 + }, + { + "epoch": 2.9617575264442637, + "grad_norm": 1.3020289111471908, + "learning_rate": 1.4809361438313702e-07, + "loss": 1.0197, + "step": 38220 + }, + { + "epoch": 2.9625324499205705, + "grad_norm": 1.2781371211077035, + "learning_rate": 1.481323620582765e-07, + "loss": 0.9957, + "step": 38230 + }, + { + "epoch": 2.9633073733968773, + "grad_norm": 1.3248132839861018, + "learning_rate": 1.48171109733416e-07, + "loss": 1.017, + "step": 38240 + }, + { + "epoch": 2.9640822968731837, + "grad_norm": 1.3569750335533133, + "learning_rate": 1.482098574085555e-07, + "loss": 1.003, + "step": 38250 + }, + { + "epoch": 2.9648572203494905, + "grad_norm": 1.399090609927002, + "learning_rate": 1.48248605083695e-07, + "loss": 1.0201, + "step": 38260 + }, + { + "epoch": 2.9656321438257973, + "grad_norm": 1.4367959674207214, + "learning_rate": 1.482873527588345e-07, + "loss": 1.0072, + "step": 38270 + }, + { + "epoch": 2.9664070673021037, + "grad_norm": 1.3173851557217753, + "learning_rate": 1.48326100433974e-07, + "loss": 1.027, + "step": 38280 + }, + { + "epoch": 2.9671819907784105, + "grad_norm": 1.3812145685579134, + "learning_rate": 1.4836484810911346e-07, + "loss": 0.9775, + "step": 38290 + }, + { + "epoch": 2.9679569142547173, + "grad_norm": 1.3432176799365632, + "learning_rate": 1.4840359578425295e-07, + "loss": 1.009, + "step": 38300 + }, + { + "epoch": 2.968731837731024, + "grad_norm": 1.3245816092236755, + "learning_rate": 1.4844234345939245e-07, + "loss": 0.9991, + "step": 38310 + }, + { + "epoch": 2.969506761207331, + "grad_norm": 1.3145682588684748, + "learning_rate": 1.4848109113453194e-07, + "loss": 1.023, + "step": 38320 + }, + { + "epoch": 2.9702816846836377, + "grad_norm": 1.331348495884396, + "learning_rate": 1.4851983880967144e-07, + "loss": 1.0183, + "step": 38330 + }, + { + "epoch": 2.971056608159944, + "grad_norm": 1.4302379915894878, + "learning_rate": 1.4855858648481093e-07, + "loss": 1.0118, + "step": 38340 + }, + { + "epoch": 2.971831531636251, + "grad_norm": 1.3415179433215967, + "learning_rate": 1.485973341599504e-07, + "loss": 0.9856, + "step": 38350 + }, + { + "epoch": 2.9726064551125577, + "grad_norm": 1.3050212918758983, + "learning_rate": 1.486360818350899e-07, + "loss": 0.9965, + "step": 38360 + }, + { + "epoch": 2.973381378588864, + "grad_norm": 1.2848903637040843, + "learning_rate": 1.486748295102294e-07, + "loss": 0.9889, + "step": 38370 + }, + { + "epoch": 2.974156302065171, + "grad_norm": 1.276009800475485, + "learning_rate": 1.487135771853689e-07, + "loss": 0.9977, + "step": 38380 + }, + { + "epoch": 2.9749312255414777, + "grad_norm": 1.3753815016466415, + "learning_rate": 1.4875232486050839e-07, + "loss": 0.9981, + "step": 38390 + }, + { + "epoch": 2.9757061490177845, + "grad_norm": 1.3956151943062063, + "learning_rate": 1.4879107253564788e-07, + "loss": 1.0525, + "step": 38400 + }, + { + "epoch": 2.9764810724940913, + "grad_norm": 1.2766443878536913, + "learning_rate": 1.4882982021078738e-07, + "loss": 1.0234, + "step": 38410 + }, + { + "epoch": 2.977255995970398, + "grad_norm": 1.3219854866948675, + "learning_rate": 1.4886856788592685e-07, + "loss": 1.0356, + "step": 38420 + }, + { + "epoch": 2.9780309194467045, + "grad_norm": 1.2291250541349399, + "learning_rate": 1.4890731556106634e-07, + "loss": 0.9647, + "step": 38430 + }, + { + "epoch": 2.9788058429230113, + "grad_norm": 1.248807880158372, + "learning_rate": 1.4894606323620584e-07, + "loss": 1.0282, + "step": 38440 + }, + { + "epoch": 2.979580766399318, + "grad_norm": 1.3476491984853636, + "learning_rate": 1.4898481091134533e-07, + "loss": 0.9998, + "step": 38450 + }, + { + "epoch": 2.980355689875625, + "grad_norm": 1.2874578871379165, + "learning_rate": 1.4902355858648483e-07, + "loss": 0.9993, + "step": 38460 + }, + { + "epoch": 2.9811306133519313, + "grad_norm": 1.3253115613374447, + "learning_rate": 1.4906230626162432e-07, + "loss": 1.0004, + "step": 38470 + }, + { + "epoch": 2.981905536828238, + "grad_norm": 1.2656487921598563, + "learning_rate": 1.4910105393676382e-07, + "loss": 0.9872, + "step": 38480 + }, + { + "epoch": 2.982680460304545, + "grad_norm": 1.2748376108137098, + "learning_rate": 1.491398016119033e-07, + "loss": 0.9877, + "step": 38490 + }, + { + "epoch": 2.9834553837808517, + "grad_norm": 1.3986198185266658, + "learning_rate": 1.4917854928704278e-07, + "loss": 1.0266, + "step": 38500 + }, + { + "epoch": 2.9834553837808517, + "eval_loss": 1.0057400465011597, + "eval_runtime": 320.394, + "eval_samples_per_second": 35.803, + "eval_steps_per_second": 8.951, + "step": 38500 + }, + { + "epoch": 2.9842303072571585, + "grad_norm": 1.3605572925798037, + "learning_rate": 1.4921729696218228e-07, + "loss": 1.0325, + "step": 38510 + }, + { + "epoch": 2.985005230733465, + "grad_norm": 1.282539941861199, + "learning_rate": 1.4925604463732177e-07, + "loss": 0.9914, + "step": 38520 + }, + { + "epoch": 2.9857801542097717, + "grad_norm": 1.3179332384440476, + "learning_rate": 1.4929479231246127e-07, + "loss": 0.9989, + "step": 38530 + }, + { + "epoch": 2.9865550776860785, + "grad_norm": 1.335920334590378, + "learning_rate": 1.4933353998760077e-07, + "loss": 0.9895, + "step": 38540 + }, + { + "epoch": 2.9873300011623853, + "grad_norm": 1.2929615795297869, + "learning_rate": 1.4937228766274026e-07, + "loss": 1.0073, + "step": 38550 + }, + { + "epoch": 2.9881049246386917, + "grad_norm": 1.3444846576903262, + "learning_rate": 1.4941103533787973e-07, + "loss": 0.9892, + "step": 38560 + }, + { + "epoch": 2.9888798481149985, + "grad_norm": 1.376656999386922, + "learning_rate": 1.4944978301301923e-07, + "loss": 1.0147, + "step": 38570 + }, + { + "epoch": 2.9896547715913053, + "grad_norm": 1.3000951707527257, + "learning_rate": 1.4948853068815872e-07, + "loss": 1.0124, + "step": 38580 + }, + { + "epoch": 2.990429695067612, + "grad_norm": 1.3933068294720032, + "learning_rate": 1.4952727836329822e-07, + "loss": 1.0037, + "step": 38590 + }, + { + "epoch": 2.991204618543919, + "grad_norm": 1.3621506729071335, + "learning_rate": 1.495660260384377e-07, + "loss": 0.9948, + "step": 38600 + }, + { + "epoch": 2.9919795420202258, + "grad_norm": 1.3593297266591404, + "learning_rate": 1.496047737135772e-07, + "loss": 0.9928, + "step": 38610 + }, + { + "epoch": 2.992754465496532, + "grad_norm": 1.2937935158937348, + "learning_rate": 1.496435213887167e-07, + "loss": 1.0082, + "step": 38620 + }, + { + "epoch": 2.993529388972839, + "grad_norm": 1.4097932829718733, + "learning_rate": 1.4968226906385617e-07, + "loss": 1.0091, + "step": 38630 + }, + { + "epoch": 2.9943043124491457, + "grad_norm": 1.340315313932516, + "learning_rate": 1.4972101673899567e-07, + "loss": 1.0267, + "step": 38640 + }, + { + "epoch": 2.995079235925452, + "grad_norm": 1.3263486206366728, + "learning_rate": 1.4975976441413516e-07, + "loss": 0.9939, + "step": 38650 + }, + { + "epoch": 2.995854159401759, + "grad_norm": 1.3178364889090595, + "learning_rate": 1.4979851208927466e-07, + "loss": 0.9877, + "step": 38660 + }, + { + "epoch": 2.9966290828780657, + "grad_norm": 1.3154912202038966, + "learning_rate": 1.4983725976441415e-07, + "loss": 1.0226, + "step": 38670 + }, + { + "epoch": 2.9974040063543725, + "grad_norm": 1.3158892308376944, + "learning_rate": 1.4987600743955365e-07, + "loss": 0.9939, + "step": 38680 + }, + { + "epoch": 2.9981789298306794, + "grad_norm": 1.409449898540306, + "learning_rate": 1.4991475511469312e-07, + "loss": 1.0062, + "step": 38690 + }, + { + "epoch": 2.998953853306986, + "grad_norm": 1.2774371293019815, + "learning_rate": 1.4995350278983261e-07, + "loss": 0.9731, + "step": 38700 + }, + { + "epoch": 2.9997287767832925, + "grad_norm": 1.2837784469067306, + "learning_rate": 1.499922504649721e-07, + "loss": 0.994, + "step": 38710 + }, + { + "epoch": 3.0005037002595993, + "grad_norm": 1.2599567844283357, + "learning_rate": 1.500309981401116e-07, + "loss": 1.0236, + "step": 38720 + }, + { + "epoch": 3.001278623735906, + "grad_norm": 1.3532081244625027, + "learning_rate": 1.500697458152511e-07, + "loss": 1.0355, + "step": 38730 + }, + { + "epoch": 3.002053547212213, + "grad_norm": 1.2691524458717447, + "learning_rate": 1.501084934903906e-07, + "loss": 1.0105, + "step": 38740 + }, + { + "epoch": 3.0028284706885193, + "grad_norm": 1.2788148799736794, + "learning_rate": 1.501472411655301e-07, + "loss": 1.0175, + "step": 38750 + }, + { + "epoch": 3.003603394164826, + "grad_norm": 1.2709803187243154, + "learning_rate": 1.5018598884066956e-07, + "loss": 1.0255, + "step": 38760 + }, + { + "epoch": 3.004378317641133, + "grad_norm": 1.3233809812484543, + "learning_rate": 1.5022473651580906e-07, + "loss": 1.0178, + "step": 38770 + }, + { + "epoch": 3.0051532411174398, + "grad_norm": 1.3187338503849442, + "learning_rate": 1.5026348419094855e-07, + "loss": 1.0024, + "step": 38780 + }, + { + "epoch": 3.0059281645937466, + "grad_norm": 1.366487813220754, + "learning_rate": 1.5030223186608805e-07, + "loss": 1.0079, + "step": 38790 + }, + { + "epoch": 3.006703088070053, + "grad_norm": 1.3955541777501825, + "learning_rate": 1.5034097954122754e-07, + "loss": 0.983, + "step": 38800 + }, + { + "epoch": 3.0074780115463597, + "grad_norm": 1.27170778788769, + "learning_rate": 1.5037972721636704e-07, + "loss": 1.0287, + "step": 38810 + }, + { + "epoch": 3.0082529350226666, + "grad_norm": 1.2932841425166042, + "learning_rate": 1.5041847489150653e-07, + "loss": 1.0078, + "step": 38820 + }, + { + "epoch": 3.0090278584989734, + "grad_norm": 1.3859583569148253, + "learning_rate": 1.50457222566646e-07, + "loss": 1.0084, + "step": 38830 + }, + { + "epoch": 3.0098027819752797, + "grad_norm": 1.3279055160499156, + "learning_rate": 1.504959702417855e-07, + "loss": 1.0228, + "step": 38840 + }, + { + "epoch": 3.0105777054515865, + "grad_norm": 1.3070512372966594, + "learning_rate": 1.50534717916925e-07, + "loss": 1.0183, + "step": 38850 + }, + { + "epoch": 3.0113526289278933, + "grad_norm": 1.367152752815744, + "learning_rate": 1.505734655920645e-07, + "loss": 1.0007, + "step": 38860 + }, + { + "epoch": 3.0121275524042, + "grad_norm": 1.309220846405779, + "learning_rate": 1.5061221326720399e-07, + "loss": 0.9932, + "step": 38870 + }, + { + "epoch": 3.012902475880507, + "grad_norm": 1.2943812868322453, + "learning_rate": 1.5065096094234348e-07, + "loss": 1.0256, + "step": 38880 + }, + { + "epoch": 3.0136773993568133, + "grad_norm": 1.344747475977107, + "learning_rate": 1.5068970861748298e-07, + "loss": 0.9905, + "step": 38890 + }, + { + "epoch": 3.01445232283312, + "grad_norm": 1.2634681737240128, + "learning_rate": 1.5072845629262245e-07, + "loss": 0.9923, + "step": 38900 + }, + { + "epoch": 3.015227246309427, + "grad_norm": 1.4071826161635315, + "learning_rate": 1.5076720396776194e-07, + "loss": 1.0359, + "step": 38910 + }, + { + "epoch": 3.0160021697857338, + "grad_norm": 1.4252662312912432, + "learning_rate": 1.5080595164290144e-07, + "loss": 1.0074, + "step": 38920 + }, + { + "epoch": 3.0167770932620406, + "grad_norm": 1.3183457558986642, + "learning_rate": 1.5084469931804093e-07, + "loss": 1.0409, + "step": 38930 + }, + { + "epoch": 3.017552016738347, + "grad_norm": 1.3317717419138273, + "learning_rate": 1.5088344699318043e-07, + "loss": 1.0064, + "step": 38940 + }, + { + "epoch": 3.0183269402146538, + "grad_norm": 1.3923370703535762, + "learning_rate": 1.5092219466831992e-07, + "loss": 1.0104, + "step": 38950 + }, + { + "epoch": 3.0191018636909606, + "grad_norm": 1.25087802812498, + "learning_rate": 1.5096094234345942e-07, + "loss": 0.9883, + "step": 38960 + }, + { + "epoch": 3.0198767871672674, + "grad_norm": 1.4027637998287381, + "learning_rate": 1.509996900185989e-07, + "loss": 1.0056, + "step": 38970 + }, + { + "epoch": 3.0206517106435737, + "grad_norm": 1.4010159193376068, + "learning_rate": 1.5103843769373838e-07, + "loss": 1.0219, + "step": 38980 + }, + { + "epoch": 3.0214266341198805, + "grad_norm": 1.3832599012773952, + "learning_rate": 1.5107718536887788e-07, + "loss": 1.0244, + "step": 38990 + }, + { + "epoch": 3.0222015575961874, + "grad_norm": 1.4329370254341616, + "learning_rate": 1.5111593304401737e-07, + "loss": 1.0088, + "step": 39000 + }, + { + "epoch": 3.0222015575961874, + "eval_loss": 1.0043965578079224, + "eval_runtime": 318.5817, + "eval_samples_per_second": 36.006, + "eval_steps_per_second": 9.002, + "step": 39000 + }, + { + "epoch": 3.022976481072494, + "grad_norm": 1.284333731537915, + "learning_rate": 1.5115468071915687e-07, + "loss": 1.0263, + "step": 39010 + }, + { + "epoch": 3.023751404548801, + "grad_norm": 1.257695283932333, + "learning_rate": 1.5119342839429636e-07, + "loss": 0.9917, + "step": 39020 + }, + { + "epoch": 3.0245263280251073, + "grad_norm": 1.2926632774694942, + "learning_rate": 1.5123217606943586e-07, + "loss": 1.0057, + "step": 39030 + }, + { + "epoch": 3.025301251501414, + "grad_norm": 1.3309442098820348, + "learning_rate": 1.5127092374457533e-07, + "loss": 1.0231, + "step": 39040 + }, + { + "epoch": 3.026076174977721, + "grad_norm": 1.3146452189978162, + "learning_rate": 1.5130967141971482e-07, + "loss": 0.9782, + "step": 39050 + }, + { + "epoch": 3.0268510984540278, + "grad_norm": 1.277909452403682, + "learning_rate": 1.5134841909485432e-07, + "loss": 0.9904, + "step": 39060 + }, + { + "epoch": 3.0276260219303346, + "grad_norm": 1.353833319862366, + "learning_rate": 1.5138716676999382e-07, + "loss": 1.0038, + "step": 39070 + }, + { + "epoch": 3.028400945406641, + "grad_norm": 1.2985437527877934, + "learning_rate": 1.514259144451333e-07, + "loss": 0.9919, + "step": 39080 + }, + { + "epoch": 3.0291758688829478, + "grad_norm": 1.3202242788670673, + "learning_rate": 1.514646621202728e-07, + "loss": 0.9924, + "step": 39090 + }, + { + "epoch": 3.0299507923592546, + "grad_norm": 1.4866350276749156, + "learning_rate": 1.5150340979541228e-07, + "loss": 0.9999, + "step": 39100 + }, + { + "epoch": 3.0307257158355614, + "grad_norm": 1.387953137597493, + "learning_rate": 1.5154215747055177e-07, + "loss": 1.0059, + "step": 39110 + }, + { + "epoch": 3.0315006393118678, + "grad_norm": 1.3138882149489586, + "learning_rate": 1.5158090514569127e-07, + "loss": 0.9839, + "step": 39120 + }, + { + "epoch": 3.0322755627881746, + "grad_norm": 1.2397288246301335, + "learning_rate": 1.5161965282083076e-07, + "loss": 1.0001, + "step": 39130 + }, + { + "epoch": 3.0330504862644814, + "grad_norm": 1.291302317998891, + "learning_rate": 1.5165840049597026e-07, + "loss": 0.9984, + "step": 39140 + }, + { + "epoch": 3.033825409740788, + "grad_norm": 1.5797282461376043, + "learning_rate": 1.5169714817110975e-07, + "loss": 1.0249, + "step": 39150 + }, + { + "epoch": 3.034600333217095, + "grad_norm": 1.350442481384607, + "learning_rate": 1.5173589584624925e-07, + "loss": 1.0164, + "step": 39160 + }, + { + "epoch": 3.0353752566934014, + "grad_norm": 1.4061590800706592, + "learning_rate": 1.5177464352138872e-07, + "loss": 1.0015, + "step": 39170 + }, + { + "epoch": 3.036150180169708, + "grad_norm": 1.3044803147695387, + "learning_rate": 1.5181339119652821e-07, + "loss": 0.9819, + "step": 39180 + }, + { + "epoch": 3.036925103646015, + "grad_norm": 1.2939422453890725, + "learning_rate": 1.518521388716677e-07, + "loss": 0.9788, + "step": 39190 + }, + { + "epoch": 3.037700027122322, + "grad_norm": 1.2995867802455767, + "learning_rate": 1.518908865468072e-07, + "loss": 0.9961, + "step": 39200 + }, + { + "epoch": 3.0384749505986286, + "grad_norm": 1.3176497648744774, + "learning_rate": 1.519296342219467e-07, + "loss": 1.0082, + "step": 39210 + }, + { + "epoch": 3.039249874074935, + "grad_norm": 1.3531686485881274, + "learning_rate": 1.519683818970862e-07, + "loss": 1.024, + "step": 39220 + }, + { + "epoch": 3.0400247975512418, + "grad_norm": 1.3536348224593633, + "learning_rate": 1.520071295722257e-07, + "loss": 1.0028, + "step": 39230 + }, + { + "epoch": 3.0407997210275486, + "grad_norm": 1.306193677093117, + "learning_rate": 1.5204587724736516e-07, + "loss": 1.0171, + "step": 39240 + }, + { + "epoch": 3.0415746445038554, + "grad_norm": 1.2455599565457245, + "learning_rate": 1.5208462492250466e-07, + "loss": 1.0355, + "step": 39250 + }, + { + "epoch": 3.0423495679801618, + "grad_norm": 1.2630118211031023, + "learning_rate": 1.5212337259764415e-07, + "loss": 0.9833, + "step": 39260 + }, + { + "epoch": 3.0431244914564686, + "grad_norm": 1.3782947223887076, + "learning_rate": 1.5216212027278365e-07, + "loss": 1.0211, + "step": 39270 + }, + { + "epoch": 3.0438994149327754, + "grad_norm": 1.2671448104775316, + "learning_rate": 1.5220086794792314e-07, + "loss": 1.0247, + "step": 39280 + }, + { + "epoch": 3.044674338409082, + "grad_norm": 1.300064500866691, + "learning_rate": 1.5223961562306264e-07, + "loss": 1.0269, + "step": 39290 + }, + { + "epoch": 3.045449261885389, + "grad_norm": 1.2654927050395086, + "learning_rate": 1.5227836329820213e-07, + "loss": 1.0014, + "step": 39300 + }, + { + "epoch": 3.0462241853616954, + "grad_norm": 1.269161046033525, + "learning_rate": 1.523171109733416e-07, + "loss": 0.9846, + "step": 39310 + }, + { + "epoch": 3.046999108838002, + "grad_norm": 1.2684403159770063, + "learning_rate": 1.523558586484811e-07, + "loss": 0.9995, + "step": 39320 + }, + { + "epoch": 3.047774032314309, + "grad_norm": 1.2731944189380358, + "learning_rate": 1.523946063236206e-07, + "loss": 0.9597, + "step": 39330 + }, + { + "epoch": 3.048548955790616, + "grad_norm": 1.2912022164582306, + "learning_rate": 1.524333539987601e-07, + "loss": 0.9915, + "step": 39340 + }, + { + "epoch": 3.0493238792669226, + "grad_norm": 1.3301434279339377, + "learning_rate": 1.5247210167389958e-07, + "loss": 1.0091, + "step": 39350 + }, + { + "epoch": 3.050098802743229, + "grad_norm": 1.2479160729937075, + "learning_rate": 1.5251084934903908e-07, + "loss": 1.0091, + "step": 39360 + }, + { + "epoch": 3.050873726219536, + "grad_norm": 1.3308062353653305, + "learning_rate": 1.5254959702417858e-07, + "loss": 0.9836, + "step": 39370 + }, + { + "epoch": 3.0516486496958426, + "grad_norm": 1.23433047053352, + "learning_rate": 1.5258834469931804e-07, + "loss": 0.9822, + "step": 39380 + }, + { + "epoch": 3.0524235731721494, + "grad_norm": 1.331839083697686, + "learning_rate": 1.5262709237445754e-07, + "loss": 1.0044, + "step": 39390 + }, + { + "epoch": 3.0531984966484558, + "grad_norm": 1.3428442018525741, + "learning_rate": 1.5266584004959704e-07, + "loss": 0.9705, + "step": 39400 + }, + { + "epoch": 3.0539734201247626, + "grad_norm": 1.2728149451214263, + "learning_rate": 1.5270458772473653e-07, + "loss": 0.9933, + "step": 39410 + }, + { + "epoch": 3.0547483436010694, + "grad_norm": 1.301418845097946, + "learning_rate": 1.5274333539987603e-07, + "loss": 1.0048, + "step": 39420 + }, + { + "epoch": 3.055523267077376, + "grad_norm": 1.3049034499160417, + "learning_rate": 1.5278208307501552e-07, + "loss": 1.0138, + "step": 39430 + }, + { + "epoch": 3.056298190553683, + "grad_norm": 1.3479148465722532, + "learning_rate": 1.52820830750155e-07, + "loss": 0.9901, + "step": 39440 + }, + { + "epoch": 3.0570731140299894, + "grad_norm": 1.3296049754367654, + "learning_rate": 1.5285957842529449e-07, + "loss": 1.0154, + "step": 39450 + }, + { + "epoch": 3.057848037506296, + "grad_norm": 1.2868549315886377, + "learning_rate": 1.5289832610043398e-07, + "loss": 0.9979, + "step": 39460 + }, + { + "epoch": 3.058622960982603, + "grad_norm": 1.3053877218127672, + "learning_rate": 1.5293707377557348e-07, + "loss": 0.995, + "step": 39470 + }, + { + "epoch": 3.05939788445891, + "grad_norm": 1.254024605378754, + "learning_rate": 1.5297582145071297e-07, + "loss": 0.9979, + "step": 39480 + }, + { + "epoch": 3.060172807935216, + "grad_norm": 1.2941056127188895, + "learning_rate": 1.5301456912585247e-07, + "loss": 0.9971, + "step": 39490 + }, + { + "epoch": 3.060947731411523, + "grad_norm": 1.3236420053913527, + "learning_rate": 1.5305331680099196e-07, + "loss": 0.9939, + "step": 39500 + }, + { + "epoch": 3.060947731411523, + "eval_loss": 1.002983570098877, + "eval_runtime": 320.8014, + "eval_samples_per_second": 35.757, + "eval_steps_per_second": 8.94, + "step": 39500 + }, + { + "epoch": 3.06172265488783, + "grad_norm": 1.2952042419154874, + "learning_rate": 1.5309206447613143e-07, + "loss": 1.0165, + "step": 39510 + }, + { + "epoch": 3.0624975783641366, + "grad_norm": 1.2923780403978447, + "learning_rate": 1.5313081215127093e-07, + "loss": 0.9991, + "step": 39520 + }, + { + "epoch": 3.0632725018404434, + "grad_norm": 1.3477447882142841, + "learning_rate": 1.5316955982641042e-07, + "loss": 1.0074, + "step": 39530 + }, + { + "epoch": 3.06404742531675, + "grad_norm": 1.3498298306402114, + "learning_rate": 1.5320830750154992e-07, + "loss": 0.9826, + "step": 39540 + }, + { + "epoch": 3.0648223487930566, + "grad_norm": 1.3736089320899496, + "learning_rate": 1.5324705517668942e-07, + "loss": 1.0295, + "step": 39550 + }, + { + "epoch": 3.0655972722693634, + "grad_norm": 1.3632652050244027, + "learning_rate": 1.532858028518289e-07, + "loss": 0.9984, + "step": 39560 + }, + { + "epoch": 3.06637219574567, + "grad_norm": 1.5745581239613582, + "learning_rate": 1.533245505269684e-07, + "loss": 1.038, + "step": 39570 + }, + { + "epoch": 3.067147119221977, + "grad_norm": 1.3453733828926906, + "learning_rate": 1.5336329820210788e-07, + "loss": 0.9945, + "step": 39580 + }, + { + "epoch": 3.0679220426982834, + "grad_norm": 1.3294566334821323, + "learning_rate": 1.5340204587724737e-07, + "loss": 0.991, + "step": 39590 + }, + { + "epoch": 3.06869696617459, + "grad_norm": 1.2346811573365604, + "learning_rate": 1.5344079355238687e-07, + "loss": 0.9815, + "step": 39600 + }, + { + "epoch": 3.069471889650897, + "grad_norm": 1.3772401668765668, + "learning_rate": 1.5347954122752636e-07, + "loss": 1.0137, + "step": 39610 + }, + { + "epoch": 3.070246813127204, + "grad_norm": 1.3818672401284362, + "learning_rate": 1.5351828890266586e-07, + "loss": 1.0135, + "step": 39620 + }, + { + "epoch": 3.07102173660351, + "grad_norm": 1.2732984362916955, + "learning_rate": 1.5355703657780535e-07, + "loss": 0.9965, + "step": 39630 + }, + { + "epoch": 3.071796660079817, + "grad_norm": 1.352697275113681, + "learning_rate": 1.5359578425294485e-07, + "loss": 1.0043, + "step": 39640 + }, + { + "epoch": 3.072571583556124, + "grad_norm": 1.4111194506549047, + "learning_rate": 1.5363453192808432e-07, + "loss": 1.0116, + "step": 39650 + }, + { + "epoch": 3.0733465070324306, + "grad_norm": 1.3675988991680141, + "learning_rate": 1.536732796032238e-07, + "loss": 1.018, + "step": 39660 + }, + { + "epoch": 3.0741214305087374, + "grad_norm": 1.3826509246925678, + "learning_rate": 1.537120272783633e-07, + "loss": 1.0195, + "step": 39670 + }, + { + "epoch": 3.074896353985044, + "grad_norm": 1.3493459305123974, + "learning_rate": 1.537507749535028e-07, + "loss": 0.967, + "step": 39680 + }, + { + "epoch": 3.0756712774613506, + "grad_norm": 1.3813872701665357, + "learning_rate": 1.537895226286423e-07, + "loss": 0.9987, + "step": 39690 + }, + { + "epoch": 3.0764462009376574, + "grad_norm": 1.369173851101332, + "learning_rate": 1.538282703037818e-07, + "loss": 0.9936, + "step": 39700 + }, + { + "epoch": 3.0772211244139642, + "grad_norm": 1.304197053207633, + "learning_rate": 1.538670179789213e-07, + "loss": 1.0109, + "step": 39710 + }, + { + "epoch": 3.077996047890271, + "grad_norm": 1.3909412562574288, + "learning_rate": 1.5390576565406076e-07, + "loss": 0.982, + "step": 39720 + }, + { + "epoch": 3.0787709713665774, + "grad_norm": 1.2824171814821406, + "learning_rate": 1.5394451332920025e-07, + "loss": 0.9924, + "step": 39730 + }, + { + "epoch": 3.079545894842884, + "grad_norm": 1.288852879075426, + "learning_rate": 1.5398326100433975e-07, + "loss": 0.995, + "step": 39740 + }, + { + "epoch": 3.080320818319191, + "grad_norm": 1.3558837723320727, + "learning_rate": 1.5402200867947925e-07, + "loss": 1.0118, + "step": 39750 + }, + { + "epoch": 3.081095741795498, + "grad_norm": 1.3217591523233092, + "learning_rate": 1.5406075635461874e-07, + "loss": 1.0017, + "step": 39760 + }, + { + "epoch": 3.081870665271804, + "grad_norm": 1.3259945324814264, + "learning_rate": 1.5409950402975824e-07, + "loss": 0.9702, + "step": 39770 + }, + { + "epoch": 3.082645588748111, + "grad_norm": 1.2797515996523336, + "learning_rate": 1.541382517048977e-07, + "loss": 0.9995, + "step": 39780 + }, + { + "epoch": 3.083420512224418, + "grad_norm": 1.3491958659369572, + "learning_rate": 1.541769993800372e-07, + "loss": 1.0017, + "step": 39790 + }, + { + "epoch": 3.0841954357007246, + "grad_norm": 1.3199912522802741, + "learning_rate": 1.542157470551767e-07, + "loss": 1.0119, + "step": 39800 + }, + { + "epoch": 3.0849703591770314, + "grad_norm": 1.3610855264582677, + "learning_rate": 1.542544947303162e-07, + "loss": 0.9908, + "step": 39810 + }, + { + "epoch": 3.085745282653338, + "grad_norm": 1.400503285626818, + "learning_rate": 1.542932424054557e-07, + "loss": 1.0181, + "step": 39820 + }, + { + "epoch": 3.0865202061296446, + "grad_norm": 1.2959192084371383, + "learning_rate": 1.5433199008059518e-07, + "loss": 0.991, + "step": 39830 + }, + { + "epoch": 3.0872951296059514, + "grad_norm": 1.242902963166855, + "learning_rate": 1.5437073775573468e-07, + "loss": 0.9866, + "step": 39840 + }, + { + "epoch": 3.0880700530822582, + "grad_norm": 1.3142171587393987, + "learning_rate": 1.5440948543087415e-07, + "loss": 1.0084, + "step": 39850 + }, + { + "epoch": 3.088844976558565, + "grad_norm": 1.3209046556045891, + "learning_rate": 1.5444823310601364e-07, + "loss": 1.0035, + "step": 39860 + }, + { + "epoch": 3.0896199000348714, + "grad_norm": 1.3328739894672292, + "learning_rate": 1.5448698078115314e-07, + "loss": 0.9897, + "step": 39870 + }, + { + "epoch": 3.0903948235111782, + "grad_norm": 1.395295131790268, + "learning_rate": 1.5452572845629263e-07, + "loss": 1.0012, + "step": 39880 + }, + { + "epoch": 3.091169746987485, + "grad_norm": 1.4003249665628357, + "learning_rate": 1.5456447613143213e-07, + "loss": 0.9989, + "step": 39890 + }, + { + "epoch": 3.091944670463792, + "grad_norm": 1.3055341890068513, + "learning_rate": 1.5460322380657163e-07, + "loss": 1.0003, + "step": 39900 + }, + { + "epoch": 3.092719593940098, + "grad_norm": 1.3667863882989795, + "learning_rate": 1.5464197148171112e-07, + "loss": 1.025, + "step": 39910 + }, + { + "epoch": 3.093494517416405, + "grad_norm": 1.3053868454835635, + "learning_rate": 1.546807191568506e-07, + "loss": 0.9943, + "step": 39920 + }, + { + "epoch": 3.094269440892712, + "grad_norm": 1.3613153925071186, + "learning_rate": 1.5471946683199009e-07, + "loss": 1.0194, + "step": 39930 + }, + { + "epoch": 3.0950443643690186, + "grad_norm": 1.2469454758454055, + "learning_rate": 1.5475821450712958e-07, + "loss": 0.9996, + "step": 39940 + }, + { + "epoch": 3.0958192878453255, + "grad_norm": 1.3376100731317078, + "learning_rate": 1.5479696218226908e-07, + "loss": 1.0058, + "step": 39950 + }, + { + "epoch": 3.096594211321632, + "grad_norm": 1.3146149972888006, + "learning_rate": 1.5483570985740857e-07, + "loss": 1.0227, + "step": 39960 + }, + { + "epoch": 3.0973691347979386, + "grad_norm": 1.441518032293385, + "learning_rate": 1.5487445753254807e-07, + "loss": 1.027, + "step": 39970 + }, + { + "epoch": 3.0981440582742454, + "grad_norm": 1.3377985944231379, + "learning_rate": 1.5491320520768756e-07, + "loss": 0.9987, + "step": 39980 + }, + { + "epoch": 3.0989189817505522, + "grad_norm": 1.2498296338003587, + "learning_rate": 1.5495195288282703e-07, + "loss": 0.9913, + "step": 39990 + }, + { + "epoch": 3.099693905226859, + "grad_norm": 1.3676917583555386, + "learning_rate": 1.5499070055796653e-07, + "loss": 0.9873, + "step": 40000 + }, + { + "epoch": 3.099693905226859, + "eval_loss": 1.0015895366668701, + "eval_runtime": 320.8498, + "eval_samples_per_second": 35.752, + "eval_steps_per_second": 8.939, + "step": 40000 + }, + { + "epoch": 3.1004688287031654, + "grad_norm": 1.3277329356614882, + "learning_rate": 1.5502944823310602e-07, + "loss": 0.9935, + "step": 40010 + }, + { + "epoch": 3.1012437521794722, + "grad_norm": 1.275257013153329, + "learning_rate": 1.5506819590824552e-07, + "loss": 1.0014, + "step": 40020 + }, + { + "epoch": 3.102018675655779, + "grad_norm": 1.2638054028363541, + "learning_rate": 1.5510694358338501e-07, + "loss": 0.9993, + "step": 40030 + }, + { + "epoch": 3.102793599132086, + "grad_norm": 1.2732318178625406, + "learning_rate": 1.551456912585245e-07, + "loss": 1.0116, + "step": 40040 + }, + { + "epoch": 3.103568522608392, + "grad_norm": 1.3980958576496587, + "learning_rate": 1.55184438933664e-07, + "loss": 0.9942, + "step": 40050 + }, + { + "epoch": 3.104343446084699, + "grad_norm": 1.368656064827309, + "learning_rate": 1.5522318660880347e-07, + "loss": 1.0281, + "step": 40060 + }, + { + "epoch": 3.105118369561006, + "grad_norm": 1.4093788131704679, + "learning_rate": 1.5526193428394297e-07, + "loss": 0.9997, + "step": 40070 + }, + { + "epoch": 3.1058932930373127, + "grad_norm": 1.3175743622510676, + "learning_rate": 1.5530068195908247e-07, + "loss": 1.0042, + "step": 40080 + }, + { + "epoch": 3.1066682165136195, + "grad_norm": 1.3065746288179165, + "learning_rate": 1.5533942963422196e-07, + "loss": 0.9906, + "step": 40090 + }, + { + "epoch": 3.107443139989926, + "grad_norm": 1.3122163281277206, + "learning_rate": 1.5537817730936146e-07, + "loss": 0.9878, + "step": 40100 + }, + { + "epoch": 3.1082180634662326, + "grad_norm": 1.305537884835301, + "learning_rate": 1.5541692498450095e-07, + "loss": 0.991, + "step": 40110 + }, + { + "epoch": 3.1089929869425394, + "grad_norm": 1.4195542871239069, + "learning_rate": 1.5545567265964045e-07, + "loss": 1.0128, + "step": 40120 + }, + { + "epoch": 3.1097679104188463, + "grad_norm": 1.2305504443897177, + "learning_rate": 1.5549442033477992e-07, + "loss": 0.9984, + "step": 40130 + }, + { + "epoch": 3.1105428338951526, + "grad_norm": 1.2869216341748129, + "learning_rate": 1.555331680099194e-07, + "loss": 0.9937, + "step": 40140 + }, + { + "epoch": 3.1113177573714594, + "grad_norm": 1.3549533358218127, + "learning_rate": 1.555719156850589e-07, + "loss": 0.9919, + "step": 40150 + }, + { + "epoch": 3.1120926808477662, + "grad_norm": 1.2684836534755424, + "learning_rate": 1.556106633601984e-07, + "loss": 0.9927, + "step": 40160 + }, + { + "epoch": 3.112867604324073, + "grad_norm": 1.3467123669343668, + "learning_rate": 1.556494110353379e-07, + "loss": 0.9928, + "step": 40170 + }, + { + "epoch": 3.11364252780038, + "grad_norm": 1.3799059896120367, + "learning_rate": 1.556881587104774e-07, + "loss": 1.0039, + "step": 40180 + }, + { + "epoch": 3.1144174512766862, + "grad_norm": 1.392825956225549, + "learning_rate": 1.5572690638561686e-07, + "loss": 0.9836, + "step": 40190 + }, + { + "epoch": 3.115192374752993, + "grad_norm": 1.339065878002435, + "learning_rate": 1.5576565406075636e-07, + "loss": 1.0039, + "step": 40200 + }, + { + "epoch": 3.1159672982293, + "grad_norm": 1.4044432246980798, + "learning_rate": 1.5580440173589585e-07, + "loss": 1.0026, + "step": 40210 + }, + { + "epoch": 3.1167422217056067, + "grad_norm": 1.3303526619101869, + "learning_rate": 1.5584314941103535e-07, + "loss": 0.9797, + "step": 40220 + }, + { + "epoch": 3.1175171451819135, + "grad_norm": 1.2774658050789618, + "learning_rate": 1.5588189708617485e-07, + "loss": 0.9862, + "step": 40230 + }, + { + "epoch": 3.11829206865822, + "grad_norm": 1.3440688901981381, + "learning_rate": 1.5592064476131434e-07, + "loss": 1.0085, + "step": 40240 + }, + { + "epoch": 3.1190669921345267, + "grad_norm": 1.287881246628204, + "learning_rate": 1.5595939243645384e-07, + "loss": 0.9985, + "step": 40250 + }, + { + "epoch": 3.1198419156108335, + "grad_norm": 1.328105399751015, + "learning_rate": 1.559981401115933e-07, + "loss": 0.982, + "step": 40260 + }, + { + "epoch": 3.1206168390871403, + "grad_norm": 1.2687574786369458, + "learning_rate": 1.560368877867328e-07, + "loss": 1.0417, + "step": 40270 + }, + { + "epoch": 3.121391762563447, + "grad_norm": 1.3515066180011754, + "learning_rate": 1.560756354618723e-07, + "loss": 1.0034, + "step": 40280 + }, + { + "epoch": 3.1221666860397534, + "grad_norm": 1.271654773419362, + "learning_rate": 1.561143831370118e-07, + "loss": 0.9868, + "step": 40290 + }, + { + "epoch": 3.1229416095160603, + "grad_norm": 1.3860125692339702, + "learning_rate": 1.561531308121513e-07, + "loss": 0.9979, + "step": 40300 + }, + { + "epoch": 3.123716532992367, + "grad_norm": 1.3163000455726184, + "learning_rate": 1.5619187848729078e-07, + "loss": 1.0163, + "step": 40310 + }, + { + "epoch": 3.124491456468674, + "grad_norm": 1.3834575653136294, + "learning_rate": 1.5623062616243028e-07, + "loss": 1.0031, + "step": 40320 + }, + { + "epoch": 3.1252663799449802, + "grad_norm": 1.2975336806184823, + "learning_rate": 1.5626937383756975e-07, + "loss": 0.9917, + "step": 40330 + }, + { + "epoch": 3.126041303421287, + "grad_norm": 1.3153947035970641, + "learning_rate": 1.5630812151270924e-07, + "loss": 0.9976, + "step": 40340 + }, + { + "epoch": 3.126816226897594, + "grad_norm": 1.4375250688560655, + "learning_rate": 1.5634686918784874e-07, + "loss": 0.9846, + "step": 40350 + }, + { + "epoch": 3.1275911503739007, + "grad_norm": 1.350291703402501, + "learning_rate": 1.5638561686298823e-07, + "loss": 1.0041, + "step": 40360 + }, + { + "epoch": 3.1283660738502075, + "grad_norm": 1.3837476216612405, + "learning_rate": 1.5642436453812773e-07, + "loss": 1.0009, + "step": 40370 + }, + { + "epoch": 3.129140997326514, + "grad_norm": 1.3940978027532362, + "learning_rate": 1.5646311221326722e-07, + "loss": 1.008, + "step": 40380 + }, + { + "epoch": 3.1299159208028207, + "grad_norm": 1.3369488037240709, + "learning_rate": 1.5650185988840672e-07, + "loss": 1.004, + "step": 40390 + }, + { + "epoch": 3.1306908442791275, + "grad_norm": 1.3641851440734898, + "learning_rate": 1.565406075635462e-07, + "loss": 0.9888, + "step": 40400 + }, + { + "epoch": 3.1314657677554343, + "grad_norm": 1.3170244700403084, + "learning_rate": 1.5657935523868568e-07, + "loss": 1.0075, + "step": 40410 + }, + { + "epoch": 3.1322406912317406, + "grad_norm": 1.3022606587344088, + "learning_rate": 1.5661810291382518e-07, + "loss": 0.9828, + "step": 40420 + }, + { + "epoch": 3.1330156147080475, + "grad_norm": 1.3227611661568377, + "learning_rate": 1.5665685058896468e-07, + "loss": 0.9957, + "step": 40430 + }, + { + "epoch": 3.1337905381843543, + "grad_norm": 1.3350429813617097, + "learning_rate": 1.5669559826410417e-07, + "loss": 0.9961, + "step": 40440 + }, + { + "epoch": 3.134565461660661, + "grad_norm": 1.4098166441171296, + "learning_rate": 1.5673434593924367e-07, + "loss": 0.9913, + "step": 40450 + }, + { + "epoch": 3.135340385136968, + "grad_norm": 1.3177977301730188, + "learning_rate": 1.5677309361438316e-07, + "loss": 0.985, + "step": 40460 + }, + { + "epoch": 3.1361153086132743, + "grad_norm": 1.397711061865154, + "learning_rate": 1.5681184128952263e-07, + "loss": 1.0171, + "step": 40470 + }, + { + "epoch": 3.136890232089581, + "grad_norm": 1.3166606594138983, + "learning_rate": 1.5685058896466213e-07, + "loss": 0.9652, + "step": 40480 + }, + { + "epoch": 3.137665155565888, + "grad_norm": 1.3782281901106086, + "learning_rate": 1.5688933663980162e-07, + "loss": 0.9878, + "step": 40490 + }, + { + "epoch": 3.1384400790421947, + "grad_norm": 1.340736228085376, + "learning_rate": 1.5692808431494112e-07, + "loss": 0.9998, + "step": 40500 + }, + { + "epoch": 3.1384400790421947, + "eval_loss": 1.0002923011779785, + "eval_runtime": 319.6777, + "eval_samples_per_second": 35.883, + "eval_steps_per_second": 8.972, + "step": 40500 + }, + { + "epoch": 3.1392150025185015, + "grad_norm": 1.3303340127435355, + "learning_rate": 1.5696683199008061e-07, + "loss": 1.0114, + "step": 40510 + }, + { + "epoch": 3.139989925994808, + "grad_norm": 1.429361813080295, + "learning_rate": 1.570055796652201e-07, + "loss": 1.0086, + "step": 40520 + }, + { + "epoch": 3.1407648494711147, + "grad_norm": 1.3247640858609697, + "learning_rate": 1.5704432734035958e-07, + "loss": 1.0107, + "step": 40530 + }, + { + "epoch": 3.1415397729474215, + "grad_norm": 1.280904063785761, + "learning_rate": 1.5708307501549907e-07, + "loss": 1.0026, + "step": 40540 + }, + { + "epoch": 3.1423146964237283, + "grad_norm": 1.3105443150019997, + "learning_rate": 1.5712182269063857e-07, + "loss": 0.9749, + "step": 40550 + }, + { + "epoch": 3.143089619900035, + "grad_norm": 1.2984609514615018, + "learning_rate": 1.5716057036577806e-07, + "loss": 1.0067, + "step": 40560 + }, + { + "epoch": 3.1438645433763415, + "grad_norm": 1.338763156917728, + "learning_rate": 1.5719931804091756e-07, + "loss": 1.0357, + "step": 40570 + }, + { + "epoch": 3.1446394668526483, + "grad_norm": 1.319481527245818, + "learning_rate": 1.5723806571605706e-07, + "loss": 0.9959, + "step": 40580 + }, + { + "epoch": 3.145414390328955, + "grad_norm": 1.3602845581990195, + "learning_rate": 1.5727681339119655e-07, + "loss": 0.9834, + "step": 40590 + }, + { + "epoch": 3.146189313805262, + "grad_norm": 1.307234465171929, + "learning_rate": 1.5731556106633602e-07, + "loss": 0.9852, + "step": 40600 + }, + { + "epoch": 3.1469642372815683, + "grad_norm": 1.4309364590581921, + "learning_rate": 1.5735430874147552e-07, + "loss": 0.9964, + "step": 40610 + }, + { + "epoch": 3.147739160757875, + "grad_norm": 1.374517087186801, + "learning_rate": 1.57393056416615e-07, + "loss": 0.9773, + "step": 40620 + }, + { + "epoch": 3.148514084234182, + "grad_norm": 1.2942844848961566, + "learning_rate": 1.574318040917545e-07, + "loss": 1.0122, + "step": 40630 + }, + { + "epoch": 3.1492890077104887, + "grad_norm": 1.420319722894449, + "learning_rate": 1.57470551766894e-07, + "loss": 1.0105, + "step": 40640 + }, + { + "epoch": 3.1500639311867955, + "grad_norm": 1.3135055120108938, + "learning_rate": 1.575092994420335e-07, + "loss": 1.0008, + "step": 40650 + }, + { + "epoch": 3.150838854663102, + "grad_norm": 1.322316750747239, + "learning_rate": 1.57548047117173e-07, + "loss": 1.0055, + "step": 40660 + }, + { + "epoch": 3.1516137781394087, + "grad_norm": 1.362301764050575, + "learning_rate": 1.5758679479231246e-07, + "loss": 0.9942, + "step": 40670 + }, + { + "epoch": 3.1523887016157155, + "grad_norm": 1.3878631691865198, + "learning_rate": 1.5762554246745196e-07, + "loss": 0.9988, + "step": 40680 + }, + { + "epoch": 3.1531636250920223, + "grad_norm": 1.4186351615830288, + "learning_rate": 1.5766429014259145e-07, + "loss": 0.9955, + "step": 40690 + }, + { + "epoch": 3.1539385485683287, + "grad_norm": 1.2913586900085086, + "learning_rate": 1.5770303781773095e-07, + "loss": 1.0061, + "step": 40700 + }, + { + "epoch": 3.1547134720446355, + "grad_norm": 1.4254664198622815, + "learning_rate": 1.5774178549287044e-07, + "loss": 0.9987, + "step": 40710 + }, + { + "epoch": 3.1554883955209423, + "grad_norm": 1.3284425822859625, + "learning_rate": 1.5778053316800994e-07, + "loss": 1.0234, + "step": 40720 + }, + { + "epoch": 3.156263318997249, + "grad_norm": 1.3300687032967167, + "learning_rate": 1.5781928084314944e-07, + "loss": 1.0276, + "step": 40730 + }, + { + "epoch": 3.157038242473556, + "grad_norm": 1.3305015573005308, + "learning_rate": 1.578580285182889e-07, + "loss": 1.0004, + "step": 40740 + }, + { + "epoch": 3.1578131659498623, + "grad_norm": 1.2442868542052947, + "learning_rate": 1.578967761934284e-07, + "loss": 0.993, + "step": 40750 + }, + { + "epoch": 3.158588089426169, + "grad_norm": 1.254980828341887, + "learning_rate": 1.579355238685679e-07, + "loss": 1.0071, + "step": 40760 + }, + { + "epoch": 3.159363012902476, + "grad_norm": 1.3317934396334625, + "learning_rate": 1.579742715437074e-07, + "loss": 1.0046, + "step": 40770 + }, + { + "epoch": 3.1601379363787827, + "grad_norm": 1.4252259430037688, + "learning_rate": 1.5801301921884689e-07, + "loss": 0.9954, + "step": 40780 + }, + { + "epoch": 3.160912859855089, + "grad_norm": 1.3209850353307986, + "learning_rate": 1.5805176689398638e-07, + "loss": 1.0333, + "step": 40790 + }, + { + "epoch": 3.161687783331396, + "grad_norm": 1.3082776770219053, + "learning_rate": 1.5809051456912588e-07, + "loss": 0.9886, + "step": 40800 + }, + { + "epoch": 3.1624627068077027, + "grad_norm": 1.2975318642239384, + "learning_rate": 1.5812926224426535e-07, + "loss": 0.998, + "step": 40810 + }, + { + "epoch": 3.1632376302840095, + "grad_norm": 1.3546881638893402, + "learning_rate": 1.5816800991940484e-07, + "loss": 0.9985, + "step": 40820 + }, + { + "epoch": 3.1640125537603163, + "grad_norm": 1.3537065412449958, + "learning_rate": 1.5820675759454434e-07, + "loss": 1.0068, + "step": 40830 + }, + { + "epoch": 3.1647874772366227, + "grad_norm": 1.2684161525584567, + "learning_rate": 1.5824550526968383e-07, + "loss": 1.009, + "step": 40840 + }, + { + "epoch": 3.1655624007129295, + "grad_norm": 1.3336757074274326, + "learning_rate": 1.5828425294482333e-07, + "loss": 0.9994, + "step": 40850 + }, + { + "epoch": 3.1663373241892363, + "grad_norm": 1.2164106144838307, + "learning_rate": 1.5832300061996282e-07, + "loss": 1.0068, + "step": 40860 + }, + { + "epoch": 3.167112247665543, + "grad_norm": 1.2683572218514627, + "learning_rate": 1.5836174829510232e-07, + "loss": 1.005, + "step": 40870 + }, + { + "epoch": 3.16788717114185, + "grad_norm": 1.3597944861709674, + "learning_rate": 1.584004959702418e-07, + "loss": 0.9904, + "step": 40880 + }, + { + "epoch": 3.1686620946181563, + "grad_norm": 1.3802786051620732, + "learning_rate": 1.5843924364538128e-07, + "loss": 1.0042, + "step": 40890 + }, + { + "epoch": 3.169437018094463, + "grad_norm": 1.3322673060761614, + "learning_rate": 1.5847799132052078e-07, + "loss": 0.9894, + "step": 40900 + }, + { + "epoch": 3.17021194157077, + "grad_norm": 1.2632562461080528, + "learning_rate": 1.5851673899566028e-07, + "loss": 0.9846, + "step": 40910 + }, + { + "epoch": 3.1709868650470767, + "grad_norm": 1.2852141868819036, + "learning_rate": 1.5855548667079977e-07, + "loss": 0.9771, + "step": 40920 + }, + { + "epoch": 3.1717617885233835, + "grad_norm": 1.346343566291815, + "learning_rate": 1.5859423434593927e-07, + "loss": 1.0244, + "step": 40930 + }, + { + "epoch": 3.17253671199969, + "grad_norm": 1.3373233889605478, + "learning_rate": 1.5863298202107874e-07, + "loss": 0.9953, + "step": 40940 + }, + { + "epoch": 3.1733116354759967, + "grad_norm": 1.3035247170987203, + "learning_rate": 1.5867172969621823e-07, + "loss": 0.9916, + "step": 40950 + }, + { + "epoch": 3.1740865589523035, + "grad_norm": 1.3510004687998651, + "learning_rate": 1.5871047737135773e-07, + "loss": 1.0049, + "step": 40960 + }, + { + "epoch": 3.1748614824286103, + "grad_norm": 1.3134206760541174, + "learning_rate": 1.5874922504649722e-07, + "loss": 1.0029, + "step": 40970 + }, + { + "epoch": 3.1756364059049167, + "grad_norm": 1.3315426285825374, + "learning_rate": 1.5878797272163672e-07, + "loss": 0.9984, + "step": 40980 + }, + { + "epoch": 3.1764113293812235, + "grad_norm": 1.4047691706396424, + "learning_rate": 1.588267203967762e-07, + "loss": 1.012, + "step": 40990 + }, + { + "epoch": 3.1771862528575303, + "grad_norm": 1.3088659244010732, + "learning_rate": 1.588654680719157e-07, + "loss": 0.9881, + "step": 41000 + }, + { + "epoch": 3.1771862528575303, + "eval_loss": 0.9989365339279175, + "eval_runtime": 319.972, + "eval_samples_per_second": 35.85, + "eval_steps_per_second": 8.963, + "step": 41000 + }, + { + "epoch": 3.177961176333837, + "grad_norm": 1.3237768802285836, + "learning_rate": 1.5890421574705518e-07, + "loss": 0.9692, + "step": 41010 + }, + { + "epoch": 3.178736099810144, + "grad_norm": 1.2923939622382394, + "learning_rate": 1.5894296342219467e-07, + "loss": 0.9977, + "step": 41020 + }, + { + "epoch": 3.1795110232864503, + "grad_norm": 1.2809328417134251, + "learning_rate": 1.5898171109733417e-07, + "loss": 0.9991, + "step": 41030 + }, + { + "epoch": 3.180285946762757, + "grad_norm": 1.354333732465566, + "learning_rate": 1.5902045877247366e-07, + "loss": 1.004, + "step": 41040 + }, + { + "epoch": 3.181060870239064, + "grad_norm": 1.290915197510589, + "learning_rate": 1.5905920644761316e-07, + "loss": 1.0164, + "step": 41050 + }, + { + "epoch": 3.1818357937153707, + "grad_norm": 1.3591002744018306, + "learning_rate": 1.5909795412275265e-07, + "loss": 0.9969, + "step": 41060 + }, + { + "epoch": 3.182610717191677, + "grad_norm": 1.4411227232295076, + "learning_rate": 1.5913670179789215e-07, + "loss": 1.0064, + "step": 41070 + }, + { + "epoch": 3.183385640667984, + "grad_norm": 1.2659477385263438, + "learning_rate": 1.5917544947303162e-07, + "loss": 0.9939, + "step": 41080 + }, + { + "epoch": 3.1841605641442907, + "grad_norm": 1.3173210440370289, + "learning_rate": 1.5921419714817111e-07, + "loss": 0.9889, + "step": 41090 + }, + { + "epoch": 3.1849354876205975, + "grad_norm": 1.365096593850652, + "learning_rate": 1.592529448233106e-07, + "loss": 1.0077, + "step": 41100 + }, + { + "epoch": 3.1857104110969043, + "grad_norm": 1.2975733493771182, + "learning_rate": 1.592916924984501e-07, + "loss": 0.9812, + "step": 41110 + }, + { + "epoch": 3.1864853345732107, + "grad_norm": 1.3101519989048276, + "learning_rate": 1.593304401735896e-07, + "loss": 0.9986, + "step": 41120 + }, + { + "epoch": 3.1872602580495175, + "grad_norm": 1.294683975544655, + "learning_rate": 1.593691878487291e-07, + "loss": 1.0011, + "step": 41130 + }, + { + "epoch": 3.1880351815258243, + "grad_norm": 1.4703170924067286, + "learning_rate": 1.594079355238686e-07, + "loss": 0.9938, + "step": 41140 + }, + { + "epoch": 3.188810105002131, + "grad_norm": 1.3533473357334367, + "learning_rate": 1.5944668319900806e-07, + "loss": 1.0095, + "step": 41150 + }, + { + "epoch": 3.189585028478438, + "grad_norm": 1.330399431811177, + "learning_rate": 1.5948543087414756e-07, + "loss": 0.9965, + "step": 41160 + }, + { + "epoch": 3.1903599519547443, + "grad_norm": 1.2662407800861892, + "learning_rate": 1.5952417854928705e-07, + "loss": 0.9925, + "step": 41170 + }, + { + "epoch": 3.191134875431051, + "grad_norm": 1.401017504341956, + "learning_rate": 1.5956292622442655e-07, + "loss": 0.9933, + "step": 41180 + }, + { + "epoch": 3.191909798907358, + "grad_norm": 1.3229618340375633, + "learning_rate": 1.5960167389956604e-07, + "loss": 1.0046, + "step": 41190 + }, + { + "epoch": 3.1926847223836647, + "grad_norm": 1.2801739650894899, + "learning_rate": 1.5964042157470554e-07, + "loss": 1.0054, + "step": 41200 + }, + { + "epoch": 3.1934596458599716, + "grad_norm": 1.332986861835143, + "learning_rate": 1.5967916924984503e-07, + "loss": 0.9926, + "step": 41210 + }, + { + "epoch": 3.194234569336278, + "grad_norm": 1.3792228345940878, + "learning_rate": 1.597179169249845e-07, + "loss": 0.9901, + "step": 41220 + }, + { + "epoch": 3.1950094928125847, + "grad_norm": 1.4866172974124896, + "learning_rate": 1.59756664600124e-07, + "loss": 0.9914, + "step": 41230 + }, + { + "epoch": 3.1957844162888915, + "grad_norm": 1.2667957710993323, + "learning_rate": 1.597954122752635e-07, + "loss": 0.9829, + "step": 41240 + }, + { + "epoch": 3.1965593397651983, + "grad_norm": 1.3032832536755503, + "learning_rate": 1.59834159950403e-07, + "loss": 1.0061, + "step": 41250 + }, + { + "epoch": 3.1973342632415047, + "grad_norm": 1.3571025143425306, + "learning_rate": 1.5987290762554249e-07, + "loss": 0.9931, + "step": 41260 + }, + { + "epoch": 3.1981091867178115, + "grad_norm": 1.3374485702753527, + "learning_rate": 1.5991165530068198e-07, + "loss": 0.9835, + "step": 41270 + }, + { + "epoch": 3.1988841101941183, + "grad_norm": 1.3453843370115424, + "learning_rate": 1.5995040297582145e-07, + "loss": 0.9984, + "step": 41280 + }, + { + "epoch": 3.199659033670425, + "grad_norm": 1.3667108686811091, + "learning_rate": 1.5998915065096095e-07, + "loss": 0.9986, + "step": 41290 + }, + { + "epoch": 3.200433957146732, + "grad_norm": 1.229530648843783, + "learning_rate": 1.6002789832610044e-07, + "loss": 0.9944, + "step": 41300 + }, + { + "epoch": 3.2012088806230383, + "grad_norm": 1.3839638725558168, + "learning_rate": 1.6006664600123994e-07, + "loss": 1.0026, + "step": 41310 + }, + { + "epoch": 3.201983804099345, + "grad_norm": 1.3428613562286122, + "learning_rate": 1.6010539367637943e-07, + "loss": 0.975, + "step": 41320 + }, + { + "epoch": 3.202758727575652, + "grad_norm": 1.4789819040894485, + "learning_rate": 1.6014414135151893e-07, + "loss": 0.981, + "step": 41330 + }, + { + "epoch": 3.2035336510519588, + "grad_norm": 1.3358891028958026, + "learning_rate": 1.6018288902665842e-07, + "loss": 1.0006, + "step": 41340 + }, + { + "epoch": 3.204308574528265, + "grad_norm": 1.3000503708377855, + "learning_rate": 1.602216367017979e-07, + "loss": 0.9951, + "step": 41350 + }, + { + "epoch": 3.205083498004572, + "grad_norm": 1.297216415711647, + "learning_rate": 1.602603843769374e-07, + "loss": 1.0093, + "step": 41360 + }, + { + "epoch": 3.2058584214808787, + "grad_norm": 1.364031487672096, + "learning_rate": 1.6029913205207688e-07, + "loss": 1.0091, + "step": 41370 + }, + { + "epoch": 3.2066333449571855, + "grad_norm": 1.3150800849462478, + "learning_rate": 1.6033787972721638e-07, + "loss": 0.9927, + "step": 41380 + }, + { + "epoch": 3.2074082684334924, + "grad_norm": 1.3006321701311585, + "learning_rate": 1.6037662740235587e-07, + "loss": 0.9765, + "step": 41390 + }, + { + "epoch": 3.2081831919097987, + "grad_norm": 1.2787430294173558, + "learning_rate": 1.6041537507749537e-07, + "loss": 0.9943, + "step": 41400 + }, + { + "epoch": 3.2089581153861055, + "grad_norm": 1.2713938039130535, + "learning_rate": 1.6045412275263487e-07, + "loss": 0.979, + "step": 41410 + }, + { + "epoch": 3.2097330388624123, + "grad_norm": 1.229140158184314, + "learning_rate": 1.6049287042777433e-07, + "loss": 0.9534, + "step": 41420 + }, + { + "epoch": 3.210507962338719, + "grad_norm": 1.2781057217925835, + "learning_rate": 1.6053161810291383e-07, + "loss": 0.9816, + "step": 41430 + }, + { + "epoch": 3.211282885815026, + "grad_norm": 1.321595749611002, + "learning_rate": 1.6057036577805333e-07, + "loss": 0.975, + "step": 41440 + }, + { + "epoch": 3.2120578092913323, + "grad_norm": 1.3493394983635398, + "learning_rate": 1.6060911345319282e-07, + "loss": 1.0117, + "step": 41450 + }, + { + "epoch": 3.212832732767639, + "grad_norm": 1.3160379792422572, + "learning_rate": 1.6064786112833232e-07, + "loss": 0.9869, + "step": 41460 + }, + { + "epoch": 3.213607656243946, + "grad_norm": 1.2936692767523321, + "learning_rate": 1.606866088034718e-07, + "loss": 0.9912, + "step": 41470 + }, + { + "epoch": 3.2143825797202528, + "grad_norm": 1.3000908219946528, + "learning_rate": 1.607253564786113e-07, + "loss": 0.9912, + "step": 41480 + }, + { + "epoch": 3.2151575031965596, + "grad_norm": 1.2870112839160157, + "learning_rate": 1.6076410415375078e-07, + "loss": 1.0021, + "step": 41490 + }, + { + "epoch": 3.215932426672866, + "grad_norm": 1.2857449248382715, + "learning_rate": 1.6080285182889027e-07, + "loss": 0.9819, + "step": 41500 + }, + { + "epoch": 3.215932426672866, + "eval_loss": 0.997684121131897, + "eval_runtime": 320.8279, + "eval_samples_per_second": 35.754, + "eval_steps_per_second": 8.939, + "step": 41500 + }, + { + "epoch": 3.2167073501491728, + "grad_norm": 1.3539832370878278, + "learning_rate": 1.6084159950402977e-07, + "loss": 0.997, + "step": 41510 + }, + { + "epoch": 3.2174822736254796, + "grad_norm": 1.3446740263057855, + "learning_rate": 1.6088034717916926e-07, + "loss": 0.9991, + "step": 41520 + }, + { + "epoch": 3.2182571971017864, + "grad_norm": 1.2922234634930214, + "learning_rate": 1.6091909485430876e-07, + "loss": 0.9935, + "step": 41530 + }, + { + "epoch": 3.2190321205780927, + "grad_norm": 1.6089290927692435, + "learning_rate": 1.6095784252944825e-07, + "loss": 1.0126, + "step": 41540 + }, + { + "epoch": 3.2198070440543995, + "grad_norm": 1.3717037168984252, + "learning_rate": 1.6099659020458775e-07, + "loss": 1.0068, + "step": 41550 + }, + { + "epoch": 3.2205819675307064, + "grad_norm": 1.345106374452256, + "learning_rate": 1.6103533787972722e-07, + "loss": 1.0007, + "step": 41560 + }, + { + "epoch": 3.221356891007013, + "grad_norm": 1.4329649061483447, + "learning_rate": 1.6107408555486671e-07, + "loss": 1.0201, + "step": 41570 + }, + { + "epoch": 3.22213181448332, + "grad_norm": 1.224279728686784, + "learning_rate": 1.611128332300062e-07, + "loss": 0.9958, + "step": 41580 + }, + { + "epoch": 3.2229067379596263, + "grad_norm": 1.4016032324716887, + "learning_rate": 1.611515809051457e-07, + "loss": 1.0138, + "step": 41590 + }, + { + "epoch": 3.223681661435933, + "grad_norm": 1.3845812720536428, + "learning_rate": 1.611903285802852e-07, + "loss": 0.984, + "step": 41600 + }, + { + "epoch": 3.22445658491224, + "grad_norm": 1.2475314885180013, + "learning_rate": 1.612290762554247e-07, + "loss": 0.9941, + "step": 41610 + }, + { + "epoch": 3.2252315083885468, + "grad_norm": 1.2792288124751745, + "learning_rate": 1.6126782393056417e-07, + "loss": 0.995, + "step": 41620 + }, + { + "epoch": 3.226006431864853, + "grad_norm": 1.3704181306358756, + "learning_rate": 1.6130657160570366e-07, + "loss": 1.0006, + "step": 41630 + }, + { + "epoch": 3.22678135534116, + "grad_norm": 1.3204548529019924, + "learning_rate": 1.6134531928084316e-07, + "loss": 1.0118, + "step": 41640 + }, + { + "epoch": 3.2275562788174668, + "grad_norm": 1.3001721841686336, + "learning_rate": 1.6138406695598265e-07, + "loss": 0.9892, + "step": 41650 + }, + { + "epoch": 3.2283312022937736, + "grad_norm": 1.2987084177205597, + "learning_rate": 1.6142281463112215e-07, + "loss": 1.003, + "step": 41660 + }, + { + "epoch": 3.2291061257700804, + "grad_norm": 1.2984281870743883, + "learning_rate": 1.6146156230626164e-07, + "loss": 1.0055, + "step": 41670 + }, + { + "epoch": 3.2298810492463867, + "grad_norm": 1.3360422538592844, + "learning_rate": 1.6150030998140114e-07, + "loss": 1.0196, + "step": 41680 + }, + { + "epoch": 3.2306559727226936, + "grad_norm": 1.2963902512312149, + "learning_rate": 1.615390576565406e-07, + "loss": 0.9865, + "step": 41690 + }, + { + "epoch": 3.2314308961990004, + "grad_norm": 1.3664127019763916, + "learning_rate": 1.615778053316801e-07, + "loss": 0.9829, + "step": 41700 + }, + { + "epoch": 3.232205819675307, + "grad_norm": 1.2959083475047681, + "learning_rate": 1.616165530068196e-07, + "loss": 0.9875, + "step": 41710 + }, + { + "epoch": 3.2329807431516135, + "grad_norm": 1.3609675507042216, + "learning_rate": 1.616553006819591e-07, + "loss": 0.9974, + "step": 41720 + }, + { + "epoch": 3.2337556666279204, + "grad_norm": 1.2955989098712406, + "learning_rate": 1.616940483570986e-07, + "loss": 0.9754, + "step": 41730 + }, + { + "epoch": 3.234530590104227, + "grad_norm": 1.255576941958963, + "learning_rate": 1.6173279603223808e-07, + "loss": 0.9955, + "step": 41740 + }, + { + "epoch": 3.235305513580534, + "grad_norm": 1.3366917369308131, + "learning_rate": 1.6177154370737758e-07, + "loss": 0.9847, + "step": 41750 + }, + { + "epoch": 3.236080437056841, + "grad_norm": 1.310175222147636, + "learning_rate": 1.6181029138251705e-07, + "loss": 0.9873, + "step": 41760 + }, + { + "epoch": 3.236855360533147, + "grad_norm": 1.3354495343638693, + "learning_rate": 1.6184903905765654e-07, + "loss": 0.996, + "step": 41770 + }, + { + "epoch": 3.237630284009454, + "grad_norm": 1.3080197831445721, + "learning_rate": 1.6188778673279604e-07, + "loss": 1.0006, + "step": 41780 + }, + { + "epoch": 3.2384052074857608, + "grad_norm": 1.2650706152838036, + "learning_rate": 1.6192653440793554e-07, + "loss": 0.9898, + "step": 41790 + }, + { + "epoch": 3.2391801309620676, + "grad_norm": 1.3407742348402394, + "learning_rate": 1.6196528208307503e-07, + "loss": 0.9854, + "step": 41800 + }, + { + "epoch": 3.2399550544383744, + "grad_norm": 1.3296820024252949, + "learning_rate": 1.6200402975821453e-07, + "loss": 0.9781, + "step": 41810 + }, + { + "epoch": 3.2407299779146808, + "grad_norm": 1.366287800617359, + "learning_rate": 1.6204277743335402e-07, + "loss": 1.0147, + "step": 41820 + }, + { + "epoch": 3.2415049013909876, + "grad_norm": 1.389326356848791, + "learning_rate": 1.620815251084935e-07, + "loss": 0.9761, + "step": 41830 + }, + { + "epoch": 3.2422798248672944, + "grad_norm": 1.3463430420290123, + "learning_rate": 1.62120272783633e-07, + "loss": 1.0077, + "step": 41840 + }, + { + "epoch": 3.243054748343601, + "grad_norm": 1.2933809047365623, + "learning_rate": 1.6215902045877248e-07, + "loss": 0.9955, + "step": 41850 + }, + { + "epoch": 3.243829671819908, + "grad_norm": 1.3674440812752924, + "learning_rate": 1.6219776813391198e-07, + "loss": 0.9873, + "step": 41860 + }, + { + "epoch": 3.2446045952962144, + "grad_norm": 1.338797300123868, + "learning_rate": 1.6223651580905147e-07, + "loss": 0.9834, + "step": 41870 + }, + { + "epoch": 3.245379518772521, + "grad_norm": 1.3487217844902555, + "learning_rate": 1.6227526348419097e-07, + "loss": 0.9874, + "step": 41880 + }, + { + "epoch": 3.246154442248828, + "grad_norm": 1.3954083616765505, + "learning_rate": 1.6231401115933046e-07, + "loss": 0.9827, + "step": 41890 + }, + { + "epoch": 3.246929365725135, + "grad_norm": 1.301305941523163, + "learning_rate": 1.6235275883446993e-07, + "loss": 0.9796, + "step": 41900 + }, + { + "epoch": 3.247704289201441, + "grad_norm": 1.2724210159095841, + "learning_rate": 1.6239150650960943e-07, + "loss": 0.997, + "step": 41910 + }, + { + "epoch": 3.248479212677748, + "grad_norm": 1.3750043420736506, + "learning_rate": 1.6243025418474892e-07, + "loss": 0.9774, + "step": 41920 + }, + { + "epoch": 3.249254136154055, + "grad_norm": 1.3942282189298612, + "learning_rate": 1.6246900185988842e-07, + "loss": 0.9813, + "step": 41930 + }, + { + "epoch": 3.2500290596303616, + "grad_norm": 1.300858658733121, + "learning_rate": 1.6250774953502792e-07, + "loss": 0.9925, + "step": 41940 + }, + { + "epoch": 3.2508039831066684, + "grad_norm": 1.3239768473724947, + "learning_rate": 1.625464972101674e-07, + "loss": 0.9998, + "step": 41950 + }, + { + "epoch": 3.2515789065829748, + "grad_norm": 1.2811257195894692, + "learning_rate": 1.625852448853069e-07, + "loss": 1.0051, + "step": 41960 + }, + { + "epoch": 3.2523538300592816, + "grad_norm": 1.3000547832585334, + "learning_rate": 1.6262399256044638e-07, + "loss": 1.0013, + "step": 41970 + }, + { + "epoch": 3.2531287535355884, + "grad_norm": 1.319682528650651, + "learning_rate": 1.6266274023558587e-07, + "loss": 1.0019, + "step": 41980 + }, + { + "epoch": 3.253903677011895, + "grad_norm": 1.4153731558748648, + "learning_rate": 1.6270148791072537e-07, + "loss": 1.0044, + "step": 41990 + }, + { + "epoch": 3.2546786004882016, + "grad_norm": 1.398394248893387, + "learning_rate": 1.6274023558586486e-07, + "loss": 1.0066, + "step": 42000 + }, + { + "epoch": 3.2546786004882016, + "eval_loss": 0.9964226484298706, + "eval_runtime": 319.2707, + "eval_samples_per_second": 35.929, + "eval_steps_per_second": 8.983, + "step": 42000 + }, + { + "epoch": 3.2554535239645084, + "grad_norm": 1.4772338890971808, + "learning_rate": 1.6277898326100436e-07, + "loss": 1.0262, + "step": 42010 + }, + { + "epoch": 3.256228447440815, + "grad_norm": 1.305409346038411, + "learning_rate": 1.6281773093614385e-07, + "loss": 0.9872, + "step": 42020 + }, + { + "epoch": 3.257003370917122, + "grad_norm": 1.4679244423085644, + "learning_rate": 1.6285647861128332e-07, + "loss": 0.9924, + "step": 42030 + }, + { + "epoch": 3.257778294393429, + "grad_norm": 1.2999191250427236, + "learning_rate": 1.6289522628642282e-07, + "loss": 0.9761, + "step": 42040 + }, + { + "epoch": 3.258553217869735, + "grad_norm": 1.3408872610377858, + "learning_rate": 1.6293397396156231e-07, + "loss": 0.963, + "step": 42050 + }, + { + "epoch": 3.259328141346042, + "grad_norm": 1.2995931873187017, + "learning_rate": 1.629727216367018e-07, + "loss": 0.9934, + "step": 42060 + }, + { + "epoch": 3.260103064822349, + "grad_norm": 1.3040322544655447, + "learning_rate": 1.630114693118413e-07, + "loss": 0.9974, + "step": 42070 + }, + { + "epoch": 3.2608779882986556, + "grad_norm": 1.3702060798625793, + "learning_rate": 1.630502169869808e-07, + "loss": 1.0216, + "step": 42080 + }, + { + "epoch": 3.2616529117749624, + "grad_norm": 1.4184644257709718, + "learning_rate": 1.630889646621203e-07, + "loss": 0.989, + "step": 42090 + }, + { + "epoch": 3.262427835251269, + "grad_norm": 1.281082872076117, + "learning_rate": 1.6312771233725976e-07, + "loss": 0.9988, + "step": 42100 + }, + { + "epoch": 3.2632027587275756, + "grad_norm": 1.307275124762446, + "learning_rate": 1.6316646001239926e-07, + "loss": 0.9875, + "step": 42110 + }, + { + "epoch": 3.2639776822038824, + "grad_norm": 1.3862225799894532, + "learning_rate": 1.6320520768753876e-07, + "loss": 0.9965, + "step": 42120 + }, + { + "epoch": 3.264752605680189, + "grad_norm": 1.4171737552057535, + "learning_rate": 1.6324395536267825e-07, + "loss": 0.986, + "step": 42130 + }, + { + "epoch": 3.265527529156496, + "grad_norm": 1.3423016056650192, + "learning_rate": 1.6328270303781775e-07, + "loss": 1.0087, + "step": 42140 + }, + { + "epoch": 3.2663024526328024, + "grad_norm": 1.3775045824135985, + "learning_rate": 1.6332145071295724e-07, + "loss": 1.0088, + "step": 42150 + }, + { + "epoch": 3.267077376109109, + "grad_norm": 1.2958319692398257, + "learning_rate": 1.6336019838809674e-07, + "loss": 0.9929, + "step": 42160 + }, + { + "epoch": 3.267852299585416, + "grad_norm": 1.3469416815160142, + "learning_rate": 1.633989460632362e-07, + "loss": 1.0033, + "step": 42170 + }, + { + "epoch": 3.268627223061723, + "grad_norm": 1.2811523751148028, + "learning_rate": 1.634376937383757e-07, + "loss": 0.9771, + "step": 42180 + }, + { + "epoch": 3.269402146538029, + "grad_norm": 1.287155184051436, + "learning_rate": 1.634764414135152e-07, + "loss": 0.9733, + "step": 42190 + }, + { + "epoch": 3.270177070014336, + "grad_norm": 1.3605603136617705, + "learning_rate": 1.635151890886547e-07, + "loss": 0.9912, + "step": 42200 + }, + { + "epoch": 3.270951993490643, + "grad_norm": 1.324634931026109, + "learning_rate": 1.635539367637942e-07, + "loss": 0.9882, + "step": 42210 + }, + { + "epoch": 3.2717269169669496, + "grad_norm": 1.321952870633865, + "learning_rate": 1.6359268443893368e-07, + "loss": 0.9826, + "step": 42220 + }, + { + "epoch": 3.2725018404432564, + "grad_norm": 1.3327485674165103, + "learning_rate": 1.6363143211407318e-07, + "loss": 1.0163, + "step": 42230 + }, + { + "epoch": 3.273276763919563, + "grad_norm": 1.3336827668408844, + "learning_rate": 1.6367017978921265e-07, + "loss": 1.0038, + "step": 42240 + }, + { + "epoch": 3.2740516873958696, + "grad_norm": 1.2478540280019101, + "learning_rate": 1.6370892746435214e-07, + "loss": 1.0133, + "step": 42250 + }, + { + "epoch": 3.2748266108721764, + "grad_norm": 1.4017164978770684, + "learning_rate": 1.6374767513949164e-07, + "loss": 0.985, + "step": 42260 + }, + { + "epoch": 3.2756015343484832, + "grad_norm": 1.3643005214633015, + "learning_rate": 1.6378642281463114e-07, + "loss": 1.0002, + "step": 42270 + }, + { + "epoch": 3.2763764578247896, + "grad_norm": 1.3451776974685588, + "learning_rate": 1.6382517048977063e-07, + "loss": 0.9968, + "step": 42280 + }, + { + "epoch": 3.2771513813010964, + "grad_norm": 1.342658294385972, + "learning_rate": 1.6386391816491013e-07, + "loss": 0.9872, + "step": 42290 + }, + { + "epoch": 3.277926304777403, + "grad_norm": 1.3027829932025576, + "learning_rate": 1.6390266584004962e-07, + "loss": 0.9795, + "step": 42300 + }, + { + "epoch": 3.27870122825371, + "grad_norm": 1.3624429189270724, + "learning_rate": 1.639414135151891e-07, + "loss": 1.0137, + "step": 42310 + }, + { + "epoch": 3.279476151730017, + "grad_norm": 1.345530389117878, + "learning_rate": 1.6398016119032859e-07, + "loss": 0.9896, + "step": 42320 + }, + { + "epoch": 3.280251075206323, + "grad_norm": 1.3540466937180125, + "learning_rate": 1.6401890886546808e-07, + "loss": 1.0204, + "step": 42330 + }, + { + "epoch": 3.28102599868263, + "grad_norm": 1.3544260058641966, + "learning_rate": 1.6405765654060758e-07, + "loss": 0.9751, + "step": 42340 + }, + { + "epoch": 3.281800922158937, + "grad_norm": 1.3462143589945996, + "learning_rate": 1.6409640421574707e-07, + "loss": 0.9986, + "step": 42350 + }, + { + "epoch": 3.2825758456352436, + "grad_norm": 1.2918692105729421, + "learning_rate": 1.6413515189088657e-07, + "loss": 1.0122, + "step": 42360 + }, + { + "epoch": 3.28335076911155, + "grad_norm": 1.3249364563229828, + "learning_rate": 1.6417389956602604e-07, + "loss": 1.0, + "step": 42370 + }, + { + "epoch": 3.284125692587857, + "grad_norm": 1.4016862132096184, + "learning_rate": 1.6421264724116553e-07, + "loss": 0.9874, + "step": 42380 + }, + { + "epoch": 3.2849006160641636, + "grad_norm": 1.353896754792388, + "learning_rate": 1.6425139491630503e-07, + "loss": 1.013, + "step": 42390 + }, + { + "epoch": 3.2856755395404704, + "grad_norm": 1.3446930319544559, + "learning_rate": 1.6429014259144452e-07, + "loss": 1.0062, + "step": 42400 + }, + { + "epoch": 3.2864504630167772, + "grad_norm": 1.2865484784975003, + "learning_rate": 1.6432889026658402e-07, + "loss": 0.9862, + "step": 42410 + }, + { + "epoch": 3.287225386493084, + "grad_norm": 1.3205243682457881, + "learning_rate": 1.6436763794172351e-07, + "loss": 0.997, + "step": 42420 + }, + { + "epoch": 3.2880003099693904, + "grad_norm": 1.3242736837868367, + "learning_rate": 1.64406385616863e-07, + "loss": 1.0098, + "step": 42430 + }, + { + "epoch": 3.288775233445697, + "grad_norm": 1.3016321042936343, + "learning_rate": 1.6444513329200248e-07, + "loss": 0.9892, + "step": 42440 + }, + { + "epoch": 3.289550156922004, + "grad_norm": 1.2655924477555582, + "learning_rate": 1.6448388096714197e-07, + "loss": 0.9855, + "step": 42450 + }, + { + "epoch": 3.290325080398311, + "grad_norm": 1.286301640028838, + "learning_rate": 1.6452262864228147e-07, + "loss": 1.0307, + "step": 42460 + }, + { + "epoch": 3.291100003874617, + "grad_norm": 1.3158642765028816, + "learning_rate": 1.6456137631742097e-07, + "loss": 1.009, + "step": 42470 + }, + { + "epoch": 3.291874927350924, + "grad_norm": 4.426145294938721, + "learning_rate": 1.6460012399256046e-07, + "loss": 0.9923, + "step": 42480 + }, + { + "epoch": 3.292649850827231, + "grad_norm": 1.2996664282184458, + "learning_rate": 1.6463887166769996e-07, + "loss": 1.003, + "step": 42490 + }, + { + "epoch": 3.2934247743035376, + "grad_norm": 1.430899740364466, + "learning_rate": 1.6467761934283945e-07, + "loss": 0.9879, + "step": 42500 + }, + { + "epoch": 3.2934247743035376, + "eval_loss": 0.9951682090759277, + "eval_runtime": 320.0193, + "eval_samples_per_second": 35.845, + "eval_steps_per_second": 8.962, + "step": 42500 + }, + { + "epoch": 3.2941996977798444, + "grad_norm": 1.295911244668933, + "learning_rate": 1.6471636701797892e-07, + "loss": 0.9848, + "step": 42510 + }, + { + "epoch": 3.294974621256151, + "grad_norm": 1.3980433838722401, + "learning_rate": 1.6475511469311842e-07, + "loss": 0.9896, + "step": 42520 + }, + { + "epoch": 3.2957495447324576, + "grad_norm": 1.8482707679143364, + "learning_rate": 1.647938623682579e-07, + "loss": 0.9915, + "step": 42530 + }, + { + "epoch": 3.2965244682087644, + "grad_norm": 1.2988335719179762, + "learning_rate": 1.648326100433974e-07, + "loss": 0.9961, + "step": 42540 + }, + { + "epoch": 3.2972993916850712, + "grad_norm": 1.393985718189735, + "learning_rate": 1.648713577185369e-07, + "loss": 0.9933, + "step": 42550 + }, + { + "epoch": 3.2980743151613776, + "grad_norm": 1.371635497708773, + "learning_rate": 1.649101053936764e-07, + "loss": 0.9844, + "step": 42560 + }, + { + "epoch": 3.2988492386376844, + "grad_norm": 1.313573162735245, + "learning_rate": 1.649488530688159e-07, + "loss": 1.0271, + "step": 42570 + }, + { + "epoch": 3.2996241621139912, + "grad_norm": 1.3629220895691383, + "learning_rate": 1.6498760074395536e-07, + "loss": 0.9859, + "step": 42580 + }, + { + "epoch": 3.300399085590298, + "grad_norm": 1.3671543207396046, + "learning_rate": 1.6502634841909486e-07, + "loss": 1.0036, + "step": 42590 + }, + { + "epoch": 3.301174009066605, + "grad_norm": 1.2686715674887667, + "learning_rate": 1.6506509609423435e-07, + "loss": 0.9786, + "step": 42600 + }, + { + "epoch": 3.301948932542911, + "grad_norm": 1.3799074260570647, + "learning_rate": 1.6510384376937385e-07, + "loss": 0.9787, + "step": 42610 + }, + { + "epoch": 3.302723856019218, + "grad_norm": 1.3260496718784183, + "learning_rate": 1.6514259144451335e-07, + "loss": 0.9828, + "step": 42620 + }, + { + "epoch": 3.303498779495525, + "grad_norm": 1.3599781468504344, + "learning_rate": 1.6518133911965284e-07, + "loss": 1.0155, + "step": 42630 + }, + { + "epoch": 3.3042737029718316, + "grad_norm": 1.4027966655773607, + "learning_rate": 1.6522008679479234e-07, + "loss": 0.9994, + "step": 42640 + }, + { + "epoch": 3.305048626448138, + "grad_norm": 1.3472883034313308, + "learning_rate": 1.652588344699318e-07, + "loss": 0.9975, + "step": 42650 + }, + { + "epoch": 3.305823549924445, + "grad_norm": 1.3808985233740447, + "learning_rate": 1.652975821450713e-07, + "loss": 1.0137, + "step": 42660 + }, + { + "epoch": 3.3065984734007516, + "grad_norm": 1.3519661306355348, + "learning_rate": 1.653363298202108e-07, + "loss": 1.0111, + "step": 42670 + }, + { + "epoch": 3.3073733968770584, + "grad_norm": 1.3257531455902545, + "learning_rate": 1.653750774953503e-07, + "loss": 0.9953, + "step": 42680 + }, + { + "epoch": 3.3081483203533653, + "grad_norm": 1.384666656812314, + "learning_rate": 1.654138251704898e-07, + "loss": 1.0081, + "step": 42690 + }, + { + "epoch": 3.308923243829672, + "grad_norm": 1.33554058269482, + "learning_rate": 1.6545257284562928e-07, + "loss": 0.9931, + "step": 42700 + }, + { + "epoch": 3.3096981673059784, + "grad_norm": 1.3032267332828187, + "learning_rate": 1.6549132052076875e-07, + "loss": 0.9793, + "step": 42710 + }, + { + "epoch": 3.3104730907822852, + "grad_norm": 1.357212918864612, + "learning_rate": 1.6553006819590825e-07, + "loss": 0.9741, + "step": 42720 + }, + { + "epoch": 3.311248014258592, + "grad_norm": 1.2977200927097694, + "learning_rate": 1.6556881587104774e-07, + "loss": 0.995, + "step": 42730 + }, + { + "epoch": 3.312022937734899, + "grad_norm": 1.3970080688773387, + "learning_rate": 1.6560756354618724e-07, + "loss": 0.9908, + "step": 42740 + }, + { + "epoch": 3.3127978612112052, + "grad_norm": 1.2946005085416394, + "learning_rate": 1.6564631122132673e-07, + "loss": 0.9981, + "step": 42750 + }, + { + "epoch": 3.313572784687512, + "grad_norm": 1.4269728636294077, + "learning_rate": 1.6568505889646623e-07, + "loss": 0.9765, + "step": 42760 + }, + { + "epoch": 3.314347708163819, + "grad_norm": 1.4192413628857705, + "learning_rate": 1.6572380657160573e-07, + "loss": 1.0124, + "step": 42770 + }, + { + "epoch": 3.3151226316401257, + "grad_norm": 1.3064690809610409, + "learning_rate": 1.657625542467452e-07, + "loss": 0.9887, + "step": 42780 + }, + { + "epoch": 3.3158975551164325, + "grad_norm": 1.2830375014247033, + "learning_rate": 1.658013019218847e-07, + "loss": 0.9784, + "step": 42790 + }, + { + "epoch": 3.316672478592739, + "grad_norm": 1.3376315776492966, + "learning_rate": 1.6584004959702419e-07, + "loss": 0.9939, + "step": 42800 + }, + { + "epoch": 3.3174474020690456, + "grad_norm": 1.3668779059978966, + "learning_rate": 1.6587879727216368e-07, + "loss": 0.9939, + "step": 42810 + }, + { + "epoch": 3.3182223255453525, + "grad_norm": 1.2867954281477638, + "learning_rate": 1.6591754494730318e-07, + "loss": 0.9961, + "step": 42820 + }, + { + "epoch": 3.3189972490216593, + "grad_norm": 1.2309620522058073, + "learning_rate": 1.6595629262244267e-07, + "loss": 1.0103, + "step": 42830 + }, + { + "epoch": 3.3197721724979656, + "grad_norm": 1.2670098949462372, + "learning_rate": 1.6599504029758217e-07, + "loss": 0.9962, + "step": 42840 + }, + { + "epoch": 3.3205470959742724, + "grad_norm": 1.351591823225391, + "learning_rate": 1.6603378797272164e-07, + "loss": 1.0146, + "step": 42850 + }, + { + "epoch": 3.3213220194505793, + "grad_norm": 1.279154868579172, + "learning_rate": 1.6607253564786113e-07, + "loss": 1.011, + "step": 42860 + }, + { + "epoch": 3.322096942926886, + "grad_norm": 1.294956320320337, + "learning_rate": 1.6611128332300063e-07, + "loss": 0.9872, + "step": 42870 + }, + { + "epoch": 3.322871866403193, + "grad_norm": 1.330406225139041, + "learning_rate": 1.6615003099814012e-07, + "loss": 0.9823, + "step": 42880 + }, + { + "epoch": 3.3236467898794992, + "grad_norm": 1.3238558934458167, + "learning_rate": 1.6618877867327962e-07, + "loss": 0.9908, + "step": 42890 + }, + { + "epoch": 3.324421713355806, + "grad_norm": 1.3074228140124617, + "learning_rate": 1.6622752634841911e-07, + "loss": 1.004, + "step": 42900 + }, + { + "epoch": 3.325196636832113, + "grad_norm": 1.2393820323009257, + "learning_rate": 1.662662740235586e-07, + "loss": 1.0029, + "step": 42910 + }, + { + "epoch": 3.3259715603084197, + "grad_norm": 1.2986595168724497, + "learning_rate": 1.6630502169869808e-07, + "loss": 1.0082, + "step": 42920 + }, + { + "epoch": 3.326746483784726, + "grad_norm": 1.3261538415245082, + "learning_rate": 1.6634376937383757e-07, + "loss": 1.0036, + "step": 42930 + }, + { + "epoch": 3.327521407261033, + "grad_norm": 1.432118489012082, + "learning_rate": 1.6638251704897707e-07, + "loss": 0.9988, + "step": 42940 + }, + { + "epoch": 3.3282963307373397, + "grad_norm": 1.324132807816656, + "learning_rate": 1.6642126472411657e-07, + "loss": 1.0034, + "step": 42950 + }, + { + "epoch": 3.3290712542136465, + "grad_norm": 1.2754330121360145, + "learning_rate": 1.6646001239925606e-07, + "loss": 1.0238, + "step": 42960 + }, + { + "epoch": 3.3298461776899533, + "grad_norm": 1.3784599759460905, + "learning_rate": 1.6649876007439556e-07, + "loss": 1.0058, + "step": 42970 + }, + { + "epoch": 3.3306211011662596, + "grad_norm": 1.2879745542821874, + "learning_rate": 1.6653750774953505e-07, + "loss": 0.9807, + "step": 42980 + }, + { + "epoch": 3.3313960246425665, + "grad_norm": 1.35642850722081, + "learning_rate": 1.6657625542467452e-07, + "loss": 0.9952, + "step": 42990 + }, + { + "epoch": 3.3321709481188733, + "grad_norm": 1.3765009731304505, + "learning_rate": 1.6661500309981402e-07, + "loss": 1.0042, + "step": 43000 + }, + { + "epoch": 3.3321709481188733, + "eval_loss": 0.9939212203025818, + "eval_runtime": 319.8175, + "eval_samples_per_second": 35.867, + "eval_steps_per_second": 8.968, + "step": 43000 + }, + { + "epoch": 3.33294587159518, + "grad_norm": 1.2672031693895522, + "learning_rate": 1.666537507749535e-07, + "loss": 0.9871, + "step": 43010 + }, + { + "epoch": 3.3337207950714864, + "grad_norm": 1.2967787518668852, + "learning_rate": 1.66692498450093e-07, + "loss": 0.9909, + "step": 43020 + }, + { + "epoch": 3.3344957185477933, + "grad_norm": 1.2969726381485762, + "learning_rate": 1.667312461252325e-07, + "loss": 0.982, + "step": 43030 + }, + { + "epoch": 3.3352706420241, + "grad_norm": 1.3795710407784982, + "learning_rate": 1.66769993800372e-07, + "loss": 0.9914, + "step": 43040 + }, + { + "epoch": 3.336045565500407, + "grad_norm": 1.302749567275662, + "learning_rate": 1.668087414755115e-07, + "loss": 0.9992, + "step": 43050 + }, + { + "epoch": 3.3368204889767137, + "grad_norm": 1.2674291638827788, + "learning_rate": 1.6684748915065096e-07, + "loss": 0.9804, + "step": 43060 + }, + { + "epoch": 3.3375954124530205, + "grad_norm": 1.4379845797732393, + "learning_rate": 1.6688623682579046e-07, + "loss": 0.9896, + "step": 43070 + }, + { + "epoch": 3.338370335929327, + "grad_norm": 1.3325129990836146, + "learning_rate": 1.6692498450092995e-07, + "loss": 0.9862, + "step": 43080 + }, + { + "epoch": 3.3391452594056337, + "grad_norm": 1.2509120739209576, + "learning_rate": 1.6696373217606945e-07, + "loss": 0.9972, + "step": 43090 + }, + { + "epoch": 3.3399201828819405, + "grad_norm": 1.331806285383791, + "learning_rate": 1.6700247985120894e-07, + "loss": 0.9692, + "step": 43100 + }, + { + "epoch": 3.3406951063582473, + "grad_norm": 1.2845613414767398, + "learning_rate": 1.6704122752634844e-07, + "loss": 0.9734, + "step": 43110 + }, + { + "epoch": 3.3414700298345537, + "grad_norm": 1.3702901063267579, + "learning_rate": 1.670799752014879e-07, + "loss": 0.992, + "step": 43120 + }, + { + "epoch": 3.3422449533108605, + "grad_norm": 1.268335691460769, + "learning_rate": 1.671187228766274e-07, + "loss": 0.9826, + "step": 43130 + }, + { + "epoch": 3.3430198767871673, + "grad_norm": 1.4948863763812141, + "learning_rate": 1.671574705517669e-07, + "loss": 0.9939, + "step": 43140 + }, + { + "epoch": 3.343794800263474, + "grad_norm": 1.336798597576474, + "learning_rate": 1.671962182269064e-07, + "loss": 1.0136, + "step": 43150 + }, + { + "epoch": 3.344569723739781, + "grad_norm": 1.2356377625127866, + "learning_rate": 1.672349659020459e-07, + "loss": 0.9946, + "step": 43160 + }, + { + "epoch": 3.3453446472160873, + "grad_norm": 1.3731119864840746, + "learning_rate": 1.672737135771854e-07, + "loss": 1.0016, + "step": 43170 + }, + { + "epoch": 3.346119570692394, + "grad_norm": 1.4268651372387053, + "learning_rate": 1.6731246125232488e-07, + "loss": 0.9974, + "step": 43180 + }, + { + "epoch": 3.346894494168701, + "grad_norm": 1.3766348250603058, + "learning_rate": 1.6735120892746435e-07, + "loss": 0.9999, + "step": 43190 + }, + { + "epoch": 3.3476694176450077, + "grad_norm": 1.3440785961407362, + "learning_rate": 1.6738995660260385e-07, + "loss": 0.9936, + "step": 43200 + }, + { + "epoch": 3.348444341121314, + "grad_norm": 1.278338959471139, + "learning_rate": 1.6742870427774334e-07, + "loss": 0.9902, + "step": 43210 + }, + { + "epoch": 3.349219264597621, + "grad_norm": 1.3903252877086822, + "learning_rate": 1.6746745195288284e-07, + "loss": 0.9937, + "step": 43220 + }, + { + "epoch": 3.3499941880739277, + "grad_norm": 1.4290567682670452, + "learning_rate": 1.6750619962802233e-07, + "loss": 0.9876, + "step": 43230 + }, + { + "epoch": 3.3507691115502345, + "grad_norm": 1.25262172665526, + "learning_rate": 1.6754494730316183e-07, + "loss": 0.9878, + "step": 43240 + }, + { + "epoch": 3.3515440350265413, + "grad_norm": 1.3336121616466332, + "learning_rate": 1.6758369497830132e-07, + "loss": 0.9892, + "step": 43250 + }, + { + "epoch": 3.3523189585028477, + "grad_norm": 1.2578779856556106, + "learning_rate": 1.676224426534408e-07, + "loss": 0.9716, + "step": 43260 + }, + { + "epoch": 3.3530938819791545, + "grad_norm": 1.3243849968126293, + "learning_rate": 1.676611903285803e-07, + "loss": 1.0177, + "step": 43270 + }, + { + "epoch": 3.3538688054554613, + "grad_norm": 1.364756332632128, + "learning_rate": 1.6769993800371978e-07, + "loss": 0.973, + "step": 43280 + }, + { + "epoch": 3.354643728931768, + "grad_norm": 1.3913552042609325, + "learning_rate": 1.6773868567885928e-07, + "loss": 1.0092, + "step": 43290 + }, + { + "epoch": 3.3554186524080745, + "grad_norm": 1.2726549064688915, + "learning_rate": 1.6777743335399878e-07, + "loss": 0.9944, + "step": 43300 + }, + { + "epoch": 3.3561935758843813, + "grad_norm": 1.3497743402847178, + "learning_rate": 1.6781618102913827e-07, + "loss": 0.9798, + "step": 43310 + }, + { + "epoch": 3.356968499360688, + "grad_norm": 1.266525099023501, + "learning_rate": 1.6785492870427777e-07, + "loss": 1.0087, + "step": 43320 + }, + { + "epoch": 3.357743422836995, + "grad_norm": 1.3460293562461045, + "learning_rate": 1.6789367637941724e-07, + "loss": 0.9838, + "step": 43330 + }, + { + "epoch": 3.3585183463133017, + "grad_norm": 1.3948666209551606, + "learning_rate": 1.6793242405455673e-07, + "loss": 1.0002, + "step": 43340 + }, + { + "epoch": 3.3592932697896085, + "grad_norm": 1.2761549048839766, + "learning_rate": 1.6797117172969623e-07, + "loss": 0.993, + "step": 43350 + }, + { + "epoch": 3.360068193265915, + "grad_norm": 1.3277364545997954, + "learning_rate": 1.6800991940483572e-07, + "loss": 0.9914, + "step": 43360 + }, + { + "epoch": 3.3608431167422217, + "grad_norm": 1.3187582015429598, + "learning_rate": 1.6804866707997522e-07, + "loss": 0.979, + "step": 43370 + }, + { + "epoch": 3.3616180402185285, + "grad_norm": 1.4148201937400007, + "learning_rate": 1.680874147551147e-07, + "loss": 0.9847, + "step": 43380 + }, + { + "epoch": 3.3623929636948353, + "grad_norm": 1.3418062771014014, + "learning_rate": 1.681261624302542e-07, + "loss": 1.0012, + "step": 43390 + }, + { + "epoch": 3.3631678871711417, + "grad_norm": 1.2959893541240075, + "learning_rate": 1.6816491010539368e-07, + "loss": 1.0018, + "step": 43400 + }, + { + "epoch": 3.3639428106474485, + "grad_norm": 1.2706419794602708, + "learning_rate": 1.6820365778053317e-07, + "loss": 0.9907, + "step": 43410 + }, + { + "epoch": 3.3647177341237553, + "grad_norm": 1.4500986536726241, + "learning_rate": 1.6824240545567267e-07, + "loss": 1.0116, + "step": 43420 + }, + { + "epoch": 3.365492657600062, + "grad_norm": 1.3650218348010978, + "learning_rate": 1.6828115313081216e-07, + "loss": 0.9696, + "step": 43430 + }, + { + "epoch": 3.366267581076369, + "grad_norm": 1.3973734562552116, + "learning_rate": 1.6831990080595166e-07, + "loss": 0.9776, + "step": 43440 + }, + { + "epoch": 3.3670425045526753, + "grad_norm": 1.323566769857483, + "learning_rate": 1.6835864848109116e-07, + "loss": 0.9853, + "step": 43450 + }, + { + "epoch": 3.367817428028982, + "grad_norm": 1.3348936489863075, + "learning_rate": 1.6839739615623062e-07, + "loss": 0.9889, + "step": 43460 + }, + { + "epoch": 3.368592351505289, + "grad_norm": 1.2936589144507056, + "learning_rate": 1.6843614383137012e-07, + "loss": 0.9829, + "step": 43470 + }, + { + "epoch": 3.3693672749815957, + "grad_norm": 1.3668061681925963, + "learning_rate": 1.6847489150650962e-07, + "loss": 0.9647, + "step": 43480 + }, + { + "epoch": 3.370142198457902, + "grad_norm": 1.3023470804710688, + "learning_rate": 1.685136391816491e-07, + "loss": 0.9812, + "step": 43490 + }, + { + "epoch": 3.370917121934209, + "grad_norm": 1.3841015581423313, + "learning_rate": 1.685523868567886e-07, + "loss": 0.9946, + "step": 43500 + }, + { + "epoch": 3.370917121934209, + "eval_loss": 0.9927462935447693, + "eval_runtime": 319.7113, + "eval_samples_per_second": 35.879, + "eval_steps_per_second": 8.971, + "step": 43500 + }, + { + "epoch": 3.3716920454105157, + "grad_norm": 1.3794814534918598, + "learning_rate": 1.685911345319281e-07, + "loss": 0.9769, + "step": 43510 + }, + { + "epoch": 3.3724669688868225, + "grad_norm": 1.322301245805011, + "learning_rate": 1.686298822070676e-07, + "loss": 0.9904, + "step": 43520 + }, + { + "epoch": 3.3732418923631293, + "grad_norm": 1.282285426769249, + "learning_rate": 1.6866862988220707e-07, + "loss": 0.9921, + "step": 43530 + }, + { + "epoch": 3.3740168158394357, + "grad_norm": 1.396566755731027, + "learning_rate": 1.6870737755734656e-07, + "loss": 0.9975, + "step": 43540 + }, + { + "epoch": 3.3747917393157425, + "grad_norm": 1.3282345315366049, + "learning_rate": 1.6874612523248606e-07, + "loss": 0.9637, + "step": 43550 + }, + { + "epoch": 3.3755666627920493, + "grad_norm": 1.264471574517996, + "learning_rate": 1.6878487290762555e-07, + "loss": 0.9813, + "step": 43560 + }, + { + "epoch": 3.376341586268356, + "grad_norm": 1.3186806298866034, + "learning_rate": 1.6882362058276505e-07, + "loss": 0.9969, + "step": 43570 + }, + { + "epoch": 3.3771165097446625, + "grad_norm": 1.3228591593247827, + "learning_rate": 1.6886236825790454e-07, + "loss": 1.0014, + "step": 43580 + }, + { + "epoch": 3.3778914332209693, + "grad_norm": 1.3165862051442228, + "learning_rate": 1.6890111593304404e-07, + "loss": 0.9756, + "step": 43590 + }, + { + "epoch": 3.378666356697276, + "grad_norm": 1.3159817059103842, + "learning_rate": 1.689398636081835e-07, + "loss": 0.9831, + "step": 43600 + }, + { + "epoch": 3.379441280173583, + "grad_norm": 1.4463458011244905, + "learning_rate": 1.68978611283323e-07, + "loss": 1.0028, + "step": 43610 + }, + { + "epoch": 3.3802162036498897, + "grad_norm": 1.3254570874424771, + "learning_rate": 1.690173589584625e-07, + "loss": 0.9787, + "step": 43620 + }, + { + "epoch": 3.380991127126196, + "grad_norm": 1.3819797639497278, + "learning_rate": 1.69056106633602e-07, + "loss": 0.9832, + "step": 43630 + }, + { + "epoch": 3.381766050602503, + "grad_norm": 1.3018851504797258, + "learning_rate": 1.690948543087415e-07, + "loss": 0.9943, + "step": 43640 + }, + { + "epoch": 3.3825409740788097, + "grad_norm": 1.372597649061883, + "learning_rate": 1.6913360198388099e-07, + "loss": 1.0028, + "step": 43650 + }, + { + "epoch": 3.3833158975551165, + "grad_norm": 1.3223327820114628, + "learning_rate": 1.6917234965902048e-07, + "loss": 0.9971, + "step": 43660 + }, + { + "epoch": 3.3840908210314233, + "grad_norm": 1.3059186132453515, + "learning_rate": 1.6921109733415995e-07, + "loss": 0.9829, + "step": 43670 + }, + { + "epoch": 3.3848657445077297, + "grad_norm": 1.3616288837194996, + "learning_rate": 1.6924984500929945e-07, + "loss": 1.015, + "step": 43680 + }, + { + "epoch": 3.3856406679840365, + "grad_norm": 1.3211531128116873, + "learning_rate": 1.6928859268443894e-07, + "loss": 0.9759, + "step": 43690 + }, + { + "epoch": 3.3864155914603433, + "grad_norm": 1.3358079173200141, + "learning_rate": 1.6932734035957844e-07, + "loss": 0.993, + "step": 43700 + }, + { + "epoch": 3.38719051493665, + "grad_norm": 1.2685792654761683, + "learning_rate": 1.6936608803471793e-07, + "loss": 1.0201, + "step": 43710 + }, + { + "epoch": 3.387965438412957, + "grad_norm": 1.3443554354969516, + "learning_rate": 1.6940483570985743e-07, + "loss": 0.9878, + "step": 43720 + }, + { + "epoch": 3.3887403618892633, + "grad_norm": 1.3159599332076553, + "learning_rate": 1.6944358338499692e-07, + "loss": 1.0039, + "step": 43730 + }, + { + "epoch": 3.38951528536557, + "grad_norm": 1.3305837743466618, + "learning_rate": 1.694823310601364e-07, + "loss": 1.0039, + "step": 43740 + }, + { + "epoch": 3.390290208841877, + "grad_norm": 1.2837919508748792, + "learning_rate": 1.695210787352759e-07, + "loss": 1.01, + "step": 43750 + }, + { + "epoch": 3.3910651323181837, + "grad_norm": 1.37401129943815, + "learning_rate": 1.6955982641041538e-07, + "loss": 0.994, + "step": 43760 + }, + { + "epoch": 3.39184005579449, + "grad_norm": 1.4595807117073052, + "learning_rate": 1.6959857408555488e-07, + "loss": 1.022, + "step": 43770 + }, + { + "epoch": 3.392614979270797, + "grad_norm": 1.2961083209654143, + "learning_rate": 1.6963732176069437e-07, + "loss": 0.9873, + "step": 43780 + }, + { + "epoch": 3.3933899027471037, + "grad_norm": 1.3972618656849176, + "learning_rate": 1.6967606943583387e-07, + "loss": 1.0245, + "step": 43790 + }, + { + "epoch": 3.3941648262234105, + "grad_norm": 1.2163884190292193, + "learning_rate": 1.6971481711097337e-07, + "loss": 0.9877, + "step": 43800 + }, + { + "epoch": 3.3949397496997173, + "grad_norm": 1.4247581000075709, + "learning_rate": 1.6975356478611283e-07, + "loss": 0.9823, + "step": 43810 + }, + { + "epoch": 3.3957146731760237, + "grad_norm": 1.3291829766261452, + "learning_rate": 1.6979231246125233e-07, + "loss": 0.9943, + "step": 43820 + }, + { + "epoch": 3.3964895966523305, + "grad_norm": 1.2923278753931748, + "learning_rate": 1.6983106013639183e-07, + "loss": 0.9834, + "step": 43830 + }, + { + "epoch": 3.3972645201286373, + "grad_norm": 1.3385722497305563, + "learning_rate": 1.6986980781153132e-07, + "loss": 1.0006, + "step": 43840 + }, + { + "epoch": 3.398039443604944, + "grad_norm": 1.1865719449129193, + "learning_rate": 1.6990855548667082e-07, + "loss": 0.9776, + "step": 43850 + }, + { + "epoch": 3.3988143670812505, + "grad_norm": 1.3111583624059635, + "learning_rate": 1.699473031618103e-07, + "loss": 0.9984, + "step": 43860 + }, + { + "epoch": 3.3995892905575573, + "grad_norm": 1.3650483464443737, + "learning_rate": 1.6998605083694978e-07, + "loss": 1.0109, + "step": 43870 + }, + { + "epoch": 3.400364214033864, + "grad_norm": 1.30034722745084, + "learning_rate": 1.7002479851208928e-07, + "loss": 1.0136, + "step": 43880 + }, + { + "epoch": 3.401139137510171, + "grad_norm": 1.2716873283991017, + "learning_rate": 1.7006354618722877e-07, + "loss": 0.9871, + "step": 43890 + }, + { + "epoch": 3.4019140609864778, + "grad_norm": 1.2910089268597764, + "learning_rate": 1.7010229386236827e-07, + "loss": 0.983, + "step": 43900 + }, + { + "epoch": 3.402688984462784, + "grad_norm": 1.3062094510000894, + "learning_rate": 1.7014104153750776e-07, + "loss": 0.9869, + "step": 43910 + }, + { + "epoch": 3.403463907939091, + "grad_norm": 1.3327205347475546, + "learning_rate": 1.7017978921264726e-07, + "loss": 0.9927, + "step": 43920 + }, + { + "epoch": 3.4042388314153977, + "grad_norm": 1.4998822750915686, + "learning_rate": 1.7021853688778675e-07, + "loss": 0.9982, + "step": 43930 + }, + { + "epoch": 3.4050137548917045, + "grad_norm": 1.371913102019869, + "learning_rate": 1.7025728456292622e-07, + "loss": 0.9851, + "step": 43940 + }, + { + "epoch": 3.405788678368011, + "grad_norm": 1.3803414524372128, + "learning_rate": 1.7029603223806572e-07, + "loss": 0.9937, + "step": 43950 + }, + { + "epoch": 3.4065636018443177, + "grad_norm": 1.3587355741793108, + "learning_rate": 1.7033477991320521e-07, + "loss": 0.9895, + "step": 43960 + }, + { + "epoch": 3.4073385253206245, + "grad_norm": 1.281646852419288, + "learning_rate": 1.703735275883447e-07, + "loss": 0.9754, + "step": 43970 + }, + { + "epoch": 3.4081134487969313, + "grad_norm": 1.338615304956928, + "learning_rate": 1.704122752634842e-07, + "loss": 0.9861, + "step": 43980 + }, + { + "epoch": 3.408888372273238, + "grad_norm": 1.270308237790733, + "learning_rate": 1.704510229386237e-07, + "loss": 0.9701, + "step": 43990 + }, + { + "epoch": 3.409663295749545, + "grad_norm": 1.3067266661135493, + "learning_rate": 1.704897706137632e-07, + "loss": 0.9791, + "step": 44000 + }, + { + "epoch": 3.409663295749545, + "eval_loss": 0.9915701150894165, + "eval_runtime": 321.0895, + "eval_samples_per_second": 35.725, + "eval_steps_per_second": 8.932, + "step": 44000 + }, + { + "epoch": 3.4104382192258513, + "grad_norm": 1.2877119838298878, + "learning_rate": 1.7052851828890267e-07, + "loss": 0.975, + "step": 44010 + }, + { + "epoch": 3.411213142702158, + "grad_norm": 1.3145749664045354, + "learning_rate": 1.7056726596404216e-07, + "loss": 0.9962, + "step": 44020 + }, + { + "epoch": 3.411988066178465, + "grad_norm": 1.356330905977799, + "learning_rate": 1.7060601363918166e-07, + "loss": 0.9812, + "step": 44030 + }, + { + "epoch": 3.4127629896547718, + "grad_norm": 1.304112503622047, + "learning_rate": 1.7064476131432115e-07, + "loss": 0.9746, + "step": 44040 + }, + { + "epoch": 3.413537913131078, + "grad_norm": 1.2922045847478878, + "learning_rate": 1.7068350898946065e-07, + "loss": 1.0043, + "step": 44050 + }, + { + "epoch": 3.414312836607385, + "grad_norm": 1.3226740011489706, + "learning_rate": 1.7072225666460014e-07, + "loss": 1.0038, + "step": 44060 + }, + { + "epoch": 3.4150877600836917, + "grad_norm": 1.31296199796552, + "learning_rate": 1.7076100433973964e-07, + "loss": 0.9893, + "step": 44070 + }, + { + "epoch": 3.4158626835599986, + "grad_norm": 1.269167979491225, + "learning_rate": 1.707997520148791e-07, + "loss": 0.9866, + "step": 44080 + }, + { + "epoch": 3.4166376070363054, + "grad_norm": 1.3156419550167808, + "learning_rate": 1.708384996900186e-07, + "loss": 0.978, + "step": 44090 + }, + { + "epoch": 3.4174125305126117, + "grad_norm": 1.2669778183408662, + "learning_rate": 1.708772473651581e-07, + "loss": 1.0067, + "step": 44100 + }, + { + "epoch": 3.4181874539889185, + "grad_norm": 1.9014786831735537, + "learning_rate": 1.709159950402976e-07, + "loss": 0.9838, + "step": 44110 + }, + { + "epoch": 3.4189623774652254, + "grad_norm": 1.4025918692578485, + "learning_rate": 1.709547427154371e-07, + "loss": 0.9914, + "step": 44120 + }, + { + "epoch": 3.419737300941532, + "grad_norm": 1.300383983188924, + "learning_rate": 1.7099349039057659e-07, + "loss": 1.0255, + "step": 44130 + }, + { + "epoch": 3.4205122244178385, + "grad_norm": 1.3579385022631352, + "learning_rate": 1.7103223806571608e-07, + "loss": 0.9795, + "step": 44140 + }, + { + "epoch": 3.4212871478941453, + "grad_norm": 1.2767249274826016, + "learning_rate": 1.7107098574085555e-07, + "loss": 1.0095, + "step": 44150 + }, + { + "epoch": 3.422062071370452, + "grad_norm": 1.3491698711415572, + "learning_rate": 1.7110973341599505e-07, + "loss": 1.0059, + "step": 44160 + }, + { + "epoch": 3.422836994846759, + "grad_norm": 1.2821365256495243, + "learning_rate": 1.7114848109113454e-07, + "loss": 0.9919, + "step": 44170 + }, + { + "epoch": 3.4236119183230658, + "grad_norm": 1.3351522460127787, + "learning_rate": 1.7118722876627404e-07, + "loss": 1.0155, + "step": 44180 + }, + { + "epoch": 3.424386841799372, + "grad_norm": 1.4279709276330972, + "learning_rate": 1.7122597644141353e-07, + "loss": 1.0264, + "step": 44190 + }, + { + "epoch": 3.425161765275679, + "grad_norm": 1.4049027396999212, + "learning_rate": 1.7126472411655303e-07, + "loss": 0.9934, + "step": 44200 + }, + { + "epoch": 3.4259366887519858, + "grad_norm": 1.3806477203434135, + "learning_rate": 1.713034717916925e-07, + "loss": 0.9798, + "step": 44210 + }, + { + "epoch": 3.4267116122282926, + "grad_norm": 1.3890119020408103, + "learning_rate": 1.71342219466832e-07, + "loss": 1.0091, + "step": 44220 + }, + { + "epoch": 3.427486535704599, + "grad_norm": 1.384010460207713, + "learning_rate": 1.713809671419715e-07, + "loss": 1.0005, + "step": 44230 + }, + { + "epoch": 3.4282614591809057, + "grad_norm": 1.3715771092813158, + "learning_rate": 1.7141971481711098e-07, + "loss": 0.9829, + "step": 44240 + }, + { + "epoch": 3.4290363826572126, + "grad_norm": 1.4236280477146972, + "learning_rate": 1.7145846249225048e-07, + "loss": 0.9764, + "step": 44250 + }, + { + "epoch": 3.4298113061335194, + "grad_norm": 1.3673434138405556, + "learning_rate": 1.7149721016738997e-07, + "loss": 0.9784, + "step": 44260 + }, + { + "epoch": 3.430586229609826, + "grad_norm": 1.3466678773477934, + "learning_rate": 1.7153595784252947e-07, + "loss": 0.9787, + "step": 44270 + }, + { + "epoch": 3.431361153086133, + "grad_norm": 1.3514067425489675, + "learning_rate": 1.7157470551766894e-07, + "loss": 0.9945, + "step": 44280 + }, + { + "epoch": 3.4321360765624394, + "grad_norm": 1.332686392697094, + "learning_rate": 1.7161345319280843e-07, + "loss": 0.9743, + "step": 44290 + }, + { + "epoch": 3.432911000038746, + "grad_norm": 1.4035025718751408, + "learning_rate": 1.7165220086794793e-07, + "loss": 0.9867, + "step": 44300 + }, + { + "epoch": 3.433685923515053, + "grad_norm": 1.340611030719314, + "learning_rate": 1.7169094854308742e-07, + "loss": 1.003, + "step": 44310 + }, + { + "epoch": 3.43446084699136, + "grad_norm": 1.3063074309972196, + "learning_rate": 1.7172969621822692e-07, + "loss": 1.0117, + "step": 44320 + }, + { + "epoch": 3.435235770467666, + "grad_norm": 1.3011055758773054, + "learning_rate": 1.7176844389336642e-07, + "loss": 0.9948, + "step": 44330 + }, + { + "epoch": 3.436010693943973, + "grad_norm": 1.3154403642673127, + "learning_rate": 1.718071915685059e-07, + "loss": 0.9994, + "step": 44340 + }, + { + "epoch": 3.4367856174202798, + "grad_norm": 1.3221966411117467, + "learning_rate": 1.7184593924364538e-07, + "loss": 0.9751, + "step": 44350 + }, + { + "epoch": 3.4375605408965866, + "grad_norm": 1.3798413214190155, + "learning_rate": 1.7188468691878488e-07, + "loss": 1.0058, + "step": 44360 + }, + { + "epoch": 3.4383354643728934, + "grad_norm": 1.3548172114915202, + "learning_rate": 1.7192343459392437e-07, + "loss": 0.9935, + "step": 44370 + }, + { + "epoch": 3.4391103878491998, + "grad_norm": 1.442888150475561, + "learning_rate": 1.7196218226906387e-07, + "loss": 0.983, + "step": 44380 + }, + { + "epoch": 3.4398853113255066, + "grad_norm": 1.4035047429895477, + "learning_rate": 1.7200092994420336e-07, + "loss": 1.0002, + "step": 44390 + }, + { + "epoch": 3.4406602348018134, + "grad_norm": 1.2882306134879302, + "learning_rate": 1.7203967761934286e-07, + "loss": 0.9648, + "step": 44400 + }, + { + "epoch": 3.44143515827812, + "grad_norm": 1.358629804311785, + "learning_rate": 1.7207842529448235e-07, + "loss": 0.9844, + "step": 44410 + }, + { + "epoch": 3.4422100817544266, + "grad_norm": 1.3190251835119722, + "learning_rate": 1.7211717296962182e-07, + "loss": 0.9805, + "step": 44420 + }, + { + "epoch": 3.4429850052307334, + "grad_norm": 1.3005555745071378, + "learning_rate": 1.7215592064476132e-07, + "loss": 0.9868, + "step": 44430 + }, + { + "epoch": 3.44375992870704, + "grad_norm": 1.2995129025608219, + "learning_rate": 1.7219466831990081e-07, + "loss": 1.0048, + "step": 44440 + }, + { + "epoch": 3.444534852183347, + "grad_norm": 1.4031754648084251, + "learning_rate": 1.722334159950403e-07, + "loss": 1.0046, + "step": 44450 + }, + { + "epoch": 3.445309775659654, + "grad_norm": 1.3828613452169678, + "learning_rate": 1.722721636701798e-07, + "loss": 1.0059, + "step": 44460 + }, + { + "epoch": 3.44608469913596, + "grad_norm": 1.2994669638875553, + "learning_rate": 1.723109113453193e-07, + "loss": 0.9803, + "step": 44470 + }, + { + "epoch": 3.446859622612267, + "grad_norm": 1.3488414745561754, + "learning_rate": 1.723496590204588e-07, + "loss": 0.9918, + "step": 44480 + }, + { + "epoch": 3.447634546088574, + "grad_norm": 1.3793329310402225, + "learning_rate": 1.7238840669559826e-07, + "loss": 1.0196, + "step": 44490 + }, + { + "epoch": 3.4484094695648806, + "grad_norm": 1.344517319340518, + "learning_rate": 1.7242715437073776e-07, + "loss": 0.9925, + "step": 44500 + }, + { + "epoch": 3.4484094695648806, + "eval_loss": 0.9904112815856934, + "eval_runtime": 320.9964, + "eval_samples_per_second": 35.736, + "eval_steps_per_second": 8.935, + "step": 44500 + }, + { + "epoch": 3.449184393041187, + "grad_norm": 1.297699721715757, + "learning_rate": 1.7246590204587726e-07, + "loss": 0.9793, + "step": 44510 + }, + { + "epoch": 3.4499593165174938, + "grad_norm": 1.353062503445691, + "learning_rate": 1.7250464972101675e-07, + "loss": 0.9909, + "step": 44520 + }, + { + "epoch": 3.4507342399938006, + "grad_norm": 1.2830171818004217, + "learning_rate": 1.7254339739615625e-07, + "loss": 0.9695, + "step": 44530 + }, + { + "epoch": 3.4515091634701074, + "grad_norm": 1.3300572626297247, + "learning_rate": 1.7258214507129574e-07, + "loss": 0.9875, + "step": 44540 + }, + { + "epoch": 3.452284086946414, + "grad_norm": 1.3443137016687488, + "learning_rate": 1.726208927464352e-07, + "loss": 0.9797, + "step": 44550 + }, + { + "epoch": 3.4530590104227206, + "grad_norm": 1.281155607584983, + "learning_rate": 1.726596404215747e-07, + "loss": 1.0002, + "step": 44560 + }, + { + "epoch": 3.4538339338990274, + "grad_norm": 1.342064151126603, + "learning_rate": 1.726983880967142e-07, + "loss": 0.9918, + "step": 44570 + }, + { + "epoch": 3.454608857375334, + "grad_norm": 1.3370546835942867, + "learning_rate": 1.727371357718537e-07, + "loss": 0.9917, + "step": 44580 + }, + { + "epoch": 3.455383780851641, + "grad_norm": 1.339475147305098, + "learning_rate": 1.727758834469932e-07, + "loss": 0.9823, + "step": 44590 + }, + { + "epoch": 3.456158704327948, + "grad_norm": 1.262781608100723, + "learning_rate": 1.728146311221327e-07, + "loss": 0.9974, + "step": 44600 + }, + { + "epoch": 3.456933627804254, + "grad_norm": 1.3605040049292483, + "learning_rate": 1.7285337879727218e-07, + "loss": 1.0193, + "step": 44610 + }, + { + "epoch": 3.457708551280561, + "grad_norm": 1.2778708152119607, + "learning_rate": 1.7289212647241165e-07, + "loss": 0.9861, + "step": 44620 + }, + { + "epoch": 3.458483474756868, + "grad_norm": 1.334548176767213, + "learning_rate": 1.7293087414755115e-07, + "loss": 0.9813, + "step": 44630 + }, + { + "epoch": 3.4592583982331746, + "grad_norm": 1.3651090376533905, + "learning_rate": 1.7296962182269064e-07, + "loss": 1.0065, + "step": 44640 + }, + { + "epoch": 3.4600333217094814, + "grad_norm": 1.3511599205637466, + "learning_rate": 1.7300836949783014e-07, + "loss": 1.0008, + "step": 44650 + }, + { + "epoch": 3.460808245185788, + "grad_norm": 1.3883256223864378, + "learning_rate": 1.7304711717296964e-07, + "loss": 0.9653, + "step": 44660 + }, + { + "epoch": 3.4615831686620946, + "grad_norm": 1.3519812104714524, + "learning_rate": 1.7308586484810913e-07, + "loss": 1.0011, + "step": 44670 + }, + { + "epoch": 3.4623580921384014, + "grad_norm": 1.2856042583529494, + "learning_rate": 1.7312461252324863e-07, + "loss": 0.9701, + "step": 44680 + }, + { + "epoch": 3.463133015614708, + "grad_norm": 1.265514274883475, + "learning_rate": 1.731633601983881e-07, + "loss": 0.9814, + "step": 44690 + }, + { + "epoch": 3.4639079390910146, + "grad_norm": 1.2920824766236483, + "learning_rate": 1.732021078735276e-07, + "loss": 0.9772, + "step": 44700 + }, + { + "epoch": 3.4646828625673214, + "grad_norm": 1.298354439257108, + "learning_rate": 1.7324085554866709e-07, + "loss": 0.9804, + "step": 44710 + }, + { + "epoch": 3.465457786043628, + "grad_norm": 1.3084854291860402, + "learning_rate": 1.7327960322380658e-07, + "loss": 0.9845, + "step": 44720 + }, + { + "epoch": 3.466232709519935, + "grad_norm": 1.3890202942577028, + "learning_rate": 1.7331835089894608e-07, + "loss": 0.9756, + "step": 44730 + }, + { + "epoch": 3.467007632996242, + "grad_norm": 1.2965246144747775, + "learning_rate": 1.7335709857408557e-07, + "loss": 1.0015, + "step": 44740 + }, + { + "epoch": 3.467782556472548, + "grad_norm": 1.2848578749408055, + "learning_rate": 1.7339584624922507e-07, + "loss": 0.9986, + "step": 44750 + }, + { + "epoch": 3.468557479948855, + "grad_norm": 1.244622071459419, + "learning_rate": 1.7343459392436454e-07, + "loss": 0.9644, + "step": 44760 + }, + { + "epoch": 3.469332403425162, + "grad_norm": 1.320660044558757, + "learning_rate": 1.7347334159950403e-07, + "loss": 1.0158, + "step": 44770 + }, + { + "epoch": 3.4701073269014686, + "grad_norm": 1.2911506619780286, + "learning_rate": 1.7351208927464353e-07, + "loss": 0.9801, + "step": 44780 + }, + { + "epoch": 3.470882250377775, + "grad_norm": 1.3313298382197434, + "learning_rate": 1.7355083694978302e-07, + "loss": 0.985, + "step": 44790 + }, + { + "epoch": 3.471657173854082, + "grad_norm": 1.354068431933999, + "learning_rate": 1.7358958462492252e-07, + "loss": 0.9773, + "step": 44800 + }, + { + "epoch": 3.4724320973303886, + "grad_norm": 1.313063455939057, + "learning_rate": 1.7362833230006202e-07, + "loss": 0.9863, + "step": 44810 + }, + { + "epoch": 3.4732070208066954, + "grad_norm": 1.305800413758639, + "learning_rate": 1.736670799752015e-07, + "loss": 0.9891, + "step": 44820 + }, + { + "epoch": 3.473981944283002, + "grad_norm": 1.3628083540444875, + "learning_rate": 1.7370582765034098e-07, + "loss": 0.9972, + "step": 44830 + }, + { + "epoch": 3.4747568677593086, + "grad_norm": 1.3630078177244762, + "learning_rate": 1.7374457532548048e-07, + "loss": 1.0115, + "step": 44840 + }, + { + "epoch": 3.4755317912356154, + "grad_norm": 1.2860166069907546, + "learning_rate": 1.7378332300061997e-07, + "loss": 0.986, + "step": 44850 + }, + { + "epoch": 3.476306714711922, + "grad_norm": 1.3277421395382396, + "learning_rate": 1.7382207067575947e-07, + "loss": 1.0026, + "step": 44860 + }, + { + "epoch": 3.477081638188229, + "grad_norm": 1.2781047758590363, + "learning_rate": 1.7386081835089896e-07, + "loss": 0.9779, + "step": 44870 + }, + { + "epoch": 3.4778565616645354, + "grad_norm": 1.3651178564256594, + "learning_rate": 1.7389956602603846e-07, + "loss": 0.9866, + "step": 44880 + }, + { + "epoch": 3.478631485140842, + "grad_norm": 1.3692367334871907, + "learning_rate": 1.7393831370117795e-07, + "loss": 1.002, + "step": 44890 + }, + { + "epoch": 3.479406408617149, + "grad_norm": 1.2973170691124685, + "learning_rate": 1.7397706137631742e-07, + "loss": 0.984, + "step": 44900 + }, + { + "epoch": 3.480181332093456, + "grad_norm": 1.2659032508078238, + "learning_rate": 1.7401580905145692e-07, + "loss": 0.9844, + "step": 44910 + }, + { + "epoch": 3.4809562555697626, + "grad_norm": 1.3496555254575089, + "learning_rate": 1.740545567265964e-07, + "loss": 0.978, + "step": 44920 + }, + { + "epoch": 3.4817311790460694, + "grad_norm": 1.390545007230666, + "learning_rate": 1.740933044017359e-07, + "loss": 0.9998, + "step": 44930 + }, + { + "epoch": 3.482506102522376, + "grad_norm": 1.2607703221827091, + "learning_rate": 1.741320520768754e-07, + "loss": 0.9776, + "step": 44940 + }, + { + "epoch": 3.4832810259986826, + "grad_norm": 1.2891819615244748, + "learning_rate": 1.741707997520149e-07, + "loss": 0.9844, + "step": 44950 + }, + { + "epoch": 3.4840559494749894, + "grad_norm": 1.2677276102690231, + "learning_rate": 1.7420954742715437e-07, + "loss": 0.9985, + "step": 44960 + }, + { + "epoch": 3.4848308729512962, + "grad_norm": 1.3186007910382533, + "learning_rate": 1.7424829510229386e-07, + "loss": 0.9875, + "step": 44970 + }, + { + "epoch": 3.4856057964276026, + "grad_norm": 1.3862710773235736, + "learning_rate": 1.7428704277743336e-07, + "loss": 0.9824, + "step": 44980 + }, + { + "epoch": 3.4863807199039094, + "grad_norm": 1.3821966343280903, + "learning_rate": 1.7432579045257285e-07, + "loss": 0.9862, + "step": 44990 + }, + { + "epoch": 3.487155643380216, + "grad_norm": 1.319201143923556, + "learning_rate": 1.7436453812771235e-07, + "loss": 0.9723, + "step": 45000 + }, + { + "epoch": 3.487155643380216, + "eval_loss": 0.9892686009407043, + "eval_runtime": 319.7346, + "eval_samples_per_second": 35.877, + "eval_steps_per_second": 8.97, + "step": 45000 + }, + { + "epoch": 3.487930566856523, + "grad_norm": 1.3012318831108096, + "learning_rate": 1.7440328580285185e-07, + "loss": 0.9961, + "step": 45010 + }, + { + "epoch": 3.48870549033283, + "grad_norm": 1.3827566235063087, + "learning_rate": 1.7444203347799134e-07, + "loss": 0.9823, + "step": 45020 + }, + { + "epoch": 3.489480413809136, + "grad_norm": 1.3002492415045692, + "learning_rate": 1.744807811531308e-07, + "loss": 1.0053, + "step": 45030 + }, + { + "epoch": 3.490255337285443, + "grad_norm": 1.397445514375352, + "learning_rate": 1.745195288282703e-07, + "loss": 0.9973, + "step": 45040 + }, + { + "epoch": 3.49103026076175, + "grad_norm": 1.2804819884946268, + "learning_rate": 1.745582765034098e-07, + "loss": 0.9936, + "step": 45050 + }, + { + "epoch": 3.4918051842380566, + "grad_norm": 1.3834790031883952, + "learning_rate": 1.745970241785493e-07, + "loss": 0.9763, + "step": 45060 + }, + { + "epoch": 3.492580107714363, + "grad_norm": 1.2997365763541653, + "learning_rate": 1.746357718536888e-07, + "loss": 0.9842, + "step": 45070 + }, + { + "epoch": 3.49335503119067, + "grad_norm": 1.286134752394335, + "learning_rate": 1.746745195288283e-07, + "loss": 0.9814, + "step": 45080 + }, + { + "epoch": 3.4941299546669766, + "grad_norm": 1.3589426205794257, + "learning_rate": 1.7471326720396778e-07, + "loss": 1.0024, + "step": 45090 + }, + { + "epoch": 3.4949048781432834, + "grad_norm": 1.2992736657549313, + "learning_rate": 1.7475201487910725e-07, + "loss": 0.9711, + "step": 45100 + }, + { + "epoch": 3.4956798016195902, + "grad_norm": 1.2803372225621361, + "learning_rate": 1.7479076255424675e-07, + "loss": 0.9604, + "step": 45110 + }, + { + "epoch": 3.4964547250958966, + "grad_norm": 1.3878706427529464, + "learning_rate": 1.7482951022938624e-07, + "loss": 0.9941, + "step": 45120 + }, + { + "epoch": 3.4972296485722034, + "grad_norm": 1.3001730296870704, + "learning_rate": 1.7486825790452574e-07, + "loss": 0.9727, + "step": 45130 + }, + { + "epoch": 3.4980045720485102, + "grad_norm": 1.260590098598763, + "learning_rate": 1.7490700557966523e-07, + "loss": 0.9926, + "step": 45140 + }, + { + "epoch": 3.498779495524817, + "grad_norm": 1.3056485375569995, + "learning_rate": 1.7494575325480473e-07, + "loss": 0.9888, + "step": 45150 + }, + { + "epoch": 3.4995544190011234, + "grad_norm": 1.2913118825583514, + "learning_rate": 1.7498450092994423e-07, + "loss": 0.9946, + "step": 45160 + }, + { + "epoch": 3.50032934247743, + "grad_norm": 1.2787593232231529, + "learning_rate": 1.750232486050837e-07, + "loss": 0.9862, + "step": 45170 + }, + { + "epoch": 3.501104265953737, + "grad_norm": 1.3338866490879917, + "learning_rate": 1.750619962802232e-07, + "loss": 1.0218, + "step": 45180 + }, + { + "epoch": 3.501879189430044, + "grad_norm": 1.3581286777081987, + "learning_rate": 1.7510074395536269e-07, + "loss": 0.9915, + "step": 45190 + }, + { + "epoch": 3.5026541129063506, + "grad_norm": 1.4071481971654172, + "learning_rate": 1.7513949163050218e-07, + "loss": 0.9966, + "step": 45200 + }, + { + "epoch": 3.5034290363826575, + "grad_norm": 1.2668831948887336, + "learning_rate": 1.7517823930564168e-07, + "loss": 0.9664, + "step": 45210 + }, + { + "epoch": 3.504203959858964, + "grad_norm": 1.363868826507466, + "learning_rate": 1.7521698698078117e-07, + "loss": 0.9828, + "step": 45220 + }, + { + "epoch": 3.5049788833352706, + "grad_norm": 1.3566249744804895, + "learning_rate": 1.7525573465592067e-07, + "loss": 0.9722, + "step": 45230 + }, + { + "epoch": 3.5057538068115774, + "grad_norm": 1.3145904385282958, + "learning_rate": 1.7529448233106014e-07, + "loss": 0.9974, + "step": 45240 + }, + { + "epoch": 3.506528730287884, + "grad_norm": 1.4879007754134363, + "learning_rate": 1.7533323000619963e-07, + "loss": 1.0106, + "step": 45250 + }, + { + "epoch": 3.5073036537641906, + "grad_norm": 1.4775711080292293, + "learning_rate": 1.7537197768133913e-07, + "loss": 0.9865, + "step": 45260 + }, + { + "epoch": 3.5080785772404974, + "grad_norm": 1.4178508833815824, + "learning_rate": 1.7541072535647862e-07, + "loss": 0.9802, + "step": 45270 + }, + { + "epoch": 3.5088535007168042, + "grad_norm": 1.2626401807991352, + "learning_rate": 1.7544947303161812e-07, + "loss": 0.9739, + "step": 45280 + }, + { + "epoch": 3.509628424193111, + "grad_norm": 1.3621135211479714, + "learning_rate": 1.7548822070675761e-07, + "loss": 0.9956, + "step": 45290 + }, + { + "epoch": 3.510403347669418, + "grad_norm": 1.3339154876389214, + "learning_rate": 1.7552696838189708e-07, + "loss": 0.9937, + "step": 45300 + }, + { + "epoch": 3.5111782711457242, + "grad_norm": 1.3178880055284619, + "learning_rate": 1.7556571605703658e-07, + "loss": 0.9952, + "step": 45310 + }, + { + "epoch": 3.511953194622031, + "grad_norm": 1.329758969873662, + "learning_rate": 1.7560446373217607e-07, + "loss": 1.0155, + "step": 45320 + }, + { + "epoch": 3.512728118098338, + "grad_norm": 1.3327348402732426, + "learning_rate": 1.7564321140731557e-07, + "loss": 0.9862, + "step": 45330 + }, + { + "epoch": 3.5135030415746447, + "grad_norm": 1.315848227282703, + "learning_rate": 1.7568195908245507e-07, + "loss": 1.0026, + "step": 45340 + }, + { + "epoch": 3.514277965050951, + "grad_norm": 1.3992279412573416, + "learning_rate": 1.7572070675759456e-07, + "loss": 0.9842, + "step": 45350 + }, + { + "epoch": 3.515052888527258, + "grad_norm": 1.3923036786786058, + "learning_rate": 1.7575945443273406e-07, + "loss": 1.0168, + "step": 45360 + }, + { + "epoch": 3.5158278120035646, + "grad_norm": 1.3927079908718198, + "learning_rate": 1.7579820210787353e-07, + "loss": 0.9924, + "step": 45370 + }, + { + "epoch": 3.5166027354798715, + "grad_norm": 1.3758371071568776, + "learning_rate": 1.7583694978301302e-07, + "loss": 0.978, + "step": 45380 + }, + { + "epoch": 3.5173776589561783, + "grad_norm": 1.3978529110367153, + "learning_rate": 1.7587569745815252e-07, + "loss": 1.033, + "step": 45390 + }, + { + "epoch": 3.5181525824324846, + "grad_norm": 1.4069965436926148, + "learning_rate": 1.75914445133292e-07, + "loss": 1.0129, + "step": 45400 + }, + { + "epoch": 3.5189275059087914, + "grad_norm": 1.2766452306622413, + "learning_rate": 1.759531928084315e-07, + "loss": 0.9797, + "step": 45410 + }, + { + "epoch": 3.5197024293850983, + "grad_norm": 1.2230697043727097, + "learning_rate": 1.75991940483571e-07, + "loss": 0.9798, + "step": 45420 + }, + { + "epoch": 3.520477352861405, + "grad_norm": 1.3806374802520858, + "learning_rate": 1.760306881587105e-07, + "loss": 0.9972, + "step": 45430 + }, + { + "epoch": 3.5212522763377114, + "grad_norm": 1.4271334215907743, + "learning_rate": 1.7606943583384997e-07, + "loss": 0.9633, + "step": 45440 + }, + { + "epoch": 3.5220271998140182, + "grad_norm": 1.293935656745657, + "learning_rate": 1.7610818350898946e-07, + "loss": 0.9951, + "step": 45450 + }, + { + "epoch": 3.522802123290325, + "grad_norm": 1.2802636429817988, + "learning_rate": 1.7614693118412896e-07, + "loss": 0.9831, + "step": 45460 + }, + { + "epoch": 3.523577046766632, + "grad_norm": 1.2676113655074772, + "learning_rate": 1.7618567885926845e-07, + "loss": 0.9888, + "step": 45470 + }, + { + "epoch": 3.5243519702429387, + "grad_norm": 1.387109832937975, + "learning_rate": 1.7622442653440795e-07, + "loss": 1.0028, + "step": 45480 + }, + { + "epoch": 3.5251268937192455, + "grad_norm": 1.3892529643254787, + "learning_rate": 1.7626317420954745e-07, + "loss": 0.9933, + "step": 45490 + }, + { + "epoch": 3.525901817195552, + "grad_norm": 1.3213859632879432, + "learning_rate": 1.7630192188468694e-07, + "loss": 0.9867, + "step": 45500 + }, + { + "epoch": 3.525901817195552, + "eval_loss": 0.9881995916366577, + "eval_runtime": 319.0602, + "eval_samples_per_second": 35.952, + "eval_steps_per_second": 8.989, + "step": 45500 + }, + { + "epoch": 3.5266767406718587, + "grad_norm": 1.3203792670426666, + "learning_rate": 1.763406695598264e-07, + "loss": 0.9691, + "step": 45510 + }, + { + "epoch": 3.5274516641481655, + "grad_norm": 1.3127137412637688, + "learning_rate": 1.763794172349659e-07, + "loss": 0.9882, + "step": 45520 + }, + { + "epoch": 3.528226587624472, + "grad_norm": 1.3843258988619467, + "learning_rate": 1.764181649101054e-07, + "loss": 0.9609, + "step": 45530 + }, + { + "epoch": 3.5290015111007786, + "grad_norm": 1.3324267785942652, + "learning_rate": 1.764569125852449e-07, + "loss": 0.9939, + "step": 45540 + }, + { + "epoch": 3.5297764345770855, + "grad_norm": 1.3481863655930828, + "learning_rate": 1.764956602603844e-07, + "loss": 0.9869, + "step": 45550 + }, + { + "epoch": 3.5305513580533923, + "grad_norm": 1.308578102475811, + "learning_rate": 1.765344079355239e-07, + "loss": 1.0267, + "step": 45560 + }, + { + "epoch": 3.531326281529699, + "grad_norm": 1.3144451869264986, + "learning_rate": 1.7657315561066338e-07, + "loss": 0.9869, + "step": 45570 + }, + { + "epoch": 3.532101205006006, + "grad_norm": 1.2484777554751745, + "learning_rate": 1.7661190328580285e-07, + "loss": 0.9636, + "step": 45580 + }, + { + "epoch": 3.5328761284823123, + "grad_norm": 1.218577853233205, + "learning_rate": 1.7665065096094235e-07, + "loss": 1.008, + "step": 45590 + }, + { + "epoch": 3.533651051958619, + "grad_norm": 1.441673309695735, + "learning_rate": 1.7668939863608184e-07, + "loss": 0.9893, + "step": 45600 + }, + { + "epoch": 3.534425975434926, + "grad_norm": 1.3881379912573457, + "learning_rate": 1.7672814631122134e-07, + "loss": 0.991, + "step": 45610 + }, + { + "epoch": 3.5352008989112327, + "grad_norm": 1.4146965623358227, + "learning_rate": 1.7676689398636083e-07, + "loss": 0.9917, + "step": 45620 + }, + { + "epoch": 3.535975822387539, + "grad_norm": 1.4051697146849425, + "learning_rate": 1.7680564166150033e-07, + "loss": 0.9892, + "step": 45630 + }, + { + "epoch": 3.536750745863846, + "grad_norm": 1.3270859217737458, + "learning_rate": 1.768443893366398e-07, + "loss": 0.9902, + "step": 45640 + }, + { + "epoch": 3.5375256693401527, + "grad_norm": 1.300626895915161, + "learning_rate": 1.768831370117793e-07, + "loss": 0.9786, + "step": 45650 + }, + { + "epoch": 3.5383005928164595, + "grad_norm": 1.2829350628669198, + "learning_rate": 1.769218846869188e-07, + "loss": 0.9814, + "step": 45660 + }, + { + "epoch": 3.5390755162927663, + "grad_norm": 1.3562777215268416, + "learning_rate": 1.7696063236205828e-07, + "loss": 1.001, + "step": 45670 + }, + { + "epoch": 3.5398504397690727, + "grad_norm": 1.3509680152773393, + "learning_rate": 1.7699938003719778e-07, + "loss": 0.9768, + "step": 45680 + }, + { + "epoch": 3.5406253632453795, + "grad_norm": 1.2599969935794337, + "learning_rate": 1.7703812771233728e-07, + "loss": 1.0033, + "step": 45690 + }, + { + "epoch": 3.5414002867216863, + "grad_norm": 1.3657639861042747, + "learning_rate": 1.7707687538747677e-07, + "loss": 0.9757, + "step": 45700 + }, + { + "epoch": 3.542175210197993, + "grad_norm": 1.25468281596526, + "learning_rate": 1.7711562306261624e-07, + "loss": 0.9646, + "step": 45710 + }, + { + "epoch": 3.5429501336742995, + "grad_norm": 1.3465985838335977, + "learning_rate": 1.7715437073775574e-07, + "loss": 1.0043, + "step": 45720 + }, + { + "epoch": 3.5437250571506063, + "grad_norm": 1.368978228857602, + "learning_rate": 1.7719311841289523e-07, + "loss": 0.9803, + "step": 45730 + }, + { + "epoch": 3.544499980626913, + "grad_norm": 1.2473862218369993, + "learning_rate": 1.7723186608803473e-07, + "loss": 0.983, + "step": 45740 + }, + { + "epoch": 3.54527490410322, + "grad_norm": 1.3697488569606289, + "learning_rate": 1.7727061376317422e-07, + "loss": 0.9972, + "step": 45750 + }, + { + "epoch": 3.5460498275795267, + "grad_norm": 1.3371533613695403, + "learning_rate": 1.7730936143831372e-07, + "loss": 0.9962, + "step": 45760 + }, + { + "epoch": 3.5468247510558335, + "grad_norm": 1.3546749110791447, + "learning_rate": 1.7734810911345321e-07, + "loss": 0.9739, + "step": 45770 + }, + { + "epoch": 3.54759967453214, + "grad_norm": 1.222636902425105, + "learning_rate": 1.7738685678859268e-07, + "loss": 0.9935, + "step": 45780 + }, + { + "epoch": 3.5483745980084467, + "grad_norm": 1.3777302132773934, + "learning_rate": 1.7742560446373218e-07, + "loss": 0.9788, + "step": 45790 + }, + { + "epoch": 3.5491495214847535, + "grad_norm": 1.5707504990122385, + "learning_rate": 1.7746435213887167e-07, + "loss": 0.9726, + "step": 45800 + }, + { + "epoch": 3.54992444496106, + "grad_norm": 1.275329120847897, + "learning_rate": 1.7750309981401117e-07, + "loss": 0.9916, + "step": 45810 + }, + { + "epoch": 3.5506993684373667, + "grad_norm": 1.3673245249543877, + "learning_rate": 1.7754184748915066e-07, + "loss": 1.0091, + "step": 45820 + }, + { + "epoch": 3.5514742919136735, + "grad_norm": 1.2417774733208298, + "learning_rate": 1.7758059516429016e-07, + "loss": 0.9949, + "step": 45830 + }, + { + "epoch": 3.5522492153899803, + "grad_norm": 1.3179258206389064, + "learning_rate": 1.7761934283942966e-07, + "loss": 0.9716, + "step": 45840 + }, + { + "epoch": 3.553024138866287, + "grad_norm": 1.369988672926415, + "learning_rate": 1.7765809051456912e-07, + "loss": 0.9865, + "step": 45850 + }, + { + "epoch": 3.553799062342594, + "grad_norm": 1.4266887602967913, + "learning_rate": 1.7769683818970862e-07, + "loss": 0.9893, + "step": 45860 + }, + { + "epoch": 3.5545739858189003, + "grad_norm": 1.379975660786687, + "learning_rate": 1.7773558586484812e-07, + "loss": 1.0071, + "step": 45870 + }, + { + "epoch": 3.555348909295207, + "grad_norm": 1.3458000837379858, + "learning_rate": 1.777743335399876e-07, + "loss": 0.9792, + "step": 45880 + }, + { + "epoch": 3.556123832771514, + "grad_norm": 1.3421657943652219, + "learning_rate": 1.778130812151271e-07, + "loss": 0.9919, + "step": 45890 + }, + { + "epoch": 3.5568987562478203, + "grad_norm": 1.321216617711226, + "learning_rate": 1.778518288902666e-07, + "loss": 0.989, + "step": 45900 + }, + { + "epoch": 3.557673679724127, + "grad_norm": 1.4106821482774754, + "learning_rate": 1.778905765654061e-07, + "loss": 0.9815, + "step": 45910 + }, + { + "epoch": 3.558448603200434, + "grad_norm": 1.4161215173467114, + "learning_rate": 1.7792932424054557e-07, + "loss": 0.968, + "step": 45920 + }, + { + "epoch": 3.5592235266767407, + "grad_norm": 1.3497460609536567, + "learning_rate": 1.7796807191568506e-07, + "loss": 0.982, + "step": 45930 + }, + { + "epoch": 3.5599984501530475, + "grad_norm": 1.657718796149656, + "learning_rate": 1.7800681959082456e-07, + "loss": 1.021, + "step": 45940 + }, + { + "epoch": 3.5607733736293543, + "grad_norm": 1.2777786390986738, + "learning_rate": 1.7804556726596405e-07, + "loss": 0.9714, + "step": 45950 + }, + { + "epoch": 3.5615482971056607, + "grad_norm": 1.3476652192817409, + "learning_rate": 1.7808431494110355e-07, + "loss": 0.9781, + "step": 45960 + }, + { + "epoch": 3.5623232205819675, + "grad_norm": 1.3169401515343742, + "learning_rate": 1.7812306261624304e-07, + "loss": 0.9945, + "step": 45970 + }, + { + "epoch": 3.5630981440582743, + "grad_norm": 1.6788211830261934, + "learning_rate": 1.7816181029138254e-07, + "loss": 1.0238, + "step": 45980 + }, + { + "epoch": 3.563873067534581, + "grad_norm": 1.2670467944811057, + "learning_rate": 1.78200557966522e-07, + "loss": 0.9817, + "step": 45990 + }, + { + "epoch": 3.5646479910108875, + "grad_norm": 1.446732621453407, + "learning_rate": 1.782393056416615e-07, + "loss": 0.9902, + "step": 46000 + }, + { + "epoch": 3.5646479910108875, + "eval_loss": 0.9871197938919067, + "eval_runtime": 318.8028, + "eval_samples_per_second": 35.981, + "eval_steps_per_second": 8.996, + "step": 46000 + }, + { + "epoch": 3.5654229144871943, + "grad_norm": 1.334287139579663, + "learning_rate": 1.78278053316801e-07, + "loss": 0.9824, + "step": 46010 + }, + { + "epoch": 3.566197837963501, + "grad_norm": 1.2761756347479245, + "learning_rate": 1.783168009919405e-07, + "loss": 0.9861, + "step": 46020 + }, + { + "epoch": 3.566972761439808, + "grad_norm": 1.4158859000002062, + "learning_rate": 1.7835554866708e-07, + "loss": 0.9908, + "step": 46030 + }, + { + "epoch": 3.5677476849161147, + "grad_norm": 1.3292672761863495, + "learning_rate": 1.7839429634221949e-07, + "loss": 0.9761, + "step": 46040 + }, + { + "epoch": 3.5685226083924215, + "grad_norm": 1.3454297500740413, + "learning_rate": 1.7843304401735896e-07, + "loss": 0.9632, + "step": 46050 + }, + { + "epoch": 3.569297531868728, + "grad_norm": 1.3926504883174395, + "learning_rate": 1.7847179169249845e-07, + "loss": 0.989, + "step": 46060 + }, + { + "epoch": 3.5700724553450347, + "grad_norm": 1.445858805077157, + "learning_rate": 1.7851053936763795e-07, + "loss": 0.9927, + "step": 46070 + }, + { + "epoch": 3.5708473788213415, + "grad_norm": 1.336284563542582, + "learning_rate": 1.7854928704277744e-07, + "loss": 0.9799, + "step": 46080 + }, + { + "epoch": 3.571622302297648, + "grad_norm": 1.32966792273405, + "learning_rate": 1.7858803471791694e-07, + "loss": 1.0086, + "step": 46090 + }, + { + "epoch": 3.5723972257739547, + "grad_norm": 1.3421992209346052, + "learning_rate": 1.7862678239305643e-07, + "loss": 0.9837, + "step": 46100 + }, + { + "epoch": 3.5731721492502615, + "grad_norm": 1.3053121911893257, + "learning_rate": 1.7866553006819593e-07, + "loss": 0.9779, + "step": 46110 + }, + { + "epoch": 3.5739470727265683, + "grad_norm": 1.4196146825262952, + "learning_rate": 1.787042777433354e-07, + "loss": 0.9807, + "step": 46120 + }, + { + "epoch": 3.574721996202875, + "grad_norm": 1.395545296795084, + "learning_rate": 1.787430254184749e-07, + "loss": 0.9818, + "step": 46130 + }, + { + "epoch": 3.575496919679182, + "grad_norm": 1.2773122062141755, + "learning_rate": 1.787817730936144e-07, + "loss": 0.9823, + "step": 46140 + }, + { + "epoch": 3.5762718431554883, + "grad_norm": 1.2908997779485498, + "learning_rate": 1.7882052076875388e-07, + "loss": 0.9737, + "step": 46150 + }, + { + "epoch": 3.577046766631795, + "grad_norm": 1.2516031438433186, + "learning_rate": 1.7885926844389338e-07, + "loss": 0.9949, + "step": 46160 + }, + { + "epoch": 3.577821690108102, + "grad_norm": 1.377706461724047, + "learning_rate": 1.7889801611903288e-07, + "loss": 0.9638, + "step": 46170 + }, + { + "epoch": 3.5785966135844083, + "grad_norm": 1.3318241707684948, + "learning_rate": 1.7893676379417237e-07, + "loss": 0.9696, + "step": 46180 + }, + { + "epoch": 3.579371537060715, + "grad_norm": 1.3188544920994698, + "learning_rate": 1.7897551146931184e-07, + "loss": 0.9774, + "step": 46190 + }, + { + "epoch": 3.580146460537022, + "grad_norm": 1.332243695830343, + "learning_rate": 1.7901425914445134e-07, + "loss": 0.9828, + "step": 46200 + }, + { + "epoch": 3.5809213840133287, + "grad_norm": 1.276768162772461, + "learning_rate": 1.7905300681959083e-07, + "loss": 0.9791, + "step": 46210 + }, + { + "epoch": 3.5816963074896355, + "grad_norm": 1.3281773966987136, + "learning_rate": 1.7909175449473033e-07, + "loss": 0.9944, + "step": 46220 + }, + { + "epoch": 3.5824712309659423, + "grad_norm": 1.368358346934314, + "learning_rate": 1.7913050216986982e-07, + "loss": 0.9863, + "step": 46230 + }, + { + "epoch": 3.5832461544422487, + "grad_norm": 1.380695910374209, + "learning_rate": 1.7916924984500932e-07, + "loss": 0.9626, + "step": 46240 + }, + { + "epoch": 3.5840210779185555, + "grad_norm": 1.2768129828109862, + "learning_rate": 1.792079975201488e-07, + "loss": 0.989, + "step": 46250 + }, + { + "epoch": 3.5847960013948623, + "grad_norm": 1.2969283526561035, + "learning_rate": 1.7924674519528828e-07, + "loss": 0.9838, + "step": 46260 + }, + { + "epoch": 3.585570924871169, + "grad_norm": 1.3628298844858457, + "learning_rate": 1.7928549287042778e-07, + "loss": 1.0084, + "step": 46270 + }, + { + "epoch": 3.5863458483474755, + "grad_norm": 1.3125542147893008, + "learning_rate": 1.7932424054556727e-07, + "loss": 0.9791, + "step": 46280 + }, + { + "epoch": 3.5871207718237823, + "grad_norm": 1.2846446699202203, + "learning_rate": 1.7936298822070677e-07, + "loss": 0.9843, + "step": 46290 + }, + { + "epoch": 3.587895695300089, + "grad_norm": 1.2446888576194737, + "learning_rate": 1.7940173589584626e-07, + "loss": 0.9718, + "step": 46300 + }, + { + "epoch": 3.588670618776396, + "grad_norm": 1.3161588446313286, + "learning_rate": 1.7944048357098576e-07, + "loss": 0.9994, + "step": 46310 + }, + { + "epoch": 3.5894455422527027, + "grad_norm": 1.4007418789570447, + "learning_rate": 1.7947923124612525e-07, + "loss": 0.982, + "step": 46320 + }, + { + "epoch": 3.590220465729009, + "grad_norm": 1.2951643921431164, + "learning_rate": 1.7951797892126472e-07, + "loss": 0.9911, + "step": 46330 + }, + { + "epoch": 3.590995389205316, + "grad_norm": 1.4094848247658234, + "learning_rate": 1.7955672659640422e-07, + "loss": 0.9985, + "step": 46340 + }, + { + "epoch": 3.5917703126816227, + "grad_norm": 1.3366553204866083, + "learning_rate": 1.7959547427154371e-07, + "loss": 0.9773, + "step": 46350 + }, + { + "epoch": 3.5925452361579295, + "grad_norm": 1.3706265921187923, + "learning_rate": 1.796342219466832e-07, + "loss": 0.9951, + "step": 46360 + }, + { + "epoch": 3.593320159634236, + "grad_norm": 1.3430480356583325, + "learning_rate": 1.796729696218227e-07, + "loss": 0.9776, + "step": 46370 + }, + { + "epoch": 3.5940950831105427, + "grad_norm": 1.4051215015752612, + "learning_rate": 1.797117172969622e-07, + "loss": 1.0032, + "step": 46380 + }, + { + "epoch": 3.5948700065868495, + "grad_norm": 1.3194546179036561, + "learning_rate": 1.7975046497210167e-07, + "loss": 0.9976, + "step": 46390 + }, + { + "epoch": 3.5956449300631563, + "grad_norm": 1.363728993613837, + "learning_rate": 1.7978921264724117e-07, + "loss": 0.9692, + "step": 46400 + }, + { + "epoch": 3.596419853539463, + "grad_norm": 1.3305878882644233, + "learning_rate": 1.7982796032238066e-07, + "loss": 0.9803, + "step": 46410 + }, + { + "epoch": 3.59719477701577, + "grad_norm": 1.369034089070708, + "learning_rate": 1.7986670799752016e-07, + "loss": 0.9727, + "step": 46420 + }, + { + "epoch": 3.5979697004920763, + "grad_norm": 1.3847539019623067, + "learning_rate": 1.7990545567265965e-07, + "loss": 0.964, + "step": 46430 + }, + { + "epoch": 3.598744623968383, + "grad_norm": 1.3277558324077707, + "learning_rate": 1.7994420334779915e-07, + "loss": 0.9746, + "step": 46440 + }, + { + "epoch": 3.59951954744469, + "grad_norm": 1.2776521195794104, + "learning_rate": 1.7998295102293864e-07, + "loss": 0.9825, + "step": 46450 + }, + { + "epoch": 3.6002944709209963, + "grad_norm": 1.3259240005544568, + "learning_rate": 1.800216986980781e-07, + "loss": 0.9901, + "step": 46460 + }, + { + "epoch": 3.601069394397303, + "grad_norm": 1.3302177020145747, + "learning_rate": 1.800604463732176e-07, + "loss": 0.9761, + "step": 46470 + }, + { + "epoch": 3.60184431787361, + "grad_norm": 1.3990984047272907, + "learning_rate": 1.800991940483571e-07, + "loss": 1.0122, + "step": 46480 + }, + { + "epoch": 3.6026192413499167, + "grad_norm": 1.3086049543139004, + "learning_rate": 1.801379417234966e-07, + "loss": 0.9772, + "step": 46490 + }, + { + "epoch": 3.6033941648262235, + "grad_norm": 1.350833084010442, + "learning_rate": 1.801766893986361e-07, + "loss": 0.9678, + "step": 46500 + }, + { + "epoch": 3.6033941648262235, + "eval_loss": 0.9860332012176514, + "eval_runtime": 319.8349, + "eval_samples_per_second": 35.865, + "eval_steps_per_second": 8.967, + "step": 46500 + }, + { + "epoch": 3.6041690883025304, + "grad_norm": 1.3628699943951972, + "learning_rate": 1.802154370737756e-07, + "loss": 0.9876, + "step": 46510 + }, + { + "epoch": 3.6049440117788367, + "grad_norm": 1.4548296506357368, + "learning_rate": 1.8025418474891509e-07, + "loss": 0.9807, + "step": 46520 + }, + { + "epoch": 3.6057189352551435, + "grad_norm": 1.3560788721006365, + "learning_rate": 1.8029293242405455e-07, + "loss": 0.9943, + "step": 46530 + }, + { + "epoch": 3.6064938587314503, + "grad_norm": 1.3499892488039593, + "learning_rate": 1.8033168009919405e-07, + "loss": 0.9901, + "step": 46540 + }, + { + "epoch": 3.6072687822077567, + "grad_norm": 1.2739294574111244, + "learning_rate": 1.8037042777433355e-07, + "loss": 0.9813, + "step": 46550 + }, + { + "epoch": 3.6080437056840635, + "grad_norm": 1.4372177096574335, + "learning_rate": 1.8040917544947304e-07, + "loss": 0.965, + "step": 46560 + }, + { + "epoch": 3.6088186291603703, + "grad_norm": 1.2955821622300245, + "learning_rate": 1.8044792312461254e-07, + "loss": 0.9697, + "step": 46570 + }, + { + "epoch": 3.609593552636677, + "grad_norm": 1.4542054088566274, + "learning_rate": 1.8048667079975203e-07, + "loss": 0.9718, + "step": 46580 + }, + { + "epoch": 3.610368476112984, + "grad_norm": 1.3056318881997329, + "learning_rate": 1.8052541847489153e-07, + "loss": 0.9823, + "step": 46590 + }, + { + "epoch": 3.6111433995892908, + "grad_norm": 1.3126663441258128, + "learning_rate": 1.80564166150031e-07, + "loss": 1.0002, + "step": 46600 + }, + { + "epoch": 3.611918323065597, + "grad_norm": 1.3170509067804528, + "learning_rate": 1.806029138251705e-07, + "loss": 0.9673, + "step": 46610 + }, + { + "epoch": 3.612693246541904, + "grad_norm": 1.2996709599198117, + "learning_rate": 1.8064166150031e-07, + "loss": 0.9923, + "step": 46620 + }, + { + "epoch": 3.6134681700182107, + "grad_norm": 1.3351140939937827, + "learning_rate": 1.8068040917544948e-07, + "loss": 0.9927, + "step": 46630 + }, + { + "epoch": 3.6142430934945176, + "grad_norm": 1.2987174962357872, + "learning_rate": 1.8071915685058898e-07, + "loss": 0.9997, + "step": 46640 + }, + { + "epoch": 3.615018016970824, + "grad_norm": 1.292366311721395, + "learning_rate": 1.8075790452572847e-07, + "loss": 0.9758, + "step": 46650 + }, + { + "epoch": 3.6157929404471307, + "grad_norm": 1.4208349270798777, + "learning_rate": 1.8079665220086797e-07, + "loss": 0.9884, + "step": 46660 + }, + { + "epoch": 3.6165678639234375, + "grad_norm": 1.3998217872443868, + "learning_rate": 1.8083539987600744e-07, + "loss": 0.9664, + "step": 46670 + }, + { + "epoch": 3.6173427873997444, + "grad_norm": 1.348194967658939, + "learning_rate": 1.8087414755114693e-07, + "loss": 0.9944, + "step": 46680 + }, + { + "epoch": 3.618117710876051, + "grad_norm": 1.3083602956225318, + "learning_rate": 1.8091289522628643e-07, + "loss": 1.0125, + "step": 46690 + }, + { + "epoch": 3.618892634352358, + "grad_norm": 1.4160746170550482, + "learning_rate": 1.8095164290142593e-07, + "loss": 0.9931, + "step": 46700 + }, + { + "epoch": 3.6196675578286643, + "grad_norm": 1.2997866018945115, + "learning_rate": 1.8099039057656542e-07, + "loss": 0.9892, + "step": 46710 + }, + { + "epoch": 3.620442481304971, + "grad_norm": 1.2827270721607706, + "learning_rate": 1.8102913825170492e-07, + "loss": 0.9898, + "step": 46720 + }, + { + "epoch": 3.621217404781278, + "grad_norm": 1.274035617777359, + "learning_rate": 1.810678859268444e-07, + "loss": 0.9688, + "step": 46730 + }, + { + "epoch": 3.6219923282575843, + "grad_norm": 1.3009073240598878, + "learning_rate": 1.8110663360198388e-07, + "loss": 1.0103, + "step": 46740 + }, + { + "epoch": 3.622767251733891, + "grad_norm": 1.431514168476024, + "learning_rate": 1.8114538127712338e-07, + "loss": 1.0, + "step": 46750 + }, + { + "epoch": 3.623542175210198, + "grad_norm": 1.3276683703558372, + "learning_rate": 1.8118412895226287e-07, + "loss": 0.9895, + "step": 46760 + }, + { + "epoch": 3.6243170986865048, + "grad_norm": 1.3714698172189062, + "learning_rate": 1.8122287662740237e-07, + "loss": 0.9955, + "step": 46770 + }, + { + "epoch": 3.6250920221628116, + "grad_norm": 1.3293070312745994, + "learning_rate": 1.8126162430254186e-07, + "loss": 0.9886, + "step": 46780 + }, + { + "epoch": 3.6258669456391184, + "grad_norm": 1.3369308857435491, + "learning_rate": 1.8130037197768136e-07, + "loss": 0.9854, + "step": 46790 + }, + { + "epoch": 3.6266418691154247, + "grad_norm": 1.33654632556845, + "learning_rate": 1.8133911965282083e-07, + "loss": 0.9742, + "step": 46800 + }, + { + "epoch": 3.6274167925917316, + "grad_norm": 1.3575039878349824, + "learning_rate": 1.8137786732796032e-07, + "loss": 0.9897, + "step": 46810 + }, + { + "epoch": 3.6281917160680384, + "grad_norm": 1.3375597838160205, + "learning_rate": 1.8141661500309982e-07, + "loss": 0.9669, + "step": 46820 + }, + { + "epoch": 3.6289666395443447, + "grad_norm": 1.3193313076456967, + "learning_rate": 1.8145536267823931e-07, + "loss": 0.9829, + "step": 46830 + }, + { + "epoch": 3.6297415630206515, + "grad_norm": 1.3626318320292548, + "learning_rate": 1.814941103533788e-07, + "loss": 0.9956, + "step": 46840 + }, + { + "epoch": 3.6305164864969584, + "grad_norm": 1.4024892214406453, + "learning_rate": 1.815328580285183e-07, + "loss": 0.9587, + "step": 46850 + }, + { + "epoch": 3.631291409973265, + "grad_norm": 1.3634020666217161, + "learning_rate": 1.815716057036578e-07, + "loss": 0.9859, + "step": 46860 + }, + { + "epoch": 3.632066333449572, + "grad_norm": 1.2959961337083454, + "learning_rate": 1.8161035337879727e-07, + "loss": 0.9834, + "step": 46870 + }, + { + "epoch": 3.632841256925879, + "grad_norm": 1.3137309056251105, + "learning_rate": 1.8164910105393677e-07, + "loss": 0.9891, + "step": 46880 + }, + { + "epoch": 3.633616180402185, + "grad_norm": 1.4182320961847912, + "learning_rate": 1.8168784872907626e-07, + "loss": 0.9632, + "step": 46890 + }, + { + "epoch": 3.634391103878492, + "grad_norm": 1.426650861971612, + "learning_rate": 1.8172659640421576e-07, + "loss": 0.9661, + "step": 46900 + }, + { + "epoch": 3.6351660273547988, + "grad_norm": 1.3462589928987003, + "learning_rate": 1.8176534407935525e-07, + "loss": 0.9771, + "step": 46910 + }, + { + "epoch": 3.6359409508311056, + "grad_norm": 1.4347426708686926, + "learning_rate": 1.8180409175449475e-07, + "loss": 0.9874, + "step": 46920 + }, + { + "epoch": 3.636715874307412, + "grad_norm": 1.293194699263866, + "learning_rate": 1.8184283942963424e-07, + "loss": 0.9927, + "step": 46930 + }, + { + "epoch": 3.6374907977837188, + "grad_norm": 1.3420312664235006, + "learning_rate": 1.818815871047737e-07, + "loss": 0.9987, + "step": 46940 + }, + { + "epoch": 3.6382657212600256, + "grad_norm": 1.3272907910392506, + "learning_rate": 1.819203347799132e-07, + "loss": 0.9875, + "step": 46950 + }, + { + "epoch": 3.6390406447363324, + "grad_norm": 1.3836431249900831, + "learning_rate": 1.819590824550527e-07, + "loss": 0.9822, + "step": 46960 + }, + { + "epoch": 3.639815568212639, + "grad_norm": 1.376876273207016, + "learning_rate": 1.819978301301922e-07, + "loss": 0.9972, + "step": 46970 + }, + { + "epoch": 3.6405904916889456, + "grad_norm": 1.3722033984699884, + "learning_rate": 1.820365778053317e-07, + "loss": 0.9915, + "step": 46980 + }, + { + "epoch": 3.6413654151652524, + "grad_norm": 1.3035133229686764, + "learning_rate": 1.820753254804712e-07, + "loss": 0.9736, + "step": 46990 + }, + { + "epoch": 3.642140338641559, + "grad_norm": 1.2760580080201263, + "learning_rate": 1.8211407315561068e-07, + "loss": 0.9736, + "step": 47000 + }, + { + "epoch": 3.642140338641559, + "eval_loss": 0.9849308133125305, + "eval_runtime": 318.8607, + "eval_samples_per_second": 35.975, + "eval_steps_per_second": 8.995, + "step": 47000 + }, + { + "epoch": 3.642915262117866, + "grad_norm": 1.2844867976728103, + "learning_rate": 1.8215282083075015e-07, + "loss": 0.9727, + "step": 47010 + }, + { + "epoch": 3.6436901855941723, + "grad_norm": 1.3426161610894103, + "learning_rate": 1.8219156850588965e-07, + "loss": 0.9926, + "step": 47020 + }, + { + "epoch": 3.644465109070479, + "grad_norm": 1.3953662250261778, + "learning_rate": 1.8223031618102914e-07, + "loss": 0.9721, + "step": 47030 + }, + { + "epoch": 3.645240032546786, + "grad_norm": 1.3664141749502565, + "learning_rate": 1.8226906385616864e-07, + "loss": 0.9819, + "step": 47040 + }, + { + "epoch": 3.646014956023093, + "grad_norm": 1.2603964196431041, + "learning_rate": 1.8230781153130814e-07, + "loss": 0.9895, + "step": 47050 + }, + { + "epoch": 3.6467898794993996, + "grad_norm": 1.3549858307179523, + "learning_rate": 1.8234655920644763e-07, + "loss": 1.0174, + "step": 47060 + }, + { + "epoch": 3.6475648029757064, + "grad_norm": 1.4401374813343095, + "learning_rate": 1.8238530688158713e-07, + "loss": 1.0008, + "step": 47070 + }, + { + "epoch": 3.6483397264520128, + "grad_norm": 1.264858529919664, + "learning_rate": 1.824240545567266e-07, + "loss": 0.9664, + "step": 47080 + }, + { + "epoch": 3.6491146499283196, + "grad_norm": 1.2630177655905686, + "learning_rate": 1.824628022318661e-07, + "loss": 0.9838, + "step": 47090 + }, + { + "epoch": 3.6498895734046264, + "grad_norm": 1.3353457223100669, + "learning_rate": 1.825015499070056e-07, + "loss": 0.9887, + "step": 47100 + }, + { + "epoch": 3.6506644968809328, + "grad_norm": 1.347044396319985, + "learning_rate": 1.8254029758214508e-07, + "loss": 0.984, + "step": 47110 + }, + { + "epoch": 3.6514394203572396, + "grad_norm": 1.3573181440293511, + "learning_rate": 1.8257904525728458e-07, + "loss": 0.9727, + "step": 47120 + }, + { + "epoch": 3.6522143438335464, + "grad_norm": 1.3633407869677932, + "learning_rate": 1.8261779293242407e-07, + "loss": 0.9666, + "step": 47130 + }, + { + "epoch": 3.652989267309853, + "grad_norm": 1.331740029101028, + "learning_rate": 1.8265654060756354e-07, + "loss": 0.9896, + "step": 47140 + }, + { + "epoch": 3.65376419078616, + "grad_norm": 1.403458553746562, + "learning_rate": 1.8269528828270304e-07, + "loss": 0.9784, + "step": 47150 + }, + { + "epoch": 3.654539114262467, + "grad_norm": 1.3392799196827754, + "learning_rate": 1.8273403595784253e-07, + "loss": 0.9674, + "step": 47160 + }, + { + "epoch": 3.655314037738773, + "grad_norm": 1.4218699036214095, + "learning_rate": 1.8277278363298203e-07, + "loss": 0.9842, + "step": 47170 + }, + { + "epoch": 3.65608896121508, + "grad_norm": 1.2707788878189352, + "learning_rate": 1.8281153130812152e-07, + "loss": 0.9949, + "step": 47180 + }, + { + "epoch": 3.656863884691387, + "grad_norm": 1.3506925778276242, + "learning_rate": 1.8285027898326102e-07, + "loss": 0.9933, + "step": 47190 + }, + { + "epoch": 3.6576388081676936, + "grad_norm": 1.5158486691049189, + "learning_rate": 1.8288902665840052e-07, + "loss": 0.9776, + "step": 47200 + }, + { + "epoch": 3.658413731644, + "grad_norm": 1.3339561636359565, + "learning_rate": 1.8292777433353998e-07, + "loss": 0.99, + "step": 47210 + }, + { + "epoch": 3.6591886551203068, + "grad_norm": 1.2814529013078215, + "learning_rate": 1.8296652200867948e-07, + "loss": 0.983, + "step": 47220 + }, + { + "epoch": 3.6599635785966136, + "grad_norm": 1.3282191521713607, + "learning_rate": 1.8300526968381898e-07, + "loss": 0.9735, + "step": 47230 + }, + { + "epoch": 3.6607385020729204, + "grad_norm": 1.3618310323169345, + "learning_rate": 1.8304401735895847e-07, + "loss": 0.9918, + "step": 47240 + }, + { + "epoch": 3.661513425549227, + "grad_norm": 1.274877265373535, + "learning_rate": 1.8308276503409797e-07, + "loss": 0.976, + "step": 47250 + }, + { + "epoch": 3.6622883490255336, + "grad_norm": 1.3340305168536177, + "learning_rate": 1.8312151270923746e-07, + "loss": 0.9686, + "step": 47260 + }, + { + "epoch": 3.6630632725018404, + "grad_norm": 1.3788616727192236, + "learning_rate": 1.8316026038437696e-07, + "loss": 0.9655, + "step": 47270 + }, + { + "epoch": 3.663838195978147, + "grad_norm": 1.3893371631316267, + "learning_rate": 1.8319900805951643e-07, + "loss": 0.9918, + "step": 47280 + }, + { + "epoch": 3.664613119454454, + "grad_norm": 1.3169698161439896, + "learning_rate": 1.8323775573465592e-07, + "loss": 0.9689, + "step": 47290 + }, + { + "epoch": 3.6653880429307604, + "grad_norm": 1.2871195975713547, + "learning_rate": 1.8327650340979542e-07, + "loss": 0.9665, + "step": 47300 + }, + { + "epoch": 3.666162966407067, + "grad_norm": 1.3549520806034785, + "learning_rate": 1.833152510849349e-07, + "loss": 1.0086, + "step": 47310 + }, + { + "epoch": 3.666937889883374, + "grad_norm": 1.28450660185871, + "learning_rate": 1.833539987600744e-07, + "loss": 0.9741, + "step": 47320 + }, + { + "epoch": 3.667712813359681, + "grad_norm": 1.2248527364479083, + "learning_rate": 1.833927464352139e-07, + "loss": 0.9868, + "step": 47330 + }, + { + "epoch": 3.6684877368359876, + "grad_norm": 1.3211522346583602, + "learning_rate": 1.834314941103534e-07, + "loss": 1.0041, + "step": 47340 + }, + { + "epoch": 3.6692626603122944, + "grad_norm": 1.465730853130628, + "learning_rate": 1.8347024178549287e-07, + "loss": 1.01, + "step": 47350 + }, + { + "epoch": 3.670037583788601, + "grad_norm": 1.3701175603277063, + "learning_rate": 1.8350898946063236e-07, + "loss": 0.9735, + "step": 47360 + }, + { + "epoch": 3.6708125072649076, + "grad_norm": 1.3687737158158801, + "learning_rate": 1.8354773713577186e-07, + "loss": 0.9882, + "step": 47370 + }, + { + "epoch": 3.6715874307412144, + "grad_norm": 1.2871209853730718, + "learning_rate": 1.8358648481091136e-07, + "loss": 0.9668, + "step": 47380 + }, + { + "epoch": 3.6723623542175208, + "grad_norm": 1.2938465057650876, + "learning_rate": 1.8362523248605085e-07, + "loss": 0.9837, + "step": 47390 + }, + { + "epoch": 3.6731372776938276, + "grad_norm": 1.2959246160374438, + "learning_rate": 1.8366398016119035e-07, + "loss": 0.9973, + "step": 47400 + }, + { + "epoch": 3.6739122011701344, + "grad_norm": 2.3457354301476783, + "learning_rate": 1.8370272783632984e-07, + "loss": 0.9932, + "step": 47410 + }, + { + "epoch": 3.674687124646441, + "grad_norm": 1.361748103496919, + "learning_rate": 1.837414755114693e-07, + "loss": 0.9729, + "step": 47420 + }, + { + "epoch": 3.675462048122748, + "grad_norm": 1.2789652831363003, + "learning_rate": 1.837802231866088e-07, + "loss": 0.9935, + "step": 47430 + }, + { + "epoch": 3.676236971599055, + "grad_norm": 1.3760347729430633, + "learning_rate": 1.838189708617483e-07, + "loss": 1.0028, + "step": 47440 + }, + { + "epoch": 3.677011895075361, + "grad_norm": 1.3749225300183754, + "learning_rate": 1.838577185368878e-07, + "loss": 0.9769, + "step": 47450 + }, + { + "epoch": 3.677786818551668, + "grad_norm": 1.3476373974632418, + "learning_rate": 1.838964662120273e-07, + "loss": 0.9795, + "step": 47460 + }, + { + "epoch": 3.678561742027975, + "grad_norm": 1.3484417825177286, + "learning_rate": 1.839352138871668e-07, + "loss": 0.9749, + "step": 47470 + }, + { + "epoch": 3.679336665504281, + "grad_norm": 1.297957381706856, + "learning_rate": 1.8397396156230626e-07, + "loss": 1.0068, + "step": 47480 + }, + { + "epoch": 3.680111588980588, + "grad_norm": 1.2980140248757743, + "learning_rate": 1.8401270923744575e-07, + "loss": 0.9913, + "step": 47490 + }, + { + "epoch": 3.680886512456895, + "grad_norm": 1.3971502216825173, + "learning_rate": 1.8405145691258525e-07, + "loss": 0.9826, + "step": 47500 + }, + { + "epoch": 3.680886512456895, + "eval_loss": 0.9839414954185486, + "eval_runtime": 319.5355, + "eval_samples_per_second": 35.899, + "eval_steps_per_second": 8.976, + "step": 47500 + }, + { + "epoch": 3.6816614359332016, + "grad_norm": 1.3358689908467483, + "learning_rate": 1.8409020458772474e-07, + "loss": 1.0017, + "step": 47510 + }, + { + "epoch": 3.6824363594095084, + "grad_norm": 1.3219148992757521, + "learning_rate": 1.8412895226286424e-07, + "loss": 0.9765, + "step": 47520 + }, + { + "epoch": 3.6832112828858152, + "grad_norm": 1.3791666821006532, + "learning_rate": 1.8416769993800374e-07, + "loss": 0.9565, + "step": 47530 + }, + { + "epoch": 3.6839862063621216, + "grad_norm": 1.3245977176392065, + "learning_rate": 1.8420644761314323e-07, + "loss": 0.9808, + "step": 47540 + }, + { + "epoch": 3.6847611298384284, + "grad_norm": 1.3965121715663098, + "learning_rate": 1.842451952882827e-07, + "loss": 0.988, + "step": 47550 + }, + { + "epoch": 3.685536053314735, + "grad_norm": 1.3550735602657327, + "learning_rate": 1.842839429634222e-07, + "loss": 0.9982, + "step": 47560 + }, + { + "epoch": 3.686310976791042, + "grad_norm": 1.3725397298531823, + "learning_rate": 1.843226906385617e-07, + "loss": 0.9751, + "step": 47570 + }, + { + "epoch": 3.6870859002673484, + "grad_norm": 1.2937547244021854, + "learning_rate": 1.8436143831370119e-07, + "loss": 0.9924, + "step": 47580 + }, + { + "epoch": 3.687860823743655, + "grad_norm": 1.3353481578430044, + "learning_rate": 1.8440018598884068e-07, + "loss": 0.9871, + "step": 47590 + }, + { + "epoch": 3.688635747219962, + "grad_norm": 1.3702102491771744, + "learning_rate": 1.8443893366398018e-07, + "loss": 1.0039, + "step": 47600 + }, + { + "epoch": 3.689410670696269, + "grad_norm": 1.3827728264128931, + "learning_rate": 1.8447768133911967e-07, + "loss": 0.9777, + "step": 47610 + }, + { + "epoch": 3.6901855941725756, + "grad_norm": 1.3213224082042498, + "learning_rate": 1.8451642901425914e-07, + "loss": 0.9825, + "step": 47620 + }, + { + "epoch": 3.6909605176488824, + "grad_norm": 1.350071174377005, + "learning_rate": 1.8455517668939864e-07, + "loss": 0.9818, + "step": 47630 + }, + { + "epoch": 3.691735441125189, + "grad_norm": 1.3984742389320624, + "learning_rate": 1.8459392436453813e-07, + "loss": 0.9931, + "step": 47640 + }, + { + "epoch": 3.6925103646014956, + "grad_norm": 1.346448037267727, + "learning_rate": 1.8463267203967763e-07, + "loss": 0.9691, + "step": 47650 + }, + { + "epoch": 3.6932852880778024, + "grad_norm": 1.327535409653118, + "learning_rate": 1.8467141971481712e-07, + "loss": 0.9795, + "step": 47660 + }, + { + "epoch": 3.694060211554109, + "grad_norm": 1.3830774965516037, + "learning_rate": 1.8471016738995662e-07, + "loss": 0.9915, + "step": 47670 + }, + { + "epoch": 3.6948351350304156, + "grad_norm": 1.2550700154050785, + "learning_rate": 1.8474891506509611e-07, + "loss": 0.9869, + "step": 47680 + }, + { + "epoch": 3.6956100585067224, + "grad_norm": 1.3189814129755355, + "learning_rate": 1.8478766274023558e-07, + "loss": 0.9855, + "step": 47690 + }, + { + "epoch": 3.6963849819830292, + "grad_norm": 1.3581680545625696, + "learning_rate": 1.8482641041537508e-07, + "loss": 0.9866, + "step": 47700 + }, + { + "epoch": 3.697159905459336, + "grad_norm": 1.3650669509980031, + "learning_rate": 1.8486515809051457e-07, + "loss": 0.9912, + "step": 47710 + }, + { + "epoch": 3.697934828935643, + "grad_norm": 1.361761821927054, + "learning_rate": 1.8490390576565407e-07, + "loss": 1.0217, + "step": 47720 + }, + { + "epoch": 3.698709752411949, + "grad_norm": 1.2857274866806367, + "learning_rate": 1.8494265344079357e-07, + "loss": 0.9775, + "step": 47730 + }, + { + "epoch": 3.699484675888256, + "grad_norm": 1.3656789476169389, + "learning_rate": 1.8498140111593306e-07, + "loss": 0.9926, + "step": 47740 + }, + { + "epoch": 3.700259599364563, + "grad_norm": 1.3270290017670592, + "learning_rate": 1.8502014879107256e-07, + "loss": 0.9606, + "step": 47750 + }, + { + "epoch": 3.701034522840869, + "grad_norm": 1.365164602855101, + "learning_rate": 1.8505889646621203e-07, + "loss": 1.0054, + "step": 47760 + }, + { + "epoch": 3.701809446317176, + "grad_norm": 1.3534870306015003, + "learning_rate": 1.8509764414135152e-07, + "loss": 0.9964, + "step": 47770 + }, + { + "epoch": 3.702584369793483, + "grad_norm": 1.333134030498148, + "learning_rate": 1.8513639181649102e-07, + "loss": 0.989, + "step": 47780 + }, + { + "epoch": 3.7033592932697896, + "grad_norm": 1.2581038097446846, + "learning_rate": 1.851751394916305e-07, + "loss": 0.9667, + "step": 47790 + }, + { + "epoch": 3.7041342167460964, + "grad_norm": 1.2595575118224245, + "learning_rate": 1.8521388716677e-07, + "loss": 0.9742, + "step": 47800 + }, + { + "epoch": 3.7049091402224033, + "grad_norm": 1.290172488718819, + "learning_rate": 1.852526348419095e-07, + "loss": 0.9748, + "step": 47810 + }, + { + "epoch": 3.7056840636987096, + "grad_norm": 1.4412732179439012, + "learning_rate": 1.85291382517049e-07, + "loss": 0.9791, + "step": 47820 + }, + { + "epoch": 3.7064589871750164, + "grad_norm": 1.3503869694167103, + "learning_rate": 1.8533013019218847e-07, + "loss": 0.9702, + "step": 47830 + }, + { + "epoch": 3.7072339106513232, + "grad_norm": 1.275801727249847, + "learning_rate": 1.8536887786732796e-07, + "loss": 0.9782, + "step": 47840 + }, + { + "epoch": 3.70800883412763, + "grad_norm": 1.3487205897274772, + "learning_rate": 1.8540762554246746e-07, + "loss": 0.971, + "step": 47850 + }, + { + "epoch": 3.7087837576039364, + "grad_norm": 1.3664074130731285, + "learning_rate": 1.8544637321760695e-07, + "loss": 0.9972, + "step": 47860 + }, + { + "epoch": 3.7095586810802432, + "grad_norm": 1.3742505344518787, + "learning_rate": 1.8548512089274645e-07, + "loss": 0.9629, + "step": 47870 + }, + { + "epoch": 3.71033360455655, + "grad_norm": 1.3247750614845673, + "learning_rate": 1.8552386856788595e-07, + "loss": 0.9631, + "step": 47880 + }, + { + "epoch": 3.711108528032857, + "grad_norm": 1.294547115006301, + "learning_rate": 1.8556261624302541e-07, + "loss": 0.9827, + "step": 47890 + }, + { + "epoch": 3.7118834515091637, + "grad_norm": 1.3412104884194385, + "learning_rate": 1.856013639181649e-07, + "loss": 0.9902, + "step": 47900 + }, + { + "epoch": 3.71265837498547, + "grad_norm": 1.3413709368857614, + "learning_rate": 1.856401115933044e-07, + "loss": 0.9746, + "step": 47910 + }, + { + "epoch": 3.713433298461777, + "grad_norm": 1.3004136898373202, + "learning_rate": 1.856788592684439e-07, + "loss": 0.9707, + "step": 47920 + }, + { + "epoch": 3.7142082219380836, + "grad_norm": 1.3380114554295075, + "learning_rate": 1.857176069435834e-07, + "loss": 0.9865, + "step": 47930 + }, + { + "epoch": 3.7149831454143905, + "grad_norm": 1.3385687135543538, + "learning_rate": 1.857563546187229e-07, + "loss": 0.9798, + "step": 47940 + }, + { + "epoch": 3.715758068890697, + "grad_norm": 1.3551133190939015, + "learning_rate": 1.857951022938624e-07, + "loss": 0.9814, + "step": 47950 + }, + { + "epoch": 3.7165329923670036, + "grad_norm": 1.3208020387428514, + "learning_rate": 1.8583384996900186e-07, + "loss": 0.9795, + "step": 47960 + }, + { + "epoch": 3.7173079158433104, + "grad_norm": 1.3842810114453896, + "learning_rate": 1.8587259764414135e-07, + "loss": 0.9646, + "step": 47970 + }, + { + "epoch": 3.7180828393196172, + "grad_norm": 1.44191513988947, + "learning_rate": 1.8591134531928085e-07, + "loss": 0.9909, + "step": 47980 + }, + { + "epoch": 3.718857762795924, + "grad_norm": 1.360464776310646, + "learning_rate": 1.8595009299442034e-07, + "loss": 0.9665, + "step": 47990 + }, + { + "epoch": 3.719632686272231, + "grad_norm": 1.3835313472870048, + "learning_rate": 1.8598884066955984e-07, + "loss": 1.0023, + "step": 48000 + }, + { + "epoch": 3.719632686272231, + "eval_loss": 0.9829105138778687, + "eval_runtime": 320.8333, + "eval_samples_per_second": 35.754, + "eval_steps_per_second": 8.939, + "step": 48000 + }, + { + "epoch": 3.7204076097485372, + "grad_norm": 1.3034226755349545, + "learning_rate": 1.8602758834469933e-07, + "loss": 0.9729, + "step": 48010 + }, + { + "epoch": 3.721182533224844, + "grad_norm": 1.2733537104875174, + "learning_rate": 1.8606633601983883e-07, + "loss": 0.9917, + "step": 48020 + }, + { + "epoch": 3.721957456701151, + "grad_norm": 1.3991633762865754, + "learning_rate": 1.861050836949783e-07, + "loss": 0.9697, + "step": 48030 + }, + { + "epoch": 3.7227323801774572, + "grad_norm": 1.3056476112267734, + "learning_rate": 1.861438313701178e-07, + "loss": 0.9831, + "step": 48040 + }, + { + "epoch": 3.723507303653764, + "grad_norm": 1.4491706516468466, + "learning_rate": 1.861825790452573e-07, + "loss": 0.9766, + "step": 48050 + }, + { + "epoch": 3.724282227130071, + "grad_norm": 1.4839076195663288, + "learning_rate": 1.8622132672039679e-07, + "loss": 0.9853, + "step": 48060 + }, + { + "epoch": 3.7250571506063777, + "grad_norm": 1.4230083667298175, + "learning_rate": 1.8626007439553628e-07, + "loss": 1.0084, + "step": 48070 + }, + { + "epoch": 3.7258320740826845, + "grad_norm": 1.3537061713164822, + "learning_rate": 1.8629882207067578e-07, + "loss": 0.9974, + "step": 48080 + }, + { + "epoch": 3.7266069975589913, + "grad_norm": 1.2852439002675387, + "learning_rate": 1.8633756974581527e-07, + "loss": 0.9773, + "step": 48090 + }, + { + "epoch": 3.7273819210352976, + "grad_norm": 1.3890558318177793, + "learning_rate": 1.8637631742095474e-07, + "loss": 0.9786, + "step": 48100 + }, + { + "epoch": 3.7281568445116045, + "grad_norm": 1.345196680577714, + "learning_rate": 1.8641506509609424e-07, + "loss": 0.975, + "step": 48110 + }, + { + "epoch": 3.7289317679879113, + "grad_norm": 1.4190627857857734, + "learning_rate": 1.8645381277123373e-07, + "loss": 1.0126, + "step": 48120 + }, + { + "epoch": 3.729706691464218, + "grad_norm": 1.3621807156834416, + "learning_rate": 1.8649256044637323e-07, + "loss": 0.9813, + "step": 48130 + }, + { + "epoch": 3.7304816149405244, + "grad_norm": 1.3092854716435463, + "learning_rate": 1.8653130812151272e-07, + "loss": 0.9858, + "step": 48140 + }, + { + "epoch": 3.7312565384168312, + "grad_norm": 1.2822701156053684, + "learning_rate": 1.8657005579665222e-07, + "loss": 1.0044, + "step": 48150 + }, + { + "epoch": 3.732031461893138, + "grad_norm": 1.4079114425630548, + "learning_rate": 1.8660880347179171e-07, + "loss": 0.9902, + "step": 48160 + }, + { + "epoch": 3.732806385369445, + "grad_norm": 1.3041056266315156, + "learning_rate": 1.8664755114693118e-07, + "loss": 0.9874, + "step": 48170 + }, + { + "epoch": 3.7335813088457517, + "grad_norm": 1.2790918892792051, + "learning_rate": 1.8668629882207068e-07, + "loss": 0.9631, + "step": 48180 + }, + { + "epoch": 3.734356232322058, + "grad_norm": 1.3397270061235227, + "learning_rate": 1.8672504649721017e-07, + "loss": 0.9764, + "step": 48190 + }, + { + "epoch": 3.735131155798365, + "grad_norm": 1.301059585179528, + "learning_rate": 1.8676379417234967e-07, + "loss": 0.9837, + "step": 48200 + }, + { + "epoch": 3.7359060792746717, + "grad_norm": 1.377266115049535, + "learning_rate": 1.8680254184748916e-07, + "loss": 0.9754, + "step": 48210 + }, + { + "epoch": 3.7366810027509785, + "grad_norm": 1.358641151823036, + "learning_rate": 1.8684128952262866e-07, + "loss": 1.007, + "step": 48220 + }, + { + "epoch": 3.737455926227285, + "grad_norm": 1.37500988032392, + "learning_rate": 1.8688003719776813e-07, + "loss": 0.9831, + "step": 48230 + }, + { + "epoch": 3.7382308497035917, + "grad_norm": 1.3513111710107628, + "learning_rate": 1.8691878487290763e-07, + "loss": 0.9881, + "step": 48240 + }, + { + "epoch": 3.7390057731798985, + "grad_norm": 1.3272261990690641, + "learning_rate": 1.8695753254804712e-07, + "loss": 0.9895, + "step": 48250 + }, + { + "epoch": 3.7397806966562053, + "grad_norm": 1.3844609694198848, + "learning_rate": 1.8699628022318662e-07, + "loss": 0.994, + "step": 48260 + }, + { + "epoch": 3.740555620132512, + "grad_norm": 1.3511596918576756, + "learning_rate": 1.870350278983261e-07, + "loss": 0.9792, + "step": 48270 + }, + { + "epoch": 3.741330543608819, + "grad_norm": 1.3283251343206153, + "learning_rate": 1.870737755734656e-07, + "loss": 0.9788, + "step": 48280 + }, + { + "epoch": 3.7421054670851253, + "grad_norm": 1.4512926997599223, + "learning_rate": 1.871125232486051e-07, + "loss": 0.9783, + "step": 48290 + }, + { + "epoch": 3.742880390561432, + "grad_norm": 1.3992089363806621, + "learning_rate": 1.8715127092374457e-07, + "loss": 0.9746, + "step": 48300 + }, + { + "epoch": 3.743655314037739, + "grad_norm": 1.4842746135110292, + "learning_rate": 1.8719001859888407e-07, + "loss": 0.979, + "step": 48310 + }, + { + "epoch": 3.7444302375140452, + "grad_norm": 1.2809047076817521, + "learning_rate": 1.8722876627402356e-07, + "loss": 0.9798, + "step": 48320 + }, + { + "epoch": 3.745205160990352, + "grad_norm": 1.3321090566150953, + "learning_rate": 1.8726751394916306e-07, + "loss": 0.9847, + "step": 48330 + }, + { + "epoch": 3.745980084466659, + "grad_norm": 1.334602834984618, + "learning_rate": 1.8730626162430255e-07, + "loss": 0.9886, + "step": 48340 + }, + { + "epoch": 3.7467550079429657, + "grad_norm": 1.43245531692451, + "learning_rate": 1.8734500929944205e-07, + "loss": 0.9818, + "step": 48350 + }, + { + "epoch": 3.7475299314192725, + "grad_norm": 1.3871665034613463, + "learning_rate": 1.8738375697458154e-07, + "loss": 0.9727, + "step": 48360 + }, + { + "epoch": 3.7483048548955793, + "grad_norm": 1.3347033679405376, + "learning_rate": 1.8742250464972101e-07, + "loss": 0.9677, + "step": 48370 + }, + { + "epoch": 3.7490797783718857, + "grad_norm": 1.4188405003025328, + "learning_rate": 1.874612523248605e-07, + "loss": 0.9895, + "step": 48380 + }, + { + "epoch": 3.7498547018481925, + "grad_norm": 1.3263445728384275, + "learning_rate": 1.875e-07, + "loss": 0.9902, + "step": 48390 + }, + { + "epoch": 3.7506296253244993, + "grad_norm": 1.3138935916626557, + "learning_rate": 1.875387476751395e-07, + "loss": 0.9597, + "step": 48400 + }, + { + "epoch": 3.7514045488008056, + "grad_norm": 1.3658369685674436, + "learning_rate": 1.87577495350279e-07, + "loss": 0.997, + "step": 48410 + }, + { + "epoch": 3.7521794722771125, + "grad_norm": 1.3223567794572255, + "learning_rate": 1.876162430254185e-07, + "loss": 0.9845, + "step": 48420 + }, + { + "epoch": 3.7529543957534193, + "grad_norm": 1.3265265967194997, + "learning_rate": 1.87654990700558e-07, + "loss": 0.9945, + "step": 48430 + }, + { + "epoch": 3.753729319229726, + "grad_norm": 1.2421235929972256, + "learning_rate": 1.8769373837569746e-07, + "loss": 0.9842, + "step": 48440 + }, + { + "epoch": 3.754504242706033, + "grad_norm": 1.4061142157998678, + "learning_rate": 1.8773248605083695e-07, + "loss": 0.9804, + "step": 48450 + }, + { + "epoch": 3.7552791661823397, + "grad_norm": 1.3444819710025808, + "learning_rate": 1.8777123372597645e-07, + "loss": 0.9824, + "step": 48460 + }, + { + "epoch": 3.756054089658646, + "grad_norm": 1.3940456709863447, + "learning_rate": 1.8780998140111594e-07, + "loss": 0.9808, + "step": 48470 + }, + { + "epoch": 3.756829013134953, + "grad_norm": 1.347571988099293, + "learning_rate": 1.8784872907625544e-07, + "loss": 0.9659, + "step": 48480 + }, + { + "epoch": 3.7576039366112597, + "grad_norm": 1.3383431221191346, + "learning_rate": 1.8788747675139493e-07, + "loss": 1.0059, + "step": 48490 + }, + { + "epoch": 3.7583788600875665, + "grad_norm": 1.3334191946703455, + "learning_rate": 1.8792622442653443e-07, + "loss": 0.9745, + "step": 48500 + }, + { + "epoch": 3.7583788600875665, + "eval_loss": 0.9818470478057861, + "eval_runtime": 319.4615, + "eval_samples_per_second": 35.907, + "eval_steps_per_second": 8.978, + "step": 48500 + }, + { + "epoch": 3.759153783563873, + "grad_norm": 1.3957238307780755, + "learning_rate": 1.879649721016739e-07, + "loss": 0.9948, + "step": 48510 + }, + { + "epoch": 3.7599287070401797, + "grad_norm": 1.3924479454806682, + "learning_rate": 1.880037197768134e-07, + "loss": 0.9847, + "step": 48520 + }, + { + "epoch": 3.7607036305164865, + "grad_norm": 1.3268364573720626, + "learning_rate": 1.880424674519529e-07, + "loss": 0.9969, + "step": 48530 + }, + { + "epoch": 3.7614785539927933, + "grad_norm": 1.2929238904516203, + "learning_rate": 1.8808121512709238e-07, + "loss": 0.9559, + "step": 48540 + }, + { + "epoch": 3.7622534774691, + "grad_norm": 1.2426311637921008, + "learning_rate": 1.8811996280223188e-07, + "loss": 0.9861, + "step": 48550 + }, + { + "epoch": 3.763028400945407, + "grad_norm": 1.3830471984325179, + "learning_rate": 1.8815871047737138e-07, + "loss": 0.9837, + "step": 48560 + }, + { + "epoch": 3.7638033244217133, + "grad_norm": 1.4337646289360817, + "learning_rate": 1.8819745815251084e-07, + "loss": 1.0387, + "step": 48570 + }, + { + "epoch": 3.76457824789802, + "grad_norm": 1.3713571280447443, + "learning_rate": 1.8823620582765034e-07, + "loss": 0.9954, + "step": 48580 + }, + { + "epoch": 3.765353171374327, + "grad_norm": 1.3424522369874063, + "learning_rate": 1.8827495350278984e-07, + "loss": 0.9822, + "step": 48590 + }, + { + "epoch": 3.7661280948506333, + "grad_norm": 1.3905611804920852, + "learning_rate": 1.8831370117792933e-07, + "loss": 0.9861, + "step": 48600 + }, + { + "epoch": 3.76690301832694, + "grad_norm": 1.2518825878876196, + "learning_rate": 1.8835244885306883e-07, + "loss": 0.9869, + "step": 48610 + }, + { + "epoch": 3.767677941803247, + "grad_norm": 1.3833955348125913, + "learning_rate": 1.8839119652820832e-07, + "loss": 0.9863, + "step": 48620 + }, + { + "epoch": 3.7684528652795537, + "grad_norm": 1.3149796421086275, + "learning_rate": 1.8842994420334782e-07, + "loss": 0.9819, + "step": 48630 + }, + { + "epoch": 3.7692277887558605, + "grad_norm": 1.4379069279171965, + "learning_rate": 1.8846869187848729e-07, + "loss": 0.9929, + "step": 48640 + }, + { + "epoch": 3.7700027122321673, + "grad_norm": 1.2692676446082822, + "learning_rate": 1.8850743955362678e-07, + "loss": 0.966, + "step": 48650 + }, + { + "epoch": 3.7707776357084737, + "grad_norm": 1.3561162077349131, + "learning_rate": 1.8854618722876628e-07, + "loss": 0.9886, + "step": 48660 + }, + { + "epoch": 3.7715525591847805, + "grad_norm": 1.3520631112876187, + "learning_rate": 1.8858493490390577e-07, + "loss": 0.971, + "step": 48670 + }, + { + "epoch": 3.7723274826610873, + "grad_norm": 1.422719560353682, + "learning_rate": 1.8862368257904527e-07, + "loss": 0.9906, + "step": 48680 + }, + { + "epoch": 3.7731024061373937, + "grad_norm": 1.3419484328119289, + "learning_rate": 1.8866243025418476e-07, + "loss": 0.9912, + "step": 48690 + }, + { + "epoch": 3.7738773296137005, + "grad_norm": 1.4157874152938659, + "learning_rate": 1.8870117792932426e-07, + "loss": 0.9854, + "step": 48700 + }, + { + "epoch": 3.7746522530900073, + "grad_norm": 1.3374113661195826, + "learning_rate": 1.8873992560446373e-07, + "loss": 0.9364, + "step": 48710 + }, + { + "epoch": 3.775427176566314, + "grad_norm": 1.2783678779365535, + "learning_rate": 1.8877867327960322e-07, + "loss": 0.969, + "step": 48720 + }, + { + "epoch": 3.776202100042621, + "grad_norm": 1.3071041764396185, + "learning_rate": 1.8881742095474272e-07, + "loss": 0.9611, + "step": 48730 + }, + { + "epoch": 3.7769770235189277, + "grad_norm": 1.374345790631067, + "learning_rate": 1.8885616862988222e-07, + "loss": 0.968, + "step": 48740 + }, + { + "epoch": 3.777751946995234, + "grad_norm": 1.314827961827049, + "learning_rate": 1.888949163050217e-07, + "loss": 0.9724, + "step": 48750 + }, + { + "epoch": 3.778526870471541, + "grad_norm": 1.2953528185320888, + "learning_rate": 1.889336639801612e-07, + "loss": 0.9807, + "step": 48760 + }, + { + "epoch": 3.7793017939478477, + "grad_norm": 1.3314989367178518, + "learning_rate": 1.889724116553007e-07, + "loss": 0.9717, + "step": 48770 + }, + { + "epoch": 3.7800767174241545, + "grad_norm": 1.4222421936076548, + "learning_rate": 1.8901115933044017e-07, + "loss": 0.9635, + "step": 48780 + }, + { + "epoch": 3.780851640900461, + "grad_norm": 1.3625816909376822, + "learning_rate": 1.8904990700557967e-07, + "loss": 0.9874, + "step": 48790 + }, + { + "epoch": 3.7816265643767677, + "grad_norm": 1.3404853488953252, + "learning_rate": 1.8908865468071916e-07, + "loss": 0.9912, + "step": 48800 + }, + { + "epoch": 3.7824014878530745, + "grad_norm": 1.329394208149085, + "learning_rate": 1.8912740235585866e-07, + "loss": 0.977, + "step": 48810 + }, + { + "epoch": 3.7831764113293813, + "grad_norm": 1.356771042835295, + "learning_rate": 1.8916615003099815e-07, + "loss": 0.978, + "step": 48820 + }, + { + "epoch": 3.783951334805688, + "grad_norm": 1.37639385458658, + "learning_rate": 1.8920489770613765e-07, + "loss": 0.9955, + "step": 48830 + }, + { + "epoch": 3.7847262582819945, + "grad_norm": 1.3840068301636352, + "learning_rate": 1.8924364538127714e-07, + "loss": 0.9734, + "step": 48840 + }, + { + "epoch": 3.7855011817583013, + "grad_norm": 1.2952328613060613, + "learning_rate": 1.892823930564166e-07, + "loss": 0.975, + "step": 48850 + }, + { + "epoch": 3.786276105234608, + "grad_norm": 1.40279413727796, + "learning_rate": 1.893211407315561e-07, + "loss": 0.9772, + "step": 48860 + }, + { + "epoch": 3.787051028710915, + "grad_norm": 1.3680192069358803, + "learning_rate": 1.893598884066956e-07, + "loss": 0.9822, + "step": 48870 + }, + { + "epoch": 3.7878259521872213, + "grad_norm": 1.3516869354976586, + "learning_rate": 1.893986360818351e-07, + "loss": 0.9698, + "step": 48880 + }, + { + "epoch": 3.788600875663528, + "grad_norm": 1.3495690468260444, + "learning_rate": 1.894373837569746e-07, + "loss": 0.9776, + "step": 48890 + }, + { + "epoch": 3.789375799139835, + "grad_norm": 1.2588303927531213, + "learning_rate": 1.894761314321141e-07, + "loss": 0.9806, + "step": 48900 + }, + { + "epoch": 3.7901507226161417, + "grad_norm": 1.3202245009164015, + "learning_rate": 1.8951487910725359e-07, + "loss": 0.9711, + "step": 48910 + }, + { + "epoch": 3.7909256460924485, + "grad_norm": 1.2570862769421487, + "learning_rate": 1.8955362678239306e-07, + "loss": 1.0096, + "step": 48920 + }, + { + "epoch": 3.7917005695687553, + "grad_norm": 1.2726366715265856, + "learning_rate": 1.8959237445753255e-07, + "loss": 0.9683, + "step": 48930 + }, + { + "epoch": 3.7924754930450617, + "grad_norm": 1.3506809209367907, + "learning_rate": 1.8963112213267205e-07, + "loss": 1.0142, + "step": 48940 + }, + { + "epoch": 3.7932504165213685, + "grad_norm": 1.3360416216367315, + "learning_rate": 1.8966986980781154e-07, + "loss": 0.9708, + "step": 48950 + }, + { + "epoch": 3.7940253399976753, + "grad_norm": 1.3686231175571497, + "learning_rate": 1.8970861748295104e-07, + "loss": 0.9665, + "step": 48960 + }, + { + "epoch": 3.7948002634739817, + "grad_norm": 1.361826987410514, + "learning_rate": 1.8974736515809053e-07, + "loss": 0.9776, + "step": 48970 + }, + { + "epoch": 3.7955751869502885, + "grad_norm": 1.3949075017935118, + "learning_rate": 1.8978611283323e-07, + "loss": 0.9862, + "step": 48980 + }, + { + "epoch": 3.7963501104265953, + "grad_norm": 1.380232830225534, + "learning_rate": 1.898248605083695e-07, + "loss": 0.9821, + "step": 48990 + }, + { + "epoch": 3.797125033902902, + "grad_norm": 1.3754830845678443, + "learning_rate": 1.89863608183509e-07, + "loss": 0.9669, + "step": 49000 + }, + { + "epoch": 3.797125033902902, + "eval_loss": 0.9808372855186462, + "eval_runtime": 318.3421, + "eval_samples_per_second": 36.034, + "eval_steps_per_second": 9.009, + "step": 49000 + }, + { + "epoch": 3.797899957379209, + "grad_norm": 1.4479563071115438, + "learning_rate": 1.899023558586485e-07, + "loss": 0.965, + "step": 49010 + }, + { + "epoch": 3.7986748808555157, + "grad_norm": 1.3829043183153098, + "learning_rate": 1.8994110353378798e-07, + "loss": 0.9735, + "step": 49020 + }, + { + "epoch": 3.799449804331822, + "grad_norm": 1.297742950183815, + "learning_rate": 1.8997985120892748e-07, + "loss": 0.9621, + "step": 49030 + }, + { + "epoch": 3.800224727808129, + "grad_norm": 1.323560353417475, + "learning_rate": 1.9001859888406697e-07, + "loss": 0.9605, + "step": 49040 + }, + { + "epoch": 3.8009996512844357, + "grad_norm": 1.3801606538540125, + "learning_rate": 1.9005734655920644e-07, + "loss": 0.9877, + "step": 49050 + }, + { + "epoch": 3.8017745747607425, + "grad_norm": 1.225538815478001, + "learning_rate": 1.9009609423434594e-07, + "loss": 0.982, + "step": 49060 + }, + { + "epoch": 3.802549498237049, + "grad_norm": 1.286286378848352, + "learning_rate": 1.9013484190948543e-07, + "loss": 0.9614, + "step": 49070 + }, + { + "epoch": 3.8033244217133557, + "grad_norm": 1.4082109980959892, + "learning_rate": 1.9017358958462493e-07, + "loss": 0.9805, + "step": 49080 + }, + { + "epoch": 3.8040993451896625, + "grad_norm": 1.39337673178103, + "learning_rate": 1.9021233725976443e-07, + "loss": 1.0031, + "step": 49090 + }, + { + "epoch": 3.8048742686659693, + "grad_norm": 1.3487196585103742, + "learning_rate": 1.9025108493490392e-07, + "loss": 0.9722, + "step": 49100 + }, + { + "epoch": 3.805649192142276, + "grad_norm": 1.3654787165960853, + "learning_rate": 1.9028983261004342e-07, + "loss": 0.9783, + "step": 49110 + }, + { + "epoch": 3.8064241156185825, + "grad_norm": 1.3082351862416133, + "learning_rate": 1.9032858028518289e-07, + "loss": 0.9838, + "step": 49120 + }, + { + "epoch": 3.8071990390948893, + "grad_norm": 1.3308866174248315, + "learning_rate": 1.9036732796032238e-07, + "loss": 0.96, + "step": 49130 + }, + { + "epoch": 3.807973962571196, + "grad_norm": 1.3156975136557765, + "learning_rate": 1.9040607563546188e-07, + "loss": 0.9928, + "step": 49140 + }, + { + "epoch": 3.808748886047503, + "grad_norm": 1.370003907180856, + "learning_rate": 1.9044482331060137e-07, + "loss": 0.9651, + "step": 49150 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 1.3945855199993926, + "learning_rate": 1.9048357098574087e-07, + "loss": 0.9827, + "step": 49160 + }, + { + "epoch": 3.810298733000116, + "grad_norm": 1.2280660209381573, + "learning_rate": 1.9052231866088036e-07, + "loss": 0.9959, + "step": 49170 + }, + { + "epoch": 3.811073656476423, + "grad_norm": 1.3424024368028098, + "learning_rate": 1.9056106633601986e-07, + "loss": 0.9873, + "step": 49180 + }, + { + "epoch": 3.8118485799527297, + "grad_norm": 1.2977154493451049, + "learning_rate": 1.9059981401115933e-07, + "loss": 0.9804, + "step": 49190 + }, + { + "epoch": 3.8126235034290366, + "grad_norm": 1.2963982273863077, + "learning_rate": 1.9063856168629882e-07, + "loss": 0.9657, + "step": 49200 + }, + { + "epoch": 3.8133984269053434, + "grad_norm": 1.316629895708269, + "learning_rate": 1.9067730936143832e-07, + "loss": 1.0016, + "step": 49210 + }, + { + "epoch": 3.8141733503816497, + "grad_norm": 1.362675902584929, + "learning_rate": 1.9071605703657781e-07, + "loss": 1.0036, + "step": 49220 + }, + { + "epoch": 3.8149482738579565, + "grad_norm": 1.2990343182670676, + "learning_rate": 1.907548047117173e-07, + "loss": 0.981, + "step": 49230 + }, + { + "epoch": 3.8157231973342634, + "grad_norm": 1.3297944908946226, + "learning_rate": 1.907935523868568e-07, + "loss": 0.9773, + "step": 49240 + }, + { + "epoch": 3.8164981208105697, + "grad_norm": 1.268981882303041, + "learning_rate": 1.908323000619963e-07, + "loss": 0.9656, + "step": 49250 + }, + { + "epoch": 3.8172730442868765, + "grad_norm": 1.3242103996345764, + "learning_rate": 1.9087104773713577e-07, + "loss": 0.9973, + "step": 49260 + }, + { + "epoch": 3.8180479677631833, + "grad_norm": 1.2896971374753143, + "learning_rate": 1.9090979541227527e-07, + "loss": 1.0219, + "step": 49270 + }, + { + "epoch": 3.81882289123949, + "grad_norm": 1.3052659494521766, + "learning_rate": 1.9094854308741476e-07, + "loss": 0.9906, + "step": 49280 + }, + { + "epoch": 3.819597814715797, + "grad_norm": 1.3133635448463556, + "learning_rate": 1.9098729076255426e-07, + "loss": 1.0014, + "step": 49290 + }, + { + "epoch": 3.8203727381921038, + "grad_norm": 1.3800782979863837, + "learning_rate": 1.9102603843769375e-07, + "loss": 0.9667, + "step": 49300 + }, + { + "epoch": 3.82114766166841, + "grad_norm": 1.371843656743846, + "learning_rate": 1.9106478611283325e-07, + "loss": 0.9905, + "step": 49310 + }, + { + "epoch": 3.821922585144717, + "grad_norm": 1.4072004080097236, + "learning_rate": 1.9110353378797272e-07, + "loss": 0.9975, + "step": 49320 + }, + { + "epoch": 3.8226975086210238, + "grad_norm": 1.378770603552566, + "learning_rate": 1.911422814631122e-07, + "loss": 0.9732, + "step": 49330 + }, + { + "epoch": 3.82347243209733, + "grad_norm": 1.3696130568338487, + "learning_rate": 1.911810291382517e-07, + "loss": 0.9837, + "step": 49340 + }, + { + "epoch": 3.824247355573637, + "grad_norm": 1.3920459072402025, + "learning_rate": 1.912197768133912e-07, + "loss": 0.9737, + "step": 49350 + }, + { + "epoch": 3.8250222790499437, + "grad_norm": 1.3834602358634391, + "learning_rate": 1.912585244885307e-07, + "loss": 0.9567, + "step": 49360 + }, + { + "epoch": 3.8257972025262506, + "grad_norm": 1.2542797006597777, + "learning_rate": 1.912972721636702e-07, + "loss": 0.9683, + "step": 49370 + }, + { + "epoch": 3.8265721260025574, + "grad_norm": 1.3865921664102592, + "learning_rate": 1.913360198388097e-07, + "loss": 0.993, + "step": 49380 + }, + { + "epoch": 3.827347049478864, + "grad_norm": 1.374761228509807, + "learning_rate": 1.9137476751394916e-07, + "loss": 0.9489, + "step": 49390 + }, + { + "epoch": 3.8281219729551705, + "grad_norm": 1.3313580910952736, + "learning_rate": 1.9141351518908865e-07, + "loss": 0.9843, + "step": 49400 + }, + { + "epoch": 3.8288968964314773, + "grad_norm": 1.2877778211267672, + "learning_rate": 1.9145226286422815e-07, + "loss": 0.9871, + "step": 49410 + }, + { + "epoch": 3.829671819907784, + "grad_norm": 1.362820805547502, + "learning_rate": 1.9149101053936765e-07, + "loss": 0.9753, + "step": 49420 + }, + { + "epoch": 3.830446743384091, + "grad_norm": 1.2974502225703817, + "learning_rate": 1.9152975821450714e-07, + "loss": 0.9796, + "step": 49430 + }, + { + "epoch": 3.8312216668603973, + "grad_norm": 1.4067263429789052, + "learning_rate": 1.9156850588964664e-07, + "loss": 0.9808, + "step": 49440 + }, + { + "epoch": 3.831996590336704, + "grad_norm": 1.2911947756609548, + "learning_rate": 1.9160725356478613e-07, + "loss": 0.9949, + "step": 49450 + }, + { + "epoch": 3.832771513813011, + "grad_norm": 1.4155127295675558, + "learning_rate": 1.916460012399256e-07, + "loss": 1.0162, + "step": 49460 + }, + { + "epoch": 3.8335464372893178, + "grad_norm": 1.3500558955484296, + "learning_rate": 1.916847489150651e-07, + "loss": 0.9924, + "step": 49470 + }, + { + "epoch": 3.8343213607656246, + "grad_norm": 1.313906955762164, + "learning_rate": 1.917234965902046e-07, + "loss": 0.9718, + "step": 49480 + }, + { + "epoch": 3.8350962842419314, + "grad_norm": 1.3972277273586309, + "learning_rate": 1.917622442653441e-07, + "loss": 0.9853, + "step": 49490 + }, + { + "epoch": 3.8358712077182378, + "grad_norm": 1.345429460368247, + "learning_rate": 1.9180099194048358e-07, + "loss": 0.9677, + "step": 49500 + }, + { + "epoch": 3.8358712077182378, + "eval_loss": 0.9799185991287231, + "eval_runtime": 321.1051, + "eval_samples_per_second": 35.724, + "eval_steps_per_second": 8.932, + "step": 49500 + }, + { + "epoch": 3.8366461311945446, + "grad_norm": 1.238367739272946, + "learning_rate": 1.9183973961562308e-07, + "loss": 0.959, + "step": 49510 + }, + { + "epoch": 3.8374210546708514, + "grad_norm": 1.3713932576162997, + "learning_rate": 1.9187848729076257e-07, + "loss": 0.9814, + "step": 49520 + }, + { + "epoch": 3.8381959781471577, + "grad_norm": 1.2988537624807888, + "learning_rate": 1.9191723496590204e-07, + "loss": 0.978, + "step": 49530 + }, + { + "epoch": 3.8389709016234645, + "grad_norm": 1.363631690323409, + "learning_rate": 1.9195598264104154e-07, + "loss": 0.9843, + "step": 49540 + }, + { + "epoch": 3.8397458250997714, + "grad_norm": 1.368913949295962, + "learning_rate": 1.9199473031618103e-07, + "loss": 0.9681, + "step": 49550 + }, + { + "epoch": 3.840520748576078, + "grad_norm": 1.3748880504512297, + "learning_rate": 1.9203347799132053e-07, + "loss": 0.9847, + "step": 49560 + }, + { + "epoch": 3.841295672052385, + "grad_norm": 1.369799260685543, + "learning_rate": 1.9207222566646002e-07, + "loss": 0.9695, + "step": 49570 + }, + { + "epoch": 3.842070595528692, + "grad_norm": 1.3905890296927677, + "learning_rate": 1.9211097334159952e-07, + "loss": 0.9675, + "step": 49580 + }, + { + "epoch": 3.842845519004998, + "grad_norm": 1.375686475391819, + "learning_rate": 1.9214972101673902e-07, + "loss": 0.9898, + "step": 49590 + }, + { + "epoch": 3.843620442481305, + "grad_norm": 1.3270555154108288, + "learning_rate": 1.9218846869187848e-07, + "loss": 0.9706, + "step": 49600 + }, + { + "epoch": 3.8443953659576118, + "grad_norm": 1.4241635804655144, + "learning_rate": 1.9222721636701798e-07, + "loss": 0.9802, + "step": 49610 + }, + { + "epoch": 3.845170289433918, + "grad_norm": 1.282509191802119, + "learning_rate": 1.9226596404215748e-07, + "loss": 0.9585, + "step": 49620 + }, + { + "epoch": 3.845945212910225, + "grad_norm": 1.378375613915107, + "learning_rate": 1.9230471171729697e-07, + "loss": 0.973, + "step": 49630 + }, + { + "epoch": 3.8467201363865318, + "grad_norm": 1.3762147231250732, + "learning_rate": 1.9234345939243647e-07, + "loss": 0.9595, + "step": 49640 + }, + { + "epoch": 3.8474950598628386, + "grad_norm": 1.3093944408940545, + "learning_rate": 1.9238220706757596e-07, + "loss": 0.9744, + "step": 49650 + }, + { + "epoch": 3.8482699833391454, + "grad_norm": 1.2822197908494921, + "learning_rate": 1.9242095474271543e-07, + "loss": 0.9851, + "step": 49660 + }, + { + "epoch": 3.849044906815452, + "grad_norm": 1.3824444672584137, + "learning_rate": 1.9245970241785493e-07, + "loss": 0.983, + "step": 49670 + }, + { + "epoch": 3.8498198302917586, + "grad_norm": 1.3318701109855473, + "learning_rate": 1.9249845009299442e-07, + "loss": 0.9856, + "step": 49680 + }, + { + "epoch": 3.8505947537680654, + "grad_norm": 1.2442334853303303, + "learning_rate": 1.9253719776813392e-07, + "loss": 0.9824, + "step": 49690 + }, + { + "epoch": 3.851369677244372, + "grad_norm": 1.3260036281086272, + "learning_rate": 1.9257594544327341e-07, + "loss": 0.9571, + "step": 49700 + }, + { + "epoch": 3.852144600720679, + "grad_norm": 1.343114397971746, + "learning_rate": 1.926146931184129e-07, + "loss": 0.9681, + "step": 49710 + }, + { + "epoch": 3.8529195241969854, + "grad_norm": 1.2872459309757844, + "learning_rate": 1.926534407935524e-07, + "loss": 1.003, + "step": 49720 + }, + { + "epoch": 3.853694447673292, + "grad_norm": 1.3883538077922484, + "learning_rate": 1.9269218846869187e-07, + "loss": 0.9977, + "step": 49730 + }, + { + "epoch": 3.854469371149599, + "grad_norm": 1.411724678170223, + "learning_rate": 1.9273093614383137e-07, + "loss": 0.9907, + "step": 49740 + }, + { + "epoch": 3.855244294625906, + "grad_norm": 1.4024818798698022, + "learning_rate": 1.9276968381897086e-07, + "loss": 0.9846, + "step": 49750 + }, + { + "epoch": 3.8560192181022126, + "grad_norm": 1.32593223350215, + "learning_rate": 1.9280843149411036e-07, + "loss": 0.9727, + "step": 49760 + }, + { + "epoch": 3.856794141578519, + "grad_norm": 1.4281345729221413, + "learning_rate": 1.9284717916924986e-07, + "loss": 0.9807, + "step": 49770 + }, + { + "epoch": 3.8575690650548258, + "grad_norm": 1.3426220230034704, + "learning_rate": 1.9288592684438935e-07, + "loss": 0.9497, + "step": 49780 + }, + { + "epoch": 3.8583439885311326, + "grad_norm": 1.3520170374127356, + "learning_rate": 1.9292467451952885e-07, + "loss": 0.9633, + "step": 49790 + }, + { + "epoch": 3.8591189120074394, + "grad_norm": 1.3462525448378588, + "learning_rate": 1.9296342219466832e-07, + "loss": 0.9638, + "step": 49800 + }, + { + "epoch": 3.8598938354837458, + "grad_norm": 1.317141052035205, + "learning_rate": 1.930021698698078e-07, + "loss": 0.9841, + "step": 49810 + }, + { + "epoch": 3.8606687589600526, + "grad_norm": 1.296305368466929, + "learning_rate": 1.930409175449473e-07, + "loss": 1.0126, + "step": 49820 + }, + { + "epoch": 3.8614436824363594, + "grad_norm": 1.323789644026646, + "learning_rate": 1.930796652200868e-07, + "loss": 1.0083, + "step": 49830 + }, + { + "epoch": 3.862218605912666, + "grad_norm": 1.3636728356527532, + "learning_rate": 1.931184128952263e-07, + "loss": 0.9906, + "step": 49840 + }, + { + "epoch": 3.862993529388973, + "grad_norm": 1.327046771072436, + "learning_rate": 1.931571605703658e-07, + "loss": 0.9841, + "step": 49850 + }, + { + "epoch": 3.86376845286528, + "grad_norm": 1.3020227983640527, + "learning_rate": 1.931959082455053e-07, + "loss": 0.9607, + "step": 49860 + }, + { + "epoch": 3.864543376341586, + "grad_norm": 1.4097522543872747, + "learning_rate": 1.9323465592064476e-07, + "loss": 0.9795, + "step": 49870 + }, + { + "epoch": 3.865318299817893, + "grad_norm": 1.3294958446532807, + "learning_rate": 1.9327340359578425e-07, + "loss": 0.9803, + "step": 49880 + }, + { + "epoch": 3.8660932232942, + "grad_norm": 1.3467841822533357, + "learning_rate": 1.9331215127092375e-07, + "loss": 0.966, + "step": 49890 + }, + { + "epoch": 3.866868146770506, + "grad_norm": 1.3018388382691857, + "learning_rate": 1.9335089894606324e-07, + "loss": 0.9549, + "step": 49900 + }, + { + "epoch": 3.867643070246813, + "grad_norm": 1.2689133528103136, + "learning_rate": 1.9338964662120274e-07, + "loss": 0.9879, + "step": 49910 + }, + { + "epoch": 3.86841799372312, + "grad_norm": 1.3219526169621945, + "learning_rate": 1.9342839429634224e-07, + "loss": 0.9779, + "step": 49920 + }, + { + "epoch": 3.8691929171994266, + "grad_norm": 1.353348590284723, + "learning_rate": 1.9346714197148173e-07, + "loss": 0.9746, + "step": 49930 + }, + { + "epoch": 3.8699678406757334, + "grad_norm": 1.385805400445425, + "learning_rate": 1.935058896466212e-07, + "loss": 0.976, + "step": 49940 + }, + { + "epoch": 3.87074276415204, + "grad_norm": 1.3589497049407964, + "learning_rate": 1.935446373217607e-07, + "loss": 0.9669, + "step": 49950 + }, + { + "epoch": 3.8715176876283466, + "grad_norm": 1.3453961683776259, + "learning_rate": 1.935833849969002e-07, + "loss": 1.002, + "step": 49960 + }, + { + "epoch": 3.8722926111046534, + "grad_norm": 1.3454153669874613, + "learning_rate": 1.9362213267203969e-07, + "loss": 0.9805, + "step": 49970 + }, + { + "epoch": 3.87306753458096, + "grad_norm": 1.3840078814151244, + "learning_rate": 1.9366088034717918e-07, + "loss": 0.9715, + "step": 49980 + }, + { + "epoch": 3.8738424580572666, + "grad_norm": 1.383777768093475, + "learning_rate": 1.9369962802231868e-07, + "loss": 0.9733, + "step": 49990 + }, + { + "epoch": 3.8746173815335734, + "grad_norm": 1.337111438223271, + "learning_rate": 1.9373837569745817e-07, + "loss": 0.978, + "step": 50000 + }, + { + "epoch": 3.8746173815335734, + "eval_loss": 0.9789366126060486, + "eval_runtime": 319.682, + "eval_samples_per_second": 35.883, + "eval_steps_per_second": 8.971, + "step": 50000 + }, + { + "epoch": 3.87539230500988, + "grad_norm": 1.3200967755014286, + "learning_rate": 1.9377712337259764e-07, + "loss": 0.9777, + "step": 50010 + }, + { + "epoch": 3.876167228486187, + "grad_norm": 1.2892440086038097, + "learning_rate": 1.9381587104773714e-07, + "loss": 0.9861, + "step": 50020 + }, + { + "epoch": 3.876942151962494, + "grad_norm": 1.3090451114434092, + "learning_rate": 1.9385461872287663e-07, + "loss": 0.9784, + "step": 50030 + }, + { + "epoch": 3.8777170754388006, + "grad_norm": 1.4505621761033525, + "learning_rate": 1.9389336639801613e-07, + "loss": 0.9785, + "step": 50040 + }, + { + "epoch": 3.878491998915107, + "grad_norm": 1.3075827162017104, + "learning_rate": 1.9393211407315562e-07, + "loss": 0.9969, + "step": 50050 + }, + { + "epoch": 3.879266922391414, + "grad_norm": 1.2258400284158104, + "learning_rate": 1.9397086174829512e-07, + "loss": 0.986, + "step": 50060 + }, + { + "epoch": 3.8800418458677206, + "grad_norm": 1.378478071875769, + "learning_rate": 1.940096094234346e-07, + "loss": 0.9846, + "step": 50070 + }, + { + "epoch": 3.8808167693440274, + "grad_norm": 1.3833220196855023, + "learning_rate": 1.9404835709857408e-07, + "loss": 0.9897, + "step": 50080 + }, + { + "epoch": 3.881591692820334, + "grad_norm": 1.371139823084928, + "learning_rate": 1.9408710477371358e-07, + "loss": 0.9759, + "step": 50090 + }, + { + "epoch": 3.8823666162966406, + "grad_norm": 1.3214518485563993, + "learning_rate": 1.9412585244885308e-07, + "loss": 0.9967, + "step": 50100 + }, + { + "epoch": 3.8831415397729474, + "grad_norm": 1.3488552259986777, + "learning_rate": 1.9416460012399257e-07, + "loss": 0.9742, + "step": 50110 + }, + { + "epoch": 3.883916463249254, + "grad_norm": 1.4331245173724392, + "learning_rate": 1.9420334779913207e-07, + "loss": 0.9868, + "step": 50120 + }, + { + "epoch": 3.884691386725561, + "grad_norm": 1.3296911773756022, + "learning_rate": 1.9424209547427156e-07, + "loss": 0.9937, + "step": 50130 + }, + { + "epoch": 3.885466310201868, + "grad_norm": 1.344221191074919, + "learning_rate": 1.9428084314941103e-07, + "loss": 1.0187, + "step": 50140 + }, + { + "epoch": 3.886241233678174, + "grad_norm": 1.319988954681864, + "learning_rate": 1.9431959082455053e-07, + "loss": 0.9761, + "step": 50150 + }, + { + "epoch": 3.887016157154481, + "grad_norm": 1.4382230674930947, + "learning_rate": 1.9435833849969002e-07, + "loss": 1.005, + "step": 50160 + }, + { + "epoch": 3.887791080630788, + "grad_norm": 1.368847483576113, + "learning_rate": 1.9439708617482952e-07, + "loss": 0.9875, + "step": 50170 + }, + { + "epoch": 3.888566004107094, + "grad_norm": 1.2440393211050709, + "learning_rate": 1.94435833849969e-07, + "loss": 0.9788, + "step": 50180 + }, + { + "epoch": 3.889340927583401, + "grad_norm": 1.4439470186084022, + "learning_rate": 1.944745815251085e-07, + "loss": 0.9881, + "step": 50190 + }, + { + "epoch": 3.890115851059708, + "grad_norm": 1.33066208090838, + "learning_rate": 1.94513329200248e-07, + "loss": 0.9634, + "step": 50200 + }, + { + "epoch": 3.8908907745360146, + "grad_norm": 1.3300023699434027, + "learning_rate": 1.9455207687538747e-07, + "loss": 0.9774, + "step": 50210 + }, + { + "epoch": 3.8916656980123214, + "grad_norm": 1.4607378194091505, + "learning_rate": 1.9459082455052697e-07, + "loss": 0.9807, + "step": 50220 + }, + { + "epoch": 3.8924406214886282, + "grad_norm": 1.3762633830118876, + "learning_rate": 1.9462957222566646e-07, + "loss": 0.9791, + "step": 50230 + }, + { + "epoch": 3.8932155449649346, + "grad_norm": 1.3660890901325222, + "learning_rate": 1.9466831990080596e-07, + "loss": 0.9733, + "step": 50240 + }, + { + "epoch": 3.8939904684412414, + "grad_norm": 1.3297483167819455, + "learning_rate": 1.9470706757594545e-07, + "loss": 0.9857, + "step": 50250 + }, + { + "epoch": 3.8947653919175482, + "grad_norm": 1.312051057676693, + "learning_rate": 1.9474581525108495e-07, + "loss": 0.986, + "step": 50260 + }, + { + "epoch": 3.8955403153938546, + "grad_norm": 1.3306131877199494, + "learning_rate": 1.9478456292622445e-07, + "loss": 0.9697, + "step": 50270 + }, + { + "epoch": 3.8963152388701614, + "grad_norm": 1.437248323448263, + "learning_rate": 1.9482331060136391e-07, + "loss": 0.9945, + "step": 50280 + }, + { + "epoch": 3.897090162346468, + "grad_norm": 1.338424331238992, + "learning_rate": 1.948620582765034e-07, + "loss": 0.9905, + "step": 50290 + }, + { + "epoch": 3.897865085822775, + "grad_norm": 1.3123723958978073, + "learning_rate": 1.949008059516429e-07, + "loss": 0.9535, + "step": 50300 + }, + { + "epoch": 3.898640009299082, + "grad_norm": 1.34366760812918, + "learning_rate": 1.949395536267824e-07, + "loss": 0.9945, + "step": 50310 + }, + { + "epoch": 3.8994149327753886, + "grad_norm": 1.380477681676846, + "learning_rate": 1.949783013019219e-07, + "loss": 0.9871, + "step": 50320 + }, + { + "epoch": 3.900189856251695, + "grad_norm": 1.3107660957536194, + "learning_rate": 1.950170489770614e-07, + "loss": 0.998, + "step": 50330 + }, + { + "epoch": 3.900964779728002, + "grad_norm": 1.2592308338572722, + "learning_rate": 1.950557966522009e-07, + "loss": 0.9722, + "step": 50340 + }, + { + "epoch": 3.9017397032043086, + "grad_norm": 1.3712729671399608, + "learning_rate": 1.9509454432734036e-07, + "loss": 0.9621, + "step": 50350 + }, + { + "epoch": 3.9025146266806154, + "grad_norm": 1.3195784062135403, + "learning_rate": 1.9513329200247985e-07, + "loss": 0.9704, + "step": 50360 + }, + { + "epoch": 3.903289550156922, + "grad_norm": 1.2881286416185587, + "learning_rate": 1.9517203967761935e-07, + "loss": 0.9643, + "step": 50370 + }, + { + "epoch": 3.9040644736332286, + "grad_norm": 1.2961549593400294, + "learning_rate": 1.9521078735275884e-07, + "loss": 0.9787, + "step": 50380 + }, + { + "epoch": 3.9048393971095354, + "grad_norm": 1.7619409389179048, + "learning_rate": 1.9524953502789834e-07, + "loss": 0.9991, + "step": 50390 + }, + { + "epoch": 3.9056143205858422, + "grad_norm": 1.3187661342687027, + "learning_rate": 1.9528828270303783e-07, + "loss": 0.9697, + "step": 50400 + }, + { + "epoch": 3.906389244062149, + "grad_norm": 1.3537663657092178, + "learning_rate": 1.953270303781773e-07, + "loss": 0.9757, + "step": 50410 + }, + { + "epoch": 3.9071641675384554, + "grad_norm": 1.3654218782469154, + "learning_rate": 1.953657780533168e-07, + "loss": 0.9758, + "step": 50420 + }, + { + "epoch": 3.9079390910147622, + "grad_norm": 1.2535325666756585, + "learning_rate": 1.954045257284563e-07, + "loss": 0.9819, + "step": 50430 + }, + { + "epoch": 3.908714014491069, + "grad_norm": 1.3935355848838664, + "learning_rate": 1.954432734035958e-07, + "loss": 0.9585, + "step": 50440 + }, + { + "epoch": 3.909488937967376, + "grad_norm": 1.3606800299259028, + "learning_rate": 1.9548202107873529e-07, + "loss": 0.9741, + "step": 50450 + }, + { + "epoch": 3.910263861443682, + "grad_norm": 1.3404262713970228, + "learning_rate": 1.9552076875387478e-07, + "loss": 0.9629, + "step": 50460 + }, + { + "epoch": 3.911038784919989, + "grad_norm": 1.2709238764996045, + "learning_rate": 1.9555951642901428e-07, + "loss": 0.9675, + "step": 50470 + }, + { + "epoch": 3.911813708396296, + "grad_norm": 1.269891846101217, + "learning_rate": 1.9559826410415375e-07, + "loss": 0.9595, + "step": 50480 + }, + { + "epoch": 3.9125886318726026, + "grad_norm": 1.3330841764050676, + "learning_rate": 1.9563701177929324e-07, + "loss": 0.9939, + "step": 50490 + }, + { + "epoch": 3.9133635553489095, + "grad_norm": 1.4260262653537839, + "learning_rate": 1.9567575945443274e-07, + "loss": 0.9881, + "step": 50500 + }, + { + "epoch": 3.9133635553489095, + "eval_loss": 0.9780346155166626, + "eval_runtime": 319.0042, + "eval_samples_per_second": 35.959, + "eval_steps_per_second": 8.99, + "step": 50500 + }, + { + "epoch": 3.9141384788252163, + "grad_norm": 1.2883404794393116, + "learning_rate": 1.9571450712957223e-07, + "loss": 0.9669, + "step": 50510 + }, + { + "epoch": 3.9149134023015226, + "grad_norm": 1.336608100607585, + "learning_rate": 1.9575325480471173e-07, + "loss": 0.9904, + "step": 50520 + }, + { + "epoch": 3.9156883257778294, + "grad_norm": 1.3429074282128184, + "learning_rate": 1.9579200247985122e-07, + "loss": 0.9697, + "step": 50530 + }, + { + "epoch": 3.9164632492541362, + "grad_norm": 1.3492323166368188, + "learning_rate": 1.9583075015499072e-07, + "loss": 0.9975, + "step": 50540 + }, + { + "epoch": 3.9172381727304426, + "grad_norm": 1.3373382933536193, + "learning_rate": 1.958694978301302e-07, + "loss": 0.9818, + "step": 50550 + }, + { + "epoch": 3.9180130962067494, + "grad_norm": 1.4640969102735253, + "learning_rate": 1.9590824550526968e-07, + "loss": 0.9523, + "step": 50560 + }, + { + "epoch": 3.9187880196830562, + "grad_norm": 1.2976188538859792, + "learning_rate": 1.9594699318040918e-07, + "loss": 0.9754, + "step": 50570 + }, + { + "epoch": 3.919562943159363, + "grad_norm": 1.3673481961926441, + "learning_rate": 1.9598574085554867e-07, + "loss": 0.9779, + "step": 50580 + }, + { + "epoch": 3.92033786663567, + "grad_norm": 1.3298955755361443, + "learning_rate": 1.9602448853068817e-07, + "loss": 0.98, + "step": 50590 + }, + { + "epoch": 3.9211127901119767, + "grad_norm": 1.345189165706451, + "learning_rate": 1.9606323620582767e-07, + "loss": 0.9652, + "step": 50600 + }, + { + "epoch": 3.921887713588283, + "grad_norm": 1.2555013270880313, + "learning_rate": 1.9610198388096716e-07, + "loss": 0.9459, + "step": 50610 + }, + { + "epoch": 3.92266263706459, + "grad_norm": 1.2649806277331244, + "learning_rate": 1.9614073155610663e-07, + "loss": 0.9874, + "step": 50620 + }, + { + "epoch": 3.9234375605408967, + "grad_norm": 1.4267951838623485, + "learning_rate": 1.9617947923124613e-07, + "loss": 0.9763, + "step": 50630 + }, + { + "epoch": 3.9242124840172035, + "grad_norm": 1.370986447273656, + "learning_rate": 1.9621822690638562e-07, + "loss": 0.9765, + "step": 50640 + }, + { + "epoch": 3.92498740749351, + "grad_norm": 1.2586005446064905, + "learning_rate": 1.9625697458152512e-07, + "loss": 0.9586, + "step": 50650 + }, + { + "epoch": 3.9257623309698166, + "grad_norm": 1.3784331623582051, + "learning_rate": 1.962957222566646e-07, + "loss": 0.979, + "step": 50660 + }, + { + "epoch": 3.9265372544461234, + "grad_norm": 1.321270079040112, + "learning_rate": 1.963344699318041e-07, + "loss": 0.965, + "step": 50670 + }, + { + "epoch": 3.9273121779224303, + "grad_norm": 1.3535126598272365, + "learning_rate": 1.963732176069436e-07, + "loss": 0.9669, + "step": 50680 + }, + { + "epoch": 3.928087101398737, + "grad_norm": 1.4888627226576843, + "learning_rate": 1.9641196528208307e-07, + "loss": 0.9816, + "step": 50690 + }, + { + "epoch": 3.9288620248750434, + "grad_norm": 1.3405542728563822, + "learning_rate": 1.9645071295722257e-07, + "loss": 0.9728, + "step": 50700 + }, + { + "epoch": 3.9296369483513502, + "grad_norm": 1.4361722593940087, + "learning_rate": 1.9648946063236206e-07, + "loss": 0.9757, + "step": 50710 + }, + { + "epoch": 3.930411871827657, + "grad_norm": 1.370008517021013, + "learning_rate": 1.9652820830750156e-07, + "loss": 0.9813, + "step": 50720 + }, + { + "epoch": 3.931186795303964, + "grad_norm": 1.405149473950075, + "learning_rate": 1.9656695598264105e-07, + "loss": 0.9994, + "step": 50730 + }, + { + "epoch": 3.9319617187802702, + "grad_norm": 1.322648155224663, + "learning_rate": 1.9660570365778055e-07, + "loss": 0.9883, + "step": 50740 + }, + { + "epoch": 3.932736642256577, + "grad_norm": 1.3093027892987106, + "learning_rate": 1.9664445133292005e-07, + "loss": 0.9584, + "step": 50750 + }, + { + "epoch": 3.933511565732884, + "grad_norm": 1.3125558890975346, + "learning_rate": 1.9668319900805951e-07, + "loss": 0.9794, + "step": 50760 + }, + { + "epoch": 3.9342864892091907, + "grad_norm": 1.3758369532449797, + "learning_rate": 1.96721946683199e-07, + "loss": 0.9607, + "step": 50770 + }, + { + "epoch": 3.9350614126854975, + "grad_norm": 1.3057389491959146, + "learning_rate": 1.967606943583385e-07, + "loss": 0.9817, + "step": 50780 + }, + { + "epoch": 3.9358363361618043, + "grad_norm": 1.3627592054929785, + "learning_rate": 1.96799442033478e-07, + "loss": 0.9985, + "step": 50790 + }, + { + "epoch": 3.9366112596381106, + "grad_norm": 1.3031731760316694, + "learning_rate": 1.968381897086175e-07, + "loss": 0.9691, + "step": 50800 + }, + { + "epoch": 3.9373861831144175, + "grad_norm": 1.3400307057997312, + "learning_rate": 1.96876937383757e-07, + "loss": 0.992, + "step": 50810 + }, + { + "epoch": 3.9381611065907243, + "grad_norm": 2.8793241311182327, + "learning_rate": 1.9691568505889646e-07, + "loss": 0.9662, + "step": 50820 + }, + { + "epoch": 3.9389360300670306, + "grad_norm": 1.30157207348781, + "learning_rate": 1.9695443273403596e-07, + "loss": 0.9809, + "step": 50830 + }, + { + "epoch": 3.9397109535433374, + "grad_norm": 1.3404165643517312, + "learning_rate": 1.9699318040917545e-07, + "loss": 0.9775, + "step": 50840 + }, + { + "epoch": 3.9404858770196443, + "grad_norm": 1.3935508185093104, + "learning_rate": 1.9703192808431495e-07, + "loss": 0.9775, + "step": 50850 + }, + { + "epoch": 3.941260800495951, + "grad_norm": 1.2613257967160436, + "learning_rate": 1.9707067575945444e-07, + "loss": 0.975, + "step": 50860 + }, + { + "epoch": 3.942035723972258, + "grad_norm": 1.3860285083308466, + "learning_rate": 1.9710942343459394e-07, + "loss": 0.9808, + "step": 50870 + }, + { + "epoch": 3.9428106474485647, + "grad_norm": 1.418860441804165, + "learning_rate": 1.9714817110973343e-07, + "loss": 0.9678, + "step": 50880 + }, + { + "epoch": 3.943585570924871, + "grad_norm": 1.316059866932129, + "learning_rate": 1.971869187848729e-07, + "loss": 0.9701, + "step": 50890 + }, + { + "epoch": 3.944360494401178, + "grad_norm": 1.3478345260324063, + "learning_rate": 1.972256664600124e-07, + "loss": 0.9948, + "step": 50900 + }, + { + "epoch": 3.9451354178774847, + "grad_norm": 1.3640266143633495, + "learning_rate": 1.972644141351519e-07, + "loss": 0.9651, + "step": 50910 + }, + { + "epoch": 3.945910341353791, + "grad_norm": 1.3157309523611465, + "learning_rate": 1.973031618102914e-07, + "loss": 0.9921, + "step": 50920 + }, + { + "epoch": 3.946685264830098, + "grad_norm": 1.3755770058628938, + "learning_rate": 1.9734190948543088e-07, + "loss": 0.9792, + "step": 50930 + }, + { + "epoch": 3.9474601883064047, + "grad_norm": 1.2905327084599332, + "learning_rate": 1.9738065716057038e-07, + "loss": 0.9983, + "step": 50940 + }, + { + "epoch": 3.9482351117827115, + "grad_norm": 1.3457444878770068, + "learning_rate": 1.9741940483570988e-07, + "loss": 0.9934, + "step": 50950 + }, + { + "epoch": 3.9490100352590183, + "grad_norm": 1.429804110162991, + "learning_rate": 1.9745815251084934e-07, + "loss": 0.9969, + "step": 50960 + }, + { + "epoch": 3.949784958735325, + "grad_norm": 1.2962639633318052, + "learning_rate": 1.9749690018598884e-07, + "loss": 0.9866, + "step": 50970 + }, + { + "epoch": 3.9505598822116315, + "grad_norm": 1.3222901034061942, + "learning_rate": 1.9753564786112834e-07, + "loss": 0.9588, + "step": 50980 + }, + { + "epoch": 3.9513348056879383, + "grad_norm": 1.2846048397817458, + "learning_rate": 1.9757439553626783e-07, + "loss": 0.9561, + "step": 50990 + }, + { + "epoch": 3.952109729164245, + "grad_norm": 1.3056531543052008, + "learning_rate": 1.9761314321140733e-07, + "loss": 0.9709, + "step": 51000 + }, + { + "epoch": 3.952109729164245, + "eval_loss": 0.9770253896713257, + "eval_runtime": 320.7197, + "eval_samples_per_second": 35.766, + "eval_steps_per_second": 8.942, + "step": 51000 + }, + { + "epoch": 3.952884652640552, + "grad_norm": 1.364394360902658, + "learning_rate": 1.9765189088654682e-07, + "loss": 0.9661, + "step": 51010 + }, + { + "epoch": 3.9536595761168583, + "grad_norm": 1.2655164903702385, + "learning_rate": 1.9769063856168632e-07, + "loss": 0.9682, + "step": 51020 + }, + { + "epoch": 3.954434499593165, + "grad_norm": 1.3598457467887684, + "learning_rate": 1.977293862368258e-07, + "loss": 0.9822, + "step": 51030 + }, + { + "epoch": 3.955209423069472, + "grad_norm": 1.266005795691614, + "learning_rate": 1.9776813391196528e-07, + "loss": 0.9711, + "step": 51040 + }, + { + "epoch": 3.9559843465457787, + "grad_norm": 1.456048062349759, + "learning_rate": 1.9780688158710478e-07, + "loss": 0.9779, + "step": 51050 + }, + { + "epoch": 3.9567592700220855, + "grad_norm": 1.3740284680162238, + "learning_rate": 1.9784562926224427e-07, + "loss": 0.9724, + "step": 51060 + }, + { + "epoch": 3.9575341934983923, + "grad_norm": 1.3608919334378382, + "learning_rate": 1.9788437693738377e-07, + "loss": 0.985, + "step": 51070 + }, + { + "epoch": 3.9583091169746987, + "grad_norm": 1.377731805416723, + "learning_rate": 1.9792312461252326e-07, + "loss": 0.9911, + "step": 51080 + }, + { + "epoch": 3.9590840404510055, + "grad_norm": 1.279900617169143, + "learning_rate": 1.9796187228766276e-07, + "loss": 0.963, + "step": 51090 + }, + { + "epoch": 3.9598589639273123, + "grad_norm": 1.426549675642248, + "learning_rate": 1.9800061996280223e-07, + "loss": 0.9773, + "step": 51100 + }, + { + "epoch": 3.9606338874036187, + "grad_norm": 1.3511495622197232, + "learning_rate": 1.9803936763794172e-07, + "loss": 0.9529, + "step": 51110 + }, + { + "epoch": 3.9614088108799255, + "grad_norm": 1.413314309866822, + "learning_rate": 1.9807811531308122e-07, + "loss": 0.9802, + "step": 51120 + }, + { + "epoch": 3.9621837343562323, + "grad_norm": 1.4974137975984794, + "learning_rate": 1.9811686298822072e-07, + "loss": 0.9821, + "step": 51130 + }, + { + "epoch": 3.962958657832539, + "grad_norm": 1.3193726627698263, + "learning_rate": 1.981556106633602e-07, + "loss": 0.9647, + "step": 51140 + }, + { + "epoch": 3.963733581308846, + "grad_norm": 1.3441055047429298, + "learning_rate": 1.981943583384997e-07, + "loss": 0.9726, + "step": 51150 + }, + { + "epoch": 3.9645085047851527, + "grad_norm": 1.3371982229231265, + "learning_rate": 1.9823310601363918e-07, + "loss": 0.9677, + "step": 51160 + }, + { + "epoch": 3.965283428261459, + "grad_norm": 1.3238082021755444, + "learning_rate": 1.9827185368877867e-07, + "loss": 0.9673, + "step": 51170 + }, + { + "epoch": 3.966058351737766, + "grad_norm": 1.3573422483408255, + "learning_rate": 1.9831060136391817e-07, + "loss": 0.9717, + "step": 51180 + }, + { + "epoch": 3.9668332752140727, + "grad_norm": 1.3427276070321905, + "learning_rate": 1.9834934903905766e-07, + "loss": 0.9959, + "step": 51190 + }, + { + "epoch": 3.967608198690379, + "grad_norm": 1.3514750192483527, + "learning_rate": 1.9838809671419716e-07, + "loss": 0.9803, + "step": 51200 + }, + { + "epoch": 3.968383122166686, + "grad_norm": 1.3381659689869192, + "learning_rate": 1.9842684438933665e-07, + "loss": 0.9811, + "step": 51210 + }, + { + "epoch": 3.9691580456429927, + "grad_norm": 1.3430760209764072, + "learning_rate": 1.9846559206447615e-07, + "loss": 0.9902, + "step": 51220 + }, + { + "epoch": 3.9699329691192995, + "grad_norm": 1.4225772545039181, + "learning_rate": 1.9850433973961562e-07, + "loss": 0.9909, + "step": 51230 + }, + { + "epoch": 3.9707078925956063, + "grad_norm": 1.2716652531063195, + "learning_rate": 1.9854308741475511e-07, + "loss": 0.9628, + "step": 51240 + }, + { + "epoch": 3.971482816071913, + "grad_norm": 1.3396705714217798, + "learning_rate": 1.985818350898946e-07, + "loss": 0.969, + "step": 51250 + }, + { + "epoch": 3.9722577395482195, + "grad_norm": 1.2924967993992795, + "learning_rate": 1.986205827650341e-07, + "loss": 0.9838, + "step": 51260 + }, + { + "epoch": 3.9730326630245263, + "grad_norm": 1.3488645711504876, + "learning_rate": 1.986593304401736e-07, + "loss": 0.9562, + "step": 51270 + }, + { + "epoch": 3.973807586500833, + "grad_norm": 1.416193466906704, + "learning_rate": 1.986980781153131e-07, + "loss": 0.9793, + "step": 51280 + }, + { + "epoch": 3.97458250997714, + "grad_norm": 1.4067594154092522, + "learning_rate": 1.987368257904526e-07, + "loss": 0.9697, + "step": 51290 + }, + { + "epoch": 3.9753574334534463, + "grad_norm": 1.3258649827047302, + "learning_rate": 1.9877557346559206e-07, + "loss": 1.0007, + "step": 51300 + }, + { + "epoch": 3.976132356929753, + "grad_norm": 1.347327752859723, + "learning_rate": 1.9881432114073156e-07, + "loss": 0.9947, + "step": 51310 + }, + { + "epoch": 3.97690728040606, + "grad_norm": 1.4482539852600071, + "learning_rate": 1.9885306881587105e-07, + "loss": 0.9877, + "step": 51320 + }, + { + "epoch": 3.9776822038823667, + "grad_norm": 1.3731317249363661, + "learning_rate": 1.9889181649101055e-07, + "loss": 0.9782, + "step": 51330 + }, + { + "epoch": 3.9784571273586735, + "grad_norm": 1.4071584145891725, + "learning_rate": 1.9893056416615004e-07, + "loss": 0.9789, + "step": 51340 + }, + { + "epoch": 3.97923205083498, + "grad_norm": 1.3201957178963242, + "learning_rate": 1.9896931184128954e-07, + "loss": 0.9828, + "step": 51350 + }, + { + "epoch": 3.9800069743112867, + "grad_norm": 1.353057257202535, + "learning_rate": 1.9900805951642903e-07, + "loss": 0.9517, + "step": 51360 + }, + { + "epoch": 3.9807818977875935, + "grad_norm": 1.2834322390987192, + "learning_rate": 1.990468071915685e-07, + "loss": 0.9663, + "step": 51370 + }, + { + "epoch": 3.9815568212639003, + "grad_norm": 1.3606080527517606, + "learning_rate": 1.99085554866708e-07, + "loss": 0.9807, + "step": 51380 + }, + { + "epoch": 3.9823317447402067, + "grad_norm": 1.2504683493166533, + "learning_rate": 1.991243025418475e-07, + "loss": 0.9603, + "step": 51390 + }, + { + "epoch": 3.9831066682165135, + "grad_norm": 1.3611422198977545, + "learning_rate": 1.99163050216987e-07, + "loss": 0.9702, + "step": 51400 + }, + { + "epoch": 3.9838815916928203, + "grad_norm": 1.3425401967975985, + "learning_rate": 1.9920179789212648e-07, + "loss": 0.9788, + "step": 51410 + }, + { + "epoch": 3.984656515169127, + "grad_norm": 1.2774869886515636, + "learning_rate": 1.9924054556726598e-07, + "loss": 0.9611, + "step": 51420 + }, + { + "epoch": 3.985431438645434, + "grad_norm": 1.3158944885376143, + "learning_rate": 1.9927929324240548e-07, + "loss": 0.9926, + "step": 51430 + }, + { + "epoch": 3.9862063621217407, + "grad_norm": 1.281638601524576, + "learning_rate": 1.9931804091754494e-07, + "loss": 0.9742, + "step": 51440 + }, + { + "epoch": 3.986981285598047, + "grad_norm": 1.3834738619019435, + "learning_rate": 1.9935678859268444e-07, + "loss": 0.9766, + "step": 51450 + }, + { + "epoch": 3.987756209074354, + "grad_norm": 1.4104079763608945, + "learning_rate": 1.9939553626782394e-07, + "loss": 0.988, + "step": 51460 + }, + { + "epoch": 3.9885311325506607, + "grad_norm": 1.3501720753393198, + "learning_rate": 1.9943428394296343e-07, + "loss": 0.9707, + "step": 51470 + }, + { + "epoch": 3.989306056026967, + "grad_norm": 1.2963028917130714, + "learning_rate": 1.9947303161810293e-07, + "loss": 0.951, + "step": 51480 + }, + { + "epoch": 3.990080979503274, + "grad_norm": 1.3196818411115527, + "learning_rate": 1.9951177929324242e-07, + "loss": 1.0009, + "step": 51490 + }, + { + "epoch": 3.9908559029795807, + "grad_norm": 1.3609942950381435, + "learning_rate": 1.995505269683819e-07, + "loss": 0.9978, + "step": 51500 + }, + { + "epoch": 3.9908559029795807, + "eval_loss": 0.9761800765991211, + "eval_runtime": 318.8503, + "eval_samples_per_second": 35.976, + "eval_steps_per_second": 8.995, + "step": 51500 + }, + { + "epoch": 3.9916308264558875, + "grad_norm": 1.3700731365581067, + "learning_rate": 1.9958927464352139e-07, + "loss": 0.9698, + "step": 51510 + }, + { + "epoch": 3.9924057499321943, + "grad_norm": 1.3423719017533837, + "learning_rate": 1.9962802231866088e-07, + "loss": 0.957, + "step": 51520 + }, + { + "epoch": 3.993180673408501, + "grad_norm": 1.4151602114293027, + "learning_rate": 1.9966676999380038e-07, + "loss": 0.9744, + "step": 51530 + }, + { + "epoch": 3.9939555968848075, + "grad_norm": 1.3494282951383028, + "learning_rate": 1.9970551766893987e-07, + "loss": 0.9882, + "step": 51540 + }, + { + "epoch": 3.9947305203611143, + "grad_norm": 1.3128934319351386, + "learning_rate": 1.9974426534407937e-07, + "loss": 0.9621, + "step": 51550 + }, + { + "epoch": 3.995505443837421, + "grad_norm": 1.4524443562754217, + "learning_rate": 1.9978301301921886e-07, + "loss": 0.9827, + "step": 51560 + }, + { + "epoch": 3.996280367313728, + "grad_norm": 1.3199028724975437, + "learning_rate": 1.9982176069435833e-07, + "loss": 1.0001, + "step": 51570 + }, + { + "epoch": 3.9970552907900343, + "grad_norm": 1.2935650869120845, + "learning_rate": 1.9986050836949783e-07, + "loss": 0.9704, + "step": 51580 + }, + { + "epoch": 3.997830214266341, + "grad_norm": 1.3311834338728838, + "learning_rate": 1.9989925604463732e-07, + "loss": 0.9738, + "step": 51590 + }, + { + "epoch": 3.998605137742648, + "grad_norm": 1.3400230847715007, + "learning_rate": 1.9993800371977682e-07, + "loss": 0.9655, + "step": 51600 + }, + { + "epoch": 3.9993800612189547, + "grad_norm": 1.3560295174520323, + "learning_rate": 1.9997675139491631e-07, + "loss": 0.9818, + "step": 51610 + }, + { + "epoch": 4.0001549846952615, + "grad_norm": 1.4328012786182942, + "learning_rate": 2.000154990700558e-07, + "loss": 0.9723, + "step": 51620 + }, + { + "epoch": 4.000929908171568, + "grad_norm": 1.3915328984858497, + "learning_rate": 2.000542467451953e-07, + "loss": 0.972, + "step": 51630 + }, + { + "epoch": 4.001704831647875, + "grad_norm": 1.2799308468596544, + "learning_rate": 2.0009299442033477e-07, + "loss": 0.971, + "step": 51640 + }, + { + "epoch": 4.002479755124181, + "grad_norm": 1.2630891096137973, + "learning_rate": 2.0013174209547427e-07, + "loss": 0.9742, + "step": 51650 + }, + { + "epoch": 4.003254678600488, + "grad_norm": 1.4546330300365657, + "learning_rate": 2.0017048977061377e-07, + "loss": 0.968, + "step": 51660 + }, + { + "epoch": 4.004029602076795, + "grad_norm": 1.323297983753045, + "learning_rate": 2.0020923744575326e-07, + "loss": 0.978, + "step": 51670 + }, + { + "epoch": 4.0048045255531015, + "grad_norm": 1.4092725142964595, + "learning_rate": 2.0024798512089276e-07, + "loss": 0.9702, + "step": 51680 + }, + { + "epoch": 4.005579449029408, + "grad_norm": 1.399773076403208, + "learning_rate": 2.0028673279603225e-07, + "loss": 0.9923, + "step": 51690 + }, + { + "epoch": 4.006354372505715, + "grad_norm": 1.2926734156653428, + "learning_rate": 2.0032548047117175e-07, + "loss": 0.9739, + "step": 51700 + }, + { + "epoch": 4.007129295982022, + "grad_norm": 1.3448486576093188, + "learning_rate": 2.0036422814631122e-07, + "loss": 0.9804, + "step": 51710 + }, + { + "epoch": 4.007904219458329, + "grad_norm": 1.2817621736947442, + "learning_rate": 2.004029758214507e-07, + "loss": 0.9667, + "step": 51720 + }, + { + "epoch": 4.008679142934636, + "grad_norm": 1.2963400420014362, + "learning_rate": 2.004417234965902e-07, + "loss": 0.9599, + "step": 51730 + }, + { + "epoch": 4.0094540664109415, + "grad_norm": 1.3755194388414498, + "learning_rate": 2.004804711717297e-07, + "loss": 0.9797, + "step": 51740 + }, + { + "epoch": 4.010228989887248, + "grad_norm": 1.3078302315351504, + "learning_rate": 2.005192188468692e-07, + "loss": 0.9582, + "step": 51750 + }, + { + "epoch": 4.011003913363555, + "grad_norm": 1.3968234770735808, + "learning_rate": 2.005579665220087e-07, + "loss": 0.9652, + "step": 51760 + }, + { + "epoch": 4.011778836839862, + "grad_norm": 1.287298420360258, + "learning_rate": 2.005967141971482e-07, + "loss": 0.9655, + "step": 51770 + }, + { + "epoch": 4.012553760316169, + "grad_norm": 1.4084531800389875, + "learning_rate": 2.0063546187228766e-07, + "loss": 0.979, + "step": 51780 + }, + { + "epoch": 4.0133286837924755, + "grad_norm": 1.3477219959396505, + "learning_rate": 2.0067420954742715e-07, + "loss": 0.9628, + "step": 51790 + }, + { + "epoch": 4.014103607268782, + "grad_norm": 1.299958548986427, + "learning_rate": 2.0071295722256665e-07, + "loss": 0.9671, + "step": 51800 + }, + { + "epoch": 4.014878530745089, + "grad_norm": 1.2955195320334039, + "learning_rate": 2.0075170489770615e-07, + "loss": 0.9716, + "step": 51810 + }, + { + "epoch": 4.015653454221396, + "grad_norm": 1.3177690270855242, + "learning_rate": 2.0079045257284564e-07, + "loss": 0.9405, + "step": 51820 + }, + { + "epoch": 4.016428377697703, + "grad_norm": 1.3271077812424297, + "learning_rate": 2.0082920024798514e-07, + "loss": 0.9496, + "step": 51830 + }, + { + "epoch": 4.017203301174009, + "grad_norm": 1.2992303387093518, + "learning_rate": 2.0086794792312463e-07, + "loss": 0.9488, + "step": 51840 + }, + { + "epoch": 4.0179782246503155, + "grad_norm": 1.3213531697898138, + "learning_rate": 2.009066955982641e-07, + "loss": 0.9697, + "step": 51850 + }, + { + "epoch": 4.018753148126622, + "grad_norm": 1.2398493486583284, + "learning_rate": 2.009454432734036e-07, + "loss": 0.9698, + "step": 51860 + }, + { + "epoch": 4.019528071602929, + "grad_norm": 1.33671829778806, + "learning_rate": 2.009841909485431e-07, + "loss": 0.9881, + "step": 51870 + }, + { + "epoch": 4.020302995079236, + "grad_norm": 1.3212634991428964, + "learning_rate": 2.010229386236826e-07, + "loss": 1.0005, + "step": 51880 + }, + { + "epoch": 4.021077918555543, + "grad_norm": 1.3247216221731517, + "learning_rate": 2.0106168629882208e-07, + "loss": 0.9814, + "step": 51890 + }, + { + "epoch": 4.02185284203185, + "grad_norm": 1.3266812821695704, + "learning_rate": 2.0110043397396158e-07, + "loss": 0.9342, + "step": 51900 + }, + { + "epoch": 4.022627765508156, + "grad_norm": 1.3696824571729753, + "learning_rate": 2.0113918164910105e-07, + "loss": 0.9885, + "step": 51910 + }, + { + "epoch": 4.023402688984463, + "grad_norm": 1.323515788910446, + "learning_rate": 2.0117792932424054e-07, + "loss": 0.9689, + "step": 51920 + }, + { + "epoch": 4.024177612460769, + "grad_norm": 1.319611844309557, + "learning_rate": 2.0121667699938004e-07, + "loss": 0.993, + "step": 51930 + }, + { + "epoch": 4.024952535937076, + "grad_norm": 1.230959877952575, + "learning_rate": 2.0125542467451953e-07, + "loss": 0.975, + "step": 51940 + }, + { + "epoch": 4.025727459413383, + "grad_norm": 1.304726460217917, + "learning_rate": 2.0129417234965903e-07, + "loss": 0.9691, + "step": 51950 + }, + { + "epoch": 4.0265023828896895, + "grad_norm": 1.3616244644683817, + "learning_rate": 2.0133292002479853e-07, + "loss": 0.948, + "step": 51960 + }, + { + "epoch": 4.027277306365996, + "grad_norm": 1.2764438888490846, + "learning_rate": 2.0137166769993802e-07, + "loss": 0.9811, + "step": 51970 + }, + { + "epoch": 4.028052229842303, + "grad_norm": 1.3573205467961968, + "learning_rate": 2.014104153750775e-07, + "loss": 0.9644, + "step": 51980 + }, + { + "epoch": 4.02882715331861, + "grad_norm": 1.3009720432871674, + "learning_rate": 2.0144916305021699e-07, + "loss": 0.9895, + "step": 51990 + }, + { + "epoch": 4.029602076794917, + "grad_norm": 1.3693556157361426, + "learning_rate": 2.0148791072535648e-07, + "loss": 0.9815, + "step": 52000 + }, + { + "epoch": 4.029602076794917, + "eval_loss": 0.9752237796783447, + "eval_runtime": 318.5759, + "eval_samples_per_second": 36.007, + "eval_steps_per_second": 9.003, + "step": 52000 + }, + { + "epoch": 4.030377000271224, + "grad_norm": 1.3457088131319526, + "learning_rate": 2.0152665840049598e-07, + "loss": 0.983, + "step": 52010 + }, + { + "epoch": 4.0311519237475295, + "grad_norm": 1.3451187644752667, + "learning_rate": 2.0156540607563547e-07, + "loss": 1.001, + "step": 52020 + }, + { + "epoch": 4.031926847223836, + "grad_norm": 1.3142079728475506, + "learning_rate": 2.0160415375077497e-07, + "loss": 0.9701, + "step": 52030 + }, + { + "epoch": 4.032701770700143, + "grad_norm": 1.3344851884727769, + "learning_rate": 2.0164290142591446e-07, + "loss": 0.9826, + "step": 52040 + }, + { + "epoch": 4.03347669417645, + "grad_norm": 1.2595215484598408, + "learning_rate": 2.0168164910105393e-07, + "loss": 0.9788, + "step": 52050 + }, + { + "epoch": 4.034251617652757, + "grad_norm": 1.3602199483260742, + "learning_rate": 2.0172039677619343e-07, + "loss": 0.9658, + "step": 52060 + }, + { + "epoch": 4.035026541129064, + "grad_norm": 1.2891789928798052, + "learning_rate": 2.0175914445133292e-07, + "loss": 0.9686, + "step": 52070 + }, + { + "epoch": 4.03580146460537, + "grad_norm": 1.3381795315736071, + "learning_rate": 2.0179789212647242e-07, + "loss": 0.9652, + "step": 52080 + }, + { + "epoch": 4.036576388081677, + "grad_norm": 1.3598629793427346, + "learning_rate": 2.0183663980161191e-07, + "loss": 0.9999, + "step": 52090 + }, + { + "epoch": 4.037351311557984, + "grad_norm": 1.304590371695782, + "learning_rate": 2.018753874767514e-07, + "loss": 0.9572, + "step": 52100 + }, + { + "epoch": 4.038126235034291, + "grad_norm": 1.3630214913139918, + "learning_rate": 2.019141351518909e-07, + "loss": 0.9623, + "step": 52110 + }, + { + "epoch": 4.038901158510597, + "grad_norm": 1.3101972753746782, + "learning_rate": 2.0195288282703037e-07, + "loss": 0.954, + "step": 52120 + }, + { + "epoch": 4.0396760819869035, + "grad_norm": 1.2596595732658697, + "learning_rate": 2.0199163050216987e-07, + "loss": 0.9754, + "step": 52130 + }, + { + "epoch": 4.04045100546321, + "grad_norm": 1.2672328299351179, + "learning_rate": 2.0203037817730937e-07, + "loss": 0.9639, + "step": 52140 + }, + { + "epoch": 4.041225928939517, + "grad_norm": 1.324316665764637, + "learning_rate": 2.0206912585244886e-07, + "loss": 0.9971, + "step": 52150 + }, + { + "epoch": 4.042000852415824, + "grad_norm": 1.3555308930016436, + "learning_rate": 2.0210787352758836e-07, + "loss": 0.9714, + "step": 52160 + }, + { + "epoch": 4.042775775892131, + "grad_norm": 1.4575846514086546, + "learning_rate": 2.0214662120272785e-07, + "loss": 1.0032, + "step": 52170 + }, + { + "epoch": 4.043550699368438, + "grad_norm": 1.3851943930216506, + "learning_rate": 2.0218536887786735e-07, + "loss": 0.9721, + "step": 52180 + }, + { + "epoch": 4.044325622844744, + "grad_norm": 2.491645554371904, + "learning_rate": 2.0222411655300682e-07, + "loss": 0.9706, + "step": 52190 + }, + { + "epoch": 4.045100546321051, + "grad_norm": 1.265376113453413, + "learning_rate": 2.022628642281463e-07, + "loss": 0.9623, + "step": 52200 + }, + { + "epoch": 4.045875469797357, + "grad_norm": 1.3678249439296537, + "learning_rate": 2.023016119032858e-07, + "loss": 0.958, + "step": 52210 + }, + { + "epoch": 4.046650393273664, + "grad_norm": 1.3377135188068345, + "learning_rate": 2.023403595784253e-07, + "loss": 0.9829, + "step": 52220 + }, + { + "epoch": 4.047425316749971, + "grad_norm": 1.3707726164653842, + "learning_rate": 2.023791072535648e-07, + "loss": 0.9729, + "step": 52230 + }, + { + "epoch": 4.048200240226278, + "grad_norm": 1.3264584277679163, + "learning_rate": 2.024178549287043e-07, + "loss": 0.9874, + "step": 52240 + }, + { + "epoch": 4.048975163702584, + "grad_norm": 1.271980296034054, + "learning_rate": 2.0245660260384376e-07, + "loss": 0.9614, + "step": 52250 + }, + { + "epoch": 4.049750087178891, + "grad_norm": 1.3517432444650173, + "learning_rate": 2.0249535027898326e-07, + "loss": 0.9798, + "step": 52260 + }, + { + "epoch": 4.050525010655198, + "grad_norm": 1.3685337184421849, + "learning_rate": 2.0253409795412275e-07, + "loss": 0.9616, + "step": 52270 + }, + { + "epoch": 4.051299934131505, + "grad_norm": 1.4490752002196043, + "learning_rate": 2.0257284562926225e-07, + "loss": 0.9692, + "step": 52280 + }, + { + "epoch": 4.052074857607812, + "grad_norm": 1.3227643885380684, + "learning_rate": 2.0261159330440174e-07, + "loss": 0.9713, + "step": 52290 + }, + { + "epoch": 4.0528497810841175, + "grad_norm": 1.3793726557318868, + "learning_rate": 2.0265034097954124e-07, + "loss": 0.9802, + "step": 52300 + }, + { + "epoch": 4.053624704560424, + "grad_norm": 1.304171924597438, + "learning_rate": 2.0268908865468074e-07, + "loss": 0.9802, + "step": 52310 + }, + { + "epoch": 4.054399628036731, + "grad_norm": 1.2683082546090305, + "learning_rate": 2.027278363298202e-07, + "loss": 0.9738, + "step": 52320 + }, + { + "epoch": 4.055174551513038, + "grad_norm": 1.4216655474167927, + "learning_rate": 2.027665840049597e-07, + "loss": 0.9694, + "step": 52330 + }, + { + "epoch": 4.055949474989345, + "grad_norm": 1.2487118250407863, + "learning_rate": 2.028053316800992e-07, + "loss": 0.9613, + "step": 52340 + }, + { + "epoch": 4.056724398465652, + "grad_norm": 1.3354621618101203, + "learning_rate": 2.028440793552387e-07, + "loss": 0.9728, + "step": 52350 + }, + { + "epoch": 4.057499321941958, + "grad_norm": 1.3039262493565045, + "learning_rate": 2.028828270303782e-07, + "loss": 0.9707, + "step": 52360 + }, + { + "epoch": 4.058274245418265, + "grad_norm": 1.3598428770901905, + "learning_rate": 2.0292157470551768e-07, + "loss": 0.9802, + "step": 52370 + }, + { + "epoch": 4.059049168894572, + "grad_norm": 1.355760166402162, + "learning_rate": 2.0296032238065718e-07, + "loss": 0.9911, + "step": 52380 + }, + { + "epoch": 4.059824092370878, + "grad_norm": 1.3565784752846044, + "learning_rate": 2.0299907005579665e-07, + "loss": 0.9439, + "step": 52390 + }, + { + "epoch": 4.060599015847185, + "grad_norm": 1.3502952365832048, + "learning_rate": 2.0303781773093614e-07, + "loss": 0.9788, + "step": 52400 + }, + { + "epoch": 4.0613739393234916, + "grad_norm": 1.474396249550665, + "learning_rate": 2.0307656540607564e-07, + "loss": 0.9847, + "step": 52410 + }, + { + "epoch": 4.062148862799798, + "grad_norm": 1.365490632958505, + "learning_rate": 2.0311531308121513e-07, + "loss": 0.9726, + "step": 52420 + }, + { + "epoch": 4.062923786276105, + "grad_norm": 1.3120011308873971, + "learning_rate": 2.0315406075635463e-07, + "loss": 0.9768, + "step": 52430 + }, + { + "epoch": 4.063698709752412, + "grad_norm": 1.3737837058048623, + "learning_rate": 2.0319280843149412e-07, + "loss": 0.9705, + "step": 52440 + }, + { + "epoch": 4.064473633228719, + "grad_norm": 1.4200423831881928, + "learning_rate": 2.0323155610663362e-07, + "loss": 0.9652, + "step": 52450 + }, + { + "epoch": 4.065248556705026, + "grad_norm": 1.3202504914027775, + "learning_rate": 2.032703037817731e-07, + "loss": 0.9613, + "step": 52460 + }, + { + "epoch": 4.066023480181332, + "grad_norm": 1.3069291721551524, + "learning_rate": 2.0330905145691258e-07, + "loss": 0.9616, + "step": 52470 + }, + { + "epoch": 4.066798403657639, + "grad_norm": 1.7657543857448712, + "learning_rate": 2.0334779913205208e-07, + "loss": 0.9483, + "step": 52480 + }, + { + "epoch": 4.067573327133945, + "grad_norm": 1.3372050054456546, + "learning_rate": 2.0338654680719158e-07, + "loss": 0.9668, + "step": 52490 + }, + { + "epoch": 4.068348250610252, + "grad_norm": 1.2940645259759722, + "learning_rate": 2.0342529448233107e-07, + "loss": 0.9561, + "step": 52500 + }, + { + "epoch": 4.068348250610252, + "eval_loss": 0.9743795394897461, + "eval_runtime": 319.5744, + "eval_samples_per_second": 35.895, + "eval_steps_per_second": 8.974, + "step": 52500 + }, + { + "epoch": 4.069123174086559, + "grad_norm": 1.2803749528177035, + "learning_rate": 2.0346404215747057e-07, + "loss": 0.9697, + "step": 52510 + }, + { + "epoch": 4.069898097562866, + "grad_norm": 1.2940350421392235, + "learning_rate": 2.0350278983261006e-07, + "loss": 0.952, + "step": 52520 + }, + { + "epoch": 4.070673021039172, + "grad_norm": 1.4045929284920893, + "learning_rate": 2.0354153750774953e-07, + "loss": 0.9845, + "step": 52530 + }, + { + "epoch": 4.071447944515479, + "grad_norm": 1.3342918754936306, + "learning_rate": 2.0358028518288903e-07, + "loss": 0.9784, + "step": 52540 + }, + { + "epoch": 4.072222867991786, + "grad_norm": 1.2969297144243266, + "learning_rate": 2.0361903285802852e-07, + "loss": 0.9823, + "step": 52550 + }, + { + "epoch": 4.072997791468093, + "grad_norm": 1.2835149763847473, + "learning_rate": 2.0365778053316802e-07, + "loss": 0.9874, + "step": 52560 + }, + { + "epoch": 4.0737727149444, + "grad_norm": 1.333901770053207, + "learning_rate": 2.036965282083075e-07, + "loss": 0.9615, + "step": 52570 + }, + { + "epoch": 4.0745476384207056, + "grad_norm": 1.4231957711088392, + "learning_rate": 2.03735275883447e-07, + "loss": 1.0002, + "step": 52580 + }, + { + "epoch": 4.075322561897012, + "grad_norm": 1.2922776686906663, + "learning_rate": 2.0377402355858648e-07, + "loss": 0.9838, + "step": 52590 + }, + { + "epoch": 4.076097485373319, + "grad_norm": 1.320406738572891, + "learning_rate": 2.0381277123372597e-07, + "loss": 0.9606, + "step": 52600 + }, + { + "epoch": 4.076872408849626, + "grad_norm": 1.3460349611529159, + "learning_rate": 2.0385151890886547e-07, + "loss": 0.9528, + "step": 52610 + }, + { + "epoch": 4.077647332325933, + "grad_norm": 1.3894334357852154, + "learning_rate": 2.0389026658400496e-07, + "loss": 0.9799, + "step": 52620 + }, + { + "epoch": 4.07842225580224, + "grad_norm": 1.3467882803649287, + "learning_rate": 2.0392901425914446e-07, + "loss": 0.971, + "step": 52630 + }, + { + "epoch": 4.079197179278546, + "grad_norm": 1.3897957447152898, + "learning_rate": 2.0396776193428396e-07, + "loss": 0.9522, + "step": 52640 + }, + { + "epoch": 4.079972102754853, + "grad_norm": 1.2857244338232168, + "learning_rate": 2.0400650960942345e-07, + "loss": 0.9827, + "step": 52650 + }, + { + "epoch": 4.08074702623116, + "grad_norm": 1.32151066909011, + "learning_rate": 2.0404525728456292e-07, + "loss": 0.9726, + "step": 52660 + }, + { + "epoch": 4.081521949707466, + "grad_norm": 1.3144741389521373, + "learning_rate": 2.0408400495970242e-07, + "loss": 0.9708, + "step": 52670 + }, + { + "epoch": 4.082296873183773, + "grad_norm": 1.3360102220546282, + "learning_rate": 2.041227526348419e-07, + "loss": 0.9535, + "step": 52680 + }, + { + "epoch": 4.08307179666008, + "grad_norm": 1.3644984252368761, + "learning_rate": 2.041615003099814e-07, + "loss": 0.9603, + "step": 52690 + }, + { + "epoch": 4.083846720136386, + "grad_norm": 1.3107677441491505, + "learning_rate": 2.042002479851209e-07, + "loss": 0.9509, + "step": 52700 + }, + { + "epoch": 4.084621643612693, + "grad_norm": 1.426591401092957, + "learning_rate": 2.042389956602604e-07, + "loss": 0.9754, + "step": 52710 + }, + { + "epoch": 4.085396567089, + "grad_norm": 1.2779526529050131, + "learning_rate": 2.042777433353999e-07, + "loss": 0.965, + "step": 52720 + }, + { + "epoch": 4.086171490565307, + "grad_norm": 1.4467422980389781, + "learning_rate": 2.0431649101053936e-07, + "loss": 0.9943, + "step": 52730 + }, + { + "epoch": 4.086946414041614, + "grad_norm": 1.372550937506374, + "learning_rate": 2.0435523868567886e-07, + "loss": 0.9748, + "step": 52740 + }, + { + "epoch": 4.08772133751792, + "grad_norm": 1.3180644132169606, + "learning_rate": 2.0439398636081835e-07, + "loss": 0.9985, + "step": 52750 + }, + { + "epoch": 4.088496260994227, + "grad_norm": 1.2540745447496, + "learning_rate": 2.0443273403595785e-07, + "loss": 0.9671, + "step": 52760 + }, + { + "epoch": 4.089271184470533, + "grad_norm": 1.3118903446064798, + "learning_rate": 2.0447148171109734e-07, + "loss": 0.9625, + "step": 52770 + }, + { + "epoch": 4.09004610794684, + "grad_norm": 1.3724302679469753, + "learning_rate": 2.0451022938623684e-07, + "loss": 0.9736, + "step": 52780 + }, + { + "epoch": 4.090821031423147, + "grad_norm": 1.3782478184527276, + "learning_rate": 2.0454897706137633e-07, + "loss": 0.9848, + "step": 52790 + }, + { + "epoch": 4.091595954899454, + "grad_norm": 1.318826445248133, + "learning_rate": 2.045877247365158e-07, + "loss": 0.9744, + "step": 52800 + }, + { + "epoch": 4.09237087837576, + "grad_norm": 1.4024299225887036, + "learning_rate": 2.046264724116553e-07, + "loss": 0.9616, + "step": 52810 + }, + { + "epoch": 4.093145801852067, + "grad_norm": 1.377712700995999, + "learning_rate": 2.046652200867948e-07, + "loss": 0.9792, + "step": 52820 + }, + { + "epoch": 4.093920725328374, + "grad_norm": 1.3678554390083744, + "learning_rate": 2.047039677619343e-07, + "loss": 1.0054, + "step": 52830 + }, + { + "epoch": 4.094695648804681, + "grad_norm": 1.3309529890527145, + "learning_rate": 2.0474271543707379e-07, + "loss": 0.9691, + "step": 52840 + }, + { + "epoch": 4.095470572280988, + "grad_norm": 1.4616812203216627, + "learning_rate": 2.0478146311221328e-07, + "loss": 0.9662, + "step": 52850 + }, + { + "epoch": 4.096245495757294, + "grad_norm": 1.3274135666856381, + "learning_rate": 2.0482021078735278e-07, + "loss": 0.9696, + "step": 52860 + }, + { + "epoch": 4.0970204192336, + "grad_norm": 1.3017125418591582, + "learning_rate": 2.0485895846249225e-07, + "loss": 0.9662, + "step": 52870 + }, + { + "epoch": 4.097795342709907, + "grad_norm": 1.3724128689952388, + "learning_rate": 2.0489770613763174e-07, + "loss": 0.9744, + "step": 52880 + }, + { + "epoch": 4.098570266186214, + "grad_norm": 1.3951773206977807, + "learning_rate": 2.0493645381277124e-07, + "loss": 0.9665, + "step": 52890 + }, + { + "epoch": 4.099345189662521, + "grad_norm": 1.3122881198548428, + "learning_rate": 2.0497520148791073e-07, + "loss": 0.9798, + "step": 52900 + }, + { + "epoch": 4.100120113138828, + "grad_norm": 1.407355968193031, + "learning_rate": 2.0501394916305023e-07, + "loss": 0.9851, + "step": 52910 + }, + { + "epoch": 4.100895036615134, + "grad_norm": 1.329492738005051, + "learning_rate": 2.0505269683818972e-07, + "loss": 0.972, + "step": 52920 + }, + { + "epoch": 4.101669960091441, + "grad_norm": 1.2967449380992726, + "learning_rate": 2.0509144451332922e-07, + "loss": 0.9618, + "step": 52930 + }, + { + "epoch": 4.102444883567748, + "grad_norm": 1.4078443781223016, + "learning_rate": 2.051301921884687e-07, + "loss": 0.9476, + "step": 52940 + }, + { + "epoch": 4.103219807044054, + "grad_norm": 1.3789632083422707, + "learning_rate": 2.0516893986360818e-07, + "loss": 0.9689, + "step": 52950 + }, + { + "epoch": 4.103994730520361, + "grad_norm": 1.344865664314363, + "learning_rate": 2.0520768753874768e-07, + "loss": 0.9949, + "step": 52960 + }, + { + "epoch": 4.104769653996668, + "grad_norm": 1.300253954036651, + "learning_rate": 2.0524643521388717e-07, + "loss": 0.9687, + "step": 52970 + }, + { + "epoch": 4.105544577472974, + "grad_norm": 1.3367950537285118, + "learning_rate": 2.0528518288902667e-07, + "loss": 0.9724, + "step": 52980 + }, + { + "epoch": 4.106319500949281, + "grad_norm": 1.368857418939864, + "learning_rate": 2.0532393056416617e-07, + "loss": 0.9801, + "step": 52990 + }, + { + "epoch": 4.107094424425588, + "grad_norm": 1.3676197988479004, + "learning_rate": 2.0536267823930563e-07, + "loss": 0.9436, + "step": 53000 + }, + { + "epoch": 4.107094424425588, + "eval_loss": 0.9735282063484192, + "eval_runtime": 319.0344, + "eval_samples_per_second": 35.955, + "eval_steps_per_second": 8.99, + "step": 53000 + }, + { + "epoch": 4.107869347901895, + "grad_norm": 1.3432793960988785, + "learning_rate": 2.0540142591444513e-07, + "loss": 0.9778, + "step": 53010 + }, + { + "epoch": 4.108644271378202, + "grad_norm": 1.3394844438804383, + "learning_rate": 2.0544017358958463e-07, + "loss": 0.9495, + "step": 53020 + }, + { + "epoch": 4.1094191948545085, + "grad_norm": 1.4525803917285454, + "learning_rate": 2.0547892126472412e-07, + "loss": 0.974, + "step": 53030 + }, + { + "epoch": 4.110194118330815, + "grad_norm": 1.3391078312705766, + "learning_rate": 2.0551766893986362e-07, + "loss": 0.9827, + "step": 53040 + }, + { + "epoch": 4.110969041807121, + "grad_norm": 1.3961435044542332, + "learning_rate": 2.055564166150031e-07, + "loss": 0.9631, + "step": 53050 + }, + { + "epoch": 4.111743965283428, + "grad_norm": 1.3795075653446367, + "learning_rate": 2.055951642901426e-07, + "loss": 0.9621, + "step": 53060 + }, + { + "epoch": 4.112518888759735, + "grad_norm": 1.306591891717236, + "learning_rate": 2.0563391196528208e-07, + "loss": 0.9397, + "step": 53070 + }, + { + "epoch": 4.113293812236042, + "grad_norm": 1.3016923384459298, + "learning_rate": 2.0567265964042157e-07, + "loss": 0.9642, + "step": 53080 + }, + { + "epoch": 4.114068735712348, + "grad_norm": 1.3289958483956883, + "learning_rate": 2.0571140731556107e-07, + "loss": 0.9559, + "step": 53090 + }, + { + "epoch": 4.114843659188655, + "grad_norm": 1.292555059467224, + "learning_rate": 2.0575015499070056e-07, + "loss": 0.9644, + "step": 53100 + }, + { + "epoch": 4.115618582664962, + "grad_norm": 1.3256196210998468, + "learning_rate": 2.0578890266584006e-07, + "loss": 0.9762, + "step": 53110 + }, + { + "epoch": 4.116393506141269, + "grad_norm": 1.4150082252478584, + "learning_rate": 2.0582765034097955e-07, + "loss": 0.9689, + "step": 53120 + }, + { + "epoch": 4.117168429617576, + "grad_norm": 1.3315953628796628, + "learning_rate": 2.0586639801611905e-07, + "loss": 0.9679, + "step": 53130 + }, + { + "epoch": 4.117943353093882, + "grad_norm": 1.3696188225648054, + "learning_rate": 2.0590514569125852e-07, + "loss": 0.9547, + "step": 53140 + }, + { + "epoch": 4.118718276570188, + "grad_norm": 1.300197393388421, + "learning_rate": 2.0594389336639801e-07, + "loss": 0.9565, + "step": 53150 + }, + { + "epoch": 4.119493200046495, + "grad_norm": 1.3977345310803562, + "learning_rate": 2.059826410415375e-07, + "loss": 0.9945, + "step": 53160 + }, + { + "epoch": 4.120268123522802, + "grad_norm": 1.3956091140813065, + "learning_rate": 2.06021388716677e-07, + "loss": 0.9972, + "step": 53170 + }, + { + "epoch": 4.121043046999109, + "grad_norm": 1.305151233522103, + "learning_rate": 2.060601363918165e-07, + "loss": 0.9658, + "step": 53180 + }, + { + "epoch": 4.121817970475416, + "grad_norm": 1.31743686012683, + "learning_rate": 2.06098884066956e-07, + "loss": 0.9611, + "step": 53190 + }, + { + "epoch": 4.1225928939517225, + "grad_norm": 1.3553442949602303, + "learning_rate": 2.061376317420955e-07, + "loss": 0.9896, + "step": 53200 + }, + { + "epoch": 4.123367817428029, + "grad_norm": 1.2697497785047371, + "learning_rate": 2.0617637941723496e-07, + "loss": 0.9797, + "step": 53210 + }, + { + "epoch": 4.124142740904336, + "grad_norm": 1.3755716948092762, + "learning_rate": 2.0621512709237446e-07, + "loss": 0.9621, + "step": 53220 + }, + { + "epoch": 4.124917664380642, + "grad_norm": 1.3163382838391928, + "learning_rate": 2.0625387476751395e-07, + "loss": 0.9518, + "step": 53230 + }, + { + "epoch": 4.125692587856949, + "grad_norm": 1.4678153867975872, + "learning_rate": 2.0629262244265345e-07, + "loss": 0.9882, + "step": 53240 + }, + { + "epoch": 4.126467511333256, + "grad_norm": 1.3222795314745357, + "learning_rate": 2.0633137011779294e-07, + "loss": 0.969, + "step": 53250 + }, + { + "epoch": 4.127242434809562, + "grad_norm": 1.2489005592288294, + "learning_rate": 2.0637011779293244e-07, + "loss": 0.9773, + "step": 53260 + }, + { + "epoch": 4.128017358285869, + "grad_norm": 1.344356250870135, + "learning_rate": 2.0640886546807193e-07, + "loss": 0.9823, + "step": 53270 + }, + { + "epoch": 4.128792281762176, + "grad_norm": 1.342584470138637, + "learning_rate": 2.064476131432114e-07, + "loss": 0.981, + "step": 53280 + }, + { + "epoch": 4.129567205238483, + "grad_norm": 1.3052380376789186, + "learning_rate": 2.064863608183509e-07, + "loss": 0.9702, + "step": 53290 + }, + { + "epoch": 4.13034212871479, + "grad_norm": 1.4163537758943026, + "learning_rate": 2.065251084934904e-07, + "loss": 0.9785, + "step": 53300 + }, + { + "epoch": 4.1311170521910965, + "grad_norm": 1.2828525838450684, + "learning_rate": 2.065638561686299e-07, + "loss": 0.9805, + "step": 53310 + }, + { + "epoch": 4.131891975667402, + "grad_norm": 1.3660483144193345, + "learning_rate": 2.0660260384376939e-07, + "loss": 0.9732, + "step": 53320 + }, + { + "epoch": 4.132666899143709, + "grad_norm": 1.368769515918898, + "learning_rate": 2.0664135151890888e-07, + "loss": 0.9704, + "step": 53330 + }, + { + "epoch": 4.133441822620016, + "grad_norm": 1.3736380864445605, + "learning_rate": 2.0668009919404835e-07, + "loss": 0.9639, + "step": 53340 + }, + { + "epoch": 4.134216746096323, + "grad_norm": 1.3622569177960877, + "learning_rate": 2.0671884686918785e-07, + "loss": 0.9769, + "step": 53350 + }, + { + "epoch": 4.13499166957263, + "grad_norm": 1.3707388401016034, + "learning_rate": 2.0675759454432734e-07, + "loss": 0.9741, + "step": 53360 + }, + { + "epoch": 4.1357665930489365, + "grad_norm": 1.2855524651255454, + "learning_rate": 2.0679634221946684e-07, + "loss": 0.9707, + "step": 53370 + }, + { + "epoch": 4.136541516525243, + "grad_norm": 1.3422489888815095, + "learning_rate": 2.0683508989460633e-07, + "loss": 0.9724, + "step": 53380 + }, + { + "epoch": 4.13731644000155, + "grad_norm": 1.3776271384691818, + "learning_rate": 2.0687383756974583e-07, + "loss": 0.9715, + "step": 53390 + }, + { + "epoch": 4.138091363477857, + "grad_norm": 1.3399172354110582, + "learning_rate": 2.0691258524488532e-07, + "loss": 0.9554, + "step": 53400 + }, + { + "epoch": 4.138866286954164, + "grad_norm": 1.340144148414859, + "learning_rate": 2.069513329200248e-07, + "loss": 0.9733, + "step": 53410 + }, + { + "epoch": 4.13964121043047, + "grad_norm": 1.3976181644155798, + "learning_rate": 2.069900805951643e-07, + "loss": 0.9651, + "step": 53420 + }, + { + "epoch": 4.140416133906776, + "grad_norm": 1.3363272685400278, + "learning_rate": 2.0702882827030378e-07, + "loss": 0.9966, + "step": 53430 + }, + { + "epoch": 4.141191057383083, + "grad_norm": 1.3518419667000896, + "learning_rate": 2.0706757594544328e-07, + "loss": 0.9987, + "step": 53440 + }, + { + "epoch": 4.14196598085939, + "grad_norm": 1.3884569970842735, + "learning_rate": 2.0710632362058277e-07, + "loss": 0.9907, + "step": 53450 + }, + { + "epoch": 4.142740904335697, + "grad_norm": 1.302721808003192, + "learning_rate": 2.0714507129572227e-07, + "loss": 0.9788, + "step": 53460 + }, + { + "epoch": 4.143515827812004, + "grad_norm": 1.3390244261300683, + "learning_rate": 2.0718381897086176e-07, + "loss": 0.9628, + "step": 53470 + }, + { + "epoch": 4.1442907512883105, + "grad_norm": 1.2611215773075695, + "learning_rate": 2.0722256664600123e-07, + "loss": 0.9622, + "step": 53480 + }, + { + "epoch": 4.145065674764617, + "grad_norm": 1.3998782511599956, + "learning_rate": 2.0726131432114073e-07, + "loss": 0.9776, + "step": 53490 + }, + { + "epoch": 4.145840598240924, + "grad_norm": 1.4003339943137267, + "learning_rate": 2.0730006199628023e-07, + "loss": 1.0009, + "step": 53500 + }, + { + "epoch": 4.145840598240924, + "eval_loss": 0.9727725386619568, + "eval_runtime": 317.8261, + "eval_samples_per_second": 36.092, + "eval_steps_per_second": 9.024, + "step": 53500 + }, + { + "epoch": 4.14661552171723, + "grad_norm": 1.297116134344629, + "learning_rate": 2.0733880967141972e-07, + "loss": 0.9693, + "step": 53510 + }, + { + "epoch": 4.147390445193537, + "grad_norm": 1.364749368497436, + "learning_rate": 2.0737755734655922e-07, + "loss": 0.9883, + "step": 53520 + }, + { + "epoch": 4.148165368669844, + "grad_norm": 1.3265075790737202, + "learning_rate": 2.074163050216987e-07, + "loss": 0.9639, + "step": 53530 + }, + { + "epoch": 4.1489402921461505, + "grad_norm": 1.3794719731299254, + "learning_rate": 2.074550526968382e-07, + "loss": 0.9871, + "step": 53540 + }, + { + "epoch": 4.149715215622457, + "grad_norm": 1.3319084052350618, + "learning_rate": 2.0749380037197768e-07, + "loss": 0.9688, + "step": 53550 + }, + { + "epoch": 4.150490139098764, + "grad_norm": 1.351057417512871, + "learning_rate": 2.0753254804711717e-07, + "loss": 0.9721, + "step": 53560 + }, + { + "epoch": 4.151265062575071, + "grad_norm": 1.3157127518736094, + "learning_rate": 2.0757129572225667e-07, + "loss": 0.9835, + "step": 53570 + }, + { + "epoch": 4.152039986051378, + "grad_norm": 1.342053207118848, + "learning_rate": 2.0761004339739616e-07, + "loss": 0.9812, + "step": 53580 + }, + { + "epoch": 4.1528149095276845, + "grad_norm": 1.3449406434202607, + "learning_rate": 2.0764879107253566e-07, + "loss": 0.9562, + "step": 53590 + }, + { + "epoch": 4.15358983300399, + "grad_norm": 1.3419671286946262, + "learning_rate": 2.0768753874767515e-07, + "loss": 0.9552, + "step": 53600 + }, + { + "epoch": 4.154364756480297, + "grad_norm": 1.4431080207521063, + "learning_rate": 2.0772628642281465e-07, + "loss": 0.965, + "step": 53610 + }, + { + "epoch": 4.155139679956604, + "grad_norm": 1.3747402235117478, + "learning_rate": 2.0776503409795412e-07, + "loss": 0.978, + "step": 53620 + }, + { + "epoch": 4.155914603432911, + "grad_norm": 1.3425007539411886, + "learning_rate": 2.0780378177309361e-07, + "loss": 0.9712, + "step": 53630 + }, + { + "epoch": 4.156689526909218, + "grad_norm": 1.3752901089293679, + "learning_rate": 2.078425294482331e-07, + "loss": 1.0012, + "step": 53640 + }, + { + "epoch": 4.1574644503855245, + "grad_norm": 1.3794446385930101, + "learning_rate": 2.078812771233726e-07, + "loss": 0.9699, + "step": 53650 + }, + { + "epoch": 4.158239373861831, + "grad_norm": 1.3626587655121705, + "learning_rate": 2.079200247985121e-07, + "loss": 0.9819, + "step": 53660 + }, + { + "epoch": 4.159014297338138, + "grad_norm": 1.290844334210044, + "learning_rate": 2.079587724736516e-07, + "loss": 0.9677, + "step": 53670 + }, + { + "epoch": 4.159789220814445, + "grad_norm": 1.24464247616254, + "learning_rate": 2.079975201487911e-07, + "loss": 0.9752, + "step": 53680 + }, + { + "epoch": 4.160564144290751, + "grad_norm": 1.3613128825085417, + "learning_rate": 2.0803626782393056e-07, + "loss": 0.9785, + "step": 53690 + }, + { + "epoch": 4.161339067767058, + "grad_norm": 1.3247268703518067, + "learning_rate": 2.0807501549907006e-07, + "loss": 0.9939, + "step": 53700 + }, + { + "epoch": 4.1621139912433645, + "grad_norm": 1.3060338484455225, + "learning_rate": 2.0811376317420955e-07, + "loss": 0.9711, + "step": 53710 + }, + { + "epoch": 4.162888914719671, + "grad_norm": 1.3049614076610065, + "learning_rate": 2.0815251084934905e-07, + "loss": 0.9722, + "step": 53720 + }, + { + "epoch": 4.163663838195978, + "grad_norm": 1.3268365110214717, + "learning_rate": 2.0819125852448854e-07, + "loss": 0.9636, + "step": 53730 + }, + { + "epoch": 4.164438761672285, + "grad_norm": 1.3345053635838857, + "learning_rate": 2.0823000619962804e-07, + "loss": 0.9714, + "step": 53740 + }, + { + "epoch": 4.165213685148592, + "grad_norm": 1.3087521739969468, + "learning_rate": 2.082687538747675e-07, + "loss": 0.953, + "step": 53750 + }, + { + "epoch": 4.1659886086248985, + "grad_norm": 1.355784226286285, + "learning_rate": 2.08307501549907e-07, + "loss": 0.9579, + "step": 53760 + }, + { + "epoch": 4.166763532101205, + "grad_norm": 1.322791144776379, + "learning_rate": 2.083462492250465e-07, + "loss": 0.9565, + "step": 53770 + }, + { + "epoch": 4.167538455577512, + "grad_norm": 1.395612293786158, + "learning_rate": 2.08384996900186e-07, + "loss": 0.9655, + "step": 53780 + }, + { + "epoch": 4.168313379053818, + "grad_norm": 1.352213895837102, + "learning_rate": 2.084237445753255e-07, + "loss": 0.9664, + "step": 53790 + }, + { + "epoch": 4.169088302530125, + "grad_norm": 1.2653640440135474, + "learning_rate": 2.0846249225046498e-07, + "loss": 0.9456, + "step": 53800 + }, + { + "epoch": 4.169863226006432, + "grad_norm": 1.3547794751599451, + "learning_rate": 2.0850123992560448e-07, + "loss": 0.9606, + "step": 53810 + }, + { + "epoch": 4.1706381494827385, + "grad_norm": 1.3497448627649224, + "learning_rate": 2.0853998760074395e-07, + "loss": 0.9814, + "step": 53820 + }, + { + "epoch": 4.171413072959045, + "grad_norm": 1.3586615120738825, + "learning_rate": 2.0857873527588344e-07, + "loss": 0.9701, + "step": 53830 + }, + { + "epoch": 4.172187996435352, + "grad_norm": 1.3201989264416305, + "learning_rate": 2.0861748295102294e-07, + "loss": 0.9663, + "step": 53840 + }, + { + "epoch": 4.172962919911659, + "grad_norm": 1.3655968039866244, + "learning_rate": 2.0865623062616244e-07, + "loss": 0.9647, + "step": 53850 + }, + { + "epoch": 4.173737843387966, + "grad_norm": 1.377028428353858, + "learning_rate": 2.0869497830130193e-07, + "loss": 0.9741, + "step": 53860 + }, + { + "epoch": 4.1745127668642725, + "grad_norm": 1.3962467642384981, + "learning_rate": 2.0873372597644143e-07, + "loss": 0.9786, + "step": 53870 + }, + { + "epoch": 4.1752876903405785, + "grad_norm": 1.3027893307281173, + "learning_rate": 2.0877247365158092e-07, + "loss": 0.9751, + "step": 53880 + }, + { + "epoch": 4.176062613816885, + "grad_norm": 1.2907939419342582, + "learning_rate": 2.088112213267204e-07, + "loss": 0.9875, + "step": 53890 + }, + { + "epoch": 4.176837537293192, + "grad_norm": 1.4180180466320127, + "learning_rate": 2.0884996900185989e-07, + "loss": 0.9764, + "step": 53900 + }, + { + "epoch": 4.177612460769499, + "grad_norm": 1.6105450539349422, + "learning_rate": 2.0888871667699938e-07, + "loss": 0.9857, + "step": 53910 + }, + { + "epoch": 4.178387384245806, + "grad_norm": 1.294098276258432, + "learning_rate": 2.0892746435213888e-07, + "loss": 0.9735, + "step": 53920 + }, + { + "epoch": 4.1791623077221125, + "grad_norm": 1.3243725759248288, + "learning_rate": 2.0896621202727837e-07, + "loss": 0.9645, + "step": 53930 + }, + { + "epoch": 4.179937231198419, + "grad_norm": 1.360199501643584, + "learning_rate": 2.0900495970241787e-07, + "loss": 0.9593, + "step": 53940 + }, + { + "epoch": 4.180712154674726, + "grad_norm": 1.3315270400919597, + "learning_rate": 2.0904370737755736e-07, + "loss": 0.9622, + "step": 53950 + }, + { + "epoch": 4.181487078151033, + "grad_norm": 1.3131529194706313, + "learning_rate": 2.0908245505269683e-07, + "loss": 0.9696, + "step": 53960 + }, + { + "epoch": 4.18226200162734, + "grad_norm": 1.33016129188647, + "learning_rate": 2.0912120272783633e-07, + "loss": 0.9617, + "step": 53970 + }, + { + "epoch": 4.183036925103646, + "grad_norm": 1.3171763164705468, + "learning_rate": 2.0915995040297582e-07, + "loss": 0.9654, + "step": 53980 + }, + { + "epoch": 4.1838118485799525, + "grad_norm": 1.3480458468752703, + "learning_rate": 2.0919869807811532e-07, + "loss": 0.9707, + "step": 53990 + }, + { + "epoch": 4.184586772056259, + "grad_norm": 1.3682334323565621, + "learning_rate": 2.0923744575325482e-07, + "loss": 0.9803, + "step": 54000 + }, + { + "epoch": 4.184586772056259, + "eval_loss": 0.9718737006187439, + "eval_runtime": 319.3829, + "eval_samples_per_second": 35.916, + "eval_steps_per_second": 8.98, + "step": 54000 + }, + { + "epoch": 4.185361695532566, + "grad_norm": 1.3636149665372623, + "learning_rate": 2.092761934283943e-07, + "loss": 0.9651, + "step": 54010 + }, + { + "epoch": 4.186136619008873, + "grad_norm": 1.3328075148464547, + "learning_rate": 2.093149411035338e-07, + "loss": 0.9631, + "step": 54020 + }, + { + "epoch": 4.18691154248518, + "grad_norm": 1.2685970582705974, + "learning_rate": 2.0935368877867328e-07, + "loss": 0.9558, + "step": 54030 + }, + { + "epoch": 4.1876864659614865, + "grad_norm": 1.3089629037762092, + "learning_rate": 2.0939243645381277e-07, + "loss": 0.9648, + "step": 54040 + }, + { + "epoch": 4.188461389437793, + "grad_norm": 1.4191455773368278, + "learning_rate": 2.0943118412895227e-07, + "loss": 1.0006, + "step": 54050 + }, + { + "epoch": 4.1892363129141, + "grad_norm": 1.3032264806262046, + "learning_rate": 2.0946993180409176e-07, + "loss": 0.9697, + "step": 54060 + }, + { + "epoch": 4.190011236390406, + "grad_norm": 1.348583355371512, + "learning_rate": 2.0950867947923126e-07, + "loss": 0.9655, + "step": 54070 + }, + { + "epoch": 4.190786159866713, + "grad_norm": 1.38172871321037, + "learning_rate": 2.0954742715437075e-07, + "loss": 0.983, + "step": 54080 + }, + { + "epoch": 4.19156108334302, + "grad_norm": 1.3802143318329816, + "learning_rate": 2.0958617482951022e-07, + "loss": 0.9557, + "step": 54090 + }, + { + "epoch": 4.1923360068193265, + "grad_norm": 1.3772943616801459, + "learning_rate": 2.0962492250464972e-07, + "loss": 0.9658, + "step": 54100 + }, + { + "epoch": 4.193110930295633, + "grad_norm": 1.429799074512773, + "learning_rate": 2.096636701797892e-07, + "loss": 0.9766, + "step": 54110 + }, + { + "epoch": 4.19388585377194, + "grad_norm": 1.3722535262640485, + "learning_rate": 2.097024178549287e-07, + "loss": 0.9608, + "step": 54120 + }, + { + "epoch": 4.194660777248247, + "grad_norm": 1.3187165603212738, + "learning_rate": 2.097411655300682e-07, + "loss": 0.9913, + "step": 54130 + }, + { + "epoch": 4.195435700724554, + "grad_norm": 1.342194432731296, + "learning_rate": 2.097799132052077e-07, + "loss": 0.9595, + "step": 54140 + }, + { + "epoch": 4.1962106242008606, + "grad_norm": 1.4542539505498786, + "learning_rate": 2.098186608803472e-07, + "loss": 0.9938, + "step": 54150 + }, + { + "epoch": 4.1969855476771665, + "grad_norm": 1.3443852470134081, + "learning_rate": 2.0985740855548666e-07, + "loss": 0.9564, + "step": 54160 + }, + { + "epoch": 4.197760471153473, + "grad_norm": 1.4213353364839705, + "learning_rate": 2.0989615623062616e-07, + "loss": 0.9765, + "step": 54170 + }, + { + "epoch": 4.19853539462978, + "grad_norm": 1.3366232813630614, + "learning_rate": 2.0993490390576565e-07, + "loss": 0.9552, + "step": 54180 + }, + { + "epoch": 4.199310318106087, + "grad_norm": 1.3652810375799407, + "learning_rate": 2.0997365158090515e-07, + "loss": 0.9703, + "step": 54190 + }, + { + "epoch": 4.200085241582394, + "grad_norm": 1.3203847604960695, + "learning_rate": 2.1001239925604465e-07, + "loss": 0.9591, + "step": 54200 + }, + { + "epoch": 4.2008601650587005, + "grad_norm": 1.272262587684419, + "learning_rate": 2.1005114693118414e-07, + "loss": 0.9652, + "step": 54210 + }, + { + "epoch": 4.201635088535007, + "grad_norm": 1.3183271781436738, + "learning_rate": 2.1008989460632364e-07, + "loss": 0.9767, + "step": 54220 + }, + { + "epoch": 4.202410012011314, + "grad_norm": 1.406233179189766, + "learning_rate": 2.101286422814631e-07, + "loss": 0.9803, + "step": 54230 + }, + { + "epoch": 4.203184935487621, + "grad_norm": 1.318573683529303, + "learning_rate": 2.101673899566026e-07, + "loss": 0.9703, + "step": 54240 + }, + { + "epoch": 4.203959858963927, + "grad_norm": 1.3771645440653455, + "learning_rate": 2.102061376317421e-07, + "loss": 0.9555, + "step": 54250 + }, + { + "epoch": 4.204734782440234, + "grad_norm": 1.304922715021307, + "learning_rate": 2.102448853068816e-07, + "loss": 0.9466, + "step": 54260 + }, + { + "epoch": 4.2055097059165405, + "grad_norm": 1.321538993161702, + "learning_rate": 2.102836329820211e-07, + "loss": 0.9926, + "step": 54270 + }, + { + "epoch": 4.206284629392847, + "grad_norm": 1.4194195536337881, + "learning_rate": 2.1032238065716058e-07, + "loss": 0.9572, + "step": 54280 + }, + { + "epoch": 4.207059552869154, + "grad_norm": 1.3536528919625264, + "learning_rate": 2.1036112833230008e-07, + "loss": 0.9666, + "step": 54290 + }, + { + "epoch": 4.207834476345461, + "grad_norm": 1.3647006867877027, + "learning_rate": 2.1039987600743955e-07, + "loss": 0.9821, + "step": 54300 + }, + { + "epoch": 4.208609399821768, + "grad_norm": 1.4068560221326785, + "learning_rate": 2.1043862368257904e-07, + "loss": 0.9707, + "step": 54310 + }, + { + "epoch": 4.2093843232980745, + "grad_norm": 1.6167682276433066, + "learning_rate": 2.1047737135771854e-07, + "loss": 0.9832, + "step": 54320 + }, + { + "epoch": 4.210159246774381, + "grad_norm": 1.332729052980719, + "learning_rate": 2.1051611903285803e-07, + "loss": 0.9718, + "step": 54330 + }, + { + "epoch": 4.210934170250688, + "grad_norm": 1.4567726597696156, + "learning_rate": 2.1055486670799753e-07, + "loss": 0.9736, + "step": 54340 + }, + { + "epoch": 4.211709093726994, + "grad_norm": 1.3618979997470706, + "learning_rate": 2.1059361438313703e-07, + "loss": 0.9783, + "step": 54350 + }, + { + "epoch": 4.212484017203301, + "grad_norm": 1.3283551105063853, + "learning_rate": 2.1063236205827652e-07, + "loss": 0.9729, + "step": 54360 + }, + { + "epoch": 4.213258940679608, + "grad_norm": 1.2684195556962239, + "learning_rate": 2.10671109733416e-07, + "loss": 0.9707, + "step": 54370 + }, + { + "epoch": 4.2140338641559145, + "grad_norm": 1.4879807858228205, + "learning_rate": 2.1070985740855549e-07, + "loss": 0.9751, + "step": 54380 + }, + { + "epoch": 4.214808787632221, + "grad_norm": 1.351755556599627, + "learning_rate": 2.1074860508369498e-07, + "loss": 0.9846, + "step": 54390 + }, + { + "epoch": 4.215583711108528, + "grad_norm": 1.2762992997026676, + "learning_rate": 2.1078735275883448e-07, + "loss": 0.9678, + "step": 54400 + }, + { + "epoch": 4.216358634584835, + "grad_norm": 1.3097110252631503, + "learning_rate": 2.1082610043397397e-07, + "loss": 0.9763, + "step": 54410 + }, + { + "epoch": 4.217133558061142, + "grad_norm": 1.2901364434233435, + "learning_rate": 2.1086484810911347e-07, + "loss": 0.9821, + "step": 54420 + }, + { + "epoch": 4.217908481537449, + "grad_norm": 1.3974679908783996, + "learning_rate": 2.1090359578425294e-07, + "loss": 1.0138, + "step": 54430 + }, + { + "epoch": 4.2186834050137545, + "grad_norm": 1.413145883131899, + "learning_rate": 2.1094234345939243e-07, + "loss": 0.9764, + "step": 54440 + }, + { + "epoch": 4.219458328490061, + "grad_norm": 1.2966269173930136, + "learning_rate": 2.1098109113453193e-07, + "loss": 0.9802, + "step": 54450 + }, + { + "epoch": 4.220233251966368, + "grad_norm": 1.3227101933278018, + "learning_rate": 2.1101983880967142e-07, + "loss": 0.9935, + "step": 54460 + }, + { + "epoch": 4.221008175442675, + "grad_norm": 1.25540235618745, + "learning_rate": 2.1105858648481092e-07, + "loss": 0.9616, + "step": 54470 + }, + { + "epoch": 4.221783098918982, + "grad_norm": 1.3509340796139169, + "learning_rate": 2.1109733415995041e-07, + "loss": 0.9688, + "step": 54480 + }, + { + "epoch": 4.2225580223952885, + "grad_norm": 1.4226174890544074, + "learning_rate": 2.111360818350899e-07, + "loss": 1.0112, + "step": 54490 + }, + { + "epoch": 4.223332945871595, + "grad_norm": 1.3703286786097724, + "learning_rate": 2.1117482951022938e-07, + "loss": 0.9677, + "step": 54500 + }, + { + "epoch": 4.223332945871595, + "eval_loss": 0.9710384607315063, + "eval_runtime": 319.4248, + "eval_samples_per_second": 35.911, + "eval_steps_per_second": 8.979, + "step": 54500 + }, + { + "epoch": 4.224107869347902, + "grad_norm": 1.5007490627211038, + "learning_rate": 2.1121357718536887e-07, + "loss": 0.962, + "step": 54510 + }, + { + "epoch": 4.224882792824209, + "grad_norm": 1.3238144271522294, + "learning_rate": 2.1125232486050837e-07, + "loss": 0.973, + "step": 54520 + }, + { + "epoch": 4.225657716300515, + "grad_norm": 1.394024314190949, + "learning_rate": 2.1129107253564787e-07, + "loss": 0.9812, + "step": 54530 + }, + { + "epoch": 4.226432639776822, + "grad_norm": 1.2202124294967827, + "learning_rate": 2.1132982021078736e-07, + "loss": 0.9943, + "step": 54540 + }, + { + "epoch": 4.2272075632531285, + "grad_norm": 1.2324671954173325, + "learning_rate": 2.1136856788592686e-07, + "loss": 0.9654, + "step": 54550 + }, + { + "epoch": 4.227982486729435, + "grad_norm": 1.3877049783311122, + "learning_rate": 2.1140731556106635e-07, + "loss": 0.9779, + "step": 54560 + }, + { + "epoch": 4.228757410205742, + "grad_norm": 1.379750172503275, + "learning_rate": 2.1144606323620582e-07, + "loss": 0.9678, + "step": 54570 + }, + { + "epoch": 4.229532333682049, + "grad_norm": 1.309785251948911, + "learning_rate": 2.1148481091134532e-07, + "loss": 0.968, + "step": 54580 + }, + { + "epoch": 4.230307257158356, + "grad_norm": 1.4135601046399315, + "learning_rate": 2.115235585864848e-07, + "loss": 0.9967, + "step": 54590 + }, + { + "epoch": 4.231082180634663, + "grad_norm": 1.3049817863043438, + "learning_rate": 2.115623062616243e-07, + "loss": 0.9694, + "step": 54600 + }, + { + "epoch": 4.231857104110969, + "grad_norm": 1.388165588231017, + "learning_rate": 2.116010539367638e-07, + "loss": 0.9468, + "step": 54610 + }, + { + "epoch": 4.232632027587275, + "grad_norm": 1.277094448693632, + "learning_rate": 2.116398016119033e-07, + "loss": 0.9689, + "step": 54620 + }, + { + "epoch": 4.233406951063582, + "grad_norm": 1.348534216453931, + "learning_rate": 2.116785492870428e-07, + "loss": 0.9641, + "step": 54630 + }, + { + "epoch": 4.234181874539889, + "grad_norm": 1.3042983285293546, + "learning_rate": 2.1171729696218226e-07, + "loss": 0.965, + "step": 54640 + }, + { + "epoch": 4.234956798016196, + "grad_norm": 1.3090232665878716, + "learning_rate": 2.1175604463732176e-07, + "loss": 0.993, + "step": 54650 + }, + { + "epoch": 4.2357317214925025, + "grad_norm": 1.283615456973346, + "learning_rate": 2.1179479231246125e-07, + "loss": 0.971, + "step": 54660 + }, + { + "epoch": 4.236506644968809, + "grad_norm": 1.4372611719023431, + "learning_rate": 2.1183353998760075e-07, + "loss": 0.9745, + "step": 54670 + }, + { + "epoch": 4.237281568445116, + "grad_norm": 1.3617401261089215, + "learning_rate": 2.1187228766274025e-07, + "loss": 0.9698, + "step": 54680 + }, + { + "epoch": 4.238056491921423, + "grad_norm": 1.4646681510276136, + "learning_rate": 2.1191103533787974e-07, + "loss": 0.9721, + "step": 54690 + }, + { + "epoch": 4.23883141539773, + "grad_norm": 1.3186798385442067, + "learning_rate": 2.1194978301301924e-07, + "loss": 0.96, + "step": 54700 + }, + { + "epoch": 4.239606338874037, + "grad_norm": 1.3621796099459473, + "learning_rate": 2.119885306881587e-07, + "loss": 0.9791, + "step": 54710 + }, + { + "epoch": 4.2403812623503425, + "grad_norm": 1.3829648843023565, + "learning_rate": 2.120272783632982e-07, + "loss": 0.9787, + "step": 54720 + }, + { + "epoch": 4.241156185826649, + "grad_norm": 1.381912554505093, + "learning_rate": 2.120660260384377e-07, + "loss": 1.0007, + "step": 54730 + }, + { + "epoch": 4.241931109302956, + "grad_norm": 1.3151004978446732, + "learning_rate": 2.121047737135772e-07, + "loss": 0.9713, + "step": 54740 + }, + { + "epoch": 4.242706032779263, + "grad_norm": 1.2997865796594157, + "learning_rate": 2.121435213887167e-07, + "loss": 0.9571, + "step": 54750 + }, + { + "epoch": 4.24348095625557, + "grad_norm": 1.3905811591844959, + "learning_rate": 2.1218226906385618e-07, + "loss": 0.9649, + "step": 54760 + }, + { + "epoch": 4.244255879731877, + "grad_norm": 1.3084901884289963, + "learning_rate": 2.1222101673899568e-07, + "loss": 0.9727, + "step": 54770 + }, + { + "epoch": 4.245030803208183, + "grad_norm": 1.3349876161133813, + "learning_rate": 2.1225976441413515e-07, + "loss": 0.9776, + "step": 54780 + }, + { + "epoch": 4.24580572668449, + "grad_norm": 1.3858561676238277, + "learning_rate": 2.1229851208927464e-07, + "loss": 0.9674, + "step": 54790 + }, + { + "epoch": 4.246580650160797, + "grad_norm": 1.2873545205203778, + "learning_rate": 2.1233725976441414e-07, + "loss": 0.9694, + "step": 54800 + }, + { + "epoch": 4.247355573637103, + "grad_norm": 1.3326218761618773, + "learning_rate": 2.1237600743955363e-07, + "loss": 0.9787, + "step": 54810 + }, + { + "epoch": 4.24813049711341, + "grad_norm": 1.314683107409564, + "learning_rate": 2.1241475511469313e-07, + "loss": 0.9697, + "step": 54820 + }, + { + "epoch": 4.2489054205897165, + "grad_norm": 1.398862290019703, + "learning_rate": 2.1245350278983262e-07, + "loss": 0.9953, + "step": 54830 + }, + { + "epoch": 4.249680344066023, + "grad_norm": 1.3212333273891466, + "learning_rate": 2.124922504649721e-07, + "loss": 0.9634, + "step": 54840 + }, + { + "epoch": 4.25045526754233, + "grad_norm": 1.3656258864746615, + "learning_rate": 2.125309981401116e-07, + "loss": 0.9607, + "step": 54850 + }, + { + "epoch": 4.251230191018637, + "grad_norm": 1.191099416805207, + "learning_rate": 2.1256974581525108e-07, + "loss": 0.9633, + "step": 54860 + }, + { + "epoch": 4.252005114494944, + "grad_norm": 1.2849909008655944, + "learning_rate": 2.1260849349039058e-07, + "loss": 0.9807, + "step": 54870 + }, + { + "epoch": 4.252780037971251, + "grad_norm": 1.3524982316425913, + "learning_rate": 2.1264724116553008e-07, + "loss": 0.9885, + "step": 54880 + }, + { + "epoch": 4.253554961447557, + "grad_norm": 1.3870765519316017, + "learning_rate": 2.1268598884066957e-07, + "loss": 0.9744, + "step": 54890 + }, + { + "epoch": 4.254329884923864, + "grad_norm": 1.359577713804436, + "learning_rate": 2.1272473651580907e-07, + "loss": 0.9769, + "step": 54900 + }, + { + "epoch": 4.25510480840017, + "grad_norm": 1.2976257975142151, + "learning_rate": 2.1276348419094854e-07, + "loss": 0.9695, + "step": 54910 + }, + { + "epoch": 4.255879731876477, + "grad_norm": 1.3507494797797968, + "learning_rate": 2.1280223186608803e-07, + "loss": 0.9737, + "step": 54920 + }, + { + "epoch": 4.256654655352784, + "grad_norm": 1.4670338058903005, + "learning_rate": 2.1284097954122753e-07, + "loss": 0.9588, + "step": 54930 + }, + { + "epoch": 4.257429578829091, + "grad_norm": 1.390678023133213, + "learning_rate": 2.1287972721636702e-07, + "loss": 0.9689, + "step": 54940 + }, + { + "epoch": 4.258204502305397, + "grad_norm": 1.2834086505333107, + "learning_rate": 2.1291847489150652e-07, + "loss": 0.9656, + "step": 54950 + }, + { + "epoch": 4.258979425781704, + "grad_norm": 1.326457262308873, + "learning_rate": 2.1295722256664601e-07, + "loss": 0.9725, + "step": 54960 + }, + { + "epoch": 4.259754349258011, + "grad_norm": 1.3300270158118543, + "learning_rate": 2.129959702417855e-07, + "loss": 0.97, + "step": 54970 + }, + { + "epoch": 4.260529272734318, + "grad_norm": 1.329380031557229, + "learning_rate": 2.1303471791692498e-07, + "loss": 0.9769, + "step": 54980 + }, + { + "epoch": 4.261304196210624, + "grad_norm": 1.3045009917913373, + "learning_rate": 2.1307346559206447e-07, + "loss": 0.9649, + "step": 54990 + }, + { + "epoch": 4.2620791196869305, + "grad_norm": 1.2989732688105653, + "learning_rate": 2.1311221326720397e-07, + "loss": 0.9614, + "step": 55000 + }, + { + "epoch": 4.2620791196869305, + "eval_loss": 0.9702572226524353, + "eval_runtime": 319.5469, + "eval_samples_per_second": 35.898, + "eval_steps_per_second": 8.975, + "step": 55000 + }, + { + "epoch": 4.262854043163237, + "grad_norm": 1.3424238832075241, + "learning_rate": 2.1315096094234346e-07, + "loss": 0.9826, + "step": 55010 + }, + { + "epoch": 4.263628966639544, + "grad_norm": 1.386064990457035, + "learning_rate": 2.1318970861748296e-07, + "loss": 0.9647, + "step": 55020 + }, + { + "epoch": 4.264403890115851, + "grad_norm": 1.3421536184326248, + "learning_rate": 2.1322845629262246e-07, + "loss": 0.9509, + "step": 55030 + }, + { + "epoch": 4.265178813592158, + "grad_norm": 1.3035004192976472, + "learning_rate": 2.1326720396776195e-07, + "loss": 0.9583, + "step": 55040 + }, + { + "epoch": 4.265953737068465, + "grad_norm": 1.338233515462902, + "learning_rate": 2.1330595164290142e-07, + "loss": 0.9509, + "step": 55050 + }, + { + "epoch": 4.266728660544771, + "grad_norm": 1.3492057460966456, + "learning_rate": 2.1334469931804092e-07, + "loss": 0.9698, + "step": 55060 + }, + { + "epoch": 4.267503584021078, + "grad_norm": 1.3775211752579992, + "learning_rate": 2.133834469931804e-07, + "loss": 0.9576, + "step": 55070 + }, + { + "epoch": 4.268278507497385, + "grad_norm": 1.4075084536545657, + "learning_rate": 2.134221946683199e-07, + "loss": 0.9722, + "step": 55080 + }, + { + "epoch": 4.269053430973691, + "grad_norm": 1.3615527400264331, + "learning_rate": 2.134609423434594e-07, + "loss": 0.9759, + "step": 55090 + }, + { + "epoch": 4.269828354449998, + "grad_norm": 1.2751443942304777, + "learning_rate": 2.134996900185989e-07, + "loss": 0.9618, + "step": 55100 + }, + { + "epoch": 4.270603277926305, + "grad_norm": 1.3924994184465787, + "learning_rate": 2.135384376937384e-07, + "loss": 0.9836, + "step": 55110 + }, + { + "epoch": 4.271378201402611, + "grad_norm": 1.331080657261875, + "learning_rate": 2.1357718536887786e-07, + "loss": 0.9573, + "step": 55120 + }, + { + "epoch": 4.272153124878918, + "grad_norm": 1.360773938413855, + "learning_rate": 2.1361593304401736e-07, + "loss": 0.9688, + "step": 55130 + }, + { + "epoch": 4.272928048355225, + "grad_norm": 1.291148995585787, + "learning_rate": 2.1365468071915685e-07, + "loss": 0.9346, + "step": 55140 + }, + { + "epoch": 4.273702971831532, + "grad_norm": 1.3461811722985284, + "learning_rate": 2.1369342839429635e-07, + "loss": 0.9684, + "step": 55150 + }, + { + "epoch": 4.274477895307839, + "grad_norm": 1.3938380696304724, + "learning_rate": 2.1373217606943584e-07, + "loss": 0.9581, + "step": 55160 + }, + { + "epoch": 4.275252818784145, + "grad_norm": 1.3419812309016124, + "learning_rate": 2.1377092374457534e-07, + "loss": 0.9721, + "step": 55170 + }, + { + "epoch": 4.276027742260451, + "grad_norm": 1.3408939932068842, + "learning_rate": 2.138096714197148e-07, + "loss": 0.9625, + "step": 55180 + }, + { + "epoch": 4.276802665736758, + "grad_norm": 1.323633279388071, + "learning_rate": 2.138484190948543e-07, + "loss": 0.9716, + "step": 55190 + }, + { + "epoch": 4.277577589213065, + "grad_norm": 1.3680720271079396, + "learning_rate": 2.138871667699938e-07, + "loss": 0.9432, + "step": 55200 + }, + { + "epoch": 4.278352512689372, + "grad_norm": 1.3749322797195838, + "learning_rate": 2.139259144451333e-07, + "loss": 0.9881, + "step": 55210 + }, + { + "epoch": 4.279127436165679, + "grad_norm": 1.3206735403109302, + "learning_rate": 2.139646621202728e-07, + "loss": 0.9593, + "step": 55220 + }, + { + "epoch": 4.279902359641985, + "grad_norm": 1.371845987794362, + "learning_rate": 2.1400340979541229e-07, + "loss": 0.9756, + "step": 55230 + }, + { + "epoch": 4.280677283118292, + "grad_norm": 1.2572133086749817, + "learning_rate": 2.1404215747055178e-07, + "loss": 0.9541, + "step": 55240 + }, + { + "epoch": 4.281452206594599, + "grad_norm": 1.4630888553458592, + "learning_rate": 2.1408090514569125e-07, + "loss": 0.9894, + "step": 55250 + }, + { + "epoch": 4.282227130070906, + "grad_norm": 1.3493886666839578, + "learning_rate": 2.1411965282083075e-07, + "loss": 0.954, + "step": 55260 + }, + { + "epoch": 4.283002053547213, + "grad_norm": 1.2966037859282575, + "learning_rate": 2.1415840049597024e-07, + "loss": 0.968, + "step": 55270 + }, + { + "epoch": 4.283776977023519, + "grad_norm": 1.3877353434626714, + "learning_rate": 2.1419714817110974e-07, + "loss": 0.9524, + "step": 55280 + }, + { + "epoch": 4.284551900499825, + "grad_norm": 1.3192045352582018, + "learning_rate": 2.1423589584624923e-07, + "loss": 0.966, + "step": 55290 + }, + { + "epoch": 4.285326823976132, + "grad_norm": 1.415300419489724, + "learning_rate": 2.1427464352138873e-07, + "loss": 0.9805, + "step": 55300 + }, + { + "epoch": 4.286101747452439, + "grad_norm": 1.4188334181344846, + "learning_rate": 2.1431339119652822e-07, + "loss": 0.9675, + "step": 55310 + }, + { + "epoch": 4.286876670928746, + "grad_norm": 1.309096316261201, + "learning_rate": 2.143521388716677e-07, + "loss": 0.9823, + "step": 55320 + }, + { + "epoch": 4.287651594405053, + "grad_norm": 1.3100854503637023, + "learning_rate": 2.143908865468072e-07, + "loss": 0.9578, + "step": 55330 + }, + { + "epoch": 4.288426517881359, + "grad_norm": 1.312248037243944, + "learning_rate": 2.1442963422194668e-07, + "loss": 0.9552, + "step": 55340 + }, + { + "epoch": 4.289201441357666, + "grad_norm": 1.368993809409875, + "learning_rate": 2.1446838189708618e-07, + "loss": 0.9596, + "step": 55350 + }, + { + "epoch": 4.289976364833973, + "grad_norm": 1.2794381558363686, + "learning_rate": 2.1450712957222568e-07, + "loss": 0.9663, + "step": 55360 + }, + { + "epoch": 4.290751288310279, + "grad_norm": 1.3179975703052316, + "learning_rate": 2.1454587724736517e-07, + "loss": 0.9598, + "step": 55370 + }, + { + "epoch": 4.291526211786586, + "grad_norm": 1.363369217321925, + "learning_rate": 2.1458462492250467e-07, + "loss": 0.9548, + "step": 55380 + }, + { + "epoch": 4.292301135262893, + "grad_norm": 1.3002777715215323, + "learning_rate": 2.1462337259764414e-07, + "loss": 0.9526, + "step": 55390 + }, + { + "epoch": 4.293076058739199, + "grad_norm": 1.3994446451451898, + "learning_rate": 2.1466212027278363e-07, + "loss": 0.9838, + "step": 55400 + }, + { + "epoch": 4.293850982215506, + "grad_norm": 1.2992295795650284, + "learning_rate": 2.1470086794792313e-07, + "loss": 0.9643, + "step": 55410 + }, + { + "epoch": 4.294625905691813, + "grad_norm": 1.3387171865822607, + "learning_rate": 2.1473961562306262e-07, + "loss": 0.9744, + "step": 55420 + }, + { + "epoch": 4.29540082916812, + "grad_norm": 1.3177556301572435, + "learning_rate": 2.1477836329820212e-07, + "loss": 0.9818, + "step": 55430 + }, + { + "epoch": 4.296175752644427, + "grad_norm": 1.3788505289532433, + "learning_rate": 2.148171109733416e-07, + "loss": 0.9586, + "step": 55440 + }, + { + "epoch": 4.2969506761207334, + "grad_norm": 1.2903256460488153, + "learning_rate": 2.148558586484811e-07, + "loss": 0.9448, + "step": 55450 + }, + { + "epoch": 4.29772559959704, + "grad_norm": 1.34862611669124, + "learning_rate": 2.1489460632362058e-07, + "loss": 0.9746, + "step": 55460 + }, + { + "epoch": 4.298500523073346, + "grad_norm": 1.3633625523530706, + "learning_rate": 2.1493335399876007e-07, + "loss": 0.9597, + "step": 55470 + }, + { + "epoch": 4.299275446549653, + "grad_norm": 1.4741447767001583, + "learning_rate": 2.1497210167389957e-07, + "loss": 0.9875, + "step": 55480 + }, + { + "epoch": 4.30005037002596, + "grad_norm": 1.4305699122761946, + "learning_rate": 2.1501084934903906e-07, + "loss": 0.96, + "step": 55490 + }, + { + "epoch": 4.300825293502267, + "grad_norm": 1.3097798178487572, + "learning_rate": 2.1504959702417856e-07, + "loss": 0.9686, + "step": 55500 + }, + { + "epoch": 4.300825293502267, + "eval_loss": 0.9693850874900818, + "eval_runtime": 320.057, + "eval_samples_per_second": 35.84, + "eval_steps_per_second": 8.961, + "step": 55500 + }, + { + "epoch": 4.301600216978573, + "grad_norm": 1.3038345869065686, + "learning_rate": 2.1508834469931805e-07, + "loss": 0.9675, + "step": 55510 + }, + { + "epoch": 4.30237514045488, + "grad_norm": 1.3483881048667308, + "learning_rate": 2.1512709237445752e-07, + "loss": 0.9633, + "step": 55520 + }, + { + "epoch": 4.303150063931187, + "grad_norm": 1.3129872643656353, + "learning_rate": 2.1516584004959702e-07, + "loss": 0.9729, + "step": 55530 + }, + { + "epoch": 4.303924987407494, + "grad_norm": 1.4247574115860058, + "learning_rate": 2.1520458772473651e-07, + "loss": 0.969, + "step": 55540 + }, + { + "epoch": 4.3046999108838, + "grad_norm": 1.3476619808696808, + "learning_rate": 2.15243335399876e-07, + "loss": 0.962, + "step": 55550 + }, + { + "epoch": 4.305474834360107, + "grad_norm": 1.315806392903147, + "learning_rate": 2.152820830750155e-07, + "loss": 0.9765, + "step": 55560 + }, + { + "epoch": 4.306249757836413, + "grad_norm": 1.3793857374207539, + "learning_rate": 2.15320830750155e-07, + "loss": 0.9638, + "step": 55570 + }, + { + "epoch": 4.30702468131272, + "grad_norm": 1.4329674226059943, + "learning_rate": 2.153595784252945e-07, + "loss": 0.995, + "step": 55580 + }, + { + "epoch": 4.307799604789027, + "grad_norm": 1.3508951595233893, + "learning_rate": 2.1539832610043397e-07, + "loss": 0.9892, + "step": 55590 + }, + { + "epoch": 4.308574528265334, + "grad_norm": 1.2713896804253308, + "learning_rate": 2.1543707377557346e-07, + "loss": 0.9698, + "step": 55600 + }, + { + "epoch": 4.309349451741641, + "grad_norm": 1.3797522250256504, + "learning_rate": 2.1547582145071296e-07, + "loss": 0.9688, + "step": 55610 + }, + { + "epoch": 4.3101243752179474, + "grad_norm": 1.3416790233245426, + "learning_rate": 2.1551456912585245e-07, + "loss": 0.9856, + "step": 55620 + }, + { + "epoch": 4.310899298694254, + "grad_norm": 1.3722578635219744, + "learning_rate": 2.1555331680099195e-07, + "loss": 0.9588, + "step": 55630 + }, + { + "epoch": 4.311674222170561, + "grad_norm": 1.4641153658764483, + "learning_rate": 2.1559206447613144e-07, + "loss": 0.9766, + "step": 55640 + }, + { + "epoch": 4.312449145646867, + "grad_norm": 1.3715716042166506, + "learning_rate": 2.1563081215127094e-07, + "loss": 0.9844, + "step": 55650 + }, + { + "epoch": 4.313224069123174, + "grad_norm": 1.2845540204013266, + "learning_rate": 2.156695598264104e-07, + "loss": 0.9741, + "step": 55660 + }, + { + "epoch": 4.313998992599481, + "grad_norm": 1.3544029630964107, + "learning_rate": 2.157083075015499e-07, + "loss": 0.9459, + "step": 55670 + }, + { + "epoch": 4.314773916075787, + "grad_norm": 1.2763451260232082, + "learning_rate": 2.157470551766894e-07, + "loss": 0.9613, + "step": 55680 + }, + { + "epoch": 4.315548839552094, + "grad_norm": 1.3235754079453776, + "learning_rate": 2.157858028518289e-07, + "loss": 0.9776, + "step": 55690 + }, + { + "epoch": 4.316323763028401, + "grad_norm": 1.3929061869038293, + "learning_rate": 2.158245505269684e-07, + "loss": 0.9623, + "step": 55700 + }, + { + "epoch": 4.317098686504708, + "grad_norm": 1.2748499167286789, + "learning_rate": 2.1586329820210789e-07, + "loss": 0.976, + "step": 55710 + }, + { + "epoch": 4.317873609981015, + "grad_norm": 1.4301763086108021, + "learning_rate": 2.1590204587724738e-07, + "loss": 0.9646, + "step": 55720 + }, + { + "epoch": 4.3186485334573215, + "grad_norm": 1.326789504730127, + "learning_rate": 2.1594079355238685e-07, + "loss": 0.9565, + "step": 55730 + }, + { + "epoch": 4.319423456933627, + "grad_norm": 1.3600703452129106, + "learning_rate": 2.1597954122752635e-07, + "loss": 0.9606, + "step": 55740 + }, + { + "epoch": 4.320198380409934, + "grad_norm": 1.3248658473045398, + "learning_rate": 2.1601828890266584e-07, + "loss": 0.9622, + "step": 55750 + }, + { + "epoch": 4.320973303886241, + "grad_norm": 1.293586184213967, + "learning_rate": 2.1605703657780534e-07, + "loss": 0.9606, + "step": 55760 + }, + { + "epoch": 4.321748227362548, + "grad_norm": 1.293273578339827, + "learning_rate": 2.1609578425294483e-07, + "loss": 0.974, + "step": 55770 + }, + { + "epoch": 4.322523150838855, + "grad_norm": 1.400869192240441, + "learning_rate": 2.1613453192808433e-07, + "loss": 0.9675, + "step": 55780 + }, + { + "epoch": 4.323298074315161, + "grad_norm": 1.342791810895466, + "learning_rate": 2.1617327960322382e-07, + "loss": 0.9567, + "step": 55790 + }, + { + "epoch": 4.324072997791468, + "grad_norm": 1.389579963662096, + "learning_rate": 2.162120272783633e-07, + "loss": 0.9531, + "step": 55800 + }, + { + "epoch": 4.324847921267775, + "grad_norm": 1.35613083787291, + "learning_rate": 2.162507749535028e-07, + "loss": 0.964, + "step": 55810 + }, + { + "epoch": 4.325622844744082, + "grad_norm": 1.2886969214643793, + "learning_rate": 2.1628952262864228e-07, + "loss": 0.9686, + "step": 55820 + }, + { + "epoch": 4.326397768220389, + "grad_norm": 1.325115665281215, + "learning_rate": 2.1632827030378178e-07, + "loss": 0.9462, + "step": 55830 + }, + { + "epoch": 4.327172691696695, + "grad_norm": 1.3080164706300703, + "learning_rate": 2.1636701797892127e-07, + "loss": 0.9564, + "step": 55840 + }, + { + "epoch": 4.327947615173001, + "grad_norm": 1.344140335512729, + "learning_rate": 2.1640576565406077e-07, + "loss": 0.9738, + "step": 55850 + }, + { + "epoch": 4.328722538649308, + "grad_norm": 1.3168821722393682, + "learning_rate": 2.1644451332920027e-07, + "loss": 0.9715, + "step": 55860 + }, + { + "epoch": 4.329497462125615, + "grad_norm": 1.3235654739576153, + "learning_rate": 2.1648326100433973e-07, + "loss": 0.9416, + "step": 55870 + }, + { + "epoch": 4.330272385601922, + "grad_norm": 1.317056001685752, + "learning_rate": 2.1652200867947923e-07, + "loss": 0.961, + "step": 55880 + }, + { + "epoch": 4.331047309078229, + "grad_norm": 1.217090733438901, + "learning_rate": 2.1656075635461873e-07, + "loss": 0.9596, + "step": 55890 + }, + { + "epoch": 4.3318222325545355, + "grad_norm": 1.3136813234456313, + "learning_rate": 2.1659950402975822e-07, + "loss": 0.9965, + "step": 55900 + }, + { + "epoch": 4.332597156030842, + "grad_norm": 1.339027662343281, + "learning_rate": 2.1663825170489772e-07, + "loss": 0.9955, + "step": 55910 + }, + { + "epoch": 4.333372079507148, + "grad_norm": 1.32020941691065, + "learning_rate": 2.166769993800372e-07, + "loss": 0.9724, + "step": 55920 + }, + { + "epoch": 4.334147002983455, + "grad_norm": 1.3539299622497196, + "learning_rate": 2.1671574705517668e-07, + "loss": 0.9522, + "step": 55930 + }, + { + "epoch": 4.334921926459762, + "grad_norm": 1.2261929118631951, + "learning_rate": 2.1675449473031618e-07, + "loss": 0.9777, + "step": 55940 + }, + { + "epoch": 4.335696849936069, + "grad_norm": 1.3666408122662101, + "learning_rate": 2.1679324240545567e-07, + "loss": 0.9793, + "step": 55950 + }, + { + "epoch": 4.336471773412375, + "grad_norm": 1.3877149276108525, + "learning_rate": 2.1683199008059517e-07, + "loss": 0.9635, + "step": 55960 + }, + { + "epoch": 4.337246696888682, + "grad_norm": 1.303202919281371, + "learning_rate": 2.1687073775573466e-07, + "loss": 0.9809, + "step": 55970 + }, + { + "epoch": 4.338021620364989, + "grad_norm": 1.324612818138615, + "learning_rate": 2.1690948543087416e-07, + "loss": 0.9716, + "step": 55980 + }, + { + "epoch": 4.338796543841296, + "grad_norm": 1.272220161244093, + "learning_rate": 2.1694823310601365e-07, + "loss": 0.9725, + "step": 55990 + }, + { + "epoch": 4.339571467317603, + "grad_norm": 1.3446109551530137, + "learning_rate": 2.1698698078115312e-07, + "loss": 0.9584, + "step": 56000 + }, + { + "epoch": 4.339571467317603, + "eval_loss": 0.9685981273651123, + "eval_runtime": 319.3372, + "eval_samples_per_second": 35.921, + "eval_steps_per_second": 8.981, + "step": 56000 + }, + { + "epoch": 4.3403463907939095, + "grad_norm": 1.3459201938765484, + "learning_rate": 2.1702572845629262e-07, + "loss": 0.965, + "step": 56010 + }, + { + "epoch": 4.341121314270215, + "grad_norm": 1.3435667833283569, + "learning_rate": 2.1706447613143211e-07, + "loss": 0.9832, + "step": 56020 + }, + { + "epoch": 4.341896237746522, + "grad_norm": 1.307345863017576, + "learning_rate": 2.171032238065716e-07, + "loss": 0.9712, + "step": 56030 + }, + { + "epoch": 4.342671161222829, + "grad_norm": 1.3218075112414749, + "learning_rate": 2.171419714817111e-07, + "loss": 0.9708, + "step": 56040 + }, + { + "epoch": 4.343446084699136, + "grad_norm": 1.2263336736044923, + "learning_rate": 2.171807191568506e-07, + "loss": 0.9439, + "step": 56050 + }, + { + "epoch": 4.344221008175443, + "grad_norm": 1.425464747363822, + "learning_rate": 2.172194668319901e-07, + "loss": 0.9711, + "step": 56060 + }, + { + "epoch": 4.3449959316517495, + "grad_norm": 1.4208394058001228, + "learning_rate": 2.1725821450712957e-07, + "loss": 0.9692, + "step": 56070 + }, + { + "epoch": 4.345770855128056, + "grad_norm": 1.3371718233276917, + "learning_rate": 2.1729696218226906e-07, + "loss": 0.9571, + "step": 56080 + }, + { + "epoch": 4.346545778604363, + "grad_norm": 1.2966055976774014, + "learning_rate": 2.1733570985740856e-07, + "loss": 0.97, + "step": 56090 + }, + { + "epoch": 4.34732070208067, + "grad_norm": 1.4853384432706842, + "learning_rate": 2.1737445753254805e-07, + "loss": 0.9612, + "step": 56100 + }, + { + "epoch": 4.348095625556976, + "grad_norm": 1.3892769938686598, + "learning_rate": 2.1741320520768755e-07, + "loss": 0.98, + "step": 56110 + }, + { + "epoch": 4.348870549033283, + "grad_norm": 1.372823330445538, + "learning_rate": 2.1745195288282704e-07, + "loss": 0.958, + "step": 56120 + }, + { + "epoch": 4.349645472509589, + "grad_norm": 1.3271720455638079, + "learning_rate": 2.1749070055796654e-07, + "loss": 0.9566, + "step": 56130 + }, + { + "epoch": 4.350420395985896, + "grad_norm": 1.412258967226294, + "learning_rate": 2.17529448233106e-07, + "loss": 0.9839, + "step": 56140 + }, + { + "epoch": 4.351195319462203, + "grad_norm": 1.3346881508108077, + "learning_rate": 2.175681959082455e-07, + "loss": 0.9706, + "step": 56150 + }, + { + "epoch": 4.35197024293851, + "grad_norm": 1.3978738980255714, + "learning_rate": 2.17606943583385e-07, + "loss": 0.9547, + "step": 56160 + }, + { + "epoch": 4.352745166414817, + "grad_norm": 1.324612605302438, + "learning_rate": 2.176456912585245e-07, + "loss": 0.9696, + "step": 56170 + }, + { + "epoch": 4.3535200898911235, + "grad_norm": 1.3656062699247618, + "learning_rate": 2.17684438933664e-07, + "loss": 0.969, + "step": 56180 + }, + { + "epoch": 4.35429501336743, + "grad_norm": 1.2618890833688032, + "learning_rate": 2.1772318660880348e-07, + "loss": 0.9663, + "step": 56190 + }, + { + "epoch": 4.355069936843737, + "grad_norm": 1.2337342834328209, + "learning_rate": 2.1776193428394298e-07, + "loss": 0.9651, + "step": 56200 + }, + { + "epoch": 4.355844860320043, + "grad_norm": 1.391349751887087, + "learning_rate": 2.1780068195908245e-07, + "loss": 0.9751, + "step": 56210 + }, + { + "epoch": 4.35661978379635, + "grad_norm": 1.3818890411633296, + "learning_rate": 2.1783942963422194e-07, + "loss": 1.004, + "step": 56220 + }, + { + "epoch": 4.357394707272657, + "grad_norm": 1.3944721582655482, + "learning_rate": 2.1787817730936144e-07, + "loss": 0.9791, + "step": 56230 + }, + { + "epoch": 4.3581696307489635, + "grad_norm": 1.3816291483211325, + "learning_rate": 2.1791692498450094e-07, + "loss": 0.9739, + "step": 56240 + }, + { + "epoch": 4.35894455422527, + "grad_norm": 1.296227862947385, + "learning_rate": 2.1795567265964043e-07, + "loss": 0.9526, + "step": 56250 + }, + { + "epoch": 4.359719477701577, + "grad_norm": 1.3135126848464076, + "learning_rate": 2.1799442033477993e-07, + "loss": 0.9619, + "step": 56260 + }, + { + "epoch": 4.360494401177884, + "grad_norm": 1.3411969302103992, + "learning_rate": 2.180331680099194e-07, + "loss": 0.9448, + "step": 56270 + }, + { + "epoch": 4.361269324654191, + "grad_norm": 1.3860877875310422, + "learning_rate": 2.180719156850589e-07, + "loss": 0.9641, + "step": 56280 + }, + { + "epoch": 4.3620442481304975, + "grad_norm": 1.3695522882249942, + "learning_rate": 2.181106633601984e-07, + "loss": 0.9608, + "step": 56290 + }, + { + "epoch": 4.362819171606803, + "grad_norm": 1.334275248559548, + "learning_rate": 2.1814941103533788e-07, + "loss": 0.9727, + "step": 56300 + }, + { + "epoch": 4.36359409508311, + "grad_norm": 1.3885229900225413, + "learning_rate": 2.1818815871047738e-07, + "loss": 0.9729, + "step": 56310 + }, + { + "epoch": 4.364369018559417, + "grad_norm": 1.3116013609162582, + "learning_rate": 2.1822690638561687e-07, + "loss": 0.9401, + "step": 56320 + }, + { + "epoch": 4.365143942035724, + "grad_norm": 1.3592824026444406, + "learning_rate": 2.1826565406075637e-07, + "loss": 0.9799, + "step": 56330 + }, + { + "epoch": 4.365918865512031, + "grad_norm": 1.3152363884219664, + "learning_rate": 2.1830440173589584e-07, + "loss": 0.9449, + "step": 56340 + }, + { + "epoch": 4.3666937889883375, + "grad_norm": 1.4634999452751936, + "learning_rate": 2.1834314941103533e-07, + "loss": 0.9678, + "step": 56350 + }, + { + "epoch": 4.367468712464644, + "grad_norm": 1.3554525719021875, + "learning_rate": 2.1838189708617483e-07, + "loss": 0.9599, + "step": 56360 + }, + { + "epoch": 4.368243635940951, + "grad_norm": 1.3149433443373764, + "learning_rate": 2.1842064476131432e-07, + "loss": 0.948, + "step": 56370 + }, + { + "epoch": 4.369018559417258, + "grad_norm": 1.2902250400977207, + "learning_rate": 2.1845939243645382e-07, + "loss": 0.9652, + "step": 56380 + }, + { + "epoch": 4.369793482893565, + "grad_norm": 1.453066913500542, + "learning_rate": 2.1849814011159332e-07, + "loss": 1.0098, + "step": 56390 + }, + { + "epoch": 4.370568406369871, + "grad_norm": 1.4273069526210551, + "learning_rate": 2.185368877867328e-07, + "loss": 0.9518, + "step": 56400 + }, + { + "epoch": 4.3713433298461775, + "grad_norm": 1.3595647113557623, + "learning_rate": 2.1857563546187228e-07, + "loss": 0.972, + "step": 56410 + }, + { + "epoch": 4.372118253322484, + "grad_norm": 1.3107847186618422, + "learning_rate": 2.1861438313701178e-07, + "loss": 0.9729, + "step": 56420 + }, + { + "epoch": 4.372893176798791, + "grad_norm": 1.3662730520171493, + "learning_rate": 2.1865313081215127e-07, + "loss": 1.0029, + "step": 56430 + }, + { + "epoch": 4.373668100275098, + "grad_norm": 1.443337475816391, + "learning_rate": 2.1869187848729077e-07, + "loss": 0.9964, + "step": 56440 + }, + { + "epoch": 4.374443023751405, + "grad_norm": 1.292537934510638, + "learning_rate": 2.1873062616243026e-07, + "loss": 0.9513, + "step": 56450 + }, + { + "epoch": 4.3752179472277115, + "grad_norm": 1.3929141521017008, + "learning_rate": 2.1876937383756976e-07, + "loss": 0.9942, + "step": 56460 + }, + { + "epoch": 4.375992870704018, + "grad_norm": 1.3593064267294916, + "learning_rate": 2.1880812151270925e-07, + "loss": 0.9641, + "step": 56470 + }, + { + "epoch": 4.376767794180324, + "grad_norm": 1.2857109332039163, + "learning_rate": 2.1884686918784872e-07, + "loss": 0.9614, + "step": 56480 + }, + { + "epoch": 4.377542717656631, + "grad_norm": 1.4402022228484868, + "learning_rate": 2.1888561686298822e-07, + "loss": 0.9583, + "step": 56490 + }, + { + "epoch": 4.378317641132938, + "grad_norm": 1.391130398309559, + "learning_rate": 2.1892436453812771e-07, + "loss": 0.9828, + "step": 56500 + }, + { + "epoch": 4.378317641132938, + "eval_loss": 0.9678364992141724, + "eval_runtime": 319.303, + "eval_samples_per_second": 35.925, + "eval_steps_per_second": 8.982, + "step": 56500 + }, + { + "epoch": 4.379092564609245, + "grad_norm": 1.386590224077471, + "learning_rate": 2.189631122132672e-07, + "loss": 0.9539, + "step": 56510 + }, + { + "epoch": 4.3798674880855515, + "grad_norm": 1.2527343481547548, + "learning_rate": 2.190018598884067e-07, + "loss": 0.9514, + "step": 56520 + }, + { + "epoch": 4.380642411561858, + "grad_norm": 1.3942009653629928, + "learning_rate": 2.190406075635462e-07, + "loss": 0.9684, + "step": 56530 + }, + { + "epoch": 4.381417335038165, + "grad_norm": 1.3006333002051051, + "learning_rate": 2.190793552386857e-07, + "loss": 0.9676, + "step": 56540 + }, + { + "epoch": 4.382192258514472, + "grad_norm": 1.3419417340262931, + "learning_rate": 2.1911810291382516e-07, + "loss": 0.9908, + "step": 56550 + }, + { + "epoch": 4.382967181990779, + "grad_norm": 1.384405296365162, + "learning_rate": 2.1915685058896466e-07, + "loss": 0.9572, + "step": 56560 + }, + { + "epoch": 4.3837421054670855, + "grad_norm": 1.352579461472999, + "learning_rate": 2.1919559826410416e-07, + "loss": 0.9629, + "step": 56570 + }, + { + "epoch": 4.3845170289433915, + "grad_norm": 1.319034296330599, + "learning_rate": 2.1923434593924365e-07, + "loss": 0.9704, + "step": 56580 + }, + { + "epoch": 4.385291952419698, + "grad_norm": 1.314865821948548, + "learning_rate": 2.1927309361438315e-07, + "loss": 0.9568, + "step": 56590 + }, + { + "epoch": 4.386066875896005, + "grad_norm": 1.250622720619022, + "learning_rate": 2.1931184128952264e-07, + "loss": 0.9514, + "step": 56600 + }, + { + "epoch": 4.386841799372312, + "grad_norm": 1.3365416232088088, + "learning_rate": 2.1935058896466214e-07, + "loss": 1.0058, + "step": 56610 + }, + { + "epoch": 4.387616722848619, + "grad_norm": 1.3403448872686068, + "learning_rate": 2.193893366398016e-07, + "loss": 0.9377, + "step": 56620 + }, + { + "epoch": 4.3883916463249255, + "grad_norm": 1.3838620193661577, + "learning_rate": 2.194280843149411e-07, + "loss": 0.9642, + "step": 56630 + }, + { + "epoch": 4.389166569801232, + "grad_norm": 1.3874814917479512, + "learning_rate": 2.194668319900806e-07, + "loss": 0.9703, + "step": 56640 + }, + { + "epoch": 4.389941493277539, + "grad_norm": 1.367382308995428, + "learning_rate": 2.195055796652201e-07, + "loss": 0.9724, + "step": 56650 + }, + { + "epoch": 4.390716416753846, + "grad_norm": 1.396818362623014, + "learning_rate": 2.195443273403596e-07, + "loss": 0.9568, + "step": 56660 + }, + { + "epoch": 4.391491340230152, + "grad_norm": 1.3785280134180924, + "learning_rate": 2.1958307501549908e-07, + "loss": 0.9457, + "step": 56670 + }, + { + "epoch": 4.392266263706459, + "grad_norm": 1.3527104989484442, + "learning_rate": 2.1962182269063855e-07, + "loss": 0.9433, + "step": 56680 + }, + { + "epoch": 4.3930411871827655, + "grad_norm": 1.2979261188975877, + "learning_rate": 2.1966057036577805e-07, + "loss": 0.962, + "step": 56690 + }, + { + "epoch": 4.393816110659072, + "grad_norm": 1.3295840998662194, + "learning_rate": 2.1969931804091754e-07, + "loss": 0.9654, + "step": 56700 + }, + { + "epoch": 4.394591034135379, + "grad_norm": 1.4207347968389932, + "learning_rate": 2.1973806571605704e-07, + "loss": 0.9662, + "step": 56710 + }, + { + "epoch": 4.395365957611686, + "grad_norm": 1.380745577368301, + "learning_rate": 2.1977681339119654e-07, + "loss": 0.9608, + "step": 56720 + }, + { + "epoch": 4.396140881087993, + "grad_norm": 1.337876348373645, + "learning_rate": 2.1981556106633603e-07, + "loss": 0.9648, + "step": 56730 + }, + { + "epoch": 4.3969158045642995, + "grad_norm": 1.285802053324713, + "learning_rate": 2.1985430874147553e-07, + "loss": 0.9713, + "step": 56740 + }, + { + "epoch": 4.397690728040606, + "grad_norm": 1.3114140284625337, + "learning_rate": 2.19893056416615e-07, + "loss": 0.9723, + "step": 56750 + }, + { + "epoch": 4.398465651516913, + "grad_norm": 1.3747337083929887, + "learning_rate": 2.199318040917545e-07, + "loss": 0.967, + "step": 56760 + }, + { + "epoch": 4.399240574993219, + "grad_norm": 1.3193323719251409, + "learning_rate": 2.1997055176689399e-07, + "loss": 0.9589, + "step": 56770 + }, + { + "epoch": 4.400015498469526, + "grad_norm": 1.3593109710967344, + "learning_rate": 2.2000929944203348e-07, + "loss": 0.9972, + "step": 56780 + }, + { + "epoch": 4.400790421945833, + "grad_norm": 1.3916116292601082, + "learning_rate": 2.2004804711717298e-07, + "loss": 0.9849, + "step": 56790 + }, + { + "epoch": 4.4015653454221395, + "grad_norm": 1.4279138203235595, + "learning_rate": 2.2008679479231247e-07, + "loss": 0.9577, + "step": 56800 + }, + { + "epoch": 4.402340268898446, + "grad_norm": 1.2600204716359635, + "learning_rate": 2.2012554246745197e-07, + "loss": 0.959, + "step": 56810 + }, + { + "epoch": 4.403115192374753, + "grad_norm": 1.342451167699987, + "learning_rate": 2.2016429014259144e-07, + "loss": 0.9753, + "step": 56820 + }, + { + "epoch": 4.40389011585106, + "grad_norm": 1.379904550548141, + "learning_rate": 2.2020303781773093e-07, + "loss": 0.9683, + "step": 56830 + }, + { + "epoch": 4.404665039327367, + "grad_norm": 1.3553001478420001, + "learning_rate": 2.2024178549287043e-07, + "loss": 0.976, + "step": 56840 + }, + { + "epoch": 4.405439962803673, + "grad_norm": 1.3285723802206202, + "learning_rate": 2.2028053316800992e-07, + "loss": 0.9531, + "step": 56850 + }, + { + "epoch": 4.4062148862799795, + "grad_norm": 1.2979144569999788, + "learning_rate": 2.2031928084314942e-07, + "loss": 0.9725, + "step": 56860 + }, + { + "epoch": 4.406989809756286, + "grad_norm": 1.2855092990157715, + "learning_rate": 2.2035802851828891e-07, + "loss": 0.9507, + "step": 56870 + }, + { + "epoch": 4.407764733232593, + "grad_norm": 1.342423993376262, + "learning_rate": 2.203967761934284e-07, + "loss": 0.9692, + "step": 56880 + }, + { + "epoch": 4.4085396567089, + "grad_norm": 1.3871224226566383, + "learning_rate": 2.2043552386856788e-07, + "loss": 0.9749, + "step": 56890 + }, + { + "epoch": 4.409314580185207, + "grad_norm": 1.3014962691175913, + "learning_rate": 2.2047427154370737e-07, + "loss": 0.9618, + "step": 56900 + }, + { + "epoch": 4.4100895036615135, + "grad_norm": 1.3478601833985544, + "learning_rate": 2.2051301921884687e-07, + "loss": 0.963, + "step": 56910 + }, + { + "epoch": 4.41086442713782, + "grad_norm": 1.3359391344859293, + "learning_rate": 2.2055176689398637e-07, + "loss": 0.9623, + "step": 56920 + }, + { + "epoch": 4.411639350614127, + "grad_norm": 1.3324160424856852, + "learning_rate": 2.2059051456912586e-07, + "loss": 0.9618, + "step": 56930 + }, + { + "epoch": 4.412414274090434, + "grad_norm": 1.3470844899129475, + "learning_rate": 2.2062926224426536e-07, + "loss": 0.9421, + "step": 56940 + }, + { + "epoch": 4.41318919756674, + "grad_norm": 1.292354809402336, + "learning_rate": 2.2066800991940485e-07, + "loss": 0.9439, + "step": 56950 + }, + { + "epoch": 4.413964121043047, + "grad_norm": 1.4527169703812324, + "learning_rate": 2.2070675759454432e-07, + "loss": 0.9633, + "step": 56960 + }, + { + "epoch": 4.4147390445193535, + "grad_norm": 1.415041816285688, + "learning_rate": 2.2074550526968382e-07, + "loss": 0.966, + "step": 56970 + }, + { + "epoch": 4.41551396799566, + "grad_norm": 1.4080891593892608, + "learning_rate": 2.207842529448233e-07, + "loss": 0.9639, + "step": 56980 + }, + { + "epoch": 4.416288891471967, + "grad_norm": 1.348344951583581, + "learning_rate": 2.208230006199628e-07, + "loss": 0.965, + "step": 56990 + }, + { + "epoch": 4.417063814948274, + "grad_norm": 1.3609273756327263, + "learning_rate": 2.208617482951023e-07, + "loss": 0.9623, + "step": 57000 + }, + { + "epoch": 4.417063814948274, + "eval_loss": 0.9669799208641052, + "eval_runtime": 321.4799, + "eval_samples_per_second": 35.682, + "eval_steps_per_second": 8.921, + "step": 57000 + }, + { + "epoch": 4.417838738424581, + "grad_norm": 1.4381488278791206, + "learning_rate": 2.209004959702418e-07, + "loss": 0.9615, + "step": 57010 + }, + { + "epoch": 4.418613661900888, + "grad_norm": 1.3420743384257097, + "learning_rate": 2.2093924364538127e-07, + "loss": 0.9843, + "step": 57020 + }, + { + "epoch": 4.419388585377194, + "grad_norm": 1.4286902467877285, + "learning_rate": 2.2097799132052076e-07, + "loss": 0.9699, + "step": 57030 + }, + { + "epoch": 4.4201635088535, + "grad_norm": 1.3527331793871553, + "learning_rate": 2.2101673899566026e-07, + "loss": 0.982, + "step": 57040 + }, + { + "epoch": 4.420938432329807, + "grad_norm": 1.2985966711478947, + "learning_rate": 2.2105548667079975e-07, + "loss": 0.9667, + "step": 57050 + }, + { + "epoch": 4.421713355806114, + "grad_norm": 1.380456561758659, + "learning_rate": 2.2109423434593925e-07, + "loss": 0.94, + "step": 57060 + }, + { + "epoch": 4.422488279282421, + "grad_norm": 1.328886589075594, + "learning_rate": 2.2113298202107875e-07, + "loss": 0.9818, + "step": 57070 + }, + { + "epoch": 4.4232632027587275, + "grad_norm": 1.2805015004936569, + "learning_rate": 2.2117172969621824e-07, + "loss": 0.9372, + "step": 57080 + }, + { + "epoch": 4.424038126235034, + "grad_norm": 1.3945947123356421, + "learning_rate": 2.212104773713577e-07, + "loss": 0.985, + "step": 57090 + }, + { + "epoch": 4.424813049711341, + "grad_norm": 1.374117961730431, + "learning_rate": 2.212492250464972e-07, + "loss": 0.9616, + "step": 57100 + }, + { + "epoch": 4.425587973187648, + "grad_norm": 1.4155395937108652, + "learning_rate": 2.212879727216367e-07, + "loss": 0.9658, + "step": 57110 + }, + { + "epoch": 4.426362896663955, + "grad_norm": 1.3133057071592722, + "learning_rate": 2.213267203967762e-07, + "loss": 0.9809, + "step": 57120 + }, + { + "epoch": 4.427137820140262, + "grad_norm": 1.3577401912511018, + "learning_rate": 2.213654680719157e-07, + "loss": 0.9704, + "step": 57130 + }, + { + "epoch": 4.4279127436165675, + "grad_norm": 1.288947492444333, + "learning_rate": 2.214042157470552e-07, + "loss": 0.9466, + "step": 57140 + }, + { + "epoch": 4.428687667092874, + "grad_norm": 1.3611496718178477, + "learning_rate": 2.2144296342219468e-07, + "loss": 0.9464, + "step": 57150 + }, + { + "epoch": 4.429462590569181, + "grad_norm": 1.3236119488049467, + "learning_rate": 2.2148171109733415e-07, + "loss": 0.9588, + "step": 57160 + }, + { + "epoch": 4.430237514045488, + "grad_norm": 1.288203407084608, + "learning_rate": 2.2152045877247365e-07, + "loss": 0.9677, + "step": 57170 + }, + { + "epoch": 4.431012437521795, + "grad_norm": 1.3380887648349455, + "learning_rate": 2.2155920644761314e-07, + "loss": 0.9475, + "step": 57180 + }, + { + "epoch": 4.4317873609981016, + "grad_norm": 1.3719659029835323, + "learning_rate": 2.2159795412275264e-07, + "loss": 0.9612, + "step": 57190 + }, + { + "epoch": 4.432562284474408, + "grad_norm": 1.3894146462238066, + "learning_rate": 2.2163670179789213e-07, + "loss": 0.9742, + "step": 57200 + }, + { + "epoch": 4.433337207950715, + "grad_norm": 1.3853193185569346, + "learning_rate": 2.2167544947303163e-07, + "loss": 0.9665, + "step": 57210 + }, + { + "epoch": 4.434112131427022, + "grad_norm": 1.4106159762591395, + "learning_rate": 2.2171419714817113e-07, + "loss": 0.9752, + "step": 57220 + }, + { + "epoch": 4.434887054903328, + "grad_norm": 1.3599847087239487, + "learning_rate": 2.217529448233106e-07, + "loss": 0.952, + "step": 57230 + }, + { + "epoch": 4.435661978379635, + "grad_norm": 1.3451800584443836, + "learning_rate": 2.217916924984501e-07, + "loss": 0.9514, + "step": 57240 + }, + { + "epoch": 4.4364369018559415, + "grad_norm": 1.3793063606696188, + "learning_rate": 2.2183044017358959e-07, + "loss": 0.9693, + "step": 57250 + }, + { + "epoch": 4.437211825332248, + "grad_norm": 1.2927694573925306, + "learning_rate": 2.2186918784872908e-07, + "loss": 0.9518, + "step": 57260 + }, + { + "epoch": 4.437986748808555, + "grad_norm": 1.2805359646453964, + "learning_rate": 2.2190793552386858e-07, + "loss": 0.9901, + "step": 57270 + }, + { + "epoch": 4.438761672284862, + "grad_norm": 1.2973987785731207, + "learning_rate": 2.2194668319900807e-07, + "loss": 0.9576, + "step": 57280 + }, + { + "epoch": 4.439536595761169, + "grad_norm": 1.341967315427913, + "learning_rate": 2.2198543087414757e-07, + "loss": 0.9443, + "step": 57290 + }, + { + "epoch": 4.440311519237476, + "grad_norm": 1.3260143516205904, + "learning_rate": 2.2202417854928704e-07, + "loss": 0.9698, + "step": 57300 + }, + { + "epoch": 4.441086442713782, + "grad_norm": 1.357354037492798, + "learning_rate": 2.2206292622442653e-07, + "loss": 0.9523, + "step": 57310 + }, + { + "epoch": 4.441861366190089, + "grad_norm": 1.4013981884051865, + "learning_rate": 2.2210167389956603e-07, + "loss": 0.9808, + "step": 57320 + }, + { + "epoch": 4.442636289666395, + "grad_norm": 1.3170210020188475, + "learning_rate": 2.2214042157470552e-07, + "loss": 0.9575, + "step": 57330 + }, + { + "epoch": 4.443411213142702, + "grad_norm": 1.3014717805651634, + "learning_rate": 2.2217916924984502e-07, + "loss": 0.9653, + "step": 57340 + }, + { + "epoch": 4.444186136619009, + "grad_norm": 1.3635525819259517, + "learning_rate": 2.2221791692498451e-07, + "loss": 0.9711, + "step": 57350 + }, + { + "epoch": 4.4449610600953156, + "grad_norm": 1.3456873226092831, + "learning_rate": 2.2225666460012398e-07, + "loss": 0.9571, + "step": 57360 + }, + { + "epoch": 4.445735983571622, + "grad_norm": 1.4029083703630723, + "learning_rate": 2.2229541227526348e-07, + "loss": 0.9706, + "step": 57370 + }, + { + "epoch": 4.446510907047929, + "grad_norm": 1.3683564500353742, + "learning_rate": 2.2233415995040297e-07, + "loss": 0.9723, + "step": 57380 + }, + { + "epoch": 4.447285830524236, + "grad_norm": 1.3704909138463826, + "learning_rate": 2.2237290762554247e-07, + "loss": 0.9852, + "step": 57390 + }, + { + "epoch": 4.448060754000543, + "grad_norm": 1.3155554582361033, + "learning_rate": 2.2241165530068197e-07, + "loss": 0.9937, + "step": 57400 + }, + { + "epoch": 4.448835677476849, + "grad_norm": 1.3654541139621428, + "learning_rate": 2.2245040297582146e-07, + "loss": 0.9649, + "step": 57410 + }, + { + "epoch": 4.4496106009531555, + "grad_norm": 1.3332112006373484, + "learning_rate": 2.2248915065096096e-07, + "loss": 0.9643, + "step": 57420 + }, + { + "epoch": 4.450385524429462, + "grad_norm": 1.3442880183473, + "learning_rate": 2.2252789832610043e-07, + "loss": 0.9569, + "step": 57430 + }, + { + "epoch": 4.451160447905769, + "grad_norm": 1.2958905699540428, + "learning_rate": 2.2256664600123992e-07, + "loss": 0.9776, + "step": 57440 + }, + { + "epoch": 4.451935371382076, + "grad_norm": 1.3754341841927553, + "learning_rate": 2.2260539367637942e-07, + "loss": 0.979, + "step": 57450 + }, + { + "epoch": 4.452710294858383, + "grad_norm": 1.361389474337367, + "learning_rate": 2.226441413515189e-07, + "loss": 0.9828, + "step": 57460 + }, + { + "epoch": 4.45348521833469, + "grad_norm": 1.3334472222976101, + "learning_rate": 2.226828890266584e-07, + "loss": 0.9524, + "step": 57470 + }, + { + "epoch": 4.454260141810996, + "grad_norm": 1.334819025668313, + "learning_rate": 2.227216367017979e-07, + "loss": 0.9683, + "step": 57480 + }, + { + "epoch": 4.455035065287303, + "grad_norm": 1.3658480091916938, + "learning_rate": 2.227603843769374e-07, + "loss": 0.9527, + "step": 57490 + }, + { + "epoch": 4.45580998876361, + "grad_norm": 1.4672869226881913, + "learning_rate": 2.2279913205207687e-07, + "loss": 0.9723, + "step": 57500 + }, + { + "epoch": 4.45580998876361, + "eval_loss": 0.9662472605705261, + "eval_runtime": 318.4606, + "eval_samples_per_second": 36.02, + "eval_steps_per_second": 9.006, + "step": 57500 + }, + { + "epoch": 4.456584912239916, + "grad_norm": 1.4001965992968783, + "learning_rate": 2.2283787972721636e-07, + "loss": 0.954, + "step": 57510 + }, + { + "epoch": 4.457359835716223, + "grad_norm": 1.3709659611580438, + "learning_rate": 2.2287662740235586e-07, + "loss": 0.9635, + "step": 57520 + }, + { + "epoch": 4.4581347591925296, + "grad_norm": 1.3441853611371795, + "learning_rate": 2.2291537507749535e-07, + "loss": 0.9517, + "step": 57530 + }, + { + "epoch": 4.458909682668836, + "grad_norm": 1.326595846756791, + "learning_rate": 2.2295412275263485e-07, + "loss": 0.965, + "step": 57540 + }, + { + "epoch": 4.459684606145143, + "grad_norm": 1.316875378811023, + "learning_rate": 2.2299287042777434e-07, + "loss": 0.9516, + "step": 57550 + }, + { + "epoch": 4.46045952962145, + "grad_norm": 1.3587418840404564, + "learning_rate": 2.2303161810291384e-07, + "loss": 0.9709, + "step": 57560 + }, + { + "epoch": 4.461234453097757, + "grad_norm": 1.37311469246136, + "learning_rate": 2.230703657780533e-07, + "loss": 0.9511, + "step": 57570 + }, + { + "epoch": 4.462009376574064, + "grad_norm": 1.2731855827768506, + "learning_rate": 2.231091134531928e-07, + "loss": 0.9605, + "step": 57580 + }, + { + "epoch": 4.46278430005037, + "grad_norm": 1.3917232179762675, + "learning_rate": 2.231478611283323e-07, + "loss": 0.9873, + "step": 57590 + }, + { + "epoch": 4.463559223526676, + "grad_norm": 1.4032075633971128, + "learning_rate": 2.231866088034718e-07, + "loss": 0.98, + "step": 57600 + }, + { + "epoch": 4.464334147002983, + "grad_norm": 1.3370883932092505, + "learning_rate": 2.232253564786113e-07, + "loss": 0.9609, + "step": 57610 + }, + { + "epoch": 4.46510907047929, + "grad_norm": 1.3307853922090245, + "learning_rate": 2.232641041537508e-07, + "loss": 0.9387, + "step": 57620 + }, + { + "epoch": 4.465883993955597, + "grad_norm": 1.3591372710889886, + "learning_rate": 2.2330285182889028e-07, + "loss": 0.9771, + "step": 57630 + }, + { + "epoch": 4.466658917431904, + "grad_norm": 1.3230693027977891, + "learning_rate": 2.2334159950402975e-07, + "loss": 0.9552, + "step": 57640 + }, + { + "epoch": 4.46743384090821, + "grad_norm": 1.3228123448100313, + "learning_rate": 2.2338034717916925e-07, + "loss": 0.9674, + "step": 57650 + }, + { + "epoch": 4.468208764384517, + "grad_norm": 1.2658262034099939, + "learning_rate": 2.2341909485430874e-07, + "loss": 0.9521, + "step": 57660 + }, + { + "epoch": 4.468983687860824, + "grad_norm": 1.2488806967884154, + "learning_rate": 2.2345784252944824e-07, + "loss": 0.9471, + "step": 57670 + }, + { + "epoch": 4.469758611337131, + "grad_norm": 1.318415323887905, + "learning_rate": 2.2349659020458773e-07, + "loss": 0.9586, + "step": 57680 + }, + { + "epoch": 4.470533534813438, + "grad_norm": 1.447396847177291, + "learning_rate": 2.2353533787972723e-07, + "loss": 0.9904, + "step": 57690 + }, + { + "epoch": 4.4713084582897435, + "grad_norm": 1.3702358219091153, + "learning_rate": 2.2357408555486672e-07, + "loss": 0.9858, + "step": 57700 + }, + { + "epoch": 4.47208338176605, + "grad_norm": 1.4086028604174576, + "learning_rate": 2.236128332300062e-07, + "loss": 0.9644, + "step": 57710 + }, + { + "epoch": 4.472858305242357, + "grad_norm": 1.3717031754394942, + "learning_rate": 2.236515809051457e-07, + "loss": 0.9633, + "step": 57720 + }, + { + "epoch": 4.473633228718664, + "grad_norm": 1.365920810530639, + "learning_rate": 2.2369032858028518e-07, + "loss": 0.952, + "step": 57730 + }, + { + "epoch": 4.474408152194971, + "grad_norm": 1.379511696722963, + "learning_rate": 2.2372907625542468e-07, + "loss": 1.014, + "step": 57740 + }, + { + "epoch": 4.475183075671278, + "grad_norm": 1.344941271977404, + "learning_rate": 2.2376782393056418e-07, + "loss": 0.953, + "step": 57750 + }, + { + "epoch": 4.475957999147584, + "grad_norm": 1.1875972710597187, + "learning_rate": 2.2380657160570367e-07, + "loss": 0.9902, + "step": 57760 + }, + { + "epoch": 4.476732922623891, + "grad_norm": 1.3485976627208385, + "learning_rate": 2.2384531928084314e-07, + "loss": 0.944, + "step": 57770 + }, + { + "epoch": 4.477507846100197, + "grad_norm": 1.3773595172740565, + "learning_rate": 2.2388406695598264e-07, + "loss": 0.9537, + "step": 57780 + }, + { + "epoch": 4.478282769576504, + "grad_norm": 1.3282089204375218, + "learning_rate": 2.2392281463112213e-07, + "loss": 0.9646, + "step": 57790 + }, + { + "epoch": 4.479057693052811, + "grad_norm": 1.3642553977293557, + "learning_rate": 2.2396156230626163e-07, + "loss": 0.9438, + "step": 57800 + }, + { + "epoch": 4.479832616529118, + "grad_norm": 1.3931578561926403, + "learning_rate": 2.2400030998140112e-07, + "loss": 0.974, + "step": 57810 + }, + { + "epoch": 4.480607540005424, + "grad_norm": 1.3204795115167118, + "learning_rate": 2.2403905765654062e-07, + "loss": 0.9515, + "step": 57820 + }, + { + "epoch": 4.481382463481731, + "grad_norm": 1.346718057073902, + "learning_rate": 2.240778053316801e-07, + "loss": 0.9816, + "step": 57830 + }, + { + "epoch": 4.482157386958038, + "grad_norm": 1.3679155265498493, + "learning_rate": 2.2411655300681958e-07, + "loss": 0.9659, + "step": 57840 + }, + { + "epoch": 4.482932310434345, + "grad_norm": 1.4033937124286475, + "learning_rate": 2.2415530068195908e-07, + "loss": 0.9735, + "step": 57850 + }, + { + "epoch": 4.483707233910652, + "grad_norm": 1.3071904059792747, + "learning_rate": 2.2419404835709857e-07, + "loss": 0.9771, + "step": 57860 + }, + { + "epoch": 4.484482157386958, + "grad_norm": 1.2976097166358531, + "learning_rate": 2.2423279603223807e-07, + "loss": 0.9609, + "step": 57870 + }, + { + "epoch": 4.485257080863264, + "grad_norm": 1.4834871040051232, + "learning_rate": 2.2427154370737756e-07, + "loss": 0.9506, + "step": 57880 + }, + { + "epoch": 4.486032004339571, + "grad_norm": 1.3560133794093967, + "learning_rate": 2.2431029138251706e-07, + "loss": 0.9809, + "step": 57890 + }, + { + "epoch": 4.486806927815878, + "grad_norm": 1.428786660109891, + "learning_rate": 2.2434903905765656e-07, + "loss": 0.9533, + "step": 57900 + }, + { + "epoch": 4.487581851292185, + "grad_norm": 1.3367749698855393, + "learning_rate": 2.2438778673279602e-07, + "loss": 0.9794, + "step": 57910 + }, + { + "epoch": 4.488356774768492, + "grad_norm": 1.3879200284561424, + "learning_rate": 2.2442653440793552e-07, + "loss": 0.955, + "step": 57920 + }, + { + "epoch": 4.489131698244798, + "grad_norm": 1.3140230167095974, + "learning_rate": 2.2446528208307502e-07, + "loss": 0.9612, + "step": 57930 + }, + { + "epoch": 4.489906621721105, + "grad_norm": 1.4639812241871868, + "learning_rate": 2.245040297582145e-07, + "loss": 0.9534, + "step": 57940 + }, + { + "epoch": 4.490681545197412, + "grad_norm": 1.3072914880519293, + "learning_rate": 2.24542777433354e-07, + "loss": 0.9682, + "step": 57950 + }, + { + "epoch": 4.491456468673719, + "grad_norm": 1.3477337280753237, + "learning_rate": 2.245815251084935e-07, + "loss": 0.9974, + "step": 57960 + }, + { + "epoch": 4.492231392150025, + "grad_norm": 1.3188804178269469, + "learning_rate": 2.24620272783633e-07, + "loss": 0.9574, + "step": 57970 + }, + { + "epoch": 4.493006315626332, + "grad_norm": 1.3033550432681624, + "learning_rate": 2.2465902045877247e-07, + "loss": 0.9683, + "step": 57980 + }, + { + "epoch": 4.493781239102638, + "grad_norm": 1.3016230338557073, + "learning_rate": 2.2469776813391196e-07, + "loss": 0.9544, + "step": 57990 + }, + { + "epoch": 4.494556162578945, + "grad_norm": 1.4413587244697494, + "learning_rate": 2.2473651580905146e-07, + "loss": 0.9646, + "step": 58000 + }, + { + "epoch": 4.494556162578945, + "eval_loss": 0.9655313491821289, + "eval_runtime": 320.296, + "eval_samples_per_second": 35.814, + "eval_steps_per_second": 8.954, + "step": 58000 + }, + { + "epoch": 4.495331086055252, + "grad_norm": 1.3639834234856252, + "learning_rate": 2.2477526348419095e-07, + "loss": 0.9766, + "step": 58010 + }, + { + "epoch": 4.496106009531559, + "grad_norm": 1.330267696494881, + "learning_rate": 2.2481401115933045e-07, + "loss": 0.9521, + "step": 58020 + }, + { + "epoch": 4.496880933007866, + "grad_norm": 1.3871392921503378, + "learning_rate": 2.2485275883446994e-07, + "loss": 0.9714, + "step": 58030 + }, + { + "epoch": 4.497655856484172, + "grad_norm": 1.3257735102940362, + "learning_rate": 2.2489150650960944e-07, + "loss": 0.9745, + "step": 58040 + }, + { + "epoch": 4.498430779960479, + "grad_norm": 1.3601482771060502, + "learning_rate": 2.249302541847489e-07, + "loss": 0.9549, + "step": 58050 + }, + { + "epoch": 4.499205703436786, + "grad_norm": 1.3248046472646893, + "learning_rate": 2.249690018598884e-07, + "loss": 0.9671, + "step": 58060 + }, + { + "epoch": 4.499980626913092, + "grad_norm": 1.2894352858204499, + "learning_rate": 2.250077495350279e-07, + "loss": 0.9584, + "step": 58070 + }, + { + "epoch": 4.500755550389399, + "grad_norm": 1.3392086694374137, + "learning_rate": 2.250464972101674e-07, + "loss": 0.9725, + "step": 58080 + }, + { + "epoch": 4.501530473865706, + "grad_norm": 1.381230217792166, + "learning_rate": 2.250852448853069e-07, + "loss": 0.9764, + "step": 58090 + }, + { + "epoch": 4.502305397342012, + "grad_norm": 1.3554976641195897, + "learning_rate": 2.2512399256044639e-07, + "loss": 0.9548, + "step": 58100 + }, + { + "epoch": 4.503080320818319, + "grad_norm": 1.3727731856584235, + "learning_rate": 2.2516274023558586e-07, + "loss": 0.9626, + "step": 58110 + }, + { + "epoch": 4.503855244294626, + "grad_norm": 1.3642018193084509, + "learning_rate": 2.2520148791072535e-07, + "loss": 0.9445, + "step": 58120 + }, + { + "epoch": 4.504630167770933, + "grad_norm": 1.37453853640821, + "learning_rate": 2.2524023558586485e-07, + "loss": 0.9602, + "step": 58130 + }, + { + "epoch": 4.50540509124724, + "grad_norm": 1.3485381554853966, + "learning_rate": 2.2527898326100434e-07, + "loss": 0.9496, + "step": 58140 + }, + { + "epoch": 4.506180014723546, + "grad_norm": 1.2797917570457584, + "learning_rate": 2.2531773093614384e-07, + "loss": 0.9785, + "step": 58150 + }, + { + "epoch": 4.506954938199852, + "grad_norm": 1.3032750843027698, + "learning_rate": 2.2535647861128333e-07, + "loss": 0.9796, + "step": 58160 + }, + { + "epoch": 4.507729861676159, + "grad_norm": 1.3148533130968845, + "learning_rate": 2.2539522628642283e-07, + "loss": 0.9553, + "step": 58170 + }, + { + "epoch": 4.508504785152466, + "grad_norm": 1.3918781037773327, + "learning_rate": 2.254339739615623e-07, + "loss": 0.9775, + "step": 58180 + }, + { + "epoch": 4.509279708628773, + "grad_norm": 1.35749811128433, + "learning_rate": 2.254727216367018e-07, + "loss": 0.9558, + "step": 58190 + }, + { + "epoch": 4.51005463210508, + "grad_norm": 1.3307593018817472, + "learning_rate": 2.255114693118413e-07, + "loss": 0.9861, + "step": 58200 + }, + { + "epoch": 4.510829555581386, + "grad_norm": 1.3551932454056568, + "learning_rate": 2.2555021698698078e-07, + "loss": 0.9796, + "step": 58210 + }, + { + "epoch": 4.511604479057693, + "grad_norm": 1.400674236671795, + "learning_rate": 2.2558896466212028e-07, + "loss": 0.9491, + "step": 58220 + }, + { + "epoch": 4.512379402534, + "grad_norm": 1.3915748765494493, + "learning_rate": 2.2562771233725977e-07, + "loss": 0.9625, + "step": 58230 + }, + { + "epoch": 4.513154326010307, + "grad_norm": 1.3550535585166603, + "learning_rate": 2.2566646001239927e-07, + "loss": 0.9699, + "step": 58240 + }, + { + "epoch": 4.513929249486614, + "grad_norm": 1.3582375623541003, + "learning_rate": 2.2570520768753874e-07, + "loss": 0.9457, + "step": 58250 + }, + { + "epoch": 4.51470417296292, + "grad_norm": 1.3206043391725768, + "learning_rate": 2.2574395536267823e-07, + "loss": 0.9509, + "step": 58260 + }, + { + "epoch": 4.515479096439226, + "grad_norm": 1.409963978080554, + "learning_rate": 2.2578270303781773e-07, + "loss": 0.9844, + "step": 58270 + }, + { + "epoch": 4.516254019915533, + "grad_norm": 1.3000247930744884, + "learning_rate": 2.2582145071295723e-07, + "loss": 0.9854, + "step": 58280 + }, + { + "epoch": 4.51702894339184, + "grad_norm": 1.362506533273476, + "learning_rate": 2.2586019838809672e-07, + "loss": 0.9756, + "step": 58290 + }, + { + "epoch": 4.517803866868147, + "grad_norm": 1.3279642198412716, + "learning_rate": 2.2589894606323622e-07, + "loss": 0.9841, + "step": 58300 + }, + { + "epoch": 4.518578790344454, + "grad_norm": 1.3110778202908828, + "learning_rate": 2.259376937383757e-07, + "loss": 0.969, + "step": 58310 + }, + { + "epoch": 4.5193537138207605, + "grad_norm": 1.4250154766676322, + "learning_rate": 2.2597644141351518e-07, + "loss": 0.9565, + "step": 58320 + }, + { + "epoch": 4.520128637297067, + "grad_norm": 1.3230199871184418, + "learning_rate": 2.2601518908865468e-07, + "loss": 0.9618, + "step": 58330 + }, + { + "epoch": 4.520903560773373, + "grad_norm": 1.3507170231389465, + "learning_rate": 2.2605393676379417e-07, + "loss": 0.9678, + "step": 58340 + }, + { + "epoch": 4.52167848424968, + "grad_norm": 1.3144003016466348, + "learning_rate": 2.2609268443893367e-07, + "loss": 0.9436, + "step": 58350 + }, + { + "epoch": 4.522453407725987, + "grad_norm": 1.439301730226901, + "learning_rate": 2.2613143211407316e-07, + "loss": 0.9692, + "step": 58360 + }, + { + "epoch": 4.523228331202294, + "grad_norm": 1.3092184102317608, + "learning_rate": 2.2617017978921266e-07, + "loss": 0.9375, + "step": 58370 + }, + { + "epoch": 4.5240032546786, + "grad_norm": 1.338516357078549, + "learning_rate": 2.2620892746435215e-07, + "loss": 0.9776, + "step": 58380 + }, + { + "epoch": 4.524778178154907, + "grad_norm": 1.3293509939899697, + "learning_rate": 2.2624767513949162e-07, + "loss": 0.9431, + "step": 58390 + }, + { + "epoch": 4.525553101631214, + "grad_norm": 1.3735505414870792, + "learning_rate": 2.2628642281463112e-07, + "loss": 0.9621, + "step": 58400 + }, + { + "epoch": 4.526328025107521, + "grad_norm": 1.4021445742957284, + "learning_rate": 2.2632517048977061e-07, + "loss": 0.9509, + "step": 58410 + }, + { + "epoch": 4.527102948583828, + "grad_norm": 1.3693352926300952, + "learning_rate": 2.263639181649101e-07, + "loss": 0.9709, + "step": 58420 + }, + { + "epoch": 4.5278778720601345, + "grad_norm": 1.325651547812006, + "learning_rate": 2.264026658400496e-07, + "loss": 0.9609, + "step": 58430 + }, + { + "epoch": 4.52865279553644, + "grad_norm": 1.3741206168535793, + "learning_rate": 2.264414135151891e-07, + "loss": 0.9469, + "step": 58440 + }, + { + "epoch": 4.529427719012747, + "grad_norm": 1.509601992588989, + "learning_rate": 2.2648016119032857e-07, + "loss": 0.9714, + "step": 58450 + }, + { + "epoch": 4.530202642489054, + "grad_norm": 1.3601843014738186, + "learning_rate": 2.2651890886546807e-07, + "loss": 0.9688, + "step": 58460 + }, + { + "epoch": 4.530977565965361, + "grad_norm": 1.3153458656316148, + "learning_rate": 2.2655765654060756e-07, + "loss": 0.9464, + "step": 58470 + }, + { + "epoch": 4.531752489441668, + "grad_norm": 1.3770403906322308, + "learning_rate": 2.2659640421574706e-07, + "loss": 0.9574, + "step": 58480 + }, + { + "epoch": 4.5325274129179745, + "grad_norm": 1.3816877486081667, + "learning_rate": 2.2663515189088655e-07, + "loss": 0.9708, + "step": 58490 + }, + { + "epoch": 4.533302336394281, + "grad_norm": 1.3638189820964044, + "learning_rate": 2.2667389956602605e-07, + "loss": 0.9642, + "step": 58500 + }, + { + "epoch": 4.533302336394281, + "eval_loss": 0.9647517800331116, + "eval_runtime": 320.4498, + "eval_samples_per_second": 35.797, + "eval_steps_per_second": 8.95, + "step": 58500 + }, + { + "epoch": 4.534077259870588, + "grad_norm": 1.3120244133969978, + "learning_rate": 2.2671264724116554e-07, + "loss": 0.9731, + "step": 58510 + }, + { + "epoch": 4.534852183346895, + "grad_norm": 1.3032452675917783, + "learning_rate": 2.26751394916305e-07, + "loss": 0.9451, + "step": 58520 + }, + { + "epoch": 4.535627106823201, + "grad_norm": 1.3826770843747092, + "learning_rate": 2.267901425914445e-07, + "loss": 0.962, + "step": 58530 + }, + { + "epoch": 4.536402030299508, + "grad_norm": 1.2934424037403691, + "learning_rate": 2.26828890266584e-07, + "loss": 0.9615, + "step": 58540 + }, + { + "epoch": 4.537176953775814, + "grad_norm": 1.3182899019555359, + "learning_rate": 2.268676379417235e-07, + "loss": 0.9513, + "step": 58550 + }, + { + "epoch": 4.537951877252121, + "grad_norm": 1.272311608707254, + "learning_rate": 2.26906385616863e-07, + "loss": 0.9687, + "step": 58560 + }, + { + "epoch": 4.538726800728428, + "grad_norm": 1.3646587126727048, + "learning_rate": 2.269451332920025e-07, + "loss": 0.9712, + "step": 58570 + }, + { + "epoch": 4.539501724204735, + "grad_norm": 1.3345653761216594, + "learning_rate": 2.2698388096714199e-07, + "loss": 0.9587, + "step": 58580 + }, + { + "epoch": 4.540276647681042, + "grad_norm": 1.3348202121853523, + "learning_rate": 2.2702262864228145e-07, + "loss": 0.9666, + "step": 58590 + }, + { + "epoch": 4.5410515711573485, + "grad_norm": 1.3641079164965781, + "learning_rate": 2.2706137631742095e-07, + "loss": 0.9694, + "step": 58600 + }, + { + "epoch": 4.541826494633655, + "grad_norm": 1.3647434757652566, + "learning_rate": 2.2710012399256045e-07, + "loss": 0.9688, + "step": 58610 + }, + { + "epoch": 4.542601418109962, + "grad_norm": 1.3113698810280454, + "learning_rate": 2.2713887166769994e-07, + "loss": 0.9509, + "step": 58620 + }, + { + "epoch": 4.543376341586268, + "grad_norm": 1.398600264977374, + "learning_rate": 2.2717761934283944e-07, + "loss": 0.9642, + "step": 58630 + }, + { + "epoch": 4.544151265062575, + "grad_norm": 1.3407371438837208, + "learning_rate": 2.2721636701797893e-07, + "loss": 0.9647, + "step": 58640 + }, + { + "epoch": 4.544926188538882, + "grad_norm": 1.331757479285741, + "learning_rate": 2.2725511469311843e-07, + "loss": 0.9764, + "step": 58650 + }, + { + "epoch": 4.5457011120151884, + "grad_norm": 1.3806812009978906, + "learning_rate": 2.272938623682579e-07, + "loss": 0.9556, + "step": 58660 + }, + { + "epoch": 4.546476035491495, + "grad_norm": 1.297135232750152, + "learning_rate": 2.273326100433974e-07, + "loss": 0.9314, + "step": 58670 + }, + { + "epoch": 4.547250958967802, + "grad_norm": 1.4080726127960062, + "learning_rate": 2.273713577185369e-07, + "loss": 0.9672, + "step": 58680 + }, + { + "epoch": 4.548025882444109, + "grad_norm": 1.3609379171734204, + "learning_rate": 2.2741010539367638e-07, + "loss": 0.9741, + "step": 58690 + }, + { + "epoch": 4.548800805920416, + "grad_norm": 1.3207996528723474, + "learning_rate": 2.2744885306881588e-07, + "loss": 0.9492, + "step": 58700 + }, + { + "epoch": 4.549575729396722, + "grad_norm": 1.4076793929996727, + "learning_rate": 2.2748760074395537e-07, + "loss": 0.9545, + "step": 58710 + }, + { + "epoch": 4.550350652873028, + "grad_norm": 1.358558202129014, + "learning_rate": 2.2752634841909487e-07, + "loss": 0.9625, + "step": 58720 + }, + { + "epoch": 4.551125576349335, + "grad_norm": 1.4164098540392276, + "learning_rate": 2.2756509609423434e-07, + "loss": 0.9381, + "step": 58730 + }, + { + "epoch": 4.551900499825642, + "grad_norm": 1.3585375169238816, + "learning_rate": 2.2760384376937383e-07, + "loss": 0.968, + "step": 58740 + }, + { + "epoch": 4.552675423301949, + "grad_norm": 1.3871988120849026, + "learning_rate": 2.2764259144451333e-07, + "loss": 0.9601, + "step": 58750 + }, + { + "epoch": 4.553450346778256, + "grad_norm": 1.3102875109553236, + "learning_rate": 2.2768133911965282e-07, + "loss": 0.9504, + "step": 58760 + }, + { + "epoch": 4.5542252702545625, + "grad_norm": 1.35426389958256, + "learning_rate": 2.2772008679479232e-07, + "loss": 0.9369, + "step": 58770 + }, + { + "epoch": 4.555000193730869, + "grad_norm": 1.3281971936955959, + "learning_rate": 2.2775883446993182e-07, + "loss": 0.9498, + "step": 58780 + }, + { + "epoch": 4.555775117207176, + "grad_norm": 1.4262659381311802, + "learning_rate": 2.277975821450713e-07, + "loss": 0.9611, + "step": 58790 + }, + { + "epoch": 4.556550040683483, + "grad_norm": 1.3610291895163489, + "learning_rate": 2.2783632982021078e-07, + "loss": 0.9634, + "step": 58800 + }, + { + "epoch": 4.55732496415979, + "grad_norm": 1.3565315188704188, + "learning_rate": 2.2787507749535028e-07, + "loss": 0.9978, + "step": 58810 + }, + { + "epoch": 4.558099887636096, + "grad_norm": 1.3079313120713467, + "learning_rate": 2.2791382517048977e-07, + "loss": 0.9618, + "step": 58820 + }, + { + "epoch": 4.5588748111124024, + "grad_norm": 1.3206125709354934, + "learning_rate": 2.2795257284562927e-07, + "loss": 0.9556, + "step": 58830 + }, + { + "epoch": 4.559649734588709, + "grad_norm": 1.327648170887637, + "learning_rate": 2.2799132052076876e-07, + "loss": 0.9696, + "step": 58840 + }, + { + "epoch": 4.560424658065016, + "grad_norm": 1.3639154303436458, + "learning_rate": 2.2803006819590826e-07, + "loss": 0.9508, + "step": 58850 + }, + { + "epoch": 4.561199581541323, + "grad_norm": 1.391148634148492, + "learning_rate": 2.2806881587104773e-07, + "loss": 0.9581, + "step": 58860 + }, + { + "epoch": 4.56197450501763, + "grad_norm": 1.3282157490297752, + "learning_rate": 2.2810756354618722e-07, + "loss": 0.9571, + "step": 58870 + }, + { + "epoch": 4.5627494284939365, + "grad_norm": 1.4134361606567556, + "learning_rate": 2.2814631122132672e-07, + "loss": 1.0164, + "step": 58880 + }, + { + "epoch": 4.563524351970243, + "grad_norm": 1.4293092474500861, + "learning_rate": 2.2818505889646621e-07, + "loss": 0.9664, + "step": 58890 + }, + { + "epoch": 4.564299275446549, + "grad_norm": 1.3692455923574864, + "learning_rate": 2.282238065716057e-07, + "loss": 0.9654, + "step": 58900 + }, + { + "epoch": 4.565074198922856, + "grad_norm": 1.3835747911274046, + "learning_rate": 2.282625542467452e-07, + "loss": 0.9771, + "step": 58910 + }, + { + "epoch": 4.565849122399163, + "grad_norm": 1.3535649963902252, + "learning_rate": 2.283013019218847e-07, + "loss": 0.955, + "step": 58920 + }, + { + "epoch": 4.56662404587547, + "grad_norm": 1.364530181979829, + "learning_rate": 2.2834004959702417e-07, + "loss": 0.9715, + "step": 58930 + }, + { + "epoch": 4.5673989693517765, + "grad_norm": 1.328611849401544, + "learning_rate": 2.2837879727216366e-07, + "loss": 0.9651, + "step": 58940 + }, + { + "epoch": 4.568173892828083, + "grad_norm": 1.363823070990215, + "learning_rate": 2.2841754494730316e-07, + "loss": 0.9723, + "step": 58950 + }, + { + "epoch": 4.56894881630439, + "grad_norm": 1.439491628207505, + "learning_rate": 2.2845629262244266e-07, + "loss": 0.9622, + "step": 58960 + }, + { + "epoch": 4.569723739780697, + "grad_norm": 1.3509891154455462, + "learning_rate": 2.2849504029758215e-07, + "loss": 0.9589, + "step": 58970 + }, + { + "epoch": 4.570498663257004, + "grad_norm": 1.3516991533959284, + "learning_rate": 2.2853378797272165e-07, + "loss": 0.9662, + "step": 58980 + }, + { + "epoch": 4.5712735867333105, + "grad_norm": 1.3550678423423566, + "learning_rate": 2.2857253564786114e-07, + "loss": 0.9769, + "step": 58990 + }, + { + "epoch": 4.5720485102096164, + "grad_norm": 1.361822548121793, + "learning_rate": 2.286112833230006e-07, + "loss": 0.9575, + "step": 59000 + }, + { + "epoch": 4.5720485102096164, + "eval_loss": 0.9639858603477478, + "eval_runtime": 321.6302, + "eval_samples_per_second": 35.665, + "eval_steps_per_second": 8.917, + "step": 59000 + }, + { + "epoch": 4.572823433685923, + "grad_norm": 1.3891142796743348, + "learning_rate": 2.286500309981401e-07, + "loss": 0.9637, + "step": 59010 + }, + { + "epoch": 4.57359835716223, + "grad_norm": 1.3324737268358684, + "learning_rate": 2.286887786732796e-07, + "loss": 0.9539, + "step": 59020 + }, + { + "epoch": 4.574373280638537, + "grad_norm": 1.32649083241806, + "learning_rate": 2.287275263484191e-07, + "loss": 0.9789, + "step": 59030 + }, + { + "epoch": 4.575148204114844, + "grad_norm": 1.4124655585622026, + "learning_rate": 2.287662740235586e-07, + "loss": 0.986, + "step": 59040 + }, + { + "epoch": 4.5759231275911505, + "grad_norm": 1.3565285741890836, + "learning_rate": 2.288050216986981e-07, + "loss": 0.9496, + "step": 59050 + }, + { + "epoch": 4.576698051067457, + "grad_norm": 1.4198385008249583, + "learning_rate": 2.2884376937383758e-07, + "loss": 0.9389, + "step": 59060 + }, + { + "epoch": 4.577472974543764, + "grad_norm": 1.3584756953397303, + "learning_rate": 2.2888251704897705e-07, + "loss": 0.943, + "step": 59070 + }, + { + "epoch": 4.57824789802007, + "grad_norm": 1.364152293114474, + "learning_rate": 2.2892126472411655e-07, + "loss": 0.947, + "step": 59080 + }, + { + "epoch": 4.579022821496377, + "grad_norm": 1.312167217550812, + "learning_rate": 2.2896001239925604e-07, + "loss": 0.9531, + "step": 59090 + }, + { + "epoch": 4.579797744972684, + "grad_norm": 1.3555133028092725, + "learning_rate": 2.2899876007439554e-07, + "loss": 0.9432, + "step": 59100 + }, + { + "epoch": 4.5805726684489905, + "grad_norm": 1.3143971945752229, + "learning_rate": 2.2903750774953504e-07, + "loss": 0.9858, + "step": 59110 + }, + { + "epoch": 4.581347591925297, + "grad_norm": 1.4003097429030282, + "learning_rate": 2.2907625542467453e-07, + "loss": 0.9635, + "step": 59120 + }, + { + "epoch": 4.582122515401604, + "grad_norm": 1.4021459001657877, + "learning_rate": 2.2911500309981403e-07, + "loss": 0.9759, + "step": 59130 + }, + { + "epoch": 4.582897438877911, + "grad_norm": 1.3513872638225335, + "learning_rate": 2.291537507749535e-07, + "loss": 0.956, + "step": 59140 + }, + { + "epoch": 4.583672362354218, + "grad_norm": 1.3771741782210805, + "learning_rate": 2.29192498450093e-07, + "loss": 0.96, + "step": 59150 + }, + { + "epoch": 4.5844472858305245, + "grad_norm": 1.388322003098797, + "learning_rate": 2.2923124612523249e-07, + "loss": 0.9323, + "step": 59160 + }, + { + "epoch": 4.585222209306831, + "grad_norm": 1.6875855210456001, + "learning_rate": 2.2926999380037198e-07, + "loss": 0.9816, + "step": 59170 + }, + { + "epoch": 4.585997132783138, + "grad_norm": 1.337957067547929, + "learning_rate": 2.2930874147551148e-07, + "loss": 0.9529, + "step": 59180 + }, + { + "epoch": 4.586772056259444, + "grad_norm": 1.4323258797628273, + "learning_rate": 2.2934748915065097e-07, + "loss": 0.9669, + "step": 59190 + }, + { + "epoch": 4.587546979735751, + "grad_norm": 1.362745051495008, + "learning_rate": 2.2938623682579044e-07, + "loss": 0.9537, + "step": 59200 + }, + { + "epoch": 4.588321903212058, + "grad_norm": 1.3549562642540127, + "learning_rate": 2.2942498450092994e-07, + "loss": 0.9598, + "step": 59210 + }, + { + "epoch": 4.5890968266883645, + "grad_norm": 1.3037199155409582, + "learning_rate": 2.2946373217606943e-07, + "loss": 0.9411, + "step": 59220 + }, + { + "epoch": 4.589871750164671, + "grad_norm": 1.2627948217604286, + "learning_rate": 2.2950247985120893e-07, + "loss": 0.9691, + "step": 59230 + }, + { + "epoch": 4.590646673640978, + "grad_norm": 1.3523733091569565, + "learning_rate": 2.2954122752634842e-07, + "loss": 0.9397, + "step": 59240 + }, + { + "epoch": 4.591421597117285, + "grad_norm": 1.3943549317627386, + "learning_rate": 2.2957997520148792e-07, + "loss": 0.9971, + "step": 59250 + }, + { + "epoch": 4.592196520593592, + "grad_norm": 1.397430321552797, + "learning_rate": 2.2961872287662742e-07, + "loss": 0.9569, + "step": 59260 + }, + { + "epoch": 4.592971444069898, + "grad_norm": 1.389099127727876, + "learning_rate": 2.2965747055176688e-07, + "loss": 0.9737, + "step": 59270 + }, + { + "epoch": 4.5937463675462045, + "grad_norm": 1.3814411287854456, + "learning_rate": 2.2969621822690638e-07, + "loss": 0.9585, + "step": 59280 + }, + { + "epoch": 4.594521291022511, + "grad_norm": 1.3150827357084485, + "learning_rate": 2.2973496590204588e-07, + "loss": 0.9385, + "step": 59290 + }, + { + "epoch": 4.595296214498818, + "grad_norm": 1.2658268532130563, + "learning_rate": 2.2977371357718537e-07, + "loss": 0.9461, + "step": 59300 + }, + { + "epoch": 4.596071137975125, + "grad_norm": 1.345481579947354, + "learning_rate": 2.2981246125232487e-07, + "loss": 0.9528, + "step": 59310 + }, + { + "epoch": 4.596846061451432, + "grad_norm": 1.3363636597657582, + "learning_rate": 2.2985120892746436e-07, + "loss": 0.9555, + "step": 59320 + }, + { + "epoch": 4.5976209849277385, + "grad_norm": 1.379810775676035, + "learning_rate": 2.2988995660260386e-07, + "loss": 0.971, + "step": 59330 + }, + { + "epoch": 4.598395908404045, + "grad_norm": 1.3571305980847117, + "learning_rate": 2.2992870427774333e-07, + "loss": 0.9644, + "step": 59340 + }, + { + "epoch": 4.599170831880352, + "grad_norm": 1.2882992113618763, + "learning_rate": 2.2996745195288282e-07, + "loss": 0.9386, + "step": 59350 + }, + { + "epoch": 4.599945755356659, + "grad_norm": 1.4722559234044374, + "learning_rate": 2.3000619962802232e-07, + "loss": 0.9349, + "step": 59360 + }, + { + "epoch": 4.600720678832965, + "grad_norm": 1.4178066679166867, + "learning_rate": 2.300449473031618e-07, + "loss": 0.9626, + "step": 59370 + }, + { + "epoch": 4.601495602309272, + "grad_norm": 1.3346009311979274, + "learning_rate": 2.300836949783013e-07, + "loss": 0.965, + "step": 59380 + }, + { + "epoch": 4.6022705257855785, + "grad_norm": 1.385253593787045, + "learning_rate": 2.301224426534408e-07, + "loss": 0.9475, + "step": 59390 + }, + { + "epoch": 4.603045449261885, + "grad_norm": 1.30807594691399, + "learning_rate": 2.301611903285803e-07, + "loss": 0.9496, + "step": 59400 + }, + { + "epoch": 4.603820372738192, + "grad_norm": 1.3644448767776158, + "learning_rate": 2.3019993800371977e-07, + "loss": 0.9751, + "step": 59410 + }, + { + "epoch": 4.604595296214499, + "grad_norm": 1.4071170549613776, + "learning_rate": 2.3023868567885926e-07, + "loss": 0.9662, + "step": 59420 + }, + { + "epoch": 4.605370219690806, + "grad_norm": 1.3885329853771264, + "learning_rate": 2.3027743335399876e-07, + "loss": 0.9598, + "step": 59430 + }, + { + "epoch": 4.6061451431671125, + "grad_norm": 1.2830984879681588, + "learning_rate": 2.3031618102913825e-07, + "loss": 0.9525, + "step": 59440 + }, + { + "epoch": 4.6069200666434185, + "grad_norm": 1.4062673153307728, + "learning_rate": 2.3035492870427775e-07, + "loss": 0.9752, + "step": 59450 + }, + { + "epoch": 4.607694990119725, + "grad_norm": 1.3333171704783093, + "learning_rate": 2.3039367637941725e-07, + "loss": 0.9391, + "step": 59460 + }, + { + "epoch": 4.608469913596032, + "grad_norm": 1.3895178137626425, + "learning_rate": 2.3043242405455674e-07, + "loss": 0.9638, + "step": 59470 + }, + { + "epoch": 4.609244837072339, + "grad_norm": 1.3057751015655927, + "learning_rate": 2.304711717296962e-07, + "loss": 0.948, + "step": 59480 + }, + { + "epoch": 4.610019760548646, + "grad_norm": 1.3500130305345728, + "learning_rate": 2.305099194048357e-07, + "loss": 0.9627, + "step": 59490 + }, + { + "epoch": 4.6107946840249525, + "grad_norm": 1.4835581882187772, + "learning_rate": 2.305486670799752e-07, + "loss": 0.9541, + "step": 59500 + }, + { + "epoch": 4.6107946840249525, + "eval_loss": 0.9632834196090698, + "eval_runtime": 320.4656, + "eval_samples_per_second": 35.795, + "eval_steps_per_second": 8.949, + "step": 59500 + }, + { + "epoch": 4.611569607501259, + "grad_norm": 1.3042052618056053, + "learning_rate": 2.305874147551147e-07, + "loss": 0.9616, + "step": 59510 + }, + { + "epoch": 4.612344530977566, + "grad_norm": 1.36247675617299, + "learning_rate": 2.306261624302542e-07, + "loss": 0.9773, + "step": 59520 + }, + { + "epoch": 4.613119454453873, + "grad_norm": 1.4117389794846271, + "learning_rate": 2.306649101053937e-07, + "loss": 0.9712, + "step": 59530 + }, + { + "epoch": 4.61389437793018, + "grad_norm": 1.3232045196324802, + "learning_rate": 2.3070365778053318e-07, + "loss": 0.9436, + "step": 59540 + }, + { + "epoch": 4.614669301406487, + "grad_norm": 1.3357553265877986, + "learning_rate": 2.3074240545567265e-07, + "loss": 0.9658, + "step": 59550 + }, + { + "epoch": 4.6154442248827925, + "grad_norm": 1.3530836039458416, + "learning_rate": 2.3078115313081215e-07, + "loss": 0.9471, + "step": 59560 + }, + { + "epoch": 4.616219148359099, + "grad_norm": 1.3652471099115775, + "learning_rate": 2.3081990080595164e-07, + "loss": 0.972, + "step": 59570 + }, + { + "epoch": 4.616994071835406, + "grad_norm": 1.3715767500270086, + "learning_rate": 2.3085864848109114e-07, + "loss": 0.9844, + "step": 59580 + }, + { + "epoch": 4.617768995311713, + "grad_norm": 1.3645285943596175, + "learning_rate": 2.3089739615623063e-07, + "loss": 0.9566, + "step": 59590 + }, + { + "epoch": 4.61854391878802, + "grad_norm": 1.374135964638765, + "learning_rate": 2.3093614383137013e-07, + "loss": 0.9545, + "step": 59600 + }, + { + "epoch": 4.6193188422643265, + "grad_norm": 1.3982545758765765, + "learning_rate": 2.309748915065096e-07, + "loss": 0.9291, + "step": 59610 + }, + { + "epoch": 4.620093765740633, + "grad_norm": 1.3438485937793332, + "learning_rate": 2.310136391816491e-07, + "loss": 0.9787, + "step": 59620 + }, + { + "epoch": 4.62086868921694, + "grad_norm": 1.2743243520892193, + "learning_rate": 2.310523868567886e-07, + "loss": 0.9496, + "step": 59630 + }, + { + "epoch": 4.621643612693246, + "grad_norm": 1.329237927329089, + "learning_rate": 2.3109113453192809e-07, + "loss": 0.9568, + "step": 59640 + }, + { + "epoch": 4.622418536169553, + "grad_norm": 1.5444006622212503, + "learning_rate": 2.3112988220706758e-07, + "loss": 0.9805, + "step": 59650 + }, + { + "epoch": 4.62319345964586, + "grad_norm": 1.290662242244179, + "learning_rate": 2.3116862988220708e-07, + "loss": 0.9675, + "step": 59660 + }, + { + "epoch": 4.6239683831221665, + "grad_norm": 1.3445565337620922, + "learning_rate": 2.3120737755734657e-07, + "loss": 0.9408, + "step": 59670 + }, + { + "epoch": 4.624743306598473, + "grad_norm": 1.2954469649673794, + "learning_rate": 2.3124612523248604e-07, + "loss": 0.9648, + "step": 59680 + }, + { + "epoch": 4.62551823007478, + "grad_norm": 1.4015343017406348, + "learning_rate": 2.3128487290762554e-07, + "loss": 0.9553, + "step": 59690 + }, + { + "epoch": 4.626293153551087, + "grad_norm": 1.3564978022940835, + "learning_rate": 2.3132362058276503e-07, + "loss": 0.9539, + "step": 59700 + }, + { + "epoch": 4.627068077027394, + "grad_norm": 1.3154154869923294, + "learning_rate": 2.3136236825790453e-07, + "loss": 0.9539, + "step": 59710 + }, + { + "epoch": 4.627843000503701, + "grad_norm": 1.4129429680697492, + "learning_rate": 2.3140111593304402e-07, + "loss": 0.9466, + "step": 59720 + }, + { + "epoch": 4.628617923980007, + "grad_norm": 1.3416864523868477, + "learning_rate": 2.3143986360818352e-07, + "loss": 0.9702, + "step": 59730 + }, + { + "epoch": 4.629392847456314, + "grad_norm": 1.4504333705552894, + "learning_rate": 2.3147861128332301e-07, + "loss": 0.9616, + "step": 59740 + }, + { + "epoch": 4.63016777093262, + "grad_norm": 1.3327300315421182, + "learning_rate": 2.3151735895846248e-07, + "loss": 0.9607, + "step": 59750 + }, + { + "epoch": 4.630942694408927, + "grad_norm": 1.3224854691926324, + "learning_rate": 2.3155610663360198e-07, + "loss": 0.9409, + "step": 59760 + }, + { + "epoch": 4.631717617885234, + "grad_norm": 1.374345316212306, + "learning_rate": 2.3159485430874147e-07, + "loss": 0.9482, + "step": 59770 + }, + { + "epoch": 4.6324925413615405, + "grad_norm": 1.2774186209671816, + "learning_rate": 2.3163360198388097e-07, + "loss": 0.947, + "step": 59780 + }, + { + "epoch": 4.633267464837847, + "grad_norm": 1.4056909488682454, + "learning_rate": 2.3167234965902047e-07, + "loss": 0.9604, + "step": 59790 + }, + { + "epoch": 4.634042388314154, + "grad_norm": 1.4283568308036347, + "learning_rate": 2.3171109733415996e-07, + "loss": 0.9589, + "step": 59800 + }, + { + "epoch": 4.634817311790461, + "grad_norm": 1.4099366307217711, + "learning_rate": 2.3174984500929946e-07, + "loss": 0.9701, + "step": 59810 + }, + { + "epoch": 4.635592235266768, + "grad_norm": 1.3508551162732685, + "learning_rate": 2.3178859268443893e-07, + "loss": 0.9332, + "step": 59820 + }, + { + "epoch": 4.636367158743074, + "grad_norm": 1.374474941542395, + "learning_rate": 2.3182734035957842e-07, + "loss": 0.9548, + "step": 59830 + }, + { + "epoch": 4.6371420822193805, + "grad_norm": 1.3873507321948046, + "learning_rate": 2.3186608803471792e-07, + "loss": 0.9467, + "step": 59840 + }, + { + "epoch": 4.637917005695687, + "grad_norm": 1.4414978275841879, + "learning_rate": 2.319048357098574e-07, + "loss": 0.947, + "step": 59850 + }, + { + "epoch": 4.638691929171994, + "grad_norm": 1.3626109808133398, + "learning_rate": 2.319435833849969e-07, + "loss": 0.958, + "step": 59860 + }, + { + "epoch": 4.639466852648301, + "grad_norm": 1.3777002028201317, + "learning_rate": 2.319823310601364e-07, + "loss": 0.9773, + "step": 59870 + }, + { + "epoch": 4.640241776124608, + "grad_norm": 1.324242049518554, + "learning_rate": 2.320210787352759e-07, + "loss": 0.9645, + "step": 59880 + }, + { + "epoch": 4.641016699600915, + "grad_norm": 1.3652840519072968, + "learning_rate": 2.3205982641041537e-07, + "loss": 0.9579, + "step": 59890 + }, + { + "epoch": 4.641791623077221, + "grad_norm": 1.343504354055945, + "learning_rate": 2.3209857408555486e-07, + "loss": 0.9559, + "step": 59900 + }, + { + "epoch": 4.642566546553528, + "grad_norm": 1.3158030517528887, + "learning_rate": 2.3213732176069436e-07, + "loss": 0.9684, + "step": 59910 + }, + { + "epoch": 4.643341470029835, + "grad_norm": 1.372068983982879, + "learning_rate": 2.3217606943583385e-07, + "loss": 0.9411, + "step": 59920 + }, + { + "epoch": 4.644116393506141, + "grad_norm": 1.3747010945329017, + "learning_rate": 2.3221481711097335e-07, + "loss": 0.9401, + "step": 59930 + }, + { + "epoch": 4.644891316982448, + "grad_norm": 1.3185859730305032, + "learning_rate": 2.3225356478611285e-07, + "loss": 0.9741, + "step": 59940 + }, + { + "epoch": 4.6456662404587545, + "grad_norm": 1.41347697258631, + "learning_rate": 2.3229231246125231e-07, + "loss": 0.9672, + "step": 59950 + }, + { + "epoch": 4.646441163935061, + "grad_norm": 1.4347113572220944, + "learning_rate": 2.323310601363918e-07, + "loss": 0.9882, + "step": 59960 + }, + { + "epoch": 4.647216087411368, + "grad_norm": 1.3207357659732442, + "learning_rate": 2.323698078115313e-07, + "loss": 0.9596, + "step": 59970 + }, + { + "epoch": 4.647991010887675, + "grad_norm": 1.3015038817348124, + "learning_rate": 2.324085554866708e-07, + "loss": 0.9566, + "step": 59980 + }, + { + "epoch": 4.648765934363982, + "grad_norm": 1.3293722286680494, + "learning_rate": 2.324473031618103e-07, + "loss": 0.9646, + "step": 59990 + }, + { + "epoch": 4.649540857840289, + "grad_norm": 1.3786524815143038, + "learning_rate": 2.324860508369498e-07, + "loss": 0.9534, + "step": 60000 + }, + { + "epoch": 4.649540857840289, + "eval_loss": 0.962502658367157, + "eval_runtime": 321.853, + "eval_samples_per_second": 35.64, + "eval_steps_per_second": 8.911, + "step": 60000 + }, + { + "epoch": 4.6503157813165945, + "grad_norm": 1.3530086158907186, + "learning_rate": 2.325247985120893e-07, + "loss": 0.95, + "step": 60010 + }, + { + "epoch": 4.651090704792901, + "grad_norm": 1.3280724761132767, + "learning_rate": 2.3256354618722876e-07, + "loss": 0.9699, + "step": 60020 + }, + { + "epoch": 4.651865628269208, + "grad_norm": 1.4209956889720676, + "learning_rate": 2.3260229386236825e-07, + "loss": 0.9729, + "step": 60030 + }, + { + "epoch": 4.652640551745515, + "grad_norm": 1.4452559422998132, + "learning_rate": 2.3264104153750775e-07, + "loss": 0.9601, + "step": 60040 + }, + { + "epoch": 4.653415475221822, + "grad_norm": 1.2853426734475777, + "learning_rate": 2.3267978921264724e-07, + "loss": 0.9651, + "step": 60050 + }, + { + "epoch": 4.654190398698129, + "grad_norm": 1.360729467054395, + "learning_rate": 2.3271853688778674e-07, + "loss": 0.9464, + "step": 60060 + }, + { + "epoch": 4.654965322174435, + "grad_norm": 1.347172115064523, + "learning_rate": 2.3275728456292623e-07, + "loss": 0.9627, + "step": 60070 + }, + { + "epoch": 4.655740245650742, + "grad_norm": 1.3317877526490527, + "learning_rate": 2.3279603223806573e-07, + "loss": 0.9928, + "step": 60080 + }, + { + "epoch": 4.656515169127049, + "grad_norm": 1.2797404161663144, + "learning_rate": 2.328347799132052e-07, + "loss": 0.9565, + "step": 60090 + }, + { + "epoch": 4.657290092603356, + "grad_norm": 1.3492600199903166, + "learning_rate": 2.328735275883447e-07, + "loss": 0.9734, + "step": 60100 + }, + { + "epoch": 4.658065016079663, + "grad_norm": 1.3079984300585794, + "learning_rate": 2.329122752634842e-07, + "loss": 0.9398, + "step": 60110 + }, + { + "epoch": 4.6588399395559685, + "grad_norm": 1.4068933511418347, + "learning_rate": 2.3295102293862368e-07, + "loss": 0.9581, + "step": 60120 + }, + { + "epoch": 4.659614863032275, + "grad_norm": 1.4149366797725669, + "learning_rate": 2.3298977061376318e-07, + "loss": 0.9691, + "step": 60130 + }, + { + "epoch": 4.660389786508582, + "grad_norm": 1.4614329853349013, + "learning_rate": 2.3302851828890268e-07, + "loss": 0.9598, + "step": 60140 + }, + { + "epoch": 4.661164709984889, + "grad_norm": 1.3606652896073503, + "learning_rate": 2.3306726596404217e-07, + "loss": 0.9425, + "step": 60150 + }, + { + "epoch": 4.661939633461196, + "grad_norm": 1.4210334758492094, + "learning_rate": 2.3310601363918164e-07, + "loss": 0.9626, + "step": 60160 + }, + { + "epoch": 4.662714556937503, + "grad_norm": 1.2655620588124423, + "learning_rate": 2.3314476131432114e-07, + "loss": 0.9791, + "step": 60170 + }, + { + "epoch": 4.663489480413809, + "grad_norm": 1.3169528091378981, + "learning_rate": 2.3318350898946063e-07, + "loss": 0.9517, + "step": 60180 + }, + { + "epoch": 4.664264403890116, + "grad_norm": 1.3898212400125347, + "learning_rate": 2.3322225666460013e-07, + "loss": 0.9618, + "step": 60190 + }, + { + "epoch": 4.665039327366422, + "grad_norm": 1.3653154829822882, + "learning_rate": 2.3326100433973962e-07, + "loss": 0.9715, + "step": 60200 + }, + { + "epoch": 4.665814250842729, + "grad_norm": 1.3115838988077695, + "learning_rate": 2.3329975201487912e-07, + "loss": 0.9537, + "step": 60210 + }, + { + "epoch": 4.666589174319036, + "grad_norm": 1.4077913929374373, + "learning_rate": 2.3333849969001861e-07, + "loss": 0.9575, + "step": 60220 + }, + { + "epoch": 4.667364097795343, + "grad_norm": 1.3197927735023203, + "learning_rate": 2.3337724736515808e-07, + "loss": 0.9473, + "step": 60230 + }, + { + "epoch": 4.668139021271649, + "grad_norm": 1.343977080956856, + "learning_rate": 2.3341599504029758e-07, + "loss": 0.9619, + "step": 60240 + }, + { + "epoch": 4.668913944747956, + "grad_norm": 1.3512789734367896, + "learning_rate": 2.3345474271543707e-07, + "loss": 0.9498, + "step": 60250 + }, + { + "epoch": 4.669688868224263, + "grad_norm": 1.3283464486342789, + "learning_rate": 2.3349349039057657e-07, + "loss": 0.9589, + "step": 60260 + }, + { + "epoch": 4.67046379170057, + "grad_norm": 1.3280181377878806, + "learning_rate": 2.3353223806571606e-07, + "loss": 0.942, + "step": 60270 + }, + { + "epoch": 4.671238715176877, + "grad_norm": 1.3657378267836133, + "learning_rate": 2.3357098574085556e-07, + "loss": 0.952, + "step": 60280 + }, + { + "epoch": 4.672013638653183, + "grad_norm": 1.408934773777593, + "learning_rate": 2.3360973341599503e-07, + "loss": 0.9515, + "step": 60290 + }, + { + "epoch": 4.672788562129489, + "grad_norm": 1.3788411191874375, + "learning_rate": 2.3364848109113452e-07, + "loss": 0.9645, + "step": 60300 + }, + { + "epoch": 4.673563485605796, + "grad_norm": 1.368461360045192, + "learning_rate": 2.3368722876627402e-07, + "loss": 0.9656, + "step": 60310 + }, + { + "epoch": 4.674338409082103, + "grad_norm": 1.364641955600341, + "learning_rate": 2.3372597644141352e-07, + "loss": 0.9542, + "step": 60320 + }, + { + "epoch": 4.67511333255841, + "grad_norm": 1.367099876222679, + "learning_rate": 2.33764724116553e-07, + "loss": 0.9644, + "step": 60330 + }, + { + "epoch": 4.675888256034717, + "grad_norm": 1.3966190503457498, + "learning_rate": 2.338034717916925e-07, + "loss": 0.9708, + "step": 60340 + }, + { + "epoch": 4.676663179511023, + "grad_norm": 1.3668491777752423, + "learning_rate": 2.33842219466832e-07, + "loss": 0.9621, + "step": 60350 + }, + { + "epoch": 4.67743810298733, + "grad_norm": 1.3062615754132951, + "learning_rate": 2.3388096714197147e-07, + "loss": 0.9764, + "step": 60360 + }, + { + "epoch": 4.678213026463637, + "grad_norm": 1.2747289882941442, + "learning_rate": 2.3391971481711097e-07, + "loss": 0.9445, + "step": 60370 + }, + { + "epoch": 4.678987949939943, + "grad_norm": 1.2836285868881774, + "learning_rate": 2.3395846249225046e-07, + "loss": 0.9688, + "step": 60380 + }, + { + "epoch": 4.67976287341625, + "grad_norm": 1.3473939511739144, + "learning_rate": 2.3399721016738996e-07, + "loss": 0.9567, + "step": 60390 + }, + { + "epoch": 4.680537796892557, + "grad_norm": 1.37054384964171, + "learning_rate": 2.3403595784252945e-07, + "loss": 0.9632, + "step": 60400 + }, + { + "epoch": 4.681312720368863, + "grad_norm": 1.3574450493324826, + "learning_rate": 2.3407470551766895e-07, + "loss": 0.9632, + "step": 60410 + }, + { + "epoch": 4.68208764384517, + "grad_norm": 1.3209768235771682, + "learning_rate": 2.3411345319280844e-07, + "loss": 0.9627, + "step": 60420 + }, + { + "epoch": 4.682862567321477, + "grad_norm": 1.677421519089842, + "learning_rate": 2.3415220086794791e-07, + "loss": 0.9696, + "step": 60430 + }, + { + "epoch": 4.683637490797784, + "grad_norm": 1.3710038324178713, + "learning_rate": 2.341909485430874e-07, + "loss": 0.9552, + "step": 60440 + }, + { + "epoch": 4.684412414274091, + "grad_norm": 1.3837150689274345, + "learning_rate": 2.342296962182269e-07, + "loss": 0.993, + "step": 60450 + }, + { + "epoch": 4.685187337750397, + "grad_norm": 1.2962152770477489, + "learning_rate": 2.342684438933664e-07, + "loss": 0.9415, + "step": 60460 + }, + { + "epoch": 4.685962261226704, + "grad_norm": 1.320525272684243, + "learning_rate": 2.343071915685059e-07, + "loss": 0.9543, + "step": 60470 + }, + { + "epoch": 4.686737184703011, + "grad_norm": 1.3446507253494875, + "learning_rate": 2.343459392436454e-07, + "loss": 0.9846, + "step": 60480 + }, + { + "epoch": 4.687512108179317, + "grad_norm": 1.4671296210495077, + "learning_rate": 2.343846869187849e-07, + "loss": 0.9907, + "step": 60490 + }, + { + "epoch": 4.688287031655624, + "grad_norm": 1.3627852338260842, + "learning_rate": 2.344234345939244e-07, + "loss": 0.9644, + "step": 60500 + }, + { + "epoch": 4.688287031655624, + "eval_loss": 0.9618179202079773, + "eval_runtime": 320.8035, + "eval_samples_per_second": 35.757, + "eval_steps_per_second": 8.94, + "step": 60500 + }, + { + "epoch": 4.689061955131931, + "grad_norm": 1.3416994928638821, + "learning_rate": 2.344621822690639e-07, + "loss": 0.9694, + "step": 60510 + }, + { + "epoch": 4.689836878608237, + "grad_norm": 1.3051505559398355, + "learning_rate": 2.3450092994420337e-07, + "loss": 0.9676, + "step": 60520 + }, + { + "epoch": 4.690611802084544, + "grad_norm": 1.3031174743490923, + "learning_rate": 2.3453967761934287e-07, + "loss": 0.9576, + "step": 60530 + }, + { + "epoch": 4.691386725560851, + "grad_norm": 1.3025760946706595, + "learning_rate": 2.3457842529448236e-07, + "loss": 0.9601, + "step": 60540 + }, + { + "epoch": 4.692161649037158, + "grad_norm": 1.326381415413032, + "learning_rate": 2.3461717296962186e-07, + "loss": 0.9541, + "step": 60550 + }, + { + "epoch": 4.692936572513465, + "grad_norm": 1.3447705112999724, + "learning_rate": 2.3465592064476136e-07, + "loss": 0.9432, + "step": 60560 + }, + { + "epoch": 4.6937114959897706, + "grad_norm": 1.3202760872394126, + "learning_rate": 2.3469466831990085e-07, + "loss": 0.9814, + "step": 60570 + }, + { + "epoch": 4.694486419466077, + "grad_norm": 1.2904901943140623, + "learning_rate": 2.3473341599504035e-07, + "loss": 0.9449, + "step": 60580 + }, + { + "epoch": 4.695261342942384, + "grad_norm": 1.443999954036201, + "learning_rate": 2.3477216367017982e-07, + "loss": 0.9723, + "step": 60590 + }, + { + "epoch": 4.696036266418691, + "grad_norm": 1.3511460708262855, + "learning_rate": 2.348109113453193e-07, + "loss": 0.9633, + "step": 60600 + }, + { + "epoch": 4.696811189894998, + "grad_norm": 1.4243886301001354, + "learning_rate": 2.348496590204588e-07, + "loss": 0.9689, + "step": 60610 + }, + { + "epoch": 4.697586113371305, + "grad_norm": 1.2669348176922433, + "learning_rate": 2.348884066955983e-07, + "loss": 0.9763, + "step": 60620 + }, + { + "epoch": 4.698361036847611, + "grad_norm": 1.3201160859067564, + "learning_rate": 2.349271543707378e-07, + "loss": 0.9413, + "step": 60630 + }, + { + "epoch": 4.699135960323918, + "grad_norm": 1.3540796008565794, + "learning_rate": 2.349659020458773e-07, + "loss": 0.9711, + "step": 60640 + }, + { + "epoch": 4.699910883800225, + "grad_norm": 1.3572923207852765, + "learning_rate": 2.350046497210168e-07, + "loss": 0.9667, + "step": 60650 + }, + { + "epoch": 4.700685807276532, + "grad_norm": 1.2977533940683612, + "learning_rate": 2.3504339739615626e-07, + "loss": 0.9877, + "step": 60660 + }, + { + "epoch": 4.701460730752838, + "grad_norm": 1.2909016474112636, + "learning_rate": 2.3508214507129575e-07, + "loss": 0.9249, + "step": 60670 + }, + { + "epoch": 4.702235654229145, + "grad_norm": 1.328021035846879, + "learning_rate": 2.3512089274643525e-07, + "loss": 0.9429, + "step": 60680 + }, + { + "epoch": 4.703010577705451, + "grad_norm": 1.329335450496829, + "learning_rate": 2.3515964042157474e-07, + "loss": 0.9444, + "step": 60690 + }, + { + "epoch": 4.703785501181758, + "grad_norm": 1.4723401412692674, + "learning_rate": 2.3519838809671424e-07, + "loss": 0.9766, + "step": 60700 + }, + { + "epoch": 4.704560424658065, + "grad_norm": 1.254078158341728, + "learning_rate": 2.3523713577185373e-07, + "loss": 0.9545, + "step": 60710 + }, + { + "epoch": 4.705335348134372, + "grad_norm": 1.431324946656076, + "learning_rate": 2.352758834469932e-07, + "loss": 0.9863, + "step": 60720 + }, + { + "epoch": 4.706110271610679, + "grad_norm": 1.2896900377147984, + "learning_rate": 2.353146311221327e-07, + "loss": 0.9578, + "step": 60730 + }, + { + "epoch": 4.706885195086985, + "grad_norm": 1.3629542491230475, + "learning_rate": 2.353533787972722e-07, + "loss": 0.9661, + "step": 60740 + }, + { + "epoch": 4.707660118563292, + "grad_norm": 1.4646818945116498, + "learning_rate": 2.353921264724117e-07, + "loss": 0.9489, + "step": 60750 + }, + { + "epoch": 4.708435042039598, + "grad_norm": 1.3717734056894642, + "learning_rate": 2.3543087414755119e-07, + "loss": 0.9616, + "step": 60760 + }, + { + "epoch": 4.709209965515905, + "grad_norm": 1.3912206721808846, + "learning_rate": 2.3546962182269068e-07, + "loss": 0.9776, + "step": 60770 + }, + { + "epoch": 4.709984888992212, + "grad_norm": 1.3483116377194, + "learning_rate": 2.3550836949783018e-07, + "loss": 0.9681, + "step": 60780 + }, + { + "epoch": 4.710759812468519, + "grad_norm": 1.3393926176779396, + "learning_rate": 2.3554711717296965e-07, + "loss": 0.9725, + "step": 60790 + }, + { + "epoch": 4.711534735944825, + "grad_norm": 1.4194576360064304, + "learning_rate": 2.3558586484810914e-07, + "loss": 0.9654, + "step": 60800 + }, + { + "epoch": 4.712309659421132, + "grad_norm": 1.4090448995056257, + "learning_rate": 2.3562461252324864e-07, + "loss": 0.9689, + "step": 60810 + }, + { + "epoch": 4.713084582897439, + "grad_norm": 1.3684139908482227, + "learning_rate": 2.3566336019838813e-07, + "loss": 0.9676, + "step": 60820 + }, + { + "epoch": 4.713859506373746, + "grad_norm": 1.4015891365959472, + "learning_rate": 2.3570210787352763e-07, + "loss": 0.9772, + "step": 60830 + }, + { + "epoch": 4.714634429850053, + "grad_norm": 1.3421436669419493, + "learning_rate": 2.3574085554866712e-07, + "loss": 0.9736, + "step": 60840 + }, + { + "epoch": 4.7154093533263595, + "grad_norm": 1.3729606418947446, + "learning_rate": 2.3577960322380662e-07, + "loss": 0.9701, + "step": 60850 + }, + { + "epoch": 4.716184276802665, + "grad_norm": 1.3658615308172573, + "learning_rate": 2.358183508989461e-07, + "loss": 0.9578, + "step": 60860 + }, + { + "epoch": 4.716959200278972, + "grad_norm": 1.3482212878744126, + "learning_rate": 2.3585709857408558e-07, + "loss": 0.9379, + "step": 60870 + }, + { + "epoch": 4.717734123755279, + "grad_norm": 1.390096898829496, + "learning_rate": 2.3589584624922508e-07, + "loss": 0.9455, + "step": 60880 + }, + { + "epoch": 4.718509047231586, + "grad_norm": 1.3851069090210277, + "learning_rate": 2.3593459392436457e-07, + "loss": 0.9514, + "step": 60890 + }, + { + "epoch": 4.719283970707893, + "grad_norm": 1.3998911733753385, + "learning_rate": 2.3597334159950407e-07, + "loss": 0.9592, + "step": 60900 + }, + { + "epoch": 4.720058894184199, + "grad_norm": 1.2931831854629054, + "learning_rate": 2.3601208927464357e-07, + "loss": 0.9392, + "step": 60910 + }, + { + "epoch": 4.720833817660506, + "grad_norm": 1.3585889102220141, + "learning_rate": 2.3605083694978306e-07, + "loss": 0.9819, + "step": 60920 + }, + { + "epoch": 4.721608741136813, + "grad_norm": 1.2988608360956433, + "learning_rate": 2.3608958462492253e-07, + "loss": 0.956, + "step": 60930 + }, + { + "epoch": 4.722383664613119, + "grad_norm": 1.3448076828413278, + "learning_rate": 2.3612833230006203e-07, + "loss": 0.9493, + "step": 60940 + }, + { + "epoch": 4.723158588089426, + "grad_norm": 1.4365211800158186, + "learning_rate": 2.3616707997520152e-07, + "loss": 0.9521, + "step": 60950 + }, + { + "epoch": 4.723933511565733, + "grad_norm": 1.4064115173521643, + "learning_rate": 2.3620582765034102e-07, + "loss": 0.9565, + "step": 60960 + }, + { + "epoch": 4.724708435042039, + "grad_norm": 1.403936116528265, + "learning_rate": 2.362445753254805e-07, + "loss": 0.9753, + "step": 60970 + }, + { + "epoch": 4.725483358518346, + "grad_norm": 1.4037872263443965, + "learning_rate": 2.3628332300062e-07, + "loss": 0.9661, + "step": 60980 + }, + { + "epoch": 4.726258281994653, + "grad_norm": 1.2803691962797432, + "learning_rate": 2.363220706757595e-07, + "loss": 0.9526, + "step": 60990 + }, + { + "epoch": 4.72703320547096, + "grad_norm": 1.300127711150459, + "learning_rate": 2.3636081835089897e-07, + "loss": 0.9313, + "step": 61000 + }, + { + "epoch": 4.72703320547096, + "eval_loss": 0.9610848426818848, + "eval_runtime": 319.891, + "eval_samples_per_second": 35.859, + "eval_steps_per_second": 8.966, + "step": 61000 + }, + { + "epoch": 4.727808128947267, + "grad_norm": 1.3351744059716038, + "learning_rate": 2.3639956602603847e-07, + "loss": 0.9772, + "step": 61010 + }, + { + "epoch": 4.7285830524235735, + "grad_norm": 1.249263005463536, + "learning_rate": 2.3643831370117796e-07, + "loss": 0.9441, + "step": 61020 + }, + { + "epoch": 4.72935797589988, + "grad_norm": 1.3994945752373913, + "learning_rate": 2.3647706137631746e-07, + "loss": 0.9426, + "step": 61030 + }, + { + "epoch": 4.730132899376187, + "grad_norm": 1.3913397004108343, + "learning_rate": 2.3651580905145695e-07, + "loss": 0.9628, + "step": 61040 + }, + { + "epoch": 4.730907822852493, + "grad_norm": 1.3634196313197917, + "learning_rate": 2.3655455672659645e-07, + "loss": 0.9495, + "step": 61050 + }, + { + "epoch": 4.7316827463288, + "grad_norm": 1.334836031917736, + "learning_rate": 2.3659330440173595e-07, + "loss": 0.9391, + "step": 61060 + }, + { + "epoch": 4.732457669805107, + "grad_norm": 1.362059003389491, + "learning_rate": 2.3663205207687541e-07, + "loss": 0.9489, + "step": 61070 + }, + { + "epoch": 4.733232593281413, + "grad_norm": 1.2827859714204095, + "learning_rate": 2.366707997520149e-07, + "loss": 0.9451, + "step": 61080 + }, + { + "epoch": 4.73400751675772, + "grad_norm": 1.3460413086345917, + "learning_rate": 2.367095474271544e-07, + "loss": 0.9839, + "step": 61090 + }, + { + "epoch": 4.734782440234027, + "grad_norm": 1.343141941848321, + "learning_rate": 2.367482951022939e-07, + "loss": 0.9827, + "step": 61100 + }, + { + "epoch": 4.735557363710334, + "grad_norm": 1.2946140623179447, + "learning_rate": 2.367870427774334e-07, + "loss": 0.9796, + "step": 61110 + }, + { + "epoch": 4.736332287186641, + "grad_norm": 1.331180438749924, + "learning_rate": 2.368257904525729e-07, + "loss": 0.9506, + "step": 61120 + }, + { + "epoch": 4.737107210662947, + "grad_norm": 1.2833296285691738, + "learning_rate": 2.3686453812771236e-07, + "loss": 0.9478, + "step": 61130 + }, + { + "epoch": 4.737882134139253, + "grad_norm": 1.343615270329413, + "learning_rate": 2.3690328580285186e-07, + "loss": 0.9737, + "step": 61140 + }, + { + "epoch": 4.73865705761556, + "grad_norm": 1.3310411723688444, + "learning_rate": 2.3694203347799135e-07, + "loss": 0.9589, + "step": 61150 + }, + { + "epoch": 4.739431981091867, + "grad_norm": 1.2915519331073886, + "learning_rate": 2.3698078115313085e-07, + "loss": 0.941, + "step": 61160 + }, + { + "epoch": 4.740206904568174, + "grad_norm": 1.3116834627441525, + "learning_rate": 2.3701952882827034e-07, + "loss": 0.9633, + "step": 61170 + }, + { + "epoch": 4.740981828044481, + "grad_norm": 1.3027933538006526, + "learning_rate": 2.3705827650340984e-07, + "loss": 0.9656, + "step": 61180 + }, + { + "epoch": 4.7417567515207875, + "grad_norm": 1.3990914960927292, + "learning_rate": 2.3709702417854933e-07, + "loss": 0.965, + "step": 61190 + }, + { + "epoch": 4.742531674997094, + "grad_norm": 1.3141360603915957, + "learning_rate": 2.371357718536888e-07, + "loss": 0.9597, + "step": 61200 + }, + { + "epoch": 4.743306598473401, + "grad_norm": 1.4351982591806165, + "learning_rate": 2.371745195288283e-07, + "loss": 0.9516, + "step": 61210 + }, + { + "epoch": 4.744081521949708, + "grad_norm": 1.4013736462336925, + "learning_rate": 2.372132672039678e-07, + "loss": 0.965, + "step": 61220 + }, + { + "epoch": 4.744856445426014, + "grad_norm": 1.3550025195251527, + "learning_rate": 2.372520148791073e-07, + "loss": 0.9622, + "step": 61230 + }, + { + "epoch": 4.745631368902321, + "grad_norm": 1.3699182360006203, + "learning_rate": 2.3729076255424678e-07, + "loss": 0.9474, + "step": 61240 + }, + { + "epoch": 4.746406292378627, + "grad_norm": 1.354202252469217, + "learning_rate": 2.3732951022938628e-07, + "loss": 0.9345, + "step": 61250 + }, + { + "epoch": 4.747181215854934, + "grad_norm": 1.4428550141162626, + "learning_rate": 2.3736825790452578e-07, + "loss": 0.9604, + "step": 61260 + }, + { + "epoch": 4.747956139331241, + "grad_norm": 1.2963382501074088, + "learning_rate": 2.3740700557966525e-07, + "loss": 0.9503, + "step": 61270 + }, + { + "epoch": 4.748731062807548, + "grad_norm": 1.2929583847280413, + "learning_rate": 2.3744575325480474e-07, + "loss": 0.9469, + "step": 61280 + }, + { + "epoch": 4.749505986283855, + "grad_norm": 1.3511378558138842, + "learning_rate": 2.3748450092994424e-07, + "loss": 0.9612, + "step": 61290 + }, + { + "epoch": 4.7502809097601615, + "grad_norm": 1.418992810237318, + "learning_rate": 2.3752324860508373e-07, + "loss": 0.9618, + "step": 61300 + }, + { + "epoch": 4.751055833236467, + "grad_norm": 1.2964433256008372, + "learning_rate": 2.3756199628022323e-07, + "loss": 0.9593, + "step": 61310 + }, + { + "epoch": 4.751830756712774, + "grad_norm": 1.3392955331252794, + "learning_rate": 2.3760074395536272e-07, + "loss": 0.9595, + "step": 61320 + }, + { + "epoch": 4.752605680189081, + "grad_norm": 1.3660210442505758, + "learning_rate": 2.3763949163050222e-07, + "loss": 0.9839, + "step": 61330 + }, + { + "epoch": 4.753380603665388, + "grad_norm": 1.2825573317578798, + "learning_rate": 2.376782393056417e-07, + "loss": 0.9978, + "step": 61340 + }, + { + "epoch": 4.754155527141695, + "grad_norm": 1.3135675444402786, + "learning_rate": 2.3771698698078118e-07, + "loss": 0.9771, + "step": 61350 + }, + { + "epoch": 4.7549304506180015, + "grad_norm": 1.365326566055974, + "learning_rate": 2.3775573465592068e-07, + "loss": 0.9402, + "step": 61360 + }, + { + "epoch": 4.755705374094308, + "grad_norm": 1.376105838872183, + "learning_rate": 2.3779448233106017e-07, + "loss": 0.954, + "step": 61370 + }, + { + "epoch": 4.756480297570615, + "grad_norm": 1.4494535245728912, + "learning_rate": 2.3783323000619967e-07, + "loss": 0.9527, + "step": 61380 + }, + { + "epoch": 4.757255221046922, + "grad_norm": 1.2834389067472864, + "learning_rate": 2.3787197768133916e-07, + "loss": 0.9513, + "step": 61390 + }, + { + "epoch": 4.758030144523229, + "grad_norm": 1.438694660592753, + "learning_rate": 2.3791072535647866e-07, + "loss": 0.9619, + "step": 61400 + }, + { + "epoch": 4.7588050679995355, + "grad_norm": 1.3590792328536598, + "learning_rate": 2.3794947303161813e-07, + "loss": 0.9538, + "step": 61410 + }, + { + "epoch": 4.759579991475841, + "grad_norm": 1.3758437630539595, + "learning_rate": 2.3798822070675762e-07, + "loss": 0.9724, + "step": 61420 + }, + { + "epoch": 4.760354914952148, + "grad_norm": 1.4073598032152115, + "learning_rate": 2.3802696838189712e-07, + "loss": 0.9571, + "step": 61430 + }, + { + "epoch": 4.761129838428455, + "grad_norm": 1.3127435389269573, + "learning_rate": 2.3806571605703662e-07, + "loss": 0.9464, + "step": 61440 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 1.364498385289576, + "learning_rate": 2.381044637321761e-07, + "loss": 0.9677, + "step": 61450 + }, + { + "epoch": 4.762679685381069, + "grad_norm": 1.3699302164012637, + "learning_rate": 2.381432114073156e-07, + "loss": 0.9654, + "step": 61460 + }, + { + "epoch": 4.7634546088573755, + "grad_norm": 1.2955947403107968, + "learning_rate": 2.3818195908245508e-07, + "loss": 0.952, + "step": 61470 + }, + { + "epoch": 4.764229532333682, + "grad_norm": 1.3896239742831697, + "learning_rate": 2.3822070675759457e-07, + "loss": 0.9458, + "step": 61480 + }, + { + "epoch": 4.765004455809989, + "grad_norm": 1.699017739371599, + "learning_rate": 2.3825945443273407e-07, + "loss": 0.9859, + "step": 61490 + }, + { + "epoch": 4.765779379286295, + "grad_norm": 1.4118209738997625, + "learning_rate": 2.3829820210787356e-07, + "loss": 0.9578, + "step": 61500 + }, + { + "epoch": 4.765779379286295, + "eval_loss": 0.9603410363197327, + "eval_runtime": 319.7226, + "eval_samples_per_second": 35.878, + "eval_steps_per_second": 8.97, + "step": 61500 + }, + { + "epoch": 4.766554302762602, + "grad_norm": 1.3187454204763542, + "learning_rate": 2.3833694978301306e-07, + "loss": 0.962, + "step": 61510 + }, + { + "epoch": 4.767329226238909, + "grad_norm": 1.331220147956138, + "learning_rate": 2.3837569745815255e-07, + "loss": 0.9636, + "step": 61520 + }, + { + "epoch": 4.7681041497152155, + "grad_norm": 1.3083019682776338, + "learning_rate": 2.3841444513329205e-07, + "loss": 0.9518, + "step": 61530 + }, + { + "epoch": 4.768879073191522, + "grad_norm": 1.324203891652345, + "learning_rate": 2.3845319280843154e-07, + "loss": 0.9545, + "step": 61540 + }, + { + "epoch": 4.769653996667829, + "grad_norm": 1.3227739904981224, + "learning_rate": 2.38491940483571e-07, + "loss": 0.9407, + "step": 61550 + }, + { + "epoch": 4.770428920144136, + "grad_norm": 1.387828238229899, + "learning_rate": 2.3853068815871054e-07, + "loss": 0.9475, + "step": 61560 + }, + { + "epoch": 4.771203843620443, + "grad_norm": 1.3529680555800727, + "learning_rate": 2.3856943583385e-07, + "loss": 0.971, + "step": 61570 + }, + { + "epoch": 4.7719787670967495, + "grad_norm": 1.4058230304243144, + "learning_rate": 2.3860818350898947e-07, + "loss": 0.964, + "step": 61580 + }, + { + "epoch": 4.772753690573056, + "grad_norm": 1.354003892412098, + "learning_rate": 2.38646931184129e-07, + "loss": 0.9577, + "step": 61590 + }, + { + "epoch": 4.773528614049362, + "grad_norm": 1.301694111323529, + "learning_rate": 2.3868567885926846e-07, + "loss": 0.9603, + "step": 61600 + }, + { + "epoch": 4.774303537525669, + "grad_norm": 1.3055823337998944, + "learning_rate": 2.38724426534408e-07, + "loss": 0.9491, + "step": 61610 + }, + { + "epoch": 4.775078461001976, + "grad_norm": 1.3930416160850265, + "learning_rate": 2.3876317420954746e-07, + "loss": 0.9621, + "step": 61620 + }, + { + "epoch": 4.775853384478283, + "grad_norm": 1.480459130172294, + "learning_rate": 2.38801921884687e-07, + "loss": 0.9656, + "step": 61630 + }, + { + "epoch": 4.7766283079545895, + "grad_norm": 1.5930225905372295, + "learning_rate": 2.3884066955982645e-07, + "loss": 0.992, + "step": 61640 + }, + { + "epoch": 4.777403231430896, + "grad_norm": 1.3021340838681001, + "learning_rate": 2.388794172349659e-07, + "loss": 0.9766, + "step": 61650 + }, + { + "epoch": 4.778178154907203, + "grad_norm": 1.3735847756632438, + "learning_rate": 2.3891816491010544e-07, + "loss": 0.9552, + "step": 61660 + }, + { + "epoch": 4.77895307838351, + "grad_norm": 1.2672830633885113, + "learning_rate": 2.389569125852449e-07, + "loss": 0.9531, + "step": 61670 + }, + { + "epoch": 4.779728001859817, + "grad_norm": 1.4057411788512595, + "learning_rate": 2.3899566026038443e-07, + "loss": 0.9825, + "step": 61680 + }, + { + "epoch": 4.780502925336123, + "grad_norm": 1.339209676600012, + "learning_rate": 2.390344079355239e-07, + "loss": 0.9486, + "step": 61690 + }, + { + "epoch": 4.7812778488124295, + "grad_norm": 1.3440682487457165, + "learning_rate": 2.390731556106634e-07, + "loss": 0.9545, + "step": 61700 + }, + { + "epoch": 4.782052772288736, + "grad_norm": 1.3797181872671669, + "learning_rate": 2.391119032858029e-07, + "loss": 0.9376, + "step": 61710 + }, + { + "epoch": 4.782827695765043, + "grad_norm": 1.5106688143647498, + "learning_rate": 2.3915065096094236e-07, + "loss": 0.9484, + "step": 61720 + }, + { + "epoch": 4.78360261924135, + "grad_norm": 1.4326850707230554, + "learning_rate": 2.391893986360819e-07, + "loss": 0.9537, + "step": 61730 + }, + { + "epoch": 4.784377542717657, + "grad_norm": 1.3576309489166702, + "learning_rate": 2.3922814631122135e-07, + "loss": 0.9812, + "step": 61740 + }, + { + "epoch": 4.7851524661939635, + "grad_norm": 1.396281570171826, + "learning_rate": 2.3926689398636087e-07, + "loss": 0.9739, + "step": 61750 + }, + { + "epoch": 4.78592738967027, + "grad_norm": 1.2913173760409438, + "learning_rate": 2.3930564166150034e-07, + "loss": 0.9365, + "step": 61760 + }, + { + "epoch": 4.786702313146577, + "grad_norm": 1.3171001669301623, + "learning_rate": 2.3934438933663986e-07, + "loss": 0.9506, + "step": 61770 + }, + { + "epoch": 4.787477236622884, + "grad_norm": 1.3486314498482286, + "learning_rate": 2.3938313701177933e-07, + "loss": 0.9608, + "step": 61780 + }, + { + "epoch": 4.78825216009919, + "grad_norm": 1.3469689407680843, + "learning_rate": 2.394218846869188e-07, + "loss": 0.9734, + "step": 61790 + }, + { + "epoch": 4.789027083575497, + "grad_norm": 1.4733935092919714, + "learning_rate": 2.394606323620583e-07, + "loss": 0.9468, + "step": 61800 + }, + { + "epoch": 4.7898020070518035, + "grad_norm": 1.360096405056326, + "learning_rate": 2.394993800371978e-07, + "loss": 0.952, + "step": 61810 + }, + { + "epoch": 4.79057693052811, + "grad_norm": 1.2584507099586697, + "learning_rate": 2.395381277123373e-07, + "loss": 0.9579, + "step": 61820 + }, + { + "epoch": 4.791351854004417, + "grad_norm": 1.3580370307388703, + "learning_rate": 2.395768753874768e-07, + "loss": 0.9822, + "step": 61830 + }, + { + "epoch": 4.792126777480724, + "grad_norm": 1.3961482633865954, + "learning_rate": 2.396156230626163e-07, + "loss": 0.9522, + "step": 61840 + }, + { + "epoch": 4.792901700957031, + "grad_norm": 1.2640891822991194, + "learning_rate": 2.3965437073775577e-07, + "loss": 0.9688, + "step": 61850 + }, + { + "epoch": 4.7936766244333375, + "grad_norm": 1.2981262955519275, + "learning_rate": 2.3969311841289524e-07, + "loss": 0.9695, + "step": 61860 + }, + { + "epoch": 4.7944515479096435, + "grad_norm": 1.42130542004679, + "learning_rate": 2.3973186608803476e-07, + "loss": 0.9595, + "step": 61870 + }, + { + "epoch": 4.79522647138595, + "grad_norm": 1.3849156660701298, + "learning_rate": 2.3977061376317423e-07, + "loss": 0.9526, + "step": 61880 + }, + { + "epoch": 4.796001394862257, + "grad_norm": 1.396813851881217, + "learning_rate": 2.3980936143831375e-07, + "loss": 0.9671, + "step": 61890 + }, + { + "epoch": 4.796776318338564, + "grad_norm": 1.312528632397862, + "learning_rate": 2.398481091134532e-07, + "loss": 0.9441, + "step": 61900 + }, + { + "epoch": 4.797551241814871, + "grad_norm": 1.3767975609034866, + "learning_rate": 2.3988685678859275e-07, + "loss": 0.9826, + "step": 61910 + }, + { + "epoch": 4.7983261652911775, + "grad_norm": 1.3904208769988582, + "learning_rate": 2.399256044637322e-07, + "loss": 0.9554, + "step": 61920 + }, + { + "epoch": 4.799101088767484, + "grad_norm": 1.4097595390142905, + "learning_rate": 2.399643521388717e-07, + "loss": 0.9626, + "step": 61930 + }, + { + "epoch": 4.799876012243791, + "grad_norm": 1.2376990159741263, + "learning_rate": 2.400030998140112e-07, + "loss": 0.9523, + "step": 61940 + }, + { + "epoch": 4.800650935720098, + "grad_norm": 1.4208601515486816, + "learning_rate": 2.400418474891507e-07, + "loss": 0.949, + "step": 61950 + }, + { + "epoch": 4.801425859196405, + "grad_norm": 1.3598644450161856, + "learning_rate": 2.400805951642902e-07, + "loss": 0.9281, + "step": 61960 + }, + { + "epoch": 4.8022007826727116, + "grad_norm": 1.4775696503200437, + "learning_rate": 2.4011934283942967e-07, + "loss": 0.9705, + "step": 61970 + }, + { + "epoch": 4.8029757061490175, + "grad_norm": 1.2648042319319248, + "learning_rate": 2.401580905145692e-07, + "loss": 0.9692, + "step": 61980 + }, + { + "epoch": 4.803750629625324, + "grad_norm": 1.369803722622953, + "learning_rate": 2.4019683818970866e-07, + "loss": 0.9776, + "step": 61990 + }, + { + "epoch": 4.804525553101631, + "grad_norm": 1.329454149289168, + "learning_rate": 2.402355858648481e-07, + "loss": 0.9503, + "step": 62000 + }, + { + "epoch": 4.804525553101631, + "eval_loss": 0.959772527217865, + "eval_runtime": 319.2065, + "eval_samples_per_second": 35.936, + "eval_steps_per_second": 8.985, + "step": 62000 + }, + { + "epoch": 4.805300476577938, + "grad_norm": 1.347911064279192, + "learning_rate": 2.4027433353998765e-07, + "loss": 0.9373, + "step": 62010 + }, + { + "epoch": 4.806075400054245, + "grad_norm": 1.425465913883068, + "learning_rate": 2.403130812151271e-07, + "loss": 0.9621, + "step": 62020 + }, + { + "epoch": 4.8068503235305515, + "grad_norm": 1.3493356839666566, + "learning_rate": 2.4035182889026664e-07, + "loss": 0.9817, + "step": 62030 + }, + { + "epoch": 4.807625247006858, + "grad_norm": 1.452482634592881, + "learning_rate": 2.403905765654061e-07, + "loss": 0.9424, + "step": 62040 + }, + { + "epoch": 4.808400170483165, + "grad_norm": 1.3931929748512129, + "learning_rate": 2.404293242405456e-07, + "loss": 0.9649, + "step": 62050 + }, + { + "epoch": 4.809175093959471, + "grad_norm": 1.3386863498942845, + "learning_rate": 2.404680719156851e-07, + "loss": 0.9623, + "step": 62060 + }, + { + "epoch": 4.809950017435778, + "grad_norm": 1.338549506935902, + "learning_rate": 2.4050681959082457e-07, + "loss": 0.9555, + "step": 62070 + }, + { + "epoch": 4.810724940912085, + "grad_norm": 1.5272394280576191, + "learning_rate": 2.405455672659641e-07, + "loss": 0.9541, + "step": 62080 + }, + { + "epoch": 4.8114998643883915, + "grad_norm": 1.3500673177201141, + "learning_rate": 2.4058431494110356e-07, + "loss": 0.9734, + "step": 62090 + }, + { + "epoch": 4.812274787864698, + "grad_norm": 1.3796339832680953, + "learning_rate": 2.406230626162431e-07, + "loss": 0.9618, + "step": 62100 + }, + { + "epoch": 4.813049711341005, + "grad_norm": 1.7110869509125566, + "learning_rate": 2.4066181029138255e-07, + "loss": 0.9734, + "step": 62110 + }, + { + "epoch": 4.813824634817312, + "grad_norm": 1.3167244979601476, + "learning_rate": 2.40700557966522e-07, + "loss": 0.9514, + "step": 62120 + }, + { + "epoch": 4.814599558293619, + "grad_norm": 1.27515945440787, + "learning_rate": 2.4073930564166154e-07, + "loss": 0.9442, + "step": 62130 + }, + { + "epoch": 4.8153744817699256, + "grad_norm": 1.3392734131891066, + "learning_rate": 2.40778053316801e-07, + "loss": 0.9811, + "step": 62140 + }, + { + "epoch": 4.816149405246232, + "grad_norm": 1.3920807165218116, + "learning_rate": 2.4081680099194053e-07, + "loss": 0.9516, + "step": 62150 + }, + { + "epoch": 4.816924328722538, + "grad_norm": 1.4167849969358735, + "learning_rate": 2.4085554866708e-07, + "loss": 0.977, + "step": 62160 + }, + { + "epoch": 4.817699252198845, + "grad_norm": 1.3280940260259506, + "learning_rate": 2.408942963422195e-07, + "loss": 0.9535, + "step": 62170 + }, + { + "epoch": 4.818474175675152, + "grad_norm": 1.2983155059723235, + "learning_rate": 2.40933044017359e-07, + "loss": 0.9566, + "step": 62180 + }, + { + "epoch": 4.819249099151459, + "grad_norm": 1.3677463109672738, + "learning_rate": 2.4097179169249846e-07, + "loss": 0.9761, + "step": 62190 + }, + { + "epoch": 4.8200240226277655, + "grad_norm": 1.3702908435283296, + "learning_rate": 2.41010539367638e-07, + "loss": 0.9421, + "step": 62200 + }, + { + "epoch": 4.820798946104072, + "grad_norm": 1.3751969705530227, + "learning_rate": 2.4104928704277745e-07, + "loss": 0.9478, + "step": 62210 + }, + { + "epoch": 4.821573869580379, + "grad_norm": 1.4004157889337119, + "learning_rate": 2.41088034717917e-07, + "loss": 0.9678, + "step": 62220 + }, + { + "epoch": 4.822348793056686, + "grad_norm": 1.3110321800941143, + "learning_rate": 2.4112678239305644e-07, + "loss": 0.9287, + "step": 62230 + }, + { + "epoch": 4.823123716532992, + "grad_norm": 1.375399463797736, + "learning_rate": 2.4116553006819597e-07, + "loss": 0.9636, + "step": 62240 + }, + { + "epoch": 4.823898640009299, + "grad_norm": 1.3159531093972263, + "learning_rate": 2.4120427774333543e-07, + "loss": 0.9386, + "step": 62250 + }, + { + "epoch": 4.8246735634856055, + "grad_norm": 1.3642270606143005, + "learning_rate": 2.412430254184749e-07, + "loss": 0.9729, + "step": 62260 + }, + { + "epoch": 4.825448486961912, + "grad_norm": 1.3987359272698883, + "learning_rate": 2.412817730936144e-07, + "loss": 0.959, + "step": 62270 + }, + { + "epoch": 4.826223410438219, + "grad_norm": 1.385693425241169, + "learning_rate": 2.413205207687539e-07, + "loss": 0.9503, + "step": 62280 + }, + { + "epoch": 4.826998333914526, + "grad_norm": 1.2979003637482744, + "learning_rate": 2.413592684438934e-07, + "loss": 0.9723, + "step": 62290 + }, + { + "epoch": 4.827773257390833, + "grad_norm": 1.3746911247314486, + "learning_rate": 2.413980161190329e-07, + "loss": 0.9636, + "step": 62300 + }, + { + "epoch": 4.8285481808671395, + "grad_norm": 1.4299933509778007, + "learning_rate": 2.414367637941724e-07, + "loss": 0.9526, + "step": 62310 + }, + { + "epoch": 4.829323104343446, + "grad_norm": 1.3653932724466291, + "learning_rate": 2.414755114693119e-07, + "loss": 0.9464, + "step": 62320 + }, + { + "epoch": 4.830098027819753, + "grad_norm": 1.3536272436600496, + "learning_rate": 2.4151425914445135e-07, + "loss": 0.9815, + "step": 62330 + }, + { + "epoch": 4.83087295129606, + "grad_norm": 2.982147423371205, + "learning_rate": 2.4155300681959087e-07, + "loss": 0.9682, + "step": 62340 + }, + { + "epoch": 4.831647874772366, + "grad_norm": 1.4235331061129857, + "learning_rate": 2.4159175449473034e-07, + "loss": 0.9638, + "step": 62350 + }, + { + "epoch": 4.832422798248673, + "grad_norm": 1.355897582652019, + "learning_rate": 2.4163050216986986e-07, + "loss": 0.9639, + "step": 62360 + }, + { + "epoch": 4.8331977217249795, + "grad_norm": 1.3447549298678778, + "learning_rate": 2.4166924984500933e-07, + "loss": 0.9473, + "step": 62370 + }, + { + "epoch": 4.833972645201286, + "grad_norm": 1.3544176345754797, + "learning_rate": 2.4170799752014885e-07, + "loss": 0.9428, + "step": 62380 + }, + { + "epoch": 4.834747568677593, + "grad_norm": 1.4402352245933459, + "learning_rate": 2.417467451952883e-07, + "loss": 0.9544, + "step": 62390 + }, + { + "epoch": 4.8355224921539, + "grad_norm": 1.3768565722772923, + "learning_rate": 2.417854928704278e-07, + "loss": 0.9349, + "step": 62400 + }, + { + "epoch": 4.836297415630207, + "grad_norm": 1.3563375225565923, + "learning_rate": 2.418242405455673e-07, + "loss": 0.949, + "step": 62410 + }, + { + "epoch": 4.837072339106514, + "grad_norm": 1.3265249561856163, + "learning_rate": 2.418629882207068e-07, + "loss": 0.9621, + "step": 62420 + }, + { + "epoch": 4.8378472625828195, + "grad_norm": 1.4010121326425535, + "learning_rate": 2.419017358958463e-07, + "loss": 0.9516, + "step": 62430 + }, + { + "epoch": 4.838622186059126, + "grad_norm": 1.2778661611843778, + "learning_rate": 2.4194048357098577e-07, + "loss": 0.9758, + "step": 62440 + }, + { + "epoch": 4.839397109535433, + "grad_norm": 1.4088557085439561, + "learning_rate": 2.419792312461253e-07, + "loss": 0.9847, + "step": 62450 + }, + { + "epoch": 4.84017203301174, + "grad_norm": 1.331093762070099, + "learning_rate": 2.4201797892126476e-07, + "loss": 0.9661, + "step": 62460 + }, + { + "epoch": 4.840946956488047, + "grad_norm": 1.426858975325417, + "learning_rate": 2.4205672659640423e-07, + "loss": 0.952, + "step": 62470 + }, + { + "epoch": 4.8417218799643535, + "grad_norm": 1.4520925580788877, + "learning_rate": 2.4209547427154375e-07, + "loss": 0.9554, + "step": 62480 + }, + { + "epoch": 4.84249680344066, + "grad_norm": 1.5895594247956422, + "learning_rate": 2.421342219466832e-07, + "loss": 0.9609, + "step": 62490 + }, + { + "epoch": 4.843271726916967, + "grad_norm": 1.4756370760389985, + "learning_rate": 2.4217296962182274e-07, + "loss": 0.9567, + "step": 62500 + }, + { + "epoch": 4.843271726916967, + "eval_loss": 0.9590495228767395, + "eval_runtime": 320.477, + "eval_samples_per_second": 35.794, + "eval_steps_per_second": 8.949, + "step": 62500 + }, + { + "epoch": 4.844046650393274, + "grad_norm": 1.4431722648602163, + "learning_rate": 2.422117172969622e-07, + "loss": 0.9444, + "step": 62510 + }, + { + "epoch": 4.844821573869581, + "grad_norm": 1.2900932786751267, + "learning_rate": 2.4225046497210173e-07, + "loss": 0.9871, + "step": 62520 + }, + { + "epoch": 4.845596497345887, + "grad_norm": 1.3867620322106033, + "learning_rate": 2.422892126472412e-07, + "loss": 0.9569, + "step": 62530 + }, + { + "epoch": 4.8463714208221935, + "grad_norm": 1.365149680873058, + "learning_rate": 2.4232796032238067e-07, + "loss": 0.9573, + "step": 62540 + }, + { + "epoch": 4.8471463442985, + "grad_norm": 1.3746356849930277, + "learning_rate": 2.423667079975202e-07, + "loss": 0.9404, + "step": 62550 + }, + { + "epoch": 4.847921267774807, + "grad_norm": 1.3426255344704063, + "learning_rate": 2.4240545567265966e-07, + "loss": 0.9622, + "step": 62560 + }, + { + "epoch": 4.848696191251114, + "grad_norm": 1.4345829909741437, + "learning_rate": 2.424442033477992e-07, + "loss": 0.9713, + "step": 62570 + }, + { + "epoch": 4.849471114727421, + "grad_norm": 1.4281546136765886, + "learning_rate": 2.4248295102293865e-07, + "loss": 0.9605, + "step": 62580 + }, + { + "epoch": 4.850246038203728, + "grad_norm": 1.4267401036783767, + "learning_rate": 2.425216986980782e-07, + "loss": 0.9494, + "step": 62590 + }, + { + "epoch": 4.851020961680034, + "grad_norm": 1.322865284712322, + "learning_rate": 2.4256044637321764e-07, + "loss": 0.9691, + "step": 62600 + }, + { + "epoch": 4.851795885156341, + "grad_norm": 1.3785641146799434, + "learning_rate": 2.425991940483571e-07, + "loss": 0.9336, + "step": 62610 + }, + { + "epoch": 4.852570808632647, + "grad_norm": 1.3928382262292058, + "learning_rate": 2.4263794172349664e-07, + "loss": 0.9511, + "step": 62620 + }, + { + "epoch": 4.853345732108954, + "grad_norm": 1.378985500005626, + "learning_rate": 2.426766893986361e-07, + "loss": 0.9398, + "step": 62630 + }, + { + "epoch": 4.854120655585261, + "grad_norm": 1.3141787869840038, + "learning_rate": 2.4271543707377563e-07, + "loss": 1.0087, + "step": 62640 + }, + { + "epoch": 4.8548955790615675, + "grad_norm": 1.3589973326909845, + "learning_rate": 2.427541847489151e-07, + "loss": 0.9563, + "step": 62650 + }, + { + "epoch": 4.855670502537874, + "grad_norm": 1.4145704092063842, + "learning_rate": 2.427929324240546e-07, + "loss": 0.9651, + "step": 62660 + }, + { + "epoch": 4.856445426014181, + "grad_norm": 1.341047146070368, + "learning_rate": 2.428316800991941e-07, + "loss": 0.965, + "step": 62670 + }, + { + "epoch": 4.857220349490488, + "grad_norm": 1.3943268415470462, + "learning_rate": 2.4287042777433356e-07, + "loss": 0.9627, + "step": 62680 + }, + { + "epoch": 4.857995272966795, + "grad_norm": 1.3017087592298864, + "learning_rate": 2.429091754494731e-07, + "loss": 0.9532, + "step": 62690 + }, + { + "epoch": 4.858770196443102, + "grad_norm": 1.449653289341006, + "learning_rate": 2.4294792312461255e-07, + "loss": 0.9599, + "step": 62700 + }, + { + "epoch": 4.859545119919408, + "grad_norm": 1.4260552662276278, + "learning_rate": 2.4298667079975207e-07, + "loss": 0.9581, + "step": 62710 + }, + { + "epoch": 4.860320043395714, + "grad_norm": 1.4029695230358024, + "learning_rate": 2.4302541847489154e-07, + "loss": 0.9569, + "step": 62720 + }, + { + "epoch": 4.861094966872021, + "grad_norm": 1.3917575490048595, + "learning_rate": 2.43064166150031e-07, + "loss": 0.9655, + "step": 62730 + }, + { + "epoch": 4.861869890348328, + "grad_norm": 1.259699947860297, + "learning_rate": 2.4310291382517053e-07, + "loss": 0.9485, + "step": 62740 + }, + { + "epoch": 4.862644813824635, + "grad_norm": 1.3156758767764498, + "learning_rate": 2.4314166150031e-07, + "loss": 0.9529, + "step": 62750 + }, + { + "epoch": 4.863419737300942, + "grad_norm": 1.3614966140471627, + "learning_rate": 2.431804091754495e-07, + "loss": 0.9496, + "step": 62760 + }, + { + "epoch": 4.864194660777248, + "grad_norm": 1.3561639476022505, + "learning_rate": 2.43219156850589e-07, + "loss": 0.9508, + "step": 62770 + }, + { + "epoch": 4.864969584253555, + "grad_norm": 1.415507878599175, + "learning_rate": 2.432579045257285e-07, + "loss": 0.9649, + "step": 62780 + }, + { + "epoch": 4.865744507729862, + "grad_norm": 1.3414352949026387, + "learning_rate": 2.43296652200868e-07, + "loss": 0.9515, + "step": 62790 + }, + { + "epoch": 4.866519431206168, + "grad_norm": 1.3119034662066291, + "learning_rate": 2.4333539987600745e-07, + "loss": 0.9445, + "step": 62800 + }, + { + "epoch": 4.867294354682475, + "grad_norm": 1.3365452415321801, + "learning_rate": 2.4337414755114697e-07, + "loss": 0.9683, + "step": 62810 + }, + { + "epoch": 4.8680692781587815, + "grad_norm": 1.3408167058388343, + "learning_rate": 2.4341289522628644e-07, + "loss": 0.9868, + "step": 62820 + }, + { + "epoch": 4.868844201635088, + "grad_norm": 1.3788839837693205, + "learning_rate": 2.4345164290142596e-07, + "loss": 0.9566, + "step": 62830 + }, + { + "epoch": 4.869619125111395, + "grad_norm": 1.3371849744159543, + "learning_rate": 2.4349039057656543e-07, + "loss": 0.9509, + "step": 62840 + }, + { + "epoch": 4.870394048587702, + "grad_norm": 1.3932715373466216, + "learning_rate": 2.4352913825170495e-07, + "loss": 0.9912, + "step": 62850 + }, + { + "epoch": 4.871168972064009, + "grad_norm": 1.2872834659760257, + "learning_rate": 2.435678859268444e-07, + "loss": 0.9356, + "step": 62860 + }, + { + "epoch": 4.871943895540316, + "grad_norm": 1.4133041587218385, + "learning_rate": 2.436066336019839e-07, + "loss": 0.9594, + "step": 62870 + }, + { + "epoch": 4.872718819016622, + "grad_norm": 1.335786843086315, + "learning_rate": 2.436453812771234e-07, + "loss": 0.9289, + "step": 62880 + }, + { + "epoch": 4.873493742492929, + "grad_norm": 1.3342291230541006, + "learning_rate": 2.436841289522629e-07, + "loss": 0.9527, + "step": 62890 + }, + { + "epoch": 4.874268665969236, + "grad_norm": 1.2781470091416258, + "learning_rate": 2.437228766274024e-07, + "loss": 0.9566, + "step": 62900 + }, + { + "epoch": 4.875043589445542, + "grad_norm": 1.392944359441407, + "learning_rate": 2.4376162430254187e-07, + "loss": 0.9428, + "step": 62910 + }, + { + "epoch": 4.875818512921849, + "grad_norm": 1.3865391593843934, + "learning_rate": 2.438003719776814e-07, + "loss": 0.9547, + "step": 62920 + }, + { + "epoch": 4.876593436398156, + "grad_norm": 1.3405408928267173, + "learning_rate": 2.4383911965282086e-07, + "loss": 0.9504, + "step": 62930 + }, + { + "epoch": 4.877368359874462, + "grad_norm": 1.3029370651250984, + "learning_rate": 2.4387786732796033e-07, + "loss": 0.9638, + "step": 62940 + }, + { + "epoch": 4.878143283350769, + "grad_norm": 1.4082007311877391, + "learning_rate": 2.4391661500309986e-07, + "loss": 0.9572, + "step": 62950 + }, + { + "epoch": 4.878918206827076, + "grad_norm": 1.3394816733341957, + "learning_rate": 2.439553626782393e-07, + "loss": 0.9466, + "step": 62960 + }, + { + "epoch": 4.879693130303383, + "grad_norm": 1.3792498876135113, + "learning_rate": 2.4399411035337885e-07, + "loss": 0.9472, + "step": 62970 + }, + { + "epoch": 4.88046805377969, + "grad_norm": 1.3787234552913863, + "learning_rate": 2.440328580285183e-07, + "loss": 0.957, + "step": 62980 + }, + { + "epoch": 4.8812429772559955, + "grad_norm": 1.3502810129532956, + "learning_rate": 2.4407160570365784e-07, + "loss": 0.9505, + "step": 62990 + }, + { + "epoch": 4.882017900732302, + "grad_norm": 1.3684564808518962, + "learning_rate": 2.441103533787973e-07, + "loss": 0.9507, + "step": 63000 + }, + { + "epoch": 4.882017900732302, + "eval_loss": 0.9583190679550171, + "eval_runtime": 320.1307, + "eval_samples_per_second": 35.832, + "eval_steps_per_second": 8.959, + "step": 63000 + }, + { + "epoch": 4.882792824208609, + "grad_norm": 1.380459708308932, + "learning_rate": 2.441491010539368e-07, + "loss": 0.9766, + "step": 63010 + }, + { + "epoch": 4.883567747684916, + "grad_norm": 1.426166260124543, + "learning_rate": 2.441878487290763e-07, + "loss": 0.9718, + "step": 63020 + }, + { + "epoch": 4.884342671161223, + "grad_norm": 1.3229243832355497, + "learning_rate": 2.4422659640421577e-07, + "loss": 0.9492, + "step": 63030 + }, + { + "epoch": 4.88511759463753, + "grad_norm": 1.3206654773001487, + "learning_rate": 2.442653440793553e-07, + "loss": 0.9523, + "step": 63040 + }, + { + "epoch": 4.885892518113836, + "grad_norm": 1.356037859752426, + "learning_rate": 2.4430409175449476e-07, + "loss": 0.9858, + "step": 63050 + }, + { + "epoch": 4.886667441590143, + "grad_norm": 1.3820674671459965, + "learning_rate": 2.443428394296343e-07, + "loss": 0.9467, + "step": 63060 + }, + { + "epoch": 4.88744236506645, + "grad_norm": 1.3506074685579381, + "learning_rate": 2.4438158710477375e-07, + "loss": 0.9468, + "step": 63070 + }, + { + "epoch": 4.888217288542757, + "grad_norm": 1.3202036786378648, + "learning_rate": 2.444203347799132e-07, + "loss": 0.943, + "step": 63080 + }, + { + "epoch": 4.888992212019063, + "grad_norm": 1.3528181458787731, + "learning_rate": 2.4445908245505274e-07, + "loss": 0.959, + "step": 63090 + }, + { + "epoch": 4.88976713549537, + "grad_norm": 1.4428043831887292, + "learning_rate": 2.444978301301922e-07, + "loss": 0.9538, + "step": 63100 + }, + { + "epoch": 4.890542058971676, + "grad_norm": 1.405738078214579, + "learning_rate": 2.4453657780533173e-07, + "loss": 0.9568, + "step": 63110 + }, + { + "epoch": 4.891316982447983, + "grad_norm": 1.3172108394780804, + "learning_rate": 2.445753254804712e-07, + "loss": 0.9362, + "step": 63120 + }, + { + "epoch": 4.89209190592429, + "grad_norm": 1.3338358913675084, + "learning_rate": 2.446140731556107e-07, + "loss": 0.9397, + "step": 63130 + }, + { + "epoch": 4.892866829400597, + "grad_norm": 1.3100865381173372, + "learning_rate": 2.446528208307502e-07, + "loss": 0.9748, + "step": 63140 + }, + { + "epoch": 4.893641752876904, + "grad_norm": 1.288519183519175, + "learning_rate": 2.4469156850588966e-07, + "loss": 0.9312, + "step": 63150 + }, + { + "epoch": 4.89441667635321, + "grad_norm": 1.3189344982963025, + "learning_rate": 2.447303161810292e-07, + "loss": 0.9458, + "step": 63160 + }, + { + "epoch": 4.895191599829516, + "grad_norm": 1.3512708939399092, + "learning_rate": 2.4476906385616865e-07, + "loss": 0.9607, + "step": 63170 + }, + { + "epoch": 4.895966523305823, + "grad_norm": 1.3707115700625832, + "learning_rate": 2.4480781153130817e-07, + "loss": 0.9735, + "step": 63180 + }, + { + "epoch": 4.89674144678213, + "grad_norm": 1.3430621214947243, + "learning_rate": 2.4484655920644764e-07, + "loss": 0.9411, + "step": 63190 + }, + { + "epoch": 4.897516370258437, + "grad_norm": 1.3728146487614075, + "learning_rate": 2.4488530688158716e-07, + "loss": 0.9614, + "step": 63200 + }, + { + "epoch": 4.898291293734744, + "grad_norm": 1.2784148186558812, + "learning_rate": 2.4492405455672663e-07, + "loss": 0.9412, + "step": 63210 + }, + { + "epoch": 4.89906621721105, + "grad_norm": 1.4320322218754649, + "learning_rate": 2.449628022318661e-07, + "loss": 0.9535, + "step": 63220 + }, + { + "epoch": 4.899841140687357, + "grad_norm": 1.449827120731276, + "learning_rate": 2.450015499070056e-07, + "loss": 0.9493, + "step": 63230 + }, + { + "epoch": 4.900616064163664, + "grad_norm": 1.370272117911184, + "learning_rate": 2.450402975821451e-07, + "loss": 0.9426, + "step": 63240 + }, + { + "epoch": 4.901390987639971, + "grad_norm": 1.31945916778549, + "learning_rate": 2.450790452572846e-07, + "loss": 0.986, + "step": 63250 + }, + { + "epoch": 4.902165911116278, + "grad_norm": 1.3991501940419435, + "learning_rate": 2.451177929324241e-07, + "loss": 0.9694, + "step": 63260 + }, + { + "epoch": 4.9029408345925845, + "grad_norm": 1.3624889363879513, + "learning_rate": 2.451565406075636e-07, + "loss": 0.9581, + "step": 63270 + }, + { + "epoch": 4.90371575806889, + "grad_norm": 1.3246805257035519, + "learning_rate": 2.451952882827031e-07, + "loss": 0.9792, + "step": 63280 + }, + { + "epoch": 4.904490681545197, + "grad_norm": 1.4647044402004852, + "learning_rate": 2.4523403595784254e-07, + "loss": 0.9418, + "step": 63290 + }, + { + "epoch": 4.905265605021504, + "grad_norm": 1.5756618306215415, + "learning_rate": 2.4527278363298207e-07, + "loss": 0.9785, + "step": 63300 + }, + { + "epoch": 4.906040528497811, + "grad_norm": 1.3018023619602914, + "learning_rate": 2.4531153130812153e-07, + "loss": 0.947, + "step": 63310 + }, + { + "epoch": 4.906815451974118, + "grad_norm": 1.2979017508845165, + "learning_rate": 2.4535027898326106e-07, + "loss": 0.9587, + "step": 63320 + }, + { + "epoch": 4.907590375450424, + "grad_norm": 1.3452792799280078, + "learning_rate": 2.453890266584005e-07, + "loss": 0.9407, + "step": 63330 + }, + { + "epoch": 4.908365298926731, + "grad_norm": 1.3345717914242365, + "learning_rate": 2.4542777433354005e-07, + "loss": 0.9619, + "step": 63340 + }, + { + "epoch": 4.909140222403038, + "grad_norm": 1.3992977497698667, + "learning_rate": 2.454665220086795e-07, + "loss": 0.9462, + "step": 63350 + }, + { + "epoch": 4.909915145879344, + "grad_norm": 1.3727658385527892, + "learning_rate": 2.45505269683819e-07, + "loss": 0.9354, + "step": 63360 + }, + { + "epoch": 4.910690069355651, + "grad_norm": 1.2729580697422775, + "learning_rate": 2.455440173589585e-07, + "loss": 0.9514, + "step": 63370 + }, + { + "epoch": 4.911464992831958, + "grad_norm": 1.4575427913286323, + "learning_rate": 2.45582765034098e-07, + "loss": 0.9584, + "step": 63380 + }, + { + "epoch": 4.912239916308264, + "grad_norm": 1.3152952194686913, + "learning_rate": 2.456215127092375e-07, + "loss": 0.9468, + "step": 63390 + }, + { + "epoch": 4.913014839784571, + "grad_norm": 1.4495031574665462, + "learning_rate": 2.4566026038437697e-07, + "loss": 0.9567, + "step": 63400 + }, + { + "epoch": 4.913789763260878, + "grad_norm": 1.4111815492096917, + "learning_rate": 2.456990080595165e-07, + "loss": 0.9418, + "step": 63410 + }, + { + "epoch": 4.914564686737185, + "grad_norm": 1.3971584063043039, + "learning_rate": 2.4573775573465596e-07, + "loss": 0.9507, + "step": 63420 + }, + { + "epoch": 4.915339610213492, + "grad_norm": 1.3773788789413914, + "learning_rate": 2.4577650340979543e-07, + "loss": 0.9524, + "step": 63430 + }, + { + "epoch": 4.9161145336897984, + "grad_norm": 1.3254737060607542, + "learning_rate": 2.4581525108493495e-07, + "loss": 0.9548, + "step": 63440 + }, + { + "epoch": 4.916889457166105, + "grad_norm": 1.3652334256020153, + "learning_rate": 2.458539987600744e-07, + "loss": 0.9541, + "step": 63450 + }, + { + "epoch": 4.917664380642411, + "grad_norm": 1.35827854841842, + "learning_rate": 2.4589274643521394e-07, + "loss": 0.9546, + "step": 63460 + }, + { + "epoch": 4.918439304118718, + "grad_norm": 1.4695589614602935, + "learning_rate": 2.459314941103534e-07, + "loss": 0.9706, + "step": 63470 + }, + { + "epoch": 4.919214227595025, + "grad_norm": 1.2922002945333197, + "learning_rate": 2.459702417854929e-07, + "loss": 0.9577, + "step": 63480 + }, + { + "epoch": 4.919989151071332, + "grad_norm": 1.331351970452765, + "learning_rate": 2.460089894606324e-07, + "loss": 0.9541, + "step": 63490 + }, + { + "epoch": 4.920764074547638, + "grad_norm": 1.4248264601948355, + "learning_rate": 2.4604773713577187e-07, + "loss": 0.9571, + "step": 63500 + }, + { + "epoch": 4.920764074547638, + "eval_loss": 0.9575965404510498, + "eval_runtime": 318.8413, + "eval_samples_per_second": 35.977, + "eval_steps_per_second": 8.995, + "step": 63500 + }, + { + "epoch": 4.921538998023945, + "grad_norm": 1.3073980425048082, + "learning_rate": 2.460864848109114e-07, + "loss": 0.9289, + "step": 63510 + }, + { + "epoch": 4.922313921500252, + "grad_norm": 1.238008443605443, + "learning_rate": 2.4612523248605086e-07, + "loss": 0.9529, + "step": 63520 + }, + { + "epoch": 4.923088844976559, + "grad_norm": 1.4111675352331299, + "learning_rate": 2.461639801611904e-07, + "loss": 0.9456, + "step": 63530 + }, + { + "epoch": 4.923863768452866, + "grad_norm": 1.3521267665869805, + "learning_rate": 2.4620272783632985e-07, + "loss": 0.9636, + "step": 63540 + }, + { + "epoch": 4.924638691929172, + "grad_norm": 1.3399785793664116, + "learning_rate": 2.462414755114693e-07, + "loss": 0.9692, + "step": 63550 + }, + { + "epoch": 4.925413615405478, + "grad_norm": 1.3927369481519856, + "learning_rate": 2.4628022318660884e-07, + "loss": 0.9518, + "step": 63560 + }, + { + "epoch": 4.926188538881785, + "grad_norm": 1.359293253516432, + "learning_rate": 2.463189708617483e-07, + "loss": 0.9636, + "step": 63570 + }, + { + "epoch": 4.926963462358092, + "grad_norm": 1.3847821209926277, + "learning_rate": 2.4635771853688783e-07, + "loss": 0.944, + "step": 63580 + }, + { + "epoch": 4.927738385834399, + "grad_norm": 1.357415142737555, + "learning_rate": 2.463964662120273e-07, + "loss": 0.9515, + "step": 63590 + }, + { + "epoch": 4.928513309310706, + "grad_norm": 1.3421099827173957, + "learning_rate": 2.464352138871668e-07, + "loss": 0.9701, + "step": 63600 + }, + { + "epoch": 4.9292882327870124, + "grad_norm": 1.3657681024362307, + "learning_rate": 2.464739615623063e-07, + "loss": 0.9589, + "step": 63610 + }, + { + "epoch": 4.930063156263319, + "grad_norm": 1.3294555801649668, + "learning_rate": 2.4651270923744576e-07, + "loss": 0.9389, + "step": 63620 + }, + { + "epoch": 4.930838079739626, + "grad_norm": 1.3405373520242032, + "learning_rate": 2.465514569125853e-07, + "loss": 0.9574, + "step": 63630 + }, + { + "epoch": 4.931613003215933, + "grad_norm": 1.3572071196303779, + "learning_rate": 2.4659020458772475e-07, + "loss": 0.9672, + "step": 63640 + }, + { + "epoch": 4.932387926692239, + "grad_norm": 1.2717342089751784, + "learning_rate": 2.466289522628643e-07, + "loss": 0.9443, + "step": 63650 + }, + { + "epoch": 4.933162850168546, + "grad_norm": 1.3416625069595824, + "learning_rate": 2.4666769993800375e-07, + "loss": 0.9383, + "step": 63660 + }, + { + "epoch": 4.933937773644852, + "grad_norm": 1.410420256441669, + "learning_rate": 2.4670644761314327e-07, + "loss": 0.9613, + "step": 63670 + }, + { + "epoch": 4.934712697121159, + "grad_norm": 1.349664036761444, + "learning_rate": 2.4674519528828274e-07, + "loss": 0.9521, + "step": 63680 + }, + { + "epoch": 4.935487620597466, + "grad_norm": 1.2846023411474012, + "learning_rate": 2.467839429634222e-07, + "loss": 0.985, + "step": 63690 + }, + { + "epoch": 4.936262544073773, + "grad_norm": 1.4143006009198258, + "learning_rate": 2.4682269063856173e-07, + "loss": 0.9568, + "step": 63700 + }, + { + "epoch": 4.93703746755008, + "grad_norm": 1.2364235225625393, + "learning_rate": 2.468614383137012e-07, + "loss": 0.9479, + "step": 63710 + }, + { + "epoch": 4.9378123910263865, + "grad_norm": 1.3461924616544725, + "learning_rate": 2.469001859888407e-07, + "loss": 0.9673, + "step": 63720 + }, + { + "epoch": 4.938587314502692, + "grad_norm": 1.3899897526862919, + "learning_rate": 2.469389336639802e-07, + "loss": 0.9726, + "step": 63730 + }, + { + "epoch": 4.939362237978999, + "grad_norm": 1.3137559072000535, + "learning_rate": 2.469776813391197e-07, + "loss": 0.9828, + "step": 63740 + }, + { + "epoch": 4.940137161455306, + "grad_norm": 1.3729438928647777, + "learning_rate": 2.470164290142592e-07, + "loss": 0.9548, + "step": 63750 + }, + { + "epoch": 4.940912084931613, + "grad_norm": 1.409120490297797, + "learning_rate": 2.4705517668939865e-07, + "loss": 0.9633, + "step": 63760 + }, + { + "epoch": 4.94168700840792, + "grad_norm": 1.3710807300293182, + "learning_rate": 2.4709392436453817e-07, + "loss": 0.957, + "step": 63770 + }, + { + "epoch": 4.9424619318842264, + "grad_norm": 1.3805520815971415, + "learning_rate": 2.4713267203967764e-07, + "loss": 0.9554, + "step": 63780 + }, + { + "epoch": 4.943236855360533, + "grad_norm": 1.3753233792404909, + "learning_rate": 2.4717141971481716e-07, + "loss": 0.9506, + "step": 63790 + }, + { + "epoch": 4.94401177883684, + "grad_norm": 1.4026529670464283, + "learning_rate": 2.4721016738995663e-07, + "loss": 0.9631, + "step": 63800 + }, + { + "epoch": 4.944786702313147, + "grad_norm": 1.3524388519308683, + "learning_rate": 2.4724891506509615e-07, + "loss": 0.9564, + "step": 63810 + }, + { + "epoch": 4.945561625789454, + "grad_norm": 1.2974842493517147, + "learning_rate": 2.472876627402356e-07, + "loss": 0.9586, + "step": 63820 + }, + { + "epoch": 4.9463365492657605, + "grad_norm": 1.4019241781058163, + "learning_rate": 2.473264104153751e-07, + "loss": 0.9528, + "step": 63830 + }, + { + "epoch": 4.947111472742066, + "grad_norm": 1.3776385098254962, + "learning_rate": 2.473651580905146e-07, + "loss": 0.9606, + "step": 63840 + }, + { + "epoch": 4.947886396218373, + "grad_norm": 1.3737952211824611, + "learning_rate": 2.474039057656541e-07, + "loss": 0.9544, + "step": 63850 + }, + { + "epoch": 4.94866131969468, + "grad_norm": 1.2720380987096453, + "learning_rate": 2.474426534407936e-07, + "loss": 0.9681, + "step": 63860 + }, + { + "epoch": 4.949436243170987, + "grad_norm": 1.3704776082986099, + "learning_rate": 2.4748140111593307e-07, + "loss": 0.9568, + "step": 63870 + }, + { + "epoch": 4.950211166647294, + "grad_norm": 1.382030531988024, + "learning_rate": 2.475201487910726e-07, + "loss": 0.9476, + "step": 63880 + }, + { + "epoch": 4.9509860901236005, + "grad_norm": 1.4066577393547193, + "learning_rate": 2.4755889646621206e-07, + "loss": 0.9857, + "step": 63890 + }, + { + "epoch": 4.951761013599907, + "grad_norm": 1.4664112945367362, + "learning_rate": 2.4759764414135153e-07, + "loss": 0.9629, + "step": 63900 + }, + { + "epoch": 4.952535937076214, + "grad_norm": 1.3195007288550382, + "learning_rate": 2.4763639181649105e-07, + "loss": 0.9406, + "step": 63910 + }, + { + "epoch": 4.95331086055252, + "grad_norm": 1.3363156024712224, + "learning_rate": 2.476751394916305e-07, + "loss": 0.9552, + "step": 63920 + }, + { + "epoch": 4.954085784028827, + "grad_norm": 1.3772669598723515, + "learning_rate": 2.4771388716677004e-07, + "loss": 0.9533, + "step": 63930 + }, + { + "epoch": 4.954860707505134, + "grad_norm": 1.3084601592084466, + "learning_rate": 2.477526348419095e-07, + "loss": 0.9566, + "step": 63940 + }, + { + "epoch": 4.95563563098144, + "grad_norm": 1.3914628698210656, + "learning_rate": 2.4779138251704904e-07, + "loss": 0.9546, + "step": 63950 + }, + { + "epoch": 4.956410554457747, + "grad_norm": 1.5653036054395462, + "learning_rate": 2.478301301921885e-07, + "loss": 0.9768, + "step": 63960 + }, + { + "epoch": 4.957185477934054, + "grad_norm": 1.4100159736465332, + "learning_rate": 2.47868877867328e-07, + "loss": 0.949, + "step": 63970 + }, + { + "epoch": 4.957960401410361, + "grad_norm": 1.3225260778762733, + "learning_rate": 2.479076255424675e-07, + "loss": 0.957, + "step": 63980 + }, + { + "epoch": 4.958735324886668, + "grad_norm": 1.326548976539438, + "learning_rate": 2.4794637321760696e-07, + "loss": 0.953, + "step": 63990 + }, + { + "epoch": 4.9595102483629745, + "grad_norm": 1.3977946393268592, + "learning_rate": 2.479851208927465e-07, + "loss": 0.9856, + "step": 64000 + }, + { + "epoch": 4.9595102483629745, + "eval_loss": 0.956969678401947, + "eval_runtime": 319.3525, + "eval_samples_per_second": 35.92, + "eval_steps_per_second": 8.981, + "step": 64000 + }, + { + "epoch": 4.960285171839281, + "grad_norm": 1.398414046171792, + "learning_rate": 2.4802386856788596e-07, + "loss": 0.9563, + "step": 64010 + }, + { + "epoch": 4.961060095315587, + "grad_norm": 1.3694945046684146, + "learning_rate": 2.480626162430255e-07, + "loss": 0.9435, + "step": 64020 + }, + { + "epoch": 4.961835018791894, + "grad_norm": 1.3303553717987617, + "learning_rate": 2.4810136391816495e-07, + "loss": 0.9527, + "step": 64030 + }, + { + "epoch": 4.962609942268201, + "grad_norm": 1.281183683938377, + "learning_rate": 2.481401115933044e-07, + "loss": 0.9515, + "step": 64040 + }, + { + "epoch": 4.963384865744508, + "grad_norm": 1.3520608522465818, + "learning_rate": 2.4817885926844394e-07, + "loss": 0.9724, + "step": 64050 + }, + { + "epoch": 4.9641597892208145, + "grad_norm": 1.3708565038518974, + "learning_rate": 2.482176069435834e-07, + "loss": 0.9574, + "step": 64060 + }, + { + "epoch": 4.964934712697121, + "grad_norm": 1.3743442589675288, + "learning_rate": 2.4825635461872293e-07, + "loss": 0.9502, + "step": 64070 + }, + { + "epoch": 4.965709636173428, + "grad_norm": 1.3712465402484597, + "learning_rate": 2.482951022938624e-07, + "loss": 0.9696, + "step": 64080 + }, + { + "epoch": 4.966484559649735, + "grad_norm": 1.4008323531554705, + "learning_rate": 2.483338499690019e-07, + "loss": 0.9599, + "step": 64090 + }, + { + "epoch": 4.967259483126041, + "grad_norm": 1.3592426047309867, + "learning_rate": 2.483725976441414e-07, + "loss": 0.9584, + "step": 64100 + }, + { + "epoch": 4.968034406602348, + "grad_norm": 1.3284171920731955, + "learning_rate": 2.4841134531928086e-07, + "loss": 0.9546, + "step": 64110 + }, + { + "epoch": 4.968809330078654, + "grad_norm": 1.360872614394953, + "learning_rate": 2.484500929944204e-07, + "loss": 0.9469, + "step": 64120 + }, + { + "epoch": 4.969584253554961, + "grad_norm": 1.2948465438607948, + "learning_rate": 2.4848884066955985e-07, + "loss": 0.9545, + "step": 64130 + }, + { + "epoch": 4.970359177031268, + "grad_norm": 1.4341064357868056, + "learning_rate": 2.4852758834469937e-07, + "loss": 0.9682, + "step": 64140 + }, + { + "epoch": 4.971134100507575, + "grad_norm": 1.2937106412384678, + "learning_rate": 2.4856633601983884e-07, + "loss": 0.9594, + "step": 64150 + }, + { + "epoch": 4.971909023983882, + "grad_norm": 1.4422559174575136, + "learning_rate": 2.4860508369497836e-07, + "loss": 0.9547, + "step": 64160 + }, + { + "epoch": 4.9726839474601885, + "grad_norm": 1.3855864666456048, + "learning_rate": 2.4864383137011783e-07, + "loss": 0.9512, + "step": 64170 + }, + { + "epoch": 4.973458870936495, + "grad_norm": 1.420579667727262, + "learning_rate": 2.486825790452573e-07, + "loss": 0.9597, + "step": 64180 + }, + { + "epoch": 4.974233794412802, + "grad_norm": 1.4077402367963794, + "learning_rate": 2.487213267203968e-07, + "loss": 0.9713, + "step": 64190 + }, + { + "epoch": 4.975008717889109, + "grad_norm": 1.3240838133442898, + "learning_rate": 2.487600743955363e-07, + "loss": 0.9661, + "step": 64200 + }, + { + "epoch": 4.975783641365415, + "grad_norm": 1.4153017599799151, + "learning_rate": 2.487988220706758e-07, + "loss": 0.9494, + "step": 64210 + }, + { + "epoch": 4.976558564841722, + "grad_norm": 1.413447877488274, + "learning_rate": 2.488375697458153e-07, + "loss": 0.9707, + "step": 64220 + }, + { + "epoch": 4.9773334883180285, + "grad_norm": 1.3838744059062555, + "learning_rate": 2.4887631742095475e-07, + "loss": 0.9393, + "step": 64230 + }, + { + "epoch": 4.978108411794335, + "grad_norm": 1.3828053940969278, + "learning_rate": 2.4891506509609427e-07, + "loss": 0.9796, + "step": 64240 + }, + { + "epoch": 4.978883335270642, + "grad_norm": 1.3467612442207055, + "learning_rate": 2.4895381277123374e-07, + "loss": 0.9524, + "step": 64250 + }, + { + "epoch": 4.979658258746949, + "grad_norm": 1.4059942218311687, + "learning_rate": 2.4899256044637326e-07, + "loss": 0.9621, + "step": 64260 + }, + { + "epoch": 4.980433182223256, + "grad_norm": 1.3800600392144204, + "learning_rate": 2.4903130812151273e-07, + "loss": 0.9514, + "step": 64270 + }, + { + "epoch": 4.9812081056995625, + "grad_norm": 1.490588596113473, + "learning_rate": 2.4907005579665226e-07, + "loss": 0.9519, + "step": 64280 + }, + { + "epoch": 4.981983029175868, + "grad_norm": 1.4879178510252786, + "learning_rate": 2.491088034717917e-07, + "loss": 0.9743, + "step": 64290 + }, + { + "epoch": 4.982757952652175, + "grad_norm": 1.397864787552006, + "learning_rate": 2.491475511469312e-07, + "loss": 0.9382, + "step": 64300 + }, + { + "epoch": 4.983532876128482, + "grad_norm": 1.3729166244885742, + "learning_rate": 2.491862988220707e-07, + "loss": 0.9605, + "step": 64310 + }, + { + "epoch": 4.984307799604789, + "grad_norm": 1.3027066121084834, + "learning_rate": 2.492250464972102e-07, + "loss": 0.9814, + "step": 64320 + }, + { + "epoch": 4.985082723081096, + "grad_norm": 1.2978801833936107, + "learning_rate": 2.492637941723497e-07, + "loss": 0.9542, + "step": 64330 + }, + { + "epoch": 4.9858576465574025, + "grad_norm": 1.4154674898784079, + "learning_rate": 2.493025418474892e-07, + "loss": 0.9533, + "step": 64340 + }, + { + "epoch": 4.986632570033709, + "grad_norm": 1.3967826753012935, + "learning_rate": 2.493412895226287e-07, + "loss": 0.9572, + "step": 64350 + }, + { + "epoch": 4.987407493510016, + "grad_norm": 1.4311446175188658, + "learning_rate": 2.4938003719776817e-07, + "loss": 0.976, + "step": 64360 + }, + { + "epoch": 4.988182416986323, + "grad_norm": 1.5228130522681833, + "learning_rate": 2.4941878487290764e-07, + "loss": 0.9437, + "step": 64370 + }, + { + "epoch": 4.98895734046263, + "grad_norm": 1.3979975685713468, + "learning_rate": 2.4945753254804716e-07, + "loss": 0.9612, + "step": 64380 + }, + { + "epoch": 4.989732263938936, + "grad_norm": 1.4420106469408738, + "learning_rate": 2.494962802231866e-07, + "loss": 0.9439, + "step": 64390 + }, + { + "epoch": 4.9905071874152425, + "grad_norm": 1.3056273869787405, + "learning_rate": 2.4953502789832615e-07, + "loss": 0.9565, + "step": 64400 + }, + { + "epoch": 4.991282110891549, + "grad_norm": 1.3197003737259412, + "learning_rate": 2.495737755734656e-07, + "loss": 0.9568, + "step": 64410 + }, + { + "epoch": 4.992057034367856, + "grad_norm": 1.3379606344262152, + "learning_rate": 2.4961252324860514e-07, + "loss": 0.9509, + "step": 64420 + }, + { + "epoch": 4.992831957844163, + "grad_norm": 1.3540731874399001, + "learning_rate": 2.496512709237446e-07, + "loss": 0.9437, + "step": 64430 + }, + { + "epoch": 4.99360688132047, + "grad_norm": 1.262808546522824, + "learning_rate": 2.496900185988841e-07, + "loss": 0.9541, + "step": 64440 + }, + { + "epoch": 4.9943818047967765, + "grad_norm": 1.4305030898179403, + "learning_rate": 2.497287662740236e-07, + "loss": 0.9616, + "step": 64450 + }, + { + "epoch": 4.995156728273083, + "grad_norm": 1.3612700390209458, + "learning_rate": 2.4976751394916307e-07, + "loss": 0.9465, + "step": 64460 + }, + { + "epoch": 4.99593165174939, + "grad_norm": 1.334523384129756, + "learning_rate": 2.498062616243026e-07, + "loss": 0.9544, + "step": 64470 + }, + { + "epoch": 4.996706575225696, + "grad_norm": 1.380730731850538, + "learning_rate": 2.4984500929944206e-07, + "loss": 0.9444, + "step": 64480 + }, + { + "epoch": 4.997481498702003, + "grad_norm": 1.4086051528962462, + "learning_rate": 2.498837569745816e-07, + "loss": 0.9523, + "step": 64490 + }, + { + "epoch": 4.99825642217831, + "grad_norm": 1.3171896091139965, + "learning_rate": 2.4992250464972105e-07, + "loss": 0.9556, + "step": 64500 + }, + { + "epoch": 4.99825642217831, + "eval_loss": 0.9562982320785522, + "eval_runtime": 321.4441, + "eval_samples_per_second": 35.686, + "eval_steps_per_second": 8.922, + "step": 64500 + }, + { + "epoch": 4.9990313456546165, + "grad_norm": 1.4537542078889538, + "learning_rate": 2.499612523248605e-07, + "loss": 0.9522, + "step": 64510 + }, + { + "epoch": 4.999806269130923, + "grad_norm": 1.3935431654669392, + "learning_rate": 2.5000000000000004e-07, + "loss": 0.9627, + "step": 64520 + }, + { + "epoch": 5.00058119260723, + "grad_norm": 1.3165472966552176, + "learning_rate": 2.500387476751395e-07, + "loss": 0.9761, + "step": 64530 + }, + { + "epoch": 5.001356116083537, + "grad_norm": 1.3452913921699219, + "learning_rate": 2.5007749535027903e-07, + "loss": 0.9287, + "step": 64540 + }, + { + "epoch": 5.002131039559844, + "grad_norm": 1.3595377605405299, + "learning_rate": 2.501162430254185e-07, + "loss": 0.9835, + "step": 64550 + }, + { + "epoch": 5.0029059630361505, + "grad_norm": 1.3633014100925998, + "learning_rate": 2.50154990700558e-07, + "loss": 0.9596, + "step": 64560 + }, + { + "epoch": 5.0036808865124565, + "grad_norm": 1.436536589122772, + "learning_rate": 2.501937383756975e-07, + "loss": 0.9547, + "step": 64570 + }, + { + "epoch": 5.004455809988763, + "grad_norm": 1.3365622044563192, + "learning_rate": 2.5023248605083696e-07, + "loss": 0.938, + "step": 64580 + }, + { + "epoch": 5.00523073346507, + "grad_norm": 1.3220633047966361, + "learning_rate": 2.502712337259765e-07, + "loss": 0.9481, + "step": 64590 + }, + { + "epoch": 5.006005656941377, + "grad_norm": 1.410362037421605, + "learning_rate": 2.5030998140111595e-07, + "loss": 0.9601, + "step": 64600 + }, + { + "epoch": 5.006780580417684, + "grad_norm": 1.3398800024532347, + "learning_rate": 2.503487290762555e-07, + "loss": 0.9287, + "step": 64610 + }, + { + "epoch": 5.0075555038939905, + "grad_norm": 1.3098894961236889, + "learning_rate": 2.5038747675139494e-07, + "loss": 0.9508, + "step": 64620 + }, + { + "epoch": 5.008330427370297, + "grad_norm": 1.3488208417023666, + "learning_rate": 2.5042622442653447e-07, + "loss": 0.9627, + "step": 64630 + }, + { + "epoch": 5.009105350846604, + "grad_norm": 1.4139057474366437, + "learning_rate": 2.5046497210167393e-07, + "loss": 0.9776, + "step": 64640 + }, + { + "epoch": 5.009880274322911, + "grad_norm": 1.2876930462781535, + "learning_rate": 2.505037197768134e-07, + "loss": 0.95, + "step": 64650 + }, + { + "epoch": 5.010655197799218, + "grad_norm": 1.4841895871763682, + "learning_rate": 2.505424674519529e-07, + "loss": 0.9557, + "step": 64660 + }, + { + "epoch": 5.011430121275524, + "grad_norm": 1.3399220471956996, + "learning_rate": 2.505812151270924e-07, + "loss": 0.9564, + "step": 64670 + }, + { + "epoch": 5.0122050447518305, + "grad_norm": 1.4007617672575994, + "learning_rate": 2.506199628022319e-07, + "loss": 0.9381, + "step": 64680 + }, + { + "epoch": 5.012979968228137, + "grad_norm": 1.3550608344831514, + "learning_rate": 2.506587104773714e-07, + "loss": 0.9299, + "step": 64690 + }, + { + "epoch": 5.013754891704444, + "grad_norm": 1.2870991791026762, + "learning_rate": 2.506974581525109e-07, + "loss": 0.9582, + "step": 64700 + }, + { + "epoch": 5.014529815180751, + "grad_norm": 1.5379064147006078, + "learning_rate": 2.507362058276504e-07, + "loss": 0.9486, + "step": 64710 + }, + { + "epoch": 5.015304738657058, + "grad_norm": 1.359468919143637, + "learning_rate": 2.5077495350278985e-07, + "loss": 0.9535, + "step": 64720 + }, + { + "epoch": 5.0160796621333645, + "grad_norm": 1.401027420639109, + "learning_rate": 2.5081370117792937e-07, + "loss": 0.9561, + "step": 64730 + }, + { + "epoch": 5.016854585609671, + "grad_norm": 1.42097070506364, + "learning_rate": 2.5085244885306884e-07, + "loss": 0.9602, + "step": 64740 + }, + { + "epoch": 5.017629509085978, + "grad_norm": 1.3896633106553806, + "learning_rate": 2.5089119652820836e-07, + "loss": 0.9711, + "step": 64750 + }, + { + "epoch": 5.018404432562284, + "grad_norm": 1.3798434696984405, + "learning_rate": 2.5092994420334783e-07, + "loss": 0.9422, + "step": 64760 + }, + { + "epoch": 5.019179356038591, + "grad_norm": 1.2614510534578782, + "learning_rate": 2.5096869187848735e-07, + "loss": 0.9361, + "step": 64770 + }, + { + "epoch": 5.019954279514898, + "grad_norm": 1.347659829404573, + "learning_rate": 2.510074395536268e-07, + "loss": 0.9567, + "step": 64780 + }, + { + "epoch": 5.0207292029912045, + "grad_norm": 1.3651347491517778, + "learning_rate": 2.510461872287663e-07, + "loss": 0.9877, + "step": 64790 + }, + { + "epoch": 5.021504126467511, + "grad_norm": 1.3404701912640682, + "learning_rate": 2.510849349039058e-07, + "loss": 0.9715, + "step": 64800 + }, + { + "epoch": 5.022279049943818, + "grad_norm": 1.4255253210777132, + "learning_rate": 2.511236825790453e-07, + "loss": 0.9608, + "step": 64810 + }, + { + "epoch": 5.023053973420125, + "grad_norm": 1.400367966049376, + "learning_rate": 2.511624302541848e-07, + "loss": 0.9423, + "step": 64820 + }, + { + "epoch": 5.023828896896432, + "grad_norm": 1.274805408794963, + "learning_rate": 2.5120117792932427e-07, + "loss": 0.9553, + "step": 64830 + }, + { + "epoch": 5.024603820372739, + "grad_norm": 1.4155955579498092, + "learning_rate": 2.512399256044638e-07, + "loss": 0.9821, + "step": 64840 + }, + { + "epoch": 5.0253787438490445, + "grad_norm": 1.3861523946050585, + "learning_rate": 2.5127867327960326e-07, + "loss": 0.9447, + "step": 64850 + }, + { + "epoch": 5.026153667325351, + "grad_norm": 1.317943522645267, + "learning_rate": 2.5131742095474273e-07, + "loss": 0.961, + "step": 64860 + }, + { + "epoch": 5.026928590801658, + "grad_norm": 1.327759835513219, + "learning_rate": 2.5135616862988225e-07, + "loss": 0.9498, + "step": 64870 + }, + { + "epoch": 5.027703514277965, + "grad_norm": 1.3606697292350582, + "learning_rate": 2.513949163050217e-07, + "loss": 0.9408, + "step": 64880 + }, + { + "epoch": 5.028478437754272, + "grad_norm": 1.3500797743063822, + "learning_rate": 2.5143366398016124e-07, + "loss": 0.9402, + "step": 64890 + }, + { + "epoch": 5.0292533612305785, + "grad_norm": 1.3518629477862303, + "learning_rate": 2.514724116553007e-07, + "loss": 0.952, + "step": 64900 + }, + { + "epoch": 5.030028284706885, + "grad_norm": 1.3596355654141825, + "learning_rate": 2.5151115933044023e-07, + "loss": 0.9454, + "step": 64910 + }, + { + "epoch": 5.030803208183192, + "grad_norm": 1.4254424139613506, + "learning_rate": 2.515499070055797e-07, + "loss": 0.9453, + "step": 64920 + }, + { + "epoch": 5.031578131659499, + "grad_norm": 1.307979242291776, + "learning_rate": 2.5158865468071917e-07, + "loss": 0.948, + "step": 64930 + }, + { + "epoch": 5.032353055135806, + "grad_norm": 1.4266639267648142, + "learning_rate": 2.516274023558587e-07, + "loss": 0.9423, + "step": 64940 + }, + { + "epoch": 5.033127978612112, + "grad_norm": 1.33778997893355, + "learning_rate": 2.5166615003099816e-07, + "loss": 0.9511, + "step": 64950 + }, + { + "epoch": 5.0339029020884185, + "grad_norm": 1.3920868810638227, + "learning_rate": 2.517048977061377e-07, + "loss": 0.9635, + "step": 64960 + }, + { + "epoch": 5.034677825564725, + "grad_norm": 1.3504838253827014, + "learning_rate": 2.5174364538127715e-07, + "loss": 0.9511, + "step": 64970 + }, + { + "epoch": 5.035452749041032, + "grad_norm": 1.3456793665469147, + "learning_rate": 2.517823930564166e-07, + "loss": 0.9697, + "step": 64980 + }, + { + "epoch": 5.036227672517339, + "grad_norm": 1.3436126328454363, + "learning_rate": 2.5182114073155615e-07, + "loss": 0.9645, + "step": 64990 + }, + { + "epoch": 5.037002595993646, + "grad_norm": 1.4457192549016145, + "learning_rate": 2.518598884066956e-07, + "loss": 0.9486, + "step": 65000 + }, + { + "epoch": 5.037002595993646, + "eval_loss": 0.9557762742042542, + "eval_runtime": 319.5842, + "eval_samples_per_second": 35.894, + "eval_steps_per_second": 8.974, + "step": 65000 + }, + { + "epoch": 5.037777519469953, + "grad_norm": 1.3191453373077522, + "learning_rate": 2.5189863608183514e-07, + "loss": 0.9405, + "step": 65010 + }, + { + "epoch": 5.038552442946259, + "grad_norm": 1.375043957831858, + "learning_rate": 2.519373837569746e-07, + "loss": 0.9406, + "step": 65020 + }, + { + "epoch": 5.039327366422566, + "grad_norm": 1.4014751778428594, + "learning_rate": 2.5197613143211413e-07, + "loss": 0.9526, + "step": 65030 + }, + { + "epoch": 5.040102289898872, + "grad_norm": 1.3387950682249468, + "learning_rate": 2.520148791072536e-07, + "loss": 0.9503, + "step": 65040 + }, + { + "epoch": 5.040877213375179, + "grad_norm": 1.358499385142714, + "learning_rate": 2.5205362678239307e-07, + "loss": 0.9208, + "step": 65050 + }, + { + "epoch": 5.041652136851486, + "grad_norm": 1.2983208577925844, + "learning_rate": 2.520923744575326e-07, + "loss": 0.9477, + "step": 65060 + }, + { + "epoch": 5.0424270603277925, + "grad_norm": 1.3136304978756386, + "learning_rate": 2.5213112213267206e-07, + "loss": 0.9485, + "step": 65070 + }, + { + "epoch": 5.043201983804099, + "grad_norm": 1.4506017348122808, + "learning_rate": 2.521698698078116e-07, + "loss": 0.9427, + "step": 65080 + }, + { + "epoch": 5.043976907280406, + "grad_norm": 1.408612785763299, + "learning_rate": 2.5220861748295105e-07, + "loss": 0.9498, + "step": 65090 + }, + { + "epoch": 5.044751830756713, + "grad_norm": 1.3657360766522102, + "learning_rate": 2.5224736515809057e-07, + "loss": 0.9609, + "step": 65100 + }, + { + "epoch": 5.04552675423302, + "grad_norm": 1.3332076689563042, + "learning_rate": 2.5228611283323004e-07, + "loss": 0.9573, + "step": 65110 + }, + { + "epoch": 5.046301677709327, + "grad_norm": 1.4326677145493885, + "learning_rate": 2.523248605083695e-07, + "loss": 0.9432, + "step": 65120 + }, + { + "epoch": 5.0470766011856325, + "grad_norm": 1.313395214351576, + "learning_rate": 2.5236360818350903e-07, + "loss": 0.9597, + "step": 65130 + }, + { + "epoch": 5.047851524661939, + "grad_norm": 1.4295142275188009, + "learning_rate": 2.524023558586485e-07, + "loss": 0.9459, + "step": 65140 + }, + { + "epoch": 5.048626448138246, + "grad_norm": 1.4349426854963203, + "learning_rate": 2.52441103533788e-07, + "loss": 0.9558, + "step": 65150 + }, + { + "epoch": 5.049401371614553, + "grad_norm": 1.3921350574965468, + "learning_rate": 2.524798512089275e-07, + "loss": 0.9387, + "step": 65160 + }, + { + "epoch": 5.05017629509086, + "grad_norm": 1.390361125018697, + "learning_rate": 2.52518598884067e-07, + "loss": 0.957, + "step": 65170 + }, + { + "epoch": 5.050951218567167, + "grad_norm": 1.3609317803029544, + "learning_rate": 2.525573465592065e-07, + "loss": 0.954, + "step": 65180 + }, + { + "epoch": 5.051726142043473, + "grad_norm": 1.3191004915151625, + "learning_rate": 2.5259609423434595e-07, + "loss": 0.9396, + "step": 65190 + }, + { + "epoch": 5.05250106551978, + "grad_norm": 1.337814360720657, + "learning_rate": 2.5263484190948547e-07, + "loss": 0.9449, + "step": 65200 + }, + { + "epoch": 5.053275988996087, + "grad_norm": 1.394387162594155, + "learning_rate": 2.5267358958462494e-07, + "loss": 0.9482, + "step": 65210 + }, + { + "epoch": 5.054050912472393, + "grad_norm": 1.382215113567365, + "learning_rate": 2.5271233725976446e-07, + "loss": 0.9641, + "step": 65220 + }, + { + "epoch": 5.0548258359487, + "grad_norm": 1.421070368517389, + "learning_rate": 2.5275108493490393e-07, + "loss": 0.9419, + "step": 65230 + }, + { + "epoch": 5.0556007594250065, + "grad_norm": 1.3722199080093966, + "learning_rate": 2.5278983261004345e-07, + "loss": 0.9604, + "step": 65240 + }, + { + "epoch": 5.056375682901313, + "grad_norm": 1.3942857290772364, + "learning_rate": 2.528285802851829e-07, + "loss": 0.9562, + "step": 65250 + }, + { + "epoch": 5.05715060637762, + "grad_norm": 1.4269837451939662, + "learning_rate": 2.528673279603224e-07, + "loss": 0.9952, + "step": 65260 + }, + { + "epoch": 5.057925529853927, + "grad_norm": 1.396774354882333, + "learning_rate": 2.529060756354619e-07, + "loss": 0.9753, + "step": 65270 + }, + { + "epoch": 5.058700453330234, + "grad_norm": 1.3632334300935798, + "learning_rate": 2.529448233106014e-07, + "loss": 0.9565, + "step": 65280 + }, + { + "epoch": 5.059475376806541, + "grad_norm": 1.4032911295837138, + "learning_rate": 2.529835709857409e-07, + "loss": 0.9726, + "step": 65290 + }, + { + "epoch": 5.060250300282847, + "grad_norm": 1.3105443627497273, + "learning_rate": 2.530223186608804e-07, + "loss": 0.9503, + "step": 65300 + }, + { + "epoch": 5.061025223759154, + "grad_norm": 1.4180236951646306, + "learning_rate": 2.530610663360199e-07, + "loss": 0.9607, + "step": 65310 + }, + { + "epoch": 5.06180014723546, + "grad_norm": 1.3895342477883004, + "learning_rate": 2.5309981401115936e-07, + "loss": 0.9683, + "step": 65320 + }, + { + "epoch": 5.062575070711767, + "grad_norm": 1.3856559232257926, + "learning_rate": 2.5313856168629883e-07, + "loss": 0.9686, + "step": 65330 + }, + { + "epoch": 5.063349994188074, + "grad_norm": 1.37701982578318, + "learning_rate": 2.5317730936143836e-07, + "loss": 0.9439, + "step": 65340 + }, + { + "epoch": 5.0641249176643806, + "grad_norm": 1.287626747441775, + "learning_rate": 2.532160570365778e-07, + "loss": 0.9295, + "step": 65350 + }, + { + "epoch": 5.064899841140687, + "grad_norm": 1.3578815058019644, + "learning_rate": 2.5325480471171735e-07, + "loss": 0.9536, + "step": 65360 + }, + { + "epoch": 5.065674764616994, + "grad_norm": 1.3582314124835935, + "learning_rate": 2.532935523868568e-07, + "loss": 0.9377, + "step": 65370 + }, + { + "epoch": 5.066449688093301, + "grad_norm": 1.3209712822902713, + "learning_rate": 2.5333230006199634e-07, + "loss": 0.9465, + "step": 65380 + }, + { + "epoch": 5.067224611569608, + "grad_norm": 1.4025423643561472, + "learning_rate": 2.533710477371358e-07, + "loss": 0.9618, + "step": 65390 + }, + { + "epoch": 5.067999535045915, + "grad_norm": 1.3340403365366744, + "learning_rate": 2.534097954122753e-07, + "loss": 0.953, + "step": 65400 + }, + { + "epoch": 5.0687744585222205, + "grad_norm": 1.3545185892495548, + "learning_rate": 2.534485430874148e-07, + "loss": 0.9694, + "step": 65410 + }, + { + "epoch": 5.069549381998527, + "grad_norm": 1.3571251877794008, + "learning_rate": 2.5348729076255427e-07, + "loss": 0.9426, + "step": 65420 + }, + { + "epoch": 5.070324305474834, + "grad_norm": 1.4299945529994775, + "learning_rate": 2.535260384376938e-07, + "loss": 0.979, + "step": 65430 + }, + { + "epoch": 5.071099228951141, + "grad_norm": 1.4056893877251302, + "learning_rate": 2.5356478611283326e-07, + "loss": 0.9601, + "step": 65440 + }, + { + "epoch": 5.071874152427448, + "grad_norm": 1.3907082323449753, + "learning_rate": 2.536035337879728e-07, + "loss": 0.9648, + "step": 65450 + }, + { + "epoch": 5.072649075903755, + "grad_norm": 1.4206763992386386, + "learning_rate": 2.5364228146311225e-07, + "loss": 0.9349, + "step": 65460 + }, + { + "epoch": 5.073423999380061, + "grad_norm": 1.3759389749807678, + "learning_rate": 2.536810291382517e-07, + "loss": 0.9571, + "step": 65470 + }, + { + "epoch": 5.074198922856368, + "grad_norm": 1.3625067411022775, + "learning_rate": 2.5371977681339124e-07, + "loss": 0.9448, + "step": 65480 + }, + { + "epoch": 5.074973846332675, + "grad_norm": 1.4197046068843657, + "learning_rate": 2.537585244885307e-07, + "loss": 0.9452, + "step": 65490 + }, + { + "epoch": 5.075748769808981, + "grad_norm": 1.4187255672470553, + "learning_rate": 2.5379727216367023e-07, + "loss": 0.9418, + "step": 65500 + }, + { + "epoch": 5.075748769808981, + "eval_loss": 0.9552573561668396, + "eval_runtime": 318.9616, + "eval_samples_per_second": 35.964, + "eval_steps_per_second": 8.992, + "step": 65500 + }, + { + "epoch": 5.076523693285288, + "grad_norm": 1.3473688043537102, + "learning_rate": 2.538360198388097e-07, + "loss": 0.9566, + "step": 65510 + }, + { + "epoch": 5.0772986167615946, + "grad_norm": 1.4317131236235543, + "learning_rate": 2.538747675139492e-07, + "loss": 0.9537, + "step": 65520 + }, + { + "epoch": 5.078073540237901, + "grad_norm": 1.40902652417253, + "learning_rate": 2.539135151890887e-07, + "loss": 0.9587, + "step": 65530 + }, + { + "epoch": 5.078848463714208, + "grad_norm": 1.4722572011312882, + "learning_rate": 2.5395226286422816e-07, + "loss": 0.951, + "step": 65540 + }, + { + "epoch": 5.079623387190515, + "grad_norm": 1.4390885830238807, + "learning_rate": 2.539910105393677e-07, + "loss": 0.9712, + "step": 65550 + }, + { + "epoch": 5.080398310666822, + "grad_norm": 1.4322038592669082, + "learning_rate": 2.5402975821450715e-07, + "loss": 0.938, + "step": 65560 + }, + { + "epoch": 5.081173234143129, + "grad_norm": 1.3211945873785156, + "learning_rate": 2.5406850588964667e-07, + "loss": 0.9435, + "step": 65570 + }, + { + "epoch": 5.081948157619435, + "grad_norm": 1.2920016126874339, + "learning_rate": 2.5410725356478614e-07, + "loss": 0.9526, + "step": 65580 + }, + { + "epoch": 5.082723081095742, + "grad_norm": 1.3778435915240166, + "learning_rate": 2.5414600123992566e-07, + "loss": 0.9491, + "step": 65590 + }, + { + "epoch": 5.083498004572048, + "grad_norm": 1.4290394116848233, + "learning_rate": 2.5418474891506513e-07, + "loss": 0.9775, + "step": 65600 + }, + { + "epoch": 5.084272928048355, + "grad_norm": 1.3184480852238847, + "learning_rate": 2.542234965902046e-07, + "loss": 0.951, + "step": 65610 + }, + { + "epoch": 5.085047851524662, + "grad_norm": 1.3485006131166375, + "learning_rate": 2.542622442653441e-07, + "loss": 0.9419, + "step": 65620 + }, + { + "epoch": 5.085822775000969, + "grad_norm": 1.3772595649927273, + "learning_rate": 2.543009919404836e-07, + "loss": 0.9289, + "step": 65630 + }, + { + "epoch": 5.086597698477275, + "grad_norm": 1.3579215699825664, + "learning_rate": 2.543397396156231e-07, + "loss": 0.9307, + "step": 65640 + }, + { + "epoch": 5.087372621953582, + "grad_norm": 1.3303853773925183, + "learning_rate": 2.543784872907626e-07, + "loss": 0.9531, + "step": 65650 + }, + { + "epoch": 5.088147545429889, + "grad_norm": 1.4364776282585725, + "learning_rate": 2.5441723496590205e-07, + "loss": 0.9648, + "step": 65660 + }, + { + "epoch": 5.088922468906196, + "grad_norm": 1.3396862136753411, + "learning_rate": 2.544559826410416e-07, + "loss": 0.9494, + "step": 65670 + }, + { + "epoch": 5.089697392382503, + "grad_norm": 3.8024659628418345, + "learning_rate": 2.5449473031618104e-07, + "loss": 0.9516, + "step": 65680 + }, + { + "epoch": 5.0904723158588085, + "grad_norm": 1.4159121989081227, + "learning_rate": 2.5453347799132057e-07, + "loss": 0.9638, + "step": 65690 + }, + { + "epoch": 5.091247239335115, + "grad_norm": 1.3392835363577291, + "learning_rate": 2.5457222566646004e-07, + "loss": 0.966, + "step": 65700 + }, + { + "epoch": 5.092022162811422, + "grad_norm": 1.367302185550415, + "learning_rate": 2.5461097334159956e-07, + "loss": 0.9338, + "step": 65710 + }, + { + "epoch": 5.092797086287729, + "grad_norm": 1.4287875972545467, + "learning_rate": 2.54649721016739e-07, + "loss": 0.9592, + "step": 65720 + }, + { + "epoch": 5.093572009764036, + "grad_norm": 1.3313167486437483, + "learning_rate": 2.546884686918785e-07, + "loss": 0.9606, + "step": 65730 + }, + { + "epoch": 5.094346933240343, + "grad_norm": 1.307929211265144, + "learning_rate": 2.54727216367018e-07, + "loss": 0.9394, + "step": 65740 + }, + { + "epoch": 5.095121856716649, + "grad_norm": 1.3096323943520674, + "learning_rate": 2.547659640421575e-07, + "loss": 0.9347, + "step": 65750 + }, + { + "epoch": 5.095896780192956, + "grad_norm": 1.3413364924955968, + "learning_rate": 2.54804711717297e-07, + "loss": 0.9548, + "step": 65760 + }, + { + "epoch": 5.096671703669263, + "grad_norm": 1.3763819112886404, + "learning_rate": 2.548434593924365e-07, + "loss": 0.9705, + "step": 65770 + }, + { + "epoch": 5.097446627145569, + "grad_norm": 1.3310182845864837, + "learning_rate": 2.54882207067576e-07, + "loss": 0.9465, + "step": 65780 + }, + { + "epoch": 5.098221550621876, + "grad_norm": 1.3241286764462696, + "learning_rate": 2.5492095474271547e-07, + "loss": 0.9325, + "step": 65790 + }, + { + "epoch": 5.098996474098183, + "grad_norm": 1.3594983490852823, + "learning_rate": 2.5495970241785494e-07, + "loss": 0.9557, + "step": 65800 + }, + { + "epoch": 5.099771397574489, + "grad_norm": 1.4201636512037044, + "learning_rate": 2.5499845009299446e-07, + "loss": 0.9601, + "step": 65810 + }, + { + "epoch": 5.100546321050796, + "grad_norm": 1.3471533008541754, + "learning_rate": 2.5503719776813393e-07, + "loss": 0.9533, + "step": 65820 + }, + { + "epoch": 5.101321244527103, + "grad_norm": 1.3388597948340626, + "learning_rate": 2.5507594544327345e-07, + "loss": 0.9529, + "step": 65830 + }, + { + "epoch": 5.10209616800341, + "grad_norm": 1.4805687059772972, + "learning_rate": 2.551146931184129e-07, + "loss": 0.9418, + "step": 65840 + }, + { + "epoch": 5.102871091479717, + "grad_norm": 1.3684601221774142, + "learning_rate": 2.5515344079355244e-07, + "loss": 0.9366, + "step": 65850 + }, + { + "epoch": 5.103646014956023, + "grad_norm": 1.3848527571004616, + "learning_rate": 2.551921884686919e-07, + "loss": 0.976, + "step": 65860 + }, + { + "epoch": 5.10442093843233, + "grad_norm": 1.384060983346306, + "learning_rate": 2.552309361438314e-07, + "loss": 0.9546, + "step": 65870 + }, + { + "epoch": 5.105195861908636, + "grad_norm": 1.4674030996673055, + "learning_rate": 2.552696838189709e-07, + "loss": 0.9547, + "step": 65880 + }, + { + "epoch": 5.105970785384943, + "grad_norm": 1.3114768168902873, + "learning_rate": 2.5530843149411037e-07, + "loss": 0.9641, + "step": 65890 + }, + { + "epoch": 5.10674570886125, + "grad_norm": 1.364519526360212, + "learning_rate": 2.553471791692499e-07, + "loss": 0.9742, + "step": 65900 + }, + { + "epoch": 5.107520632337557, + "grad_norm": 1.389007259006637, + "learning_rate": 2.5538592684438936e-07, + "loss": 0.9525, + "step": 65910 + }, + { + "epoch": 5.108295555813863, + "grad_norm": 1.287126129132587, + "learning_rate": 2.554246745195289e-07, + "loss": 0.949, + "step": 65920 + }, + { + "epoch": 5.10907047929017, + "grad_norm": 1.414302368278361, + "learning_rate": 2.5546342219466835e-07, + "loss": 0.9319, + "step": 65930 + }, + { + "epoch": 5.109845402766477, + "grad_norm": 1.3302078770527297, + "learning_rate": 2.555021698698078e-07, + "loss": 0.9687, + "step": 65940 + }, + { + "epoch": 5.110620326242784, + "grad_norm": 1.3301491951884854, + "learning_rate": 2.5554091754494734e-07, + "loss": 0.9676, + "step": 65950 + }, + { + "epoch": 5.111395249719091, + "grad_norm": 1.3341686324324906, + "learning_rate": 2.555796652200868e-07, + "loss": 0.9441, + "step": 65960 + }, + { + "epoch": 5.112170173195397, + "grad_norm": 1.3377555212817624, + "learning_rate": 2.5561841289522633e-07, + "loss": 0.9641, + "step": 65970 + }, + { + "epoch": 5.112945096671703, + "grad_norm": 1.369229371652042, + "learning_rate": 2.556571605703658e-07, + "loss": 0.9429, + "step": 65980 + }, + { + "epoch": 5.11372002014801, + "grad_norm": 1.2582544455260325, + "learning_rate": 2.556959082455053e-07, + "loss": 0.96, + "step": 65990 + }, + { + "epoch": 5.114494943624317, + "grad_norm": 1.296641532826321, + "learning_rate": 2.557346559206448e-07, + "loss": 0.9527, + "step": 66000 + }, + { + "epoch": 5.114494943624317, + "eval_loss": 0.954459011554718, + "eval_runtime": 319.937, + "eval_samples_per_second": 35.854, + "eval_steps_per_second": 8.964, + "step": 66000 + }, + { + "epoch": 5.115269867100624, + "grad_norm": 1.3885852264952467, + "learning_rate": 2.5577340359578426e-07, + "loss": 0.9647, + "step": 66010 + }, + { + "epoch": 5.116044790576931, + "grad_norm": 1.3658937676753442, + "learning_rate": 2.558121512709238e-07, + "loss": 0.9571, + "step": 66020 + }, + { + "epoch": 5.116819714053237, + "grad_norm": 1.4262737999192865, + "learning_rate": 2.5585089894606325e-07, + "loss": 0.9387, + "step": 66030 + }, + { + "epoch": 5.117594637529544, + "grad_norm": 1.33701170340561, + "learning_rate": 2.558896466212028e-07, + "loss": 0.9483, + "step": 66040 + }, + { + "epoch": 5.118369561005851, + "grad_norm": 1.3146117750759252, + "learning_rate": 2.5592839429634225e-07, + "loss": 0.9386, + "step": 66050 + }, + { + "epoch": 5.119144484482157, + "grad_norm": 1.3369315172236373, + "learning_rate": 2.5596714197148177e-07, + "loss": 0.9519, + "step": 66060 + }, + { + "epoch": 5.119919407958464, + "grad_norm": 1.313733078937584, + "learning_rate": 2.5600588964662124e-07, + "loss": 0.9537, + "step": 66070 + }, + { + "epoch": 5.120694331434771, + "grad_norm": 1.4394756473285744, + "learning_rate": 2.560446373217607e-07, + "loss": 0.971, + "step": 66080 + }, + { + "epoch": 5.121469254911077, + "grad_norm": 1.4283909566014987, + "learning_rate": 2.5608338499690023e-07, + "loss": 0.9522, + "step": 66090 + }, + { + "epoch": 5.122244178387384, + "grad_norm": 1.3548349183560942, + "learning_rate": 2.561221326720397e-07, + "loss": 0.9348, + "step": 66100 + }, + { + "epoch": 5.123019101863691, + "grad_norm": 1.4222603384948447, + "learning_rate": 2.561608803471792e-07, + "loss": 0.9588, + "step": 66110 + }, + { + "epoch": 5.123794025339998, + "grad_norm": 1.3655543939202597, + "learning_rate": 2.561996280223187e-07, + "loss": 0.9621, + "step": 66120 + }, + { + "epoch": 5.124568948816305, + "grad_norm": 1.3509852448078443, + "learning_rate": 2.562383756974582e-07, + "loss": 0.9377, + "step": 66130 + }, + { + "epoch": 5.1253438722926115, + "grad_norm": 1.274385836253272, + "learning_rate": 2.562771233725977e-07, + "loss": 0.9367, + "step": 66140 + }, + { + "epoch": 5.126118795768917, + "grad_norm": 1.3953187447808204, + "learning_rate": 2.5631587104773715e-07, + "loss": 0.953, + "step": 66150 + }, + { + "epoch": 5.126893719245224, + "grad_norm": 1.4069829392649627, + "learning_rate": 2.5635461872287667e-07, + "loss": 0.9471, + "step": 66160 + }, + { + "epoch": 5.127668642721531, + "grad_norm": 1.312074381292686, + "learning_rate": 2.5639336639801614e-07, + "loss": 0.9576, + "step": 66170 + }, + { + "epoch": 5.128443566197838, + "grad_norm": 1.4857241886651367, + "learning_rate": 2.5643211407315566e-07, + "loss": 0.963, + "step": 66180 + }, + { + "epoch": 5.129218489674145, + "grad_norm": 1.3603046177426066, + "learning_rate": 2.5647086174829513e-07, + "loss": 0.9406, + "step": 66190 + }, + { + "epoch": 5.129993413150451, + "grad_norm": 1.3089364245621058, + "learning_rate": 2.5650960942343465e-07, + "loss": 0.9394, + "step": 66200 + }, + { + "epoch": 5.130768336626758, + "grad_norm": 1.4135851391474583, + "learning_rate": 2.565483570985741e-07, + "loss": 0.965, + "step": 66210 + }, + { + "epoch": 5.131543260103065, + "grad_norm": 1.336891397740609, + "learning_rate": 2.565871047737136e-07, + "loss": 0.9569, + "step": 66220 + }, + { + "epoch": 5.132318183579372, + "grad_norm": 1.3113145101855335, + "learning_rate": 2.566258524488531e-07, + "loss": 0.9635, + "step": 66230 + }, + { + "epoch": 5.133093107055679, + "grad_norm": 1.348906989813785, + "learning_rate": 2.566646001239926e-07, + "loss": 0.939, + "step": 66240 + }, + { + "epoch": 5.133868030531985, + "grad_norm": 1.3116257041186596, + "learning_rate": 2.567033477991321e-07, + "loss": 0.9482, + "step": 66250 + }, + { + "epoch": 5.134642954008291, + "grad_norm": 1.3418577192120045, + "learning_rate": 2.5674209547427157e-07, + "loss": 0.9349, + "step": 66260 + }, + { + "epoch": 5.135417877484598, + "grad_norm": 1.3661431585603556, + "learning_rate": 2.567808431494111e-07, + "loss": 0.9568, + "step": 66270 + }, + { + "epoch": 5.136192800960905, + "grad_norm": 1.3500965544261114, + "learning_rate": 2.5681959082455056e-07, + "loss": 0.9758, + "step": 66280 + }, + { + "epoch": 5.136967724437212, + "grad_norm": 1.3302954856831208, + "learning_rate": 2.5685833849969003e-07, + "loss": 0.9637, + "step": 66290 + }, + { + "epoch": 5.137742647913519, + "grad_norm": 1.352717445177647, + "learning_rate": 2.5689708617482955e-07, + "loss": 0.9246, + "step": 66300 + }, + { + "epoch": 5.1385175713898255, + "grad_norm": 1.3963388723252395, + "learning_rate": 2.56935833849969e-07, + "loss": 0.9746, + "step": 66310 + }, + { + "epoch": 5.139292494866132, + "grad_norm": 1.4200059022913667, + "learning_rate": 2.5697458152510855e-07, + "loss": 0.9547, + "step": 66320 + }, + { + "epoch": 5.140067418342439, + "grad_norm": 1.3860589473075133, + "learning_rate": 2.57013329200248e-07, + "loss": 0.9393, + "step": 66330 + }, + { + "epoch": 5.140842341818745, + "grad_norm": 1.3516452041144078, + "learning_rate": 2.5705207687538754e-07, + "loss": 0.9322, + "step": 66340 + }, + { + "epoch": 5.141617265295052, + "grad_norm": 1.3635033826315814, + "learning_rate": 2.57090824550527e-07, + "loss": 0.9698, + "step": 66350 + }, + { + "epoch": 5.142392188771359, + "grad_norm": 1.3494219169634103, + "learning_rate": 2.571295722256665e-07, + "loss": 0.9451, + "step": 66360 + }, + { + "epoch": 5.143167112247665, + "grad_norm": 1.343048184508696, + "learning_rate": 2.57168319900806e-07, + "loss": 0.9286, + "step": 66370 + }, + { + "epoch": 5.143942035723972, + "grad_norm": 1.297691774731146, + "learning_rate": 2.5720706757594547e-07, + "loss": 0.9371, + "step": 66380 + }, + { + "epoch": 5.144716959200279, + "grad_norm": 1.387622199757069, + "learning_rate": 2.57245815251085e-07, + "loss": 0.9506, + "step": 66390 + }, + { + "epoch": 5.145491882676586, + "grad_norm": 1.3384453411636268, + "learning_rate": 2.5728456292622446e-07, + "loss": 0.9653, + "step": 66400 + }, + { + "epoch": 5.146266806152893, + "grad_norm": 1.3164392308954296, + "learning_rate": 2.573233106013639e-07, + "loss": 0.9348, + "step": 66410 + }, + { + "epoch": 5.1470417296291995, + "grad_norm": 1.3636231512139534, + "learning_rate": 2.5736205827650345e-07, + "loss": 0.9475, + "step": 66420 + }, + { + "epoch": 5.147816653105505, + "grad_norm": 1.3716782165991872, + "learning_rate": 2.574008059516429e-07, + "loss": 0.9454, + "step": 66430 + }, + { + "epoch": 5.148591576581812, + "grad_norm": 1.419849409793542, + "learning_rate": 2.5743955362678244e-07, + "loss": 0.9565, + "step": 66440 + }, + { + "epoch": 5.149366500058119, + "grad_norm": 1.3401147207064728, + "learning_rate": 2.574783013019219e-07, + "loss": 0.9445, + "step": 66450 + }, + { + "epoch": 5.150141423534426, + "grad_norm": 1.3906717367378787, + "learning_rate": 2.5751704897706143e-07, + "loss": 0.9691, + "step": 66460 + }, + { + "epoch": 5.150916347010733, + "grad_norm": 1.3656261708405524, + "learning_rate": 2.575557966522009e-07, + "loss": 0.9503, + "step": 66470 + }, + { + "epoch": 5.1516912704870395, + "grad_norm": 1.35619536278368, + "learning_rate": 2.5759454432734037e-07, + "loss": 0.9387, + "step": 66480 + }, + { + "epoch": 5.152466193963346, + "grad_norm": 1.3962495816901206, + "learning_rate": 2.576332920024799e-07, + "loss": 0.9467, + "step": 66490 + }, + { + "epoch": 5.153241117439653, + "grad_norm": 1.3943620565853592, + "learning_rate": 2.5767203967761936e-07, + "loss": 0.9514, + "step": 66500 + }, + { + "epoch": 5.153241117439653, + "eval_loss": 0.9538718461990356, + "eval_runtime": 318.7358, + "eval_samples_per_second": 35.989, + "eval_steps_per_second": 8.998, + "step": 66500 + }, + { + "epoch": 5.15401604091596, + "grad_norm": 1.4630733631078086, + "learning_rate": 2.577107873527589e-07, + "loss": 0.9472, + "step": 66510 + }, + { + "epoch": 5.154790964392266, + "grad_norm": 1.3936559875118357, + "learning_rate": 2.5774953502789835e-07, + "loss": 0.9466, + "step": 66520 + }, + { + "epoch": 5.155565887868573, + "grad_norm": 1.3902090148353161, + "learning_rate": 2.5778828270303787e-07, + "loss": 0.9471, + "step": 66530 + }, + { + "epoch": 5.156340811344879, + "grad_norm": 1.3285296859211493, + "learning_rate": 2.5782703037817734e-07, + "loss": 0.9398, + "step": 66540 + }, + { + "epoch": 5.157115734821186, + "grad_norm": 1.4006612402869671, + "learning_rate": 2.578657780533168e-07, + "loss": 0.9405, + "step": 66550 + }, + { + "epoch": 5.157890658297493, + "grad_norm": 1.3740965942228331, + "learning_rate": 2.5790452572845633e-07, + "loss": 0.9798, + "step": 66560 + }, + { + "epoch": 5.1586655817738, + "grad_norm": 1.4012025778360577, + "learning_rate": 2.579432734035958e-07, + "loss": 0.9328, + "step": 66570 + }, + { + "epoch": 5.159440505250107, + "grad_norm": 1.3450667659169737, + "learning_rate": 2.579820210787353e-07, + "loss": 0.9389, + "step": 66580 + }, + { + "epoch": 5.1602154287264135, + "grad_norm": 1.3089083904087697, + "learning_rate": 2.580207687538748e-07, + "loss": 0.9711, + "step": 66590 + }, + { + "epoch": 5.16099035220272, + "grad_norm": 1.3065708555857178, + "learning_rate": 2.580595164290143e-07, + "loss": 0.9358, + "step": 66600 + }, + { + "epoch": 5.161765275679027, + "grad_norm": 1.4242035889808726, + "learning_rate": 2.580982641041538e-07, + "loss": 0.9472, + "step": 66610 + }, + { + "epoch": 5.162540199155333, + "grad_norm": 1.4564368130854646, + "learning_rate": 2.5813701177929325e-07, + "loss": 0.9368, + "step": 66620 + }, + { + "epoch": 5.16331512263164, + "grad_norm": 1.3631027026496183, + "learning_rate": 2.581757594544328e-07, + "loss": 0.9422, + "step": 66630 + }, + { + "epoch": 5.164090046107947, + "grad_norm": 1.3452682416311856, + "learning_rate": 2.5821450712957224e-07, + "loss": 0.9484, + "step": 66640 + }, + { + "epoch": 5.1648649695842535, + "grad_norm": 1.362990523730834, + "learning_rate": 2.5825325480471176e-07, + "loss": 0.9542, + "step": 66650 + }, + { + "epoch": 5.16563989306056, + "grad_norm": 1.3684120996518938, + "learning_rate": 2.5829200247985123e-07, + "loss": 0.9655, + "step": 66660 + }, + { + "epoch": 5.166414816536867, + "grad_norm": 1.4007190285697737, + "learning_rate": 2.5833075015499076e-07, + "loss": 0.9615, + "step": 66670 + }, + { + "epoch": 5.167189740013174, + "grad_norm": 1.343322464123273, + "learning_rate": 2.583694978301302e-07, + "loss": 0.9497, + "step": 66680 + }, + { + "epoch": 5.167964663489481, + "grad_norm": 1.3494746304643996, + "learning_rate": 2.584082455052697e-07, + "loss": 0.9468, + "step": 66690 + }, + { + "epoch": 5.1687395869657875, + "grad_norm": 1.4049670791601157, + "learning_rate": 2.584469931804092e-07, + "loss": 0.9411, + "step": 66700 + }, + { + "epoch": 5.169514510442093, + "grad_norm": 1.3109240294343332, + "learning_rate": 2.584857408555487e-07, + "loss": 0.9412, + "step": 66710 + }, + { + "epoch": 5.1702894339184, + "grad_norm": 1.4421685722427189, + "learning_rate": 2.585244885306882e-07, + "loss": 0.9697, + "step": 66720 + }, + { + "epoch": 5.171064357394707, + "grad_norm": 1.3158128033749685, + "learning_rate": 2.585632362058277e-07, + "loss": 0.9581, + "step": 66730 + }, + { + "epoch": 5.171839280871014, + "grad_norm": 1.4437290877883773, + "learning_rate": 2.586019838809672e-07, + "loss": 0.9623, + "step": 66740 + }, + { + "epoch": 5.172614204347321, + "grad_norm": 1.2604163423107755, + "learning_rate": 2.5864073155610667e-07, + "loss": 0.9737, + "step": 66750 + }, + { + "epoch": 5.1733891278236275, + "grad_norm": 1.3538736107676035, + "learning_rate": 2.5867947923124614e-07, + "loss": 0.9379, + "step": 66760 + }, + { + "epoch": 5.174164051299934, + "grad_norm": 1.4145826197086913, + "learning_rate": 2.5871822690638566e-07, + "loss": 0.9459, + "step": 66770 + }, + { + "epoch": 5.174938974776241, + "grad_norm": 1.4370651608925082, + "learning_rate": 2.5875697458152513e-07, + "loss": 0.9388, + "step": 66780 + }, + { + "epoch": 5.175713898252548, + "grad_norm": 1.3987606260763519, + "learning_rate": 2.5879572225666465e-07, + "loss": 0.9569, + "step": 66790 + }, + { + "epoch": 5.176488821728855, + "grad_norm": 1.3692388718041777, + "learning_rate": 2.588344699318041e-07, + "loss": 0.9469, + "step": 66800 + }, + { + "epoch": 5.177263745205161, + "grad_norm": 1.3410784131488853, + "learning_rate": 2.5887321760694364e-07, + "loss": 0.9399, + "step": 66810 + }, + { + "epoch": 5.1780386686814674, + "grad_norm": 1.37818857767433, + "learning_rate": 2.589119652820831e-07, + "loss": 0.9563, + "step": 66820 + }, + { + "epoch": 5.178813592157774, + "grad_norm": 1.3267636260033493, + "learning_rate": 2.589507129572226e-07, + "loss": 0.9397, + "step": 66830 + }, + { + "epoch": 5.179588515634081, + "grad_norm": 1.4486393175948833, + "learning_rate": 2.589894606323621e-07, + "loss": 0.9548, + "step": 66840 + }, + { + "epoch": 5.180363439110388, + "grad_norm": 1.423713622274608, + "learning_rate": 2.5902820830750157e-07, + "loss": 0.9369, + "step": 66850 + }, + { + "epoch": 5.181138362586695, + "grad_norm": 1.3645632105056362, + "learning_rate": 2.590669559826411e-07, + "loss": 0.9297, + "step": 66860 + }, + { + "epoch": 5.1819132860630015, + "grad_norm": 1.3706458742027219, + "learning_rate": 2.5910570365778056e-07, + "loss": 1.0009, + "step": 66870 + }, + { + "epoch": 5.182688209539308, + "grad_norm": 1.3908069769391243, + "learning_rate": 2.591444513329201e-07, + "loss": 0.9352, + "step": 66880 + }, + { + "epoch": 5.183463133015615, + "grad_norm": 1.3475719702113458, + "learning_rate": 2.5918319900805955e-07, + "loss": 0.964, + "step": 66890 + }, + { + "epoch": 5.184238056491921, + "grad_norm": 1.5128498220354145, + "learning_rate": 2.59221946683199e-07, + "loss": 0.9408, + "step": 66900 + }, + { + "epoch": 5.185012979968228, + "grad_norm": 1.4009541093803395, + "learning_rate": 2.5926069435833854e-07, + "loss": 0.961, + "step": 66910 + }, + { + "epoch": 5.185787903444535, + "grad_norm": 1.2846254079854662, + "learning_rate": 2.59299442033478e-07, + "loss": 0.9451, + "step": 66920 + }, + { + "epoch": 5.1865628269208415, + "grad_norm": 1.360459585693665, + "learning_rate": 2.5933818970861753e-07, + "loss": 0.9573, + "step": 66930 + }, + { + "epoch": 5.187337750397148, + "grad_norm": 1.3868365979429054, + "learning_rate": 2.59376937383757e-07, + "loss": 0.9711, + "step": 66940 + }, + { + "epoch": 5.188112673873455, + "grad_norm": 1.3703729126098687, + "learning_rate": 2.594156850588965e-07, + "loss": 0.9386, + "step": 66950 + }, + { + "epoch": 5.188887597349762, + "grad_norm": 1.4083407971920026, + "learning_rate": 2.59454432734036e-07, + "loss": 0.9347, + "step": 66960 + }, + { + "epoch": 5.189662520826069, + "grad_norm": 1.4325406603566324, + "learning_rate": 2.5949318040917546e-07, + "loss": 0.9509, + "step": 66970 + }, + { + "epoch": 5.1904374443023755, + "grad_norm": 1.4159508622733412, + "learning_rate": 2.59531928084315e-07, + "loss": 0.9495, + "step": 66980 + }, + { + "epoch": 5.1912123677786814, + "grad_norm": 1.328869149816522, + "learning_rate": 2.5957067575945445e-07, + "loss": 0.9571, + "step": 66990 + }, + { + "epoch": 5.191987291254988, + "grad_norm": 1.4361731358352914, + "learning_rate": 2.59609423434594e-07, + "loss": 0.9629, + "step": 67000 + }, + { + "epoch": 5.191987291254988, + "eval_loss": 0.9533229470252991, + "eval_runtime": 320.3159, + "eval_samples_per_second": 35.812, + "eval_steps_per_second": 8.954, + "step": 67000 + }, + { + "epoch": 5.192762214731295, + "grad_norm": 1.3637682899491828, + "learning_rate": 2.5964817110973344e-07, + "loss": 0.9498, + "step": 67010 + }, + { + "epoch": 5.193537138207602, + "grad_norm": 1.364049926346991, + "learning_rate": 2.5968691878487297e-07, + "loss": 0.9539, + "step": 67020 + }, + { + "epoch": 5.194312061683909, + "grad_norm": 1.355760312163681, + "learning_rate": 2.5972566646001244e-07, + "loss": 0.9256, + "step": 67030 + }, + { + "epoch": 5.1950869851602155, + "grad_norm": 1.2490238707075765, + "learning_rate": 2.597644141351519e-07, + "loss": 0.952, + "step": 67040 + }, + { + "epoch": 5.195861908636522, + "grad_norm": 1.3293879505969335, + "learning_rate": 2.598031618102914e-07, + "loss": 0.9584, + "step": 67050 + }, + { + "epoch": 5.196636832112829, + "grad_norm": 1.4202146777231714, + "learning_rate": 2.598419094854309e-07, + "loss": 0.9656, + "step": 67060 + }, + { + "epoch": 5.197411755589136, + "grad_norm": 1.4101139338190838, + "learning_rate": 2.598806571605704e-07, + "loss": 0.9552, + "step": 67070 + }, + { + "epoch": 5.198186679065442, + "grad_norm": 1.4410028601203309, + "learning_rate": 2.599194048357099e-07, + "loss": 0.9447, + "step": 67080 + }, + { + "epoch": 5.198961602541749, + "grad_norm": 1.5733991478850489, + "learning_rate": 2.599581525108494e-07, + "loss": 0.9477, + "step": 67090 + }, + { + "epoch": 5.1997365260180555, + "grad_norm": 1.365034891624098, + "learning_rate": 2.599969001859889e-07, + "loss": 0.939, + "step": 67100 + }, + { + "epoch": 5.200511449494362, + "grad_norm": 1.3189648315002207, + "learning_rate": 2.6003564786112835e-07, + "loss": 0.9357, + "step": 67110 + }, + { + "epoch": 5.201286372970669, + "grad_norm": 1.3985432173005612, + "learning_rate": 2.6007439553626787e-07, + "loss": 0.9712, + "step": 67120 + }, + { + "epoch": 5.202061296446976, + "grad_norm": 1.399101287509839, + "learning_rate": 2.6011314321140734e-07, + "loss": 0.9373, + "step": 67130 + }, + { + "epoch": 5.202836219923283, + "grad_norm": 1.2998581979235762, + "learning_rate": 2.6015189088654686e-07, + "loss": 0.9434, + "step": 67140 + }, + { + "epoch": 5.2036111433995895, + "grad_norm": 1.4301207899954131, + "learning_rate": 2.6019063856168633e-07, + "loss": 0.9555, + "step": 67150 + }, + { + "epoch": 5.204386066875896, + "grad_norm": 1.310616251098498, + "learning_rate": 2.602293862368258e-07, + "loss": 0.9524, + "step": 67160 + }, + { + "epoch": 5.205160990352203, + "grad_norm": 1.5071243857229304, + "learning_rate": 2.602681339119653e-07, + "loss": 0.9667, + "step": 67170 + }, + { + "epoch": 5.205935913828509, + "grad_norm": 1.3534688602277574, + "learning_rate": 2.603068815871048e-07, + "loss": 0.9666, + "step": 67180 + }, + { + "epoch": 5.206710837304816, + "grad_norm": 1.4386346777002437, + "learning_rate": 2.603456292622443e-07, + "loss": 0.9476, + "step": 67190 + }, + { + "epoch": 5.207485760781123, + "grad_norm": 1.3861113421949944, + "learning_rate": 2.603843769373838e-07, + "loss": 0.9411, + "step": 67200 + }, + { + "epoch": 5.2082606842574295, + "grad_norm": 1.381882263827802, + "learning_rate": 2.604231246125233e-07, + "loss": 0.9429, + "step": 67210 + }, + { + "epoch": 5.209035607733736, + "grad_norm": 1.3766024724152277, + "learning_rate": 2.6046187228766277e-07, + "loss": 0.9528, + "step": 67220 + }, + { + "epoch": 5.209810531210043, + "grad_norm": 1.4153466308781657, + "learning_rate": 2.6050061996280224e-07, + "loss": 0.9711, + "step": 67230 + }, + { + "epoch": 5.21058545468635, + "grad_norm": 1.2632040574150774, + "learning_rate": 2.6053936763794176e-07, + "loss": 0.9786, + "step": 67240 + }, + { + "epoch": 5.211360378162657, + "grad_norm": 1.3415435828135076, + "learning_rate": 2.6057811531308123e-07, + "loss": 0.9424, + "step": 67250 + }, + { + "epoch": 5.2121353016389635, + "grad_norm": 1.366937108173127, + "learning_rate": 2.6061686298822075e-07, + "loss": 0.95, + "step": 67260 + }, + { + "epoch": 5.2129102251152695, + "grad_norm": 1.4018131231564017, + "learning_rate": 2.606556106633602e-07, + "loss": 0.9617, + "step": 67270 + }, + { + "epoch": 5.213685148591576, + "grad_norm": 1.4403295401071856, + "learning_rate": 2.6069435833849974e-07, + "loss": 0.9381, + "step": 67280 + }, + { + "epoch": 5.214460072067883, + "grad_norm": 1.293669795100538, + "learning_rate": 2.607331060136392e-07, + "loss": 0.9542, + "step": 67290 + }, + { + "epoch": 5.21523499554419, + "grad_norm": 1.4229587510313464, + "learning_rate": 2.607718536887787e-07, + "loss": 0.9503, + "step": 67300 + }, + { + "epoch": 5.216009919020497, + "grad_norm": 1.4665299321185572, + "learning_rate": 2.608106013639182e-07, + "loss": 0.9789, + "step": 67310 + }, + { + "epoch": 5.2167848424968035, + "grad_norm": 1.4212906774397436, + "learning_rate": 2.6084934903905767e-07, + "loss": 0.9415, + "step": 67320 + }, + { + "epoch": 5.21755976597311, + "grad_norm": 1.3064210323843515, + "learning_rate": 2.608880967141972e-07, + "loss": 0.9272, + "step": 67330 + }, + { + "epoch": 5.218334689449417, + "grad_norm": 1.2980138317854135, + "learning_rate": 2.6092684438933666e-07, + "loss": 0.9274, + "step": 67340 + }, + { + "epoch": 5.219109612925724, + "grad_norm": 1.4837003395263964, + "learning_rate": 2.609655920644762e-07, + "loss": 0.9469, + "step": 67350 + }, + { + "epoch": 5.21988453640203, + "grad_norm": 1.4025285770783449, + "learning_rate": 2.6100433973961565e-07, + "loss": 0.9243, + "step": 67360 + }, + { + "epoch": 5.220659459878337, + "grad_norm": 1.4008715511552325, + "learning_rate": 2.610430874147551e-07, + "loss": 0.9381, + "step": 67370 + }, + { + "epoch": 5.2214343833546435, + "grad_norm": 1.3522163861778556, + "learning_rate": 2.6108183508989465e-07, + "loss": 0.9489, + "step": 67380 + }, + { + "epoch": 5.22220930683095, + "grad_norm": 1.3957997223263054, + "learning_rate": 2.611205827650341e-07, + "loss": 0.9281, + "step": 67390 + }, + { + "epoch": 5.222984230307257, + "grad_norm": 1.3670336760866206, + "learning_rate": 2.6115933044017364e-07, + "loss": 0.9511, + "step": 67400 + }, + { + "epoch": 5.223759153783564, + "grad_norm": 1.3879873121919093, + "learning_rate": 2.611980781153131e-07, + "loss": 0.9393, + "step": 67410 + }, + { + "epoch": 5.224534077259871, + "grad_norm": 1.4281762024518834, + "learning_rate": 2.6123682579045263e-07, + "loss": 0.9371, + "step": 67420 + }, + { + "epoch": 5.2253090007361775, + "grad_norm": 1.2882246416430378, + "learning_rate": 2.612755734655921e-07, + "loss": 0.9526, + "step": 67430 + }, + { + "epoch": 5.226083924212484, + "grad_norm": 1.3804050440537239, + "learning_rate": 2.6131432114073157e-07, + "loss": 0.9702, + "step": 67440 + }, + { + "epoch": 5.22685884768879, + "grad_norm": 1.341547152231443, + "learning_rate": 2.613530688158711e-07, + "loss": 0.9632, + "step": 67450 + }, + { + "epoch": 5.227633771165097, + "grad_norm": 1.3263566704588727, + "learning_rate": 2.6139181649101056e-07, + "loss": 0.9621, + "step": 67460 + }, + { + "epoch": 5.228408694641404, + "grad_norm": 1.3496595435168512, + "learning_rate": 2.614305641661501e-07, + "loss": 0.9536, + "step": 67470 + }, + { + "epoch": 5.229183618117711, + "grad_norm": 1.3449391126579466, + "learning_rate": 2.6146931184128955e-07, + "loss": 0.9773, + "step": 67480 + }, + { + "epoch": 5.2299585415940175, + "grad_norm": 1.3681397270708213, + "learning_rate": 2.6150805951642907e-07, + "loss": 0.9273, + "step": 67490 + }, + { + "epoch": 5.230733465070324, + "grad_norm": 1.4203226642681759, + "learning_rate": 2.6154680719156854e-07, + "loss": 0.958, + "step": 67500 + }, + { + "epoch": 5.230733465070324, + "eval_loss": 0.9526898264884949, + "eval_runtime": 320.2474, + "eval_samples_per_second": 35.819, + "eval_steps_per_second": 8.956, + "step": 67500 + }, + { + "epoch": 5.231508388546631, + "grad_norm": 1.4173208787211158, + "learning_rate": 2.61585554866708e-07, + "loss": 0.9393, + "step": 67510 + }, + { + "epoch": 5.232283312022938, + "grad_norm": 1.3724469002566475, + "learning_rate": 2.6162430254184753e-07, + "loss": 0.9426, + "step": 67520 + }, + { + "epoch": 5.233058235499245, + "grad_norm": 1.3707879623682515, + "learning_rate": 2.61663050216987e-07, + "loss": 0.9304, + "step": 67530 + }, + { + "epoch": 5.233833158975552, + "grad_norm": 1.4020386297019407, + "learning_rate": 2.617017978921265e-07, + "loss": 0.9366, + "step": 67540 + }, + { + "epoch": 5.2346080824518575, + "grad_norm": 1.3354448535838934, + "learning_rate": 2.61740545567266e-07, + "loss": 0.9421, + "step": 67550 + }, + { + "epoch": 5.235383005928164, + "grad_norm": 1.3577782020523288, + "learning_rate": 2.617792932424055e-07, + "loss": 0.9504, + "step": 67560 + }, + { + "epoch": 5.236157929404471, + "grad_norm": 1.3590073535024767, + "learning_rate": 2.61818040917545e-07, + "loss": 0.9477, + "step": 67570 + }, + { + "epoch": 5.236932852880778, + "grad_norm": 1.3368455854182908, + "learning_rate": 2.6185678859268445e-07, + "loss": 0.9351, + "step": 67580 + }, + { + "epoch": 5.237707776357085, + "grad_norm": 1.2952513567193198, + "learning_rate": 2.6189553626782397e-07, + "loss": 0.9596, + "step": 67590 + }, + { + "epoch": 5.2384826998333915, + "grad_norm": 1.3692967469676953, + "learning_rate": 2.6193428394296344e-07, + "loss": 0.9339, + "step": 67600 + }, + { + "epoch": 5.239257623309698, + "grad_norm": 1.4837569790997167, + "learning_rate": 2.6197303161810296e-07, + "loss": 0.9392, + "step": 67610 + }, + { + "epoch": 5.240032546786005, + "grad_norm": 1.4253908754392586, + "learning_rate": 2.6201177929324243e-07, + "loss": 0.9522, + "step": 67620 + }, + { + "epoch": 5.240807470262312, + "grad_norm": 1.3643203335812246, + "learning_rate": 2.6205052696838195e-07, + "loss": 0.9411, + "step": 67630 + }, + { + "epoch": 5.241582393738618, + "grad_norm": 1.3343513659569082, + "learning_rate": 2.620892746435214e-07, + "loss": 0.938, + "step": 67640 + }, + { + "epoch": 5.242357317214925, + "grad_norm": 1.431035050593983, + "learning_rate": 2.621280223186609e-07, + "loss": 0.9586, + "step": 67650 + }, + { + "epoch": 5.2431322406912315, + "grad_norm": 1.37795777124415, + "learning_rate": 2.621667699938004e-07, + "loss": 0.9731, + "step": 67660 + }, + { + "epoch": 5.243907164167538, + "grad_norm": 1.436272828772703, + "learning_rate": 2.622055176689399e-07, + "loss": 0.9491, + "step": 67670 + }, + { + "epoch": 5.244682087643845, + "grad_norm": 1.3832115760373362, + "learning_rate": 2.622442653440794e-07, + "loss": 0.9405, + "step": 67680 + }, + { + "epoch": 5.245457011120152, + "grad_norm": 1.2949191483041056, + "learning_rate": 2.622830130192189e-07, + "loss": 0.942, + "step": 67690 + }, + { + "epoch": 5.246231934596459, + "grad_norm": 1.4468578398213752, + "learning_rate": 2.623217606943584e-07, + "loss": 0.9492, + "step": 67700 + }, + { + "epoch": 5.247006858072766, + "grad_norm": 1.3959141638089712, + "learning_rate": 2.6236050836949787e-07, + "loss": 0.9757, + "step": 67710 + }, + { + "epoch": 5.247781781549072, + "grad_norm": 1.3560500782482194, + "learning_rate": 2.6239925604463733e-07, + "loss": 0.9489, + "step": 67720 + }, + { + "epoch": 5.248556705025379, + "grad_norm": 1.4201502916197086, + "learning_rate": 2.6243800371977686e-07, + "loss": 0.9341, + "step": 67730 + }, + { + "epoch": 5.249331628501685, + "grad_norm": 1.4783385311866541, + "learning_rate": 2.624767513949163e-07, + "loss": 0.9581, + "step": 67740 + }, + { + "epoch": 5.250106551977992, + "grad_norm": 1.5914765857153517, + "learning_rate": 2.6251549907005585e-07, + "loss": 0.9627, + "step": 67750 + }, + { + "epoch": 5.250881475454299, + "grad_norm": 1.3830528872202636, + "learning_rate": 2.625542467451953e-07, + "loss": 0.9497, + "step": 67760 + }, + { + "epoch": 5.2516563989306055, + "grad_norm": 1.3347375274882973, + "learning_rate": 2.6259299442033484e-07, + "loss": 0.9296, + "step": 67770 + }, + { + "epoch": 5.252431322406912, + "grad_norm": 1.3762412322004982, + "learning_rate": 2.626317420954743e-07, + "loss": 0.9551, + "step": 67780 + }, + { + "epoch": 5.253206245883219, + "grad_norm": 1.4616998698091648, + "learning_rate": 2.626704897706138e-07, + "loss": 0.9531, + "step": 67790 + }, + { + "epoch": 5.253981169359526, + "grad_norm": 1.4349196356131702, + "learning_rate": 2.627092374457533e-07, + "loss": 0.94, + "step": 67800 + }, + { + "epoch": 5.254756092835833, + "grad_norm": 1.3307653131250963, + "learning_rate": 2.6274798512089277e-07, + "loss": 0.9387, + "step": 67810 + }, + { + "epoch": 5.25553101631214, + "grad_norm": 1.4442945806302983, + "learning_rate": 2.627867327960323e-07, + "loss": 0.9452, + "step": 67820 + }, + { + "epoch": 5.2563059397884455, + "grad_norm": 1.372118124430672, + "learning_rate": 2.6282548047117176e-07, + "loss": 0.9427, + "step": 67830 + }, + { + "epoch": 5.257080863264752, + "grad_norm": 1.4562798456438393, + "learning_rate": 2.6286422814631123e-07, + "loss": 0.9435, + "step": 67840 + }, + { + "epoch": 5.257855786741059, + "grad_norm": 1.3510828078324009, + "learning_rate": 2.6290297582145075e-07, + "loss": 0.9541, + "step": 67850 + }, + { + "epoch": 5.258630710217366, + "grad_norm": 1.3424033629025374, + "learning_rate": 2.629417234965902e-07, + "loss": 0.929, + "step": 67860 + }, + { + "epoch": 5.259405633693673, + "grad_norm": 1.4468662624660351, + "learning_rate": 2.6298047117172974e-07, + "loss": 0.9635, + "step": 67870 + }, + { + "epoch": 5.26018055716998, + "grad_norm": 1.331006954364977, + "learning_rate": 2.630192188468692e-07, + "loss": 0.9377, + "step": 67880 + }, + { + "epoch": 5.260955480646286, + "grad_norm": 1.469188263384363, + "learning_rate": 2.6305796652200873e-07, + "loss": 0.9589, + "step": 67890 + }, + { + "epoch": 5.261730404122593, + "grad_norm": 1.3715286195057188, + "learning_rate": 2.630967141971482e-07, + "loss": 0.9509, + "step": 67900 + }, + { + "epoch": 5.2625053275989, + "grad_norm": 1.3451485514675974, + "learning_rate": 2.6313546187228767e-07, + "loss": 0.9325, + "step": 67910 + }, + { + "epoch": 5.263280251075206, + "grad_norm": 1.3919518557057018, + "learning_rate": 2.631742095474272e-07, + "loss": 0.9452, + "step": 67920 + }, + { + "epoch": 5.264055174551513, + "grad_norm": 1.4465289102417822, + "learning_rate": 2.6321295722256666e-07, + "loss": 0.9545, + "step": 67930 + }, + { + "epoch": 5.2648300980278195, + "grad_norm": 1.309920452789619, + "learning_rate": 2.632517048977062e-07, + "loss": 0.9455, + "step": 67940 + }, + { + "epoch": 5.265605021504126, + "grad_norm": 1.3187365926005061, + "learning_rate": 2.6329045257284565e-07, + "loss": 0.9461, + "step": 67950 + }, + { + "epoch": 5.266379944980433, + "grad_norm": 1.3955804960587557, + "learning_rate": 2.6332920024798517e-07, + "loss": 0.9454, + "step": 67960 + }, + { + "epoch": 5.26715486845674, + "grad_norm": 1.375221328416343, + "learning_rate": 2.6336794792312464e-07, + "loss": 0.9419, + "step": 67970 + }, + { + "epoch": 5.267929791933047, + "grad_norm": 1.3840292083788575, + "learning_rate": 2.634066955982641e-07, + "loss": 0.9634, + "step": 67980 + }, + { + "epoch": 5.268704715409354, + "grad_norm": 1.4137394824750218, + "learning_rate": 2.6344544327340363e-07, + "loss": 0.9717, + "step": 67990 + }, + { + "epoch": 5.26947963888566, + "grad_norm": 1.3974812864374482, + "learning_rate": 2.634841909485431e-07, + "loss": 0.9416, + "step": 68000 + }, + { + "epoch": 5.26947963888566, + "eval_loss": 0.9521579146385193, + "eval_runtime": 319.0759, + "eval_samples_per_second": 35.951, + "eval_steps_per_second": 8.988, + "step": 68000 + }, + { + "epoch": 5.270254562361966, + "grad_norm": 1.4140372879293583, + "learning_rate": 2.635229386236826e-07, + "loss": 0.939, + "step": 68010 + }, + { + "epoch": 5.271029485838273, + "grad_norm": 1.3351781903316238, + "learning_rate": 2.635616862988221e-07, + "loss": 0.9522, + "step": 68020 + }, + { + "epoch": 5.27180440931458, + "grad_norm": 1.3846135368014116, + "learning_rate": 2.636004339739616e-07, + "loss": 0.9493, + "step": 68030 + }, + { + "epoch": 5.272579332790887, + "grad_norm": 1.3447594286948938, + "learning_rate": 2.636391816491011e-07, + "loss": 0.9358, + "step": 68040 + }, + { + "epoch": 5.273354256267194, + "grad_norm": 1.3773986744994118, + "learning_rate": 2.6367792932424055e-07, + "loss": 0.9694, + "step": 68050 + }, + { + "epoch": 5.2741291797435, + "grad_norm": 1.379372063059439, + "learning_rate": 2.637166769993801e-07, + "loss": 0.9522, + "step": 68060 + }, + { + "epoch": 5.274904103219807, + "grad_norm": 1.4738991253845897, + "learning_rate": 2.6375542467451954e-07, + "loss": 0.9504, + "step": 68070 + }, + { + "epoch": 5.275679026696114, + "grad_norm": 1.3891343286469067, + "learning_rate": 2.6379417234965907e-07, + "loss": 0.9664, + "step": 68080 + }, + { + "epoch": 5.276453950172421, + "grad_norm": 1.329093274019755, + "learning_rate": 2.6383292002479854e-07, + "loss": 0.9695, + "step": 68090 + }, + { + "epoch": 5.277228873648728, + "grad_norm": 1.291437369887085, + "learning_rate": 2.6387166769993806e-07, + "loss": 0.9383, + "step": 68100 + }, + { + "epoch": 5.2780037971250335, + "grad_norm": 1.3516348073810742, + "learning_rate": 2.6391041537507753e-07, + "loss": 0.9608, + "step": 68110 + }, + { + "epoch": 5.27877872060134, + "grad_norm": 1.3717193366643168, + "learning_rate": 2.63949163050217e-07, + "loss": 0.9494, + "step": 68120 + }, + { + "epoch": 5.279553644077647, + "grad_norm": 1.4253591002011878, + "learning_rate": 2.639879107253565e-07, + "loss": 0.9255, + "step": 68130 + }, + { + "epoch": 5.280328567553954, + "grad_norm": 1.364434904062138, + "learning_rate": 2.64026658400496e-07, + "loss": 0.9731, + "step": 68140 + }, + { + "epoch": 5.281103491030261, + "grad_norm": 1.3283854036091718, + "learning_rate": 2.640654060756355e-07, + "loss": 0.9512, + "step": 68150 + }, + { + "epoch": 5.281878414506568, + "grad_norm": 1.31587173803338, + "learning_rate": 2.64104153750775e-07, + "loss": 0.9424, + "step": 68160 + }, + { + "epoch": 5.282653337982874, + "grad_norm": 1.3544831891066333, + "learning_rate": 2.641429014259145e-07, + "loss": 0.9701, + "step": 68170 + }, + { + "epoch": 5.283428261459181, + "grad_norm": 1.3573753310499184, + "learning_rate": 2.6418164910105397e-07, + "loss": 0.9345, + "step": 68180 + }, + { + "epoch": 5.284203184935488, + "grad_norm": 1.3096701487697087, + "learning_rate": 2.6422039677619344e-07, + "loss": 0.961, + "step": 68190 + }, + { + "epoch": 5.284978108411794, + "grad_norm": 1.41506104999005, + "learning_rate": 2.6425914445133296e-07, + "loss": 0.9473, + "step": 68200 + }, + { + "epoch": 5.285753031888101, + "grad_norm": 1.3457795409416444, + "learning_rate": 2.6429789212647243e-07, + "loss": 0.9405, + "step": 68210 + }, + { + "epoch": 5.286527955364408, + "grad_norm": 1.3616604471711597, + "learning_rate": 2.6433663980161195e-07, + "loss": 0.9431, + "step": 68220 + }, + { + "epoch": 5.287302878840714, + "grad_norm": 1.4080075443602484, + "learning_rate": 2.643753874767514e-07, + "loss": 0.969, + "step": 68230 + }, + { + "epoch": 5.288077802317021, + "grad_norm": 1.3826760964545406, + "learning_rate": 2.6441413515189094e-07, + "loss": 0.9401, + "step": 68240 + }, + { + "epoch": 5.288852725793328, + "grad_norm": 1.38724527598815, + "learning_rate": 2.644528828270304e-07, + "loss": 0.9324, + "step": 68250 + }, + { + "epoch": 5.289627649269635, + "grad_norm": 1.3415715936400947, + "learning_rate": 2.644916305021699e-07, + "loss": 0.9456, + "step": 68260 + }, + { + "epoch": 5.290402572745942, + "grad_norm": 1.356847592433711, + "learning_rate": 2.645303781773094e-07, + "loss": 0.9413, + "step": 68270 + }, + { + "epoch": 5.291177496222248, + "grad_norm": 1.3530697260543318, + "learning_rate": 2.6456912585244887e-07, + "loss": 0.969, + "step": 68280 + }, + { + "epoch": 5.291952419698555, + "grad_norm": 1.3454772362276264, + "learning_rate": 2.646078735275884e-07, + "loss": 0.9362, + "step": 68290 + }, + { + "epoch": 5.292727343174861, + "grad_norm": 1.399356393552507, + "learning_rate": 2.6464662120272786e-07, + "loss": 0.9545, + "step": 68300 + }, + { + "epoch": 5.293502266651168, + "grad_norm": 1.4057691741964886, + "learning_rate": 2.646853688778674e-07, + "loss": 0.9623, + "step": 68310 + }, + { + "epoch": 5.294277190127475, + "grad_norm": 1.4921093078428345, + "learning_rate": 2.6472411655300685e-07, + "loss": 0.966, + "step": 68320 + }, + { + "epoch": 5.295052113603782, + "grad_norm": 1.3804804740605874, + "learning_rate": 2.647628642281463e-07, + "loss": 0.9367, + "step": 68330 + }, + { + "epoch": 5.295827037080088, + "grad_norm": 1.4321460057106667, + "learning_rate": 2.6480161190328584e-07, + "loss": 0.9371, + "step": 68340 + }, + { + "epoch": 5.296601960556395, + "grad_norm": 1.3260491738046603, + "learning_rate": 2.648403595784253e-07, + "loss": 0.9416, + "step": 68350 + }, + { + "epoch": 5.297376884032702, + "grad_norm": 1.3447184421671132, + "learning_rate": 2.6487910725356484e-07, + "loss": 0.9573, + "step": 68360 + }, + { + "epoch": 5.298151807509009, + "grad_norm": 1.2974692953712346, + "learning_rate": 2.649178549287043e-07, + "loss": 0.9495, + "step": 68370 + }, + { + "epoch": 5.298926730985315, + "grad_norm": 1.273767838305077, + "learning_rate": 2.649566026038438e-07, + "loss": 0.9513, + "step": 68380 + }, + { + "epoch": 5.299701654461622, + "grad_norm": 1.3871844370475608, + "learning_rate": 2.649953502789833e-07, + "loss": 0.9569, + "step": 68390 + }, + { + "epoch": 5.300476577937928, + "grad_norm": 1.3111104430308957, + "learning_rate": 2.6503409795412276e-07, + "loss": 0.9199, + "step": 68400 + }, + { + "epoch": 5.301251501414235, + "grad_norm": 1.373736294343799, + "learning_rate": 2.650728456292623e-07, + "loss": 0.948, + "step": 68410 + }, + { + "epoch": 5.302026424890542, + "grad_norm": 1.2865945653430346, + "learning_rate": 2.6511159330440176e-07, + "loss": 0.9818, + "step": 68420 + }, + { + "epoch": 5.302801348366849, + "grad_norm": 1.3214867172749356, + "learning_rate": 2.651503409795413e-07, + "loss": 0.9714, + "step": 68430 + }, + { + "epoch": 5.303576271843156, + "grad_norm": 1.3567399872462422, + "learning_rate": 2.6518908865468075e-07, + "loss": 0.9496, + "step": 68440 + }, + { + "epoch": 5.304351195319462, + "grad_norm": 1.501997433246852, + "learning_rate": 2.6522783632982027e-07, + "loss": 0.948, + "step": 68450 + }, + { + "epoch": 5.305126118795769, + "grad_norm": 1.4619433492536125, + "learning_rate": 2.6526658400495974e-07, + "loss": 0.9601, + "step": 68460 + }, + { + "epoch": 5.305901042272076, + "grad_norm": 1.3299283624432818, + "learning_rate": 2.653053316800992e-07, + "loss": 0.9456, + "step": 68470 + }, + { + "epoch": 5.306675965748382, + "grad_norm": 1.2972047620592548, + "learning_rate": 2.6534407935523873e-07, + "loss": 0.9418, + "step": 68480 + }, + { + "epoch": 5.307450889224689, + "grad_norm": 1.3403341386975218, + "learning_rate": 2.653828270303782e-07, + "loss": 0.9669, + "step": 68490 + }, + { + "epoch": 5.308225812700996, + "grad_norm": 1.3798299158947918, + "learning_rate": 2.654215747055177e-07, + "loss": 0.9596, + "step": 68500 + }, + { + "epoch": 5.308225812700996, + "eval_loss": 0.9516459107398987, + "eval_runtime": 321.0776, + "eval_samples_per_second": 35.727, + "eval_steps_per_second": 8.932, + "step": 68500 + }, + { + "epoch": 5.309000736177302, + "grad_norm": 1.394990124416333, + "learning_rate": 2.654603223806572e-07, + "loss": 0.9676, + "step": 68510 + }, + { + "epoch": 5.309775659653609, + "grad_norm": 1.4337411464335894, + "learning_rate": 2.654990700557967e-07, + "loss": 0.9764, + "step": 68520 + }, + { + "epoch": 5.310550583129916, + "grad_norm": 1.3185587779672692, + "learning_rate": 2.655378177309362e-07, + "loss": 0.9365, + "step": 68530 + }, + { + "epoch": 5.311325506606223, + "grad_norm": 1.420908770920365, + "learning_rate": 2.6557656540607565e-07, + "loss": 0.9386, + "step": 68540 + }, + { + "epoch": 5.31210043008253, + "grad_norm": 1.3046600181725825, + "learning_rate": 2.6561531308121517e-07, + "loss": 0.9561, + "step": 68550 + }, + { + "epoch": 5.3128753535588364, + "grad_norm": 1.3836536656042375, + "learning_rate": 2.6565406075635464e-07, + "loss": 0.9221, + "step": 68560 + }, + { + "epoch": 5.313650277035142, + "grad_norm": 1.4885460212531951, + "learning_rate": 2.6569280843149416e-07, + "loss": 0.9686, + "step": 68570 + }, + { + "epoch": 5.314425200511449, + "grad_norm": 1.4098118504750425, + "learning_rate": 2.6573155610663363e-07, + "loss": 0.9459, + "step": 68580 + }, + { + "epoch": 5.315200123987756, + "grad_norm": 1.352574416670844, + "learning_rate": 2.657703037817731e-07, + "loss": 0.9572, + "step": 68590 + }, + { + "epoch": 5.315975047464063, + "grad_norm": 1.3525644141626008, + "learning_rate": 2.658090514569126e-07, + "loss": 0.9633, + "step": 68600 + }, + { + "epoch": 5.31674997094037, + "grad_norm": 1.3143331549333355, + "learning_rate": 2.658477991320521e-07, + "loss": 0.9343, + "step": 68610 + }, + { + "epoch": 5.317524894416676, + "grad_norm": 1.3533501763840983, + "learning_rate": 2.658865468071916e-07, + "loss": 0.95, + "step": 68620 + }, + { + "epoch": 5.318299817892983, + "grad_norm": 1.3301112825079713, + "learning_rate": 2.659252944823311e-07, + "loss": 0.9386, + "step": 68630 + }, + { + "epoch": 5.31907474136929, + "grad_norm": 1.379325800256456, + "learning_rate": 2.659640421574706e-07, + "loss": 0.934, + "step": 68640 + }, + { + "epoch": 5.319849664845597, + "grad_norm": 1.401630572518385, + "learning_rate": 2.6600278983261007e-07, + "loss": 0.9468, + "step": 68650 + }, + { + "epoch": 5.320624588321904, + "grad_norm": 1.410778345609761, + "learning_rate": 2.6604153750774954e-07, + "loss": 0.9573, + "step": 68660 + }, + { + "epoch": 5.32139951179821, + "grad_norm": 1.3407090410742595, + "learning_rate": 2.6608028518288906e-07, + "loss": 0.951, + "step": 68670 + }, + { + "epoch": 5.322174435274516, + "grad_norm": 1.4278431008423687, + "learning_rate": 2.6611903285802853e-07, + "loss": 0.952, + "step": 68680 + }, + { + "epoch": 5.322949358750823, + "grad_norm": 1.4163814632503215, + "learning_rate": 2.6615778053316805e-07, + "loss": 0.9574, + "step": 68690 + }, + { + "epoch": 5.32372428222713, + "grad_norm": 1.302308081604165, + "learning_rate": 2.661965282083075e-07, + "loss": 0.9361, + "step": 68700 + }, + { + "epoch": 5.324499205703437, + "grad_norm": 1.4689879721386114, + "learning_rate": 2.6623527588344705e-07, + "loss": 0.9594, + "step": 68710 + }, + { + "epoch": 5.325274129179744, + "grad_norm": 1.3266936155353684, + "learning_rate": 2.662740235585865e-07, + "loss": 0.9494, + "step": 68720 + }, + { + "epoch": 5.32604905265605, + "grad_norm": 1.4226786009311017, + "learning_rate": 2.66312771233726e-07, + "loss": 0.9401, + "step": 68730 + }, + { + "epoch": 5.326823976132357, + "grad_norm": 1.3093473210477817, + "learning_rate": 2.663515189088655e-07, + "loss": 0.9228, + "step": 68740 + }, + { + "epoch": 5.327598899608663, + "grad_norm": 1.316122537624217, + "learning_rate": 2.66390266584005e-07, + "loss": 0.9475, + "step": 68750 + }, + { + "epoch": 5.32837382308497, + "grad_norm": 1.250520010540527, + "learning_rate": 2.664290142591445e-07, + "loss": 0.9231, + "step": 68760 + }, + { + "epoch": 5.329148746561277, + "grad_norm": 1.3713241279132053, + "learning_rate": 2.6646776193428397e-07, + "loss": 0.9527, + "step": 68770 + }, + { + "epoch": 5.329923670037584, + "grad_norm": 1.371964846808839, + "learning_rate": 2.665065096094235e-07, + "loss": 0.9438, + "step": 68780 + }, + { + "epoch": 5.33069859351389, + "grad_norm": 1.3754203269613532, + "learning_rate": 2.6654525728456296e-07, + "loss": 0.9426, + "step": 68790 + }, + { + "epoch": 5.331473516990197, + "grad_norm": 1.3354726269645163, + "learning_rate": 2.665840049597024e-07, + "loss": 0.9316, + "step": 68800 + }, + { + "epoch": 5.332248440466504, + "grad_norm": 1.3230820656416167, + "learning_rate": 2.6662275263484195e-07, + "loss": 0.9549, + "step": 68810 + }, + { + "epoch": 5.333023363942811, + "grad_norm": 1.3813152042966486, + "learning_rate": 2.666615003099814e-07, + "loss": 0.9494, + "step": 68820 + }, + { + "epoch": 5.333798287419118, + "grad_norm": 1.355535782061546, + "learning_rate": 2.6670024798512094e-07, + "loss": 0.9366, + "step": 68830 + }, + { + "epoch": 5.3345732108954245, + "grad_norm": 1.5129556368101658, + "learning_rate": 2.667389956602604e-07, + "loss": 0.9297, + "step": 68840 + }, + { + "epoch": 5.33534813437173, + "grad_norm": 1.3869935409243046, + "learning_rate": 2.6677774333539993e-07, + "loss": 0.9678, + "step": 68850 + }, + { + "epoch": 5.336123057848037, + "grad_norm": 1.3609494398221642, + "learning_rate": 2.668164910105394e-07, + "loss": 0.9289, + "step": 68860 + }, + { + "epoch": 5.336897981324344, + "grad_norm": 1.3989385369464837, + "learning_rate": 2.6685523868567887e-07, + "loss": 0.9512, + "step": 68870 + }, + { + "epoch": 5.337672904800651, + "grad_norm": 1.3924827447084045, + "learning_rate": 2.668939863608184e-07, + "loss": 0.9457, + "step": 68880 + }, + { + "epoch": 5.338447828276958, + "grad_norm": 1.3689151887020212, + "learning_rate": 2.6693273403595786e-07, + "loss": 0.9413, + "step": 68890 + }, + { + "epoch": 5.339222751753264, + "grad_norm": 1.3775162693868608, + "learning_rate": 2.669714817110974e-07, + "loss": 0.9448, + "step": 68900 + }, + { + "epoch": 5.339997675229571, + "grad_norm": 1.326407486484731, + "learning_rate": 2.6701022938623685e-07, + "loss": 0.9465, + "step": 68910 + }, + { + "epoch": 5.340772598705878, + "grad_norm": 1.4332901175251227, + "learning_rate": 2.6704897706137637e-07, + "loss": 0.9532, + "step": 68920 + }, + { + "epoch": 5.341547522182185, + "grad_norm": 1.3752259108809586, + "learning_rate": 2.6708772473651584e-07, + "loss": 0.9399, + "step": 68930 + }, + { + "epoch": 5.342322445658491, + "grad_norm": 1.279383040765434, + "learning_rate": 2.671264724116553e-07, + "loss": 0.9388, + "step": 68940 + }, + { + "epoch": 5.343097369134798, + "grad_norm": 1.359937253375372, + "learning_rate": 2.6716522008679483e-07, + "loss": 0.9696, + "step": 68950 + }, + { + "epoch": 5.343872292611104, + "grad_norm": 1.4130457955563585, + "learning_rate": 2.672039677619343e-07, + "loss": 0.9548, + "step": 68960 + }, + { + "epoch": 5.344647216087411, + "grad_norm": 1.4299024612134559, + "learning_rate": 2.672427154370738e-07, + "loss": 0.9562, + "step": 68970 + }, + { + "epoch": 5.345422139563718, + "grad_norm": 1.3788955968279513, + "learning_rate": 2.672814631122133e-07, + "loss": 0.9558, + "step": 68980 + }, + { + "epoch": 5.346197063040025, + "grad_norm": 1.3389002990778678, + "learning_rate": 2.673202107873528e-07, + "loss": 0.9354, + "step": 68990 + }, + { + "epoch": 5.346971986516332, + "grad_norm": 1.3256612280819629, + "learning_rate": 2.673589584624923e-07, + "loss": 0.9696, + "step": 69000 + }, + { + "epoch": 5.346971986516332, + "eval_loss": 0.9509864449501038, + "eval_runtime": 320.2746, + "eval_samples_per_second": 35.816, + "eval_steps_per_second": 8.955, + "step": 69000 + }, + { + "epoch": 5.3477469099926385, + "grad_norm": 1.4079248695799902, + "learning_rate": 2.6739770613763175e-07, + "loss": 0.9585, + "step": 69010 + }, + { + "epoch": 5.348521833468945, + "grad_norm": 1.2995379902999413, + "learning_rate": 2.674364538127713e-07, + "loss": 0.9461, + "step": 69020 + }, + { + "epoch": 5.349296756945252, + "grad_norm": 1.3221476927840659, + "learning_rate": 2.6747520148791074e-07, + "loss": 0.9438, + "step": 69030 + }, + { + "epoch": 5.350071680421558, + "grad_norm": 1.4009055778269695, + "learning_rate": 2.6751394916305027e-07, + "loss": 0.9953, + "step": 69040 + }, + { + "epoch": 5.350846603897865, + "grad_norm": 1.4786082070774451, + "learning_rate": 2.6755269683818973e-07, + "loss": 0.9457, + "step": 69050 + }, + { + "epoch": 5.351621527374172, + "grad_norm": 1.3624216948082823, + "learning_rate": 2.6759144451332926e-07, + "loss": 0.9415, + "step": 69060 + }, + { + "epoch": 5.352396450850478, + "grad_norm": 1.3797817905374565, + "learning_rate": 2.676301921884687e-07, + "loss": 0.9266, + "step": 69070 + }, + { + "epoch": 5.353171374326785, + "grad_norm": 1.3430495298518406, + "learning_rate": 2.676689398636082e-07, + "loss": 0.9559, + "step": 69080 + }, + { + "epoch": 5.353946297803092, + "grad_norm": 1.323827244503646, + "learning_rate": 2.677076875387477e-07, + "loss": 0.9413, + "step": 69090 + }, + { + "epoch": 5.354721221279399, + "grad_norm": 1.3098286615750372, + "learning_rate": 2.677464352138872e-07, + "loss": 0.9283, + "step": 69100 + }, + { + "epoch": 5.355496144755706, + "grad_norm": 1.3983317571566347, + "learning_rate": 2.677851828890267e-07, + "loss": 0.9531, + "step": 69110 + }, + { + "epoch": 5.3562710682320125, + "grad_norm": 1.3431055721039635, + "learning_rate": 2.678239305641662e-07, + "loss": 0.9305, + "step": 69120 + }, + { + "epoch": 5.357045991708318, + "grad_norm": 1.3296763008486214, + "learning_rate": 2.678626782393057e-07, + "loss": 0.9433, + "step": 69130 + }, + { + "epoch": 5.357820915184625, + "grad_norm": 1.4208959365368132, + "learning_rate": 2.6790142591444517e-07, + "loss": 0.9362, + "step": 69140 + }, + { + "epoch": 5.358595838660932, + "grad_norm": 1.4130128597526517, + "learning_rate": 2.6794017358958464e-07, + "loss": 0.9499, + "step": 69150 + }, + { + "epoch": 5.359370762137239, + "grad_norm": 1.4082927055361805, + "learning_rate": 2.6797892126472416e-07, + "loss": 0.9495, + "step": 69160 + }, + { + "epoch": 5.360145685613546, + "grad_norm": 1.3481264955791372, + "learning_rate": 2.6801766893986363e-07, + "loss": 0.9564, + "step": 69170 + }, + { + "epoch": 5.3609206090898525, + "grad_norm": 1.349880387421848, + "learning_rate": 2.6805641661500315e-07, + "loss": 0.9286, + "step": 69180 + }, + { + "epoch": 5.361695532566159, + "grad_norm": 1.4253137762340704, + "learning_rate": 2.680951642901426e-07, + "loss": 0.9566, + "step": 69190 + }, + { + "epoch": 5.362470456042466, + "grad_norm": 1.4421004517004348, + "learning_rate": 2.6813391196528214e-07, + "loss": 0.9625, + "step": 69200 + }, + { + "epoch": 5.363245379518773, + "grad_norm": 1.4017819881710993, + "learning_rate": 2.681726596404216e-07, + "loss": 0.9516, + "step": 69210 + }, + { + "epoch": 5.36402030299508, + "grad_norm": 1.3767635415342252, + "learning_rate": 2.682114073155611e-07, + "loss": 0.9381, + "step": 69220 + }, + { + "epoch": 5.364795226471386, + "grad_norm": 1.3177325625655552, + "learning_rate": 2.682501549907006e-07, + "loss": 0.9515, + "step": 69230 + }, + { + "epoch": 5.365570149947692, + "grad_norm": 1.40290425792915, + "learning_rate": 2.6828890266584007e-07, + "loss": 0.9415, + "step": 69240 + }, + { + "epoch": 5.366345073423999, + "grad_norm": 1.3297715485830808, + "learning_rate": 2.683276503409796e-07, + "loss": 0.9685, + "step": 69250 + }, + { + "epoch": 5.367119996900306, + "grad_norm": 1.3449327110914993, + "learning_rate": 2.6836639801611906e-07, + "loss": 0.9496, + "step": 69260 + }, + { + "epoch": 5.367894920376613, + "grad_norm": 1.2892690755943845, + "learning_rate": 2.684051456912586e-07, + "loss": 0.9501, + "step": 69270 + }, + { + "epoch": 5.36866984385292, + "grad_norm": 1.298846873732871, + "learning_rate": 2.6844389336639805e-07, + "loss": 0.9149, + "step": 69280 + }, + { + "epoch": 5.3694447673292265, + "grad_norm": 1.4573109921583758, + "learning_rate": 2.684826410415375e-07, + "loss": 0.9315, + "step": 69290 + }, + { + "epoch": 5.370219690805533, + "grad_norm": 1.284258621741672, + "learning_rate": 2.6852138871667704e-07, + "loss": 0.9501, + "step": 69300 + }, + { + "epoch": 5.370994614281839, + "grad_norm": 1.3295003068313953, + "learning_rate": 2.685601363918165e-07, + "loss": 0.9397, + "step": 69310 + }, + { + "epoch": 5.371769537758146, + "grad_norm": 1.4142872432499576, + "learning_rate": 2.6859888406695603e-07, + "loss": 0.9551, + "step": 69320 + }, + { + "epoch": 5.372544461234453, + "grad_norm": 1.4246849631255607, + "learning_rate": 2.686376317420955e-07, + "loss": 0.9489, + "step": 69330 + }, + { + "epoch": 5.37331938471076, + "grad_norm": 1.4142013423291577, + "learning_rate": 2.6867637941723497e-07, + "loss": 0.9489, + "step": 69340 + }, + { + "epoch": 5.3740943081870665, + "grad_norm": 1.440672934036559, + "learning_rate": 2.687151270923745e-07, + "loss": 0.9338, + "step": 69350 + }, + { + "epoch": 5.374869231663373, + "grad_norm": 1.4512421639210031, + "learning_rate": 2.6875387476751396e-07, + "loss": 0.9215, + "step": 69360 + }, + { + "epoch": 5.37564415513968, + "grad_norm": 1.3569137276637822, + "learning_rate": 2.687926224426535e-07, + "loss": 0.9346, + "step": 69370 + }, + { + "epoch": 5.376419078615987, + "grad_norm": 1.390308615122034, + "learning_rate": 2.6883137011779295e-07, + "loss": 0.946, + "step": 69380 + }, + { + "epoch": 5.377194002092294, + "grad_norm": 1.4194759735001135, + "learning_rate": 2.688701177929325e-07, + "loss": 0.9645, + "step": 69390 + }, + { + "epoch": 5.3779689255686005, + "grad_norm": 1.3531055683706081, + "learning_rate": 2.6890886546807194e-07, + "loss": 0.9655, + "step": 69400 + }, + { + "epoch": 5.378743849044906, + "grad_norm": 1.7291543089155321, + "learning_rate": 2.689476131432114e-07, + "loss": 0.9682, + "step": 69410 + }, + { + "epoch": 5.379518772521213, + "grad_norm": 1.4392566020552637, + "learning_rate": 2.6898636081835094e-07, + "loss": 0.9513, + "step": 69420 + }, + { + "epoch": 5.38029369599752, + "grad_norm": 1.3209517385355316, + "learning_rate": 2.690251084934904e-07, + "loss": 0.9601, + "step": 69430 + }, + { + "epoch": 5.381068619473827, + "grad_norm": 1.3645702656645942, + "learning_rate": 2.6906385616862993e-07, + "loss": 0.9517, + "step": 69440 + }, + { + "epoch": 5.381843542950134, + "grad_norm": 1.3232810324414455, + "learning_rate": 2.691026038437694e-07, + "loss": 0.9602, + "step": 69450 + }, + { + "epoch": 5.3826184664264405, + "grad_norm": 1.3963757776739534, + "learning_rate": 2.691413515189089e-07, + "loss": 0.9397, + "step": 69460 + }, + { + "epoch": 5.383393389902747, + "grad_norm": 1.3884226667916357, + "learning_rate": 2.691800991940484e-07, + "loss": 0.9463, + "step": 69470 + }, + { + "epoch": 5.384168313379054, + "grad_norm": 1.3992335007863062, + "learning_rate": 2.6921884686918786e-07, + "loss": 0.9473, + "step": 69480 + }, + { + "epoch": 5.384943236855361, + "grad_norm": 1.4376784737431296, + "learning_rate": 2.692575945443274e-07, + "loss": 0.9508, + "step": 69490 + }, + { + "epoch": 5.385718160331667, + "grad_norm": 1.3632231172661815, + "learning_rate": 2.6929634221946685e-07, + "loss": 0.9381, + "step": 69500 + }, + { + "epoch": 5.385718160331667, + "eval_loss": 0.9503531455993652, + "eval_runtime": 320.1244, + "eval_samples_per_second": 35.833, + "eval_steps_per_second": 8.959, + "step": 69500 + }, + { + "epoch": 5.386493083807974, + "grad_norm": 1.4479023159729487, + "learning_rate": 2.6933508989460637e-07, + "loss": 0.9345, + "step": 69510 + }, + { + "epoch": 5.3872680072842805, + "grad_norm": 1.4030105952784684, + "learning_rate": 2.6937383756974584e-07, + "loss": 0.9483, + "step": 69520 + }, + { + "epoch": 5.388042930760587, + "grad_norm": 1.3519084604585732, + "learning_rate": 2.6941258524488536e-07, + "loss": 0.9228, + "step": 69530 + }, + { + "epoch": 5.388817854236894, + "grad_norm": 1.497255963311084, + "learning_rate": 2.6945133292002483e-07, + "loss": 0.9669, + "step": 69540 + }, + { + "epoch": 5.389592777713201, + "grad_norm": 1.4343508115284638, + "learning_rate": 2.694900805951643e-07, + "loss": 0.9379, + "step": 69550 + }, + { + "epoch": 5.390367701189508, + "grad_norm": 1.341240174812892, + "learning_rate": 2.695288282703038e-07, + "loss": 0.9412, + "step": 69560 + }, + { + "epoch": 5.3911426246658145, + "grad_norm": 1.317821889003307, + "learning_rate": 2.695675759454433e-07, + "loss": 0.9707, + "step": 69570 + }, + { + "epoch": 5.391917548142121, + "grad_norm": 1.3016157313588188, + "learning_rate": 2.696063236205828e-07, + "loss": 0.9399, + "step": 69580 + }, + { + "epoch": 5.392692471618428, + "grad_norm": 1.3736962586078523, + "learning_rate": 2.696450712957223e-07, + "loss": 0.9347, + "step": 69590 + }, + { + "epoch": 5.393467395094734, + "grad_norm": 1.3571214980596487, + "learning_rate": 2.696838189708618e-07, + "loss": 0.9497, + "step": 69600 + }, + { + "epoch": 5.394242318571041, + "grad_norm": 1.384907811848371, + "learning_rate": 2.6972256664600127e-07, + "loss": 0.9402, + "step": 69610 + }, + { + "epoch": 5.395017242047348, + "grad_norm": 1.3237919877920465, + "learning_rate": 2.6976131432114074e-07, + "loss": 0.944, + "step": 69620 + }, + { + "epoch": 5.3957921655236545, + "grad_norm": 1.359633380204749, + "learning_rate": 2.6980006199628026e-07, + "loss": 0.9518, + "step": 69630 + }, + { + "epoch": 5.396567088999961, + "grad_norm": 1.3630620216634102, + "learning_rate": 2.6983880967141973e-07, + "loss": 0.9486, + "step": 69640 + }, + { + "epoch": 5.397342012476268, + "grad_norm": 1.365407996319731, + "learning_rate": 2.6987755734655925e-07, + "loss": 0.9614, + "step": 69650 + }, + { + "epoch": 5.398116935952575, + "grad_norm": 1.3843108786580323, + "learning_rate": 2.699163050216987e-07, + "loss": 0.9382, + "step": 69660 + }, + { + "epoch": 5.398891859428882, + "grad_norm": 1.3773460662391164, + "learning_rate": 2.6995505269683824e-07, + "loss": 0.9335, + "step": 69670 + }, + { + "epoch": 5.399666782905188, + "grad_norm": 1.3380258236490556, + "learning_rate": 2.699938003719777e-07, + "loss": 0.9397, + "step": 69680 + }, + { + "epoch": 5.4004417063814945, + "grad_norm": 1.443952449213518, + "learning_rate": 2.700325480471172e-07, + "loss": 0.9387, + "step": 69690 + }, + { + "epoch": 5.401216629857801, + "grad_norm": 1.3342741058279448, + "learning_rate": 2.700712957222567e-07, + "loss": 0.9645, + "step": 69700 + }, + { + "epoch": 5.401991553334108, + "grad_norm": 1.4131341555263746, + "learning_rate": 2.7011004339739617e-07, + "loss": 0.9501, + "step": 69710 + }, + { + "epoch": 5.402766476810415, + "grad_norm": 1.381389477443203, + "learning_rate": 2.701487910725357e-07, + "loss": 0.9337, + "step": 69720 + }, + { + "epoch": 5.403541400286722, + "grad_norm": 1.3812096162677598, + "learning_rate": 2.7018753874767516e-07, + "loss": 0.925, + "step": 69730 + }, + { + "epoch": 5.4043163237630285, + "grad_norm": 1.3717781045360273, + "learning_rate": 2.702262864228147e-07, + "loss": 0.9425, + "step": 69740 + }, + { + "epoch": 5.405091247239335, + "grad_norm": 1.4116414113201334, + "learning_rate": 2.7026503409795416e-07, + "loss": 0.9375, + "step": 69750 + }, + { + "epoch": 5.405866170715642, + "grad_norm": 1.3108408307267192, + "learning_rate": 2.703037817730936e-07, + "loss": 0.9415, + "step": 69760 + }, + { + "epoch": 5.406641094191949, + "grad_norm": 1.320077970998006, + "learning_rate": 2.7034252944823315e-07, + "loss": 0.957, + "step": 69770 + }, + { + "epoch": 5.407416017668255, + "grad_norm": 1.3391913428886355, + "learning_rate": 2.703812771233726e-07, + "loss": 0.9545, + "step": 69780 + }, + { + "epoch": 5.408190941144562, + "grad_norm": 1.4200001192335843, + "learning_rate": 2.7042002479851214e-07, + "loss": 0.9575, + "step": 69790 + }, + { + "epoch": 5.4089658646208685, + "grad_norm": 1.4195305014808517, + "learning_rate": 2.704587724736516e-07, + "loss": 0.9568, + "step": 69800 + }, + { + "epoch": 5.409740788097175, + "grad_norm": 1.3556292098921274, + "learning_rate": 2.7049752014879113e-07, + "loss": 0.9446, + "step": 69810 + }, + { + "epoch": 5.410515711573482, + "grad_norm": 1.393534456721638, + "learning_rate": 2.705362678239306e-07, + "loss": 0.9441, + "step": 69820 + }, + { + "epoch": 5.411290635049789, + "grad_norm": 1.5359363104320145, + "learning_rate": 2.7057501549907007e-07, + "loss": 0.9438, + "step": 69830 + }, + { + "epoch": 5.412065558526096, + "grad_norm": 1.3746535826164146, + "learning_rate": 2.706137631742096e-07, + "loss": 0.9478, + "step": 69840 + }, + { + "epoch": 5.4128404820024025, + "grad_norm": 1.3552764905936612, + "learning_rate": 2.7065251084934906e-07, + "loss": 0.9423, + "step": 69850 + }, + { + "epoch": 5.413615405478709, + "grad_norm": 1.4039828641474752, + "learning_rate": 2.706912585244886e-07, + "loss": 0.9751, + "step": 69860 + }, + { + "epoch": 5.414390328955015, + "grad_norm": 1.3388994752862673, + "learning_rate": 2.7073000619962805e-07, + "loss": 0.9508, + "step": 69870 + }, + { + "epoch": 5.415165252431322, + "grad_norm": 1.4959279247575954, + "learning_rate": 2.7076875387476757e-07, + "loss": 0.9647, + "step": 69880 + }, + { + "epoch": 5.415940175907629, + "grad_norm": 1.3250258574183937, + "learning_rate": 2.7080750154990704e-07, + "loss": 0.955, + "step": 69890 + }, + { + "epoch": 5.416715099383936, + "grad_norm": 1.3169731162270364, + "learning_rate": 2.708462492250465e-07, + "loss": 0.9368, + "step": 69900 + }, + { + "epoch": 5.4174900228602425, + "grad_norm": 1.376502856341012, + "learning_rate": 2.7088499690018603e-07, + "loss": 0.9618, + "step": 69910 + }, + { + "epoch": 5.418264946336549, + "grad_norm": 1.4237964364751432, + "learning_rate": 2.709237445753255e-07, + "loss": 0.9442, + "step": 69920 + }, + { + "epoch": 5.419039869812856, + "grad_norm": 1.3508067460770221, + "learning_rate": 2.70962492250465e-07, + "loss": 0.9248, + "step": 69930 + }, + { + "epoch": 5.419814793289163, + "grad_norm": 1.34049736042798, + "learning_rate": 2.710012399256045e-07, + "loss": 0.9484, + "step": 69940 + }, + { + "epoch": 5.42058971676547, + "grad_norm": 1.282354428374563, + "learning_rate": 2.71039987600744e-07, + "loss": 0.9267, + "step": 69950 + }, + { + "epoch": 5.4213646402417766, + "grad_norm": 1.4591516246381506, + "learning_rate": 2.710787352758835e-07, + "loss": 0.941, + "step": 69960 + }, + { + "epoch": 5.4221395637180825, + "grad_norm": 1.3736988876404475, + "learning_rate": 2.7111748295102295e-07, + "loss": 0.9532, + "step": 69970 + }, + { + "epoch": 5.422914487194389, + "grad_norm": 1.3402187072746268, + "learning_rate": 2.7115623062616247e-07, + "loss": 0.9687, + "step": 69980 + }, + { + "epoch": 5.423689410670696, + "grad_norm": 1.3098216440531316, + "learning_rate": 2.7119497830130194e-07, + "loss": 0.9646, + "step": 69990 + }, + { + "epoch": 5.424464334147003, + "grad_norm": 1.4346017121245411, + "learning_rate": 2.7123372597644146e-07, + "loss": 0.9565, + "step": 70000 + }, + { + "epoch": 5.424464334147003, + "eval_loss": 0.9497991800308228, + "eval_runtime": 321.2008, + "eval_samples_per_second": 35.713, + "eval_steps_per_second": 8.929, + "step": 70000 + }, + { + "epoch": 5.42523925762331, + "grad_norm": 1.3888954083848288, + "learning_rate": 2.7127247365158093e-07, + "loss": 0.938, + "step": 70010 + }, + { + "epoch": 5.4260141810996165, + "grad_norm": 1.3376701271367388, + "learning_rate": 2.7131122132672045e-07, + "loss": 0.9332, + "step": 70020 + }, + { + "epoch": 5.426789104575923, + "grad_norm": 1.4385667449227306, + "learning_rate": 2.713499690018599e-07, + "loss": 0.9545, + "step": 70030 + }, + { + "epoch": 5.42756402805223, + "grad_norm": 1.4161495801715045, + "learning_rate": 2.713887166769994e-07, + "loss": 0.9588, + "step": 70040 + }, + { + "epoch": 5.428338951528537, + "grad_norm": 1.360237182816128, + "learning_rate": 2.714274643521389e-07, + "loss": 0.9378, + "step": 70050 + }, + { + "epoch": 5.429113875004843, + "grad_norm": 1.3086075744232104, + "learning_rate": 2.714662120272784e-07, + "loss": 0.93, + "step": 70060 + }, + { + "epoch": 5.42988879848115, + "grad_norm": 1.3881479636497143, + "learning_rate": 2.715049597024179e-07, + "loss": 0.9422, + "step": 70070 + }, + { + "epoch": 5.4306637219574565, + "grad_norm": 1.3660186788895, + "learning_rate": 2.715437073775574e-07, + "loss": 0.9741, + "step": 70080 + }, + { + "epoch": 5.431438645433763, + "grad_norm": 1.3550250569001787, + "learning_rate": 2.7158245505269684e-07, + "loss": 0.9392, + "step": 70090 + }, + { + "epoch": 5.43221356891007, + "grad_norm": 1.3379724240286521, + "learning_rate": 2.7162120272783637e-07, + "loss": 0.9501, + "step": 70100 + }, + { + "epoch": 5.432988492386377, + "grad_norm": 1.3920491843657101, + "learning_rate": 2.7165995040297583e-07, + "loss": 0.9196, + "step": 70110 + }, + { + "epoch": 5.433763415862684, + "grad_norm": 1.3996412786980412, + "learning_rate": 2.7169869807811536e-07, + "loss": 0.9548, + "step": 70120 + }, + { + "epoch": 5.4345383393389906, + "grad_norm": 1.3586799727973642, + "learning_rate": 2.717374457532548e-07, + "loss": 0.9464, + "step": 70130 + }, + { + "epoch": 5.435313262815297, + "grad_norm": 1.453597944492521, + "learning_rate": 2.7177619342839435e-07, + "loss": 0.9432, + "step": 70140 + }, + { + "epoch": 5.436088186291604, + "grad_norm": 1.4066082490531957, + "learning_rate": 2.718149411035338e-07, + "loss": 0.9516, + "step": 70150 + }, + { + "epoch": 5.43686310976791, + "grad_norm": 1.3905805976796553, + "learning_rate": 2.718536887786733e-07, + "loss": 0.9546, + "step": 70160 + }, + { + "epoch": 5.437638033244217, + "grad_norm": 1.3138651662241958, + "learning_rate": 2.718924364538128e-07, + "loss": 0.9377, + "step": 70170 + }, + { + "epoch": 5.438412956720524, + "grad_norm": 1.3317885608178668, + "learning_rate": 2.719311841289523e-07, + "loss": 0.9372, + "step": 70180 + }, + { + "epoch": 5.4391878801968305, + "grad_norm": 1.3448856282315862, + "learning_rate": 2.719699318040918e-07, + "loss": 0.9501, + "step": 70190 + }, + { + "epoch": 5.439962803673137, + "grad_norm": 1.3121859444212574, + "learning_rate": 2.7200867947923127e-07, + "loss": 0.9459, + "step": 70200 + }, + { + "epoch": 5.440737727149444, + "grad_norm": 1.3356987362835193, + "learning_rate": 2.720474271543708e-07, + "loss": 0.9527, + "step": 70210 + }, + { + "epoch": 5.441512650625751, + "grad_norm": 1.3180804414646203, + "learning_rate": 2.7208617482951026e-07, + "loss": 0.9466, + "step": 70220 + }, + { + "epoch": 5.442287574102058, + "grad_norm": 1.3419414418216982, + "learning_rate": 2.7212492250464973e-07, + "loss": 0.9408, + "step": 70230 + }, + { + "epoch": 5.443062497578364, + "grad_norm": 1.3892191766221265, + "learning_rate": 2.7216367017978925e-07, + "loss": 0.9602, + "step": 70240 + }, + { + "epoch": 5.4438374210546705, + "grad_norm": 1.4500775880444094, + "learning_rate": 2.722024178549287e-07, + "loss": 0.9584, + "step": 70250 + }, + { + "epoch": 5.444612344530977, + "grad_norm": 1.4393695771154276, + "learning_rate": 2.7224116553006824e-07, + "loss": 0.9539, + "step": 70260 + }, + { + "epoch": 5.445387268007284, + "grad_norm": 1.4680454709997048, + "learning_rate": 2.722799132052077e-07, + "loss": 0.9267, + "step": 70270 + }, + { + "epoch": 5.446162191483591, + "grad_norm": 1.3700549456816715, + "learning_rate": 2.7231866088034723e-07, + "loss": 0.9706, + "step": 70280 + }, + { + "epoch": 5.446937114959898, + "grad_norm": 1.4310252113450226, + "learning_rate": 2.723574085554867e-07, + "loss": 0.9381, + "step": 70290 + }, + { + "epoch": 5.4477120384362046, + "grad_norm": 1.4283138422632962, + "learning_rate": 2.7239615623062617e-07, + "loss": 0.9421, + "step": 70300 + }, + { + "epoch": 5.448486961912511, + "grad_norm": 1.340803238397744, + "learning_rate": 2.724349039057657e-07, + "loss": 0.9446, + "step": 70310 + }, + { + "epoch": 5.449261885388818, + "grad_norm": 1.4078564449649311, + "learning_rate": 2.7247365158090516e-07, + "loss": 0.937, + "step": 70320 + }, + { + "epoch": 5.450036808865125, + "grad_norm": 1.4147898982463956, + "learning_rate": 2.725123992560447e-07, + "loss": 0.9382, + "step": 70330 + }, + { + "epoch": 5.450811732341431, + "grad_norm": 1.4671541890294346, + "learning_rate": 2.7255114693118415e-07, + "loss": 0.972, + "step": 70340 + }, + { + "epoch": 5.451586655817738, + "grad_norm": 1.3353901192106277, + "learning_rate": 2.725898946063237e-07, + "loss": 0.9767, + "step": 70350 + }, + { + "epoch": 5.4523615792940445, + "grad_norm": 1.398581296120329, + "learning_rate": 2.7262864228146314e-07, + "loss": 0.9452, + "step": 70360 + }, + { + "epoch": 5.453136502770351, + "grad_norm": 1.5073504304665721, + "learning_rate": 2.726673899566026e-07, + "loss": 0.9557, + "step": 70370 + }, + { + "epoch": 5.453911426246658, + "grad_norm": 1.3264334197937389, + "learning_rate": 2.7270613763174213e-07, + "loss": 0.9563, + "step": 70380 + }, + { + "epoch": 5.454686349722965, + "grad_norm": 1.241027785665736, + "learning_rate": 2.727448853068816e-07, + "loss": 0.9397, + "step": 70390 + }, + { + "epoch": 5.455461273199272, + "grad_norm": 1.430143549057819, + "learning_rate": 2.727836329820211e-07, + "loss": 0.9598, + "step": 70400 + }, + { + "epoch": 5.456236196675579, + "grad_norm": 1.3670333621006026, + "learning_rate": 2.728223806571606e-07, + "loss": 0.9397, + "step": 70410 + }, + { + "epoch": 5.457011120151885, + "grad_norm": 1.3374456034902957, + "learning_rate": 2.728611283323001e-07, + "loss": 0.9383, + "step": 70420 + }, + { + "epoch": 5.457786043628191, + "grad_norm": 1.3426256225176245, + "learning_rate": 2.728998760074396e-07, + "loss": 0.9435, + "step": 70430 + }, + { + "epoch": 5.458560967104498, + "grad_norm": 1.4216060918659004, + "learning_rate": 2.7293862368257905e-07, + "loss": 0.962, + "step": 70440 + }, + { + "epoch": 5.459335890580805, + "grad_norm": 1.2427135832609786, + "learning_rate": 2.729773713577186e-07, + "loss": 0.9251, + "step": 70450 + }, + { + "epoch": 5.460110814057112, + "grad_norm": 1.3580514565778665, + "learning_rate": 2.7301611903285805e-07, + "loss": 0.9508, + "step": 70460 + }, + { + "epoch": 5.4608857375334185, + "grad_norm": 1.338816114151078, + "learning_rate": 2.7305486670799757e-07, + "loss": 0.9525, + "step": 70470 + }, + { + "epoch": 5.461660661009725, + "grad_norm": 1.317417900948398, + "learning_rate": 2.7309361438313704e-07, + "loss": 0.9469, + "step": 70480 + }, + { + "epoch": 5.462435584486032, + "grad_norm": 1.4411083131876992, + "learning_rate": 2.7313236205827656e-07, + "loss": 0.9569, + "step": 70490 + }, + { + "epoch": 5.463210507962339, + "grad_norm": 1.32856378491937, + "learning_rate": 2.7317110973341603e-07, + "loss": 0.9574, + "step": 70500 + }, + { + "epoch": 5.463210507962339, + "eval_loss": 0.9493421316146851, + "eval_runtime": 320.396, + "eval_samples_per_second": 35.803, + "eval_steps_per_second": 8.951, + "step": 70500 + }, + { + "epoch": 5.463985431438646, + "grad_norm": 1.3614491708347405, + "learning_rate": 2.732098574085555e-07, + "loss": 0.9259, + "step": 70510 + }, + { + "epoch": 5.464760354914953, + "grad_norm": 1.468527392637476, + "learning_rate": 2.73248605083695e-07, + "loss": 0.9644, + "step": 70520 + }, + { + "epoch": 5.4655352783912585, + "grad_norm": 1.3402679051318607, + "learning_rate": 2.732873527588345e-07, + "loss": 0.9355, + "step": 70530 + }, + { + "epoch": 5.466310201867565, + "grad_norm": 1.4193041727092865, + "learning_rate": 2.73326100433974e-07, + "loss": 0.9467, + "step": 70540 + }, + { + "epoch": 5.467085125343872, + "grad_norm": 1.3663851094821151, + "learning_rate": 2.733648481091135e-07, + "loss": 0.9398, + "step": 70550 + }, + { + "epoch": 5.467860048820179, + "grad_norm": 1.4000163999397663, + "learning_rate": 2.73403595784253e-07, + "loss": 0.94, + "step": 70560 + }, + { + "epoch": 5.468634972296486, + "grad_norm": 1.337875187207545, + "learning_rate": 2.7344234345939247e-07, + "loss": 0.9357, + "step": 70570 + }, + { + "epoch": 5.469409895772793, + "grad_norm": 1.3621840305061195, + "learning_rate": 2.7348109113453194e-07, + "loss": 0.9333, + "step": 70580 + }, + { + "epoch": 5.470184819249099, + "grad_norm": 1.4495657877573576, + "learning_rate": 2.7351983880967146e-07, + "loss": 0.9363, + "step": 70590 + }, + { + "epoch": 5.470959742725406, + "grad_norm": 1.3609879715490873, + "learning_rate": 2.7355858648481093e-07, + "loss": 0.9518, + "step": 70600 + }, + { + "epoch": 5.471734666201712, + "grad_norm": 1.2776144790238082, + "learning_rate": 2.7359733415995045e-07, + "loss": 0.9407, + "step": 70610 + }, + { + "epoch": 5.472509589678019, + "grad_norm": 1.5095164447701905, + "learning_rate": 2.736360818350899e-07, + "loss": 0.9606, + "step": 70620 + }, + { + "epoch": 5.473284513154326, + "grad_norm": 1.4700909950475314, + "learning_rate": 2.7367482951022944e-07, + "loss": 0.9666, + "step": 70630 + }, + { + "epoch": 5.4740594366306325, + "grad_norm": 1.385492695097446, + "learning_rate": 2.737135771853689e-07, + "loss": 0.9176, + "step": 70640 + }, + { + "epoch": 5.474834360106939, + "grad_norm": 1.3311276619157824, + "learning_rate": 2.737523248605084e-07, + "loss": 0.9222, + "step": 70650 + }, + { + "epoch": 5.475609283583246, + "grad_norm": 1.3618457212496995, + "learning_rate": 2.737910725356479e-07, + "loss": 0.9349, + "step": 70660 + }, + { + "epoch": 5.476384207059553, + "grad_norm": 1.396433524731578, + "learning_rate": 2.7382982021078737e-07, + "loss": 0.9375, + "step": 70670 + }, + { + "epoch": 5.47715913053586, + "grad_norm": 1.4235582721925317, + "learning_rate": 2.738685678859269e-07, + "loss": 0.9388, + "step": 70680 + }, + { + "epoch": 5.477934054012167, + "grad_norm": 1.3677032766618316, + "learning_rate": 2.7390731556106636e-07, + "loss": 0.9335, + "step": 70690 + }, + { + "epoch": 5.478708977488473, + "grad_norm": 1.4885361817117952, + "learning_rate": 2.739460632362059e-07, + "loss": 0.951, + "step": 70700 + }, + { + "epoch": 5.479483900964779, + "grad_norm": 1.4122804704128105, + "learning_rate": 2.7398481091134535e-07, + "loss": 0.9435, + "step": 70710 + }, + { + "epoch": 5.480258824441086, + "grad_norm": 1.4056144185550778, + "learning_rate": 2.740235585864848e-07, + "loss": 0.9493, + "step": 70720 + }, + { + "epoch": 5.481033747917393, + "grad_norm": 1.2518645409337266, + "learning_rate": 2.7406230626162434e-07, + "loss": 0.9565, + "step": 70730 + }, + { + "epoch": 5.4818086713937, + "grad_norm": 1.3894059797565579, + "learning_rate": 2.741010539367638e-07, + "loss": 0.9501, + "step": 70740 + }, + { + "epoch": 5.482583594870007, + "grad_norm": 1.3891686764221416, + "learning_rate": 2.7413980161190334e-07, + "loss": 0.9934, + "step": 70750 + }, + { + "epoch": 5.483358518346313, + "grad_norm": 1.3750185634939927, + "learning_rate": 2.741785492870428e-07, + "loss": 0.9503, + "step": 70760 + }, + { + "epoch": 5.48413344182262, + "grad_norm": 1.3871960217416288, + "learning_rate": 2.742172969621823e-07, + "loss": 0.9365, + "step": 70770 + }, + { + "epoch": 5.484908365298927, + "grad_norm": 1.3668330149024983, + "learning_rate": 2.742560446373218e-07, + "loss": 0.9705, + "step": 70780 + }, + { + "epoch": 5.485683288775234, + "grad_norm": 1.445930631182142, + "learning_rate": 2.7429479231246126e-07, + "loss": 0.9541, + "step": 70790 + }, + { + "epoch": 5.48645821225154, + "grad_norm": 1.359597194377496, + "learning_rate": 2.743335399876008e-07, + "loss": 0.9516, + "step": 70800 + }, + { + "epoch": 5.4872331357278465, + "grad_norm": 1.3177561035394347, + "learning_rate": 2.7437228766274026e-07, + "loss": 0.9372, + "step": 70810 + }, + { + "epoch": 5.488008059204153, + "grad_norm": 1.3778720190273435, + "learning_rate": 2.744110353378798e-07, + "loss": 0.955, + "step": 70820 + }, + { + "epoch": 5.48878298268046, + "grad_norm": 1.2977689953193374, + "learning_rate": 2.7444978301301925e-07, + "loss": 0.9473, + "step": 70830 + }, + { + "epoch": 5.489557906156767, + "grad_norm": 1.3438371288789477, + "learning_rate": 2.744885306881587e-07, + "loss": 0.9206, + "step": 70840 + }, + { + "epoch": 5.490332829633074, + "grad_norm": 1.444234729077937, + "learning_rate": 2.7452727836329824e-07, + "loss": 0.9399, + "step": 70850 + }, + { + "epoch": 5.491107753109381, + "grad_norm": 1.4006189650517842, + "learning_rate": 2.745660260384377e-07, + "loss": 0.959, + "step": 70860 + }, + { + "epoch": 5.491882676585687, + "grad_norm": 1.3390962800463357, + "learning_rate": 2.7460477371357723e-07, + "loss": 0.9211, + "step": 70870 + }, + { + "epoch": 5.492657600061994, + "grad_norm": 1.3573029916737351, + "learning_rate": 2.746435213887167e-07, + "loss": 0.9478, + "step": 70880 + }, + { + "epoch": 5.493432523538301, + "grad_norm": 1.4129323013065074, + "learning_rate": 2.746822690638562e-07, + "loss": 0.9425, + "step": 70890 + }, + { + "epoch": 5.494207447014607, + "grad_norm": 1.4082915421249174, + "learning_rate": 2.747210167389957e-07, + "loss": 0.9416, + "step": 70900 + }, + { + "epoch": 5.494982370490914, + "grad_norm": 1.3391324825426039, + "learning_rate": 2.7475976441413516e-07, + "loss": 0.9405, + "step": 70910 + }, + { + "epoch": 5.495757293967221, + "grad_norm": 1.4001726765491163, + "learning_rate": 2.747985120892747e-07, + "loss": 0.961, + "step": 70920 + }, + { + "epoch": 5.496532217443527, + "grad_norm": 1.3739678103432529, + "learning_rate": 2.7483725976441415e-07, + "loss": 0.9721, + "step": 70930 + }, + { + "epoch": 5.497307140919834, + "grad_norm": 1.4149448147915245, + "learning_rate": 2.7487600743955367e-07, + "loss": 0.9473, + "step": 70940 + }, + { + "epoch": 5.498082064396141, + "grad_norm": 1.57139392030274, + "learning_rate": 2.7491475511469314e-07, + "loss": 0.9301, + "step": 70950 + }, + { + "epoch": 5.498856987872448, + "grad_norm": 1.4281737415397286, + "learning_rate": 2.7495350278983266e-07, + "loss": 0.9405, + "step": 70960 + }, + { + "epoch": 5.499631911348755, + "grad_norm": 1.3435626030881762, + "learning_rate": 2.7499225046497213e-07, + "loss": 0.9296, + "step": 70970 + }, + { + "epoch": 5.5004068348250605, + "grad_norm": 1.2593212752425922, + "learning_rate": 2.750309981401116e-07, + "loss": 0.9374, + "step": 70980 + }, + { + "epoch": 5.501181758301367, + "grad_norm": 1.34453233556026, + "learning_rate": 2.750697458152511e-07, + "loss": 0.9742, + "step": 70990 + }, + { + "epoch": 5.501956681777674, + "grad_norm": 1.3597553193566352, + "learning_rate": 2.751084934903906e-07, + "loss": 0.9382, + "step": 71000 + }, + { + "epoch": 5.501956681777674, + "eval_loss": 0.9487263560295105, + "eval_runtime": 319.9948, + "eval_samples_per_second": 35.847, + "eval_steps_per_second": 8.963, + "step": 71000 + }, + { + "epoch": 5.502731605253981, + "grad_norm": 1.3163792917407504, + "learning_rate": 2.751472411655301e-07, + "loss": 0.9484, + "step": 71010 + }, + { + "epoch": 5.503506528730288, + "grad_norm": 1.3817661802318837, + "learning_rate": 2.751859888406696e-07, + "loss": 0.9643, + "step": 71020 + }, + { + "epoch": 5.504281452206595, + "grad_norm": 1.3264462137406248, + "learning_rate": 2.752247365158091e-07, + "loss": 0.9427, + "step": 71030 + }, + { + "epoch": 5.505056375682901, + "grad_norm": 1.326287653438055, + "learning_rate": 2.7526348419094857e-07, + "loss": 0.945, + "step": 71040 + }, + { + "epoch": 5.505831299159208, + "grad_norm": 1.376391532297046, + "learning_rate": 2.7530223186608804e-07, + "loss": 0.9377, + "step": 71050 + }, + { + "epoch": 5.506606222635515, + "grad_norm": 1.3835876950484727, + "learning_rate": 2.7534097954122756e-07, + "loss": 0.9522, + "step": 71060 + }, + { + "epoch": 5.507381146111822, + "grad_norm": 1.3685580184741029, + "learning_rate": 2.7537972721636703e-07, + "loss": 0.9576, + "step": 71070 + }, + { + "epoch": 5.508156069588129, + "grad_norm": 1.407741144110289, + "learning_rate": 2.7541847489150655e-07, + "loss": 0.9624, + "step": 71080 + }, + { + "epoch": 5.508930993064435, + "grad_norm": 1.3920118273971986, + "learning_rate": 2.75457222566646e-07, + "loss": 0.9596, + "step": 71090 + }, + { + "epoch": 5.509705916540741, + "grad_norm": 1.3518767332919759, + "learning_rate": 2.7549597024178555e-07, + "loss": 0.9453, + "step": 71100 + }, + { + "epoch": 5.510480840017048, + "grad_norm": 1.3602253139222837, + "learning_rate": 2.75534717916925e-07, + "loss": 0.9703, + "step": 71110 + }, + { + "epoch": 5.511255763493355, + "grad_norm": 1.3286992292481108, + "learning_rate": 2.755734655920645e-07, + "loss": 0.9365, + "step": 71120 + }, + { + "epoch": 5.512030686969662, + "grad_norm": 1.373900068781218, + "learning_rate": 2.75612213267204e-07, + "loss": 0.9523, + "step": 71130 + }, + { + "epoch": 5.512805610445969, + "grad_norm": 1.454324577306388, + "learning_rate": 2.756509609423435e-07, + "loss": 0.9434, + "step": 71140 + }, + { + "epoch": 5.513580533922275, + "grad_norm": 1.3302242910954498, + "learning_rate": 2.75689708617483e-07, + "loss": 0.9262, + "step": 71150 + }, + { + "epoch": 5.514355457398582, + "grad_norm": 1.3049715053562814, + "learning_rate": 2.7572845629262247e-07, + "loss": 0.9484, + "step": 71160 + }, + { + "epoch": 5.515130380874888, + "grad_norm": 1.315315319900584, + "learning_rate": 2.75767203967762e-07, + "loss": 0.9549, + "step": 71170 + }, + { + "epoch": 5.515905304351195, + "grad_norm": 1.3431826419454738, + "learning_rate": 2.7580595164290146e-07, + "loss": 0.9412, + "step": 71180 + }, + { + "epoch": 5.516680227827502, + "grad_norm": 1.406748806674564, + "learning_rate": 2.758446993180409e-07, + "loss": 0.9226, + "step": 71190 + }, + { + "epoch": 5.517455151303809, + "grad_norm": 1.3510181230192284, + "learning_rate": 2.7588344699318045e-07, + "loss": 0.9627, + "step": 71200 + }, + { + "epoch": 5.518230074780115, + "grad_norm": 1.3053570233441958, + "learning_rate": 2.759221946683199e-07, + "loss": 0.9293, + "step": 71210 + }, + { + "epoch": 5.519004998256422, + "grad_norm": 1.3728492813402695, + "learning_rate": 2.7596094234345944e-07, + "loss": 0.9524, + "step": 71220 + }, + { + "epoch": 5.519779921732729, + "grad_norm": 1.400349068020888, + "learning_rate": 2.759996900185989e-07, + "loss": 0.9533, + "step": 71230 + }, + { + "epoch": 5.520554845209036, + "grad_norm": 1.3895037222587756, + "learning_rate": 2.7603843769373843e-07, + "loss": 0.9435, + "step": 71240 + }, + { + "epoch": 5.521329768685343, + "grad_norm": 1.4564229025903441, + "learning_rate": 2.760771853688779e-07, + "loss": 0.9475, + "step": 71250 + }, + { + "epoch": 5.5221046921616495, + "grad_norm": 1.4463110831701878, + "learning_rate": 2.7611593304401737e-07, + "loss": 0.9655, + "step": 71260 + }, + { + "epoch": 5.522879615637955, + "grad_norm": 1.3706363768277143, + "learning_rate": 2.761546807191569e-07, + "loss": 0.9353, + "step": 71270 + }, + { + "epoch": 5.523654539114262, + "grad_norm": 1.4251679752924127, + "learning_rate": 2.7619342839429636e-07, + "loss": 0.9294, + "step": 71280 + }, + { + "epoch": 5.524429462590569, + "grad_norm": 1.425797642266892, + "learning_rate": 2.762321760694359e-07, + "loss": 0.9487, + "step": 71290 + }, + { + "epoch": 5.525204386066876, + "grad_norm": 1.400652927764556, + "learning_rate": 2.7627092374457535e-07, + "loss": 0.9303, + "step": 71300 + }, + { + "epoch": 5.525979309543183, + "grad_norm": 1.3317882705131185, + "learning_rate": 2.7630967141971487e-07, + "loss": 0.9608, + "step": 71310 + }, + { + "epoch": 5.526754233019489, + "grad_norm": 1.4332638366029846, + "learning_rate": 2.7634841909485434e-07, + "loss": 0.9577, + "step": 71320 + }, + { + "epoch": 5.527529156495796, + "grad_norm": 1.3084802596051464, + "learning_rate": 2.763871667699938e-07, + "loss": 0.9488, + "step": 71330 + }, + { + "epoch": 5.528304079972103, + "grad_norm": 1.3944593993343712, + "learning_rate": 2.7642591444513333e-07, + "loss": 0.9526, + "step": 71340 + }, + { + "epoch": 5.52907900344841, + "grad_norm": 1.368910008408681, + "learning_rate": 2.764646621202728e-07, + "loss": 0.9215, + "step": 71350 + }, + { + "epoch": 5.529853926924716, + "grad_norm": 1.3645362948474173, + "learning_rate": 2.765034097954123e-07, + "loss": 0.9329, + "step": 71360 + }, + { + "epoch": 5.530628850401023, + "grad_norm": 1.375278365775517, + "learning_rate": 2.765421574705518e-07, + "loss": 0.939, + "step": 71370 + }, + { + "epoch": 5.531403773877329, + "grad_norm": 1.2830688033955855, + "learning_rate": 2.765809051456913e-07, + "loss": 0.9483, + "step": 71380 + }, + { + "epoch": 5.532178697353636, + "grad_norm": 1.3821293327615496, + "learning_rate": 2.766196528208308e-07, + "loss": 0.9331, + "step": 71390 + }, + { + "epoch": 5.532953620829943, + "grad_norm": 1.4014439084351051, + "learning_rate": 2.7665840049597025e-07, + "loss": 0.9543, + "step": 71400 + }, + { + "epoch": 5.53372854430625, + "grad_norm": 1.4949246849211464, + "learning_rate": 2.766971481711098e-07, + "loss": 0.9402, + "step": 71410 + }, + { + "epoch": 5.534503467782557, + "grad_norm": 1.3689822567152177, + "learning_rate": 2.7673589584624924e-07, + "loss": 0.9321, + "step": 71420 + }, + { + "epoch": 5.5352783912588635, + "grad_norm": 1.4325501779357654, + "learning_rate": 2.7677464352138877e-07, + "loss": 0.9457, + "step": 71430 + }, + { + "epoch": 5.53605331473517, + "grad_norm": 1.3540893683229798, + "learning_rate": 2.7681339119652823e-07, + "loss": 0.9275, + "step": 71440 + }, + { + "epoch": 5.536828238211477, + "grad_norm": 1.442793046851756, + "learning_rate": 2.7685213887166776e-07, + "loss": 0.9862, + "step": 71450 + }, + { + "epoch": 5.537603161687783, + "grad_norm": 1.3232727716609831, + "learning_rate": 2.768908865468072e-07, + "loss": 0.9361, + "step": 71460 + }, + { + "epoch": 5.53837808516409, + "grad_norm": 1.4582230072262488, + "learning_rate": 2.769296342219467e-07, + "loss": 0.9405, + "step": 71470 + }, + { + "epoch": 5.539153008640397, + "grad_norm": 1.4078828095214686, + "learning_rate": 2.769683818970862e-07, + "loss": 0.9352, + "step": 71480 + }, + { + "epoch": 5.539927932116703, + "grad_norm": 1.370168327506737, + "learning_rate": 2.770071295722257e-07, + "loss": 0.9361, + "step": 71490 + }, + { + "epoch": 5.54070285559301, + "grad_norm": 1.366174788365024, + "learning_rate": 2.770458772473652e-07, + "loss": 0.9374, + "step": 71500 + }, + { + "epoch": 5.54070285559301, + "eval_loss": 0.9480811953544617, + "eval_runtime": 319.9672, + "eval_samples_per_second": 35.851, + "eval_steps_per_second": 8.963, + "step": 71500 + }, + { + "epoch": 5.541477779069317, + "grad_norm": 1.3377923298397532, + "learning_rate": 2.770846249225047e-07, + "loss": 0.9061, + "step": 71510 + }, + { + "epoch": 5.542252702545624, + "grad_norm": 1.3295290189101308, + "learning_rate": 2.7712337259764415e-07, + "loss": 0.9443, + "step": 71520 + }, + { + "epoch": 5.543027626021931, + "grad_norm": 1.3434218778043878, + "learning_rate": 2.7716212027278367e-07, + "loss": 0.9397, + "step": 71530 + }, + { + "epoch": 5.543802549498237, + "grad_norm": 1.3657338050407823, + "learning_rate": 2.7720086794792314e-07, + "loss": 0.9455, + "step": 71540 + }, + { + "epoch": 5.544577472974543, + "grad_norm": 1.313238155837475, + "learning_rate": 2.7723961562306266e-07, + "loss": 0.9303, + "step": 71550 + }, + { + "epoch": 5.54535239645085, + "grad_norm": 1.3385621918354478, + "learning_rate": 2.7727836329820213e-07, + "loss": 0.927, + "step": 71560 + }, + { + "epoch": 5.546127319927157, + "grad_norm": 1.4391264243904736, + "learning_rate": 2.7731711097334165e-07, + "loss": 0.9475, + "step": 71570 + }, + { + "epoch": 5.546902243403464, + "grad_norm": 1.3869998017226688, + "learning_rate": 2.773558586484811e-07, + "loss": 0.9517, + "step": 71580 + }, + { + "epoch": 5.547677166879771, + "grad_norm": 1.3185998474395904, + "learning_rate": 2.773946063236206e-07, + "loss": 0.9433, + "step": 71590 + }, + { + "epoch": 5.5484520903560774, + "grad_norm": 1.3580389832556432, + "learning_rate": 2.774333539987601e-07, + "loss": 0.9542, + "step": 71600 + }, + { + "epoch": 5.549227013832384, + "grad_norm": 1.3856654156450994, + "learning_rate": 2.774721016738996e-07, + "loss": 0.9287, + "step": 71610 + }, + { + "epoch": 5.550001937308691, + "grad_norm": 1.4398439290877023, + "learning_rate": 2.775108493490391e-07, + "loss": 0.916, + "step": 71620 + }, + { + "epoch": 5.550776860784998, + "grad_norm": 1.4567355619834916, + "learning_rate": 2.7754959702417857e-07, + "loss": 0.9607, + "step": 71630 + }, + { + "epoch": 5.551551784261305, + "grad_norm": 1.316811684441114, + "learning_rate": 2.775883446993181e-07, + "loss": 0.9618, + "step": 71640 + }, + { + "epoch": 5.552326707737611, + "grad_norm": 1.3481697877064729, + "learning_rate": 2.7762709237445756e-07, + "loss": 0.9203, + "step": 71650 + }, + { + "epoch": 5.553101631213917, + "grad_norm": 1.4318121599260911, + "learning_rate": 2.7766584004959703e-07, + "loss": 0.9438, + "step": 71660 + }, + { + "epoch": 5.553876554690224, + "grad_norm": 1.3905638871768904, + "learning_rate": 2.7770458772473655e-07, + "loss": 0.942, + "step": 71670 + }, + { + "epoch": 5.554651478166531, + "grad_norm": 1.3566514477829004, + "learning_rate": 2.77743335399876e-07, + "loss": 0.9322, + "step": 71680 + }, + { + "epoch": 5.555426401642838, + "grad_norm": 1.4239353460748487, + "learning_rate": 2.7778208307501554e-07, + "loss": 0.9443, + "step": 71690 + }, + { + "epoch": 5.556201325119145, + "grad_norm": 1.3461076332790687, + "learning_rate": 2.77820830750155e-07, + "loss": 0.9493, + "step": 71700 + }, + { + "epoch": 5.5569762485954515, + "grad_norm": 1.4800074379187633, + "learning_rate": 2.7785957842529453e-07, + "loss": 0.9695, + "step": 71710 + }, + { + "epoch": 5.557751172071758, + "grad_norm": 1.3928864235333285, + "learning_rate": 2.77898326100434e-07, + "loss": 0.9381, + "step": 71720 + }, + { + "epoch": 5.558526095548064, + "grad_norm": 1.417758604916665, + "learning_rate": 2.7793707377557347e-07, + "loss": 0.9628, + "step": 71730 + }, + { + "epoch": 5.559301019024371, + "grad_norm": 1.3770809987553927, + "learning_rate": 2.77975821450713e-07, + "loss": 0.9446, + "step": 71740 + }, + { + "epoch": 5.560075942500678, + "grad_norm": 1.4038508585717566, + "learning_rate": 2.7801456912585246e-07, + "loss": 0.9502, + "step": 71750 + }, + { + "epoch": 5.560850865976985, + "grad_norm": 1.4388264870067093, + "learning_rate": 2.78053316800992e-07, + "loss": 0.9429, + "step": 71760 + }, + { + "epoch": 5.5616257894532914, + "grad_norm": 1.297805045265354, + "learning_rate": 2.7809206447613145e-07, + "loss": 0.951, + "step": 71770 + }, + { + "epoch": 5.562400712929598, + "grad_norm": 1.3479724035140004, + "learning_rate": 2.78130812151271e-07, + "loss": 0.9427, + "step": 71780 + }, + { + "epoch": 5.563175636405905, + "grad_norm": 1.3803817561997844, + "learning_rate": 2.7816955982641044e-07, + "loss": 0.9841, + "step": 71790 + }, + { + "epoch": 5.563950559882212, + "grad_norm": 1.360656475123106, + "learning_rate": 2.782083075015499e-07, + "loss": 0.9471, + "step": 71800 + }, + { + "epoch": 5.564725483358519, + "grad_norm": 1.3037659310976137, + "learning_rate": 2.7824705517668944e-07, + "loss": 0.9387, + "step": 71810 + }, + { + "epoch": 5.5655004068348255, + "grad_norm": 1.454750982983289, + "learning_rate": 2.782858028518289e-07, + "loss": 0.9566, + "step": 71820 + }, + { + "epoch": 5.566275330311131, + "grad_norm": 1.4280535590737355, + "learning_rate": 2.7832455052696843e-07, + "loss": 0.9267, + "step": 71830 + }, + { + "epoch": 5.567050253787438, + "grad_norm": 1.4344548177733785, + "learning_rate": 2.783632982021079e-07, + "loss": 0.9711, + "step": 71840 + }, + { + "epoch": 5.567825177263745, + "grad_norm": 1.3142047496758016, + "learning_rate": 2.784020458772474e-07, + "loss": 0.964, + "step": 71850 + }, + { + "epoch": 5.568600100740052, + "grad_norm": 1.403543581490592, + "learning_rate": 2.784407935523869e-07, + "loss": 0.9181, + "step": 71860 + }, + { + "epoch": 5.569375024216359, + "grad_norm": 1.3205947180992768, + "learning_rate": 2.7847954122752636e-07, + "loss": 0.9457, + "step": 71870 + }, + { + "epoch": 5.5701499476926655, + "grad_norm": 1.3407488372978043, + "learning_rate": 2.785182889026659e-07, + "loss": 0.9397, + "step": 71880 + }, + { + "epoch": 5.570924871168972, + "grad_norm": 1.417534974493108, + "learning_rate": 2.7855703657780535e-07, + "loss": 0.9478, + "step": 71890 + }, + { + "epoch": 5.571699794645279, + "grad_norm": 1.3867938452471105, + "learning_rate": 2.7859578425294487e-07, + "loss": 0.962, + "step": 71900 + }, + { + "epoch": 5.572474718121585, + "grad_norm": 1.3699635081520174, + "learning_rate": 2.7863453192808434e-07, + "loss": 0.9302, + "step": 71910 + }, + { + "epoch": 5.573249641597892, + "grad_norm": 1.371524555660893, + "learning_rate": 2.7867327960322386e-07, + "loss": 0.9533, + "step": 71920 + }, + { + "epoch": 5.574024565074199, + "grad_norm": 1.3927787140680445, + "learning_rate": 2.7871202727836333e-07, + "loss": 0.9377, + "step": 71930 + }, + { + "epoch": 5.5747994885505054, + "grad_norm": 1.4143512373052904, + "learning_rate": 2.787507749535028e-07, + "loss": 0.9397, + "step": 71940 + }, + { + "epoch": 5.575574412026812, + "grad_norm": 1.3985609359559463, + "learning_rate": 2.787895226286423e-07, + "loss": 0.9504, + "step": 71950 + }, + { + "epoch": 5.576349335503119, + "grad_norm": 1.3107609391085289, + "learning_rate": 2.788282703037818e-07, + "loss": 0.9471, + "step": 71960 + }, + { + "epoch": 5.577124258979426, + "grad_norm": 1.3638748412427961, + "learning_rate": 2.788670179789213e-07, + "loss": 0.9384, + "step": 71970 + }, + { + "epoch": 5.577899182455733, + "grad_norm": 1.3860773494726024, + "learning_rate": 2.789057656540608e-07, + "loss": 0.949, + "step": 71980 + }, + { + "epoch": 5.5786741059320395, + "grad_norm": 1.3653605924435386, + "learning_rate": 2.789445133292003e-07, + "loss": 0.937, + "step": 71990 + }, + { + "epoch": 5.579449029408346, + "grad_norm": 1.3677996746950438, + "learning_rate": 2.7898326100433977e-07, + "loss": 0.957, + "step": 72000 + }, + { + "epoch": 5.579449029408346, + "eval_loss": 0.9476259350776672, + "eval_runtime": 319.5033, + "eval_samples_per_second": 35.903, + "eval_steps_per_second": 8.976, + "step": 72000 + }, + { + "epoch": 5.580223952884653, + "grad_norm": 1.3980446331742302, + "learning_rate": 2.7902200867947924e-07, + "loss": 0.9347, + "step": 72010 + }, + { + "epoch": 5.580998876360959, + "grad_norm": 1.4576066659604106, + "learning_rate": 2.7906075635461876e-07, + "loss": 0.941, + "step": 72020 + }, + { + "epoch": 5.581773799837266, + "grad_norm": 1.305714288991595, + "learning_rate": 2.7909950402975823e-07, + "loss": 0.9527, + "step": 72030 + }, + { + "epoch": 5.582548723313573, + "grad_norm": 1.4276697992680711, + "learning_rate": 2.7913825170489775e-07, + "loss": 0.9325, + "step": 72040 + }, + { + "epoch": 5.5833236467898795, + "grad_norm": 1.296603908698595, + "learning_rate": 2.791769993800372e-07, + "loss": 0.9399, + "step": 72050 + }, + { + "epoch": 5.584098570266186, + "grad_norm": 1.3883152570499258, + "learning_rate": 2.7921574705517674e-07, + "loss": 0.9511, + "step": 72060 + }, + { + "epoch": 5.584873493742493, + "grad_norm": 1.4394073835346708, + "learning_rate": 2.792544947303162e-07, + "loss": 0.9588, + "step": 72070 + }, + { + "epoch": 5.5856484172188, + "grad_norm": 1.4168169048020784, + "learning_rate": 2.792932424054557e-07, + "loss": 0.9445, + "step": 72080 + }, + { + "epoch": 5.586423340695107, + "grad_norm": 1.3001120228322038, + "learning_rate": 2.793319900805952e-07, + "loss": 0.9505, + "step": 72090 + }, + { + "epoch": 5.587198264171413, + "grad_norm": 1.3127158457708987, + "learning_rate": 2.7937073775573467e-07, + "loss": 0.9464, + "step": 72100 + }, + { + "epoch": 5.587973187647719, + "grad_norm": 1.4199737663680478, + "learning_rate": 2.794094854308742e-07, + "loss": 0.9382, + "step": 72110 + }, + { + "epoch": 5.588748111124026, + "grad_norm": 1.3948920849462558, + "learning_rate": 2.7944823310601366e-07, + "loss": 0.9457, + "step": 72120 + }, + { + "epoch": 5.589523034600333, + "grad_norm": 1.2769812239197684, + "learning_rate": 2.794869807811532e-07, + "loss": 0.9394, + "step": 72130 + }, + { + "epoch": 5.59029795807664, + "grad_norm": 1.3905211006904656, + "learning_rate": 2.7952572845629266e-07, + "loss": 0.9389, + "step": 72140 + }, + { + "epoch": 5.591072881552947, + "grad_norm": 1.381339932404278, + "learning_rate": 2.795644761314321e-07, + "loss": 0.9346, + "step": 72150 + }, + { + "epoch": 5.5918478050292535, + "grad_norm": 1.3834467475441934, + "learning_rate": 2.7960322380657165e-07, + "loss": 0.9504, + "step": 72160 + }, + { + "epoch": 5.59262272850556, + "grad_norm": 1.3585860870256314, + "learning_rate": 2.796419714817111e-07, + "loss": 0.9509, + "step": 72170 + }, + { + "epoch": 5.593397651981867, + "grad_norm": 1.3190631291148471, + "learning_rate": 2.7968071915685064e-07, + "loss": 0.9417, + "step": 72180 + }, + { + "epoch": 5.594172575458174, + "grad_norm": 1.3833762885592562, + "learning_rate": 2.797194668319901e-07, + "loss": 0.9263, + "step": 72190 + }, + { + "epoch": 5.59494749893448, + "grad_norm": 1.3425063964520079, + "learning_rate": 2.7975821450712963e-07, + "loss": 0.9406, + "step": 72200 + }, + { + "epoch": 5.595722422410787, + "grad_norm": 1.3227965430839048, + "learning_rate": 2.797969621822691e-07, + "loss": 0.9342, + "step": 72210 + }, + { + "epoch": 5.5964973458870935, + "grad_norm": 1.3279902907761627, + "learning_rate": 2.7983570985740857e-07, + "loss": 0.9359, + "step": 72220 + }, + { + "epoch": 5.5972722693634, + "grad_norm": 1.378441091835342, + "learning_rate": 2.798744575325481e-07, + "loss": 0.9414, + "step": 72230 + }, + { + "epoch": 5.598047192839707, + "grad_norm": 1.365576795385847, + "learning_rate": 2.7991320520768756e-07, + "loss": 0.9652, + "step": 72240 + }, + { + "epoch": 5.598822116316014, + "grad_norm": 1.2848556690230881, + "learning_rate": 2.799519528828271e-07, + "loss": 0.9575, + "step": 72250 + }, + { + "epoch": 5.599597039792321, + "grad_norm": 1.3826863267715979, + "learning_rate": 2.7999070055796655e-07, + "loss": 0.9378, + "step": 72260 + }, + { + "epoch": 5.6003719632686275, + "grad_norm": 1.318420141700875, + "learning_rate": 2.80029448233106e-07, + "loss": 0.9577, + "step": 72270 + }, + { + "epoch": 5.601146886744933, + "grad_norm": 1.3337455195176608, + "learning_rate": 2.8006819590824554e-07, + "loss": 0.9332, + "step": 72280 + }, + { + "epoch": 5.60192181022124, + "grad_norm": 1.3590434953480792, + "learning_rate": 2.80106943583385e-07, + "loss": 0.9566, + "step": 72290 + }, + { + "epoch": 5.602696733697547, + "grad_norm": 1.377950731385284, + "learning_rate": 2.8014569125852453e-07, + "loss": 0.9419, + "step": 72300 + }, + { + "epoch": 5.603471657173854, + "grad_norm": 1.3672234759092463, + "learning_rate": 2.80184438933664e-07, + "loss": 0.9427, + "step": 72310 + }, + { + "epoch": 5.604246580650161, + "grad_norm": 1.320401539217909, + "learning_rate": 2.802231866088035e-07, + "loss": 0.9402, + "step": 72320 + }, + { + "epoch": 5.6050215041264675, + "grad_norm": 1.4428112385840917, + "learning_rate": 2.80261934283943e-07, + "loss": 0.952, + "step": 72330 + }, + { + "epoch": 5.605796427602774, + "grad_norm": 1.3946755612134765, + "learning_rate": 2.8030068195908246e-07, + "loss": 0.9302, + "step": 72340 + }, + { + "epoch": 5.606571351079081, + "grad_norm": 1.3660015605664155, + "learning_rate": 2.80339429634222e-07, + "loss": 0.9524, + "step": 72350 + }, + { + "epoch": 5.607346274555388, + "grad_norm": 1.3681488426591484, + "learning_rate": 2.8037817730936145e-07, + "loss": 0.9292, + "step": 72360 + }, + { + "epoch": 5.608121198031695, + "grad_norm": 1.3890164251641655, + "learning_rate": 2.8041692498450097e-07, + "loss": 0.9524, + "step": 72370 + }, + { + "epoch": 5.6088961215080015, + "grad_norm": 1.4029501919540415, + "learning_rate": 2.8045567265964044e-07, + "loss": 0.9544, + "step": 72380 + }, + { + "epoch": 5.6096710449843075, + "grad_norm": 1.3602728337945078, + "learning_rate": 2.8049442033477996e-07, + "loss": 0.9606, + "step": 72390 + }, + { + "epoch": 5.610445968460614, + "grad_norm": 1.3758698601187271, + "learning_rate": 2.8053316800991943e-07, + "loss": 0.9491, + "step": 72400 + }, + { + "epoch": 5.611220891936921, + "grad_norm": 1.3334930795840705, + "learning_rate": 2.805719156850589e-07, + "loss": 0.9321, + "step": 72410 + }, + { + "epoch": 5.611995815413228, + "grad_norm": 1.4156669508950381, + "learning_rate": 2.806106633601984e-07, + "loss": 0.927, + "step": 72420 + }, + { + "epoch": 5.612770738889535, + "grad_norm": 1.3685152002080938, + "learning_rate": 2.806494110353379e-07, + "loss": 0.9702, + "step": 72430 + }, + { + "epoch": 5.6135456623658415, + "grad_norm": 1.3708940357563966, + "learning_rate": 2.806881587104774e-07, + "loss": 0.9334, + "step": 72440 + }, + { + "epoch": 5.614320585842148, + "grad_norm": 1.3170141583789357, + "learning_rate": 2.807269063856169e-07, + "loss": 0.9496, + "step": 72450 + }, + { + "epoch": 5.615095509318455, + "grad_norm": 1.4343630573623234, + "learning_rate": 2.807656540607564e-07, + "loss": 0.9562, + "step": 72460 + }, + { + "epoch": 5.615870432794761, + "grad_norm": 1.3496751900605966, + "learning_rate": 2.808044017358959e-07, + "loss": 0.9342, + "step": 72470 + }, + { + "epoch": 5.616645356271068, + "grad_norm": 1.3677622976926243, + "learning_rate": 2.8084314941103534e-07, + "loss": 0.9378, + "step": 72480 + }, + { + "epoch": 5.617420279747375, + "grad_norm": 1.3967263987859753, + "learning_rate": 2.8088189708617487e-07, + "loss": 0.9658, + "step": 72490 + }, + { + "epoch": 5.6181952032236815, + "grad_norm": 1.384222605464729, + "learning_rate": 2.8092064476131433e-07, + "loss": 0.9281, + "step": 72500 + }, + { + "epoch": 5.6181952032236815, + "eval_loss": 0.9470317363739014, + "eval_runtime": 319.4787, + "eval_samples_per_second": 35.905, + "eval_steps_per_second": 8.977, + "step": 72500 + }, + { + "epoch": 5.618970126699988, + "grad_norm": 1.3611143482717096, + "learning_rate": 2.8095939243645386e-07, + "loss": 0.9658, + "step": 72510 + }, + { + "epoch": 5.619745050176295, + "grad_norm": 1.411866035233656, + "learning_rate": 2.809981401115933e-07, + "loss": 0.9571, + "step": 72520 + }, + { + "epoch": 5.620519973652602, + "grad_norm": 1.357155629799515, + "learning_rate": 2.8103688778673285e-07, + "loss": 0.9398, + "step": 72530 + }, + { + "epoch": 5.621294897128909, + "grad_norm": 1.47722105887275, + "learning_rate": 2.810756354618723e-07, + "loss": 0.951, + "step": 72540 + }, + { + "epoch": 5.6220698206052155, + "grad_norm": 1.4308314007975877, + "learning_rate": 2.811143831370118e-07, + "loss": 0.929, + "step": 72550 + }, + { + "epoch": 5.622844744081522, + "grad_norm": 1.4627750272592845, + "learning_rate": 2.811531308121513e-07, + "loss": 0.9405, + "step": 72560 + }, + { + "epoch": 5.623619667557829, + "grad_norm": 1.3853926100056722, + "learning_rate": 2.811918784872908e-07, + "loss": 0.9456, + "step": 72570 + }, + { + "epoch": 5.624394591034135, + "grad_norm": 1.4262573519162225, + "learning_rate": 2.812306261624303e-07, + "loss": 0.9569, + "step": 72580 + }, + { + "epoch": 5.625169514510442, + "grad_norm": 1.4181496223149563, + "learning_rate": 2.8126937383756977e-07, + "loss": 0.9666, + "step": 72590 + }, + { + "epoch": 5.625944437986749, + "grad_norm": 1.3956157881231797, + "learning_rate": 2.813081215127093e-07, + "loss": 0.9532, + "step": 72600 + }, + { + "epoch": 5.6267193614630555, + "grad_norm": 1.4888953931776368, + "learning_rate": 2.8134686918784876e-07, + "loss": 0.9386, + "step": 72610 + }, + { + "epoch": 5.627494284939362, + "grad_norm": 1.3160747471118377, + "learning_rate": 2.8138561686298823e-07, + "loss": 0.9411, + "step": 72620 + }, + { + "epoch": 5.628269208415669, + "grad_norm": 1.4485319461781998, + "learning_rate": 2.8142436453812775e-07, + "loss": 0.9402, + "step": 72630 + }, + { + "epoch": 5.629044131891976, + "grad_norm": 1.489456038055572, + "learning_rate": 2.814631122132672e-07, + "loss": 0.9689, + "step": 72640 + }, + { + "epoch": 5.629819055368283, + "grad_norm": 1.340928798609577, + "learning_rate": 2.8150185988840674e-07, + "loss": 0.9305, + "step": 72650 + }, + { + "epoch": 5.630593978844589, + "grad_norm": 1.3985919430864968, + "learning_rate": 2.815406075635462e-07, + "loss": 0.9389, + "step": 72660 + }, + { + "epoch": 5.6313689023208955, + "grad_norm": 1.3784886129855558, + "learning_rate": 2.8157935523868573e-07, + "loss": 0.932, + "step": 72670 + }, + { + "epoch": 5.632143825797202, + "grad_norm": 1.416447756818304, + "learning_rate": 2.816181029138252e-07, + "loss": 0.9644, + "step": 72680 + }, + { + "epoch": 5.632918749273509, + "grad_norm": 1.5232223542863574, + "learning_rate": 2.8165685058896467e-07, + "loss": 0.9524, + "step": 72690 + }, + { + "epoch": 5.633693672749816, + "grad_norm": 1.3849724898046822, + "learning_rate": 2.816955982641042e-07, + "loss": 0.9345, + "step": 72700 + }, + { + "epoch": 5.634468596226123, + "grad_norm": 1.4144921205852448, + "learning_rate": 2.8173434593924366e-07, + "loss": 0.9484, + "step": 72710 + }, + { + "epoch": 5.6352435197024295, + "grad_norm": 1.3932534730774349, + "learning_rate": 2.817730936143832e-07, + "loss": 0.9441, + "step": 72720 + }, + { + "epoch": 5.636018443178736, + "grad_norm": 1.372778877045302, + "learning_rate": 2.8181184128952265e-07, + "loss": 0.9667, + "step": 72730 + }, + { + "epoch": 5.636793366655043, + "grad_norm": 1.3454445456779094, + "learning_rate": 2.818505889646622e-07, + "loss": 0.9289, + "step": 72740 + }, + { + "epoch": 5.63756829013135, + "grad_norm": 1.3675295587577432, + "learning_rate": 2.8188933663980164e-07, + "loss": 0.9205, + "step": 72750 + }, + { + "epoch": 5.638343213607656, + "grad_norm": 1.4047836028011444, + "learning_rate": 2.819280843149411e-07, + "loss": 0.9231, + "step": 72760 + }, + { + "epoch": 5.639118137083963, + "grad_norm": 1.444748332965644, + "learning_rate": 2.8196683199008063e-07, + "loss": 0.9508, + "step": 72770 + }, + { + "epoch": 5.6398930605602695, + "grad_norm": 1.3424787957688429, + "learning_rate": 2.820055796652201e-07, + "loss": 0.9526, + "step": 72780 + }, + { + "epoch": 5.640667984036576, + "grad_norm": 1.354009518452506, + "learning_rate": 2.820443273403596e-07, + "loss": 0.9379, + "step": 72790 + }, + { + "epoch": 5.641442907512883, + "grad_norm": 1.4194115051009883, + "learning_rate": 2.820830750154991e-07, + "loss": 0.9469, + "step": 72800 + }, + { + "epoch": 5.64221783098919, + "grad_norm": 1.3738342877919, + "learning_rate": 2.821218226906386e-07, + "loss": 0.9309, + "step": 72810 + }, + { + "epoch": 5.642992754465497, + "grad_norm": 1.3505160557453004, + "learning_rate": 2.821605703657781e-07, + "loss": 0.9468, + "step": 72820 + }, + { + "epoch": 5.643767677941804, + "grad_norm": 1.308701639771387, + "learning_rate": 2.8219931804091755e-07, + "loss": 0.9456, + "step": 72830 + }, + { + "epoch": 5.6445426014181095, + "grad_norm": 1.3835133499612662, + "learning_rate": 2.822380657160571e-07, + "loss": 0.9447, + "step": 72840 + }, + { + "epoch": 5.645317524894416, + "grad_norm": 1.450918012731429, + "learning_rate": 2.8227681339119655e-07, + "loss": 0.9321, + "step": 72850 + }, + { + "epoch": 5.646092448370723, + "grad_norm": 1.415093526154684, + "learning_rate": 2.8231556106633607e-07, + "loss": 0.9505, + "step": 72860 + }, + { + "epoch": 5.64686737184703, + "grad_norm": 1.387984611038559, + "learning_rate": 2.8235430874147554e-07, + "loss": 0.9457, + "step": 72870 + }, + { + "epoch": 5.647642295323337, + "grad_norm": 1.3420112960252581, + "learning_rate": 2.8239305641661506e-07, + "loss": 0.9571, + "step": 72880 + }, + { + "epoch": 5.6484172187996435, + "grad_norm": 1.4254037235746173, + "learning_rate": 2.8243180409175453e-07, + "loss": 0.9626, + "step": 72890 + }, + { + "epoch": 5.64919214227595, + "grad_norm": 1.4166301743619243, + "learning_rate": 2.82470551766894e-07, + "loss": 0.9915, + "step": 72900 + }, + { + "epoch": 5.649967065752257, + "grad_norm": 1.280068589199105, + "learning_rate": 2.825092994420335e-07, + "loss": 0.9231, + "step": 72910 + }, + { + "epoch": 5.650741989228564, + "grad_norm": 1.356926230168506, + "learning_rate": 2.82548047117173e-07, + "loss": 0.9372, + "step": 72920 + }, + { + "epoch": 5.651516912704871, + "grad_norm": 1.386099658888063, + "learning_rate": 2.825867947923125e-07, + "loss": 0.9655, + "step": 72930 + }, + { + "epoch": 5.652291836181178, + "grad_norm": 1.3464755322562258, + "learning_rate": 2.82625542467452e-07, + "loss": 0.9432, + "step": 72940 + }, + { + "epoch": 5.6530667596574835, + "grad_norm": 1.3569240565892782, + "learning_rate": 2.826642901425915e-07, + "loss": 0.9421, + "step": 72950 + }, + { + "epoch": 5.65384168313379, + "grad_norm": 1.4225594968489743, + "learning_rate": 2.8270303781773097e-07, + "loss": 0.9411, + "step": 72960 + }, + { + "epoch": 5.654616606610097, + "grad_norm": 1.4375856491808034, + "learning_rate": 2.8274178549287044e-07, + "loss": 0.9444, + "step": 72970 + }, + { + "epoch": 5.655391530086404, + "grad_norm": 1.3233487518995526, + "learning_rate": 2.8278053316800996e-07, + "loss": 0.9374, + "step": 72980 + }, + { + "epoch": 5.656166453562711, + "grad_norm": 1.3641288023607407, + "learning_rate": 2.8281928084314943e-07, + "loss": 0.9336, + "step": 72990 + }, + { + "epoch": 5.656941377039018, + "grad_norm": 1.402586902020945, + "learning_rate": 2.8285802851828895e-07, + "loss": 0.9456, + "step": 73000 + }, + { + "epoch": 5.656941377039018, + "eval_loss": 0.9464043974876404, + "eval_runtime": 319.6634, + "eval_samples_per_second": 35.885, + "eval_steps_per_second": 8.972, + "step": 73000 + }, + { + "epoch": 5.657716300515324, + "grad_norm": 1.3490550661691658, + "learning_rate": 2.828967761934284e-07, + "loss": 0.9464, + "step": 73010 + }, + { + "epoch": 5.658491223991631, + "grad_norm": 1.3919274094236715, + "learning_rate": 2.829355238685679e-07, + "loss": 0.9173, + "step": 73020 + }, + { + "epoch": 5.659266147467937, + "grad_norm": 1.40322284107105, + "learning_rate": 2.829742715437074e-07, + "loss": 0.9236, + "step": 73030 + }, + { + "epoch": 5.660041070944244, + "grad_norm": 1.4076827791879476, + "learning_rate": 2.830130192188469e-07, + "loss": 0.9327, + "step": 73040 + }, + { + "epoch": 5.660815994420551, + "grad_norm": 1.426486885494023, + "learning_rate": 2.830517668939864e-07, + "loss": 0.9766, + "step": 73050 + }, + { + "epoch": 5.6615909178968575, + "grad_norm": 1.326724746627052, + "learning_rate": 2.8309051456912587e-07, + "loss": 0.942, + "step": 73060 + }, + { + "epoch": 5.662365841373164, + "grad_norm": 1.3649859822056898, + "learning_rate": 2.831292622442654e-07, + "loss": 0.9437, + "step": 73070 + }, + { + "epoch": 5.663140764849471, + "grad_norm": 1.4048420887588204, + "learning_rate": 2.8316800991940486e-07, + "loss": 0.9487, + "step": 73080 + }, + { + "epoch": 5.663915688325778, + "grad_norm": 1.3854025296308115, + "learning_rate": 2.8320675759454433e-07, + "loss": 0.9382, + "step": 73090 + }, + { + "epoch": 5.664690611802085, + "grad_norm": 1.3780130425781363, + "learning_rate": 2.8324550526968385e-07, + "loss": 0.9364, + "step": 73100 + }, + { + "epoch": 5.665465535278392, + "grad_norm": 1.475949474450272, + "learning_rate": 2.832842529448233e-07, + "loss": 0.9497, + "step": 73110 + }, + { + "epoch": 5.666240458754698, + "grad_norm": 1.3084647656851602, + "learning_rate": 2.8332300061996284e-07, + "loss": 0.9237, + "step": 73120 + }, + { + "epoch": 5.667015382231004, + "grad_norm": 1.4393111918562134, + "learning_rate": 2.833617482951023e-07, + "loss": 0.939, + "step": 73130 + }, + { + "epoch": 5.667790305707311, + "grad_norm": 1.2944377195461096, + "learning_rate": 2.8340049597024184e-07, + "loss": 0.9219, + "step": 73140 + }, + { + "epoch": 5.668565229183618, + "grad_norm": 1.3693414436796378, + "learning_rate": 2.834392436453813e-07, + "loss": 0.9221, + "step": 73150 + }, + { + "epoch": 5.669340152659925, + "grad_norm": 1.4113760394560806, + "learning_rate": 2.834779913205208e-07, + "loss": 0.9412, + "step": 73160 + }, + { + "epoch": 5.670115076136232, + "grad_norm": 1.3950232825896356, + "learning_rate": 2.835167389956603e-07, + "loss": 0.9533, + "step": 73170 + }, + { + "epoch": 5.670889999612538, + "grad_norm": 1.3452721880076195, + "learning_rate": 2.8355548667079976e-07, + "loss": 0.9338, + "step": 73180 + }, + { + "epoch": 5.671664923088845, + "grad_norm": 1.3421430369881615, + "learning_rate": 2.835942343459393e-07, + "loss": 0.9291, + "step": 73190 + }, + { + "epoch": 5.672439846565152, + "grad_norm": 1.4368749036614512, + "learning_rate": 2.8363298202107876e-07, + "loss": 0.9425, + "step": 73200 + }, + { + "epoch": 5.673214770041458, + "grad_norm": 1.4457668118977829, + "learning_rate": 2.836717296962183e-07, + "loss": 0.9458, + "step": 73210 + }, + { + "epoch": 5.673989693517765, + "grad_norm": 1.3688336392935174, + "learning_rate": 2.8371047737135775e-07, + "loss": 0.9708, + "step": 73220 + }, + { + "epoch": 5.6747646169940715, + "grad_norm": 1.4232552809187167, + "learning_rate": 2.837492250464972e-07, + "loss": 0.9806, + "step": 73230 + }, + { + "epoch": 5.675539540470378, + "grad_norm": 1.3347181562273724, + "learning_rate": 2.8378797272163674e-07, + "loss": 0.9384, + "step": 73240 + }, + { + "epoch": 5.676314463946685, + "grad_norm": 1.4762050598024263, + "learning_rate": 2.838267203967762e-07, + "loss": 0.9312, + "step": 73250 + }, + { + "epoch": 5.677089387422992, + "grad_norm": 1.34648527011074, + "learning_rate": 2.8386546807191573e-07, + "loss": 0.9656, + "step": 73260 + }, + { + "epoch": 5.677864310899299, + "grad_norm": 1.4103316968616428, + "learning_rate": 2.839042157470552e-07, + "loss": 0.9375, + "step": 73270 + }, + { + "epoch": 5.678639234375606, + "grad_norm": 1.3025344772636813, + "learning_rate": 2.839429634221947e-07, + "loss": 0.9366, + "step": 73280 + }, + { + "epoch": 5.679414157851912, + "grad_norm": 1.4197811094404278, + "learning_rate": 2.839817110973342e-07, + "loss": 0.9413, + "step": 73290 + }, + { + "epoch": 5.680189081328219, + "grad_norm": 1.4584521354286981, + "learning_rate": 2.8402045877247366e-07, + "loss": 0.9544, + "step": 73300 + }, + { + "epoch": 5.680964004804526, + "grad_norm": 1.3810880597854254, + "learning_rate": 2.840592064476132e-07, + "loss": 0.9343, + "step": 73310 + }, + { + "epoch": 5.681738928280832, + "grad_norm": 1.4064252498071053, + "learning_rate": 2.8409795412275265e-07, + "loss": 0.9726, + "step": 73320 + }, + { + "epoch": 5.682513851757139, + "grad_norm": 1.7700111394904305, + "learning_rate": 2.8413670179789217e-07, + "loss": 0.9806, + "step": 73330 + }, + { + "epoch": 5.6832887752334456, + "grad_norm": 1.410251578972013, + "learning_rate": 2.8417544947303164e-07, + "loss": 0.9372, + "step": 73340 + }, + { + "epoch": 5.684063698709752, + "grad_norm": 1.464076717758312, + "learning_rate": 2.8421419714817116e-07, + "loss": 0.955, + "step": 73350 + }, + { + "epoch": 5.684838622186059, + "grad_norm": 1.4031592433240165, + "learning_rate": 2.8425294482331063e-07, + "loss": 0.925, + "step": 73360 + }, + { + "epoch": 5.685613545662366, + "grad_norm": 1.4033366519420027, + "learning_rate": 2.842916924984501e-07, + "loss": 0.9498, + "step": 73370 + }, + { + "epoch": 5.686388469138673, + "grad_norm": 1.4543901336446392, + "learning_rate": 2.843304401735896e-07, + "loss": 0.9638, + "step": 73380 + }, + { + "epoch": 5.68716339261498, + "grad_norm": 1.3933877791059888, + "learning_rate": 2.843691878487291e-07, + "loss": 0.971, + "step": 73390 + }, + { + "epoch": 5.6879383160912855, + "grad_norm": 1.3547430784526109, + "learning_rate": 2.844079355238686e-07, + "loss": 0.9579, + "step": 73400 + }, + { + "epoch": 5.688713239567592, + "grad_norm": 1.3617450710522767, + "learning_rate": 2.844466831990081e-07, + "loss": 0.9417, + "step": 73410 + }, + { + "epoch": 5.689488163043899, + "grad_norm": 1.3503249210871622, + "learning_rate": 2.844854308741476e-07, + "loss": 0.9957, + "step": 73420 + }, + { + "epoch": 5.690263086520206, + "grad_norm": 1.3939388010056477, + "learning_rate": 2.8452417854928707e-07, + "loss": 0.9397, + "step": 73430 + }, + { + "epoch": 5.691038009996513, + "grad_norm": 1.3688840754933056, + "learning_rate": 2.8456292622442654e-07, + "loss": 0.93, + "step": 73440 + }, + { + "epoch": 5.69181293347282, + "grad_norm": 1.3174104340086925, + "learning_rate": 2.8460167389956606e-07, + "loss": 0.9307, + "step": 73450 + }, + { + "epoch": 5.692587856949126, + "grad_norm": 1.3862637905139916, + "learning_rate": 2.8464042157470553e-07, + "loss": 0.9459, + "step": 73460 + }, + { + "epoch": 5.693362780425433, + "grad_norm": 1.3268880306356163, + "learning_rate": 2.8467916924984506e-07, + "loss": 0.9341, + "step": 73470 + }, + { + "epoch": 5.69413770390174, + "grad_norm": 1.2851561301405323, + "learning_rate": 2.847179169249845e-07, + "loss": 0.9366, + "step": 73480 + }, + { + "epoch": 5.694912627378047, + "grad_norm": 1.3540341633331732, + "learning_rate": 2.8475666460012405e-07, + "loss": 0.9294, + "step": 73490 + }, + { + "epoch": 5.695687550854354, + "grad_norm": 1.5198649562594129, + "learning_rate": 2.847954122752635e-07, + "loss": 0.9436, + "step": 73500 + }, + { + "epoch": 5.695687550854354, + "eval_loss": 0.9458404779434204, + "eval_runtime": 320.128, + "eval_samples_per_second": 35.833, + "eval_steps_per_second": 8.959, + "step": 73500 + }, + { + "epoch": 5.6964624743306596, + "grad_norm": 1.3797244291104858, + "learning_rate": 2.84834159950403e-07, + "loss": 0.9288, + "step": 73510 + }, + { + "epoch": 5.697237397806966, + "grad_norm": 1.4125722124258333, + "learning_rate": 2.848729076255425e-07, + "loss": 0.9305, + "step": 73520 + }, + { + "epoch": 5.698012321283273, + "grad_norm": 1.3805077096628409, + "learning_rate": 2.84911655300682e-07, + "loss": 0.949, + "step": 73530 + }, + { + "epoch": 5.69878724475958, + "grad_norm": 1.437251026536953, + "learning_rate": 2.849504029758215e-07, + "loss": 0.9334, + "step": 73540 + }, + { + "epoch": 5.699562168235887, + "grad_norm": 1.3901068553014064, + "learning_rate": 2.8498915065096097e-07, + "loss": 0.9222, + "step": 73550 + }, + { + "epoch": 5.700337091712194, + "grad_norm": 1.4150311362569092, + "learning_rate": 2.850278983261005e-07, + "loss": 0.9406, + "step": 73560 + }, + { + "epoch": 5.7011120151885, + "grad_norm": 1.3815630412627582, + "learning_rate": 2.8506664600123996e-07, + "loss": 0.9441, + "step": 73570 + }, + { + "epoch": 5.701886938664807, + "grad_norm": 1.4500463047249958, + "learning_rate": 2.8510539367637943e-07, + "loss": 0.9434, + "step": 73580 + }, + { + "epoch": 5.702661862141113, + "grad_norm": 1.4206079532374534, + "learning_rate": 2.8514414135151895e-07, + "loss": 0.9309, + "step": 73590 + }, + { + "epoch": 5.70343678561742, + "grad_norm": 1.3195492655246344, + "learning_rate": 2.851828890266584e-07, + "loss": 0.9336, + "step": 73600 + }, + { + "epoch": 5.704211709093727, + "grad_norm": 1.4450889330025296, + "learning_rate": 2.8522163670179794e-07, + "loss": 0.9501, + "step": 73610 + }, + { + "epoch": 5.704986632570034, + "grad_norm": 1.4118542652368, + "learning_rate": 2.852603843769374e-07, + "loss": 0.9471, + "step": 73620 + }, + { + "epoch": 5.70576155604634, + "grad_norm": 1.4511331879226128, + "learning_rate": 2.8529913205207693e-07, + "loss": 0.9695, + "step": 73630 + }, + { + "epoch": 5.706536479522647, + "grad_norm": 1.3232492129561555, + "learning_rate": 2.853378797272164e-07, + "loss": 0.9298, + "step": 73640 + }, + { + "epoch": 5.707311402998954, + "grad_norm": 1.5049848951905416, + "learning_rate": 2.8537662740235587e-07, + "loss": 0.9356, + "step": 73650 + }, + { + "epoch": 5.708086326475261, + "grad_norm": 1.2888977969946558, + "learning_rate": 2.854153750774954e-07, + "loss": 0.9234, + "step": 73660 + }, + { + "epoch": 5.708861249951568, + "grad_norm": 1.3939915715910793, + "learning_rate": 2.8545412275263486e-07, + "loss": 0.9229, + "step": 73670 + }, + { + "epoch": 5.709636173427874, + "grad_norm": 1.442529990200742, + "learning_rate": 2.854928704277744e-07, + "loss": 0.932, + "step": 73680 + }, + { + "epoch": 5.71041109690418, + "grad_norm": 1.4377013801227154, + "learning_rate": 2.8553161810291385e-07, + "loss": 0.9678, + "step": 73690 + }, + { + "epoch": 5.711186020380487, + "grad_norm": 1.3783897646383523, + "learning_rate": 2.855703657780533e-07, + "loss": 0.9349, + "step": 73700 + }, + { + "epoch": 5.711960943856794, + "grad_norm": 1.3362313390055254, + "learning_rate": 2.8560911345319284e-07, + "loss": 0.9244, + "step": 73710 + }, + { + "epoch": 5.712735867333101, + "grad_norm": 1.4292168269840753, + "learning_rate": 2.856478611283323e-07, + "loss": 0.942, + "step": 73720 + }, + { + "epoch": 5.713510790809408, + "grad_norm": 1.438678923585079, + "learning_rate": 2.8568660880347183e-07, + "loss": 0.9447, + "step": 73730 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.4024865340891277, + "learning_rate": 2.857253564786113e-07, + "loss": 0.9313, + "step": 73740 + }, + { + "epoch": 5.715060637762021, + "grad_norm": 1.3650147615886559, + "learning_rate": 2.857641041537508e-07, + "loss": 0.9458, + "step": 73750 + }, + { + "epoch": 5.715835561238328, + "grad_norm": 1.350704085740583, + "learning_rate": 2.858028518288903e-07, + "loss": 0.9417, + "step": 73760 + }, + { + "epoch": 5.716610484714634, + "grad_norm": 1.4289888643312536, + "learning_rate": 2.8584159950402976e-07, + "loss": 0.9313, + "step": 73770 + }, + { + "epoch": 5.717385408190941, + "grad_norm": 1.3834248496049324, + "learning_rate": 2.858803471791693e-07, + "loss": 0.9411, + "step": 73780 + }, + { + "epoch": 5.718160331667248, + "grad_norm": 1.3907935271974914, + "learning_rate": 2.8591909485430875e-07, + "loss": 0.9626, + "step": 73790 + }, + { + "epoch": 5.718935255143554, + "grad_norm": 1.342175082412199, + "learning_rate": 2.859578425294483e-07, + "loss": 0.9321, + "step": 73800 + }, + { + "epoch": 5.719710178619861, + "grad_norm": 1.4255146914387609, + "learning_rate": 2.8599659020458774e-07, + "loss": 0.9385, + "step": 73810 + }, + { + "epoch": 5.720485102096168, + "grad_norm": 1.4604668426448983, + "learning_rate": 2.8603533787972727e-07, + "loss": 0.9161, + "step": 73820 + }, + { + "epoch": 5.721260025572475, + "grad_norm": 1.3314286405392162, + "learning_rate": 2.8607408555486673e-07, + "loss": 0.9788, + "step": 73830 + }, + { + "epoch": 5.722034949048782, + "grad_norm": 1.3392666989621067, + "learning_rate": 2.861128332300062e-07, + "loss": 0.9365, + "step": 73840 + }, + { + "epoch": 5.722809872525088, + "grad_norm": 1.265583331711839, + "learning_rate": 2.861515809051457e-07, + "loss": 0.9384, + "step": 73850 + }, + { + "epoch": 5.723584796001395, + "grad_norm": 1.359739858231947, + "learning_rate": 2.861903285802852e-07, + "loss": 0.9509, + "step": 73860 + }, + { + "epoch": 5.724359719477702, + "grad_norm": 1.4647248019015269, + "learning_rate": 2.862290762554247e-07, + "loss": 0.9208, + "step": 73870 + }, + { + "epoch": 5.725134642954008, + "grad_norm": 1.379338866634779, + "learning_rate": 2.862678239305642e-07, + "loss": 0.9656, + "step": 73880 + }, + { + "epoch": 5.725909566430315, + "grad_norm": 1.4273056952476437, + "learning_rate": 2.863065716057037e-07, + "loss": 0.9446, + "step": 73890 + }, + { + "epoch": 5.726684489906622, + "grad_norm": 1.379210047554168, + "learning_rate": 2.863453192808432e-07, + "loss": 0.9444, + "step": 73900 + }, + { + "epoch": 5.727459413382928, + "grad_norm": 1.4508151598611996, + "learning_rate": 2.8638406695598265e-07, + "loss": 0.9604, + "step": 73910 + }, + { + "epoch": 5.728234336859235, + "grad_norm": 1.4283276857858012, + "learning_rate": 2.8642281463112217e-07, + "loss": 0.9313, + "step": 73920 + }, + { + "epoch": 5.729009260335542, + "grad_norm": 4.657822881615142, + "learning_rate": 2.8646156230626164e-07, + "loss": 0.9456, + "step": 73930 + }, + { + "epoch": 5.729784183811849, + "grad_norm": 1.37978673626654, + "learning_rate": 2.8650030998140116e-07, + "loss": 0.9354, + "step": 73940 + }, + { + "epoch": 5.730559107288156, + "grad_norm": 1.36838758750212, + "learning_rate": 2.8653905765654063e-07, + "loss": 0.9461, + "step": 73950 + }, + { + "epoch": 5.731334030764462, + "grad_norm": 1.365928889124298, + "learning_rate": 2.8657780533168015e-07, + "loss": 0.9331, + "step": 73960 + }, + { + "epoch": 5.732108954240768, + "grad_norm": 1.3774893094131455, + "learning_rate": 2.866165530068196e-07, + "loss": 0.9477, + "step": 73970 + }, + { + "epoch": 5.732883877717075, + "grad_norm": 1.376235809936045, + "learning_rate": 2.866553006819591e-07, + "loss": 0.935, + "step": 73980 + }, + { + "epoch": 5.733658801193382, + "grad_norm": 1.401645865280591, + "learning_rate": 2.866940483570986e-07, + "loss": 0.9459, + "step": 73990 + }, + { + "epoch": 5.734433724669689, + "grad_norm": 1.309842328658824, + "learning_rate": 2.867327960322381e-07, + "loss": 0.936, + "step": 74000 + }, + { + "epoch": 5.734433724669689, + "eval_loss": 0.9454536437988281, + "eval_runtime": 318.3399, + "eval_samples_per_second": 36.034, + "eval_steps_per_second": 9.009, + "step": 74000 + }, + { + "epoch": 5.735208648145996, + "grad_norm": 1.5133257154030786, + "learning_rate": 2.867715437073776e-07, + "loss": 0.953, + "step": 74010 + }, + { + "epoch": 5.735983571622302, + "grad_norm": 1.4121421362554618, + "learning_rate": 2.8681029138251707e-07, + "loss": 0.9388, + "step": 74020 + }, + { + "epoch": 5.736758495098609, + "grad_norm": 1.3371279062508037, + "learning_rate": 2.868490390576566e-07, + "loss": 0.9205, + "step": 74030 + }, + { + "epoch": 5.737533418574916, + "grad_norm": 1.6304455886297449, + "learning_rate": 2.8688778673279606e-07, + "loss": 0.9597, + "step": 74040 + }, + { + "epoch": 5.738308342051223, + "grad_norm": 1.3851475423926929, + "learning_rate": 2.8692653440793553e-07, + "loss": 0.9296, + "step": 74050 + }, + { + "epoch": 5.739083265527529, + "grad_norm": 1.3761691746861022, + "learning_rate": 2.8696528208307505e-07, + "loss": 0.952, + "step": 74060 + }, + { + "epoch": 5.739858189003836, + "grad_norm": 1.4091340140353463, + "learning_rate": 2.870040297582145e-07, + "loss": 0.9534, + "step": 74070 + }, + { + "epoch": 5.740633112480142, + "grad_norm": 1.458202853682235, + "learning_rate": 2.8704277743335404e-07, + "loss": 0.942, + "step": 74080 + }, + { + "epoch": 5.741408035956449, + "grad_norm": 1.3808874896544328, + "learning_rate": 2.870815251084935e-07, + "loss": 0.9288, + "step": 74090 + }, + { + "epoch": 5.742182959432756, + "grad_norm": 1.4225501174815613, + "learning_rate": 2.8712027278363303e-07, + "loss": 0.9448, + "step": 74100 + }, + { + "epoch": 5.742957882909063, + "grad_norm": 1.5014459538598046, + "learning_rate": 2.871590204587725e-07, + "loss": 0.9626, + "step": 74110 + }, + { + "epoch": 5.74373280638537, + "grad_norm": 1.25306087912228, + "learning_rate": 2.8719776813391197e-07, + "loss": 0.9064, + "step": 74120 + }, + { + "epoch": 5.7445077298616765, + "grad_norm": 1.4438011189957187, + "learning_rate": 2.872365158090515e-07, + "loss": 0.9315, + "step": 74130 + }, + { + "epoch": 5.745282653337982, + "grad_norm": 1.3872551393912536, + "learning_rate": 2.8727526348419096e-07, + "loss": 0.9531, + "step": 74140 + }, + { + "epoch": 5.746057576814289, + "grad_norm": 1.3661104426250008, + "learning_rate": 2.873140111593305e-07, + "loss": 0.9478, + "step": 74150 + }, + { + "epoch": 5.746832500290596, + "grad_norm": 1.29736789788835, + "learning_rate": 2.8735275883446995e-07, + "loss": 0.9451, + "step": 74160 + }, + { + "epoch": 5.747607423766903, + "grad_norm": 1.393669750555065, + "learning_rate": 2.873915065096095e-07, + "loss": 0.9607, + "step": 74170 + }, + { + "epoch": 5.74838234724321, + "grad_norm": 1.3631385334583292, + "learning_rate": 2.8743025418474895e-07, + "loss": 0.9377, + "step": 74180 + }, + { + "epoch": 5.749157270719516, + "grad_norm": 1.4201731564038194, + "learning_rate": 2.874690018598884e-07, + "loss": 0.9431, + "step": 74190 + }, + { + "epoch": 5.749932194195823, + "grad_norm": 1.3760565511671574, + "learning_rate": 2.8750774953502794e-07, + "loss": 0.968, + "step": 74200 + }, + { + "epoch": 5.75070711767213, + "grad_norm": 1.4546235556158953, + "learning_rate": 2.875464972101674e-07, + "loss": 0.9492, + "step": 74210 + }, + { + "epoch": 5.751482041148437, + "grad_norm": 1.4161674516839824, + "learning_rate": 2.8758524488530693e-07, + "loss": 0.9482, + "step": 74220 + }, + { + "epoch": 5.752256964624744, + "grad_norm": 1.435046509570317, + "learning_rate": 2.876239925604464e-07, + "loss": 0.9695, + "step": 74230 + }, + { + "epoch": 5.7530318881010505, + "grad_norm": 1.4338455707938056, + "learning_rate": 2.876627402355859e-07, + "loss": 0.9345, + "step": 74240 + }, + { + "epoch": 5.753806811577356, + "grad_norm": 1.4216746419855288, + "learning_rate": 2.877014879107254e-07, + "loss": 0.9452, + "step": 74250 + }, + { + "epoch": 5.754581735053663, + "grad_norm": 1.3947445517158426, + "learning_rate": 2.8774023558586486e-07, + "loss": 0.9655, + "step": 74260 + }, + { + "epoch": 5.75535665852997, + "grad_norm": 1.429790675707361, + "learning_rate": 2.877789832610044e-07, + "loss": 0.92, + "step": 74270 + }, + { + "epoch": 5.756131582006277, + "grad_norm": 1.43284730947888, + "learning_rate": 2.8781773093614385e-07, + "loss": 0.9232, + "step": 74280 + }, + { + "epoch": 5.756906505482584, + "grad_norm": 1.3195902092546936, + "learning_rate": 2.8785647861128337e-07, + "loss": 0.9454, + "step": 74290 + }, + { + "epoch": 5.7576814289588905, + "grad_norm": 1.4840280488543605, + "learning_rate": 2.8789522628642284e-07, + "loss": 0.9274, + "step": 74300 + }, + { + "epoch": 5.758456352435197, + "grad_norm": 1.4104230099002562, + "learning_rate": 2.8793397396156236e-07, + "loss": 0.9334, + "step": 74310 + }, + { + "epoch": 5.759231275911504, + "grad_norm": 1.2806917150780317, + "learning_rate": 2.8797272163670183e-07, + "loss": 0.9345, + "step": 74320 + }, + { + "epoch": 5.76000619938781, + "grad_norm": 1.3574763005319244, + "learning_rate": 2.880114693118413e-07, + "loss": 0.9482, + "step": 74330 + }, + { + "epoch": 5.760781122864117, + "grad_norm": 1.3980187961772177, + "learning_rate": 2.880502169869808e-07, + "loss": 0.9346, + "step": 74340 + }, + { + "epoch": 5.761556046340424, + "grad_norm": 1.4360854084600894, + "learning_rate": 2.880889646621203e-07, + "loss": 0.9487, + "step": 74350 + }, + { + "epoch": 5.76233096981673, + "grad_norm": 1.39624027422077, + "learning_rate": 2.881277123372598e-07, + "loss": 0.9447, + "step": 74360 + }, + { + "epoch": 5.763105893293037, + "grad_norm": 1.3867256596452302, + "learning_rate": 2.881664600123993e-07, + "loss": 0.9409, + "step": 74370 + }, + { + "epoch": 5.763880816769344, + "grad_norm": 1.330480003914678, + "learning_rate": 2.882052076875388e-07, + "loss": 0.9327, + "step": 74380 + }, + { + "epoch": 5.764655740245651, + "grad_norm": 1.3833269065325893, + "learning_rate": 2.8824395536267827e-07, + "loss": 0.9467, + "step": 74390 + }, + { + "epoch": 5.765430663721958, + "grad_norm": 1.3580104369616475, + "learning_rate": 2.8828270303781774e-07, + "loss": 0.9293, + "step": 74400 + }, + { + "epoch": 5.7662055871982645, + "grad_norm": 1.3428866373145005, + "learning_rate": 2.8832145071295726e-07, + "loss": 0.927, + "step": 74410 + }, + { + "epoch": 5.766980510674571, + "grad_norm": 1.4449311317549676, + "learning_rate": 2.8836019838809673e-07, + "loss": 0.9164, + "step": 74420 + }, + { + "epoch": 5.767755434150877, + "grad_norm": 1.4906240100909476, + "learning_rate": 2.8839894606323625e-07, + "loss": 0.938, + "step": 74430 + }, + { + "epoch": 5.768530357627184, + "grad_norm": 1.4303274592663424, + "learning_rate": 2.884376937383757e-07, + "loss": 0.9292, + "step": 74440 + }, + { + "epoch": 5.769305281103491, + "grad_norm": 1.4611928170273656, + "learning_rate": 2.884764414135152e-07, + "loss": 0.9455, + "step": 74450 + }, + { + "epoch": 5.770080204579798, + "grad_norm": 1.4646141455654955, + "learning_rate": 2.885151890886547e-07, + "loss": 0.945, + "step": 74460 + }, + { + "epoch": 5.7708551280561045, + "grad_norm": 1.436784382087901, + "learning_rate": 2.885539367637942e-07, + "loss": 0.9335, + "step": 74470 + }, + { + "epoch": 5.771630051532411, + "grad_norm": 1.4162104923299492, + "learning_rate": 2.885926844389337e-07, + "loss": 0.9378, + "step": 74480 + }, + { + "epoch": 5.772404975008718, + "grad_norm": 1.344481308833764, + "learning_rate": 2.886314321140732e-07, + "loss": 0.9574, + "step": 74490 + }, + { + "epoch": 5.773179898485025, + "grad_norm": 1.4344362485301188, + "learning_rate": 2.886701797892127e-07, + "loss": 0.956, + "step": 74500 + }, + { + "epoch": 5.773179898485025, + "eval_loss": 0.944819986820221, + "eval_runtime": 319.2401, + "eval_samples_per_second": 35.932, + "eval_steps_per_second": 8.984, + "step": 74500 + }, + { + "epoch": 5.773954821961332, + "grad_norm": 1.3407494471690815, + "learning_rate": 2.8870892746435216e-07, + "loss": 0.9654, + "step": 74510 + }, + { + "epoch": 5.774729745437638, + "grad_norm": 1.387879583278785, + "learning_rate": 2.8874767513949163e-07, + "loss": 0.9472, + "step": 74520 + }, + { + "epoch": 5.775504668913944, + "grad_norm": 1.3917770559977074, + "learning_rate": 2.8878642281463116e-07, + "loss": 0.9434, + "step": 74530 + }, + { + "epoch": 5.776279592390251, + "grad_norm": 1.350004439790283, + "learning_rate": 2.888251704897706e-07, + "loss": 0.938, + "step": 74540 + }, + { + "epoch": 5.777054515866558, + "grad_norm": 1.2984663069481304, + "learning_rate": 2.8886391816491015e-07, + "loss": 0.9392, + "step": 74550 + }, + { + "epoch": 5.777829439342865, + "grad_norm": 1.4182019412683002, + "learning_rate": 2.889026658400496e-07, + "loss": 0.9673, + "step": 74560 + }, + { + "epoch": 5.778604362819172, + "grad_norm": 1.412898340089769, + "learning_rate": 2.8894141351518914e-07, + "loss": 0.9361, + "step": 74570 + }, + { + "epoch": 5.7793792862954785, + "grad_norm": 1.3628653963201918, + "learning_rate": 2.889801611903286e-07, + "loss": 0.9492, + "step": 74580 + }, + { + "epoch": 5.780154209771785, + "grad_norm": 1.359369502992319, + "learning_rate": 2.890189088654681e-07, + "loss": 0.9387, + "step": 74590 + }, + { + "epoch": 5.780929133248092, + "grad_norm": 1.400187992356998, + "learning_rate": 2.890576565406076e-07, + "loss": 0.9228, + "step": 74600 + }, + { + "epoch": 5.781704056724399, + "grad_norm": 1.3570832407314277, + "learning_rate": 2.8909640421574707e-07, + "loss": 0.9578, + "step": 74610 + }, + { + "epoch": 5.782478980200705, + "grad_norm": 1.3507691559893538, + "learning_rate": 2.891351518908866e-07, + "loss": 0.9422, + "step": 74620 + }, + { + "epoch": 5.783253903677012, + "grad_norm": 1.3686821064974466, + "learning_rate": 2.8917389956602606e-07, + "loss": 0.9732, + "step": 74630 + }, + { + "epoch": 5.7840288271533185, + "grad_norm": 1.4671763555243162, + "learning_rate": 2.892126472411656e-07, + "loss": 0.9468, + "step": 74640 + }, + { + "epoch": 5.784803750629625, + "grad_norm": 1.300436224958161, + "learning_rate": 2.8925139491630505e-07, + "loss": 0.9364, + "step": 74650 + }, + { + "epoch": 5.785578674105932, + "grad_norm": 1.3008747320851353, + "learning_rate": 2.892901425914445e-07, + "loss": 0.9337, + "step": 74660 + }, + { + "epoch": 5.786353597582239, + "grad_norm": 1.4114379789994473, + "learning_rate": 2.8932889026658404e-07, + "loss": 0.9411, + "step": 74670 + }, + { + "epoch": 5.787128521058546, + "grad_norm": 1.3624951323932553, + "learning_rate": 2.893676379417235e-07, + "loss": 0.9366, + "step": 74680 + }, + { + "epoch": 5.7879034445348525, + "grad_norm": 1.4578034157989976, + "learning_rate": 2.8940638561686303e-07, + "loss": 0.9566, + "step": 74690 + }, + { + "epoch": 5.788678368011158, + "grad_norm": 1.4675588132746697, + "learning_rate": 2.894451332920025e-07, + "loss": 0.9132, + "step": 74700 + }, + { + "epoch": 5.789453291487465, + "grad_norm": 1.4023449961733856, + "learning_rate": 2.89483880967142e-07, + "loss": 0.9663, + "step": 74710 + }, + { + "epoch": 5.790228214963772, + "grad_norm": 1.3694000656073715, + "learning_rate": 2.895226286422815e-07, + "loss": 0.9472, + "step": 74720 + }, + { + "epoch": 5.791003138440079, + "grad_norm": 1.4443470968299772, + "learning_rate": 2.8956137631742096e-07, + "loss": 0.9927, + "step": 74730 + }, + { + "epoch": 5.791778061916386, + "grad_norm": 1.4076060234440326, + "learning_rate": 2.896001239925605e-07, + "loss": 0.9541, + "step": 74740 + }, + { + "epoch": 5.7925529853926925, + "grad_norm": 1.358309546351276, + "learning_rate": 2.8963887166769995e-07, + "loss": 0.9308, + "step": 74750 + }, + { + "epoch": 5.793327908868999, + "grad_norm": 1.3481824793740564, + "learning_rate": 2.8967761934283947e-07, + "loss": 0.9407, + "step": 74760 + }, + { + "epoch": 5.794102832345306, + "grad_norm": 1.3445320941929448, + "learning_rate": 2.8971636701797894e-07, + "loss": 0.9193, + "step": 74770 + }, + { + "epoch": 5.794877755821613, + "grad_norm": 1.609012311743734, + "learning_rate": 2.8975511469311846e-07, + "loss": 0.9549, + "step": 74780 + }, + { + "epoch": 5.79565267929792, + "grad_norm": 1.4371881714773969, + "learning_rate": 2.8979386236825793e-07, + "loss": 0.936, + "step": 74790 + }, + { + "epoch": 5.7964276027742265, + "grad_norm": 1.4091979614974077, + "learning_rate": 2.898326100433974e-07, + "loss": 0.9437, + "step": 74800 + }, + { + "epoch": 5.7972025262505325, + "grad_norm": 1.4553149765647868, + "learning_rate": 2.898713577185369e-07, + "loss": 0.9377, + "step": 74810 + }, + { + "epoch": 5.797977449726839, + "grad_norm": 1.36649643578499, + "learning_rate": 2.899101053936764e-07, + "loss": 0.9373, + "step": 74820 + }, + { + "epoch": 5.798752373203146, + "grad_norm": 1.3742149767232323, + "learning_rate": 2.899488530688159e-07, + "loss": 0.9513, + "step": 74830 + }, + { + "epoch": 5.799527296679453, + "grad_norm": 1.3349091307244396, + "learning_rate": 2.899876007439554e-07, + "loss": 0.9364, + "step": 74840 + }, + { + "epoch": 5.80030222015576, + "grad_norm": 1.3807184524397884, + "learning_rate": 2.900263484190949e-07, + "loss": 0.9293, + "step": 74850 + }, + { + "epoch": 5.8010771436320665, + "grad_norm": 1.3445422157839866, + "learning_rate": 2.900650960942344e-07, + "loss": 0.9252, + "step": 74860 + }, + { + "epoch": 5.801852067108373, + "grad_norm": 1.4634658077763425, + "learning_rate": 2.9010384376937384e-07, + "loss": 0.9387, + "step": 74870 + }, + { + "epoch": 5.80262699058468, + "grad_norm": 1.4090544927520663, + "learning_rate": 2.9014259144451337e-07, + "loss": 0.9574, + "step": 74880 + }, + { + "epoch": 5.803401914060986, + "grad_norm": 1.486615149566606, + "learning_rate": 2.9018133911965284e-07, + "loss": 0.9382, + "step": 74890 + }, + { + "epoch": 5.804176837537293, + "grad_norm": 1.3625426561953047, + "learning_rate": 2.9022008679479236e-07, + "loss": 0.9376, + "step": 74900 + }, + { + "epoch": 5.8049517610136, + "grad_norm": 1.2882247658364263, + "learning_rate": 2.902588344699318e-07, + "loss": 0.9394, + "step": 74910 + }, + { + "epoch": 5.8057266844899065, + "grad_norm": 1.4222411317881296, + "learning_rate": 2.9029758214507135e-07, + "loss": 0.9393, + "step": 74920 + }, + { + "epoch": 5.806501607966213, + "grad_norm": 1.388539527786237, + "learning_rate": 2.903363298202108e-07, + "loss": 0.9365, + "step": 74930 + }, + { + "epoch": 5.80727653144252, + "grad_norm": 1.4306368872761348, + "learning_rate": 2.903750774953503e-07, + "loss": 0.949, + "step": 74940 + }, + { + "epoch": 5.808051454918827, + "grad_norm": 1.3556311894984032, + "learning_rate": 2.904138251704898e-07, + "loss": 0.9471, + "step": 74950 + }, + { + "epoch": 5.808826378395134, + "grad_norm": 1.458544751131571, + "learning_rate": 2.904525728456293e-07, + "loss": 0.929, + "step": 74960 + }, + { + "epoch": 5.8096013018714405, + "grad_norm": 1.4568051253512175, + "learning_rate": 2.904913205207688e-07, + "loss": 0.9274, + "step": 74970 + }, + { + "epoch": 5.810376225347747, + "grad_norm": 1.4068818474081728, + "learning_rate": 2.9053006819590827e-07, + "loss": 0.9308, + "step": 74980 + }, + { + "epoch": 5.811151148824053, + "grad_norm": 1.3465835019629722, + "learning_rate": 2.905688158710478e-07, + "loss": 0.9618, + "step": 74990 + }, + { + "epoch": 5.81192607230036, + "grad_norm": 1.4017038474371557, + "learning_rate": 2.9060756354618726e-07, + "loss": 0.9316, + "step": 75000 + }, + { + "epoch": 5.81192607230036, + "eval_loss": 0.9442824125289917, + "eval_runtime": 319.8541, + "eval_samples_per_second": 35.863, + "eval_steps_per_second": 8.967, + "step": 75000 + }, + { + "epoch": 5.812700995776667, + "grad_norm": 1.510923932814381, + "learning_rate": 2.9064631122132673e-07, + "loss": 0.9378, + "step": 75010 + }, + { + "epoch": 5.813475919252974, + "grad_norm": 1.4071359433602497, + "learning_rate": 2.9068505889646625e-07, + "loss": 0.9492, + "step": 75020 + }, + { + "epoch": 5.8142508427292805, + "grad_norm": 1.4998039864789494, + "learning_rate": 2.907238065716057e-07, + "loss": 0.9506, + "step": 75030 + }, + { + "epoch": 5.815025766205587, + "grad_norm": 1.3255036358093668, + "learning_rate": 2.9076255424674524e-07, + "loss": 0.9389, + "step": 75040 + }, + { + "epoch": 5.815800689681894, + "grad_norm": 1.3961484312729295, + "learning_rate": 2.908013019218847e-07, + "loss": 0.9338, + "step": 75050 + }, + { + "epoch": 5.816575613158201, + "grad_norm": 1.384611348031449, + "learning_rate": 2.9084004959702423e-07, + "loss": 0.9418, + "step": 75060 + }, + { + "epoch": 5.817350536634507, + "grad_norm": 1.3876117620527648, + "learning_rate": 2.908787972721637e-07, + "loss": 0.9206, + "step": 75070 + }, + { + "epoch": 5.818125460110814, + "grad_norm": 1.3741501178430442, + "learning_rate": 2.9091754494730317e-07, + "loss": 0.9469, + "step": 75080 + }, + { + "epoch": 5.8189003835871205, + "grad_norm": 1.4352922551469205, + "learning_rate": 2.909562926224427e-07, + "loss": 0.9512, + "step": 75090 + }, + { + "epoch": 5.819675307063427, + "grad_norm": 1.4067417854598314, + "learning_rate": 2.9099504029758216e-07, + "loss": 0.9694, + "step": 75100 + }, + { + "epoch": 5.820450230539734, + "grad_norm": 1.4060890824115442, + "learning_rate": 2.910337879727217e-07, + "loss": 0.9455, + "step": 75110 + }, + { + "epoch": 5.821225154016041, + "grad_norm": 1.357682932334724, + "learning_rate": 2.9107253564786115e-07, + "loss": 0.9199, + "step": 75120 + }, + { + "epoch": 5.822000077492348, + "grad_norm": 1.4473280433809712, + "learning_rate": 2.911112833230007e-07, + "loss": 0.9523, + "step": 75130 + }, + { + "epoch": 5.8227750009686545, + "grad_norm": 1.4198609483292617, + "learning_rate": 2.9115003099814014e-07, + "loss": 0.97, + "step": 75140 + }, + { + "epoch": 5.823549924444961, + "grad_norm": 1.3060116670734487, + "learning_rate": 2.911887786732796e-07, + "loss": 0.9461, + "step": 75150 + }, + { + "epoch": 5.824324847921268, + "grad_norm": 1.3050863157305954, + "learning_rate": 2.9122752634841913e-07, + "loss": 0.9342, + "step": 75160 + }, + { + "epoch": 5.825099771397575, + "grad_norm": 1.3648379977301022, + "learning_rate": 2.912662740235586e-07, + "loss": 0.9512, + "step": 75170 + }, + { + "epoch": 5.825874694873881, + "grad_norm": 1.3196449864072357, + "learning_rate": 2.913050216986981e-07, + "loss": 0.9544, + "step": 75180 + }, + { + "epoch": 5.826649618350188, + "grad_norm": 1.3652493440586868, + "learning_rate": 2.913437693738376e-07, + "loss": 0.9357, + "step": 75190 + }, + { + "epoch": 5.8274245418264945, + "grad_norm": 1.467009067580174, + "learning_rate": 2.9138251704897706e-07, + "loss": 0.9428, + "step": 75200 + }, + { + "epoch": 5.828199465302801, + "grad_norm": 1.3585366478517855, + "learning_rate": 2.914212647241166e-07, + "loss": 0.9173, + "step": 75210 + }, + { + "epoch": 5.828974388779108, + "grad_norm": 1.3744093559877333, + "learning_rate": 2.9146001239925605e-07, + "loss": 0.9319, + "step": 75220 + }, + { + "epoch": 5.829749312255415, + "grad_norm": 1.3625569235947106, + "learning_rate": 2.914987600743956e-07, + "loss": 0.9552, + "step": 75230 + }, + { + "epoch": 5.830524235731722, + "grad_norm": 1.3960034694424703, + "learning_rate": 2.9153750774953505e-07, + "loss": 0.9533, + "step": 75240 + }, + { + "epoch": 5.8312991592080285, + "grad_norm": 1.4635459390566965, + "learning_rate": 2.9157625542467457e-07, + "loss": 0.9395, + "step": 75250 + }, + { + "epoch": 5.8320740826843345, + "grad_norm": 1.4238892966211325, + "learning_rate": 2.9161500309981404e-07, + "loss": 0.9275, + "step": 75260 + }, + { + "epoch": 5.832849006160641, + "grad_norm": 1.3540943867452524, + "learning_rate": 2.916537507749535e-07, + "loss": 0.9211, + "step": 75270 + }, + { + "epoch": 5.833623929636948, + "grad_norm": 1.3955965631822134, + "learning_rate": 2.9169249845009303e-07, + "loss": 0.9561, + "step": 75280 + }, + { + "epoch": 5.834398853113255, + "grad_norm": 1.3528787743235657, + "learning_rate": 2.917312461252325e-07, + "loss": 0.9584, + "step": 75290 + }, + { + "epoch": 5.835173776589562, + "grad_norm": 1.310020367627835, + "learning_rate": 2.91769993800372e-07, + "loss": 0.9411, + "step": 75300 + }, + { + "epoch": 5.8359487000658685, + "grad_norm": 1.4097146500091753, + "learning_rate": 2.918087414755115e-07, + "loss": 0.9295, + "step": 75310 + }, + { + "epoch": 5.836723623542175, + "grad_norm": 1.372587791902365, + "learning_rate": 2.91847489150651e-07, + "loss": 0.9412, + "step": 75320 + }, + { + "epoch": 5.837498547018482, + "grad_norm": 1.4014869208875826, + "learning_rate": 2.918862368257905e-07, + "loss": 0.9468, + "step": 75330 + }, + { + "epoch": 5.838273470494789, + "grad_norm": 1.380480483929296, + "learning_rate": 2.9192498450092995e-07, + "loss": 0.9387, + "step": 75340 + }, + { + "epoch": 5.839048393971096, + "grad_norm": 1.432914308258221, + "learning_rate": 2.9196373217606947e-07, + "loss": 0.9544, + "step": 75350 + }, + { + "epoch": 5.839823317447402, + "grad_norm": 1.4047633586692927, + "learning_rate": 2.9200247985120894e-07, + "loss": 0.9395, + "step": 75360 + }, + { + "epoch": 5.8405982409237085, + "grad_norm": 1.3285891608242864, + "learning_rate": 2.9204122752634846e-07, + "loss": 0.9455, + "step": 75370 + }, + { + "epoch": 5.841373164400015, + "grad_norm": 1.3823022431944063, + "learning_rate": 2.9207997520148793e-07, + "loss": 0.935, + "step": 75380 + }, + { + "epoch": 5.842148087876322, + "grad_norm": 1.3539644001986526, + "learning_rate": 2.9211872287662745e-07, + "loss": 0.9368, + "step": 75390 + }, + { + "epoch": 5.842923011352629, + "grad_norm": 1.36254428515551, + "learning_rate": 2.921574705517669e-07, + "loss": 0.925, + "step": 75400 + }, + { + "epoch": 5.843697934828936, + "grad_norm": 1.399453070013584, + "learning_rate": 2.921962182269064e-07, + "loss": 0.9541, + "step": 75410 + }, + { + "epoch": 5.8444728583052425, + "grad_norm": 1.4635294916997206, + "learning_rate": 2.922349659020459e-07, + "loss": 0.936, + "step": 75420 + }, + { + "epoch": 5.845247781781549, + "grad_norm": 1.3916574068512038, + "learning_rate": 2.922737135771854e-07, + "loss": 0.9456, + "step": 75430 + }, + { + "epoch": 5.846022705257856, + "grad_norm": 1.6460164758909623, + "learning_rate": 2.923124612523249e-07, + "loss": 0.9473, + "step": 75440 + }, + { + "epoch": 5.846797628734162, + "grad_norm": 1.460406823211728, + "learning_rate": 2.9235120892746437e-07, + "loss": 0.9551, + "step": 75450 + }, + { + "epoch": 5.847572552210469, + "grad_norm": 1.44200367957501, + "learning_rate": 2.923899566026039e-07, + "loss": 0.9629, + "step": 75460 + }, + { + "epoch": 5.848347475686776, + "grad_norm": 1.4075899683836586, + "learning_rate": 2.9242870427774336e-07, + "loss": 0.9337, + "step": 75470 + }, + { + "epoch": 5.8491223991630825, + "grad_norm": 1.3359279326665836, + "learning_rate": 2.9246745195288283e-07, + "loss": 0.9337, + "step": 75480 + }, + { + "epoch": 5.849897322639389, + "grad_norm": 1.372929720568063, + "learning_rate": 2.9250619962802235e-07, + "loss": 0.9244, + "step": 75490 + }, + { + "epoch": 5.850672246115696, + "grad_norm": 1.4137050038898216, + "learning_rate": 2.925449473031618e-07, + "loss": 0.9457, + "step": 75500 + }, + { + "epoch": 5.850672246115696, + "eval_loss": 0.9438233375549316, + "eval_runtime": 318.0162, + "eval_samples_per_second": 36.07, + "eval_steps_per_second": 9.018, + "step": 75500 + }, + { + "epoch": 5.851447169592003, + "grad_norm": 1.3838122415233003, + "learning_rate": 2.9258369497830135e-07, + "loss": 0.9559, + "step": 75510 + }, + { + "epoch": 5.85222209306831, + "grad_norm": 1.4028222049721917, + "learning_rate": 2.926224426534408e-07, + "loss": 0.9424, + "step": 75520 + }, + { + "epoch": 5.852997016544617, + "grad_norm": 1.4051812529986094, + "learning_rate": 2.9266119032858034e-07, + "loss": 0.9439, + "step": 75530 + }, + { + "epoch": 5.853771940020923, + "grad_norm": 1.4153342207615305, + "learning_rate": 2.926999380037198e-07, + "loss": 0.9167, + "step": 75540 + }, + { + "epoch": 5.854546863497229, + "grad_norm": 1.3493322661417515, + "learning_rate": 2.927386856788593e-07, + "loss": 0.9373, + "step": 75550 + }, + { + "epoch": 5.855321786973536, + "grad_norm": 1.3661030993822734, + "learning_rate": 2.927774333539988e-07, + "loss": 0.9426, + "step": 75560 + }, + { + "epoch": 5.856096710449843, + "grad_norm": 1.396235382815523, + "learning_rate": 2.9281618102913827e-07, + "loss": 0.9426, + "step": 75570 + }, + { + "epoch": 5.85687163392615, + "grad_norm": 1.3698577798694882, + "learning_rate": 2.928549287042778e-07, + "loss": 0.9329, + "step": 75580 + }, + { + "epoch": 5.8576465574024565, + "grad_norm": 1.377268103779046, + "learning_rate": 2.9289367637941726e-07, + "loss": 0.9618, + "step": 75590 + }, + { + "epoch": 5.858421480878763, + "grad_norm": 1.3993178789704472, + "learning_rate": 2.929324240545568e-07, + "loss": 0.9334, + "step": 75600 + }, + { + "epoch": 5.85919640435507, + "grad_norm": 1.3632811422419238, + "learning_rate": 2.9297117172969625e-07, + "loss": 0.9425, + "step": 75610 + }, + { + "epoch": 5.859971327831377, + "grad_norm": 1.3134600427032144, + "learning_rate": 2.930099194048357e-07, + "loss": 0.9325, + "step": 75620 + }, + { + "epoch": 5.860746251307683, + "grad_norm": 1.4812118074435925, + "learning_rate": 2.9304866707997524e-07, + "loss": 0.9275, + "step": 75630 + }, + { + "epoch": 5.86152117478399, + "grad_norm": 1.4712775963667084, + "learning_rate": 2.930874147551147e-07, + "loss": 0.9405, + "step": 75640 + }, + { + "epoch": 5.8622960982602965, + "grad_norm": 1.4449706808076266, + "learning_rate": 2.9312616243025423e-07, + "loss": 0.951, + "step": 75650 + }, + { + "epoch": 5.863071021736603, + "grad_norm": 1.3265598251576356, + "learning_rate": 2.931649101053937e-07, + "loss": 0.9461, + "step": 75660 + }, + { + "epoch": 5.86384594521291, + "grad_norm": 1.3819333464884762, + "learning_rate": 2.932036577805332e-07, + "loss": 0.9584, + "step": 75670 + }, + { + "epoch": 5.864620868689217, + "grad_norm": 1.3607091708357337, + "learning_rate": 2.932424054556727e-07, + "loss": 0.9501, + "step": 75680 + }, + { + "epoch": 5.865395792165524, + "grad_norm": 1.3790465602319706, + "learning_rate": 2.9328115313081216e-07, + "loss": 0.9413, + "step": 75690 + }, + { + "epoch": 5.866170715641831, + "grad_norm": 1.4010828421102326, + "learning_rate": 2.933199008059517e-07, + "loss": 0.9418, + "step": 75700 + }, + { + "epoch": 5.866945639118137, + "grad_norm": 1.4851505694136204, + "learning_rate": 2.9335864848109115e-07, + "loss": 0.9329, + "step": 75710 + }, + { + "epoch": 5.867720562594444, + "grad_norm": 1.3774500304959043, + "learning_rate": 2.9339739615623067e-07, + "loss": 0.9503, + "step": 75720 + }, + { + "epoch": 5.868495486070751, + "grad_norm": 1.4172066030188242, + "learning_rate": 2.9343614383137014e-07, + "loss": 0.9152, + "step": 75730 + }, + { + "epoch": 5.869270409547057, + "grad_norm": 1.4146969432653285, + "learning_rate": 2.9347489150650966e-07, + "loss": 0.938, + "step": 75740 + }, + { + "epoch": 5.870045333023364, + "grad_norm": 1.3351375013432405, + "learning_rate": 2.9351363918164913e-07, + "loss": 0.924, + "step": 75750 + }, + { + "epoch": 5.8708202564996705, + "grad_norm": 1.2648986659095227, + "learning_rate": 2.935523868567886e-07, + "loss": 0.9459, + "step": 75760 + }, + { + "epoch": 5.871595179975977, + "grad_norm": 1.3760455335942623, + "learning_rate": 2.935911345319281e-07, + "loss": 0.9604, + "step": 75770 + }, + { + "epoch": 5.872370103452284, + "grad_norm": 1.370121279254465, + "learning_rate": 2.936298822070676e-07, + "loss": 0.9162, + "step": 75780 + }, + { + "epoch": 5.873145026928591, + "grad_norm": 1.4466290274656086, + "learning_rate": 2.936686298822071e-07, + "loss": 0.9351, + "step": 75790 + }, + { + "epoch": 5.873919950404898, + "grad_norm": 1.395110883268488, + "learning_rate": 2.937073775573466e-07, + "loss": 0.9391, + "step": 75800 + }, + { + "epoch": 5.874694873881205, + "grad_norm": 1.3217666547611524, + "learning_rate": 2.937461252324861e-07, + "loss": 0.935, + "step": 75810 + }, + { + "epoch": 5.8754697973575105, + "grad_norm": 1.4205197833635528, + "learning_rate": 2.937848729076256e-07, + "loss": 0.9409, + "step": 75820 + }, + { + "epoch": 5.876244720833817, + "grad_norm": 1.3689655998040882, + "learning_rate": 2.9382362058276504e-07, + "loss": 0.9427, + "step": 75830 + }, + { + "epoch": 5.877019644310124, + "grad_norm": 1.508026828350527, + "learning_rate": 2.9386236825790456e-07, + "loss": 0.9261, + "step": 75840 + }, + { + "epoch": 5.877794567786431, + "grad_norm": 1.3576191029785316, + "learning_rate": 2.9390111593304403e-07, + "loss": 0.9496, + "step": 75850 + }, + { + "epoch": 5.878569491262738, + "grad_norm": 1.4362669835608473, + "learning_rate": 2.9393986360818356e-07, + "loss": 0.9266, + "step": 75860 + }, + { + "epoch": 5.879344414739045, + "grad_norm": 1.3575000254303924, + "learning_rate": 2.93978611283323e-07, + "loss": 0.961, + "step": 75870 + }, + { + "epoch": 5.880119338215351, + "grad_norm": 1.3170957459494392, + "learning_rate": 2.9401735895846255e-07, + "loss": 0.9605, + "step": 75880 + }, + { + "epoch": 5.880894261691658, + "grad_norm": 1.352888442369137, + "learning_rate": 2.94056106633602e-07, + "loss": 0.9367, + "step": 75890 + }, + { + "epoch": 5.881669185167965, + "grad_norm": 1.3367492791772677, + "learning_rate": 2.940948543087415e-07, + "loss": 0.9228, + "step": 75900 + }, + { + "epoch": 5.882444108644272, + "grad_norm": 1.485642243523856, + "learning_rate": 2.94133601983881e-07, + "loss": 0.9547, + "step": 75910 + }, + { + "epoch": 5.883219032120578, + "grad_norm": 1.3539528143695911, + "learning_rate": 2.941723496590205e-07, + "loss": 0.9323, + "step": 75920 + }, + { + "epoch": 5.8839939555968845, + "grad_norm": 1.3274536897670144, + "learning_rate": 2.9421109733416e-07, + "loss": 0.9549, + "step": 75930 + }, + { + "epoch": 5.884768879073191, + "grad_norm": 1.3496144943809596, + "learning_rate": 2.9424984500929947e-07, + "loss": 0.9335, + "step": 75940 + }, + { + "epoch": 5.885543802549498, + "grad_norm": 1.384261770521727, + "learning_rate": 2.9428859268443894e-07, + "loss": 0.9434, + "step": 75950 + }, + { + "epoch": 5.886318726025805, + "grad_norm": 1.4176930056822896, + "learning_rate": 2.9432734035957846e-07, + "loss": 0.9249, + "step": 75960 + }, + { + "epoch": 5.887093649502112, + "grad_norm": 1.4183995182106162, + "learning_rate": 2.9436608803471793e-07, + "loss": 0.9293, + "step": 75970 + }, + { + "epoch": 5.887868572978419, + "grad_norm": 1.3746992885553715, + "learning_rate": 2.9440483570985745e-07, + "loss": 0.9274, + "step": 75980 + }, + { + "epoch": 5.888643496454725, + "grad_norm": 1.3672622731156243, + "learning_rate": 2.944435833849969e-07, + "loss": 0.9302, + "step": 75990 + }, + { + "epoch": 5.889418419931031, + "grad_norm": 1.359434018665275, + "learning_rate": 2.9448233106013644e-07, + "loss": 0.9274, + "step": 76000 + }, + { + "epoch": 5.889418419931031, + "eval_loss": 0.9432269334793091, + "eval_runtime": 319.436, + "eval_samples_per_second": 35.91, + "eval_steps_per_second": 8.978, + "step": 76000 + }, + { + "epoch": 5.890193343407338, + "grad_norm": 1.4101296401387764, + "learning_rate": 2.945210787352759e-07, + "loss": 0.9578, + "step": 76010 + }, + { + "epoch": 5.890968266883645, + "grad_norm": 1.4287529143268236, + "learning_rate": 2.945598264104154e-07, + "loss": 0.9331, + "step": 76020 + }, + { + "epoch": 5.891743190359952, + "grad_norm": 1.4259874277416351, + "learning_rate": 2.945985740855549e-07, + "loss": 0.9586, + "step": 76030 + }, + { + "epoch": 5.892518113836259, + "grad_norm": 1.360418141846679, + "learning_rate": 2.9463732176069437e-07, + "loss": 0.9382, + "step": 76040 + }, + { + "epoch": 5.893293037312565, + "grad_norm": 1.391766021679472, + "learning_rate": 2.946760694358339e-07, + "loss": 0.9458, + "step": 76050 + }, + { + "epoch": 5.894067960788872, + "grad_norm": 1.3262338376293037, + "learning_rate": 2.9471481711097336e-07, + "loss": 0.9443, + "step": 76060 + }, + { + "epoch": 5.894842884265179, + "grad_norm": 1.3453592036609001, + "learning_rate": 2.947535647861129e-07, + "loss": 0.9482, + "step": 76070 + }, + { + "epoch": 5.895617807741486, + "grad_norm": 1.3927466814307061, + "learning_rate": 2.9479231246125235e-07, + "loss": 0.9586, + "step": 76080 + }, + { + "epoch": 5.896392731217793, + "grad_norm": 1.4686310752584024, + "learning_rate": 2.948310601363918e-07, + "loss": 0.9606, + "step": 76090 + }, + { + "epoch": 5.897167654694099, + "grad_norm": 1.2777185429893565, + "learning_rate": 2.9486980781153134e-07, + "loss": 0.9479, + "step": 76100 + }, + { + "epoch": 5.897942578170405, + "grad_norm": 1.4666543775814918, + "learning_rate": 2.949085554866708e-07, + "loss": 0.9467, + "step": 76110 + }, + { + "epoch": 5.898717501646712, + "grad_norm": 1.3984084115434898, + "learning_rate": 2.9494730316181033e-07, + "loss": 0.9416, + "step": 76120 + }, + { + "epoch": 5.899492425123019, + "grad_norm": 1.4183197778432681, + "learning_rate": 2.949860508369498e-07, + "loss": 0.957, + "step": 76130 + }, + { + "epoch": 5.900267348599326, + "grad_norm": 1.3553470060489163, + "learning_rate": 2.950247985120893e-07, + "loss": 0.9371, + "step": 76140 + }, + { + "epoch": 5.901042272075633, + "grad_norm": 1.3713181985761511, + "learning_rate": 2.950635461872288e-07, + "loss": 0.9383, + "step": 76150 + }, + { + "epoch": 5.901817195551939, + "grad_norm": 1.4868778302150008, + "learning_rate": 2.9510229386236826e-07, + "loss": 0.9411, + "step": 76160 + }, + { + "epoch": 5.902592119028246, + "grad_norm": 1.433033895889541, + "learning_rate": 2.951410415375078e-07, + "loss": 0.9203, + "step": 76170 + }, + { + "epoch": 5.903367042504553, + "grad_norm": 1.3827589106540776, + "learning_rate": 2.9517978921264725e-07, + "loss": 0.9167, + "step": 76180 + }, + { + "epoch": 5.904141965980859, + "grad_norm": 1.403012023230783, + "learning_rate": 2.952185368877868e-07, + "loss": 0.9534, + "step": 76190 + }, + { + "epoch": 5.904916889457166, + "grad_norm": 1.2983070073067429, + "learning_rate": 2.9525728456292624e-07, + "loss": 0.9406, + "step": 76200 + }, + { + "epoch": 5.905691812933473, + "grad_norm": 1.2742648000305052, + "learning_rate": 2.9529603223806577e-07, + "loss": 0.9128, + "step": 76210 + }, + { + "epoch": 5.906466736409779, + "grad_norm": 1.3629954928719406, + "learning_rate": 2.9533477991320524e-07, + "loss": 0.9377, + "step": 76220 + }, + { + "epoch": 5.907241659886086, + "grad_norm": 1.3479059240429832, + "learning_rate": 2.953735275883447e-07, + "loss": 0.924, + "step": 76230 + }, + { + "epoch": 5.908016583362393, + "grad_norm": 1.3981225725785207, + "learning_rate": 2.954122752634842e-07, + "loss": 0.9556, + "step": 76240 + }, + { + "epoch": 5.9087915068387, + "grad_norm": 1.4013622407734654, + "learning_rate": 2.954510229386237e-07, + "loss": 0.951, + "step": 76250 + }, + { + "epoch": 5.909566430315007, + "grad_norm": 1.4181971295031026, + "learning_rate": 2.954897706137632e-07, + "loss": 0.9538, + "step": 76260 + }, + { + "epoch": 5.910341353791313, + "grad_norm": 1.370170734067697, + "learning_rate": 2.955285182889027e-07, + "loss": 0.9668, + "step": 76270 + }, + { + "epoch": 5.91111627726762, + "grad_norm": 1.3285558472426853, + "learning_rate": 2.955672659640422e-07, + "loss": 0.9674, + "step": 76280 + }, + { + "epoch": 5.911891200743926, + "grad_norm": 1.501791485765219, + "learning_rate": 2.956060136391817e-07, + "loss": 0.957, + "step": 76290 + }, + { + "epoch": 5.912666124220233, + "grad_norm": 1.3959630985916025, + "learning_rate": 2.9564476131432115e-07, + "loss": 0.9551, + "step": 76300 + }, + { + "epoch": 5.91344104769654, + "grad_norm": 1.4861966598952328, + "learning_rate": 2.9568350898946067e-07, + "loss": 0.9313, + "step": 76310 + }, + { + "epoch": 5.914215971172847, + "grad_norm": 1.3602945849525114, + "learning_rate": 2.9572225666460014e-07, + "loss": 0.9224, + "step": 76320 + }, + { + "epoch": 5.914990894649153, + "grad_norm": 1.4540821595677884, + "learning_rate": 2.9576100433973966e-07, + "loss": 0.9472, + "step": 76330 + }, + { + "epoch": 5.91576581812546, + "grad_norm": 1.355450140776883, + "learning_rate": 2.9579975201487913e-07, + "loss": 0.9454, + "step": 76340 + }, + { + "epoch": 5.916540741601767, + "grad_norm": 1.423446559162916, + "learning_rate": 2.9583849969001865e-07, + "loss": 0.941, + "step": 76350 + }, + { + "epoch": 5.917315665078074, + "grad_norm": 1.3085685937000109, + "learning_rate": 2.958772473651581e-07, + "loss": 0.9282, + "step": 76360 + }, + { + "epoch": 5.918090588554381, + "grad_norm": 1.450455142307877, + "learning_rate": 2.959159950402976e-07, + "loss": 0.9554, + "step": 76370 + }, + { + "epoch": 5.918865512030687, + "grad_norm": 1.4014003566363722, + "learning_rate": 2.959547427154371e-07, + "loss": 0.9346, + "step": 76380 + }, + { + "epoch": 5.919640435506993, + "grad_norm": 1.3902313413896523, + "learning_rate": 2.959934903905766e-07, + "loss": 0.9578, + "step": 76390 + }, + { + "epoch": 5.9204153589833, + "grad_norm": 1.4455139903333807, + "learning_rate": 2.960322380657161e-07, + "loss": 0.9423, + "step": 76400 + }, + { + "epoch": 5.921190282459607, + "grad_norm": 1.3929023079708671, + "learning_rate": 2.9607098574085557e-07, + "loss": 0.9473, + "step": 76410 + }, + { + "epoch": 5.921965205935914, + "grad_norm": 1.3698956671589784, + "learning_rate": 2.961097334159951e-07, + "loss": 0.9323, + "step": 76420 + }, + { + "epoch": 5.922740129412221, + "grad_norm": 1.4186906592302946, + "learning_rate": 2.9614848109113456e-07, + "loss": 0.9555, + "step": 76430 + }, + { + "epoch": 5.923515052888527, + "grad_norm": 1.4213794021007662, + "learning_rate": 2.9618722876627403e-07, + "loss": 0.9407, + "step": 76440 + }, + { + "epoch": 5.924289976364834, + "grad_norm": 1.4209216453241467, + "learning_rate": 2.9622597644141355e-07, + "loss": 0.9495, + "step": 76450 + }, + { + "epoch": 5.925064899841141, + "grad_norm": 1.3845008553771891, + "learning_rate": 2.96264724116553e-07, + "loss": 0.9413, + "step": 76460 + }, + { + "epoch": 5.925839823317448, + "grad_norm": 1.3941059042350976, + "learning_rate": 2.9630347179169254e-07, + "loss": 0.935, + "step": 76470 + }, + { + "epoch": 5.926614746793754, + "grad_norm": 1.4257668995955077, + "learning_rate": 2.96342219466832e-07, + "loss": 0.9363, + "step": 76480 + }, + { + "epoch": 5.927389670270061, + "grad_norm": 1.439740844591869, + "learning_rate": 2.9638096714197153e-07, + "loss": 0.9211, + "step": 76490 + }, + { + "epoch": 5.928164593746367, + "grad_norm": 1.3612223274667983, + "learning_rate": 2.96419714817111e-07, + "loss": 0.935, + "step": 76500 + }, + { + "epoch": 5.928164593746367, + "eval_loss": 0.9427737593650818, + "eval_runtime": 336.3, + "eval_samples_per_second": 34.109, + "eval_steps_per_second": 8.528, + "step": 76500 + }, + { + "epoch": 5.928939517222674, + "grad_norm": 1.3123767867864127, + "learning_rate": 2.9645846249225047e-07, + "loss": 0.9341, + "step": 76510 + }, + { + "epoch": 5.929714440698981, + "grad_norm": 1.3413668022863605, + "learning_rate": 2.9649721016739e-07, + "loss": 0.9327, + "step": 76520 + }, + { + "epoch": 5.930489364175288, + "grad_norm": 1.301661030664625, + "learning_rate": 2.9653595784252946e-07, + "loss": 0.9373, + "step": 76530 + }, + { + "epoch": 5.931264287651595, + "grad_norm": 1.3420602854186954, + "learning_rate": 2.96574705517669e-07, + "loss": 0.9255, + "step": 76540 + }, + { + "epoch": 5.9320392111279014, + "grad_norm": 1.3975071967578372, + "learning_rate": 2.9661345319280845e-07, + "loss": 0.9507, + "step": 76550 + }, + { + "epoch": 5.932814134604207, + "grad_norm": 1.452849253829167, + "learning_rate": 2.96652200867948e-07, + "loss": 0.914, + "step": 76560 + }, + { + "epoch": 5.933589058080514, + "grad_norm": 1.3598728844439907, + "learning_rate": 2.9669094854308745e-07, + "loss": 0.9467, + "step": 76570 + }, + { + "epoch": 5.934363981556821, + "grad_norm": 1.3522023854657206, + "learning_rate": 2.967296962182269e-07, + "loss": 0.9329, + "step": 76580 + }, + { + "epoch": 5.935138905033128, + "grad_norm": 1.3575079060768818, + "learning_rate": 2.9676844389336644e-07, + "loss": 0.9389, + "step": 76590 + }, + { + "epoch": 5.935913828509435, + "grad_norm": 1.3501375554284463, + "learning_rate": 2.968071915685059e-07, + "loss": 0.9386, + "step": 76600 + }, + { + "epoch": 5.936688751985741, + "grad_norm": 1.415579630854541, + "learning_rate": 2.9684593924364543e-07, + "loss": 0.9271, + "step": 76610 + }, + { + "epoch": 5.937463675462048, + "grad_norm": 1.4166864863521045, + "learning_rate": 2.968846869187849e-07, + "loss": 0.9167, + "step": 76620 + }, + { + "epoch": 5.938238598938355, + "grad_norm": 1.3530857551332516, + "learning_rate": 2.9692343459392437e-07, + "loss": 0.9198, + "step": 76630 + }, + { + "epoch": 5.939013522414662, + "grad_norm": 1.418752556800572, + "learning_rate": 2.969621822690639e-07, + "loss": 0.9285, + "step": 76640 + }, + { + "epoch": 5.939788445890969, + "grad_norm": 1.347090482143804, + "learning_rate": 2.9700092994420336e-07, + "loss": 0.9422, + "step": 76650 + }, + { + "epoch": 5.9405633693672755, + "grad_norm": 1.4237688331274951, + "learning_rate": 2.970396776193429e-07, + "loss": 0.9319, + "step": 76660 + }, + { + "epoch": 5.941338292843581, + "grad_norm": 1.3266339494020611, + "learning_rate": 2.9707842529448235e-07, + "loss": 0.9227, + "step": 76670 + }, + { + "epoch": 5.942113216319888, + "grad_norm": 1.3339345706720875, + "learning_rate": 2.9711717296962187e-07, + "loss": 0.932, + "step": 76680 + }, + { + "epoch": 5.942888139796195, + "grad_norm": 1.3744831594383216, + "learning_rate": 2.9715592064476134e-07, + "loss": 0.9318, + "step": 76690 + }, + { + "epoch": 5.943663063272502, + "grad_norm": 1.3928208521573695, + "learning_rate": 2.971946683199008e-07, + "loss": 0.9276, + "step": 76700 + }, + { + "epoch": 5.944437986748809, + "grad_norm": 1.3171547236628403, + "learning_rate": 2.9723341599504033e-07, + "loss": 0.9315, + "step": 76710 + }, + { + "epoch": 5.945212910225115, + "grad_norm": 1.5191811421163586, + "learning_rate": 2.972721636701798e-07, + "loss": 0.9433, + "step": 76720 + }, + { + "epoch": 5.945987833701422, + "grad_norm": 1.3360702831654918, + "learning_rate": 2.973109113453193e-07, + "loss": 0.9254, + "step": 76730 + }, + { + "epoch": 5.946762757177729, + "grad_norm": 1.36557184051128, + "learning_rate": 2.973496590204588e-07, + "loss": 0.9689, + "step": 76740 + }, + { + "epoch": 5.947537680654035, + "grad_norm": 1.5406317421982243, + "learning_rate": 2.973884066955983e-07, + "loss": 0.9291, + "step": 76750 + }, + { + "epoch": 5.948312604130342, + "grad_norm": 1.4238212289511092, + "learning_rate": 2.974271543707378e-07, + "loss": 0.948, + "step": 76760 + }, + { + "epoch": 5.949087527606649, + "grad_norm": 1.3251247516888767, + "learning_rate": 2.9746590204587725e-07, + "loss": 0.9477, + "step": 76770 + }, + { + "epoch": 5.949862451082955, + "grad_norm": 1.3598812099613107, + "learning_rate": 2.9750464972101677e-07, + "loss": 0.9454, + "step": 76780 + }, + { + "epoch": 5.950637374559262, + "grad_norm": 1.3576590433745142, + "learning_rate": 2.9754339739615624e-07, + "loss": 0.9531, + "step": 76790 + }, + { + "epoch": 5.951412298035569, + "grad_norm": 1.3551869189878618, + "learning_rate": 2.9758214507129576e-07, + "loss": 0.934, + "step": 76800 + }, + { + "epoch": 5.952187221511876, + "grad_norm": 1.3728503638597547, + "learning_rate": 2.9762089274643523e-07, + "loss": 0.9363, + "step": 76810 + }, + { + "epoch": 5.952962144988183, + "grad_norm": 1.4074021607652003, + "learning_rate": 2.9765964042157475e-07, + "loss": 0.9348, + "step": 76820 + }, + { + "epoch": 5.9537370684644895, + "grad_norm": 1.3508069390058621, + "learning_rate": 2.976983880967142e-07, + "loss": 0.9243, + "step": 76830 + }, + { + "epoch": 5.954511991940796, + "grad_norm": 1.4839395530413089, + "learning_rate": 2.977371357718537e-07, + "loss": 0.9249, + "step": 76840 + }, + { + "epoch": 5.955286915417102, + "grad_norm": 1.4397410857900688, + "learning_rate": 2.977758834469932e-07, + "loss": 0.9416, + "step": 76850 + }, + { + "epoch": 5.956061838893409, + "grad_norm": 1.4497639083275873, + "learning_rate": 2.978146311221327e-07, + "loss": 0.952, + "step": 76860 + }, + { + "epoch": 5.956836762369716, + "grad_norm": 1.409711809384205, + "learning_rate": 2.978533787972722e-07, + "loss": 0.9318, + "step": 76870 + }, + { + "epoch": 5.957611685846023, + "grad_norm": 1.3794977346479276, + "learning_rate": 2.978921264724117e-07, + "loss": 0.9469, + "step": 76880 + }, + { + "epoch": 5.958386609322329, + "grad_norm": 1.410367787851897, + "learning_rate": 2.979308741475512e-07, + "loss": 0.9478, + "step": 76890 + }, + { + "epoch": 5.959161532798636, + "grad_norm": 1.4064344147318881, + "learning_rate": 2.9796962182269067e-07, + "loss": 0.9449, + "step": 76900 + }, + { + "epoch": 5.959936456274943, + "grad_norm": 1.319816252679746, + "learning_rate": 2.9800836949783013e-07, + "loss": 0.965, + "step": 76910 + }, + { + "epoch": 5.96071137975125, + "grad_norm": 1.3809180546886675, + "learning_rate": 2.9804711717296966e-07, + "loss": 0.9464, + "step": 76920 + }, + { + "epoch": 5.961486303227556, + "grad_norm": 1.3422407481269734, + "learning_rate": 2.980858648481091e-07, + "loss": 0.9327, + "step": 76930 + }, + { + "epoch": 5.962261226703863, + "grad_norm": 1.3702569101493818, + "learning_rate": 2.9812461252324865e-07, + "loss": 0.9336, + "step": 76940 + }, + { + "epoch": 5.963036150180169, + "grad_norm": 1.32680985748268, + "learning_rate": 2.981633601983881e-07, + "loss": 0.9256, + "step": 76950 + }, + { + "epoch": 5.963811073656476, + "grad_norm": 1.4222331281105491, + "learning_rate": 2.9820210787352764e-07, + "loss": 0.9354, + "step": 76960 + }, + { + "epoch": 5.964585997132783, + "grad_norm": 1.4763646848522698, + "learning_rate": 2.982408555486671e-07, + "loss": 0.96, + "step": 76970 + }, + { + "epoch": 5.96536092060909, + "grad_norm": 1.3983326428195593, + "learning_rate": 2.982796032238066e-07, + "loss": 0.9465, + "step": 76980 + }, + { + "epoch": 5.966135844085397, + "grad_norm": 1.4007890872982176, + "learning_rate": 2.983183508989461e-07, + "loss": 0.9379, + "step": 76990 + }, + { + "epoch": 5.9669107675617035, + "grad_norm": 1.4160315114587751, + "learning_rate": 2.9835709857408557e-07, + "loss": 0.9401, + "step": 77000 + }, + { + "epoch": 5.9669107675617035, + "eval_loss": 0.9422782063484192, + "eval_runtime": 330.7926, + "eval_samples_per_second": 34.677, + "eval_steps_per_second": 8.67, + "step": 77000 + }, + { + "epoch": 5.96768569103801, + "grad_norm": 1.370199375719337, + "learning_rate": 2.983958462492251e-07, + "loss": 0.9434, + "step": 77010 + }, + { + "epoch": 5.968460614514317, + "grad_norm": 1.43952837138412, + "learning_rate": 2.9843459392436456e-07, + "loss": 0.9463, + "step": 77020 + }, + { + "epoch": 5.969235537990624, + "grad_norm": 1.4086725949388585, + "learning_rate": 2.984733415995041e-07, + "loss": 0.9382, + "step": 77030 + }, + { + "epoch": 5.97001046146693, + "grad_norm": 1.3961871078326202, + "learning_rate": 2.9851208927464355e-07, + "loss": 0.9397, + "step": 77040 + }, + { + "epoch": 5.970785384943237, + "grad_norm": 1.4355663201567694, + "learning_rate": 2.98550836949783e-07, + "loss": 0.942, + "step": 77050 + }, + { + "epoch": 5.971560308419543, + "grad_norm": 1.473513733014111, + "learning_rate": 2.9858958462492254e-07, + "loss": 0.9799, + "step": 77060 + }, + { + "epoch": 5.97233523189585, + "grad_norm": 1.359683828790495, + "learning_rate": 2.98628332300062e-07, + "loss": 0.929, + "step": 77070 + }, + { + "epoch": 5.973110155372157, + "grad_norm": 1.3744881958967543, + "learning_rate": 2.9866707997520153e-07, + "loss": 0.9302, + "step": 77080 + }, + { + "epoch": 5.973885078848464, + "grad_norm": 1.3119574601657877, + "learning_rate": 2.98705827650341e-07, + "loss": 0.9539, + "step": 77090 + }, + { + "epoch": 5.974660002324771, + "grad_norm": 1.3399977311101015, + "learning_rate": 2.987445753254805e-07, + "loss": 0.9543, + "step": 77100 + }, + { + "epoch": 5.9754349258010775, + "grad_norm": 1.3462566305870274, + "learning_rate": 2.9878332300062e-07, + "loss": 0.9473, + "step": 77110 + }, + { + "epoch": 5.976209849277383, + "grad_norm": 1.269967908558885, + "learning_rate": 2.9882207067575946e-07, + "loss": 0.9331, + "step": 77120 + }, + { + "epoch": 5.97698477275369, + "grad_norm": 1.394733835443936, + "learning_rate": 2.98860818350899e-07, + "loss": 0.9645, + "step": 77130 + }, + { + "epoch": 5.977759696229997, + "grad_norm": 1.3139621653947813, + "learning_rate": 2.9889956602603845e-07, + "loss": 0.9599, + "step": 77140 + }, + { + "epoch": 5.978534619706304, + "grad_norm": 1.3095603359187875, + "learning_rate": 2.98938313701178e-07, + "loss": 0.9219, + "step": 77150 + }, + { + "epoch": 5.979309543182611, + "grad_norm": 1.3623968289470711, + "learning_rate": 2.9897706137631744e-07, + "loss": 0.9468, + "step": 77160 + }, + { + "epoch": 5.9800844666589175, + "grad_norm": 1.4093436678237952, + "learning_rate": 2.9901580905145696e-07, + "loss": 0.9352, + "step": 77170 + }, + { + "epoch": 5.980859390135224, + "grad_norm": 1.3600739853389796, + "learning_rate": 2.9905455672659643e-07, + "loss": 0.9423, + "step": 77180 + }, + { + "epoch": 5.981634313611531, + "grad_norm": 1.3517868703097178, + "learning_rate": 2.990933044017359e-07, + "loss": 0.9271, + "step": 77190 + }, + { + "epoch": 5.982409237087838, + "grad_norm": 1.4585865860931362, + "learning_rate": 2.991320520768754e-07, + "loss": 0.9445, + "step": 77200 + }, + { + "epoch": 5.983184160564145, + "grad_norm": 1.327113320769345, + "learning_rate": 2.991707997520149e-07, + "loss": 0.9572, + "step": 77210 + }, + { + "epoch": 5.983959084040451, + "grad_norm": 1.3693558770360723, + "learning_rate": 2.992095474271544e-07, + "loss": 0.9508, + "step": 77220 + }, + { + "epoch": 5.984734007516757, + "grad_norm": 1.386413765838366, + "learning_rate": 2.992482951022939e-07, + "loss": 0.9555, + "step": 77230 + }, + { + "epoch": 5.985508930993064, + "grad_norm": 1.449131050674395, + "learning_rate": 2.992870427774334e-07, + "loss": 0.9525, + "step": 77240 + }, + { + "epoch": 5.986283854469371, + "grad_norm": 1.3634904659374583, + "learning_rate": 2.993257904525729e-07, + "loss": 0.926, + "step": 77250 + }, + { + "epoch": 5.987058777945678, + "grad_norm": 1.3809717721031491, + "learning_rate": 2.9936453812771234e-07, + "loss": 0.9356, + "step": 77260 + }, + { + "epoch": 5.987833701421985, + "grad_norm": 1.3460251034787487, + "learning_rate": 2.9940328580285187e-07, + "loss": 0.9588, + "step": 77270 + }, + { + "epoch": 5.9886086248982915, + "grad_norm": 1.3888276022882848, + "learning_rate": 2.9944203347799134e-07, + "loss": 0.9321, + "step": 77280 + }, + { + "epoch": 5.989383548374598, + "grad_norm": 1.4258588391873146, + "learning_rate": 2.9948078115313086e-07, + "loss": 0.9315, + "step": 77290 + }, + { + "epoch": 5.990158471850905, + "grad_norm": 1.3365610157650576, + "learning_rate": 2.9951952882827033e-07, + "loss": 0.9319, + "step": 77300 + }, + { + "epoch": 5.990933395327211, + "grad_norm": 1.3977799926111394, + "learning_rate": 2.9955827650340985e-07, + "loss": 0.91, + "step": 77310 + }, + { + "epoch": 5.991708318803518, + "grad_norm": 1.3314874794860299, + "learning_rate": 2.995970241785493e-07, + "loss": 0.9316, + "step": 77320 + }, + { + "epoch": 5.992483242279825, + "grad_norm": 1.297521958890903, + "learning_rate": 2.996357718536888e-07, + "loss": 0.9531, + "step": 77330 + }, + { + "epoch": 5.9932581657561315, + "grad_norm": 1.3020956138600197, + "learning_rate": 2.996745195288283e-07, + "loss": 0.9497, + "step": 77340 + }, + { + "epoch": 5.994033089232438, + "grad_norm": 1.3754880146848902, + "learning_rate": 2.997132672039678e-07, + "loss": 0.9433, + "step": 77350 + }, + { + "epoch": 5.994808012708745, + "grad_norm": 1.3520346174514508, + "learning_rate": 2.997520148791073e-07, + "loss": 0.9382, + "step": 77360 + }, + { + "epoch": 5.995582936185052, + "grad_norm": 1.417258104201398, + "learning_rate": 2.9979076255424677e-07, + "loss": 0.9336, + "step": 77370 + }, + { + "epoch": 5.996357859661359, + "grad_norm": 1.4423932834557611, + "learning_rate": 2.9982951022938624e-07, + "loss": 0.9413, + "step": 77380 + }, + { + "epoch": 5.9971327831376655, + "grad_norm": 1.5039121962558286, + "learning_rate": 2.9986825790452576e-07, + "loss": 0.9647, + "step": 77390 + }, + { + "epoch": 5.997907706613972, + "grad_norm": 1.442505519741371, + "learning_rate": 2.9990700557966523e-07, + "loss": 0.9474, + "step": 77400 + }, + { + "epoch": 5.998682630090278, + "grad_norm": 1.4051364928953505, + "learning_rate": 2.9994575325480475e-07, + "loss": 0.9494, + "step": 77410 + }, + { + "epoch": 5.999457553566585, + "grad_norm": 1.3096246226769497, + "learning_rate": 2.999845009299442e-07, + "loss": 0.9363, + "step": 77420 + }, + { + "epoch": 6.000232477042892, + "grad_norm": 1.393753896618301, + "learning_rate": 3.0002324860508374e-07, + "loss": 0.9414, + "step": 77430 + }, + { + "epoch": 6.001007400519199, + "grad_norm": 1.4237569015564457, + "learning_rate": 3.000619962802232e-07, + "loss": 0.9497, + "step": 77440 + }, + { + "epoch": 6.0017823239955055, + "grad_norm": 1.4393853790004867, + "learning_rate": 3.001007439553627e-07, + "loss": 0.9398, + "step": 77450 + }, + { + "epoch": 6.002557247471812, + "grad_norm": 1.3867572903952616, + "learning_rate": 3.001394916305022e-07, + "loss": 0.9174, + "step": 77460 + }, + { + "epoch": 6.003332170948119, + "grad_norm": 1.362984025488032, + "learning_rate": 3.0017823930564167e-07, + "loss": 0.9367, + "step": 77470 + }, + { + "epoch": 6.004107094424426, + "grad_norm": 1.3595482295199015, + "learning_rate": 3.002169869807812e-07, + "loss": 0.9271, + "step": 77480 + }, + { + "epoch": 6.004882017900733, + "grad_norm": 1.3443018951657708, + "learning_rate": 3.0025573465592066e-07, + "loss": 0.9301, + "step": 77490 + }, + { + "epoch": 6.005656941377039, + "grad_norm": 1.4005313322834636, + "learning_rate": 3.002944823310602e-07, + "loss": 0.9247, + "step": 77500 + }, + { + "epoch": 6.005656941377039, + "eval_loss": 0.9418132901191711, + "eval_runtime": 331.3387, + "eval_samples_per_second": 34.62, + "eval_steps_per_second": 8.656, + "step": 77500 + }, + { + "epoch": 6.0064318648533455, + "grad_norm": 1.3778789961132978, + "learning_rate": 3.0033323000619965e-07, + "loss": 0.9238, + "step": 77510 + }, + { + "epoch": 6.007206788329652, + "grad_norm": 1.258704402430241, + "learning_rate": 3.003719776813391e-07, + "loss": 0.9095, + "step": 77520 + }, + { + "epoch": 6.007981711805959, + "grad_norm": 1.3564571583007037, + "learning_rate": 3.0041072535647864e-07, + "loss": 0.927, + "step": 77530 + }, + { + "epoch": 6.008756635282266, + "grad_norm": 1.3763366678293711, + "learning_rate": 3.004494730316181e-07, + "loss": 0.9405, + "step": 77540 + }, + { + "epoch": 6.009531558758573, + "grad_norm": 1.4055284548567641, + "learning_rate": 3.0048822070675764e-07, + "loss": 0.943, + "step": 77550 + }, + { + "epoch": 6.0103064822348795, + "grad_norm": 1.4305814260931131, + "learning_rate": 3.005269683818971e-07, + "loss": 0.9364, + "step": 77560 + }, + { + "epoch": 6.011081405711186, + "grad_norm": 1.4001303888595618, + "learning_rate": 3.005657160570366e-07, + "loss": 0.9307, + "step": 77570 + }, + { + "epoch": 6.011856329187493, + "grad_norm": 1.4505052032167949, + "learning_rate": 3.006044637321761e-07, + "loss": 0.9421, + "step": 77580 + }, + { + "epoch": 6.012631252663799, + "grad_norm": 1.320672607003554, + "learning_rate": 3.0064321140731556e-07, + "loss": 0.9429, + "step": 77590 + }, + { + "epoch": 6.013406176140106, + "grad_norm": 1.3625955041793658, + "learning_rate": 3.006819590824551e-07, + "loss": 0.9288, + "step": 77600 + }, + { + "epoch": 6.014181099616413, + "grad_norm": 1.283031987173071, + "learning_rate": 3.0072070675759456e-07, + "loss": 0.9226, + "step": 77610 + }, + { + "epoch": 6.0149560230927195, + "grad_norm": 1.3793026138679703, + "learning_rate": 3.007594544327341e-07, + "loss": 0.9239, + "step": 77620 + }, + { + "epoch": 6.015730946569026, + "grad_norm": 1.3950210147651854, + "learning_rate": 3.0079820210787355e-07, + "loss": 0.948, + "step": 77630 + }, + { + "epoch": 6.016505870045333, + "grad_norm": 1.3594625427292029, + "learning_rate": 3.0083694978301307e-07, + "loss": 0.9427, + "step": 77640 + }, + { + "epoch": 6.01728079352164, + "grad_norm": 1.330338479418566, + "learning_rate": 3.0087569745815254e-07, + "loss": 0.9518, + "step": 77650 + }, + { + "epoch": 6.018055716997947, + "grad_norm": 1.4803381885234785, + "learning_rate": 3.00914445133292e-07, + "loss": 0.9425, + "step": 77660 + }, + { + "epoch": 6.0188306404742535, + "grad_norm": 1.3526001309623013, + "learning_rate": 3.0095319280843153e-07, + "loss": 0.9484, + "step": 77670 + }, + { + "epoch": 6.0196055639505595, + "grad_norm": 1.3788422096248525, + "learning_rate": 3.00991940483571e-07, + "loss": 0.9371, + "step": 77680 + }, + { + "epoch": 6.020380487426866, + "grad_norm": 1.377521335568205, + "learning_rate": 3.010306881587105e-07, + "loss": 0.9178, + "step": 77690 + }, + { + "epoch": 6.021155410903173, + "grad_norm": 1.3713662501504926, + "learning_rate": 3.0106943583385e-07, + "loss": 0.9485, + "step": 77700 + }, + { + "epoch": 6.02193033437948, + "grad_norm": 1.358486234872434, + "learning_rate": 3.011081835089895e-07, + "loss": 0.9294, + "step": 77710 + }, + { + "epoch": 6.022705257855787, + "grad_norm": 1.3602064821734985, + "learning_rate": 3.01146931184129e-07, + "loss": 0.9296, + "step": 77720 + }, + { + "epoch": 6.0234801813320935, + "grad_norm": 1.3241392916573158, + "learning_rate": 3.0118567885926845e-07, + "loss": 0.9298, + "step": 77730 + }, + { + "epoch": 6.0242551048084, + "grad_norm": 1.3482582988259808, + "learning_rate": 3.0122442653440797e-07, + "loss": 0.9315, + "step": 77740 + }, + { + "epoch": 6.025030028284707, + "grad_norm": 1.4285459091320254, + "learning_rate": 3.0126317420954744e-07, + "loss": 0.9236, + "step": 77750 + }, + { + "epoch": 6.025804951761014, + "grad_norm": 1.311844669246895, + "learning_rate": 3.0130192188468696e-07, + "loss": 0.9486, + "step": 77760 + }, + { + "epoch": 6.026579875237321, + "grad_norm": 1.3529629058918857, + "learning_rate": 3.0134066955982643e-07, + "loss": 0.9276, + "step": 77770 + }, + { + "epoch": 6.027354798713627, + "grad_norm": 1.459588530071261, + "learning_rate": 3.0137941723496595e-07, + "loss": 0.9382, + "step": 77780 + }, + { + "epoch": 6.0281297221899335, + "grad_norm": 1.4014896102525651, + "learning_rate": 3.014181649101054e-07, + "loss": 0.9355, + "step": 77790 + }, + { + "epoch": 6.02890464566624, + "grad_norm": 1.4389469638816883, + "learning_rate": 3.014569125852449e-07, + "loss": 0.9383, + "step": 77800 + }, + { + "epoch": 6.029679569142547, + "grad_norm": 1.3573130613127933, + "learning_rate": 3.014956602603844e-07, + "loss": 0.9357, + "step": 77810 + }, + { + "epoch": 6.030454492618854, + "grad_norm": 1.4596547748882658, + "learning_rate": 3.015344079355239e-07, + "loss": 0.9242, + "step": 77820 + }, + { + "epoch": 6.031229416095161, + "grad_norm": 1.3801258379399661, + "learning_rate": 3.015731556106634e-07, + "loss": 0.9374, + "step": 77830 + }, + { + "epoch": 6.0320043395714675, + "grad_norm": 1.3324951088669243, + "learning_rate": 3.0161190328580287e-07, + "loss": 0.9177, + "step": 77840 + }, + { + "epoch": 6.032779263047774, + "grad_norm": 1.4437528920587006, + "learning_rate": 3.016506509609424e-07, + "loss": 0.9193, + "step": 77850 + }, + { + "epoch": 6.033554186524081, + "grad_norm": 1.3422772414614499, + "learning_rate": 3.0168939863608186e-07, + "loss": 0.924, + "step": 77860 + }, + { + "epoch": 6.034329110000387, + "grad_norm": 1.4527862073766704, + "learning_rate": 3.0172814631122133e-07, + "loss": 0.9641, + "step": 77870 + }, + { + "epoch": 6.035104033476694, + "grad_norm": 1.4855678334484486, + "learning_rate": 3.0176689398636085e-07, + "loss": 0.9159, + "step": 77880 + }, + { + "epoch": 6.035878956953001, + "grad_norm": 1.3460984384776273, + "learning_rate": 3.018056416615003e-07, + "loss": 0.9453, + "step": 77890 + }, + { + "epoch": 6.0366538804293075, + "grad_norm": 1.4402650427826131, + "learning_rate": 3.0184438933663985e-07, + "loss": 0.922, + "step": 77900 + }, + { + "epoch": 6.037428803905614, + "grad_norm": 1.3658838618560607, + "learning_rate": 3.018831370117793e-07, + "loss": 0.9421, + "step": 77910 + }, + { + "epoch": 6.038203727381921, + "grad_norm": 1.4105280380070209, + "learning_rate": 3.0192188468691884e-07, + "loss": 0.9402, + "step": 77920 + }, + { + "epoch": 6.038978650858228, + "grad_norm": 1.3947529681778994, + "learning_rate": 3.019606323620583e-07, + "loss": 0.9221, + "step": 77930 + }, + { + "epoch": 6.039753574334535, + "grad_norm": 1.3595984817154247, + "learning_rate": 3.019993800371978e-07, + "loss": 0.9352, + "step": 77940 + }, + { + "epoch": 6.040528497810842, + "grad_norm": 1.3887921382680046, + "learning_rate": 3.020381277123373e-07, + "loss": 0.9237, + "step": 77950 + }, + { + "epoch": 6.0413034212871475, + "grad_norm": 1.3335734452238566, + "learning_rate": 3.0207687538747677e-07, + "loss": 0.9158, + "step": 77960 + }, + { + "epoch": 6.042078344763454, + "grad_norm": 1.4559262203133752, + "learning_rate": 3.021156230626163e-07, + "loss": 0.9222, + "step": 77970 + }, + { + "epoch": 6.042853268239761, + "grad_norm": 1.317694874526139, + "learning_rate": 3.0215437073775576e-07, + "loss": 0.9431, + "step": 77980 + }, + { + "epoch": 6.043628191716068, + "grad_norm": 1.3622081187282427, + "learning_rate": 3.021931184128953e-07, + "loss": 0.9223, + "step": 77990 + }, + { + "epoch": 6.044403115192375, + "grad_norm": 1.4711364165584724, + "learning_rate": 3.0223186608803475e-07, + "loss": 0.9413, + "step": 78000 + }, + { + "epoch": 6.044403115192375, + "eval_loss": 0.9412979483604431, + "eval_runtime": 329.6783, + "eval_samples_per_second": 34.795, + "eval_steps_per_second": 8.699, + "step": 78000 + }, + { + "epoch": 6.0451780386686815, + "grad_norm": 1.363442918788406, + "learning_rate": 3.022706137631742e-07, + "loss": 0.915, + "step": 78010 + }, + { + "epoch": 6.045952962144988, + "grad_norm": 1.3737877795724642, + "learning_rate": 3.0230936143831374e-07, + "loss": 0.9468, + "step": 78020 + }, + { + "epoch": 6.046727885621295, + "grad_norm": 1.3767773182598262, + "learning_rate": 3.023481091134532e-07, + "loss": 0.95, + "step": 78030 + }, + { + "epoch": 6.047502809097602, + "grad_norm": 1.267474891970462, + "learning_rate": 3.0238685678859273e-07, + "loss": 0.9242, + "step": 78040 + }, + { + "epoch": 6.048277732573908, + "grad_norm": 1.3754598151466035, + "learning_rate": 3.024256044637322e-07, + "loss": 0.9374, + "step": 78050 + }, + { + "epoch": 6.049052656050215, + "grad_norm": 1.3592836409839797, + "learning_rate": 3.024643521388717e-07, + "loss": 0.9452, + "step": 78060 + }, + { + "epoch": 6.0498275795265215, + "grad_norm": 1.4009481437812377, + "learning_rate": 3.025030998140112e-07, + "loss": 0.9248, + "step": 78070 + }, + { + "epoch": 6.050602503002828, + "grad_norm": 1.3839450086083471, + "learning_rate": 3.0254184748915066e-07, + "loss": 0.9377, + "step": 78080 + }, + { + "epoch": 6.051377426479135, + "grad_norm": 1.3738198353297664, + "learning_rate": 3.025805951642902e-07, + "loss": 0.9479, + "step": 78090 + }, + { + "epoch": 6.052152349955442, + "grad_norm": 1.375891800233639, + "learning_rate": 3.0261934283942965e-07, + "loss": 0.9656, + "step": 78100 + }, + { + "epoch": 6.052927273431749, + "grad_norm": 1.4638288997843647, + "learning_rate": 3.0265809051456917e-07, + "loss": 0.9373, + "step": 78110 + }, + { + "epoch": 6.0537021969080556, + "grad_norm": 1.605303129495517, + "learning_rate": 3.0269683818970864e-07, + "loss": 0.9412, + "step": 78120 + }, + { + "epoch": 6.054477120384362, + "grad_norm": 1.3609769570911217, + "learning_rate": 3.027355858648481e-07, + "loss": 0.9518, + "step": 78130 + }, + { + "epoch": 6.055252043860669, + "grad_norm": 1.3440634846544497, + "learning_rate": 3.0277433353998763e-07, + "loss": 0.9375, + "step": 78140 + }, + { + "epoch": 6.056026967336975, + "grad_norm": 1.343247224924933, + "learning_rate": 3.028130812151271e-07, + "loss": 0.9329, + "step": 78150 + }, + { + "epoch": 6.056801890813282, + "grad_norm": 1.3777987080001426, + "learning_rate": 3.028518288902666e-07, + "loss": 0.9243, + "step": 78160 + }, + { + "epoch": 6.057576814289589, + "grad_norm": 1.349286486899116, + "learning_rate": 3.028905765654061e-07, + "loss": 0.9042, + "step": 78170 + }, + { + "epoch": 6.0583517377658955, + "grad_norm": 1.4236243583597663, + "learning_rate": 3.029293242405456e-07, + "loss": 0.9406, + "step": 78180 + }, + { + "epoch": 6.059126661242202, + "grad_norm": 1.3432041242755965, + "learning_rate": 3.029680719156851e-07, + "loss": 0.9272, + "step": 78190 + }, + { + "epoch": 6.059901584718509, + "grad_norm": 1.4228459556844153, + "learning_rate": 3.0300681959082455e-07, + "loss": 0.9813, + "step": 78200 + }, + { + "epoch": 6.060676508194816, + "grad_norm": 1.370064210896743, + "learning_rate": 3.030455672659641e-07, + "loss": 0.9324, + "step": 78210 + }, + { + "epoch": 6.061451431671123, + "grad_norm": 1.306421401823494, + "learning_rate": 3.0308431494110354e-07, + "loss": 0.9375, + "step": 78220 + }, + { + "epoch": 6.06222635514743, + "grad_norm": 1.3695237232094737, + "learning_rate": 3.0312306261624307e-07, + "loss": 0.9451, + "step": 78230 + }, + { + "epoch": 6.0630012786237355, + "grad_norm": 1.4375553417171913, + "learning_rate": 3.0316181029138253e-07, + "loss": 0.9376, + "step": 78240 + }, + { + "epoch": 6.063776202100042, + "grad_norm": 1.3438271011321383, + "learning_rate": 3.0320055796652206e-07, + "loss": 0.9403, + "step": 78250 + }, + { + "epoch": 6.064551125576349, + "grad_norm": 1.476531454644511, + "learning_rate": 3.032393056416615e-07, + "loss": 0.9436, + "step": 78260 + }, + { + "epoch": 6.065326049052656, + "grad_norm": 1.4661542803513372, + "learning_rate": 3.03278053316801e-07, + "loss": 0.9177, + "step": 78270 + }, + { + "epoch": 6.066100972528963, + "grad_norm": 1.4330761842390445, + "learning_rate": 3.033168009919405e-07, + "loss": 0.9226, + "step": 78280 + }, + { + "epoch": 6.0668758960052696, + "grad_norm": 1.511809818504738, + "learning_rate": 3.0335554866708e-07, + "loss": 0.924, + "step": 78290 + }, + { + "epoch": 6.067650819481576, + "grad_norm": 1.415189334895935, + "learning_rate": 3.033942963422195e-07, + "loss": 0.9331, + "step": 78300 + }, + { + "epoch": 6.068425742957883, + "grad_norm": 1.339863104785121, + "learning_rate": 3.03433044017359e-07, + "loss": 0.9211, + "step": 78310 + }, + { + "epoch": 6.06920066643419, + "grad_norm": 1.3536527413766297, + "learning_rate": 3.034717916924985e-07, + "loss": 0.9257, + "step": 78320 + }, + { + "epoch": 6.069975589910496, + "grad_norm": 1.4390433445497768, + "learning_rate": 3.0351053936763797e-07, + "loss": 0.9578, + "step": 78330 + }, + { + "epoch": 6.070750513386803, + "grad_norm": 1.3884069561896355, + "learning_rate": 3.0354928704277744e-07, + "loss": 0.9403, + "step": 78340 + }, + { + "epoch": 6.0715254368631095, + "grad_norm": 1.386030882075188, + "learning_rate": 3.0358803471791696e-07, + "loss": 0.9373, + "step": 78350 + }, + { + "epoch": 6.072300360339416, + "grad_norm": 1.3387187036255432, + "learning_rate": 3.0362678239305643e-07, + "loss": 0.9416, + "step": 78360 + }, + { + "epoch": 6.073075283815723, + "grad_norm": 1.4454349356261236, + "learning_rate": 3.0366553006819595e-07, + "loss": 0.9525, + "step": 78370 + }, + { + "epoch": 6.07385020729203, + "grad_norm": 1.3687963819198614, + "learning_rate": 3.037042777433354e-07, + "loss": 0.9479, + "step": 78380 + }, + { + "epoch": 6.074625130768337, + "grad_norm": 1.4251223560131243, + "learning_rate": 3.0374302541847494e-07, + "loss": 0.9549, + "step": 78390 + }, + { + "epoch": 6.075400054244644, + "grad_norm": 1.329320656264589, + "learning_rate": 3.037817730936144e-07, + "loss": 0.9574, + "step": 78400 + }, + { + "epoch": 6.07617497772095, + "grad_norm": 1.3491292174264153, + "learning_rate": 3.038205207687539e-07, + "loss": 0.9329, + "step": 78410 + }, + { + "epoch": 6.076949901197257, + "grad_norm": 1.3274255186412416, + "learning_rate": 3.038592684438934e-07, + "loss": 0.9276, + "step": 78420 + }, + { + "epoch": 6.077724824673563, + "grad_norm": 1.4099562211517191, + "learning_rate": 3.0389801611903287e-07, + "loss": 0.9447, + "step": 78430 + }, + { + "epoch": 6.07849974814987, + "grad_norm": 1.3937636438215755, + "learning_rate": 3.039367637941724e-07, + "loss": 0.9208, + "step": 78440 + }, + { + "epoch": 6.079274671626177, + "grad_norm": 1.3838359654316317, + "learning_rate": 3.0397551146931186e-07, + "loss": 0.9201, + "step": 78450 + }, + { + "epoch": 6.0800495951024836, + "grad_norm": 1.3828263617272591, + "learning_rate": 3.040142591444514e-07, + "loss": 0.9476, + "step": 78460 + }, + { + "epoch": 6.08082451857879, + "grad_norm": 1.5115089779315438, + "learning_rate": 3.0405300681959085e-07, + "loss": 0.9498, + "step": 78470 + }, + { + "epoch": 6.081599442055097, + "grad_norm": 1.3821918358992857, + "learning_rate": 3.040917544947303e-07, + "loss": 0.9267, + "step": 78480 + }, + { + "epoch": 6.082374365531404, + "grad_norm": 1.4216786773540668, + "learning_rate": 3.0413050216986984e-07, + "loss": 0.9314, + "step": 78490 + }, + { + "epoch": 6.083149289007711, + "grad_norm": 1.4305883021943984, + "learning_rate": 3.041692498450093e-07, + "loss": 0.9263, + "step": 78500 + }, + { + "epoch": 6.083149289007711, + "eval_loss": 0.9409106969833374, + "eval_runtime": 328.3881, + "eval_samples_per_second": 34.931, + "eval_steps_per_second": 8.734, + "step": 78500 + }, + { + "epoch": 6.083924212484018, + "grad_norm": 1.3858026310829832, + "learning_rate": 3.0420799752014883e-07, + "loss": 0.9338, + "step": 78510 + }, + { + "epoch": 6.0846991359603235, + "grad_norm": 1.496621228639605, + "learning_rate": 3.042467451952883e-07, + "loss": 0.9444, + "step": 78520 + }, + { + "epoch": 6.08547405943663, + "grad_norm": 1.3584368127923612, + "learning_rate": 3.042854928704278e-07, + "loss": 0.9319, + "step": 78530 + }, + { + "epoch": 6.086248982912937, + "grad_norm": 1.3895599714535707, + "learning_rate": 3.043242405455673e-07, + "loss": 0.9613, + "step": 78540 + }, + { + "epoch": 6.087023906389244, + "grad_norm": 1.424043979537738, + "learning_rate": 3.0436298822070676e-07, + "loss": 0.9603, + "step": 78550 + }, + { + "epoch": 6.087798829865551, + "grad_norm": 1.3410214519189685, + "learning_rate": 3.044017358958463e-07, + "loss": 0.9466, + "step": 78560 + }, + { + "epoch": 6.088573753341858, + "grad_norm": 1.3710127590304695, + "learning_rate": 3.0444048357098575e-07, + "loss": 0.939, + "step": 78570 + }, + { + "epoch": 6.089348676818164, + "grad_norm": 1.4085501611821147, + "learning_rate": 3.044792312461253e-07, + "loss": 0.9413, + "step": 78580 + }, + { + "epoch": 6.090123600294471, + "grad_norm": 1.3403377458518142, + "learning_rate": 3.0451797892126474e-07, + "loss": 0.9234, + "step": 78590 + }, + { + "epoch": 6.090898523770778, + "grad_norm": 1.4219073638796922, + "learning_rate": 3.0455672659640427e-07, + "loss": 0.918, + "step": 78600 + }, + { + "epoch": 6.091673447247084, + "grad_norm": 1.4074239454959252, + "learning_rate": 3.0459547427154374e-07, + "loss": 0.9446, + "step": 78610 + }, + { + "epoch": 6.092448370723391, + "grad_norm": 1.4131813665362558, + "learning_rate": 3.046342219466832e-07, + "loss": 0.9363, + "step": 78620 + }, + { + "epoch": 6.0932232941996975, + "grad_norm": 1.371103575166693, + "learning_rate": 3.0467296962182273e-07, + "loss": 0.945, + "step": 78630 + }, + { + "epoch": 6.093998217676004, + "grad_norm": 1.3913812906198284, + "learning_rate": 3.047117172969622e-07, + "loss": 0.9323, + "step": 78640 + }, + { + "epoch": 6.094773141152311, + "grad_norm": 1.4045927633389443, + "learning_rate": 3.047504649721017e-07, + "loss": 0.9132, + "step": 78650 + }, + { + "epoch": 6.095548064628618, + "grad_norm": 1.4435850767179328, + "learning_rate": 3.047892126472412e-07, + "loss": 0.9573, + "step": 78660 + }, + { + "epoch": 6.096322988104925, + "grad_norm": 1.4468581806653633, + "learning_rate": 3.048279603223807e-07, + "loss": 0.9345, + "step": 78670 + }, + { + "epoch": 6.097097911581232, + "grad_norm": 1.4104853543672249, + "learning_rate": 3.048667079975202e-07, + "loss": 0.9452, + "step": 78680 + }, + { + "epoch": 6.097872835057538, + "grad_norm": 1.33125374974484, + "learning_rate": 3.0490545567265965e-07, + "loss": 0.9229, + "step": 78690 + }, + { + "epoch": 6.098647758533845, + "grad_norm": 1.3978709745953741, + "learning_rate": 3.0494420334779917e-07, + "loss": 0.9448, + "step": 78700 + }, + { + "epoch": 6.099422682010151, + "grad_norm": 1.389061911638437, + "learning_rate": 3.0498295102293864e-07, + "loss": 0.9208, + "step": 78710 + }, + { + "epoch": 6.100197605486458, + "grad_norm": 1.400815456157256, + "learning_rate": 3.0502169869807816e-07, + "loss": 0.9288, + "step": 78720 + }, + { + "epoch": 6.100972528962765, + "grad_norm": 1.3772562217631332, + "learning_rate": 3.0506044637321763e-07, + "loss": 0.9345, + "step": 78730 + }, + { + "epoch": 6.101747452439072, + "grad_norm": 1.383146911383278, + "learning_rate": 3.0509919404835715e-07, + "loss": 0.9316, + "step": 78740 + }, + { + "epoch": 6.102522375915378, + "grad_norm": 1.4113072522463859, + "learning_rate": 3.051379417234966e-07, + "loss": 0.9426, + "step": 78750 + }, + { + "epoch": 6.103297299391685, + "grad_norm": 1.394570698432847, + "learning_rate": 3.051766893986361e-07, + "loss": 0.9473, + "step": 78760 + }, + { + "epoch": 6.104072222867992, + "grad_norm": 1.3947215685238556, + "learning_rate": 3.052154370737756e-07, + "loss": 0.9368, + "step": 78770 + }, + { + "epoch": 6.104847146344299, + "grad_norm": 1.4184000932651664, + "learning_rate": 3.052541847489151e-07, + "loss": 0.9456, + "step": 78780 + }, + { + "epoch": 6.105622069820606, + "grad_norm": 1.4146818029123882, + "learning_rate": 3.052929324240546e-07, + "loss": 0.9296, + "step": 78790 + }, + { + "epoch": 6.1063969932969115, + "grad_norm": 1.3940361585215668, + "learning_rate": 3.0533168009919407e-07, + "loss": 0.9473, + "step": 78800 + }, + { + "epoch": 6.107171916773218, + "grad_norm": 1.4269439401767625, + "learning_rate": 3.053704277743336e-07, + "loss": 0.9321, + "step": 78810 + }, + { + "epoch": 6.107946840249525, + "grad_norm": 1.3865244427261425, + "learning_rate": 3.0540917544947306e-07, + "loss": 0.9167, + "step": 78820 + }, + { + "epoch": 6.108721763725832, + "grad_norm": 1.419999924451691, + "learning_rate": 3.0544792312461253e-07, + "loss": 0.9473, + "step": 78830 + }, + { + "epoch": 6.109496687202139, + "grad_norm": 1.4606471408022772, + "learning_rate": 3.0548667079975205e-07, + "loss": 0.9276, + "step": 78840 + }, + { + "epoch": 6.110271610678446, + "grad_norm": 1.2901675416162495, + "learning_rate": 3.055254184748915e-07, + "loss": 0.9296, + "step": 78850 + }, + { + "epoch": 6.111046534154752, + "grad_norm": 1.3189595855907446, + "learning_rate": 3.0556416615003104e-07, + "loss": 0.9494, + "step": 78860 + }, + { + "epoch": 6.111821457631059, + "grad_norm": 1.3970179947761137, + "learning_rate": 3.056029138251705e-07, + "loss": 0.9335, + "step": 78870 + }, + { + "epoch": 6.112596381107366, + "grad_norm": 1.3765009647022803, + "learning_rate": 3.0564166150031e-07, + "loss": 0.9296, + "step": 78880 + }, + { + "epoch": 6.113371304583672, + "grad_norm": 1.4308544685487636, + "learning_rate": 3.056804091754495e-07, + "loss": 0.9237, + "step": 78890 + }, + { + "epoch": 6.114146228059979, + "grad_norm": 1.4520463785682203, + "learning_rate": 3.0571915685058897e-07, + "loss": 0.9468, + "step": 78900 + }, + { + "epoch": 6.114921151536286, + "grad_norm": 1.4123014055742389, + "learning_rate": 3.057579045257285e-07, + "loss": 0.9341, + "step": 78910 + }, + { + "epoch": 6.115696075012592, + "grad_norm": 1.3423957824021018, + "learning_rate": 3.0579665220086796e-07, + "loss": 0.9365, + "step": 78920 + }, + { + "epoch": 6.116470998488899, + "grad_norm": 1.4601366570719247, + "learning_rate": 3.058353998760075e-07, + "loss": 0.9299, + "step": 78930 + }, + { + "epoch": 6.117245921965206, + "grad_norm": 1.4324213338804508, + "learning_rate": 3.0587414755114696e-07, + "loss": 0.9333, + "step": 78940 + }, + { + "epoch": 6.118020845441513, + "grad_norm": 1.3676443413113302, + "learning_rate": 3.059128952262864e-07, + "loss": 0.9362, + "step": 78950 + }, + { + "epoch": 6.11879576891782, + "grad_norm": 1.4078512016822065, + "learning_rate": 3.0595164290142595e-07, + "loss": 0.9316, + "step": 78960 + }, + { + "epoch": 6.119570692394126, + "grad_norm": 1.4562968860173189, + "learning_rate": 3.059903905765654e-07, + "loss": 0.9474, + "step": 78970 + }, + { + "epoch": 6.120345615870432, + "grad_norm": 1.3236353224403072, + "learning_rate": 3.0602913825170494e-07, + "loss": 0.9391, + "step": 78980 + }, + { + "epoch": 6.121120539346739, + "grad_norm": 1.3963734402946515, + "learning_rate": 3.060678859268444e-07, + "loss": 0.9383, + "step": 78990 + }, + { + "epoch": 6.121895462823046, + "grad_norm": 1.3407011935284485, + "learning_rate": 3.0610663360198393e-07, + "loss": 0.9561, + "step": 79000 + }, + { + "epoch": 6.121895462823046, + "eval_loss": 0.940485954284668, + "eval_runtime": 333.3961, + "eval_samples_per_second": 34.407, + "eval_steps_per_second": 8.602, + "step": 79000 + }, + { + "epoch": 6.122670386299353, + "grad_norm": 1.407919266924269, + "learning_rate": 3.061453812771234e-07, + "loss": 0.9502, + "step": 79010 + }, + { + "epoch": 6.12344530977566, + "grad_norm": 1.3615847886744903, + "learning_rate": 3.0618412895226287e-07, + "loss": 0.9467, + "step": 79020 + }, + { + "epoch": 6.124220233251966, + "grad_norm": 1.3524488192482274, + "learning_rate": 3.062228766274024e-07, + "loss": 0.9348, + "step": 79030 + }, + { + "epoch": 6.124995156728273, + "grad_norm": 1.4401535318202958, + "learning_rate": 3.0626162430254186e-07, + "loss": 0.9304, + "step": 79040 + }, + { + "epoch": 6.12577008020458, + "grad_norm": 1.4389836045351687, + "learning_rate": 3.063003719776814e-07, + "loss": 0.9495, + "step": 79050 + }, + { + "epoch": 6.126545003680887, + "grad_norm": 1.3450238910601322, + "learning_rate": 3.0633911965282085e-07, + "loss": 0.9396, + "step": 79060 + }, + { + "epoch": 6.127319927157194, + "grad_norm": 1.3355818632536687, + "learning_rate": 3.0637786732796037e-07, + "loss": 0.9388, + "step": 79070 + }, + { + "epoch": 6.1280948506335, + "grad_norm": 1.3497498041783216, + "learning_rate": 3.0641661500309984e-07, + "loss": 0.9328, + "step": 79080 + }, + { + "epoch": 6.128869774109806, + "grad_norm": 1.3578450627232128, + "learning_rate": 3.064553626782393e-07, + "loss": 0.9672, + "step": 79090 + }, + { + "epoch": 6.129644697586113, + "grad_norm": 1.3135682707773055, + "learning_rate": 3.0649411035337883e-07, + "loss": 0.9229, + "step": 79100 + }, + { + "epoch": 6.13041962106242, + "grad_norm": 1.391513787984874, + "learning_rate": 3.065328580285183e-07, + "loss": 0.9258, + "step": 79110 + }, + { + "epoch": 6.131194544538727, + "grad_norm": 1.2933071083385173, + "learning_rate": 3.065716057036578e-07, + "loss": 0.9357, + "step": 79120 + }, + { + "epoch": 6.131969468015034, + "grad_norm": 1.3353077946675165, + "learning_rate": 3.066103533787973e-07, + "loss": 0.941, + "step": 79130 + }, + { + "epoch": 6.13274439149134, + "grad_norm": 1.424310543195098, + "learning_rate": 3.066491010539368e-07, + "loss": 0.9341, + "step": 79140 + }, + { + "epoch": 6.133519314967647, + "grad_norm": 1.3647339819963904, + "learning_rate": 3.066878487290763e-07, + "loss": 0.9387, + "step": 79150 + }, + { + "epoch": 6.134294238443954, + "grad_norm": 1.449987856310907, + "learning_rate": 3.0672659640421575e-07, + "loss": 0.9485, + "step": 79160 + }, + { + "epoch": 6.13506916192026, + "grad_norm": 1.4310393514199466, + "learning_rate": 3.0676534407935527e-07, + "loss": 0.9404, + "step": 79170 + }, + { + "epoch": 6.135844085396567, + "grad_norm": 1.4058451591493248, + "learning_rate": 3.0680409175449474e-07, + "loss": 0.94, + "step": 79180 + }, + { + "epoch": 6.136619008872874, + "grad_norm": 1.3855561280780975, + "learning_rate": 3.0684283942963426e-07, + "loss": 0.9334, + "step": 79190 + }, + { + "epoch": 6.13739393234918, + "grad_norm": 1.366179779582365, + "learning_rate": 3.0688158710477373e-07, + "loss": 0.9316, + "step": 79200 + }, + { + "epoch": 6.138168855825487, + "grad_norm": 1.3393414832102688, + "learning_rate": 3.0692033477991325e-07, + "loss": 0.9485, + "step": 79210 + }, + { + "epoch": 6.138943779301794, + "grad_norm": 1.4496533039268922, + "learning_rate": 3.069590824550527e-07, + "loss": 0.9352, + "step": 79220 + }, + { + "epoch": 6.139718702778101, + "grad_norm": 1.4935184737248692, + "learning_rate": 3.069978301301922e-07, + "loss": 0.9366, + "step": 79230 + }, + { + "epoch": 6.140493626254408, + "grad_norm": 1.3945302686594963, + "learning_rate": 3.070365778053317e-07, + "loss": 0.9257, + "step": 79240 + }, + { + "epoch": 6.1412685497307145, + "grad_norm": 1.3678722080013568, + "learning_rate": 3.070753254804712e-07, + "loss": 0.9042, + "step": 79250 + }, + { + "epoch": 6.14204347320702, + "grad_norm": 1.3558994655136125, + "learning_rate": 3.071140731556107e-07, + "loss": 0.9384, + "step": 79260 + }, + { + "epoch": 6.142818396683327, + "grad_norm": 1.3732718476425871, + "learning_rate": 3.071528208307502e-07, + "loss": 0.9327, + "step": 79270 + }, + { + "epoch": 6.143593320159634, + "grad_norm": 1.3553763763998736, + "learning_rate": 3.071915685058897e-07, + "loss": 0.9266, + "step": 79280 + }, + { + "epoch": 6.144368243635941, + "grad_norm": 1.3536233358458822, + "learning_rate": 3.0723031618102917e-07, + "loss": 0.9061, + "step": 79290 + }, + { + "epoch": 6.145143167112248, + "grad_norm": 1.4130189072352042, + "learning_rate": 3.0726906385616863e-07, + "loss": 0.944, + "step": 79300 + }, + { + "epoch": 6.145918090588554, + "grad_norm": 1.4509178798870677, + "learning_rate": 3.0730781153130816e-07, + "loss": 0.9556, + "step": 79310 + }, + { + "epoch": 6.146693014064861, + "grad_norm": 1.433451592187412, + "learning_rate": 3.073465592064476e-07, + "loss": 0.9154, + "step": 79320 + }, + { + "epoch": 6.147467937541168, + "grad_norm": 1.3967826952832618, + "learning_rate": 3.0738530688158715e-07, + "loss": 0.9473, + "step": 79330 + }, + { + "epoch": 6.148242861017475, + "grad_norm": 1.3445334061016392, + "learning_rate": 3.074240545567266e-07, + "loss": 0.948, + "step": 79340 + }, + { + "epoch": 6.149017784493782, + "grad_norm": 1.434668131843469, + "learning_rate": 3.0746280223186614e-07, + "loss": 0.9355, + "step": 79350 + }, + { + "epoch": 6.149792707970088, + "grad_norm": 1.4123457762092626, + "learning_rate": 3.075015499070056e-07, + "loss": 0.9311, + "step": 79360 + }, + { + "epoch": 6.150567631446394, + "grad_norm": 1.46529622448992, + "learning_rate": 3.075402975821451e-07, + "loss": 0.9275, + "step": 79370 + }, + { + "epoch": 6.151342554922701, + "grad_norm": 1.3502345286652293, + "learning_rate": 3.075790452572846e-07, + "loss": 0.9216, + "step": 79380 + }, + { + "epoch": 6.152117478399008, + "grad_norm": 1.3709148852509614, + "learning_rate": 3.0761779293242407e-07, + "loss": 0.9295, + "step": 79390 + }, + { + "epoch": 6.152892401875315, + "grad_norm": 1.419819069787292, + "learning_rate": 3.076565406075636e-07, + "loss": 0.941, + "step": 79400 + }, + { + "epoch": 6.153667325351622, + "grad_norm": 1.4697623119857601, + "learning_rate": 3.0769528828270306e-07, + "loss": 0.9489, + "step": 79410 + }, + { + "epoch": 6.1544422488279285, + "grad_norm": 1.4164643716071137, + "learning_rate": 3.077340359578426e-07, + "loss": 0.9194, + "step": 79420 + }, + { + "epoch": 6.155217172304235, + "grad_norm": 1.4195578185309012, + "learning_rate": 3.0777278363298205e-07, + "loss": 0.9407, + "step": 79430 + }, + { + "epoch": 6.155992095780542, + "grad_norm": 1.4540850050193652, + "learning_rate": 3.078115313081215e-07, + "loss": 0.9299, + "step": 79440 + }, + { + "epoch": 6.156767019256848, + "grad_norm": 1.370626238087714, + "learning_rate": 3.0785027898326104e-07, + "loss": 0.9438, + "step": 79450 + }, + { + "epoch": 6.157541942733155, + "grad_norm": 1.4035993686454504, + "learning_rate": 3.078890266584005e-07, + "loss": 0.9402, + "step": 79460 + }, + { + "epoch": 6.158316866209462, + "grad_norm": 1.3741630348335034, + "learning_rate": 3.0792777433354003e-07, + "loss": 0.9467, + "step": 79470 + }, + { + "epoch": 6.159091789685768, + "grad_norm": 1.3700530916385247, + "learning_rate": 3.079665220086795e-07, + "loss": 0.9293, + "step": 79480 + }, + { + "epoch": 6.159866713162075, + "grad_norm": 1.4845351489658196, + "learning_rate": 3.08005269683819e-07, + "loss": 0.9309, + "step": 79490 + }, + { + "epoch": 6.160641636638382, + "grad_norm": 1.387208876882309, + "learning_rate": 3.080440173589585e-07, + "loss": 0.9174, + "step": 79500 + }, + { + "epoch": 6.160641636638382, + "eval_loss": 0.9400010704994202, + "eval_runtime": 332.2626, + "eval_samples_per_second": 34.524, + "eval_steps_per_second": 8.632, + "step": 79500 + }, + { + "epoch": 6.161416560114689, + "grad_norm": 1.3542556168397102, + "learning_rate": 3.0808276503409796e-07, + "loss": 0.9291, + "step": 79510 + }, + { + "epoch": 6.162191483590996, + "grad_norm": 1.3394667437976135, + "learning_rate": 3.081215127092375e-07, + "loss": 0.936, + "step": 79520 + }, + { + "epoch": 6.1629664070673025, + "grad_norm": 1.327874367844396, + "learning_rate": 3.0816026038437695e-07, + "loss": 0.9463, + "step": 79530 + }, + { + "epoch": 6.163741330543608, + "grad_norm": 1.4273086681327025, + "learning_rate": 3.081990080595165e-07, + "loss": 0.9238, + "step": 79540 + }, + { + "epoch": 6.164516254019915, + "grad_norm": 1.3912437141177736, + "learning_rate": 3.0823775573465594e-07, + "loss": 0.9412, + "step": 79550 + }, + { + "epoch": 6.165291177496222, + "grad_norm": 1.4190471235183109, + "learning_rate": 3.082765034097954e-07, + "loss": 0.9569, + "step": 79560 + }, + { + "epoch": 6.166066100972529, + "grad_norm": 1.4078906515448544, + "learning_rate": 3.0831525108493493e-07, + "loss": 0.9344, + "step": 79570 + }, + { + "epoch": 6.166841024448836, + "grad_norm": 1.41776107779419, + "learning_rate": 3.083539987600744e-07, + "loss": 0.9316, + "step": 79580 + }, + { + "epoch": 6.1676159479251425, + "grad_norm": 1.3951052634128223, + "learning_rate": 3.083927464352139e-07, + "loss": 0.9462, + "step": 79590 + }, + { + "epoch": 6.168390871401449, + "grad_norm": 1.4138818591751872, + "learning_rate": 3.084314941103534e-07, + "loss": 0.9413, + "step": 79600 + }, + { + "epoch": 6.169165794877756, + "grad_norm": 1.632354182229796, + "learning_rate": 3.084702417854929e-07, + "loss": 0.9594, + "step": 79610 + }, + { + "epoch": 6.169940718354063, + "grad_norm": 1.4047157624622117, + "learning_rate": 3.085089894606324e-07, + "loss": 0.9345, + "step": 79620 + }, + { + "epoch": 6.17071564183037, + "grad_norm": 1.3730130689587579, + "learning_rate": 3.0854773713577185e-07, + "loss": 0.9221, + "step": 79630 + }, + { + "epoch": 6.171490565306676, + "grad_norm": 1.361801766529219, + "learning_rate": 3.085864848109114e-07, + "loss": 0.9297, + "step": 79640 + }, + { + "epoch": 6.172265488782982, + "grad_norm": 1.3914245262287335, + "learning_rate": 3.0862523248605085e-07, + "loss": 0.9377, + "step": 79650 + }, + { + "epoch": 6.173040412259289, + "grad_norm": 1.3923622631431771, + "learning_rate": 3.0866398016119037e-07, + "loss": 0.9472, + "step": 79660 + }, + { + "epoch": 6.173815335735596, + "grad_norm": 1.4151613936091207, + "learning_rate": 3.0870272783632984e-07, + "loss": 0.9424, + "step": 79670 + }, + { + "epoch": 6.174590259211903, + "grad_norm": 1.379292274050935, + "learning_rate": 3.0874147551146936e-07, + "loss": 0.9309, + "step": 79680 + }, + { + "epoch": 6.17536518268821, + "grad_norm": 1.4172682670085488, + "learning_rate": 3.0878022318660883e-07, + "loss": 0.9621, + "step": 79690 + }, + { + "epoch": 6.1761401061645165, + "grad_norm": 1.3829031136271717, + "learning_rate": 3.088189708617483e-07, + "loss": 0.9373, + "step": 79700 + }, + { + "epoch": 6.176915029640823, + "grad_norm": 1.445038771086869, + "learning_rate": 3.088577185368878e-07, + "loss": 0.9327, + "step": 79710 + }, + { + "epoch": 6.17768995311713, + "grad_norm": 1.3598476301721891, + "learning_rate": 3.088964662120273e-07, + "loss": 0.9234, + "step": 79720 + }, + { + "epoch": 6.178464876593436, + "grad_norm": 1.4148825981150515, + "learning_rate": 3.089352138871668e-07, + "loss": 0.9484, + "step": 79730 + }, + { + "epoch": 6.179239800069743, + "grad_norm": 1.4474667964198251, + "learning_rate": 3.089739615623063e-07, + "loss": 0.9177, + "step": 79740 + }, + { + "epoch": 6.18001472354605, + "grad_norm": 1.4098187660662371, + "learning_rate": 3.090127092374458e-07, + "loss": 0.9526, + "step": 79750 + }, + { + "epoch": 6.1807896470223564, + "grad_norm": 1.3521922165339224, + "learning_rate": 3.0905145691258527e-07, + "loss": 0.939, + "step": 79760 + }, + { + "epoch": 6.181564570498663, + "grad_norm": 1.4679706267074157, + "learning_rate": 3.0909020458772474e-07, + "loss": 0.9153, + "step": 79770 + }, + { + "epoch": 6.18233949397497, + "grad_norm": 1.432728370393358, + "learning_rate": 3.0912895226286426e-07, + "loss": 0.9312, + "step": 79780 + }, + { + "epoch": 6.183114417451277, + "grad_norm": 1.4631688111898018, + "learning_rate": 3.0916769993800373e-07, + "loss": 0.9237, + "step": 79790 + }, + { + "epoch": 6.183889340927584, + "grad_norm": 1.36202871990533, + "learning_rate": 3.0920644761314325e-07, + "loss": 0.9166, + "step": 79800 + }, + { + "epoch": 6.1846642644038905, + "grad_norm": 1.4412232613047338, + "learning_rate": 3.092451952882827e-07, + "loss": 0.9292, + "step": 79810 + }, + { + "epoch": 6.185439187880196, + "grad_norm": 1.3844391167414214, + "learning_rate": 3.0928394296342224e-07, + "loss": 0.9353, + "step": 79820 + }, + { + "epoch": 6.186214111356503, + "grad_norm": 1.3741120412820824, + "learning_rate": 3.093226906385617e-07, + "loss": 0.9282, + "step": 79830 + }, + { + "epoch": 6.18698903483281, + "grad_norm": 1.3978188892478243, + "learning_rate": 3.093614383137012e-07, + "loss": 0.9513, + "step": 79840 + }, + { + "epoch": 6.187763958309117, + "grad_norm": 1.4172862923680651, + "learning_rate": 3.094001859888407e-07, + "loss": 0.936, + "step": 79850 + }, + { + "epoch": 6.188538881785424, + "grad_norm": 1.4152912605373686, + "learning_rate": 3.0943893366398017e-07, + "loss": 0.9336, + "step": 79860 + }, + { + "epoch": 6.1893138052617305, + "grad_norm": 1.4109702130007626, + "learning_rate": 3.094776813391197e-07, + "loss": 0.9277, + "step": 79870 + }, + { + "epoch": 6.190088728738037, + "grad_norm": 1.374601264072896, + "learning_rate": 3.0951642901425916e-07, + "loss": 0.9368, + "step": 79880 + }, + { + "epoch": 6.190863652214344, + "grad_norm": 1.3326639910797728, + "learning_rate": 3.095551766893987e-07, + "loss": 0.9624, + "step": 79890 + }, + { + "epoch": 6.191638575690651, + "grad_norm": 1.3825089878361414, + "learning_rate": 3.0959392436453815e-07, + "loss": 0.966, + "step": 79900 + }, + { + "epoch": 6.192413499166957, + "grad_norm": 1.3952781002957861, + "learning_rate": 3.096326720396776e-07, + "loss": 0.9178, + "step": 79910 + }, + { + "epoch": 6.193188422643264, + "grad_norm": 1.3721009558397097, + "learning_rate": 3.0967141971481714e-07, + "loss": 0.9258, + "step": 79920 + }, + { + "epoch": 6.1939633461195704, + "grad_norm": 1.553678306582538, + "learning_rate": 3.097101673899566e-07, + "loss": 0.9375, + "step": 79930 + }, + { + "epoch": 6.194738269595877, + "grad_norm": 1.4547373611024568, + "learning_rate": 3.0974891506509614e-07, + "loss": 0.9381, + "step": 79940 + }, + { + "epoch": 6.195513193072184, + "grad_norm": 1.4057458597575756, + "learning_rate": 3.097876627402356e-07, + "loss": 0.9293, + "step": 79950 + }, + { + "epoch": 6.196288116548491, + "grad_norm": 1.369516474296517, + "learning_rate": 3.0982641041537513e-07, + "loss": 0.9126, + "step": 79960 + }, + { + "epoch": 6.197063040024798, + "grad_norm": 1.4520158289124545, + "learning_rate": 3.098651580905146e-07, + "loss": 0.9206, + "step": 79970 + }, + { + "epoch": 6.1978379635011045, + "grad_norm": 1.411248814109717, + "learning_rate": 3.0990390576565406e-07, + "loss": 0.9184, + "step": 79980 + }, + { + "epoch": 6.198612886977411, + "grad_norm": 1.3557348039252395, + "learning_rate": 3.099426534407936e-07, + "loss": 0.9387, + "step": 79990 + }, + { + "epoch": 6.199387810453718, + "grad_norm": 1.3510725117148985, + "learning_rate": 3.0998140111593306e-07, + "loss": 0.9484, + "step": 80000 + }, + { + "epoch": 6.199387810453718, + "eval_loss": 0.9395511746406555, + "eval_runtime": 329.4437, + "eval_samples_per_second": 34.819, + "eval_steps_per_second": 8.706, + "step": 80000 + }, + { + "epoch": 6.200162733930024, + "grad_norm": 1.4133907350289339, + "learning_rate": 3.100201487910726e-07, + "loss": 0.9545, + "step": 80010 + }, + { + "epoch": 6.200937657406331, + "grad_norm": 1.3747125725227254, + "learning_rate": 3.1005889646621205e-07, + "loss": 0.9477, + "step": 80020 + }, + { + "epoch": 6.201712580882638, + "grad_norm": 1.5149252497785686, + "learning_rate": 3.1009764414135157e-07, + "loss": 0.9268, + "step": 80030 + }, + { + "epoch": 6.2024875043589445, + "grad_norm": 1.3888575674437078, + "learning_rate": 3.1013639181649104e-07, + "loss": 0.949, + "step": 80040 + }, + { + "epoch": 6.203262427835251, + "grad_norm": 1.373488550859658, + "learning_rate": 3.101751394916305e-07, + "loss": 0.9249, + "step": 80050 + }, + { + "epoch": 6.204037351311558, + "grad_norm": 1.416763526539567, + "learning_rate": 3.1021388716677003e-07, + "loss": 0.9381, + "step": 80060 + }, + { + "epoch": 6.204812274787865, + "grad_norm": 1.4032871440433088, + "learning_rate": 3.102526348419095e-07, + "loss": 0.9248, + "step": 80070 + }, + { + "epoch": 6.205587198264172, + "grad_norm": 1.3212855798360401, + "learning_rate": 3.10291382517049e-07, + "loss": 0.9408, + "step": 80080 + }, + { + "epoch": 6.2063621217404785, + "grad_norm": 1.4771568250609883, + "learning_rate": 3.103301301921885e-07, + "loss": 0.9214, + "step": 80090 + }, + { + "epoch": 6.207137045216784, + "grad_norm": 1.3926923741391177, + "learning_rate": 3.10368877867328e-07, + "loss": 0.9386, + "step": 80100 + }, + { + "epoch": 6.207911968693091, + "grad_norm": 1.3945541314955159, + "learning_rate": 3.104076255424675e-07, + "loss": 0.9491, + "step": 80110 + }, + { + "epoch": 6.208686892169398, + "grad_norm": 1.3696271282301309, + "learning_rate": 3.1044637321760695e-07, + "loss": 0.9202, + "step": 80120 + }, + { + "epoch": 6.209461815645705, + "grad_norm": 1.425367645032651, + "learning_rate": 3.1048512089274647e-07, + "loss": 0.9832, + "step": 80130 + }, + { + "epoch": 6.210236739122012, + "grad_norm": 1.3978377148656644, + "learning_rate": 3.1052386856788594e-07, + "loss": 0.9262, + "step": 80140 + }, + { + "epoch": 6.2110116625983185, + "grad_norm": 1.3633500916568357, + "learning_rate": 3.1056261624302546e-07, + "loss": 0.9181, + "step": 80150 + }, + { + "epoch": 6.211786586074625, + "grad_norm": 1.4039769364429404, + "learning_rate": 3.1060136391816493e-07, + "loss": 0.922, + "step": 80160 + }, + { + "epoch": 6.212561509550932, + "grad_norm": 1.3653099585448436, + "learning_rate": 3.1064011159330445e-07, + "loss": 0.9335, + "step": 80170 + }, + { + "epoch": 6.213336433027239, + "grad_norm": 1.3540186620734067, + "learning_rate": 3.106788592684439e-07, + "loss": 0.9536, + "step": 80180 + }, + { + "epoch": 6.214111356503545, + "grad_norm": 1.350435272908183, + "learning_rate": 3.107176069435834e-07, + "loss": 0.9291, + "step": 80190 + }, + { + "epoch": 6.214886279979852, + "grad_norm": 1.4119349918688409, + "learning_rate": 3.107563546187229e-07, + "loss": 0.9381, + "step": 80200 + }, + { + "epoch": 6.2156612034561585, + "grad_norm": 1.3578549830106648, + "learning_rate": 3.107951022938624e-07, + "loss": 0.9184, + "step": 80210 + }, + { + "epoch": 6.216436126932465, + "grad_norm": 1.443009846127988, + "learning_rate": 3.108338499690019e-07, + "loss": 0.925, + "step": 80220 + }, + { + "epoch": 6.217211050408772, + "grad_norm": 1.4100488506114937, + "learning_rate": 3.1087259764414137e-07, + "loss": 0.9332, + "step": 80230 + }, + { + "epoch": 6.217985973885079, + "grad_norm": 1.5100400757915244, + "learning_rate": 3.109113453192809e-07, + "loss": 0.9432, + "step": 80240 + }, + { + "epoch": 6.218760897361386, + "grad_norm": 1.4049412950627136, + "learning_rate": 3.1095009299442036e-07, + "loss": 0.9357, + "step": 80250 + }, + { + "epoch": 6.2195358208376925, + "grad_norm": 1.2843694686524088, + "learning_rate": 3.1098884066955983e-07, + "loss": 0.918, + "step": 80260 + }, + { + "epoch": 6.220310744313999, + "grad_norm": 1.4386132808759824, + "learning_rate": 3.1102758834469936e-07, + "loss": 0.9455, + "step": 80270 + }, + { + "epoch": 6.221085667790305, + "grad_norm": 1.3446800897629778, + "learning_rate": 3.110663360198388e-07, + "loss": 0.9284, + "step": 80280 + }, + { + "epoch": 6.221860591266612, + "grad_norm": 1.3966131728314077, + "learning_rate": 3.1110508369497835e-07, + "loss": 0.9489, + "step": 80290 + }, + { + "epoch": 6.222635514742919, + "grad_norm": 1.4699828232084724, + "learning_rate": 3.111438313701178e-07, + "loss": 0.9333, + "step": 80300 + }, + { + "epoch": 6.223410438219226, + "grad_norm": 1.4026990243098851, + "learning_rate": 3.111825790452573e-07, + "loss": 0.929, + "step": 80310 + }, + { + "epoch": 6.2241853616955325, + "grad_norm": 1.3428520174799767, + "learning_rate": 3.112213267203968e-07, + "loss": 0.9454, + "step": 80320 + }, + { + "epoch": 6.224960285171839, + "grad_norm": 1.4279175912118627, + "learning_rate": 3.112600743955363e-07, + "loss": 0.9502, + "step": 80330 + }, + { + "epoch": 6.225735208648146, + "grad_norm": 1.3405215827521346, + "learning_rate": 3.112988220706758e-07, + "loss": 0.9345, + "step": 80340 + }, + { + "epoch": 6.226510132124453, + "grad_norm": 1.43954219017349, + "learning_rate": 3.1133756974581527e-07, + "loss": 0.9406, + "step": 80350 + }, + { + "epoch": 6.22728505560076, + "grad_norm": 1.4659459963706278, + "learning_rate": 3.113763174209548e-07, + "loss": 0.9471, + "step": 80360 + }, + { + "epoch": 6.2280599790770665, + "grad_norm": 1.4087329302766298, + "learning_rate": 3.1141506509609426e-07, + "loss": 0.9383, + "step": 80370 + }, + { + "epoch": 6.2288349025533725, + "grad_norm": 1.3339988714658246, + "learning_rate": 3.114538127712337e-07, + "loss": 0.9395, + "step": 80380 + }, + { + "epoch": 6.229609826029679, + "grad_norm": 1.362698150738611, + "learning_rate": 3.1149256044637325e-07, + "loss": 0.9312, + "step": 80390 + }, + { + "epoch": 6.230384749505986, + "grad_norm": 1.5012471576767439, + "learning_rate": 3.115313081215127e-07, + "loss": 0.9329, + "step": 80400 + }, + { + "epoch": 6.231159672982293, + "grad_norm": 1.4332999600505556, + "learning_rate": 3.1157005579665224e-07, + "loss": 0.9306, + "step": 80410 + }, + { + "epoch": 6.2319345964586, + "grad_norm": 1.5174100666593262, + "learning_rate": 3.116088034717917e-07, + "loss": 0.935, + "step": 80420 + }, + { + "epoch": 6.2327095199349065, + "grad_norm": 1.3609424221627275, + "learning_rate": 3.1164755114693123e-07, + "loss": 0.9433, + "step": 80430 + }, + { + "epoch": 6.233484443411213, + "grad_norm": 1.4235422014181165, + "learning_rate": 3.116862988220707e-07, + "loss": 0.9414, + "step": 80440 + }, + { + "epoch": 6.23425936688752, + "grad_norm": 1.3667618785491977, + "learning_rate": 3.1172504649721017e-07, + "loss": 0.9339, + "step": 80450 + }, + { + "epoch": 6.235034290363827, + "grad_norm": 1.3998804881358056, + "learning_rate": 3.117637941723497e-07, + "loss": 0.9334, + "step": 80460 + }, + { + "epoch": 6.235809213840133, + "grad_norm": 1.3293076294892083, + "learning_rate": 3.1180254184748916e-07, + "loss": 0.9233, + "step": 80470 + }, + { + "epoch": 6.23658413731644, + "grad_norm": 1.402080569924954, + "learning_rate": 3.118412895226287e-07, + "loss": 0.9291, + "step": 80480 + }, + { + "epoch": 6.2373590607927465, + "grad_norm": 1.3437331030415867, + "learning_rate": 3.1188003719776815e-07, + "loss": 0.9146, + "step": 80490 + }, + { + "epoch": 6.238133984269053, + "grad_norm": 1.4159100731629677, + "learning_rate": 3.1191878487290767e-07, + "loss": 0.9291, + "step": 80500 + }, + { + "epoch": 6.238133984269053, + "eval_loss": 0.9391106963157654, + "eval_runtime": 327.0584, + "eval_samples_per_second": 35.073, + "eval_steps_per_second": 8.769, + "step": 80500 + }, + { + "epoch": 6.23890890774536, + "grad_norm": 1.4312989163209904, + "learning_rate": 3.1195753254804714e-07, + "loss": 0.9151, + "step": 80510 + }, + { + "epoch": 6.239683831221667, + "grad_norm": 1.413106070709289, + "learning_rate": 3.119962802231866e-07, + "loss": 0.9455, + "step": 80520 + }, + { + "epoch": 6.240458754697974, + "grad_norm": 1.3578117325811117, + "learning_rate": 3.1203502789832613e-07, + "loss": 0.937, + "step": 80530 + }, + { + "epoch": 6.2412336781742805, + "grad_norm": 1.3844855173721555, + "learning_rate": 3.120737755734656e-07, + "loss": 0.9306, + "step": 80540 + }, + { + "epoch": 6.242008601650587, + "grad_norm": 1.4665052434563794, + "learning_rate": 3.121125232486051e-07, + "loss": 0.9279, + "step": 80550 + }, + { + "epoch": 6.242783525126894, + "grad_norm": 1.3891469725207226, + "learning_rate": 3.121512709237446e-07, + "loss": 0.9096, + "step": 80560 + }, + { + "epoch": 6.2435584486032, + "grad_norm": 1.4445033491629475, + "learning_rate": 3.121900185988841e-07, + "loss": 0.9414, + "step": 80570 + }, + { + "epoch": 6.244333372079507, + "grad_norm": 1.3634479190751476, + "learning_rate": 3.122287662740236e-07, + "loss": 0.9368, + "step": 80580 + }, + { + "epoch": 6.245108295555814, + "grad_norm": 1.3511662885827682, + "learning_rate": 3.1226751394916305e-07, + "loss": 0.9415, + "step": 80590 + }, + { + "epoch": 6.2458832190321205, + "grad_norm": 1.3872301307513712, + "learning_rate": 3.123062616243026e-07, + "loss": 0.9333, + "step": 80600 + }, + { + "epoch": 6.246658142508427, + "grad_norm": 1.414436747958874, + "learning_rate": 3.1234500929944204e-07, + "loss": 0.9358, + "step": 80610 + }, + { + "epoch": 6.247433065984734, + "grad_norm": 1.4085760511610619, + "learning_rate": 3.1238375697458157e-07, + "loss": 0.9481, + "step": 80620 + }, + { + "epoch": 6.248207989461041, + "grad_norm": 1.4054989476004938, + "learning_rate": 3.1242250464972103e-07, + "loss": 0.9277, + "step": 80630 + }, + { + "epoch": 6.248982912937348, + "grad_norm": 1.4389005743041048, + "learning_rate": 3.1246125232486056e-07, + "loss": 0.945, + "step": 80640 + }, + { + "epoch": 6.249757836413655, + "grad_norm": 1.4786073213511377, + "learning_rate": 3.125e-07, + "loss": 0.9089, + "step": 80650 + }, + { + "epoch": 6.2505327598899605, + "grad_norm": 1.4260282456792956, + "learning_rate": 3.125387476751395e-07, + "loss": 0.9256, + "step": 80660 + }, + { + "epoch": 6.251307683366267, + "grad_norm": 1.3392872561642184, + "learning_rate": 3.12577495350279e-07, + "loss": 0.9357, + "step": 80670 + }, + { + "epoch": 6.252082606842574, + "grad_norm": 1.3388165183831606, + "learning_rate": 3.126162430254185e-07, + "loss": 0.9398, + "step": 80680 + }, + { + "epoch": 6.252857530318881, + "grad_norm": 1.4799668053655568, + "learning_rate": 3.12654990700558e-07, + "loss": 0.9415, + "step": 80690 + }, + { + "epoch": 6.253632453795188, + "grad_norm": 1.3785464278011395, + "learning_rate": 3.126937383756975e-07, + "loss": 0.9261, + "step": 80700 + }, + { + "epoch": 6.2544073772714945, + "grad_norm": 1.3608394907592325, + "learning_rate": 3.12732486050837e-07, + "loss": 0.9384, + "step": 80710 + }, + { + "epoch": 6.255182300747801, + "grad_norm": 1.364371462486148, + "learning_rate": 3.1277123372597647e-07, + "loss": 0.9605, + "step": 80720 + }, + { + "epoch": 6.255957224224108, + "grad_norm": 1.3784670779171246, + "learning_rate": 3.1280998140111594e-07, + "loss": 0.9221, + "step": 80730 + }, + { + "epoch": 6.256732147700415, + "grad_norm": 1.422234675863988, + "learning_rate": 3.1284872907625546e-07, + "loss": 0.9221, + "step": 80740 + }, + { + "epoch": 6.257507071176721, + "grad_norm": 1.461754019364941, + "learning_rate": 3.1288747675139493e-07, + "loss": 0.9189, + "step": 80750 + }, + { + "epoch": 6.258281994653028, + "grad_norm": 1.3532855416157066, + "learning_rate": 3.1292622442653445e-07, + "loss": 0.9326, + "step": 80760 + }, + { + "epoch": 6.2590569181293345, + "grad_norm": 1.3154139811754781, + "learning_rate": 3.129649721016739e-07, + "loss": 0.9276, + "step": 80770 + }, + { + "epoch": 6.259831841605641, + "grad_norm": 1.335337834408408, + "learning_rate": 3.1300371977681344e-07, + "loss": 0.928, + "step": 80780 + }, + { + "epoch": 6.260606765081948, + "grad_norm": 1.3783974061924298, + "learning_rate": 3.130424674519529e-07, + "loss": 0.9179, + "step": 80790 + }, + { + "epoch": 6.261381688558255, + "grad_norm": 1.4401521694051824, + "learning_rate": 3.130812151270924e-07, + "loss": 0.92, + "step": 80800 + }, + { + "epoch": 6.262156612034562, + "grad_norm": 1.3342116250003637, + "learning_rate": 3.131199628022319e-07, + "loss": 0.9212, + "step": 80810 + }, + { + "epoch": 6.262931535510869, + "grad_norm": 1.3406993071194238, + "learning_rate": 3.1315871047737137e-07, + "loss": 0.9514, + "step": 80820 + }, + { + "epoch": 6.263706458987175, + "grad_norm": 1.4048832650763952, + "learning_rate": 3.131974581525109e-07, + "loss": 0.9415, + "step": 80830 + }, + { + "epoch": 6.264481382463481, + "grad_norm": 1.3908216854267774, + "learning_rate": 3.1323620582765036e-07, + "loss": 0.9452, + "step": 80840 + }, + { + "epoch": 6.265256305939788, + "grad_norm": 1.371774399700595, + "learning_rate": 3.132749535027899e-07, + "loss": 0.9297, + "step": 80850 + }, + { + "epoch": 6.266031229416095, + "grad_norm": 1.37245165052669, + "learning_rate": 3.1331370117792935e-07, + "loss": 0.9053, + "step": 80860 + }, + { + "epoch": 6.266806152892402, + "grad_norm": 1.37529375738448, + "learning_rate": 3.133524488530688e-07, + "loss": 0.9315, + "step": 80870 + }, + { + "epoch": 6.2675810763687085, + "grad_norm": 1.266075929199596, + "learning_rate": 3.1339119652820834e-07, + "loss": 0.9563, + "step": 80880 + }, + { + "epoch": 6.268355999845015, + "grad_norm": 1.332902884465778, + "learning_rate": 3.134299442033478e-07, + "loss": 0.9227, + "step": 80890 + }, + { + "epoch": 6.269130923321322, + "grad_norm": 1.422187751480393, + "learning_rate": 3.1346869187848733e-07, + "loss": 0.9224, + "step": 80900 + }, + { + "epoch": 6.269905846797629, + "grad_norm": 1.2942716845694484, + "learning_rate": 3.135074395536268e-07, + "loss": 0.9445, + "step": 80910 + }, + { + "epoch": 6.270680770273936, + "grad_norm": 1.4555051583502239, + "learning_rate": 3.135461872287663e-07, + "loss": 0.9358, + "step": 80920 + }, + { + "epoch": 6.271455693750243, + "grad_norm": 1.37385347186141, + "learning_rate": 3.135849349039058e-07, + "loss": 0.9484, + "step": 80930 + }, + { + "epoch": 6.2722306172265485, + "grad_norm": 1.431713289756122, + "learning_rate": 3.1362368257904526e-07, + "loss": 0.9477, + "step": 80940 + }, + { + "epoch": 6.273005540702855, + "grad_norm": 1.3115544772936878, + "learning_rate": 3.136624302541848e-07, + "loss": 0.937, + "step": 80950 + }, + { + "epoch": 6.273780464179162, + "grad_norm": 1.4123154411215915, + "learning_rate": 3.1370117792932425e-07, + "loss": 0.9412, + "step": 80960 + }, + { + "epoch": 6.274555387655469, + "grad_norm": 1.4894882653371135, + "learning_rate": 3.137399256044638e-07, + "loss": 0.9385, + "step": 80970 + }, + { + "epoch": 6.275330311131776, + "grad_norm": 1.4035742410916712, + "learning_rate": 3.1377867327960325e-07, + "loss": 0.9079, + "step": 80980 + }, + { + "epoch": 6.276105234608083, + "grad_norm": 1.409388131409916, + "learning_rate": 3.1381742095474277e-07, + "loss": 0.9371, + "step": 80990 + }, + { + "epoch": 6.276880158084389, + "grad_norm": 1.3643122459600543, + "learning_rate": 3.1385616862988224e-07, + "loss": 0.9265, + "step": 81000 + }, + { + "epoch": 6.276880158084389, + "eval_loss": 0.9384918212890625, + "eval_runtime": 326.7773, + "eval_samples_per_second": 35.103, + "eval_steps_per_second": 8.777, + "step": 81000 + }, + { + "epoch": 6.277655081560696, + "grad_norm": 1.4631166091782433, + "learning_rate": 3.138949163050217e-07, + "loss": 0.9317, + "step": 81010 + }, + { + "epoch": 6.278430005037003, + "grad_norm": 1.373343830574834, + "learning_rate": 3.1393366398016123e-07, + "loss": 0.9486, + "step": 81020 + }, + { + "epoch": 6.279204928513309, + "grad_norm": 1.4457689320176117, + "learning_rate": 3.139724116553007e-07, + "loss": 0.9379, + "step": 81030 + }, + { + "epoch": 6.279979851989616, + "grad_norm": 1.4413516351348779, + "learning_rate": 3.140111593304402e-07, + "loss": 0.9505, + "step": 81040 + }, + { + "epoch": 6.2807547754659225, + "grad_norm": 1.4576608127919422, + "learning_rate": 3.140499070055797e-07, + "loss": 0.9272, + "step": 81050 + }, + { + "epoch": 6.281529698942229, + "grad_norm": 1.37083983126543, + "learning_rate": 3.1408865468071916e-07, + "loss": 0.936, + "step": 81060 + }, + { + "epoch": 6.282304622418536, + "grad_norm": 1.4301045974547135, + "learning_rate": 3.141274023558587e-07, + "loss": 0.9458, + "step": 81070 + }, + { + "epoch": 6.283079545894843, + "grad_norm": 1.4784955732294378, + "learning_rate": 3.1416615003099815e-07, + "loss": 0.9201, + "step": 81080 + }, + { + "epoch": 6.28385446937115, + "grad_norm": 1.355481405676295, + "learning_rate": 3.1420489770613767e-07, + "loss": 0.9646, + "step": 81090 + }, + { + "epoch": 6.284629392847457, + "grad_norm": 1.371742451488965, + "learning_rate": 3.1424364538127714e-07, + "loss": 0.9292, + "step": 81100 + }, + { + "epoch": 6.285404316323763, + "grad_norm": 1.4670261416638166, + "learning_rate": 3.1428239305641666e-07, + "loss": 0.9454, + "step": 81110 + }, + { + "epoch": 6.28617923980007, + "grad_norm": 1.4293168247046522, + "learning_rate": 3.1432114073155613e-07, + "loss": 0.9154, + "step": 81120 + }, + { + "epoch": 6.286954163276376, + "grad_norm": 1.395643562695283, + "learning_rate": 3.143598884066956e-07, + "loss": 0.9337, + "step": 81130 + }, + { + "epoch": 6.287729086752683, + "grad_norm": 1.4552936940666794, + "learning_rate": 3.143986360818351e-07, + "loss": 0.926, + "step": 81140 + }, + { + "epoch": 6.28850401022899, + "grad_norm": 1.410438299719802, + "learning_rate": 3.144373837569746e-07, + "loss": 0.933, + "step": 81150 + }, + { + "epoch": 6.289278933705297, + "grad_norm": 1.366965571777811, + "learning_rate": 3.144761314321141e-07, + "loss": 0.886, + "step": 81160 + }, + { + "epoch": 6.290053857181603, + "grad_norm": 1.3653677152587087, + "learning_rate": 3.145148791072536e-07, + "loss": 0.9484, + "step": 81170 + }, + { + "epoch": 6.29082878065791, + "grad_norm": 1.494399514555708, + "learning_rate": 3.145536267823931e-07, + "loss": 0.9565, + "step": 81180 + }, + { + "epoch": 6.291603704134217, + "grad_norm": 1.378051032472058, + "learning_rate": 3.1459237445753257e-07, + "loss": 0.9647, + "step": 81190 + }, + { + "epoch": 6.292378627610524, + "grad_norm": 1.3804900607819532, + "learning_rate": 3.1463112213267204e-07, + "loss": 0.9282, + "step": 81200 + }, + { + "epoch": 6.29315355108683, + "grad_norm": 1.3290065874704713, + "learning_rate": 3.1466986980781156e-07, + "loss": 0.9264, + "step": 81210 + }, + { + "epoch": 6.2939284745631365, + "grad_norm": 1.3130330582689842, + "learning_rate": 3.1470861748295103e-07, + "loss": 0.9489, + "step": 81220 + }, + { + "epoch": 6.294703398039443, + "grad_norm": 1.3850293847192896, + "learning_rate": 3.1474736515809055e-07, + "loss": 0.9426, + "step": 81230 + }, + { + "epoch": 6.29547832151575, + "grad_norm": 1.385196233324913, + "learning_rate": 3.1478611283323e-07, + "loss": 0.9288, + "step": 81240 + }, + { + "epoch": 6.296253244992057, + "grad_norm": 1.363820985358171, + "learning_rate": 3.1482486050836954e-07, + "loss": 0.9261, + "step": 81250 + }, + { + "epoch": 6.297028168468364, + "grad_norm": 1.318755895972342, + "learning_rate": 3.14863608183509e-07, + "loss": 0.9255, + "step": 81260 + }, + { + "epoch": 6.297803091944671, + "grad_norm": 1.3464232442135198, + "learning_rate": 3.149023558586485e-07, + "loss": 0.9304, + "step": 81270 + }, + { + "epoch": 6.298578015420977, + "grad_norm": 1.500998219372699, + "learning_rate": 3.14941103533788e-07, + "loss": 0.9199, + "step": 81280 + }, + { + "epoch": 6.299352938897284, + "grad_norm": 1.3914250893990219, + "learning_rate": 3.1497985120892747e-07, + "loss": 0.932, + "step": 81290 + }, + { + "epoch": 6.300127862373591, + "grad_norm": 1.3855579194309395, + "learning_rate": 3.15018598884067e-07, + "loss": 0.9473, + "step": 81300 + }, + { + "epoch": 6.300902785849897, + "grad_norm": 1.4357646159746855, + "learning_rate": 3.1505734655920646e-07, + "loss": 0.9246, + "step": 81310 + }, + { + "epoch": 6.301677709326204, + "grad_norm": 1.4418790229931038, + "learning_rate": 3.15096094234346e-07, + "loss": 0.9338, + "step": 81320 + }, + { + "epoch": 6.302452632802511, + "grad_norm": 1.420347827505461, + "learning_rate": 3.1513484190948546e-07, + "loss": 0.931, + "step": 81330 + }, + { + "epoch": 6.303227556278817, + "grad_norm": 1.3795444183644407, + "learning_rate": 3.151735895846249e-07, + "loss": 0.9537, + "step": 81340 + }, + { + "epoch": 6.304002479755124, + "grad_norm": 1.4503165379741716, + "learning_rate": 3.1521233725976445e-07, + "loss": 0.9285, + "step": 81350 + }, + { + "epoch": 6.304777403231431, + "grad_norm": 1.3643983815200504, + "learning_rate": 3.152510849349039e-07, + "loss": 0.9293, + "step": 81360 + }, + { + "epoch": 6.305552326707738, + "grad_norm": 1.3665139690354182, + "learning_rate": 3.1528983261004344e-07, + "loss": 0.9555, + "step": 81370 + }, + { + "epoch": 6.306327250184045, + "grad_norm": 1.4938785888589667, + "learning_rate": 3.153285802851829e-07, + "loss": 0.9293, + "step": 81380 + }, + { + "epoch": 6.307102173660351, + "grad_norm": 1.5873663335781791, + "learning_rate": 3.1536732796032243e-07, + "loss": 0.9467, + "step": 81390 + }, + { + "epoch": 6.307877097136657, + "grad_norm": 1.3092549387321344, + "learning_rate": 3.154060756354619e-07, + "loss": 0.9382, + "step": 81400 + }, + { + "epoch": 6.308652020612964, + "grad_norm": 1.4028250456302889, + "learning_rate": 3.1544482331060137e-07, + "loss": 0.9428, + "step": 81410 + }, + { + "epoch": 6.309426944089271, + "grad_norm": 1.350291231100983, + "learning_rate": 3.154835709857409e-07, + "loss": 0.9162, + "step": 81420 + }, + { + "epoch": 6.310201867565578, + "grad_norm": 1.3889445343414324, + "learning_rate": 3.1552231866088036e-07, + "loss": 0.9335, + "step": 81430 + }, + { + "epoch": 6.310976791041885, + "grad_norm": 1.3093621869421215, + "learning_rate": 3.155610663360199e-07, + "loss": 0.9201, + "step": 81440 + }, + { + "epoch": 6.311751714518191, + "grad_norm": 1.401084695727321, + "learning_rate": 3.1559981401115935e-07, + "loss": 0.9165, + "step": 81450 + }, + { + "epoch": 6.312526637994498, + "grad_norm": 1.532631053975801, + "learning_rate": 3.1563856168629887e-07, + "loss": 0.9326, + "step": 81460 + }, + { + "epoch": 6.313301561470805, + "grad_norm": 1.3834630299735742, + "learning_rate": 3.1567730936143834e-07, + "loss": 0.9335, + "step": 81470 + }, + { + "epoch": 6.314076484947112, + "grad_norm": 1.494247255671808, + "learning_rate": 3.157160570365778e-07, + "loss": 0.9323, + "step": 81480 + }, + { + "epoch": 6.314851408423419, + "grad_norm": 1.3489372185069366, + "learning_rate": 3.1575480471171733e-07, + "loss": 0.9425, + "step": 81490 + }, + { + "epoch": 6.3156263318997246, + "grad_norm": 1.3976722698118536, + "learning_rate": 3.157935523868568e-07, + "loss": 0.9251, + "step": 81500 + }, + { + "epoch": 6.3156263318997246, + "eval_loss": 0.93812096118927, + "eval_runtime": 332.7564, + "eval_samples_per_second": 34.473, + "eval_steps_per_second": 8.619, + "step": 81500 + }, + { + "epoch": 6.316401255376031, + "grad_norm": 1.3067146718399474, + "learning_rate": 3.158323000619963e-07, + "loss": 0.9326, + "step": 81510 + }, + { + "epoch": 6.317176178852338, + "grad_norm": 1.4534898809414545, + "learning_rate": 3.158710477371358e-07, + "loss": 0.9302, + "step": 81520 + }, + { + "epoch": 6.317951102328645, + "grad_norm": 1.519079435172341, + "learning_rate": 3.159097954122753e-07, + "loss": 0.9296, + "step": 81530 + }, + { + "epoch": 6.318726025804952, + "grad_norm": 1.4681399333940368, + "learning_rate": 3.159485430874148e-07, + "loss": 0.9351, + "step": 81540 + }, + { + "epoch": 6.319500949281259, + "grad_norm": 1.3017742631969618, + "learning_rate": 3.1598729076255425e-07, + "loss": 0.9372, + "step": 81550 + }, + { + "epoch": 6.320275872757565, + "grad_norm": 1.2686123730956886, + "learning_rate": 3.1602603843769377e-07, + "loss": 0.942, + "step": 81560 + }, + { + "epoch": 6.321050796233872, + "grad_norm": 1.4146810107484387, + "learning_rate": 3.1606478611283324e-07, + "loss": 0.9254, + "step": 81570 + }, + { + "epoch": 6.321825719710178, + "grad_norm": 1.4562089215665102, + "learning_rate": 3.1610353378797276e-07, + "loss": 0.9293, + "step": 81580 + }, + { + "epoch": 6.322600643186485, + "grad_norm": 1.3633742072680766, + "learning_rate": 3.1614228146311223e-07, + "loss": 0.9237, + "step": 81590 + }, + { + "epoch": 6.323375566662792, + "grad_norm": 1.39382810385913, + "learning_rate": 3.1618102913825175e-07, + "loss": 0.9292, + "step": 81600 + }, + { + "epoch": 6.324150490139099, + "grad_norm": 1.403916548414985, + "learning_rate": 3.162197768133912e-07, + "loss": 0.9268, + "step": 81610 + }, + { + "epoch": 6.324925413615405, + "grad_norm": 1.3757182139965085, + "learning_rate": 3.162585244885307e-07, + "loss": 0.9388, + "step": 81620 + }, + { + "epoch": 6.325700337091712, + "grad_norm": 1.3516585197519626, + "learning_rate": 3.162972721636702e-07, + "loss": 0.9257, + "step": 81630 + }, + { + "epoch": 6.326475260568019, + "grad_norm": 1.3508174327438116, + "learning_rate": 3.163360198388097e-07, + "loss": 0.9425, + "step": 81640 + }, + { + "epoch": 6.327250184044326, + "grad_norm": 1.3765712753456474, + "learning_rate": 3.163747675139492e-07, + "loss": 0.915, + "step": 81650 + }, + { + "epoch": 6.328025107520633, + "grad_norm": 1.459042328075577, + "learning_rate": 3.164135151890887e-07, + "loss": 0.9617, + "step": 81660 + }, + { + "epoch": 6.328800030996939, + "grad_norm": 1.3222198555360065, + "learning_rate": 3.164522628642282e-07, + "loss": 0.9179, + "step": 81670 + }, + { + "epoch": 6.329574954473245, + "grad_norm": 1.428677823341214, + "learning_rate": 3.1649101053936767e-07, + "loss": 0.9295, + "step": 81680 + }, + { + "epoch": 6.330349877949552, + "grad_norm": 1.3925198529332037, + "learning_rate": 3.1652975821450714e-07, + "loss": 0.919, + "step": 81690 + }, + { + "epoch": 6.331124801425859, + "grad_norm": 1.412924255757063, + "learning_rate": 3.1656850588964666e-07, + "loss": 0.9214, + "step": 81700 + }, + { + "epoch": 6.331899724902166, + "grad_norm": 1.5208203016484902, + "learning_rate": 3.166072535647861e-07, + "loss": 0.9295, + "step": 81710 + }, + { + "epoch": 6.332674648378473, + "grad_norm": 1.4315530536118295, + "learning_rate": 3.1664600123992565e-07, + "loss": 0.9314, + "step": 81720 + }, + { + "epoch": 6.333449571854779, + "grad_norm": 1.429170718686307, + "learning_rate": 3.166847489150651e-07, + "loss": 0.9402, + "step": 81730 + }, + { + "epoch": 6.334224495331086, + "grad_norm": 1.4281379972664447, + "learning_rate": 3.1672349659020464e-07, + "loss": 0.9248, + "step": 81740 + }, + { + "epoch": 6.334999418807393, + "grad_norm": 1.4397043956697795, + "learning_rate": 3.167622442653441e-07, + "loss": 0.9336, + "step": 81750 + }, + { + "epoch": 6.3357743422837, + "grad_norm": 1.3993267773364446, + "learning_rate": 3.168009919404836e-07, + "loss": 0.9176, + "step": 81760 + }, + { + "epoch": 6.336549265760006, + "grad_norm": 1.504204657439478, + "learning_rate": 3.168397396156231e-07, + "loss": 0.935, + "step": 81770 + }, + { + "epoch": 6.337324189236313, + "grad_norm": 1.3486300132080205, + "learning_rate": 3.1687848729076257e-07, + "loss": 0.9396, + "step": 81780 + }, + { + "epoch": 6.338099112712619, + "grad_norm": 1.435645542308251, + "learning_rate": 3.169172349659021e-07, + "loss": 0.9175, + "step": 81790 + }, + { + "epoch": 6.338874036188926, + "grad_norm": 1.402325762259632, + "learning_rate": 3.1695598264104156e-07, + "loss": 0.9127, + "step": 81800 + }, + { + "epoch": 6.339648959665233, + "grad_norm": 1.3574400016779253, + "learning_rate": 3.1699473031618103e-07, + "loss": 0.9623, + "step": 81810 + }, + { + "epoch": 6.34042388314154, + "grad_norm": 1.4296199926033701, + "learning_rate": 3.1703347799132055e-07, + "loss": 0.9521, + "step": 81820 + }, + { + "epoch": 6.341198806617847, + "grad_norm": 1.341911613909418, + "learning_rate": 3.1707222566646e-07, + "loss": 0.9293, + "step": 81830 + }, + { + "epoch": 6.341973730094153, + "grad_norm": 1.4242609261901282, + "learning_rate": 3.1711097334159954e-07, + "loss": 0.9187, + "step": 81840 + }, + { + "epoch": 6.34274865357046, + "grad_norm": 1.3450771927276974, + "learning_rate": 3.17149721016739e-07, + "loss": 0.942, + "step": 81850 + }, + { + "epoch": 6.343523577046767, + "grad_norm": 1.4526612683293043, + "learning_rate": 3.1718846869187853e-07, + "loss": 0.9416, + "step": 81860 + }, + { + "epoch": 6.344298500523073, + "grad_norm": 1.472350675413444, + "learning_rate": 3.17227216367018e-07, + "loss": 0.9121, + "step": 81870 + }, + { + "epoch": 6.34507342399938, + "grad_norm": 1.5183591583329543, + "learning_rate": 3.1726596404215747e-07, + "loss": 0.9215, + "step": 81880 + }, + { + "epoch": 6.345848347475687, + "grad_norm": 1.3505317759204885, + "learning_rate": 3.17304711717297e-07, + "loss": 0.9408, + "step": 81890 + }, + { + "epoch": 6.346623270951993, + "grad_norm": 1.390950428023204, + "learning_rate": 3.1734345939243646e-07, + "loss": 0.9099, + "step": 81900 + }, + { + "epoch": 6.3473981944283, + "grad_norm": 1.393955848182178, + "learning_rate": 3.17382207067576e-07, + "loss": 0.9171, + "step": 81910 + }, + { + "epoch": 6.348173117904607, + "grad_norm": 1.381885721062394, + "learning_rate": 3.1742095474271545e-07, + "loss": 0.9051, + "step": 81920 + }, + { + "epoch": 6.348948041380914, + "grad_norm": 1.4098058929291295, + "learning_rate": 3.17459702417855e-07, + "loss": 0.9214, + "step": 81930 + }, + { + "epoch": 6.349722964857221, + "grad_norm": 1.4558505433910658, + "learning_rate": 3.1749845009299444e-07, + "loss": 0.9284, + "step": 81940 + }, + { + "epoch": 6.3504978883335275, + "grad_norm": 1.4625345543055945, + "learning_rate": 3.175371977681339e-07, + "loss": 0.9519, + "step": 81950 + }, + { + "epoch": 6.351272811809833, + "grad_norm": 1.4704771410968671, + "learning_rate": 3.1757594544327343e-07, + "loss": 0.9307, + "step": 81960 + }, + { + "epoch": 6.35204773528614, + "grad_norm": 1.4391407392434474, + "learning_rate": 3.176146931184129e-07, + "loss": 0.9196, + "step": 81970 + }, + { + "epoch": 6.352822658762447, + "grad_norm": 1.453068765150201, + "learning_rate": 3.176534407935524e-07, + "loss": 0.9476, + "step": 81980 + }, + { + "epoch": 6.353597582238754, + "grad_norm": 1.4094104169249544, + "learning_rate": 3.176921884686919e-07, + "loss": 0.9215, + "step": 81990 + }, + { + "epoch": 6.354372505715061, + "grad_norm": 1.3759464345566064, + "learning_rate": 3.177309361438314e-07, + "loss": 0.9472, + "step": 82000 + }, + { + "epoch": 6.354372505715061, + "eval_loss": 0.9378209710121155, + "eval_runtime": 332.1844, + "eval_samples_per_second": 34.532, + "eval_steps_per_second": 8.634, + "step": 82000 + }, + { + "epoch": 6.355147429191367, + "grad_norm": 1.3806446393506835, + "learning_rate": 3.177696838189709e-07, + "loss": 0.9354, + "step": 82010 + }, + { + "epoch": 6.355922352667674, + "grad_norm": 1.394898768562573, + "learning_rate": 3.1780843149411035e-07, + "loss": 0.9483, + "step": 82020 + }, + { + "epoch": 6.356697276143981, + "grad_norm": 1.3958924452360255, + "learning_rate": 3.178471791692499e-07, + "loss": 0.9333, + "step": 82030 + }, + { + "epoch": 6.357472199620288, + "grad_norm": 1.3510821569725096, + "learning_rate": 3.1788592684438935e-07, + "loss": 0.932, + "step": 82040 + }, + { + "epoch": 6.358247123096595, + "grad_norm": 1.4087037836095104, + "learning_rate": 3.1792467451952887e-07, + "loss": 0.9485, + "step": 82050 + }, + { + "epoch": 6.359022046572901, + "grad_norm": 1.4816794392616541, + "learning_rate": 3.1796342219466834e-07, + "loss": 0.9376, + "step": 82060 + }, + { + "epoch": 6.359796970049207, + "grad_norm": 1.4016252654478767, + "learning_rate": 3.1800216986980786e-07, + "loss": 0.9228, + "step": 82070 + }, + { + "epoch": 6.360571893525514, + "grad_norm": 1.388201434916441, + "learning_rate": 3.1804091754494733e-07, + "loss": 0.9449, + "step": 82080 + }, + { + "epoch": 6.361346817001821, + "grad_norm": 1.3880648602909444, + "learning_rate": 3.180796652200868e-07, + "loss": 0.9234, + "step": 82090 + }, + { + "epoch": 6.362121740478128, + "grad_norm": 1.3908455444154648, + "learning_rate": 3.181184128952263e-07, + "loss": 0.9343, + "step": 82100 + }, + { + "epoch": 6.362896663954435, + "grad_norm": 1.405846675310091, + "learning_rate": 3.181571605703658e-07, + "loss": 0.908, + "step": 82110 + }, + { + "epoch": 6.3636715874307415, + "grad_norm": 1.3335453581318333, + "learning_rate": 3.181959082455053e-07, + "loss": 0.9348, + "step": 82120 + }, + { + "epoch": 6.364446510907048, + "grad_norm": 1.3637523535381177, + "learning_rate": 3.182346559206448e-07, + "loss": 0.9408, + "step": 82130 + }, + { + "epoch": 6.365221434383354, + "grad_norm": 1.4901606265810219, + "learning_rate": 3.182734035957843e-07, + "loss": 0.9518, + "step": 82140 + }, + { + "epoch": 6.365996357859661, + "grad_norm": 1.3488890915180902, + "learning_rate": 3.1831215127092377e-07, + "loss": 0.9198, + "step": 82150 + }, + { + "epoch": 6.366771281335968, + "grad_norm": 1.3966136747265963, + "learning_rate": 3.1835089894606324e-07, + "loss": 0.9323, + "step": 82160 + }, + { + "epoch": 6.367546204812275, + "grad_norm": 1.495156363049676, + "learning_rate": 3.1838964662120276e-07, + "loss": 0.9387, + "step": 82170 + }, + { + "epoch": 6.368321128288581, + "grad_norm": 1.4165871317319907, + "learning_rate": 3.1842839429634223e-07, + "loss": 0.9398, + "step": 82180 + }, + { + "epoch": 6.369096051764888, + "grad_norm": 1.4195242487976787, + "learning_rate": 3.1846714197148175e-07, + "loss": 0.925, + "step": 82190 + }, + { + "epoch": 6.369870975241195, + "grad_norm": 1.4136295557326883, + "learning_rate": 3.185058896466212e-07, + "loss": 0.9348, + "step": 82200 + }, + { + "epoch": 6.370645898717502, + "grad_norm": 1.4819977546436987, + "learning_rate": 3.1854463732176074e-07, + "loss": 0.9473, + "step": 82210 + }, + { + "epoch": 6.371420822193809, + "grad_norm": 1.4766050159626216, + "learning_rate": 3.185833849969002e-07, + "loss": 0.9418, + "step": 82220 + }, + { + "epoch": 6.3721957456701155, + "grad_norm": 1.3608259217056722, + "learning_rate": 3.186221326720397e-07, + "loss": 0.9153, + "step": 82230 + }, + { + "epoch": 6.372970669146421, + "grad_norm": 1.3879794065972824, + "learning_rate": 3.186608803471792e-07, + "loss": 0.9382, + "step": 82240 + }, + { + "epoch": 6.373745592622728, + "grad_norm": 1.341461803314864, + "learning_rate": 3.1869962802231867e-07, + "loss": 0.9271, + "step": 82250 + }, + { + "epoch": 6.374520516099035, + "grad_norm": 1.3759594968363287, + "learning_rate": 3.187383756974582e-07, + "loss": 0.9258, + "step": 82260 + }, + { + "epoch": 6.375295439575342, + "grad_norm": 1.3412438008491692, + "learning_rate": 3.1877712337259766e-07, + "loss": 0.9217, + "step": 82270 + }, + { + "epoch": 6.376070363051649, + "grad_norm": 1.387573204659477, + "learning_rate": 3.188158710477372e-07, + "loss": 0.9477, + "step": 82280 + }, + { + "epoch": 6.3768452865279555, + "grad_norm": 1.4450917425559444, + "learning_rate": 3.1885461872287665e-07, + "loss": 0.9325, + "step": 82290 + }, + { + "epoch": 6.377620210004262, + "grad_norm": 1.3429680155502406, + "learning_rate": 3.188933663980161e-07, + "loss": 0.9468, + "step": 82300 + }, + { + "epoch": 6.378395133480569, + "grad_norm": 1.3430916646461508, + "learning_rate": 3.1893211407315564e-07, + "loss": 0.9224, + "step": 82310 + }, + { + "epoch": 6.379170056956876, + "grad_norm": 1.4606047596899951, + "learning_rate": 3.189708617482951e-07, + "loss": 0.9175, + "step": 82320 + }, + { + "epoch": 6.379944980433182, + "grad_norm": 1.3539523257324761, + "learning_rate": 3.1900960942343464e-07, + "loss": 0.9312, + "step": 82330 + }, + { + "epoch": 6.380719903909489, + "grad_norm": 1.3665430532381784, + "learning_rate": 3.190483570985741e-07, + "loss": 0.928, + "step": 82340 + }, + { + "epoch": 6.381494827385795, + "grad_norm": 1.4762058025629452, + "learning_rate": 3.1908710477371363e-07, + "loss": 0.9263, + "step": 82350 + }, + { + "epoch": 6.382269750862102, + "grad_norm": 1.4534927331408432, + "learning_rate": 3.191258524488531e-07, + "loss": 0.9268, + "step": 82360 + }, + { + "epoch": 6.383044674338409, + "grad_norm": 1.330924036097082, + "learning_rate": 3.1916460012399257e-07, + "loss": 0.944, + "step": 82370 + }, + { + "epoch": 6.383819597814716, + "grad_norm": 1.4535587132327752, + "learning_rate": 3.192033477991321e-07, + "loss": 0.9307, + "step": 82380 + }, + { + "epoch": 6.384594521291023, + "grad_norm": 1.3536674870220626, + "learning_rate": 3.1924209547427156e-07, + "loss": 0.9275, + "step": 82390 + }, + { + "epoch": 6.3853694447673295, + "grad_norm": 1.3969822416450746, + "learning_rate": 3.192808431494111e-07, + "loss": 0.9228, + "step": 82400 + }, + { + "epoch": 6.386144368243636, + "grad_norm": 1.343508767365189, + "learning_rate": 3.1931959082455055e-07, + "loss": 0.9309, + "step": 82410 + }, + { + "epoch": 6.386919291719943, + "grad_norm": 1.4871296383559522, + "learning_rate": 3.1935833849969007e-07, + "loss": 0.9338, + "step": 82420 + }, + { + "epoch": 6.387694215196249, + "grad_norm": 1.4190513403206109, + "learning_rate": 3.1939708617482954e-07, + "loss": 0.9289, + "step": 82430 + }, + { + "epoch": 6.388469138672556, + "grad_norm": 1.4989050370854877, + "learning_rate": 3.19435833849969e-07, + "loss": 0.9315, + "step": 82440 + }, + { + "epoch": 6.389244062148863, + "grad_norm": 1.3884142107283557, + "learning_rate": 3.1947458152510853e-07, + "loss": 0.9265, + "step": 82450 + }, + { + "epoch": 6.3900189856251695, + "grad_norm": 1.3029100864664105, + "learning_rate": 3.19513329200248e-07, + "loss": 0.912, + "step": 82460 + }, + { + "epoch": 6.390793909101476, + "grad_norm": 1.5258190043428854, + "learning_rate": 3.195520768753875e-07, + "loss": 0.9719, + "step": 82470 + }, + { + "epoch": 6.391568832577783, + "grad_norm": 1.3699496220476883, + "learning_rate": 3.19590824550527e-07, + "loss": 0.9295, + "step": 82480 + }, + { + "epoch": 6.39234375605409, + "grad_norm": 1.455029706124365, + "learning_rate": 3.1962957222566646e-07, + "loss": 0.9527, + "step": 82490 + }, + { + "epoch": 6.393118679530397, + "grad_norm": 1.4612774265122357, + "learning_rate": 3.19668319900806e-07, + "loss": 0.9585, + "step": 82500 + }, + { + "epoch": 6.393118679530397, + "eval_loss": 0.9372374415397644, + "eval_runtime": 330.9832, + "eval_samples_per_second": 34.657, + "eval_steps_per_second": 8.665, + "step": 82500 + }, + { + "epoch": 6.393893603006703, + "grad_norm": 1.4170380707570926, + "learning_rate": 3.1970706757594545e-07, + "loss": 0.9339, + "step": 82510 + }, + { + "epoch": 6.394668526483009, + "grad_norm": 1.4689691653921302, + "learning_rate": 3.1974581525108497e-07, + "loss": 0.9307, + "step": 82520 + }, + { + "epoch": 6.395443449959316, + "grad_norm": 1.4030333084204465, + "learning_rate": 3.1978456292622444e-07, + "loss": 0.9032, + "step": 82530 + }, + { + "epoch": 6.396218373435623, + "grad_norm": 1.3680658618323218, + "learning_rate": 3.1982331060136396e-07, + "loss": 0.9386, + "step": 82540 + }, + { + "epoch": 6.39699329691193, + "grad_norm": 1.408347155970064, + "learning_rate": 3.1986205827650343e-07, + "loss": 0.9278, + "step": 82550 + }, + { + "epoch": 6.397768220388237, + "grad_norm": 1.4686962437487932, + "learning_rate": 3.199008059516429e-07, + "loss": 0.9129, + "step": 82560 + }, + { + "epoch": 6.3985431438645435, + "grad_norm": 1.3924159710186663, + "learning_rate": 3.199395536267824e-07, + "loss": 0.9174, + "step": 82570 + }, + { + "epoch": 6.39931806734085, + "grad_norm": 1.4103067876699704, + "learning_rate": 3.199783013019219e-07, + "loss": 0.9217, + "step": 82580 + }, + { + "epoch": 6.400092990817157, + "grad_norm": 1.4207900918314929, + "learning_rate": 3.200170489770614e-07, + "loss": 0.921, + "step": 82590 + }, + { + "epoch": 6.400867914293464, + "grad_norm": 1.4231863001344662, + "learning_rate": 3.200557966522009e-07, + "loss": 0.924, + "step": 82600 + }, + { + "epoch": 6.40164283776977, + "grad_norm": 1.3394640651679977, + "learning_rate": 3.200945443273404e-07, + "loss": 0.9288, + "step": 82610 + }, + { + "epoch": 6.402417761246077, + "grad_norm": 1.4073165136641146, + "learning_rate": 3.2013329200247987e-07, + "loss": 0.9338, + "step": 82620 + }, + { + "epoch": 6.4031926847223835, + "grad_norm": 1.3859129598111595, + "learning_rate": 3.2017203967761934e-07, + "loss": 0.919, + "step": 82630 + }, + { + "epoch": 6.40396760819869, + "grad_norm": 1.3755741788652607, + "learning_rate": 3.2021078735275886e-07, + "loss": 0.9569, + "step": 82640 + }, + { + "epoch": 6.404742531674997, + "grad_norm": 1.4140134653965766, + "learning_rate": 3.2024953502789833e-07, + "loss": 0.9396, + "step": 82650 + }, + { + "epoch": 6.405517455151304, + "grad_norm": 1.508785419294078, + "learning_rate": 3.2028828270303786e-07, + "loss": 0.9228, + "step": 82660 + }, + { + "epoch": 6.406292378627611, + "grad_norm": 1.411790135444443, + "learning_rate": 3.203270303781773e-07, + "loss": 0.9573, + "step": 82670 + }, + { + "epoch": 6.4070673021039175, + "grad_norm": 1.4027072023205274, + "learning_rate": 3.2036577805331685e-07, + "loss": 0.945, + "step": 82680 + }, + { + "epoch": 6.407842225580224, + "grad_norm": 1.3600832234336182, + "learning_rate": 3.204045257284563e-07, + "loss": 0.9248, + "step": 82690 + }, + { + "epoch": 6.40861714905653, + "grad_norm": 1.4434551749272169, + "learning_rate": 3.204432734035958e-07, + "loss": 0.9166, + "step": 82700 + }, + { + "epoch": 6.409392072532837, + "grad_norm": 1.3466792817408761, + "learning_rate": 3.204820210787353e-07, + "loss": 0.9428, + "step": 82710 + }, + { + "epoch": 6.410166996009144, + "grad_norm": 1.4005574834733705, + "learning_rate": 3.205207687538748e-07, + "loss": 0.9406, + "step": 82720 + }, + { + "epoch": 6.410941919485451, + "grad_norm": 1.3995491767911694, + "learning_rate": 3.205595164290143e-07, + "loss": 0.9138, + "step": 82730 + }, + { + "epoch": 6.4117168429617575, + "grad_norm": 1.4067125405283893, + "learning_rate": 3.2059826410415377e-07, + "loss": 0.9384, + "step": 82740 + }, + { + "epoch": 6.412491766438064, + "grad_norm": 1.3568867515788583, + "learning_rate": 3.206370117792933e-07, + "loss": 0.9356, + "step": 82750 + }, + { + "epoch": 6.413266689914371, + "grad_norm": 1.3751464334437138, + "learning_rate": 3.2067575945443276e-07, + "loss": 0.9005, + "step": 82760 + }, + { + "epoch": 6.414041613390678, + "grad_norm": 1.4461866327546582, + "learning_rate": 3.2071450712957223e-07, + "loss": 0.9354, + "step": 82770 + }, + { + "epoch": 6.414816536866985, + "grad_norm": 1.4169427506851267, + "learning_rate": 3.2075325480471175e-07, + "loss": 0.9282, + "step": 82780 + }, + { + "epoch": 6.4155914603432915, + "grad_norm": 1.3817017120155797, + "learning_rate": 3.207920024798512e-07, + "loss": 0.9204, + "step": 82790 + }, + { + "epoch": 6.4163663838195975, + "grad_norm": 1.561730841755664, + "learning_rate": 3.2083075015499074e-07, + "loss": 0.9473, + "step": 82800 + }, + { + "epoch": 6.417141307295904, + "grad_norm": 1.4272611170504346, + "learning_rate": 3.208694978301302e-07, + "loss": 0.9274, + "step": 82810 + }, + { + "epoch": 6.417916230772211, + "grad_norm": 1.4281625651006882, + "learning_rate": 3.2090824550526973e-07, + "loss": 0.9279, + "step": 82820 + }, + { + "epoch": 6.418691154248518, + "grad_norm": 1.3948148170154364, + "learning_rate": 3.209469931804092e-07, + "loss": 0.9245, + "step": 82830 + }, + { + "epoch": 6.419466077724825, + "grad_norm": 1.4019589814899343, + "learning_rate": 3.2098574085554867e-07, + "loss": 0.9333, + "step": 82840 + }, + { + "epoch": 6.4202410012011315, + "grad_norm": 1.3557196918159964, + "learning_rate": 3.210244885306882e-07, + "loss": 0.9724, + "step": 82850 + }, + { + "epoch": 6.421015924677438, + "grad_norm": 1.3475285825373855, + "learning_rate": 3.2106323620582766e-07, + "loss": 0.9342, + "step": 82860 + }, + { + "epoch": 6.421790848153745, + "grad_norm": 1.4826916268926356, + "learning_rate": 3.211019838809672e-07, + "loss": 0.9421, + "step": 82870 + }, + { + "epoch": 6.422565771630052, + "grad_norm": 1.4002205353076895, + "learning_rate": 3.2114073155610665e-07, + "loss": 0.9556, + "step": 82880 + }, + { + "epoch": 6.423340695106358, + "grad_norm": 1.3390066081593945, + "learning_rate": 3.2117947923124617e-07, + "loss": 0.9328, + "step": 82890 + }, + { + "epoch": 6.424115618582665, + "grad_norm": 1.3981661166693338, + "learning_rate": 3.2121822690638564e-07, + "loss": 0.9218, + "step": 82900 + }, + { + "epoch": 6.4248905420589715, + "grad_norm": 1.3662793332877976, + "learning_rate": 3.212569745815251e-07, + "loss": 0.935, + "step": 82910 + }, + { + "epoch": 6.425665465535278, + "grad_norm": 1.462066471672983, + "learning_rate": 3.2129572225666463e-07, + "loss": 0.9319, + "step": 82920 + }, + { + "epoch": 6.426440389011585, + "grad_norm": 1.3977582787719045, + "learning_rate": 3.213344699318041e-07, + "loss": 0.9306, + "step": 82930 + }, + { + "epoch": 6.427215312487892, + "grad_norm": 1.4094894792157613, + "learning_rate": 3.213732176069436e-07, + "loss": 0.9434, + "step": 82940 + }, + { + "epoch": 6.427990235964199, + "grad_norm": 1.489314561933835, + "learning_rate": 3.214119652820831e-07, + "loss": 0.9343, + "step": 82950 + }, + { + "epoch": 6.4287651594405055, + "grad_norm": 1.4415881124326044, + "learning_rate": 3.214507129572226e-07, + "loss": 0.9404, + "step": 82960 + }, + { + "epoch": 6.429540082916812, + "grad_norm": 1.3951295779024704, + "learning_rate": 3.214894606323621e-07, + "loss": 0.9224, + "step": 82970 + }, + { + "epoch": 6.430315006393119, + "grad_norm": 1.469571233094426, + "learning_rate": 3.2152820830750155e-07, + "loss": 0.9462, + "step": 82980 + }, + { + "epoch": 6.431089929869425, + "grad_norm": 1.4100334049542644, + "learning_rate": 3.215669559826411e-07, + "loss": 0.9188, + "step": 82990 + }, + { + "epoch": 6.431864853345732, + "grad_norm": 1.3994355926437143, + "learning_rate": 3.2160570365778054e-07, + "loss": 0.9155, + "step": 83000 + }, + { + "epoch": 6.431864853345732, + "eval_loss": 0.9367989897727966, + "eval_runtime": 329.7571, + "eval_samples_per_second": 34.786, + "eval_steps_per_second": 8.697, + "step": 83000 + }, + { + "epoch": 6.432639776822039, + "grad_norm": 1.3702052541296095, + "learning_rate": 3.2164445133292007e-07, + "loss": 0.9386, + "step": 83010 + }, + { + "epoch": 6.4334147002983455, + "grad_norm": 1.3797209538740394, + "learning_rate": 3.2168319900805953e-07, + "loss": 0.9335, + "step": 83020 + }, + { + "epoch": 6.434189623774652, + "grad_norm": 1.4587487286447935, + "learning_rate": 3.2172194668319906e-07, + "loss": 0.9388, + "step": 83030 + }, + { + "epoch": 6.434964547250959, + "grad_norm": 1.4051322027513256, + "learning_rate": 3.217606943583385e-07, + "loss": 0.9595, + "step": 83040 + }, + { + "epoch": 6.435739470727266, + "grad_norm": 1.3948947877213234, + "learning_rate": 3.21799442033478e-07, + "loss": 0.9356, + "step": 83050 + }, + { + "epoch": 6.436514394203573, + "grad_norm": 1.4378356553166958, + "learning_rate": 3.218381897086175e-07, + "loss": 0.9381, + "step": 83060 + }, + { + "epoch": 6.437289317679879, + "grad_norm": 1.4249477848824135, + "learning_rate": 3.21876937383757e-07, + "loss": 0.9373, + "step": 83070 + }, + { + "epoch": 6.4380642411561855, + "grad_norm": 1.3790267895592443, + "learning_rate": 3.219156850588965e-07, + "loss": 0.9183, + "step": 83080 + }, + { + "epoch": 6.438839164632492, + "grad_norm": 1.3707640505408112, + "learning_rate": 3.21954432734036e-07, + "loss": 0.9177, + "step": 83090 + }, + { + "epoch": 6.439614088108799, + "grad_norm": 1.345709842042711, + "learning_rate": 3.219931804091755e-07, + "loss": 0.9513, + "step": 83100 + }, + { + "epoch": 6.440389011585106, + "grad_norm": 1.345148445497061, + "learning_rate": 3.2203192808431497e-07, + "loss": 0.9283, + "step": 83110 + }, + { + "epoch": 6.441163935061413, + "grad_norm": 1.3875292307971345, + "learning_rate": 3.2207067575945444e-07, + "loss": 0.9097, + "step": 83120 + }, + { + "epoch": 6.4419388585377195, + "grad_norm": 1.4177915050503795, + "learning_rate": 3.2210942343459396e-07, + "loss": 0.943, + "step": 83130 + }, + { + "epoch": 6.442713782014026, + "grad_norm": 1.3453214902520316, + "learning_rate": 3.2214817110973343e-07, + "loss": 0.9297, + "step": 83140 + }, + { + "epoch": 6.443488705490333, + "grad_norm": 1.3895414688910959, + "learning_rate": 3.2218691878487295e-07, + "loss": 0.9194, + "step": 83150 + }, + { + "epoch": 6.44426362896664, + "grad_norm": 1.4262156453626198, + "learning_rate": 3.222256664600124e-07, + "loss": 0.933, + "step": 83160 + }, + { + "epoch": 6.445038552442946, + "grad_norm": 1.4946456723357942, + "learning_rate": 3.2226441413515194e-07, + "loss": 0.9295, + "step": 83170 + }, + { + "epoch": 6.445813475919253, + "grad_norm": 1.5293667465239362, + "learning_rate": 3.223031618102914e-07, + "loss": 0.9238, + "step": 83180 + }, + { + "epoch": 6.4465883993955595, + "grad_norm": 1.4059613957822394, + "learning_rate": 3.223419094854309e-07, + "loss": 0.9436, + "step": 83190 + }, + { + "epoch": 6.447363322871866, + "grad_norm": 1.5143614210883762, + "learning_rate": 3.223806571605704e-07, + "loss": 0.9521, + "step": 83200 + }, + { + "epoch": 6.448138246348173, + "grad_norm": 1.434042811222748, + "learning_rate": 3.2241940483570987e-07, + "loss": 0.933, + "step": 83210 + }, + { + "epoch": 6.44891316982448, + "grad_norm": 1.3721413047732067, + "learning_rate": 3.224581525108494e-07, + "loss": 0.9324, + "step": 83220 + }, + { + "epoch": 6.449688093300787, + "grad_norm": 1.4594914363187212, + "learning_rate": 3.2249690018598886e-07, + "loss": 0.9413, + "step": 83230 + }, + { + "epoch": 6.4504630167770936, + "grad_norm": 1.4917061304782422, + "learning_rate": 3.2253564786112833e-07, + "loss": 0.9427, + "step": 83240 + }, + { + "epoch": 6.4512379402534, + "grad_norm": 1.3624865500964463, + "learning_rate": 3.2257439553626785e-07, + "loss": 0.9206, + "step": 83250 + }, + { + "epoch": 6.452012863729706, + "grad_norm": 1.3673260328054317, + "learning_rate": 3.226131432114073e-07, + "loss": 0.9183, + "step": 83260 + }, + { + "epoch": 6.452787787206013, + "grad_norm": 1.463554780178774, + "learning_rate": 3.2265189088654684e-07, + "loss": 0.9349, + "step": 83270 + }, + { + "epoch": 6.45356271068232, + "grad_norm": 1.4880952298032468, + "learning_rate": 3.226906385616863e-07, + "loss": 0.9412, + "step": 83280 + }, + { + "epoch": 6.454337634158627, + "grad_norm": 1.3614825813217115, + "learning_rate": 3.2272938623682583e-07, + "loss": 0.9551, + "step": 83290 + }, + { + "epoch": 6.4551125576349335, + "grad_norm": 1.3683084667212708, + "learning_rate": 3.227681339119653e-07, + "loss": 0.9255, + "step": 83300 + }, + { + "epoch": 6.45588748111124, + "grad_norm": 1.4659229284320991, + "learning_rate": 3.2280688158710477e-07, + "loss": 0.9468, + "step": 83310 + }, + { + "epoch": 6.456662404587547, + "grad_norm": 1.4260564862383651, + "learning_rate": 3.228456292622443e-07, + "loss": 0.9322, + "step": 83320 + }, + { + "epoch": 6.457437328063854, + "grad_norm": 1.3899150533306373, + "learning_rate": 3.2288437693738376e-07, + "loss": 0.9314, + "step": 83330 + }, + { + "epoch": 6.458212251540161, + "grad_norm": 1.4254333363394989, + "learning_rate": 3.229231246125233e-07, + "loss": 0.9243, + "step": 83340 + }, + { + "epoch": 6.458987175016468, + "grad_norm": 1.394417780760872, + "learning_rate": 3.2296187228766275e-07, + "loss": 0.9404, + "step": 83350 + }, + { + "epoch": 6.4597620984927735, + "grad_norm": 1.4454627512051408, + "learning_rate": 3.230006199628023e-07, + "loss": 0.9466, + "step": 83360 + }, + { + "epoch": 6.46053702196908, + "grad_norm": 1.3533108977683639, + "learning_rate": 3.2303936763794175e-07, + "loss": 0.9318, + "step": 83370 + }, + { + "epoch": 6.461311945445387, + "grad_norm": 1.328255580180587, + "learning_rate": 3.230781153130812e-07, + "loss": 0.923, + "step": 83380 + }, + { + "epoch": 6.462086868921694, + "grad_norm": 1.38189833488743, + "learning_rate": 3.2311686298822074e-07, + "loss": 0.9368, + "step": 83390 + }, + { + "epoch": 6.462861792398001, + "grad_norm": 1.399449237752047, + "learning_rate": 3.231556106633602e-07, + "loss": 0.922, + "step": 83400 + }, + { + "epoch": 6.4636367158743075, + "grad_norm": 1.4028205856937506, + "learning_rate": 3.2319435833849973e-07, + "loss": 0.9507, + "step": 83410 + }, + { + "epoch": 6.464411639350614, + "grad_norm": 1.402335959271809, + "learning_rate": 3.232331060136392e-07, + "loss": 0.9261, + "step": 83420 + }, + { + "epoch": 6.465186562826921, + "grad_norm": 1.4230775296880454, + "learning_rate": 3.232718536887787e-07, + "loss": 0.9364, + "step": 83430 + }, + { + "epoch": 6.465961486303227, + "grad_norm": 1.3486868870414785, + "learning_rate": 3.233106013639182e-07, + "loss": 0.9311, + "step": 83440 + }, + { + "epoch": 6.466736409779534, + "grad_norm": 1.3700669786638746, + "learning_rate": 3.2334934903905766e-07, + "loss": 0.9161, + "step": 83450 + }, + { + "epoch": 6.467511333255841, + "grad_norm": 1.522747336249541, + "learning_rate": 3.233880967141972e-07, + "loss": 0.9305, + "step": 83460 + }, + { + "epoch": 6.4682862567321475, + "grad_norm": 1.4573388104440457, + "learning_rate": 3.2342684438933665e-07, + "loss": 0.9287, + "step": 83470 + }, + { + "epoch": 6.469061180208454, + "grad_norm": 1.4402432322681522, + "learning_rate": 3.2346559206447617e-07, + "loss": 0.9352, + "step": 83480 + }, + { + "epoch": 6.469836103684761, + "grad_norm": 1.3780064310921591, + "learning_rate": 3.2350433973961564e-07, + "loss": 0.9472, + "step": 83490 + }, + { + "epoch": 6.470611027161068, + "grad_norm": 1.3132197546968478, + "learning_rate": 3.2354308741475516e-07, + "loss": 0.935, + "step": 83500 + }, + { + "epoch": 6.470611027161068, + "eval_loss": 0.9362419843673706, + "eval_runtime": 332.927, + "eval_samples_per_second": 34.455, + "eval_steps_per_second": 8.615, + "step": 83500 + }, + { + "epoch": 6.471385950637375, + "grad_norm": 1.5060270977711114, + "learning_rate": 3.2358183508989463e-07, + "loss": 0.9476, + "step": 83510 + }, + { + "epoch": 6.472160874113682, + "grad_norm": 1.4585639932736667, + "learning_rate": 3.236205827650341e-07, + "loss": 0.9319, + "step": 83520 + }, + { + "epoch": 6.472935797589988, + "grad_norm": 1.3336185705003205, + "learning_rate": 3.236593304401736e-07, + "loss": 0.95, + "step": 83530 + }, + { + "epoch": 6.473710721066294, + "grad_norm": 1.408351159056393, + "learning_rate": 3.236980781153131e-07, + "loss": 0.9425, + "step": 83540 + }, + { + "epoch": 6.474485644542601, + "grad_norm": 1.4088414143396912, + "learning_rate": 3.237368257904526e-07, + "loss": 0.9301, + "step": 83550 + }, + { + "epoch": 6.475260568018908, + "grad_norm": 1.3443778385749618, + "learning_rate": 3.237755734655921e-07, + "loss": 0.9284, + "step": 83560 + }, + { + "epoch": 6.476035491495215, + "grad_norm": 1.396665670072089, + "learning_rate": 3.238143211407316e-07, + "loss": 0.9293, + "step": 83570 + }, + { + "epoch": 6.4768104149715215, + "grad_norm": 1.4779457012012163, + "learning_rate": 3.2385306881587107e-07, + "loss": 0.9507, + "step": 83580 + }, + { + "epoch": 6.477585338447828, + "grad_norm": 1.5303188759961421, + "learning_rate": 3.2389181649101054e-07, + "loss": 0.9348, + "step": 83590 + }, + { + "epoch": 6.478360261924135, + "grad_norm": 1.4169350342940996, + "learning_rate": 3.2393056416615006e-07, + "loss": 0.937, + "step": 83600 + }, + { + "epoch": 6.479135185400442, + "grad_norm": 1.4075349815819886, + "learning_rate": 3.2396931184128953e-07, + "loss": 0.9214, + "step": 83610 + }, + { + "epoch": 6.479910108876749, + "grad_norm": 1.4594667092911202, + "learning_rate": 3.2400805951642905e-07, + "loss": 0.9226, + "step": 83620 + }, + { + "epoch": 6.480685032353055, + "grad_norm": 1.3975799233478003, + "learning_rate": 3.240468071915685e-07, + "loss": 0.9228, + "step": 83630 + }, + { + "epoch": 6.4814599558293615, + "grad_norm": 1.5486402756510824, + "learning_rate": 3.2408555486670804e-07, + "loss": 0.9695, + "step": 83640 + }, + { + "epoch": 6.482234879305668, + "grad_norm": 1.4528864811995652, + "learning_rate": 3.241243025418475e-07, + "loss": 0.9346, + "step": 83650 + }, + { + "epoch": 6.483009802781975, + "grad_norm": 1.4091563512205845, + "learning_rate": 3.24163050216987e-07, + "loss": 0.9257, + "step": 83660 + }, + { + "epoch": 6.483784726258282, + "grad_norm": 1.4826360898730249, + "learning_rate": 3.242017978921265e-07, + "loss": 0.9378, + "step": 83670 + }, + { + "epoch": 6.484559649734589, + "grad_norm": 1.4471050972014372, + "learning_rate": 3.24240545567266e-07, + "loss": 0.8954, + "step": 83680 + }, + { + "epoch": 6.485334573210896, + "grad_norm": 1.4748068153920455, + "learning_rate": 3.242792932424055e-07, + "loss": 0.9701, + "step": 83690 + }, + { + "epoch": 6.486109496687202, + "grad_norm": 1.3247735296474266, + "learning_rate": 3.2431804091754496e-07, + "loss": 0.943, + "step": 83700 + }, + { + "epoch": 6.486884420163509, + "grad_norm": 2.0491655571238576, + "learning_rate": 3.243567885926845e-07, + "loss": 0.9331, + "step": 83710 + }, + { + "epoch": 6.487659343639816, + "grad_norm": 1.3317837943408817, + "learning_rate": 3.2439553626782396e-07, + "loss": 0.9178, + "step": 83720 + }, + { + "epoch": 6.488434267116122, + "grad_norm": 1.373022664816514, + "learning_rate": 3.244342839429634e-07, + "loss": 0.93, + "step": 83730 + }, + { + "epoch": 6.489209190592429, + "grad_norm": 1.3915486762267075, + "learning_rate": 3.2447303161810295e-07, + "loss": 0.9516, + "step": 83740 + }, + { + "epoch": 6.4899841140687355, + "grad_norm": 1.3402749566480958, + "learning_rate": 3.245117792932424e-07, + "loss": 0.9485, + "step": 83750 + }, + { + "epoch": 6.490759037545042, + "grad_norm": 1.3356646319852463, + "learning_rate": 3.2455052696838194e-07, + "loss": 0.9275, + "step": 83760 + }, + { + "epoch": 6.491533961021349, + "grad_norm": 1.4282215375216847, + "learning_rate": 3.245892746435214e-07, + "loss": 0.947, + "step": 83770 + }, + { + "epoch": 6.492308884497656, + "grad_norm": 1.4523344672981664, + "learning_rate": 3.2462802231866093e-07, + "loss": 0.9251, + "step": 83780 + }, + { + "epoch": 6.493083807973963, + "grad_norm": 1.4749999076422189, + "learning_rate": 3.246667699938004e-07, + "loss": 0.9335, + "step": 83790 + }, + { + "epoch": 6.49385873145027, + "grad_norm": 1.5074384173340147, + "learning_rate": 3.2470551766893987e-07, + "loss": 0.9347, + "step": 83800 + }, + { + "epoch": 6.494633654926576, + "grad_norm": 1.4583752333629971, + "learning_rate": 3.247442653440794e-07, + "loss": 0.9321, + "step": 83810 + }, + { + "epoch": 6.495408578402882, + "grad_norm": 1.4633076412629695, + "learning_rate": 3.2478301301921886e-07, + "loss": 0.9226, + "step": 83820 + }, + { + "epoch": 6.496183501879189, + "grad_norm": 1.3430283081465781, + "learning_rate": 3.248217606943584e-07, + "loss": 0.9207, + "step": 83830 + }, + { + "epoch": 6.496958425355496, + "grad_norm": 1.472328921434914, + "learning_rate": 3.2486050836949785e-07, + "loss": 0.9657, + "step": 83840 + }, + { + "epoch": 6.497733348831803, + "grad_norm": 1.3507648370514862, + "learning_rate": 3.2489925604463737e-07, + "loss": 0.9355, + "step": 83850 + }, + { + "epoch": 6.49850827230811, + "grad_norm": 1.362311278682436, + "learning_rate": 3.2493800371977684e-07, + "loss": 0.9222, + "step": 83860 + }, + { + "epoch": 6.499283195784416, + "grad_norm": 1.495925013339561, + "learning_rate": 3.249767513949163e-07, + "loss": 0.9328, + "step": 83870 + }, + { + "epoch": 6.500058119260723, + "grad_norm": 1.416483245466835, + "learning_rate": 3.2501549907005583e-07, + "loss": 0.9497, + "step": 83880 + }, + { + "epoch": 6.50083304273703, + "grad_norm": 1.3875811017355717, + "learning_rate": 3.250542467451953e-07, + "loss": 0.9217, + "step": 83890 + }, + { + "epoch": 6.501607966213337, + "grad_norm": 1.4237414973541571, + "learning_rate": 3.250929944203348e-07, + "loss": 0.9358, + "step": 83900 + }, + { + "epoch": 6.502382889689644, + "grad_norm": 1.318226639587706, + "learning_rate": 3.251317420954743e-07, + "loss": 0.9274, + "step": 83910 + }, + { + "epoch": 6.5031578131659495, + "grad_norm": 1.3791991302905857, + "learning_rate": 3.251704897706138e-07, + "loss": 0.934, + "step": 83920 + }, + { + "epoch": 6.503932736642256, + "grad_norm": 1.4516386782638602, + "learning_rate": 3.252092374457533e-07, + "loss": 0.9286, + "step": 83930 + }, + { + "epoch": 6.504707660118563, + "grad_norm": 1.3839204789197543, + "learning_rate": 3.2524798512089275e-07, + "loss": 0.935, + "step": 83940 + }, + { + "epoch": 6.50548258359487, + "grad_norm": 1.3761251516328064, + "learning_rate": 3.2528673279603227e-07, + "loss": 0.9474, + "step": 83950 + }, + { + "epoch": 6.506257507071177, + "grad_norm": 1.3817107410193268, + "learning_rate": 3.2532548047117174e-07, + "loss": 0.9393, + "step": 83960 + }, + { + "epoch": 6.507032430547484, + "grad_norm": 1.3828591002014388, + "learning_rate": 3.2536422814631126e-07, + "loss": 0.9204, + "step": 83970 + }, + { + "epoch": 6.50780735402379, + "grad_norm": 1.4608744178047126, + "learning_rate": 3.2540297582145073e-07, + "loss": 0.9251, + "step": 83980 + }, + { + "epoch": 6.508582277500097, + "grad_norm": 1.3850170245946403, + "learning_rate": 3.254417234965902e-07, + "loss": 0.9214, + "step": 83990 + }, + { + "epoch": 6.509357200976403, + "grad_norm": 1.4201013564725895, + "learning_rate": 3.254804711717297e-07, + "loss": 0.9237, + "step": 84000 + }, + { + "epoch": 6.509357200976403, + "eval_loss": 0.935834527015686, + "eval_runtime": 332.7145, + "eval_samples_per_second": 34.477, + "eval_steps_per_second": 8.62, + "step": 84000 + }, + { + "epoch": 6.51013212445271, + "grad_norm": 1.4338409763222162, + "learning_rate": 3.255192188468692e-07, + "loss": 0.9334, + "step": 84010 + }, + { + "epoch": 6.510907047929017, + "grad_norm": 1.330867749893648, + "learning_rate": 3.255579665220087e-07, + "loss": 0.9355, + "step": 84020 + }, + { + "epoch": 6.511681971405324, + "grad_norm": 1.4223595523609898, + "learning_rate": 3.255967141971482e-07, + "loss": 0.9269, + "step": 84030 + }, + { + "epoch": 6.51245689488163, + "grad_norm": 1.4146776286265508, + "learning_rate": 3.256354618722877e-07, + "loss": 0.9496, + "step": 84040 + }, + { + "epoch": 6.513231818357937, + "grad_norm": 1.423979945885184, + "learning_rate": 3.256742095474272e-07, + "loss": 0.9074, + "step": 84050 + }, + { + "epoch": 6.514006741834244, + "grad_norm": 1.3658206314748484, + "learning_rate": 3.2571295722256664e-07, + "loss": 0.9458, + "step": 84060 + }, + { + "epoch": 6.514781665310551, + "grad_norm": 1.3733928432226923, + "learning_rate": 3.2575170489770617e-07, + "loss": 0.9387, + "step": 84070 + }, + { + "epoch": 6.515556588786858, + "grad_norm": 1.411504083429672, + "learning_rate": 3.2579045257284564e-07, + "loss": 0.9092, + "step": 84080 + }, + { + "epoch": 6.516331512263164, + "grad_norm": 1.4072915297424846, + "learning_rate": 3.2582920024798516e-07, + "loss": 0.9588, + "step": 84090 + }, + { + "epoch": 6.51710643573947, + "grad_norm": 1.4731580795385129, + "learning_rate": 3.2586794792312463e-07, + "loss": 0.9502, + "step": 84100 + }, + { + "epoch": 6.517881359215777, + "grad_norm": 1.4854907284277075, + "learning_rate": 3.2590669559826415e-07, + "loss": 0.925, + "step": 84110 + }, + { + "epoch": 6.518656282692084, + "grad_norm": 1.4494519860074615, + "learning_rate": 3.259454432734036e-07, + "loss": 0.929, + "step": 84120 + }, + { + "epoch": 6.519431206168391, + "grad_norm": 1.4640693307988335, + "learning_rate": 3.259841909485431e-07, + "loss": 0.9321, + "step": 84130 + }, + { + "epoch": 6.520206129644698, + "grad_norm": 1.3672203405147596, + "learning_rate": 3.260229386236826e-07, + "loss": 0.9166, + "step": 84140 + }, + { + "epoch": 6.520981053121004, + "grad_norm": 1.4322642127865366, + "learning_rate": 3.260616862988221e-07, + "loss": 0.9446, + "step": 84150 + }, + { + "epoch": 6.521755976597311, + "grad_norm": 1.4351477008993105, + "learning_rate": 3.261004339739616e-07, + "loss": 0.931, + "step": 84160 + }, + { + "epoch": 6.522530900073618, + "grad_norm": 1.4658455733516649, + "learning_rate": 3.2613918164910107e-07, + "loss": 0.9243, + "step": 84170 + }, + { + "epoch": 6.523305823549925, + "grad_norm": 1.3512428672363366, + "learning_rate": 3.261779293242406e-07, + "loss": 0.919, + "step": 84180 + }, + { + "epoch": 6.524080747026231, + "grad_norm": 1.3478407265000971, + "learning_rate": 3.2621667699938006e-07, + "loss": 0.9574, + "step": 84190 + }, + { + "epoch": 6.524855670502538, + "grad_norm": 1.378652232542485, + "learning_rate": 3.2625542467451953e-07, + "loss": 0.9279, + "step": 84200 + }, + { + "epoch": 6.525630593978844, + "grad_norm": 1.4911828273780625, + "learning_rate": 3.2629417234965905e-07, + "loss": 0.9238, + "step": 84210 + }, + { + "epoch": 6.526405517455151, + "grad_norm": 1.3980825549979727, + "learning_rate": 3.263329200247985e-07, + "loss": 0.9196, + "step": 84220 + }, + { + "epoch": 6.527180440931458, + "grad_norm": 1.4578457156320743, + "learning_rate": 3.2637166769993804e-07, + "loss": 0.9199, + "step": 84230 + }, + { + "epoch": 6.527955364407765, + "grad_norm": 1.4258297381289984, + "learning_rate": 3.264104153750775e-07, + "loss": 0.9587, + "step": 84240 + }, + { + "epoch": 6.528730287884072, + "grad_norm": 1.4064947721416814, + "learning_rate": 3.2644916305021703e-07, + "loss": 0.9263, + "step": 84250 + }, + { + "epoch": 6.529505211360378, + "grad_norm": 1.301333996649877, + "learning_rate": 3.264879107253565e-07, + "loss": 0.9339, + "step": 84260 + }, + { + "epoch": 6.530280134836685, + "grad_norm": 1.3540060615432177, + "learning_rate": 3.2652665840049597e-07, + "loss": 0.9509, + "step": 84270 + }, + { + "epoch": 6.531055058312992, + "grad_norm": 1.4020996109616155, + "learning_rate": 3.265654060756355e-07, + "loss": 0.9125, + "step": 84280 + }, + { + "epoch": 6.531829981789298, + "grad_norm": 1.4234663418890108, + "learning_rate": 3.2660415375077496e-07, + "loss": 0.9296, + "step": 84290 + }, + { + "epoch": 6.532604905265605, + "grad_norm": 1.4524946545198663, + "learning_rate": 3.266429014259145e-07, + "loss": 0.9365, + "step": 84300 + }, + { + "epoch": 6.533379828741912, + "grad_norm": 1.3819295393010773, + "learning_rate": 3.2668164910105395e-07, + "loss": 0.9309, + "step": 84310 + }, + { + "epoch": 6.534154752218218, + "grad_norm": 1.4029110671953444, + "learning_rate": 3.267203967761935e-07, + "loss": 0.9271, + "step": 84320 + }, + { + "epoch": 6.534929675694525, + "grad_norm": 1.4654081456335528, + "learning_rate": 3.2675914445133294e-07, + "loss": 0.9478, + "step": 84330 + }, + { + "epoch": 6.535704599170832, + "grad_norm": 1.40590275261802, + "learning_rate": 3.267978921264724e-07, + "loss": 0.9229, + "step": 84340 + }, + { + "epoch": 6.536479522647139, + "grad_norm": 1.4773374892805051, + "learning_rate": 3.2683663980161193e-07, + "loss": 0.9177, + "step": 84350 + }, + { + "epoch": 6.537254446123446, + "grad_norm": 1.5005966620741358, + "learning_rate": 3.268753874767514e-07, + "loss": 0.9134, + "step": 84360 + }, + { + "epoch": 6.538029369599752, + "grad_norm": 1.4179089673930692, + "learning_rate": 3.269141351518909e-07, + "loss": 0.9644, + "step": 84370 + }, + { + "epoch": 6.538804293076058, + "grad_norm": 1.3406082820985432, + "learning_rate": 3.269528828270304e-07, + "loss": 0.9396, + "step": 84380 + }, + { + "epoch": 6.539579216552365, + "grad_norm": 1.339052823086556, + "learning_rate": 3.269916305021699e-07, + "loss": 0.9065, + "step": 84390 + }, + { + "epoch": 6.540354140028672, + "grad_norm": 1.3438985843359121, + "learning_rate": 3.270303781773094e-07, + "loss": 0.9069, + "step": 84400 + }, + { + "epoch": 6.541129063504979, + "grad_norm": 1.390338064192136, + "learning_rate": 3.2706912585244885e-07, + "loss": 0.9253, + "step": 84410 + }, + { + "epoch": 6.541903986981286, + "grad_norm": 1.4451481413347886, + "learning_rate": 3.271078735275884e-07, + "loss": 0.9437, + "step": 84420 + }, + { + "epoch": 6.542678910457592, + "grad_norm": 1.4072220970026363, + "learning_rate": 3.2714662120272785e-07, + "loss": 0.9459, + "step": 84430 + }, + { + "epoch": 6.543453833933899, + "grad_norm": 1.4635736007203421, + "learning_rate": 3.2718536887786737e-07, + "loss": 0.949, + "step": 84440 + }, + { + "epoch": 6.544228757410206, + "grad_norm": 1.4438603766174418, + "learning_rate": 3.2722411655300684e-07, + "loss": 0.9171, + "step": 84450 + }, + { + "epoch": 6.545003680886513, + "grad_norm": 1.4553971751214523, + "learning_rate": 3.2726286422814636e-07, + "loss": 0.9277, + "step": 84460 + }, + { + "epoch": 6.54577860436282, + "grad_norm": 1.37563180327903, + "learning_rate": 3.2730161190328583e-07, + "loss": 0.9386, + "step": 84470 + }, + { + "epoch": 6.546553527839126, + "grad_norm": 1.3149658812297669, + "learning_rate": 3.273403595784253e-07, + "loss": 0.9393, + "step": 84480 + }, + { + "epoch": 6.547328451315432, + "grad_norm": 1.343433862038656, + "learning_rate": 3.273791072535648e-07, + "loss": 0.9319, + "step": 84490 + }, + { + "epoch": 6.548103374791739, + "grad_norm": 1.3347208937970088, + "learning_rate": 3.274178549287043e-07, + "loss": 0.9255, + "step": 84500 + }, + { + "epoch": 6.548103374791739, + "eval_loss": 0.9353747963905334, + "eval_runtime": 333.7281, + "eval_samples_per_second": 34.372, + "eval_steps_per_second": 8.594, + "step": 84500 + }, + { + "epoch": 6.548878298268046, + "grad_norm": 1.3770218731724035, + "learning_rate": 3.274566026038438e-07, + "loss": 0.9124, + "step": 84510 + }, + { + "epoch": 6.549653221744353, + "grad_norm": 1.3762302584390544, + "learning_rate": 3.274953502789833e-07, + "loss": 0.9455, + "step": 84520 + }, + { + "epoch": 6.55042814522066, + "grad_norm": 1.3911979690625351, + "learning_rate": 3.275340979541228e-07, + "loss": 0.9184, + "step": 84530 + }, + { + "epoch": 6.5512030686969664, + "grad_norm": 1.472502351832089, + "learning_rate": 3.2757284562926227e-07, + "loss": 0.9443, + "step": 84540 + }, + { + "epoch": 6.551977992173273, + "grad_norm": 1.371659195330664, + "learning_rate": 3.2761159330440174e-07, + "loss": 0.9282, + "step": 84550 + }, + { + "epoch": 6.552752915649579, + "grad_norm": 1.2500178149786347, + "learning_rate": 3.2765034097954126e-07, + "loss": 0.9397, + "step": 84560 + }, + { + "epoch": 6.553527839125886, + "grad_norm": 1.434190349029732, + "learning_rate": 3.2768908865468073e-07, + "loss": 0.9309, + "step": 84570 + }, + { + "epoch": 6.554302762602193, + "grad_norm": 1.4717163662155248, + "learning_rate": 3.2772783632982025e-07, + "loss": 0.9293, + "step": 84580 + }, + { + "epoch": 6.5550776860785, + "grad_norm": 1.4278366023668319, + "learning_rate": 3.277665840049597e-07, + "loss": 0.9283, + "step": 84590 + }, + { + "epoch": 6.555852609554806, + "grad_norm": 1.4013947161250082, + "learning_rate": 3.2780533168009924e-07, + "loss": 0.9242, + "step": 84600 + }, + { + "epoch": 6.556627533031113, + "grad_norm": 1.5366681526902362, + "learning_rate": 3.278440793552387e-07, + "loss": 0.9336, + "step": 84610 + }, + { + "epoch": 6.55740245650742, + "grad_norm": 1.3270866697183377, + "learning_rate": 3.278828270303782e-07, + "loss": 0.9243, + "step": 84620 + }, + { + "epoch": 6.558177379983727, + "grad_norm": 1.3770910560169554, + "learning_rate": 3.279215747055177e-07, + "loss": 0.9215, + "step": 84630 + }, + { + "epoch": 6.558952303460034, + "grad_norm": 1.3508568542151036, + "learning_rate": 3.2796032238065717e-07, + "loss": 0.9478, + "step": 84640 + }, + { + "epoch": 6.5597272269363405, + "grad_norm": 1.3432406961427192, + "learning_rate": 3.279990700557967e-07, + "loss": 0.9216, + "step": 84650 + }, + { + "epoch": 6.560502150412646, + "grad_norm": 1.3704452310420452, + "learning_rate": 3.2803781773093616e-07, + "loss": 0.9427, + "step": 84660 + }, + { + "epoch": 6.561277073888953, + "grad_norm": 1.3441388709829054, + "learning_rate": 3.280765654060757e-07, + "loss": 0.9248, + "step": 84670 + }, + { + "epoch": 6.56205199736526, + "grad_norm": 1.3046855457743693, + "learning_rate": 3.2811531308121515e-07, + "loss": 0.9292, + "step": 84680 + }, + { + "epoch": 6.562826920841567, + "grad_norm": 1.357857377535192, + "learning_rate": 3.281540607563546e-07, + "loss": 0.9308, + "step": 84690 + }, + { + "epoch": 6.563601844317874, + "grad_norm": 1.4061982932636385, + "learning_rate": 3.2819280843149415e-07, + "loss": 0.9033, + "step": 84700 + }, + { + "epoch": 6.5643767677941804, + "grad_norm": 1.395602990264258, + "learning_rate": 3.282315561066336e-07, + "loss": 0.9475, + "step": 84710 + }, + { + "epoch": 6.565151691270487, + "grad_norm": 1.3138634561696076, + "learning_rate": 3.2827030378177314e-07, + "loss": 0.9284, + "step": 84720 + }, + { + "epoch": 6.565926614746794, + "grad_norm": 1.345711934126876, + "learning_rate": 3.283090514569126e-07, + "loss": 0.9103, + "step": 84730 + }, + { + "epoch": 6.5667015382231, + "grad_norm": 1.413306959710907, + "learning_rate": 3.283477991320521e-07, + "loss": 0.9475, + "step": 84740 + }, + { + "epoch": 6.567476461699407, + "grad_norm": 1.4131125207245068, + "learning_rate": 3.283865468071916e-07, + "loss": 0.9398, + "step": 84750 + }, + { + "epoch": 6.568251385175714, + "grad_norm": 1.3814159636798768, + "learning_rate": 3.2842529448233107e-07, + "loss": 0.9408, + "step": 84760 + }, + { + "epoch": 6.56902630865202, + "grad_norm": 1.420687936307888, + "learning_rate": 3.284640421574706e-07, + "loss": 0.9537, + "step": 84770 + }, + { + "epoch": 6.569801232128327, + "grad_norm": 1.3777605215516644, + "learning_rate": 3.2850278983261006e-07, + "loss": 0.9163, + "step": 84780 + }, + { + "epoch": 6.570576155604634, + "grad_norm": 1.3464837453067744, + "learning_rate": 3.285415375077496e-07, + "loss": 0.9341, + "step": 84790 + }, + { + "epoch": 6.571351079080941, + "grad_norm": 1.4546909801501104, + "learning_rate": 3.2858028518288905e-07, + "loss": 0.9647, + "step": 84800 + }, + { + "epoch": 6.572126002557248, + "grad_norm": 1.3960385497633112, + "learning_rate": 3.286190328580285e-07, + "loss": 0.9241, + "step": 84810 + }, + { + "epoch": 6.5729009260335545, + "grad_norm": 1.432690335513437, + "learning_rate": 3.2865778053316804e-07, + "loss": 0.9412, + "step": 84820 + }, + { + "epoch": 6.573675849509861, + "grad_norm": 1.3968771760277667, + "learning_rate": 3.286965282083075e-07, + "loss": 0.9263, + "step": 84830 + }, + { + "epoch": 6.574450772986168, + "grad_norm": 1.4886937792854564, + "learning_rate": 3.2873527588344703e-07, + "loss": 0.9303, + "step": 84840 + }, + { + "epoch": 6.575225696462474, + "grad_norm": 1.4854842837898434, + "learning_rate": 3.287740235585865e-07, + "loss": 0.9351, + "step": 84850 + }, + { + "epoch": 6.576000619938781, + "grad_norm": 1.466015014187737, + "learning_rate": 3.28812771233726e-07, + "loss": 0.9269, + "step": 84860 + }, + { + "epoch": 6.576775543415088, + "grad_norm": 1.323262706090754, + "learning_rate": 3.288515189088655e-07, + "loss": 0.9157, + "step": 84870 + }, + { + "epoch": 6.577550466891394, + "grad_norm": 1.3912557136872634, + "learning_rate": 3.2889026658400496e-07, + "loss": 0.9663, + "step": 84880 + }, + { + "epoch": 6.578325390367701, + "grad_norm": 1.3903701716211132, + "learning_rate": 3.289290142591445e-07, + "loss": 0.9423, + "step": 84890 + }, + { + "epoch": 6.579100313844008, + "grad_norm": 1.3972556955481816, + "learning_rate": 3.2896776193428395e-07, + "loss": 0.9248, + "step": 84900 + }, + { + "epoch": 6.579875237320315, + "grad_norm": 1.3203122839578478, + "learning_rate": 3.2900650960942347e-07, + "loss": 0.9389, + "step": 84910 + }, + { + "epoch": 6.580650160796622, + "grad_norm": 1.4767840493820061, + "learning_rate": 3.2904525728456294e-07, + "loss": 0.902, + "step": 84920 + }, + { + "epoch": 6.581425084272928, + "grad_norm": 1.3720941381762672, + "learning_rate": 3.2908400495970246e-07, + "loss": 0.9304, + "step": 84930 + }, + { + "epoch": 6.582200007749234, + "grad_norm": 1.3893723461436094, + "learning_rate": 3.2912275263484193e-07, + "loss": 0.9326, + "step": 84940 + }, + { + "epoch": 6.582974931225541, + "grad_norm": 1.286947947979054, + "learning_rate": 3.291615003099814e-07, + "loss": 0.9194, + "step": 84950 + }, + { + "epoch": 6.583749854701848, + "grad_norm": 1.3443345024185127, + "learning_rate": 3.292002479851209e-07, + "loss": 0.9098, + "step": 84960 + }, + { + "epoch": 6.584524778178155, + "grad_norm": 1.4303055919893763, + "learning_rate": 3.292389956602604e-07, + "loss": 0.9349, + "step": 84970 + }, + { + "epoch": 6.585299701654462, + "grad_norm": 1.4457124117156335, + "learning_rate": 3.292777433353999e-07, + "loss": 0.9218, + "step": 84980 + }, + { + "epoch": 6.5860746251307685, + "grad_norm": 1.35580567971086, + "learning_rate": 3.293164910105394e-07, + "loss": 0.9429, + "step": 84990 + }, + { + "epoch": 6.586849548607075, + "grad_norm": 1.342590003742134, + "learning_rate": 3.293552386856789e-07, + "loss": 0.9195, + "step": 85000 + }, + { + "epoch": 6.586849548607075, + "eval_loss": 0.9349644184112549, + "eval_runtime": 333.9199, + "eval_samples_per_second": 34.353, + "eval_steps_per_second": 8.589, + "step": 85000 + }, + { + "epoch": 6.587624472083382, + "grad_norm": 1.4767855632327294, + "learning_rate": 3.293939863608184e-07, + "loss": 0.9317, + "step": 85010 + }, + { + "epoch": 6.588399395559689, + "grad_norm": 1.4780573668494899, + "learning_rate": 3.2943273403595784e-07, + "loss": 0.9535, + "step": 85020 + }, + { + "epoch": 6.589174319035995, + "grad_norm": 1.3604562297976428, + "learning_rate": 3.2947148171109736e-07, + "loss": 0.9292, + "step": 85030 + }, + { + "epoch": 6.589949242512302, + "grad_norm": 1.3268383342760381, + "learning_rate": 3.2951022938623683e-07, + "loss": 0.9164, + "step": 85040 + }, + { + "epoch": 6.590724165988608, + "grad_norm": 1.4526929278891598, + "learning_rate": 3.2954897706137636e-07, + "loss": 0.9535, + "step": 85050 + }, + { + "epoch": 6.591499089464915, + "grad_norm": 1.4006640725464274, + "learning_rate": 3.295877247365158e-07, + "loss": 0.9408, + "step": 85060 + }, + { + "epoch": 6.592274012941222, + "grad_norm": 1.367350519499839, + "learning_rate": 3.2962647241165535e-07, + "loss": 0.9407, + "step": 85070 + }, + { + "epoch": 6.593048936417529, + "grad_norm": 1.4089567972600812, + "learning_rate": 3.296652200867948e-07, + "loss": 0.937, + "step": 85080 + }, + { + "epoch": 6.593823859893836, + "grad_norm": 1.3501277677422343, + "learning_rate": 3.297039677619343e-07, + "loss": 0.9783, + "step": 85090 + }, + { + "epoch": 6.5945987833701425, + "grad_norm": 1.3902646064193807, + "learning_rate": 3.297427154370738e-07, + "loss": 0.926, + "step": 85100 + }, + { + "epoch": 6.595373706846448, + "grad_norm": 1.3702778024643567, + "learning_rate": 3.297814631122133e-07, + "loss": 0.9378, + "step": 85110 + }, + { + "epoch": 6.596148630322755, + "grad_norm": 1.3576570879089174, + "learning_rate": 3.298202107873528e-07, + "loss": 0.9252, + "step": 85120 + }, + { + "epoch": 6.596923553799062, + "grad_norm": 1.4185063228198367, + "learning_rate": 3.2985895846249227e-07, + "loss": 0.9483, + "step": 85130 + }, + { + "epoch": 6.597698477275369, + "grad_norm": 1.4075308619207256, + "learning_rate": 3.298977061376318e-07, + "loss": 0.9222, + "step": 85140 + }, + { + "epoch": 6.598473400751676, + "grad_norm": 1.37110275407927, + "learning_rate": 3.2993645381277126e-07, + "loss": 0.9341, + "step": 85150 + }, + { + "epoch": 6.5992483242279825, + "grad_norm": 1.3652715808293048, + "learning_rate": 3.2997520148791073e-07, + "loss": 0.9209, + "step": 85160 + }, + { + "epoch": 6.600023247704289, + "grad_norm": 1.4431077484987744, + "learning_rate": 3.3001394916305025e-07, + "loss": 0.9571, + "step": 85170 + }, + { + "epoch": 6.600798171180596, + "grad_norm": 1.3792524618977178, + "learning_rate": 3.300526968381897e-07, + "loss": 0.9237, + "step": 85180 + }, + { + "epoch": 6.601573094656903, + "grad_norm": 1.343046988432937, + "learning_rate": 3.3009144451332924e-07, + "loss": 0.9396, + "step": 85190 + }, + { + "epoch": 6.60234801813321, + "grad_norm": 1.4598397878337512, + "learning_rate": 3.301301921884687e-07, + "loss": 0.9416, + "step": 85200 + }, + { + "epoch": 6.6031229416095165, + "grad_norm": 1.3923692093136015, + "learning_rate": 3.3016893986360823e-07, + "loss": 0.9171, + "step": 85210 + }, + { + "epoch": 6.603897865085822, + "grad_norm": 1.4379529160055562, + "learning_rate": 3.302076875387477e-07, + "loss": 0.9431, + "step": 85220 + }, + { + "epoch": 6.604672788562129, + "grad_norm": 1.3605562115984464, + "learning_rate": 3.3024643521388717e-07, + "loss": 0.9591, + "step": 85230 + }, + { + "epoch": 6.605447712038436, + "grad_norm": 1.5284280837149837, + "learning_rate": 3.302851828890267e-07, + "loss": 0.9407, + "step": 85240 + }, + { + "epoch": 6.606222635514743, + "grad_norm": 1.3627633653221793, + "learning_rate": 3.3032393056416616e-07, + "loss": 0.9147, + "step": 85250 + }, + { + "epoch": 6.60699755899105, + "grad_norm": 1.4360154259958071, + "learning_rate": 3.303626782393057e-07, + "loss": 0.9375, + "step": 85260 + }, + { + "epoch": 6.6077724824673565, + "grad_norm": 1.3886201928110644, + "learning_rate": 3.3040142591444515e-07, + "loss": 0.9465, + "step": 85270 + }, + { + "epoch": 6.608547405943663, + "grad_norm": 1.4016056225803981, + "learning_rate": 3.3044017358958467e-07, + "loss": 0.9508, + "step": 85280 + }, + { + "epoch": 6.60932232941997, + "grad_norm": 1.3226915236261385, + "learning_rate": 3.3047892126472414e-07, + "loss": 0.9151, + "step": 85290 + }, + { + "epoch": 6.610097252896276, + "grad_norm": 1.4683219970057821, + "learning_rate": 3.305176689398636e-07, + "loss": 0.9284, + "step": 85300 + }, + { + "epoch": 6.610872176372583, + "grad_norm": 1.4369333435124918, + "learning_rate": 3.3055641661500313e-07, + "loss": 0.9449, + "step": 85310 + }, + { + "epoch": 6.61164709984889, + "grad_norm": 1.465647173976451, + "learning_rate": 3.305951642901426e-07, + "loss": 0.9361, + "step": 85320 + }, + { + "epoch": 6.6124220233251965, + "grad_norm": 1.4622928104919448, + "learning_rate": 3.306339119652821e-07, + "loss": 0.9215, + "step": 85330 + }, + { + "epoch": 6.613196946801503, + "grad_norm": 1.4585616315452, + "learning_rate": 3.306726596404216e-07, + "loss": 0.9215, + "step": 85340 + }, + { + "epoch": 6.61397187027781, + "grad_norm": 1.4311893635942616, + "learning_rate": 3.307114073155611e-07, + "loss": 0.9189, + "step": 85350 + }, + { + "epoch": 6.614746793754117, + "grad_norm": 1.4735767967230655, + "learning_rate": 3.307501549907006e-07, + "loss": 0.929, + "step": 85360 + }, + { + "epoch": 6.615521717230424, + "grad_norm": 1.4546658329569622, + "learning_rate": 3.3078890266584005e-07, + "loss": 0.9367, + "step": 85370 + }, + { + "epoch": 6.6162966407067305, + "grad_norm": 1.44470922050982, + "learning_rate": 3.308276503409796e-07, + "loss": 0.919, + "step": 85380 + }, + { + "epoch": 6.617071564183037, + "grad_norm": 1.3406208315663157, + "learning_rate": 3.3086639801611904e-07, + "loss": 0.9293, + "step": 85390 + }, + { + "epoch": 6.617846487659344, + "grad_norm": 1.4790525581189675, + "learning_rate": 3.3090514569125857e-07, + "loss": 0.9429, + "step": 85400 + }, + { + "epoch": 6.61862141113565, + "grad_norm": 1.5479610814749982, + "learning_rate": 3.3094389336639804e-07, + "loss": 0.9333, + "step": 85410 + }, + { + "epoch": 6.619396334611957, + "grad_norm": 1.3798242706326564, + "learning_rate": 3.309826410415375e-07, + "loss": 0.9367, + "step": 85420 + }, + { + "epoch": 6.620171258088264, + "grad_norm": 1.412745085118755, + "learning_rate": 3.31021388716677e-07, + "loss": 0.9423, + "step": 85430 + }, + { + "epoch": 6.6209461815645705, + "grad_norm": 1.3872945713584082, + "learning_rate": 3.310601363918165e-07, + "loss": 0.9654, + "step": 85440 + }, + { + "epoch": 6.621721105040877, + "grad_norm": 1.3596932021354922, + "learning_rate": 3.31098884066956e-07, + "loss": 0.9735, + "step": 85450 + }, + { + "epoch": 6.622496028517184, + "grad_norm": 1.481992126193367, + "learning_rate": 3.311376317420955e-07, + "loss": 0.9415, + "step": 85460 + }, + { + "epoch": 6.623270951993491, + "grad_norm": 1.347384299718468, + "learning_rate": 3.31176379417235e-07, + "loss": 0.9194, + "step": 85470 + }, + { + "epoch": 6.624045875469798, + "grad_norm": 1.4342498049732766, + "learning_rate": 3.312151270923745e-07, + "loss": 0.9372, + "step": 85480 + }, + { + "epoch": 6.624820798946104, + "grad_norm": 1.4444708802555557, + "learning_rate": 3.3125387476751395e-07, + "loss": 0.9359, + "step": 85490 + }, + { + "epoch": 6.6255957224224105, + "grad_norm": 1.4463618995727927, + "learning_rate": 3.3129262244265347e-07, + "loss": 0.9061, + "step": 85500 + }, + { + "epoch": 6.6255957224224105, + "eval_loss": 0.9344653487205505, + "eval_runtime": 332.8882, + "eval_samples_per_second": 34.459, + "eval_steps_per_second": 8.616, + "step": 85500 + }, + { + "epoch": 6.626370645898717, + "grad_norm": 1.3501719764175093, + "learning_rate": 3.3133137011779294e-07, + "loss": 0.9247, + "step": 85510 + }, + { + "epoch": 6.627145569375024, + "grad_norm": 1.4741011482088904, + "learning_rate": 3.3137011779293246e-07, + "loss": 0.9267, + "step": 85520 + }, + { + "epoch": 6.627920492851331, + "grad_norm": 1.4126939129591372, + "learning_rate": 3.3140886546807193e-07, + "loss": 0.9164, + "step": 85530 + }, + { + "epoch": 6.628695416327638, + "grad_norm": 1.3272000237979968, + "learning_rate": 3.3144761314321145e-07, + "loss": 0.9236, + "step": 85540 + }, + { + "epoch": 6.6294703398039445, + "grad_norm": 1.4042899362095351, + "learning_rate": 3.314863608183509e-07, + "loss": 0.9504, + "step": 85550 + }, + { + "epoch": 6.630245263280251, + "grad_norm": 1.430398848543462, + "learning_rate": 3.315251084934904e-07, + "loss": 0.9169, + "step": 85560 + }, + { + "epoch": 6.631020186756558, + "grad_norm": 1.4138279604193456, + "learning_rate": 3.315638561686299e-07, + "loss": 0.9329, + "step": 85570 + }, + { + "epoch": 6.631795110232865, + "grad_norm": 1.4061586465730813, + "learning_rate": 3.316026038437694e-07, + "loss": 0.9232, + "step": 85580 + }, + { + "epoch": 6.632570033709171, + "grad_norm": 1.4193696424064486, + "learning_rate": 3.316413515189089e-07, + "loss": 0.9193, + "step": 85590 + }, + { + "epoch": 6.633344957185478, + "grad_norm": 1.3740538570086442, + "learning_rate": 3.3168009919404837e-07, + "loss": 0.921, + "step": 85600 + }, + { + "epoch": 6.6341198806617845, + "grad_norm": 1.4048488319719508, + "learning_rate": 3.317188468691879e-07, + "loss": 0.9282, + "step": 85610 + }, + { + "epoch": 6.634894804138091, + "grad_norm": 1.3998650096358118, + "learning_rate": 3.3175759454432736e-07, + "loss": 0.9547, + "step": 85620 + }, + { + "epoch": 6.635669727614398, + "grad_norm": 1.4104563281959959, + "learning_rate": 3.3179634221946683e-07, + "loss": 0.9273, + "step": 85630 + }, + { + "epoch": 6.636444651090705, + "grad_norm": 1.4074571074934252, + "learning_rate": 3.3183508989460635e-07, + "loss": 0.9245, + "step": 85640 + }, + { + "epoch": 6.637219574567012, + "grad_norm": 1.4111686691291587, + "learning_rate": 3.318738375697458e-07, + "loss": 0.9114, + "step": 85650 + }, + { + "epoch": 6.6379944980433185, + "grad_norm": 1.4210736965609152, + "learning_rate": 3.3191258524488534e-07, + "loss": 0.941, + "step": 85660 + }, + { + "epoch": 6.6387694215196245, + "grad_norm": 1.4860969665332988, + "learning_rate": 3.319513329200248e-07, + "loss": 0.9363, + "step": 85670 + }, + { + "epoch": 6.639544344995931, + "grad_norm": 1.4529518354018784, + "learning_rate": 3.3199008059516433e-07, + "loss": 0.9236, + "step": 85680 + }, + { + "epoch": 6.640319268472238, + "grad_norm": 1.4380911261572633, + "learning_rate": 3.320288282703038e-07, + "loss": 0.9206, + "step": 85690 + }, + { + "epoch": 6.641094191948545, + "grad_norm": 1.5024469848742064, + "learning_rate": 3.3206757594544327e-07, + "loss": 0.9417, + "step": 85700 + }, + { + "epoch": 6.641869115424852, + "grad_norm": 1.408076032457801, + "learning_rate": 3.321063236205828e-07, + "loss": 0.9387, + "step": 85710 + }, + { + "epoch": 6.6426440389011585, + "grad_norm": 1.3645086044639962, + "learning_rate": 3.3214507129572226e-07, + "loss": 0.9392, + "step": 85720 + }, + { + "epoch": 6.643418962377465, + "grad_norm": 1.4017719679793221, + "learning_rate": 3.321838189708618e-07, + "loss": 0.9375, + "step": 85730 + }, + { + "epoch": 6.644193885853772, + "grad_norm": 1.4404236324431494, + "learning_rate": 3.3222256664600125e-07, + "loss": 0.9373, + "step": 85740 + }, + { + "epoch": 6.644968809330079, + "grad_norm": 1.4618202086435026, + "learning_rate": 3.322613143211408e-07, + "loss": 0.946, + "step": 85750 + }, + { + "epoch": 6.645743732806386, + "grad_norm": 1.3910951622634455, + "learning_rate": 3.3230006199628025e-07, + "loss": 0.9306, + "step": 85760 + }, + { + "epoch": 6.646518656282693, + "grad_norm": 1.363238845782967, + "learning_rate": 3.323388096714197e-07, + "loss": 0.9343, + "step": 85770 + }, + { + "epoch": 6.6472935797589985, + "grad_norm": 1.3968021112866935, + "learning_rate": 3.3237755734655924e-07, + "loss": 0.9166, + "step": 85780 + }, + { + "epoch": 6.648068503235305, + "grad_norm": 1.4913903797353818, + "learning_rate": 3.324163050216987e-07, + "loss": 0.9295, + "step": 85790 + }, + { + "epoch": 6.648843426711612, + "grad_norm": 1.330563279137635, + "learning_rate": 3.3245505269683823e-07, + "loss": 0.9574, + "step": 85800 + }, + { + "epoch": 6.649618350187919, + "grad_norm": 1.387326067615639, + "learning_rate": 3.324938003719777e-07, + "loss": 0.9249, + "step": 85810 + }, + { + "epoch": 6.650393273664226, + "grad_norm": 1.3898273190040016, + "learning_rate": 3.325325480471172e-07, + "loss": 0.9239, + "step": 85820 + }, + { + "epoch": 6.6511681971405325, + "grad_norm": 1.4178651241528313, + "learning_rate": 3.325712957222567e-07, + "loss": 0.931, + "step": 85830 + }, + { + "epoch": 6.651943120616839, + "grad_norm": 1.4488871502298337, + "learning_rate": 3.3261004339739616e-07, + "loss": 0.9266, + "step": 85840 + }, + { + "epoch": 6.652718044093146, + "grad_norm": 1.33283515593049, + "learning_rate": 3.326487910725357e-07, + "loss": 0.9122, + "step": 85850 + }, + { + "epoch": 6.653492967569452, + "grad_norm": 1.3845413000455382, + "learning_rate": 3.3268753874767515e-07, + "loss": 0.9292, + "step": 85860 + }, + { + "epoch": 6.654267891045759, + "grad_norm": 1.349345663144317, + "learning_rate": 3.3272628642281467e-07, + "loss": 0.9153, + "step": 85870 + }, + { + "epoch": 6.655042814522066, + "grad_norm": 1.3609909180119557, + "learning_rate": 3.3276503409795414e-07, + "loss": 0.9198, + "step": 85880 + }, + { + "epoch": 6.6558177379983725, + "grad_norm": 1.363318917261223, + "learning_rate": 3.3280378177309366e-07, + "loss": 0.9268, + "step": 85890 + }, + { + "epoch": 6.656592661474679, + "grad_norm": 1.3594464946478892, + "learning_rate": 3.3284252944823313e-07, + "loss": 0.9107, + "step": 85900 + }, + { + "epoch": 6.657367584950986, + "grad_norm": 1.4456600905654917, + "learning_rate": 3.328812771233726e-07, + "loss": 0.933, + "step": 85910 + }, + { + "epoch": 6.658142508427293, + "grad_norm": 1.3449106471879633, + "learning_rate": 3.329200247985121e-07, + "loss": 0.9123, + "step": 85920 + }, + { + "epoch": 6.6589174319036, + "grad_norm": 1.3573308300475335, + "learning_rate": 3.329587724736516e-07, + "loss": 0.9354, + "step": 85930 + }, + { + "epoch": 6.659692355379907, + "grad_norm": 1.3982550095792463, + "learning_rate": 3.329975201487911e-07, + "loss": 0.9253, + "step": 85940 + }, + { + "epoch": 6.660467278856213, + "grad_norm": 1.4177001544538703, + "learning_rate": 3.330362678239306e-07, + "loss": 0.9198, + "step": 85950 + }, + { + "epoch": 6.661242202332519, + "grad_norm": 1.6622426702594253, + "learning_rate": 3.330750154990701e-07, + "loss": 0.9692, + "step": 85960 + }, + { + "epoch": 6.662017125808826, + "grad_norm": 1.3662864448442822, + "learning_rate": 3.3311376317420957e-07, + "loss": 0.9436, + "step": 85970 + }, + { + "epoch": 6.662792049285133, + "grad_norm": 1.338237069380746, + "learning_rate": 3.3315251084934904e-07, + "loss": 0.918, + "step": 85980 + }, + { + "epoch": 6.66356697276144, + "grad_norm": 1.3846819514519864, + "learning_rate": 3.3319125852448856e-07, + "loss": 0.9238, + "step": 85990 + }, + { + "epoch": 6.6643418962377465, + "grad_norm": 1.4183979348330884, + "learning_rate": 3.3323000619962803e-07, + "loss": 0.9486, + "step": 86000 + }, + { + "epoch": 6.6643418962377465, + "eval_loss": 0.9341169595718384, + "eval_runtime": 332.3859, + "eval_samples_per_second": 34.511, + "eval_steps_per_second": 8.629, + "step": 86000 + }, + { + "epoch": 6.665116819714053, + "grad_norm": 1.4320113798968273, + "learning_rate": 3.3326875387476755e-07, + "loss": 0.936, + "step": 86010 + }, + { + "epoch": 6.66589174319036, + "grad_norm": 1.3895133838919416, + "learning_rate": 3.33307501549907e-07, + "loss": 0.9326, + "step": 86020 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 1.4046124607017003, + "learning_rate": 3.3334624922504655e-07, + "loss": 0.9194, + "step": 86030 + }, + { + "epoch": 6.667441590142973, + "grad_norm": 1.444759150940625, + "learning_rate": 3.33384996900186e-07, + "loss": 0.9333, + "step": 86040 + }, + { + "epoch": 6.66821651361928, + "grad_norm": 1.4611131116597205, + "learning_rate": 3.334237445753255e-07, + "loss": 0.9219, + "step": 86050 + }, + { + "epoch": 6.6689914370955865, + "grad_norm": 1.5070454070025925, + "learning_rate": 3.33462492250465e-07, + "loss": 0.9357, + "step": 86060 + }, + { + "epoch": 6.669766360571893, + "grad_norm": 1.3482172874236793, + "learning_rate": 3.335012399256045e-07, + "loss": 0.9305, + "step": 86070 + }, + { + "epoch": 6.6705412840482, + "grad_norm": 1.4756330321956639, + "learning_rate": 3.33539987600744e-07, + "loss": 0.9472, + "step": 86080 + }, + { + "epoch": 6.671316207524507, + "grad_norm": 1.4170536206995528, + "learning_rate": 3.3357873527588347e-07, + "loss": 0.9329, + "step": 86090 + }, + { + "epoch": 6.672091131000814, + "grad_norm": 1.4252550249886635, + "learning_rate": 3.33617482951023e-07, + "loss": 0.9242, + "step": 86100 + }, + { + "epoch": 6.672866054477121, + "grad_norm": 1.4263749279394884, + "learning_rate": 3.3365623062616246e-07, + "loss": 0.9334, + "step": 86110 + }, + { + "epoch": 6.673640977953427, + "grad_norm": 1.3036383732993637, + "learning_rate": 3.336949783013019e-07, + "loss": 0.9128, + "step": 86120 + }, + { + "epoch": 6.674415901429734, + "grad_norm": 1.4441589465099613, + "learning_rate": 3.3373372597644145e-07, + "loss": 0.9207, + "step": 86130 + }, + { + "epoch": 6.675190824906041, + "grad_norm": 1.4705945092188502, + "learning_rate": 3.337724736515809e-07, + "loss": 0.9271, + "step": 86140 + }, + { + "epoch": 6.675965748382347, + "grad_norm": 1.40977738164605, + "learning_rate": 3.3381122132672044e-07, + "loss": 0.9208, + "step": 86150 + }, + { + "epoch": 6.676740671858654, + "grad_norm": 1.3899328817063543, + "learning_rate": 3.338499690018599e-07, + "loss": 0.9313, + "step": 86160 + }, + { + "epoch": 6.6775155953349605, + "grad_norm": 1.4060746438191145, + "learning_rate": 3.338887166769994e-07, + "loss": 0.9094, + "step": 86170 + }, + { + "epoch": 6.678290518811267, + "grad_norm": 1.4182173000348999, + "learning_rate": 3.339274643521389e-07, + "loss": 0.9346, + "step": 86180 + }, + { + "epoch": 6.679065442287574, + "grad_norm": 1.3977735616403806, + "learning_rate": 3.3396621202727837e-07, + "loss": 0.9435, + "step": 86190 + }, + { + "epoch": 6.679840365763881, + "grad_norm": 1.419977953601567, + "learning_rate": 3.340049597024179e-07, + "loss": 0.9143, + "step": 86200 + }, + { + "epoch": 6.680615289240188, + "grad_norm": 1.3851439311532185, + "learning_rate": 3.3404370737755736e-07, + "loss": 0.9369, + "step": 86210 + }, + { + "epoch": 6.681390212716495, + "grad_norm": 1.4385957850798636, + "learning_rate": 3.340824550526969e-07, + "loss": 0.9095, + "step": 86220 + }, + { + "epoch": 6.6821651361928005, + "grad_norm": 1.3826006675286182, + "learning_rate": 3.3412120272783635e-07, + "loss": 0.9172, + "step": 86230 + }, + { + "epoch": 6.682940059669107, + "grad_norm": 1.4867326851593023, + "learning_rate": 3.341599504029758e-07, + "loss": 0.9442, + "step": 86240 + }, + { + "epoch": 6.683714983145414, + "grad_norm": 1.317967573442897, + "learning_rate": 3.3419869807811534e-07, + "loss": 0.946, + "step": 86250 + }, + { + "epoch": 6.684489906621721, + "grad_norm": 1.4512299473351447, + "learning_rate": 3.342374457532548e-07, + "loss": 0.9434, + "step": 86260 + }, + { + "epoch": 6.685264830098028, + "grad_norm": 1.3172623123969684, + "learning_rate": 3.3427619342839433e-07, + "loss": 0.9186, + "step": 86270 + }, + { + "epoch": 6.6860397535743346, + "grad_norm": 1.4861665657531795, + "learning_rate": 3.343149411035338e-07, + "loss": 0.9298, + "step": 86280 + }, + { + "epoch": 6.686814677050641, + "grad_norm": 1.3539400906153283, + "learning_rate": 3.343536887786733e-07, + "loss": 0.9253, + "step": 86290 + }, + { + "epoch": 6.687589600526948, + "grad_norm": 1.3527941978746127, + "learning_rate": 3.343924364538128e-07, + "loss": 0.9352, + "step": 86300 + }, + { + "epoch": 6.688364524003255, + "grad_norm": 1.3682311141306767, + "learning_rate": 3.3443118412895226e-07, + "loss": 0.9211, + "step": 86310 + }, + { + "epoch": 6.689139447479562, + "grad_norm": 1.3912262788506407, + "learning_rate": 3.344699318040918e-07, + "loss": 0.9488, + "step": 86320 + }, + { + "epoch": 6.689914370955869, + "grad_norm": 1.4449981486529018, + "learning_rate": 3.3450867947923125e-07, + "loss": 0.9382, + "step": 86330 + }, + { + "epoch": 6.6906892944321745, + "grad_norm": 1.3831066731669293, + "learning_rate": 3.345474271543708e-07, + "loss": 0.9293, + "step": 86340 + }, + { + "epoch": 6.691464217908481, + "grad_norm": 1.4535535942269318, + "learning_rate": 3.3458617482951024e-07, + "loss": 0.9537, + "step": 86350 + }, + { + "epoch": 6.692239141384788, + "grad_norm": 1.385940276303113, + "learning_rate": 3.3462492250464976e-07, + "loss": 0.9067, + "step": 86360 + }, + { + "epoch": 6.693014064861095, + "grad_norm": 1.4144031531639834, + "learning_rate": 3.3466367017978923e-07, + "loss": 0.941, + "step": 86370 + }, + { + "epoch": 6.693788988337402, + "grad_norm": 1.43890352522574, + "learning_rate": 3.347024178549287e-07, + "loss": 0.9461, + "step": 86380 + }, + { + "epoch": 6.694563911813709, + "grad_norm": 1.4175333039484121, + "learning_rate": 3.347411655300682e-07, + "loss": 0.9287, + "step": 86390 + }, + { + "epoch": 6.695338835290015, + "grad_norm": 1.4326050442684661, + "learning_rate": 3.347799132052077e-07, + "loss": 0.9356, + "step": 86400 + }, + { + "epoch": 6.696113758766322, + "grad_norm": 1.4216048884693349, + "learning_rate": 3.348186608803472e-07, + "loss": 0.9617, + "step": 86410 + }, + { + "epoch": 6.696888682242628, + "grad_norm": 1.492751919090847, + "learning_rate": 3.348574085554867e-07, + "loss": 0.9159, + "step": 86420 + }, + { + "epoch": 6.697663605718935, + "grad_norm": 1.3102891132546723, + "learning_rate": 3.348961562306262e-07, + "loss": 0.9368, + "step": 86430 + }, + { + "epoch": 6.698438529195242, + "grad_norm": 1.4188476015078857, + "learning_rate": 3.349349039057657e-07, + "loss": 0.9378, + "step": 86440 + }, + { + "epoch": 6.6992134526715486, + "grad_norm": 1.3457555517349766, + "learning_rate": 3.3497365158090514e-07, + "loss": 0.9274, + "step": 86450 + }, + { + "epoch": 6.699988376147855, + "grad_norm": 1.521331690422673, + "learning_rate": 3.3501239925604467e-07, + "loss": 0.9402, + "step": 86460 + }, + { + "epoch": 6.700763299624162, + "grad_norm": 1.3750879635894657, + "learning_rate": 3.3505114693118414e-07, + "loss": 0.9287, + "step": 86470 + }, + { + "epoch": 6.701538223100469, + "grad_norm": 1.3679050869329714, + "learning_rate": 3.3508989460632366e-07, + "loss": 0.9167, + "step": 86480 + }, + { + "epoch": 6.702313146576776, + "grad_norm": 1.465219974163689, + "learning_rate": 3.3512864228146313e-07, + "loss": 0.929, + "step": 86490 + }, + { + "epoch": 6.703088070053083, + "grad_norm": 1.324128080984547, + "learning_rate": 3.3516738995660265e-07, + "loss": 0.9265, + "step": 86500 + }, + { + "epoch": 6.703088070053083, + "eval_loss": 0.9337286949157715, + "eval_runtime": 334.3587, + "eval_samples_per_second": 34.307, + "eval_steps_per_second": 8.578, + "step": 86500 + }, + { + "epoch": 6.703862993529389, + "grad_norm": 1.3887625083374737, + "learning_rate": 3.352061376317421e-07, + "loss": 0.9258, + "step": 86510 + }, + { + "epoch": 6.704637917005695, + "grad_norm": 1.3543319272764691, + "learning_rate": 3.352448853068816e-07, + "loss": 0.9514, + "step": 86520 + }, + { + "epoch": 6.705412840482002, + "grad_norm": 1.3903187012849185, + "learning_rate": 3.352836329820211e-07, + "loss": 0.9299, + "step": 86530 + }, + { + "epoch": 6.706187763958309, + "grad_norm": 1.512027097321786, + "learning_rate": 3.353223806571606e-07, + "loss": 0.9673, + "step": 86540 + }, + { + "epoch": 6.706962687434616, + "grad_norm": 1.4399944358548722, + "learning_rate": 3.353611283323001e-07, + "loss": 0.9178, + "step": 86550 + }, + { + "epoch": 6.707737610910923, + "grad_norm": 1.5338791368056728, + "learning_rate": 3.3539987600743957e-07, + "loss": 0.9301, + "step": 86560 + }, + { + "epoch": 6.708512534387229, + "grad_norm": 1.3386022641698103, + "learning_rate": 3.354386236825791e-07, + "loss": 0.9183, + "step": 86570 + }, + { + "epoch": 6.709287457863536, + "grad_norm": 1.466041486021641, + "learning_rate": 3.3547737135771856e-07, + "loss": 0.9361, + "step": 86580 + }, + { + "epoch": 6.710062381339843, + "grad_norm": 1.329941884453547, + "learning_rate": 3.3551611903285803e-07, + "loss": 0.9298, + "step": 86590 + }, + { + "epoch": 6.710837304816149, + "grad_norm": 1.449332158155999, + "learning_rate": 3.3555486670799755e-07, + "loss": 0.9252, + "step": 86600 + }, + { + "epoch": 6.711612228292456, + "grad_norm": 1.446458114784115, + "learning_rate": 3.35593614383137e-07, + "loss": 0.9413, + "step": 86610 + }, + { + "epoch": 6.7123871517687625, + "grad_norm": 1.4910785167319365, + "learning_rate": 3.3563236205827654e-07, + "loss": 0.9438, + "step": 86620 + }, + { + "epoch": 6.713162075245069, + "grad_norm": 1.4208689577033002, + "learning_rate": 3.35671109733416e-07, + "loss": 0.9342, + "step": 86630 + }, + { + "epoch": 6.713936998721376, + "grad_norm": 1.4657690712297378, + "learning_rate": 3.3570985740855553e-07, + "loss": 0.9462, + "step": 86640 + }, + { + "epoch": 6.714711922197683, + "grad_norm": 1.450065341302538, + "learning_rate": 3.35748605083695e-07, + "loss": 0.9393, + "step": 86650 + }, + { + "epoch": 6.71548684567399, + "grad_norm": 1.3666699270919913, + "learning_rate": 3.3578735275883447e-07, + "loss": 0.9235, + "step": 86660 + }, + { + "epoch": 6.716261769150297, + "grad_norm": 1.339107647171398, + "learning_rate": 3.35826100433974e-07, + "loss": 0.9196, + "step": 86670 + }, + { + "epoch": 6.717036692626603, + "grad_norm": 1.4495997801513099, + "learning_rate": 3.3586484810911346e-07, + "loss": 0.9237, + "step": 86680 + }, + { + "epoch": 6.71781161610291, + "grad_norm": 1.4278275207054254, + "learning_rate": 3.35903595784253e-07, + "loss": 0.9318, + "step": 86690 + }, + { + "epoch": 6.718586539579217, + "grad_norm": 1.4390681527953346, + "learning_rate": 3.3594234345939245e-07, + "loss": 0.9273, + "step": 86700 + }, + { + "epoch": 6.719361463055523, + "grad_norm": 1.437479199376107, + "learning_rate": 3.35981091134532e-07, + "loss": 0.93, + "step": 86710 + }, + { + "epoch": 6.72013638653183, + "grad_norm": 1.4010483277002952, + "learning_rate": 3.3601983880967144e-07, + "loss": 0.9428, + "step": 86720 + }, + { + "epoch": 6.720911310008137, + "grad_norm": 1.4683628865651044, + "learning_rate": 3.360585864848109e-07, + "loss": 0.9404, + "step": 86730 + }, + { + "epoch": 6.721686233484443, + "grad_norm": 1.550823927962686, + "learning_rate": 3.3609733415995044e-07, + "loss": 0.9213, + "step": 86740 + }, + { + "epoch": 6.72246115696075, + "grad_norm": 1.41320428575273, + "learning_rate": 3.361360818350899e-07, + "loss": 0.9454, + "step": 86750 + }, + { + "epoch": 6.723236080437057, + "grad_norm": 1.4423865380046075, + "learning_rate": 3.361748295102294e-07, + "loss": 0.9422, + "step": 86760 + }, + { + "epoch": 6.724011003913364, + "grad_norm": 1.416452367401916, + "learning_rate": 3.362135771853689e-07, + "loss": 0.9362, + "step": 86770 + }, + { + "epoch": 6.724785927389671, + "grad_norm": 1.4284202876745296, + "learning_rate": 3.362523248605084e-07, + "loss": 0.9006, + "step": 86780 + }, + { + "epoch": 6.7255608508659765, + "grad_norm": 1.4362081686244443, + "learning_rate": 3.362910725356479e-07, + "loss": 0.9132, + "step": 86790 + }, + { + "epoch": 6.726335774342283, + "grad_norm": 1.4242313250710354, + "learning_rate": 3.3632982021078736e-07, + "loss": 0.9158, + "step": 86800 + }, + { + "epoch": 6.72711069781859, + "grad_norm": 1.3684348778010216, + "learning_rate": 3.363685678859269e-07, + "loss": 0.9313, + "step": 86810 + }, + { + "epoch": 6.727885621294897, + "grad_norm": 1.430502662004704, + "learning_rate": 3.3640731556106635e-07, + "loss": 0.9071, + "step": 86820 + }, + { + "epoch": 6.728660544771204, + "grad_norm": 1.4096757652227838, + "learning_rate": 3.3644606323620587e-07, + "loss": 0.9172, + "step": 86830 + }, + { + "epoch": 6.729435468247511, + "grad_norm": 1.396328084286977, + "learning_rate": 3.3648481091134534e-07, + "loss": 0.9337, + "step": 86840 + }, + { + "epoch": 6.730210391723817, + "grad_norm": 1.3585232603536952, + "learning_rate": 3.3652355858648486e-07, + "loss": 0.9132, + "step": 86850 + }, + { + "epoch": 6.730985315200124, + "grad_norm": 1.3908388259165472, + "learning_rate": 3.3656230626162433e-07, + "loss": 0.9502, + "step": 86860 + }, + { + "epoch": 6.731760238676431, + "grad_norm": 1.4064834745654464, + "learning_rate": 3.366010539367638e-07, + "loss": 0.9319, + "step": 86870 + }, + { + "epoch": 6.732535162152738, + "grad_norm": 1.3153854567979644, + "learning_rate": 3.366398016119033e-07, + "loss": 0.9155, + "step": 86880 + }, + { + "epoch": 6.733310085629044, + "grad_norm": 1.4172508874451395, + "learning_rate": 3.366785492870428e-07, + "loss": 0.9489, + "step": 86890 + }, + { + "epoch": 6.734085009105351, + "grad_norm": 1.3967450970762003, + "learning_rate": 3.367172969621823e-07, + "loss": 0.9198, + "step": 86900 + }, + { + "epoch": 6.734859932581657, + "grad_norm": 1.4384876340893806, + "learning_rate": 3.367560446373218e-07, + "loss": 0.9426, + "step": 86910 + }, + { + "epoch": 6.735634856057964, + "grad_norm": 1.4044501798905202, + "learning_rate": 3.3679479231246125e-07, + "loss": 0.9385, + "step": 86920 + }, + { + "epoch": 6.736409779534271, + "grad_norm": 1.374996465258152, + "learning_rate": 3.3683353998760077e-07, + "loss": 0.9353, + "step": 86930 + }, + { + "epoch": 6.737184703010578, + "grad_norm": 1.3863139827413313, + "learning_rate": 3.3687228766274024e-07, + "loss": 0.9428, + "step": 86940 + }, + { + "epoch": 6.737959626486885, + "grad_norm": 1.3982483061281594, + "learning_rate": 3.3691103533787976e-07, + "loss": 0.9271, + "step": 86950 + }, + { + "epoch": 6.738734549963191, + "grad_norm": 1.3592153405599698, + "learning_rate": 3.3694978301301923e-07, + "loss": 0.9204, + "step": 86960 + }, + { + "epoch": 6.739509473439497, + "grad_norm": 1.464454360948952, + "learning_rate": 3.3698853068815875e-07, + "loss": 0.9184, + "step": 86970 + }, + { + "epoch": 6.740284396915804, + "grad_norm": 1.3125864845645256, + "learning_rate": 3.370272783632982e-07, + "loss": 0.9246, + "step": 86980 + }, + { + "epoch": 6.741059320392111, + "grad_norm": 1.3382114654048782, + "learning_rate": 3.370660260384377e-07, + "loss": 0.9396, + "step": 86990 + }, + { + "epoch": 6.741834243868418, + "grad_norm": 1.3275169736500383, + "learning_rate": 3.371047737135772e-07, + "loss": 0.9258, + "step": 87000 + }, + { + "epoch": 6.741834243868418, + "eval_loss": 0.9333247542381287, + "eval_runtime": 332.6145, + "eval_samples_per_second": 34.487, + "eval_steps_per_second": 8.623, + "step": 87000 + }, + { + "epoch": 6.742609167344725, + "grad_norm": 1.3878475113757593, + "learning_rate": 3.371435213887167e-07, + "loss": 0.9254, + "step": 87010 + }, + { + "epoch": 6.743384090821031, + "grad_norm": 1.3676649051950212, + "learning_rate": 3.371822690638562e-07, + "loss": 0.9323, + "step": 87020 + }, + { + "epoch": 6.744159014297338, + "grad_norm": 1.3943917369134136, + "learning_rate": 3.3722101673899567e-07, + "loss": 0.9297, + "step": 87030 + }, + { + "epoch": 6.744933937773645, + "grad_norm": 1.4558975631086482, + "learning_rate": 3.372597644141352e-07, + "loss": 0.9337, + "step": 87040 + }, + { + "epoch": 6.745708861249952, + "grad_norm": 1.445852947404434, + "learning_rate": 3.3729851208927466e-07, + "loss": 0.932, + "step": 87050 + }, + { + "epoch": 6.746483784726259, + "grad_norm": 1.4953007133696854, + "learning_rate": 3.3733725976441413e-07, + "loss": 0.9175, + "step": 87060 + }, + { + "epoch": 6.7472587082025655, + "grad_norm": 1.4457669927594332, + "learning_rate": 3.3737600743955365e-07, + "loss": 0.9414, + "step": 87070 + }, + { + "epoch": 6.748033631678871, + "grad_norm": 1.3642371532349786, + "learning_rate": 3.374147551146931e-07, + "loss": 0.9427, + "step": 87080 + }, + { + "epoch": 6.748808555155178, + "grad_norm": 1.4047113666932987, + "learning_rate": 3.3745350278983265e-07, + "loss": 0.9527, + "step": 87090 + }, + { + "epoch": 6.749583478631485, + "grad_norm": 1.3685403804538399, + "learning_rate": 3.374922504649721e-07, + "loss": 0.9145, + "step": 87100 + }, + { + "epoch": 6.750358402107792, + "grad_norm": 1.54149604982235, + "learning_rate": 3.3753099814011164e-07, + "loss": 0.9221, + "step": 87110 + }, + { + "epoch": 6.751133325584099, + "grad_norm": 1.4575316484848442, + "learning_rate": 3.375697458152511e-07, + "loss": 0.926, + "step": 87120 + }, + { + "epoch": 6.751908249060405, + "grad_norm": 1.411333959898642, + "learning_rate": 3.376084934903906e-07, + "loss": 0.93, + "step": 87130 + }, + { + "epoch": 6.752683172536712, + "grad_norm": 1.4030315552041208, + "learning_rate": 3.376472411655301e-07, + "loss": 0.9285, + "step": 87140 + }, + { + "epoch": 6.753458096013019, + "grad_norm": 1.4536208434634053, + "learning_rate": 3.3768598884066957e-07, + "loss": 0.924, + "step": 87150 + }, + { + "epoch": 6.754233019489325, + "grad_norm": 1.4375749587490543, + "learning_rate": 3.377247365158091e-07, + "loss": 0.9373, + "step": 87160 + }, + { + "epoch": 6.755007942965632, + "grad_norm": 1.4387188962095259, + "learning_rate": 3.3776348419094856e-07, + "loss": 0.9578, + "step": 87170 + }, + { + "epoch": 6.755782866441939, + "grad_norm": 1.472175729136519, + "learning_rate": 3.378022318660881e-07, + "loss": 0.9327, + "step": 87180 + }, + { + "epoch": 6.756557789918245, + "grad_norm": 1.4455563658088617, + "learning_rate": 3.3784097954122755e-07, + "loss": 0.9389, + "step": 87190 + }, + { + "epoch": 6.757332713394552, + "grad_norm": 1.3851690623504334, + "learning_rate": 3.37879727216367e-07, + "loss": 0.9298, + "step": 87200 + }, + { + "epoch": 6.758107636870859, + "grad_norm": 1.4138242137255235, + "learning_rate": 3.3791847489150654e-07, + "loss": 0.9372, + "step": 87210 + }, + { + "epoch": 6.758882560347166, + "grad_norm": 1.3787204282432473, + "learning_rate": 3.37957222566646e-07, + "loss": 0.9322, + "step": 87220 + }, + { + "epoch": 6.759657483823473, + "grad_norm": 1.3472496738373152, + "learning_rate": 3.3799597024178553e-07, + "loss": 0.929, + "step": 87230 + }, + { + "epoch": 6.7604324072997795, + "grad_norm": 1.378432710933356, + "learning_rate": 3.38034717916925e-07, + "loss": 0.9299, + "step": 87240 + }, + { + "epoch": 6.761207330776086, + "grad_norm": 1.4114122838102663, + "learning_rate": 3.380734655920645e-07, + "loss": 0.923, + "step": 87250 + }, + { + "epoch": 6.761982254252392, + "grad_norm": 1.3951133065780605, + "learning_rate": 3.38112213267204e-07, + "loss": 0.9193, + "step": 87260 + }, + { + "epoch": 6.762757177728699, + "grad_norm": 1.4465579001578333, + "learning_rate": 3.3815096094234346e-07, + "loss": 0.9314, + "step": 87270 + }, + { + "epoch": 6.763532101205006, + "grad_norm": 1.3867634524360186, + "learning_rate": 3.38189708617483e-07, + "loss": 0.9142, + "step": 87280 + }, + { + "epoch": 6.764307024681313, + "grad_norm": 1.3742538702754141, + "learning_rate": 3.3822845629262245e-07, + "loss": 0.947, + "step": 87290 + }, + { + "epoch": 6.765081948157619, + "grad_norm": 1.3886124308306969, + "learning_rate": 3.3826720396776197e-07, + "loss": 0.9401, + "step": 87300 + }, + { + "epoch": 6.765856871633926, + "grad_norm": 1.3142614633937177, + "learning_rate": 3.3830595164290144e-07, + "loss": 0.9304, + "step": 87310 + }, + { + "epoch": 6.766631795110233, + "grad_norm": 1.457999312600238, + "learning_rate": 3.3834469931804096e-07, + "loss": 0.9329, + "step": 87320 + }, + { + "epoch": 6.76740671858654, + "grad_norm": 1.3650297566096377, + "learning_rate": 3.3838344699318043e-07, + "loss": 0.9241, + "step": 87330 + }, + { + "epoch": 6.768181642062847, + "grad_norm": 1.2982553815855054, + "learning_rate": 3.384221946683199e-07, + "loss": 0.905, + "step": 87340 + }, + { + "epoch": 6.768956565539153, + "grad_norm": 1.4687003558533145, + "learning_rate": 3.384609423434594e-07, + "loss": 0.933, + "step": 87350 + }, + { + "epoch": 6.769731489015459, + "grad_norm": 1.413755999843272, + "learning_rate": 3.384996900185989e-07, + "loss": 0.9471, + "step": 87360 + }, + { + "epoch": 6.770506412491766, + "grad_norm": 1.3978821809717767, + "learning_rate": 3.385384376937384e-07, + "loss": 0.9292, + "step": 87370 + }, + { + "epoch": 6.771281335968073, + "grad_norm": 1.467842897390095, + "learning_rate": 3.385771853688779e-07, + "loss": 0.9317, + "step": 87380 + }, + { + "epoch": 6.77205625944438, + "grad_norm": 1.3649860672976566, + "learning_rate": 3.386159330440174e-07, + "loss": 0.9092, + "step": 87390 + }, + { + "epoch": 6.772831182920687, + "grad_norm": 1.473474105390267, + "learning_rate": 3.386546807191569e-07, + "loss": 0.9192, + "step": 87400 + }, + { + "epoch": 6.7736061063969935, + "grad_norm": 1.3993408405022907, + "learning_rate": 3.3869342839429634e-07, + "loss": 0.9186, + "step": 87410 + }, + { + "epoch": 6.7743810298733, + "grad_norm": 1.4058610190906353, + "learning_rate": 3.3873217606943587e-07, + "loss": 0.9122, + "step": 87420 + }, + { + "epoch": 6.775155953349607, + "grad_norm": 1.4317682975670298, + "learning_rate": 3.3877092374457533e-07, + "loss": 0.9252, + "step": 87430 + }, + { + "epoch": 6.775930876825914, + "grad_norm": 1.5245968604822617, + "learning_rate": 3.3880967141971486e-07, + "loss": 0.9175, + "step": 87440 + }, + { + "epoch": 6.77670580030222, + "grad_norm": 1.390804014464169, + "learning_rate": 3.388484190948543e-07, + "loss": 0.9377, + "step": 87450 + }, + { + "epoch": 6.777480723778527, + "grad_norm": 1.3745891395880265, + "learning_rate": 3.3888716676999385e-07, + "loss": 0.9265, + "step": 87460 + }, + { + "epoch": 6.778255647254833, + "grad_norm": 1.400068678277934, + "learning_rate": 3.389259144451333e-07, + "loss": 0.9131, + "step": 87470 + }, + { + "epoch": 6.77903057073114, + "grad_norm": 1.4141810094910934, + "learning_rate": 3.389646621202728e-07, + "loss": 0.9297, + "step": 87480 + }, + { + "epoch": 6.779805494207447, + "grad_norm": 1.416876050126825, + "learning_rate": 3.390034097954123e-07, + "loss": 0.9165, + "step": 87490 + }, + { + "epoch": 6.780580417683754, + "grad_norm": 1.4029026419883928, + "learning_rate": 3.390421574705518e-07, + "loss": 0.9296, + "step": 87500 + }, + { + "epoch": 6.780580417683754, + "eval_loss": 0.9328203797340393, + "eval_runtime": 332.9118, + "eval_samples_per_second": 34.457, + "eval_steps_per_second": 8.615, + "step": 87500 + }, + { + "epoch": 6.781355341160061, + "grad_norm": 1.4384528726846304, + "learning_rate": 3.390809051456913e-07, + "loss": 0.9168, + "step": 87510 + }, + { + "epoch": 6.7821302646363675, + "grad_norm": 1.4051918338621905, + "learning_rate": 3.3911965282083077e-07, + "loss": 0.9339, + "step": 87520 + }, + { + "epoch": 6.782905188112673, + "grad_norm": 1.4080883103399449, + "learning_rate": 3.391584004959703e-07, + "loss": 0.9294, + "step": 87530 + }, + { + "epoch": 6.78368011158898, + "grad_norm": 1.3775418542056497, + "learning_rate": 3.3919714817110976e-07, + "loss": 0.9662, + "step": 87540 + }, + { + "epoch": 6.784455035065287, + "grad_norm": 1.357349948672864, + "learning_rate": 3.3923589584624923e-07, + "loss": 0.9187, + "step": 87550 + }, + { + "epoch": 6.785229958541594, + "grad_norm": 1.4441030630966436, + "learning_rate": 3.3927464352138875e-07, + "loss": 0.9384, + "step": 87560 + }, + { + "epoch": 6.786004882017901, + "grad_norm": 1.3896578150593917, + "learning_rate": 3.393133911965282e-07, + "loss": 0.9275, + "step": 87570 + }, + { + "epoch": 6.7867798054942075, + "grad_norm": 1.4506875127072352, + "learning_rate": 3.3935213887166774e-07, + "loss": 0.954, + "step": 87580 + }, + { + "epoch": 6.787554728970514, + "grad_norm": 1.3370447368600102, + "learning_rate": 3.393908865468072e-07, + "loss": 0.9147, + "step": 87590 + }, + { + "epoch": 6.788329652446821, + "grad_norm": 1.5153588988282463, + "learning_rate": 3.3942963422194673e-07, + "loss": 0.9403, + "step": 87600 + }, + { + "epoch": 6.789104575923128, + "grad_norm": 1.3671052842750275, + "learning_rate": 3.394683818970862e-07, + "loss": 0.9436, + "step": 87610 + }, + { + "epoch": 6.789879499399435, + "grad_norm": 1.3987789353262745, + "learning_rate": 3.3950712957222567e-07, + "loss": 0.9333, + "step": 87620 + }, + { + "epoch": 6.7906544228757415, + "grad_norm": 1.450673598426636, + "learning_rate": 3.395458772473652e-07, + "loss": 0.9277, + "step": 87630 + }, + { + "epoch": 6.791429346352047, + "grad_norm": 1.404502102731107, + "learning_rate": 3.3958462492250466e-07, + "loss": 0.9123, + "step": 87640 + }, + { + "epoch": 6.792204269828354, + "grad_norm": 1.4363240503007482, + "learning_rate": 3.396233725976442e-07, + "loss": 0.907, + "step": 87650 + }, + { + "epoch": 6.792979193304661, + "grad_norm": 1.42889764394495, + "learning_rate": 3.3966212027278365e-07, + "loss": 0.9186, + "step": 87660 + }, + { + "epoch": 6.793754116780968, + "grad_norm": 1.4058215933976705, + "learning_rate": 3.397008679479231e-07, + "loss": 0.9334, + "step": 87670 + }, + { + "epoch": 6.794529040257275, + "grad_norm": 1.3820651611846015, + "learning_rate": 3.3973961562306264e-07, + "loss": 0.9325, + "step": 87680 + }, + { + "epoch": 6.7953039637335815, + "grad_norm": 1.5578037625426926, + "learning_rate": 3.397783632982021e-07, + "loss": 0.9271, + "step": 87690 + }, + { + "epoch": 6.796078887209888, + "grad_norm": 1.4339365249838132, + "learning_rate": 3.3981711097334163e-07, + "loss": 0.9141, + "step": 87700 + }, + { + "epoch": 6.796853810686195, + "grad_norm": 1.3605697933119536, + "learning_rate": 3.398558586484811e-07, + "loss": 0.9375, + "step": 87710 + }, + { + "epoch": 6.797628734162501, + "grad_norm": 1.4543772996194897, + "learning_rate": 3.398946063236206e-07, + "loss": 0.9262, + "step": 87720 + }, + { + "epoch": 6.798403657638808, + "grad_norm": 1.3677431134277545, + "learning_rate": 3.399333539987601e-07, + "loss": 0.9325, + "step": 87730 + }, + { + "epoch": 6.799178581115115, + "grad_norm": 1.429023826599957, + "learning_rate": 3.3997210167389956e-07, + "loss": 0.926, + "step": 87740 + }, + { + "epoch": 6.7999535045914214, + "grad_norm": 1.4541695901677965, + "learning_rate": 3.400108493490391e-07, + "loss": 0.9543, + "step": 87750 + }, + { + "epoch": 6.800728428067728, + "grad_norm": 1.304178209922658, + "learning_rate": 3.4004959702417855e-07, + "loss": 0.91, + "step": 87760 + }, + { + "epoch": 6.801503351544035, + "grad_norm": 1.425670533380329, + "learning_rate": 3.400883446993181e-07, + "loss": 0.9107, + "step": 87770 + }, + { + "epoch": 6.802278275020342, + "grad_norm": 1.3527544322363747, + "learning_rate": 3.4012709237445754e-07, + "loss": 0.919, + "step": 87780 + }, + { + "epoch": 6.803053198496649, + "grad_norm": 1.3660933255905798, + "learning_rate": 3.4016584004959707e-07, + "loss": 0.9354, + "step": 87790 + }, + { + "epoch": 6.8038281219729555, + "grad_norm": 1.4486958451910479, + "learning_rate": 3.4020458772473654e-07, + "loss": 0.9452, + "step": 87800 + }, + { + "epoch": 6.804603045449262, + "grad_norm": 1.4197805306314144, + "learning_rate": 3.40243335399876e-07, + "loss": 0.9341, + "step": 87810 + }, + { + "epoch": 6.805377968925568, + "grad_norm": 1.4308979324872255, + "learning_rate": 3.4028208307501553e-07, + "loss": 0.928, + "step": 87820 + }, + { + "epoch": 6.806152892401875, + "grad_norm": 1.3782082789736276, + "learning_rate": 3.40320830750155e-07, + "loss": 0.9176, + "step": 87830 + }, + { + "epoch": 6.806927815878182, + "grad_norm": 1.3829087210734257, + "learning_rate": 3.403595784252945e-07, + "loss": 0.9396, + "step": 87840 + }, + { + "epoch": 6.807702739354489, + "grad_norm": 1.3772059872107, + "learning_rate": 3.40398326100434e-07, + "loss": 0.908, + "step": 87850 + }, + { + "epoch": 6.8084776628307955, + "grad_norm": 1.3802951568560722, + "learning_rate": 3.404370737755735e-07, + "loss": 0.9305, + "step": 87860 + }, + { + "epoch": 6.809252586307102, + "grad_norm": 1.3917802680681106, + "learning_rate": 3.40475821450713e-07, + "loss": 0.9292, + "step": 87870 + }, + { + "epoch": 6.810027509783409, + "grad_norm": 1.476232958234437, + "learning_rate": 3.4051456912585245e-07, + "loss": 0.9244, + "step": 87880 + }, + { + "epoch": 6.810802433259716, + "grad_norm": 1.7004647920431621, + "learning_rate": 3.4055331680099197e-07, + "loss": 0.9209, + "step": 87890 + }, + { + "epoch": 6.811577356736022, + "grad_norm": 1.3264931616372744, + "learning_rate": 3.4059206447613144e-07, + "loss": 0.9179, + "step": 87900 + }, + { + "epoch": 6.812352280212329, + "grad_norm": 1.3614078247307975, + "learning_rate": 3.4063081215127096e-07, + "loss": 0.9202, + "step": 87910 + }, + { + "epoch": 6.8131272036886354, + "grad_norm": 1.3483206807439891, + "learning_rate": 3.4066955982641043e-07, + "loss": 0.9297, + "step": 87920 + }, + { + "epoch": 6.813902127164942, + "grad_norm": 1.3904354878079608, + "learning_rate": 3.4070830750154995e-07, + "loss": 0.9316, + "step": 87930 + }, + { + "epoch": 6.814677050641249, + "grad_norm": 1.391400923130751, + "learning_rate": 3.407470551766894e-07, + "loss": 0.9329, + "step": 87940 + }, + { + "epoch": 6.815451974117556, + "grad_norm": 1.347376808497649, + "learning_rate": 3.407858028518289e-07, + "loss": 0.9192, + "step": 87950 + }, + { + "epoch": 6.816226897593863, + "grad_norm": 1.384479494607721, + "learning_rate": 3.408245505269684e-07, + "loss": 0.9255, + "step": 87960 + }, + { + "epoch": 6.8170018210701695, + "grad_norm": 1.3879603340966875, + "learning_rate": 3.408632982021079e-07, + "loss": 0.9191, + "step": 87970 + }, + { + "epoch": 6.817776744546476, + "grad_norm": 1.4109184703185234, + "learning_rate": 3.409020458772474e-07, + "loss": 0.9209, + "step": 87980 + }, + { + "epoch": 6.818551668022783, + "grad_norm": 1.3277879474055334, + "learning_rate": 3.4094079355238687e-07, + "loss": 0.9354, + "step": 87990 + }, + { + "epoch": 6.81932659149909, + "grad_norm": 1.3411637754207797, + "learning_rate": 3.409795412275264e-07, + "loss": 0.9191, + "step": 88000 + }, + { + "epoch": 6.81932659149909, + "eval_loss": 0.9324710965156555, + "eval_runtime": 334.9419, + "eval_samples_per_second": 34.248, + "eval_steps_per_second": 8.563, + "step": 88000 + }, + { + "epoch": 6.820101514975396, + "grad_norm": 1.4145870283562327, + "learning_rate": 3.4101828890266586e-07, + "loss": 0.9097, + "step": 88010 + }, + { + "epoch": 6.820876438451703, + "grad_norm": 1.4216581525377012, + "learning_rate": 3.4105703657780533e-07, + "loss": 0.9233, + "step": 88020 + }, + { + "epoch": 6.8216513619280095, + "grad_norm": 1.4453382592136692, + "learning_rate": 3.4109578425294485e-07, + "loss": 0.9383, + "step": 88030 + }, + { + "epoch": 6.822426285404316, + "grad_norm": 1.422009450654189, + "learning_rate": 3.411345319280843e-07, + "loss": 0.959, + "step": 88040 + }, + { + "epoch": 6.823201208880623, + "grad_norm": 1.334280942113302, + "learning_rate": 3.4117327960322384e-07, + "loss": 0.9165, + "step": 88050 + }, + { + "epoch": 6.82397613235693, + "grad_norm": 1.3771687240979738, + "learning_rate": 3.412120272783633e-07, + "loss": 0.9251, + "step": 88060 + }, + { + "epoch": 6.824751055833237, + "grad_norm": 1.3443335087742039, + "learning_rate": 3.4125077495350284e-07, + "loss": 0.9629, + "step": 88070 + }, + { + "epoch": 6.8255259793095435, + "grad_norm": 1.3778937318740074, + "learning_rate": 3.412895226286423e-07, + "loss": 0.9173, + "step": 88080 + }, + { + "epoch": 6.8263009027858494, + "grad_norm": 1.351797747301875, + "learning_rate": 3.4132827030378177e-07, + "loss": 0.9227, + "step": 88090 + }, + { + "epoch": 6.827075826262156, + "grad_norm": 1.4380243529378258, + "learning_rate": 3.413670179789213e-07, + "loss": 0.9256, + "step": 88100 + }, + { + "epoch": 6.827850749738463, + "grad_norm": 1.4066008874233276, + "learning_rate": 3.4140576565406076e-07, + "loss": 0.9205, + "step": 88110 + }, + { + "epoch": 6.82862567321477, + "grad_norm": 1.406978922831281, + "learning_rate": 3.414445133292003e-07, + "loss": 0.9213, + "step": 88120 + }, + { + "epoch": 6.829400596691077, + "grad_norm": 1.5523829423388928, + "learning_rate": 3.4148326100433976e-07, + "loss": 0.9301, + "step": 88130 + }, + { + "epoch": 6.8301755201673835, + "grad_norm": 1.4013392150599588, + "learning_rate": 3.415220086794793e-07, + "loss": 0.924, + "step": 88140 + }, + { + "epoch": 6.83095044364369, + "grad_norm": 1.353448741695987, + "learning_rate": 3.4156075635461875e-07, + "loss": 0.9301, + "step": 88150 + }, + { + "epoch": 6.831725367119997, + "grad_norm": 1.3499359316306596, + "learning_rate": 3.415995040297582e-07, + "loss": 0.9075, + "step": 88160 + }, + { + "epoch": 6.832500290596304, + "grad_norm": 1.4390800715861314, + "learning_rate": 3.4163825170489774e-07, + "loss": 0.9155, + "step": 88170 + }, + { + "epoch": 6.833275214072611, + "grad_norm": 1.3853260056777803, + "learning_rate": 3.416769993800372e-07, + "loss": 0.9163, + "step": 88180 + }, + { + "epoch": 6.834050137548917, + "grad_norm": 1.4044281155301568, + "learning_rate": 3.4171574705517673e-07, + "loss": 0.9352, + "step": 88190 + }, + { + "epoch": 6.8348250610252235, + "grad_norm": 1.3990873147592582, + "learning_rate": 3.417544947303162e-07, + "loss": 0.9265, + "step": 88200 + }, + { + "epoch": 6.83559998450153, + "grad_norm": 1.3644375510666265, + "learning_rate": 3.417932424054557e-07, + "loss": 0.9055, + "step": 88210 + }, + { + "epoch": 6.836374907977837, + "grad_norm": 1.332108948122843, + "learning_rate": 3.418319900805952e-07, + "loss": 0.9098, + "step": 88220 + }, + { + "epoch": 6.837149831454144, + "grad_norm": 1.4984413119392481, + "learning_rate": 3.4187073775573466e-07, + "loss": 0.92, + "step": 88230 + }, + { + "epoch": 6.837924754930451, + "grad_norm": 1.4167877740981487, + "learning_rate": 3.419094854308742e-07, + "loss": 0.9446, + "step": 88240 + }, + { + "epoch": 6.8386996784067575, + "grad_norm": 1.4086744094599533, + "learning_rate": 3.4194823310601365e-07, + "loss": 0.9328, + "step": 88250 + }, + { + "epoch": 6.839474601883064, + "grad_norm": 1.3984109338262432, + "learning_rate": 3.4198698078115317e-07, + "loss": 0.9216, + "step": 88260 + }, + { + "epoch": 6.840249525359371, + "grad_norm": 1.4073639960512814, + "learning_rate": 3.4202572845629264e-07, + "loss": 0.9457, + "step": 88270 + }, + { + "epoch": 6.841024448835677, + "grad_norm": 1.397338227996627, + "learning_rate": 3.4206447613143216e-07, + "loss": 0.9365, + "step": 88280 + }, + { + "epoch": 6.841799372311984, + "grad_norm": 1.4481006390133642, + "learning_rate": 3.4210322380657163e-07, + "loss": 0.9376, + "step": 88290 + }, + { + "epoch": 6.842574295788291, + "grad_norm": 1.3680143227559254, + "learning_rate": 3.421419714817111e-07, + "loss": 0.9228, + "step": 88300 + }, + { + "epoch": 6.8433492192645975, + "grad_norm": 1.3574388410318114, + "learning_rate": 3.421807191568506e-07, + "loss": 0.9572, + "step": 88310 + }, + { + "epoch": 6.844124142740904, + "grad_norm": 1.343167234396865, + "learning_rate": 3.422194668319901e-07, + "loss": 0.923, + "step": 88320 + }, + { + "epoch": 6.844899066217211, + "grad_norm": 1.4827435264484816, + "learning_rate": 3.422582145071296e-07, + "loss": 0.9354, + "step": 88330 + }, + { + "epoch": 6.845673989693518, + "grad_norm": 1.4948983650724632, + "learning_rate": 3.422969621822691e-07, + "loss": 0.9415, + "step": 88340 + }, + { + "epoch": 6.846448913169825, + "grad_norm": 1.438840457260777, + "learning_rate": 3.4233570985740855e-07, + "loss": 0.9298, + "step": 88350 + }, + { + "epoch": 6.8472238366461315, + "grad_norm": 1.367951860883122, + "learning_rate": 3.4237445753254807e-07, + "loss": 0.9245, + "step": 88360 + }, + { + "epoch": 6.847998760122438, + "grad_norm": 1.3899428302823402, + "learning_rate": 3.4241320520768754e-07, + "loss": 0.93, + "step": 88370 + }, + { + "epoch": 6.848773683598744, + "grad_norm": 1.3761883375340394, + "learning_rate": 3.4245195288282706e-07, + "loss": 0.9384, + "step": 88380 + }, + { + "epoch": 6.849548607075051, + "grad_norm": 1.4109178737323393, + "learning_rate": 3.4249070055796653e-07, + "loss": 0.9354, + "step": 88390 + }, + { + "epoch": 6.850323530551358, + "grad_norm": 1.3738429255509306, + "learning_rate": 3.4252944823310605e-07, + "loss": 0.9387, + "step": 88400 + }, + { + "epoch": 6.851098454027665, + "grad_norm": 1.4029858875194143, + "learning_rate": 3.425681959082455e-07, + "loss": 0.9203, + "step": 88410 + }, + { + "epoch": 6.8518733775039715, + "grad_norm": 1.3963520653723855, + "learning_rate": 3.42606943583385e-07, + "loss": 0.9326, + "step": 88420 + }, + { + "epoch": 6.852648300980278, + "grad_norm": 1.3109383599764155, + "learning_rate": 3.426456912585245e-07, + "loss": 0.9224, + "step": 88430 + }, + { + "epoch": 6.853423224456585, + "grad_norm": 1.4121950685215914, + "learning_rate": 3.42684438933664e-07, + "loss": 0.9395, + "step": 88440 + }, + { + "epoch": 6.854198147932892, + "grad_norm": 1.4853254716777207, + "learning_rate": 3.427231866088035e-07, + "loss": 0.9505, + "step": 88450 + }, + { + "epoch": 6.854973071409198, + "grad_norm": 1.4735204291579533, + "learning_rate": 3.42761934283943e-07, + "loss": 0.9271, + "step": 88460 + }, + { + "epoch": 6.855747994885505, + "grad_norm": 1.389342666759834, + "learning_rate": 3.428006819590825e-07, + "loss": 0.9364, + "step": 88470 + }, + { + "epoch": 6.8565229183618115, + "grad_norm": 1.3621272979649077, + "learning_rate": 3.4283942963422197e-07, + "loss": 0.917, + "step": 88480 + }, + { + "epoch": 6.857297841838118, + "grad_norm": 1.5265804980897892, + "learning_rate": 3.4287817730936143e-07, + "loss": 0.9312, + "step": 88490 + }, + { + "epoch": 6.858072765314425, + "grad_norm": 1.4981466549884281, + "learning_rate": 3.4291692498450096e-07, + "loss": 0.9285, + "step": 88500 + }, + { + "epoch": 6.858072765314425, + "eval_loss": 0.9319877028465271, + "eval_runtime": 332.8406, + "eval_samples_per_second": 34.464, + "eval_steps_per_second": 8.617, + "step": 88500 + }, + { + "epoch": 6.858847688790732, + "grad_norm": 1.3639621773909387, + "learning_rate": 3.429556726596404e-07, + "loss": 0.9144, + "step": 88510 + }, + { + "epoch": 6.859622612267039, + "grad_norm": 1.4935597202903415, + "learning_rate": 3.4299442033477995e-07, + "loss": 0.9418, + "step": 88520 + }, + { + "epoch": 6.8603975357433455, + "grad_norm": 1.3250562009489955, + "learning_rate": 3.430331680099194e-07, + "loss": 0.9433, + "step": 88530 + }, + { + "epoch": 6.861172459219652, + "grad_norm": 1.4169076687317637, + "learning_rate": 3.4307191568505894e-07, + "loss": 0.9143, + "step": 88540 + }, + { + "epoch": 6.861947382695959, + "grad_norm": 1.4082604661195002, + "learning_rate": 3.431106633601984e-07, + "loss": 0.9196, + "step": 88550 + }, + { + "epoch": 6.862722306172266, + "grad_norm": 1.412655751537818, + "learning_rate": 3.431494110353379e-07, + "loss": 0.926, + "step": 88560 + }, + { + "epoch": 6.863497229648572, + "grad_norm": 1.4325841912582682, + "learning_rate": 3.431881587104774e-07, + "loss": 0.9503, + "step": 88570 + }, + { + "epoch": 6.864272153124879, + "grad_norm": 1.4116690815918993, + "learning_rate": 3.4322690638561687e-07, + "loss": 0.9246, + "step": 88580 + }, + { + "epoch": 6.8650470766011855, + "grad_norm": 1.4035754288062612, + "learning_rate": 3.432656540607564e-07, + "loss": 0.9227, + "step": 88590 + }, + { + "epoch": 6.865822000077492, + "grad_norm": 1.370552364059342, + "learning_rate": 3.4330440173589586e-07, + "loss": 0.9308, + "step": 88600 + }, + { + "epoch": 6.866596923553799, + "grad_norm": 1.4065497494431567, + "learning_rate": 3.433431494110354e-07, + "loss": 0.9261, + "step": 88610 + }, + { + "epoch": 6.867371847030106, + "grad_norm": 1.4180116564310554, + "learning_rate": 3.4338189708617485e-07, + "loss": 0.9562, + "step": 88620 + }, + { + "epoch": 6.868146770506413, + "grad_norm": 1.3987339751350718, + "learning_rate": 3.434206447613143e-07, + "loss": 0.9208, + "step": 88630 + }, + { + "epoch": 6.86892169398272, + "grad_norm": 1.3797687066681263, + "learning_rate": 3.4345939243645384e-07, + "loss": 0.9294, + "step": 88640 + }, + { + "epoch": 6.8696966174590255, + "grad_norm": 1.5631723255918757, + "learning_rate": 3.434981401115933e-07, + "loss": 0.9335, + "step": 88650 + }, + { + "epoch": 6.870471540935332, + "grad_norm": 1.352721451134985, + "learning_rate": 3.4353688778673283e-07, + "loss": 0.9181, + "step": 88660 + }, + { + "epoch": 6.871246464411639, + "grad_norm": 1.416327549007748, + "learning_rate": 3.435756354618723e-07, + "loss": 0.9322, + "step": 88670 + }, + { + "epoch": 6.872021387887946, + "grad_norm": 1.499012366978986, + "learning_rate": 3.436143831370118e-07, + "loss": 0.9094, + "step": 88680 + }, + { + "epoch": 6.872796311364253, + "grad_norm": 1.3435957274441561, + "learning_rate": 3.436531308121513e-07, + "loss": 0.9215, + "step": 88690 + }, + { + "epoch": 6.8735712348405595, + "grad_norm": 1.3597087629991755, + "learning_rate": 3.4369187848729076e-07, + "loss": 0.9125, + "step": 88700 + }, + { + "epoch": 6.874346158316866, + "grad_norm": 1.3963512897285288, + "learning_rate": 3.437306261624303e-07, + "loss": 0.9076, + "step": 88710 + }, + { + "epoch": 6.875121081793173, + "grad_norm": 1.3666966804251626, + "learning_rate": 3.4376937383756975e-07, + "loss": 0.9438, + "step": 88720 + }, + { + "epoch": 6.87589600526948, + "grad_norm": 1.459981515307032, + "learning_rate": 3.438081215127093e-07, + "loss": 0.9372, + "step": 88730 + }, + { + "epoch": 6.876670928745787, + "grad_norm": 1.421835802288763, + "learning_rate": 3.4384686918784874e-07, + "loss": 0.9151, + "step": 88740 + }, + { + "epoch": 6.877445852222093, + "grad_norm": 1.5337253711760446, + "learning_rate": 3.4388561686298827e-07, + "loss": 0.947, + "step": 88750 + }, + { + "epoch": 6.8782207756983995, + "grad_norm": 1.4266143619354108, + "learning_rate": 3.4392436453812773e-07, + "loss": 0.9342, + "step": 88760 + }, + { + "epoch": 6.878995699174706, + "grad_norm": 1.379256314107907, + "learning_rate": 3.439631122132672e-07, + "loss": 0.9188, + "step": 88770 + }, + { + "epoch": 6.879770622651013, + "grad_norm": 1.3589995006535112, + "learning_rate": 3.440018598884067e-07, + "loss": 0.9295, + "step": 88780 + }, + { + "epoch": 6.88054554612732, + "grad_norm": 1.4552266867281283, + "learning_rate": 3.440406075635462e-07, + "loss": 0.9108, + "step": 88790 + }, + { + "epoch": 6.881320469603627, + "grad_norm": 1.3974662924581862, + "learning_rate": 3.440793552386857e-07, + "loss": 0.9231, + "step": 88800 + }, + { + "epoch": 6.882095393079934, + "grad_norm": 1.3390522973993308, + "learning_rate": 3.441181029138252e-07, + "loss": 0.9301, + "step": 88810 + }, + { + "epoch": 6.88287031655624, + "grad_norm": 1.404334307744321, + "learning_rate": 3.441568505889647e-07, + "loss": 0.9267, + "step": 88820 + }, + { + "epoch": 6.883645240032546, + "grad_norm": 1.48606023077832, + "learning_rate": 3.441955982641042e-07, + "loss": 0.9019, + "step": 88830 + }, + { + "epoch": 6.884420163508853, + "grad_norm": 1.4071385218783055, + "learning_rate": 3.4423434593924365e-07, + "loss": 0.9278, + "step": 88840 + }, + { + "epoch": 6.88519508698516, + "grad_norm": 1.4315249416153364, + "learning_rate": 3.4427309361438317e-07, + "loss": 0.9377, + "step": 88850 + }, + { + "epoch": 6.885970010461467, + "grad_norm": 1.450662431673407, + "learning_rate": 3.4431184128952264e-07, + "loss": 0.9322, + "step": 88860 + }, + { + "epoch": 6.8867449339377735, + "grad_norm": 1.3900232568634274, + "learning_rate": 3.4435058896466216e-07, + "loss": 0.916, + "step": 88870 + }, + { + "epoch": 6.88751985741408, + "grad_norm": 1.358268802656375, + "learning_rate": 3.4438933663980163e-07, + "loss": 0.9141, + "step": 88880 + }, + { + "epoch": 6.888294780890387, + "grad_norm": 1.404657363178517, + "learning_rate": 3.4442808431494115e-07, + "loss": 0.9095, + "step": 88890 + }, + { + "epoch": 6.889069704366694, + "grad_norm": 1.378550899168716, + "learning_rate": 3.444668319900806e-07, + "loss": 0.9144, + "step": 88900 + }, + { + "epoch": 6.889844627843001, + "grad_norm": 1.4736346720580693, + "learning_rate": 3.445055796652201e-07, + "loss": 0.9114, + "step": 88910 + }, + { + "epoch": 6.890619551319308, + "grad_norm": 1.3361809477456401, + "learning_rate": 3.445443273403596e-07, + "loss": 0.9457, + "step": 88920 + }, + { + "epoch": 6.891394474795614, + "grad_norm": 1.3857077446642465, + "learning_rate": 3.445830750154991e-07, + "loss": 0.927, + "step": 88930 + }, + { + "epoch": 6.89216939827192, + "grad_norm": 1.510390197967375, + "learning_rate": 3.446218226906386e-07, + "loss": 0.946, + "step": 88940 + }, + { + "epoch": 6.892944321748227, + "grad_norm": 1.4247174775491078, + "learning_rate": 3.4466057036577807e-07, + "loss": 0.924, + "step": 88950 + }, + { + "epoch": 6.893719245224534, + "grad_norm": 1.3392391852385357, + "learning_rate": 3.446993180409176e-07, + "loss": 0.9199, + "step": 88960 + }, + { + "epoch": 6.894494168700841, + "grad_norm": 1.3626937702128905, + "learning_rate": 3.4473806571605706e-07, + "loss": 0.9186, + "step": 88970 + }, + { + "epoch": 6.895269092177148, + "grad_norm": 1.4738577670747615, + "learning_rate": 3.4477681339119653e-07, + "loss": 0.9212, + "step": 88980 + }, + { + "epoch": 6.896044015653454, + "grad_norm": 1.4716953191919884, + "learning_rate": 3.4481556106633605e-07, + "loss": 0.9344, + "step": 88990 + }, + { + "epoch": 6.896818939129761, + "grad_norm": 1.3655563362987013, + "learning_rate": 3.448543087414755e-07, + "loss": 0.933, + "step": 89000 + }, + { + "epoch": 6.896818939129761, + "eval_loss": 0.9317952394485474, + "eval_runtime": 332.786, + "eval_samples_per_second": 34.47, + "eval_steps_per_second": 8.618, + "step": 89000 + }, + { + "epoch": 6.897593862606068, + "grad_norm": 1.416932935695682, + "learning_rate": 3.4489305641661504e-07, + "loss": 0.9316, + "step": 89010 + }, + { + "epoch": 6.898368786082374, + "grad_norm": 1.3782012310164697, + "learning_rate": 3.449318040917545e-07, + "loss": 0.9124, + "step": 89020 + }, + { + "epoch": 6.899143709558681, + "grad_norm": 1.4492393414897318, + "learning_rate": 3.4497055176689403e-07, + "loss": 0.9294, + "step": 89030 + }, + { + "epoch": 6.8999186330349875, + "grad_norm": 1.38952768949678, + "learning_rate": 3.450092994420335e-07, + "loss": 0.9162, + "step": 89040 + }, + { + "epoch": 6.900693556511294, + "grad_norm": 1.4183120666294675, + "learning_rate": 3.4504804711717297e-07, + "loss": 0.9277, + "step": 89050 + }, + { + "epoch": 6.901468479987601, + "grad_norm": 1.3820847836918995, + "learning_rate": 3.450867947923125e-07, + "loss": 0.9219, + "step": 89060 + }, + { + "epoch": 6.902243403463908, + "grad_norm": 1.373254013630881, + "learning_rate": 3.4512554246745196e-07, + "loss": 0.9007, + "step": 89070 + }, + { + "epoch": 6.903018326940215, + "grad_norm": 1.349868532465127, + "learning_rate": 3.451642901425915e-07, + "loss": 0.9148, + "step": 89080 + }, + { + "epoch": 6.903793250416522, + "grad_norm": 1.4171577299938036, + "learning_rate": 3.4520303781773095e-07, + "loss": 0.912, + "step": 89090 + }, + { + "epoch": 6.904568173892828, + "grad_norm": 1.3696302933783306, + "learning_rate": 3.452417854928704e-07, + "loss": 0.9324, + "step": 89100 + }, + { + "epoch": 6.905343097369135, + "grad_norm": 1.3251357711853313, + "learning_rate": 3.4528053316800994e-07, + "loss": 0.9228, + "step": 89110 + }, + { + "epoch": 6.906118020845441, + "grad_norm": 1.584692598349844, + "learning_rate": 3.453192808431494e-07, + "loss": 0.9479, + "step": 89120 + }, + { + "epoch": 6.906892944321748, + "grad_norm": 1.4016925172764947, + "learning_rate": 3.4535802851828894e-07, + "loss": 0.9265, + "step": 89130 + }, + { + "epoch": 6.907667867798055, + "grad_norm": 1.33051908524767, + "learning_rate": 3.453967761934284e-07, + "loss": 0.9315, + "step": 89140 + }, + { + "epoch": 6.908442791274362, + "grad_norm": 1.6730895261048047, + "learning_rate": 3.4543552386856793e-07, + "loss": 0.9586, + "step": 89150 + }, + { + "epoch": 6.909217714750668, + "grad_norm": 1.4342442317825033, + "learning_rate": 3.454742715437074e-07, + "loss": 0.9326, + "step": 89160 + }, + { + "epoch": 6.909992638226975, + "grad_norm": 1.3719774906193305, + "learning_rate": 3.4551301921884686e-07, + "loss": 0.9219, + "step": 89170 + }, + { + "epoch": 6.910767561703282, + "grad_norm": 1.3799618671054679, + "learning_rate": 3.455517668939864e-07, + "loss": 0.9224, + "step": 89180 + }, + { + "epoch": 6.911542485179589, + "grad_norm": 1.3514693851167943, + "learning_rate": 3.4559051456912586e-07, + "loss": 0.9235, + "step": 89190 + }, + { + "epoch": 6.912317408655896, + "grad_norm": 1.3898849617856326, + "learning_rate": 3.456292622442654e-07, + "loss": 0.9263, + "step": 89200 + }, + { + "epoch": 6.9130923321322015, + "grad_norm": 1.331443718095853, + "learning_rate": 3.4566800991940485e-07, + "loss": 0.9276, + "step": 89210 + }, + { + "epoch": 6.913867255608508, + "grad_norm": 1.4096292506424808, + "learning_rate": 3.4570675759454437e-07, + "loss": 0.9194, + "step": 89220 + }, + { + "epoch": 6.914642179084815, + "grad_norm": 1.4140768301781408, + "learning_rate": 3.4574550526968384e-07, + "loss": 0.9003, + "step": 89230 + }, + { + "epoch": 6.915417102561122, + "grad_norm": 1.3981592951399116, + "learning_rate": 3.457842529448233e-07, + "loss": 0.9209, + "step": 89240 + }, + { + "epoch": 6.916192026037429, + "grad_norm": 1.4103239250046888, + "learning_rate": 3.4582300061996283e-07, + "loss": 0.9265, + "step": 89250 + }, + { + "epoch": 6.916966949513736, + "grad_norm": 1.448842730529226, + "learning_rate": 3.458617482951023e-07, + "loss": 0.9358, + "step": 89260 + }, + { + "epoch": 6.917741872990042, + "grad_norm": 1.4234029467284575, + "learning_rate": 3.459004959702418e-07, + "loss": 0.9278, + "step": 89270 + }, + { + "epoch": 6.918516796466349, + "grad_norm": 1.3764163337062068, + "learning_rate": 3.459392436453813e-07, + "loss": 0.9284, + "step": 89280 + }, + { + "epoch": 6.919291719942656, + "grad_norm": 1.4580249321884502, + "learning_rate": 3.459779913205208e-07, + "loss": 0.9467, + "step": 89290 + }, + { + "epoch": 6.920066643418963, + "grad_norm": 1.3330108877729534, + "learning_rate": 3.460167389956603e-07, + "loss": 0.9236, + "step": 89300 + }, + { + "epoch": 6.920841566895269, + "grad_norm": 1.3521563224663526, + "learning_rate": 3.4605548667079975e-07, + "loss": 0.9352, + "step": 89310 + }, + { + "epoch": 6.921616490371576, + "grad_norm": 1.4188333889236515, + "learning_rate": 3.4609423434593927e-07, + "loss": 0.9305, + "step": 89320 + }, + { + "epoch": 6.922391413847882, + "grad_norm": 1.3905882581389881, + "learning_rate": 3.4613298202107874e-07, + "loss": 0.9331, + "step": 89330 + }, + { + "epoch": 6.923166337324189, + "grad_norm": 1.3556090662609506, + "learning_rate": 3.4617172969621826e-07, + "loss": 0.9272, + "step": 89340 + }, + { + "epoch": 6.923941260800496, + "grad_norm": 1.3610503361282857, + "learning_rate": 3.4621047737135773e-07, + "loss": 0.943, + "step": 89350 + }, + { + "epoch": 6.924716184276803, + "grad_norm": 1.4090824476849984, + "learning_rate": 3.4624922504649725e-07, + "loss": 0.9375, + "step": 89360 + }, + { + "epoch": 6.92549110775311, + "grad_norm": 1.4085305252961065, + "learning_rate": 3.462879727216367e-07, + "loss": 0.919, + "step": 89370 + }, + { + "epoch": 6.926266031229416, + "grad_norm": 1.4128155991568783, + "learning_rate": 3.463267203967762e-07, + "loss": 0.9195, + "step": 89380 + }, + { + "epoch": 6.927040954705722, + "grad_norm": 1.4215945327306978, + "learning_rate": 3.463654680719157e-07, + "loss": 0.9244, + "step": 89390 + }, + { + "epoch": 6.927815878182029, + "grad_norm": 1.4072404634299807, + "learning_rate": 3.464042157470552e-07, + "loss": 0.9263, + "step": 89400 + }, + { + "epoch": 6.928590801658336, + "grad_norm": 1.3853113708124305, + "learning_rate": 3.464429634221947e-07, + "loss": 0.9344, + "step": 89410 + }, + { + "epoch": 6.929365725134643, + "grad_norm": 1.3926744534256201, + "learning_rate": 3.4648171109733417e-07, + "loss": 0.9266, + "step": 89420 + }, + { + "epoch": 6.93014064861095, + "grad_norm": 1.4457135783789303, + "learning_rate": 3.465204587724737e-07, + "loss": 0.9369, + "step": 89430 + }, + { + "epoch": 6.930915572087256, + "grad_norm": 1.3982537667183512, + "learning_rate": 3.4655920644761316e-07, + "loss": 0.9235, + "step": 89440 + }, + { + "epoch": 6.931690495563563, + "grad_norm": 1.3997265599207773, + "learning_rate": 3.4659795412275263e-07, + "loss": 0.9395, + "step": 89450 + }, + { + "epoch": 6.93246541903987, + "grad_norm": 1.3370540595745233, + "learning_rate": 3.4663670179789216e-07, + "loss": 0.9375, + "step": 89460 + }, + { + "epoch": 6.933240342516177, + "grad_norm": 1.3825910901283205, + "learning_rate": 3.466754494730316e-07, + "loss": 0.928, + "step": 89470 + }, + { + "epoch": 6.934015265992484, + "grad_norm": 1.4042764306295634, + "learning_rate": 3.4671419714817115e-07, + "loss": 0.9297, + "step": 89480 + }, + { + "epoch": 6.9347901894687904, + "grad_norm": 1.4106105411725733, + "learning_rate": 3.467529448233106e-07, + "loss": 0.9278, + "step": 89490 + }, + { + "epoch": 6.935565112945096, + "grad_norm": 1.3288718867886333, + "learning_rate": 3.4679169249845014e-07, + "loss": 0.9262, + "step": 89500 + }, + { + "epoch": 6.935565112945096, + "eval_loss": 0.9311829209327698, + "eval_runtime": 332.595, + "eval_samples_per_second": 34.489, + "eval_steps_per_second": 8.623, + "step": 89500 + }, + { + "epoch": 6.936340036421403, + "grad_norm": 1.3692540633489985, + "learning_rate": 3.468304401735896e-07, + "loss": 0.9009, + "step": 89510 + }, + { + "epoch": 6.93711495989771, + "grad_norm": 1.3525787663804756, + "learning_rate": 3.468691878487291e-07, + "loss": 0.9332, + "step": 89520 + }, + { + "epoch": 6.937889883374017, + "grad_norm": 1.342754768092904, + "learning_rate": 3.469079355238686e-07, + "loss": 0.9327, + "step": 89530 + }, + { + "epoch": 6.938664806850324, + "grad_norm": 1.3755952487973386, + "learning_rate": 3.4694668319900807e-07, + "loss": 0.9441, + "step": 89540 + }, + { + "epoch": 6.93943973032663, + "grad_norm": 1.3368670918615557, + "learning_rate": 3.469854308741476e-07, + "loss": 0.9308, + "step": 89550 + }, + { + "epoch": 6.940214653802937, + "grad_norm": 1.4383774056357896, + "learning_rate": 3.4702417854928706e-07, + "loss": 0.9356, + "step": 89560 + }, + { + "epoch": 6.940989577279244, + "grad_norm": 1.3599200203345416, + "learning_rate": 3.470629262244266e-07, + "loss": 0.9272, + "step": 89570 + }, + { + "epoch": 6.94176450075555, + "grad_norm": 1.4424583775915847, + "learning_rate": 3.4710167389956605e-07, + "loss": 0.921, + "step": 89580 + }, + { + "epoch": 6.942539424231857, + "grad_norm": 1.3481895732495504, + "learning_rate": 3.471404215747055e-07, + "loss": 0.931, + "step": 89590 + }, + { + "epoch": 6.943314347708164, + "grad_norm": 1.3863587745441397, + "learning_rate": 3.4717916924984504e-07, + "loss": 0.9331, + "step": 89600 + }, + { + "epoch": 6.94408927118447, + "grad_norm": 1.3736407144519092, + "learning_rate": 3.472179169249845e-07, + "loss": 0.9297, + "step": 89610 + }, + { + "epoch": 6.944864194660777, + "grad_norm": 1.4407596582115636, + "learning_rate": 3.4725666460012403e-07, + "loss": 0.9326, + "step": 89620 + }, + { + "epoch": 6.945639118137084, + "grad_norm": 1.3596168183211523, + "learning_rate": 3.472954122752635e-07, + "loss": 0.9085, + "step": 89630 + }, + { + "epoch": 6.946414041613391, + "grad_norm": 1.458786520197417, + "learning_rate": 3.47334159950403e-07, + "loss": 0.9394, + "step": 89640 + }, + { + "epoch": 6.947188965089698, + "grad_norm": 1.3991024886814978, + "learning_rate": 3.473729076255425e-07, + "loss": 0.9116, + "step": 89650 + }, + { + "epoch": 6.947963888566004, + "grad_norm": 1.4722201807143065, + "learning_rate": 3.4741165530068196e-07, + "loss": 0.9228, + "step": 89660 + }, + { + "epoch": 6.948738812042311, + "grad_norm": 1.470989301563433, + "learning_rate": 3.474504029758215e-07, + "loss": 0.9299, + "step": 89670 + }, + { + "epoch": 6.949513735518617, + "grad_norm": 1.3737449509631885, + "learning_rate": 3.4748915065096095e-07, + "loss": 0.9099, + "step": 89680 + }, + { + "epoch": 6.950288658994924, + "grad_norm": 1.3727650743599291, + "learning_rate": 3.4752789832610047e-07, + "loss": 0.9303, + "step": 89690 + }, + { + "epoch": 6.951063582471231, + "grad_norm": 1.3289604363514989, + "learning_rate": 3.4756664600123994e-07, + "loss": 0.9223, + "step": 89700 + }, + { + "epoch": 6.951838505947538, + "grad_norm": 1.4250379079086049, + "learning_rate": 3.4760539367637946e-07, + "loss": 0.9155, + "step": 89710 + }, + { + "epoch": 6.952613429423844, + "grad_norm": 1.356757896488273, + "learning_rate": 3.4764414135151893e-07, + "loss": 0.9505, + "step": 89720 + }, + { + "epoch": 6.953388352900151, + "grad_norm": 1.419434563435655, + "learning_rate": 3.476828890266584e-07, + "loss": 0.9315, + "step": 89730 + }, + { + "epoch": 6.954163276376458, + "grad_norm": 1.4039025108185288, + "learning_rate": 3.477216367017979e-07, + "loss": 0.9443, + "step": 89740 + }, + { + "epoch": 6.954938199852765, + "grad_norm": 1.348678873148326, + "learning_rate": 3.477603843769374e-07, + "loss": 0.9418, + "step": 89750 + }, + { + "epoch": 6.955713123329071, + "grad_norm": 1.4227690502660635, + "learning_rate": 3.477991320520769e-07, + "loss": 0.9348, + "step": 89760 + }, + { + "epoch": 6.956488046805378, + "grad_norm": 1.436266852364258, + "learning_rate": 3.478378797272164e-07, + "loss": 0.9565, + "step": 89770 + }, + { + "epoch": 6.957262970281684, + "grad_norm": 1.401250929478064, + "learning_rate": 3.478766274023559e-07, + "loss": 0.9459, + "step": 89780 + }, + { + "epoch": 6.958037893757991, + "grad_norm": 1.3836529813037441, + "learning_rate": 3.479153750774954e-07, + "loss": 0.928, + "step": 89790 + }, + { + "epoch": 6.958812817234298, + "grad_norm": 1.3236730135756078, + "learning_rate": 3.4795412275263484e-07, + "loss": 0.9351, + "step": 89800 + }, + { + "epoch": 6.959587740710605, + "grad_norm": 1.3260435660819991, + "learning_rate": 3.4799287042777437e-07, + "loss": 0.9034, + "step": 89810 + }, + { + "epoch": 6.960362664186912, + "grad_norm": 1.3750924405135652, + "learning_rate": 3.4803161810291383e-07, + "loss": 0.939, + "step": 89820 + }, + { + "epoch": 6.961137587663218, + "grad_norm": 1.4556779318345194, + "learning_rate": 3.4807036577805336e-07, + "loss": 0.925, + "step": 89830 + }, + { + "epoch": 6.961912511139525, + "grad_norm": 1.3633383831701609, + "learning_rate": 3.481091134531928e-07, + "loss": 0.9141, + "step": 89840 + }, + { + "epoch": 6.962687434615832, + "grad_norm": 1.44682726665675, + "learning_rate": 3.481478611283323e-07, + "loss": 0.9312, + "step": 89850 + }, + { + "epoch": 6.963462358092139, + "grad_norm": 1.4492584910548272, + "learning_rate": 3.481866088034718e-07, + "loss": 0.9293, + "step": 89860 + }, + { + "epoch": 6.964237281568445, + "grad_norm": 1.401708371747481, + "learning_rate": 3.482253564786113e-07, + "loss": 0.9244, + "step": 89870 + }, + { + "epoch": 6.965012205044752, + "grad_norm": 1.368593653253678, + "learning_rate": 3.482641041537508e-07, + "loss": 0.9315, + "step": 89880 + }, + { + "epoch": 6.965787128521058, + "grad_norm": 1.4096451019905623, + "learning_rate": 3.483028518288903e-07, + "loss": 0.9132, + "step": 89890 + }, + { + "epoch": 6.966562051997365, + "grad_norm": 1.3601482787072068, + "learning_rate": 3.483415995040298e-07, + "loss": 0.921, + "step": 89900 + }, + { + "epoch": 6.967336975473672, + "grad_norm": 1.3991128295806108, + "learning_rate": 3.4838034717916927e-07, + "loss": 0.9222, + "step": 89910 + }, + { + "epoch": 6.968111898949979, + "grad_norm": 1.3044173911174617, + "learning_rate": 3.4841909485430874e-07, + "loss": 0.9057, + "step": 89920 + }, + { + "epoch": 6.968886822426286, + "grad_norm": 1.3346709372922434, + "learning_rate": 3.4845784252944826e-07, + "loss": 0.9342, + "step": 89930 + }, + { + "epoch": 6.9696617459025925, + "grad_norm": 1.3346359944925363, + "learning_rate": 3.4849659020458773e-07, + "loss": 0.9166, + "step": 89940 + }, + { + "epoch": 6.970436669378898, + "grad_norm": 1.3514500556849274, + "learning_rate": 3.4853533787972725e-07, + "loss": 0.9131, + "step": 89950 + }, + { + "epoch": 6.971211592855205, + "grad_norm": 1.480555125576763, + "learning_rate": 3.485740855548667e-07, + "loss": 0.9578, + "step": 89960 + }, + { + "epoch": 6.971986516331512, + "grad_norm": 1.4211079861769176, + "learning_rate": 3.4861283323000624e-07, + "loss": 0.9192, + "step": 89970 + }, + { + "epoch": 6.972761439807819, + "grad_norm": 1.3365671299394066, + "learning_rate": 3.486515809051457e-07, + "loss": 0.9148, + "step": 89980 + }, + { + "epoch": 6.973536363284126, + "grad_norm": 1.353178850638665, + "learning_rate": 3.486903285802852e-07, + "loss": 0.918, + "step": 89990 + }, + { + "epoch": 6.974311286760432, + "grad_norm": 1.3714388065181782, + "learning_rate": 3.487290762554247e-07, + "loss": 0.9185, + "step": 90000 + }, + { + "epoch": 6.974311286760432, + "eval_loss": 0.9307225346565247, + "eval_runtime": 331.7231, + "eval_samples_per_second": 34.58, + "eval_steps_per_second": 8.646, + "step": 90000 + }, + { + "epoch": 6.975086210236739, + "grad_norm": 1.4306468842927496, + "learning_rate": 3.4876782393056417e-07, + "loss": 0.9331, + "step": 90010 + }, + { + "epoch": 6.975861133713046, + "grad_norm": 1.3932310372849526, + "learning_rate": 3.488065716057037e-07, + "loss": 0.9177, + "step": 90020 + }, + { + "epoch": 6.976636057189353, + "grad_norm": 1.4139440949088642, + "learning_rate": 3.4884531928084316e-07, + "loss": 0.9281, + "step": 90030 + }, + { + "epoch": 6.97741098066566, + "grad_norm": 1.4435170319938193, + "learning_rate": 3.488840669559827e-07, + "loss": 0.9128, + "step": 90040 + }, + { + "epoch": 6.978185904141966, + "grad_norm": 1.3693387947107456, + "learning_rate": 3.4892281463112215e-07, + "loss": 0.9102, + "step": 90050 + }, + { + "epoch": 6.978960827618272, + "grad_norm": 1.4338448056123867, + "learning_rate": 3.489615623062616e-07, + "loss": 0.9252, + "step": 90060 + }, + { + "epoch": 6.979735751094579, + "grad_norm": 1.4281783486539987, + "learning_rate": 3.4900030998140114e-07, + "loss": 0.9211, + "step": 90070 + }, + { + "epoch": 6.980510674570886, + "grad_norm": 1.444654925974396, + "learning_rate": 3.490390576565406e-07, + "loss": 0.9161, + "step": 90080 + }, + { + "epoch": 6.981285598047193, + "grad_norm": 1.4203461194875853, + "learning_rate": 3.4907780533168013e-07, + "loss": 0.9225, + "step": 90090 + }, + { + "epoch": 6.9820605215235, + "grad_norm": 1.5820584192753382, + "learning_rate": 3.491165530068196e-07, + "loss": 0.9359, + "step": 90100 + }, + { + "epoch": 6.9828354449998065, + "grad_norm": 1.427076607166875, + "learning_rate": 3.491553006819591e-07, + "loss": 0.97, + "step": 90110 + }, + { + "epoch": 6.983610368476113, + "grad_norm": 1.3698689655918606, + "learning_rate": 3.491940483570986e-07, + "loss": 0.9259, + "step": 90120 + }, + { + "epoch": 6.98438529195242, + "grad_norm": 1.3930409124997827, + "learning_rate": 3.4923279603223806e-07, + "loss": 0.933, + "step": 90130 + }, + { + "epoch": 6.985160215428726, + "grad_norm": 1.4078068635319403, + "learning_rate": 3.492715437073776e-07, + "loss": 0.9434, + "step": 90140 + }, + { + "epoch": 6.985935138905033, + "grad_norm": 1.4527495517783757, + "learning_rate": 3.4931029138251705e-07, + "loss": 0.9139, + "step": 90150 + }, + { + "epoch": 6.98671006238134, + "grad_norm": 1.3953046685330628, + "learning_rate": 3.493490390576566e-07, + "loss": 0.9047, + "step": 90160 + }, + { + "epoch": 6.987484985857646, + "grad_norm": 1.3773482274410316, + "learning_rate": 3.4938778673279605e-07, + "loss": 0.9294, + "step": 90170 + }, + { + "epoch": 6.988259909333953, + "grad_norm": 1.469033151740036, + "learning_rate": 3.4942653440793557e-07, + "loss": 0.9222, + "step": 90180 + }, + { + "epoch": 6.98903483281026, + "grad_norm": 1.481672180751661, + "learning_rate": 3.4946528208307504e-07, + "loss": 0.9126, + "step": 90190 + }, + { + "epoch": 6.989809756286567, + "grad_norm": 1.3991876691450682, + "learning_rate": 3.495040297582145e-07, + "loss": 0.9225, + "step": 90200 + }, + { + "epoch": 6.990584679762874, + "grad_norm": 1.3616847517747015, + "learning_rate": 3.4954277743335403e-07, + "loss": 0.9215, + "step": 90210 + }, + { + "epoch": 6.9913596032391805, + "grad_norm": 1.4163029932126512, + "learning_rate": 3.495815251084935e-07, + "loss": 0.9288, + "step": 90220 + }, + { + "epoch": 6.992134526715487, + "grad_norm": 1.4074301631916957, + "learning_rate": 3.49620272783633e-07, + "loss": 0.9318, + "step": 90230 + }, + { + "epoch": 6.992909450191793, + "grad_norm": 1.4074387742515513, + "learning_rate": 3.496590204587725e-07, + "loss": 0.9269, + "step": 90240 + }, + { + "epoch": 6.9936843736681, + "grad_norm": 1.4913825770569453, + "learning_rate": 3.49697768133912e-07, + "loss": 0.918, + "step": 90250 + }, + { + "epoch": 6.994459297144407, + "grad_norm": 1.3086222771634504, + "learning_rate": 3.497365158090515e-07, + "loss": 0.929, + "step": 90260 + }, + { + "epoch": 6.995234220620714, + "grad_norm": 1.4705296222697717, + "learning_rate": 3.4977526348419095e-07, + "loss": 0.9263, + "step": 90270 + }, + { + "epoch": 6.9960091440970205, + "grad_norm": 1.3770805746827828, + "learning_rate": 3.4981401115933047e-07, + "loss": 0.919, + "step": 90280 + }, + { + "epoch": 6.996784067573327, + "grad_norm": 1.386570479368802, + "learning_rate": 3.4985275883446994e-07, + "loss": 0.9119, + "step": 90290 + }, + { + "epoch": 6.997558991049634, + "grad_norm": 1.3578832783337258, + "learning_rate": 3.4989150650960946e-07, + "loss": 0.9209, + "step": 90300 + }, + { + "epoch": 6.998333914525941, + "grad_norm": 1.4052195848246578, + "learning_rate": 3.4993025418474893e-07, + "loss": 0.9307, + "step": 90310 + }, + { + "epoch": 6.999108838002247, + "grad_norm": 1.3683274979572755, + "learning_rate": 3.4996900185988845e-07, + "loss": 0.9268, + "step": 90320 + }, + { + "epoch": 6.999883761478554, + "grad_norm": 1.3497962665910064, + "learning_rate": 3.500077495350279e-07, + "loss": 0.9398, + "step": 90330 + }, + { + "epoch": 7.00065868495486, + "grad_norm": 1.4627504918009324, + "learning_rate": 3.500464972101674e-07, + "loss": 0.916, + "step": 90340 + }, + { + "epoch": 7.001433608431167, + "grad_norm": 1.3530649997479045, + "learning_rate": 3.500852448853069e-07, + "loss": 0.9274, + "step": 90350 + }, + { + "epoch": 7.002208531907474, + "grad_norm": 1.4396327704756329, + "learning_rate": 3.501239925604464e-07, + "loss": 0.9254, + "step": 90360 + }, + { + "epoch": 7.002983455383781, + "grad_norm": 1.4231923733686371, + "learning_rate": 3.501627402355859e-07, + "loss": 0.9284, + "step": 90370 + }, + { + "epoch": 7.003758378860088, + "grad_norm": 1.4395680033856395, + "learning_rate": 3.5020148791072537e-07, + "loss": 0.9236, + "step": 90380 + }, + { + "epoch": 7.0045333023363945, + "grad_norm": 1.4465686325606018, + "learning_rate": 3.502402355858649e-07, + "loss": 0.9484, + "step": 90390 + }, + { + "epoch": 7.005308225812701, + "grad_norm": 1.496208415699939, + "learning_rate": 3.5027898326100436e-07, + "loss": 0.9183, + "step": 90400 + }, + { + "epoch": 7.006083149289008, + "grad_norm": 1.4179220418059695, + "learning_rate": 3.5031773093614383e-07, + "loss": 0.9355, + "step": 90410 + }, + { + "epoch": 7.006858072765314, + "grad_norm": 1.477674936768774, + "learning_rate": 3.5035647861128335e-07, + "loss": 0.9295, + "step": 90420 + }, + { + "epoch": 7.007632996241621, + "grad_norm": 1.4966721202293258, + "learning_rate": 3.503952262864228e-07, + "loss": 0.9175, + "step": 90430 + }, + { + "epoch": 7.008407919717928, + "grad_norm": 1.3480413889207825, + "learning_rate": 3.5043397396156234e-07, + "loss": 0.931, + "step": 90440 + }, + { + "epoch": 7.0091828431942345, + "grad_norm": 1.4959795423376274, + "learning_rate": 3.504727216367018e-07, + "loss": 0.9165, + "step": 90450 + }, + { + "epoch": 7.009957766670541, + "grad_norm": 1.3889876411791227, + "learning_rate": 3.5051146931184134e-07, + "loss": 0.9138, + "step": 90460 + }, + { + "epoch": 7.010732690146848, + "grad_norm": 1.4341655464003198, + "learning_rate": 3.505502169869808e-07, + "loss": 0.933, + "step": 90470 + }, + { + "epoch": 7.011507613623155, + "grad_norm": 1.363985765365713, + "learning_rate": 3.505889646621203e-07, + "loss": 0.9293, + "step": 90480 + }, + { + "epoch": 7.012282537099462, + "grad_norm": 1.4488529750089543, + "learning_rate": 3.506277123372598e-07, + "loss": 0.9263, + "step": 90490 + }, + { + "epoch": 7.0130574605757685, + "grad_norm": 1.4621268486427703, + "learning_rate": 3.5066646001239926e-07, + "loss": 0.9141, + "step": 90500 + }, + { + "epoch": 7.0130574605757685, + "eval_loss": 0.9305116534233093, + "eval_runtime": 332.4852, + "eval_samples_per_second": 34.501, + "eval_steps_per_second": 8.626, + "step": 90500 + }, + { + "epoch": 7.013832384052074, + "grad_norm": 1.3529268132495598, + "learning_rate": 3.507052076875388e-07, + "loss": 0.9142, + "step": 90510 + }, + { + "epoch": 7.014607307528381, + "grad_norm": 1.3561522895011746, + "learning_rate": 3.5074395536267826e-07, + "loss": 0.9286, + "step": 90520 + }, + { + "epoch": 7.015382231004688, + "grad_norm": 1.403890122437333, + "learning_rate": 3.507827030378178e-07, + "loss": 0.9174, + "step": 90530 + }, + { + "epoch": 7.016157154480995, + "grad_norm": 1.4178504703310182, + "learning_rate": 3.5082145071295725e-07, + "loss": 0.927, + "step": 90540 + }, + { + "epoch": 7.016932077957302, + "grad_norm": 1.4765926474186073, + "learning_rate": 3.508601983880967e-07, + "loss": 0.9097, + "step": 90550 + }, + { + "epoch": 7.0177070014336085, + "grad_norm": 1.3494542855394671, + "learning_rate": 3.5089894606323624e-07, + "loss": 0.9357, + "step": 90560 + }, + { + "epoch": 7.018481924909915, + "grad_norm": 1.453198097486533, + "learning_rate": 3.509376937383757e-07, + "loss": 0.9213, + "step": 90570 + }, + { + "epoch": 7.019256848386222, + "grad_norm": 1.4282181033088648, + "learning_rate": 3.5097644141351523e-07, + "loss": 0.9211, + "step": 90580 + }, + { + "epoch": 7.020031771862529, + "grad_norm": 1.440251359974603, + "learning_rate": 3.510151890886547e-07, + "loss": 0.922, + "step": 90590 + }, + { + "epoch": 7.020806695338836, + "grad_norm": 1.3910482150260421, + "learning_rate": 3.5105393676379417e-07, + "loss": 0.9142, + "step": 90600 + }, + { + "epoch": 7.021581618815142, + "grad_norm": 1.3517371418426452, + "learning_rate": 3.510926844389337e-07, + "loss": 0.9338, + "step": 90610 + }, + { + "epoch": 7.0223565422914485, + "grad_norm": 1.4752820771692063, + "learning_rate": 3.5113143211407316e-07, + "loss": 0.9224, + "step": 90620 + }, + { + "epoch": 7.023131465767755, + "grad_norm": 1.344843834975169, + "learning_rate": 3.511701797892127e-07, + "loss": 0.9415, + "step": 90630 + }, + { + "epoch": 7.023906389244062, + "grad_norm": 1.4645301067268766, + "learning_rate": 3.5120892746435215e-07, + "loss": 0.9051, + "step": 90640 + }, + { + "epoch": 7.024681312720369, + "grad_norm": 1.524704062108091, + "learning_rate": 3.5124767513949167e-07, + "loss": 0.9117, + "step": 90650 + }, + { + "epoch": 7.025456236196676, + "grad_norm": 1.4414169165006845, + "learning_rate": 3.5128642281463114e-07, + "loss": 0.9417, + "step": 90660 + }, + { + "epoch": 7.0262311596729825, + "grad_norm": 1.334894617359948, + "learning_rate": 3.513251704897706e-07, + "loss": 0.939, + "step": 90670 + }, + { + "epoch": 7.027006083149289, + "grad_norm": 1.4528402830841605, + "learning_rate": 3.5136391816491013e-07, + "loss": 0.9323, + "step": 90680 + }, + { + "epoch": 7.027781006625596, + "grad_norm": 1.402494208238766, + "learning_rate": 3.514026658400496e-07, + "loss": 0.9278, + "step": 90690 + }, + { + "epoch": 7.028555930101902, + "grad_norm": 1.3986460869839576, + "learning_rate": 3.514414135151891e-07, + "loss": 0.9274, + "step": 90700 + }, + { + "epoch": 7.029330853578209, + "grad_norm": 1.4413250827865416, + "learning_rate": 3.514801611903286e-07, + "loss": 0.9071, + "step": 90710 + }, + { + "epoch": 7.030105777054516, + "grad_norm": 1.4592178923537358, + "learning_rate": 3.515189088654681e-07, + "loss": 0.9244, + "step": 90720 + }, + { + "epoch": 7.0308807005308225, + "grad_norm": 1.432319859232653, + "learning_rate": 3.515576565406076e-07, + "loss": 0.9276, + "step": 90730 + }, + { + "epoch": 7.031655624007129, + "grad_norm": 1.3951028142228143, + "learning_rate": 3.5159640421574705e-07, + "loss": 0.9043, + "step": 90740 + }, + { + "epoch": 7.032430547483436, + "grad_norm": 1.5095747536177837, + "learning_rate": 3.5163515189088657e-07, + "loss": 0.9281, + "step": 90750 + }, + { + "epoch": 7.033205470959743, + "grad_norm": 1.4236374339127635, + "learning_rate": 3.5167389956602604e-07, + "loss": 0.9107, + "step": 90760 + }, + { + "epoch": 7.03398039443605, + "grad_norm": 1.4164614543882967, + "learning_rate": 3.5171264724116556e-07, + "loss": 0.9142, + "step": 90770 + }, + { + "epoch": 7.0347553179123565, + "grad_norm": 1.4388347569047766, + "learning_rate": 3.5175139491630503e-07, + "loss": 0.919, + "step": 90780 + }, + { + "epoch": 7.0355302413886625, + "grad_norm": 1.3250943954561885, + "learning_rate": 3.5179014259144455e-07, + "loss": 0.9222, + "step": 90790 + }, + { + "epoch": 7.036305164864969, + "grad_norm": 1.330605649283535, + "learning_rate": 3.51828890266584e-07, + "loss": 0.9273, + "step": 90800 + }, + { + "epoch": 7.037080088341276, + "grad_norm": 1.402179736843307, + "learning_rate": 3.518676379417235e-07, + "loss": 0.9213, + "step": 90810 + }, + { + "epoch": 7.037855011817583, + "grad_norm": 1.4001176600472849, + "learning_rate": 3.51906385616863e-07, + "loss": 0.9357, + "step": 90820 + }, + { + "epoch": 7.03862993529389, + "grad_norm": 1.3172362505866935, + "learning_rate": 3.519451332920025e-07, + "loss": 0.9183, + "step": 90830 + }, + { + "epoch": 7.0394048587701965, + "grad_norm": 1.4168653869288033, + "learning_rate": 3.51983880967142e-07, + "loss": 0.9199, + "step": 90840 + }, + { + "epoch": 7.040179782246503, + "grad_norm": 1.4076022363444871, + "learning_rate": 3.520226286422815e-07, + "loss": 0.9154, + "step": 90850 + }, + { + "epoch": 7.04095470572281, + "grad_norm": 1.3957109136409276, + "learning_rate": 3.52061376317421e-07, + "loss": 0.9204, + "step": 90860 + }, + { + "epoch": 7.041729629199117, + "grad_norm": 1.3516037600675836, + "learning_rate": 3.5210012399256047e-07, + "loss": 0.9379, + "step": 90870 + }, + { + "epoch": 7.042504552675423, + "grad_norm": 1.3780914316429174, + "learning_rate": 3.5213887166769994e-07, + "loss": 0.9255, + "step": 90880 + }, + { + "epoch": 7.04327947615173, + "grad_norm": 1.4448264157843906, + "learning_rate": 3.5217761934283946e-07, + "loss": 0.946, + "step": 90890 + }, + { + "epoch": 7.0440543996280365, + "grad_norm": 1.3804224697890306, + "learning_rate": 3.522163670179789e-07, + "loss": 0.9347, + "step": 90900 + }, + { + "epoch": 7.044829323104343, + "grad_norm": 1.30618901464158, + "learning_rate": 3.5225511469311845e-07, + "loss": 0.9279, + "step": 90910 + }, + { + "epoch": 7.04560424658065, + "grad_norm": 1.4475105733717868, + "learning_rate": 3.522938623682579e-07, + "loss": 0.9352, + "step": 90920 + }, + { + "epoch": 7.046379170056957, + "grad_norm": 1.474142502002257, + "learning_rate": 3.5233261004339744e-07, + "loss": 0.9099, + "step": 90930 + }, + { + "epoch": 7.047154093533264, + "grad_norm": 1.4328617194310003, + "learning_rate": 3.523713577185369e-07, + "loss": 0.9191, + "step": 90940 + }, + { + "epoch": 7.0479290170095705, + "grad_norm": 1.369544490186052, + "learning_rate": 3.524101053936764e-07, + "loss": 0.9443, + "step": 90950 + }, + { + "epoch": 7.048703940485877, + "grad_norm": 1.3344823340232679, + "learning_rate": 3.524488530688159e-07, + "loss": 0.9122, + "step": 90960 + }, + { + "epoch": 7.049478863962184, + "grad_norm": 1.521133204063488, + "learning_rate": 3.5248760074395537e-07, + "loss": 0.9004, + "step": 90970 + }, + { + "epoch": 7.05025378743849, + "grad_norm": 1.3612363049582505, + "learning_rate": 3.525263484190949e-07, + "loss": 0.9244, + "step": 90980 + }, + { + "epoch": 7.051028710914797, + "grad_norm": 1.379718728178043, + "learning_rate": 3.5256509609423436e-07, + "loss": 0.9201, + "step": 90990 + }, + { + "epoch": 7.051803634391104, + "grad_norm": 1.46258760799513, + "learning_rate": 3.526038437693739e-07, + "loss": 0.9309, + "step": 91000 + }, + { + "epoch": 7.051803634391104, + "eval_loss": 0.9301605820655823, + "eval_runtime": 332.6563, + "eval_samples_per_second": 34.483, + "eval_steps_per_second": 8.622, + "step": 91000 + }, + { + "epoch": 7.0525785578674105, + "grad_norm": 1.4374155523003977, + "learning_rate": 3.5264259144451335e-07, + "loss": 0.92, + "step": 91010 + }, + { + "epoch": 7.053353481343717, + "grad_norm": 1.38993688802413, + "learning_rate": 3.526813391196528e-07, + "loss": 0.9296, + "step": 91020 + }, + { + "epoch": 7.054128404820024, + "grad_norm": 1.2794770006737521, + "learning_rate": 3.5272008679479234e-07, + "loss": 0.9026, + "step": 91030 + }, + { + "epoch": 7.054903328296331, + "grad_norm": 1.4393368857408804, + "learning_rate": 3.527588344699318e-07, + "loss": 0.9302, + "step": 91040 + }, + { + "epoch": 7.055678251772638, + "grad_norm": 1.3678904093255289, + "learning_rate": 3.5279758214507133e-07, + "loss": 0.9285, + "step": 91050 + }, + { + "epoch": 7.0564531752489446, + "grad_norm": 1.3959305453847775, + "learning_rate": 3.528363298202108e-07, + "loss": 0.9333, + "step": 91060 + }, + { + "epoch": 7.0572280987252505, + "grad_norm": 1.3948874865829022, + "learning_rate": 3.528750774953503e-07, + "loss": 0.9267, + "step": 91070 + }, + { + "epoch": 7.058003022201557, + "grad_norm": 1.4127078409264564, + "learning_rate": 3.529138251704898e-07, + "loss": 0.9396, + "step": 91080 + }, + { + "epoch": 7.058777945677864, + "grad_norm": 1.3745177291242499, + "learning_rate": 3.5295257284562926e-07, + "loss": 0.9225, + "step": 91090 + }, + { + "epoch": 7.059552869154171, + "grad_norm": 1.4696464198108785, + "learning_rate": 3.529913205207688e-07, + "loss": 0.9283, + "step": 91100 + }, + { + "epoch": 7.060327792630478, + "grad_norm": 1.3858024072724902, + "learning_rate": 3.5303006819590825e-07, + "loss": 0.911, + "step": 91110 + }, + { + "epoch": 7.0611027161067845, + "grad_norm": 1.383099348922824, + "learning_rate": 3.530688158710478e-07, + "loss": 0.9104, + "step": 91120 + }, + { + "epoch": 7.061877639583091, + "grad_norm": 1.403409405774458, + "learning_rate": 3.5310756354618724e-07, + "loss": 0.9167, + "step": 91130 + }, + { + "epoch": 7.062652563059398, + "grad_norm": 1.3490954279857916, + "learning_rate": 3.5314631122132677e-07, + "loss": 0.9276, + "step": 91140 + }, + { + "epoch": 7.063427486535705, + "grad_norm": 1.3141292481061726, + "learning_rate": 3.5318505889646623e-07, + "loss": 0.9208, + "step": 91150 + }, + { + "epoch": 7.064202410012011, + "grad_norm": 1.4186581611899292, + "learning_rate": 3.532238065716057e-07, + "loss": 0.9156, + "step": 91160 + }, + { + "epoch": 7.064977333488318, + "grad_norm": 1.437207272884943, + "learning_rate": 3.532625542467452e-07, + "loss": 0.9438, + "step": 91170 + }, + { + "epoch": 7.0657522569646245, + "grad_norm": 1.4331527476169692, + "learning_rate": 3.533013019218847e-07, + "loss": 0.9327, + "step": 91180 + }, + { + "epoch": 7.066527180440931, + "grad_norm": 1.3904996940142407, + "learning_rate": 3.533400495970242e-07, + "loss": 0.942, + "step": 91190 + }, + { + "epoch": 7.067302103917238, + "grad_norm": 1.368707269036405, + "learning_rate": 3.533787972721637e-07, + "loss": 0.9435, + "step": 91200 + }, + { + "epoch": 7.068077027393545, + "grad_norm": 1.3694482945449185, + "learning_rate": 3.534175449473032e-07, + "loss": 0.9043, + "step": 91210 + }, + { + "epoch": 7.068851950869852, + "grad_norm": 1.4606713244079246, + "learning_rate": 3.534562926224427e-07, + "loss": 0.9341, + "step": 91220 + }, + { + "epoch": 7.0696268743461586, + "grad_norm": 1.4100142156564373, + "learning_rate": 3.5349504029758215e-07, + "loss": 0.9189, + "step": 91230 + }, + { + "epoch": 7.070401797822465, + "grad_norm": 1.4526235990374239, + "learning_rate": 3.5353378797272167e-07, + "loss": 0.9271, + "step": 91240 + }, + { + "epoch": 7.071176721298772, + "grad_norm": 1.459522360752838, + "learning_rate": 3.5357253564786114e-07, + "loss": 0.9344, + "step": 91250 + }, + { + "epoch": 7.071951644775078, + "grad_norm": 1.4307740723645685, + "learning_rate": 3.5361128332300066e-07, + "loss": 0.9146, + "step": 91260 + }, + { + "epoch": 7.072726568251385, + "grad_norm": 1.3149017461245456, + "learning_rate": 3.5365003099814013e-07, + "loss": 0.8954, + "step": 91270 + }, + { + "epoch": 7.073501491727692, + "grad_norm": 1.421808331574074, + "learning_rate": 3.536887786732796e-07, + "loss": 0.93, + "step": 91280 + }, + { + "epoch": 7.0742764152039985, + "grad_norm": 1.4122684568000903, + "learning_rate": 3.537275263484191e-07, + "loss": 0.9273, + "step": 91290 + }, + { + "epoch": 7.075051338680305, + "grad_norm": 1.488809178957385, + "learning_rate": 3.537662740235586e-07, + "loss": 0.9095, + "step": 91300 + }, + { + "epoch": 7.075826262156612, + "grad_norm": 1.3222187103675083, + "learning_rate": 3.538050216986981e-07, + "loss": 0.916, + "step": 91310 + }, + { + "epoch": 7.076601185632919, + "grad_norm": 1.479978566619864, + "learning_rate": 3.538437693738376e-07, + "loss": 0.9326, + "step": 91320 + }, + { + "epoch": 7.077376109109226, + "grad_norm": 1.4022639407632496, + "learning_rate": 3.538825170489771e-07, + "loss": 0.9214, + "step": 91330 + }, + { + "epoch": 7.078151032585533, + "grad_norm": 1.3749342890315701, + "learning_rate": 3.5392126472411657e-07, + "loss": 0.9047, + "step": 91340 + }, + { + "epoch": 7.0789259560618385, + "grad_norm": 1.4138744331507425, + "learning_rate": 3.5396001239925604e-07, + "loss": 0.93, + "step": 91350 + }, + { + "epoch": 7.079700879538145, + "grad_norm": 1.4305054346401518, + "learning_rate": 3.5399876007439556e-07, + "loss": 0.9376, + "step": 91360 + }, + { + "epoch": 7.080475803014452, + "grad_norm": 1.4874472953656068, + "learning_rate": 3.5403750774953503e-07, + "loss": 0.9239, + "step": 91370 + }, + { + "epoch": 7.081250726490759, + "grad_norm": 1.3920969607974576, + "learning_rate": 3.5407625542467455e-07, + "loss": 0.9272, + "step": 91380 + }, + { + "epoch": 7.082025649967066, + "grad_norm": 1.3940756458553665, + "learning_rate": 3.54115003099814e-07, + "loss": 0.93, + "step": 91390 + }, + { + "epoch": 7.0828005734433725, + "grad_norm": 1.4099612037870868, + "learning_rate": 3.5415375077495354e-07, + "loss": 0.9166, + "step": 91400 + }, + { + "epoch": 7.083575496919679, + "grad_norm": 1.3493684751450945, + "learning_rate": 3.54192498450093e-07, + "loss": 0.9288, + "step": 91410 + }, + { + "epoch": 7.084350420395986, + "grad_norm": 1.336766530471701, + "learning_rate": 3.542312461252325e-07, + "loss": 0.9333, + "step": 91420 + }, + { + "epoch": 7.085125343872293, + "grad_norm": 1.423229021811624, + "learning_rate": 3.54269993800372e-07, + "loss": 0.9086, + "step": 91430 + }, + { + "epoch": 7.085900267348599, + "grad_norm": 1.3279848756015573, + "learning_rate": 3.5430874147551147e-07, + "loss": 0.9142, + "step": 91440 + }, + { + "epoch": 7.086675190824906, + "grad_norm": 1.4622589201073795, + "learning_rate": 3.54347489150651e-07, + "loss": 0.9283, + "step": 91450 + }, + { + "epoch": 7.0874501143012125, + "grad_norm": 1.4267831349892028, + "learning_rate": 3.5438623682579046e-07, + "loss": 0.9218, + "step": 91460 + }, + { + "epoch": 7.088225037777519, + "grad_norm": 1.4565780975825193, + "learning_rate": 3.5442498450093e-07, + "loss": 0.8955, + "step": 91470 + }, + { + "epoch": 7.088999961253826, + "grad_norm": 1.528820654557171, + "learning_rate": 3.5446373217606945e-07, + "loss": 0.9104, + "step": 91480 + }, + { + "epoch": 7.089774884730133, + "grad_norm": 1.4034883539076672, + "learning_rate": 3.545024798512089e-07, + "loss": 0.9271, + "step": 91490 + }, + { + "epoch": 7.09054980820644, + "grad_norm": 1.3618597960079983, + "learning_rate": 3.5454122752634844e-07, + "loss": 0.8985, + "step": 91500 + }, + { + "epoch": 7.09054980820644, + "eval_loss": 0.9296744465827942, + "eval_runtime": 331.6294, + "eval_samples_per_second": 34.59, + "eval_steps_per_second": 8.648, + "step": 91500 + }, + { + "epoch": 7.091324731682747, + "grad_norm": 1.376276324419691, + "learning_rate": 3.545799752014879e-07, + "loss": 0.9119, + "step": 91510 + }, + { + "epoch": 7.092099655159053, + "grad_norm": 1.416363988330987, + "learning_rate": 3.5461872287662744e-07, + "loss": 0.9422, + "step": 91520 + }, + { + "epoch": 7.09287457863536, + "grad_norm": 1.4193229295904959, + "learning_rate": 3.546574705517669e-07, + "loss": 0.9208, + "step": 91530 + }, + { + "epoch": 7.093649502111666, + "grad_norm": 1.3932237777326504, + "learning_rate": 3.5469621822690643e-07, + "loss": 0.9228, + "step": 91540 + }, + { + "epoch": 7.094424425587973, + "grad_norm": 1.4880989639522637, + "learning_rate": 3.547349659020459e-07, + "loss": 0.935, + "step": 91550 + }, + { + "epoch": 7.09519934906428, + "grad_norm": 1.360942166696605, + "learning_rate": 3.5477371357718537e-07, + "loss": 0.9183, + "step": 91560 + }, + { + "epoch": 7.0959742725405865, + "grad_norm": 1.4326821916918764, + "learning_rate": 3.548124612523249e-07, + "loss": 0.9196, + "step": 91570 + }, + { + "epoch": 7.096749196016893, + "grad_norm": 1.3843401704354368, + "learning_rate": 3.5485120892746436e-07, + "loss": 0.921, + "step": 91580 + }, + { + "epoch": 7.0975241194932, + "grad_norm": 1.5623785198678721, + "learning_rate": 3.548899566026039e-07, + "loss": 0.921, + "step": 91590 + }, + { + "epoch": 7.098299042969507, + "grad_norm": 1.3905169198992482, + "learning_rate": 3.5492870427774335e-07, + "loss": 0.9155, + "step": 91600 + }, + { + "epoch": 7.099073966445814, + "grad_norm": 1.383639202715308, + "learning_rate": 3.5496745195288287e-07, + "loss": 0.9101, + "step": 91610 + }, + { + "epoch": 7.099848889922121, + "grad_norm": 1.3697172659151868, + "learning_rate": 3.5500619962802234e-07, + "loss": 0.9123, + "step": 91620 + }, + { + "epoch": 7.1006238133984265, + "grad_norm": 1.663096668852582, + "learning_rate": 3.550449473031618e-07, + "loss": 0.9348, + "step": 91630 + }, + { + "epoch": 7.101398736874733, + "grad_norm": 1.4240499045742487, + "learning_rate": 3.5508369497830133e-07, + "loss": 0.9328, + "step": 91640 + }, + { + "epoch": 7.10217366035104, + "grad_norm": 1.3368454689745297, + "learning_rate": 3.551224426534408e-07, + "loss": 0.9151, + "step": 91650 + }, + { + "epoch": 7.102948583827347, + "grad_norm": 1.412834276708446, + "learning_rate": 3.551611903285803e-07, + "loss": 0.9201, + "step": 91660 + }, + { + "epoch": 7.103723507303654, + "grad_norm": 1.4148846273180993, + "learning_rate": 3.551999380037198e-07, + "loss": 0.9223, + "step": 91670 + }, + { + "epoch": 7.104498430779961, + "grad_norm": 1.3357901348896715, + "learning_rate": 3.552386856788593e-07, + "loss": 0.901, + "step": 91680 + }, + { + "epoch": 7.105273354256267, + "grad_norm": 1.4661660242823127, + "learning_rate": 3.552774333539988e-07, + "loss": 0.8988, + "step": 91690 + }, + { + "epoch": 7.106048277732574, + "grad_norm": 1.378333767566334, + "learning_rate": 3.5531618102913825e-07, + "loss": 0.9244, + "step": 91700 + }, + { + "epoch": 7.106823201208881, + "grad_norm": 1.3891800609351768, + "learning_rate": 3.5535492870427777e-07, + "loss": 0.9178, + "step": 91710 + }, + { + "epoch": 7.107598124685187, + "grad_norm": 1.4543134735341976, + "learning_rate": 3.5539367637941724e-07, + "loss": 0.9307, + "step": 91720 + }, + { + "epoch": 7.108373048161494, + "grad_norm": 1.435244776797363, + "learning_rate": 3.5543242405455676e-07, + "loss": 0.9248, + "step": 91730 + }, + { + "epoch": 7.1091479716378005, + "grad_norm": 1.3478223196092887, + "learning_rate": 3.5547117172969623e-07, + "loss": 0.9222, + "step": 91740 + }, + { + "epoch": 7.109922895114107, + "grad_norm": 1.4458410346596415, + "learning_rate": 3.5550991940483575e-07, + "loss": 0.9206, + "step": 91750 + }, + { + "epoch": 7.110697818590414, + "grad_norm": 1.3258891418892884, + "learning_rate": 3.555486670799752e-07, + "loss": 0.9127, + "step": 91760 + }, + { + "epoch": 7.111472742066721, + "grad_norm": 1.4862595472690106, + "learning_rate": 3.555874147551147e-07, + "loss": 0.9303, + "step": 91770 + }, + { + "epoch": 7.112247665543028, + "grad_norm": 1.4066637125356976, + "learning_rate": 3.556261624302542e-07, + "loss": 0.9636, + "step": 91780 + }, + { + "epoch": 7.113022589019335, + "grad_norm": 1.4139348487762555, + "learning_rate": 3.556649101053937e-07, + "loss": 0.9229, + "step": 91790 + }, + { + "epoch": 7.113797512495641, + "grad_norm": 1.3991161119826234, + "learning_rate": 3.557036577805332e-07, + "loss": 0.9396, + "step": 91800 + }, + { + "epoch": 7.114572435971947, + "grad_norm": 1.388756969554802, + "learning_rate": 3.5574240545567267e-07, + "loss": 0.9196, + "step": 91810 + }, + { + "epoch": 7.115347359448254, + "grad_norm": 1.4014981472233203, + "learning_rate": 3.557811531308122e-07, + "loss": 0.9226, + "step": 91820 + }, + { + "epoch": 7.116122282924561, + "grad_norm": 1.4560325023963516, + "learning_rate": 3.5581990080595166e-07, + "loss": 0.9309, + "step": 91830 + }, + { + "epoch": 7.116897206400868, + "grad_norm": 1.430694827742422, + "learning_rate": 3.5585864848109113e-07, + "loss": 0.928, + "step": 91840 + }, + { + "epoch": 7.117672129877175, + "grad_norm": 1.438102291334274, + "learning_rate": 3.5589739615623066e-07, + "loss": 0.9466, + "step": 91850 + }, + { + "epoch": 7.118447053353481, + "grad_norm": 1.2918959815424897, + "learning_rate": 3.559361438313701e-07, + "loss": 0.9046, + "step": 91860 + }, + { + "epoch": 7.119221976829788, + "grad_norm": 1.345415340471792, + "learning_rate": 3.5597489150650965e-07, + "loss": 0.9272, + "step": 91870 + }, + { + "epoch": 7.119996900306095, + "grad_norm": 1.3850529004728793, + "learning_rate": 3.560136391816491e-07, + "loss": 0.9263, + "step": 91880 + }, + { + "epoch": 7.120771823782402, + "grad_norm": 1.4903651834802423, + "learning_rate": 3.5605238685678864e-07, + "loss": 0.9438, + "step": 91890 + }, + { + "epoch": 7.121546747258709, + "grad_norm": 1.447769557281898, + "learning_rate": 3.560911345319281e-07, + "loss": 0.9295, + "step": 91900 + }, + { + "epoch": 7.1223216707350145, + "grad_norm": 1.457688631958372, + "learning_rate": 3.561298822070676e-07, + "loss": 0.922, + "step": 91910 + }, + { + "epoch": 7.123096594211321, + "grad_norm": 1.5275223895321453, + "learning_rate": 3.561686298822071e-07, + "loss": 0.924, + "step": 91920 + }, + { + "epoch": 7.123871517687628, + "grad_norm": 1.4826081586666318, + "learning_rate": 3.5620737755734657e-07, + "loss": 0.9164, + "step": 91930 + }, + { + "epoch": 7.124646441163935, + "grad_norm": 1.4150913534611107, + "learning_rate": 3.562461252324861e-07, + "loss": 0.912, + "step": 91940 + }, + { + "epoch": 7.125421364640242, + "grad_norm": 1.3906942626537553, + "learning_rate": 3.5628487290762556e-07, + "loss": 0.9328, + "step": 91950 + }, + { + "epoch": 7.126196288116549, + "grad_norm": 1.3957547058739896, + "learning_rate": 3.563236205827651e-07, + "loss": 0.9229, + "step": 91960 + }, + { + "epoch": 7.126971211592855, + "grad_norm": 1.383523124011446, + "learning_rate": 3.5636236825790455e-07, + "loss": 0.9005, + "step": 91970 + }, + { + "epoch": 7.127746135069162, + "grad_norm": 1.4176065860906053, + "learning_rate": 3.56401115933044e-07, + "loss": 0.9216, + "step": 91980 + }, + { + "epoch": 7.128521058545469, + "grad_norm": 1.3730405767360765, + "learning_rate": 3.5643986360818354e-07, + "loss": 0.9235, + "step": 91990 + }, + { + "epoch": 7.129295982021775, + "grad_norm": 1.4922714424889394, + "learning_rate": 3.56478611283323e-07, + "loss": 0.9208, + "step": 92000 + }, + { + "epoch": 7.129295982021775, + "eval_loss": 0.9295350909233093, + "eval_runtime": 332.6572, + "eval_samples_per_second": 34.483, + "eval_steps_per_second": 8.621, + "step": 92000 + }, + { + "epoch": 7.130070905498082, + "grad_norm": 1.3652567577346877, + "learning_rate": 3.5651735895846253e-07, + "loss": 0.9309, + "step": 92010 + }, + { + "epoch": 7.130845828974389, + "grad_norm": 1.4593588402855782, + "learning_rate": 3.56556106633602e-07, + "loss": 0.9233, + "step": 92020 + }, + { + "epoch": 7.131620752450695, + "grad_norm": 1.4889136395088867, + "learning_rate": 3.5659485430874147e-07, + "loss": 0.9264, + "step": 92030 + }, + { + "epoch": 7.132395675927002, + "grad_norm": 1.444479739631116, + "learning_rate": 3.56633601983881e-07, + "loss": 0.9294, + "step": 92040 + }, + { + "epoch": 7.133170599403309, + "grad_norm": 1.433817081533382, + "learning_rate": 3.5667234965902046e-07, + "loss": 0.9125, + "step": 92050 + }, + { + "epoch": 7.133945522879616, + "grad_norm": 1.3995273293801345, + "learning_rate": 3.5671109733416e-07, + "loss": 0.9059, + "step": 92060 + }, + { + "epoch": 7.134720446355923, + "grad_norm": 1.355274195018594, + "learning_rate": 3.5674984500929945e-07, + "loss": 0.9365, + "step": 92070 + }, + { + "epoch": 7.135495369832229, + "grad_norm": 1.3573366787790457, + "learning_rate": 3.5678859268443897e-07, + "loss": 0.917, + "step": 92080 + }, + { + "epoch": 7.136270293308536, + "grad_norm": 1.3935350527504926, + "learning_rate": 3.5682734035957844e-07, + "loss": 0.9194, + "step": 92090 + }, + { + "epoch": 7.137045216784842, + "grad_norm": 1.416818852113532, + "learning_rate": 3.568660880347179e-07, + "loss": 0.9282, + "step": 92100 + }, + { + "epoch": 7.137820140261149, + "grad_norm": 1.4709623219642054, + "learning_rate": 3.5690483570985743e-07, + "loss": 0.9276, + "step": 92110 + }, + { + "epoch": 7.138595063737456, + "grad_norm": 1.3889143447291847, + "learning_rate": 3.569435833849969e-07, + "loss": 0.9161, + "step": 92120 + }, + { + "epoch": 7.139369987213763, + "grad_norm": 1.4495597654639154, + "learning_rate": 3.569823310601364e-07, + "loss": 0.9007, + "step": 92130 + }, + { + "epoch": 7.140144910690069, + "grad_norm": 1.4435312027525251, + "learning_rate": 3.570210787352759e-07, + "loss": 0.9206, + "step": 92140 + }, + { + "epoch": 7.140919834166376, + "grad_norm": 1.4273670022754017, + "learning_rate": 3.570598264104154e-07, + "loss": 0.9237, + "step": 92150 + }, + { + "epoch": 7.141694757642683, + "grad_norm": 1.4876639695041523, + "learning_rate": 3.570985740855549e-07, + "loss": 0.9298, + "step": 92160 + }, + { + "epoch": 7.14246968111899, + "grad_norm": 1.3731534009770443, + "learning_rate": 3.5713732176069435e-07, + "loss": 0.9214, + "step": 92170 + }, + { + "epoch": 7.143244604595297, + "grad_norm": 1.4204222992695532, + "learning_rate": 3.571760694358339e-07, + "loss": 0.9268, + "step": 92180 + }, + { + "epoch": 7.144019528071603, + "grad_norm": 1.6469518727893282, + "learning_rate": 3.5721481711097334e-07, + "loss": 0.9284, + "step": 92190 + }, + { + "epoch": 7.144794451547909, + "grad_norm": 1.3473805277096684, + "learning_rate": 3.5725356478611287e-07, + "loss": 0.9146, + "step": 92200 + }, + { + "epoch": 7.145569375024216, + "grad_norm": 1.4441359923519697, + "learning_rate": 3.5729231246125234e-07, + "loss": 0.9217, + "step": 92210 + }, + { + "epoch": 7.146344298500523, + "grad_norm": 1.4574590396824365, + "learning_rate": 3.5733106013639186e-07, + "loss": 0.9395, + "step": 92220 + }, + { + "epoch": 7.14711922197683, + "grad_norm": 1.445395042934528, + "learning_rate": 3.573698078115313e-07, + "loss": 0.9039, + "step": 92230 + }, + { + "epoch": 7.147894145453137, + "grad_norm": 1.455638687634206, + "learning_rate": 3.574085554866708e-07, + "loss": 0.916, + "step": 92240 + }, + { + "epoch": 7.148669068929443, + "grad_norm": 1.4432187509401988, + "learning_rate": 3.574473031618103e-07, + "loss": 0.9487, + "step": 92250 + }, + { + "epoch": 7.14944399240575, + "grad_norm": 1.3707580024879729, + "learning_rate": 3.574860508369498e-07, + "loss": 0.9027, + "step": 92260 + }, + { + "epoch": 7.150218915882057, + "grad_norm": 1.2913855732006836, + "learning_rate": 3.575247985120893e-07, + "loss": 0.9118, + "step": 92270 + }, + { + "epoch": 7.150993839358363, + "grad_norm": 1.3950731453723386, + "learning_rate": 3.575635461872288e-07, + "loss": 0.9214, + "step": 92280 + }, + { + "epoch": 7.15176876283467, + "grad_norm": 1.3943160782991566, + "learning_rate": 3.576022938623683e-07, + "loss": 0.9638, + "step": 92290 + }, + { + "epoch": 7.152543686310977, + "grad_norm": 1.5268667438189125, + "learning_rate": 3.5764104153750777e-07, + "loss": 0.9368, + "step": 92300 + }, + { + "epoch": 7.153318609787283, + "grad_norm": 1.369231507708813, + "learning_rate": 3.5767978921264724e-07, + "loss": 0.9181, + "step": 92310 + }, + { + "epoch": 7.15409353326359, + "grad_norm": 1.4122003660953968, + "learning_rate": 3.5771853688778676e-07, + "loss": 0.9322, + "step": 92320 + }, + { + "epoch": 7.154868456739897, + "grad_norm": 1.387665052174028, + "learning_rate": 3.5775728456292623e-07, + "loss": 0.9476, + "step": 92330 + }, + { + "epoch": 7.155643380216204, + "grad_norm": 1.397904391255524, + "learning_rate": 3.5779603223806575e-07, + "loss": 0.9383, + "step": 92340 + }, + { + "epoch": 7.156418303692511, + "grad_norm": 1.4075791614540192, + "learning_rate": 3.578347799132052e-07, + "loss": 0.9274, + "step": 92350 + }, + { + "epoch": 7.1571932271688175, + "grad_norm": 1.4144127653961056, + "learning_rate": 3.5787352758834474e-07, + "loss": 0.9706, + "step": 92360 + }, + { + "epoch": 7.157968150645123, + "grad_norm": 1.4603356373784249, + "learning_rate": 3.579122752634842e-07, + "loss": 0.942, + "step": 92370 + }, + { + "epoch": 7.15874307412143, + "grad_norm": 1.4429429833313172, + "learning_rate": 3.579510229386237e-07, + "loss": 0.918, + "step": 92380 + }, + { + "epoch": 7.159517997597737, + "grad_norm": 1.4076643830364446, + "learning_rate": 3.579897706137632e-07, + "loss": 0.915, + "step": 92390 + }, + { + "epoch": 7.160292921074044, + "grad_norm": 1.4412319497949573, + "learning_rate": 3.5802851828890267e-07, + "loss": 0.8922, + "step": 92400 + }, + { + "epoch": 7.161067844550351, + "grad_norm": 1.4316540400463027, + "learning_rate": 3.580672659640422e-07, + "loss": 0.9316, + "step": 92410 + }, + { + "epoch": 7.161842768026657, + "grad_norm": 1.4666292312164035, + "learning_rate": 3.5810601363918166e-07, + "loss": 0.9341, + "step": 92420 + }, + { + "epoch": 7.162617691502964, + "grad_norm": 1.3841092853367285, + "learning_rate": 3.581447613143212e-07, + "loss": 0.9221, + "step": 92430 + }, + { + "epoch": 7.163392614979271, + "grad_norm": 1.4438430280876733, + "learning_rate": 3.5818350898946065e-07, + "loss": 0.9289, + "step": 92440 + }, + { + "epoch": 7.164167538455578, + "grad_norm": 1.3723241884614525, + "learning_rate": 3.582222566646001e-07, + "loss": 0.9099, + "step": 92450 + }, + { + "epoch": 7.164942461931885, + "grad_norm": 1.4002285701846704, + "learning_rate": 3.5826100433973964e-07, + "loss": 0.9814, + "step": 92460 + }, + { + "epoch": 7.165717385408191, + "grad_norm": 1.491771416883063, + "learning_rate": 3.582997520148791e-07, + "loss": 0.9318, + "step": 92470 + }, + { + "epoch": 7.166492308884497, + "grad_norm": 1.4673180507993837, + "learning_rate": 3.5833849969001863e-07, + "loss": 0.9271, + "step": 92480 + }, + { + "epoch": 7.167267232360804, + "grad_norm": 1.4215182004327163, + "learning_rate": 3.583772473651581e-07, + "loss": 0.9109, + "step": 92490 + }, + { + "epoch": 7.168042155837111, + "grad_norm": 1.433950770836674, + "learning_rate": 3.584159950402976e-07, + "loss": 0.9077, + "step": 92500 + }, + { + "epoch": 7.168042155837111, + "eval_loss": 0.9292341470718384, + "eval_runtime": 331.2049, + "eval_samples_per_second": 34.634, + "eval_steps_per_second": 8.659, + "step": 92500 + }, + { + "epoch": 7.168817079313418, + "grad_norm": 1.4032146725544972, + "learning_rate": 3.584547427154371e-07, + "loss": 0.9107, + "step": 92510 + }, + { + "epoch": 7.169592002789725, + "grad_norm": 1.4359002277766362, + "learning_rate": 3.5849349039057656e-07, + "loss": 0.9068, + "step": 92520 + }, + { + "epoch": 7.1703669262660314, + "grad_norm": 1.4525232504029646, + "learning_rate": 3.585322380657161e-07, + "loss": 0.9212, + "step": 92530 + }, + { + "epoch": 7.171141849742338, + "grad_norm": 1.4680049163685711, + "learning_rate": 3.5857098574085555e-07, + "loss": 0.9183, + "step": 92540 + }, + { + "epoch": 7.171916773218645, + "grad_norm": 1.4440205730671076, + "learning_rate": 3.586097334159951e-07, + "loss": 0.9246, + "step": 92550 + }, + { + "epoch": 7.172691696694951, + "grad_norm": 1.401018793243371, + "learning_rate": 3.5864848109113455e-07, + "loss": 0.9306, + "step": 92560 + }, + { + "epoch": 7.173466620171258, + "grad_norm": 1.3952769764585224, + "learning_rate": 3.5868722876627407e-07, + "loss": 0.9328, + "step": 92570 + }, + { + "epoch": 7.174241543647565, + "grad_norm": 1.3931149446205142, + "learning_rate": 3.5872597644141354e-07, + "loss": 0.9353, + "step": 92580 + }, + { + "epoch": 7.175016467123871, + "grad_norm": 1.4854898547904152, + "learning_rate": 3.58764724116553e-07, + "loss": 0.9387, + "step": 92590 + }, + { + "epoch": 7.175791390600178, + "grad_norm": 1.4457391114877745, + "learning_rate": 3.5880347179169253e-07, + "loss": 0.9182, + "step": 92600 + }, + { + "epoch": 7.176566314076485, + "grad_norm": 1.490590258758892, + "learning_rate": 3.58842219466832e-07, + "loss": 0.9134, + "step": 92610 + }, + { + "epoch": 7.177341237552792, + "grad_norm": 1.3826186065119832, + "learning_rate": 3.588809671419715e-07, + "loss": 0.9121, + "step": 92620 + }, + { + "epoch": 7.178116161029099, + "grad_norm": 1.4118777885879632, + "learning_rate": 3.58919714817111e-07, + "loss": 0.9212, + "step": 92630 + }, + { + "epoch": 7.1788910845054055, + "grad_norm": 1.394744973486328, + "learning_rate": 3.589584624922505e-07, + "loss": 0.9231, + "step": 92640 + }, + { + "epoch": 7.179666007981711, + "grad_norm": 1.4262750217568876, + "learning_rate": 3.5899721016739e-07, + "loss": 0.9537, + "step": 92650 + }, + { + "epoch": 7.180440931458018, + "grad_norm": 1.378382443639765, + "learning_rate": 3.5903595784252945e-07, + "loss": 0.9181, + "step": 92660 + }, + { + "epoch": 7.181215854934325, + "grad_norm": 1.3766485979451328, + "learning_rate": 3.5907470551766897e-07, + "loss": 0.9103, + "step": 92670 + }, + { + "epoch": 7.181990778410632, + "grad_norm": 1.4156378023120433, + "learning_rate": 3.5911345319280844e-07, + "loss": 0.9035, + "step": 92680 + }, + { + "epoch": 7.182765701886939, + "grad_norm": 1.4840992411949716, + "learning_rate": 3.5915220086794796e-07, + "loss": 0.9373, + "step": 92690 + }, + { + "epoch": 7.1835406253632454, + "grad_norm": 1.4362370026737128, + "learning_rate": 3.5919094854308743e-07, + "loss": 0.9221, + "step": 92700 + }, + { + "epoch": 7.184315548839552, + "grad_norm": 1.4056161478261935, + "learning_rate": 3.5922969621822695e-07, + "loss": 0.9328, + "step": 92710 + }, + { + "epoch": 7.185090472315859, + "grad_norm": 1.3732179928003494, + "learning_rate": 3.592684438933664e-07, + "loss": 0.9274, + "step": 92720 + }, + { + "epoch": 7.185865395792166, + "grad_norm": 1.3712539832423518, + "learning_rate": 3.593071915685059e-07, + "loss": 0.9192, + "step": 92730 + }, + { + "epoch": 7.186640319268472, + "grad_norm": 1.3453511434545298, + "learning_rate": 3.593459392436454e-07, + "loss": 0.9373, + "step": 92740 + }, + { + "epoch": 7.187415242744779, + "grad_norm": 1.3524397328683848, + "learning_rate": 3.593846869187849e-07, + "loss": 0.9315, + "step": 92750 + }, + { + "epoch": 7.188190166221085, + "grad_norm": 1.3447383872196956, + "learning_rate": 3.594234345939244e-07, + "loss": 0.9489, + "step": 92760 + }, + { + "epoch": 7.188965089697392, + "grad_norm": 1.383974718153584, + "learning_rate": 3.5946218226906387e-07, + "loss": 0.9087, + "step": 92770 + }, + { + "epoch": 7.189740013173699, + "grad_norm": 1.3712756537310138, + "learning_rate": 3.5950092994420334e-07, + "loss": 0.9142, + "step": 92780 + }, + { + "epoch": 7.190514936650006, + "grad_norm": 1.4277272593371908, + "learning_rate": 3.5953967761934286e-07, + "loss": 0.9105, + "step": 92790 + }, + { + "epoch": 7.191289860126313, + "grad_norm": 1.4424435646560323, + "learning_rate": 3.5957842529448233e-07, + "loss": 0.9154, + "step": 92800 + }, + { + "epoch": 7.1920647836026195, + "grad_norm": 1.3648488253554607, + "learning_rate": 3.5961717296962185e-07, + "loss": 0.9173, + "step": 92810 + }, + { + "epoch": 7.192839707078926, + "grad_norm": 1.4258598450827693, + "learning_rate": 3.596559206447613e-07, + "loss": 0.9305, + "step": 92820 + }, + { + "epoch": 7.193614630555233, + "grad_norm": 1.3934807362384682, + "learning_rate": 3.5969466831990084e-07, + "loss": 0.913, + "step": 92830 + }, + { + "epoch": 7.194389554031539, + "grad_norm": 1.5467906810520429, + "learning_rate": 3.597334159950403e-07, + "loss": 0.9323, + "step": 92840 + }, + { + "epoch": 7.195164477507846, + "grad_norm": 1.4784241651410117, + "learning_rate": 3.597721636701798e-07, + "loss": 0.9363, + "step": 92850 + }, + { + "epoch": 7.195939400984153, + "grad_norm": 1.5003672518033884, + "learning_rate": 3.598109113453193e-07, + "loss": 0.9166, + "step": 92860 + }, + { + "epoch": 7.1967143244604594, + "grad_norm": 1.3572789712236126, + "learning_rate": 3.598496590204588e-07, + "loss": 0.9044, + "step": 92870 + }, + { + "epoch": 7.197489247936766, + "grad_norm": 1.496317894812849, + "learning_rate": 3.598884066955983e-07, + "loss": 0.9148, + "step": 92880 + }, + { + "epoch": 7.198264171413073, + "grad_norm": 1.5105969895458073, + "learning_rate": 3.5992715437073776e-07, + "loss": 0.9237, + "step": 92890 + }, + { + "epoch": 7.19903909488938, + "grad_norm": 1.4197933712387798, + "learning_rate": 3.599659020458773e-07, + "loss": 0.9321, + "step": 92900 + }, + { + "epoch": 7.199814018365687, + "grad_norm": 1.4061154560307683, + "learning_rate": 3.6000464972101676e-07, + "loss": 0.9176, + "step": 92910 + }, + { + "epoch": 7.2005889418419935, + "grad_norm": 1.4274010496280776, + "learning_rate": 3.600433973961562e-07, + "loss": 0.9255, + "step": 92920 + }, + { + "epoch": 7.201363865318299, + "grad_norm": 1.4070257297671895, + "learning_rate": 3.6008214507129575e-07, + "loss": 0.9155, + "step": 92930 + }, + { + "epoch": 7.202138788794606, + "grad_norm": 1.4396719882677396, + "learning_rate": 3.601208927464352e-07, + "loss": 0.9233, + "step": 92940 + }, + { + "epoch": 7.202913712270913, + "grad_norm": 1.3952819222662842, + "learning_rate": 3.6015964042157474e-07, + "loss": 0.9395, + "step": 92950 + }, + { + "epoch": 7.20368863574722, + "grad_norm": 1.4306957984907898, + "learning_rate": 3.601983880967142e-07, + "loss": 0.9131, + "step": 92960 + }, + { + "epoch": 7.204463559223527, + "grad_norm": 1.4576573355613693, + "learning_rate": 3.6023713577185373e-07, + "loss": 0.9227, + "step": 92970 + }, + { + "epoch": 7.2052384826998335, + "grad_norm": 1.5300381385536161, + "learning_rate": 3.602758834469932e-07, + "loss": 0.9295, + "step": 92980 + }, + { + "epoch": 7.20601340617614, + "grad_norm": 1.4340506715362633, + "learning_rate": 3.6031463112213267e-07, + "loss": 0.9197, + "step": 92990 + }, + { + "epoch": 7.206788329652447, + "grad_norm": 1.407609689548193, + "learning_rate": 3.603533787972722e-07, + "loss": 0.927, + "step": 93000 + }, + { + "epoch": 7.206788329652447, + "eval_loss": 0.9288294315338135, + "eval_runtime": 331.1765, + "eval_samples_per_second": 34.637, + "eval_steps_per_second": 8.66, + "step": 93000 + }, + { + "epoch": 7.207563253128754, + "grad_norm": 1.390004312486143, + "learning_rate": 3.6039212647241166e-07, + "loss": 0.9143, + "step": 93010 + }, + { + "epoch": 7.20833817660506, + "grad_norm": 1.367046526585238, + "learning_rate": 3.604308741475512e-07, + "loss": 0.9267, + "step": 93020 + }, + { + "epoch": 7.209113100081367, + "grad_norm": 1.4555536350012777, + "learning_rate": 3.6046962182269065e-07, + "loss": 0.9154, + "step": 93030 + }, + { + "epoch": 7.209888023557673, + "grad_norm": 1.4149165400942645, + "learning_rate": 3.6050836949783017e-07, + "loss": 0.9268, + "step": 93040 + }, + { + "epoch": 7.21066294703398, + "grad_norm": 1.38557731962723, + "learning_rate": 3.6054711717296964e-07, + "loss": 0.9241, + "step": 93050 + }, + { + "epoch": 7.211437870510287, + "grad_norm": 1.3625269595918807, + "learning_rate": 3.605858648481091e-07, + "loss": 0.8964, + "step": 93060 + }, + { + "epoch": 7.212212793986594, + "grad_norm": 1.40051544937346, + "learning_rate": 3.6062461252324863e-07, + "loss": 0.9101, + "step": 93070 + }, + { + "epoch": 7.212987717462901, + "grad_norm": 1.3859694750184475, + "learning_rate": 3.606633601983881e-07, + "loss": 0.9229, + "step": 93080 + }, + { + "epoch": 7.2137626409392075, + "grad_norm": 1.4228329685823773, + "learning_rate": 3.607021078735276e-07, + "loss": 0.9514, + "step": 93090 + }, + { + "epoch": 7.214537564415514, + "grad_norm": 1.4444432472724948, + "learning_rate": 3.607408555486671e-07, + "loss": 0.8948, + "step": 93100 + }, + { + "epoch": 7.21531248789182, + "grad_norm": 1.4630456107212435, + "learning_rate": 3.607796032238066e-07, + "loss": 0.9346, + "step": 93110 + }, + { + "epoch": 7.216087411368127, + "grad_norm": 1.4605588940482814, + "learning_rate": 3.608183508989461e-07, + "loss": 0.9271, + "step": 93120 + }, + { + "epoch": 7.216862334844434, + "grad_norm": 1.4525455187870506, + "learning_rate": 3.6085709857408555e-07, + "loss": 0.916, + "step": 93130 + }, + { + "epoch": 7.217637258320741, + "grad_norm": 1.400451456313422, + "learning_rate": 3.6089584624922507e-07, + "loss": 0.9037, + "step": 93140 + }, + { + "epoch": 7.2184121817970475, + "grad_norm": 1.4477304531160051, + "learning_rate": 3.6093459392436454e-07, + "loss": 0.9179, + "step": 93150 + }, + { + "epoch": 7.219187105273354, + "grad_norm": 1.4245101278711096, + "learning_rate": 3.6097334159950406e-07, + "loss": 0.9195, + "step": 93160 + }, + { + "epoch": 7.219962028749661, + "grad_norm": 1.4079340737049564, + "learning_rate": 3.6101208927464353e-07, + "loss": 0.9054, + "step": 93170 + }, + { + "epoch": 7.220736952225968, + "grad_norm": 1.528076482820635, + "learning_rate": 3.6105083694978306e-07, + "loss": 0.9418, + "step": 93180 + }, + { + "epoch": 7.221511875702275, + "grad_norm": 1.4566382112699703, + "learning_rate": 3.610895846249225e-07, + "loss": 0.9193, + "step": 93190 + }, + { + "epoch": 7.2222867991785815, + "grad_norm": 1.429878942738889, + "learning_rate": 3.61128332300062e-07, + "loss": 0.9433, + "step": 93200 + }, + { + "epoch": 7.223061722654887, + "grad_norm": 1.3582830595828115, + "learning_rate": 3.611670799752015e-07, + "loss": 0.9287, + "step": 93210 + }, + { + "epoch": 7.223836646131194, + "grad_norm": 1.4406058945863445, + "learning_rate": 3.61205827650341e-07, + "loss": 0.9136, + "step": 93220 + }, + { + "epoch": 7.224611569607501, + "grad_norm": 1.536124061860983, + "learning_rate": 3.612445753254805e-07, + "loss": 0.9366, + "step": 93230 + }, + { + "epoch": 7.225386493083808, + "grad_norm": 1.3973253118443705, + "learning_rate": 3.6128332300062e-07, + "loss": 0.9384, + "step": 93240 + }, + { + "epoch": 7.226161416560115, + "grad_norm": 1.478153132158645, + "learning_rate": 3.613220706757595e-07, + "loss": 0.9309, + "step": 93250 + }, + { + "epoch": 7.2269363400364215, + "grad_norm": 1.398918723039517, + "learning_rate": 3.6136081835089897e-07, + "loss": 0.9126, + "step": 93260 + }, + { + "epoch": 7.227711263512728, + "grad_norm": 1.4874431939493213, + "learning_rate": 3.6139956602603844e-07, + "loss": 0.9036, + "step": 93270 + }, + { + "epoch": 7.228486186989035, + "grad_norm": 1.4968519234073714, + "learning_rate": 3.6143831370117796e-07, + "loss": 0.9059, + "step": 93280 + }, + { + "epoch": 7.229261110465342, + "grad_norm": 1.3995845135532887, + "learning_rate": 3.6147706137631743e-07, + "loss": 0.9132, + "step": 93290 + }, + { + "epoch": 7.230036033941648, + "grad_norm": 1.478408577832552, + "learning_rate": 3.6151580905145695e-07, + "loss": 0.9315, + "step": 93300 + }, + { + "epoch": 7.230810957417955, + "grad_norm": 1.372479382247515, + "learning_rate": 3.615545567265964e-07, + "loss": 0.9224, + "step": 93310 + }, + { + "epoch": 7.2315858808942615, + "grad_norm": 1.3297909904411798, + "learning_rate": 3.6159330440173594e-07, + "loss": 0.9292, + "step": 93320 + }, + { + "epoch": 7.232360804370568, + "grad_norm": 1.354156365214573, + "learning_rate": 3.616320520768754e-07, + "loss": 0.9129, + "step": 93330 + }, + { + "epoch": 7.233135727846875, + "grad_norm": 1.4439035684003654, + "learning_rate": 3.616707997520149e-07, + "loss": 0.9244, + "step": 93340 + }, + { + "epoch": 7.233910651323182, + "grad_norm": 1.3885318230439767, + "learning_rate": 3.617095474271544e-07, + "loss": 0.9193, + "step": 93350 + }, + { + "epoch": 7.234685574799489, + "grad_norm": 1.309435128270209, + "learning_rate": 3.6174829510229387e-07, + "loss": 0.9137, + "step": 93360 + }, + { + "epoch": 7.2354604982757955, + "grad_norm": 1.501088173936756, + "learning_rate": 3.617870427774334e-07, + "loss": 0.9275, + "step": 93370 + }, + { + "epoch": 7.236235421752102, + "grad_norm": 1.3734978726933131, + "learning_rate": 3.6182579045257286e-07, + "loss": 0.9292, + "step": 93380 + }, + { + "epoch": 7.237010345228409, + "grad_norm": 1.412893762023084, + "learning_rate": 3.618645381277124e-07, + "loss": 0.9327, + "step": 93390 + }, + { + "epoch": 7.237785268704715, + "grad_norm": 1.4431011240429459, + "learning_rate": 3.6190328580285185e-07, + "loss": 0.901, + "step": 93400 + }, + { + "epoch": 7.238560192181022, + "grad_norm": 1.3683590664036978, + "learning_rate": 3.619420334779913e-07, + "loss": 0.9299, + "step": 93410 + }, + { + "epoch": 7.239335115657329, + "grad_norm": 1.447274570907586, + "learning_rate": 3.6198078115313084e-07, + "loss": 0.9317, + "step": 93420 + }, + { + "epoch": 7.2401100391336355, + "grad_norm": 1.4696818435036616, + "learning_rate": 3.620195288282703e-07, + "loss": 0.9299, + "step": 93430 + }, + { + "epoch": 7.240884962609942, + "grad_norm": 1.4504482668054366, + "learning_rate": 3.6205827650340983e-07, + "loss": 0.8958, + "step": 93440 + }, + { + "epoch": 7.241659886086249, + "grad_norm": 1.491041704101943, + "learning_rate": 3.620970241785493e-07, + "loss": 0.9316, + "step": 93450 + }, + { + "epoch": 7.242434809562556, + "grad_norm": 1.5033796877780243, + "learning_rate": 3.621357718536888e-07, + "loss": 0.9196, + "step": 93460 + }, + { + "epoch": 7.243209733038863, + "grad_norm": 1.4581401081892773, + "learning_rate": 3.621745195288283e-07, + "loss": 0.9212, + "step": 93470 + }, + { + "epoch": 7.2439846565151695, + "grad_norm": 1.3439447527306083, + "learning_rate": 3.6221326720396776e-07, + "loss": 0.9186, + "step": 93480 + }, + { + "epoch": 7.2447595799914755, + "grad_norm": 1.3734872429675677, + "learning_rate": 3.622520148791073e-07, + "loss": 0.903, + "step": 93490 + }, + { + "epoch": 7.245534503467782, + "grad_norm": 1.5075099153929208, + "learning_rate": 3.6229076255424675e-07, + "loss": 0.9266, + "step": 93500 + }, + { + "epoch": 7.245534503467782, + "eval_loss": 0.928355872631073, + "eval_runtime": 331.1877, + "eval_samples_per_second": 34.636, + "eval_steps_per_second": 8.66, + "step": 93500 + }, + { + "epoch": 7.246309426944089, + "grad_norm": 1.4344805738757018, + "learning_rate": 3.623295102293863e-07, + "loss": 0.9048, + "step": 93510 + }, + { + "epoch": 7.247084350420396, + "grad_norm": 1.42955281421723, + "learning_rate": 3.6236825790452574e-07, + "loss": 0.9029, + "step": 93520 + }, + { + "epoch": 7.247859273896703, + "grad_norm": 1.423534834881543, + "learning_rate": 3.624070055796652e-07, + "loss": 0.9134, + "step": 93530 + }, + { + "epoch": 7.2486341973730095, + "grad_norm": 1.392214148475707, + "learning_rate": 3.6244575325480473e-07, + "loss": 0.9074, + "step": 93540 + }, + { + "epoch": 7.249409120849316, + "grad_norm": 1.4607988277196289, + "learning_rate": 3.624845009299442e-07, + "loss": 0.9193, + "step": 93550 + }, + { + "epoch": 7.250184044325623, + "grad_norm": 1.3712600346648003, + "learning_rate": 3.625232486050837e-07, + "loss": 0.9238, + "step": 93560 + }, + { + "epoch": 7.25095896780193, + "grad_norm": 1.3869682923589757, + "learning_rate": 3.625619962802232e-07, + "loss": 0.9355, + "step": 93570 + }, + { + "epoch": 7.251733891278236, + "grad_norm": 1.4042771510353418, + "learning_rate": 3.626007439553627e-07, + "loss": 0.9128, + "step": 93580 + }, + { + "epoch": 7.252508814754543, + "grad_norm": 1.392918411393212, + "learning_rate": 3.626394916305022e-07, + "loss": 0.9173, + "step": 93590 + }, + { + "epoch": 7.2532837382308495, + "grad_norm": 1.3655022063113116, + "learning_rate": 3.6267823930564166e-07, + "loss": 0.9144, + "step": 93600 + }, + { + "epoch": 7.254058661707156, + "grad_norm": 1.4414075739438397, + "learning_rate": 3.627169869807812e-07, + "loss": 0.9201, + "step": 93610 + }, + { + "epoch": 7.254833585183463, + "grad_norm": 1.427363005254532, + "learning_rate": 3.6275573465592065e-07, + "loss": 0.9237, + "step": 93620 + }, + { + "epoch": 7.25560850865977, + "grad_norm": 1.3609648274655959, + "learning_rate": 3.6279448233106017e-07, + "loss": 0.9221, + "step": 93630 + }, + { + "epoch": 7.256383432136077, + "grad_norm": 1.3710737986303978, + "learning_rate": 3.6283323000619964e-07, + "loss": 0.9113, + "step": 93640 + }, + { + "epoch": 7.2571583556123835, + "grad_norm": 1.4894969584290847, + "learning_rate": 3.6287197768133916e-07, + "loss": 0.9295, + "step": 93650 + }, + { + "epoch": 7.25793327908869, + "grad_norm": 1.3328770079846688, + "learning_rate": 3.6291072535647863e-07, + "loss": 0.9252, + "step": 93660 + }, + { + "epoch": 7.258708202564996, + "grad_norm": 1.4158190355173683, + "learning_rate": 3.629494730316181e-07, + "loss": 0.9058, + "step": 93670 + }, + { + "epoch": 7.259483126041303, + "grad_norm": 1.335796632873897, + "learning_rate": 3.629882207067576e-07, + "loss": 0.9196, + "step": 93680 + }, + { + "epoch": 7.26025804951761, + "grad_norm": 1.4128137349619672, + "learning_rate": 3.630269683818971e-07, + "loss": 0.9195, + "step": 93690 + }, + { + "epoch": 7.261032972993917, + "grad_norm": 1.4647447501228976, + "learning_rate": 3.630657160570366e-07, + "loss": 0.9312, + "step": 93700 + }, + { + "epoch": 7.2618078964702235, + "grad_norm": 1.4038183786371892, + "learning_rate": 3.631044637321761e-07, + "loss": 0.9358, + "step": 93710 + }, + { + "epoch": 7.26258281994653, + "grad_norm": 1.3687489731984506, + "learning_rate": 3.631432114073156e-07, + "loss": 0.914, + "step": 93720 + }, + { + "epoch": 7.263357743422837, + "grad_norm": 1.468423482006845, + "learning_rate": 3.6318195908245507e-07, + "loss": 0.9128, + "step": 93730 + }, + { + "epoch": 7.264132666899144, + "grad_norm": 1.4496659041234687, + "learning_rate": 3.6322070675759454e-07, + "loss": 0.9196, + "step": 93740 + }, + { + "epoch": 7.264907590375451, + "grad_norm": 1.3226562340202945, + "learning_rate": 3.6325945443273406e-07, + "loss": 0.9009, + "step": 93750 + }, + { + "epoch": 7.265682513851758, + "grad_norm": 1.4340092083697986, + "learning_rate": 3.6329820210787353e-07, + "loss": 0.9181, + "step": 93760 + }, + { + "epoch": 7.2664574373280635, + "grad_norm": 1.3748852086918475, + "learning_rate": 3.6333694978301305e-07, + "loss": 0.9146, + "step": 93770 + }, + { + "epoch": 7.26723236080437, + "grad_norm": 1.5119565557725627, + "learning_rate": 3.633756974581525e-07, + "loss": 0.9367, + "step": 93780 + }, + { + "epoch": 7.268007284280677, + "grad_norm": 1.389082838480025, + "learning_rate": 3.6341444513329204e-07, + "loss": 0.9142, + "step": 93790 + }, + { + "epoch": 7.268782207756984, + "grad_norm": 1.3880865067712231, + "learning_rate": 3.634531928084315e-07, + "loss": 0.9274, + "step": 93800 + }, + { + "epoch": 7.269557131233291, + "grad_norm": 1.3836331044065273, + "learning_rate": 3.63491940483571e-07, + "loss": 0.9304, + "step": 93810 + }, + { + "epoch": 7.2703320547095975, + "grad_norm": 1.3672621703656545, + "learning_rate": 3.635306881587105e-07, + "loss": 0.9299, + "step": 93820 + }, + { + "epoch": 7.271106978185904, + "grad_norm": 1.4163903491987428, + "learning_rate": 3.6356943583384997e-07, + "loss": 0.9137, + "step": 93830 + }, + { + "epoch": 7.271881901662211, + "grad_norm": 1.4537305146258446, + "learning_rate": 3.636081835089895e-07, + "loss": 0.9117, + "step": 93840 + }, + { + "epoch": 7.272656825138518, + "grad_norm": 1.4745800574572587, + "learning_rate": 3.6364693118412896e-07, + "loss": 0.9258, + "step": 93850 + }, + { + "epoch": 7.273431748614824, + "grad_norm": 1.427449438331265, + "learning_rate": 3.636856788592685e-07, + "loss": 0.9152, + "step": 93860 + }, + { + "epoch": 7.274206672091131, + "grad_norm": 1.3799544615293364, + "learning_rate": 3.6372442653440795e-07, + "loss": 0.9247, + "step": 93870 + }, + { + "epoch": 7.2749815955674375, + "grad_norm": 1.3943720049603787, + "learning_rate": 3.637631742095474e-07, + "loss": 0.9064, + "step": 93880 + }, + { + "epoch": 7.275756519043744, + "grad_norm": 1.4679240436110146, + "learning_rate": 3.6380192188468695e-07, + "loss": 0.9386, + "step": 93890 + }, + { + "epoch": 7.276531442520051, + "grad_norm": 1.4658894546279069, + "learning_rate": 3.638406695598264e-07, + "loss": 0.9264, + "step": 93900 + }, + { + "epoch": 7.277306365996358, + "grad_norm": 1.4207367510605795, + "learning_rate": 3.6387941723496594e-07, + "loss": 0.946, + "step": 93910 + }, + { + "epoch": 7.278081289472665, + "grad_norm": 1.3482996047484026, + "learning_rate": 3.639181649101054e-07, + "loss": 0.8984, + "step": 93920 + }, + { + "epoch": 7.278856212948972, + "grad_norm": 1.392344106703212, + "learning_rate": 3.6395691258524493e-07, + "loss": 0.9563, + "step": 93930 + }, + { + "epoch": 7.279631136425278, + "grad_norm": 1.336194327873086, + "learning_rate": 3.639956602603844e-07, + "loss": 0.9298, + "step": 93940 + }, + { + "epoch": 7.280406059901585, + "grad_norm": 1.4743450786033627, + "learning_rate": 3.6403440793552387e-07, + "loss": 0.9, + "step": 93950 + }, + { + "epoch": 7.281180983377891, + "grad_norm": 1.4355578506285827, + "learning_rate": 3.640731556106634e-07, + "loss": 0.9186, + "step": 93960 + }, + { + "epoch": 7.281955906854198, + "grad_norm": 1.4019633567086676, + "learning_rate": 3.6411190328580286e-07, + "loss": 0.9025, + "step": 93970 + }, + { + "epoch": 7.282730830330505, + "grad_norm": 1.390815916177662, + "learning_rate": 3.641506509609424e-07, + "loss": 0.9208, + "step": 93980 + }, + { + "epoch": 7.2835057538068115, + "grad_norm": 1.4434101652856903, + "learning_rate": 3.6418939863608185e-07, + "loss": 0.9024, + "step": 93990 + }, + { + "epoch": 7.284280677283118, + "grad_norm": 1.3890040145669211, + "learning_rate": 3.6422814631122137e-07, + "loss": 0.9195, + "step": 94000 + }, + { + "epoch": 7.284280677283118, + "eval_loss": 0.9281030297279358, + "eval_runtime": 329.849, + "eval_samples_per_second": 34.777, + "eval_steps_per_second": 8.695, + "step": 94000 + }, + { + "epoch": 7.285055600759425, + "grad_norm": 1.38687476754188, + "learning_rate": 3.6426689398636084e-07, + "loss": 0.9264, + "step": 94010 + }, + { + "epoch": 7.285830524235732, + "grad_norm": 1.4482777931461845, + "learning_rate": 3.643056416615003e-07, + "loss": 0.9199, + "step": 94020 + }, + { + "epoch": 7.286605447712039, + "grad_norm": 1.4814167031784862, + "learning_rate": 3.6434438933663983e-07, + "loss": 0.9155, + "step": 94030 + }, + { + "epoch": 7.287380371188345, + "grad_norm": 1.5199671537727586, + "learning_rate": 3.643831370117793e-07, + "loss": 0.9141, + "step": 94040 + }, + { + "epoch": 7.2881552946646515, + "grad_norm": 1.4302435101603228, + "learning_rate": 3.644218846869188e-07, + "loss": 0.9279, + "step": 94050 + }, + { + "epoch": 7.288930218140958, + "grad_norm": 1.6203190531758946, + "learning_rate": 3.644606323620583e-07, + "loss": 0.9555, + "step": 94060 + }, + { + "epoch": 7.289705141617265, + "grad_norm": 1.440458790048054, + "learning_rate": 3.644993800371978e-07, + "loss": 0.925, + "step": 94070 + }, + { + "epoch": 7.290480065093572, + "grad_norm": 1.4837373685474706, + "learning_rate": 3.645381277123373e-07, + "loss": 0.9121, + "step": 94080 + }, + { + "epoch": 7.291254988569879, + "grad_norm": 1.3778413448623283, + "learning_rate": 3.6457687538747675e-07, + "loss": 0.9324, + "step": 94090 + }, + { + "epoch": 7.292029912046186, + "grad_norm": 1.45597395903262, + "learning_rate": 3.6461562306261627e-07, + "loss": 0.9538, + "step": 94100 + }, + { + "epoch": 7.292804835522492, + "grad_norm": 1.369240265687564, + "learning_rate": 3.6465437073775574e-07, + "loss": 0.9507, + "step": 94110 + }, + { + "epoch": 7.293579758998799, + "grad_norm": 1.4524729228678017, + "learning_rate": 3.6469311841289526e-07, + "loss": 0.9379, + "step": 94120 + }, + { + "epoch": 7.294354682475106, + "grad_norm": 1.5308310445681512, + "learning_rate": 3.6473186608803473e-07, + "loss": 0.9109, + "step": 94130 + }, + { + "epoch": 7.295129605951412, + "grad_norm": 1.4150788293606729, + "learning_rate": 3.6477061376317425e-07, + "loss": 0.9096, + "step": 94140 + }, + { + "epoch": 7.295904529427719, + "grad_norm": 1.3807637149601912, + "learning_rate": 3.648093614383137e-07, + "loss": 0.9287, + "step": 94150 + }, + { + "epoch": 7.2966794529040255, + "grad_norm": 1.4231504454509945, + "learning_rate": 3.648481091134532e-07, + "loss": 0.9294, + "step": 94160 + }, + { + "epoch": 7.297454376380332, + "grad_norm": 1.4678553412116697, + "learning_rate": 3.648868567885927e-07, + "loss": 0.9289, + "step": 94170 + }, + { + "epoch": 7.298229299856639, + "grad_norm": 1.4141658073100016, + "learning_rate": 3.649256044637322e-07, + "loss": 0.9281, + "step": 94180 + }, + { + "epoch": 7.299004223332946, + "grad_norm": 1.418586378170272, + "learning_rate": 3.649643521388717e-07, + "loss": 0.9065, + "step": 94190 + }, + { + "epoch": 7.299779146809253, + "grad_norm": 1.6455240832719886, + "learning_rate": 3.650030998140112e-07, + "loss": 0.9294, + "step": 94200 + }, + { + "epoch": 7.30055407028556, + "grad_norm": 1.4174077058549954, + "learning_rate": 3.6504184748915064e-07, + "loss": 0.9151, + "step": 94210 + }, + { + "epoch": 7.301328993761866, + "grad_norm": 1.3580305143579188, + "learning_rate": 3.6508059516429016e-07, + "loss": 0.925, + "step": 94220 + }, + { + "epoch": 7.302103917238172, + "grad_norm": 1.3649351378237293, + "learning_rate": 3.6511934283942963e-07, + "loss": 0.9242, + "step": 94230 + }, + { + "epoch": 7.302878840714479, + "grad_norm": 1.4188226520843257, + "learning_rate": 3.6515809051456916e-07, + "loss": 0.9158, + "step": 94240 + }, + { + "epoch": 7.303653764190786, + "grad_norm": 1.3830957290226291, + "learning_rate": 3.651968381897086e-07, + "loss": 0.9369, + "step": 94250 + }, + { + "epoch": 7.304428687667093, + "grad_norm": 1.4721527972880246, + "learning_rate": 3.6523558586484815e-07, + "loss": 0.9196, + "step": 94260 + }, + { + "epoch": 7.3052036111433996, + "grad_norm": 1.4002133746970493, + "learning_rate": 3.652743335399876e-07, + "loss": 0.9195, + "step": 94270 + }, + { + "epoch": 7.305978534619706, + "grad_norm": 1.434589435775023, + "learning_rate": 3.653130812151271e-07, + "loss": 0.9257, + "step": 94280 + }, + { + "epoch": 7.306753458096013, + "grad_norm": 1.4579127973786052, + "learning_rate": 3.653518288902666e-07, + "loss": 0.9141, + "step": 94290 + }, + { + "epoch": 7.30752838157232, + "grad_norm": 1.5074097241730045, + "learning_rate": 3.653905765654061e-07, + "loss": 0.9269, + "step": 94300 + }, + { + "epoch": 7.308303305048627, + "grad_norm": 1.4487513492741402, + "learning_rate": 3.654293242405456e-07, + "loss": 0.9559, + "step": 94310 + }, + { + "epoch": 7.309078228524934, + "grad_norm": 1.4693360305490524, + "learning_rate": 3.6546807191568507e-07, + "loss": 0.9132, + "step": 94320 + }, + { + "epoch": 7.3098531520012395, + "grad_norm": 1.4379984147087284, + "learning_rate": 3.655068195908246e-07, + "loss": 0.9304, + "step": 94330 + }, + { + "epoch": 7.310628075477546, + "grad_norm": 1.4647502124522946, + "learning_rate": 3.6554556726596406e-07, + "loss": 0.9263, + "step": 94340 + }, + { + "epoch": 7.311402998953853, + "grad_norm": 1.3672468221236516, + "learning_rate": 3.6558431494110353e-07, + "loss": 0.9111, + "step": 94350 + }, + { + "epoch": 7.31217792243016, + "grad_norm": 1.5131830120657477, + "learning_rate": 3.6562306261624305e-07, + "loss": 0.9083, + "step": 94360 + }, + { + "epoch": 7.312952845906467, + "grad_norm": 1.397942989594124, + "learning_rate": 3.656618102913825e-07, + "loss": 0.9345, + "step": 94370 + }, + { + "epoch": 7.313727769382774, + "grad_norm": 1.4805364160738672, + "learning_rate": 3.6570055796652204e-07, + "loss": 0.951, + "step": 94380 + }, + { + "epoch": 7.31450269285908, + "grad_norm": 1.4686441568057345, + "learning_rate": 3.657393056416615e-07, + "loss": 0.9346, + "step": 94390 + }, + { + "epoch": 7.315277616335387, + "grad_norm": 1.4032455229365246, + "learning_rate": 3.6577805331680103e-07, + "loss": 0.9232, + "step": 94400 + }, + { + "epoch": 7.316052539811694, + "grad_norm": 1.4048681024727185, + "learning_rate": 3.658168009919405e-07, + "loss": 0.9246, + "step": 94410 + }, + { + "epoch": 7.316827463288, + "grad_norm": 1.3987204772346458, + "learning_rate": 3.6585554866707997e-07, + "loss": 0.9184, + "step": 94420 + }, + { + "epoch": 7.317602386764307, + "grad_norm": 1.4059925652644782, + "learning_rate": 3.658942963422195e-07, + "loss": 0.9329, + "step": 94430 + }, + { + "epoch": 7.3183773102406136, + "grad_norm": 1.4734004770217393, + "learning_rate": 3.6593304401735896e-07, + "loss": 0.949, + "step": 94440 + }, + { + "epoch": 7.31915223371692, + "grad_norm": 1.3501392930723968, + "learning_rate": 3.659717916924985e-07, + "loss": 0.9272, + "step": 94450 + }, + { + "epoch": 7.319927157193227, + "grad_norm": 1.3884774517250964, + "learning_rate": 3.6601053936763795e-07, + "loss": 0.9532, + "step": 94460 + }, + { + "epoch": 7.320702080669534, + "grad_norm": 1.5212999530162292, + "learning_rate": 3.6604928704277747e-07, + "loss": 0.9174, + "step": 94470 + }, + { + "epoch": 7.321477004145841, + "grad_norm": 1.4360092023902244, + "learning_rate": 3.6608803471791694e-07, + "loss": 0.9156, + "step": 94480 + }, + { + "epoch": 7.322251927622148, + "grad_norm": 1.4215231389783753, + "learning_rate": 3.661267823930564e-07, + "loss": 0.9215, + "step": 94490 + }, + { + "epoch": 7.323026851098454, + "grad_norm": 1.267752453598864, + "learning_rate": 3.6616553006819593e-07, + "loss": 0.9146, + "step": 94500 + }, + { + "epoch": 7.323026851098454, + "eval_loss": 0.9276667833328247, + "eval_runtime": 329.5296, + "eval_samples_per_second": 34.81, + "eval_steps_per_second": 8.703, + "step": 94500 + }, + { + "epoch": 7.32380177457476, + "grad_norm": 1.5444030454514985, + "learning_rate": 3.662042777433354e-07, + "loss": 0.9119, + "step": 94510 + }, + { + "epoch": 7.324576698051067, + "grad_norm": 1.4723067196527961, + "learning_rate": 3.662430254184749e-07, + "loss": 0.9151, + "step": 94520 + }, + { + "epoch": 7.325351621527374, + "grad_norm": 1.4876456636040911, + "learning_rate": 3.662817730936144e-07, + "loss": 0.922, + "step": 94530 + }, + { + "epoch": 7.326126545003681, + "grad_norm": 1.3975133165332627, + "learning_rate": 3.663205207687539e-07, + "loss": 0.9063, + "step": 94540 + }, + { + "epoch": 7.326901468479988, + "grad_norm": 1.4887376302802606, + "learning_rate": 3.663592684438934e-07, + "loss": 0.9309, + "step": 94550 + }, + { + "epoch": 7.327676391956294, + "grad_norm": 1.4449865240617532, + "learning_rate": 3.6639801611903285e-07, + "loss": 0.8939, + "step": 94560 + }, + { + "epoch": 7.328451315432601, + "grad_norm": 1.403368627684336, + "learning_rate": 3.664367637941724e-07, + "loss": 0.9007, + "step": 94570 + }, + { + "epoch": 7.329226238908908, + "grad_norm": 1.3653912350540662, + "learning_rate": 3.6647551146931184e-07, + "loss": 0.9106, + "step": 94580 + }, + { + "epoch": 7.330001162385215, + "grad_norm": 1.4437724553870994, + "learning_rate": 3.6651425914445137e-07, + "loss": 0.9245, + "step": 94590 + }, + { + "epoch": 7.330776085861521, + "grad_norm": 1.3427298007073605, + "learning_rate": 3.6655300681959084e-07, + "loss": 0.9095, + "step": 94600 + }, + { + "epoch": 7.3315510093378276, + "grad_norm": 1.4207949977981489, + "learning_rate": 3.6659175449473036e-07, + "loss": 0.9321, + "step": 94610 + }, + { + "epoch": 7.332325932814134, + "grad_norm": 1.320282087146438, + "learning_rate": 3.666305021698698e-07, + "loss": 0.9313, + "step": 94620 + }, + { + "epoch": 7.333100856290441, + "grad_norm": 1.525648631753782, + "learning_rate": 3.666692498450093e-07, + "loss": 0.9453, + "step": 94630 + }, + { + "epoch": 7.333875779766748, + "grad_norm": 1.47491875198302, + "learning_rate": 3.667079975201488e-07, + "loss": 0.909, + "step": 94640 + }, + { + "epoch": 7.334650703243055, + "grad_norm": 1.4410353358616823, + "learning_rate": 3.667467451952883e-07, + "loss": 0.9084, + "step": 94650 + }, + { + "epoch": 7.335425626719362, + "grad_norm": 1.5089476611667052, + "learning_rate": 3.667854928704278e-07, + "loss": 0.9326, + "step": 94660 + }, + { + "epoch": 7.336200550195668, + "grad_norm": 1.417225659120552, + "learning_rate": 3.668242405455673e-07, + "loss": 0.8986, + "step": 94670 + }, + { + "epoch": 7.336975473671975, + "grad_norm": 1.473633881492033, + "learning_rate": 3.668629882207068e-07, + "loss": 0.9384, + "step": 94680 + }, + { + "epoch": 7.337750397148282, + "grad_norm": 1.3645298827279269, + "learning_rate": 3.6690173589584627e-07, + "loss": 0.9183, + "step": 94690 + }, + { + "epoch": 7.338525320624588, + "grad_norm": 1.4738301925027821, + "learning_rate": 3.6694048357098574e-07, + "loss": 0.9171, + "step": 94700 + }, + { + "epoch": 7.339300244100895, + "grad_norm": 1.4688748202531137, + "learning_rate": 3.6697923124612526e-07, + "loss": 0.9122, + "step": 94710 + }, + { + "epoch": 7.340075167577202, + "grad_norm": 1.65170026249568, + "learning_rate": 3.6701797892126473e-07, + "loss": 0.9624, + "step": 94720 + }, + { + "epoch": 7.340850091053508, + "grad_norm": 1.3411052100269099, + "learning_rate": 3.6705672659640425e-07, + "loss": 0.9092, + "step": 94730 + }, + { + "epoch": 7.341625014529815, + "grad_norm": 1.3801980837790306, + "learning_rate": 3.670954742715437e-07, + "loss": 0.9191, + "step": 94740 + }, + { + "epoch": 7.342399938006122, + "grad_norm": 1.4303995746898024, + "learning_rate": 3.6713422194668324e-07, + "loss": 0.914, + "step": 94750 + }, + { + "epoch": 7.343174861482429, + "grad_norm": 1.3924155815160764, + "learning_rate": 3.671729696218227e-07, + "loss": 0.9229, + "step": 94760 + }, + { + "epoch": 7.343949784958736, + "grad_norm": 1.431123681261732, + "learning_rate": 3.672117172969622e-07, + "loss": 0.9155, + "step": 94770 + }, + { + "epoch": 7.344724708435042, + "grad_norm": 1.4040870893497748, + "learning_rate": 3.672504649721017e-07, + "loss": 0.9213, + "step": 94780 + }, + { + "epoch": 7.345499631911348, + "grad_norm": 1.4112590612943516, + "learning_rate": 3.6728921264724117e-07, + "loss": 0.9145, + "step": 94790 + }, + { + "epoch": 7.346274555387655, + "grad_norm": 1.354170749444122, + "learning_rate": 3.673279603223807e-07, + "loss": 0.9094, + "step": 94800 + }, + { + "epoch": 7.347049478863962, + "grad_norm": 1.3738722997255919, + "learning_rate": 3.6736670799752016e-07, + "loss": 0.9088, + "step": 94810 + }, + { + "epoch": 7.347824402340269, + "grad_norm": 1.6224311532299651, + "learning_rate": 3.674054556726597e-07, + "loss": 0.9426, + "step": 94820 + }, + { + "epoch": 7.348599325816576, + "grad_norm": 1.3929084558522977, + "learning_rate": 3.6744420334779915e-07, + "loss": 0.93, + "step": 94830 + }, + { + "epoch": 7.349374249292882, + "grad_norm": 1.4289590261768008, + "learning_rate": 3.674829510229386e-07, + "loss": 0.9204, + "step": 94840 + }, + { + "epoch": 7.350149172769189, + "grad_norm": 1.3515995172257924, + "learning_rate": 3.6752169869807814e-07, + "loss": 0.919, + "step": 94850 + }, + { + "epoch": 7.350924096245496, + "grad_norm": 1.4212368422367267, + "learning_rate": 3.675604463732176e-07, + "loss": 0.9268, + "step": 94860 + }, + { + "epoch": 7.351699019721803, + "grad_norm": 1.4799408945728383, + "learning_rate": 3.6759919404835713e-07, + "loss": 0.9157, + "step": 94870 + }, + { + "epoch": 7.35247394319811, + "grad_norm": 1.474821912883223, + "learning_rate": 3.676379417234966e-07, + "loss": 0.9373, + "step": 94880 + }, + { + "epoch": 7.353248866674416, + "grad_norm": 1.4093269181379386, + "learning_rate": 3.676766893986361e-07, + "loss": 0.9227, + "step": 94890 + }, + { + "epoch": 7.354023790150722, + "grad_norm": 1.3991777362566664, + "learning_rate": 3.677154370737756e-07, + "loss": 0.9281, + "step": 94900 + }, + { + "epoch": 7.354798713627029, + "grad_norm": 1.5368158791653772, + "learning_rate": 3.6775418474891506e-07, + "loss": 0.9769, + "step": 94910 + }, + { + "epoch": 7.355573637103336, + "grad_norm": 1.4090080639428064, + "learning_rate": 3.677929324240546e-07, + "loss": 0.9042, + "step": 94920 + }, + { + "epoch": 7.356348560579643, + "grad_norm": 1.3512639849871195, + "learning_rate": 3.6783168009919405e-07, + "loss": 0.9131, + "step": 94930 + }, + { + "epoch": 7.35712348405595, + "grad_norm": 1.3157348832873708, + "learning_rate": 3.678704277743336e-07, + "loss": 0.9312, + "step": 94940 + }, + { + "epoch": 7.357898407532256, + "grad_norm": 1.3388630555926118, + "learning_rate": 3.6790917544947305e-07, + "loss": 0.9076, + "step": 94950 + }, + { + "epoch": 7.358673331008563, + "grad_norm": 1.3761995283271757, + "learning_rate": 3.679479231246125e-07, + "loss": 0.9166, + "step": 94960 + }, + { + "epoch": 7.359448254484869, + "grad_norm": 1.4648744194060601, + "learning_rate": 3.6798667079975204e-07, + "loss": 0.9331, + "step": 94970 + }, + { + "epoch": 7.360223177961176, + "grad_norm": 1.4514178894110634, + "learning_rate": 3.680254184748915e-07, + "loss": 0.916, + "step": 94980 + }, + { + "epoch": 7.360998101437483, + "grad_norm": 1.4775613642301075, + "learning_rate": 3.6806416615003103e-07, + "loss": 0.8997, + "step": 94990 + }, + { + "epoch": 7.36177302491379, + "grad_norm": 1.3792301060898566, + "learning_rate": 3.681029138251705e-07, + "loss": 0.9219, + "step": 95000 + }, + { + "epoch": 7.36177302491379, + "eval_loss": 0.9273276329040527, + "eval_runtime": 332.3484, + "eval_samples_per_second": 34.515, + "eval_steps_per_second": 8.629, + "step": 95000 + }, + { + "epoch": 7.362547948390096, + "grad_norm": 1.4788890305813005, + "learning_rate": 3.6814166150031e-07, + "loss": 0.9035, + "step": 95010 + }, + { + "epoch": 7.363322871866403, + "grad_norm": 1.4385201747154397, + "learning_rate": 3.681804091754495e-07, + "loss": 0.9296, + "step": 95020 + }, + { + "epoch": 7.36409779534271, + "grad_norm": 1.4227118164140635, + "learning_rate": 3.6821915685058896e-07, + "loss": 0.9101, + "step": 95030 + }, + { + "epoch": 7.364872718819017, + "grad_norm": 1.4358006909915135, + "learning_rate": 3.682579045257285e-07, + "loss": 0.905, + "step": 95040 + }, + { + "epoch": 7.365647642295324, + "grad_norm": 1.437860122176234, + "learning_rate": 3.6829665220086795e-07, + "loss": 0.9224, + "step": 95050 + }, + { + "epoch": 7.3664225657716305, + "grad_norm": 1.4986251014644174, + "learning_rate": 3.6833539987600747e-07, + "loss": 0.926, + "step": 95060 + }, + { + "epoch": 7.367197489247936, + "grad_norm": 1.4308827985650112, + "learning_rate": 3.6837414755114694e-07, + "loss": 0.9327, + "step": 95070 + }, + { + "epoch": 7.367972412724243, + "grad_norm": 1.4698767355724376, + "learning_rate": 3.6841289522628646e-07, + "loss": 0.9266, + "step": 95080 + }, + { + "epoch": 7.36874733620055, + "grad_norm": 1.3736271153964204, + "learning_rate": 3.6845164290142593e-07, + "loss": 0.89, + "step": 95090 + }, + { + "epoch": 7.369522259676857, + "grad_norm": 1.5147174865024353, + "learning_rate": 3.684903905765654e-07, + "loss": 0.9151, + "step": 95100 + }, + { + "epoch": 7.370297183153164, + "grad_norm": 1.3696787298745479, + "learning_rate": 3.685291382517049e-07, + "loss": 0.9301, + "step": 95110 + }, + { + "epoch": 7.37107210662947, + "grad_norm": 1.404901057005184, + "learning_rate": 3.685678859268444e-07, + "loss": 0.9349, + "step": 95120 + }, + { + "epoch": 7.371847030105777, + "grad_norm": 1.3981892186935023, + "learning_rate": 3.686066336019839e-07, + "loss": 0.9198, + "step": 95130 + }, + { + "epoch": 7.372621953582084, + "grad_norm": 1.3437925673355955, + "learning_rate": 3.686453812771234e-07, + "loss": 0.925, + "step": 95140 + }, + { + "epoch": 7.373396877058391, + "grad_norm": 1.52088161579201, + "learning_rate": 3.686841289522629e-07, + "loss": 0.9095, + "step": 95150 + }, + { + "epoch": 7.374171800534697, + "grad_norm": 1.4993278169943824, + "learning_rate": 3.6872287662740237e-07, + "loss": 0.9154, + "step": 95160 + }, + { + "epoch": 7.374946724011004, + "grad_norm": 1.4697934468206437, + "learning_rate": 3.6876162430254184e-07, + "loss": 0.9389, + "step": 95170 + }, + { + "epoch": 7.37572164748731, + "grad_norm": 1.5109970582999153, + "learning_rate": 3.6880037197768136e-07, + "loss": 0.9186, + "step": 95180 + }, + { + "epoch": 7.376496570963617, + "grad_norm": 1.468310417426922, + "learning_rate": 3.6883911965282083e-07, + "loss": 0.9396, + "step": 95190 + }, + { + "epoch": 7.377271494439924, + "grad_norm": 1.4502567198452008, + "learning_rate": 3.6887786732796035e-07, + "loss": 0.9283, + "step": 95200 + }, + { + "epoch": 7.378046417916231, + "grad_norm": 1.567521845234971, + "learning_rate": 3.689166150030998e-07, + "loss": 0.9037, + "step": 95210 + }, + { + "epoch": 7.378821341392538, + "grad_norm": 1.3672967790573696, + "learning_rate": 3.6895536267823935e-07, + "loss": 0.9112, + "step": 95220 + }, + { + "epoch": 7.3795962648688445, + "grad_norm": 1.4901762192248653, + "learning_rate": 3.689941103533788e-07, + "loss": 0.9287, + "step": 95230 + }, + { + "epoch": 7.380371188345151, + "grad_norm": 1.3967220403320058, + "learning_rate": 3.690328580285183e-07, + "loss": 0.92, + "step": 95240 + }, + { + "epoch": 7.381146111821458, + "grad_norm": 1.4307287195338116, + "learning_rate": 3.690716057036578e-07, + "loss": 0.9145, + "step": 95250 + }, + { + "epoch": 7.381921035297764, + "grad_norm": 1.3684950011120993, + "learning_rate": 3.691103533787973e-07, + "loss": 0.9255, + "step": 95260 + }, + { + "epoch": 7.382695958774071, + "grad_norm": 1.3935819310180333, + "learning_rate": 3.691491010539368e-07, + "loss": 0.9389, + "step": 95270 + }, + { + "epoch": 7.383470882250378, + "grad_norm": 1.4288523989522854, + "learning_rate": 3.6918784872907627e-07, + "loss": 0.9249, + "step": 95280 + }, + { + "epoch": 7.384245805726684, + "grad_norm": 1.3633407461751264, + "learning_rate": 3.692265964042158e-07, + "loss": 0.9204, + "step": 95290 + }, + { + "epoch": 7.385020729202991, + "grad_norm": 1.4624729505314997, + "learning_rate": 3.6926534407935526e-07, + "loss": 0.9226, + "step": 95300 + }, + { + "epoch": 7.385795652679298, + "grad_norm": 1.4158895537496776, + "learning_rate": 3.693040917544947e-07, + "loss": 0.9268, + "step": 95310 + }, + { + "epoch": 7.386570576155605, + "grad_norm": 1.3925829920639197, + "learning_rate": 3.6934283942963425e-07, + "loss": 0.9058, + "step": 95320 + }, + { + "epoch": 7.387345499631912, + "grad_norm": 1.4352851390275638, + "learning_rate": 3.693815871047737e-07, + "loss": 0.9293, + "step": 95330 + }, + { + "epoch": 7.388120423108218, + "grad_norm": 1.3589963218823637, + "learning_rate": 3.6942033477991324e-07, + "loss": 0.9131, + "step": 95340 + }, + { + "epoch": 7.388895346584524, + "grad_norm": 1.4732667431680433, + "learning_rate": 3.694590824550527e-07, + "loss": 0.9141, + "step": 95350 + }, + { + "epoch": 7.389670270060831, + "grad_norm": 1.4254134027508083, + "learning_rate": 3.6949783013019223e-07, + "loss": 0.9281, + "step": 95360 + }, + { + "epoch": 7.390445193537138, + "grad_norm": 1.3911022975442595, + "learning_rate": 3.695365778053317e-07, + "loss": 0.9053, + "step": 95370 + }, + { + "epoch": 7.391220117013445, + "grad_norm": 1.3976173248275205, + "learning_rate": 3.6957532548047117e-07, + "loss": 0.9002, + "step": 95380 + }, + { + "epoch": 7.391995040489752, + "grad_norm": 1.3888365937621194, + "learning_rate": 3.696140731556107e-07, + "loss": 0.9091, + "step": 95390 + }, + { + "epoch": 7.3927699639660585, + "grad_norm": 1.389978800021285, + "learning_rate": 3.6965282083075016e-07, + "loss": 0.9182, + "step": 95400 + }, + { + "epoch": 7.393544887442365, + "grad_norm": 1.464396474515483, + "learning_rate": 3.696915685058897e-07, + "loss": 0.9312, + "step": 95410 + }, + { + "epoch": 7.394319810918672, + "grad_norm": 1.4480000366921257, + "learning_rate": 3.6973031618102915e-07, + "loss": 0.9138, + "step": 95420 + }, + { + "epoch": 7.395094734394979, + "grad_norm": 1.3555707725265649, + "learning_rate": 3.6976906385616867e-07, + "loss": 0.9217, + "step": 95430 + }, + { + "epoch": 7.395869657871285, + "grad_norm": 1.4389538176978058, + "learning_rate": 3.6980781153130814e-07, + "loss": 0.9099, + "step": 95440 + }, + { + "epoch": 7.396644581347592, + "grad_norm": 1.4107998963125792, + "learning_rate": 3.698465592064476e-07, + "loss": 0.9145, + "step": 95450 + }, + { + "epoch": 7.397419504823898, + "grad_norm": 1.339932556845184, + "learning_rate": 3.6988530688158713e-07, + "loss": 0.9296, + "step": 95460 + }, + { + "epoch": 7.398194428300205, + "grad_norm": 1.3850597245443015, + "learning_rate": 3.699240545567266e-07, + "loss": 0.9216, + "step": 95470 + }, + { + "epoch": 7.398969351776512, + "grad_norm": 1.4133073343194662, + "learning_rate": 3.699628022318661e-07, + "loss": 0.9179, + "step": 95480 + }, + { + "epoch": 7.399744275252819, + "grad_norm": 1.3660935278500959, + "learning_rate": 3.700015499070056e-07, + "loss": 0.9255, + "step": 95490 + }, + { + "epoch": 7.400519198729126, + "grad_norm": 1.360277234143227, + "learning_rate": 3.700402975821451e-07, + "loss": 0.9222, + "step": 95500 + }, + { + "epoch": 7.400519198729126, + "eval_loss": 0.9268714785575867, + "eval_runtime": 335.0303, + "eval_samples_per_second": 34.239, + "eval_steps_per_second": 8.56, + "step": 95500 + }, + { + "epoch": 7.4012941222054325, + "grad_norm": 1.4160051203998996, + "learning_rate": 3.700790452572846e-07, + "loss": 0.9169, + "step": 95510 + }, + { + "epoch": 7.402069045681739, + "grad_norm": 1.4365000724290913, + "learning_rate": 3.7011779293242405e-07, + "loss": 0.9501, + "step": 95520 + }, + { + "epoch": 7.402843969158045, + "grad_norm": 1.4372535391431995, + "learning_rate": 3.701565406075636e-07, + "loss": 0.8979, + "step": 95530 + }, + { + "epoch": 7.403618892634352, + "grad_norm": 1.4245689374374912, + "learning_rate": 3.7019528828270304e-07, + "loss": 0.9272, + "step": 95540 + }, + { + "epoch": 7.404393816110659, + "grad_norm": 1.3498746786111793, + "learning_rate": 3.7023403595784256e-07, + "loss": 0.9209, + "step": 95550 + }, + { + "epoch": 7.405168739586966, + "grad_norm": 1.4714450586156065, + "learning_rate": 3.7027278363298203e-07, + "loss": 0.904, + "step": 95560 + }, + { + "epoch": 7.4059436630632725, + "grad_norm": 1.475212285347925, + "learning_rate": 3.7031153130812156e-07, + "loss": 0.9098, + "step": 95570 + }, + { + "epoch": 7.406718586539579, + "grad_norm": 1.4117186257248024, + "learning_rate": 3.70350278983261e-07, + "loss": 0.9344, + "step": 95580 + }, + { + "epoch": 7.407493510015886, + "grad_norm": 1.3928600912898466, + "learning_rate": 3.703890266584005e-07, + "loss": 0.9113, + "step": 95590 + }, + { + "epoch": 7.408268433492193, + "grad_norm": 1.342135502807894, + "learning_rate": 3.7042777433354e-07, + "loss": 0.9096, + "step": 95600 + }, + { + "epoch": 7.4090433569685, + "grad_norm": 1.3417388917501445, + "learning_rate": 3.704665220086795e-07, + "loss": 0.9177, + "step": 95610 + }, + { + "epoch": 7.4098182804448065, + "grad_norm": 1.4929250550696775, + "learning_rate": 3.70505269683819e-07, + "loss": 0.9536, + "step": 95620 + }, + { + "epoch": 7.410593203921112, + "grad_norm": 1.5172557790305499, + "learning_rate": 3.705440173589585e-07, + "loss": 0.9195, + "step": 95630 + }, + { + "epoch": 7.411368127397419, + "grad_norm": 1.3832473229973274, + "learning_rate": 3.70582765034098e-07, + "loss": 0.9247, + "step": 95640 + }, + { + "epoch": 7.412143050873726, + "grad_norm": 1.3435411758435454, + "learning_rate": 3.7062151270923747e-07, + "loss": 0.9274, + "step": 95650 + }, + { + "epoch": 7.412917974350033, + "grad_norm": 1.4385248011961966, + "learning_rate": 3.7066026038437694e-07, + "loss": 0.9397, + "step": 95660 + }, + { + "epoch": 7.41369289782634, + "grad_norm": 1.3740000739685476, + "learning_rate": 3.7069900805951646e-07, + "loss": 0.9182, + "step": 95670 + }, + { + "epoch": 7.4144678213026465, + "grad_norm": 1.4021897165700994, + "learning_rate": 3.7073775573465593e-07, + "loss": 0.9353, + "step": 95680 + }, + { + "epoch": 7.415242744778953, + "grad_norm": 1.3500847284292643, + "learning_rate": 3.7077650340979545e-07, + "loss": 0.9252, + "step": 95690 + }, + { + "epoch": 7.41601766825526, + "grad_norm": 1.437122810774689, + "learning_rate": 3.708152510849349e-07, + "loss": 0.8996, + "step": 95700 + }, + { + "epoch": 7.416792591731567, + "grad_norm": 1.353265557775667, + "learning_rate": 3.708539987600744e-07, + "loss": 0.9348, + "step": 95710 + }, + { + "epoch": 7.417567515207873, + "grad_norm": 1.3267516678607618, + "learning_rate": 3.708927464352139e-07, + "loss": 0.9078, + "step": 95720 + }, + { + "epoch": 7.41834243868418, + "grad_norm": 1.4044633195576193, + "learning_rate": 3.709314941103534e-07, + "loss": 0.9289, + "step": 95730 + }, + { + "epoch": 7.4191173621604865, + "grad_norm": 1.3353553124027873, + "learning_rate": 3.709702417854929e-07, + "loss": 0.9173, + "step": 95740 + }, + { + "epoch": 7.419892285636793, + "grad_norm": 1.4210966987427167, + "learning_rate": 3.7100898946063237e-07, + "loss": 0.9002, + "step": 95750 + }, + { + "epoch": 7.4206672091131, + "grad_norm": 1.691199271457612, + "learning_rate": 3.710477371357719e-07, + "loss": 0.915, + "step": 95760 + }, + { + "epoch": 7.421442132589407, + "grad_norm": 1.4806789306262844, + "learning_rate": 3.7108648481091136e-07, + "loss": 0.9343, + "step": 95770 + }, + { + "epoch": 7.422217056065714, + "grad_norm": 1.4134845621417735, + "learning_rate": 3.7112523248605083e-07, + "loss": 0.9288, + "step": 95780 + }, + { + "epoch": 7.4229919795420205, + "grad_norm": 1.3661664857044558, + "learning_rate": 3.7116398016119035e-07, + "loss": 0.9502, + "step": 95790 + }, + { + "epoch": 7.423766903018327, + "grad_norm": 1.3830916963505833, + "learning_rate": 3.712027278363298e-07, + "loss": 0.9111, + "step": 95800 + }, + { + "epoch": 7.424541826494634, + "grad_norm": 1.4125797817061512, + "learning_rate": 3.7124147551146934e-07, + "loss": 0.9243, + "step": 95810 + }, + { + "epoch": 7.42531674997094, + "grad_norm": 1.3748533707903443, + "learning_rate": 3.712802231866088e-07, + "loss": 0.9247, + "step": 95820 + }, + { + "epoch": 7.426091673447247, + "grad_norm": 1.4153099646442422, + "learning_rate": 3.7131897086174833e-07, + "loss": 0.9314, + "step": 95830 + }, + { + "epoch": 7.426866596923554, + "grad_norm": 1.3669762575870879, + "learning_rate": 3.713577185368878e-07, + "loss": 0.9286, + "step": 95840 + }, + { + "epoch": 7.4276415203998605, + "grad_norm": 1.451180851812576, + "learning_rate": 3.7139646621202727e-07, + "loss": 0.9264, + "step": 95850 + }, + { + "epoch": 7.428416443876167, + "grad_norm": 1.41635808936522, + "learning_rate": 3.714352138871668e-07, + "loss": 0.9277, + "step": 95860 + }, + { + "epoch": 7.429191367352474, + "grad_norm": 1.468739794578764, + "learning_rate": 3.7147396156230626e-07, + "loss": 0.9305, + "step": 95870 + }, + { + "epoch": 7.429966290828781, + "grad_norm": 1.388222281675358, + "learning_rate": 3.715127092374458e-07, + "loss": 0.9054, + "step": 95880 + }, + { + "epoch": 7.430741214305088, + "grad_norm": 1.4178476592374427, + "learning_rate": 3.7155145691258525e-07, + "loss": 0.9123, + "step": 95890 + }, + { + "epoch": 7.431516137781394, + "grad_norm": 1.419031735946374, + "learning_rate": 3.715902045877248e-07, + "loss": 0.9325, + "step": 95900 + }, + { + "epoch": 7.4322910612577004, + "grad_norm": 1.4139897098847825, + "learning_rate": 3.7162895226286424e-07, + "loss": 0.9369, + "step": 95910 + }, + { + "epoch": 7.433065984734007, + "grad_norm": 1.4587096920796243, + "learning_rate": 3.716676999380037e-07, + "loss": 0.9209, + "step": 95920 + }, + { + "epoch": 7.433840908210314, + "grad_norm": 1.4173268834602022, + "learning_rate": 3.7170644761314324e-07, + "loss": 0.9097, + "step": 95930 + }, + { + "epoch": 7.434615831686621, + "grad_norm": 1.4069959139853117, + "learning_rate": 3.717451952882827e-07, + "loss": 0.9238, + "step": 95940 + }, + { + "epoch": 7.435390755162928, + "grad_norm": 1.4777442804878074, + "learning_rate": 3.717839429634222e-07, + "loss": 0.916, + "step": 95950 + }, + { + "epoch": 7.4361656786392345, + "grad_norm": 1.5656466581887944, + "learning_rate": 3.718226906385617e-07, + "loss": 0.935, + "step": 95960 + }, + { + "epoch": 7.436940602115541, + "grad_norm": 1.353528820883029, + "learning_rate": 3.718614383137012e-07, + "loss": 0.9151, + "step": 95970 + }, + { + "epoch": 7.437715525591848, + "grad_norm": 1.411718896884153, + "learning_rate": 3.719001859888407e-07, + "loss": 0.9228, + "step": 95980 + }, + { + "epoch": 7.438490449068155, + "grad_norm": 1.5331410660870028, + "learning_rate": 3.7193893366398016e-07, + "loss": 0.9308, + "step": 95990 + }, + { + "epoch": 7.439265372544461, + "grad_norm": 1.440480655748141, + "learning_rate": 3.719776813391197e-07, + "loss": 0.926, + "step": 96000 + }, + { + "epoch": 7.439265372544461, + "eval_loss": 0.9265397787094116, + "eval_runtime": 331.736, + "eval_samples_per_second": 34.579, + "eval_steps_per_second": 8.645, + "step": 96000 + }, + { + "epoch": 7.440040296020768, + "grad_norm": 1.4503689205657084, + "learning_rate": 3.7201642901425915e-07, + "loss": 0.9202, + "step": 96010 + }, + { + "epoch": 7.4408152194970745, + "grad_norm": 1.4504442538563318, + "learning_rate": 3.7205517668939867e-07, + "loss": 0.9066, + "step": 96020 + }, + { + "epoch": 7.441590142973381, + "grad_norm": 1.3757571848833186, + "learning_rate": 3.7209392436453814e-07, + "loss": 0.9147, + "step": 96030 + }, + { + "epoch": 7.442365066449688, + "grad_norm": 1.4665922788515744, + "learning_rate": 3.7213267203967766e-07, + "loss": 0.9413, + "step": 96040 + }, + { + "epoch": 7.443139989925995, + "grad_norm": 1.429587755084545, + "learning_rate": 3.7217141971481713e-07, + "loss": 0.9, + "step": 96050 + }, + { + "epoch": 7.443914913402302, + "grad_norm": 1.4046811651443043, + "learning_rate": 3.722101673899566e-07, + "loss": 0.92, + "step": 96060 + }, + { + "epoch": 7.4446898368786085, + "grad_norm": 1.51761798000461, + "learning_rate": 3.722489150650961e-07, + "loss": 0.9222, + "step": 96070 + }, + { + "epoch": 7.445464760354915, + "grad_norm": 1.4505937199790964, + "learning_rate": 3.722876627402356e-07, + "loss": 0.9164, + "step": 96080 + }, + { + "epoch": 7.446239683831221, + "grad_norm": 1.4344720838847032, + "learning_rate": 3.723264104153751e-07, + "loss": 0.9443, + "step": 96090 + }, + { + "epoch": 7.447014607307528, + "grad_norm": 1.4261920409180673, + "learning_rate": 3.723651580905146e-07, + "loss": 0.9181, + "step": 96100 + }, + { + "epoch": 7.447789530783835, + "grad_norm": 1.431177481435054, + "learning_rate": 3.724039057656541e-07, + "loss": 0.9178, + "step": 96110 + }, + { + "epoch": 7.448564454260142, + "grad_norm": 1.4178110364208443, + "learning_rate": 3.7244265344079357e-07, + "loss": 0.9182, + "step": 96120 + }, + { + "epoch": 7.4493393777364485, + "grad_norm": 1.4138267991277702, + "learning_rate": 3.7248140111593304e-07, + "loss": 0.9231, + "step": 96130 + }, + { + "epoch": 7.450114301212755, + "grad_norm": 1.4190917176687092, + "learning_rate": 3.7252014879107256e-07, + "loss": 0.9266, + "step": 96140 + }, + { + "epoch": 7.450889224689062, + "grad_norm": 1.4221786915775476, + "learning_rate": 3.7255889646621203e-07, + "loss": 0.9421, + "step": 96150 + }, + { + "epoch": 7.451664148165369, + "grad_norm": 1.4384654274060615, + "learning_rate": 3.7259764414135155e-07, + "loss": 0.9103, + "step": 96160 + }, + { + "epoch": 7.452439071641676, + "grad_norm": 1.456972472190561, + "learning_rate": 3.72636391816491e-07, + "loss": 0.9142, + "step": 96170 + }, + { + "epoch": 7.4532139951179825, + "grad_norm": 1.3966073841970013, + "learning_rate": 3.7267513949163054e-07, + "loss": 0.909, + "step": 96180 + }, + { + "epoch": 7.4539889185942885, + "grad_norm": 1.4543243328846136, + "learning_rate": 3.7271388716677e-07, + "loss": 0.9138, + "step": 96190 + }, + { + "epoch": 7.454763842070595, + "grad_norm": 1.337574319293032, + "learning_rate": 3.727526348419095e-07, + "loss": 0.9211, + "step": 96200 + }, + { + "epoch": 7.455538765546902, + "grad_norm": 1.3461138091875027, + "learning_rate": 3.72791382517049e-07, + "loss": 0.9255, + "step": 96210 + }, + { + "epoch": 7.456313689023209, + "grad_norm": 1.3665091968796919, + "learning_rate": 3.7283013019218847e-07, + "loss": 0.9167, + "step": 96220 + }, + { + "epoch": 7.457088612499516, + "grad_norm": 1.4484094845916538, + "learning_rate": 3.72868877867328e-07, + "loss": 0.9411, + "step": 96230 + }, + { + "epoch": 7.4578635359758225, + "grad_norm": 1.420822534848705, + "learning_rate": 3.7290762554246746e-07, + "loss": 0.9143, + "step": 96240 + }, + { + "epoch": 7.458638459452129, + "grad_norm": 1.455613101089179, + "learning_rate": 3.72946373217607e-07, + "loss": 0.9212, + "step": 96250 + }, + { + "epoch": 7.459413382928436, + "grad_norm": 1.430935717834097, + "learning_rate": 3.7298512089274645e-07, + "loss": 0.9497, + "step": 96260 + }, + { + "epoch": 7.460188306404742, + "grad_norm": 1.3966157175298286, + "learning_rate": 3.730238685678859e-07, + "loss": 0.9179, + "step": 96270 + }, + { + "epoch": 7.460963229881049, + "grad_norm": 1.437942346532814, + "learning_rate": 3.7306261624302545e-07, + "loss": 0.9243, + "step": 96280 + }, + { + "epoch": 7.461738153357356, + "grad_norm": 1.3998086012399853, + "learning_rate": 3.731013639181649e-07, + "loss": 0.9042, + "step": 96290 + }, + { + "epoch": 7.4625130768336625, + "grad_norm": 1.3616191400626847, + "learning_rate": 3.7314011159330444e-07, + "loss": 0.918, + "step": 96300 + }, + { + "epoch": 7.463288000309969, + "grad_norm": 1.4098407736948486, + "learning_rate": 3.731788592684439e-07, + "loss": 0.9101, + "step": 96310 + }, + { + "epoch": 7.464062923786276, + "grad_norm": 1.4934587448427343, + "learning_rate": 3.7321760694358343e-07, + "loss": 0.8932, + "step": 96320 + }, + { + "epoch": 7.464837847262583, + "grad_norm": 1.3470470713448943, + "learning_rate": 3.732563546187229e-07, + "loss": 0.8933, + "step": 96330 + }, + { + "epoch": 7.46561277073889, + "grad_norm": 1.4531268523917578, + "learning_rate": 3.7329510229386237e-07, + "loss": 0.9119, + "step": 96340 + }, + { + "epoch": 7.4663876942151965, + "grad_norm": 1.5141963806359553, + "learning_rate": 3.733338499690019e-07, + "loss": 0.912, + "step": 96350 + }, + { + "epoch": 7.467162617691503, + "grad_norm": 1.3772660581871077, + "learning_rate": 3.7337259764414136e-07, + "loss": 0.9026, + "step": 96360 + }, + { + "epoch": 7.467937541167809, + "grad_norm": 1.4211643579011048, + "learning_rate": 3.734113453192809e-07, + "loss": 0.9604, + "step": 96370 + }, + { + "epoch": 7.468712464644116, + "grad_norm": 1.495277590126251, + "learning_rate": 3.7345009299442035e-07, + "loss": 0.9495, + "step": 96380 + }, + { + "epoch": 7.469487388120423, + "grad_norm": 1.4699620549360175, + "learning_rate": 3.734888406695598e-07, + "loss": 0.9469, + "step": 96390 + }, + { + "epoch": 7.47026231159673, + "grad_norm": 1.4200002999522494, + "learning_rate": 3.7352758834469934e-07, + "loss": 0.9413, + "step": 96400 + }, + { + "epoch": 7.4710372350730365, + "grad_norm": 1.4260481086600783, + "learning_rate": 3.735663360198388e-07, + "loss": 0.935, + "step": 96410 + }, + { + "epoch": 7.471812158549343, + "grad_norm": 1.4341065920416354, + "learning_rate": 3.7360508369497833e-07, + "loss": 0.9198, + "step": 96420 + }, + { + "epoch": 7.47258708202565, + "grad_norm": 1.43838214252549, + "learning_rate": 3.736438313701178e-07, + "loss": 0.9277, + "step": 96430 + }, + { + "epoch": 7.473362005501957, + "grad_norm": 1.3903285900157034, + "learning_rate": 3.736825790452573e-07, + "loss": 0.9072, + "step": 96440 + }, + { + "epoch": 7.474136928978264, + "grad_norm": 1.39488987032655, + "learning_rate": 3.737213267203968e-07, + "loss": 0.9153, + "step": 96450 + }, + { + "epoch": 7.47491185245457, + "grad_norm": 1.4523272381758896, + "learning_rate": 3.7376007439553626e-07, + "loss": 0.9131, + "step": 96460 + }, + { + "epoch": 7.4756867759308765, + "grad_norm": 1.4456156721130187, + "learning_rate": 3.737988220706758e-07, + "loss": 0.9369, + "step": 96470 + }, + { + "epoch": 7.476461699407183, + "grad_norm": 1.4190337214687943, + "learning_rate": 3.7383756974581525e-07, + "loss": 0.8994, + "step": 96480 + }, + { + "epoch": 7.47723662288349, + "grad_norm": 1.4969663388176735, + "learning_rate": 3.7387631742095477e-07, + "loss": 0.9162, + "step": 96490 + }, + { + "epoch": 7.478011546359797, + "grad_norm": 1.404681772823158, + "learning_rate": 3.7391506509609424e-07, + "loss": 0.9117, + "step": 96500 + }, + { + "epoch": 7.478011546359797, + "eval_loss": 0.9262690544128418, + "eval_runtime": 334.4572, + "eval_samples_per_second": 34.297, + "eval_steps_per_second": 8.575, + "step": 96500 + }, + { + "epoch": 7.478786469836104, + "grad_norm": 1.4502542582263898, + "learning_rate": 3.7395381277123376e-07, + "loss": 0.9277, + "step": 96510 + }, + { + "epoch": 7.4795613933124105, + "grad_norm": 1.4650197937459488, + "learning_rate": 3.7399256044637323e-07, + "loss": 0.9263, + "step": 96520 + }, + { + "epoch": 7.480336316788717, + "grad_norm": 1.3412362952454955, + "learning_rate": 3.740313081215127e-07, + "loss": 0.9095, + "step": 96530 + }, + { + "epoch": 7.481111240265024, + "grad_norm": 1.4160771562273489, + "learning_rate": 3.740700557966522e-07, + "loss": 0.898, + "step": 96540 + }, + { + "epoch": 7.481886163741331, + "grad_norm": 1.4208538325477211, + "learning_rate": 3.741088034717917e-07, + "loss": 0.9219, + "step": 96550 + }, + { + "epoch": 7.482661087217637, + "grad_norm": 1.3881899209060293, + "learning_rate": 3.741475511469312e-07, + "loss": 0.924, + "step": 96560 + }, + { + "epoch": 7.483436010693944, + "grad_norm": 1.3971565832471162, + "learning_rate": 3.741862988220707e-07, + "loss": 0.8961, + "step": 96570 + }, + { + "epoch": 7.4842109341702505, + "grad_norm": 1.3197076102337195, + "learning_rate": 3.742250464972102e-07, + "loss": 0.9107, + "step": 96580 + }, + { + "epoch": 7.484985857646557, + "grad_norm": 1.4239107545445273, + "learning_rate": 3.742637941723497e-07, + "loss": 0.935, + "step": 96590 + }, + { + "epoch": 7.485760781122864, + "grad_norm": 1.3797044640381921, + "learning_rate": 3.7430254184748914e-07, + "loss": 0.9375, + "step": 96600 + }, + { + "epoch": 7.486535704599171, + "grad_norm": 1.439491181066273, + "learning_rate": 3.7434128952262867e-07, + "loss": 0.9427, + "step": 96610 + }, + { + "epoch": 7.487310628075478, + "grad_norm": 1.365760610078266, + "learning_rate": 3.7438003719776813e-07, + "loss": 0.9082, + "step": 96620 + }, + { + "epoch": 7.488085551551785, + "grad_norm": 1.3757790934009018, + "learning_rate": 3.7441878487290766e-07, + "loss": 0.9102, + "step": 96630 + }, + { + "epoch": 7.488860475028091, + "grad_norm": 1.4080531179296338, + "learning_rate": 3.744575325480471e-07, + "loss": 0.9306, + "step": 96640 + }, + { + "epoch": 7.489635398504397, + "grad_norm": 1.478472344285011, + "learning_rate": 3.7449628022318665e-07, + "loss": 0.9135, + "step": 96650 + }, + { + "epoch": 7.490410321980704, + "grad_norm": 1.4307083522415993, + "learning_rate": 3.745350278983261e-07, + "loss": 0.9061, + "step": 96660 + }, + { + "epoch": 7.491185245457011, + "grad_norm": 1.36912482623289, + "learning_rate": 3.745737755734656e-07, + "loss": 0.9143, + "step": 96670 + }, + { + "epoch": 7.491960168933318, + "grad_norm": 1.3946286847939922, + "learning_rate": 3.746125232486051e-07, + "loss": 0.9211, + "step": 96680 + }, + { + "epoch": 7.4927350924096245, + "grad_norm": 1.457742677142209, + "learning_rate": 3.746512709237446e-07, + "loss": 0.9449, + "step": 96690 + }, + { + "epoch": 7.493510015885931, + "grad_norm": 1.4593714176317993, + "learning_rate": 3.746900185988841e-07, + "loss": 0.9262, + "step": 96700 + }, + { + "epoch": 7.494284939362238, + "grad_norm": 1.3229775103665438, + "learning_rate": 3.7472876627402357e-07, + "loss": 0.9175, + "step": 96710 + }, + { + "epoch": 7.495059862838545, + "grad_norm": 1.4750762060547626, + "learning_rate": 3.747675139491631e-07, + "loss": 0.9479, + "step": 96720 + }, + { + "epoch": 7.495834786314852, + "grad_norm": 1.4269645852881097, + "learning_rate": 3.7480626162430256e-07, + "loss": 0.912, + "step": 96730 + }, + { + "epoch": 7.496609709791159, + "grad_norm": 1.4292438874141309, + "learning_rate": 3.7484500929944203e-07, + "loss": 0.9123, + "step": 96740 + }, + { + "epoch": 7.4973846332674645, + "grad_norm": 1.3720671084167144, + "learning_rate": 3.7488375697458155e-07, + "loss": 0.9325, + "step": 96750 + }, + { + "epoch": 7.498159556743771, + "grad_norm": 1.4307115404090438, + "learning_rate": 3.74922504649721e-07, + "loss": 0.9033, + "step": 96760 + }, + { + "epoch": 7.498934480220078, + "grad_norm": 1.3641656452572921, + "learning_rate": 3.7496125232486054e-07, + "loss": 0.9073, + "step": 96770 + }, + { + "epoch": 7.499709403696385, + "grad_norm": 1.426461275933315, + "learning_rate": 3.75e-07, + "loss": 0.912, + "step": 96780 + }, + { + "epoch": 7.500484327172692, + "grad_norm": 1.4682226430252527, + "learning_rate": 3.7503874767513953e-07, + "loss": 0.9119, + "step": 96790 + }, + { + "epoch": 7.501259250648999, + "grad_norm": 1.4358974150726367, + "learning_rate": 3.75077495350279e-07, + "loss": 0.9435, + "step": 96800 + }, + { + "epoch": 7.502034174125305, + "grad_norm": 1.3437256933993975, + "learning_rate": 3.7511624302541847e-07, + "loss": 0.9385, + "step": 96810 + }, + { + "epoch": 7.502809097601612, + "grad_norm": 1.4506126569920195, + "learning_rate": 3.75154990700558e-07, + "loss": 0.9074, + "step": 96820 + }, + { + "epoch": 7.503584021077918, + "grad_norm": 1.3993758823924092, + "learning_rate": 3.7519373837569746e-07, + "loss": 0.9052, + "step": 96830 + }, + { + "epoch": 7.504358944554225, + "grad_norm": 1.4431054666345693, + "learning_rate": 3.75232486050837e-07, + "loss": 0.9041, + "step": 96840 + }, + { + "epoch": 7.505133868030532, + "grad_norm": 1.3242607649757931, + "learning_rate": 3.7527123372597645e-07, + "loss": 0.9294, + "step": 96850 + }, + { + "epoch": 7.5059087915068385, + "grad_norm": 1.437697692949813, + "learning_rate": 3.75309981401116e-07, + "loss": 0.941, + "step": 96860 + }, + { + "epoch": 7.506683714983145, + "grad_norm": 1.4674365057367469, + "learning_rate": 3.7534872907625544e-07, + "loss": 0.9172, + "step": 96870 + }, + { + "epoch": 7.507458638459452, + "grad_norm": 1.431778581540718, + "learning_rate": 3.753874767513949e-07, + "loss": 0.9351, + "step": 96880 + }, + { + "epoch": 7.508233561935759, + "grad_norm": 1.4096713582690907, + "learning_rate": 3.7542622442653443e-07, + "loss": 0.9387, + "step": 96890 + }, + { + "epoch": 7.509008485412066, + "grad_norm": 1.4140532722350425, + "learning_rate": 3.754649721016739e-07, + "loss": 0.9187, + "step": 96900 + }, + { + "epoch": 7.509783408888373, + "grad_norm": 1.4480795553175936, + "learning_rate": 3.755037197768134e-07, + "loss": 0.8942, + "step": 96910 + }, + { + "epoch": 7.510558332364679, + "grad_norm": 1.3551421300108533, + "learning_rate": 3.755424674519529e-07, + "loss": 0.928, + "step": 96920 + }, + { + "epoch": 7.511333255840985, + "grad_norm": 1.449208590086081, + "learning_rate": 3.755812151270924e-07, + "loss": 0.9027, + "step": 96930 + }, + { + "epoch": 7.512108179317292, + "grad_norm": 1.4690427824274819, + "learning_rate": 3.756199628022319e-07, + "loss": 0.9153, + "step": 96940 + }, + { + "epoch": 7.512883102793599, + "grad_norm": 1.4478596902725092, + "learning_rate": 3.7565871047737135e-07, + "loss": 0.9296, + "step": 96950 + }, + { + "epoch": 7.513658026269906, + "grad_norm": 1.4129158132157664, + "learning_rate": 3.756974581525109e-07, + "loss": 0.9243, + "step": 96960 + }, + { + "epoch": 7.514432949746213, + "grad_norm": 1.4389223156617512, + "learning_rate": 3.7573620582765034e-07, + "loss": 0.9193, + "step": 96970 + }, + { + "epoch": 7.515207873222519, + "grad_norm": 1.4414283350526425, + "learning_rate": 3.7577495350278987e-07, + "loss": 0.9048, + "step": 96980 + }, + { + "epoch": 7.515982796698826, + "grad_norm": 1.4014616536567068, + "learning_rate": 3.7581370117792934e-07, + "loss": 0.9261, + "step": 96990 + }, + { + "epoch": 7.516757720175133, + "grad_norm": 1.4072243681917738, + "learning_rate": 3.7585244885306886e-07, + "loss": 0.8943, + "step": 97000 + }, + { + "epoch": 7.516757720175133, + "eval_loss": 0.9260082244873047, + "eval_runtime": 331.4693, + "eval_samples_per_second": 34.607, + "eval_steps_per_second": 8.652, + "step": 97000 + }, + { + "epoch": 7.51753264365144, + "grad_norm": 1.4640860641085773, + "learning_rate": 3.7589119652820833e-07, + "loss": 0.9367, + "step": 97010 + }, + { + "epoch": 7.518307567127746, + "grad_norm": 1.4117236473564054, + "learning_rate": 3.759299442033478e-07, + "loss": 0.9162, + "step": 97020 + }, + { + "epoch": 7.5190824906040525, + "grad_norm": 1.4049599321406712, + "learning_rate": 3.759686918784873e-07, + "loss": 0.9271, + "step": 97030 + }, + { + "epoch": 7.519857414080359, + "grad_norm": 1.4472196356672422, + "learning_rate": 3.760074395536268e-07, + "loss": 0.9257, + "step": 97040 + }, + { + "epoch": 7.520632337556666, + "grad_norm": 1.4265310843161676, + "learning_rate": 3.760461872287663e-07, + "loss": 0.9243, + "step": 97050 + }, + { + "epoch": 7.521407261032973, + "grad_norm": 1.4481906327686296, + "learning_rate": 3.760849349039058e-07, + "loss": 0.9217, + "step": 97060 + }, + { + "epoch": 7.52218218450928, + "grad_norm": 1.395274290470503, + "learning_rate": 3.761236825790453e-07, + "loss": 0.9239, + "step": 97070 + }, + { + "epoch": 7.522957107985587, + "grad_norm": 1.365756900711663, + "learning_rate": 3.7616243025418477e-07, + "loss": 0.9154, + "step": 97080 + }, + { + "epoch": 7.523732031461893, + "grad_norm": 1.3416584779072824, + "learning_rate": 3.7620117792932424e-07, + "loss": 0.9193, + "step": 97090 + }, + { + "epoch": 7.5245069549382, + "grad_norm": 1.3615445993091926, + "learning_rate": 3.7623992560446376e-07, + "loss": 0.915, + "step": 97100 + }, + { + "epoch": 7.525281878414507, + "grad_norm": 1.4073878861147342, + "learning_rate": 3.7627867327960323e-07, + "loss": 0.9459, + "step": 97110 + }, + { + "epoch": 7.526056801890813, + "grad_norm": 1.4911039271738138, + "learning_rate": 3.7631742095474275e-07, + "loss": 0.9264, + "step": 97120 + }, + { + "epoch": 7.52683172536712, + "grad_norm": 1.3324735629668267, + "learning_rate": 3.763561686298822e-07, + "loss": 0.9009, + "step": 97130 + }, + { + "epoch": 7.527606648843427, + "grad_norm": 1.3736113583785903, + "learning_rate": 3.763949163050217e-07, + "loss": 0.9248, + "step": 97140 + }, + { + "epoch": 7.528381572319733, + "grad_norm": 1.4374993949456463, + "learning_rate": 3.764336639801612e-07, + "loss": 0.9172, + "step": 97150 + }, + { + "epoch": 7.52915649579604, + "grad_norm": 1.409186183657067, + "learning_rate": 3.764724116553007e-07, + "loss": 0.9163, + "step": 97160 + }, + { + "epoch": 7.529931419272347, + "grad_norm": 1.4364804689039947, + "learning_rate": 3.765111593304402e-07, + "loss": 0.9118, + "step": 97170 + }, + { + "epoch": 7.530706342748654, + "grad_norm": 1.358598578520454, + "learning_rate": 3.7654990700557967e-07, + "loss": 0.9234, + "step": 97180 + }, + { + "epoch": 7.531481266224961, + "grad_norm": 1.5566529586688262, + "learning_rate": 3.765886546807192e-07, + "loss": 0.9147, + "step": 97190 + }, + { + "epoch": 7.5322561897012665, + "grad_norm": 1.4254935045967578, + "learning_rate": 3.7662740235585866e-07, + "loss": 0.9475, + "step": 97200 + }, + { + "epoch": 7.533031113177573, + "grad_norm": 1.3744573452856121, + "learning_rate": 3.7666615003099813e-07, + "loss": 0.9186, + "step": 97210 + }, + { + "epoch": 7.53380603665388, + "grad_norm": 1.4112853914556642, + "learning_rate": 3.7670489770613765e-07, + "loss": 0.9299, + "step": 97220 + }, + { + "epoch": 7.534580960130187, + "grad_norm": 1.4088254317137876, + "learning_rate": 3.767436453812771e-07, + "loss": 0.9207, + "step": 97230 + }, + { + "epoch": 7.535355883606494, + "grad_norm": 1.4423483735983964, + "learning_rate": 3.7678239305641664e-07, + "loss": 0.9054, + "step": 97240 + }, + { + "epoch": 7.536130807082801, + "grad_norm": 1.355840273633307, + "learning_rate": 3.768211407315561e-07, + "loss": 0.9335, + "step": 97250 + }, + { + "epoch": 7.536905730559107, + "grad_norm": 1.3625365746853084, + "learning_rate": 3.7685988840669564e-07, + "loss": 0.8963, + "step": 97260 + }, + { + "epoch": 7.537680654035414, + "grad_norm": 1.3549280924308111, + "learning_rate": 3.768986360818351e-07, + "loss": 0.9276, + "step": 97270 + }, + { + "epoch": 7.538455577511721, + "grad_norm": 1.4349535742518902, + "learning_rate": 3.7693738375697457e-07, + "loss": 0.9411, + "step": 97280 + }, + { + "epoch": 7.539230500988028, + "grad_norm": 1.404536638671184, + "learning_rate": 3.769761314321141e-07, + "loss": 0.9137, + "step": 97290 + }, + { + "epoch": 7.540005424464335, + "grad_norm": 1.4109652356482987, + "learning_rate": 3.7701487910725356e-07, + "loss": 0.9205, + "step": 97300 + }, + { + "epoch": 7.540780347940641, + "grad_norm": 1.400868176128099, + "learning_rate": 3.770536267823931e-07, + "loss": 0.9064, + "step": 97310 + }, + { + "epoch": 7.541555271416947, + "grad_norm": 1.4472645470441046, + "learning_rate": 3.7709237445753256e-07, + "loss": 0.903, + "step": 97320 + }, + { + "epoch": 7.542330194893254, + "grad_norm": 1.4196645863803155, + "learning_rate": 3.771311221326721e-07, + "loss": 0.9384, + "step": 97330 + }, + { + "epoch": 7.543105118369561, + "grad_norm": 1.412233923351347, + "learning_rate": 3.7716986980781155e-07, + "loss": 0.9169, + "step": 97340 + }, + { + "epoch": 7.543880041845868, + "grad_norm": 1.3751497685120873, + "learning_rate": 3.77208617482951e-07, + "loss": 0.91, + "step": 97350 + }, + { + "epoch": 7.544654965322175, + "grad_norm": 1.4362386376643803, + "learning_rate": 3.7724736515809054e-07, + "loss": 0.9189, + "step": 97360 + }, + { + "epoch": 7.545429888798481, + "grad_norm": 1.4088737743054633, + "learning_rate": 3.7728611283323e-07, + "loss": 0.92, + "step": 97370 + }, + { + "epoch": 7.546204812274788, + "grad_norm": 1.4386047801820028, + "learning_rate": 3.7732486050836953e-07, + "loss": 0.9285, + "step": 97380 + }, + { + "epoch": 7.546979735751094, + "grad_norm": 1.3643525925908415, + "learning_rate": 3.77363608183509e-07, + "loss": 0.9198, + "step": 97390 + }, + { + "epoch": 7.547754659227401, + "grad_norm": 1.452077010819401, + "learning_rate": 3.774023558586485e-07, + "loss": 0.929, + "step": 97400 + }, + { + "epoch": 7.548529582703708, + "grad_norm": 1.43031026888859, + "learning_rate": 3.77441103533788e-07, + "loss": 0.9219, + "step": 97410 + }, + { + "epoch": 7.549304506180015, + "grad_norm": 1.4732352760001939, + "learning_rate": 3.7747985120892746e-07, + "loss": 0.9501, + "step": 97420 + }, + { + "epoch": 7.550079429656321, + "grad_norm": 1.470781553295747, + "learning_rate": 3.77518598884067e-07, + "loss": 0.9118, + "step": 97430 + }, + { + "epoch": 7.550854353132628, + "grad_norm": 1.4762810989253603, + "learning_rate": 3.7755734655920645e-07, + "loss": 0.9191, + "step": 97440 + }, + { + "epoch": 7.551629276608935, + "grad_norm": 1.4688845344216708, + "learning_rate": 3.7759609423434597e-07, + "loss": 0.9186, + "step": 97450 + }, + { + "epoch": 7.552404200085242, + "grad_norm": 1.5441529347123577, + "learning_rate": 3.7763484190948544e-07, + "loss": 0.9305, + "step": 97460 + }, + { + "epoch": 7.553179123561549, + "grad_norm": 1.4057218187931038, + "learning_rate": 3.7767358958462496e-07, + "loss": 0.9269, + "step": 97470 + }, + { + "epoch": 7.5539540470378554, + "grad_norm": 1.3305082511826802, + "learning_rate": 3.7771233725976443e-07, + "loss": 0.9173, + "step": 97480 + }, + { + "epoch": 7.554728970514161, + "grad_norm": 1.4371993450833456, + "learning_rate": 3.777510849349039e-07, + "loss": 0.8953, + "step": 97490 + }, + { + "epoch": 7.555503893990468, + "grad_norm": 1.4640829007673817, + "learning_rate": 3.777898326100434e-07, + "loss": 0.9398, + "step": 97500 + }, + { + "epoch": 7.555503893990468, + "eval_loss": 0.9254491329193115, + "eval_runtime": 333.0101, + "eval_samples_per_second": 34.446, + "eval_steps_per_second": 8.612, + "step": 97500 + }, + { + "epoch": 7.556278817466775, + "grad_norm": 1.4361551612193613, + "learning_rate": 3.778285802851829e-07, + "loss": 0.9229, + "step": 97510 + }, + { + "epoch": 7.557053740943082, + "grad_norm": 1.3910360303922176, + "learning_rate": 3.778673279603224e-07, + "loss": 0.9277, + "step": 97520 + }, + { + "epoch": 7.557828664419389, + "grad_norm": 1.4321065604381609, + "learning_rate": 3.779060756354619e-07, + "loss": 0.9201, + "step": 97530 + }, + { + "epoch": 7.558603587895695, + "grad_norm": 1.5057361324771115, + "learning_rate": 3.779448233106014e-07, + "loss": 0.912, + "step": 97540 + }, + { + "epoch": 7.559378511372002, + "grad_norm": 1.405908760045715, + "learning_rate": 3.7798357098574087e-07, + "loss": 0.9138, + "step": 97550 + }, + { + "epoch": 7.560153434848309, + "grad_norm": 1.395951616601458, + "learning_rate": 3.7802231866088034e-07, + "loss": 0.9412, + "step": 97560 + }, + { + "epoch": 7.560928358324615, + "grad_norm": 1.4368865043427066, + "learning_rate": 3.7806106633601986e-07, + "loss": 0.9178, + "step": 97570 + }, + { + "epoch": 7.561703281800922, + "grad_norm": 1.5566656956926506, + "learning_rate": 3.7809981401115933e-07, + "loss": 0.9026, + "step": 97580 + }, + { + "epoch": 7.562478205277229, + "grad_norm": 1.3874292098370358, + "learning_rate": 3.7813856168629885e-07, + "loss": 0.9365, + "step": 97590 + }, + { + "epoch": 7.563253128753535, + "grad_norm": 1.3298154120864982, + "learning_rate": 3.781773093614383e-07, + "loss": 0.9009, + "step": 97600 + }, + { + "epoch": 7.564028052229842, + "grad_norm": 1.352766728476646, + "learning_rate": 3.7821605703657785e-07, + "loss": 0.908, + "step": 97610 + }, + { + "epoch": 7.564802975706149, + "grad_norm": 1.3678083878631737, + "learning_rate": 3.782548047117173e-07, + "loss": 0.9395, + "step": 97620 + }, + { + "epoch": 7.565577899182456, + "grad_norm": 1.3134094471290754, + "learning_rate": 3.782935523868568e-07, + "loss": 0.9151, + "step": 97630 + }, + { + "epoch": 7.566352822658763, + "grad_norm": 1.4072298052013306, + "learning_rate": 3.783323000619963e-07, + "loss": 0.9144, + "step": 97640 + }, + { + "epoch": 7.567127746135069, + "grad_norm": 1.42838158314226, + "learning_rate": 3.783710477371358e-07, + "loss": 0.9144, + "step": 97650 + }, + { + "epoch": 7.567902669611376, + "grad_norm": 1.4137766632236837, + "learning_rate": 3.784097954122753e-07, + "loss": 0.9011, + "step": 97660 + }, + { + "epoch": 7.568677593087683, + "grad_norm": 1.3255736742664994, + "learning_rate": 3.7844854308741477e-07, + "loss": 0.9214, + "step": 97670 + }, + { + "epoch": 7.569452516563989, + "grad_norm": 1.4482052797593001, + "learning_rate": 3.784872907625543e-07, + "loss": 0.9277, + "step": 97680 + }, + { + "epoch": 7.570227440040296, + "grad_norm": 1.3525426699789103, + "learning_rate": 3.7852603843769376e-07, + "loss": 0.9144, + "step": 97690 + }, + { + "epoch": 7.571002363516603, + "grad_norm": 1.3740336335409096, + "learning_rate": 3.785647861128332e-07, + "loss": 0.9439, + "step": 97700 + }, + { + "epoch": 7.571777286992909, + "grad_norm": 1.455334567930754, + "learning_rate": 3.7860353378797275e-07, + "loss": 0.9175, + "step": 97710 + }, + { + "epoch": 7.572552210469216, + "grad_norm": 1.4149905935422296, + "learning_rate": 3.786422814631122e-07, + "loss": 0.8955, + "step": 97720 + }, + { + "epoch": 7.573327133945523, + "grad_norm": 1.4532522442422862, + "learning_rate": 3.7868102913825174e-07, + "loss": 0.922, + "step": 97730 + }, + { + "epoch": 7.57410205742183, + "grad_norm": 1.3906538243371815, + "learning_rate": 3.787197768133912e-07, + "loss": 0.9107, + "step": 97740 + }, + { + "epoch": 7.574876980898137, + "grad_norm": 1.3702843528458217, + "learning_rate": 3.7875852448853073e-07, + "loss": 0.9253, + "step": 97750 + }, + { + "epoch": 7.575651904374443, + "grad_norm": 1.380140761829786, + "learning_rate": 3.787972721636702e-07, + "loss": 0.8969, + "step": 97760 + }, + { + "epoch": 7.576426827850749, + "grad_norm": 1.4591642788017964, + "learning_rate": 3.7883601983880967e-07, + "loss": 0.9091, + "step": 97770 + }, + { + "epoch": 7.577201751327056, + "grad_norm": 1.3644081914297508, + "learning_rate": 3.788747675139492e-07, + "loss": 0.9317, + "step": 97780 + }, + { + "epoch": 7.577976674803363, + "grad_norm": 1.3552426827704402, + "learning_rate": 3.7891351518908866e-07, + "loss": 0.9105, + "step": 97790 + }, + { + "epoch": 7.57875159827967, + "grad_norm": 1.3556199196453844, + "learning_rate": 3.789522628642282e-07, + "loss": 0.929, + "step": 97800 + }, + { + "epoch": 7.579526521755977, + "grad_norm": 1.445457208296888, + "learning_rate": 3.7899101053936765e-07, + "loss": 0.9118, + "step": 97810 + }, + { + "epoch": 7.580301445232283, + "grad_norm": 1.446344912625199, + "learning_rate": 3.7902975821450717e-07, + "loss": 0.9191, + "step": 97820 + }, + { + "epoch": 7.58107636870859, + "grad_norm": 1.4048971432106154, + "learning_rate": 3.7906850588964664e-07, + "loss": 0.9121, + "step": 97830 + }, + { + "epoch": 7.581851292184897, + "grad_norm": 1.3866204765141132, + "learning_rate": 3.791072535647861e-07, + "loss": 0.896, + "step": 97840 + }, + { + "epoch": 7.582626215661204, + "grad_norm": 1.3859631939888541, + "learning_rate": 3.7914600123992563e-07, + "loss": 0.9136, + "step": 97850 + }, + { + "epoch": 7.58340113913751, + "grad_norm": 1.4514443971878488, + "learning_rate": 3.791847489150651e-07, + "loss": 0.8995, + "step": 97860 + }, + { + "epoch": 7.584176062613817, + "grad_norm": 1.3887199587045047, + "learning_rate": 3.792234965902046e-07, + "loss": 0.9045, + "step": 97870 + }, + { + "epoch": 7.584950986090123, + "grad_norm": 1.4842768990595177, + "learning_rate": 3.792622442653441e-07, + "loss": 0.9206, + "step": 97880 + }, + { + "epoch": 7.58572590956643, + "grad_norm": 1.4599835028476051, + "learning_rate": 3.7930099194048356e-07, + "loss": 0.9317, + "step": 97890 + }, + { + "epoch": 7.586500833042737, + "grad_norm": 1.456570145154029, + "learning_rate": 3.793397396156231e-07, + "loss": 0.9377, + "step": 97900 + }, + { + "epoch": 7.587275756519044, + "grad_norm": 1.4316790748417068, + "learning_rate": 3.7937848729076255e-07, + "loss": 0.9134, + "step": 97910 + }, + { + "epoch": 7.588050679995351, + "grad_norm": 1.3931860675840055, + "learning_rate": 3.794172349659021e-07, + "loss": 0.9251, + "step": 97920 + }, + { + "epoch": 7.5888256034716575, + "grad_norm": 1.4012401807014392, + "learning_rate": 3.7945598264104154e-07, + "loss": 0.9221, + "step": 97930 + }, + { + "epoch": 7.589600526947964, + "grad_norm": 1.4168478280621544, + "learning_rate": 3.7949473031618107e-07, + "loss": 0.93, + "step": 97940 + }, + { + "epoch": 7.59037545042427, + "grad_norm": 1.4086069103669945, + "learning_rate": 3.7953347799132053e-07, + "loss": 0.9281, + "step": 97950 + }, + { + "epoch": 7.591150373900577, + "grad_norm": 1.3763591302587361, + "learning_rate": 3.7957222566646e-07, + "loss": 0.9173, + "step": 97960 + }, + { + "epoch": 7.591925297376884, + "grad_norm": 1.4461192091752813, + "learning_rate": 3.796109733415995e-07, + "loss": 0.9156, + "step": 97970 + }, + { + "epoch": 7.592700220853191, + "grad_norm": 1.3759305788946035, + "learning_rate": 3.79649721016739e-07, + "loss": 0.9248, + "step": 97980 + }, + { + "epoch": 7.593475144329497, + "grad_norm": 1.4280869783624124, + "learning_rate": 3.796884686918785e-07, + "loss": 0.9055, + "step": 97990 + }, + { + "epoch": 7.594250067805804, + "grad_norm": 1.4104161722909028, + "learning_rate": 3.79727216367018e-07, + "loss": 0.9196, + "step": 98000 + }, + { + "epoch": 7.594250067805804, + "eval_loss": 0.9251598715782166, + "eval_runtime": 331.7073, + "eval_samples_per_second": 34.582, + "eval_steps_per_second": 8.646, + "step": 98000 + }, + { + "epoch": 7.595024991282111, + "grad_norm": 1.3499425899463802, + "learning_rate": 3.797659640421575e-07, + "loss": 0.9125, + "step": 98010 + }, + { + "epoch": 7.595799914758418, + "grad_norm": 1.3730294032314312, + "learning_rate": 3.79804711717297e-07, + "loss": 0.9184, + "step": 98020 + }, + { + "epoch": 7.596574838234725, + "grad_norm": 1.4122360857872835, + "learning_rate": 3.7984345939243645e-07, + "loss": 0.9226, + "step": 98030 + }, + { + "epoch": 7.5973497617110315, + "grad_norm": 1.4276249845242637, + "learning_rate": 3.7988220706757597e-07, + "loss": 0.9128, + "step": 98040 + }, + { + "epoch": 7.598124685187337, + "grad_norm": 1.4242298939896632, + "learning_rate": 3.7992095474271544e-07, + "loss": 0.9076, + "step": 98050 + }, + { + "epoch": 7.598899608663644, + "grad_norm": 1.3824286670373325, + "learning_rate": 3.7995970241785496e-07, + "loss": 0.9271, + "step": 98060 + }, + { + "epoch": 7.599674532139951, + "grad_norm": 1.4468304869205026, + "learning_rate": 3.7999845009299443e-07, + "loss": 0.9282, + "step": 98070 + }, + { + "epoch": 7.600449455616258, + "grad_norm": 1.4614320982255893, + "learning_rate": 3.8003719776813395e-07, + "loss": 0.9306, + "step": 98080 + }, + { + "epoch": 7.601224379092565, + "grad_norm": 1.5101118932453428, + "learning_rate": 3.800759454432734e-07, + "loss": 0.9121, + "step": 98090 + }, + { + "epoch": 7.6019993025688715, + "grad_norm": 1.4329021422643147, + "learning_rate": 3.801146931184129e-07, + "loss": 0.9139, + "step": 98100 + }, + { + "epoch": 7.602774226045178, + "grad_norm": 1.4225902924453848, + "learning_rate": 3.801534407935524e-07, + "loss": 0.9191, + "step": 98110 + }, + { + "epoch": 7.603549149521485, + "grad_norm": 1.4595064210774464, + "learning_rate": 3.801921884686919e-07, + "loss": 0.9153, + "step": 98120 + }, + { + "epoch": 7.604324072997791, + "grad_norm": 1.5050202304898272, + "learning_rate": 3.802309361438314e-07, + "loss": 0.9275, + "step": 98130 + }, + { + "epoch": 7.605098996474098, + "grad_norm": 1.466343527256689, + "learning_rate": 3.8026968381897087e-07, + "loss": 0.9376, + "step": 98140 + }, + { + "epoch": 7.605873919950405, + "grad_norm": 1.5299760632177712, + "learning_rate": 3.803084314941104e-07, + "loss": 0.9079, + "step": 98150 + }, + { + "epoch": 7.606648843426711, + "grad_norm": 1.5290603966761513, + "learning_rate": 3.8034717916924986e-07, + "loss": 0.9179, + "step": 98160 + }, + { + "epoch": 7.607423766903018, + "grad_norm": 1.376258735316635, + "learning_rate": 3.8038592684438933e-07, + "loss": 0.9187, + "step": 98170 + }, + { + "epoch": 7.608198690379325, + "grad_norm": 1.3821615963637008, + "learning_rate": 3.8042467451952885e-07, + "loss": 0.915, + "step": 98180 + }, + { + "epoch": 7.608973613855632, + "grad_norm": 1.3739623631258175, + "learning_rate": 3.804634221946683e-07, + "loss": 0.9087, + "step": 98190 + }, + { + "epoch": 7.609748537331939, + "grad_norm": 1.4568968502831465, + "learning_rate": 3.8050216986980784e-07, + "loss": 0.9222, + "step": 98200 + }, + { + "epoch": 7.6105234608082455, + "grad_norm": 1.378512403242265, + "learning_rate": 3.805409175449473e-07, + "loss": 0.9189, + "step": 98210 + }, + { + "epoch": 7.611298384284552, + "grad_norm": 1.4706127204006023, + "learning_rate": 3.8057966522008683e-07, + "loss": 0.9316, + "step": 98220 + }, + { + "epoch": 7.612073307760859, + "grad_norm": 1.4452612577121944, + "learning_rate": 3.806184128952263e-07, + "loss": 0.9027, + "step": 98230 + }, + { + "epoch": 7.612848231237165, + "grad_norm": 1.3402561231135885, + "learning_rate": 3.8065716057036577e-07, + "loss": 0.9013, + "step": 98240 + }, + { + "epoch": 7.613623154713472, + "grad_norm": 1.4208244354012194, + "learning_rate": 3.806959082455053e-07, + "loss": 0.918, + "step": 98250 + }, + { + "epoch": 7.614398078189779, + "grad_norm": 1.411881423503122, + "learning_rate": 3.8073465592064476e-07, + "loss": 0.9176, + "step": 98260 + }, + { + "epoch": 7.6151730016660855, + "grad_norm": 1.3862364231412119, + "learning_rate": 3.807734035957843e-07, + "loss": 0.9321, + "step": 98270 + }, + { + "epoch": 7.615947925142392, + "grad_norm": 1.4693281529692945, + "learning_rate": 3.8081215127092375e-07, + "loss": 0.9236, + "step": 98280 + }, + { + "epoch": 7.616722848618699, + "grad_norm": 1.4360883786350924, + "learning_rate": 3.808508989460633e-07, + "loss": 0.9278, + "step": 98290 + }, + { + "epoch": 7.617497772095006, + "grad_norm": 1.3934585823667902, + "learning_rate": 3.8088964662120274e-07, + "loss": 0.9216, + "step": 98300 + }, + { + "epoch": 7.618272695571313, + "grad_norm": 1.358452004904236, + "learning_rate": 3.809283942963422e-07, + "loss": 0.9086, + "step": 98310 + }, + { + "epoch": 7.619047619047619, + "grad_norm": 1.419566854778883, + "learning_rate": 3.8096714197148174e-07, + "loss": 0.9485, + "step": 98320 + }, + { + "epoch": 7.619822542523925, + "grad_norm": 1.4405691361731365, + "learning_rate": 3.810058896466212e-07, + "loss": 0.941, + "step": 98330 + }, + { + "epoch": 7.620597466000232, + "grad_norm": 1.4469167975638244, + "learning_rate": 3.8104463732176073e-07, + "loss": 0.9188, + "step": 98340 + }, + { + "epoch": 7.621372389476539, + "grad_norm": 1.4095885639956682, + "learning_rate": 3.810833849969002e-07, + "loss": 0.9174, + "step": 98350 + }, + { + "epoch": 7.622147312952846, + "grad_norm": 1.4188020198090021, + "learning_rate": 3.811221326720397e-07, + "loss": 0.9261, + "step": 98360 + }, + { + "epoch": 7.622922236429153, + "grad_norm": 1.430984388376839, + "learning_rate": 3.811608803471792e-07, + "loss": 0.9234, + "step": 98370 + }, + { + "epoch": 7.6236971599054595, + "grad_norm": 1.4571621429651656, + "learning_rate": 3.8119962802231866e-07, + "loss": 0.9253, + "step": 98380 + }, + { + "epoch": 7.624472083381766, + "grad_norm": 1.4537392039469852, + "learning_rate": 3.812383756974582e-07, + "loss": 0.9335, + "step": 98390 + }, + { + "epoch": 7.625247006858073, + "grad_norm": 1.2972086253367123, + "learning_rate": 3.8127712337259765e-07, + "loss": 0.9169, + "step": 98400 + }, + { + "epoch": 7.62602193033438, + "grad_norm": 1.3776268400454097, + "learning_rate": 3.8131587104773717e-07, + "loss": 0.9015, + "step": 98410 + }, + { + "epoch": 7.626796853810686, + "grad_norm": 1.4116334965789146, + "learning_rate": 3.8135461872287664e-07, + "loss": 0.8933, + "step": 98420 + }, + { + "epoch": 7.627571777286993, + "grad_norm": 1.45257365003587, + "learning_rate": 3.8139336639801616e-07, + "loss": 0.8967, + "step": 98430 + }, + { + "epoch": 7.6283467007632995, + "grad_norm": 1.5376212089478614, + "learning_rate": 3.8143211407315563e-07, + "loss": 0.9298, + "step": 98440 + }, + { + "epoch": 7.629121624239606, + "grad_norm": 1.4685024839980094, + "learning_rate": 3.814708617482951e-07, + "loss": 0.9051, + "step": 98450 + }, + { + "epoch": 7.629896547715913, + "grad_norm": 1.345308906158007, + "learning_rate": 3.815096094234346e-07, + "loss": 0.928, + "step": 98460 + }, + { + "epoch": 7.63067147119222, + "grad_norm": 1.4472449119088246, + "learning_rate": 3.815483570985741e-07, + "loss": 0.9192, + "step": 98470 + }, + { + "epoch": 7.631446394668527, + "grad_norm": 1.4140735526865096, + "learning_rate": 3.815871047737136e-07, + "loss": 0.9181, + "step": 98480 + }, + { + "epoch": 7.6322213181448335, + "grad_norm": 1.3912714655807545, + "learning_rate": 3.816258524488531e-07, + "loss": 0.9102, + "step": 98490 + }, + { + "epoch": 7.632996241621139, + "grad_norm": 1.450947239306177, + "learning_rate": 3.816646001239926e-07, + "loss": 0.9232, + "step": 98500 + }, + { + "epoch": 7.632996241621139, + "eval_loss": 0.924842894077301, + "eval_runtime": 331.6845, + "eval_samples_per_second": 34.584, + "eval_steps_per_second": 8.647, + "step": 98500 + }, + { + "epoch": 7.633771165097446, + "grad_norm": 1.4551583181041485, + "learning_rate": 3.8170334779913207e-07, + "loss": 0.9136, + "step": 98510 + }, + { + "epoch": 7.634546088573753, + "grad_norm": 1.3867771599686425, + "learning_rate": 3.8174209547427154e-07, + "loss": 0.9102, + "step": 98520 + }, + { + "epoch": 7.63532101205006, + "grad_norm": 1.3624384346163452, + "learning_rate": 3.8178084314941106e-07, + "loss": 0.8966, + "step": 98530 + }, + { + "epoch": 7.636095935526367, + "grad_norm": 1.4695035004529058, + "learning_rate": 3.8181959082455053e-07, + "loss": 0.9154, + "step": 98540 + }, + { + "epoch": 7.6368708590026735, + "grad_norm": 1.3802822433087474, + "learning_rate": 3.8185833849969005e-07, + "loss": 0.9251, + "step": 98550 + }, + { + "epoch": 7.63764578247898, + "grad_norm": 1.4604379553984466, + "learning_rate": 3.818970861748295e-07, + "loss": 0.9106, + "step": 98560 + }, + { + "epoch": 7.638420705955287, + "grad_norm": 1.3955432038896722, + "learning_rate": 3.8193583384996904e-07, + "loss": 0.9139, + "step": 98570 + }, + { + "epoch": 7.639195629431594, + "grad_norm": 1.370624796231205, + "learning_rate": 3.819745815251085e-07, + "loss": 0.9005, + "step": 98580 + }, + { + "epoch": 7.639970552907901, + "grad_norm": 1.4338814535611795, + "learning_rate": 3.82013329200248e-07, + "loss": 0.9094, + "step": 98590 + }, + { + "epoch": 7.6407454763842075, + "grad_norm": 1.4325921816188736, + "learning_rate": 3.820520768753875e-07, + "loss": 0.9062, + "step": 98600 + }, + { + "epoch": 7.6415203998605135, + "grad_norm": 1.4240995167605859, + "learning_rate": 3.8209082455052697e-07, + "loss": 0.9103, + "step": 98610 + }, + { + "epoch": 7.64229532333682, + "grad_norm": 1.3864431528320236, + "learning_rate": 3.821295722256665e-07, + "loss": 0.9049, + "step": 98620 + }, + { + "epoch": 7.643070246813127, + "grad_norm": 1.4326545114390872, + "learning_rate": 3.8216831990080596e-07, + "loss": 0.9131, + "step": 98630 + }, + { + "epoch": 7.643845170289434, + "grad_norm": 1.5693423254831915, + "learning_rate": 3.8220706757594543e-07, + "loss": 0.9043, + "step": 98640 + }, + { + "epoch": 7.644620093765741, + "grad_norm": 3.2046046790196856, + "learning_rate": 3.8224581525108496e-07, + "loss": 0.9247, + "step": 98650 + }, + { + "epoch": 7.6453950172420475, + "grad_norm": 1.4250800543770588, + "learning_rate": 3.822845629262244e-07, + "loss": 0.9236, + "step": 98660 + }, + { + "epoch": 7.646169940718354, + "grad_norm": 1.4190312856918554, + "learning_rate": 3.8232331060136395e-07, + "loss": 0.9187, + "step": 98670 + }, + { + "epoch": 7.646944864194661, + "grad_norm": 1.4062804370429238, + "learning_rate": 3.823620582765034e-07, + "loss": 0.9074, + "step": 98680 + }, + { + "epoch": 7.647719787670967, + "grad_norm": 1.3920245543820375, + "learning_rate": 3.8240080595164294e-07, + "loss": 0.9016, + "step": 98690 + }, + { + "epoch": 7.648494711147274, + "grad_norm": 1.3152581404107189, + "learning_rate": 3.824395536267824e-07, + "loss": 0.8994, + "step": 98700 + }, + { + "epoch": 7.649269634623581, + "grad_norm": 1.3791857998758437, + "learning_rate": 3.824783013019219e-07, + "loss": 0.9041, + "step": 98710 + }, + { + "epoch": 7.6500445580998875, + "grad_norm": 1.3849403327600192, + "learning_rate": 3.825170489770614e-07, + "loss": 0.9227, + "step": 98720 + }, + { + "epoch": 7.650819481576194, + "grad_norm": 1.3996669735571818, + "learning_rate": 3.8255579665220087e-07, + "loss": 0.9157, + "step": 98730 + }, + { + "epoch": 7.651594405052501, + "grad_norm": 1.4512107772377418, + "learning_rate": 3.825945443273404e-07, + "loss": 0.9124, + "step": 98740 + }, + { + "epoch": 7.652369328528808, + "grad_norm": 1.4131273756324538, + "learning_rate": 3.8263329200247986e-07, + "loss": 0.9255, + "step": 98750 + }, + { + "epoch": 7.653144252005115, + "grad_norm": 1.4791641590286093, + "learning_rate": 3.826720396776194e-07, + "loss": 0.9163, + "step": 98760 + }, + { + "epoch": 7.6539191754814215, + "grad_norm": 1.373292919302659, + "learning_rate": 3.8271078735275885e-07, + "loss": 0.913, + "step": 98770 + }, + { + "epoch": 7.654694098957728, + "grad_norm": 1.368879968188134, + "learning_rate": 3.827495350278983e-07, + "loss": 0.8999, + "step": 98780 + }, + { + "epoch": 7.655469022434034, + "grad_norm": 1.3582255932784373, + "learning_rate": 3.8278828270303784e-07, + "loss": 0.894, + "step": 98790 + }, + { + "epoch": 7.656243945910341, + "grad_norm": 1.405975763199879, + "learning_rate": 3.828270303781773e-07, + "loss": 0.891, + "step": 98800 + }, + { + "epoch": 7.657018869386648, + "grad_norm": 1.4392366482497732, + "learning_rate": 3.8286577805331683e-07, + "loss": 0.9042, + "step": 98810 + }, + { + "epoch": 7.657793792862955, + "grad_norm": 1.3713975957613713, + "learning_rate": 3.829045257284563e-07, + "loss": 0.9244, + "step": 98820 + }, + { + "epoch": 7.6585687163392615, + "grad_norm": 1.3389581799427297, + "learning_rate": 3.829432734035958e-07, + "loss": 0.9095, + "step": 98830 + }, + { + "epoch": 7.659343639815568, + "grad_norm": 1.4376053800456605, + "learning_rate": 3.829820210787353e-07, + "loss": 0.9215, + "step": 98840 + }, + { + "epoch": 7.660118563291875, + "grad_norm": 1.4477809068567125, + "learning_rate": 3.8302076875387476e-07, + "loss": 0.9227, + "step": 98850 + }, + { + "epoch": 7.660893486768182, + "grad_norm": 1.4345562539386514, + "learning_rate": 3.830595164290143e-07, + "loss": 0.9227, + "step": 98860 + }, + { + "epoch": 7.661668410244488, + "grad_norm": 1.3986303774776412, + "learning_rate": 3.8309826410415375e-07, + "loss": 0.9362, + "step": 98870 + }, + { + "epoch": 7.662443333720795, + "grad_norm": 1.49551316478248, + "learning_rate": 3.8313701177929327e-07, + "loss": 0.9276, + "step": 98880 + }, + { + "epoch": 7.6632182571971015, + "grad_norm": 1.469331857498979, + "learning_rate": 3.8317575945443274e-07, + "loss": 0.8997, + "step": 98890 + }, + { + "epoch": 7.663993180673408, + "grad_norm": 1.4301350463237712, + "learning_rate": 3.8321450712957226e-07, + "loss": 0.9086, + "step": 98900 + }, + { + "epoch": 7.664768104149715, + "grad_norm": 1.2923127002690808, + "learning_rate": 3.8325325480471173e-07, + "loss": 0.9085, + "step": 98910 + }, + { + "epoch": 7.665543027626022, + "grad_norm": 1.566965751292593, + "learning_rate": 3.832920024798512e-07, + "loss": 0.9286, + "step": 98920 + }, + { + "epoch": 7.666317951102329, + "grad_norm": 1.5254829119474276, + "learning_rate": 3.833307501549907e-07, + "loss": 0.9263, + "step": 98930 + }, + { + "epoch": 7.6670928745786355, + "grad_norm": 1.401761500827861, + "learning_rate": 3.833694978301302e-07, + "loss": 0.8897, + "step": 98940 + }, + { + "epoch": 7.667867798054942, + "grad_norm": 1.4427344656883463, + "learning_rate": 3.834082455052697e-07, + "loss": 0.9065, + "step": 98950 + }, + { + "epoch": 7.668642721531249, + "grad_norm": 1.4400331435000444, + "learning_rate": 3.834469931804092e-07, + "loss": 0.9325, + "step": 98960 + }, + { + "epoch": 7.669417645007556, + "grad_norm": 1.4438161796486058, + "learning_rate": 3.834857408555487e-07, + "loss": 0.9142, + "step": 98970 + }, + { + "epoch": 7.670192568483862, + "grad_norm": 1.5145468785517922, + "learning_rate": 3.835244885306882e-07, + "loss": 0.919, + "step": 98980 + }, + { + "epoch": 7.670967491960169, + "grad_norm": 1.4899035318956664, + "learning_rate": 3.8356323620582764e-07, + "loss": 0.8956, + "step": 98990 + }, + { + "epoch": 7.6717424154364755, + "grad_norm": 1.3806973248263203, + "learning_rate": 3.8360198388096717e-07, + "loss": 0.9116, + "step": 99000 + }, + { + "epoch": 7.6717424154364755, + "eval_loss": 0.9244319796562195, + "eval_runtime": 330.8347, + "eval_samples_per_second": 34.673, + "eval_steps_per_second": 8.669, + "step": 99000 + }, + { + "epoch": 7.672517338912782, + "grad_norm": 1.4820787918592875, + "learning_rate": 3.8364073155610663e-07, + "loss": 0.9247, + "step": 99010 + }, + { + "epoch": 7.673292262389089, + "grad_norm": 1.4111731855651795, + "learning_rate": 3.8367947923124616e-07, + "loss": 0.9167, + "step": 99020 + }, + { + "epoch": 7.674067185865396, + "grad_norm": 1.3837734636654566, + "learning_rate": 3.837182269063856e-07, + "loss": 0.9011, + "step": 99030 + }, + { + "epoch": 7.674842109341703, + "grad_norm": 1.3865782336218462, + "learning_rate": 3.8375697458152515e-07, + "loss": 0.9017, + "step": 99040 + }, + { + "epoch": 7.6756170328180096, + "grad_norm": 1.5175590811152948, + "learning_rate": 3.837957222566646e-07, + "loss": 0.9268, + "step": 99050 + }, + { + "epoch": 7.6763919562943155, + "grad_norm": 1.3845379059430127, + "learning_rate": 3.838344699318041e-07, + "loss": 0.8943, + "step": 99060 + }, + { + "epoch": 7.677166879770622, + "grad_norm": 1.4305859319689662, + "learning_rate": 3.838732176069436e-07, + "loss": 0.9063, + "step": 99070 + }, + { + "epoch": 7.677941803246929, + "grad_norm": 1.4541922703531454, + "learning_rate": 3.839119652820831e-07, + "loss": 0.9369, + "step": 99080 + }, + { + "epoch": 7.678716726723236, + "grad_norm": 1.3821382314315285, + "learning_rate": 3.839507129572226e-07, + "loss": 0.9105, + "step": 99090 + }, + { + "epoch": 7.679491650199543, + "grad_norm": 1.478421351877657, + "learning_rate": 3.8398946063236207e-07, + "loss": 0.9117, + "step": 99100 + }, + { + "epoch": 7.6802665736758495, + "grad_norm": 1.4466499584742112, + "learning_rate": 3.840282083075016e-07, + "loss": 0.9269, + "step": 99110 + }, + { + "epoch": 7.681041497152156, + "grad_norm": 1.4335547621566929, + "learning_rate": 3.8406695598264106e-07, + "loss": 0.9326, + "step": 99120 + }, + { + "epoch": 7.681816420628463, + "grad_norm": 1.5204796495819195, + "learning_rate": 3.8410570365778053e-07, + "loss": 0.9248, + "step": 99130 + }, + { + "epoch": 7.68259134410477, + "grad_norm": 1.3740739631405297, + "learning_rate": 3.8414445133292005e-07, + "loss": 0.9347, + "step": 99140 + }, + { + "epoch": 7.683366267581077, + "grad_norm": 1.366375598218737, + "learning_rate": 3.841831990080595e-07, + "loss": 0.9239, + "step": 99150 + }, + { + "epoch": 7.684141191057384, + "grad_norm": 1.4602807920914027, + "learning_rate": 3.8422194668319904e-07, + "loss": 0.932, + "step": 99160 + }, + { + "epoch": 7.6849161145336895, + "grad_norm": 1.3048653694698842, + "learning_rate": 3.842606943583385e-07, + "loss": 0.925, + "step": 99170 + }, + { + "epoch": 7.685691038009996, + "grad_norm": 1.4969406190150032, + "learning_rate": 3.8429944203347803e-07, + "loss": 0.9377, + "step": 99180 + }, + { + "epoch": 7.686465961486303, + "grad_norm": 1.4321037547823354, + "learning_rate": 3.843381897086175e-07, + "loss": 0.9112, + "step": 99190 + }, + { + "epoch": 7.68724088496261, + "grad_norm": 1.4389279020106152, + "learning_rate": 3.8437693738375697e-07, + "loss": 0.9234, + "step": 99200 + }, + { + "epoch": 7.688015808438917, + "grad_norm": 1.431403778169388, + "learning_rate": 3.844156850588965e-07, + "loss": 0.9266, + "step": 99210 + }, + { + "epoch": 7.6887907319152236, + "grad_norm": 1.4612421316081088, + "learning_rate": 3.8445443273403596e-07, + "loss": 0.9073, + "step": 99220 + }, + { + "epoch": 7.68956565539153, + "grad_norm": 1.4258179044556591, + "learning_rate": 3.844931804091755e-07, + "loss": 0.9206, + "step": 99230 + }, + { + "epoch": 7.690340578867837, + "grad_norm": 1.4510943273976138, + "learning_rate": 3.8453192808431495e-07, + "loss": 0.9166, + "step": 99240 + }, + { + "epoch": 7.691115502344143, + "grad_norm": 1.3410675975953996, + "learning_rate": 3.845706757594545e-07, + "loss": 0.9061, + "step": 99250 + }, + { + "epoch": 7.69189042582045, + "grad_norm": 1.3316422711086415, + "learning_rate": 3.8460942343459394e-07, + "loss": 0.94, + "step": 99260 + }, + { + "epoch": 7.692665349296757, + "grad_norm": 1.3912010024218244, + "learning_rate": 3.846481711097334e-07, + "loss": 0.9305, + "step": 99270 + }, + { + "epoch": 7.6934402727730635, + "grad_norm": 1.4579671969090073, + "learning_rate": 3.8468691878487293e-07, + "loss": 0.9271, + "step": 99280 + }, + { + "epoch": 7.69421519624937, + "grad_norm": 1.4851149092857803, + "learning_rate": 3.847256664600124e-07, + "loss": 0.9205, + "step": 99290 + }, + { + "epoch": 7.694990119725677, + "grad_norm": 1.38407363558421, + "learning_rate": 3.847644141351519e-07, + "loss": 0.8995, + "step": 99300 + }, + { + "epoch": 7.695765043201984, + "grad_norm": 1.3762629352618692, + "learning_rate": 3.848031618102914e-07, + "loss": 0.9098, + "step": 99310 + }, + { + "epoch": 7.696539966678291, + "grad_norm": 1.4553160669231522, + "learning_rate": 3.8484190948543086e-07, + "loss": 0.9249, + "step": 99320 + }, + { + "epoch": 7.697314890154598, + "grad_norm": 1.4625337440724067, + "learning_rate": 3.848806571605704e-07, + "loss": 0.9283, + "step": 99330 + }, + { + "epoch": 7.698089813630904, + "grad_norm": 1.356243375524094, + "learning_rate": 3.8491940483570985e-07, + "loss": 0.9119, + "step": 99340 + }, + { + "epoch": 7.69886473710721, + "grad_norm": 1.4182065880439034, + "learning_rate": 3.849581525108494e-07, + "loss": 0.9266, + "step": 99350 + }, + { + "epoch": 7.699639660583517, + "grad_norm": 1.572643709333692, + "learning_rate": 3.8499690018598885e-07, + "loss": 0.9097, + "step": 99360 + }, + { + "epoch": 7.700414584059824, + "grad_norm": 1.4154523142504463, + "learning_rate": 3.8503564786112837e-07, + "loss": 0.9354, + "step": 99370 + }, + { + "epoch": 7.701189507536131, + "grad_norm": 1.368515538523704, + "learning_rate": 3.8507439553626784e-07, + "loss": 0.9407, + "step": 99380 + }, + { + "epoch": 7.7019644310124376, + "grad_norm": 1.3999624619152211, + "learning_rate": 3.851131432114073e-07, + "loss": 0.909, + "step": 99390 + }, + { + "epoch": 7.702739354488744, + "grad_norm": 1.460497551461028, + "learning_rate": 3.8515189088654683e-07, + "loss": 0.9245, + "step": 99400 + }, + { + "epoch": 7.703514277965051, + "grad_norm": 1.426458821512506, + "learning_rate": 3.851906385616863e-07, + "loss": 0.9283, + "step": 99410 + }, + { + "epoch": 7.704289201441358, + "grad_norm": 1.3860595388159132, + "learning_rate": 3.852293862368258e-07, + "loss": 0.9138, + "step": 99420 + }, + { + "epoch": 7.705064124917664, + "grad_norm": 1.4276850804613292, + "learning_rate": 3.852681339119653e-07, + "loss": 0.9125, + "step": 99430 + }, + { + "epoch": 7.705839048393971, + "grad_norm": 1.5089198890984097, + "learning_rate": 3.853068815871048e-07, + "loss": 0.9345, + "step": 99440 + }, + { + "epoch": 7.7066139718702775, + "grad_norm": 1.446697361149277, + "learning_rate": 3.853456292622443e-07, + "loss": 0.9438, + "step": 99450 + }, + { + "epoch": 7.707388895346584, + "grad_norm": 1.4866200548609012, + "learning_rate": 3.8538437693738375e-07, + "loss": 0.9462, + "step": 99460 + }, + { + "epoch": 7.708163818822891, + "grad_norm": 1.3940720350748401, + "learning_rate": 3.8542312461252327e-07, + "loss": 0.9176, + "step": 99470 + }, + { + "epoch": 7.708938742299198, + "grad_norm": 1.3849388514235217, + "learning_rate": 3.8546187228766274e-07, + "loss": 0.9058, + "step": 99480 + }, + { + "epoch": 7.709713665775505, + "grad_norm": 1.3981161170275616, + "learning_rate": 3.8550061996280226e-07, + "loss": 0.9546, + "step": 99490 + }, + { + "epoch": 7.710488589251812, + "grad_norm": 1.4352613533164666, + "learning_rate": 3.8553936763794173e-07, + "loss": 0.9333, + "step": 99500 + }, + { + "epoch": 7.710488589251812, + "eval_loss": 0.9241409301757812, + "eval_runtime": 332.4572, + "eval_samples_per_second": 34.504, + "eval_steps_per_second": 8.627, + "step": 99500 + }, + { + "epoch": 7.711263512728118, + "grad_norm": 1.4068668231274597, + "learning_rate": 3.8557811531308125e-07, + "loss": 0.9446, + "step": 99510 + }, + { + "epoch": 7.712038436204425, + "grad_norm": 1.3696080520414222, + "learning_rate": 3.856168629882207e-07, + "loss": 0.9273, + "step": 99520 + }, + { + "epoch": 7.712813359680732, + "grad_norm": 1.4446914296987041, + "learning_rate": 3.856556106633602e-07, + "loss": 0.9223, + "step": 99530 + }, + { + "epoch": 7.713588283157038, + "grad_norm": 1.427581078367664, + "learning_rate": 3.856943583384997e-07, + "loss": 0.9349, + "step": 99540 + }, + { + "epoch": 7.714363206633345, + "grad_norm": 1.370643461777344, + "learning_rate": 3.857331060136392e-07, + "loss": 0.9144, + "step": 99550 + }, + { + "epoch": 7.7151381301096515, + "grad_norm": 1.3530991874703255, + "learning_rate": 3.857718536887787e-07, + "loss": 0.9217, + "step": 99560 + }, + { + "epoch": 7.715913053585958, + "grad_norm": 1.498757623421468, + "learning_rate": 3.8581060136391817e-07, + "loss": 0.9183, + "step": 99570 + }, + { + "epoch": 7.716687977062265, + "grad_norm": 1.5193029478280982, + "learning_rate": 3.858493490390577e-07, + "loss": 0.9324, + "step": 99580 + }, + { + "epoch": 7.717462900538572, + "grad_norm": 1.4132044457437065, + "learning_rate": 3.8588809671419716e-07, + "loss": 0.9169, + "step": 99590 + }, + { + "epoch": 7.718237824014879, + "grad_norm": 1.4747206048266897, + "learning_rate": 3.8592684438933663e-07, + "loss": 0.9186, + "step": 99600 + }, + { + "epoch": 7.719012747491186, + "grad_norm": 1.3244496199911602, + "learning_rate": 3.8596559206447615e-07, + "loss": 0.922, + "step": 99610 + }, + { + "epoch": 7.7197876709674915, + "grad_norm": 1.4934470469608456, + "learning_rate": 3.860043397396156e-07, + "loss": 0.9304, + "step": 99620 + }, + { + "epoch": 7.720562594443798, + "grad_norm": 1.407774883416455, + "learning_rate": 3.8604308741475514e-07, + "loss": 0.9187, + "step": 99630 + }, + { + "epoch": 7.721337517920105, + "grad_norm": 1.5398485339192207, + "learning_rate": 3.860818350898946e-07, + "loss": 0.9052, + "step": 99640 + }, + { + "epoch": 7.722112441396412, + "grad_norm": 1.4605235117487985, + "learning_rate": 3.8612058276503414e-07, + "loss": 0.9191, + "step": 99650 + }, + { + "epoch": 7.722887364872719, + "grad_norm": 1.3673123355237926, + "learning_rate": 3.861593304401736e-07, + "loss": 0.9245, + "step": 99660 + }, + { + "epoch": 7.723662288349026, + "grad_norm": 1.3941851925156674, + "learning_rate": 3.861980781153131e-07, + "loss": 0.915, + "step": 99670 + }, + { + "epoch": 7.724437211825332, + "grad_norm": 1.4193670180013185, + "learning_rate": 3.862368257904526e-07, + "loss": 0.9103, + "step": 99680 + }, + { + "epoch": 7.725212135301639, + "grad_norm": 1.4479823087542345, + "learning_rate": 3.8627557346559206e-07, + "loss": 0.9158, + "step": 99690 + }, + { + "epoch": 7.725987058777946, + "grad_norm": 1.418380694347081, + "learning_rate": 3.863143211407316e-07, + "loss": 0.9255, + "step": 99700 + }, + { + "epoch": 7.726761982254253, + "grad_norm": 1.4392931151137622, + "learning_rate": 3.8635306881587106e-07, + "loss": 0.9044, + "step": 99710 + }, + { + "epoch": 7.727536905730559, + "grad_norm": 1.389475369691568, + "learning_rate": 3.863918164910106e-07, + "loss": 0.934, + "step": 99720 + }, + { + "epoch": 7.7283118292068655, + "grad_norm": 1.4511518062149495, + "learning_rate": 3.8643056416615005e-07, + "loss": 0.9009, + "step": 99730 + }, + { + "epoch": 7.729086752683172, + "grad_norm": 1.4637618167805297, + "learning_rate": 3.864693118412895e-07, + "loss": 0.897, + "step": 99740 + }, + { + "epoch": 7.729861676159479, + "grad_norm": 1.4432050026025118, + "learning_rate": 3.8650805951642904e-07, + "loss": 0.9191, + "step": 99750 + }, + { + "epoch": 7.730636599635786, + "grad_norm": 1.381133035548192, + "learning_rate": 3.865468071915685e-07, + "loss": 0.916, + "step": 99760 + }, + { + "epoch": 7.731411523112093, + "grad_norm": 1.5172186552278055, + "learning_rate": 3.8658555486670803e-07, + "loss": 0.9114, + "step": 99770 + }, + { + "epoch": 7.7321864465884, + "grad_norm": 1.5157924618503873, + "learning_rate": 3.866243025418475e-07, + "loss": 0.9206, + "step": 99780 + }, + { + "epoch": 7.732961370064706, + "grad_norm": 1.3573435913841372, + "learning_rate": 3.86663050216987e-07, + "loss": 0.9219, + "step": 99790 + }, + { + "epoch": 7.733736293541012, + "grad_norm": 1.4249521738102924, + "learning_rate": 3.867017978921265e-07, + "loss": 0.9162, + "step": 99800 + }, + { + "epoch": 7.734511217017319, + "grad_norm": 1.4249225116230582, + "learning_rate": 3.8674054556726596e-07, + "loss": 0.9112, + "step": 99810 + }, + { + "epoch": 7.735286140493626, + "grad_norm": 1.4207594678005264, + "learning_rate": 3.867792932424055e-07, + "loss": 0.9209, + "step": 99820 + }, + { + "epoch": 7.736061063969933, + "grad_norm": 1.4259539570080644, + "learning_rate": 3.8681804091754495e-07, + "loss": 0.9306, + "step": 99830 + }, + { + "epoch": 7.73683598744624, + "grad_norm": 1.4749282263694066, + "learning_rate": 3.8685678859268447e-07, + "loss": 0.9095, + "step": 99840 + }, + { + "epoch": 7.737610910922546, + "grad_norm": 1.4410304037412907, + "learning_rate": 3.8689553626782394e-07, + "loss": 0.9323, + "step": 99850 + }, + { + "epoch": 7.738385834398853, + "grad_norm": 1.5124432111654718, + "learning_rate": 3.8693428394296346e-07, + "loss": 0.9225, + "step": 99860 + }, + { + "epoch": 7.73916075787516, + "grad_norm": 1.5681078252935239, + "learning_rate": 3.8697303161810293e-07, + "loss": 0.9058, + "step": 99870 + }, + { + "epoch": 7.739935681351467, + "grad_norm": 1.476368717445595, + "learning_rate": 3.870117792932424e-07, + "loss": 0.916, + "step": 99880 + }, + { + "epoch": 7.740710604827774, + "grad_norm": 1.415307065869652, + "learning_rate": 3.870505269683819e-07, + "loss": 0.921, + "step": 99890 + }, + { + "epoch": 7.74148552830408, + "grad_norm": 1.4005564578061755, + "learning_rate": 3.870892746435214e-07, + "loss": 0.919, + "step": 99900 + }, + { + "epoch": 7.742260451780386, + "grad_norm": 1.398094184075863, + "learning_rate": 3.871280223186609e-07, + "loss": 0.9034, + "step": 99910 + }, + { + "epoch": 7.743035375256693, + "grad_norm": 1.376228354320085, + "learning_rate": 3.871667699938004e-07, + "loss": 0.9249, + "step": 99920 + }, + { + "epoch": 7.743810298733, + "grad_norm": 1.4396417791766074, + "learning_rate": 3.872055176689399e-07, + "loss": 0.9163, + "step": 99930 + }, + { + "epoch": 7.744585222209307, + "grad_norm": 1.4227351195191267, + "learning_rate": 3.8724426534407937e-07, + "loss": 0.925, + "step": 99940 + }, + { + "epoch": 7.745360145685614, + "grad_norm": 1.3969660571981979, + "learning_rate": 3.8728301301921884e-07, + "loss": 0.9285, + "step": 99950 + }, + { + "epoch": 7.74613506916192, + "grad_norm": 1.4300180910759934, + "learning_rate": 3.8732176069435836e-07, + "loss": 0.9244, + "step": 99960 + }, + { + "epoch": 7.746909992638227, + "grad_norm": 1.4437355833966912, + "learning_rate": 3.8736050836949783e-07, + "loss": 0.9243, + "step": 99970 + }, + { + "epoch": 7.747684916114534, + "grad_norm": 1.4277422292016777, + "learning_rate": 3.8739925604463736e-07, + "loss": 0.9307, + "step": 99980 + }, + { + "epoch": 7.74845983959084, + "grad_norm": 1.4991707037537012, + "learning_rate": 3.874380037197768e-07, + "loss": 0.9163, + "step": 99990 + }, + { + "epoch": 7.749234763067147, + "grad_norm": 1.4437580817997087, + "learning_rate": 3.8747675139491635e-07, + "loss": 0.918, + "step": 100000 + }, + { + "epoch": 7.749234763067147, + "eval_loss": 0.9236516356468201, + "eval_runtime": 328.6417, + "eval_samples_per_second": 34.904, + "eval_steps_per_second": 8.727, + "step": 100000 + }, + { + "epoch": 7.750009686543454, + "grad_norm": 1.4393308582566369, + "learning_rate": 3.875154990700558e-07, + "loss": 0.9504, + "step": 100010 + }, + { + "epoch": 7.75078461001976, + "grad_norm": 1.5407185515997013, + "learning_rate": 3.875542467451953e-07, + "loss": 0.9109, + "step": 100020 + }, + { + "epoch": 7.751559533496067, + "grad_norm": 1.3977733091799516, + "learning_rate": 3.875929944203348e-07, + "loss": 0.9258, + "step": 100030 + }, + { + "epoch": 7.752334456972374, + "grad_norm": 1.3735654117599334, + "learning_rate": 3.876317420954743e-07, + "loss": 0.911, + "step": 100040 + }, + { + "epoch": 7.753109380448681, + "grad_norm": 1.4705434399633799, + "learning_rate": 3.876704897706138e-07, + "loss": 0.9034, + "step": 100050 + }, + { + "epoch": 7.753884303924988, + "grad_norm": 1.5372364504536895, + "learning_rate": 3.8770923744575327e-07, + "loss": 0.9186, + "step": 100060 + }, + { + "epoch": 7.754659227401294, + "grad_norm": 1.4218626537211536, + "learning_rate": 3.8774798512089274e-07, + "loss": 0.9097, + "step": 100070 + }, + { + "epoch": 7.755434150877601, + "grad_norm": 1.471191901499318, + "learning_rate": 3.8778673279603226e-07, + "loss": 0.9372, + "step": 100080 + }, + { + "epoch": 7.756209074353908, + "grad_norm": 1.4495020382608832, + "learning_rate": 3.878254804711717e-07, + "loss": 0.927, + "step": 100090 + }, + { + "epoch": 7.756983997830214, + "grad_norm": 1.4371464115027677, + "learning_rate": 3.8786422814631125e-07, + "loss": 0.9151, + "step": 100100 + }, + { + "epoch": 7.757758921306521, + "grad_norm": 1.3941595601570602, + "learning_rate": 3.879029758214507e-07, + "loss": 0.9106, + "step": 100110 + }, + { + "epoch": 7.758533844782828, + "grad_norm": 1.3872242627022668, + "learning_rate": 3.8794172349659024e-07, + "loss": 0.9231, + "step": 100120 + }, + { + "epoch": 7.759308768259134, + "grad_norm": 1.357460942365618, + "learning_rate": 3.879804711717297e-07, + "loss": 0.9225, + "step": 100130 + }, + { + "epoch": 7.760083691735441, + "grad_norm": 1.4226751766261845, + "learning_rate": 3.880192188468692e-07, + "loss": 0.9071, + "step": 100140 + }, + { + "epoch": 7.760858615211748, + "grad_norm": 1.3716616550283538, + "learning_rate": 3.880579665220087e-07, + "loss": 0.9222, + "step": 100150 + }, + { + "epoch": 7.761633538688055, + "grad_norm": 1.4202437996097417, + "learning_rate": 3.8809671419714817e-07, + "loss": 0.9182, + "step": 100160 + }, + { + "epoch": 7.762408462164362, + "grad_norm": 1.3888084575900832, + "learning_rate": 3.881354618722877e-07, + "loss": 0.9128, + "step": 100170 + }, + { + "epoch": 7.763183385640668, + "grad_norm": 1.319082856915432, + "learning_rate": 3.8817420954742716e-07, + "loss": 0.902, + "step": 100180 + }, + { + "epoch": 7.763958309116974, + "grad_norm": 1.4125053147053814, + "learning_rate": 3.882129572225667e-07, + "loss": 0.9169, + "step": 100190 + }, + { + "epoch": 7.764733232593281, + "grad_norm": 1.5000651686281568, + "learning_rate": 3.8825170489770615e-07, + "loss": 0.9319, + "step": 100200 + }, + { + "epoch": 7.765508156069588, + "grad_norm": 1.458669933919486, + "learning_rate": 3.882904525728456e-07, + "loss": 0.9436, + "step": 100210 + }, + { + "epoch": 7.766283079545895, + "grad_norm": 1.4071449923009365, + "learning_rate": 3.8832920024798514e-07, + "loss": 0.9201, + "step": 100220 + }, + { + "epoch": 7.767058003022202, + "grad_norm": 1.4437861806788224, + "learning_rate": 3.883679479231246e-07, + "loss": 0.9278, + "step": 100230 + }, + { + "epoch": 7.767832926498508, + "grad_norm": 1.4144683034734291, + "learning_rate": 3.8840669559826413e-07, + "loss": 0.928, + "step": 100240 + }, + { + "epoch": 7.768607849974815, + "grad_norm": 1.4238916472432386, + "learning_rate": 3.884454432734036e-07, + "loss": 0.924, + "step": 100250 + }, + { + "epoch": 7.769382773451122, + "grad_norm": 1.360531439578694, + "learning_rate": 3.884841909485431e-07, + "loss": 0.9133, + "step": 100260 + }, + { + "epoch": 7.770157696927429, + "grad_norm": 1.4991117512388368, + "learning_rate": 3.885229386236826e-07, + "loss": 0.9172, + "step": 100270 + }, + { + "epoch": 7.770932620403735, + "grad_norm": 1.3834220846428344, + "learning_rate": 3.8856168629882206e-07, + "loss": 0.924, + "step": 100280 + }, + { + "epoch": 7.771707543880042, + "grad_norm": 1.4545674970620364, + "learning_rate": 3.886004339739616e-07, + "loss": 0.9228, + "step": 100290 + }, + { + "epoch": 7.772482467356348, + "grad_norm": 1.4686312968184871, + "learning_rate": 3.8863918164910105e-07, + "loss": 0.9217, + "step": 100300 + }, + { + "epoch": 7.773257390832655, + "grad_norm": 1.804582314155496, + "learning_rate": 3.886779293242406e-07, + "loss": 0.9232, + "step": 100310 + }, + { + "epoch": 7.774032314308962, + "grad_norm": 1.3726559836429408, + "learning_rate": 3.8871667699938004e-07, + "loss": 0.9156, + "step": 100320 + }, + { + "epoch": 7.774807237785269, + "grad_norm": 1.4384620628400726, + "learning_rate": 3.8875542467451957e-07, + "loss": 0.9274, + "step": 100330 + }, + { + "epoch": 7.775582161261576, + "grad_norm": 1.4419718838465032, + "learning_rate": 3.8879417234965903e-07, + "loss": 0.9313, + "step": 100340 + }, + { + "epoch": 7.7763570847378825, + "grad_norm": 1.499806607575103, + "learning_rate": 3.888329200247985e-07, + "loss": 0.9136, + "step": 100350 + }, + { + "epoch": 7.777132008214188, + "grad_norm": 1.4454754536850305, + "learning_rate": 3.88871667699938e-07, + "loss": 0.9182, + "step": 100360 + }, + { + "epoch": 7.777906931690495, + "grad_norm": 1.4026312305118738, + "learning_rate": 3.889104153750775e-07, + "loss": 0.9103, + "step": 100370 + }, + { + "epoch": 7.778681855166802, + "grad_norm": 1.4513146180357153, + "learning_rate": 3.88949163050217e-07, + "loss": 0.9097, + "step": 100380 + }, + { + "epoch": 7.779456778643109, + "grad_norm": 1.456100045661663, + "learning_rate": 3.889879107253565e-07, + "loss": 0.9203, + "step": 100390 + }, + { + "epoch": 7.780231702119416, + "grad_norm": 1.4980419628290809, + "learning_rate": 3.89026658400496e-07, + "loss": 0.9154, + "step": 100400 + }, + { + "epoch": 7.781006625595722, + "grad_norm": 1.4419290035504697, + "learning_rate": 3.890654060756355e-07, + "loss": 0.9041, + "step": 100410 + }, + { + "epoch": 7.781781549072029, + "grad_norm": 1.4617250303581768, + "learning_rate": 3.8910415375077495e-07, + "loss": 0.9263, + "step": 100420 + }, + { + "epoch": 7.782556472548336, + "grad_norm": 1.4413691666584576, + "learning_rate": 3.8914290142591447e-07, + "loss": 0.9254, + "step": 100430 + }, + { + "epoch": 7.783331396024643, + "grad_norm": 1.5097876971734026, + "learning_rate": 3.8918164910105394e-07, + "loss": 0.9008, + "step": 100440 + }, + { + "epoch": 7.78410631950095, + "grad_norm": 1.3593494312891086, + "learning_rate": 3.8922039677619346e-07, + "loss": 0.9151, + "step": 100450 + }, + { + "epoch": 7.7848812429772565, + "grad_norm": 1.3698494271633, + "learning_rate": 3.8925914445133293e-07, + "loss": 0.9115, + "step": 100460 + }, + { + "epoch": 7.785656166453562, + "grad_norm": 1.4176566970897648, + "learning_rate": 3.8929789212647245e-07, + "loss": 0.9196, + "step": 100470 + }, + { + "epoch": 7.786431089929869, + "grad_norm": 1.3736412299546685, + "learning_rate": 3.893366398016119e-07, + "loss": 0.9126, + "step": 100480 + }, + { + "epoch": 7.787206013406176, + "grad_norm": 1.4013231224012754, + "learning_rate": 3.893753874767514e-07, + "loss": 0.9058, + "step": 100490 + }, + { + "epoch": 7.787980936882483, + "grad_norm": 1.4153339100783064, + "learning_rate": 3.894141351518909e-07, + "loss": 0.9608, + "step": 100500 + }, + { + "epoch": 7.787980936882483, + "eval_loss": 0.923370897769928, + "eval_runtime": 329.8557, + "eval_samples_per_second": 34.776, + "eval_steps_per_second": 8.695, + "step": 100500 + }, + { + "epoch": 7.78875586035879, + "grad_norm": 1.4493316973872672, + "learning_rate": 3.894528828270304e-07, + "loss": 0.9243, + "step": 100510 + }, + { + "epoch": 7.7895307838350965, + "grad_norm": 1.3683761729597483, + "learning_rate": 3.894916305021699e-07, + "loss": 0.8993, + "step": 100520 + }, + { + "epoch": 7.790305707311403, + "grad_norm": 1.3656522730283716, + "learning_rate": 3.8953037817730937e-07, + "loss": 0.9229, + "step": 100530 + }, + { + "epoch": 7.79108063078771, + "grad_norm": 1.5085600174303049, + "learning_rate": 3.895691258524489e-07, + "loss": 0.9317, + "step": 100540 + }, + { + "epoch": 7.791855554264016, + "grad_norm": 1.4501802057525712, + "learning_rate": 3.8960787352758836e-07, + "loss": 0.9439, + "step": 100550 + }, + { + "epoch": 7.792630477740323, + "grad_norm": 1.3497801299781718, + "learning_rate": 3.8964662120272783e-07, + "loss": 0.926, + "step": 100560 + }, + { + "epoch": 7.79340540121663, + "grad_norm": 1.3943433058822017, + "learning_rate": 3.8968536887786735e-07, + "loss": 0.9156, + "step": 100570 + }, + { + "epoch": 7.794180324692936, + "grad_norm": 1.4848797793047073, + "learning_rate": 3.897241165530068e-07, + "loss": 0.9073, + "step": 100580 + }, + { + "epoch": 7.794955248169243, + "grad_norm": 1.4834614450126253, + "learning_rate": 3.8976286422814634e-07, + "loss": 0.9248, + "step": 100590 + }, + { + "epoch": 7.79573017164555, + "grad_norm": 1.42260658606557, + "learning_rate": 3.898016119032858e-07, + "loss": 0.9255, + "step": 100600 + }, + { + "epoch": 7.796505095121857, + "grad_norm": 1.420301037645688, + "learning_rate": 3.8984035957842533e-07, + "loss": 0.9203, + "step": 100610 + }, + { + "epoch": 7.797280018598164, + "grad_norm": 1.427454786258808, + "learning_rate": 3.898791072535648e-07, + "loss": 0.9079, + "step": 100620 + }, + { + "epoch": 7.7980549420744705, + "grad_norm": 1.4820477075194474, + "learning_rate": 3.8991785492870427e-07, + "loss": 0.9142, + "step": 100630 + }, + { + "epoch": 7.798829865550777, + "grad_norm": 1.4077526508225835, + "learning_rate": 3.899566026038438e-07, + "loss": 0.9088, + "step": 100640 + }, + { + "epoch": 7.799604789027083, + "grad_norm": 1.4374527791982266, + "learning_rate": 3.8999535027898326e-07, + "loss": 0.9102, + "step": 100650 + }, + { + "epoch": 7.80037971250339, + "grad_norm": 1.3995085001438468, + "learning_rate": 3.900340979541228e-07, + "loss": 0.9036, + "step": 100660 + }, + { + "epoch": 7.801154635979697, + "grad_norm": 1.3864087575809396, + "learning_rate": 3.9007284562926225e-07, + "loss": 0.8989, + "step": 100670 + }, + { + "epoch": 7.801929559456004, + "grad_norm": 1.446020526701007, + "learning_rate": 3.901115933044018e-07, + "loss": 0.9131, + "step": 100680 + }, + { + "epoch": 7.8027044829323104, + "grad_norm": 1.4364371824456212, + "learning_rate": 3.9015034097954125e-07, + "loss": 0.9151, + "step": 100690 + }, + { + "epoch": 7.803479406408617, + "grad_norm": 1.3802218278291425, + "learning_rate": 3.901890886546807e-07, + "loss": 0.9241, + "step": 100700 + }, + { + "epoch": 7.804254329884924, + "grad_norm": 1.4540383107917647, + "learning_rate": 3.9022783632982024e-07, + "loss": 0.917, + "step": 100710 + }, + { + "epoch": 7.805029253361231, + "grad_norm": 1.4361961084327481, + "learning_rate": 3.902665840049597e-07, + "loss": 0.9124, + "step": 100720 + }, + { + "epoch": 7.805804176837537, + "grad_norm": 1.4015752165928428, + "learning_rate": 3.9030533168009923e-07, + "loss": 0.9013, + "step": 100730 + }, + { + "epoch": 7.806579100313844, + "grad_norm": 1.4108398741024484, + "learning_rate": 3.903440793552387e-07, + "loss": 0.9157, + "step": 100740 + }, + { + "epoch": 7.80735402379015, + "grad_norm": 1.4773176989129384, + "learning_rate": 3.903828270303782e-07, + "loss": 0.9181, + "step": 100750 + }, + { + "epoch": 7.808128947266457, + "grad_norm": 1.4227429603494142, + "learning_rate": 3.904215747055177e-07, + "loss": 0.9237, + "step": 100760 + }, + { + "epoch": 7.808903870742764, + "grad_norm": 1.45488702202112, + "learning_rate": 3.9046032238065716e-07, + "loss": 0.9186, + "step": 100770 + }, + { + "epoch": 7.809678794219071, + "grad_norm": 1.4050879482765122, + "learning_rate": 3.904990700557967e-07, + "loss": 0.9054, + "step": 100780 + }, + { + "epoch": 7.810453717695378, + "grad_norm": 1.50546718674229, + "learning_rate": 3.9053781773093615e-07, + "loss": 0.923, + "step": 100790 + }, + { + "epoch": 7.8112286411716845, + "grad_norm": 1.3745490306097166, + "learning_rate": 3.9057656540607567e-07, + "loss": 0.8892, + "step": 100800 + }, + { + "epoch": 7.812003564647991, + "grad_norm": 1.3874583767120487, + "learning_rate": 3.9061531308121514e-07, + "loss": 0.9097, + "step": 100810 + }, + { + "epoch": 7.812778488124298, + "grad_norm": 1.4373274415280255, + "learning_rate": 3.906540607563546e-07, + "loss": 0.918, + "step": 100820 + }, + { + "epoch": 7.813553411600605, + "grad_norm": 1.422058086275576, + "learning_rate": 3.9069280843149413e-07, + "loss": 0.9182, + "step": 100830 + }, + { + "epoch": 7.814328335076911, + "grad_norm": 1.5097413625751686, + "learning_rate": 3.907315561066336e-07, + "loss": 0.9198, + "step": 100840 + }, + { + "epoch": 7.815103258553218, + "grad_norm": 1.3507254822237593, + "learning_rate": 3.907703037817731e-07, + "loss": 0.9194, + "step": 100850 + }, + { + "epoch": 7.8158781820295244, + "grad_norm": 1.4188501298851266, + "learning_rate": 3.908090514569126e-07, + "loss": 0.9182, + "step": 100860 + }, + { + "epoch": 7.816653105505831, + "grad_norm": 1.3945074065203944, + "learning_rate": 3.908477991320521e-07, + "loss": 0.9154, + "step": 100870 + }, + { + "epoch": 7.817428028982138, + "grad_norm": 1.5124340325361034, + "learning_rate": 3.908865468071916e-07, + "loss": 0.9333, + "step": 100880 + }, + { + "epoch": 7.818202952458445, + "grad_norm": 1.4614101818106997, + "learning_rate": 3.9092529448233105e-07, + "loss": 0.9066, + "step": 100890 + }, + { + "epoch": 7.818977875934752, + "grad_norm": 1.3943396096918625, + "learning_rate": 3.9096404215747057e-07, + "loss": 0.9254, + "step": 100900 + }, + { + "epoch": 7.8197527994110585, + "grad_norm": 1.48230463712425, + "learning_rate": 3.9100278983261004e-07, + "loss": 0.9041, + "step": 100910 + }, + { + "epoch": 7.820527722887364, + "grad_norm": 1.3484592570949634, + "learning_rate": 3.9104153750774956e-07, + "loss": 0.9239, + "step": 100920 + }, + { + "epoch": 7.821302646363671, + "grad_norm": 1.3980495497828422, + "learning_rate": 3.9108028518288903e-07, + "loss": 0.9096, + "step": 100930 + }, + { + "epoch": 7.822077569839978, + "grad_norm": 1.3970120669134458, + "learning_rate": 3.9111903285802855e-07, + "loss": 0.9113, + "step": 100940 + }, + { + "epoch": 7.822852493316285, + "grad_norm": 1.3694364924503624, + "learning_rate": 3.91157780533168e-07, + "loss": 0.9303, + "step": 100950 + }, + { + "epoch": 7.823627416792592, + "grad_norm": 1.4144035036193452, + "learning_rate": 3.911965282083075e-07, + "loss": 0.9256, + "step": 100960 + }, + { + "epoch": 7.8244023402688985, + "grad_norm": 1.3230360492994602, + "learning_rate": 3.91235275883447e-07, + "loss": 0.9029, + "step": 100970 + }, + { + "epoch": 7.825177263745205, + "grad_norm": 1.5130113944978931, + "learning_rate": 3.912740235585865e-07, + "loss": 0.9206, + "step": 100980 + }, + { + "epoch": 7.825952187221512, + "grad_norm": 1.4383310878714792, + "learning_rate": 3.91312771233726e-07, + "loss": 0.9107, + "step": 100990 + }, + { + "epoch": 7.826727110697819, + "grad_norm": 1.3807541230196958, + "learning_rate": 3.9135151890886547e-07, + "loss": 0.9143, + "step": 101000 + }, + { + "epoch": 7.826727110697819, + "eval_loss": 0.9230188727378845, + "eval_runtime": 330.0074, + "eval_samples_per_second": 34.76, + "eval_steps_per_second": 8.691, + "step": 101000 + }, + { + "epoch": 7.827502034174126, + "grad_norm": 1.4078053759573763, + "learning_rate": 3.91390266584005e-07, + "loss": 0.9401, + "step": 101010 + }, + { + "epoch": 7.828276957650432, + "grad_norm": 1.4806584412998898, + "learning_rate": 3.9142901425914446e-07, + "loss": 0.9093, + "step": 101020 + }, + { + "epoch": 7.829051881126738, + "grad_norm": 1.5141102848225918, + "learning_rate": 3.9146776193428393e-07, + "loss": 0.9397, + "step": 101030 + }, + { + "epoch": 7.829826804603045, + "grad_norm": 1.4662679746801874, + "learning_rate": 3.9150650960942346e-07, + "loss": 0.9356, + "step": 101040 + }, + { + "epoch": 7.830601728079352, + "grad_norm": 2.1171534736319626, + "learning_rate": 3.915452572845629e-07, + "loss": 0.9213, + "step": 101050 + }, + { + "epoch": 7.831376651555659, + "grad_norm": 1.2904646781602227, + "learning_rate": 3.9158400495970245e-07, + "loss": 0.9136, + "step": 101060 + }, + { + "epoch": 7.832151575031966, + "grad_norm": 1.3291307038661015, + "learning_rate": 3.916227526348419e-07, + "loss": 0.9251, + "step": 101070 + }, + { + "epoch": 7.8329264985082725, + "grad_norm": 1.4937925804250172, + "learning_rate": 3.9166150030998144e-07, + "loss": 0.9032, + "step": 101080 + }, + { + "epoch": 7.833701421984579, + "grad_norm": 1.4550370607930108, + "learning_rate": 3.917002479851209e-07, + "loss": 0.9376, + "step": 101090 + }, + { + "epoch": 7.834476345460886, + "grad_norm": 1.403042535148221, + "learning_rate": 3.917389956602604e-07, + "loss": 0.9185, + "step": 101100 + }, + { + "epoch": 7.835251268937192, + "grad_norm": 1.3784286679656754, + "learning_rate": 3.917777433353999e-07, + "loss": 0.9372, + "step": 101110 + }, + { + "epoch": 7.836026192413499, + "grad_norm": 1.4214382778858414, + "learning_rate": 3.9181649101053937e-07, + "loss": 0.9264, + "step": 101120 + }, + { + "epoch": 7.836801115889806, + "grad_norm": 1.4027728858241195, + "learning_rate": 3.918552386856789e-07, + "loss": 0.9186, + "step": 101130 + }, + { + "epoch": 7.8375760393661125, + "grad_norm": 1.4510408703171518, + "learning_rate": 3.9189398636081836e-07, + "loss": 0.9172, + "step": 101140 + }, + { + "epoch": 7.838350962842419, + "grad_norm": 1.4642237849006448, + "learning_rate": 3.919327340359579e-07, + "loss": 0.9012, + "step": 101150 + }, + { + "epoch": 7.839125886318726, + "grad_norm": 1.443527635098953, + "learning_rate": 3.9197148171109735e-07, + "loss": 0.9076, + "step": 101160 + }, + { + "epoch": 7.839900809795033, + "grad_norm": 1.470475319363217, + "learning_rate": 3.920102293862368e-07, + "loss": 0.9065, + "step": 101170 + }, + { + "epoch": 7.84067573327134, + "grad_norm": 1.4458587582122737, + "learning_rate": 3.9204897706137634e-07, + "loss": 0.9234, + "step": 101180 + }, + { + "epoch": 7.8414506567476465, + "grad_norm": 1.4657858121574063, + "learning_rate": 3.920877247365158e-07, + "loss": 0.9058, + "step": 101190 + }, + { + "epoch": 7.842225580223953, + "grad_norm": 1.4828044898896866, + "learning_rate": 3.9212647241165533e-07, + "loss": 0.9438, + "step": 101200 + }, + { + "epoch": 7.843000503700259, + "grad_norm": 1.481900383863437, + "learning_rate": 3.921652200867948e-07, + "loss": 0.9306, + "step": 101210 + }, + { + "epoch": 7.843775427176566, + "grad_norm": 1.4580569417028861, + "learning_rate": 3.922039677619343e-07, + "loss": 0.9122, + "step": 101220 + }, + { + "epoch": 7.844550350652873, + "grad_norm": 1.3890231001120446, + "learning_rate": 3.922427154370738e-07, + "loss": 0.9111, + "step": 101230 + }, + { + "epoch": 7.84532527412918, + "grad_norm": 1.451073672588674, + "learning_rate": 3.9228146311221326e-07, + "loss": 0.9102, + "step": 101240 + }, + { + "epoch": 7.8461001976054865, + "grad_norm": 1.4132607000770243, + "learning_rate": 3.923202107873528e-07, + "loss": 0.9131, + "step": 101250 + }, + { + "epoch": 7.846875121081793, + "grad_norm": 1.3800563393345253, + "learning_rate": 3.9235895846249225e-07, + "loss": 0.9075, + "step": 101260 + }, + { + "epoch": 7.8476500445581, + "grad_norm": 1.4568229536439345, + "learning_rate": 3.9239770613763177e-07, + "loss": 0.8967, + "step": 101270 + }, + { + "epoch": 7.848424968034407, + "grad_norm": 1.3845138592061021, + "learning_rate": 3.9243645381277124e-07, + "loss": 0.9267, + "step": 101280 + }, + { + "epoch": 7.849199891510713, + "grad_norm": 1.4763558154026293, + "learning_rate": 3.9247520148791076e-07, + "loss": 0.9108, + "step": 101290 + }, + { + "epoch": 7.84997481498702, + "grad_norm": 1.3466743159927725, + "learning_rate": 3.9251394916305023e-07, + "loss": 0.9166, + "step": 101300 + }, + { + "epoch": 7.8507497384633265, + "grad_norm": 1.435020105076459, + "learning_rate": 3.925526968381897e-07, + "loss": 0.9259, + "step": 101310 + }, + { + "epoch": 7.851524661939633, + "grad_norm": 1.3941107460380524, + "learning_rate": 3.925914445133292e-07, + "loss": 0.903, + "step": 101320 + }, + { + "epoch": 7.85229958541594, + "grad_norm": 1.4179483622610134, + "learning_rate": 3.926301921884687e-07, + "loss": 0.9114, + "step": 101330 + }, + { + "epoch": 7.853074508892247, + "grad_norm": 1.366062831166217, + "learning_rate": 3.926689398636082e-07, + "loss": 0.9218, + "step": 101340 + }, + { + "epoch": 7.853849432368554, + "grad_norm": 1.3908671922599367, + "learning_rate": 3.927076875387477e-07, + "loss": 0.9323, + "step": 101350 + }, + { + "epoch": 7.8546243558448605, + "grad_norm": 1.436800780894689, + "learning_rate": 3.927464352138872e-07, + "loss": 0.902, + "step": 101360 + }, + { + "epoch": 7.855399279321167, + "grad_norm": 1.461604098271006, + "learning_rate": 3.927851828890267e-07, + "loss": 0.9263, + "step": 101370 + }, + { + "epoch": 7.856174202797474, + "grad_norm": 1.382082925448635, + "learning_rate": 3.9282393056416614e-07, + "loss": 0.925, + "step": 101380 + }, + { + "epoch": 7.856949126273781, + "grad_norm": 1.3784744871103523, + "learning_rate": 3.9286267823930567e-07, + "loss": 0.8993, + "step": 101390 + }, + { + "epoch": 7.857724049750087, + "grad_norm": 1.5433583805959756, + "learning_rate": 3.9290142591444514e-07, + "loss": 0.9412, + "step": 101400 + }, + { + "epoch": 7.858498973226394, + "grad_norm": 1.4538162332348048, + "learning_rate": 3.9294017358958466e-07, + "loss": 0.9154, + "step": 101410 + }, + { + "epoch": 7.8592738967027005, + "grad_norm": 1.3410649644587171, + "learning_rate": 3.929789212647241e-07, + "loss": 0.9176, + "step": 101420 + }, + { + "epoch": 7.860048820179007, + "grad_norm": 1.5730257153777203, + "learning_rate": 3.9301766893986365e-07, + "loss": 0.9145, + "step": 101430 + }, + { + "epoch": 7.860823743655314, + "grad_norm": 1.3761986402558306, + "learning_rate": 3.930564166150031e-07, + "loss": 0.9514, + "step": 101440 + }, + { + "epoch": 7.861598667131621, + "grad_norm": 1.4368960081799829, + "learning_rate": 3.930951642901426e-07, + "loss": 0.9341, + "step": 101450 + }, + { + "epoch": 7.862373590607928, + "grad_norm": 1.3988981564012593, + "learning_rate": 3.931339119652821e-07, + "loss": 0.9067, + "step": 101460 + }, + { + "epoch": 7.8631485140842345, + "grad_norm": 1.49296571002898, + "learning_rate": 3.931726596404216e-07, + "loss": 0.9338, + "step": 101470 + }, + { + "epoch": 7.8639234375605405, + "grad_norm": 1.3775824087410442, + "learning_rate": 3.932114073155611e-07, + "loss": 0.9086, + "step": 101480 + }, + { + "epoch": 7.864698361036847, + "grad_norm": 1.3603029014397778, + "learning_rate": 3.9325015499070057e-07, + "loss": 0.9083, + "step": 101490 + }, + { + "epoch": 7.865473284513154, + "grad_norm": 1.4764604160559156, + "learning_rate": 3.932889026658401e-07, + "loss": 0.8956, + "step": 101500 + }, + { + "epoch": 7.865473284513154, + "eval_loss": 0.9227323532104492, + "eval_runtime": 331.6618, + "eval_samples_per_second": 34.586, + "eval_steps_per_second": 8.647, + "step": 101500 + }, + { + "epoch": 7.866248207989461, + "grad_norm": 1.3842508273633962, + "learning_rate": 3.9332765034097956e-07, + "loss": 0.9119, + "step": 101510 + }, + { + "epoch": 7.867023131465768, + "grad_norm": 1.440722199422545, + "learning_rate": 3.9336639801611903e-07, + "loss": 0.9212, + "step": 101520 + }, + { + "epoch": 7.8677980549420745, + "grad_norm": 1.5123072690198123, + "learning_rate": 3.9340514569125855e-07, + "loss": 0.8985, + "step": 101530 + }, + { + "epoch": 7.868572978418381, + "grad_norm": 1.3808038392081459, + "learning_rate": 3.93443893366398e-07, + "loss": 0.9081, + "step": 101540 + }, + { + "epoch": 7.869347901894688, + "grad_norm": 1.5317170840828698, + "learning_rate": 3.9348264104153754e-07, + "loss": 0.9319, + "step": 101550 + }, + { + "epoch": 7.870122825370995, + "grad_norm": 1.3679846539896636, + "learning_rate": 3.93521388716677e-07, + "loss": 0.9275, + "step": 101560 + }, + { + "epoch": 7.870897748847302, + "grad_norm": 1.3988396549031301, + "learning_rate": 3.935601363918165e-07, + "loss": 0.9105, + "step": 101570 + }, + { + "epoch": 7.871672672323608, + "grad_norm": 1.375917993735132, + "learning_rate": 3.93598884066956e-07, + "loss": 0.9078, + "step": 101580 + }, + { + "epoch": 7.8724475957999145, + "grad_norm": 1.4085996294503282, + "learning_rate": 3.9363763174209547e-07, + "loss": 0.9131, + "step": 101590 + }, + { + "epoch": 7.873222519276221, + "grad_norm": 1.3779527376873075, + "learning_rate": 3.93676379417235e-07, + "loss": 0.9375, + "step": 101600 + }, + { + "epoch": 7.873997442752528, + "grad_norm": 1.4771550320293207, + "learning_rate": 3.9371512709237446e-07, + "loss": 0.9076, + "step": 101610 + }, + { + "epoch": 7.874772366228835, + "grad_norm": 1.3910726366338046, + "learning_rate": 3.93753874767514e-07, + "loss": 0.9131, + "step": 101620 + }, + { + "epoch": 7.875547289705142, + "grad_norm": 1.4065173565706912, + "learning_rate": 3.9379262244265345e-07, + "loss": 0.8982, + "step": 101630 + }, + { + "epoch": 7.8763222131814485, + "grad_norm": 1.5166983666932945, + "learning_rate": 3.938313701177929e-07, + "loss": 0.9271, + "step": 101640 + }, + { + "epoch": 7.877097136657755, + "grad_norm": 1.4474188526411131, + "learning_rate": 3.9387011779293244e-07, + "loss": 0.9034, + "step": 101650 + }, + { + "epoch": 7.877872060134061, + "grad_norm": 1.3817090656575612, + "learning_rate": 3.939088654680719e-07, + "loss": 0.935, + "step": 101660 + }, + { + "epoch": 7.878646983610368, + "grad_norm": 1.3984698464440635, + "learning_rate": 3.9394761314321143e-07, + "loss": 0.9148, + "step": 101670 + }, + { + "epoch": 7.879421907086675, + "grad_norm": 1.353988740250469, + "learning_rate": 3.939863608183509e-07, + "loss": 0.9334, + "step": 101680 + }, + { + "epoch": 7.880196830562982, + "grad_norm": 1.4659891193987222, + "learning_rate": 3.940251084934904e-07, + "loss": 0.8951, + "step": 101690 + }, + { + "epoch": 7.8809717540392885, + "grad_norm": 1.479294726256704, + "learning_rate": 3.940638561686299e-07, + "loss": 0.9259, + "step": 101700 + }, + { + "epoch": 7.881746677515595, + "grad_norm": 1.5089352288490396, + "learning_rate": 3.9410260384376936e-07, + "loss": 0.918, + "step": 101710 + }, + { + "epoch": 7.882521600991902, + "grad_norm": 1.3773259268636024, + "learning_rate": 3.941413515189089e-07, + "loss": 0.9294, + "step": 101720 + }, + { + "epoch": 7.883296524468209, + "grad_norm": 1.4455962793650137, + "learning_rate": 3.9418009919404835e-07, + "loss": 0.8929, + "step": 101730 + }, + { + "epoch": 7.884071447944516, + "grad_norm": 1.3699916549838953, + "learning_rate": 3.942188468691879e-07, + "loss": 0.9088, + "step": 101740 + }, + { + "epoch": 7.884846371420823, + "grad_norm": 1.4122301635849295, + "learning_rate": 3.9425759454432735e-07, + "loss": 0.9246, + "step": 101750 + }, + { + "epoch": 7.885621294897129, + "grad_norm": 1.3895602732668315, + "learning_rate": 3.9429634221946687e-07, + "loss": 0.9092, + "step": 101760 + }, + { + "epoch": 7.886396218373435, + "grad_norm": 1.3736978435337859, + "learning_rate": 3.9433508989460634e-07, + "loss": 0.9094, + "step": 101770 + }, + { + "epoch": 7.887171141849742, + "grad_norm": 1.361501663715722, + "learning_rate": 3.943738375697458e-07, + "loss": 0.9104, + "step": 101780 + }, + { + "epoch": 7.887946065326049, + "grad_norm": 1.3026631516290046, + "learning_rate": 3.9441258524488533e-07, + "loss": 0.9074, + "step": 101790 + }, + { + "epoch": 7.888720988802356, + "grad_norm": 1.4454496947673492, + "learning_rate": 3.944513329200248e-07, + "loss": 0.8984, + "step": 101800 + }, + { + "epoch": 7.8894959122786625, + "grad_norm": 1.392430339908352, + "learning_rate": 3.944900805951643e-07, + "loss": 0.9023, + "step": 101810 + }, + { + "epoch": 7.890270835754969, + "grad_norm": 1.543844309765938, + "learning_rate": 3.945288282703038e-07, + "loss": 0.9283, + "step": 101820 + }, + { + "epoch": 7.891045759231276, + "grad_norm": 1.4152676153189687, + "learning_rate": 3.945675759454433e-07, + "loss": 0.9041, + "step": 101830 + }, + { + "epoch": 7.891820682707583, + "grad_norm": 1.4766714792253417, + "learning_rate": 3.946063236205828e-07, + "loss": 0.9493, + "step": 101840 + }, + { + "epoch": 7.892595606183889, + "grad_norm": 1.4271295582319565, + "learning_rate": 3.9464507129572225e-07, + "loss": 0.9173, + "step": 101850 + }, + { + "epoch": 7.893370529660196, + "grad_norm": 1.4131848176363488, + "learning_rate": 3.9468381897086177e-07, + "loss": 0.9244, + "step": 101860 + }, + { + "epoch": 7.8941454531365025, + "grad_norm": 1.3425647864850472, + "learning_rate": 3.9472256664600124e-07, + "loss": 0.9075, + "step": 101870 + }, + { + "epoch": 7.894920376612809, + "grad_norm": 1.519407581590417, + "learning_rate": 3.9476131432114076e-07, + "loss": 0.9226, + "step": 101880 + }, + { + "epoch": 7.895695300089116, + "grad_norm": 1.5136267044952458, + "learning_rate": 3.9480006199628023e-07, + "loss": 0.9068, + "step": 101890 + }, + { + "epoch": 7.896470223565423, + "grad_norm": 1.5025456751968258, + "learning_rate": 3.9483880967141975e-07, + "loss": 0.9238, + "step": 101900 + }, + { + "epoch": 7.89724514704173, + "grad_norm": 1.4747587831827698, + "learning_rate": 3.948775573465592e-07, + "loss": 0.9103, + "step": 101910 + }, + { + "epoch": 7.898020070518037, + "grad_norm": 1.4805376127274985, + "learning_rate": 3.949163050216987e-07, + "loss": 0.9137, + "step": 101920 + }, + { + "epoch": 7.898794993994343, + "grad_norm": 1.4614995019758963, + "learning_rate": 3.949550526968382e-07, + "loss": 0.9247, + "step": 101930 + }, + { + "epoch": 7.89956991747065, + "grad_norm": 1.3982935796281806, + "learning_rate": 3.949938003719777e-07, + "loss": 0.9225, + "step": 101940 + }, + { + "epoch": 7.900344840946956, + "grad_norm": 1.4730697044884524, + "learning_rate": 3.950325480471172e-07, + "loss": 0.9426, + "step": 101950 + }, + { + "epoch": 7.901119764423263, + "grad_norm": 1.4464086182178812, + "learning_rate": 3.9507129572225667e-07, + "loss": 0.9081, + "step": 101960 + }, + { + "epoch": 7.90189468789957, + "grad_norm": 1.3865468044338738, + "learning_rate": 3.951100433973962e-07, + "loss": 0.9125, + "step": 101970 + }, + { + "epoch": 7.9026696113758765, + "grad_norm": 1.4171775642448, + "learning_rate": 3.9514879107253566e-07, + "loss": 0.92, + "step": 101980 + }, + { + "epoch": 7.903444534852183, + "grad_norm": 1.3567795963596159, + "learning_rate": 3.9518753874767513e-07, + "loss": 0.9203, + "step": 101990 + }, + { + "epoch": 7.90421945832849, + "grad_norm": 1.4216116070471685, + "learning_rate": 3.9522628642281465e-07, + "loss": 0.9152, + "step": 102000 + }, + { + "epoch": 7.90421945832849, + "eval_loss": 0.9223456382751465, + "eval_runtime": 331.1149, + "eval_samples_per_second": 34.644, + "eval_steps_per_second": 8.662, + "step": 102000 + }, + { + "epoch": 7.904994381804797, + "grad_norm": 1.3999860982997527, + "learning_rate": 3.952650340979541e-07, + "loss": 0.9015, + "step": 102010 + }, + { + "epoch": 7.905769305281104, + "grad_norm": 1.47441656273902, + "learning_rate": 3.9530378177309364e-07, + "loss": 0.9088, + "step": 102020 + }, + { + "epoch": 7.906544228757411, + "grad_norm": 1.3527481815218114, + "learning_rate": 3.953425294482331e-07, + "loss": 0.9113, + "step": 102030 + }, + { + "epoch": 7.9073191522337165, + "grad_norm": 1.3372968208858138, + "learning_rate": 3.9538127712337264e-07, + "loss": 0.9131, + "step": 102040 + }, + { + "epoch": 7.908094075710023, + "grad_norm": 1.4150945879576236, + "learning_rate": 3.954200247985121e-07, + "loss": 0.9334, + "step": 102050 + }, + { + "epoch": 7.90886899918633, + "grad_norm": 1.3594551816151865, + "learning_rate": 3.954587724736516e-07, + "loss": 0.9235, + "step": 102060 + }, + { + "epoch": 7.909643922662637, + "grad_norm": 1.464214864357011, + "learning_rate": 3.954975201487911e-07, + "loss": 0.8944, + "step": 102070 + }, + { + "epoch": 7.910418846138944, + "grad_norm": 1.443718010524709, + "learning_rate": 3.9553626782393057e-07, + "loss": 0.9298, + "step": 102080 + }, + { + "epoch": 7.911193769615251, + "grad_norm": 1.5131571519146039, + "learning_rate": 3.955750154990701e-07, + "loss": 0.9415, + "step": 102090 + }, + { + "epoch": 7.911968693091557, + "grad_norm": 1.491789007970597, + "learning_rate": 3.9561376317420956e-07, + "loss": 0.9378, + "step": 102100 + }, + { + "epoch": 7.912743616567864, + "grad_norm": 1.4159505223107727, + "learning_rate": 3.956525108493491e-07, + "loss": 0.924, + "step": 102110 + }, + { + "epoch": 7.913518540044171, + "grad_norm": 1.3941615214685232, + "learning_rate": 3.9569125852448855e-07, + "loss": 0.905, + "step": 102120 + }, + { + "epoch": 7.914293463520478, + "grad_norm": 1.4899409597656141, + "learning_rate": 3.95730006199628e-07, + "loss": 0.9179, + "step": 102130 + }, + { + "epoch": 7.915068386996784, + "grad_norm": 1.4487242752506957, + "learning_rate": 3.9576875387476754e-07, + "loss": 0.9226, + "step": 102140 + }, + { + "epoch": 7.9158433104730905, + "grad_norm": 1.4083048329296248, + "learning_rate": 3.95807501549907e-07, + "loss": 0.942, + "step": 102150 + }, + { + "epoch": 7.916618233949397, + "grad_norm": 1.4144104054737283, + "learning_rate": 3.9584624922504653e-07, + "loss": 0.9046, + "step": 102160 + }, + { + "epoch": 7.917393157425704, + "grad_norm": 1.4721439912792562, + "learning_rate": 3.95884996900186e-07, + "loss": 0.932, + "step": 102170 + }, + { + "epoch": 7.918168080902011, + "grad_norm": 1.4984857202143973, + "learning_rate": 3.959237445753255e-07, + "loss": 0.9014, + "step": 102180 + }, + { + "epoch": 7.918943004378318, + "grad_norm": 1.4243747963747737, + "learning_rate": 3.95962492250465e-07, + "loss": 0.8937, + "step": 102190 + }, + { + "epoch": 7.919717927854625, + "grad_norm": 1.3703962576030395, + "learning_rate": 3.9600123992560446e-07, + "loss": 0.93, + "step": 102200 + }, + { + "epoch": 7.920492851330931, + "grad_norm": 1.4307959013758274, + "learning_rate": 3.96039987600744e-07, + "loss": 0.9167, + "step": 102210 + }, + { + "epoch": 7.921267774807237, + "grad_norm": 1.3063551401318843, + "learning_rate": 3.9607873527588345e-07, + "loss": 0.8922, + "step": 102220 + }, + { + "epoch": 7.922042698283544, + "grad_norm": 1.4025099132930448, + "learning_rate": 3.9611748295102297e-07, + "loss": 0.9063, + "step": 102230 + }, + { + "epoch": 7.922817621759851, + "grad_norm": 1.4800078513784518, + "learning_rate": 3.9615623062616244e-07, + "loss": 0.9078, + "step": 102240 + }, + { + "epoch": 7.923592545236158, + "grad_norm": 1.4673327998261398, + "learning_rate": 3.961949783013019e-07, + "loss": 0.8988, + "step": 102250 + }, + { + "epoch": 7.924367468712465, + "grad_norm": 1.4410420084087723, + "learning_rate": 3.9623372597644143e-07, + "loss": 0.9292, + "step": 102260 + }, + { + "epoch": 7.925142392188771, + "grad_norm": 1.4775128691567896, + "learning_rate": 3.962724736515809e-07, + "loss": 0.9166, + "step": 102270 + }, + { + "epoch": 7.925917315665078, + "grad_norm": 1.3693310980615414, + "learning_rate": 3.963112213267204e-07, + "loss": 0.9278, + "step": 102280 + }, + { + "epoch": 7.926692239141385, + "grad_norm": 1.4300869283658584, + "learning_rate": 3.963499690018599e-07, + "loss": 0.9138, + "step": 102290 + }, + { + "epoch": 7.927467162617692, + "grad_norm": 1.3514209515878393, + "learning_rate": 3.963887166769994e-07, + "loss": 0.9286, + "step": 102300 + }, + { + "epoch": 7.928242086093999, + "grad_norm": 1.4042277833806573, + "learning_rate": 3.964274643521389e-07, + "loss": 0.924, + "step": 102310 + }, + { + "epoch": 7.929017009570305, + "grad_norm": 1.463166390414999, + "learning_rate": 3.9646621202727835e-07, + "loss": 0.8946, + "step": 102320 + }, + { + "epoch": 7.929791933046611, + "grad_norm": 1.4034175997967762, + "learning_rate": 3.9650495970241787e-07, + "loss": 0.9107, + "step": 102330 + }, + { + "epoch": 7.930566856522918, + "grad_norm": 1.4464770434811496, + "learning_rate": 3.9654370737755734e-07, + "loss": 0.9219, + "step": 102340 + }, + { + "epoch": 7.931341779999225, + "grad_norm": 1.4339927695355204, + "learning_rate": 3.9658245505269686e-07, + "loss": 0.8995, + "step": 102350 + }, + { + "epoch": 7.932116703475532, + "grad_norm": 1.514390819548932, + "learning_rate": 3.9662120272783633e-07, + "loss": 0.9096, + "step": 102360 + }, + { + "epoch": 7.932891626951839, + "grad_norm": 1.4594245126686156, + "learning_rate": 3.9665995040297586e-07, + "loss": 0.9276, + "step": 102370 + }, + { + "epoch": 7.933666550428145, + "grad_norm": 1.41938200676656, + "learning_rate": 3.966986980781153e-07, + "loss": 0.9161, + "step": 102380 + }, + { + "epoch": 7.934441473904452, + "grad_norm": 1.4919035316746863, + "learning_rate": 3.967374457532548e-07, + "loss": 0.9016, + "step": 102390 + }, + { + "epoch": 7.935216397380759, + "grad_norm": 1.5049238855529878, + "learning_rate": 3.967761934283943e-07, + "loss": 0.9482, + "step": 102400 + }, + { + "epoch": 7.935991320857065, + "grad_norm": 1.426191461651717, + "learning_rate": 3.968149411035338e-07, + "loss": 0.9312, + "step": 102410 + }, + { + "epoch": 7.936766244333372, + "grad_norm": 1.3644987080955677, + "learning_rate": 3.968536887786733e-07, + "loss": 0.9218, + "step": 102420 + }, + { + "epoch": 7.9375411678096786, + "grad_norm": 1.4694205378155396, + "learning_rate": 3.968924364538128e-07, + "loss": 0.8996, + "step": 102430 + }, + { + "epoch": 7.938316091285985, + "grad_norm": 1.4199451906889093, + "learning_rate": 3.969311841289523e-07, + "loss": 0.923, + "step": 102440 + }, + { + "epoch": 7.939091014762292, + "grad_norm": 1.3950307420689765, + "learning_rate": 3.9696993180409177e-07, + "loss": 0.9145, + "step": 102450 + }, + { + "epoch": 7.939865938238599, + "grad_norm": 1.4137037885031598, + "learning_rate": 3.9700867947923124e-07, + "loss": 0.9161, + "step": 102460 + }, + { + "epoch": 7.940640861714906, + "grad_norm": 1.4759679640993708, + "learning_rate": 3.9704742715437076e-07, + "loss": 0.8964, + "step": 102470 + }, + { + "epoch": 7.941415785191213, + "grad_norm": 1.3900382010201977, + "learning_rate": 3.9708617482951023e-07, + "loss": 0.9154, + "step": 102480 + }, + { + "epoch": 7.942190708667519, + "grad_norm": 1.4441042353873017, + "learning_rate": 3.9712492250464975e-07, + "loss": 0.9067, + "step": 102490 + }, + { + "epoch": 7.942965632143826, + "grad_norm": 1.46810799858033, + "learning_rate": 3.971636701797892e-07, + "loss": 0.9197, + "step": 102500 + }, + { + "epoch": 7.942965632143826, + "eval_loss": 0.921995997428894, + "eval_runtime": 331.7653, + "eval_samples_per_second": 34.576, + "eval_steps_per_second": 8.645, + "step": 102500 + }, + { + "epoch": 7.943740555620132, + "grad_norm": 1.3955422704658913, + "learning_rate": 3.9720241785492874e-07, + "loss": 0.9184, + "step": 102510 + }, + { + "epoch": 7.944515479096439, + "grad_norm": 1.3135588280667272, + "learning_rate": 3.972411655300682e-07, + "loss": 0.9217, + "step": 102520 + }, + { + "epoch": 7.945290402572746, + "grad_norm": 1.426771140124903, + "learning_rate": 3.972799132052077e-07, + "loss": 0.9078, + "step": 102530 + }, + { + "epoch": 7.946065326049053, + "grad_norm": 1.3890318774730728, + "learning_rate": 3.973186608803472e-07, + "loss": 0.9275, + "step": 102540 + }, + { + "epoch": 7.946840249525359, + "grad_norm": 1.4307263785096125, + "learning_rate": 3.9735740855548667e-07, + "loss": 0.9371, + "step": 102550 + }, + { + "epoch": 7.947615173001666, + "grad_norm": 1.3877628604747532, + "learning_rate": 3.973961562306262e-07, + "loss": 0.8979, + "step": 102560 + }, + { + "epoch": 7.948390096477973, + "grad_norm": 1.4332844512614946, + "learning_rate": 3.9743490390576566e-07, + "loss": 0.9205, + "step": 102570 + }, + { + "epoch": 7.94916501995428, + "grad_norm": 1.4817014775917126, + "learning_rate": 3.974736515809052e-07, + "loss": 0.8991, + "step": 102580 + }, + { + "epoch": 7.949939943430586, + "grad_norm": 1.4239241958402886, + "learning_rate": 3.9751239925604465e-07, + "loss": 0.9181, + "step": 102590 + }, + { + "epoch": 7.9507148669068926, + "grad_norm": 1.4052806810123013, + "learning_rate": 3.975511469311841e-07, + "loss": 0.9132, + "step": 102600 + }, + { + "epoch": 7.951489790383199, + "grad_norm": 1.3802101002338547, + "learning_rate": 3.9758989460632364e-07, + "loss": 0.9077, + "step": 102610 + }, + { + "epoch": 7.952264713859506, + "grad_norm": 1.3880482076781768, + "learning_rate": 3.976286422814631e-07, + "loss": 0.9148, + "step": 102620 + }, + { + "epoch": 7.953039637335813, + "grad_norm": 1.3944177138774643, + "learning_rate": 3.9766738995660263e-07, + "loss": 0.9295, + "step": 102630 + }, + { + "epoch": 7.95381456081212, + "grad_norm": 1.3931592128732015, + "learning_rate": 3.977061376317421e-07, + "loss": 0.9289, + "step": 102640 + }, + { + "epoch": 7.954589484288427, + "grad_norm": 1.4763052475960754, + "learning_rate": 3.977448853068816e-07, + "loss": 0.9127, + "step": 102650 + }, + { + "epoch": 7.955364407764733, + "grad_norm": 1.405113258798703, + "learning_rate": 3.977836329820211e-07, + "loss": 0.9143, + "step": 102660 + }, + { + "epoch": 7.95613933124104, + "grad_norm": 1.4160970228173215, + "learning_rate": 3.9782238065716056e-07, + "loss": 0.9103, + "step": 102670 + }, + { + "epoch": 7.956914254717347, + "grad_norm": 1.4449013471087597, + "learning_rate": 3.978611283323001e-07, + "loss": 0.901, + "step": 102680 + }, + { + "epoch": 7.957689178193654, + "grad_norm": 1.4740145975610877, + "learning_rate": 3.9789987600743955e-07, + "loss": 0.945, + "step": 102690 + }, + { + "epoch": 7.95846410166996, + "grad_norm": 1.4183167682724398, + "learning_rate": 3.979386236825791e-07, + "loss": 0.9028, + "step": 102700 + }, + { + "epoch": 7.959239025146267, + "grad_norm": 1.4258326530926184, + "learning_rate": 3.9797737135771854e-07, + "loss": 0.9186, + "step": 102710 + }, + { + "epoch": 7.960013948622573, + "grad_norm": 1.418425836348066, + "learning_rate": 3.9801611903285807e-07, + "loss": 0.914, + "step": 102720 + }, + { + "epoch": 7.96078887209888, + "grad_norm": 1.347933745599566, + "learning_rate": 3.9805486670799753e-07, + "loss": 0.9115, + "step": 102730 + }, + { + "epoch": 7.961563795575187, + "grad_norm": 1.4435050462644758, + "learning_rate": 3.98093614383137e-07, + "loss": 0.9155, + "step": 102740 + }, + { + "epoch": 7.962338719051494, + "grad_norm": 1.3996161761594015, + "learning_rate": 3.981323620582765e-07, + "loss": 0.9278, + "step": 102750 + }, + { + "epoch": 7.963113642527801, + "grad_norm": 1.507313037553486, + "learning_rate": 3.98171109733416e-07, + "loss": 0.9034, + "step": 102760 + }, + { + "epoch": 7.963888566004107, + "grad_norm": 1.4218620721193613, + "learning_rate": 3.982098574085555e-07, + "loss": 0.9054, + "step": 102770 + }, + { + "epoch": 7.964663489480413, + "grad_norm": 1.4926890743544505, + "learning_rate": 3.98248605083695e-07, + "loss": 0.9051, + "step": 102780 + }, + { + "epoch": 7.96543841295672, + "grad_norm": 1.4485570522077182, + "learning_rate": 3.982873527588345e-07, + "loss": 0.9389, + "step": 102790 + }, + { + "epoch": 7.966213336433027, + "grad_norm": 1.4080428109085321, + "learning_rate": 3.98326100433974e-07, + "loss": 0.9152, + "step": 102800 + }, + { + "epoch": 7.966988259909334, + "grad_norm": 1.4131833347433587, + "learning_rate": 3.9836484810911345e-07, + "loss": 0.9141, + "step": 102810 + }, + { + "epoch": 7.967763183385641, + "grad_norm": 1.4565993246784403, + "learning_rate": 3.9840359578425297e-07, + "loss": 0.9113, + "step": 102820 + }, + { + "epoch": 7.968538106861947, + "grad_norm": 1.4493478732920677, + "learning_rate": 3.9844234345939244e-07, + "loss": 0.9376, + "step": 102830 + }, + { + "epoch": 7.969313030338254, + "grad_norm": 1.4404653262595684, + "learning_rate": 3.9848109113453196e-07, + "loss": 0.9297, + "step": 102840 + }, + { + "epoch": 7.970087953814561, + "grad_norm": 1.4065610475559804, + "learning_rate": 3.9851983880967143e-07, + "loss": 0.9145, + "step": 102850 + }, + { + "epoch": 7.970862877290868, + "grad_norm": 1.387527152602133, + "learning_rate": 3.9855858648481095e-07, + "loss": 0.924, + "step": 102860 + }, + { + "epoch": 7.971637800767175, + "grad_norm": 1.477180511021425, + "learning_rate": 3.985973341599504e-07, + "loss": 0.9295, + "step": 102870 + }, + { + "epoch": 7.972412724243481, + "grad_norm": 1.4384468635827763, + "learning_rate": 3.986360818350899e-07, + "loss": 0.9319, + "step": 102880 + }, + { + "epoch": 7.973187647719787, + "grad_norm": 1.3369016957148268, + "learning_rate": 3.986748295102294e-07, + "loss": 0.9147, + "step": 102890 + }, + { + "epoch": 7.973962571196094, + "grad_norm": 1.4202657897603692, + "learning_rate": 3.987135771853689e-07, + "loss": 0.9094, + "step": 102900 + }, + { + "epoch": 7.974737494672401, + "grad_norm": 1.3445257966785136, + "learning_rate": 3.987523248605084e-07, + "loss": 0.9122, + "step": 102910 + }, + { + "epoch": 7.975512418148708, + "grad_norm": 1.4483657772530762, + "learning_rate": 3.9879107253564787e-07, + "loss": 0.8988, + "step": 102920 + }, + { + "epoch": 7.976287341625015, + "grad_norm": 1.4015099515399956, + "learning_rate": 3.988298202107874e-07, + "loss": 0.91, + "step": 102930 + }, + { + "epoch": 7.977062265101321, + "grad_norm": 1.420381248933574, + "learning_rate": 3.9886856788592686e-07, + "loss": 0.9155, + "step": 102940 + }, + { + "epoch": 7.977837188577628, + "grad_norm": 1.3900600490213335, + "learning_rate": 3.9890731556106633e-07, + "loss": 0.9249, + "step": 102950 + }, + { + "epoch": 7.978612112053935, + "grad_norm": 1.4133655736328112, + "learning_rate": 3.9894606323620585e-07, + "loss": 0.902, + "step": 102960 + }, + { + "epoch": 7.979387035530241, + "grad_norm": 1.3912163077265522, + "learning_rate": 3.989848109113453e-07, + "loss": 0.9209, + "step": 102970 + }, + { + "epoch": 7.980161959006548, + "grad_norm": 1.4234371869383877, + "learning_rate": 3.9902355858648484e-07, + "loss": 0.9272, + "step": 102980 + }, + { + "epoch": 7.980936882482855, + "grad_norm": 1.4702125105126638, + "learning_rate": 3.990623062616243e-07, + "loss": 0.9159, + "step": 102990 + }, + { + "epoch": 7.981711805959161, + "grad_norm": 1.39428115109348, + "learning_rate": 3.991010539367638e-07, + "loss": 0.9134, + "step": 103000 + }, + { + "epoch": 7.981711805959161, + "eval_loss": 0.9216727018356323, + "eval_runtime": 331.1402, + "eval_samples_per_second": 34.641, + "eval_steps_per_second": 8.661, + "step": 103000 + }, + { + "epoch": 7.982486729435468, + "grad_norm": 1.335051029576431, + "learning_rate": 3.991398016119033e-07, + "loss": 0.9221, + "step": 103010 + }, + { + "epoch": 7.983261652911775, + "grad_norm": 1.4939237389892983, + "learning_rate": 3.9917854928704277e-07, + "loss": 0.9202, + "step": 103020 + }, + { + "epoch": 7.984036576388082, + "grad_norm": 1.3969016630451436, + "learning_rate": 3.992172969621823e-07, + "loss": 0.9331, + "step": 103030 + }, + { + "epoch": 7.984811499864389, + "grad_norm": 1.4541639090356158, + "learning_rate": 3.9925604463732176e-07, + "loss": 0.9039, + "step": 103040 + }, + { + "epoch": 7.9855864233406955, + "grad_norm": 1.5316627228361204, + "learning_rate": 3.992947923124613e-07, + "loss": 0.912, + "step": 103050 + }, + { + "epoch": 7.986361346817002, + "grad_norm": 1.4195837167624337, + "learning_rate": 3.9933353998760075e-07, + "loss": 0.9185, + "step": 103060 + }, + { + "epoch": 7.987136270293308, + "grad_norm": 1.369670076359388, + "learning_rate": 3.993722876627402e-07, + "loss": 0.9171, + "step": 103070 + }, + { + "epoch": 7.987911193769615, + "grad_norm": 1.479669613159765, + "learning_rate": 3.9941103533787975e-07, + "loss": 0.9208, + "step": 103080 + }, + { + "epoch": 7.988686117245922, + "grad_norm": 1.4931581194673809, + "learning_rate": 3.994497830130192e-07, + "loss": 0.9153, + "step": 103090 + }, + { + "epoch": 7.989461040722229, + "grad_norm": 1.4564165202140666, + "learning_rate": 3.9948853068815874e-07, + "loss": 0.922, + "step": 103100 + }, + { + "epoch": 7.990235964198535, + "grad_norm": 1.41607317184917, + "learning_rate": 3.995272783632982e-07, + "loss": 0.9158, + "step": 103110 + }, + { + "epoch": 7.991010887674842, + "grad_norm": 1.3987287293423425, + "learning_rate": 3.9956602603843773e-07, + "loss": 0.9142, + "step": 103120 + }, + { + "epoch": 7.991785811151149, + "grad_norm": 1.475314502900762, + "learning_rate": 3.996047737135772e-07, + "loss": 0.9056, + "step": 103130 + }, + { + "epoch": 7.992560734627456, + "grad_norm": 1.4368966675632424, + "learning_rate": 3.9964352138871667e-07, + "loss": 0.9334, + "step": 103140 + }, + { + "epoch": 7.993335658103762, + "grad_norm": 1.4905580183601872, + "learning_rate": 3.996822690638562e-07, + "loss": 0.9232, + "step": 103150 + }, + { + "epoch": 7.994110581580069, + "grad_norm": 1.4410160864823003, + "learning_rate": 3.9972101673899566e-07, + "loss": 0.9059, + "step": 103160 + }, + { + "epoch": 7.994885505056375, + "grad_norm": 1.3813219054760364, + "learning_rate": 3.997597644141352e-07, + "loss": 0.9082, + "step": 103170 + }, + { + "epoch": 7.995660428532682, + "grad_norm": 1.450796327202022, + "learning_rate": 3.9979851208927465e-07, + "loss": 0.9268, + "step": 103180 + }, + { + "epoch": 7.996435352008989, + "grad_norm": 1.3723850759976877, + "learning_rate": 3.9983725976441417e-07, + "loss": 0.9013, + "step": 103190 + }, + { + "epoch": 7.997210275485296, + "grad_norm": 1.3973953938931323, + "learning_rate": 3.9987600743955364e-07, + "loss": 0.9109, + "step": 103200 + }, + { + "epoch": 7.997985198961603, + "grad_norm": 1.346906396355174, + "learning_rate": 3.999147551146931e-07, + "loss": 0.8927, + "step": 103210 + }, + { + "epoch": 7.9987601224379095, + "grad_norm": 1.4926089635166029, + "learning_rate": 3.9995350278983263e-07, + "loss": 0.921, + "step": 103220 + }, + { + "epoch": 7.999535045914216, + "grad_norm": 1.4697402302057265, + "learning_rate": 3.999922504649721e-07, + "loss": 0.9034, + "step": 103230 + }, + { + "epoch": 8.000309969390523, + "grad_norm": 1.3917078614352716, + "learning_rate": 4.000309981401116e-07, + "loss": 0.9065, + "step": 103240 + }, + { + "epoch": 8.00108489286683, + "grad_norm": 1.4140553938621698, + "learning_rate": 4.000697458152511e-07, + "loss": 0.9091, + "step": 103250 + }, + { + "epoch": 8.001859816343137, + "grad_norm": 1.3635260549360166, + "learning_rate": 4.001084934903906e-07, + "loss": 0.8838, + "step": 103260 + }, + { + "epoch": 8.002634739819444, + "grad_norm": 1.3927671302017155, + "learning_rate": 4.001472411655301e-07, + "loss": 0.9146, + "step": 103270 + }, + { + "epoch": 8.00340966329575, + "grad_norm": 1.4069406319319813, + "learning_rate": 4.0018598884066955e-07, + "loss": 0.9104, + "step": 103280 + }, + { + "epoch": 8.004184586772057, + "grad_norm": 1.337804827732656, + "learning_rate": 4.0022473651580907e-07, + "loss": 0.9294, + "step": 103290 + }, + { + "epoch": 8.004959510248362, + "grad_norm": 1.3786362922455504, + "learning_rate": 4.0026348419094854e-07, + "loss": 0.9196, + "step": 103300 + }, + { + "epoch": 8.005734433724669, + "grad_norm": 1.4339246401887475, + "learning_rate": 4.0030223186608806e-07, + "loss": 0.9175, + "step": 103310 + }, + { + "epoch": 8.006509357200976, + "grad_norm": 1.4106000221245283, + "learning_rate": 4.0034097954122753e-07, + "loss": 0.904, + "step": 103320 + }, + { + "epoch": 8.007284280677283, + "grad_norm": 1.4050257433131148, + "learning_rate": 4.0037972721636705e-07, + "loss": 0.9285, + "step": 103330 + }, + { + "epoch": 8.00805920415359, + "grad_norm": 1.4587250669743828, + "learning_rate": 4.004184748915065e-07, + "loss": 0.9241, + "step": 103340 + }, + { + "epoch": 8.008834127629896, + "grad_norm": 1.397962927549508, + "learning_rate": 4.00457222566646e-07, + "loss": 0.9155, + "step": 103350 + }, + { + "epoch": 8.009609051106203, + "grad_norm": 1.3569818056473435, + "learning_rate": 4.004959702417855e-07, + "loss": 0.9083, + "step": 103360 + }, + { + "epoch": 8.01038397458251, + "grad_norm": 1.5101083767364378, + "learning_rate": 4.00534717916925e-07, + "loss": 0.9082, + "step": 103370 + }, + { + "epoch": 8.011158898058817, + "grad_norm": 1.4905596548229076, + "learning_rate": 4.005734655920645e-07, + "loss": 0.9353, + "step": 103380 + }, + { + "epoch": 8.011933821535123, + "grad_norm": 1.4642154173705855, + "learning_rate": 4.00612213267204e-07, + "loss": 0.914, + "step": 103390 + }, + { + "epoch": 8.01270874501143, + "grad_norm": 1.4080065913932995, + "learning_rate": 4.006509609423435e-07, + "loss": 0.9019, + "step": 103400 + }, + { + "epoch": 8.013483668487737, + "grad_norm": 1.5204603645067598, + "learning_rate": 4.0068970861748296e-07, + "loss": 0.9032, + "step": 103410 + }, + { + "epoch": 8.014258591964044, + "grad_norm": 1.5088392182126193, + "learning_rate": 4.0072845629262243e-07, + "loss": 0.9029, + "step": 103420 + }, + { + "epoch": 8.01503351544035, + "grad_norm": 1.4192638378365714, + "learning_rate": 4.0076720396776196e-07, + "loss": 0.9203, + "step": 103430 + }, + { + "epoch": 8.015808438916658, + "grad_norm": 1.481593711083548, + "learning_rate": 4.008059516429014e-07, + "loss": 0.9117, + "step": 103440 + }, + { + "epoch": 8.016583362392964, + "grad_norm": 1.456692765450323, + "learning_rate": 4.0084469931804095e-07, + "loss": 0.9235, + "step": 103450 + }, + { + "epoch": 8.017358285869271, + "grad_norm": 1.4812286096706422, + "learning_rate": 4.008834469931804e-07, + "loss": 0.9216, + "step": 103460 + }, + { + "epoch": 8.018133209345578, + "grad_norm": 1.4737236504230318, + "learning_rate": 4.0092219466831994e-07, + "loss": 0.9153, + "step": 103470 + }, + { + "epoch": 8.018908132821883, + "grad_norm": 1.3610691378320012, + "learning_rate": 4.009609423434594e-07, + "loss": 0.8968, + "step": 103480 + }, + { + "epoch": 8.01968305629819, + "grad_norm": 1.3850336575781137, + "learning_rate": 4.009996900185989e-07, + "loss": 0.9154, + "step": 103490 + }, + { + "epoch": 8.020457979774497, + "grad_norm": 1.4752250633262154, + "learning_rate": 4.010384376937384e-07, + "loss": 0.9196, + "step": 103500 + }, + { + "epoch": 8.020457979774497, + "eval_loss": 0.9215549230575562, + "eval_runtime": 331.1775, + "eval_samples_per_second": 34.637, + "eval_steps_per_second": 8.66, + "step": 103500 + }, + { + "epoch": 8.021232903250803, + "grad_norm": 1.3901114889654242, + "learning_rate": 4.0107718536887787e-07, + "loss": 0.9011, + "step": 103510 + }, + { + "epoch": 8.02200782672711, + "grad_norm": 1.5126510742333683, + "learning_rate": 4.011159330440174e-07, + "loss": 0.908, + "step": 103520 + }, + { + "epoch": 8.022782750203417, + "grad_norm": 1.4689951558218712, + "learning_rate": 4.0115468071915686e-07, + "loss": 0.9273, + "step": 103530 + }, + { + "epoch": 8.023557673679724, + "grad_norm": 1.362813010569426, + "learning_rate": 4.011934283942964e-07, + "loss": 0.913, + "step": 103540 + }, + { + "epoch": 8.02433259715603, + "grad_norm": 1.3526988409894947, + "learning_rate": 4.0123217606943585e-07, + "loss": 0.8908, + "step": 103550 + }, + { + "epoch": 8.025107520632337, + "grad_norm": 1.443638704946477, + "learning_rate": 4.012709237445753e-07, + "loss": 0.8987, + "step": 103560 + }, + { + "epoch": 8.025882444108644, + "grad_norm": 1.3969505797358805, + "learning_rate": 4.0130967141971484e-07, + "loss": 0.8993, + "step": 103570 + }, + { + "epoch": 8.026657367584951, + "grad_norm": 1.4064361644162198, + "learning_rate": 4.013484190948543e-07, + "loss": 0.8931, + "step": 103580 + }, + { + "epoch": 8.027432291061258, + "grad_norm": 1.471584150796977, + "learning_rate": 4.0138716676999383e-07, + "loss": 0.9271, + "step": 103590 + }, + { + "epoch": 8.028207214537565, + "grad_norm": 1.4689271360375238, + "learning_rate": 4.014259144451333e-07, + "loss": 0.9301, + "step": 103600 + }, + { + "epoch": 8.028982138013872, + "grad_norm": 1.3970869463882731, + "learning_rate": 4.014646621202728e-07, + "loss": 0.9189, + "step": 103610 + }, + { + "epoch": 8.029757061490178, + "grad_norm": 1.3616638505798835, + "learning_rate": 4.015034097954123e-07, + "loss": 0.9091, + "step": 103620 + }, + { + "epoch": 8.030531984966485, + "grad_norm": 1.4371062928592926, + "learning_rate": 4.0154215747055176e-07, + "loss": 0.9084, + "step": 103630 + }, + { + "epoch": 8.031306908442792, + "grad_norm": 1.4811742698303716, + "learning_rate": 4.015809051456913e-07, + "loss": 0.9084, + "step": 103640 + }, + { + "epoch": 8.032081831919099, + "grad_norm": 1.374778544430721, + "learning_rate": 4.0161965282083075e-07, + "loss": 0.9179, + "step": 103650 + }, + { + "epoch": 8.032856755395406, + "grad_norm": 1.4490332696434314, + "learning_rate": 4.0165840049597027e-07, + "loss": 0.904, + "step": 103660 + }, + { + "epoch": 8.03363167887171, + "grad_norm": 1.3000816734132883, + "learning_rate": 4.0169714817110974e-07, + "loss": 0.9175, + "step": 103670 + }, + { + "epoch": 8.034406602348017, + "grad_norm": 1.4235105399842622, + "learning_rate": 4.0173589584624926e-07, + "loss": 0.9196, + "step": 103680 + }, + { + "epoch": 8.035181525824324, + "grad_norm": 1.4229696808656602, + "learning_rate": 4.0177464352138873e-07, + "loss": 0.9049, + "step": 103690 + }, + { + "epoch": 8.035956449300631, + "grad_norm": 1.415792362242932, + "learning_rate": 4.018133911965282e-07, + "loss": 0.9425, + "step": 103700 + }, + { + "epoch": 8.036731372776938, + "grad_norm": 1.470251640491661, + "learning_rate": 4.018521388716677e-07, + "loss": 0.9078, + "step": 103710 + }, + { + "epoch": 8.037506296253245, + "grad_norm": 1.450222210445949, + "learning_rate": 4.018908865468072e-07, + "loss": 0.902, + "step": 103720 + }, + { + "epoch": 8.038281219729551, + "grad_norm": 1.4269974660088098, + "learning_rate": 4.019296342219467e-07, + "loss": 0.9301, + "step": 103730 + }, + { + "epoch": 8.039056143205858, + "grad_norm": 1.4129501950177665, + "learning_rate": 4.019683818970862e-07, + "loss": 0.9306, + "step": 103740 + }, + { + "epoch": 8.039831066682165, + "grad_norm": 1.4433331257532398, + "learning_rate": 4.0200712957222565e-07, + "loss": 0.9187, + "step": 103750 + }, + { + "epoch": 8.040605990158472, + "grad_norm": 1.3587194084178675, + "learning_rate": 4.020458772473652e-07, + "loss": 0.8972, + "step": 103760 + }, + { + "epoch": 8.041380913634779, + "grad_norm": 1.439532356920215, + "learning_rate": 4.0208462492250464e-07, + "loss": 0.9096, + "step": 103770 + }, + { + "epoch": 8.042155837111086, + "grad_norm": 1.4291375389891812, + "learning_rate": 4.0212337259764417e-07, + "loss": 0.9185, + "step": 103780 + }, + { + "epoch": 8.042930760587392, + "grad_norm": 1.527742909708359, + "learning_rate": 4.0216212027278364e-07, + "loss": 0.9215, + "step": 103790 + }, + { + "epoch": 8.0437056840637, + "grad_norm": 1.4338806776993311, + "learning_rate": 4.0220086794792316e-07, + "loss": 0.9051, + "step": 103800 + }, + { + "epoch": 8.044480607540006, + "grad_norm": 1.3950230396978454, + "learning_rate": 4.0223961562306263e-07, + "loss": 0.9244, + "step": 103810 + }, + { + "epoch": 8.045255531016313, + "grad_norm": 1.3376698257028503, + "learning_rate": 4.022783632982021e-07, + "loss": 0.9293, + "step": 103820 + }, + { + "epoch": 8.04603045449262, + "grad_norm": 1.4634105556003119, + "learning_rate": 4.023171109733416e-07, + "loss": 0.9463, + "step": 103830 + }, + { + "epoch": 8.046805377968926, + "grad_norm": 1.3882635970819188, + "learning_rate": 4.023558586484811e-07, + "loss": 0.8985, + "step": 103840 + }, + { + "epoch": 8.047580301445231, + "grad_norm": 1.4761393798693982, + "learning_rate": 4.023946063236206e-07, + "loss": 0.9293, + "step": 103850 + }, + { + "epoch": 8.048355224921538, + "grad_norm": 1.3967308102214868, + "learning_rate": 4.024333539987601e-07, + "loss": 0.922, + "step": 103860 + }, + { + "epoch": 8.049130148397845, + "grad_norm": 1.4417383606927794, + "learning_rate": 4.024721016738996e-07, + "loss": 0.9005, + "step": 103870 + }, + { + "epoch": 8.049905071874152, + "grad_norm": 1.3731817728299136, + "learning_rate": 4.0251084934903907e-07, + "loss": 0.9377, + "step": 103880 + }, + { + "epoch": 8.050679995350459, + "grad_norm": 1.3698607542591767, + "learning_rate": 4.0254959702417854e-07, + "loss": 0.8976, + "step": 103890 + }, + { + "epoch": 8.051454918826765, + "grad_norm": 1.3850610996633952, + "learning_rate": 4.0258834469931806e-07, + "loss": 0.9043, + "step": 103900 + }, + { + "epoch": 8.052229842303072, + "grad_norm": 1.385269160914953, + "learning_rate": 4.0262709237445753e-07, + "loss": 0.9078, + "step": 103910 + }, + { + "epoch": 8.053004765779379, + "grad_norm": 1.4969333468058235, + "learning_rate": 4.0266584004959705e-07, + "loss": 0.9068, + "step": 103920 + }, + { + "epoch": 8.053779689255686, + "grad_norm": 1.3454646601608495, + "learning_rate": 4.027045877247365e-07, + "loss": 0.9022, + "step": 103930 + }, + { + "epoch": 8.054554612731993, + "grad_norm": 1.3798490600574849, + "learning_rate": 4.0274333539987604e-07, + "loss": 0.9098, + "step": 103940 + }, + { + "epoch": 8.0553295362083, + "grad_norm": 1.4229542294500126, + "learning_rate": 4.027820830750155e-07, + "loss": 0.9453, + "step": 103950 + }, + { + "epoch": 8.056104459684606, + "grad_norm": 1.397727986313095, + "learning_rate": 4.02820830750155e-07, + "loss": 0.9168, + "step": 103960 + }, + { + "epoch": 8.056879383160913, + "grad_norm": 1.5317770558981099, + "learning_rate": 4.028595784252945e-07, + "loss": 0.9169, + "step": 103970 + }, + { + "epoch": 8.05765430663722, + "grad_norm": 1.3390363452167875, + "learning_rate": 4.0289832610043397e-07, + "loss": 0.9101, + "step": 103980 + }, + { + "epoch": 8.058429230113527, + "grad_norm": 1.4711205882779521, + "learning_rate": 4.029370737755735e-07, + "loss": 0.9082, + "step": 103990 + }, + { + "epoch": 8.059204153589834, + "grad_norm": 1.4317414784285516, + "learning_rate": 4.0297582145071296e-07, + "loss": 0.9166, + "step": 104000 + }, + { + "epoch": 8.059204153589834, + "eval_loss": 0.9212583899497986, + "eval_runtime": 331.9257, + "eval_samples_per_second": 34.559, + "eval_steps_per_second": 8.64, + "step": 104000 + }, + { + "epoch": 8.05997907706614, + "grad_norm": 1.4393020395090865, + "learning_rate": 4.030145691258525e-07, + "loss": 0.923, + "step": 104010 + }, + { + "epoch": 8.060754000542447, + "grad_norm": 1.387215926889405, + "learning_rate": 4.0305331680099195e-07, + "loss": 0.9168, + "step": 104020 + }, + { + "epoch": 8.061528924018754, + "grad_norm": 1.3900719775228936, + "learning_rate": 4.030920644761314e-07, + "loss": 0.9248, + "step": 104030 + }, + { + "epoch": 8.062303847495059, + "grad_norm": 1.4856411571715789, + "learning_rate": 4.0313081215127094e-07, + "loss": 0.9105, + "step": 104040 + }, + { + "epoch": 8.063078770971366, + "grad_norm": 1.483099118010514, + "learning_rate": 4.031695598264104e-07, + "loss": 0.9554, + "step": 104050 + }, + { + "epoch": 8.063853694447673, + "grad_norm": 1.3688877319826573, + "learning_rate": 4.0320830750154993e-07, + "loss": 0.9172, + "step": 104060 + }, + { + "epoch": 8.06462861792398, + "grad_norm": 1.4030089092678288, + "learning_rate": 4.032470551766894e-07, + "loss": 0.9163, + "step": 104070 + }, + { + "epoch": 8.065403541400286, + "grad_norm": 1.5436773262051418, + "learning_rate": 4.032858028518289e-07, + "loss": 0.8942, + "step": 104080 + }, + { + "epoch": 8.066178464876593, + "grad_norm": 1.3863170689549862, + "learning_rate": 4.033245505269684e-07, + "loss": 0.9081, + "step": 104090 + }, + { + "epoch": 8.0669533883529, + "grad_norm": 1.4965142546563657, + "learning_rate": 4.0336329820210786e-07, + "loss": 0.9024, + "step": 104100 + }, + { + "epoch": 8.067728311829207, + "grad_norm": 1.3929128908226796, + "learning_rate": 4.034020458772474e-07, + "loss": 0.9215, + "step": 104110 + }, + { + "epoch": 8.068503235305513, + "grad_norm": 1.468242969245847, + "learning_rate": 4.0344079355238685e-07, + "loss": 0.9329, + "step": 104120 + }, + { + "epoch": 8.06927815878182, + "grad_norm": 1.4076930414230193, + "learning_rate": 4.034795412275264e-07, + "loss": 0.8758, + "step": 104130 + }, + { + "epoch": 8.070053082258127, + "grad_norm": 1.5251328728936264, + "learning_rate": 4.0351828890266585e-07, + "loss": 0.9185, + "step": 104140 + }, + { + "epoch": 8.070828005734434, + "grad_norm": 1.5106401461780292, + "learning_rate": 4.0355703657780537e-07, + "loss": 0.9332, + "step": 104150 + }, + { + "epoch": 8.07160292921074, + "grad_norm": 1.4492722177895019, + "learning_rate": 4.0359578425294484e-07, + "loss": 0.917, + "step": 104160 + }, + { + "epoch": 8.072377852687048, + "grad_norm": 1.4688110909064167, + "learning_rate": 4.036345319280843e-07, + "loss": 0.9203, + "step": 104170 + }, + { + "epoch": 8.073152776163354, + "grad_norm": 1.3795129945246316, + "learning_rate": 4.0367327960322383e-07, + "loss": 0.9064, + "step": 104180 + }, + { + "epoch": 8.073927699639661, + "grad_norm": 1.521315221645112, + "learning_rate": 4.037120272783633e-07, + "loss": 0.9, + "step": 104190 + }, + { + "epoch": 8.074702623115968, + "grad_norm": 1.412056592096577, + "learning_rate": 4.037507749535028e-07, + "loss": 0.9056, + "step": 104200 + }, + { + "epoch": 8.075477546592275, + "grad_norm": 1.5368329448386395, + "learning_rate": 4.037895226286423e-07, + "loss": 0.9274, + "step": 104210 + }, + { + "epoch": 8.076252470068582, + "grad_norm": 1.4169634890520693, + "learning_rate": 4.038282703037818e-07, + "loss": 0.9176, + "step": 104220 + }, + { + "epoch": 8.077027393544887, + "grad_norm": 1.5560994722407802, + "learning_rate": 4.038670179789213e-07, + "loss": 0.925, + "step": 104230 + }, + { + "epoch": 8.077802317021193, + "grad_norm": 1.3995172078348959, + "learning_rate": 4.0390576565406075e-07, + "loss": 0.9146, + "step": 104240 + }, + { + "epoch": 8.0785772404975, + "grad_norm": 1.4287557199664431, + "learning_rate": 4.0394451332920027e-07, + "loss": 0.901, + "step": 104250 + }, + { + "epoch": 8.079352163973807, + "grad_norm": 1.3975212626500275, + "learning_rate": 4.0398326100433974e-07, + "loss": 0.9036, + "step": 104260 + }, + { + "epoch": 8.080127087450114, + "grad_norm": 1.3684683009653715, + "learning_rate": 4.0402200867947926e-07, + "loss": 0.8751, + "step": 104270 + }, + { + "epoch": 8.08090201092642, + "grad_norm": 1.3761401050791022, + "learning_rate": 4.0406075635461873e-07, + "loss": 0.8993, + "step": 104280 + }, + { + "epoch": 8.081676934402727, + "grad_norm": 1.4258746385157093, + "learning_rate": 4.0409950402975825e-07, + "loss": 0.9187, + "step": 104290 + }, + { + "epoch": 8.082451857879034, + "grad_norm": 1.4064299589816358, + "learning_rate": 4.041382517048977e-07, + "loss": 0.9139, + "step": 104300 + }, + { + "epoch": 8.083226781355341, + "grad_norm": 1.4110545696638699, + "learning_rate": 4.041769993800372e-07, + "loss": 0.9015, + "step": 104310 + }, + { + "epoch": 8.084001704831648, + "grad_norm": 1.4456486378643756, + "learning_rate": 4.042157470551767e-07, + "loss": 0.8961, + "step": 104320 + }, + { + "epoch": 8.084776628307955, + "grad_norm": 1.4633048598630485, + "learning_rate": 4.042544947303162e-07, + "loss": 0.9213, + "step": 104330 + }, + { + "epoch": 8.085551551784262, + "grad_norm": 1.3603215724762734, + "learning_rate": 4.042932424054557e-07, + "loss": 0.9241, + "step": 104340 + }, + { + "epoch": 8.086326475260568, + "grad_norm": 1.5002588869995122, + "learning_rate": 4.0433199008059517e-07, + "loss": 0.8845, + "step": 104350 + }, + { + "epoch": 8.087101398736875, + "grad_norm": 1.4163624254365559, + "learning_rate": 4.043707377557347e-07, + "loss": 0.9013, + "step": 104360 + }, + { + "epoch": 8.087876322213182, + "grad_norm": 1.461776351199422, + "learning_rate": 4.0440948543087416e-07, + "loss": 0.9284, + "step": 104370 + }, + { + "epoch": 8.088651245689489, + "grad_norm": 1.4632816959332393, + "learning_rate": 4.0444823310601363e-07, + "loss": 0.9216, + "step": 104380 + }, + { + "epoch": 8.089426169165796, + "grad_norm": 1.4731616207351044, + "learning_rate": 4.0448698078115315e-07, + "loss": 0.929, + "step": 104390 + }, + { + "epoch": 8.090201092642102, + "grad_norm": 1.462810772910437, + "learning_rate": 4.045257284562926e-07, + "loss": 0.9065, + "step": 104400 + }, + { + "epoch": 8.090976016118407, + "grad_norm": 1.3794863167501015, + "learning_rate": 4.0456447613143215e-07, + "loss": 0.9188, + "step": 104410 + }, + { + "epoch": 8.091750939594714, + "grad_norm": 1.3864372439858417, + "learning_rate": 4.046032238065716e-07, + "loss": 0.8922, + "step": 104420 + }, + { + "epoch": 8.092525863071021, + "grad_norm": 1.3523371062565523, + "learning_rate": 4.0464197148171114e-07, + "loss": 0.9063, + "step": 104430 + }, + { + "epoch": 8.093300786547328, + "grad_norm": 1.46857811603032, + "learning_rate": 4.046807191568506e-07, + "loss": 0.9212, + "step": 104440 + }, + { + "epoch": 8.094075710023635, + "grad_norm": 1.4130489691777424, + "learning_rate": 4.047194668319901e-07, + "loss": 0.8887, + "step": 104450 + }, + { + "epoch": 8.094850633499941, + "grad_norm": 1.4587389233481736, + "learning_rate": 4.047582145071296e-07, + "loss": 0.9049, + "step": 104460 + }, + { + "epoch": 8.095625556976248, + "grad_norm": 1.4592484918086108, + "learning_rate": 4.0479696218226907e-07, + "loss": 0.9022, + "step": 104470 + }, + { + "epoch": 8.096400480452555, + "grad_norm": 1.48041932955101, + "learning_rate": 4.048357098574086e-07, + "loss": 0.9022, + "step": 104480 + }, + { + "epoch": 8.097175403928862, + "grad_norm": 1.4243709418714372, + "learning_rate": 4.0487445753254806e-07, + "loss": 0.9413, + "step": 104490 + }, + { + "epoch": 8.097950327405169, + "grad_norm": 1.3201922557909347, + "learning_rate": 4.049132052076875e-07, + "loss": 0.8978, + "step": 104500 + }, + { + "epoch": 8.097950327405169, + "eval_loss": 0.9210622906684875, + "eval_runtime": 327.8346, + "eval_samples_per_second": 34.99, + "eval_steps_per_second": 8.748, + "step": 104500 + }, + { + "epoch": 8.098725250881476, + "grad_norm": 1.453880615466816, + "learning_rate": 4.0495195288282705e-07, + "loss": 0.9174, + "step": 104510 + }, + { + "epoch": 8.099500174357782, + "grad_norm": 1.378700737617905, + "learning_rate": 4.049907005579665e-07, + "loss": 0.9008, + "step": 104520 + }, + { + "epoch": 8.10027509783409, + "grad_norm": 1.4712438929194553, + "learning_rate": 4.0502944823310604e-07, + "loss": 0.9306, + "step": 104530 + }, + { + "epoch": 8.101050021310396, + "grad_norm": 1.48986123288013, + "learning_rate": 4.050681959082455e-07, + "loss": 0.902, + "step": 104540 + }, + { + "epoch": 8.101824944786703, + "grad_norm": 1.427190322841918, + "learning_rate": 4.0510694358338503e-07, + "loss": 0.9429, + "step": 104550 + }, + { + "epoch": 8.10259986826301, + "grad_norm": 1.4521774528405271, + "learning_rate": 4.051456912585245e-07, + "loss": 0.9112, + "step": 104560 + }, + { + "epoch": 8.103374791739316, + "grad_norm": 1.4111053074646096, + "learning_rate": 4.0518443893366397e-07, + "loss": 0.912, + "step": 104570 + }, + { + "epoch": 8.104149715215623, + "grad_norm": 1.4848621954380985, + "learning_rate": 4.052231866088035e-07, + "loss": 0.9352, + "step": 104580 + }, + { + "epoch": 8.10492463869193, + "grad_norm": 1.4776734226539487, + "learning_rate": 4.0526193428394296e-07, + "loss": 0.9188, + "step": 104590 + }, + { + "epoch": 8.105699562168235, + "grad_norm": 1.444741426691392, + "learning_rate": 4.053006819590825e-07, + "loss": 0.9057, + "step": 104600 + }, + { + "epoch": 8.106474485644542, + "grad_norm": 1.343066712814432, + "learning_rate": 4.0533942963422195e-07, + "loss": 0.8876, + "step": 104610 + }, + { + "epoch": 8.107249409120849, + "grad_norm": 1.5237591404841602, + "learning_rate": 4.0537817730936147e-07, + "loss": 0.9162, + "step": 104620 + }, + { + "epoch": 8.108024332597155, + "grad_norm": 1.4800999738586818, + "learning_rate": 4.0541692498450094e-07, + "loss": 0.9242, + "step": 104630 + }, + { + "epoch": 8.108799256073462, + "grad_norm": 1.443396539438176, + "learning_rate": 4.054556726596404e-07, + "loss": 0.9335, + "step": 104640 + }, + { + "epoch": 8.10957417954977, + "grad_norm": 1.443427386679186, + "learning_rate": 4.0549442033477993e-07, + "loss": 0.9176, + "step": 104650 + }, + { + "epoch": 8.110349103026076, + "grad_norm": 1.5540951534365806, + "learning_rate": 4.055331680099194e-07, + "loss": 0.9275, + "step": 104660 + }, + { + "epoch": 8.111124026502383, + "grad_norm": 1.3818813960354623, + "learning_rate": 4.055719156850589e-07, + "loss": 0.894, + "step": 104670 + }, + { + "epoch": 8.11189894997869, + "grad_norm": 1.4341956068847177, + "learning_rate": 4.056106633601984e-07, + "loss": 0.9229, + "step": 104680 + }, + { + "epoch": 8.112673873454996, + "grad_norm": 1.4062923188544183, + "learning_rate": 4.056494110353379e-07, + "loss": 0.9204, + "step": 104690 + }, + { + "epoch": 8.113448796931303, + "grad_norm": 1.48846818811775, + "learning_rate": 4.056881587104774e-07, + "loss": 0.9467, + "step": 104700 + }, + { + "epoch": 8.11422372040761, + "grad_norm": 1.5581588673218316, + "learning_rate": 4.0572690638561685e-07, + "loss": 0.892, + "step": 104710 + }, + { + "epoch": 8.114998643883917, + "grad_norm": 1.4294430372915667, + "learning_rate": 4.057656540607564e-07, + "loss": 0.9163, + "step": 104720 + }, + { + "epoch": 8.115773567360224, + "grad_norm": 1.3565525005963668, + "learning_rate": 4.0580440173589584e-07, + "loss": 0.9006, + "step": 104730 + }, + { + "epoch": 8.11654849083653, + "grad_norm": 1.5161454487170312, + "learning_rate": 4.0584314941103536e-07, + "loss": 0.9116, + "step": 104740 + }, + { + "epoch": 8.117323414312837, + "grad_norm": 1.4881738240718294, + "learning_rate": 4.0588189708617483e-07, + "loss": 0.9142, + "step": 104750 + }, + { + "epoch": 8.118098337789144, + "grad_norm": 1.445350500580828, + "learning_rate": 4.0592064476131436e-07, + "loss": 0.9187, + "step": 104760 + }, + { + "epoch": 8.11887326126545, + "grad_norm": 1.5204960681281954, + "learning_rate": 4.059593924364538e-07, + "loss": 0.9129, + "step": 104770 + }, + { + "epoch": 8.119648184741756, + "grad_norm": 1.4271190551346693, + "learning_rate": 4.059981401115933e-07, + "loss": 0.9033, + "step": 104780 + }, + { + "epoch": 8.120423108218063, + "grad_norm": 1.4979315443271912, + "learning_rate": 4.060368877867328e-07, + "loss": 0.9213, + "step": 104790 + }, + { + "epoch": 8.12119803169437, + "grad_norm": 1.4766900739792814, + "learning_rate": 4.060756354618723e-07, + "loss": 0.9051, + "step": 104800 + }, + { + "epoch": 8.121972955170676, + "grad_norm": 1.4667684377447214, + "learning_rate": 4.061143831370118e-07, + "loss": 0.9341, + "step": 104810 + }, + { + "epoch": 8.122747878646983, + "grad_norm": 1.2998616494225779, + "learning_rate": 4.061531308121513e-07, + "loss": 0.8935, + "step": 104820 + }, + { + "epoch": 8.12352280212329, + "grad_norm": 1.4375309323404875, + "learning_rate": 4.061918784872908e-07, + "loss": 0.9326, + "step": 104830 + }, + { + "epoch": 8.124297725599597, + "grad_norm": 1.4727871853112033, + "learning_rate": 4.0623062616243027e-07, + "loss": 0.9084, + "step": 104840 + }, + { + "epoch": 8.125072649075904, + "grad_norm": 1.4561577045976375, + "learning_rate": 4.0626937383756974e-07, + "loss": 0.906, + "step": 104850 + }, + { + "epoch": 8.12584757255221, + "grad_norm": 1.4135539267065367, + "learning_rate": 4.0630812151270926e-07, + "loss": 0.9057, + "step": 104860 + }, + { + "epoch": 8.126622496028517, + "grad_norm": 1.4696131953809666, + "learning_rate": 4.0634686918784873e-07, + "loss": 0.9196, + "step": 104870 + }, + { + "epoch": 8.127397419504824, + "grad_norm": 1.4144905004731942, + "learning_rate": 4.0638561686298825e-07, + "loss": 0.9066, + "step": 104880 + }, + { + "epoch": 8.12817234298113, + "grad_norm": 1.2971846343050135, + "learning_rate": 4.064243645381277e-07, + "loss": 0.9179, + "step": 104890 + }, + { + "epoch": 8.128947266457438, + "grad_norm": 1.473497087113831, + "learning_rate": 4.0646311221326724e-07, + "loss": 0.9078, + "step": 104900 + }, + { + "epoch": 8.129722189933744, + "grad_norm": 1.427306704592014, + "learning_rate": 4.065018598884067e-07, + "loss": 0.9211, + "step": 104910 + }, + { + "epoch": 8.130497113410051, + "grad_norm": 1.39724480563401, + "learning_rate": 4.065406075635462e-07, + "loss": 0.9094, + "step": 104920 + }, + { + "epoch": 8.131272036886358, + "grad_norm": 1.4982877144297022, + "learning_rate": 4.065793552386857e-07, + "loss": 0.9032, + "step": 104930 + }, + { + "epoch": 8.132046960362665, + "grad_norm": 1.4059837162919626, + "learning_rate": 4.0661810291382517e-07, + "loss": 0.9144, + "step": 104940 + }, + { + "epoch": 8.132821883838972, + "grad_norm": 1.4349636835012725, + "learning_rate": 4.066568505889647e-07, + "loss": 0.8989, + "step": 104950 + }, + { + "epoch": 8.133596807315278, + "grad_norm": 1.434050342924566, + "learning_rate": 4.0669559826410416e-07, + "loss": 0.9038, + "step": 104960 + }, + { + "epoch": 8.134371730791583, + "grad_norm": 1.521382499775324, + "learning_rate": 4.067343459392437e-07, + "loss": 0.9062, + "step": 104970 + }, + { + "epoch": 8.13514665426789, + "grad_norm": 1.4166926181857926, + "learning_rate": 4.0677309361438315e-07, + "loss": 0.9099, + "step": 104980 + }, + { + "epoch": 8.135921577744197, + "grad_norm": 1.5342601961414397, + "learning_rate": 4.068118412895226e-07, + "loss": 0.9178, + "step": 104990 + }, + { + "epoch": 8.136696501220504, + "grad_norm": 1.4000961365354052, + "learning_rate": 4.0685058896466214e-07, + "loss": 0.9188, + "step": 105000 + }, + { + "epoch": 8.136696501220504, + "eval_loss": 0.9207261204719543, + "eval_runtime": 326.7738, + "eval_samples_per_second": 35.104, + "eval_steps_per_second": 8.777, + "step": 105000 + }, + { + "epoch": 8.13747142469681, + "grad_norm": 1.3842590646028312, + "learning_rate": 4.068893366398016e-07, + "loss": 0.9294, + "step": 105010 + }, + { + "epoch": 8.138246348173118, + "grad_norm": 1.4598834867025903, + "learning_rate": 4.0692808431494113e-07, + "loss": 0.9016, + "step": 105020 + }, + { + "epoch": 8.139021271649424, + "grad_norm": 1.477997084981107, + "learning_rate": 4.069668319900806e-07, + "loss": 0.916, + "step": 105030 + }, + { + "epoch": 8.139796195125731, + "grad_norm": 1.4493287885198392, + "learning_rate": 4.070055796652201e-07, + "loss": 0.919, + "step": 105040 + }, + { + "epoch": 8.140571118602038, + "grad_norm": 1.3661305716034733, + "learning_rate": 4.070443273403596e-07, + "loss": 0.9174, + "step": 105050 + }, + { + "epoch": 8.141346042078345, + "grad_norm": 1.444187625339828, + "learning_rate": 4.0708307501549906e-07, + "loss": 0.9225, + "step": 105060 + }, + { + "epoch": 8.142120965554652, + "grad_norm": 1.42941802004654, + "learning_rate": 4.071218226906386e-07, + "loss": 0.9013, + "step": 105070 + }, + { + "epoch": 8.142895889030958, + "grad_norm": 1.4037409676971717, + "learning_rate": 4.0716057036577805e-07, + "loss": 0.9079, + "step": 105080 + }, + { + "epoch": 8.143670812507265, + "grad_norm": 1.362802229241616, + "learning_rate": 4.071993180409176e-07, + "loss": 0.9246, + "step": 105090 + }, + { + "epoch": 8.144445735983572, + "grad_norm": 1.4860361547922436, + "learning_rate": 4.0723806571605704e-07, + "loss": 0.9085, + "step": 105100 + }, + { + "epoch": 8.145220659459879, + "grad_norm": 1.5005293057098932, + "learning_rate": 4.0727681339119657e-07, + "loss": 0.9251, + "step": 105110 + }, + { + "epoch": 8.145995582936186, + "grad_norm": 1.4719249268093995, + "learning_rate": 4.0731556106633604e-07, + "loss": 0.9206, + "step": 105120 + }, + { + "epoch": 8.146770506412492, + "grad_norm": 1.4921192976092947, + "learning_rate": 4.073543087414755e-07, + "loss": 0.9193, + "step": 105130 + }, + { + "epoch": 8.1475454298888, + "grad_norm": 1.4084067151075432, + "learning_rate": 4.07393056416615e-07, + "loss": 0.9225, + "step": 105140 + }, + { + "epoch": 8.148320353365104, + "grad_norm": 1.4139393763376125, + "learning_rate": 4.074318040917545e-07, + "loss": 0.9111, + "step": 105150 + }, + { + "epoch": 8.149095276841411, + "grad_norm": 1.4029890594507333, + "learning_rate": 4.07470551766894e-07, + "loss": 0.9338, + "step": 105160 + }, + { + "epoch": 8.149870200317718, + "grad_norm": 1.5038243249455234, + "learning_rate": 4.075092994420335e-07, + "loss": 0.9074, + "step": 105170 + }, + { + "epoch": 8.150645123794025, + "grad_norm": 1.3965222775641686, + "learning_rate": 4.0754804711717296e-07, + "loss": 0.9055, + "step": 105180 + }, + { + "epoch": 8.151420047270332, + "grad_norm": 1.50262485413544, + "learning_rate": 4.075867947923125e-07, + "loss": 0.9198, + "step": 105190 + }, + { + "epoch": 8.152194970746638, + "grad_norm": 1.3701642945980101, + "learning_rate": 4.0762554246745195e-07, + "loss": 0.9013, + "step": 105200 + }, + { + "epoch": 8.152969894222945, + "grad_norm": 1.3810670206375502, + "learning_rate": 4.0766429014259147e-07, + "loss": 0.9023, + "step": 105210 + }, + { + "epoch": 8.153744817699252, + "grad_norm": 1.5014450057290243, + "learning_rate": 4.0770303781773094e-07, + "loss": 0.9171, + "step": 105220 + }, + { + "epoch": 8.154519741175559, + "grad_norm": 1.3465775124473394, + "learning_rate": 4.0774178549287046e-07, + "loss": 0.8952, + "step": 105230 + }, + { + "epoch": 8.155294664651866, + "grad_norm": 1.3961221642317554, + "learning_rate": 4.0778053316800993e-07, + "loss": 0.8945, + "step": 105240 + }, + { + "epoch": 8.156069588128172, + "grad_norm": 1.3749480685576396, + "learning_rate": 4.078192808431494e-07, + "loss": 0.9255, + "step": 105250 + }, + { + "epoch": 8.15684451160448, + "grad_norm": 1.4439989233181338, + "learning_rate": 4.078580285182889e-07, + "loss": 0.912, + "step": 105260 + }, + { + "epoch": 8.157619435080786, + "grad_norm": 1.4831037302495063, + "learning_rate": 4.078967761934284e-07, + "loss": 0.9218, + "step": 105270 + }, + { + "epoch": 8.158394358557093, + "grad_norm": 1.4310111836469064, + "learning_rate": 4.079355238685679e-07, + "loss": 0.9344, + "step": 105280 + }, + { + "epoch": 8.1591692820334, + "grad_norm": 1.4421270450132548, + "learning_rate": 4.079742715437074e-07, + "loss": 0.934, + "step": 105290 + }, + { + "epoch": 8.159944205509706, + "grad_norm": 1.4581926903154223, + "learning_rate": 4.080130192188469e-07, + "loss": 0.9214, + "step": 105300 + }, + { + "epoch": 8.160719128986013, + "grad_norm": 1.4764480972347365, + "learning_rate": 4.0805176689398637e-07, + "loss": 0.8984, + "step": 105310 + }, + { + "epoch": 8.16149405246232, + "grad_norm": 1.4218252960977809, + "learning_rate": 4.0809051456912584e-07, + "loss": 0.9108, + "step": 105320 + }, + { + "epoch": 8.162268975938627, + "grad_norm": 1.3955076102252972, + "learning_rate": 4.0812926224426536e-07, + "loss": 0.894, + "step": 105330 + }, + { + "epoch": 8.163043899414932, + "grad_norm": 1.3468410083672928, + "learning_rate": 4.0816800991940483e-07, + "loss": 0.9178, + "step": 105340 + }, + { + "epoch": 8.163818822891239, + "grad_norm": 1.4457171548536405, + "learning_rate": 4.0820675759454435e-07, + "loss": 0.9291, + "step": 105350 + }, + { + "epoch": 8.164593746367546, + "grad_norm": 1.470211318493687, + "learning_rate": 4.082455052696838e-07, + "loss": 0.9141, + "step": 105360 + }, + { + "epoch": 8.165368669843852, + "grad_norm": 1.4507001816361436, + "learning_rate": 4.0828425294482334e-07, + "loss": 0.9168, + "step": 105370 + }, + { + "epoch": 8.16614359332016, + "grad_norm": 1.498820070886096, + "learning_rate": 4.083230006199628e-07, + "loss": 0.9335, + "step": 105380 + }, + { + "epoch": 8.166918516796466, + "grad_norm": 1.4649969573878896, + "learning_rate": 4.083617482951023e-07, + "loss": 0.8938, + "step": 105390 + }, + { + "epoch": 8.167693440272773, + "grad_norm": 1.4856786071553951, + "learning_rate": 4.084004959702418e-07, + "loss": 0.8978, + "step": 105400 + }, + { + "epoch": 8.16846836374908, + "grad_norm": 1.42505526855303, + "learning_rate": 4.0843924364538127e-07, + "loss": 0.9166, + "step": 105410 + }, + { + "epoch": 8.169243287225386, + "grad_norm": 1.4465754722674036, + "learning_rate": 4.084779913205208e-07, + "loss": 0.9025, + "step": 105420 + }, + { + "epoch": 8.170018210701693, + "grad_norm": 1.354330689269429, + "learning_rate": 4.0851673899566026e-07, + "loss": 0.9091, + "step": 105430 + }, + { + "epoch": 8.170793134178, + "grad_norm": 1.4901658639920101, + "learning_rate": 4.085554866707998e-07, + "loss": 0.9033, + "step": 105440 + }, + { + "epoch": 8.171568057654307, + "grad_norm": 1.4496510253098753, + "learning_rate": 4.0859423434593925e-07, + "loss": 0.912, + "step": 105450 + }, + { + "epoch": 8.172342981130614, + "grad_norm": 1.4303896874095798, + "learning_rate": 4.086329820210787e-07, + "loss": 0.9127, + "step": 105460 + }, + { + "epoch": 8.17311790460692, + "grad_norm": 1.4559250507117067, + "learning_rate": 4.0867172969621825e-07, + "loss": 0.9255, + "step": 105470 + }, + { + "epoch": 8.173892828083227, + "grad_norm": 1.389693273951282, + "learning_rate": 4.087104773713577e-07, + "loss": 0.901, + "step": 105480 + }, + { + "epoch": 8.174667751559534, + "grad_norm": 1.4364760537598134, + "learning_rate": 4.0874922504649724e-07, + "loss": 0.9206, + "step": 105490 + }, + { + "epoch": 8.17544267503584, + "grad_norm": 1.4667468247149997, + "learning_rate": 4.087879727216367e-07, + "loss": 0.9076, + "step": 105500 + }, + { + "epoch": 8.17544267503584, + "eval_loss": 0.9204335808753967, + "eval_runtime": 327.8741, + "eval_samples_per_second": 34.986, + "eval_steps_per_second": 8.747, + "step": 105500 + }, + { + "epoch": 8.176217598512148, + "grad_norm": 1.3791438070927244, + "learning_rate": 4.0882672039677623e-07, + "loss": 0.9282, + "step": 105510 + }, + { + "epoch": 8.176992521988454, + "grad_norm": 1.4109499423190204, + "learning_rate": 4.088654680719157e-07, + "loss": 0.9284, + "step": 105520 + }, + { + "epoch": 8.17776744546476, + "grad_norm": 1.5151432075670739, + "learning_rate": 4.0890421574705517e-07, + "loss": 0.9051, + "step": 105530 + }, + { + "epoch": 8.178542368941066, + "grad_norm": 1.443951278459268, + "learning_rate": 4.089429634221947e-07, + "loss": 0.9196, + "step": 105540 + }, + { + "epoch": 8.179317292417373, + "grad_norm": 1.4299939368119996, + "learning_rate": 4.0898171109733416e-07, + "loss": 0.9013, + "step": 105550 + }, + { + "epoch": 8.18009221589368, + "grad_norm": 1.5431173419371471, + "learning_rate": 4.090204587724737e-07, + "loss": 0.9178, + "step": 105560 + }, + { + "epoch": 8.180867139369987, + "grad_norm": 1.4397022750772674, + "learning_rate": 4.0905920644761315e-07, + "loss": 0.9122, + "step": 105570 + }, + { + "epoch": 8.181642062846294, + "grad_norm": 1.4459993172073222, + "learning_rate": 4.0909795412275267e-07, + "loss": 0.8942, + "step": 105580 + }, + { + "epoch": 8.1824169863226, + "grad_norm": 1.4778748396729582, + "learning_rate": 4.0913670179789214e-07, + "loss": 0.9125, + "step": 105590 + }, + { + "epoch": 8.183191909798907, + "grad_norm": 1.3724813322251634, + "learning_rate": 4.091754494730316e-07, + "loss": 0.9142, + "step": 105600 + }, + { + "epoch": 8.183966833275214, + "grad_norm": 1.421645676546598, + "learning_rate": 4.0921419714817113e-07, + "loss": 0.9046, + "step": 105610 + }, + { + "epoch": 8.18474175675152, + "grad_norm": 1.4295644694265217, + "learning_rate": 4.092529448233106e-07, + "loss": 0.9048, + "step": 105620 + }, + { + "epoch": 8.185516680227828, + "grad_norm": 1.4728172084078097, + "learning_rate": 4.092916924984501e-07, + "loss": 0.9147, + "step": 105630 + }, + { + "epoch": 8.186291603704134, + "grad_norm": 1.4564068933196688, + "learning_rate": 4.093304401735896e-07, + "loss": 0.9105, + "step": 105640 + }, + { + "epoch": 8.187066527180441, + "grad_norm": 1.4403511245348903, + "learning_rate": 4.093691878487291e-07, + "loss": 0.9162, + "step": 105650 + }, + { + "epoch": 8.187841450656748, + "grad_norm": 1.5807613295283114, + "learning_rate": 4.094079355238686e-07, + "loss": 0.9081, + "step": 105660 + }, + { + "epoch": 8.188616374133055, + "grad_norm": 1.3519190011100792, + "learning_rate": 4.0944668319900805e-07, + "loss": 0.9076, + "step": 105670 + }, + { + "epoch": 8.189391297609362, + "grad_norm": 1.477854312718591, + "learning_rate": 4.0948543087414757e-07, + "loss": 0.9034, + "step": 105680 + }, + { + "epoch": 8.190166221085668, + "grad_norm": 1.388260531988575, + "learning_rate": 4.0952417854928704e-07, + "loss": 0.8895, + "step": 105690 + }, + { + "epoch": 8.190941144561975, + "grad_norm": 1.4639076804758866, + "learning_rate": 4.0956292622442656e-07, + "loss": 0.9188, + "step": 105700 + }, + { + "epoch": 8.191716068038282, + "grad_norm": 1.3831049906021882, + "learning_rate": 4.0960167389956603e-07, + "loss": 0.9345, + "step": 105710 + }, + { + "epoch": 8.192490991514587, + "grad_norm": 1.439053693219489, + "learning_rate": 4.0964042157470555e-07, + "loss": 0.9094, + "step": 105720 + }, + { + "epoch": 8.193265914990894, + "grad_norm": 1.480726782060119, + "learning_rate": 4.09679169249845e-07, + "loss": 0.9309, + "step": 105730 + }, + { + "epoch": 8.1940408384672, + "grad_norm": 1.4638674793022766, + "learning_rate": 4.097179169249845e-07, + "loss": 0.9202, + "step": 105740 + }, + { + "epoch": 8.194815761943508, + "grad_norm": 1.3727957656548007, + "learning_rate": 4.09756664600124e-07, + "loss": 0.9211, + "step": 105750 + }, + { + "epoch": 8.195590685419814, + "grad_norm": 1.453485728088525, + "learning_rate": 4.097954122752635e-07, + "loss": 0.9446, + "step": 105760 + }, + { + "epoch": 8.196365608896121, + "grad_norm": 1.389865357946805, + "learning_rate": 4.09834159950403e-07, + "loss": 0.9146, + "step": 105770 + }, + { + "epoch": 8.197140532372428, + "grad_norm": 1.4421682952172734, + "learning_rate": 4.098729076255425e-07, + "loss": 0.9138, + "step": 105780 + }, + { + "epoch": 8.197915455848735, + "grad_norm": 1.371962652621803, + "learning_rate": 4.09911655300682e-07, + "loss": 0.9116, + "step": 105790 + }, + { + "epoch": 8.198690379325042, + "grad_norm": 1.3903306941739126, + "learning_rate": 4.0995040297582147e-07, + "loss": 0.9043, + "step": 105800 + }, + { + "epoch": 8.199465302801348, + "grad_norm": 1.4817707782820606, + "learning_rate": 4.0998915065096093e-07, + "loss": 0.8938, + "step": 105810 + }, + { + "epoch": 8.200240226277655, + "grad_norm": 1.4419878177481196, + "learning_rate": 4.1002789832610046e-07, + "loss": 0.918, + "step": 105820 + }, + { + "epoch": 8.201015149753962, + "grad_norm": 1.4210845282048203, + "learning_rate": 4.100666460012399e-07, + "loss": 0.9149, + "step": 105830 + }, + { + "epoch": 8.201790073230269, + "grad_norm": 1.4615629910123449, + "learning_rate": 4.1010539367637945e-07, + "loss": 0.8863, + "step": 105840 + }, + { + "epoch": 8.202564996706576, + "grad_norm": 1.552601866006786, + "learning_rate": 4.101441413515189e-07, + "loss": 0.9186, + "step": 105850 + }, + { + "epoch": 8.203339920182882, + "grad_norm": 1.4479330534560797, + "learning_rate": 4.1018288902665844e-07, + "loss": 0.899, + "step": 105860 + }, + { + "epoch": 8.20411484365919, + "grad_norm": 1.3867673002566163, + "learning_rate": 4.102216367017979e-07, + "loss": 0.9177, + "step": 105870 + }, + { + "epoch": 8.204889767135496, + "grad_norm": 1.4233369584953197, + "learning_rate": 4.102603843769374e-07, + "loss": 0.9109, + "step": 105880 + }, + { + "epoch": 8.205664690611803, + "grad_norm": 1.3644543791100021, + "learning_rate": 4.102991320520769e-07, + "loss": 0.8989, + "step": 105890 + }, + { + "epoch": 8.206439614088108, + "grad_norm": 1.5245542581604008, + "learning_rate": 4.1033787972721637e-07, + "loss": 0.9089, + "step": 105900 + }, + { + "epoch": 8.207214537564415, + "grad_norm": 1.4887949598324426, + "learning_rate": 4.103766274023559e-07, + "loss": 0.9225, + "step": 105910 + }, + { + "epoch": 8.207989461040722, + "grad_norm": 1.4077787148814536, + "learning_rate": 4.1041537507749536e-07, + "loss": 0.9255, + "step": 105920 + }, + { + "epoch": 8.208764384517028, + "grad_norm": 1.3665540227455004, + "learning_rate": 4.1045412275263483e-07, + "loss": 0.9074, + "step": 105930 + }, + { + "epoch": 8.209539307993335, + "grad_norm": 1.4666734221239335, + "learning_rate": 4.1049287042777435e-07, + "loss": 0.902, + "step": 105940 + }, + { + "epoch": 8.210314231469642, + "grad_norm": 1.4093050106455245, + "learning_rate": 4.105316181029138e-07, + "loss": 0.9288, + "step": 105950 + }, + { + "epoch": 8.211089154945949, + "grad_norm": 1.404269354817556, + "learning_rate": 4.1057036577805334e-07, + "loss": 0.9142, + "step": 105960 + }, + { + "epoch": 8.211864078422256, + "grad_norm": 1.420816938221865, + "learning_rate": 4.106091134531928e-07, + "loss": 0.9156, + "step": 105970 + }, + { + "epoch": 8.212639001898562, + "grad_norm": 1.3892202333613268, + "learning_rate": 4.1064786112833233e-07, + "loss": 0.9091, + "step": 105980 + }, + { + "epoch": 8.21341392537487, + "grad_norm": 1.5435094681712858, + "learning_rate": 4.106866088034718e-07, + "loss": 0.9209, + "step": 105990 + }, + { + "epoch": 8.214188848851176, + "grad_norm": 1.3700277763537363, + "learning_rate": 4.1072535647861127e-07, + "loss": 0.8978, + "step": 106000 + }, + { + "epoch": 8.214188848851176, + "eval_loss": 0.920173168182373, + "eval_runtime": 327.7583, + "eval_samples_per_second": 34.998, + "eval_steps_per_second": 8.75, + "step": 106000 + }, + { + "epoch": 8.214963772327483, + "grad_norm": 1.6342184723482365, + "learning_rate": 4.107641041537508e-07, + "loss": 0.9322, + "step": 106010 + }, + { + "epoch": 8.21573869580379, + "grad_norm": 1.5191733824204448, + "learning_rate": 4.1080285182889026e-07, + "loss": 0.9073, + "step": 106020 + }, + { + "epoch": 8.216513619280096, + "grad_norm": 1.430922525715692, + "learning_rate": 4.108415995040298e-07, + "loss": 0.8987, + "step": 106030 + }, + { + "epoch": 8.217288542756403, + "grad_norm": 1.4266227418837705, + "learning_rate": 4.1088034717916925e-07, + "loss": 0.9124, + "step": 106040 + }, + { + "epoch": 8.21806346623271, + "grad_norm": 1.4151099232912059, + "learning_rate": 4.109190948543088e-07, + "loss": 0.896, + "step": 106050 + }, + { + "epoch": 8.218838389709017, + "grad_norm": 1.3746404209773138, + "learning_rate": 4.1095784252944824e-07, + "loss": 0.9106, + "step": 106060 + }, + { + "epoch": 8.219613313185324, + "grad_norm": 1.3525129987689282, + "learning_rate": 4.109965902045877e-07, + "loss": 0.914, + "step": 106070 + }, + { + "epoch": 8.22038823666163, + "grad_norm": 1.4436760148454149, + "learning_rate": 4.1103533787972723e-07, + "loss": 0.8994, + "step": 106080 + }, + { + "epoch": 8.221163160137936, + "grad_norm": 1.5920244273595172, + "learning_rate": 4.110740855548667e-07, + "loss": 0.9057, + "step": 106090 + }, + { + "epoch": 8.221938083614242, + "grad_norm": 1.506099729090253, + "learning_rate": 4.111128332300062e-07, + "loss": 0.8982, + "step": 106100 + }, + { + "epoch": 8.22271300709055, + "grad_norm": 1.4317355382092791, + "learning_rate": 4.111515809051457e-07, + "loss": 0.92, + "step": 106110 + }, + { + "epoch": 8.223487930566856, + "grad_norm": 1.3806539797415327, + "learning_rate": 4.111903285802852e-07, + "loss": 0.8957, + "step": 106120 + }, + { + "epoch": 8.224262854043163, + "grad_norm": 1.5289246508752818, + "learning_rate": 4.112290762554247e-07, + "loss": 0.9224, + "step": 106130 + }, + { + "epoch": 8.22503777751947, + "grad_norm": 1.415069370545511, + "learning_rate": 4.1126782393056415e-07, + "loss": 0.9118, + "step": 106140 + }, + { + "epoch": 8.225812700995776, + "grad_norm": 1.42157692774059, + "learning_rate": 4.113065716057037e-07, + "loss": 0.9079, + "step": 106150 + }, + { + "epoch": 8.226587624472083, + "grad_norm": 1.4255127829841492, + "learning_rate": 4.1134531928084314e-07, + "loss": 0.9216, + "step": 106160 + }, + { + "epoch": 8.22736254794839, + "grad_norm": 1.340555870564239, + "learning_rate": 4.1138406695598267e-07, + "loss": 0.9029, + "step": 106170 + }, + { + "epoch": 8.228137471424697, + "grad_norm": 1.4490731763859324, + "learning_rate": 4.1142281463112214e-07, + "loss": 0.903, + "step": 106180 + }, + { + "epoch": 8.228912394901004, + "grad_norm": 1.5316054684477969, + "learning_rate": 4.1146156230626166e-07, + "loss": 0.9053, + "step": 106190 + }, + { + "epoch": 8.22968731837731, + "grad_norm": 1.426872761769062, + "learning_rate": 4.1150030998140113e-07, + "loss": 0.9051, + "step": 106200 + }, + { + "epoch": 8.230462241853617, + "grad_norm": 1.4359333712646238, + "learning_rate": 4.115390576565406e-07, + "loss": 0.9169, + "step": 106210 + }, + { + "epoch": 8.231237165329924, + "grad_norm": 1.4057567957547523, + "learning_rate": 4.115778053316801e-07, + "loss": 0.9086, + "step": 106220 + }, + { + "epoch": 8.232012088806231, + "grad_norm": 1.4963145295376108, + "learning_rate": 4.116165530068196e-07, + "loss": 0.9249, + "step": 106230 + }, + { + "epoch": 8.232787012282538, + "grad_norm": 1.4672562174157802, + "learning_rate": 4.116553006819591e-07, + "loss": 0.8962, + "step": 106240 + }, + { + "epoch": 8.233561935758845, + "grad_norm": 1.4541279223779688, + "learning_rate": 4.116940483570986e-07, + "loss": 0.9205, + "step": 106250 + }, + { + "epoch": 8.234336859235151, + "grad_norm": 1.4401812775895226, + "learning_rate": 4.117327960322381e-07, + "loss": 0.8903, + "step": 106260 + }, + { + "epoch": 8.235111782711456, + "grad_norm": 1.4040404686453052, + "learning_rate": 4.1177154370737757e-07, + "loss": 0.9, + "step": 106270 + }, + { + "epoch": 8.235886706187763, + "grad_norm": 1.4645476538900513, + "learning_rate": 4.1181029138251704e-07, + "loss": 0.9023, + "step": 106280 + }, + { + "epoch": 8.23666162966407, + "grad_norm": 1.3998528228392066, + "learning_rate": 4.1184903905765656e-07, + "loss": 0.9273, + "step": 106290 + }, + { + "epoch": 8.237436553140377, + "grad_norm": 1.4295467144883542, + "learning_rate": 4.1188778673279603e-07, + "loss": 0.9153, + "step": 106300 + }, + { + "epoch": 8.238211476616684, + "grad_norm": 1.4182004860485515, + "learning_rate": 4.1192653440793555e-07, + "loss": 0.8935, + "step": 106310 + }, + { + "epoch": 8.23898640009299, + "grad_norm": 1.4272341666029325, + "learning_rate": 4.11965282083075e-07, + "loss": 0.9102, + "step": 106320 + }, + { + "epoch": 8.239761323569297, + "grad_norm": 1.3995004493816263, + "learning_rate": 4.1200402975821454e-07, + "loss": 0.9123, + "step": 106330 + }, + { + "epoch": 8.240536247045604, + "grad_norm": 1.424023150898539, + "learning_rate": 4.12042777433354e-07, + "loss": 0.8995, + "step": 106340 + }, + { + "epoch": 8.24131117052191, + "grad_norm": 1.3639805074903513, + "learning_rate": 4.120815251084935e-07, + "loss": 0.9166, + "step": 106350 + }, + { + "epoch": 8.242086093998218, + "grad_norm": 1.4826383002439818, + "learning_rate": 4.12120272783633e-07, + "loss": 0.9174, + "step": 106360 + }, + { + "epoch": 8.242861017474524, + "grad_norm": 1.413008920756146, + "learning_rate": 4.1215902045877247e-07, + "loss": 0.8961, + "step": 106370 + }, + { + "epoch": 8.243635940950831, + "grad_norm": 1.3629930548100702, + "learning_rate": 4.12197768133912e-07, + "loss": 0.9078, + "step": 106380 + }, + { + "epoch": 8.244410864427138, + "grad_norm": 1.5310837671509534, + "learning_rate": 4.1223651580905146e-07, + "loss": 0.8995, + "step": 106390 + }, + { + "epoch": 8.245185787903445, + "grad_norm": 1.4502025053137935, + "learning_rate": 4.12275263484191e-07, + "loss": 0.9051, + "step": 106400 + }, + { + "epoch": 8.245960711379752, + "grad_norm": 1.454637991841698, + "learning_rate": 4.1231401115933045e-07, + "loss": 0.9085, + "step": 106410 + }, + { + "epoch": 8.246735634856059, + "grad_norm": 1.3555656460860361, + "learning_rate": 4.123527588344699e-07, + "loss": 0.8958, + "step": 106420 + }, + { + "epoch": 8.247510558332365, + "grad_norm": 1.5234803269865314, + "learning_rate": 4.1239150650960944e-07, + "loss": 0.9158, + "step": 106430 + }, + { + "epoch": 8.248285481808672, + "grad_norm": 1.510338118984459, + "learning_rate": 4.124302541847489e-07, + "loss": 0.9158, + "step": 106440 + }, + { + "epoch": 8.249060405284979, + "grad_norm": 1.3747862282290475, + "learning_rate": 4.1246900185988844e-07, + "loss": 0.923, + "step": 106450 + }, + { + "epoch": 8.249835328761284, + "grad_norm": 1.5280120067694454, + "learning_rate": 4.125077495350279e-07, + "loss": 0.9035, + "step": 106460 + }, + { + "epoch": 8.25061025223759, + "grad_norm": 1.4025203994586517, + "learning_rate": 4.125464972101674e-07, + "loss": 0.9184, + "step": 106470 + }, + { + "epoch": 8.251385175713898, + "grad_norm": 1.3800718603816648, + "learning_rate": 4.125852448853069e-07, + "loss": 0.9112, + "step": 106480 + }, + { + "epoch": 8.252160099190204, + "grad_norm": 1.4030260637263936, + "learning_rate": 4.1262399256044636e-07, + "loss": 0.934, + "step": 106490 + }, + { + "epoch": 8.252935022666511, + "grad_norm": 1.4808472076412524, + "learning_rate": 4.126627402355859e-07, + "loss": 0.9364, + "step": 106500 + }, + { + "epoch": 8.252935022666511, + "eval_loss": 0.919685959815979, + "eval_runtime": 327.2928, + "eval_samples_per_second": 35.048, + "eval_steps_per_second": 8.763, + "step": 106500 + }, + { + "epoch": 8.253709946142818, + "grad_norm": 1.3878062187575806, + "learning_rate": 4.1270148791072536e-07, + "loss": 0.9232, + "step": 106510 + }, + { + "epoch": 8.254484869619125, + "grad_norm": 1.4043666990716677, + "learning_rate": 4.127402355858649e-07, + "loss": 0.9056, + "step": 106520 + }, + { + "epoch": 8.255259793095432, + "grad_norm": 1.4150006446486307, + "learning_rate": 4.1277898326100435e-07, + "loss": 0.8847, + "step": 106530 + }, + { + "epoch": 8.256034716571738, + "grad_norm": 1.4351576933314922, + "learning_rate": 4.1281773093614387e-07, + "loss": 0.9012, + "step": 106540 + }, + { + "epoch": 8.256809640048045, + "grad_norm": 1.511459574496661, + "learning_rate": 4.1285647861128334e-07, + "loss": 0.9123, + "step": 106550 + }, + { + "epoch": 8.257584563524352, + "grad_norm": 1.3998496965622482, + "learning_rate": 4.128952262864228e-07, + "loss": 0.9016, + "step": 106560 + }, + { + "epoch": 8.258359487000659, + "grad_norm": 1.4183405440527657, + "learning_rate": 4.1293397396156233e-07, + "loss": 0.9093, + "step": 106570 + }, + { + "epoch": 8.259134410476966, + "grad_norm": 1.4078156702730802, + "learning_rate": 4.129727216367018e-07, + "loss": 0.909, + "step": 106580 + }, + { + "epoch": 8.259909333953273, + "grad_norm": 1.4706271072967003, + "learning_rate": 4.130114693118413e-07, + "loss": 0.9323, + "step": 106590 + }, + { + "epoch": 8.26068425742958, + "grad_norm": 1.4541744924201663, + "learning_rate": 4.130502169869808e-07, + "loss": 0.9072, + "step": 106600 + }, + { + "epoch": 8.261459180905886, + "grad_norm": 1.477164237941501, + "learning_rate": 4.130889646621203e-07, + "loss": 0.927, + "step": 106610 + }, + { + "epoch": 8.262234104382193, + "grad_norm": 1.4168539136381222, + "learning_rate": 4.131277123372598e-07, + "loss": 0.9046, + "step": 106620 + }, + { + "epoch": 8.2630090278585, + "grad_norm": 1.4910824976097452, + "learning_rate": 4.1316646001239925e-07, + "loss": 0.9121, + "step": 106630 + }, + { + "epoch": 8.263783951334805, + "grad_norm": 1.4286005860183115, + "learning_rate": 4.1320520768753877e-07, + "loss": 0.9273, + "step": 106640 + }, + { + "epoch": 8.264558874811112, + "grad_norm": 1.517767226044818, + "learning_rate": 4.1324395536267824e-07, + "loss": 0.9177, + "step": 106650 + }, + { + "epoch": 8.265333798287418, + "grad_norm": 1.4276088157656837, + "learning_rate": 4.1328270303781776e-07, + "loss": 0.8841, + "step": 106660 + }, + { + "epoch": 8.266108721763725, + "grad_norm": 1.4932925257626226, + "learning_rate": 4.1332145071295723e-07, + "loss": 0.9184, + "step": 106670 + }, + { + "epoch": 8.266883645240032, + "grad_norm": 1.3858898588606505, + "learning_rate": 4.133601983880967e-07, + "loss": 0.9189, + "step": 106680 + }, + { + "epoch": 8.267658568716339, + "grad_norm": 1.449967129731411, + "learning_rate": 4.133989460632362e-07, + "loss": 0.9111, + "step": 106690 + }, + { + "epoch": 8.268433492192646, + "grad_norm": 1.5214805975998946, + "learning_rate": 4.134376937383757e-07, + "loss": 0.8997, + "step": 106700 + }, + { + "epoch": 8.269208415668952, + "grad_norm": 1.4016451251214581, + "learning_rate": 4.134764414135152e-07, + "loss": 0.9067, + "step": 106710 + }, + { + "epoch": 8.26998333914526, + "grad_norm": 1.4546885180324884, + "learning_rate": 4.135151890886547e-07, + "loss": 0.9178, + "step": 106720 + }, + { + "epoch": 8.270758262621566, + "grad_norm": 1.4786789947490862, + "learning_rate": 4.135539367637942e-07, + "loss": 0.9181, + "step": 106730 + }, + { + "epoch": 8.271533186097873, + "grad_norm": 1.497798511548784, + "learning_rate": 4.1359268443893367e-07, + "loss": 0.9201, + "step": 106740 + }, + { + "epoch": 8.27230810957418, + "grad_norm": 1.431008815737167, + "learning_rate": 4.1363143211407314e-07, + "loss": 0.9091, + "step": 106750 + }, + { + "epoch": 8.273083033050487, + "grad_norm": 1.4953095989852243, + "learning_rate": 4.1367017978921266e-07, + "loss": 0.9048, + "step": 106760 + }, + { + "epoch": 8.273857956526793, + "grad_norm": 1.4378803659273998, + "learning_rate": 4.1370892746435213e-07, + "loss": 0.9521, + "step": 106770 + }, + { + "epoch": 8.2746328800031, + "grad_norm": 1.4409705083437214, + "learning_rate": 4.1374767513949165e-07, + "loss": 0.9197, + "step": 106780 + }, + { + "epoch": 8.275407803479407, + "grad_norm": 1.4299414584565888, + "learning_rate": 4.137864228146311e-07, + "loss": 0.9075, + "step": 106790 + }, + { + "epoch": 8.276182726955714, + "grad_norm": 1.4496561038935731, + "learning_rate": 4.1382517048977065e-07, + "loss": 0.9038, + "step": 106800 + }, + { + "epoch": 8.27695765043202, + "grad_norm": 1.4277300015916738, + "learning_rate": 4.138639181649101e-07, + "loss": 0.8987, + "step": 106810 + }, + { + "epoch": 8.277732573908327, + "grad_norm": 1.4225011044040925, + "learning_rate": 4.139026658400496e-07, + "loss": 0.8969, + "step": 106820 + }, + { + "epoch": 8.278507497384632, + "grad_norm": 1.467457913568716, + "learning_rate": 4.139414135151891e-07, + "loss": 0.8987, + "step": 106830 + }, + { + "epoch": 8.27928242086094, + "grad_norm": 1.4495283910883154, + "learning_rate": 4.139801611903286e-07, + "loss": 0.9329, + "step": 106840 + }, + { + "epoch": 8.280057344337246, + "grad_norm": 1.430304891249494, + "learning_rate": 4.140189088654681e-07, + "loss": 0.9114, + "step": 106850 + }, + { + "epoch": 8.280832267813553, + "grad_norm": 1.4862722691138341, + "learning_rate": 4.1405765654060757e-07, + "loss": 0.8951, + "step": 106860 + }, + { + "epoch": 8.28160719128986, + "grad_norm": 1.484908715596452, + "learning_rate": 4.140964042157471e-07, + "loss": 0.91, + "step": 106870 + }, + { + "epoch": 8.282382114766166, + "grad_norm": 1.4602791695001913, + "learning_rate": 4.1413515189088656e-07, + "loss": 0.8927, + "step": 106880 + }, + { + "epoch": 8.283157038242473, + "grad_norm": 1.3858166776817387, + "learning_rate": 4.14173899566026e-07, + "loss": 0.8957, + "step": 106890 + }, + { + "epoch": 8.28393196171878, + "grad_norm": 1.3769552428953717, + "learning_rate": 4.1421264724116555e-07, + "loss": 0.9208, + "step": 106900 + }, + { + "epoch": 8.284706885195087, + "grad_norm": 1.3481701375219988, + "learning_rate": 4.14251394916305e-07, + "loss": 0.9334, + "step": 106910 + }, + { + "epoch": 8.285481808671394, + "grad_norm": 1.447192481886258, + "learning_rate": 4.1429014259144454e-07, + "loss": 0.9096, + "step": 106920 + }, + { + "epoch": 8.2862567321477, + "grad_norm": 1.4097840878283396, + "learning_rate": 4.14328890266584e-07, + "loss": 0.9046, + "step": 106930 + }, + { + "epoch": 8.287031655624007, + "grad_norm": 1.4515238110959274, + "learning_rate": 4.1436763794172353e-07, + "loss": 0.9147, + "step": 106940 + }, + { + "epoch": 8.287806579100314, + "grad_norm": 1.4125192677721505, + "learning_rate": 4.14406385616863e-07, + "loss": 0.9146, + "step": 106950 + }, + { + "epoch": 8.288581502576621, + "grad_norm": 1.42753368477872, + "learning_rate": 4.1444513329200247e-07, + "loss": 0.917, + "step": 106960 + }, + { + "epoch": 8.289356426052928, + "grad_norm": 1.4512460372067726, + "learning_rate": 4.14483880967142e-07, + "loss": 0.9089, + "step": 106970 + }, + { + "epoch": 8.290131349529235, + "grad_norm": 1.4964050763195973, + "learning_rate": 4.1452262864228146e-07, + "loss": 0.9282, + "step": 106980 + }, + { + "epoch": 8.290906273005541, + "grad_norm": 1.4931023296166723, + "learning_rate": 4.14561376317421e-07, + "loss": 0.9187, + "step": 106990 + }, + { + "epoch": 8.291681196481848, + "grad_norm": 1.4966006689497573, + "learning_rate": 4.1460012399256045e-07, + "loss": 0.8966, + "step": 107000 + }, + { + "epoch": 8.291681196481848, + "eval_loss": 0.919613778591156, + "eval_runtime": 332.2165, + "eval_samples_per_second": 34.529, + "eval_steps_per_second": 8.633, + "step": 107000 + }, + { + "epoch": 8.292456119958153, + "grad_norm": 1.4350638935646542, + "learning_rate": 4.1463887166769997e-07, + "loss": 0.8967, + "step": 107010 + }, + { + "epoch": 8.29323104343446, + "grad_norm": 1.4705822112053164, + "learning_rate": 4.1467761934283944e-07, + "loss": 0.911, + "step": 107020 + }, + { + "epoch": 8.294005966910767, + "grad_norm": 1.3593542902025266, + "learning_rate": 4.147163670179789e-07, + "loss": 0.8889, + "step": 107030 + }, + { + "epoch": 8.294780890387074, + "grad_norm": 1.5685655276752883, + "learning_rate": 4.1475511469311843e-07, + "loss": 0.9067, + "step": 107040 + }, + { + "epoch": 8.29555581386338, + "grad_norm": 1.4381831633417934, + "learning_rate": 4.147938623682579e-07, + "loss": 0.9132, + "step": 107050 + }, + { + "epoch": 8.296330737339687, + "grad_norm": 1.5319033288508976, + "learning_rate": 4.148326100433974e-07, + "loss": 0.9349, + "step": 107060 + }, + { + "epoch": 8.297105660815994, + "grad_norm": 1.4230935889042162, + "learning_rate": 4.148713577185369e-07, + "loss": 0.9075, + "step": 107070 + }, + { + "epoch": 8.297880584292301, + "grad_norm": 1.3472002575535396, + "learning_rate": 4.149101053936764e-07, + "loss": 0.9067, + "step": 107080 + }, + { + "epoch": 8.298655507768608, + "grad_norm": 1.374527599211136, + "learning_rate": 4.149488530688159e-07, + "loss": 0.8922, + "step": 107090 + }, + { + "epoch": 8.299430431244915, + "grad_norm": 1.4815446939053452, + "learning_rate": 4.1498760074395535e-07, + "loss": 0.897, + "step": 107100 + }, + { + "epoch": 8.300205354721221, + "grad_norm": 1.48695086132137, + "learning_rate": 4.150263484190949e-07, + "loss": 0.9152, + "step": 107110 + }, + { + "epoch": 8.300980278197528, + "grad_norm": 1.373217192325355, + "learning_rate": 4.1506509609423434e-07, + "loss": 0.9188, + "step": 107120 + }, + { + "epoch": 8.301755201673835, + "grad_norm": 1.4105245292801967, + "learning_rate": 4.1510384376937387e-07, + "loss": 0.8981, + "step": 107130 + }, + { + "epoch": 8.302530125150142, + "grad_norm": 1.4605392846341172, + "learning_rate": 4.1514259144451333e-07, + "loss": 0.904, + "step": 107140 + }, + { + "epoch": 8.303305048626449, + "grad_norm": 1.4327807844092535, + "learning_rate": 4.1518133911965286e-07, + "loss": 0.9202, + "step": 107150 + }, + { + "epoch": 8.304079972102755, + "grad_norm": 1.3869080395277122, + "learning_rate": 4.152200867947923e-07, + "loss": 0.9203, + "step": 107160 + }, + { + "epoch": 8.304854895579062, + "grad_norm": 1.424377329631921, + "learning_rate": 4.152588344699318e-07, + "loss": 0.9206, + "step": 107170 + }, + { + "epoch": 8.305629819055369, + "grad_norm": 1.3987768769711273, + "learning_rate": 4.152975821450713e-07, + "loss": 0.9005, + "step": 107180 + }, + { + "epoch": 8.306404742531676, + "grad_norm": 1.3927206373913716, + "learning_rate": 4.153363298202108e-07, + "loss": 0.9122, + "step": 107190 + }, + { + "epoch": 8.30717966600798, + "grad_norm": 1.5465934782376383, + "learning_rate": 4.153750774953503e-07, + "loss": 0.8982, + "step": 107200 + }, + { + "epoch": 8.307954589484288, + "grad_norm": 1.4070531993545181, + "learning_rate": 4.154138251704898e-07, + "loss": 0.9144, + "step": 107210 + }, + { + "epoch": 8.308729512960594, + "grad_norm": 1.380695871027453, + "learning_rate": 4.154525728456293e-07, + "loss": 0.918, + "step": 107220 + }, + { + "epoch": 8.309504436436901, + "grad_norm": 1.397883022617702, + "learning_rate": 4.1549132052076877e-07, + "loss": 0.922, + "step": 107230 + }, + { + "epoch": 8.310279359913208, + "grad_norm": 1.5149708057706888, + "learning_rate": 4.1553006819590824e-07, + "loss": 0.9012, + "step": 107240 + }, + { + "epoch": 8.311054283389515, + "grad_norm": 1.426347207762535, + "learning_rate": 4.1556881587104776e-07, + "loss": 0.9031, + "step": 107250 + }, + { + "epoch": 8.311829206865822, + "grad_norm": 1.387719029707473, + "learning_rate": 4.1560756354618723e-07, + "loss": 0.8957, + "step": 107260 + }, + { + "epoch": 8.312604130342129, + "grad_norm": 1.4567536244466193, + "learning_rate": 4.1564631122132675e-07, + "loss": 0.9254, + "step": 107270 + }, + { + "epoch": 8.313379053818435, + "grad_norm": 1.4783349334731004, + "learning_rate": 4.156850588964662e-07, + "loss": 0.8956, + "step": 107280 + }, + { + "epoch": 8.314153977294742, + "grad_norm": 1.4506958710975286, + "learning_rate": 4.1572380657160574e-07, + "loss": 0.9367, + "step": 107290 + }, + { + "epoch": 8.314928900771049, + "grad_norm": 1.345663230375982, + "learning_rate": 4.157625542467452e-07, + "loss": 0.9048, + "step": 107300 + }, + { + "epoch": 8.315703824247356, + "grad_norm": 1.4622100427518625, + "learning_rate": 4.158013019218847e-07, + "loss": 0.9165, + "step": 107310 + }, + { + "epoch": 8.316478747723663, + "grad_norm": 1.5614426233678735, + "learning_rate": 4.158400495970242e-07, + "loss": 0.9316, + "step": 107320 + }, + { + "epoch": 8.31725367119997, + "grad_norm": 1.4239137311890466, + "learning_rate": 4.1587879727216367e-07, + "loss": 0.8829, + "step": 107330 + }, + { + "epoch": 8.318028594676276, + "grad_norm": 1.4132520748450914, + "learning_rate": 4.159175449473032e-07, + "loss": 0.9264, + "step": 107340 + }, + { + "epoch": 8.318803518152583, + "grad_norm": 1.3815998960937064, + "learning_rate": 4.1595629262244266e-07, + "loss": 0.9073, + "step": 107350 + }, + { + "epoch": 8.31957844162889, + "grad_norm": 1.4560397994144438, + "learning_rate": 4.159950402975822e-07, + "loss": 0.9105, + "step": 107360 + }, + { + "epoch": 8.320353365105197, + "grad_norm": 1.4964273375216142, + "learning_rate": 4.1603378797272165e-07, + "loss": 0.9146, + "step": 107370 + }, + { + "epoch": 8.321128288581502, + "grad_norm": 1.3935589001137183, + "learning_rate": 4.160725356478611e-07, + "loss": 0.9063, + "step": 107380 + }, + { + "epoch": 8.321903212057808, + "grad_norm": 1.4426108081973419, + "learning_rate": 4.1611128332300064e-07, + "loss": 0.9338, + "step": 107390 + }, + { + "epoch": 8.322678135534115, + "grad_norm": 1.4224273909223433, + "learning_rate": 4.161500309981401e-07, + "loss": 0.8978, + "step": 107400 + }, + { + "epoch": 8.323453059010422, + "grad_norm": 1.3652734132079392, + "learning_rate": 4.1618877867327963e-07, + "loss": 0.8928, + "step": 107410 + }, + { + "epoch": 8.324227982486729, + "grad_norm": 1.3431029913904708, + "learning_rate": 4.162275263484191e-07, + "loss": 0.9144, + "step": 107420 + }, + { + "epoch": 8.325002905963036, + "grad_norm": 1.458588236256767, + "learning_rate": 4.1626627402355857e-07, + "loss": 0.9363, + "step": 107430 + }, + { + "epoch": 8.325777829439343, + "grad_norm": 1.4158394999607637, + "learning_rate": 4.163050216986981e-07, + "loss": 0.912, + "step": 107440 + }, + { + "epoch": 8.32655275291565, + "grad_norm": 1.4354755401097061, + "learning_rate": 4.1634376937383756e-07, + "loss": 0.9313, + "step": 107450 + }, + { + "epoch": 8.327327676391956, + "grad_norm": 1.4948108937021785, + "learning_rate": 4.163825170489771e-07, + "loss": 0.9039, + "step": 107460 + }, + { + "epoch": 8.328102599868263, + "grad_norm": 1.4353657835379647, + "learning_rate": 4.1642126472411655e-07, + "loss": 0.9085, + "step": 107470 + }, + { + "epoch": 8.32887752334457, + "grad_norm": 1.5123472658213701, + "learning_rate": 4.164600123992561e-07, + "loss": 0.9178, + "step": 107480 + }, + { + "epoch": 8.329652446820877, + "grad_norm": 1.34608269202101, + "learning_rate": 4.1649876007439554e-07, + "loss": 0.9382, + "step": 107490 + }, + { + "epoch": 8.330427370297183, + "grad_norm": 1.4293831457049917, + "learning_rate": 4.16537507749535e-07, + "loss": 0.9132, + "step": 107500 + }, + { + "epoch": 8.330427370297183, + "eval_loss": 0.9192724823951721, + "eval_runtime": 331.3709, + "eval_samples_per_second": 34.617, + "eval_steps_per_second": 8.655, + "step": 107500 + }, + { + "epoch": 8.33120229377349, + "grad_norm": 1.52206333232987, + "learning_rate": 4.1657625542467454e-07, + "loss": 0.9117, + "step": 107510 + }, + { + "epoch": 8.331977217249797, + "grad_norm": 1.419820154055633, + "learning_rate": 4.16615003099814e-07, + "loss": 0.8952, + "step": 107520 + }, + { + "epoch": 8.332752140726104, + "grad_norm": 1.4494998435426871, + "learning_rate": 4.1665375077495353e-07, + "loss": 0.9218, + "step": 107530 + }, + { + "epoch": 8.33352706420241, + "grad_norm": 1.4213096274012524, + "learning_rate": 4.16692498450093e-07, + "loss": 0.9308, + "step": 107540 + }, + { + "epoch": 8.334301987678717, + "grad_norm": 1.4795397334057154, + "learning_rate": 4.167312461252325e-07, + "loss": 0.9437, + "step": 107550 + }, + { + "epoch": 8.335076911155024, + "grad_norm": 1.4199490394561507, + "learning_rate": 4.16769993800372e-07, + "loss": 0.891, + "step": 107560 + }, + { + "epoch": 8.335851834631331, + "grad_norm": 1.4728828152840594, + "learning_rate": 4.1680874147551146e-07, + "loss": 0.9204, + "step": 107570 + }, + { + "epoch": 8.336626758107636, + "grad_norm": 1.499391931190088, + "learning_rate": 4.16847489150651e-07, + "loss": 0.928, + "step": 107580 + }, + { + "epoch": 8.337401681583943, + "grad_norm": 1.4116859568260003, + "learning_rate": 4.1688623682579045e-07, + "loss": 0.9083, + "step": 107590 + }, + { + "epoch": 8.33817660506025, + "grad_norm": 1.3347324602070476, + "learning_rate": 4.1692498450092997e-07, + "loss": 0.8978, + "step": 107600 + }, + { + "epoch": 8.338951528536557, + "grad_norm": 1.5125427189798517, + "learning_rate": 4.1696373217606944e-07, + "loss": 0.8913, + "step": 107610 + }, + { + "epoch": 8.339726452012863, + "grad_norm": 1.4692849485699597, + "learning_rate": 4.1700247985120896e-07, + "loss": 0.9057, + "step": 107620 + }, + { + "epoch": 8.34050137548917, + "grad_norm": 1.501319180667496, + "learning_rate": 4.1704122752634843e-07, + "loss": 0.9006, + "step": 107630 + }, + { + "epoch": 8.341276298965477, + "grad_norm": 1.4976913364935627, + "learning_rate": 4.170799752014879e-07, + "loss": 0.9134, + "step": 107640 + }, + { + "epoch": 8.342051222441784, + "grad_norm": 1.4190962410646375, + "learning_rate": 4.171187228766274e-07, + "loss": 0.9367, + "step": 107650 + }, + { + "epoch": 8.34282614591809, + "grad_norm": 1.3971853739123548, + "learning_rate": 4.171574705517669e-07, + "loss": 0.8874, + "step": 107660 + }, + { + "epoch": 8.343601069394397, + "grad_norm": 1.4349122231255789, + "learning_rate": 4.171962182269064e-07, + "loss": 0.9036, + "step": 107670 + }, + { + "epoch": 8.344375992870704, + "grad_norm": 1.5195373911797119, + "learning_rate": 4.172349659020459e-07, + "loss": 0.9062, + "step": 107680 + }, + { + "epoch": 8.345150916347011, + "grad_norm": 1.4596814723691613, + "learning_rate": 4.172737135771854e-07, + "loss": 0.9084, + "step": 107690 + }, + { + "epoch": 8.345925839823318, + "grad_norm": 1.4525130144383576, + "learning_rate": 4.1731246125232487e-07, + "loss": 0.9073, + "step": 107700 + }, + { + "epoch": 8.346700763299625, + "grad_norm": 1.4760050435685568, + "learning_rate": 4.1735120892746434e-07, + "loss": 0.9269, + "step": 107710 + }, + { + "epoch": 8.347475686775931, + "grad_norm": 1.4365349665651184, + "learning_rate": 4.1738995660260386e-07, + "loss": 0.9037, + "step": 107720 + }, + { + "epoch": 8.348250610252238, + "grad_norm": 1.4516165916781374, + "learning_rate": 4.1742870427774333e-07, + "loss": 0.9177, + "step": 107730 + }, + { + "epoch": 8.349025533728545, + "grad_norm": 1.4973573465540861, + "learning_rate": 4.1746745195288285e-07, + "loss": 0.9129, + "step": 107740 + }, + { + "epoch": 8.349800457204852, + "grad_norm": 1.6122866652561405, + "learning_rate": 4.175061996280223e-07, + "loss": 0.9321, + "step": 107750 + }, + { + "epoch": 8.350575380681157, + "grad_norm": 1.4001431504173858, + "learning_rate": 4.1754494730316184e-07, + "loss": 0.9032, + "step": 107760 + }, + { + "epoch": 8.351350304157464, + "grad_norm": 1.4073488981794504, + "learning_rate": 4.175836949783013e-07, + "loss": 0.9083, + "step": 107770 + }, + { + "epoch": 8.35212522763377, + "grad_norm": 1.4219150000480574, + "learning_rate": 4.176224426534408e-07, + "loss": 0.9226, + "step": 107780 + }, + { + "epoch": 8.352900151110077, + "grad_norm": 1.3846134807508559, + "learning_rate": 4.176611903285803e-07, + "loss": 0.9242, + "step": 107790 + }, + { + "epoch": 8.353675074586384, + "grad_norm": 1.4073733886874558, + "learning_rate": 4.1769993800371977e-07, + "loss": 0.9427, + "step": 107800 + }, + { + "epoch": 8.354449998062691, + "grad_norm": 1.3701440735684292, + "learning_rate": 4.177386856788593e-07, + "loss": 0.8969, + "step": 107810 + }, + { + "epoch": 8.355224921538998, + "grad_norm": 1.5185923052339492, + "learning_rate": 4.1777743335399876e-07, + "loss": 0.9344, + "step": 107820 + }, + { + "epoch": 8.355999845015305, + "grad_norm": 1.427290996624503, + "learning_rate": 4.178161810291383e-07, + "loss": 0.9156, + "step": 107830 + }, + { + "epoch": 8.356774768491611, + "grad_norm": 1.4487561925699632, + "learning_rate": 4.1785492870427776e-07, + "loss": 0.9018, + "step": 107840 + }, + { + "epoch": 8.357549691967918, + "grad_norm": 1.4140215854828173, + "learning_rate": 4.178936763794172e-07, + "loss": 0.898, + "step": 107850 + }, + { + "epoch": 8.358324615444225, + "grad_norm": 1.4589199038052667, + "learning_rate": 4.1793242405455675e-07, + "loss": 0.8999, + "step": 107860 + }, + { + "epoch": 8.359099538920532, + "grad_norm": 1.51791871722056, + "learning_rate": 4.179711717296962e-07, + "loss": 0.9121, + "step": 107870 + }, + { + "epoch": 8.359874462396839, + "grad_norm": 1.3904517696672203, + "learning_rate": 4.1800991940483574e-07, + "loss": 0.9092, + "step": 107880 + }, + { + "epoch": 8.360649385873145, + "grad_norm": 1.4092498714968806, + "learning_rate": 4.180486670799752e-07, + "loss": 0.9012, + "step": 107890 + }, + { + "epoch": 8.361424309349452, + "grad_norm": 1.483386983330434, + "learning_rate": 4.1808741475511473e-07, + "loss": 0.9298, + "step": 107900 + }, + { + "epoch": 8.362199232825759, + "grad_norm": 1.4511144361041552, + "learning_rate": 4.181261624302542e-07, + "loss": 0.9037, + "step": 107910 + }, + { + "epoch": 8.362974156302066, + "grad_norm": 1.3945478576184112, + "learning_rate": 4.1816491010539367e-07, + "loss": 0.9312, + "step": 107920 + }, + { + "epoch": 8.363749079778373, + "grad_norm": 1.420349750546058, + "learning_rate": 4.182036577805332e-07, + "loss": 0.9115, + "step": 107930 + }, + { + "epoch": 8.36452400325468, + "grad_norm": 1.4893642258174236, + "learning_rate": 4.1824240545567266e-07, + "loss": 0.8902, + "step": 107940 + }, + { + "epoch": 8.365298926730985, + "grad_norm": 1.3828293505451734, + "learning_rate": 4.182811531308122e-07, + "loss": 0.9187, + "step": 107950 + }, + { + "epoch": 8.366073850207291, + "grad_norm": 1.4238013107197243, + "learning_rate": 4.1831990080595165e-07, + "loss": 0.9319, + "step": 107960 + }, + { + "epoch": 8.366848773683598, + "grad_norm": 1.3993434810267413, + "learning_rate": 4.1835864848109117e-07, + "loss": 0.8898, + "step": 107970 + }, + { + "epoch": 8.367623697159905, + "grad_norm": 1.4442522123369341, + "learning_rate": 4.1839739615623064e-07, + "loss": 0.898, + "step": 107980 + }, + { + "epoch": 8.368398620636212, + "grad_norm": 1.4867858749722995, + "learning_rate": 4.184361438313701e-07, + "loss": 0.8959, + "step": 107990 + }, + { + "epoch": 8.369173544112519, + "grad_norm": 1.4216964849294722, + "learning_rate": 4.1847489150650963e-07, + "loss": 0.9318, + "step": 108000 + }, + { + "epoch": 8.369173544112519, + "eval_loss": 0.9189316630363464, + "eval_runtime": 331.202, + "eval_samples_per_second": 34.634, + "eval_steps_per_second": 8.659, + "step": 108000 + }, + { + "epoch": 8.369948467588825, + "grad_norm": 1.4590904743946673, + "learning_rate": 4.185136391816491e-07, + "loss": 0.9152, + "step": 108010 + }, + { + "epoch": 8.370723391065132, + "grad_norm": 1.4184082202251802, + "learning_rate": 4.185523868567886e-07, + "loss": 0.9111, + "step": 108020 + }, + { + "epoch": 8.371498314541439, + "grad_norm": 1.4848493750498641, + "learning_rate": 4.185911345319281e-07, + "loss": 0.8982, + "step": 108030 + }, + { + "epoch": 8.372273238017746, + "grad_norm": 1.5544898927697937, + "learning_rate": 4.186298822070676e-07, + "loss": 0.8981, + "step": 108040 + }, + { + "epoch": 8.373048161494053, + "grad_norm": 1.4385303641967029, + "learning_rate": 4.186686298822071e-07, + "loss": 0.9069, + "step": 108050 + }, + { + "epoch": 8.37382308497036, + "grad_norm": 1.4438873993962056, + "learning_rate": 4.1870737755734655e-07, + "loss": 0.9201, + "step": 108060 + }, + { + "epoch": 8.374598008446666, + "grad_norm": 1.4378772530303072, + "learning_rate": 4.1874612523248607e-07, + "loss": 0.9265, + "step": 108070 + }, + { + "epoch": 8.375372931922973, + "grad_norm": 1.4559871965731055, + "learning_rate": 4.1878487290762554e-07, + "loss": 0.9104, + "step": 108080 + }, + { + "epoch": 8.37614785539928, + "grad_norm": 1.5280181040899448, + "learning_rate": 4.1882362058276506e-07, + "loss": 0.907, + "step": 108090 + }, + { + "epoch": 8.376922778875587, + "grad_norm": 1.4282541188185274, + "learning_rate": 4.1886236825790453e-07, + "loss": 0.904, + "step": 108100 + }, + { + "epoch": 8.377697702351893, + "grad_norm": 1.4152680041103536, + "learning_rate": 4.18901115933044e-07, + "loss": 0.8962, + "step": 108110 + }, + { + "epoch": 8.3784726258282, + "grad_norm": 1.4155955843009949, + "learning_rate": 4.189398636081835e-07, + "loss": 0.8812, + "step": 108120 + }, + { + "epoch": 8.379247549304505, + "grad_norm": 1.4322691722836467, + "learning_rate": 4.18978611283323e-07, + "loss": 0.9093, + "step": 108130 + }, + { + "epoch": 8.380022472780812, + "grad_norm": 1.4508546583799915, + "learning_rate": 4.190173589584625e-07, + "loss": 0.8976, + "step": 108140 + }, + { + "epoch": 8.380797396257119, + "grad_norm": 1.4414493572319995, + "learning_rate": 4.19056106633602e-07, + "loss": 0.9086, + "step": 108150 + }, + { + "epoch": 8.381572319733426, + "grad_norm": 1.429837628591562, + "learning_rate": 4.190948543087415e-07, + "loss": 0.8924, + "step": 108160 + }, + { + "epoch": 8.382347243209733, + "grad_norm": 1.4725269756788366, + "learning_rate": 4.19133601983881e-07, + "loss": 0.8999, + "step": 108170 + }, + { + "epoch": 8.38312216668604, + "grad_norm": 1.5004626476151708, + "learning_rate": 4.1917234965902044e-07, + "loss": 0.9113, + "step": 108180 + }, + { + "epoch": 8.383897090162346, + "grad_norm": 1.527705875328745, + "learning_rate": 4.1921109733415997e-07, + "loss": 0.9435, + "step": 108190 + }, + { + "epoch": 8.384672013638653, + "grad_norm": 1.3834676108313229, + "learning_rate": 4.1924984500929943e-07, + "loss": 0.9121, + "step": 108200 + }, + { + "epoch": 8.38544693711496, + "grad_norm": 1.4050286777637366, + "learning_rate": 4.1928859268443896e-07, + "loss": 0.9057, + "step": 108210 + }, + { + "epoch": 8.386221860591267, + "grad_norm": 1.459477289299454, + "learning_rate": 4.193273403595784e-07, + "loss": 0.9118, + "step": 108220 + }, + { + "epoch": 8.386996784067573, + "grad_norm": 1.4501219822310085, + "learning_rate": 4.1936608803471795e-07, + "loss": 0.9037, + "step": 108230 + }, + { + "epoch": 8.38777170754388, + "grad_norm": 1.4152469026765877, + "learning_rate": 4.194048357098574e-07, + "loss": 0.9068, + "step": 108240 + }, + { + "epoch": 8.388546631020187, + "grad_norm": 1.456851418704138, + "learning_rate": 4.194435833849969e-07, + "loss": 0.9272, + "step": 108250 + }, + { + "epoch": 8.389321554496494, + "grad_norm": 1.4197666156148463, + "learning_rate": 4.194823310601364e-07, + "loss": 0.9026, + "step": 108260 + }, + { + "epoch": 8.3900964779728, + "grad_norm": 1.4877406147439505, + "learning_rate": 4.195210787352759e-07, + "loss": 0.9284, + "step": 108270 + }, + { + "epoch": 8.390871401449107, + "grad_norm": 1.3483553847143657, + "learning_rate": 4.195598264104154e-07, + "loss": 0.9087, + "step": 108280 + }, + { + "epoch": 8.391646324925414, + "grad_norm": 1.421855697988011, + "learning_rate": 4.1959857408555487e-07, + "loss": 0.9104, + "step": 108290 + }, + { + "epoch": 8.392421248401721, + "grad_norm": 1.3801279276203702, + "learning_rate": 4.196373217606944e-07, + "loss": 0.9259, + "step": 108300 + }, + { + "epoch": 8.393196171878028, + "grad_norm": 1.5270676805346897, + "learning_rate": 4.1967606943583386e-07, + "loss": 0.9122, + "step": 108310 + }, + { + "epoch": 8.393971095354333, + "grad_norm": 1.5074980472739468, + "learning_rate": 4.1971481711097333e-07, + "loss": 0.9103, + "step": 108320 + }, + { + "epoch": 8.39474601883064, + "grad_norm": 1.4408924878959473, + "learning_rate": 4.1975356478611285e-07, + "loss": 0.9063, + "step": 108330 + }, + { + "epoch": 8.395520942306947, + "grad_norm": 1.4588056207484656, + "learning_rate": 4.197923124612523e-07, + "loss": 0.9024, + "step": 108340 + }, + { + "epoch": 8.396295865783253, + "grad_norm": 1.4209020091097875, + "learning_rate": 4.1983106013639184e-07, + "loss": 0.9083, + "step": 108350 + }, + { + "epoch": 8.39707078925956, + "grad_norm": 1.487979226557547, + "learning_rate": 4.198698078115313e-07, + "loss": 0.9198, + "step": 108360 + }, + { + "epoch": 8.397845712735867, + "grad_norm": 1.5151667529970108, + "learning_rate": 4.1990855548667083e-07, + "loss": 0.9088, + "step": 108370 + }, + { + "epoch": 8.398620636212174, + "grad_norm": 1.4354014052519435, + "learning_rate": 4.199473031618103e-07, + "loss": 0.92, + "step": 108380 + }, + { + "epoch": 8.39939555968848, + "grad_norm": 1.4787259103842334, + "learning_rate": 4.1998605083694977e-07, + "loss": 0.9115, + "step": 108390 + }, + { + "epoch": 8.400170483164787, + "grad_norm": 1.407292317857806, + "learning_rate": 4.200247985120893e-07, + "loss": 0.9117, + "step": 108400 + }, + { + "epoch": 8.400945406641094, + "grad_norm": 1.4074523124627552, + "learning_rate": 4.2006354618722876e-07, + "loss": 0.8975, + "step": 108410 + }, + { + "epoch": 8.401720330117401, + "grad_norm": 1.357315808871205, + "learning_rate": 4.201022938623683e-07, + "loss": 0.9237, + "step": 108420 + }, + { + "epoch": 8.402495253593708, + "grad_norm": 1.4273303574598017, + "learning_rate": 4.2014104153750775e-07, + "loss": 0.9046, + "step": 108430 + }, + { + "epoch": 8.403270177070015, + "grad_norm": 1.4852628218300346, + "learning_rate": 4.201797892126473e-07, + "loss": 0.8968, + "step": 108440 + }, + { + "epoch": 8.404045100546321, + "grad_norm": 1.3942730729594197, + "learning_rate": 4.2021853688778674e-07, + "loss": 0.9479, + "step": 108450 + }, + { + "epoch": 8.404820024022628, + "grad_norm": 1.3611515799689822, + "learning_rate": 4.202572845629262e-07, + "loss": 0.9343, + "step": 108460 + }, + { + "epoch": 8.405594947498935, + "grad_norm": 1.3103886775827733, + "learning_rate": 4.2029603223806573e-07, + "loss": 0.9163, + "step": 108470 + }, + { + "epoch": 8.406369870975242, + "grad_norm": 1.415319626272107, + "learning_rate": 4.203347799132052e-07, + "loss": 0.9114, + "step": 108480 + }, + { + "epoch": 8.407144794451549, + "grad_norm": 1.3996538850215938, + "learning_rate": 4.203735275883447e-07, + "loss": 0.8914, + "step": 108490 + }, + { + "epoch": 8.407919717927854, + "grad_norm": 1.4129737395536284, + "learning_rate": 4.204122752634842e-07, + "loss": 0.9208, + "step": 108500 + }, + { + "epoch": 8.407919717927854, + "eval_loss": 0.9185624122619629, + "eval_runtime": 333.6708, + "eval_samples_per_second": 34.378, + "eval_steps_per_second": 8.595, + "step": 108500 + }, + { + "epoch": 8.40869464140416, + "grad_norm": 1.3594852301869644, + "learning_rate": 4.204510229386237e-07, + "loss": 0.918, + "step": 108510 + }, + { + "epoch": 8.409469564880467, + "grad_norm": 1.4025476239017007, + "learning_rate": 4.204897706137632e-07, + "loss": 0.8957, + "step": 108520 + }, + { + "epoch": 8.410244488356774, + "grad_norm": 1.4031783789102852, + "learning_rate": 4.2052851828890265e-07, + "loss": 0.9115, + "step": 108530 + }, + { + "epoch": 8.411019411833081, + "grad_norm": 1.5134240072808696, + "learning_rate": 4.205672659640422e-07, + "loss": 0.9099, + "step": 108540 + }, + { + "epoch": 8.411794335309388, + "grad_norm": 1.3231486236765284, + "learning_rate": 4.2060601363918165e-07, + "loss": 0.8924, + "step": 108550 + }, + { + "epoch": 8.412569258785695, + "grad_norm": 1.4616022372455786, + "learning_rate": 4.2064476131432117e-07, + "loss": 0.9068, + "step": 108560 + }, + { + "epoch": 8.413344182262001, + "grad_norm": 1.4532011912819114, + "learning_rate": 4.2068350898946064e-07, + "loss": 0.9199, + "step": 108570 + }, + { + "epoch": 8.414119105738308, + "grad_norm": 1.4023532071869045, + "learning_rate": 4.2072225666460016e-07, + "loss": 0.906, + "step": 108580 + }, + { + "epoch": 8.414894029214615, + "grad_norm": 1.4654796281380391, + "learning_rate": 4.2076100433973963e-07, + "loss": 0.9333, + "step": 108590 + }, + { + "epoch": 8.415668952690922, + "grad_norm": 1.4647660150394113, + "learning_rate": 4.207997520148791e-07, + "loss": 0.9353, + "step": 108600 + }, + { + "epoch": 8.416443876167229, + "grad_norm": 1.4899137003931309, + "learning_rate": 4.208384996900186e-07, + "loss": 0.9113, + "step": 108610 + }, + { + "epoch": 8.417218799643535, + "grad_norm": 1.4332597148688557, + "learning_rate": 4.208772473651581e-07, + "loss": 0.9205, + "step": 108620 + }, + { + "epoch": 8.417993723119842, + "grad_norm": 1.498307629494525, + "learning_rate": 4.209159950402976e-07, + "loss": 0.8978, + "step": 108630 + }, + { + "epoch": 8.418768646596149, + "grad_norm": 1.4818233418060849, + "learning_rate": 4.209547427154371e-07, + "loss": 0.9165, + "step": 108640 + }, + { + "epoch": 8.419543570072456, + "grad_norm": 1.4388463508668745, + "learning_rate": 4.209934903905766e-07, + "loss": 0.9085, + "step": 108650 + }, + { + "epoch": 8.420318493548763, + "grad_norm": 1.3699032072360597, + "learning_rate": 4.2103223806571607e-07, + "loss": 0.9216, + "step": 108660 + }, + { + "epoch": 8.42109341702507, + "grad_norm": 1.433582061368556, + "learning_rate": 4.2107098574085554e-07, + "loss": 0.8759, + "step": 108670 + }, + { + "epoch": 8.421868340501376, + "grad_norm": 1.393624966529744, + "learning_rate": 4.2110973341599506e-07, + "loss": 0.9375, + "step": 108680 + }, + { + "epoch": 8.422643263977681, + "grad_norm": 1.4236267715120683, + "learning_rate": 4.2114848109113453e-07, + "loss": 0.9149, + "step": 108690 + }, + { + "epoch": 8.423418187453988, + "grad_norm": 1.3983140250008836, + "learning_rate": 4.2118722876627405e-07, + "loss": 0.9027, + "step": 108700 + }, + { + "epoch": 8.424193110930295, + "grad_norm": 1.357584218073007, + "learning_rate": 4.212259764414135e-07, + "loss": 0.9076, + "step": 108710 + }, + { + "epoch": 8.424968034406602, + "grad_norm": 1.5177747902852383, + "learning_rate": 4.2126472411655304e-07, + "loss": 0.9027, + "step": 108720 + }, + { + "epoch": 8.425742957882909, + "grad_norm": 1.377576890407189, + "learning_rate": 4.213034717916925e-07, + "loss": 0.9026, + "step": 108730 + }, + { + "epoch": 8.426517881359215, + "grad_norm": 1.51894285394777, + "learning_rate": 4.21342219466832e-07, + "loss": 0.9267, + "step": 108740 + }, + { + "epoch": 8.427292804835522, + "grad_norm": 1.4363750698226134, + "learning_rate": 4.213809671419715e-07, + "loss": 0.9244, + "step": 108750 + }, + { + "epoch": 8.428067728311829, + "grad_norm": 1.3938207655007995, + "learning_rate": 4.2141971481711097e-07, + "loss": 0.9206, + "step": 108760 + }, + { + "epoch": 8.428842651788136, + "grad_norm": 1.3239837331609408, + "learning_rate": 4.214584624922505e-07, + "loss": 0.8973, + "step": 108770 + }, + { + "epoch": 8.429617575264443, + "grad_norm": 1.4626078561588982, + "learning_rate": 4.2149721016738996e-07, + "loss": 0.9276, + "step": 108780 + }, + { + "epoch": 8.43039249874075, + "grad_norm": 1.4448593791578563, + "learning_rate": 4.215359578425295e-07, + "loss": 0.9053, + "step": 108790 + }, + { + "epoch": 8.431167422217056, + "grad_norm": 1.399416976165917, + "learning_rate": 4.2157470551766895e-07, + "loss": 0.9118, + "step": 108800 + }, + { + "epoch": 8.431942345693363, + "grad_norm": 1.4217797163630075, + "learning_rate": 4.216134531928084e-07, + "loss": 0.9067, + "step": 108810 + }, + { + "epoch": 8.43271726916967, + "grad_norm": 1.4554573153928192, + "learning_rate": 4.2165220086794794e-07, + "loss": 0.9064, + "step": 108820 + }, + { + "epoch": 8.433492192645977, + "grad_norm": 1.4145976166574679, + "learning_rate": 4.216909485430874e-07, + "loss": 0.9082, + "step": 108830 + }, + { + "epoch": 8.434267116122284, + "grad_norm": 1.3848668122367414, + "learning_rate": 4.2172969621822694e-07, + "loss": 0.9086, + "step": 108840 + }, + { + "epoch": 8.43504203959859, + "grad_norm": 1.4147130944825446, + "learning_rate": 4.217684438933664e-07, + "loss": 0.9161, + "step": 108850 + }, + { + "epoch": 8.435816963074897, + "grad_norm": 1.4149633563313069, + "learning_rate": 4.218071915685059e-07, + "loss": 0.9111, + "step": 108860 + }, + { + "epoch": 8.436591886551202, + "grad_norm": 1.4701768194109535, + "learning_rate": 4.218459392436454e-07, + "loss": 0.9018, + "step": 108870 + }, + { + "epoch": 8.437366810027509, + "grad_norm": 1.3678368192735177, + "learning_rate": 4.2188468691878486e-07, + "loss": 0.9001, + "step": 108880 + }, + { + "epoch": 8.438141733503816, + "grad_norm": 1.4480219245991202, + "learning_rate": 4.219234345939244e-07, + "loss": 0.9137, + "step": 108890 + }, + { + "epoch": 8.438916656980123, + "grad_norm": 1.4464799931432706, + "learning_rate": 4.2196218226906386e-07, + "loss": 0.8864, + "step": 108900 + }, + { + "epoch": 8.43969158045643, + "grad_norm": 1.5092809165057068, + "learning_rate": 4.220009299442034e-07, + "loss": 0.9104, + "step": 108910 + }, + { + "epoch": 8.440466503932736, + "grad_norm": 1.445617835858367, + "learning_rate": 4.2203967761934285e-07, + "loss": 0.9083, + "step": 108920 + }, + { + "epoch": 8.441241427409043, + "grad_norm": 1.458426632909525, + "learning_rate": 4.220784252944823e-07, + "loss": 0.9152, + "step": 108930 + }, + { + "epoch": 8.44201635088535, + "grad_norm": 1.4598602475911955, + "learning_rate": 4.2211717296962184e-07, + "loss": 0.8914, + "step": 108940 + }, + { + "epoch": 8.442791274361657, + "grad_norm": 1.4191441872671127, + "learning_rate": 4.221559206447613e-07, + "loss": 0.9268, + "step": 108950 + }, + { + "epoch": 8.443566197837963, + "grad_norm": 1.4461210284637525, + "learning_rate": 4.2219466831990083e-07, + "loss": 0.9438, + "step": 108960 + }, + { + "epoch": 8.44434112131427, + "grad_norm": 1.548427058821278, + "learning_rate": 4.222334159950403e-07, + "loss": 0.9091, + "step": 108970 + }, + { + "epoch": 8.445116044790577, + "grad_norm": 1.4953678177254621, + "learning_rate": 4.222721636701798e-07, + "loss": 0.905, + "step": 108980 + }, + { + "epoch": 8.445890968266884, + "grad_norm": 1.4243720690252855, + "learning_rate": 4.223109113453193e-07, + "loss": 0.8971, + "step": 108990 + }, + { + "epoch": 8.44666589174319, + "grad_norm": 1.451401756216443, + "learning_rate": 4.2234965902045876e-07, + "loss": 0.9162, + "step": 109000 + }, + { + "epoch": 8.44666589174319, + "eval_loss": 0.9183058738708496, + "eval_runtime": 331.5244, + "eval_samples_per_second": 34.601, + "eval_steps_per_second": 8.651, + "step": 109000 + }, + { + "epoch": 8.447440815219498, + "grad_norm": 1.3660603303655874, + "learning_rate": 4.223884066955983e-07, + "loss": 0.9253, + "step": 109010 + }, + { + "epoch": 8.448215738695804, + "grad_norm": 1.4759132292717911, + "learning_rate": 4.2242715437073775e-07, + "loss": 0.9042, + "step": 109020 + }, + { + "epoch": 8.448990662172111, + "grad_norm": 1.4347430639211733, + "learning_rate": 4.2246590204587727e-07, + "loss": 0.9352, + "step": 109030 + }, + { + "epoch": 8.449765585648418, + "grad_norm": 1.4158582478153523, + "learning_rate": 4.2250464972101674e-07, + "loss": 0.9108, + "step": 109040 + }, + { + "epoch": 8.450540509124725, + "grad_norm": 1.56974804884767, + "learning_rate": 4.2254339739615626e-07, + "loss": 0.8903, + "step": 109050 + }, + { + "epoch": 8.45131543260103, + "grad_norm": 1.4016605364479633, + "learning_rate": 4.2258214507129573e-07, + "loss": 0.914, + "step": 109060 + }, + { + "epoch": 8.452090356077337, + "grad_norm": 1.343503134153369, + "learning_rate": 4.226208927464352e-07, + "loss": 0.9067, + "step": 109070 + }, + { + "epoch": 8.452865279553643, + "grad_norm": 1.386882950583847, + "learning_rate": 4.226596404215747e-07, + "loss": 0.9051, + "step": 109080 + }, + { + "epoch": 8.45364020302995, + "grad_norm": 1.4445437676725161, + "learning_rate": 4.226983880967142e-07, + "loss": 0.9247, + "step": 109090 + }, + { + "epoch": 8.454415126506257, + "grad_norm": 1.4388960969117568, + "learning_rate": 4.227371357718537e-07, + "loss": 0.9306, + "step": 109100 + }, + { + "epoch": 8.455190049982564, + "grad_norm": 1.4581272301805566, + "learning_rate": 4.227758834469932e-07, + "loss": 0.8975, + "step": 109110 + }, + { + "epoch": 8.45596497345887, + "grad_norm": 1.3793263400505995, + "learning_rate": 4.228146311221327e-07, + "loss": 0.9129, + "step": 109120 + }, + { + "epoch": 8.456739896935177, + "grad_norm": 1.4901056845262124, + "learning_rate": 4.2285337879727217e-07, + "loss": 0.9153, + "step": 109130 + }, + { + "epoch": 8.457514820411484, + "grad_norm": 1.4868048658107764, + "learning_rate": 4.2289212647241164e-07, + "loss": 0.9273, + "step": 109140 + }, + { + "epoch": 8.458289743887791, + "grad_norm": 1.446382806055701, + "learning_rate": 4.2293087414755116e-07, + "loss": 0.8876, + "step": 109150 + }, + { + "epoch": 8.459064667364098, + "grad_norm": 1.4098179558497568, + "learning_rate": 4.2296962182269063e-07, + "loss": 0.9178, + "step": 109160 + }, + { + "epoch": 8.459839590840405, + "grad_norm": 1.3692026889742364, + "learning_rate": 4.2300836949783016e-07, + "loss": 0.9084, + "step": 109170 + }, + { + "epoch": 8.460614514316712, + "grad_norm": 1.4557779447263275, + "learning_rate": 4.230471171729696e-07, + "loss": 0.9029, + "step": 109180 + }, + { + "epoch": 8.461389437793018, + "grad_norm": 1.3938564940537588, + "learning_rate": 4.2308586484810915e-07, + "loss": 0.9272, + "step": 109190 + }, + { + "epoch": 8.462164361269325, + "grad_norm": 1.4132032776944765, + "learning_rate": 4.231246125232486e-07, + "loss": 0.914, + "step": 109200 + }, + { + "epoch": 8.462939284745632, + "grad_norm": 1.333384393603618, + "learning_rate": 4.231633601983881e-07, + "loss": 0.9042, + "step": 109210 + }, + { + "epoch": 8.463714208221939, + "grad_norm": 1.4640873789950115, + "learning_rate": 4.232021078735276e-07, + "loss": 0.9213, + "step": 109220 + }, + { + "epoch": 8.464489131698246, + "grad_norm": 1.482135779118379, + "learning_rate": 4.232408555486671e-07, + "loss": 0.9136, + "step": 109230 + }, + { + "epoch": 8.46526405517455, + "grad_norm": 1.4140797965284186, + "learning_rate": 4.232796032238066e-07, + "loss": 0.9041, + "step": 109240 + }, + { + "epoch": 8.466038978650857, + "grad_norm": 1.4184856866757831, + "learning_rate": 4.2331835089894607e-07, + "loss": 0.8911, + "step": 109250 + }, + { + "epoch": 8.466813902127164, + "grad_norm": 1.486588842495142, + "learning_rate": 4.233570985740856e-07, + "loss": 0.9328, + "step": 109260 + }, + { + "epoch": 8.467588825603471, + "grad_norm": 1.401400295700497, + "learning_rate": 4.2339584624922506e-07, + "loss": 0.8999, + "step": 109270 + }, + { + "epoch": 8.468363749079778, + "grad_norm": 1.4031340041172646, + "learning_rate": 4.234345939243645e-07, + "loss": 0.9398, + "step": 109280 + }, + { + "epoch": 8.469138672556085, + "grad_norm": 1.433734568329988, + "learning_rate": 4.2347334159950405e-07, + "loss": 0.8978, + "step": 109290 + }, + { + "epoch": 8.469913596032391, + "grad_norm": 1.494484330109707, + "learning_rate": 4.235120892746435e-07, + "loss": 0.9063, + "step": 109300 + }, + { + "epoch": 8.470688519508698, + "grad_norm": 1.5133328328102935, + "learning_rate": 4.2355083694978304e-07, + "loss": 0.9267, + "step": 109310 + }, + { + "epoch": 8.471463442985005, + "grad_norm": 1.4956035514376402, + "learning_rate": 4.235895846249225e-07, + "loss": 0.9171, + "step": 109320 + }, + { + "epoch": 8.472238366461312, + "grad_norm": 1.416146608198462, + "learning_rate": 4.2362833230006203e-07, + "loss": 0.9107, + "step": 109330 + }, + { + "epoch": 8.473013289937619, + "grad_norm": 1.3069841175875598, + "learning_rate": 4.236670799752015e-07, + "loss": 0.9109, + "step": 109340 + }, + { + "epoch": 8.473788213413926, + "grad_norm": 1.4336391042450418, + "learning_rate": 4.2370582765034097e-07, + "loss": 0.897, + "step": 109350 + }, + { + "epoch": 8.474563136890232, + "grad_norm": 1.4346067435288645, + "learning_rate": 4.237445753254805e-07, + "loss": 0.8969, + "step": 109360 + }, + { + "epoch": 8.47533806036654, + "grad_norm": 1.3850239154911075, + "learning_rate": 4.2378332300061996e-07, + "loss": 0.9037, + "step": 109370 + }, + { + "epoch": 8.476112983842846, + "grad_norm": 1.3946460072803135, + "learning_rate": 4.238220706757595e-07, + "loss": 0.9414, + "step": 109380 + }, + { + "epoch": 8.476887907319153, + "grad_norm": 1.403994665112896, + "learning_rate": 4.2386081835089895e-07, + "loss": 0.923, + "step": 109390 + }, + { + "epoch": 8.47766283079546, + "grad_norm": 1.4308796027995498, + "learning_rate": 4.2389956602603847e-07, + "loss": 0.9162, + "step": 109400 + }, + { + "epoch": 8.478437754271766, + "grad_norm": 1.3916884473560553, + "learning_rate": 4.2393831370117794e-07, + "loss": 0.92, + "step": 109410 + }, + { + "epoch": 8.479212677748073, + "grad_norm": 1.5150210766783068, + "learning_rate": 4.239770613763174e-07, + "loss": 0.9071, + "step": 109420 + }, + { + "epoch": 8.47998760122438, + "grad_norm": 1.480742608413463, + "learning_rate": 4.2401580905145693e-07, + "loss": 0.9303, + "step": 109430 + }, + { + "epoch": 8.480762524700685, + "grad_norm": 1.4099974321677875, + "learning_rate": 4.240545567265964e-07, + "loss": 0.9017, + "step": 109440 + }, + { + "epoch": 8.481537448176992, + "grad_norm": 1.3824847916087235, + "learning_rate": 4.240933044017359e-07, + "loss": 0.9128, + "step": 109450 + }, + { + "epoch": 8.482312371653299, + "grad_norm": 1.3875121220366349, + "learning_rate": 4.241320520768754e-07, + "loss": 0.9023, + "step": 109460 + }, + { + "epoch": 8.483087295129605, + "grad_norm": 1.4025098266264455, + "learning_rate": 4.241707997520149e-07, + "loss": 0.9184, + "step": 109470 + }, + { + "epoch": 8.483862218605912, + "grad_norm": 1.4343887791461685, + "learning_rate": 4.242095474271544e-07, + "loss": 0.9195, + "step": 109480 + }, + { + "epoch": 8.484637142082219, + "grad_norm": 1.420786621430813, + "learning_rate": 4.2424829510229385e-07, + "loss": 0.8957, + "step": 109490 + }, + { + "epoch": 8.485412065558526, + "grad_norm": 1.44891942292856, + "learning_rate": 4.242870427774334e-07, + "loss": 0.922, + "step": 109500 + }, + { + "epoch": 8.485412065558526, + "eval_loss": 0.9179683923721313, + "eval_runtime": 331.8786, + "eval_samples_per_second": 34.564, + "eval_steps_per_second": 8.642, + "step": 109500 + }, + { + "epoch": 8.486186989034833, + "grad_norm": 1.406875623687257, + "learning_rate": 4.2432579045257284e-07, + "loss": 0.9086, + "step": 109510 + }, + { + "epoch": 8.48696191251114, + "grad_norm": 1.468250550967385, + "learning_rate": 4.2436453812771237e-07, + "loss": 0.9128, + "step": 109520 + }, + { + "epoch": 8.487736835987446, + "grad_norm": 1.3821789885254983, + "learning_rate": 4.2440328580285183e-07, + "loss": 0.914, + "step": 109530 + }, + { + "epoch": 8.488511759463753, + "grad_norm": 1.3493847958560066, + "learning_rate": 4.2444203347799136e-07, + "loss": 0.8993, + "step": 109540 + }, + { + "epoch": 8.48928668294006, + "grad_norm": 1.4679896160068788, + "learning_rate": 4.244807811531308e-07, + "loss": 0.9266, + "step": 109550 + }, + { + "epoch": 8.490061606416367, + "grad_norm": 1.5638898105247874, + "learning_rate": 4.245195288282703e-07, + "loss": 0.9039, + "step": 109560 + }, + { + "epoch": 8.490836529892674, + "grad_norm": 1.4067131946354712, + "learning_rate": 4.245582765034098e-07, + "loss": 0.9054, + "step": 109570 + }, + { + "epoch": 8.49161145336898, + "grad_norm": 1.509775739060308, + "learning_rate": 4.245970241785493e-07, + "loss": 0.9047, + "step": 109580 + }, + { + "epoch": 8.492386376845287, + "grad_norm": 1.5438285439983037, + "learning_rate": 4.246357718536888e-07, + "loss": 0.9225, + "step": 109590 + }, + { + "epoch": 8.493161300321594, + "grad_norm": 1.394493407768057, + "learning_rate": 4.246745195288283e-07, + "loss": 0.9036, + "step": 109600 + }, + { + "epoch": 8.4939362237979, + "grad_norm": 1.4618639561343116, + "learning_rate": 4.2471326720396775e-07, + "loss": 0.9054, + "step": 109610 + }, + { + "epoch": 8.494711147274206, + "grad_norm": 1.48561312524979, + "learning_rate": 4.2475201487910727e-07, + "loss": 0.8965, + "step": 109620 + }, + { + "epoch": 8.495486070750513, + "grad_norm": 1.4496317290930645, + "learning_rate": 4.2479076255424674e-07, + "loss": 0.9054, + "step": 109630 + }, + { + "epoch": 8.49626099422682, + "grad_norm": 1.4831452726433214, + "learning_rate": 4.2482951022938626e-07, + "loss": 0.8802, + "step": 109640 + }, + { + "epoch": 8.497035917703126, + "grad_norm": 1.532299802493044, + "learning_rate": 4.2486825790452573e-07, + "loss": 0.9117, + "step": 109650 + }, + { + "epoch": 8.497810841179433, + "grad_norm": 1.492535064674564, + "learning_rate": 4.2490700557966525e-07, + "loss": 0.9035, + "step": 109660 + }, + { + "epoch": 8.49858576465574, + "grad_norm": 1.4458159948068319, + "learning_rate": 4.249457532548047e-07, + "loss": 0.9159, + "step": 109670 + }, + { + "epoch": 8.499360688132047, + "grad_norm": 1.3837308945018392, + "learning_rate": 4.249845009299442e-07, + "loss": 0.8877, + "step": 109680 + }, + { + "epoch": 8.500135611608354, + "grad_norm": 1.373054047351347, + "learning_rate": 4.250232486050837e-07, + "loss": 0.8991, + "step": 109690 + }, + { + "epoch": 8.50091053508466, + "grad_norm": 1.4950184415615881, + "learning_rate": 4.250619962802232e-07, + "loss": 0.911, + "step": 109700 + }, + { + "epoch": 8.501685458560967, + "grad_norm": 1.4335651063958554, + "learning_rate": 4.251007439553627e-07, + "loss": 0.9038, + "step": 109710 + }, + { + "epoch": 8.502460382037274, + "grad_norm": 1.4263626483509915, + "learning_rate": 4.2513949163050217e-07, + "loss": 0.8991, + "step": 109720 + }, + { + "epoch": 8.50323530551358, + "grad_norm": 1.4912419230576948, + "learning_rate": 4.251782393056417e-07, + "loss": 0.9061, + "step": 109730 + }, + { + "epoch": 8.504010228989888, + "grad_norm": 1.4032337392048713, + "learning_rate": 4.2521698698078116e-07, + "loss": 0.9149, + "step": 109740 + }, + { + "epoch": 8.504785152466194, + "grad_norm": 1.4591956285048362, + "learning_rate": 4.2525573465592063e-07, + "loss": 0.8913, + "step": 109750 + }, + { + "epoch": 8.505560075942501, + "grad_norm": 1.4330804296714517, + "learning_rate": 4.2529448233106015e-07, + "loss": 0.9037, + "step": 109760 + }, + { + "epoch": 8.506334999418808, + "grad_norm": 1.4310155537724485, + "learning_rate": 4.253332300061996e-07, + "loss": 0.9032, + "step": 109770 + }, + { + "epoch": 8.507109922895115, + "grad_norm": 1.3967698156092783, + "learning_rate": 4.2537197768133914e-07, + "loss": 0.9078, + "step": 109780 + }, + { + "epoch": 8.507884846371422, + "grad_norm": 1.4647054221677585, + "learning_rate": 4.254107253564786e-07, + "loss": 0.9059, + "step": 109790 + }, + { + "epoch": 8.508659769847728, + "grad_norm": 1.4543314769740587, + "learning_rate": 4.2544947303161813e-07, + "loss": 0.9009, + "step": 109800 + }, + { + "epoch": 8.509434693324033, + "grad_norm": 1.3731545210456955, + "learning_rate": 4.254882207067576e-07, + "loss": 0.9149, + "step": 109810 + }, + { + "epoch": 8.51020961680034, + "grad_norm": 1.4479927933689696, + "learning_rate": 4.2552696838189707e-07, + "loss": 0.8825, + "step": 109820 + }, + { + "epoch": 8.510984540276647, + "grad_norm": 1.4184881842649764, + "learning_rate": 4.255657160570366e-07, + "loss": 0.9167, + "step": 109830 + }, + { + "epoch": 8.511759463752954, + "grad_norm": 1.3477035146480592, + "learning_rate": 4.2560446373217606e-07, + "loss": 0.9145, + "step": 109840 + }, + { + "epoch": 8.51253438722926, + "grad_norm": 1.383681852609345, + "learning_rate": 4.256432114073156e-07, + "loss": 0.9016, + "step": 109850 + }, + { + "epoch": 8.513309310705568, + "grad_norm": 1.5474357559443292, + "learning_rate": 4.2568195908245505e-07, + "loss": 0.9016, + "step": 109860 + }, + { + "epoch": 8.514084234181874, + "grad_norm": 1.4604191185200568, + "learning_rate": 4.257207067575946e-07, + "loss": 0.907, + "step": 109870 + }, + { + "epoch": 8.514859157658181, + "grad_norm": 1.4165637292291666, + "learning_rate": 4.2575945443273405e-07, + "loss": 0.9139, + "step": 109880 + }, + { + "epoch": 8.515634081134488, + "grad_norm": 1.463517780750364, + "learning_rate": 4.257982021078735e-07, + "loss": 0.8943, + "step": 109890 + }, + { + "epoch": 8.516409004610795, + "grad_norm": 1.3403528624722079, + "learning_rate": 4.2583694978301304e-07, + "loss": 0.9099, + "step": 109900 + }, + { + "epoch": 8.517183928087102, + "grad_norm": 1.4598789176866875, + "learning_rate": 4.258756974581525e-07, + "loss": 0.9114, + "step": 109910 + }, + { + "epoch": 8.517958851563408, + "grad_norm": 1.4734667556643284, + "learning_rate": 4.2591444513329203e-07, + "loss": 0.9074, + "step": 109920 + }, + { + "epoch": 8.518733775039715, + "grad_norm": 1.4550587459869824, + "learning_rate": 4.259531928084315e-07, + "loss": 0.911, + "step": 109930 + }, + { + "epoch": 8.519508698516022, + "grad_norm": 1.422324507344771, + "learning_rate": 4.25991940483571e-07, + "loss": 0.9072, + "step": 109940 + }, + { + "epoch": 8.520283621992329, + "grad_norm": 1.4211018855321373, + "learning_rate": 4.260306881587105e-07, + "loss": 0.9164, + "step": 109950 + }, + { + "epoch": 8.521058545468636, + "grad_norm": 1.4563738029730946, + "learning_rate": 4.2606943583384996e-07, + "loss": 0.9048, + "step": 109960 + }, + { + "epoch": 8.521833468944942, + "grad_norm": 1.3999432642484562, + "learning_rate": 4.261081835089895e-07, + "loss": 0.8946, + "step": 109970 + }, + { + "epoch": 8.522608392421247, + "grad_norm": 1.4812841640417356, + "learning_rate": 4.2614693118412895e-07, + "loss": 0.9041, + "step": 109980 + }, + { + "epoch": 8.523383315897554, + "grad_norm": 1.4479349315834504, + "learning_rate": 4.2618567885926847e-07, + "loss": 0.9267, + "step": 109990 + }, + { + "epoch": 8.524158239373861, + "grad_norm": 1.416838337500505, + "learning_rate": 4.2622442653440794e-07, + "loss": 0.9118, + "step": 110000 + }, + { + "epoch": 8.524158239373861, + "eval_loss": 0.9177433848381042, + "eval_runtime": 333.0873, + "eval_samples_per_second": 34.438, + "eval_steps_per_second": 8.61, + "step": 110000 + }, + { + "epoch": 8.524933162850168, + "grad_norm": 1.5058583396657583, + "learning_rate": 4.2626317420954746e-07, + "loss": 0.8931, + "step": 110010 + }, + { + "epoch": 8.525708086326475, + "grad_norm": 1.5599388983734195, + "learning_rate": 4.2630192188468693e-07, + "loss": 0.9094, + "step": 110020 + }, + { + "epoch": 8.526483009802782, + "grad_norm": 1.4676048547217877, + "learning_rate": 4.263406695598264e-07, + "loss": 0.9107, + "step": 110030 + }, + { + "epoch": 8.527257933279088, + "grad_norm": 1.4501441152134569, + "learning_rate": 4.263794172349659e-07, + "loss": 0.871, + "step": 110040 + }, + { + "epoch": 8.528032856755395, + "grad_norm": 1.4337808182142213, + "learning_rate": 4.264181649101054e-07, + "loss": 0.8965, + "step": 110050 + }, + { + "epoch": 8.528807780231702, + "grad_norm": 1.398035939456046, + "learning_rate": 4.264569125852449e-07, + "loss": 0.9037, + "step": 110060 + }, + { + "epoch": 8.529582703708009, + "grad_norm": 1.5243057663339077, + "learning_rate": 4.264956602603844e-07, + "loss": 0.9135, + "step": 110070 + }, + { + "epoch": 8.530357627184316, + "grad_norm": 1.4331776676680905, + "learning_rate": 4.265344079355239e-07, + "loss": 0.8975, + "step": 110080 + }, + { + "epoch": 8.531132550660622, + "grad_norm": 1.4667427056327946, + "learning_rate": 4.2657315561066337e-07, + "loss": 0.9042, + "step": 110090 + }, + { + "epoch": 8.53190747413693, + "grad_norm": 1.3609339052047356, + "learning_rate": 4.2661190328580284e-07, + "loss": 0.8918, + "step": 110100 + }, + { + "epoch": 8.532682397613236, + "grad_norm": 1.4883036259471762, + "learning_rate": 4.2665065096094236e-07, + "loss": 0.9276, + "step": 110110 + }, + { + "epoch": 8.533457321089543, + "grad_norm": 1.442163164042806, + "learning_rate": 4.2668939863608183e-07, + "loss": 0.909, + "step": 110120 + }, + { + "epoch": 8.53423224456585, + "grad_norm": 1.4166516188119034, + "learning_rate": 4.2672814631122135e-07, + "loss": 0.937, + "step": 110130 + }, + { + "epoch": 8.535007168042156, + "grad_norm": 1.4197956379241097, + "learning_rate": 4.267668939863608e-07, + "loss": 0.928, + "step": 110140 + }, + { + "epoch": 8.535782091518463, + "grad_norm": 1.4250919661220909, + "learning_rate": 4.2680564166150034e-07, + "loss": 0.9089, + "step": 110150 + }, + { + "epoch": 8.53655701499477, + "grad_norm": 1.4820731254127477, + "learning_rate": 4.268443893366398e-07, + "loss": 0.9253, + "step": 110160 + }, + { + "epoch": 8.537331938471077, + "grad_norm": 1.5576406768077602, + "learning_rate": 4.268831370117793e-07, + "loss": 0.9106, + "step": 110170 + }, + { + "epoch": 8.538106861947382, + "grad_norm": 1.5015304259500588, + "learning_rate": 4.269218846869188e-07, + "loss": 0.9119, + "step": 110180 + }, + { + "epoch": 8.538881785423689, + "grad_norm": 1.4650752647501002, + "learning_rate": 4.269606323620583e-07, + "loss": 0.9107, + "step": 110190 + }, + { + "epoch": 8.539656708899996, + "grad_norm": 1.4766690509058031, + "learning_rate": 4.269993800371978e-07, + "loss": 0.9372, + "step": 110200 + }, + { + "epoch": 8.540431632376302, + "grad_norm": 1.4864489308368651, + "learning_rate": 4.2703812771233726e-07, + "loss": 0.9049, + "step": 110210 + }, + { + "epoch": 8.54120655585261, + "grad_norm": 1.4433599390609102, + "learning_rate": 4.270768753874768e-07, + "loss": 0.9247, + "step": 110220 + }, + { + "epoch": 8.541981479328916, + "grad_norm": 1.4890538499945798, + "learning_rate": 4.2711562306261626e-07, + "loss": 0.9013, + "step": 110230 + }, + { + "epoch": 8.542756402805223, + "grad_norm": 1.3515852537487119, + "learning_rate": 4.271543707377557e-07, + "loss": 0.9068, + "step": 110240 + }, + { + "epoch": 8.54353132628153, + "grad_norm": 1.4926660109539012, + "learning_rate": 4.2719311841289525e-07, + "loss": 0.9042, + "step": 110250 + }, + { + "epoch": 8.544306249757836, + "grad_norm": 1.3718985160819732, + "learning_rate": 4.272318660880347e-07, + "loss": 0.9125, + "step": 110260 + }, + { + "epoch": 8.545081173234143, + "grad_norm": 1.4081475357628328, + "learning_rate": 4.2727061376317424e-07, + "loss": 0.9248, + "step": 110270 + }, + { + "epoch": 8.54585609671045, + "grad_norm": 1.47037227135713, + "learning_rate": 4.273093614383137e-07, + "loss": 0.927, + "step": 110280 + }, + { + "epoch": 8.546631020186757, + "grad_norm": 1.4684942097093256, + "learning_rate": 4.2734810911345323e-07, + "loss": 0.9322, + "step": 110290 + }, + { + "epoch": 8.547405943663064, + "grad_norm": 1.4251826101992031, + "learning_rate": 4.273868567885927e-07, + "loss": 0.9, + "step": 110300 + }, + { + "epoch": 8.54818086713937, + "grad_norm": 1.4433475653738463, + "learning_rate": 4.2742560446373217e-07, + "loss": 0.9169, + "step": 110310 + }, + { + "epoch": 8.548955790615677, + "grad_norm": 1.5796100024975601, + "learning_rate": 4.274643521388717e-07, + "loss": 0.924, + "step": 110320 + }, + { + "epoch": 8.549730714091984, + "grad_norm": 1.4260963545878718, + "learning_rate": 4.2750309981401116e-07, + "loss": 0.9116, + "step": 110330 + }, + { + "epoch": 8.55050563756829, + "grad_norm": 1.4725848362296432, + "learning_rate": 4.275418474891507e-07, + "loss": 0.9088, + "step": 110340 + }, + { + "epoch": 8.551280561044598, + "grad_norm": 1.47360607810359, + "learning_rate": 4.2758059516429015e-07, + "loss": 0.901, + "step": 110350 + }, + { + "epoch": 8.552055484520903, + "grad_norm": 1.4427053469245903, + "learning_rate": 4.276193428394296e-07, + "loss": 0.9062, + "step": 110360 + }, + { + "epoch": 8.55283040799721, + "grad_norm": 1.4323438480585076, + "learning_rate": 4.2765809051456914e-07, + "loss": 0.9142, + "step": 110370 + }, + { + "epoch": 8.553605331473516, + "grad_norm": 1.51148194378017, + "learning_rate": 4.276968381897086e-07, + "loss": 0.9058, + "step": 110380 + }, + { + "epoch": 8.554380254949823, + "grad_norm": 1.4709685996365025, + "learning_rate": 4.2773558586484813e-07, + "loss": 0.9014, + "step": 110390 + }, + { + "epoch": 8.55515517842613, + "grad_norm": 1.5342517430111022, + "learning_rate": 4.277743335399876e-07, + "loss": 0.9215, + "step": 110400 + }, + { + "epoch": 8.555930101902437, + "grad_norm": 1.44774728462536, + "learning_rate": 4.278130812151271e-07, + "loss": 0.8953, + "step": 110410 + }, + { + "epoch": 8.556705025378744, + "grad_norm": 1.4270912529556063, + "learning_rate": 4.278518288902666e-07, + "loss": 0.9079, + "step": 110420 + }, + { + "epoch": 8.55747994885505, + "grad_norm": 1.4602471028373287, + "learning_rate": 4.2789057656540606e-07, + "loss": 0.9131, + "step": 110430 + }, + { + "epoch": 8.558254872331357, + "grad_norm": 1.4622949020782088, + "learning_rate": 4.279293242405456e-07, + "loss": 0.9218, + "step": 110440 + }, + { + "epoch": 8.559029795807664, + "grad_norm": 1.399113804419539, + "learning_rate": 4.2796807191568505e-07, + "loss": 0.9119, + "step": 110450 + }, + { + "epoch": 8.55980471928397, + "grad_norm": 1.509190322982427, + "learning_rate": 4.2800681959082457e-07, + "loss": 0.8917, + "step": 110460 + }, + { + "epoch": 8.560579642760278, + "grad_norm": 1.3823232440130806, + "learning_rate": 4.2804556726596404e-07, + "loss": 0.9031, + "step": 110470 + }, + { + "epoch": 8.561354566236584, + "grad_norm": 1.479427649017713, + "learning_rate": 4.2808431494110356e-07, + "loss": 0.9053, + "step": 110480 + }, + { + "epoch": 8.562129489712891, + "grad_norm": 1.355954157980268, + "learning_rate": 4.2812306261624303e-07, + "loss": 0.8945, + "step": 110490 + }, + { + "epoch": 8.562904413189198, + "grad_norm": 1.4106241836469566, + "learning_rate": 4.281618102913825e-07, + "loss": 0.8951, + "step": 110500 + }, + { + "epoch": 8.562904413189198, + "eval_loss": 0.9173483848571777, + "eval_runtime": 332.8744, + "eval_samples_per_second": 34.46, + "eval_steps_per_second": 8.616, + "step": 110500 + }, + { + "epoch": 8.563679336665505, + "grad_norm": 1.3793939990461637, + "learning_rate": 4.28200557966522e-07, + "loss": 0.9029, + "step": 110510 + }, + { + "epoch": 8.564454260141812, + "grad_norm": 1.4487544049606014, + "learning_rate": 4.282393056416615e-07, + "loss": 0.898, + "step": 110520 + }, + { + "epoch": 8.565229183618118, + "grad_norm": 1.4584187916030207, + "learning_rate": 4.28278053316801e-07, + "loss": 0.9003, + "step": 110530 + }, + { + "epoch": 8.566004107094425, + "grad_norm": 1.5166669385932452, + "learning_rate": 4.283168009919405e-07, + "loss": 0.9379, + "step": 110540 + }, + { + "epoch": 8.56677903057073, + "grad_norm": 1.5004677659172132, + "learning_rate": 4.2835554866708e-07, + "loss": 0.9334, + "step": 110550 + }, + { + "epoch": 8.567553954047037, + "grad_norm": 1.4264686488400042, + "learning_rate": 4.283942963422195e-07, + "loss": 0.9034, + "step": 110560 + }, + { + "epoch": 8.568328877523344, + "grad_norm": 1.436584689269412, + "learning_rate": 4.2843304401735894e-07, + "loss": 0.8966, + "step": 110570 + }, + { + "epoch": 8.56910380099965, + "grad_norm": 1.4229309029101396, + "learning_rate": 4.2847179169249847e-07, + "loss": 0.8976, + "step": 110580 + }, + { + "epoch": 8.569878724475958, + "grad_norm": 1.5366565119874624, + "learning_rate": 4.2851053936763794e-07, + "loss": 0.9197, + "step": 110590 + }, + { + "epoch": 8.570653647952264, + "grad_norm": 1.3811303501506413, + "learning_rate": 4.2854928704277746e-07, + "loss": 0.905, + "step": 110600 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 1.4433730849311561, + "learning_rate": 4.285880347179169e-07, + "loss": 0.8913, + "step": 110610 + }, + { + "epoch": 8.572203494904878, + "grad_norm": 1.4765432255839612, + "learning_rate": 4.2862678239305645e-07, + "loss": 0.9362, + "step": 110620 + }, + { + "epoch": 8.572978418381185, + "grad_norm": 1.3656264414838328, + "learning_rate": 4.286655300681959e-07, + "loss": 0.9252, + "step": 110630 + }, + { + "epoch": 8.573753341857492, + "grad_norm": 1.4147367873728909, + "learning_rate": 4.287042777433354e-07, + "loss": 0.9189, + "step": 110640 + }, + { + "epoch": 8.574528265333798, + "grad_norm": 1.4558518525823259, + "learning_rate": 4.287430254184749e-07, + "loss": 0.9133, + "step": 110650 + }, + { + "epoch": 8.575303188810105, + "grad_norm": 1.4003637421335888, + "learning_rate": 4.287817730936144e-07, + "loss": 0.9077, + "step": 110660 + }, + { + "epoch": 8.576078112286412, + "grad_norm": 1.4776673138529732, + "learning_rate": 4.288205207687539e-07, + "loss": 0.897, + "step": 110670 + }, + { + "epoch": 8.576853035762719, + "grad_norm": 1.4832973602286332, + "learning_rate": 4.2885926844389337e-07, + "loss": 0.9227, + "step": 110680 + }, + { + "epoch": 8.577627959239026, + "grad_norm": 1.41622341666132, + "learning_rate": 4.288980161190329e-07, + "loss": 0.9165, + "step": 110690 + }, + { + "epoch": 8.578402882715332, + "grad_norm": 1.4346487011806717, + "learning_rate": 4.2893676379417236e-07, + "loss": 0.9263, + "step": 110700 + }, + { + "epoch": 8.57917780619164, + "grad_norm": 1.4429568470486518, + "learning_rate": 4.2897551146931183e-07, + "loss": 0.9262, + "step": 110710 + }, + { + "epoch": 8.579952729667946, + "grad_norm": 1.4605177593543432, + "learning_rate": 4.2901425914445135e-07, + "loss": 0.9234, + "step": 110720 + }, + { + "epoch": 8.580727653144251, + "grad_norm": 1.4204927230444795, + "learning_rate": 4.290530068195908e-07, + "loss": 0.8995, + "step": 110730 + }, + { + "epoch": 8.581502576620558, + "grad_norm": 1.467145590962962, + "learning_rate": 4.2909175449473034e-07, + "loss": 0.9398, + "step": 110740 + }, + { + "epoch": 8.582277500096865, + "grad_norm": 1.4618751683218507, + "learning_rate": 4.291305021698698e-07, + "loss": 0.8962, + "step": 110750 + }, + { + "epoch": 8.583052423573172, + "grad_norm": 1.466912706109744, + "learning_rate": 4.2916924984500933e-07, + "loss": 0.912, + "step": 110760 + }, + { + "epoch": 8.583827347049478, + "grad_norm": 1.4536747677550022, + "learning_rate": 4.292079975201488e-07, + "loss": 0.8986, + "step": 110770 + }, + { + "epoch": 8.584602270525785, + "grad_norm": 1.4044019331064062, + "learning_rate": 4.2924674519528827e-07, + "loss": 0.9069, + "step": 110780 + }, + { + "epoch": 8.585377194002092, + "grad_norm": 1.422379849352862, + "learning_rate": 4.292854928704278e-07, + "loss": 0.921, + "step": 110790 + }, + { + "epoch": 8.586152117478399, + "grad_norm": 1.4179568998527607, + "learning_rate": 4.2932424054556726e-07, + "loss": 0.9084, + "step": 110800 + }, + { + "epoch": 8.586927040954706, + "grad_norm": 1.3678894259306722, + "learning_rate": 4.293629882207068e-07, + "loss": 0.928, + "step": 110810 + }, + { + "epoch": 8.587701964431012, + "grad_norm": 1.4758311050551154, + "learning_rate": 4.2940173589584625e-07, + "loss": 0.895, + "step": 110820 + }, + { + "epoch": 8.58847688790732, + "grad_norm": 1.482260079417221, + "learning_rate": 4.294404835709858e-07, + "loss": 0.9009, + "step": 110830 + }, + { + "epoch": 8.589251811383626, + "grad_norm": 1.402316401473655, + "learning_rate": 4.2947923124612524e-07, + "loss": 0.9214, + "step": 110840 + }, + { + "epoch": 8.590026734859933, + "grad_norm": 1.4377342181543051, + "learning_rate": 4.295179789212647e-07, + "loss": 0.9123, + "step": 110850 + }, + { + "epoch": 8.59080165833624, + "grad_norm": 1.45876541538053, + "learning_rate": 4.2955672659640423e-07, + "loss": 0.9013, + "step": 110860 + }, + { + "epoch": 8.591576581812546, + "grad_norm": 1.403976989479545, + "learning_rate": 4.295954742715437e-07, + "loss": 0.9052, + "step": 110870 + }, + { + "epoch": 8.592351505288853, + "grad_norm": 1.441727651076373, + "learning_rate": 4.296342219466832e-07, + "loss": 0.9027, + "step": 110880 + }, + { + "epoch": 8.59312642876516, + "grad_norm": 1.403079565416468, + "learning_rate": 4.296729696218227e-07, + "loss": 0.914, + "step": 110890 + }, + { + "epoch": 8.593901352241467, + "grad_norm": 1.4607030878873848, + "learning_rate": 4.297117172969622e-07, + "loss": 0.9043, + "step": 110900 + }, + { + "epoch": 8.594676275717774, + "grad_norm": 1.3156880000929314, + "learning_rate": 4.297504649721017e-07, + "loss": 0.8994, + "step": 110910 + }, + { + "epoch": 8.59545119919408, + "grad_norm": 1.4591559418487274, + "learning_rate": 4.2978921264724115e-07, + "loss": 0.8907, + "step": 110920 + }, + { + "epoch": 8.596226122670386, + "grad_norm": 1.4874388534720928, + "learning_rate": 4.298279603223807e-07, + "loss": 0.8807, + "step": 110930 + }, + { + "epoch": 8.597001046146692, + "grad_norm": 1.4266781899360352, + "learning_rate": 4.2986670799752015e-07, + "loss": 0.9094, + "step": 110940 + }, + { + "epoch": 8.597775969623, + "grad_norm": 1.4786989738124168, + "learning_rate": 4.2990545567265967e-07, + "loss": 0.8916, + "step": 110950 + }, + { + "epoch": 8.598550893099306, + "grad_norm": 1.420583873437336, + "learning_rate": 4.2994420334779914e-07, + "loss": 0.9132, + "step": 110960 + }, + { + "epoch": 8.599325816575613, + "grad_norm": 1.3948015750757887, + "learning_rate": 4.2998295102293866e-07, + "loss": 0.8935, + "step": 110970 + }, + { + "epoch": 8.60010074005192, + "grad_norm": 1.4531015599015087, + "learning_rate": 4.3002169869807813e-07, + "loss": 0.9233, + "step": 110980 + }, + { + "epoch": 8.600875663528226, + "grad_norm": 1.4690044889878722, + "learning_rate": 4.300604463732176e-07, + "loss": 0.9197, + "step": 110990 + }, + { + "epoch": 8.601650587004533, + "grad_norm": 1.4474607355572946, + "learning_rate": 4.300991940483571e-07, + "loss": 0.9064, + "step": 111000 + }, + { + "epoch": 8.601650587004533, + "eval_loss": 0.917207658290863, + "eval_runtime": 335.8401, + "eval_samples_per_second": 34.156, + "eval_steps_per_second": 8.54, + "step": 111000 + }, + { + "epoch": 8.60242551048084, + "grad_norm": 1.443000923620717, + "learning_rate": 4.301379417234966e-07, + "loss": 0.9129, + "step": 111010 + }, + { + "epoch": 8.603200433957147, + "grad_norm": 1.3459779977313746, + "learning_rate": 4.301766893986361e-07, + "loss": 0.9106, + "step": 111020 + }, + { + "epoch": 8.603975357433454, + "grad_norm": 1.3680656773624194, + "learning_rate": 4.302154370737756e-07, + "loss": 0.9297, + "step": 111030 + }, + { + "epoch": 8.60475028090976, + "grad_norm": 1.4029059232356729, + "learning_rate": 4.3025418474891505e-07, + "loss": 0.9054, + "step": 111040 + }, + { + "epoch": 8.605525204386067, + "grad_norm": 1.469298247486628, + "learning_rate": 4.3029293242405457e-07, + "loss": 0.8906, + "step": 111050 + }, + { + "epoch": 8.606300127862374, + "grad_norm": 1.4226498964528471, + "learning_rate": 4.3033168009919404e-07, + "loss": 0.9154, + "step": 111060 + }, + { + "epoch": 8.607075051338681, + "grad_norm": 1.4088774668326172, + "learning_rate": 4.3037042777433356e-07, + "loss": 0.9278, + "step": 111070 + }, + { + "epoch": 8.607849974814988, + "grad_norm": 1.4005968014675505, + "learning_rate": 4.3040917544947303e-07, + "loss": 0.8842, + "step": 111080 + }, + { + "epoch": 8.608624898291295, + "grad_norm": 1.5006883258297563, + "learning_rate": 4.3044792312461255e-07, + "loss": 0.9326, + "step": 111090 + }, + { + "epoch": 8.6093998217676, + "grad_norm": 1.4206092905152652, + "learning_rate": 4.30486670799752e-07, + "loss": 0.8928, + "step": 111100 + }, + { + "epoch": 8.610174745243906, + "grad_norm": 1.4844310792378763, + "learning_rate": 4.305254184748915e-07, + "loss": 0.9068, + "step": 111110 + }, + { + "epoch": 8.610949668720213, + "grad_norm": 1.4417771575743796, + "learning_rate": 4.30564166150031e-07, + "loss": 0.9078, + "step": 111120 + }, + { + "epoch": 8.61172459219652, + "grad_norm": 1.487800132347442, + "learning_rate": 4.306029138251705e-07, + "loss": 0.9104, + "step": 111130 + }, + { + "epoch": 8.612499515672827, + "grad_norm": 1.458557431895574, + "learning_rate": 4.3064166150031e-07, + "loss": 0.9086, + "step": 111140 + }, + { + "epoch": 8.613274439149134, + "grad_norm": 1.3893801281387683, + "learning_rate": 4.3068040917544947e-07, + "loss": 0.8955, + "step": 111150 + }, + { + "epoch": 8.61404936262544, + "grad_norm": 1.4534008837445092, + "learning_rate": 4.30719156850589e-07, + "loss": 0.9018, + "step": 111160 + }, + { + "epoch": 8.614824286101747, + "grad_norm": 1.4188877482726856, + "learning_rate": 4.3075790452572846e-07, + "loss": 0.9107, + "step": 111170 + }, + { + "epoch": 8.615599209578054, + "grad_norm": 1.349054058668224, + "learning_rate": 4.3079665220086793e-07, + "loss": 0.914, + "step": 111180 + }, + { + "epoch": 8.61637413305436, + "grad_norm": 1.414024376960444, + "learning_rate": 4.3083539987600745e-07, + "loss": 0.9036, + "step": 111190 + }, + { + "epoch": 8.617149056530668, + "grad_norm": 1.3889743961721324, + "learning_rate": 4.308741475511469e-07, + "loss": 0.9144, + "step": 111200 + }, + { + "epoch": 8.617923980006974, + "grad_norm": 1.4767337062601078, + "learning_rate": 4.3091289522628644e-07, + "loss": 0.9155, + "step": 111210 + }, + { + "epoch": 8.618698903483281, + "grad_norm": 1.5133351856218276, + "learning_rate": 4.309516429014259e-07, + "loss": 0.9051, + "step": 111220 + }, + { + "epoch": 8.619473826959588, + "grad_norm": 1.4728207463098244, + "learning_rate": 4.3099039057656544e-07, + "loss": 0.9071, + "step": 111230 + }, + { + "epoch": 8.620248750435895, + "grad_norm": 1.4720627519739025, + "learning_rate": 4.310291382517049e-07, + "loss": 0.9048, + "step": 111240 + }, + { + "epoch": 8.621023673912202, + "grad_norm": 1.4569910148622998, + "learning_rate": 4.310678859268444e-07, + "loss": 0.9121, + "step": 111250 + }, + { + "epoch": 8.621798597388509, + "grad_norm": 1.3825241019502321, + "learning_rate": 4.311066336019839e-07, + "loss": 0.9024, + "step": 111260 + }, + { + "epoch": 8.622573520864815, + "grad_norm": 1.3325897743934498, + "learning_rate": 4.3114538127712337e-07, + "loss": 0.8878, + "step": 111270 + }, + { + "epoch": 8.623348444341122, + "grad_norm": 1.4488757835531791, + "learning_rate": 4.311841289522629e-07, + "loss": 0.9223, + "step": 111280 + }, + { + "epoch": 8.624123367817429, + "grad_norm": 1.4460941728768635, + "learning_rate": 4.3122287662740236e-07, + "loss": 0.8832, + "step": 111290 + }, + { + "epoch": 8.624898291293734, + "grad_norm": 1.378034480022561, + "learning_rate": 4.312616243025419e-07, + "loss": 0.9021, + "step": 111300 + }, + { + "epoch": 8.62567321477004, + "grad_norm": 1.4458463034425622, + "learning_rate": 4.3130037197768135e-07, + "loss": 0.8974, + "step": 111310 + }, + { + "epoch": 8.626448138246348, + "grad_norm": 1.4893426242636518, + "learning_rate": 4.313391196528208e-07, + "loss": 0.9359, + "step": 111320 + }, + { + "epoch": 8.627223061722654, + "grad_norm": 1.413997130754136, + "learning_rate": 4.3137786732796034e-07, + "loss": 0.8941, + "step": 111330 + }, + { + "epoch": 8.627997985198961, + "grad_norm": 1.3838115194000686, + "learning_rate": 4.314166150030998e-07, + "loss": 0.955, + "step": 111340 + }, + { + "epoch": 8.628772908675268, + "grad_norm": 1.3983678737306464, + "learning_rate": 4.3145536267823933e-07, + "loss": 0.8999, + "step": 111350 + }, + { + "epoch": 8.629547832151575, + "grad_norm": 1.4079920615371244, + "learning_rate": 4.314941103533788e-07, + "loss": 0.9204, + "step": 111360 + }, + { + "epoch": 8.630322755627882, + "grad_norm": 1.4440290284607598, + "learning_rate": 4.315328580285183e-07, + "loss": 0.8975, + "step": 111370 + }, + { + "epoch": 8.631097679104188, + "grad_norm": 1.4007322116309635, + "learning_rate": 4.315716057036578e-07, + "loss": 0.9396, + "step": 111380 + }, + { + "epoch": 8.631872602580495, + "grad_norm": 1.4231691052578928, + "learning_rate": 4.3161035337879726e-07, + "loss": 0.916, + "step": 111390 + }, + { + "epoch": 8.632647526056802, + "grad_norm": 1.383779892482784, + "learning_rate": 4.316491010539368e-07, + "loss": 0.9072, + "step": 111400 + }, + { + "epoch": 8.633422449533109, + "grad_norm": 1.4289732947171423, + "learning_rate": 4.3168784872907625e-07, + "loss": 0.9129, + "step": 111410 + }, + { + "epoch": 8.634197373009416, + "grad_norm": 1.439565011633324, + "learning_rate": 4.3172659640421577e-07, + "loss": 0.9122, + "step": 111420 + }, + { + "epoch": 8.634972296485723, + "grad_norm": 1.4632884754703477, + "learning_rate": 4.3176534407935524e-07, + "loss": 0.8981, + "step": 111430 + }, + { + "epoch": 8.63574721996203, + "grad_norm": 1.3525477709095752, + "learning_rate": 4.3180409175449476e-07, + "loss": 0.8851, + "step": 111440 + }, + { + "epoch": 8.636522143438336, + "grad_norm": 1.4605198111291955, + "learning_rate": 4.3184283942963423e-07, + "loss": 0.903, + "step": 111450 + }, + { + "epoch": 8.637297066914643, + "grad_norm": 1.560372286524763, + "learning_rate": 4.318815871047737e-07, + "loss": 0.9095, + "step": 111460 + }, + { + "epoch": 8.638071990390948, + "grad_norm": 1.3896183150514647, + "learning_rate": 4.319203347799132e-07, + "loss": 0.8864, + "step": 111470 + }, + { + "epoch": 8.638846913867255, + "grad_norm": 1.3706283249192246, + "learning_rate": 4.319590824550527e-07, + "loss": 0.9288, + "step": 111480 + }, + { + "epoch": 8.639621837343562, + "grad_norm": 1.4179904571545874, + "learning_rate": 4.319978301301922e-07, + "loss": 0.8936, + "step": 111490 + }, + { + "epoch": 8.640396760819868, + "grad_norm": 1.4443787442822553, + "learning_rate": 4.320365778053317e-07, + "loss": 0.9233, + "step": 111500 + }, + { + "epoch": 8.640396760819868, + "eval_loss": 0.9168842434883118, + "eval_runtime": 330.0616, + "eval_samples_per_second": 34.754, + "eval_steps_per_second": 8.689, + "step": 111500 + }, + { + "epoch": 8.641171684296175, + "grad_norm": 1.332191142873064, + "learning_rate": 4.320753254804712e-07, + "loss": 0.9032, + "step": 111510 + }, + { + "epoch": 8.641946607772482, + "grad_norm": 1.4968414359059203, + "learning_rate": 4.3211407315561067e-07, + "loss": 0.9109, + "step": 111520 + }, + { + "epoch": 8.642721531248789, + "grad_norm": 1.3711109450524503, + "learning_rate": 4.3215282083075014e-07, + "loss": 0.9001, + "step": 111530 + }, + { + "epoch": 8.643496454725096, + "grad_norm": 1.4508543007293149, + "learning_rate": 4.3219156850588966e-07, + "loss": 0.8946, + "step": 111540 + }, + { + "epoch": 8.644271378201402, + "grad_norm": 1.433302999960814, + "learning_rate": 4.3223031618102913e-07, + "loss": 0.9051, + "step": 111550 + }, + { + "epoch": 8.64504630167771, + "grad_norm": 1.4612469836601156, + "learning_rate": 4.3226906385616866e-07, + "loss": 0.9042, + "step": 111560 + }, + { + "epoch": 8.645821225154016, + "grad_norm": 1.4182660540913965, + "learning_rate": 4.323078115313081e-07, + "loss": 0.9005, + "step": 111570 + }, + { + "epoch": 8.646596148630323, + "grad_norm": 1.4386499404765005, + "learning_rate": 4.3234655920644765e-07, + "loss": 0.9222, + "step": 111580 + }, + { + "epoch": 8.64737107210663, + "grad_norm": 1.5233940306563654, + "learning_rate": 4.323853068815871e-07, + "loss": 0.9024, + "step": 111590 + }, + { + "epoch": 8.648145995582937, + "grad_norm": 1.4469649465725956, + "learning_rate": 4.324240545567266e-07, + "loss": 0.9183, + "step": 111600 + }, + { + "epoch": 8.648920919059243, + "grad_norm": 1.3757758367765618, + "learning_rate": 4.324628022318661e-07, + "loss": 0.9007, + "step": 111610 + }, + { + "epoch": 8.64969584253555, + "grad_norm": 1.4372857516198134, + "learning_rate": 4.325015499070056e-07, + "loss": 0.9076, + "step": 111620 + }, + { + "epoch": 8.650470766011857, + "grad_norm": 1.4955785792488325, + "learning_rate": 4.325402975821451e-07, + "loss": 0.907, + "step": 111630 + }, + { + "epoch": 8.651245689488164, + "grad_norm": 1.4124695088136041, + "learning_rate": 4.3257904525728457e-07, + "loss": 0.9055, + "step": 111640 + }, + { + "epoch": 8.65202061296447, + "grad_norm": 1.435004004246368, + "learning_rate": 4.326177929324241e-07, + "loss": 0.9003, + "step": 111650 + }, + { + "epoch": 8.652795536440777, + "grad_norm": 1.4130065613449687, + "learning_rate": 4.3265654060756356e-07, + "loss": 0.9266, + "step": 111660 + }, + { + "epoch": 8.653570459917082, + "grad_norm": 1.442420192124818, + "learning_rate": 4.3269528828270303e-07, + "loss": 0.9186, + "step": 111670 + }, + { + "epoch": 8.65434538339339, + "grad_norm": 1.3593497455109478, + "learning_rate": 4.3273403595784255e-07, + "loss": 0.9321, + "step": 111680 + }, + { + "epoch": 8.655120306869696, + "grad_norm": 1.5411314396178106, + "learning_rate": 4.32772783632982e-07, + "loss": 0.9086, + "step": 111690 + }, + { + "epoch": 8.655895230346003, + "grad_norm": 1.5718934440827972, + "learning_rate": 4.3281153130812154e-07, + "loss": 0.9109, + "step": 111700 + }, + { + "epoch": 8.65667015382231, + "grad_norm": 1.4049252379990416, + "learning_rate": 4.32850278983261e-07, + "loss": 0.8913, + "step": 111710 + }, + { + "epoch": 8.657445077298616, + "grad_norm": 1.353955428854551, + "learning_rate": 4.3288902665840053e-07, + "loss": 0.9144, + "step": 111720 + }, + { + "epoch": 8.658220000774923, + "grad_norm": 1.4385180200940777, + "learning_rate": 4.3292777433354e-07, + "loss": 0.9087, + "step": 111730 + }, + { + "epoch": 8.65899492425123, + "grad_norm": 1.4220534624109098, + "learning_rate": 4.3296652200867947e-07, + "loss": 0.8816, + "step": 111740 + }, + { + "epoch": 8.659769847727537, + "grad_norm": 1.4517415813209262, + "learning_rate": 4.33005269683819e-07, + "loss": 0.9208, + "step": 111750 + }, + { + "epoch": 8.660544771203844, + "grad_norm": 1.411255850043063, + "learning_rate": 4.3304401735895846e-07, + "loss": 0.8884, + "step": 111760 + }, + { + "epoch": 8.66131969468015, + "grad_norm": 1.4927377840583032, + "learning_rate": 4.33082765034098e-07, + "loss": 0.8957, + "step": 111770 + }, + { + "epoch": 8.662094618156457, + "grad_norm": 1.472164243195394, + "learning_rate": 4.3312151270923745e-07, + "loss": 0.9089, + "step": 111780 + }, + { + "epoch": 8.662869541632764, + "grad_norm": 1.4269803168271946, + "learning_rate": 4.331602603843769e-07, + "loss": 0.9188, + "step": 111790 + }, + { + "epoch": 8.663644465109071, + "grad_norm": 1.4392184442418456, + "learning_rate": 4.3319900805951644e-07, + "loss": 0.8782, + "step": 111800 + }, + { + "epoch": 8.664419388585378, + "grad_norm": 1.4411634833827383, + "learning_rate": 4.332377557346559e-07, + "loss": 0.9097, + "step": 111810 + }, + { + "epoch": 8.665194312061685, + "grad_norm": 1.4544902359816032, + "learning_rate": 4.3327650340979543e-07, + "loss": 0.8948, + "step": 111820 + }, + { + "epoch": 8.665969235537991, + "grad_norm": 1.4317571031470053, + "learning_rate": 4.333152510849349e-07, + "loss": 0.8952, + "step": 111830 + }, + { + "epoch": 8.666744159014296, + "grad_norm": 1.5381207562383477, + "learning_rate": 4.333539987600744e-07, + "loss": 0.8894, + "step": 111840 + }, + { + "epoch": 8.667519082490603, + "grad_norm": 1.4807010055845333, + "learning_rate": 4.333927464352139e-07, + "loss": 0.9094, + "step": 111850 + }, + { + "epoch": 8.66829400596691, + "grad_norm": 1.4597651146727668, + "learning_rate": 4.3343149411035336e-07, + "loss": 0.9233, + "step": 111860 + }, + { + "epoch": 8.669068929443217, + "grad_norm": 1.41284776417951, + "learning_rate": 4.334702417854929e-07, + "loss": 0.9168, + "step": 111870 + }, + { + "epoch": 8.669843852919524, + "grad_norm": 1.4055426557856678, + "learning_rate": 4.3350898946063235e-07, + "loss": 0.9194, + "step": 111880 + }, + { + "epoch": 8.67061877639583, + "grad_norm": 1.3526236222413899, + "learning_rate": 4.335477371357719e-07, + "loss": 0.8992, + "step": 111890 + }, + { + "epoch": 8.671393699872137, + "grad_norm": 1.3927654930478868, + "learning_rate": 4.3358648481091134e-07, + "loss": 0.9119, + "step": 111900 + }, + { + "epoch": 8.672168623348444, + "grad_norm": 1.3949683061470999, + "learning_rate": 4.3362523248605087e-07, + "loss": 0.9051, + "step": 111910 + }, + { + "epoch": 8.67294354682475, + "grad_norm": 1.4240954019381529, + "learning_rate": 4.3366398016119034e-07, + "loss": 0.8976, + "step": 111920 + }, + { + "epoch": 8.673718470301058, + "grad_norm": 1.369969588320093, + "learning_rate": 4.337027278363298e-07, + "loss": 0.8977, + "step": 111930 + }, + { + "epoch": 8.674493393777365, + "grad_norm": 1.3716650375046977, + "learning_rate": 4.337414755114693e-07, + "loss": 0.9123, + "step": 111940 + }, + { + "epoch": 8.675268317253671, + "grad_norm": 1.5188975191484269, + "learning_rate": 4.337802231866088e-07, + "loss": 0.9207, + "step": 111950 + }, + { + "epoch": 8.676043240729978, + "grad_norm": 1.3316264448264303, + "learning_rate": 4.338189708617483e-07, + "loss": 0.8981, + "step": 111960 + }, + { + "epoch": 8.676818164206285, + "grad_norm": 1.3963276032248402, + "learning_rate": 4.338577185368878e-07, + "loss": 0.902, + "step": 111970 + }, + { + "epoch": 8.677593087682592, + "grad_norm": 1.4794863189086045, + "learning_rate": 4.338964662120273e-07, + "loss": 0.9096, + "step": 111980 + }, + { + "epoch": 8.678368011158899, + "grad_norm": 1.357565512649293, + "learning_rate": 4.339352138871668e-07, + "loss": 0.8952, + "step": 111990 + }, + { + "epoch": 8.679142934635205, + "grad_norm": 1.4843473682951633, + "learning_rate": 4.3397396156230625e-07, + "loss": 0.9282, + "step": 112000 + }, + { + "epoch": 8.679142934635205, + "eval_loss": 0.9164881706237793, + "eval_runtime": 330.8156, + "eval_samples_per_second": 34.675, + "eval_steps_per_second": 8.669, + "step": 112000 + }, + { + "epoch": 8.679917858111512, + "grad_norm": 1.458946985621132, + "learning_rate": 4.3401270923744577e-07, + "loss": 0.9106, + "step": 112010 + }, + { + "epoch": 8.680692781587819, + "grad_norm": 1.3934166424788856, + "learning_rate": 4.3405145691258524e-07, + "loss": 0.9457, + "step": 112020 + }, + { + "epoch": 8.681467705064126, + "grad_norm": 1.3285305170741448, + "learning_rate": 4.3409020458772476e-07, + "loss": 0.9095, + "step": 112030 + }, + { + "epoch": 8.68224262854043, + "grad_norm": 1.393290258308061, + "learning_rate": 4.3412895226286423e-07, + "loss": 0.9203, + "step": 112040 + }, + { + "epoch": 8.683017552016738, + "grad_norm": 1.5308272830655623, + "learning_rate": 4.3416769993800375e-07, + "loss": 0.9252, + "step": 112050 + }, + { + "epoch": 8.683792475493044, + "grad_norm": 1.3996702640091863, + "learning_rate": 4.342064476131432e-07, + "loss": 0.8988, + "step": 112060 + }, + { + "epoch": 8.684567398969351, + "grad_norm": 1.4010551858851357, + "learning_rate": 4.342451952882827e-07, + "loss": 0.9184, + "step": 112070 + }, + { + "epoch": 8.685342322445658, + "grad_norm": 1.4678868133899374, + "learning_rate": 4.342839429634222e-07, + "loss": 0.9119, + "step": 112080 + }, + { + "epoch": 8.686117245921965, + "grad_norm": 1.4644452462278739, + "learning_rate": 4.343226906385617e-07, + "loss": 0.8878, + "step": 112090 + }, + { + "epoch": 8.686892169398272, + "grad_norm": 1.4248870075695723, + "learning_rate": 4.343614383137012e-07, + "loss": 0.9095, + "step": 112100 + }, + { + "epoch": 8.687667092874579, + "grad_norm": 1.4576486470515806, + "learning_rate": 4.3440018598884067e-07, + "loss": 0.9235, + "step": 112110 + }, + { + "epoch": 8.688442016350885, + "grad_norm": 1.5180845604842932, + "learning_rate": 4.344389336639802e-07, + "loss": 0.9096, + "step": 112120 + }, + { + "epoch": 8.689216939827192, + "grad_norm": 1.3979205954007357, + "learning_rate": 4.3447768133911966e-07, + "loss": 0.9074, + "step": 112130 + }, + { + "epoch": 8.689991863303499, + "grad_norm": 1.4684178500048637, + "learning_rate": 4.3451642901425913e-07, + "loss": 0.898, + "step": 112140 + }, + { + "epoch": 8.690766786779806, + "grad_norm": 1.4309158672971227, + "learning_rate": 4.3455517668939865e-07, + "loss": 0.9013, + "step": 112150 + }, + { + "epoch": 8.691541710256113, + "grad_norm": 1.5533713329033112, + "learning_rate": 4.345939243645381e-07, + "loss": 0.9304, + "step": 112160 + }, + { + "epoch": 8.69231663373242, + "grad_norm": 1.412863076366388, + "learning_rate": 4.3463267203967764e-07, + "loss": 0.8966, + "step": 112170 + }, + { + "epoch": 8.693091557208726, + "grad_norm": 1.4346591438560463, + "learning_rate": 4.346714197148171e-07, + "loss": 0.92, + "step": 112180 + }, + { + "epoch": 8.693866480685033, + "grad_norm": 1.403876253212299, + "learning_rate": 4.3471016738995663e-07, + "loss": 0.8868, + "step": 112190 + }, + { + "epoch": 8.69464140416134, + "grad_norm": 1.505260604367872, + "learning_rate": 4.347489150650961e-07, + "loss": 0.9323, + "step": 112200 + }, + { + "epoch": 8.695416327637647, + "grad_norm": 1.4728548185900086, + "learning_rate": 4.3478766274023557e-07, + "loss": 0.9212, + "step": 112210 + }, + { + "epoch": 8.696191251113952, + "grad_norm": 1.49685888823644, + "learning_rate": 4.348264104153751e-07, + "loss": 0.9005, + "step": 112220 + }, + { + "epoch": 8.696966174590258, + "grad_norm": 1.3838080257693037, + "learning_rate": 4.3486515809051456e-07, + "loss": 0.9035, + "step": 112230 + }, + { + "epoch": 8.697741098066565, + "grad_norm": 1.4069443013031921, + "learning_rate": 4.349039057656541e-07, + "loss": 0.8961, + "step": 112240 + }, + { + "epoch": 8.698516021542872, + "grad_norm": 1.3672189956609708, + "learning_rate": 4.3494265344079355e-07, + "loss": 0.8989, + "step": 112250 + }, + { + "epoch": 8.699290945019179, + "grad_norm": 1.5053066153480823, + "learning_rate": 4.349814011159331e-07, + "loss": 0.919, + "step": 112260 + }, + { + "epoch": 8.700065868495486, + "grad_norm": 1.4089963294544692, + "learning_rate": 4.3502014879107255e-07, + "loss": 0.9151, + "step": 112270 + }, + { + "epoch": 8.700840791971792, + "grad_norm": 1.5218383663165935, + "learning_rate": 4.35058896466212e-07, + "loss": 0.8944, + "step": 112280 + }, + { + "epoch": 8.7016157154481, + "grad_norm": 1.5346055412804067, + "learning_rate": 4.3509764414135154e-07, + "loss": 0.8949, + "step": 112290 + }, + { + "epoch": 8.702390638924406, + "grad_norm": 1.3918059415119741, + "learning_rate": 4.35136391816491e-07, + "loss": 0.8848, + "step": 112300 + }, + { + "epoch": 8.703165562400713, + "grad_norm": 1.3825081369120034, + "learning_rate": 4.3517513949163053e-07, + "loss": 0.8976, + "step": 112310 + }, + { + "epoch": 8.70394048587702, + "grad_norm": 1.5211240522250016, + "learning_rate": 4.3521388716677e-07, + "loss": 0.9422, + "step": 112320 + }, + { + "epoch": 8.704715409353327, + "grad_norm": 1.4542931440572202, + "learning_rate": 4.352526348419095e-07, + "loss": 0.9064, + "step": 112330 + }, + { + "epoch": 8.705490332829633, + "grad_norm": 1.461398800518242, + "learning_rate": 4.35291382517049e-07, + "loss": 0.8957, + "step": 112340 + }, + { + "epoch": 8.70626525630594, + "grad_norm": 1.4832676449841509, + "learning_rate": 4.3533013019218846e-07, + "loss": 0.9411, + "step": 112350 + }, + { + "epoch": 8.707040179782247, + "grad_norm": 1.4155304830083093, + "learning_rate": 4.35368877867328e-07, + "loss": 0.9099, + "step": 112360 + }, + { + "epoch": 8.707815103258554, + "grad_norm": 1.4852445980428939, + "learning_rate": 4.3540762554246745e-07, + "loss": 0.9213, + "step": 112370 + }, + { + "epoch": 8.70859002673486, + "grad_norm": 1.4763955057917413, + "learning_rate": 4.3544637321760697e-07, + "loss": 0.8992, + "step": 112380 + }, + { + "epoch": 8.709364950211167, + "grad_norm": 1.434953855856962, + "learning_rate": 4.3548512089274644e-07, + "loss": 0.8992, + "step": 112390 + }, + { + "epoch": 8.710139873687474, + "grad_norm": 1.4799821737936802, + "learning_rate": 4.3552386856788596e-07, + "loss": 0.9261, + "step": 112400 + }, + { + "epoch": 8.71091479716378, + "grad_norm": 1.4480419759950756, + "learning_rate": 4.3556261624302543e-07, + "loss": 0.9162, + "step": 112410 + }, + { + "epoch": 8.711689720640086, + "grad_norm": 1.3979488227971306, + "learning_rate": 4.356013639181649e-07, + "loss": 0.9133, + "step": 112420 + }, + { + "epoch": 8.712464644116393, + "grad_norm": 1.4269159603051493, + "learning_rate": 4.356401115933044e-07, + "loss": 0.8881, + "step": 112430 + }, + { + "epoch": 8.7132395675927, + "grad_norm": 1.3448905305402206, + "learning_rate": 4.356788592684439e-07, + "loss": 0.9001, + "step": 112440 + }, + { + "epoch": 8.714014491069006, + "grad_norm": 1.4769017157952342, + "learning_rate": 4.357176069435834e-07, + "loss": 0.9048, + "step": 112450 + }, + { + "epoch": 8.714789414545313, + "grad_norm": 1.487498895183625, + "learning_rate": 4.357563546187229e-07, + "loss": 0.8929, + "step": 112460 + }, + { + "epoch": 8.71556433802162, + "grad_norm": 1.527269262615834, + "learning_rate": 4.357951022938624e-07, + "loss": 0.9194, + "step": 112470 + }, + { + "epoch": 8.716339261497927, + "grad_norm": 1.5033873162801383, + "learning_rate": 4.3583384996900187e-07, + "loss": 0.9174, + "step": 112480 + }, + { + "epoch": 8.717114184974234, + "grad_norm": 1.440792913442083, + "learning_rate": 4.3587259764414134e-07, + "loss": 0.9094, + "step": 112490 + }, + { + "epoch": 8.71788910845054, + "grad_norm": 1.380385519068456, + "learning_rate": 4.3591134531928086e-07, + "loss": 0.9018, + "step": 112500 + }, + { + "epoch": 8.71788910845054, + "eval_loss": 0.9162014126777649, + "eval_runtime": 329.8078, + "eval_samples_per_second": 34.781, + "eval_steps_per_second": 8.696, + "step": 112500 + }, + { + "epoch": 8.718664031926847, + "grad_norm": 1.4225383877964306, + "learning_rate": 4.3595009299442033e-07, + "loss": 0.8994, + "step": 112510 + }, + { + "epoch": 8.719438955403154, + "grad_norm": 1.387276710017055, + "learning_rate": 4.3598884066955985e-07, + "loss": 0.9131, + "step": 112520 + }, + { + "epoch": 8.720213878879461, + "grad_norm": 1.5027863911545027, + "learning_rate": 4.360275883446993e-07, + "loss": 0.9004, + "step": 112530 + }, + { + "epoch": 8.720988802355768, + "grad_norm": 1.4008719194332804, + "learning_rate": 4.360663360198388e-07, + "loss": 0.8977, + "step": 112540 + }, + { + "epoch": 8.721763725832075, + "grad_norm": 1.4509571956827547, + "learning_rate": 4.361050836949783e-07, + "loss": 0.9214, + "step": 112550 + }, + { + "epoch": 8.722538649308381, + "grad_norm": 1.4343662754900302, + "learning_rate": 4.361438313701178e-07, + "loss": 0.9216, + "step": 112560 + }, + { + "epoch": 8.723313572784688, + "grad_norm": 1.450312263241741, + "learning_rate": 4.361825790452573e-07, + "loss": 0.9215, + "step": 112570 + }, + { + "epoch": 8.724088496260995, + "grad_norm": 1.4343608329905921, + "learning_rate": 4.362213267203968e-07, + "loss": 0.9034, + "step": 112580 + }, + { + "epoch": 8.7248634197373, + "grad_norm": 1.4318105201277793, + "learning_rate": 4.362600743955363e-07, + "loss": 0.8845, + "step": 112590 + }, + { + "epoch": 8.725638343213607, + "grad_norm": 1.4337144575855185, + "learning_rate": 4.3629882207067576e-07, + "loss": 0.9063, + "step": 112600 + }, + { + "epoch": 8.726413266689914, + "grad_norm": 1.456891843102342, + "learning_rate": 4.3633756974581523e-07, + "loss": 0.9014, + "step": 112610 + }, + { + "epoch": 8.72718819016622, + "grad_norm": 1.3783923053248686, + "learning_rate": 4.3637631742095476e-07, + "loss": 0.9093, + "step": 112620 + }, + { + "epoch": 8.727963113642527, + "grad_norm": 1.4480865877025002, + "learning_rate": 4.364150650960942e-07, + "loss": 0.9117, + "step": 112630 + }, + { + "epoch": 8.728738037118834, + "grad_norm": 1.4315310709406754, + "learning_rate": 4.3645381277123375e-07, + "loss": 0.9032, + "step": 112640 + }, + { + "epoch": 8.729512960595141, + "grad_norm": 1.4567160249640667, + "learning_rate": 4.364925604463732e-07, + "loss": 0.9158, + "step": 112650 + }, + { + "epoch": 8.730287884071448, + "grad_norm": 1.349107217688794, + "learning_rate": 4.3653130812151274e-07, + "loss": 0.8983, + "step": 112660 + }, + { + "epoch": 8.731062807547755, + "grad_norm": 1.3981226729291996, + "learning_rate": 4.365700557966522e-07, + "loss": 0.9331, + "step": 112670 + }, + { + "epoch": 8.731837731024061, + "grad_norm": 1.3992110594523637, + "learning_rate": 4.366088034717917e-07, + "loss": 0.9194, + "step": 112680 + }, + { + "epoch": 8.732612654500368, + "grad_norm": 1.5137747627255382, + "learning_rate": 4.366475511469312e-07, + "loss": 0.9166, + "step": 112690 + }, + { + "epoch": 8.733387577976675, + "grad_norm": 1.4942127640059741, + "learning_rate": 4.3668629882207067e-07, + "loss": 0.9019, + "step": 112700 + }, + { + "epoch": 8.734162501452982, + "grad_norm": 1.437089263813211, + "learning_rate": 4.367250464972102e-07, + "loss": 0.894, + "step": 112710 + }, + { + "epoch": 8.734937424929289, + "grad_norm": 1.4629987876754176, + "learning_rate": 4.3676379417234966e-07, + "loss": 0.9047, + "step": 112720 + }, + { + "epoch": 8.735712348405595, + "grad_norm": 1.4011143770977765, + "learning_rate": 4.368025418474892e-07, + "loss": 0.9027, + "step": 112730 + }, + { + "epoch": 8.736487271881902, + "grad_norm": 1.3362690264479502, + "learning_rate": 4.3684128952262865e-07, + "loss": 0.9016, + "step": 112740 + }, + { + "epoch": 8.737262195358209, + "grad_norm": 1.4699916216244227, + "learning_rate": 4.368800371977681e-07, + "loss": 0.8958, + "step": 112750 + }, + { + "epoch": 8.738037118834516, + "grad_norm": 1.397435610376759, + "learning_rate": 4.3691878487290764e-07, + "loss": 0.912, + "step": 112760 + }, + { + "epoch": 8.738812042310823, + "grad_norm": 1.45415075409882, + "learning_rate": 4.369575325480471e-07, + "loss": 0.9196, + "step": 112770 + }, + { + "epoch": 8.73958696578713, + "grad_norm": 1.407895116499108, + "learning_rate": 4.3699628022318663e-07, + "loss": 0.903, + "step": 112780 + }, + { + "epoch": 8.740361889263434, + "grad_norm": 1.4622480469695693, + "learning_rate": 4.370350278983261e-07, + "loss": 0.8856, + "step": 112790 + }, + { + "epoch": 8.741136812739741, + "grad_norm": 1.453408805904414, + "learning_rate": 4.370737755734656e-07, + "loss": 0.9143, + "step": 112800 + }, + { + "epoch": 8.741911736216048, + "grad_norm": 1.5575171719333871, + "learning_rate": 4.371125232486051e-07, + "loss": 0.9044, + "step": 112810 + }, + { + "epoch": 8.742686659692355, + "grad_norm": 1.4233937134777104, + "learning_rate": 4.3715127092374456e-07, + "loss": 0.9136, + "step": 112820 + }, + { + "epoch": 8.743461583168662, + "grad_norm": 1.4471889667093285, + "learning_rate": 4.371900185988841e-07, + "loss": 0.8879, + "step": 112830 + }, + { + "epoch": 8.744236506644969, + "grad_norm": 1.3902731425736197, + "learning_rate": 4.3722876627402355e-07, + "loss": 0.9101, + "step": 112840 + }, + { + "epoch": 8.745011430121275, + "grad_norm": 1.3953624879638222, + "learning_rate": 4.3726751394916307e-07, + "loss": 0.914, + "step": 112850 + }, + { + "epoch": 8.745786353597582, + "grad_norm": 1.4167985187052172, + "learning_rate": 4.3730626162430254e-07, + "loss": 0.9093, + "step": 112860 + }, + { + "epoch": 8.746561277073889, + "grad_norm": 1.479218900012138, + "learning_rate": 4.3734500929944206e-07, + "loss": 0.9138, + "step": 112870 + }, + { + "epoch": 8.747336200550196, + "grad_norm": 1.4567578399906274, + "learning_rate": 4.3738375697458153e-07, + "loss": 0.9089, + "step": 112880 + }, + { + "epoch": 8.748111124026503, + "grad_norm": 1.4274661215195155, + "learning_rate": 4.37422504649721e-07, + "loss": 0.8928, + "step": 112890 + }, + { + "epoch": 8.74888604750281, + "grad_norm": 1.4290287908502564, + "learning_rate": 4.374612523248605e-07, + "loss": 0.9057, + "step": 112900 + }, + { + "epoch": 8.749660970979116, + "grad_norm": 1.4471103645180714, + "learning_rate": 4.375e-07, + "loss": 0.898, + "step": 112910 + }, + { + "epoch": 8.750435894455423, + "grad_norm": 1.583409591761211, + "learning_rate": 4.375387476751395e-07, + "loss": 0.9056, + "step": 112920 + }, + { + "epoch": 8.75121081793173, + "grad_norm": 1.520793282105864, + "learning_rate": 4.37577495350279e-07, + "loss": 0.9155, + "step": 112930 + }, + { + "epoch": 8.751985741408037, + "grad_norm": 1.487126005969643, + "learning_rate": 4.376162430254185e-07, + "loss": 0.9154, + "step": 112940 + }, + { + "epoch": 8.752760664884343, + "grad_norm": 1.4438570712651944, + "learning_rate": 4.37654990700558e-07, + "loss": 0.9043, + "step": 112950 + }, + { + "epoch": 8.753535588360648, + "grad_norm": 1.4226410653586385, + "learning_rate": 4.3769373837569744e-07, + "loss": 0.9122, + "step": 112960 + }, + { + "epoch": 8.754310511836955, + "grad_norm": 1.3633959710977244, + "learning_rate": 4.3773248605083697e-07, + "loss": 0.8895, + "step": 112970 + }, + { + "epoch": 8.755085435313262, + "grad_norm": 1.44738062375348, + "learning_rate": 4.3777123372597644e-07, + "loss": 0.9002, + "step": 112980 + }, + { + "epoch": 8.755860358789569, + "grad_norm": 1.4349669230959863, + "learning_rate": 4.3780998140111596e-07, + "loss": 0.9076, + "step": 112990 + }, + { + "epoch": 8.756635282265876, + "grad_norm": 1.438591700657714, + "learning_rate": 4.3784872907625543e-07, + "loss": 0.9361, + "step": 113000 + }, + { + "epoch": 8.756635282265876, + "eval_loss": 0.9160436987876892, + "eval_runtime": 327.698, + "eval_samples_per_second": 35.005, + "eval_steps_per_second": 8.752, + "step": 113000 + }, + { + "epoch": 8.757410205742183, + "grad_norm": 1.4406252590870507, + "learning_rate": 4.3788747675139495e-07, + "loss": 0.9113, + "step": 113010 + }, + { + "epoch": 8.75818512921849, + "grad_norm": 1.4720361725137128, + "learning_rate": 4.379262244265344e-07, + "loss": 0.9105, + "step": 113020 + }, + { + "epoch": 8.758960052694796, + "grad_norm": 1.5053910065588219, + "learning_rate": 4.379649721016739e-07, + "loss": 0.9027, + "step": 113030 + }, + { + "epoch": 8.759734976171103, + "grad_norm": 1.4328153625373996, + "learning_rate": 4.380037197768134e-07, + "loss": 0.9053, + "step": 113040 + }, + { + "epoch": 8.76050989964741, + "grad_norm": 1.4704588749938161, + "learning_rate": 4.380424674519529e-07, + "loss": 0.9108, + "step": 113050 + }, + { + "epoch": 8.761284823123717, + "grad_norm": 1.4640110754324442, + "learning_rate": 4.380812151270924e-07, + "loss": 0.9025, + "step": 113060 + }, + { + "epoch": 8.762059746600023, + "grad_norm": 1.5348189869285007, + "learning_rate": 4.3811996280223187e-07, + "loss": 0.9059, + "step": 113070 + }, + { + "epoch": 8.76283467007633, + "grad_norm": 1.4869550508737723, + "learning_rate": 4.381587104773714e-07, + "loss": 0.9002, + "step": 113080 + }, + { + "epoch": 8.763609593552637, + "grad_norm": 1.3755081311395971, + "learning_rate": 4.3819745815251086e-07, + "loss": 0.908, + "step": 113090 + }, + { + "epoch": 8.764384517028944, + "grad_norm": 1.4849169633484205, + "learning_rate": 4.3823620582765033e-07, + "loss": 0.9315, + "step": 113100 + }, + { + "epoch": 8.76515944050525, + "grad_norm": 1.4756880191636128, + "learning_rate": 4.3827495350278985e-07, + "loss": 0.8949, + "step": 113110 + }, + { + "epoch": 8.765934363981557, + "grad_norm": 1.3694855115824982, + "learning_rate": 4.383137011779293e-07, + "loss": 0.8985, + "step": 113120 + }, + { + "epoch": 8.766709287457864, + "grad_norm": 1.457614239273947, + "learning_rate": 4.3835244885306884e-07, + "loss": 0.9139, + "step": 113130 + }, + { + "epoch": 8.767484210934171, + "grad_norm": 1.4224746201911225, + "learning_rate": 4.383911965282083e-07, + "loss": 0.905, + "step": 113140 + }, + { + "epoch": 8.768259134410478, + "grad_norm": 1.4941508559738792, + "learning_rate": 4.3842994420334783e-07, + "loss": 0.906, + "step": 113150 + }, + { + "epoch": 8.769034057886783, + "grad_norm": 1.3593313517070582, + "learning_rate": 4.384686918784873e-07, + "loss": 0.8938, + "step": 113160 + }, + { + "epoch": 8.76980898136309, + "grad_norm": 1.5433002810890355, + "learning_rate": 4.3850743955362677e-07, + "loss": 0.906, + "step": 113170 + }, + { + "epoch": 8.770583904839397, + "grad_norm": 1.4285723217288113, + "learning_rate": 4.385461872287663e-07, + "loss": 0.916, + "step": 113180 + }, + { + "epoch": 8.771358828315703, + "grad_norm": 1.4231398279689718, + "learning_rate": 4.3858493490390576e-07, + "loss": 0.8848, + "step": 113190 + }, + { + "epoch": 8.77213375179201, + "grad_norm": 1.412258679788284, + "learning_rate": 4.386236825790453e-07, + "loss": 0.927, + "step": 113200 + }, + { + "epoch": 8.772908675268317, + "grad_norm": 1.5371369011532028, + "learning_rate": 4.3866243025418475e-07, + "loss": 0.9249, + "step": 113210 + }, + { + "epoch": 8.773683598744624, + "grad_norm": 1.585453708442985, + "learning_rate": 4.387011779293243e-07, + "loss": 0.925, + "step": 113220 + }, + { + "epoch": 8.77445852222093, + "grad_norm": 1.4063720691489532, + "learning_rate": 4.3873992560446374e-07, + "loss": 0.9073, + "step": 113230 + }, + { + "epoch": 8.775233445697237, + "grad_norm": 1.4460514201891634, + "learning_rate": 4.387786732796032e-07, + "loss": 0.9025, + "step": 113240 + }, + { + "epoch": 8.776008369173544, + "grad_norm": 1.4190862437085188, + "learning_rate": 4.3881742095474273e-07, + "loss": 0.914, + "step": 113250 + }, + { + "epoch": 8.776783292649851, + "grad_norm": 1.3915839127471936, + "learning_rate": 4.388561686298822e-07, + "loss": 0.9277, + "step": 113260 + }, + { + "epoch": 8.777558216126158, + "grad_norm": 1.3919748415090285, + "learning_rate": 4.388949163050217e-07, + "loss": 0.9178, + "step": 113270 + }, + { + "epoch": 8.778333139602465, + "grad_norm": 1.5311357362345623, + "learning_rate": 4.389336639801612e-07, + "loss": 0.9089, + "step": 113280 + }, + { + "epoch": 8.779108063078771, + "grad_norm": 1.4672917281961582, + "learning_rate": 4.3897241165530066e-07, + "loss": 0.9002, + "step": 113290 + }, + { + "epoch": 8.779882986555078, + "grad_norm": 1.54519883076201, + "learning_rate": 4.390111593304402e-07, + "loss": 0.8985, + "step": 113300 + }, + { + "epoch": 8.780657910031385, + "grad_norm": 1.4993277009628174, + "learning_rate": 4.3904990700557966e-07, + "loss": 0.8962, + "step": 113310 + }, + { + "epoch": 8.781432833507692, + "grad_norm": 1.4165224994212204, + "learning_rate": 4.390886546807192e-07, + "loss": 0.9074, + "step": 113320 + }, + { + "epoch": 8.782207756983997, + "grad_norm": 1.4919604933035553, + "learning_rate": 4.3912740235585865e-07, + "loss": 0.9382, + "step": 113330 + }, + { + "epoch": 8.782982680460304, + "grad_norm": 1.3947048084901836, + "learning_rate": 4.3916615003099817e-07, + "loss": 0.9108, + "step": 113340 + }, + { + "epoch": 8.78375760393661, + "grad_norm": 1.535434675064109, + "learning_rate": 4.3920489770613764e-07, + "loss": 0.9347, + "step": 113350 + }, + { + "epoch": 8.784532527412917, + "grad_norm": 1.402053557984181, + "learning_rate": 4.392436453812771e-07, + "loss": 0.9055, + "step": 113360 + }, + { + "epoch": 8.785307450889224, + "grad_norm": 1.423042317638839, + "learning_rate": 4.3928239305641663e-07, + "loss": 0.8927, + "step": 113370 + }, + { + "epoch": 8.786082374365531, + "grad_norm": 1.427242341005402, + "learning_rate": 4.393211407315561e-07, + "loss": 0.9264, + "step": 113380 + }, + { + "epoch": 8.786857297841838, + "grad_norm": 1.5499331501675178, + "learning_rate": 4.393598884066956e-07, + "loss": 0.924, + "step": 113390 + }, + { + "epoch": 8.787632221318145, + "grad_norm": 1.4165531052896168, + "learning_rate": 4.393986360818351e-07, + "loss": 0.9035, + "step": 113400 + }, + { + "epoch": 8.788407144794451, + "grad_norm": 1.4551468490164712, + "learning_rate": 4.394373837569746e-07, + "loss": 0.9023, + "step": 113410 + }, + { + "epoch": 8.789182068270758, + "grad_norm": 1.4104688584287344, + "learning_rate": 4.394761314321141e-07, + "loss": 0.9083, + "step": 113420 + }, + { + "epoch": 8.789956991747065, + "grad_norm": 1.4617249931087395, + "learning_rate": 4.3951487910725355e-07, + "loss": 0.8971, + "step": 113430 + }, + { + "epoch": 8.790731915223372, + "grad_norm": 1.4914634167619065, + "learning_rate": 4.3955362678239307e-07, + "loss": 0.9218, + "step": 113440 + }, + { + "epoch": 8.791506838699679, + "grad_norm": 1.4335215155574477, + "learning_rate": 4.3959237445753254e-07, + "loss": 0.9039, + "step": 113450 + }, + { + "epoch": 8.792281762175985, + "grad_norm": 1.45430032895748, + "learning_rate": 4.3963112213267206e-07, + "loss": 0.907, + "step": 113460 + }, + { + "epoch": 8.793056685652292, + "grad_norm": 1.3879281082855972, + "learning_rate": 4.3966986980781153e-07, + "loss": 0.9037, + "step": 113470 + }, + { + "epoch": 8.793831609128599, + "grad_norm": 1.4673406297544889, + "learning_rate": 4.3970861748295105e-07, + "loss": 0.8771, + "step": 113480 + }, + { + "epoch": 8.794606532604906, + "grad_norm": 1.5049489018386957, + "learning_rate": 4.397473651580905e-07, + "loss": 0.914, + "step": 113490 + }, + { + "epoch": 8.795381456081213, + "grad_norm": 1.3676116385764292, + "learning_rate": 4.3978611283323e-07, + "loss": 0.902, + "step": 113500 + }, + { + "epoch": 8.795381456081213, + "eval_loss": 0.9154987931251526, + "eval_runtime": 328.0361, + "eval_samples_per_second": 34.969, + "eval_steps_per_second": 8.743, + "step": 113500 + }, + { + "epoch": 8.79615637955752, + "grad_norm": 1.4461001084742107, + "learning_rate": 4.398248605083695e-07, + "loss": 0.9002, + "step": 113510 + }, + { + "epoch": 8.796931303033826, + "grad_norm": 1.4749499441764435, + "learning_rate": 4.39863608183509e-07, + "loss": 0.9191, + "step": 113520 + }, + { + "epoch": 8.797706226510131, + "grad_norm": 1.4662873664482936, + "learning_rate": 4.399023558586485e-07, + "loss": 0.8888, + "step": 113530 + }, + { + "epoch": 8.798481149986438, + "grad_norm": 1.4085022218178653, + "learning_rate": 4.3994110353378797e-07, + "loss": 0.9104, + "step": 113540 + }, + { + "epoch": 8.799256073462745, + "grad_norm": 1.4405021255043686, + "learning_rate": 4.399798512089275e-07, + "loss": 0.914, + "step": 113550 + }, + { + "epoch": 8.800030996939052, + "grad_norm": 1.4684402882542313, + "learning_rate": 4.4001859888406696e-07, + "loss": 0.9103, + "step": 113560 + }, + { + "epoch": 8.800805920415359, + "grad_norm": 1.390341199575908, + "learning_rate": 4.4005734655920643e-07, + "loss": 0.9222, + "step": 113570 + }, + { + "epoch": 8.801580843891665, + "grad_norm": 1.4260836193043116, + "learning_rate": 4.4009609423434595e-07, + "loss": 0.8976, + "step": 113580 + }, + { + "epoch": 8.802355767367972, + "grad_norm": 1.4645002416257837, + "learning_rate": 4.401348419094854e-07, + "loss": 0.8839, + "step": 113590 + }, + { + "epoch": 8.803130690844279, + "grad_norm": 1.467678679200007, + "learning_rate": 4.4017358958462495e-07, + "loss": 0.9013, + "step": 113600 + }, + { + "epoch": 8.803905614320586, + "grad_norm": 1.5193654457258352, + "learning_rate": 4.402123372597644e-07, + "loss": 0.911, + "step": 113610 + }, + { + "epoch": 8.804680537796893, + "grad_norm": 1.4795607739641026, + "learning_rate": 4.4025108493490394e-07, + "loss": 0.9039, + "step": 113620 + }, + { + "epoch": 8.8054554612732, + "grad_norm": 1.4910170416882926, + "learning_rate": 4.402898326100434e-07, + "loss": 0.8893, + "step": 113630 + }, + { + "epoch": 8.806230384749506, + "grad_norm": 1.4508109218696577, + "learning_rate": 4.403285802851829e-07, + "loss": 0.9164, + "step": 113640 + }, + { + "epoch": 8.807005308225813, + "grad_norm": 1.541106446431563, + "learning_rate": 4.403673279603224e-07, + "loss": 0.9139, + "step": 113650 + }, + { + "epoch": 8.80778023170212, + "grad_norm": 1.4351305241137229, + "learning_rate": 4.4040607563546187e-07, + "loss": 0.9002, + "step": 113660 + }, + { + "epoch": 8.808555155178427, + "grad_norm": 1.4541046468485124, + "learning_rate": 4.404448233106014e-07, + "loss": 0.9113, + "step": 113670 + }, + { + "epoch": 8.809330078654733, + "grad_norm": 1.4226694729772957, + "learning_rate": 4.4048357098574086e-07, + "loss": 0.9394, + "step": 113680 + }, + { + "epoch": 8.81010500213104, + "grad_norm": 1.4318653933689736, + "learning_rate": 4.405223186608804e-07, + "loss": 0.9155, + "step": 113690 + }, + { + "epoch": 8.810879925607345, + "grad_norm": 1.5223472655474952, + "learning_rate": 4.4056106633601985e-07, + "loss": 0.9192, + "step": 113700 + }, + { + "epoch": 8.811654849083652, + "grad_norm": 1.4942405048915717, + "learning_rate": 4.405998140111593e-07, + "loss": 0.9178, + "step": 113710 + }, + { + "epoch": 8.812429772559959, + "grad_norm": 1.4868092980012122, + "learning_rate": 4.4063856168629884e-07, + "loss": 0.9104, + "step": 113720 + }, + { + "epoch": 8.813204696036266, + "grad_norm": 1.428863788437385, + "learning_rate": 4.406773093614383e-07, + "loss": 0.9092, + "step": 113730 + }, + { + "epoch": 8.813979619512573, + "grad_norm": 1.4504841753936335, + "learning_rate": 4.4071605703657783e-07, + "loss": 0.9042, + "step": 113740 + }, + { + "epoch": 8.81475454298888, + "grad_norm": 1.4525429275191772, + "learning_rate": 4.407548047117173e-07, + "loss": 0.9176, + "step": 113750 + }, + { + "epoch": 8.815529466465186, + "grad_norm": 1.3333436069564812, + "learning_rate": 4.407935523868568e-07, + "loss": 0.9109, + "step": 113760 + }, + { + "epoch": 8.816304389941493, + "grad_norm": 1.441907214026499, + "learning_rate": 4.408323000619963e-07, + "loss": 0.9296, + "step": 113770 + }, + { + "epoch": 8.8170793134178, + "grad_norm": 1.5019324866994779, + "learning_rate": 4.4087104773713576e-07, + "loss": 0.9295, + "step": 113780 + }, + { + "epoch": 8.817854236894107, + "grad_norm": 1.467063943112749, + "learning_rate": 4.409097954122753e-07, + "loss": 0.9029, + "step": 113790 + }, + { + "epoch": 8.818629160370413, + "grad_norm": 1.4996031779243686, + "learning_rate": 4.4094854308741475e-07, + "loss": 0.9247, + "step": 113800 + }, + { + "epoch": 8.81940408384672, + "grad_norm": 1.430715253454376, + "learning_rate": 4.4098729076255427e-07, + "loss": 0.9182, + "step": 113810 + }, + { + "epoch": 8.820179007323027, + "grad_norm": 1.4570960573788214, + "learning_rate": 4.4102603843769374e-07, + "loss": 0.9198, + "step": 113820 + }, + { + "epoch": 8.820953930799334, + "grad_norm": 1.4809026030560684, + "learning_rate": 4.4106478611283326e-07, + "loss": 0.9088, + "step": 113830 + }, + { + "epoch": 8.82172885427564, + "grad_norm": 1.349658344373874, + "learning_rate": 4.4110353378797273e-07, + "loss": 0.9108, + "step": 113840 + }, + { + "epoch": 8.822503777751947, + "grad_norm": 1.3947396529360392, + "learning_rate": 4.411422814631122e-07, + "loss": 0.895, + "step": 113850 + }, + { + "epoch": 8.823278701228254, + "grad_norm": 1.4770388635611136, + "learning_rate": 4.411810291382517e-07, + "loss": 0.9243, + "step": 113860 + }, + { + "epoch": 8.824053624704561, + "grad_norm": 1.3813648354884818, + "learning_rate": 4.412197768133912e-07, + "loss": 0.8959, + "step": 113870 + }, + { + "epoch": 8.824828548180868, + "grad_norm": 1.4474465971611121, + "learning_rate": 4.412585244885307e-07, + "loss": 0.9204, + "step": 113880 + }, + { + "epoch": 8.825603471657175, + "grad_norm": 1.4309987752505466, + "learning_rate": 4.412972721636702e-07, + "loss": 0.9273, + "step": 113890 + }, + { + "epoch": 8.82637839513348, + "grad_norm": 1.372544988267538, + "learning_rate": 4.413360198388097e-07, + "loss": 0.9046, + "step": 113900 + }, + { + "epoch": 8.827153318609787, + "grad_norm": 1.4050268502870553, + "learning_rate": 4.413747675139492e-07, + "loss": 0.8993, + "step": 113910 + }, + { + "epoch": 8.827928242086093, + "grad_norm": 1.42996973304847, + "learning_rate": 4.4141351518908864e-07, + "loss": 0.908, + "step": 113920 + }, + { + "epoch": 8.8287031655624, + "grad_norm": 1.3765092376077066, + "learning_rate": 4.4145226286422816e-07, + "loss": 0.9133, + "step": 113930 + }, + { + "epoch": 8.829478089038707, + "grad_norm": 1.4066408921243319, + "learning_rate": 4.4149101053936763e-07, + "loss": 0.8874, + "step": 113940 + }, + { + "epoch": 8.830253012515014, + "grad_norm": 1.4223654920044098, + "learning_rate": 4.4152975821450716e-07, + "loss": 0.908, + "step": 113950 + }, + { + "epoch": 8.83102793599132, + "grad_norm": 1.3874210927887145, + "learning_rate": 4.415685058896466e-07, + "loss": 0.913, + "step": 113960 + }, + { + "epoch": 8.831802859467627, + "grad_norm": 1.4451754237028793, + "learning_rate": 4.416072535647861e-07, + "loss": 0.9159, + "step": 113970 + }, + { + "epoch": 8.832577782943934, + "grad_norm": 1.3955728249903183, + "learning_rate": 4.416460012399256e-07, + "loss": 0.9058, + "step": 113980 + }, + { + "epoch": 8.833352706420241, + "grad_norm": 1.381197224689921, + "learning_rate": 4.416847489150651e-07, + "loss": 0.9016, + "step": 113990 + }, + { + "epoch": 8.834127629896548, + "grad_norm": 1.4231491925203916, + "learning_rate": 4.417234965902046e-07, + "loss": 0.902, + "step": 114000 + }, + { + "epoch": 8.834127629896548, + "eval_loss": 0.9153403639793396, + "eval_runtime": 327.6654, + "eval_samples_per_second": 35.008, + "eval_steps_per_second": 8.753, + "step": 114000 + }, + { + "epoch": 8.834902553372855, + "grad_norm": 1.4716855754911844, + "learning_rate": 4.417622442653441e-07, + "loss": 0.9137, + "step": 114010 + }, + { + "epoch": 8.835677476849161, + "grad_norm": 1.284839263149165, + "learning_rate": 4.418009919404836e-07, + "loss": 0.8996, + "step": 114020 + }, + { + "epoch": 8.836452400325468, + "grad_norm": 1.3415520069698017, + "learning_rate": 4.4183973961562307e-07, + "loss": 0.9181, + "step": 114030 + }, + { + "epoch": 8.837227323801775, + "grad_norm": 1.561501029568925, + "learning_rate": 4.4187848729076254e-07, + "loss": 0.9205, + "step": 114040 + }, + { + "epoch": 8.838002247278082, + "grad_norm": 1.5053096838072417, + "learning_rate": 4.4191723496590206e-07, + "loss": 0.9066, + "step": 114050 + }, + { + "epoch": 8.838777170754389, + "grad_norm": 1.4604804739602142, + "learning_rate": 4.4195598264104153e-07, + "loss": 0.9219, + "step": 114060 + }, + { + "epoch": 8.839552094230696, + "grad_norm": 1.4135372440096559, + "learning_rate": 4.4199473031618105e-07, + "loss": 0.9014, + "step": 114070 + }, + { + "epoch": 8.840327017707, + "grad_norm": 1.4391689546365258, + "learning_rate": 4.420334779913205e-07, + "loss": 0.9251, + "step": 114080 + }, + { + "epoch": 8.841101941183307, + "grad_norm": 1.4510141598612207, + "learning_rate": 4.4207222566646004e-07, + "loss": 0.9154, + "step": 114090 + }, + { + "epoch": 8.841876864659614, + "grad_norm": 1.5694028176913546, + "learning_rate": 4.421109733415995e-07, + "loss": 0.9222, + "step": 114100 + }, + { + "epoch": 8.842651788135921, + "grad_norm": 1.3954552780300817, + "learning_rate": 4.42149721016739e-07, + "loss": 0.9063, + "step": 114110 + }, + { + "epoch": 8.843426711612228, + "grad_norm": 1.4467274795691298, + "learning_rate": 4.421884686918785e-07, + "loss": 0.8895, + "step": 114120 + }, + { + "epoch": 8.844201635088535, + "grad_norm": 1.4846463764134707, + "learning_rate": 4.4222721636701797e-07, + "loss": 0.8951, + "step": 114130 + }, + { + "epoch": 8.844976558564841, + "grad_norm": 1.4276172945665468, + "learning_rate": 4.422659640421575e-07, + "loss": 0.8888, + "step": 114140 + }, + { + "epoch": 8.845751482041148, + "grad_norm": 1.5422352412877582, + "learning_rate": 4.4230471171729696e-07, + "loss": 0.9216, + "step": 114150 + }, + { + "epoch": 8.846526405517455, + "grad_norm": 1.5113555344264378, + "learning_rate": 4.423434593924365e-07, + "loss": 0.9048, + "step": 114160 + }, + { + "epoch": 8.847301328993762, + "grad_norm": 1.4473243311008666, + "learning_rate": 4.4238220706757595e-07, + "loss": 0.9052, + "step": 114170 + }, + { + "epoch": 8.848076252470069, + "grad_norm": 1.4130581318148918, + "learning_rate": 4.424209547427154e-07, + "loss": 0.9078, + "step": 114180 + }, + { + "epoch": 8.848851175946375, + "grad_norm": 1.4451768390252322, + "learning_rate": 4.4245970241785494e-07, + "loss": 0.9059, + "step": 114190 + }, + { + "epoch": 8.849626099422682, + "grad_norm": 1.606928671989654, + "learning_rate": 4.424984500929944e-07, + "loss": 0.8961, + "step": 114200 + }, + { + "epoch": 8.850401022898989, + "grad_norm": 1.722982213379894, + "learning_rate": 4.4253719776813393e-07, + "loss": 0.9326, + "step": 114210 + }, + { + "epoch": 8.851175946375296, + "grad_norm": 1.5017957251289566, + "learning_rate": 4.425759454432734e-07, + "loss": 0.9189, + "step": 114220 + }, + { + "epoch": 8.851950869851603, + "grad_norm": 1.3804496720477808, + "learning_rate": 4.426146931184129e-07, + "loss": 0.9117, + "step": 114230 + }, + { + "epoch": 8.85272579332791, + "grad_norm": 1.4778143787376237, + "learning_rate": 4.426534407935524e-07, + "loss": 0.8956, + "step": 114240 + }, + { + "epoch": 8.853500716804216, + "grad_norm": 1.6131475677611233, + "learning_rate": 4.4269218846869186e-07, + "loss": 0.9143, + "step": 114250 + }, + { + "epoch": 8.854275640280523, + "grad_norm": 1.576189830437625, + "learning_rate": 4.427309361438314e-07, + "loss": 0.9072, + "step": 114260 + }, + { + "epoch": 8.855050563756828, + "grad_norm": 1.4141119002356322, + "learning_rate": 4.4276968381897085e-07, + "loss": 0.9201, + "step": 114270 + }, + { + "epoch": 8.855825487233135, + "grad_norm": 1.4664492049384554, + "learning_rate": 4.428084314941104e-07, + "loss": 0.9089, + "step": 114280 + }, + { + "epoch": 8.856600410709442, + "grad_norm": 1.483237088018491, + "learning_rate": 4.4284717916924984e-07, + "loss": 0.8975, + "step": 114290 + }, + { + "epoch": 8.857375334185749, + "grad_norm": 1.4125736119841554, + "learning_rate": 4.4288592684438937e-07, + "loss": 0.9274, + "step": 114300 + }, + { + "epoch": 8.858150257662055, + "grad_norm": 1.4493196948630949, + "learning_rate": 4.4292467451952884e-07, + "loss": 0.9023, + "step": 114310 + }, + { + "epoch": 8.858925181138362, + "grad_norm": 1.3819996580899006, + "learning_rate": 4.429634221946683e-07, + "loss": 0.9034, + "step": 114320 + }, + { + "epoch": 8.859700104614669, + "grad_norm": 1.4589938391759825, + "learning_rate": 4.430021698698078e-07, + "loss": 0.9208, + "step": 114330 + }, + { + "epoch": 8.860475028090976, + "grad_norm": 1.3873938460163673, + "learning_rate": 4.430409175449473e-07, + "loss": 0.9017, + "step": 114340 + }, + { + "epoch": 8.861249951567283, + "grad_norm": 1.4585363391904884, + "learning_rate": 4.430796652200868e-07, + "loss": 0.9176, + "step": 114350 + }, + { + "epoch": 8.86202487504359, + "grad_norm": 1.4062810693527248, + "learning_rate": 4.431184128952263e-07, + "loss": 0.904, + "step": 114360 + }, + { + "epoch": 8.862799798519896, + "grad_norm": 1.501854478812361, + "learning_rate": 4.431571605703658e-07, + "loss": 0.9029, + "step": 114370 + }, + { + "epoch": 8.863574721996203, + "grad_norm": 1.4126614917836116, + "learning_rate": 4.431959082455053e-07, + "loss": 0.906, + "step": 114380 + }, + { + "epoch": 8.86434964547251, + "grad_norm": 1.3862857414929217, + "learning_rate": 4.4323465592064475e-07, + "loss": 0.9135, + "step": 114390 + }, + { + "epoch": 8.865124568948817, + "grad_norm": 1.4397066230910152, + "learning_rate": 4.4327340359578427e-07, + "loss": 0.8978, + "step": 114400 + }, + { + "epoch": 8.865899492425124, + "grad_norm": 1.438322555073989, + "learning_rate": 4.4331215127092374e-07, + "loss": 0.9081, + "step": 114410 + }, + { + "epoch": 8.86667441590143, + "grad_norm": 1.3018738512645442, + "learning_rate": 4.4335089894606326e-07, + "loss": 0.8813, + "step": 114420 + }, + { + "epoch": 8.867449339377737, + "grad_norm": 1.4750389256228935, + "learning_rate": 4.4338964662120273e-07, + "loss": 0.9282, + "step": 114430 + }, + { + "epoch": 8.868224262854044, + "grad_norm": 1.4402422957440253, + "learning_rate": 4.4342839429634225e-07, + "loss": 0.8933, + "step": 114440 + }, + { + "epoch": 8.868999186330349, + "grad_norm": 1.4039123547953944, + "learning_rate": 4.434671419714817e-07, + "loss": 0.9141, + "step": 114450 + }, + { + "epoch": 8.869774109806656, + "grad_norm": 1.3944649260109687, + "learning_rate": 4.435058896466212e-07, + "loss": 0.911, + "step": 114460 + }, + { + "epoch": 8.870549033282963, + "grad_norm": 1.3777604086135393, + "learning_rate": 4.435446373217607e-07, + "loss": 0.9123, + "step": 114470 + }, + { + "epoch": 8.87132395675927, + "grad_norm": 1.4064838754386393, + "learning_rate": 4.435833849969002e-07, + "loss": 0.9165, + "step": 114480 + }, + { + "epoch": 8.872098880235576, + "grad_norm": 1.5216805855050841, + "learning_rate": 4.436221326720397e-07, + "loss": 0.9295, + "step": 114490 + }, + { + "epoch": 8.872873803711883, + "grad_norm": 1.434849416873383, + "learning_rate": 4.4366088034717917e-07, + "loss": 0.8964, + "step": 114500 + }, + { + "epoch": 8.872873803711883, + "eval_loss": 0.9151089787483215, + "eval_runtime": 327.5698, + "eval_samples_per_second": 35.018, + "eval_steps_per_second": 8.755, + "step": 114500 + }, + { + "epoch": 8.87364872718819, + "grad_norm": 1.429234249569384, + "learning_rate": 4.436996280223187e-07, + "loss": 0.9079, + "step": 114510 + }, + { + "epoch": 8.874423650664497, + "grad_norm": 1.4074596596745952, + "learning_rate": 4.4373837569745816e-07, + "loss": 0.8986, + "step": 114520 + }, + { + "epoch": 8.875198574140803, + "grad_norm": 1.3880320720525725, + "learning_rate": 4.4377712337259763e-07, + "loss": 0.9163, + "step": 114530 + }, + { + "epoch": 8.87597349761711, + "grad_norm": 1.4011551165415466, + "learning_rate": 4.4381587104773715e-07, + "loss": 0.8883, + "step": 114540 + }, + { + "epoch": 8.876748421093417, + "grad_norm": 1.427359517738672, + "learning_rate": 4.438546187228766e-07, + "loss": 0.916, + "step": 114550 + }, + { + "epoch": 8.877523344569724, + "grad_norm": 1.3983671426092121, + "learning_rate": 4.4389336639801614e-07, + "loss": 0.9105, + "step": 114560 + }, + { + "epoch": 8.87829826804603, + "grad_norm": 1.4358614550895101, + "learning_rate": 4.439321140731556e-07, + "loss": 0.9297, + "step": 114570 + }, + { + "epoch": 8.879073191522338, + "grad_norm": 1.4250877881784887, + "learning_rate": 4.4397086174829513e-07, + "loss": 0.9227, + "step": 114580 + }, + { + "epoch": 8.879848114998644, + "grad_norm": 1.4308038577623896, + "learning_rate": 4.440096094234346e-07, + "loss": 0.9016, + "step": 114590 + }, + { + "epoch": 8.880623038474951, + "grad_norm": 1.5028477667844016, + "learning_rate": 4.4404835709857407e-07, + "loss": 0.9256, + "step": 114600 + }, + { + "epoch": 8.881397961951258, + "grad_norm": 1.438827841298675, + "learning_rate": 4.440871047737136e-07, + "loss": 0.9031, + "step": 114610 + }, + { + "epoch": 8.882172885427565, + "grad_norm": 1.4952619556753146, + "learning_rate": 4.4412585244885306e-07, + "loss": 0.8997, + "step": 114620 + }, + { + "epoch": 8.882947808903872, + "grad_norm": 1.4143386827168911, + "learning_rate": 4.441646001239926e-07, + "loss": 0.8979, + "step": 114630 + }, + { + "epoch": 8.883722732380178, + "grad_norm": 1.3601332268805522, + "learning_rate": 4.4420334779913205e-07, + "loss": 0.9163, + "step": 114640 + }, + { + "epoch": 8.884497655856483, + "grad_norm": 1.4065475277994572, + "learning_rate": 4.442420954742716e-07, + "loss": 0.8856, + "step": 114650 + }, + { + "epoch": 8.88527257933279, + "grad_norm": 1.444758020308265, + "learning_rate": 4.4428084314941105e-07, + "loss": 0.9279, + "step": 114660 + }, + { + "epoch": 8.886047502809097, + "grad_norm": 1.467496843511529, + "learning_rate": 4.443195908245505e-07, + "loss": 0.8952, + "step": 114670 + }, + { + "epoch": 8.886822426285404, + "grad_norm": 1.3963393181953974, + "learning_rate": 4.4435833849969004e-07, + "loss": 0.9223, + "step": 114680 + }, + { + "epoch": 8.88759734976171, + "grad_norm": 1.5356976193905059, + "learning_rate": 4.443970861748295e-07, + "loss": 0.9261, + "step": 114690 + }, + { + "epoch": 8.888372273238017, + "grad_norm": 1.545182924528815, + "learning_rate": 4.4443583384996903e-07, + "loss": 0.8964, + "step": 114700 + }, + { + "epoch": 8.889147196714324, + "grad_norm": 1.379933900397908, + "learning_rate": 4.444745815251085e-07, + "loss": 0.9116, + "step": 114710 + }, + { + "epoch": 8.889922120190631, + "grad_norm": 1.443638773946937, + "learning_rate": 4.4451332920024797e-07, + "loss": 0.918, + "step": 114720 + }, + { + "epoch": 8.890697043666938, + "grad_norm": 1.4738523853324164, + "learning_rate": 4.445520768753875e-07, + "loss": 0.8865, + "step": 114730 + }, + { + "epoch": 8.891471967143245, + "grad_norm": 1.463478872511849, + "learning_rate": 4.4459082455052696e-07, + "loss": 0.93, + "step": 114740 + }, + { + "epoch": 8.892246890619552, + "grad_norm": 1.5256408099590641, + "learning_rate": 4.446295722256665e-07, + "loss": 0.917, + "step": 114750 + }, + { + "epoch": 8.893021814095858, + "grad_norm": 1.3624462713351222, + "learning_rate": 4.4466831990080595e-07, + "loss": 0.9178, + "step": 114760 + }, + { + "epoch": 8.893796737572165, + "grad_norm": 1.3158615229862425, + "learning_rate": 4.4470706757594547e-07, + "loss": 0.914, + "step": 114770 + }, + { + "epoch": 8.894571661048472, + "grad_norm": 1.4266685303701845, + "learning_rate": 4.4474581525108494e-07, + "loss": 0.8916, + "step": 114780 + }, + { + "epoch": 8.895346584524779, + "grad_norm": 1.5094912156474005, + "learning_rate": 4.447845629262244e-07, + "loss": 0.9003, + "step": 114790 + }, + { + "epoch": 8.896121508001086, + "grad_norm": 1.4413286160998477, + "learning_rate": 4.4482331060136393e-07, + "loss": 0.9269, + "step": 114800 + }, + { + "epoch": 8.896896431477392, + "grad_norm": 1.4471121900123671, + "learning_rate": 4.448620582765034e-07, + "loss": 0.8911, + "step": 114810 + }, + { + "epoch": 8.897671354953697, + "grad_norm": 1.3469646009025964, + "learning_rate": 4.449008059516429e-07, + "loss": 0.9164, + "step": 114820 + }, + { + "epoch": 8.898446278430004, + "grad_norm": 1.4996899977609521, + "learning_rate": 4.449395536267824e-07, + "loss": 0.8973, + "step": 114830 + }, + { + "epoch": 8.899221201906311, + "grad_norm": 1.5526211902277356, + "learning_rate": 4.449783013019219e-07, + "loss": 0.8922, + "step": 114840 + }, + { + "epoch": 8.899996125382618, + "grad_norm": 1.4708115528824897, + "learning_rate": 4.450170489770614e-07, + "loss": 0.9303, + "step": 114850 + }, + { + "epoch": 8.900771048858925, + "grad_norm": 1.4854131613432096, + "learning_rate": 4.4505579665220085e-07, + "loss": 0.9133, + "step": 114860 + }, + { + "epoch": 8.901545972335231, + "grad_norm": 1.4983653884572126, + "learning_rate": 4.4509454432734037e-07, + "loss": 0.9009, + "step": 114870 + }, + { + "epoch": 8.902320895811538, + "grad_norm": 1.4315569438089288, + "learning_rate": 4.4513329200247984e-07, + "loss": 0.9025, + "step": 114880 + }, + { + "epoch": 8.903095819287845, + "grad_norm": 1.4368909893499464, + "learning_rate": 4.4517203967761936e-07, + "loss": 0.9372, + "step": 114890 + }, + { + "epoch": 8.903870742764152, + "grad_norm": 1.376454290002246, + "learning_rate": 4.4521078735275883e-07, + "loss": 0.875, + "step": 114900 + }, + { + "epoch": 8.904645666240459, + "grad_norm": 1.4799379159787478, + "learning_rate": 4.4524953502789835e-07, + "loss": 0.8985, + "step": 114910 + }, + { + "epoch": 8.905420589716766, + "grad_norm": 1.475629778438026, + "learning_rate": 4.452882827030378e-07, + "loss": 0.9082, + "step": 114920 + }, + { + "epoch": 8.906195513193072, + "grad_norm": 1.4932599432821931, + "learning_rate": 4.453270303781773e-07, + "loss": 0.8948, + "step": 114930 + }, + { + "epoch": 8.90697043666938, + "grad_norm": 1.4134032728983363, + "learning_rate": 4.453657780533168e-07, + "loss": 0.8915, + "step": 114940 + }, + { + "epoch": 8.907745360145686, + "grad_norm": 1.3556496011453802, + "learning_rate": 4.454045257284563e-07, + "loss": 0.8929, + "step": 114950 + }, + { + "epoch": 8.908520283621993, + "grad_norm": 1.427243424844765, + "learning_rate": 4.454432734035958e-07, + "loss": 0.917, + "step": 114960 + }, + { + "epoch": 8.9092952070983, + "grad_norm": 1.3800233969387652, + "learning_rate": 4.454820210787353e-07, + "loss": 0.9207, + "step": 114970 + }, + { + "epoch": 8.910070130574606, + "grad_norm": 1.4246617388559548, + "learning_rate": 4.455207687538748e-07, + "loss": 0.8907, + "step": 114980 + }, + { + "epoch": 8.910845054050913, + "grad_norm": 1.5292376048132343, + "learning_rate": 4.4555951642901427e-07, + "loss": 0.8996, + "step": 114990 + }, + { + "epoch": 8.91161997752722, + "grad_norm": 1.3962819929988235, + "learning_rate": 4.4559826410415373e-07, + "loss": 0.9087, + "step": 115000 + }, + { + "epoch": 8.91161997752722, + "eval_loss": 0.9147964119911194, + "eval_runtime": 327.8726, + "eval_samples_per_second": 34.986, + "eval_steps_per_second": 8.747, + "step": 115000 + }, + { + "epoch": 8.912394901003527, + "grad_norm": 1.4299434465247653, + "learning_rate": 4.4563701177929326e-07, + "loss": 0.9265, + "step": 115010 + }, + { + "epoch": 8.913169824479832, + "grad_norm": 1.4740011573391032, + "learning_rate": 4.456757594544327e-07, + "loss": 0.8916, + "step": 115020 + }, + { + "epoch": 8.913944747956139, + "grad_norm": 1.526129577052168, + "learning_rate": 4.4571450712957225e-07, + "loss": 0.9199, + "step": 115030 + }, + { + "epoch": 8.914719671432445, + "grad_norm": 1.4180744322715002, + "learning_rate": 4.457532548047117e-07, + "loss": 0.9222, + "step": 115040 + }, + { + "epoch": 8.915494594908752, + "grad_norm": 1.4640531401319838, + "learning_rate": 4.4579200247985124e-07, + "loss": 0.9079, + "step": 115050 + }, + { + "epoch": 8.916269518385059, + "grad_norm": 1.3769361410863639, + "learning_rate": 4.458307501549907e-07, + "loss": 0.9232, + "step": 115060 + }, + { + "epoch": 8.917044441861366, + "grad_norm": 1.4677560787216954, + "learning_rate": 4.458694978301302e-07, + "loss": 0.9116, + "step": 115070 + }, + { + "epoch": 8.917819365337673, + "grad_norm": 1.376484033071169, + "learning_rate": 4.459082455052697e-07, + "loss": 0.8989, + "step": 115080 + }, + { + "epoch": 8.91859428881398, + "grad_norm": 1.4919015110960683, + "learning_rate": 4.4594699318040917e-07, + "loss": 0.8965, + "step": 115090 + }, + { + "epoch": 8.919369212290286, + "grad_norm": 1.4157776313270607, + "learning_rate": 4.459857408555487e-07, + "loss": 0.9025, + "step": 115100 + }, + { + "epoch": 8.920144135766593, + "grad_norm": 1.390080031799455, + "learning_rate": 4.4602448853068816e-07, + "loss": 0.8953, + "step": 115110 + }, + { + "epoch": 8.9209190592429, + "grad_norm": 1.529872608815007, + "learning_rate": 4.460632362058277e-07, + "loss": 0.9092, + "step": 115120 + }, + { + "epoch": 8.921693982719207, + "grad_norm": 1.47564892612783, + "learning_rate": 4.4610198388096715e-07, + "loss": 0.9225, + "step": 115130 + }, + { + "epoch": 8.922468906195514, + "grad_norm": 1.402789639982149, + "learning_rate": 4.461407315561066e-07, + "loss": 0.9003, + "step": 115140 + }, + { + "epoch": 8.92324382967182, + "grad_norm": 1.3618583866692282, + "learning_rate": 4.4617947923124614e-07, + "loss": 0.8932, + "step": 115150 + }, + { + "epoch": 8.924018753148127, + "grad_norm": 1.4128237002985762, + "learning_rate": 4.462182269063856e-07, + "loss": 0.8855, + "step": 115160 + }, + { + "epoch": 8.924793676624434, + "grad_norm": 1.384034997352597, + "learning_rate": 4.4625697458152513e-07, + "loss": 0.9128, + "step": 115170 + }, + { + "epoch": 8.92556860010074, + "grad_norm": 1.4913585521783848, + "learning_rate": 4.462957222566646e-07, + "loss": 0.9132, + "step": 115180 + }, + { + "epoch": 8.926343523577046, + "grad_norm": 1.3927498018088516, + "learning_rate": 4.463344699318041e-07, + "loss": 0.8933, + "step": 115190 + }, + { + "epoch": 8.927118447053353, + "grad_norm": 1.467165109436078, + "learning_rate": 4.463732176069436e-07, + "loss": 0.931, + "step": 115200 + }, + { + "epoch": 8.92789337052966, + "grad_norm": 1.393933349962808, + "learning_rate": 4.4641196528208306e-07, + "loss": 0.8919, + "step": 115210 + }, + { + "epoch": 8.928668294005966, + "grad_norm": 1.434341143574282, + "learning_rate": 4.464507129572226e-07, + "loss": 0.9221, + "step": 115220 + }, + { + "epoch": 8.929443217482273, + "grad_norm": 1.4547019689991307, + "learning_rate": 4.4648946063236205e-07, + "loss": 0.9216, + "step": 115230 + }, + { + "epoch": 8.93021814095858, + "grad_norm": 1.4027791710628403, + "learning_rate": 4.465282083075016e-07, + "loss": 0.9099, + "step": 115240 + }, + { + "epoch": 8.930993064434887, + "grad_norm": 1.4839286926266024, + "learning_rate": 4.4656695598264104e-07, + "loss": 0.8994, + "step": 115250 + }, + { + "epoch": 8.931767987911194, + "grad_norm": 1.43884291135401, + "learning_rate": 4.4660570365778056e-07, + "loss": 0.9085, + "step": 115260 + }, + { + "epoch": 8.9325429113875, + "grad_norm": 1.442658785157403, + "learning_rate": 4.4664445133292003e-07, + "loss": 0.8979, + "step": 115270 + }, + { + "epoch": 8.933317834863807, + "grad_norm": 1.472464690774143, + "learning_rate": 4.466831990080595e-07, + "loss": 0.9156, + "step": 115280 + }, + { + "epoch": 8.934092758340114, + "grad_norm": 1.4112542024249677, + "learning_rate": 4.46721946683199e-07, + "loss": 0.9099, + "step": 115290 + }, + { + "epoch": 8.93486768181642, + "grad_norm": 1.4435542023875705, + "learning_rate": 4.467606943583385e-07, + "loss": 0.9013, + "step": 115300 + }, + { + "epoch": 8.935642605292728, + "grad_norm": 1.547751502341918, + "learning_rate": 4.46799442033478e-07, + "loss": 0.9057, + "step": 115310 + }, + { + "epoch": 8.936417528769034, + "grad_norm": 1.4172072365229684, + "learning_rate": 4.468381897086175e-07, + "loss": 0.8913, + "step": 115320 + }, + { + "epoch": 8.937192452245341, + "grad_norm": 1.413658740198392, + "learning_rate": 4.46876937383757e-07, + "loss": 0.9148, + "step": 115330 + }, + { + "epoch": 8.937967375721648, + "grad_norm": 1.459785960044392, + "learning_rate": 4.469156850588965e-07, + "loss": 0.9098, + "step": 115340 + }, + { + "epoch": 8.938742299197955, + "grad_norm": 1.4759778161004775, + "learning_rate": 4.4695443273403594e-07, + "loss": 0.9116, + "step": 115350 + }, + { + "epoch": 8.939517222674262, + "grad_norm": 1.4517610091315198, + "learning_rate": 4.4699318040917547e-07, + "loss": 0.899, + "step": 115360 + }, + { + "epoch": 8.940292146150568, + "grad_norm": 1.4037530718758906, + "learning_rate": 4.4703192808431494e-07, + "loss": 0.9099, + "step": 115370 + }, + { + "epoch": 8.941067069626875, + "grad_norm": 1.4838175009408308, + "learning_rate": 4.4707067575945446e-07, + "loss": 0.9322, + "step": 115380 + }, + { + "epoch": 8.94184199310318, + "grad_norm": 1.412804811255057, + "learning_rate": 4.4710942343459393e-07, + "loss": 0.923, + "step": 115390 + }, + { + "epoch": 8.942616916579487, + "grad_norm": 1.4455932407990282, + "learning_rate": 4.4714817110973345e-07, + "loss": 0.9086, + "step": 115400 + }, + { + "epoch": 8.943391840055794, + "grad_norm": 1.5064326716609218, + "learning_rate": 4.471869187848729e-07, + "loss": 0.8944, + "step": 115410 + }, + { + "epoch": 8.9441667635321, + "grad_norm": 1.4104330165360421, + "learning_rate": 4.472256664600124e-07, + "loss": 0.9388, + "step": 115420 + }, + { + "epoch": 8.944941687008408, + "grad_norm": 1.4685503784602194, + "learning_rate": 4.472644141351519e-07, + "loss": 0.9144, + "step": 115430 + }, + { + "epoch": 8.945716610484714, + "grad_norm": 1.389349759928285, + "learning_rate": 4.473031618102914e-07, + "loss": 0.8894, + "step": 115440 + }, + { + "epoch": 8.946491533961021, + "grad_norm": 1.380637271911574, + "learning_rate": 4.473419094854309e-07, + "loss": 0.9092, + "step": 115450 + }, + { + "epoch": 8.947266457437328, + "grad_norm": 1.561869701262999, + "learning_rate": 4.4738065716057037e-07, + "loss": 0.916, + "step": 115460 + }, + { + "epoch": 8.948041380913635, + "grad_norm": 1.4198513468012883, + "learning_rate": 4.4741940483570984e-07, + "loss": 0.9068, + "step": 115470 + }, + { + "epoch": 8.948816304389942, + "grad_norm": 1.4164284552334323, + "learning_rate": 4.4745815251084936e-07, + "loss": 0.8985, + "step": 115480 + }, + { + "epoch": 8.949591227866248, + "grad_norm": 1.4967241684151837, + "learning_rate": 4.4749690018598883e-07, + "loss": 0.8928, + "step": 115490 + }, + { + "epoch": 8.950366151342555, + "grad_norm": 1.5140069505360643, + "learning_rate": 4.4753564786112835e-07, + "loss": 0.9059, + "step": 115500 + }, + { + "epoch": 8.950366151342555, + "eval_loss": 0.9145816564559937, + "eval_runtime": 329.0869, + "eval_samples_per_second": 34.857, + "eval_steps_per_second": 8.715, + "step": 115500 + }, + { + "epoch": 8.951141074818862, + "grad_norm": 2.002895827880215, + "learning_rate": 4.475743955362678e-07, + "loss": 0.9223, + "step": 115510 + }, + { + "epoch": 8.951915998295169, + "grad_norm": 1.4088403183361726, + "learning_rate": 4.4761314321140734e-07, + "loss": 0.9191, + "step": 115520 + }, + { + "epoch": 8.952690921771476, + "grad_norm": 1.4685135266983869, + "learning_rate": 4.476518908865468e-07, + "loss": 0.9055, + "step": 115530 + }, + { + "epoch": 8.953465845247782, + "grad_norm": 1.5209114072207188, + "learning_rate": 4.476906385616863e-07, + "loss": 0.9106, + "step": 115540 + }, + { + "epoch": 8.95424076872409, + "grad_norm": 1.4978552819913016, + "learning_rate": 4.477293862368258e-07, + "loss": 0.9308, + "step": 115550 + }, + { + "epoch": 8.955015692200394, + "grad_norm": 1.387954870679663, + "learning_rate": 4.4776813391196527e-07, + "loss": 0.9182, + "step": 115560 + }, + { + "epoch": 8.955790615676701, + "grad_norm": 1.3617799216661615, + "learning_rate": 4.478068815871048e-07, + "loss": 0.9184, + "step": 115570 + }, + { + "epoch": 8.956565539153008, + "grad_norm": 1.4990845623039135, + "learning_rate": 4.4784562926224426e-07, + "loss": 0.921, + "step": 115580 + }, + { + "epoch": 8.957340462629315, + "grad_norm": 1.4609649048418125, + "learning_rate": 4.478843769373838e-07, + "loss": 0.8894, + "step": 115590 + }, + { + "epoch": 8.958115386105622, + "grad_norm": 1.4478176362177986, + "learning_rate": 4.4792312461252325e-07, + "loss": 0.899, + "step": 115600 + }, + { + "epoch": 8.958890309581928, + "grad_norm": 1.371298553372575, + "learning_rate": 4.479618722876627e-07, + "loss": 0.9057, + "step": 115610 + }, + { + "epoch": 8.959665233058235, + "grad_norm": 1.4209409316706658, + "learning_rate": 4.4800061996280224e-07, + "loss": 0.9136, + "step": 115620 + }, + { + "epoch": 8.960440156534542, + "grad_norm": 1.4745778020110976, + "learning_rate": 4.480393676379417e-07, + "loss": 0.91, + "step": 115630 + }, + { + "epoch": 8.961215080010849, + "grad_norm": 1.4229901732719055, + "learning_rate": 4.4807811531308124e-07, + "loss": 0.9055, + "step": 115640 + }, + { + "epoch": 8.961990003487156, + "grad_norm": 1.3828316491918553, + "learning_rate": 4.481168629882207e-07, + "loss": 0.9036, + "step": 115650 + }, + { + "epoch": 8.962764926963462, + "grad_norm": 1.4306165928897123, + "learning_rate": 4.481556106633602e-07, + "loss": 0.904, + "step": 115660 + }, + { + "epoch": 8.96353985043977, + "grad_norm": 1.4227723918364719, + "learning_rate": 4.481943583384997e-07, + "loss": 0.9369, + "step": 115670 + }, + { + "epoch": 8.964314773916076, + "grad_norm": 1.3748976184408972, + "learning_rate": 4.4823310601363916e-07, + "loss": 0.9309, + "step": 115680 + }, + { + "epoch": 8.965089697392383, + "grad_norm": 1.402405932952367, + "learning_rate": 4.482718536887787e-07, + "loss": 0.8931, + "step": 115690 + }, + { + "epoch": 8.96586462086869, + "grad_norm": 1.4562388873532242, + "learning_rate": 4.4831060136391816e-07, + "loss": 0.9068, + "step": 115700 + }, + { + "epoch": 8.966639544344996, + "grad_norm": 1.4551510014139437, + "learning_rate": 4.483493490390577e-07, + "loss": 0.8991, + "step": 115710 + }, + { + "epoch": 8.967414467821303, + "grad_norm": 1.4728327485764703, + "learning_rate": 4.4838809671419715e-07, + "loss": 0.9059, + "step": 115720 + }, + { + "epoch": 8.96818939129761, + "grad_norm": 1.3592361444664232, + "learning_rate": 4.4842684438933667e-07, + "loss": 0.9071, + "step": 115730 + }, + { + "epoch": 8.968964314773917, + "grad_norm": 1.4605373185124235, + "learning_rate": 4.4846559206447614e-07, + "loss": 0.9253, + "step": 115740 + }, + { + "epoch": 8.969739238250224, + "grad_norm": 1.383183404238102, + "learning_rate": 4.485043397396156e-07, + "loss": 0.9027, + "step": 115750 + }, + { + "epoch": 8.970514161726529, + "grad_norm": 1.5118161549700566, + "learning_rate": 4.4854308741475513e-07, + "loss": 0.889, + "step": 115760 + }, + { + "epoch": 8.971289085202836, + "grad_norm": 1.3975615757175415, + "learning_rate": 4.485818350898946e-07, + "loss": 0.8843, + "step": 115770 + }, + { + "epoch": 8.972064008679142, + "grad_norm": 1.4740694019014433, + "learning_rate": 4.486205827650341e-07, + "loss": 0.8909, + "step": 115780 + }, + { + "epoch": 8.97283893215545, + "grad_norm": 1.4148394319392992, + "learning_rate": 4.486593304401736e-07, + "loss": 0.8973, + "step": 115790 + }, + { + "epoch": 8.973613855631756, + "grad_norm": 1.4235856837054566, + "learning_rate": 4.486980781153131e-07, + "loss": 0.8907, + "step": 115800 + }, + { + "epoch": 8.974388779108063, + "grad_norm": 1.4683508872502584, + "learning_rate": 4.487368257904526e-07, + "loss": 0.9021, + "step": 115810 + }, + { + "epoch": 8.97516370258437, + "grad_norm": 1.490466955427777, + "learning_rate": 4.4877557346559205e-07, + "loss": 0.9078, + "step": 115820 + }, + { + "epoch": 8.975938626060676, + "grad_norm": 1.5025345185518886, + "learning_rate": 4.4881432114073157e-07, + "loss": 0.9108, + "step": 115830 + }, + { + "epoch": 8.976713549536983, + "grad_norm": 1.3818046999418647, + "learning_rate": 4.4885306881587104e-07, + "loss": 0.8948, + "step": 115840 + }, + { + "epoch": 8.97748847301329, + "grad_norm": 1.3951808625442432, + "learning_rate": 4.4889181649101056e-07, + "loss": 0.9201, + "step": 115850 + }, + { + "epoch": 8.978263396489597, + "grad_norm": 1.441368052721329, + "learning_rate": 4.4893056416615003e-07, + "loss": 0.9268, + "step": 115860 + }, + { + "epoch": 8.979038319965904, + "grad_norm": 1.4021572696623077, + "learning_rate": 4.4896931184128955e-07, + "loss": 0.8988, + "step": 115870 + }, + { + "epoch": 8.97981324344221, + "grad_norm": 1.4778556944653298, + "learning_rate": 4.49008059516429e-07, + "loss": 0.9174, + "step": 115880 + }, + { + "epoch": 8.980588166918517, + "grad_norm": 1.462026682517415, + "learning_rate": 4.490468071915685e-07, + "loss": 0.9165, + "step": 115890 + }, + { + "epoch": 8.981363090394824, + "grad_norm": 1.4014503269891418, + "learning_rate": 4.49085554866708e-07, + "loss": 0.9114, + "step": 115900 + }, + { + "epoch": 8.98213801387113, + "grad_norm": 1.471088876045274, + "learning_rate": 4.491243025418475e-07, + "loss": 0.888, + "step": 115910 + }, + { + "epoch": 8.982912937347438, + "grad_norm": 1.3808388792787143, + "learning_rate": 4.49163050216987e-07, + "loss": 0.9159, + "step": 115920 + }, + { + "epoch": 8.983687860823744, + "grad_norm": 1.4833499209225904, + "learning_rate": 4.4920179789212647e-07, + "loss": 0.9043, + "step": 115930 + }, + { + "epoch": 8.98446278430005, + "grad_norm": 1.4326934933717954, + "learning_rate": 4.49240545567266e-07, + "loss": 0.9165, + "step": 115940 + }, + { + "epoch": 8.985237707776356, + "grad_norm": 1.4511034972335182, + "learning_rate": 4.4927929324240546e-07, + "loss": 0.9102, + "step": 115950 + }, + { + "epoch": 8.986012631252663, + "grad_norm": 1.3696955244758153, + "learning_rate": 4.4931804091754493e-07, + "loss": 0.895, + "step": 115960 + }, + { + "epoch": 8.98678755472897, + "grad_norm": 1.5371214275494285, + "learning_rate": 4.4935678859268445e-07, + "loss": 0.9038, + "step": 115970 + }, + { + "epoch": 8.987562478205277, + "grad_norm": 1.369053982307693, + "learning_rate": 4.493955362678239e-07, + "loss": 0.9135, + "step": 115980 + }, + { + "epoch": 8.988337401681584, + "grad_norm": 1.3452263569432112, + "learning_rate": 4.4943428394296345e-07, + "loss": 0.9037, + "step": 115990 + }, + { + "epoch": 8.98911232515789, + "grad_norm": 1.4877087576308852, + "learning_rate": 4.494730316181029e-07, + "loss": 0.9017, + "step": 116000 + }, + { + "epoch": 8.98911232515789, + "eval_loss": 0.9143242239952087, + "eval_runtime": 329.4856, + "eval_samples_per_second": 34.815, + "eval_steps_per_second": 8.704, + "step": 116000 + }, + { + "epoch": 8.989887248634197, + "grad_norm": 1.418112794026667, + "learning_rate": 4.4951177929324244e-07, + "loss": 0.8876, + "step": 116010 + }, + { + "epoch": 8.990662172110504, + "grad_norm": 1.5607566678226839, + "learning_rate": 4.495505269683819e-07, + "loss": 0.8928, + "step": 116020 + }, + { + "epoch": 8.99143709558681, + "grad_norm": 1.5246432312746268, + "learning_rate": 4.495892746435214e-07, + "loss": 0.9272, + "step": 116030 + }, + { + "epoch": 8.992212019063118, + "grad_norm": 1.4416959489263395, + "learning_rate": 4.496280223186609e-07, + "loss": 0.8792, + "step": 116040 + }, + { + "epoch": 8.992986942539424, + "grad_norm": 1.4800717291299648, + "learning_rate": 4.4966676999380037e-07, + "loss": 0.9221, + "step": 116050 + }, + { + "epoch": 8.993761866015731, + "grad_norm": 1.4270880433578443, + "learning_rate": 4.497055176689399e-07, + "loss": 0.8878, + "step": 116060 + }, + { + "epoch": 8.994536789492038, + "grad_norm": 1.5014650413016757, + "learning_rate": 4.4974426534407936e-07, + "loss": 0.9066, + "step": 116070 + }, + { + "epoch": 8.995311712968345, + "grad_norm": 1.5200294788408903, + "learning_rate": 4.497830130192189e-07, + "loss": 0.9076, + "step": 116080 + }, + { + "epoch": 8.996086636444652, + "grad_norm": 1.3744252603877471, + "learning_rate": 4.4982176069435835e-07, + "loss": 0.9238, + "step": 116090 + }, + { + "epoch": 8.996861559920958, + "grad_norm": 1.401242274566386, + "learning_rate": 4.498605083694978e-07, + "loss": 0.9254, + "step": 116100 + }, + { + "epoch": 8.997636483397265, + "grad_norm": 1.4577446964571117, + "learning_rate": 4.4989925604463734e-07, + "loss": 0.8923, + "step": 116110 + }, + { + "epoch": 8.998411406873572, + "grad_norm": 1.5333982080500947, + "learning_rate": 4.499380037197768e-07, + "loss": 0.9186, + "step": 116120 + }, + { + "epoch": 8.999186330349877, + "grad_norm": 1.4624317512728373, + "learning_rate": 4.4997675139491633e-07, + "loss": 0.9037, + "step": 116130 + }, + { + "epoch": 8.999961253826184, + "grad_norm": 1.5054249563861615, + "learning_rate": 4.500154990700558e-07, + "loss": 0.9059, + "step": 116140 + }, + { + "epoch": 9.00073617730249, + "grad_norm": 1.4011865615930752, + "learning_rate": 4.500542467451953e-07, + "loss": 0.9164, + "step": 116150 + }, + { + "epoch": 9.001511100778798, + "grad_norm": 1.3574606030889484, + "learning_rate": 4.500929944203348e-07, + "loss": 0.8935, + "step": 116160 + }, + { + "epoch": 9.002286024255104, + "grad_norm": 1.4164760031798802, + "learning_rate": 4.5013174209547426e-07, + "loss": 0.8962, + "step": 116170 + }, + { + "epoch": 9.003060947731411, + "grad_norm": 1.4408451161999305, + "learning_rate": 4.501704897706138e-07, + "loss": 0.9073, + "step": 116180 + }, + { + "epoch": 9.003835871207718, + "grad_norm": 1.3555892583128455, + "learning_rate": 4.5020923744575325e-07, + "loss": 0.9026, + "step": 116190 + }, + { + "epoch": 9.004610794684025, + "grad_norm": 1.4953196660955503, + "learning_rate": 4.5024798512089277e-07, + "loss": 0.9191, + "step": 116200 + }, + { + "epoch": 9.005385718160332, + "grad_norm": 1.3995025467610782, + "learning_rate": 4.5028673279603224e-07, + "loss": 0.8976, + "step": 116210 + }, + { + "epoch": 9.006160641636638, + "grad_norm": 1.4405683694780598, + "learning_rate": 4.503254804711717e-07, + "loss": 0.8972, + "step": 116220 + }, + { + "epoch": 9.006935565112945, + "grad_norm": 1.4640570591311668, + "learning_rate": 4.5036422814631123e-07, + "loss": 0.9046, + "step": 116230 + }, + { + "epoch": 9.007710488589252, + "grad_norm": 1.462813150178437, + "learning_rate": 4.504029758214507e-07, + "loss": 0.9029, + "step": 116240 + }, + { + "epoch": 9.008485412065559, + "grad_norm": 1.4635599336909306, + "learning_rate": 4.504417234965902e-07, + "loss": 0.8961, + "step": 116250 + }, + { + "epoch": 9.009260335541866, + "grad_norm": 1.3509949209680574, + "learning_rate": 4.504804711717297e-07, + "loss": 0.9048, + "step": 116260 + }, + { + "epoch": 9.010035259018172, + "grad_norm": 1.3958888439099153, + "learning_rate": 4.505192188468692e-07, + "loss": 0.9159, + "step": 116270 + }, + { + "epoch": 9.01081018249448, + "grad_norm": 1.425095914146034, + "learning_rate": 4.505579665220087e-07, + "loss": 0.9079, + "step": 116280 + }, + { + "epoch": 9.011585105970786, + "grad_norm": 1.4989852482476498, + "learning_rate": 4.5059671419714815e-07, + "loss": 0.8897, + "step": 116290 + }, + { + "epoch": 9.012360029447093, + "grad_norm": 1.4593100846449623, + "learning_rate": 4.506354618722877e-07, + "loss": 0.9179, + "step": 116300 + }, + { + "epoch": 9.013134952923398, + "grad_norm": 1.5368698621717547, + "learning_rate": 4.5067420954742714e-07, + "loss": 0.9305, + "step": 116310 + }, + { + "epoch": 9.013909876399705, + "grad_norm": 1.4342935050906906, + "learning_rate": 4.5071295722256667e-07, + "loss": 0.9247, + "step": 116320 + }, + { + "epoch": 9.014684799876012, + "grad_norm": 1.4786601797700694, + "learning_rate": 4.5075170489770613e-07, + "loss": 0.9087, + "step": 116330 + }, + { + "epoch": 9.015459723352318, + "grad_norm": 1.5092615166922958, + "learning_rate": 4.5079045257284566e-07, + "loss": 0.9111, + "step": 116340 + }, + { + "epoch": 9.016234646828625, + "grad_norm": 1.5389260309324893, + "learning_rate": 4.508292002479851e-07, + "loss": 0.8978, + "step": 116350 + }, + { + "epoch": 9.017009570304932, + "grad_norm": 1.461240464187108, + "learning_rate": 4.508679479231246e-07, + "loss": 0.8913, + "step": 116360 + }, + { + "epoch": 9.017784493781239, + "grad_norm": 1.5152359157731807, + "learning_rate": 4.509066955982641e-07, + "loss": 0.8978, + "step": 116370 + }, + { + "epoch": 9.018559417257546, + "grad_norm": 1.4909306687524837, + "learning_rate": 4.509454432734036e-07, + "loss": 0.9005, + "step": 116380 + }, + { + "epoch": 9.019334340733852, + "grad_norm": 1.4727717967750367, + "learning_rate": 4.509841909485431e-07, + "loss": 0.9097, + "step": 116390 + }, + { + "epoch": 9.02010926421016, + "grad_norm": 1.4170882735253862, + "learning_rate": 4.510229386236826e-07, + "loss": 0.9024, + "step": 116400 + }, + { + "epoch": 9.020884187686466, + "grad_norm": 1.3721888802776796, + "learning_rate": 4.510616862988221e-07, + "loss": 0.9003, + "step": 116410 + }, + { + "epoch": 9.021659111162773, + "grad_norm": 1.7176008185344152, + "learning_rate": 4.5110043397396157e-07, + "loss": 0.9162, + "step": 116420 + }, + { + "epoch": 9.02243403463908, + "grad_norm": 1.5091443724142273, + "learning_rate": 4.5113918164910104e-07, + "loss": 0.9072, + "step": 116430 + }, + { + "epoch": 9.023208958115386, + "grad_norm": 1.4579526524642392, + "learning_rate": 4.5117792932424056e-07, + "loss": 0.9104, + "step": 116440 + }, + { + "epoch": 9.023983881591693, + "grad_norm": 1.4563743921878778, + "learning_rate": 4.5121667699938003e-07, + "loss": 0.896, + "step": 116450 + }, + { + "epoch": 9.024758805068, + "grad_norm": 1.444046519397117, + "learning_rate": 4.5125542467451955e-07, + "loss": 0.905, + "step": 116460 + }, + { + "epoch": 9.025533728544307, + "grad_norm": 1.4580244442860189, + "learning_rate": 4.51294172349659e-07, + "loss": 0.9115, + "step": 116470 + }, + { + "epoch": 9.026308652020614, + "grad_norm": 1.3999492559098643, + "learning_rate": 4.5133292002479854e-07, + "loss": 0.8859, + "step": 116480 + }, + { + "epoch": 9.02708357549692, + "grad_norm": 1.4042069628276304, + "learning_rate": 4.51371667699938e-07, + "loss": 0.8884, + "step": 116490 + }, + { + "epoch": 9.027858498973226, + "grad_norm": 1.4441611735863618, + "learning_rate": 4.514104153750775e-07, + "loss": 0.8944, + "step": 116500 + }, + { + "epoch": 9.027858498973226, + "eval_loss": 0.9141332507133484, + "eval_runtime": 328.4188, + "eval_samples_per_second": 34.928, + "eval_steps_per_second": 8.733, + "step": 116500 + }, + { + "epoch": 9.028633422449532, + "grad_norm": 1.413722812806332, + "learning_rate": 4.51449163050217e-07, + "loss": 0.9138, + "step": 116510 + }, + { + "epoch": 9.02940834592584, + "grad_norm": 1.4873790604338804, + "learning_rate": 4.5148791072535647e-07, + "loss": 0.9145, + "step": 116520 + }, + { + "epoch": 9.030183269402146, + "grad_norm": 1.4440807915421607, + "learning_rate": 4.51526658400496e-07, + "loss": 0.894, + "step": 116530 + }, + { + "epoch": 9.030958192878453, + "grad_norm": 1.45398430549384, + "learning_rate": 4.5156540607563546e-07, + "loss": 0.9033, + "step": 116540 + }, + { + "epoch": 9.03173311635476, + "grad_norm": 1.4382438318061523, + "learning_rate": 4.51604153750775e-07, + "loss": 0.9236, + "step": 116550 + }, + { + "epoch": 9.032508039831066, + "grad_norm": 1.4990385587275448, + "learning_rate": 4.5164290142591445e-07, + "loss": 0.8949, + "step": 116560 + }, + { + "epoch": 9.033282963307373, + "grad_norm": 1.3844903217708648, + "learning_rate": 4.516816491010539e-07, + "loss": 0.9079, + "step": 116570 + }, + { + "epoch": 9.03405788678368, + "grad_norm": 1.3918139095753808, + "learning_rate": 4.5172039677619344e-07, + "loss": 0.8941, + "step": 116580 + }, + { + "epoch": 9.034832810259987, + "grad_norm": 1.4822315153993977, + "learning_rate": 4.517591444513329e-07, + "loss": 0.924, + "step": 116590 + }, + { + "epoch": 9.035607733736294, + "grad_norm": 1.473542345283719, + "learning_rate": 4.5179789212647243e-07, + "loss": 0.9039, + "step": 116600 + }, + { + "epoch": 9.0363826572126, + "grad_norm": 1.4814292070158575, + "learning_rate": 4.518366398016119e-07, + "loss": 0.8958, + "step": 116610 + }, + { + "epoch": 9.037157580688907, + "grad_norm": 1.4391253737873952, + "learning_rate": 4.518753874767514e-07, + "loss": 0.9082, + "step": 116620 + }, + { + "epoch": 9.037932504165214, + "grad_norm": 1.5346070975211523, + "learning_rate": 4.519141351518909e-07, + "loss": 0.8935, + "step": 116630 + }, + { + "epoch": 9.038707427641521, + "grad_norm": 1.5031110682879973, + "learning_rate": 4.5195288282703036e-07, + "loss": 0.9162, + "step": 116640 + }, + { + "epoch": 9.039482351117828, + "grad_norm": 1.486460171069224, + "learning_rate": 4.519916305021699e-07, + "loss": 0.8804, + "step": 116650 + }, + { + "epoch": 9.040257274594135, + "grad_norm": 1.4164202684923843, + "learning_rate": 4.5203037817730935e-07, + "loss": 0.9003, + "step": 116660 + }, + { + "epoch": 9.041032198070441, + "grad_norm": 1.442237923953168, + "learning_rate": 4.520691258524489e-07, + "loss": 0.9145, + "step": 116670 + }, + { + "epoch": 9.041807121546746, + "grad_norm": 1.4377542984498441, + "learning_rate": 4.5210787352758834e-07, + "loss": 0.911, + "step": 116680 + }, + { + "epoch": 9.042582045023053, + "grad_norm": 1.4557787468173082, + "learning_rate": 4.5214662120272787e-07, + "loss": 0.9081, + "step": 116690 + }, + { + "epoch": 9.04335696849936, + "grad_norm": 1.4802349116643456, + "learning_rate": 4.5218536887786734e-07, + "loss": 0.9102, + "step": 116700 + }, + { + "epoch": 9.044131891975667, + "grad_norm": 1.4279642962610728, + "learning_rate": 4.522241165530068e-07, + "loss": 0.899, + "step": 116710 + }, + { + "epoch": 9.044906815451974, + "grad_norm": 1.4325209033971789, + "learning_rate": 4.5226286422814633e-07, + "loss": 0.8795, + "step": 116720 + }, + { + "epoch": 9.04568173892828, + "grad_norm": 1.451134403244577, + "learning_rate": 4.523016119032858e-07, + "loss": 0.9002, + "step": 116730 + }, + { + "epoch": 9.046456662404587, + "grad_norm": 1.4027586236505982, + "learning_rate": 4.523403595784253e-07, + "loss": 0.9115, + "step": 116740 + }, + { + "epoch": 9.047231585880894, + "grad_norm": 1.4552125626480679, + "learning_rate": 4.523791072535648e-07, + "loss": 0.9139, + "step": 116750 + }, + { + "epoch": 9.0480065093572, + "grad_norm": 1.4646061143481466, + "learning_rate": 4.524178549287043e-07, + "loss": 0.9037, + "step": 116760 + }, + { + "epoch": 9.048781432833508, + "grad_norm": 1.4387074513770237, + "learning_rate": 4.524566026038438e-07, + "loss": 0.9007, + "step": 116770 + }, + { + "epoch": 9.049556356309814, + "grad_norm": 1.368232159381742, + "learning_rate": 4.5249535027898325e-07, + "loss": 0.8794, + "step": 116780 + }, + { + "epoch": 9.050331279786121, + "grad_norm": 1.489009868462191, + "learning_rate": 4.5253409795412277e-07, + "loss": 0.8922, + "step": 116790 + }, + { + "epoch": 9.051106203262428, + "grad_norm": 1.5284854993791344, + "learning_rate": 4.5257284562926224e-07, + "loss": 0.9267, + "step": 116800 + }, + { + "epoch": 9.051881126738735, + "grad_norm": 1.424291395602763, + "learning_rate": 4.5261159330440176e-07, + "loss": 0.8871, + "step": 116810 + }, + { + "epoch": 9.052656050215042, + "grad_norm": 1.4394764315362345, + "learning_rate": 4.5265034097954123e-07, + "loss": 0.9103, + "step": 116820 + }, + { + "epoch": 9.053430973691349, + "grad_norm": 1.5621583933237229, + "learning_rate": 4.5268908865468075e-07, + "loss": 0.9028, + "step": 116830 + }, + { + "epoch": 9.054205897167655, + "grad_norm": 1.5021847034772933, + "learning_rate": 4.527278363298202e-07, + "loss": 0.8895, + "step": 116840 + }, + { + "epoch": 9.054980820643962, + "grad_norm": 1.4397958542616072, + "learning_rate": 4.527665840049597e-07, + "loss": 0.8998, + "step": 116850 + }, + { + "epoch": 9.055755744120269, + "grad_norm": 1.4157493136033026, + "learning_rate": 4.528053316800992e-07, + "loss": 0.8986, + "step": 116860 + }, + { + "epoch": 9.056530667596574, + "grad_norm": 1.3542662806829904, + "learning_rate": 4.528440793552387e-07, + "loss": 0.9139, + "step": 116870 + }, + { + "epoch": 9.05730559107288, + "grad_norm": 1.338970915929697, + "learning_rate": 4.528828270303782e-07, + "loss": 0.896, + "step": 116880 + }, + { + "epoch": 9.058080514549188, + "grad_norm": 1.4130592737405334, + "learning_rate": 4.5292157470551767e-07, + "loss": 0.9238, + "step": 116890 + }, + { + "epoch": 9.058855438025494, + "grad_norm": 1.4355427517819745, + "learning_rate": 4.5296032238065714e-07, + "loss": 0.8916, + "step": 116900 + }, + { + "epoch": 9.059630361501801, + "grad_norm": 1.4008621461255437, + "learning_rate": 4.5299907005579666e-07, + "loss": 0.8832, + "step": 116910 + }, + { + "epoch": 9.060405284978108, + "grad_norm": 1.45652885404514, + "learning_rate": 4.5303781773093613e-07, + "loss": 0.9087, + "step": 116920 + }, + { + "epoch": 9.061180208454415, + "grad_norm": 1.503761203443475, + "learning_rate": 4.5307656540607565e-07, + "loss": 0.9037, + "step": 116930 + }, + { + "epoch": 9.061955131930722, + "grad_norm": 1.4475105201963412, + "learning_rate": 4.531153130812151e-07, + "loss": 0.9036, + "step": 116940 + }, + { + "epoch": 9.062730055407028, + "grad_norm": 1.4370758015437592, + "learning_rate": 4.5315406075635464e-07, + "loss": 0.9224, + "step": 116950 + }, + { + "epoch": 9.063504978883335, + "grad_norm": 1.5112274597983857, + "learning_rate": 4.531928084314941e-07, + "loss": 0.9009, + "step": 116960 + }, + { + "epoch": 9.064279902359642, + "grad_norm": 1.4370201389952761, + "learning_rate": 4.532315561066336e-07, + "loss": 0.909, + "step": 116970 + }, + { + "epoch": 9.065054825835949, + "grad_norm": 1.3804114589814216, + "learning_rate": 4.532703037817731e-07, + "loss": 0.8916, + "step": 116980 + }, + { + "epoch": 9.065829749312256, + "grad_norm": 1.5151994554939587, + "learning_rate": 4.5330905145691257e-07, + "loss": 0.9109, + "step": 116990 + }, + { + "epoch": 9.066604672788563, + "grad_norm": 1.4860933756804795, + "learning_rate": 4.533477991320521e-07, + "loss": 0.9073, + "step": 117000 + }, + { + "epoch": 9.066604672788563, + "eval_loss": 0.9140987396240234, + "eval_runtime": 328.3114, + "eval_samples_per_second": 34.939, + "eval_steps_per_second": 8.736, + "step": 117000 + }, + { + "epoch": 9.06737959626487, + "grad_norm": 1.3575315971815791, + "learning_rate": 4.5338654680719156e-07, + "loss": 0.8885, + "step": 117010 + }, + { + "epoch": 9.068154519741176, + "grad_norm": 1.344777229007518, + "learning_rate": 4.534252944823311e-07, + "loss": 0.9106, + "step": 117020 + }, + { + "epoch": 9.068929443217483, + "grad_norm": 1.5144761281095902, + "learning_rate": 4.5346404215747056e-07, + "loss": 0.8907, + "step": 117030 + }, + { + "epoch": 9.06970436669379, + "grad_norm": 1.4412069473886453, + "learning_rate": 4.5350278983261e-07, + "loss": 0.8948, + "step": 117040 + }, + { + "epoch": 9.070479290170097, + "grad_norm": 1.388667726341221, + "learning_rate": 4.5354153750774955e-07, + "loss": 0.8966, + "step": 117050 + }, + { + "epoch": 9.071254213646402, + "grad_norm": 1.4758117733984992, + "learning_rate": 4.53580285182889e-07, + "loss": 0.9052, + "step": 117060 + }, + { + "epoch": 9.072029137122708, + "grad_norm": 1.4011267906543183, + "learning_rate": 4.5361903285802854e-07, + "loss": 0.896, + "step": 117070 + }, + { + "epoch": 9.072804060599015, + "grad_norm": 1.429693841739987, + "learning_rate": 4.53657780533168e-07, + "loss": 0.9229, + "step": 117080 + }, + { + "epoch": 9.073578984075322, + "grad_norm": 1.4398896793277822, + "learning_rate": 4.5369652820830753e-07, + "loss": 0.9114, + "step": 117090 + }, + { + "epoch": 9.074353907551629, + "grad_norm": 1.4943876278567045, + "learning_rate": 4.53735275883447e-07, + "loss": 0.92, + "step": 117100 + }, + { + "epoch": 9.075128831027936, + "grad_norm": 1.4976226444180416, + "learning_rate": 4.5377402355858647e-07, + "loss": 0.9052, + "step": 117110 + }, + { + "epoch": 9.075903754504242, + "grad_norm": 1.423048611041308, + "learning_rate": 4.53812771233726e-07, + "loss": 0.9031, + "step": 117120 + }, + { + "epoch": 9.07667867798055, + "grad_norm": 1.3910279689565874, + "learning_rate": 4.5385151890886546e-07, + "loss": 0.8969, + "step": 117130 + }, + { + "epoch": 9.077453601456856, + "grad_norm": 1.469812836857358, + "learning_rate": 4.53890266584005e-07, + "loss": 0.8983, + "step": 117140 + }, + { + "epoch": 9.078228524933163, + "grad_norm": 1.4907658275455178, + "learning_rate": 4.5392901425914445e-07, + "loss": 0.912, + "step": 117150 + }, + { + "epoch": 9.07900344840947, + "grad_norm": 1.4417818769503998, + "learning_rate": 4.5396776193428397e-07, + "loss": 0.9151, + "step": 117160 + }, + { + "epoch": 9.079778371885777, + "grad_norm": 1.3434653858636305, + "learning_rate": 4.5400650960942344e-07, + "loss": 0.9008, + "step": 117170 + }, + { + "epoch": 9.080553295362083, + "grad_norm": 1.5197261447603732, + "learning_rate": 4.540452572845629e-07, + "loss": 0.8959, + "step": 117180 + }, + { + "epoch": 9.08132821883839, + "grad_norm": 1.4064574083775017, + "learning_rate": 4.5408400495970243e-07, + "loss": 0.8889, + "step": 117190 + }, + { + "epoch": 9.082103142314697, + "grad_norm": 1.35491826386487, + "learning_rate": 4.541227526348419e-07, + "loss": 0.8803, + "step": 117200 + }, + { + "epoch": 9.082878065791004, + "grad_norm": 1.433867783554959, + "learning_rate": 4.541615003099814e-07, + "loss": 0.8932, + "step": 117210 + }, + { + "epoch": 9.08365298926731, + "grad_norm": 1.4372523083875677, + "learning_rate": 4.542002479851209e-07, + "loss": 0.8929, + "step": 117220 + }, + { + "epoch": 9.084427912743617, + "grad_norm": 1.4786822217355482, + "learning_rate": 4.542389956602604e-07, + "loss": 0.9409, + "step": 117230 + }, + { + "epoch": 9.085202836219922, + "grad_norm": 1.3723807378870192, + "learning_rate": 4.542777433353999e-07, + "loss": 0.8905, + "step": 117240 + }, + { + "epoch": 9.08597775969623, + "grad_norm": 1.386074103731609, + "learning_rate": 4.5431649101053935e-07, + "loss": 0.884, + "step": 117250 + }, + { + "epoch": 9.086752683172536, + "grad_norm": 1.4809367174098065, + "learning_rate": 4.5435523868567887e-07, + "loss": 0.8957, + "step": 117260 + }, + { + "epoch": 9.087527606648843, + "grad_norm": 1.5937854742431843, + "learning_rate": 4.5439398636081834e-07, + "loss": 0.9096, + "step": 117270 + }, + { + "epoch": 9.08830253012515, + "grad_norm": 1.4483947266977124, + "learning_rate": 4.5443273403595786e-07, + "loss": 0.9008, + "step": 117280 + }, + { + "epoch": 9.089077453601456, + "grad_norm": 1.5210947284835297, + "learning_rate": 4.5447148171109733e-07, + "loss": 0.8937, + "step": 117290 + }, + { + "epoch": 9.089852377077763, + "grad_norm": 1.5084526028866883, + "learning_rate": 4.5451022938623685e-07, + "loss": 0.8933, + "step": 117300 + }, + { + "epoch": 9.09062730055407, + "grad_norm": 1.3934315039834684, + "learning_rate": 4.545489770613763e-07, + "loss": 0.8885, + "step": 117310 + }, + { + "epoch": 9.091402224030377, + "grad_norm": 1.5113286176157277, + "learning_rate": 4.545877247365158e-07, + "loss": 0.9019, + "step": 117320 + }, + { + "epoch": 9.092177147506684, + "grad_norm": 1.551606106312903, + "learning_rate": 4.546264724116553e-07, + "loss": 0.8923, + "step": 117330 + }, + { + "epoch": 9.09295207098299, + "grad_norm": 1.3705488929775216, + "learning_rate": 4.546652200867948e-07, + "loss": 0.8863, + "step": 117340 + }, + { + "epoch": 9.093726994459297, + "grad_norm": 1.5282183402470164, + "learning_rate": 4.547039677619343e-07, + "loss": 0.9047, + "step": 117350 + }, + { + "epoch": 9.094501917935604, + "grad_norm": 1.4181422675313737, + "learning_rate": 4.547427154370738e-07, + "loss": 0.8852, + "step": 117360 + }, + { + "epoch": 9.095276841411911, + "grad_norm": 1.455760741097648, + "learning_rate": 4.547814631122133e-07, + "loss": 0.9112, + "step": 117370 + }, + { + "epoch": 9.096051764888218, + "grad_norm": 1.4755431798146466, + "learning_rate": 4.5482021078735277e-07, + "loss": 0.8982, + "step": 117380 + }, + { + "epoch": 9.096826688364525, + "grad_norm": 1.4239492939364586, + "learning_rate": 4.5485895846249223e-07, + "loss": 0.9027, + "step": 117390 + }, + { + "epoch": 9.097601611840831, + "grad_norm": 1.4635454060249782, + "learning_rate": 4.5489770613763176e-07, + "loss": 0.8998, + "step": 117400 + }, + { + "epoch": 9.098376535317138, + "grad_norm": 1.4491157836329331, + "learning_rate": 4.549364538127712e-07, + "loss": 0.8909, + "step": 117410 + }, + { + "epoch": 9.099151458793445, + "grad_norm": 1.4168475415686652, + "learning_rate": 4.5497520148791075e-07, + "loss": 0.9012, + "step": 117420 + }, + { + "epoch": 9.09992638226975, + "grad_norm": 1.5234479865052863, + "learning_rate": 4.550139491630502e-07, + "loss": 0.8943, + "step": 117430 + }, + { + "epoch": 9.100701305746057, + "grad_norm": 1.364092681707852, + "learning_rate": 4.5505269683818974e-07, + "loss": 0.9095, + "step": 117440 + }, + { + "epoch": 9.101476229222364, + "grad_norm": 1.47011359164185, + "learning_rate": 4.550914445133292e-07, + "loss": 0.8847, + "step": 117450 + }, + { + "epoch": 9.10225115269867, + "grad_norm": 1.4126037892011596, + "learning_rate": 4.551301921884687e-07, + "loss": 0.8928, + "step": 117460 + }, + { + "epoch": 9.103026076174977, + "grad_norm": 1.5465904722475858, + "learning_rate": 4.551689398636082e-07, + "loss": 0.9052, + "step": 117470 + }, + { + "epoch": 9.103800999651284, + "grad_norm": 1.4813190543778865, + "learning_rate": 4.5520768753874767e-07, + "loss": 0.9226, + "step": 117480 + }, + { + "epoch": 9.104575923127591, + "grad_norm": 1.490096485401648, + "learning_rate": 4.552464352138872e-07, + "loss": 0.8912, + "step": 117490 + }, + { + "epoch": 9.105350846603898, + "grad_norm": 1.461733024558414, + "learning_rate": 4.5528518288902666e-07, + "loss": 0.8944, + "step": 117500 + }, + { + "epoch": 9.105350846603898, + "eval_loss": 0.9139140844345093, + "eval_runtime": 326.2959, + "eval_samples_per_second": 35.155, + "eval_steps_per_second": 8.79, + "step": 117500 + }, + { + "epoch": 9.106125770080205, + "grad_norm": 1.4437817353012303, + "learning_rate": 4.553239305641662e-07, + "loss": 0.9082, + "step": 117510 + }, + { + "epoch": 9.106900693556511, + "grad_norm": 1.3917343014205028, + "learning_rate": 4.5536267823930565e-07, + "loss": 0.904, + "step": 117520 + }, + { + "epoch": 9.107675617032818, + "grad_norm": 1.4014348737489235, + "learning_rate": 4.554014259144451e-07, + "loss": 0.9093, + "step": 117530 + }, + { + "epoch": 9.108450540509125, + "grad_norm": 1.5130636881556314, + "learning_rate": 4.5544017358958464e-07, + "loss": 0.8968, + "step": 117540 + }, + { + "epoch": 9.109225463985432, + "grad_norm": 1.4350776595034658, + "learning_rate": 4.554789212647241e-07, + "loss": 0.8828, + "step": 117550 + }, + { + "epoch": 9.110000387461739, + "grad_norm": 1.4607956533281155, + "learning_rate": 4.5551766893986363e-07, + "loss": 0.8946, + "step": 117560 + }, + { + "epoch": 9.110775310938045, + "grad_norm": 1.3644863947187775, + "learning_rate": 4.555564166150031e-07, + "loss": 0.8883, + "step": 117570 + }, + { + "epoch": 9.111550234414352, + "grad_norm": 1.4432130156675251, + "learning_rate": 4.555951642901426e-07, + "loss": 0.8975, + "step": 117580 + }, + { + "epoch": 9.112325157890659, + "grad_norm": 1.4244148234279976, + "learning_rate": 4.556339119652821e-07, + "loss": 0.8965, + "step": 117590 + }, + { + "epoch": 9.113100081366966, + "grad_norm": 1.5337953503671713, + "learning_rate": 4.5567265964042156e-07, + "loss": 0.9213, + "step": 117600 + }, + { + "epoch": 9.113875004843273, + "grad_norm": 1.4160051423991404, + "learning_rate": 4.557114073155611e-07, + "loss": 0.9047, + "step": 117610 + }, + { + "epoch": 9.114649928319578, + "grad_norm": 1.4357393303750337, + "learning_rate": 4.5575015499070055e-07, + "loss": 0.9072, + "step": 117620 + }, + { + "epoch": 9.115424851795884, + "grad_norm": 1.4584355379857192, + "learning_rate": 4.557889026658401e-07, + "loss": 0.8967, + "step": 117630 + }, + { + "epoch": 9.116199775272191, + "grad_norm": 1.4100555012592928, + "learning_rate": 4.5582765034097954e-07, + "loss": 0.9137, + "step": 117640 + }, + { + "epoch": 9.116974698748498, + "grad_norm": 1.4264463446611237, + "learning_rate": 4.55866398016119e-07, + "loss": 0.9028, + "step": 117650 + }, + { + "epoch": 9.117749622224805, + "grad_norm": 1.4858372992043367, + "learning_rate": 4.5590514569125853e-07, + "loss": 0.8975, + "step": 117660 + }, + { + "epoch": 9.118524545701112, + "grad_norm": 1.4404552164805065, + "learning_rate": 4.55943893366398e-07, + "loss": 0.9016, + "step": 117670 + }, + { + "epoch": 9.119299469177419, + "grad_norm": 1.3317464640812573, + "learning_rate": 4.559826410415375e-07, + "loss": 0.8937, + "step": 117680 + }, + { + "epoch": 9.120074392653725, + "grad_norm": 1.4044146736221492, + "learning_rate": 4.56021388716677e-07, + "loss": 0.8848, + "step": 117690 + }, + { + "epoch": 9.120849316130032, + "grad_norm": 1.4804646705211117, + "learning_rate": 4.560601363918165e-07, + "loss": 0.8843, + "step": 117700 + }, + { + "epoch": 9.121624239606339, + "grad_norm": 1.3934962923502658, + "learning_rate": 4.56098884066956e-07, + "loss": 0.8943, + "step": 117710 + }, + { + "epoch": 9.122399163082646, + "grad_norm": 1.4779123850837879, + "learning_rate": 4.5613763174209545e-07, + "loss": 0.9386, + "step": 117720 + }, + { + "epoch": 9.123174086558953, + "grad_norm": 1.4762060804196626, + "learning_rate": 4.56176379417235e-07, + "loss": 0.902, + "step": 117730 + }, + { + "epoch": 9.12394901003526, + "grad_norm": 1.357790493749103, + "learning_rate": 4.5621512709237445e-07, + "loss": 0.898, + "step": 117740 + }, + { + "epoch": 9.124723933511566, + "grad_norm": 1.6847731514125268, + "learning_rate": 4.5625387476751397e-07, + "loss": 0.8825, + "step": 117750 + }, + { + "epoch": 9.125498856987873, + "grad_norm": 1.3920840562230294, + "learning_rate": 4.5629262244265344e-07, + "loss": 0.8893, + "step": 117760 + }, + { + "epoch": 9.12627378046418, + "grad_norm": 1.4906155176307876, + "learning_rate": 4.5633137011779296e-07, + "loss": 0.9144, + "step": 117770 + }, + { + "epoch": 9.127048703940487, + "grad_norm": 1.3646398088265237, + "learning_rate": 4.5637011779293243e-07, + "loss": 0.9123, + "step": 117780 + }, + { + "epoch": 9.127823627416793, + "grad_norm": 1.3882725397402524, + "learning_rate": 4.564088654680719e-07, + "loss": 0.8995, + "step": 117790 + }, + { + "epoch": 9.128598550893098, + "grad_norm": 1.5320598766003237, + "learning_rate": 4.564476131432114e-07, + "loss": 0.9133, + "step": 117800 + }, + { + "epoch": 9.129373474369405, + "grad_norm": 1.5675662716287757, + "learning_rate": 4.564863608183509e-07, + "loss": 0.9184, + "step": 117810 + }, + { + "epoch": 9.130148397845712, + "grad_norm": 1.4236519374396719, + "learning_rate": 4.565251084934904e-07, + "loss": 0.8969, + "step": 117820 + }, + { + "epoch": 9.130923321322019, + "grad_norm": 1.4544470594174117, + "learning_rate": 4.565638561686299e-07, + "loss": 0.9077, + "step": 117830 + }, + { + "epoch": 9.131698244798326, + "grad_norm": 1.4887790520983335, + "learning_rate": 4.566026038437694e-07, + "loss": 0.8932, + "step": 117840 + }, + { + "epoch": 9.132473168274633, + "grad_norm": 1.4492006059164892, + "learning_rate": 4.5664135151890887e-07, + "loss": 0.9062, + "step": 117850 + }, + { + "epoch": 9.13324809175094, + "grad_norm": 1.4046170385108054, + "learning_rate": 4.5668009919404834e-07, + "loss": 0.9009, + "step": 117860 + }, + { + "epoch": 9.134023015227246, + "grad_norm": 1.444189632216435, + "learning_rate": 4.5671884686918786e-07, + "loss": 0.9116, + "step": 117870 + }, + { + "epoch": 9.134797938703553, + "grad_norm": 1.4472311768653368, + "learning_rate": 4.5675759454432733e-07, + "loss": 0.9249, + "step": 117880 + }, + { + "epoch": 9.13557286217986, + "grad_norm": 1.4058891023730842, + "learning_rate": 4.5679634221946685e-07, + "loss": 0.8942, + "step": 117890 + }, + { + "epoch": 9.136347785656167, + "grad_norm": 1.3340640348192379, + "learning_rate": 4.568350898946063e-07, + "loss": 0.8894, + "step": 117900 + }, + { + "epoch": 9.137122709132473, + "grad_norm": 1.3869543718156647, + "learning_rate": 4.5687383756974584e-07, + "loss": 0.8987, + "step": 117910 + }, + { + "epoch": 9.13789763260878, + "grad_norm": 1.4191716149884372, + "learning_rate": 4.569125852448853e-07, + "loss": 0.8932, + "step": 117920 + }, + { + "epoch": 9.138672556085087, + "grad_norm": 1.6277067590642937, + "learning_rate": 4.569513329200248e-07, + "loss": 0.9058, + "step": 117930 + }, + { + "epoch": 9.139447479561394, + "grad_norm": 1.4144821597025354, + "learning_rate": 4.569900805951643e-07, + "loss": 0.8897, + "step": 117940 + }, + { + "epoch": 9.1402224030377, + "grad_norm": 1.548648506476816, + "learning_rate": 4.5702882827030377e-07, + "loss": 0.9, + "step": 117950 + }, + { + "epoch": 9.140997326514007, + "grad_norm": 1.5447968798709992, + "learning_rate": 4.570675759454433e-07, + "loss": 0.8954, + "step": 117960 + }, + { + "epoch": 9.141772249990314, + "grad_norm": 1.526977394407352, + "learning_rate": 4.5710632362058276e-07, + "loss": 0.9143, + "step": 117970 + }, + { + "epoch": 9.142547173466621, + "grad_norm": 1.4610004082939307, + "learning_rate": 4.571450712957223e-07, + "loss": 0.9197, + "step": 117980 + }, + { + "epoch": 9.143322096942926, + "grad_norm": 1.4670633063130043, + "learning_rate": 4.5718381897086175e-07, + "loss": 0.902, + "step": 117990 + }, + { + "epoch": 9.144097020419233, + "grad_norm": 1.4807468156600816, + "learning_rate": 4.572225666460012e-07, + "loss": 0.9015, + "step": 118000 + }, + { + "epoch": 9.144097020419233, + "eval_loss": 0.9136245846748352, + "eval_runtime": 328.0996, + "eval_samples_per_second": 34.962, + "eval_steps_per_second": 8.741, + "step": 118000 + }, + { + "epoch": 9.14487194389554, + "grad_norm": 1.4653882616777774, + "learning_rate": 4.5726131432114074e-07, + "loss": 0.9234, + "step": 118010 + }, + { + "epoch": 9.145646867371847, + "grad_norm": 1.5264922684042315, + "learning_rate": 4.573000619962802e-07, + "loss": 0.9146, + "step": 118020 + }, + { + "epoch": 9.146421790848153, + "grad_norm": 1.5581857302201079, + "learning_rate": 4.5733880967141974e-07, + "loss": 0.9052, + "step": 118030 + }, + { + "epoch": 9.14719671432446, + "grad_norm": 1.5510094288037142, + "learning_rate": 4.573775573465592e-07, + "loss": 0.909, + "step": 118040 + }, + { + "epoch": 9.147971637800767, + "grad_norm": 1.4590058369588745, + "learning_rate": 4.5741630502169873e-07, + "loss": 0.8934, + "step": 118050 + }, + { + "epoch": 9.148746561277074, + "grad_norm": 1.5517903382255478, + "learning_rate": 4.574550526968382e-07, + "loss": 0.9184, + "step": 118060 + }, + { + "epoch": 9.14952148475338, + "grad_norm": 1.4983908876067935, + "learning_rate": 4.5749380037197766e-07, + "loss": 0.9015, + "step": 118070 + }, + { + "epoch": 9.150296408229687, + "grad_norm": 1.4691030536903034, + "learning_rate": 4.575325480471172e-07, + "loss": 0.9134, + "step": 118080 + }, + { + "epoch": 9.151071331705994, + "grad_norm": 1.4181730829479822, + "learning_rate": 4.5757129572225666e-07, + "loss": 0.8969, + "step": 118090 + }, + { + "epoch": 9.151846255182301, + "grad_norm": 1.4380525821154855, + "learning_rate": 4.576100433973962e-07, + "loss": 0.9005, + "step": 118100 + }, + { + "epoch": 9.152621178658608, + "grad_norm": 1.4399458626236528, + "learning_rate": 4.5764879107253565e-07, + "loss": 0.9212, + "step": 118110 + }, + { + "epoch": 9.153396102134915, + "grad_norm": 1.428959401786637, + "learning_rate": 4.5768753874767517e-07, + "loss": 0.9137, + "step": 118120 + }, + { + "epoch": 9.154171025611221, + "grad_norm": 1.450675676523256, + "learning_rate": 4.5772628642281464e-07, + "loss": 0.912, + "step": 118130 + }, + { + "epoch": 9.154945949087528, + "grad_norm": 1.4796367024508144, + "learning_rate": 4.577650340979541e-07, + "loss": 0.9129, + "step": 118140 + }, + { + "epoch": 9.155720872563835, + "grad_norm": 1.3960980812593067, + "learning_rate": 4.5780378177309363e-07, + "loss": 0.9036, + "step": 118150 + }, + { + "epoch": 9.156495796040142, + "grad_norm": 1.4196113186361234, + "learning_rate": 4.578425294482331e-07, + "loss": 0.8974, + "step": 118160 + }, + { + "epoch": 9.157270719516447, + "grad_norm": 1.5191893954562434, + "learning_rate": 4.578812771233726e-07, + "loss": 0.9067, + "step": 118170 + }, + { + "epoch": 9.158045642992754, + "grad_norm": 1.403187157925034, + "learning_rate": 4.579200247985121e-07, + "loss": 0.865, + "step": 118180 + }, + { + "epoch": 9.15882056646906, + "grad_norm": 1.4208968156753952, + "learning_rate": 4.579587724736516e-07, + "loss": 0.9019, + "step": 118190 + }, + { + "epoch": 9.159595489945367, + "grad_norm": 1.4334611584719008, + "learning_rate": 4.579975201487911e-07, + "loss": 0.9008, + "step": 118200 + }, + { + "epoch": 9.160370413421674, + "grad_norm": 1.3893771721122428, + "learning_rate": 4.5803626782393055e-07, + "loss": 0.8972, + "step": 118210 + }, + { + "epoch": 9.161145336897981, + "grad_norm": 1.4207365480552119, + "learning_rate": 4.5807501549907007e-07, + "loss": 0.9023, + "step": 118220 + }, + { + "epoch": 9.161920260374288, + "grad_norm": 1.3294879070162835, + "learning_rate": 4.5811376317420954e-07, + "loss": 0.8946, + "step": 118230 + }, + { + "epoch": 9.162695183850595, + "grad_norm": 1.4499718542710176, + "learning_rate": 4.5815251084934906e-07, + "loss": 0.9002, + "step": 118240 + }, + { + "epoch": 9.163470107326901, + "grad_norm": 1.4844381833709626, + "learning_rate": 4.5819125852448853e-07, + "loss": 0.9222, + "step": 118250 + }, + { + "epoch": 9.164245030803208, + "grad_norm": 1.5240957034455118, + "learning_rate": 4.5823000619962805e-07, + "loss": 0.9181, + "step": 118260 + }, + { + "epoch": 9.165019954279515, + "grad_norm": 1.4567085271470797, + "learning_rate": 4.582687538747675e-07, + "loss": 0.9145, + "step": 118270 + }, + { + "epoch": 9.165794877755822, + "grad_norm": 1.436369319828335, + "learning_rate": 4.58307501549907e-07, + "loss": 0.8971, + "step": 118280 + }, + { + "epoch": 9.166569801232129, + "grad_norm": 1.4373815879067688, + "learning_rate": 4.583462492250465e-07, + "loss": 0.899, + "step": 118290 + }, + { + "epoch": 9.167344724708435, + "grad_norm": 1.4860710043991245, + "learning_rate": 4.58384996900186e-07, + "loss": 0.8974, + "step": 118300 + }, + { + "epoch": 9.168119648184742, + "grad_norm": 1.3850382195755078, + "learning_rate": 4.584237445753255e-07, + "loss": 0.8764, + "step": 118310 + }, + { + "epoch": 9.168894571661049, + "grad_norm": 1.406794368840438, + "learning_rate": 4.5846249225046497e-07, + "loss": 0.9056, + "step": 118320 + }, + { + "epoch": 9.169669495137356, + "grad_norm": 1.4476107240184595, + "learning_rate": 4.585012399256045e-07, + "loss": 0.9201, + "step": 118330 + }, + { + "epoch": 9.170444418613663, + "grad_norm": 1.5722478856398494, + "learning_rate": 4.5853998760074396e-07, + "loss": 0.906, + "step": 118340 + }, + { + "epoch": 9.17121934208997, + "grad_norm": 1.3941328016831744, + "learning_rate": 4.5857873527588343e-07, + "loss": 0.903, + "step": 118350 + }, + { + "epoch": 9.171994265566275, + "grad_norm": 1.4055462805618903, + "learning_rate": 4.5861748295102296e-07, + "loss": 0.9149, + "step": 118360 + }, + { + "epoch": 9.172769189042581, + "grad_norm": 1.3974806588325546, + "learning_rate": 4.586562306261624e-07, + "loss": 0.8964, + "step": 118370 + }, + { + "epoch": 9.173544112518888, + "grad_norm": 1.3695837102110722, + "learning_rate": 4.5869497830130195e-07, + "loss": 0.8945, + "step": 118380 + }, + { + "epoch": 9.174319035995195, + "grad_norm": 1.4233563748390068, + "learning_rate": 4.587337259764414e-07, + "loss": 0.9002, + "step": 118390 + }, + { + "epoch": 9.175093959471502, + "grad_norm": 1.4156329440265258, + "learning_rate": 4.587724736515809e-07, + "loss": 0.9223, + "step": 118400 + }, + { + "epoch": 9.175868882947809, + "grad_norm": 1.4796654775600209, + "learning_rate": 4.588112213267204e-07, + "loss": 0.9089, + "step": 118410 + }, + { + "epoch": 9.176643806424115, + "grad_norm": 1.425123369164596, + "learning_rate": 4.588499690018599e-07, + "loss": 0.8968, + "step": 118420 + }, + { + "epoch": 9.177418729900422, + "grad_norm": 1.4144291877362685, + "learning_rate": 4.588887166769994e-07, + "loss": 0.8917, + "step": 118430 + }, + { + "epoch": 9.178193653376729, + "grad_norm": 1.416257237492478, + "learning_rate": 4.5892746435213887e-07, + "loss": 0.8933, + "step": 118440 + }, + { + "epoch": 9.178968576853036, + "grad_norm": 1.5235097125880144, + "learning_rate": 4.589662120272784e-07, + "loss": 0.8977, + "step": 118450 + }, + { + "epoch": 9.179743500329343, + "grad_norm": 1.4022106924726134, + "learning_rate": 4.5900495970241786e-07, + "loss": 0.875, + "step": 118460 + }, + { + "epoch": 9.18051842380565, + "grad_norm": 1.411474855001366, + "learning_rate": 4.590437073775573e-07, + "loss": 0.8981, + "step": 118470 + }, + { + "epoch": 9.181293347281956, + "grad_norm": 1.5163875740028914, + "learning_rate": 4.5908245505269685e-07, + "loss": 0.9264, + "step": 118480 + }, + { + "epoch": 9.182068270758263, + "grad_norm": 1.3730882460771434, + "learning_rate": 4.591212027278363e-07, + "loss": 0.8983, + "step": 118490 + }, + { + "epoch": 9.18284319423457, + "grad_norm": 1.5109030660768092, + "learning_rate": 4.5915995040297584e-07, + "loss": 0.9076, + "step": 118500 + }, + { + "epoch": 9.18284319423457, + "eval_loss": 0.9133142828941345, + "eval_runtime": 328.1237, + "eval_samples_per_second": 34.959, + "eval_steps_per_second": 8.741, + "step": 118500 + }, + { + "epoch": 9.183618117710877, + "grad_norm": 1.4741352987323646, + "learning_rate": 4.591986980781153e-07, + "loss": 0.879, + "step": 118510 + }, + { + "epoch": 9.184393041187183, + "grad_norm": 1.4029915835157123, + "learning_rate": 4.5923744575325483e-07, + "loss": 0.8916, + "step": 118520 + }, + { + "epoch": 9.18516796466349, + "grad_norm": 1.4613359999428068, + "learning_rate": 4.592761934283943e-07, + "loss": 0.9027, + "step": 118530 + }, + { + "epoch": 9.185942888139795, + "grad_norm": 1.4846067346506016, + "learning_rate": 4.5931494110353377e-07, + "loss": 0.8891, + "step": 118540 + }, + { + "epoch": 9.186717811616102, + "grad_norm": 1.4291737904505104, + "learning_rate": 4.593536887786733e-07, + "loss": 0.9113, + "step": 118550 + }, + { + "epoch": 9.187492735092409, + "grad_norm": 1.4089826888326604, + "learning_rate": 4.5939243645381276e-07, + "loss": 0.8826, + "step": 118560 + }, + { + "epoch": 9.188267658568716, + "grad_norm": 1.483194340215217, + "learning_rate": 4.594311841289523e-07, + "loss": 0.8853, + "step": 118570 + }, + { + "epoch": 9.189042582045023, + "grad_norm": 1.4731696697218637, + "learning_rate": 4.5946993180409175e-07, + "loss": 0.8892, + "step": 118580 + }, + { + "epoch": 9.18981750552133, + "grad_norm": 1.4585179616102901, + "learning_rate": 4.5950867947923127e-07, + "loss": 0.9303, + "step": 118590 + }, + { + "epoch": 9.190592428997636, + "grad_norm": 1.420479483039271, + "learning_rate": 4.5954742715437074e-07, + "loss": 0.8881, + "step": 118600 + }, + { + "epoch": 9.191367352473943, + "grad_norm": 1.4746752399443532, + "learning_rate": 4.595861748295102e-07, + "loss": 0.892, + "step": 118610 + }, + { + "epoch": 9.19214227595025, + "grad_norm": 1.4869774432004472, + "learning_rate": 4.5962492250464973e-07, + "loss": 0.894, + "step": 118620 + }, + { + "epoch": 9.192917199426557, + "grad_norm": 1.4814516644368976, + "learning_rate": 4.596636701797892e-07, + "loss": 0.9095, + "step": 118630 + }, + { + "epoch": 9.193692122902863, + "grad_norm": 1.5040021334226272, + "learning_rate": 4.597024178549287e-07, + "loss": 0.9003, + "step": 118640 + }, + { + "epoch": 9.19446704637917, + "grad_norm": 1.5164272193729824, + "learning_rate": 4.597411655300682e-07, + "loss": 0.9003, + "step": 118650 + }, + { + "epoch": 9.195241969855477, + "grad_norm": 1.3891188766268803, + "learning_rate": 4.597799132052077e-07, + "loss": 0.9124, + "step": 118660 + }, + { + "epoch": 9.196016893331784, + "grad_norm": 1.4041420679982872, + "learning_rate": 4.598186608803472e-07, + "loss": 0.8883, + "step": 118670 + }, + { + "epoch": 9.19679181680809, + "grad_norm": 1.5021242712698413, + "learning_rate": 4.5985740855548665e-07, + "loss": 0.9124, + "step": 118680 + }, + { + "epoch": 9.197566740284397, + "grad_norm": 1.4358623289881367, + "learning_rate": 4.598961562306262e-07, + "loss": 0.8968, + "step": 118690 + }, + { + "epoch": 9.198341663760704, + "grad_norm": 1.4913787611426195, + "learning_rate": 4.5993490390576564e-07, + "loss": 0.8897, + "step": 118700 + }, + { + "epoch": 9.199116587237011, + "grad_norm": 1.4971572320516087, + "learning_rate": 4.5997365158090517e-07, + "loss": 0.9065, + "step": 118710 + }, + { + "epoch": 9.199891510713318, + "grad_norm": 1.3891460658072965, + "learning_rate": 4.6001239925604463e-07, + "loss": 0.9348, + "step": 118720 + }, + { + "epoch": 9.200666434189623, + "grad_norm": 1.4629207483486546, + "learning_rate": 4.6005114693118416e-07, + "loss": 0.8987, + "step": 118730 + }, + { + "epoch": 9.20144135766593, + "grad_norm": 1.5568679963211818, + "learning_rate": 4.600898946063236e-07, + "loss": 0.8934, + "step": 118740 + }, + { + "epoch": 9.202216281142237, + "grad_norm": 1.4390752023840874, + "learning_rate": 4.601286422814631e-07, + "loss": 0.9013, + "step": 118750 + }, + { + "epoch": 9.202991204618543, + "grad_norm": 1.4223227325533996, + "learning_rate": 4.601673899566026e-07, + "loss": 0.8915, + "step": 118760 + }, + { + "epoch": 9.20376612809485, + "grad_norm": 1.4767352899629969, + "learning_rate": 4.602061376317421e-07, + "loss": 0.9024, + "step": 118770 + }, + { + "epoch": 9.204541051571157, + "grad_norm": 1.5389887578315764, + "learning_rate": 4.602448853068816e-07, + "loss": 0.9002, + "step": 118780 + }, + { + "epoch": 9.205315975047464, + "grad_norm": 1.4894071726489326, + "learning_rate": 4.602836329820211e-07, + "loss": 0.9123, + "step": 118790 + }, + { + "epoch": 9.20609089852377, + "grad_norm": 1.512738553701421, + "learning_rate": 4.603223806571606e-07, + "loss": 0.9021, + "step": 118800 + }, + { + "epoch": 9.206865822000077, + "grad_norm": 1.4804196936297607, + "learning_rate": 4.6036112833230007e-07, + "loss": 0.8962, + "step": 118810 + }, + { + "epoch": 9.207640745476384, + "grad_norm": 1.4750617881644443, + "learning_rate": 4.6039987600743954e-07, + "loss": 0.9022, + "step": 118820 + }, + { + "epoch": 9.208415668952691, + "grad_norm": 1.4756246956555104, + "learning_rate": 4.6043862368257906e-07, + "loss": 0.9182, + "step": 118830 + }, + { + "epoch": 9.209190592428998, + "grad_norm": 1.3137903742974322, + "learning_rate": 4.6047737135771853e-07, + "loss": 0.881, + "step": 118840 + }, + { + "epoch": 9.209965515905305, + "grad_norm": 1.4405931383632877, + "learning_rate": 4.6051611903285805e-07, + "loss": 0.8936, + "step": 118850 + }, + { + "epoch": 9.210740439381611, + "grad_norm": 1.4535534856334944, + "learning_rate": 4.605548667079975e-07, + "loss": 0.9281, + "step": 118860 + }, + { + "epoch": 9.211515362857918, + "grad_norm": 1.4920949971029984, + "learning_rate": 4.6059361438313704e-07, + "loss": 0.888, + "step": 118870 + }, + { + "epoch": 9.212290286334225, + "grad_norm": 1.393073432980561, + "learning_rate": 4.606323620582765e-07, + "loss": 0.8852, + "step": 118880 + }, + { + "epoch": 9.213065209810532, + "grad_norm": 1.5857438428366355, + "learning_rate": 4.60671109733416e-07, + "loss": 0.8969, + "step": 118890 + }, + { + "epoch": 9.213840133286839, + "grad_norm": 1.395049617169159, + "learning_rate": 4.607098574085555e-07, + "loss": 0.8872, + "step": 118900 + }, + { + "epoch": 9.214615056763144, + "grad_norm": 1.40963135233559, + "learning_rate": 4.6074860508369497e-07, + "loss": 0.8922, + "step": 118910 + }, + { + "epoch": 9.21538998023945, + "grad_norm": 1.4134408140409584, + "learning_rate": 4.607873527588345e-07, + "loss": 0.907, + "step": 118920 + }, + { + "epoch": 9.216164903715757, + "grad_norm": 1.4868070423883144, + "learning_rate": 4.6082610043397396e-07, + "loss": 0.9176, + "step": 118930 + }, + { + "epoch": 9.216939827192064, + "grad_norm": 1.4204703703893409, + "learning_rate": 4.608648481091135e-07, + "loss": 0.9083, + "step": 118940 + }, + { + "epoch": 9.217714750668371, + "grad_norm": 1.4821065382787253, + "learning_rate": 4.6090359578425295e-07, + "loss": 0.8961, + "step": 118950 + }, + { + "epoch": 9.218489674144678, + "grad_norm": 1.4869336674364224, + "learning_rate": 4.609423434593924e-07, + "loss": 0.9024, + "step": 118960 + }, + { + "epoch": 9.219264597620985, + "grad_norm": 1.4499621428302094, + "learning_rate": 4.6098109113453194e-07, + "loss": 0.8847, + "step": 118970 + }, + { + "epoch": 9.220039521097291, + "grad_norm": 1.5023665293017414, + "learning_rate": 4.610198388096714e-07, + "loss": 0.8772, + "step": 118980 + }, + { + "epoch": 9.220814444573598, + "grad_norm": 1.4860318350630446, + "learning_rate": 4.6105858648481093e-07, + "loss": 0.9101, + "step": 118990 + }, + { + "epoch": 9.221589368049905, + "grad_norm": 1.4482445365611534, + "learning_rate": 4.610973341599504e-07, + "loss": 0.9168, + "step": 119000 + }, + { + "epoch": 9.221589368049905, + "eval_loss": 0.9131125211715698, + "eval_runtime": 327.9662, + "eval_samples_per_second": 34.976, + "eval_steps_per_second": 8.745, + "step": 119000 + }, + { + "epoch": 9.222364291526212, + "grad_norm": 1.5086722558224341, + "learning_rate": 4.611360818350899e-07, + "loss": 0.8877, + "step": 119010 + }, + { + "epoch": 9.223139215002519, + "grad_norm": 1.5381030698943408, + "learning_rate": 4.611748295102294e-07, + "loss": 0.8929, + "step": 119020 + }, + { + "epoch": 9.223914138478825, + "grad_norm": 1.4578398593582245, + "learning_rate": 4.6121357718536886e-07, + "loss": 0.8923, + "step": 119030 + }, + { + "epoch": 9.224689061955132, + "grad_norm": 1.5060237255423534, + "learning_rate": 4.612523248605084e-07, + "loss": 0.9042, + "step": 119040 + }, + { + "epoch": 9.225463985431439, + "grad_norm": 1.419401558013138, + "learning_rate": 4.6129107253564785e-07, + "loss": 0.8867, + "step": 119050 + }, + { + "epoch": 9.226238908907746, + "grad_norm": 1.43489135009635, + "learning_rate": 4.613298202107874e-07, + "loss": 0.8898, + "step": 119060 + }, + { + "epoch": 9.227013832384053, + "grad_norm": 1.406150755390335, + "learning_rate": 4.6136856788592685e-07, + "loss": 0.8896, + "step": 119070 + }, + { + "epoch": 9.22778875586036, + "grad_norm": 1.4428958582315647, + "learning_rate": 4.6140731556106637e-07, + "loss": 0.891, + "step": 119080 + }, + { + "epoch": 9.228563679336666, + "grad_norm": 1.3818800517350036, + "learning_rate": 4.6144606323620584e-07, + "loss": 0.9028, + "step": 119090 + }, + { + "epoch": 9.229338602812971, + "grad_norm": 1.3571058265014038, + "learning_rate": 4.614848109113453e-07, + "loss": 0.8975, + "step": 119100 + }, + { + "epoch": 9.230113526289278, + "grad_norm": 1.4782791967531812, + "learning_rate": 4.6152355858648483e-07, + "loss": 0.8842, + "step": 119110 + }, + { + "epoch": 9.230888449765585, + "grad_norm": 1.4267364434409924, + "learning_rate": 4.615623062616243e-07, + "loss": 0.9009, + "step": 119120 + }, + { + "epoch": 9.231663373241892, + "grad_norm": 1.398857251984431, + "learning_rate": 4.616010539367638e-07, + "loss": 0.8918, + "step": 119130 + }, + { + "epoch": 9.232438296718199, + "grad_norm": 1.4079537817239807, + "learning_rate": 4.616398016119033e-07, + "loss": 0.9093, + "step": 119140 + }, + { + "epoch": 9.233213220194505, + "grad_norm": 1.396399041432103, + "learning_rate": 4.6167854928704276e-07, + "loss": 0.9027, + "step": 119150 + }, + { + "epoch": 9.233988143670812, + "grad_norm": 1.4324447521374346, + "learning_rate": 4.617172969621823e-07, + "loss": 0.8956, + "step": 119160 + }, + { + "epoch": 9.234763067147119, + "grad_norm": 1.3851510230353603, + "learning_rate": 4.6175604463732175e-07, + "loss": 0.9004, + "step": 119170 + }, + { + "epoch": 9.235537990623426, + "grad_norm": 1.4102780193435518, + "learning_rate": 4.6179479231246127e-07, + "loss": 0.9072, + "step": 119180 + }, + { + "epoch": 9.236312914099733, + "grad_norm": 1.4369620739495303, + "learning_rate": 4.6183353998760074e-07, + "loss": 0.8949, + "step": 119190 + }, + { + "epoch": 9.23708783757604, + "grad_norm": 1.4153164270185055, + "learning_rate": 4.6187228766274026e-07, + "loss": 0.9, + "step": 119200 + }, + { + "epoch": 9.237862761052346, + "grad_norm": 1.4357811691602627, + "learning_rate": 4.6191103533787973e-07, + "loss": 0.8943, + "step": 119210 + }, + { + "epoch": 9.238637684528653, + "grad_norm": 1.512977274897079, + "learning_rate": 4.619497830130192e-07, + "loss": 0.9099, + "step": 119220 + }, + { + "epoch": 9.23941260800496, + "grad_norm": 1.5266075488881983, + "learning_rate": 4.619885306881587e-07, + "loss": 0.9053, + "step": 119230 + }, + { + "epoch": 9.240187531481267, + "grad_norm": 1.3872518345995988, + "learning_rate": 4.620272783632982e-07, + "loss": 0.9266, + "step": 119240 + }, + { + "epoch": 9.240962454957574, + "grad_norm": 1.4692823844875011, + "learning_rate": 4.620660260384377e-07, + "loss": 0.9148, + "step": 119250 + }, + { + "epoch": 9.24173737843388, + "grad_norm": 1.402989531656814, + "learning_rate": 4.621047737135772e-07, + "loss": 0.9148, + "step": 119260 + }, + { + "epoch": 9.242512301910187, + "grad_norm": 1.543620305608034, + "learning_rate": 4.621435213887167e-07, + "loss": 0.9016, + "step": 119270 + }, + { + "epoch": 9.243287225386494, + "grad_norm": 1.438034838070567, + "learning_rate": 4.6218226906385617e-07, + "loss": 0.9056, + "step": 119280 + }, + { + "epoch": 9.244062148862799, + "grad_norm": 1.5031448071826286, + "learning_rate": 4.6222101673899564e-07, + "loss": 0.9245, + "step": 119290 + }, + { + "epoch": 9.244837072339106, + "grad_norm": 1.4367090982340724, + "learning_rate": 4.6225976441413516e-07, + "loss": 0.8998, + "step": 119300 + }, + { + "epoch": 9.245611995815413, + "grad_norm": 1.4153497799387973, + "learning_rate": 4.6229851208927463e-07, + "loss": 0.9136, + "step": 119310 + }, + { + "epoch": 9.24638691929172, + "grad_norm": 1.4051611315131256, + "learning_rate": 4.6233725976441415e-07, + "loss": 0.9029, + "step": 119320 + }, + { + "epoch": 9.247161842768026, + "grad_norm": 1.4307593990343215, + "learning_rate": 4.623760074395536e-07, + "loss": 0.9058, + "step": 119330 + }, + { + "epoch": 9.247936766244333, + "grad_norm": 1.4818031450919273, + "learning_rate": 4.6241475511469314e-07, + "loss": 0.9129, + "step": 119340 + }, + { + "epoch": 9.24871168972064, + "grad_norm": 1.4035880828568157, + "learning_rate": 4.624535027898326e-07, + "loss": 0.8832, + "step": 119350 + }, + { + "epoch": 9.249486613196947, + "grad_norm": 1.4763878804451471, + "learning_rate": 4.624922504649721e-07, + "loss": 0.8761, + "step": 119360 + }, + { + "epoch": 9.250261536673253, + "grad_norm": 1.515262554678899, + "learning_rate": 4.625309981401116e-07, + "loss": 0.9034, + "step": 119370 + }, + { + "epoch": 9.25103646014956, + "grad_norm": 1.4423629320872335, + "learning_rate": 4.625697458152511e-07, + "loss": 0.894, + "step": 119380 + }, + { + "epoch": 9.251811383625867, + "grad_norm": 1.5096747756323088, + "learning_rate": 4.626084934903906e-07, + "loss": 0.9085, + "step": 119390 + }, + { + "epoch": 9.252586307102174, + "grad_norm": 1.448955301037478, + "learning_rate": 4.6264724116553006e-07, + "loss": 0.9132, + "step": 119400 + }, + { + "epoch": 9.25336123057848, + "grad_norm": 1.5372745821544125, + "learning_rate": 4.626859888406696e-07, + "loss": 0.9024, + "step": 119410 + }, + { + "epoch": 9.254136154054788, + "grad_norm": 1.3921364941055598, + "learning_rate": 4.6272473651580906e-07, + "loss": 0.8968, + "step": 119420 + }, + { + "epoch": 9.254911077531094, + "grad_norm": 1.480037522937947, + "learning_rate": 4.627634841909485e-07, + "loss": 0.9371, + "step": 119430 + }, + { + "epoch": 9.255686001007401, + "grad_norm": 1.3934792758078491, + "learning_rate": 4.6280223186608805e-07, + "loss": 0.9118, + "step": 119440 + }, + { + "epoch": 9.256460924483708, + "grad_norm": 1.3590592056201096, + "learning_rate": 4.628409795412275e-07, + "loss": 0.8887, + "step": 119450 + }, + { + "epoch": 9.257235847960015, + "grad_norm": 1.5147381953377372, + "learning_rate": 4.6287972721636704e-07, + "loss": 0.9084, + "step": 119460 + }, + { + "epoch": 9.258010771436322, + "grad_norm": 1.500031894423829, + "learning_rate": 4.629184748915065e-07, + "loss": 0.8924, + "step": 119470 + }, + { + "epoch": 9.258785694912627, + "grad_norm": 1.3916111763184944, + "learning_rate": 4.6295722256664603e-07, + "loss": 0.8856, + "step": 119480 + }, + { + "epoch": 9.259560618388933, + "grad_norm": 1.4383115451756225, + "learning_rate": 4.629959702417855e-07, + "loss": 0.9069, + "step": 119490 + }, + { + "epoch": 9.26033554186524, + "grad_norm": 1.46684822294065, + "learning_rate": 4.6303471791692497e-07, + "loss": 0.8936, + "step": 119500 + }, + { + "epoch": 9.26033554186524, + "eval_loss": 0.9128202199935913, + "eval_runtime": 325.8036, + "eval_samples_per_second": 35.208, + "eval_steps_per_second": 8.803, + "step": 119500 + }, + { + "epoch": 9.261110465341547, + "grad_norm": 1.4218935972709372, + "learning_rate": 4.630734655920645e-07, + "loss": 0.9033, + "step": 119510 + }, + { + "epoch": 9.261885388817854, + "grad_norm": 1.4512842120027583, + "learning_rate": 4.6311221326720396e-07, + "loss": 0.8945, + "step": 119520 + }, + { + "epoch": 9.26266031229416, + "grad_norm": 1.4610044392981414, + "learning_rate": 4.631509609423435e-07, + "loss": 0.9033, + "step": 119530 + }, + { + "epoch": 9.263435235770467, + "grad_norm": 1.4648396686432472, + "learning_rate": 4.6318970861748295e-07, + "loss": 0.9329, + "step": 119540 + }, + { + "epoch": 9.264210159246774, + "grad_norm": 1.4533516058542875, + "learning_rate": 4.6322845629262247e-07, + "loss": 0.9, + "step": 119550 + }, + { + "epoch": 9.264985082723081, + "grad_norm": 1.485358274995953, + "learning_rate": 4.6326720396776194e-07, + "loss": 0.897, + "step": 119560 + }, + { + "epoch": 9.265760006199388, + "grad_norm": 1.5051383038223412, + "learning_rate": 4.633059516429014e-07, + "loss": 0.93, + "step": 119570 + }, + { + "epoch": 9.266534929675695, + "grad_norm": 1.4832697158630164, + "learning_rate": 4.6334469931804093e-07, + "loss": 0.9189, + "step": 119580 + }, + { + "epoch": 9.267309853152002, + "grad_norm": 1.5686560198775468, + "learning_rate": 4.633834469931804e-07, + "loss": 0.9038, + "step": 119590 + }, + { + "epoch": 9.268084776628308, + "grad_norm": 1.459968543461006, + "learning_rate": 4.634221946683199e-07, + "loss": 0.8956, + "step": 119600 + }, + { + "epoch": 9.268859700104615, + "grad_norm": 1.451891536191317, + "learning_rate": 4.634609423434594e-07, + "loss": 0.9054, + "step": 119610 + }, + { + "epoch": 9.269634623580922, + "grad_norm": 1.6710272917709992, + "learning_rate": 4.634996900185989e-07, + "loss": 0.8974, + "step": 119620 + }, + { + "epoch": 9.270409547057229, + "grad_norm": 1.4470117549206511, + "learning_rate": 4.635384376937384e-07, + "loss": 0.8923, + "step": 119630 + }, + { + "epoch": 9.271184470533536, + "grad_norm": 1.4126635330300334, + "learning_rate": 4.6357718536887785e-07, + "loss": 0.9193, + "step": 119640 + }, + { + "epoch": 9.271959394009842, + "grad_norm": 1.3500091546786295, + "learning_rate": 4.6361593304401737e-07, + "loss": 0.886, + "step": 119650 + }, + { + "epoch": 9.272734317486147, + "grad_norm": 1.4028865488821196, + "learning_rate": 4.6365468071915684e-07, + "loss": 0.9083, + "step": 119660 + }, + { + "epoch": 9.273509240962454, + "grad_norm": 1.431761674184103, + "learning_rate": 4.6369342839429636e-07, + "loss": 0.885, + "step": 119670 + }, + { + "epoch": 9.274284164438761, + "grad_norm": 1.434869573142459, + "learning_rate": 4.6373217606943583e-07, + "loss": 0.8967, + "step": 119680 + }, + { + "epoch": 9.275059087915068, + "grad_norm": 1.4062754176874679, + "learning_rate": 4.6377092374457536e-07, + "loss": 0.8933, + "step": 119690 + }, + { + "epoch": 9.275834011391375, + "grad_norm": 1.43362761219826, + "learning_rate": 4.638096714197148e-07, + "loss": 0.8945, + "step": 119700 + }, + { + "epoch": 9.276608934867681, + "grad_norm": 1.4943424600829698, + "learning_rate": 4.638484190948543e-07, + "loss": 0.9055, + "step": 119710 + }, + { + "epoch": 9.277383858343988, + "grad_norm": 1.480972724046568, + "learning_rate": 4.638871667699938e-07, + "loss": 0.9165, + "step": 119720 + }, + { + "epoch": 9.278158781820295, + "grad_norm": 1.4488063189891942, + "learning_rate": 4.639259144451333e-07, + "loss": 0.9042, + "step": 119730 + }, + { + "epoch": 9.278933705296602, + "grad_norm": 1.361228229602148, + "learning_rate": 4.639646621202728e-07, + "loss": 0.8869, + "step": 119740 + }, + { + "epoch": 9.279708628772909, + "grad_norm": 1.5085505542623592, + "learning_rate": 4.640034097954123e-07, + "loss": 0.9071, + "step": 119750 + }, + { + "epoch": 9.280483552249216, + "grad_norm": 1.4435481382638122, + "learning_rate": 4.640421574705518e-07, + "loss": 0.9184, + "step": 119760 + }, + { + "epoch": 9.281258475725522, + "grad_norm": 1.4567372766823068, + "learning_rate": 4.6408090514569127e-07, + "loss": 0.9047, + "step": 119770 + }, + { + "epoch": 9.28203339920183, + "grad_norm": 1.5402615324565512, + "learning_rate": 4.6411965282083074e-07, + "loss": 0.8977, + "step": 119780 + }, + { + "epoch": 9.282808322678136, + "grad_norm": 1.5551220311877485, + "learning_rate": 4.6415840049597026e-07, + "loss": 0.885, + "step": 119790 + }, + { + "epoch": 9.283583246154443, + "grad_norm": 1.5535451213189437, + "learning_rate": 4.641971481711097e-07, + "loss": 0.9018, + "step": 119800 + }, + { + "epoch": 9.28435816963075, + "grad_norm": 1.4722677742754307, + "learning_rate": 4.6423589584624925e-07, + "loss": 0.9045, + "step": 119810 + }, + { + "epoch": 9.285133093107056, + "grad_norm": 1.54744347776107, + "learning_rate": 4.642746435213887e-07, + "loss": 0.9064, + "step": 119820 + }, + { + "epoch": 9.285908016583363, + "grad_norm": 1.4471971463972224, + "learning_rate": 4.643133911965282e-07, + "loss": 0.91, + "step": 119830 + }, + { + "epoch": 9.28668294005967, + "grad_norm": 1.3833052592127912, + "learning_rate": 4.643521388716677e-07, + "loss": 0.9079, + "step": 119840 + }, + { + "epoch": 9.287457863535975, + "grad_norm": 1.5190428192248844, + "learning_rate": 4.643908865468072e-07, + "loss": 0.9143, + "step": 119850 + }, + { + "epoch": 9.288232787012282, + "grad_norm": 1.4115411394742132, + "learning_rate": 4.644296342219467e-07, + "loss": 0.9026, + "step": 119860 + }, + { + "epoch": 9.289007710488589, + "grad_norm": 1.43025465322763, + "learning_rate": 4.6446838189708617e-07, + "loss": 0.9184, + "step": 119870 + }, + { + "epoch": 9.289782633964895, + "grad_norm": 1.5212377107643438, + "learning_rate": 4.645071295722257e-07, + "loss": 0.9192, + "step": 119880 + }, + { + "epoch": 9.290557557441202, + "grad_norm": 1.5136848201504975, + "learning_rate": 4.6454587724736516e-07, + "loss": 0.8907, + "step": 119890 + }, + { + "epoch": 9.291332480917509, + "grad_norm": 1.3347772314402777, + "learning_rate": 4.6458462492250463e-07, + "loss": 0.8935, + "step": 119900 + }, + { + "epoch": 9.292107404393816, + "grad_norm": 1.413397171449619, + "learning_rate": 4.6462337259764415e-07, + "loss": 0.8949, + "step": 119910 + }, + { + "epoch": 9.292882327870123, + "grad_norm": 1.3954045831561646, + "learning_rate": 4.646621202727836e-07, + "loss": 0.8937, + "step": 119920 + }, + { + "epoch": 9.29365725134643, + "grad_norm": 1.4061328001826425, + "learning_rate": 4.6470086794792314e-07, + "loss": 0.9, + "step": 119930 + }, + { + "epoch": 9.294432174822736, + "grad_norm": 1.4935342685267379, + "learning_rate": 4.647396156230626e-07, + "loss": 0.9244, + "step": 119940 + }, + { + "epoch": 9.295207098299043, + "grad_norm": 1.4153836871837309, + "learning_rate": 4.6477836329820213e-07, + "loss": 0.8893, + "step": 119950 + }, + { + "epoch": 9.29598202177535, + "grad_norm": 1.4724877846463376, + "learning_rate": 4.648171109733416e-07, + "loss": 0.8869, + "step": 119960 + }, + { + "epoch": 9.296756945251657, + "grad_norm": 1.4344999143229002, + "learning_rate": 4.6485585864848107e-07, + "loss": 0.8944, + "step": 119970 + }, + { + "epoch": 9.297531868727964, + "grad_norm": 1.5060894797391093, + "learning_rate": 4.648946063236206e-07, + "loss": 0.9025, + "step": 119980 + }, + { + "epoch": 9.29830679220427, + "grad_norm": 1.462010653442788, + "learning_rate": 4.6493335399876006e-07, + "loss": 0.9007, + "step": 119990 + }, + { + "epoch": 9.299081715680577, + "grad_norm": 1.384809251683566, + "learning_rate": 4.649721016738996e-07, + "loss": 0.9267, + "step": 120000 + }, + { + "epoch": 9.299081715680577, + "eval_loss": 0.9126284122467041, + "eval_runtime": 331.7165, + "eval_samples_per_second": 34.581, + "eval_steps_per_second": 8.646, + "step": 120000 + }, + { + "epoch": 9.299856639156884, + "grad_norm": 1.445549247776976, + "learning_rate": 4.6501084934903905e-07, + "loss": 0.9236, + "step": 120010 + }, + { + "epoch": 9.30063156263319, + "grad_norm": 1.4640668720552916, + "learning_rate": 4.650495970241786e-07, + "loss": 0.9018, + "step": 120020 + }, + { + "epoch": 9.301406486109496, + "grad_norm": 1.474224652679683, + "learning_rate": 4.6508834469931804e-07, + "loss": 0.8808, + "step": 120030 + }, + { + "epoch": 9.302181409585803, + "grad_norm": 1.5414153352202569, + "learning_rate": 4.651270923744575e-07, + "loss": 0.9, + "step": 120040 + }, + { + "epoch": 9.30295633306211, + "grad_norm": 1.5048432508228256, + "learning_rate": 4.6516584004959703e-07, + "loss": 0.9088, + "step": 120050 + }, + { + "epoch": 9.303731256538416, + "grad_norm": 1.4422870219122954, + "learning_rate": 4.652045877247365e-07, + "loss": 0.89, + "step": 120060 + }, + { + "epoch": 9.304506180014723, + "grad_norm": 1.4833356101848147, + "learning_rate": 4.65243335399876e-07, + "loss": 0.9295, + "step": 120070 + }, + { + "epoch": 9.30528110349103, + "grad_norm": 1.4299540683113423, + "learning_rate": 4.652820830750155e-07, + "loss": 0.8894, + "step": 120080 + }, + { + "epoch": 9.306056026967337, + "grad_norm": 1.4332077946102044, + "learning_rate": 4.65320830750155e-07, + "loss": 0.8997, + "step": 120090 + }, + { + "epoch": 9.306830950443644, + "grad_norm": 1.46527386074773, + "learning_rate": 4.653595784252945e-07, + "loss": 0.9095, + "step": 120100 + }, + { + "epoch": 9.30760587391995, + "grad_norm": 1.3868316679909367, + "learning_rate": 4.6539832610043395e-07, + "loss": 0.9276, + "step": 120110 + }, + { + "epoch": 9.308380797396257, + "grad_norm": 1.4665203646484046, + "learning_rate": 4.654370737755735e-07, + "loss": 0.91, + "step": 120120 + }, + { + "epoch": 9.309155720872564, + "grad_norm": 1.4398027690686235, + "learning_rate": 4.6547582145071295e-07, + "loss": 0.9051, + "step": 120130 + }, + { + "epoch": 9.30993064434887, + "grad_norm": 1.3662550176267045, + "learning_rate": 4.6551456912585247e-07, + "loss": 0.8892, + "step": 120140 + }, + { + "epoch": 9.310705567825178, + "grad_norm": 1.4035246908551906, + "learning_rate": 4.6555331680099194e-07, + "loss": 0.8788, + "step": 120150 + }, + { + "epoch": 9.311480491301484, + "grad_norm": 1.3918740214371352, + "learning_rate": 4.6559206447613146e-07, + "loss": 0.8884, + "step": 120160 + }, + { + "epoch": 9.312255414777791, + "grad_norm": 1.4420749357116986, + "learning_rate": 4.6563081215127093e-07, + "loss": 0.9117, + "step": 120170 + }, + { + "epoch": 9.313030338254098, + "grad_norm": 1.411935971168189, + "learning_rate": 4.656695598264104e-07, + "loss": 0.9196, + "step": 120180 + }, + { + "epoch": 9.313805261730405, + "grad_norm": 1.510866985257258, + "learning_rate": 4.657083075015499e-07, + "loss": 0.8977, + "step": 120190 + }, + { + "epoch": 9.314580185206712, + "grad_norm": 1.4946786516933608, + "learning_rate": 4.657470551766894e-07, + "loss": 0.9031, + "step": 120200 + }, + { + "epoch": 9.315355108683018, + "grad_norm": 1.4830552053316088, + "learning_rate": 4.657858028518289e-07, + "loss": 0.8996, + "step": 120210 + }, + { + "epoch": 9.316130032159323, + "grad_norm": 1.4744583954226593, + "learning_rate": 4.658245505269684e-07, + "loss": 0.9017, + "step": 120220 + }, + { + "epoch": 9.31690495563563, + "grad_norm": 1.5830504449504785, + "learning_rate": 4.658632982021079e-07, + "loss": 0.929, + "step": 120230 + }, + { + "epoch": 9.317679879111937, + "grad_norm": 1.4875267143969282, + "learning_rate": 4.6590204587724737e-07, + "loss": 0.9225, + "step": 120240 + }, + { + "epoch": 9.318454802588244, + "grad_norm": 1.5519928896462587, + "learning_rate": 4.6594079355238684e-07, + "loss": 0.9074, + "step": 120250 + }, + { + "epoch": 9.31922972606455, + "grad_norm": 1.6401893851458396, + "learning_rate": 4.6597954122752636e-07, + "loss": 0.905, + "step": 120260 + }, + { + "epoch": 9.320004649540857, + "grad_norm": 1.394800796561424, + "learning_rate": 4.6601828890266583e-07, + "loss": 0.9202, + "step": 120270 + }, + { + "epoch": 9.320779573017164, + "grad_norm": 1.4128402393451045, + "learning_rate": 4.6605703657780535e-07, + "loss": 0.9059, + "step": 120280 + }, + { + "epoch": 9.321554496493471, + "grad_norm": 1.4250494715841522, + "learning_rate": 4.660957842529448e-07, + "loss": 0.919, + "step": 120290 + }, + { + "epoch": 9.322329419969778, + "grad_norm": 1.5184187662524922, + "learning_rate": 4.6613453192808434e-07, + "loss": 0.9107, + "step": 120300 + }, + { + "epoch": 9.323104343446085, + "grad_norm": 1.5109676019418867, + "learning_rate": 4.661732796032238e-07, + "loss": 0.8794, + "step": 120310 + }, + { + "epoch": 9.323879266922392, + "grad_norm": 1.3779270782628605, + "learning_rate": 4.662120272783633e-07, + "loss": 0.9118, + "step": 120320 + }, + { + "epoch": 9.324654190398698, + "grad_norm": 1.4831651040252243, + "learning_rate": 4.662507749535028e-07, + "loss": 0.8962, + "step": 120330 + }, + { + "epoch": 9.325429113875005, + "grad_norm": 1.4379867768382761, + "learning_rate": 4.6628952262864227e-07, + "loss": 0.9347, + "step": 120340 + }, + { + "epoch": 9.326204037351312, + "grad_norm": 1.4591925537508232, + "learning_rate": 4.663282703037818e-07, + "loss": 0.8784, + "step": 120350 + }, + { + "epoch": 9.326978960827619, + "grad_norm": 1.4768584197058385, + "learning_rate": 4.6636701797892126e-07, + "loss": 0.9046, + "step": 120360 + }, + { + "epoch": 9.327753884303926, + "grad_norm": 1.4359988844431248, + "learning_rate": 4.664057656540608e-07, + "loss": 0.8975, + "step": 120370 + }, + { + "epoch": 9.328528807780232, + "grad_norm": 1.538699381300763, + "learning_rate": 4.6644451332920025e-07, + "loss": 0.9281, + "step": 120380 + }, + { + "epoch": 9.32930373125654, + "grad_norm": 1.529763300904556, + "learning_rate": 4.664832610043397e-07, + "loss": 0.9108, + "step": 120390 + }, + { + "epoch": 9.330078654732844, + "grad_norm": 1.5147785380800076, + "learning_rate": 4.6652200867947925e-07, + "loss": 0.9147, + "step": 120400 + }, + { + "epoch": 9.330853578209151, + "grad_norm": 1.5156702460694782, + "learning_rate": 4.665607563546187e-07, + "loss": 0.9034, + "step": 120410 + }, + { + "epoch": 9.331628501685458, + "grad_norm": 1.5494104884834377, + "learning_rate": 4.6659950402975824e-07, + "loss": 0.896, + "step": 120420 + }, + { + "epoch": 9.332403425161765, + "grad_norm": 1.4193581373917008, + "learning_rate": 4.666382517048977e-07, + "loss": 0.8847, + "step": 120430 + }, + { + "epoch": 9.333178348638071, + "grad_norm": 1.4268985056323933, + "learning_rate": 4.6667699938003723e-07, + "loss": 0.9141, + "step": 120440 + }, + { + "epoch": 9.333953272114378, + "grad_norm": 1.4546669519116118, + "learning_rate": 4.667157470551767e-07, + "loss": 0.9053, + "step": 120450 + }, + { + "epoch": 9.334728195590685, + "grad_norm": 1.4297600954475864, + "learning_rate": 4.6675449473031617e-07, + "loss": 0.9212, + "step": 120460 + }, + { + "epoch": 9.335503119066992, + "grad_norm": 1.3531531720185346, + "learning_rate": 4.667932424054557e-07, + "loss": 0.8983, + "step": 120470 + }, + { + "epoch": 9.336278042543299, + "grad_norm": 1.451551297677128, + "learning_rate": 4.6683199008059516e-07, + "loss": 0.8799, + "step": 120480 + }, + { + "epoch": 9.337052966019606, + "grad_norm": 1.4472276574737308, + "learning_rate": 4.668707377557347e-07, + "loss": 0.8765, + "step": 120490 + }, + { + "epoch": 9.337827889495912, + "grad_norm": 1.3824042151206832, + "learning_rate": 4.6690948543087415e-07, + "loss": 0.9044, + "step": 120500 + }, + { + "epoch": 9.337827889495912, + "eval_loss": 0.9123533964157104, + "eval_runtime": 331.8982, + "eval_samples_per_second": 34.562, + "eval_steps_per_second": 8.641, + "step": 120500 + }, + { + "epoch": 9.33860281297222, + "grad_norm": 1.4524293456401252, + "learning_rate": 4.6694823310601367e-07, + "loss": 0.8943, + "step": 120510 + }, + { + "epoch": 9.339377736448526, + "grad_norm": 1.3805302037793703, + "learning_rate": 4.6698698078115314e-07, + "loss": 0.911, + "step": 120520 + }, + { + "epoch": 9.340152659924833, + "grad_norm": 1.484731611679219, + "learning_rate": 4.670257284562926e-07, + "loss": 0.8861, + "step": 120530 + }, + { + "epoch": 9.34092758340114, + "grad_norm": 1.462950011545921, + "learning_rate": 4.6706447613143213e-07, + "loss": 0.9119, + "step": 120540 + }, + { + "epoch": 9.341702506877446, + "grad_norm": 1.4963726351639661, + "learning_rate": 4.671032238065716e-07, + "loss": 0.9013, + "step": 120550 + }, + { + "epoch": 9.342477430353753, + "grad_norm": 1.4449219697630589, + "learning_rate": 4.671419714817111e-07, + "loss": 0.9184, + "step": 120560 + }, + { + "epoch": 9.34325235383006, + "grad_norm": 1.4039830619623268, + "learning_rate": 4.671807191568506e-07, + "loss": 0.8916, + "step": 120570 + }, + { + "epoch": 9.344027277306367, + "grad_norm": 1.5370362214701876, + "learning_rate": 4.6721946683199006e-07, + "loss": 0.8809, + "step": 120580 + }, + { + "epoch": 9.344802200782672, + "grad_norm": 1.453848720359158, + "learning_rate": 4.672582145071296e-07, + "loss": 0.9097, + "step": 120590 + }, + { + "epoch": 9.345577124258979, + "grad_norm": 1.5677988816048642, + "learning_rate": 4.6729696218226905e-07, + "loss": 0.9256, + "step": 120600 + }, + { + "epoch": 9.346352047735285, + "grad_norm": 1.4255420095501565, + "learning_rate": 4.6733570985740857e-07, + "loss": 0.903, + "step": 120610 + }, + { + "epoch": 9.347126971211592, + "grad_norm": 1.4481583958747593, + "learning_rate": 4.6737445753254804e-07, + "loss": 0.8901, + "step": 120620 + }, + { + "epoch": 9.3479018946879, + "grad_norm": 1.4427568940757052, + "learning_rate": 4.6741320520768756e-07, + "loss": 0.9059, + "step": 120630 + }, + { + "epoch": 9.348676818164206, + "grad_norm": 1.4575102987084905, + "learning_rate": 4.6745195288282703e-07, + "loss": 0.914, + "step": 120640 + }, + { + "epoch": 9.349451741640513, + "grad_norm": 1.39405103773567, + "learning_rate": 4.674907005579665e-07, + "loss": 0.9038, + "step": 120650 + }, + { + "epoch": 9.35022666511682, + "grad_norm": 1.4349587873516065, + "learning_rate": 4.67529448233106e-07, + "loss": 0.9002, + "step": 120660 + }, + { + "epoch": 9.351001588593126, + "grad_norm": 1.424804293221819, + "learning_rate": 4.675681959082455e-07, + "loss": 0.915, + "step": 120670 + }, + { + "epoch": 9.351776512069433, + "grad_norm": 1.4792086295402709, + "learning_rate": 4.67606943583385e-07, + "loss": 0.9094, + "step": 120680 + }, + { + "epoch": 9.35255143554574, + "grad_norm": 1.5133198097416116, + "learning_rate": 4.676456912585245e-07, + "loss": 0.9107, + "step": 120690 + }, + { + "epoch": 9.353326359022047, + "grad_norm": 1.4373851408678802, + "learning_rate": 4.67684438933664e-07, + "loss": 0.9, + "step": 120700 + }, + { + "epoch": 9.354101282498354, + "grad_norm": 1.4558718019397965, + "learning_rate": 4.6772318660880347e-07, + "loss": 0.9101, + "step": 120710 + }, + { + "epoch": 9.35487620597466, + "grad_norm": 1.465424525443859, + "learning_rate": 4.6776193428394294e-07, + "loss": 0.9101, + "step": 120720 + }, + { + "epoch": 9.355651129450967, + "grad_norm": 1.4249152419066935, + "learning_rate": 4.6780068195908246e-07, + "loss": 0.9015, + "step": 120730 + }, + { + "epoch": 9.356426052927274, + "grad_norm": 1.5270740618168404, + "learning_rate": 4.6783942963422193e-07, + "loss": 0.8976, + "step": 120740 + }, + { + "epoch": 9.35720097640358, + "grad_norm": 1.4731891413377252, + "learning_rate": 4.6787817730936146e-07, + "loss": 0.8727, + "step": 120750 + }, + { + "epoch": 9.357975899879888, + "grad_norm": 1.5203781431539873, + "learning_rate": 4.679169249845009e-07, + "loss": 0.9059, + "step": 120760 + }, + { + "epoch": 9.358750823356193, + "grad_norm": 1.3793406550613379, + "learning_rate": 4.6795567265964045e-07, + "loss": 0.8794, + "step": 120770 + }, + { + "epoch": 9.3595257468325, + "grad_norm": 1.5151562098517004, + "learning_rate": 4.679944203347799e-07, + "loss": 0.9, + "step": 120780 + }, + { + "epoch": 9.360300670308806, + "grad_norm": 1.3900506929501555, + "learning_rate": 4.680331680099194e-07, + "loss": 0.8893, + "step": 120790 + }, + { + "epoch": 9.361075593785113, + "grad_norm": 1.738083538088619, + "learning_rate": 4.680719156850589e-07, + "loss": 0.9238, + "step": 120800 + }, + { + "epoch": 9.36185051726142, + "grad_norm": 1.4619305152518696, + "learning_rate": 4.681106633601984e-07, + "loss": 0.8952, + "step": 120810 + }, + { + "epoch": 9.362625440737727, + "grad_norm": 1.5022832712451146, + "learning_rate": 4.681494110353379e-07, + "loss": 0.9114, + "step": 120820 + }, + { + "epoch": 9.363400364214034, + "grad_norm": 1.5376277902772544, + "learning_rate": 4.6818815871047737e-07, + "loss": 0.916, + "step": 120830 + }, + { + "epoch": 9.36417528769034, + "grad_norm": 1.4010224616365372, + "learning_rate": 4.682269063856169e-07, + "loss": 0.9011, + "step": 120840 + }, + { + "epoch": 9.364950211166647, + "grad_norm": 1.5800574483782348, + "learning_rate": 4.6826565406075636e-07, + "loss": 0.907, + "step": 120850 + }, + { + "epoch": 9.365725134642954, + "grad_norm": 1.4564586851456127, + "learning_rate": 4.6830440173589583e-07, + "loss": 0.9321, + "step": 120860 + }, + { + "epoch": 9.36650005811926, + "grad_norm": 1.432958727597782, + "learning_rate": 4.6834314941103535e-07, + "loss": 0.9148, + "step": 120870 + }, + { + "epoch": 9.367274981595568, + "grad_norm": 1.3963149943281778, + "learning_rate": 4.683818970861748e-07, + "loss": 0.8961, + "step": 120880 + }, + { + "epoch": 9.368049905071874, + "grad_norm": 1.5622032125597194, + "learning_rate": 4.6842064476131434e-07, + "loss": 0.9349, + "step": 120890 + }, + { + "epoch": 9.368824828548181, + "grad_norm": 1.486228411032533, + "learning_rate": 4.684593924364538e-07, + "loss": 0.8951, + "step": 120900 + }, + { + "epoch": 9.369599752024488, + "grad_norm": 1.5268247657713543, + "learning_rate": 4.6849814011159333e-07, + "loss": 0.8966, + "step": 120910 + }, + { + "epoch": 9.370374675500795, + "grad_norm": 1.485974767415363, + "learning_rate": 4.685368877867328e-07, + "loss": 0.8961, + "step": 120920 + }, + { + "epoch": 9.371149598977102, + "grad_norm": 1.511533731662106, + "learning_rate": 4.6857563546187227e-07, + "loss": 0.9226, + "step": 120930 + }, + { + "epoch": 9.371924522453408, + "grad_norm": 1.4726593634437182, + "learning_rate": 4.686143831370118e-07, + "loss": 0.8869, + "step": 120940 + }, + { + "epoch": 9.372699445929715, + "grad_norm": 1.4748406078031568, + "learning_rate": 4.6865313081215126e-07, + "loss": 0.9022, + "step": 120950 + }, + { + "epoch": 9.37347436940602, + "grad_norm": 1.4951107709802205, + "learning_rate": 4.686918784872908e-07, + "loss": 0.9011, + "step": 120960 + }, + { + "epoch": 9.374249292882327, + "grad_norm": 1.4340613705440521, + "learning_rate": 4.6873062616243025e-07, + "loss": 0.8921, + "step": 120970 + }, + { + "epoch": 9.375024216358634, + "grad_norm": 1.456558776476652, + "learning_rate": 4.687693738375698e-07, + "loss": 0.9049, + "step": 120980 + }, + { + "epoch": 9.37579913983494, + "grad_norm": 1.4596318503739203, + "learning_rate": 4.688081215127093e-07, + "loss": 0.9154, + "step": 120990 + }, + { + "epoch": 9.376574063311248, + "grad_norm": 1.4239662873922285, + "learning_rate": 4.688468691878488e-07, + "loss": 0.8916, + "step": 121000 + }, + { + "epoch": 9.376574063311248, + "eval_loss": 0.9121843576431274, + "eval_runtime": 325.5814, + "eval_samples_per_second": 35.232, + "eval_steps_per_second": 8.809, + "step": 121000 + }, + { + "epoch": 9.377348986787554, + "grad_norm": 1.3954844136628455, + "learning_rate": 4.688856168629883e-07, + "loss": 0.9093, + "step": 121010 + }, + { + "epoch": 9.378123910263861, + "grad_norm": 1.5190873408394803, + "learning_rate": 4.689243645381278e-07, + "loss": 0.9117, + "step": 121020 + }, + { + "epoch": 9.378898833740168, + "grad_norm": 1.4488802058299683, + "learning_rate": 4.689631122132673e-07, + "loss": 0.9191, + "step": 121030 + }, + { + "epoch": 9.379673757216475, + "grad_norm": 1.399383775539556, + "learning_rate": 4.6900185988840675e-07, + "loss": 0.8842, + "step": 121040 + }, + { + "epoch": 9.380448680692782, + "grad_norm": 1.461648809481994, + "learning_rate": 4.6904060756354627e-07, + "loss": 0.9141, + "step": 121050 + }, + { + "epoch": 9.381223604169088, + "grad_norm": 1.452973075815023, + "learning_rate": 4.6907935523868574e-07, + "loss": 0.8837, + "step": 121060 + }, + { + "epoch": 9.381998527645395, + "grad_norm": 1.4742147854692698, + "learning_rate": 4.6911810291382526e-07, + "loss": 0.8921, + "step": 121070 + }, + { + "epoch": 9.382773451121702, + "grad_norm": 1.4556225974796466, + "learning_rate": 4.6915685058896473e-07, + "loss": 0.9256, + "step": 121080 + }, + { + "epoch": 9.383548374598009, + "grad_norm": 1.373308326322467, + "learning_rate": 4.6919559826410425e-07, + "loss": 0.8733, + "step": 121090 + }, + { + "epoch": 9.384323298074316, + "grad_norm": 1.4191650988315585, + "learning_rate": 4.692343459392437e-07, + "loss": 0.8949, + "step": 121100 + }, + { + "epoch": 9.385098221550622, + "grad_norm": 1.4885195027471725, + "learning_rate": 4.692730936143832e-07, + "loss": 0.899, + "step": 121110 + }, + { + "epoch": 9.38587314502693, + "grad_norm": 1.4333947346859124, + "learning_rate": 4.693118412895227e-07, + "loss": 0.8981, + "step": 121120 + }, + { + "epoch": 9.386648068503236, + "grad_norm": 1.490765448263973, + "learning_rate": 4.693505889646622e-07, + "loss": 0.8912, + "step": 121130 + }, + { + "epoch": 9.387422991979541, + "grad_norm": 1.5234346505635938, + "learning_rate": 4.693893366398017e-07, + "loss": 0.8938, + "step": 121140 + }, + { + "epoch": 9.388197915455848, + "grad_norm": 1.413383662263604, + "learning_rate": 4.6942808431494117e-07, + "loss": 0.9037, + "step": 121150 + }, + { + "epoch": 9.388972838932155, + "grad_norm": 1.3866171244663787, + "learning_rate": 4.694668319900807e-07, + "loss": 0.9002, + "step": 121160 + }, + { + "epoch": 9.389747762408462, + "grad_norm": 1.3977151815322566, + "learning_rate": 4.6950557966522016e-07, + "loss": 0.9051, + "step": 121170 + }, + { + "epoch": 9.390522685884768, + "grad_norm": 1.5264986237516311, + "learning_rate": 4.6954432734035963e-07, + "loss": 0.9145, + "step": 121180 + }, + { + "epoch": 9.391297609361075, + "grad_norm": 1.4035452455959074, + "learning_rate": 4.6958307501549915e-07, + "loss": 0.908, + "step": 121190 + }, + { + "epoch": 9.392072532837382, + "grad_norm": 1.4469097351000562, + "learning_rate": 4.696218226906386e-07, + "loss": 0.9037, + "step": 121200 + }, + { + "epoch": 9.392847456313689, + "grad_norm": 1.5589823777436198, + "learning_rate": 4.6966057036577814e-07, + "loss": 0.9245, + "step": 121210 + }, + { + "epoch": 9.393622379789996, + "grad_norm": 1.4418640035707866, + "learning_rate": 4.696993180409176e-07, + "loss": 0.9018, + "step": 121220 + }, + { + "epoch": 9.394397303266302, + "grad_norm": 1.4509378803339816, + "learning_rate": 4.6973806571605713e-07, + "loss": 0.9044, + "step": 121230 + }, + { + "epoch": 9.39517222674261, + "grad_norm": 1.378550504250077, + "learning_rate": 4.697768133911966e-07, + "loss": 0.9106, + "step": 121240 + }, + { + "epoch": 9.395947150218916, + "grad_norm": 1.4505852732434548, + "learning_rate": 4.6981556106633607e-07, + "loss": 0.8911, + "step": 121250 + }, + { + "epoch": 9.396722073695223, + "grad_norm": 1.511225873197802, + "learning_rate": 4.698543087414756e-07, + "loss": 0.9048, + "step": 121260 + }, + { + "epoch": 9.39749699717153, + "grad_norm": 1.5057273420875792, + "learning_rate": 4.6989305641661506e-07, + "loss": 0.9252, + "step": 121270 + }, + { + "epoch": 9.398271920647836, + "grad_norm": 1.5868253184182404, + "learning_rate": 4.699318040917546e-07, + "loss": 0.9005, + "step": 121280 + }, + { + "epoch": 9.399046844124143, + "grad_norm": 1.4587220939681802, + "learning_rate": 4.6997055176689405e-07, + "loss": 0.9102, + "step": 121290 + }, + { + "epoch": 9.39982176760045, + "grad_norm": 1.4186198613485947, + "learning_rate": 4.700092994420336e-07, + "loss": 0.9073, + "step": 121300 + }, + { + "epoch": 9.400596691076757, + "grad_norm": 1.4094225265463984, + "learning_rate": 4.7004804711717305e-07, + "loss": 0.8987, + "step": 121310 + }, + { + "epoch": 9.401371614553064, + "grad_norm": 1.4802752310914646, + "learning_rate": 4.700867947923125e-07, + "loss": 0.888, + "step": 121320 + }, + { + "epoch": 9.40214653802937, + "grad_norm": 1.4534627235977537, + "learning_rate": 4.7012554246745204e-07, + "loss": 0.9032, + "step": 121330 + }, + { + "epoch": 9.402921461505676, + "grad_norm": 1.4305081944957938, + "learning_rate": 4.701642901425915e-07, + "loss": 0.9114, + "step": 121340 + }, + { + "epoch": 9.403696384981982, + "grad_norm": 1.3932529009812908, + "learning_rate": 4.7020303781773103e-07, + "loss": 0.8913, + "step": 121350 + }, + { + "epoch": 9.40447130845829, + "grad_norm": 1.4417031203281583, + "learning_rate": 4.702417854928705e-07, + "loss": 0.8882, + "step": 121360 + }, + { + "epoch": 9.405246231934596, + "grad_norm": 1.4706490469685942, + "learning_rate": 4.7028053316801e-07, + "loss": 0.89, + "step": 121370 + }, + { + "epoch": 9.406021155410903, + "grad_norm": 1.4418124488856732, + "learning_rate": 4.703192808431495e-07, + "loss": 0.8954, + "step": 121380 + }, + { + "epoch": 9.40679607888721, + "grad_norm": 1.4312939985338549, + "learning_rate": 4.7035802851828896e-07, + "loss": 0.8903, + "step": 121390 + }, + { + "epoch": 9.407571002363516, + "grad_norm": 1.4043249744115087, + "learning_rate": 4.703967761934285e-07, + "loss": 0.8946, + "step": 121400 + }, + { + "epoch": 9.408345925839823, + "grad_norm": 1.396115977957902, + "learning_rate": 4.7043552386856795e-07, + "loss": 0.9069, + "step": 121410 + }, + { + "epoch": 9.40912084931613, + "grad_norm": 1.4086366929682703, + "learning_rate": 4.7047427154370747e-07, + "loss": 0.8877, + "step": 121420 + }, + { + "epoch": 9.409895772792437, + "grad_norm": 1.4756723207141471, + "learning_rate": 4.7051301921884694e-07, + "loss": 0.8956, + "step": 121430 + }, + { + "epoch": 9.410670696268744, + "grad_norm": 1.4741632705824808, + "learning_rate": 4.705517668939864e-07, + "loss": 0.8955, + "step": 121440 + }, + { + "epoch": 9.41144561974505, + "grad_norm": 1.4606380531142424, + "learning_rate": 4.7059051456912593e-07, + "loss": 0.9322, + "step": 121450 + }, + { + "epoch": 9.412220543221357, + "grad_norm": 1.4393423881791787, + "learning_rate": 4.706292622442654e-07, + "loss": 0.8994, + "step": 121460 + }, + { + "epoch": 9.412995466697664, + "grad_norm": 1.4341940318776492, + "learning_rate": 4.706680099194049e-07, + "loss": 0.8948, + "step": 121470 + }, + { + "epoch": 9.41377039017397, + "grad_norm": 1.3772026207530748, + "learning_rate": 4.707067575945444e-07, + "loss": 0.8921, + "step": 121480 + }, + { + "epoch": 9.414545313650278, + "grad_norm": 1.5083178857540263, + "learning_rate": 4.707455052696839e-07, + "loss": 0.8624, + "step": 121490 + }, + { + "epoch": 9.415320237126585, + "grad_norm": 1.6843165684648354, + "learning_rate": 4.707842529448234e-07, + "loss": 0.9082, + "step": 121500 + }, + { + "epoch": 9.415320237126585, + "eval_loss": 0.9118934869766235, + "eval_runtime": 328.6787, + "eval_samples_per_second": 34.9, + "eval_steps_per_second": 8.726, + "step": 121500 + }, + { + "epoch": 9.416095160602891, + "grad_norm": 1.4237490058342739, + "learning_rate": 4.7082300061996285e-07, + "loss": 0.916, + "step": 121510 + }, + { + "epoch": 9.416870084079196, + "grad_norm": 1.4676641622839446, + "learning_rate": 4.7086174829510237e-07, + "loss": 0.892, + "step": 121520 + }, + { + "epoch": 9.417645007555503, + "grad_norm": 1.487302305676805, + "learning_rate": 4.7090049597024184e-07, + "loss": 0.9045, + "step": 121530 + }, + { + "epoch": 9.41841993103181, + "grad_norm": 1.4769577403864027, + "learning_rate": 4.7093924364538136e-07, + "loss": 0.9361, + "step": 121540 + }, + { + "epoch": 9.419194854508117, + "grad_norm": 1.4323298640632023, + "learning_rate": 4.7097799132052083e-07, + "loss": 0.9008, + "step": 121550 + }, + { + "epoch": 9.419969777984424, + "grad_norm": 1.4213975147527402, + "learning_rate": 4.7101673899566035e-07, + "loss": 0.8966, + "step": 121560 + }, + { + "epoch": 9.42074470146073, + "grad_norm": 1.472999006326999, + "learning_rate": 4.710554866707998e-07, + "loss": 0.907, + "step": 121570 + }, + { + "epoch": 9.421519624937037, + "grad_norm": 1.3404895073339358, + "learning_rate": 4.710942343459393e-07, + "loss": 0.9008, + "step": 121580 + }, + { + "epoch": 9.422294548413344, + "grad_norm": 1.471171006109118, + "learning_rate": 4.711329820210788e-07, + "loss": 0.9028, + "step": 121590 + }, + { + "epoch": 9.42306947188965, + "grad_norm": 1.4495847850312102, + "learning_rate": 4.711717296962183e-07, + "loss": 0.896, + "step": 121600 + }, + { + "epoch": 9.423844395365958, + "grad_norm": 1.52215934302115, + "learning_rate": 4.712104773713578e-07, + "loss": 0.891, + "step": 121610 + }, + { + "epoch": 9.424619318842264, + "grad_norm": 1.382620901976232, + "learning_rate": 4.712492250464973e-07, + "loss": 0.8918, + "step": 121620 + }, + { + "epoch": 9.425394242318571, + "grad_norm": 1.4267863545945658, + "learning_rate": 4.712879727216368e-07, + "loss": 0.8765, + "step": 121630 + }, + { + "epoch": 9.426169165794878, + "grad_norm": 1.5128150413267987, + "learning_rate": 4.7132672039677626e-07, + "loss": 0.9007, + "step": 121640 + }, + { + "epoch": 9.426944089271185, + "grad_norm": 1.4763108076157512, + "learning_rate": 4.7136546807191573e-07, + "loss": 0.9008, + "step": 121650 + }, + { + "epoch": 9.427719012747492, + "grad_norm": 1.4003364501205138, + "learning_rate": 4.7140421574705526e-07, + "loss": 0.8964, + "step": 121660 + }, + { + "epoch": 9.428493936223798, + "grad_norm": 1.518040641100107, + "learning_rate": 4.714429634221947e-07, + "loss": 0.9314, + "step": 121670 + }, + { + "epoch": 9.429268859700105, + "grad_norm": 1.376336912132331, + "learning_rate": 4.7148171109733425e-07, + "loss": 0.894, + "step": 121680 + }, + { + "epoch": 9.430043783176412, + "grad_norm": 1.4814872757806037, + "learning_rate": 4.715204587724737e-07, + "loss": 0.8894, + "step": 121690 + }, + { + "epoch": 9.430818706652719, + "grad_norm": 1.498945249627831, + "learning_rate": 4.7155920644761324e-07, + "loss": 0.9096, + "step": 121700 + }, + { + "epoch": 9.431593630129024, + "grad_norm": 1.4443768010751479, + "learning_rate": 4.715979541227527e-07, + "loss": 0.9123, + "step": 121710 + }, + { + "epoch": 9.43236855360533, + "grad_norm": 1.4702266454092374, + "learning_rate": 4.716367017978922e-07, + "loss": 0.9203, + "step": 121720 + }, + { + "epoch": 9.433143477081638, + "grad_norm": 1.378873027059051, + "learning_rate": 4.716754494730317e-07, + "loss": 0.9321, + "step": 121730 + }, + { + "epoch": 9.433918400557944, + "grad_norm": 1.4309518684163882, + "learning_rate": 4.7171419714817117e-07, + "loss": 0.9049, + "step": 121740 + }, + { + "epoch": 9.434693324034251, + "grad_norm": 1.501952260270227, + "learning_rate": 4.717529448233107e-07, + "loss": 0.9178, + "step": 121750 + }, + { + "epoch": 9.435468247510558, + "grad_norm": 1.5587150235233624, + "learning_rate": 4.7179169249845016e-07, + "loss": 0.8982, + "step": 121760 + }, + { + "epoch": 9.436243170986865, + "grad_norm": 1.444823236042022, + "learning_rate": 4.718304401735897e-07, + "loss": 0.8748, + "step": 121770 + }, + { + "epoch": 9.437018094463172, + "grad_norm": 1.403865879788502, + "learning_rate": 4.7186918784872915e-07, + "loss": 0.9067, + "step": 121780 + }, + { + "epoch": 9.437793017939478, + "grad_norm": 1.4376632992959346, + "learning_rate": 4.719079355238686e-07, + "loss": 0.9046, + "step": 121790 + }, + { + "epoch": 9.438567941415785, + "grad_norm": 1.4117929886724168, + "learning_rate": 4.7194668319900814e-07, + "loss": 0.8889, + "step": 121800 + }, + { + "epoch": 9.439342864892092, + "grad_norm": 1.4618433152606898, + "learning_rate": 4.719854308741476e-07, + "loss": 0.9113, + "step": 121810 + }, + { + "epoch": 9.440117788368399, + "grad_norm": 1.4291311140803888, + "learning_rate": 4.7202417854928713e-07, + "loss": 0.907, + "step": 121820 + }, + { + "epoch": 9.440892711844706, + "grad_norm": 1.5284163950242866, + "learning_rate": 4.720629262244266e-07, + "loss": 0.9082, + "step": 121830 + }, + { + "epoch": 9.441667635321012, + "grad_norm": 1.4702043941942813, + "learning_rate": 4.721016738995661e-07, + "loss": 0.9053, + "step": 121840 + }, + { + "epoch": 9.44244255879732, + "grad_norm": 1.406069415975593, + "learning_rate": 4.721404215747056e-07, + "loss": 0.9075, + "step": 121850 + }, + { + "epoch": 9.443217482273626, + "grad_norm": 1.4469017218374671, + "learning_rate": 4.7217916924984506e-07, + "loss": 0.8836, + "step": 121860 + }, + { + "epoch": 9.443992405749933, + "grad_norm": 1.370236487034267, + "learning_rate": 4.722179169249846e-07, + "loss": 0.9127, + "step": 121870 + }, + { + "epoch": 9.44476732922624, + "grad_norm": 1.4279137045203747, + "learning_rate": 4.7225666460012405e-07, + "loss": 0.8891, + "step": 121880 + }, + { + "epoch": 9.445542252702545, + "grad_norm": 1.3808688498404218, + "learning_rate": 4.7229541227526357e-07, + "loss": 0.8957, + "step": 121890 + }, + { + "epoch": 9.446317176178852, + "grad_norm": 1.450040032765983, + "learning_rate": 4.7233415995040304e-07, + "loss": 0.8946, + "step": 121900 + }, + { + "epoch": 9.447092099655158, + "grad_norm": 1.4606871297217496, + "learning_rate": 4.7237290762554256e-07, + "loss": 0.916, + "step": 121910 + }, + { + "epoch": 9.447867023131465, + "grad_norm": 1.4225170901841921, + "learning_rate": 4.7241165530068203e-07, + "loss": 0.9068, + "step": 121920 + }, + { + "epoch": 9.448641946607772, + "grad_norm": 1.4739673862725755, + "learning_rate": 4.724504029758215e-07, + "loss": 0.9077, + "step": 121930 + }, + { + "epoch": 9.449416870084079, + "grad_norm": 1.6324992176597557, + "learning_rate": 4.72489150650961e-07, + "loss": 0.9006, + "step": 121940 + }, + { + "epoch": 9.450191793560386, + "grad_norm": 1.4843811537678826, + "learning_rate": 4.725278983261005e-07, + "loss": 0.9025, + "step": 121950 + }, + { + "epoch": 9.450966717036692, + "grad_norm": 1.4318184043331712, + "learning_rate": 4.7256664600124e-07, + "loss": 0.9028, + "step": 121960 + }, + { + "epoch": 9.451741640513, + "grad_norm": 1.508238089568074, + "learning_rate": 4.726053936763795e-07, + "loss": 0.9215, + "step": 121970 + }, + { + "epoch": 9.452516563989306, + "grad_norm": 1.5019277249059328, + "learning_rate": 4.72644141351519e-07, + "loss": 0.8997, + "step": 121980 + }, + { + "epoch": 9.453291487465613, + "grad_norm": 1.4656109600279672, + "learning_rate": 4.726828890266585e-07, + "loss": 0.8999, + "step": 121990 + }, + { + "epoch": 9.45406641094192, + "grad_norm": 1.4355182059029885, + "learning_rate": 4.7272163670179794e-07, + "loss": 0.8942, + "step": 122000 + }, + { + "epoch": 9.45406641094192, + "eval_loss": 0.9115082621574402, + "eval_runtime": 328.1857, + "eval_samples_per_second": 34.953, + "eval_steps_per_second": 8.739, + "step": 122000 + }, + { + "epoch": 9.454841334418226, + "grad_norm": 1.3932833109918457, + "learning_rate": 4.7276038437693747e-07, + "loss": 0.882, + "step": 122010 + }, + { + "epoch": 9.455616257894533, + "grad_norm": 1.5252159572534347, + "learning_rate": 4.7279913205207694e-07, + "loss": 0.9137, + "step": 122020 + }, + { + "epoch": 9.45639118137084, + "grad_norm": 1.4172000351621474, + "learning_rate": 4.7283787972721646e-07, + "loss": 0.9214, + "step": 122030 + }, + { + "epoch": 9.457166104847147, + "grad_norm": 1.4203845492552194, + "learning_rate": 4.728766274023559e-07, + "loss": 0.9027, + "step": 122040 + }, + { + "epoch": 9.457941028323454, + "grad_norm": 1.4508114640217182, + "learning_rate": 4.7291537507749545e-07, + "loss": 0.9109, + "step": 122050 + }, + { + "epoch": 9.45871595179976, + "grad_norm": 1.5709362515118979, + "learning_rate": 4.729541227526349e-07, + "loss": 0.8891, + "step": 122060 + }, + { + "epoch": 9.459490875276067, + "grad_norm": 1.4616009255286102, + "learning_rate": 4.729928704277744e-07, + "loss": 0.9033, + "step": 122070 + }, + { + "epoch": 9.460265798752372, + "grad_norm": 1.4775507355630388, + "learning_rate": 4.730316181029139e-07, + "loss": 0.9081, + "step": 122080 + }, + { + "epoch": 9.46104072222868, + "grad_norm": 1.3928070256347802, + "learning_rate": 4.730703657780534e-07, + "loss": 0.8828, + "step": 122090 + }, + { + "epoch": 9.461815645704986, + "grad_norm": 1.4826620017708045, + "learning_rate": 4.731091134531929e-07, + "loss": 0.9016, + "step": 122100 + }, + { + "epoch": 9.462590569181293, + "grad_norm": 1.5010768975264888, + "learning_rate": 4.7314786112833237e-07, + "loss": 0.8988, + "step": 122110 + }, + { + "epoch": 9.4633654926576, + "grad_norm": 1.3332431847221904, + "learning_rate": 4.731866088034719e-07, + "loss": 0.9021, + "step": 122120 + }, + { + "epoch": 9.464140416133906, + "grad_norm": 1.5337685239457917, + "learning_rate": 4.7322535647861136e-07, + "loss": 0.8993, + "step": 122130 + }, + { + "epoch": 9.464915339610213, + "grad_norm": 1.4494523793284622, + "learning_rate": 4.7326410415375083e-07, + "loss": 0.9144, + "step": 122140 + }, + { + "epoch": 9.46569026308652, + "grad_norm": 1.488755046099483, + "learning_rate": 4.7330285182889035e-07, + "loss": 0.8844, + "step": 122150 + }, + { + "epoch": 9.466465186562827, + "grad_norm": 1.421366216187846, + "learning_rate": 4.733415995040298e-07, + "loss": 0.8923, + "step": 122160 + }, + { + "epoch": 9.467240110039134, + "grad_norm": 1.4562535650522834, + "learning_rate": 4.7338034717916934e-07, + "loss": 0.8926, + "step": 122170 + }, + { + "epoch": 9.46801503351544, + "grad_norm": 1.4069592902058103, + "learning_rate": 4.734190948543088e-07, + "loss": 0.8801, + "step": 122180 + }, + { + "epoch": 9.468789956991747, + "grad_norm": 1.4464518871091954, + "learning_rate": 4.734578425294483e-07, + "loss": 0.8961, + "step": 122190 + }, + { + "epoch": 9.469564880468054, + "grad_norm": 1.4878118492531003, + "learning_rate": 4.734965902045878e-07, + "loss": 0.8999, + "step": 122200 + }, + { + "epoch": 9.470339803944361, + "grad_norm": 1.416664332273958, + "learning_rate": 4.7353533787972727e-07, + "loss": 0.9, + "step": 122210 + }, + { + "epoch": 9.471114727420668, + "grad_norm": 1.466224221713055, + "learning_rate": 4.735740855548668e-07, + "loss": 0.9028, + "step": 122220 + }, + { + "epoch": 9.471889650896975, + "grad_norm": 1.379774741193443, + "learning_rate": 4.7361283323000626e-07, + "loss": 0.8902, + "step": 122230 + }, + { + "epoch": 9.472664574373281, + "grad_norm": 1.4658349980469967, + "learning_rate": 4.736515809051458e-07, + "loss": 0.9051, + "step": 122240 + }, + { + "epoch": 9.473439497849588, + "grad_norm": 1.419281493615288, + "learning_rate": 4.7369032858028525e-07, + "loss": 0.902, + "step": 122250 + }, + { + "epoch": 9.474214421325893, + "grad_norm": 1.4033389757520474, + "learning_rate": 4.737290762554247e-07, + "loss": 0.8822, + "step": 122260 + }, + { + "epoch": 9.4749893448022, + "grad_norm": 1.5297385690056446, + "learning_rate": 4.7376782393056424e-07, + "loss": 0.9035, + "step": 122270 + }, + { + "epoch": 9.475764268278507, + "grad_norm": 1.5337467164763852, + "learning_rate": 4.738065716057037e-07, + "loss": 0.9185, + "step": 122280 + }, + { + "epoch": 9.476539191754814, + "grad_norm": 1.445896247451869, + "learning_rate": 4.7384531928084323e-07, + "loss": 0.8812, + "step": 122290 + }, + { + "epoch": 9.47731411523112, + "grad_norm": 1.5507512785365198, + "learning_rate": 4.738840669559827e-07, + "loss": 0.8992, + "step": 122300 + }, + { + "epoch": 9.478089038707427, + "grad_norm": 1.497212080115241, + "learning_rate": 4.739228146311222e-07, + "loss": 0.8972, + "step": 122310 + }, + { + "epoch": 9.478863962183734, + "grad_norm": 1.4299135382991144, + "learning_rate": 4.739615623062617e-07, + "loss": 0.8943, + "step": 122320 + }, + { + "epoch": 9.47963888566004, + "grad_norm": 1.4697257645282942, + "learning_rate": 4.7400030998140116e-07, + "loss": 0.9075, + "step": 122330 + }, + { + "epoch": 9.480413809136348, + "grad_norm": 1.410979958083409, + "learning_rate": 4.740390576565407e-07, + "loss": 0.8955, + "step": 122340 + }, + { + "epoch": 9.481188732612654, + "grad_norm": 1.4853053129284335, + "learning_rate": 4.7407780533168015e-07, + "loss": 0.8983, + "step": 122350 + }, + { + "epoch": 9.481963656088961, + "grad_norm": 1.4199512783776693, + "learning_rate": 4.741165530068197e-07, + "loss": 0.9127, + "step": 122360 + }, + { + "epoch": 9.482738579565268, + "grad_norm": 1.4520459412478255, + "learning_rate": 4.7415530068195915e-07, + "loss": 0.9047, + "step": 122370 + }, + { + "epoch": 9.483513503041575, + "grad_norm": 1.377449014897846, + "learning_rate": 4.7419404835709867e-07, + "loss": 0.9249, + "step": 122380 + }, + { + "epoch": 9.484288426517882, + "grad_norm": 1.4317386333843496, + "learning_rate": 4.7423279603223814e-07, + "loss": 0.896, + "step": 122390 + }, + { + "epoch": 9.485063349994189, + "grad_norm": 1.5180057051242273, + "learning_rate": 4.742715437073776e-07, + "loss": 0.9067, + "step": 122400 + }, + { + "epoch": 9.485838273470495, + "grad_norm": 1.3696635069039889, + "learning_rate": 4.7431029138251713e-07, + "loss": 0.9054, + "step": 122410 + }, + { + "epoch": 9.486613196946802, + "grad_norm": 1.494588790811581, + "learning_rate": 4.743490390576566e-07, + "loss": 0.932, + "step": 122420 + }, + { + "epoch": 9.487388120423109, + "grad_norm": 1.4461133503363166, + "learning_rate": 4.743877867327961e-07, + "loss": 0.9111, + "step": 122430 + }, + { + "epoch": 9.488163043899416, + "grad_norm": 1.4818039902084157, + "learning_rate": 4.744265344079356e-07, + "loss": 0.9077, + "step": 122440 + }, + { + "epoch": 9.48893796737572, + "grad_norm": 1.4513516903733725, + "learning_rate": 4.744652820830751e-07, + "loss": 0.9124, + "step": 122450 + }, + { + "epoch": 9.489712890852028, + "grad_norm": 1.4474241671257024, + "learning_rate": 4.745040297582146e-07, + "loss": 0.9089, + "step": 122460 + }, + { + "epoch": 9.490487814328334, + "grad_norm": 1.4937791717455882, + "learning_rate": 4.7454277743335405e-07, + "loss": 0.9052, + "step": 122470 + }, + { + "epoch": 9.491262737804641, + "grad_norm": 1.440777672070203, + "learning_rate": 4.7458152510849357e-07, + "loss": 0.9308, + "step": 122480 + }, + { + "epoch": 9.492037661280948, + "grad_norm": 1.4639036405730599, + "learning_rate": 4.7462027278363304e-07, + "loss": 0.9058, + "step": 122490 + }, + { + "epoch": 9.492812584757255, + "grad_norm": 1.4923436458101489, + "learning_rate": 4.7465902045877256e-07, + "loss": 0.9137, + "step": 122500 + }, + { + "epoch": 9.492812584757255, + "eval_loss": 0.9112920761108398, + "eval_runtime": 330.3603, + "eval_samples_per_second": 34.723, + "eval_steps_per_second": 8.681, + "step": 122500 + }, + { + "epoch": 9.493587508233562, + "grad_norm": 1.460692203500001, + "learning_rate": 4.7469776813391203e-07, + "loss": 0.9112, + "step": 122510 + }, + { + "epoch": 9.494362431709868, + "grad_norm": 1.5035242459162352, + "learning_rate": 4.7473651580905155e-07, + "loss": 0.9117, + "step": 122520 + }, + { + "epoch": 9.495137355186175, + "grad_norm": 1.5448643177712267, + "learning_rate": 4.74775263484191e-07, + "loss": 0.9074, + "step": 122530 + }, + { + "epoch": 9.495912278662482, + "grad_norm": 1.3872031455493985, + "learning_rate": 4.748140111593305e-07, + "loss": 0.8756, + "step": 122540 + }, + { + "epoch": 9.496687202138789, + "grad_norm": 1.4529012064610622, + "learning_rate": 4.7485275883447e-07, + "loss": 0.8918, + "step": 122550 + }, + { + "epoch": 9.497462125615096, + "grad_norm": 1.541033422927463, + "learning_rate": 4.748915065096095e-07, + "loss": 0.8932, + "step": 122560 + }, + { + "epoch": 9.498237049091403, + "grad_norm": 1.4774756085181806, + "learning_rate": 4.74930254184749e-07, + "loss": 0.9159, + "step": 122570 + }, + { + "epoch": 9.49901197256771, + "grad_norm": 1.462093479472888, + "learning_rate": 4.7496900185988847e-07, + "loss": 0.8989, + "step": 122580 + }, + { + "epoch": 9.499786896044016, + "grad_norm": 1.4028431666060404, + "learning_rate": 4.75007749535028e-07, + "loss": 0.8938, + "step": 122590 + }, + { + "epoch": 9.500561819520323, + "grad_norm": 1.4147203071360201, + "learning_rate": 4.7504649721016746e-07, + "loss": 0.9027, + "step": 122600 + }, + { + "epoch": 9.50133674299663, + "grad_norm": 1.4176883135815992, + "learning_rate": 4.7508524488530693e-07, + "loss": 0.914, + "step": 122610 + }, + { + "epoch": 9.502111666472937, + "grad_norm": 1.5072173115006784, + "learning_rate": 4.7512399256044645e-07, + "loss": 0.8952, + "step": 122620 + }, + { + "epoch": 9.502886589949242, + "grad_norm": 1.4407821445500018, + "learning_rate": 4.751627402355859e-07, + "loss": 0.8958, + "step": 122630 + }, + { + "epoch": 9.503661513425548, + "grad_norm": 1.4103711587159249, + "learning_rate": 4.7520148791072545e-07, + "loss": 0.8957, + "step": 122640 + }, + { + "epoch": 9.504436436901855, + "grad_norm": 1.4706738224324307, + "learning_rate": 4.752402355858649e-07, + "loss": 0.8936, + "step": 122650 + }, + { + "epoch": 9.505211360378162, + "grad_norm": 1.4206029235761228, + "learning_rate": 4.7527898326100444e-07, + "loss": 0.8916, + "step": 122660 + }, + { + "epoch": 9.505986283854469, + "grad_norm": 1.4379009621955872, + "learning_rate": 4.753177309361439e-07, + "loss": 0.9066, + "step": 122670 + }, + { + "epoch": 9.506761207330776, + "grad_norm": 1.4148617181796113, + "learning_rate": 4.753564786112834e-07, + "loss": 0.9048, + "step": 122680 + }, + { + "epoch": 9.507536130807082, + "grad_norm": 1.517445878105058, + "learning_rate": 4.753952262864229e-07, + "loss": 0.932, + "step": 122690 + }, + { + "epoch": 9.50831105428339, + "grad_norm": 1.4516747715800367, + "learning_rate": 4.7543397396156237e-07, + "loss": 0.8866, + "step": 122700 + }, + { + "epoch": 9.509085977759696, + "grad_norm": 1.3784652187113116, + "learning_rate": 4.754727216367019e-07, + "loss": 0.8906, + "step": 122710 + }, + { + "epoch": 9.509860901236003, + "grad_norm": 1.4794177638373074, + "learning_rate": 4.7551146931184136e-07, + "loss": 0.9049, + "step": 122720 + }, + { + "epoch": 9.51063582471231, + "grad_norm": 1.499021402692922, + "learning_rate": 4.755502169869809e-07, + "loss": 0.9, + "step": 122730 + }, + { + "epoch": 9.511410748188617, + "grad_norm": 1.4281960738048018, + "learning_rate": 4.7558896466212035e-07, + "loss": 0.9069, + "step": 122740 + }, + { + "epoch": 9.512185671664923, + "grad_norm": 1.4261111591045068, + "learning_rate": 4.756277123372598e-07, + "loss": 0.9004, + "step": 122750 + }, + { + "epoch": 9.51296059514123, + "grad_norm": 1.4471388670771228, + "learning_rate": 4.7566646001239934e-07, + "loss": 0.9039, + "step": 122760 + }, + { + "epoch": 9.513735518617537, + "grad_norm": 1.4501996827168286, + "learning_rate": 4.757052076875388e-07, + "loss": 0.9059, + "step": 122770 + }, + { + "epoch": 9.514510442093844, + "grad_norm": 1.537774292056697, + "learning_rate": 4.7574395536267833e-07, + "loss": 0.9022, + "step": 122780 + }, + { + "epoch": 9.51528536557015, + "grad_norm": 1.4415358176302238, + "learning_rate": 4.757827030378178e-07, + "loss": 0.8875, + "step": 122790 + }, + { + "epoch": 9.516060289046457, + "grad_norm": 1.4878797470579725, + "learning_rate": 4.758214507129573e-07, + "loss": 0.9271, + "step": 122800 + }, + { + "epoch": 9.516835212522764, + "grad_norm": 1.4395051563811418, + "learning_rate": 4.758601983880968e-07, + "loss": 0.9006, + "step": 122810 + }, + { + "epoch": 9.517610135999071, + "grad_norm": 1.5196487704162591, + "learning_rate": 4.7589894606323626e-07, + "loss": 0.9087, + "step": 122820 + }, + { + "epoch": 9.518385059475376, + "grad_norm": 1.4870398007712744, + "learning_rate": 4.759376937383758e-07, + "loss": 0.8893, + "step": 122830 + }, + { + "epoch": 9.519159982951683, + "grad_norm": 1.5386200723936772, + "learning_rate": 4.7597644141351525e-07, + "loss": 0.9174, + "step": 122840 + }, + { + "epoch": 9.51993490642799, + "grad_norm": 1.4164048652845396, + "learning_rate": 4.7601518908865477e-07, + "loss": 0.9001, + "step": 122850 + }, + { + "epoch": 9.520709829904296, + "grad_norm": 1.5341540545293064, + "learning_rate": 4.7605393676379424e-07, + "loss": 0.9259, + "step": 122860 + }, + { + "epoch": 9.521484753380603, + "grad_norm": 1.5225110653582679, + "learning_rate": 4.760926844389337e-07, + "loss": 0.9019, + "step": 122870 + }, + { + "epoch": 9.52225967685691, + "grad_norm": 1.398795030818793, + "learning_rate": 4.7613143211407323e-07, + "loss": 0.8882, + "step": 122880 + }, + { + "epoch": 9.523034600333217, + "grad_norm": 1.4862553871092825, + "learning_rate": 4.761701797892127e-07, + "loss": 0.9039, + "step": 122890 + }, + { + "epoch": 9.523809523809524, + "grad_norm": 1.4062275725053952, + "learning_rate": 4.762089274643522e-07, + "loss": 0.8878, + "step": 122900 + }, + { + "epoch": 9.52458444728583, + "grad_norm": 1.3861442758184572, + "learning_rate": 4.762476751394917e-07, + "loss": 0.899, + "step": 122910 + }, + { + "epoch": 9.525359370762137, + "grad_norm": 1.3794174232328624, + "learning_rate": 4.762864228146312e-07, + "loss": 0.8886, + "step": 122920 + }, + { + "epoch": 9.526134294238444, + "grad_norm": 1.5627747578978595, + "learning_rate": 4.763251704897707e-07, + "loss": 0.9031, + "step": 122930 + }, + { + "epoch": 9.526909217714751, + "grad_norm": 1.4841460145401904, + "learning_rate": 4.7636391816491015e-07, + "loss": 0.8955, + "step": 122940 + }, + { + "epoch": 9.527684141191058, + "grad_norm": 1.5467368663031982, + "learning_rate": 4.764026658400497e-07, + "loss": 0.8963, + "step": 122950 + }, + { + "epoch": 9.528459064667365, + "grad_norm": 1.4077590555606447, + "learning_rate": 4.7644141351518914e-07, + "loss": 0.9202, + "step": 122960 + }, + { + "epoch": 9.529233988143671, + "grad_norm": 1.4734248173329245, + "learning_rate": 4.7648016119032866e-07, + "loss": 0.8804, + "step": 122970 + }, + { + "epoch": 9.530008911619978, + "grad_norm": 1.5579514231518037, + "learning_rate": 4.7651890886546813e-07, + "loss": 0.9118, + "step": 122980 + }, + { + "epoch": 9.530783835096285, + "grad_norm": 1.4128289768358988, + "learning_rate": 4.7655765654060766e-07, + "loss": 0.8892, + "step": 122990 + }, + { + "epoch": 9.53155875857259, + "grad_norm": 1.5032925486106818, + "learning_rate": 4.765964042157471e-07, + "loss": 0.8973, + "step": 123000 + }, + { + "epoch": 9.53155875857259, + "eval_loss": 0.9111209511756897, + "eval_runtime": 331.2107, + "eval_samples_per_second": 34.634, + "eval_steps_per_second": 8.659, + "step": 123000 + }, + { + "epoch": 9.532333682048897, + "grad_norm": 1.4971968905957025, + "learning_rate": 4.766351518908866e-07, + "loss": 0.8946, + "step": 123010 + }, + { + "epoch": 9.533108605525204, + "grad_norm": 1.4219742977376324, + "learning_rate": 4.766738995660261e-07, + "loss": 0.9039, + "step": 123020 + }, + { + "epoch": 9.53388352900151, + "grad_norm": 1.3940014647486372, + "learning_rate": 4.767126472411656e-07, + "loss": 0.8885, + "step": 123030 + }, + { + "epoch": 9.534658452477817, + "grad_norm": 1.3900775775762593, + "learning_rate": 4.767513949163051e-07, + "loss": 0.8922, + "step": 123040 + }, + { + "epoch": 9.535433375954124, + "grad_norm": 1.3692912893982445, + "learning_rate": 4.767901425914446e-07, + "loss": 0.8805, + "step": 123050 + }, + { + "epoch": 9.536208299430431, + "grad_norm": 1.393782284195802, + "learning_rate": 4.768288902665841e-07, + "loss": 0.9013, + "step": 123060 + }, + { + "epoch": 9.536983222906738, + "grad_norm": 1.387606046712784, + "learning_rate": 4.768676379417236e-07, + "loss": 0.9097, + "step": 123070 + }, + { + "epoch": 9.537758146383045, + "grad_norm": 1.4345364609350455, + "learning_rate": 4.769063856168631e-07, + "loss": 0.8841, + "step": 123080 + }, + { + "epoch": 9.538533069859351, + "grad_norm": 1.446629079898762, + "learning_rate": 4.769451332920025e-07, + "loss": 0.9096, + "step": 123090 + }, + { + "epoch": 9.539307993335658, + "grad_norm": 1.4247507885034567, + "learning_rate": 4.76983880967142e-07, + "loss": 0.9073, + "step": 123100 + }, + { + "epoch": 9.540082916811965, + "grad_norm": 1.4030749931089155, + "learning_rate": 4.770226286422815e-07, + "loss": 0.9039, + "step": 123110 + }, + { + "epoch": 9.540857840288272, + "grad_norm": 1.459582137379091, + "learning_rate": 4.770613763174211e-07, + "loss": 0.8992, + "step": 123120 + }, + { + "epoch": 9.541632763764579, + "grad_norm": 1.4665502488950135, + "learning_rate": 4.771001239925605e-07, + "loss": 0.8973, + "step": 123130 + }, + { + "epoch": 9.542407687240885, + "grad_norm": 1.3297888208316138, + "learning_rate": 4.771388716677e-07, + "loss": 0.8969, + "step": 123140 + }, + { + "epoch": 9.543182610717192, + "grad_norm": 1.4158360806258559, + "learning_rate": 4.771776193428395e-07, + "loss": 0.8766, + "step": 123150 + }, + { + "epoch": 9.543957534193499, + "grad_norm": 1.451994030471087, + "learning_rate": 4.772163670179789e-07, + "loss": 0.8885, + "step": 123160 + }, + { + "epoch": 9.544732457669806, + "grad_norm": 1.4642705611860865, + "learning_rate": 4.772551146931185e-07, + "loss": 0.9082, + "step": 123170 + }, + { + "epoch": 9.545507381146113, + "grad_norm": 1.4356938129822665, + "learning_rate": 4.77293862368258e-07, + "loss": 0.8719, + "step": 123180 + }, + { + "epoch": 9.54628230462242, + "grad_norm": 1.5184483525370593, + "learning_rate": 4.773326100433975e-07, + "loss": 0.9096, + "step": 123190 + }, + { + "epoch": 9.547057228098724, + "grad_norm": 1.4113795657392165, + "learning_rate": 4.773713577185369e-07, + "loss": 0.8967, + "step": 123200 + }, + { + "epoch": 9.547832151575031, + "grad_norm": 1.4998766361472664, + "learning_rate": 4.774101053936765e-07, + "loss": 0.8888, + "step": 123210 + }, + { + "epoch": 9.548607075051338, + "grad_norm": 1.4800101232940561, + "learning_rate": 4.77448853068816e-07, + "loss": 0.9255, + "step": 123220 + }, + { + "epoch": 9.549381998527645, + "grad_norm": 1.55009842370402, + "learning_rate": 4.774876007439554e-07, + "loss": 0.89, + "step": 123230 + }, + { + "epoch": 9.550156922003952, + "grad_norm": 1.424170779489034, + "learning_rate": 4.775263484190949e-07, + "loss": 0.9003, + "step": 123240 + }, + { + "epoch": 9.550931845480259, + "grad_norm": 1.5000506551342914, + "learning_rate": 4.775650960942344e-07, + "loss": 0.8984, + "step": 123250 + }, + { + "epoch": 9.551706768956565, + "grad_norm": 1.493810474542026, + "learning_rate": 4.77603843769374e-07, + "loss": 0.8965, + "step": 123260 + }, + { + "epoch": 9.552481692432872, + "grad_norm": 1.4014855298094426, + "learning_rate": 4.776425914445134e-07, + "loss": 0.9019, + "step": 123270 + }, + { + "epoch": 9.553256615909179, + "grad_norm": 1.5616613759650921, + "learning_rate": 4.776813391196529e-07, + "loss": 0.8979, + "step": 123280 + }, + { + "epoch": 9.554031539385486, + "grad_norm": 1.4460323102973183, + "learning_rate": 4.777200867947924e-07, + "loss": 0.8882, + "step": 123290 + }, + { + "epoch": 9.554806462861793, + "grad_norm": 1.453183821509373, + "learning_rate": 4.777588344699318e-07, + "loss": 0.9021, + "step": 123300 + }, + { + "epoch": 9.5555813863381, + "grad_norm": 1.4154293564490663, + "learning_rate": 4.777975821450714e-07, + "loss": 0.8887, + "step": 123310 + }, + { + "epoch": 9.556356309814406, + "grad_norm": 1.477340476178929, + "learning_rate": 4.778363298202109e-07, + "loss": 0.9076, + "step": 123320 + }, + { + "epoch": 9.557131233290713, + "grad_norm": 1.4556062752336916, + "learning_rate": 4.778750774953504e-07, + "loss": 0.9003, + "step": 123330 + }, + { + "epoch": 9.55790615676702, + "grad_norm": 1.5260026203484305, + "learning_rate": 4.779138251704898e-07, + "loss": 0.9043, + "step": 123340 + }, + { + "epoch": 9.558681080243327, + "grad_norm": 1.4510150026893434, + "learning_rate": 4.779525728456293e-07, + "loss": 0.91, + "step": 123350 + }, + { + "epoch": 9.559456003719633, + "grad_norm": 1.4056158013485764, + "learning_rate": 4.779913205207689e-07, + "loss": 0.8936, + "step": 123360 + }, + { + "epoch": 9.560230927195938, + "grad_norm": 1.4899860247120515, + "learning_rate": 4.780300681959083e-07, + "loss": 0.9008, + "step": 123370 + }, + { + "epoch": 9.561005850672245, + "grad_norm": 1.5110907947166599, + "learning_rate": 4.780688158710478e-07, + "loss": 0.9031, + "step": 123380 + }, + { + "epoch": 9.561780774148552, + "grad_norm": 1.4589316458420019, + "learning_rate": 4.781075635461873e-07, + "loss": 0.9104, + "step": 123390 + }, + { + "epoch": 9.562555697624859, + "grad_norm": 1.4661036715779787, + "learning_rate": 4.781463112213268e-07, + "loss": 0.8951, + "step": 123400 + }, + { + "epoch": 9.563330621101166, + "grad_norm": 1.4551906390760063, + "learning_rate": 4.781850588964663e-07, + "loss": 0.9002, + "step": 123410 + }, + { + "epoch": 9.564105544577473, + "grad_norm": 1.4145761991273214, + "learning_rate": 4.782238065716058e-07, + "loss": 0.8803, + "step": 123420 + }, + { + "epoch": 9.56488046805378, + "grad_norm": 1.4273283831788843, + "learning_rate": 4.782625542467453e-07, + "loss": 0.8868, + "step": 123430 + }, + { + "epoch": 9.565655391530086, + "grad_norm": 1.476264297447644, + "learning_rate": 4.783013019218847e-07, + "loss": 0.8934, + "step": 123440 + }, + { + "epoch": 9.566430315006393, + "grad_norm": 1.4962673575932393, + "learning_rate": 4.783400495970242e-07, + "loss": 0.8916, + "step": 123450 + }, + { + "epoch": 9.5672052384827, + "grad_norm": 1.5057969721499478, + "learning_rate": 4.783787972721638e-07, + "loss": 0.9041, + "step": 123460 + }, + { + "epoch": 9.567980161959007, + "grad_norm": 1.4117899985302598, + "learning_rate": 4.784175449473033e-07, + "loss": 0.8844, + "step": 123470 + }, + { + "epoch": 9.568755085435313, + "grad_norm": 1.3479140149103825, + "learning_rate": 4.784562926224427e-07, + "loss": 0.908, + "step": 123480 + }, + { + "epoch": 9.56953000891162, + "grad_norm": 1.6509656576126306, + "learning_rate": 4.784950402975822e-07, + "loss": 0.8764, + "step": 123490 + }, + { + "epoch": 9.570304932387927, + "grad_norm": 1.3766526823820087, + "learning_rate": 4.785337879727217e-07, + "loss": 0.911, + "step": 123500 + }, + { + "epoch": 9.570304932387927, + "eval_loss": 0.9110093116760254, + "eval_runtime": 330.5675, + "eval_samples_per_second": 34.701, + "eval_steps_per_second": 8.676, + "step": 123500 + }, + { + "epoch": 9.571079855864234, + "grad_norm": 1.41151213420638, + "learning_rate": 4.785725356478612e-07, + "loss": 0.9124, + "step": 123510 + }, + { + "epoch": 9.57185477934054, + "grad_norm": 1.4043842326375198, + "learning_rate": 4.786112833230007e-07, + "loss": 0.9002, + "step": 123520 + }, + { + "epoch": 9.572629702816847, + "grad_norm": 1.521340263412688, + "learning_rate": 4.786500309981402e-07, + "loss": 0.9245, + "step": 123530 + }, + { + "epoch": 9.573404626293154, + "grad_norm": 1.475254173446478, + "learning_rate": 4.786887786732797e-07, + "loss": 0.895, + "step": 123540 + }, + { + "epoch": 9.574179549769461, + "grad_norm": 1.5374325926873467, + "learning_rate": 4.787275263484191e-07, + "loss": 0.8917, + "step": 123550 + }, + { + "epoch": 9.574954473245768, + "grad_norm": 1.5071630529076288, + "learning_rate": 4.787662740235587e-07, + "loss": 0.9128, + "step": 123560 + }, + { + "epoch": 9.575729396722073, + "grad_norm": 1.4024780756796345, + "learning_rate": 4.788050216986982e-07, + "loss": 0.898, + "step": 123570 + }, + { + "epoch": 9.57650432019838, + "grad_norm": 1.397068354571897, + "learning_rate": 4.788437693738376e-07, + "loss": 0.8856, + "step": 123580 + }, + { + "epoch": 9.577279243674687, + "grad_norm": 1.4466221821964662, + "learning_rate": 4.788825170489771e-07, + "loss": 0.8974, + "step": 123590 + }, + { + "epoch": 9.578054167150993, + "grad_norm": 1.4814627672399991, + "learning_rate": 4.789212647241166e-07, + "loss": 0.9142, + "step": 123600 + }, + { + "epoch": 9.5788290906273, + "grad_norm": 1.4217971035301993, + "learning_rate": 4.789600123992562e-07, + "loss": 0.906, + "step": 123610 + }, + { + "epoch": 9.579604014103607, + "grad_norm": 1.3955536096235652, + "learning_rate": 4.789987600743956e-07, + "loss": 0.8882, + "step": 123620 + }, + { + "epoch": 9.580378937579914, + "grad_norm": 1.4457314489485984, + "learning_rate": 4.790375077495351e-07, + "loss": 0.9204, + "step": 123630 + }, + { + "epoch": 9.58115386105622, + "grad_norm": 1.4412724077260715, + "learning_rate": 4.790762554246746e-07, + "loss": 0.8929, + "step": 123640 + }, + { + "epoch": 9.581928784532527, + "grad_norm": 1.5245042632463348, + "learning_rate": 4.79115003099814e-07, + "loss": 0.9037, + "step": 123650 + }, + { + "epoch": 9.582703708008834, + "grad_norm": 1.3958899969925207, + "learning_rate": 4.791537507749536e-07, + "loss": 0.8998, + "step": 123660 + }, + { + "epoch": 9.583478631485141, + "grad_norm": 1.3968376887327554, + "learning_rate": 4.791924984500931e-07, + "loss": 0.9053, + "step": 123670 + }, + { + "epoch": 9.584253554961448, + "grad_norm": 1.3784243028952212, + "learning_rate": 4.792312461252326e-07, + "loss": 0.9123, + "step": 123680 + }, + { + "epoch": 9.585028478437755, + "grad_norm": 1.5028927329755222, + "learning_rate": 4.79269993800372e-07, + "loss": 0.8746, + "step": 123690 + }, + { + "epoch": 9.585803401914061, + "grad_norm": 1.487934764714473, + "learning_rate": 4.793087414755115e-07, + "loss": 0.9155, + "step": 123700 + }, + { + "epoch": 9.586578325390368, + "grad_norm": 1.4620420060992023, + "learning_rate": 4.793474891506511e-07, + "loss": 0.8934, + "step": 123710 + }, + { + "epoch": 9.587353248866675, + "grad_norm": 1.4713709786607876, + "learning_rate": 4.793862368257905e-07, + "loss": 0.8965, + "step": 123720 + }, + { + "epoch": 9.588128172342982, + "grad_norm": 1.4884678704002818, + "learning_rate": 4.7942498450093e-07, + "loss": 0.8889, + "step": 123730 + }, + { + "epoch": 9.588903095819287, + "grad_norm": 1.4126208073475832, + "learning_rate": 4.794637321760695e-07, + "loss": 0.9008, + "step": 123740 + }, + { + "epoch": 9.589678019295594, + "grad_norm": 1.558728485927937, + "learning_rate": 4.79502479851209e-07, + "loss": 0.896, + "step": 123750 + }, + { + "epoch": 9.5904529427719, + "grad_norm": 1.5454602715735035, + "learning_rate": 4.795412275263485e-07, + "loss": 0.8938, + "step": 123760 + }, + { + "epoch": 9.591227866248207, + "grad_norm": 1.409567861110384, + "learning_rate": 4.79579975201488e-07, + "loss": 0.9092, + "step": 123770 + }, + { + "epoch": 9.592002789724514, + "grad_norm": 1.5088270583166061, + "learning_rate": 4.796187228766275e-07, + "loss": 0.908, + "step": 123780 + }, + { + "epoch": 9.592777713200821, + "grad_norm": 1.4703724005898011, + "learning_rate": 4.796574705517669e-07, + "loss": 0.889, + "step": 123790 + }, + { + "epoch": 9.593552636677128, + "grad_norm": 1.4805540423936534, + "learning_rate": 4.796962182269064e-07, + "loss": 0.9218, + "step": 123800 + }, + { + "epoch": 9.594327560153435, + "grad_norm": 1.4004218177822751, + "learning_rate": 4.79734965902046e-07, + "loss": 0.9001, + "step": 123810 + }, + { + "epoch": 9.595102483629741, + "grad_norm": 1.5504158205150198, + "learning_rate": 4.797737135771855e-07, + "loss": 0.8923, + "step": 123820 + }, + { + "epoch": 9.595877407106048, + "grad_norm": 1.529712961835369, + "learning_rate": 4.798124612523249e-07, + "loss": 0.8894, + "step": 123830 + }, + { + "epoch": 9.596652330582355, + "grad_norm": 1.473865096260106, + "learning_rate": 4.798512089274644e-07, + "loss": 0.9015, + "step": 123840 + }, + { + "epoch": 9.597427254058662, + "grad_norm": 1.460418643544787, + "learning_rate": 4.79889956602604e-07, + "loss": 0.9085, + "step": 123850 + }, + { + "epoch": 9.598202177534969, + "grad_norm": 1.447649158751461, + "learning_rate": 4.799287042777434e-07, + "loss": 0.9023, + "step": 123860 + }, + { + "epoch": 9.598977101011275, + "grad_norm": 1.5587792598531918, + "learning_rate": 4.799674519528829e-07, + "loss": 0.8995, + "step": 123870 + }, + { + "epoch": 9.599752024487582, + "grad_norm": 1.420573707321592, + "learning_rate": 4.800061996280224e-07, + "loss": 0.9378, + "step": 123880 + }, + { + "epoch": 9.600526947963889, + "grad_norm": 1.4203296566654644, + "learning_rate": 4.800449473031619e-07, + "loss": 0.9102, + "step": 123890 + }, + { + "epoch": 9.601301871440196, + "grad_norm": 1.645921051182685, + "learning_rate": 4.800836949783013e-07, + "loss": 0.915, + "step": 123900 + }, + { + "epoch": 9.602076794916503, + "grad_norm": 1.4566318007135546, + "learning_rate": 4.801224426534409e-07, + "loss": 0.9139, + "step": 123910 + }, + { + "epoch": 9.60285171839281, + "grad_norm": 1.511536743924663, + "learning_rate": 4.801611903285804e-07, + "loss": 0.8972, + "step": 123920 + }, + { + "epoch": 9.603626641869116, + "grad_norm": 1.3248761656933976, + "learning_rate": 4.801999380037198e-07, + "loss": 0.8867, + "step": 123930 + }, + { + "epoch": 9.604401565345421, + "grad_norm": 1.4320838436000092, + "learning_rate": 4.802386856788593e-07, + "loss": 0.8891, + "step": 123940 + }, + { + "epoch": 9.605176488821728, + "grad_norm": 1.6364027043494498, + "learning_rate": 4.802774333539989e-07, + "loss": 0.9229, + "step": 123950 + }, + { + "epoch": 9.605951412298035, + "grad_norm": 1.5155632210543382, + "learning_rate": 4.803161810291384e-07, + "loss": 0.8987, + "step": 123960 + }, + { + "epoch": 9.606726335774342, + "grad_norm": 1.476330035408959, + "learning_rate": 4.803549287042778e-07, + "loss": 0.8924, + "step": 123970 + }, + { + "epoch": 9.607501259250649, + "grad_norm": 1.469135147467304, + "learning_rate": 4.803936763794173e-07, + "loss": 0.9045, + "step": 123980 + }, + { + "epoch": 9.608276182726955, + "grad_norm": 1.4201698870905568, + "learning_rate": 4.804324240545568e-07, + "loss": 0.9126, + "step": 123990 + }, + { + "epoch": 9.609051106203262, + "grad_norm": 1.4230031722874201, + "learning_rate": 4.804711717296963e-07, + "loss": 0.8957, + "step": 124000 + }, + { + "epoch": 9.609051106203262, + "eval_loss": 0.9106069207191467, + "eval_runtime": 327.962, + "eval_samples_per_second": 34.977, + "eval_steps_per_second": 8.745, + "step": 124000 + }, + { + "epoch": 9.609826029679569, + "grad_norm": 1.510059836621672, + "learning_rate": 4.805099194048358e-07, + "loss": 0.8956, + "step": 124010 + }, + { + "epoch": 9.610600953155876, + "grad_norm": 1.4732532521160995, + "learning_rate": 4.805486670799753e-07, + "loss": 0.8944, + "step": 124020 + }, + { + "epoch": 9.611375876632183, + "grad_norm": 1.37155117955386, + "learning_rate": 4.805874147551147e-07, + "loss": 0.8828, + "step": 124030 + }, + { + "epoch": 9.61215080010849, + "grad_norm": 1.4826366641904871, + "learning_rate": 4.806261624302542e-07, + "loss": 0.9072, + "step": 124040 + }, + { + "epoch": 9.612925723584796, + "grad_norm": 1.422802905277281, + "learning_rate": 4.806649101053938e-07, + "loss": 0.9031, + "step": 124050 + }, + { + "epoch": 9.613700647061103, + "grad_norm": 1.4108956529441457, + "learning_rate": 4.807036577805333e-07, + "loss": 0.8881, + "step": 124060 + }, + { + "epoch": 9.61447557053741, + "grad_norm": 1.6114555002372652, + "learning_rate": 4.807424054556727e-07, + "loss": 0.9074, + "step": 124070 + }, + { + "epoch": 9.615250494013717, + "grad_norm": 1.567424579270805, + "learning_rate": 4.807811531308122e-07, + "loss": 0.889, + "step": 124080 + }, + { + "epoch": 9.616025417490023, + "grad_norm": 1.4426836483704863, + "learning_rate": 4.808199008059517e-07, + "loss": 0.8927, + "step": 124090 + }, + { + "epoch": 9.61680034096633, + "grad_norm": 1.3583610420871823, + "learning_rate": 4.808586484810912e-07, + "loss": 0.8985, + "step": 124100 + }, + { + "epoch": 9.617575264442637, + "grad_norm": 1.5220861512361408, + "learning_rate": 4.808973961562307e-07, + "loss": 0.8948, + "step": 124110 + }, + { + "epoch": 9.618350187918942, + "grad_norm": 1.4931646068477908, + "learning_rate": 4.809361438313702e-07, + "loss": 0.9066, + "step": 124120 + }, + { + "epoch": 9.619125111395249, + "grad_norm": 1.457164363941826, + "learning_rate": 4.809748915065097e-07, + "loss": 0.9056, + "step": 124130 + }, + { + "epoch": 9.619900034871556, + "grad_norm": 1.5239612507142108, + "learning_rate": 4.810136391816491e-07, + "loss": 0.8928, + "step": 124140 + }, + { + "epoch": 9.620674958347863, + "grad_norm": 1.4713026004947234, + "learning_rate": 4.810523868567887e-07, + "loss": 0.8947, + "step": 124150 + }, + { + "epoch": 9.62144988182417, + "grad_norm": 1.5230599436040153, + "learning_rate": 4.810911345319282e-07, + "loss": 0.8932, + "step": 124160 + }, + { + "epoch": 9.622224805300476, + "grad_norm": 1.435938947133189, + "learning_rate": 4.811298822070676e-07, + "loss": 0.8935, + "step": 124170 + }, + { + "epoch": 9.622999728776783, + "grad_norm": 1.4326066005403935, + "learning_rate": 4.811686298822071e-07, + "loss": 0.9132, + "step": 124180 + }, + { + "epoch": 9.62377465225309, + "grad_norm": 1.5143091495837, + "learning_rate": 4.812073775573466e-07, + "loss": 0.9139, + "step": 124190 + }, + { + "epoch": 9.624549575729397, + "grad_norm": 1.481518303091407, + "learning_rate": 4.812461252324862e-07, + "loss": 0.9177, + "step": 124200 + }, + { + "epoch": 9.625324499205703, + "grad_norm": 1.5018479895605608, + "learning_rate": 4.812848729076256e-07, + "loss": 0.8921, + "step": 124210 + }, + { + "epoch": 9.62609942268201, + "grad_norm": 1.4310601482323635, + "learning_rate": 4.813236205827651e-07, + "loss": 0.9, + "step": 124220 + }, + { + "epoch": 9.626874346158317, + "grad_norm": 1.4183840378064785, + "learning_rate": 4.813623682579046e-07, + "loss": 0.8932, + "step": 124230 + }, + { + "epoch": 9.627649269634624, + "grad_norm": 1.4405820637598226, + "learning_rate": 4.81401115933044e-07, + "loss": 0.8811, + "step": 124240 + }, + { + "epoch": 9.62842419311093, + "grad_norm": 1.5073998744397874, + "learning_rate": 4.814398636081836e-07, + "loss": 0.8936, + "step": 124250 + }, + { + "epoch": 9.629199116587237, + "grad_norm": 1.5488036947573245, + "learning_rate": 4.814786112833231e-07, + "loss": 0.903, + "step": 124260 + }, + { + "epoch": 9.629974040063544, + "grad_norm": 1.437695984842987, + "learning_rate": 4.815173589584626e-07, + "loss": 0.9002, + "step": 124270 + }, + { + "epoch": 9.630748963539851, + "grad_norm": 1.4354162628051808, + "learning_rate": 4.81556106633602e-07, + "loss": 0.8933, + "step": 124280 + }, + { + "epoch": 9.631523887016158, + "grad_norm": 1.4266066695045938, + "learning_rate": 4.815948543087415e-07, + "loss": 0.8969, + "step": 124290 + }, + { + "epoch": 9.632298810492465, + "grad_norm": 1.4925818107204016, + "learning_rate": 4.816336019838811e-07, + "loss": 0.8885, + "step": 124300 + }, + { + "epoch": 9.63307373396877, + "grad_norm": 1.3766845061523214, + "learning_rate": 4.816723496590205e-07, + "loss": 0.8977, + "step": 124310 + }, + { + "epoch": 9.633848657445077, + "grad_norm": 1.4403032154959963, + "learning_rate": 4.8171109733416e-07, + "loss": 0.8862, + "step": 124320 + }, + { + "epoch": 9.634623580921383, + "grad_norm": 1.3905744654918517, + "learning_rate": 4.817498450092995e-07, + "loss": 0.889, + "step": 124330 + }, + { + "epoch": 9.63539850439769, + "grad_norm": 1.4626105670591643, + "learning_rate": 4.81788592684439e-07, + "loss": 0.9172, + "step": 124340 + }, + { + "epoch": 9.636173427873997, + "grad_norm": 1.4124346855952086, + "learning_rate": 4.818273403595785e-07, + "loss": 0.8901, + "step": 124350 + }, + { + "epoch": 9.636948351350304, + "grad_norm": 1.4575320076305471, + "learning_rate": 4.81866088034718e-07, + "loss": 0.9098, + "step": 124360 + }, + { + "epoch": 9.63772327482661, + "grad_norm": 1.4253793292272878, + "learning_rate": 4.819048357098575e-07, + "loss": 0.8909, + "step": 124370 + }, + { + "epoch": 9.638498198302917, + "grad_norm": 1.4477192494954554, + "learning_rate": 4.819435833849969e-07, + "loss": 0.9036, + "step": 124380 + }, + { + "epoch": 9.639273121779224, + "grad_norm": 1.4703489899107824, + "learning_rate": 4.819823310601364e-07, + "loss": 0.9028, + "step": 124390 + }, + { + "epoch": 9.640048045255531, + "grad_norm": 1.4110670874552786, + "learning_rate": 4.82021078735276e-07, + "loss": 0.9082, + "step": 124400 + }, + { + "epoch": 9.640822968731838, + "grad_norm": 1.4207057059137196, + "learning_rate": 4.820598264104155e-07, + "loss": 0.9084, + "step": 124410 + }, + { + "epoch": 9.641597892208145, + "grad_norm": 1.510805313116273, + "learning_rate": 4.820985740855549e-07, + "loss": 0.9104, + "step": 124420 + }, + { + "epoch": 9.642372815684451, + "grad_norm": 1.568657552005395, + "learning_rate": 4.821373217606944e-07, + "loss": 0.8967, + "step": 124430 + }, + { + "epoch": 9.643147739160758, + "grad_norm": 1.4205012369259726, + "learning_rate": 4.82176069435834e-07, + "loss": 0.8927, + "step": 124440 + }, + { + "epoch": 9.643922662637065, + "grad_norm": 1.424007311447707, + "learning_rate": 4.822148171109734e-07, + "loss": 0.9274, + "step": 124450 + }, + { + "epoch": 9.644697586113372, + "grad_norm": 1.536930614823013, + "learning_rate": 4.822535647861129e-07, + "loss": 0.9127, + "step": 124460 + }, + { + "epoch": 9.645472509589679, + "grad_norm": 1.4854303917980118, + "learning_rate": 4.822923124612524e-07, + "loss": 0.8886, + "step": 124470 + }, + { + "epoch": 9.646247433065986, + "grad_norm": 1.550331196795062, + "learning_rate": 4.823310601363919e-07, + "loss": 0.8868, + "step": 124480 + }, + { + "epoch": 9.64702235654229, + "grad_norm": 1.445745056523328, + "learning_rate": 4.823698078115313e-07, + "loss": 0.9094, + "step": 124490 + }, + { + "epoch": 9.647797280018597, + "grad_norm": 1.4220315306153657, + "learning_rate": 4.824085554866709e-07, + "loss": 0.8881, + "step": 124500 + }, + { + "epoch": 9.647797280018597, + "eval_loss": 0.9103937745094299, + "eval_runtime": 328.3942, + "eval_samples_per_second": 34.931, + "eval_steps_per_second": 8.733, + "step": 124500 + }, + { + "epoch": 9.648572203494904, + "grad_norm": 1.5022816314790202, + "learning_rate": 4.824473031618104e-07, + "loss": 0.8958, + "step": 124510 + }, + { + "epoch": 9.649347126971211, + "grad_norm": 1.4698837206378357, + "learning_rate": 4.824860508369498e-07, + "loss": 0.8959, + "step": 124520 + }, + { + "epoch": 9.650122050447518, + "grad_norm": 1.4440594126511346, + "learning_rate": 4.825247985120893e-07, + "loss": 0.9124, + "step": 124530 + }, + { + "epoch": 9.650896973923825, + "grad_norm": 1.4363437510431265, + "learning_rate": 4.825635461872289e-07, + "loss": 0.8888, + "step": 124540 + }, + { + "epoch": 9.651671897400131, + "grad_norm": 1.4394713947421753, + "learning_rate": 4.826022938623684e-07, + "loss": 0.9032, + "step": 124550 + }, + { + "epoch": 9.652446820876438, + "grad_norm": 1.4324250442593272, + "learning_rate": 4.826410415375078e-07, + "loss": 0.8923, + "step": 124560 + }, + { + "epoch": 9.653221744352745, + "grad_norm": 1.4862784717454467, + "learning_rate": 4.826797892126473e-07, + "loss": 0.8834, + "step": 124570 + }, + { + "epoch": 9.653996667829052, + "grad_norm": 1.4575801677163256, + "learning_rate": 4.827185368877868e-07, + "loss": 0.9388, + "step": 124580 + }, + { + "epoch": 9.654771591305359, + "grad_norm": 1.530987526396269, + "learning_rate": 4.827572845629262e-07, + "loss": 0.9114, + "step": 124590 + }, + { + "epoch": 9.655546514781665, + "grad_norm": 1.352578649693925, + "learning_rate": 4.827960322380658e-07, + "loss": 0.8915, + "step": 124600 + }, + { + "epoch": 9.656321438257972, + "grad_norm": 1.4285357362078512, + "learning_rate": 4.828347799132053e-07, + "loss": 0.9048, + "step": 124610 + }, + { + "epoch": 9.657096361734279, + "grad_norm": 1.501975115400451, + "learning_rate": 4.828735275883448e-07, + "loss": 0.8941, + "step": 124620 + }, + { + "epoch": 9.657871285210586, + "grad_norm": 1.4173865008079998, + "learning_rate": 4.829122752634842e-07, + "loss": 0.9049, + "step": 124630 + }, + { + "epoch": 9.658646208686893, + "grad_norm": 1.4960509982608712, + "learning_rate": 4.829510229386238e-07, + "loss": 0.8888, + "step": 124640 + }, + { + "epoch": 9.6594211321632, + "grad_norm": 1.4897363590693282, + "learning_rate": 4.829897706137633e-07, + "loss": 0.9014, + "step": 124650 + }, + { + "epoch": 9.660196055639506, + "grad_norm": 1.4069847218786882, + "learning_rate": 4.830285182889027e-07, + "loss": 0.9001, + "step": 124660 + }, + { + "epoch": 9.660970979115813, + "grad_norm": 1.3972369477803785, + "learning_rate": 4.830672659640422e-07, + "loss": 0.9273, + "step": 124670 + }, + { + "epoch": 9.66174590259212, + "grad_norm": 1.4074653902638143, + "learning_rate": 4.831060136391817e-07, + "loss": 0.8978, + "step": 124680 + }, + { + "epoch": 9.662520826068425, + "grad_norm": 1.3856177481952592, + "learning_rate": 4.831447613143213e-07, + "loss": 0.9233, + "step": 124690 + }, + { + "epoch": 9.663295749544732, + "grad_norm": 1.4352014192480802, + "learning_rate": 4.831835089894607e-07, + "loss": 0.9037, + "step": 124700 + }, + { + "epoch": 9.664070673021039, + "grad_norm": 1.5507540461079168, + "learning_rate": 4.832222566646002e-07, + "loss": 0.9026, + "step": 124710 + }, + { + "epoch": 9.664845596497345, + "grad_norm": 1.4787120158240048, + "learning_rate": 4.832610043397397e-07, + "loss": 0.8834, + "step": 124720 + }, + { + "epoch": 9.665620519973652, + "grad_norm": 1.4920377375272473, + "learning_rate": 4.832997520148791e-07, + "loss": 0.9078, + "step": 124730 + }, + { + "epoch": 9.666395443449959, + "grad_norm": 1.4997558295139963, + "learning_rate": 4.833384996900187e-07, + "loss": 0.8826, + "step": 124740 + }, + { + "epoch": 9.667170366926266, + "grad_norm": 1.4588910837343207, + "learning_rate": 4.833772473651582e-07, + "loss": 0.8883, + "step": 124750 + }, + { + "epoch": 9.667945290402573, + "grad_norm": 1.5501463090360057, + "learning_rate": 4.834159950402977e-07, + "loss": 0.8969, + "step": 124760 + }, + { + "epoch": 9.66872021387888, + "grad_norm": 1.4879775338652594, + "learning_rate": 4.834547427154371e-07, + "loss": 0.8929, + "step": 124770 + }, + { + "epoch": 9.669495137355186, + "grad_norm": 1.5342101475733751, + "learning_rate": 4.834934903905766e-07, + "loss": 0.9055, + "step": 124780 + }, + { + "epoch": 9.670270060831493, + "grad_norm": 1.4335839147429248, + "learning_rate": 4.835322380657162e-07, + "loss": 0.9075, + "step": 124790 + }, + { + "epoch": 9.6710449843078, + "grad_norm": 1.4329564682179134, + "learning_rate": 4.835709857408556e-07, + "loss": 0.8887, + "step": 124800 + }, + { + "epoch": 9.671819907784107, + "grad_norm": 1.557660501703197, + "learning_rate": 4.836097334159951e-07, + "loss": 0.9142, + "step": 124810 + }, + { + "epoch": 9.672594831260414, + "grad_norm": 1.5157172051638932, + "learning_rate": 4.836484810911346e-07, + "loss": 0.9115, + "step": 124820 + }, + { + "epoch": 9.67336975473672, + "grad_norm": 1.4585896578687605, + "learning_rate": 4.836872287662741e-07, + "loss": 0.9069, + "step": 124830 + }, + { + "epoch": 9.674144678213027, + "grad_norm": 1.4908260483088698, + "learning_rate": 4.837259764414136e-07, + "loss": 0.8989, + "step": 124840 + }, + { + "epoch": 9.674919601689334, + "grad_norm": 1.5035907356521176, + "learning_rate": 4.837647241165531e-07, + "loss": 0.9072, + "step": 124850 + }, + { + "epoch": 9.675694525165639, + "grad_norm": 1.5421822184568366, + "learning_rate": 4.838034717916926e-07, + "loss": 0.9263, + "step": 124860 + }, + { + "epoch": 9.676469448641946, + "grad_norm": 1.4675296306128787, + "learning_rate": 4.83842219466832e-07, + "loss": 0.8941, + "step": 124870 + }, + { + "epoch": 9.677244372118253, + "grad_norm": 1.4737390685778606, + "learning_rate": 4.838809671419715e-07, + "loss": 0.902, + "step": 124880 + }, + { + "epoch": 9.67801929559456, + "grad_norm": 1.410903828009903, + "learning_rate": 4.839197148171111e-07, + "loss": 0.9029, + "step": 124890 + }, + { + "epoch": 9.678794219070866, + "grad_norm": 1.461138442026633, + "learning_rate": 4.839584624922506e-07, + "loss": 0.9285, + "step": 124900 + }, + { + "epoch": 9.679569142547173, + "grad_norm": 1.4366352428938216, + "learning_rate": 4.8399721016739e-07, + "loss": 0.8963, + "step": 124910 + }, + { + "epoch": 9.68034406602348, + "grad_norm": 1.4453424496824285, + "learning_rate": 4.840359578425295e-07, + "loss": 0.9087, + "step": 124920 + }, + { + "epoch": 9.681118989499787, + "grad_norm": 1.377518438902157, + "learning_rate": 4.84074705517669e-07, + "loss": 0.9092, + "step": 124930 + }, + { + "epoch": 9.681893912976093, + "grad_norm": 1.424292185621729, + "learning_rate": 4.841134531928085e-07, + "loss": 0.9117, + "step": 124940 + }, + { + "epoch": 9.6826688364524, + "grad_norm": 1.3825966330728323, + "learning_rate": 4.84152200867948e-07, + "loss": 0.897, + "step": 124950 + }, + { + "epoch": 9.683443759928707, + "grad_norm": 1.4626898901865977, + "learning_rate": 4.841909485430875e-07, + "loss": 0.8977, + "step": 124960 + }, + { + "epoch": 9.684218683405014, + "grad_norm": 1.4350726926625987, + "learning_rate": 4.84229696218227e-07, + "loss": 0.9351, + "step": 124970 + }, + { + "epoch": 9.68499360688132, + "grad_norm": 1.3348811788188915, + "learning_rate": 4.842684438933664e-07, + "loss": 0.9096, + "step": 124980 + }, + { + "epoch": 9.685768530357628, + "grad_norm": 1.4636504655486173, + "learning_rate": 4.84307191568506e-07, + "loss": 0.9115, + "step": 124990 + }, + { + "epoch": 9.686543453833934, + "grad_norm": 1.5200265660577248, + "learning_rate": 4.843459392436455e-07, + "loss": 0.8976, + "step": 125000 + }, + { + "epoch": 9.686543453833934, + "eval_loss": 0.9101827144622803, + "eval_runtime": 327.9083, + "eval_samples_per_second": 34.982, + "eval_steps_per_second": 8.746, + "step": 125000 + }, + { + "epoch": 9.687318377310241, + "grad_norm": 1.4609796005780444, + "learning_rate": 4.843846869187849e-07, + "loss": 0.8983, + "step": 125010 + }, + { + "epoch": 9.688093300786548, + "grad_norm": 1.4108457293723804, + "learning_rate": 4.844234345939244e-07, + "loss": 0.8822, + "step": 125020 + }, + { + "epoch": 9.688868224262855, + "grad_norm": 1.5457494888205503, + "learning_rate": 4.844621822690639e-07, + "loss": 0.8937, + "step": 125030 + }, + { + "epoch": 9.689643147739162, + "grad_norm": 1.5024472410600485, + "learning_rate": 4.845009299442035e-07, + "loss": 0.8912, + "step": 125040 + }, + { + "epoch": 9.690418071215468, + "grad_norm": 1.4711441598622188, + "learning_rate": 4.845396776193429e-07, + "loss": 0.8875, + "step": 125050 + }, + { + "epoch": 9.691192994691773, + "grad_norm": 1.4980373845553348, + "learning_rate": 4.845784252944824e-07, + "loss": 0.9142, + "step": 125060 + }, + { + "epoch": 9.69196791816808, + "grad_norm": 1.537579123743022, + "learning_rate": 4.846171729696219e-07, + "loss": 0.8855, + "step": 125070 + }, + { + "epoch": 9.692742841644387, + "grad_norm": 1.4203509497871392, + "learning_rate": 4.846559206447613e-07, + "loss": 0.8979, + "step": 125080 + }, + { + "epoch": 9.693517765120694, + "grad_norm": 1.5399570500266573, + "learning_rate": 4.846946683199009e-07, + "loss": 0.8991, + "step": 125090 + }, + { + "epoch": 9.694292688597, + "grad_norm": 1.4698529460542096, + "learning_rate": 4.847334159950404e-07, + "loss": 0.9012, + "step": 125100 + }, + { + "epoch": 9.695067612073307, + "grad_norm": 1.4645286249607539, + "learning_rate": 4.847721636701799e-07, + "loss": 0.9031, + "step": 125110 + }, + { + "epoch": 9.695842535549614, + "grad_norm": 1.4660636374580729, + "learning_rate": 4.848109113453193e-07, + "loss": 0.8935, + "step": 125120 + }, + { + "epoch": 9.696617459025921, + "grad_norm": 1.4577045601798595, + "learning_rate": 4.848496590204588e-07, + "loss": 0.8929, + "step": 125130 + }, + { + "epoch": 9.697392382502228, + "grad_norm": 1.4684156006834579, + "learning_rate": 4.848884066955984e-07, + "loss": 0.8983, + "step": 125140 + }, + { + "epoch": 9.698167305978535, + "grad_norm": 1.4629556207740608, + "learning_rate": 4.849271543707378e-07, + "loss": 0.9173, + "step": 125150 + }, + { + "epoch": 9.698942229454842, + "grad_norm": 1.4943440352668897, + "learning_rate": 4.849659020458773e-07, + "loss": 0.9017, + "step": 125160 + }, + { + "epoch": 9.699717152931148, + "grad_norm": 1.4611560972931965, + "learning_rate": 4.850046497210168e-07, + "loss": 0.9116, + "step": 125170 + }, + { + "epoch": 9.700492076407455, + "grad_norm": 1.3985677163418497, + "learning_rate": 4.850433973961564e-07, + "loss": 0.9062, + "step": 125180 + }, + { + "epoch": 9.701266999883762, + "grad_norm": 1.625836941919472, + "learning_rate": 4.850821450712958e-07, + "loss": 0.8958, + "step": 125190 + }, + { + "epoch": 9.702041923360069, + "grad_norm": 1.4281313274529164, + "learning_rate": 4.851208927464353e-07, + "loss": 0.8868, + "step": 125200 + }, + { + "epoch": 9.702816846836376, + "grad_norm": 1.4571712049015595, + "learning_rate": 4.851596404215748e-07, + "loss": 0.9139, + "step": 125210 + }, + { + "epoch": 9.703591770312682, + "grad_norm": 1.3935823242423524, + "learning_rate": 4.851983880967142e-07, + "loss": 0.8958, + "step": 125220 + }, + { + "epoch": 9.704366693788987, + "grad_norm": 1.4504021913502085, + "learning_rate": 4.852371357718537e-07, + "loss": 0.8907, + "step": 125230 + }, + { + "epoch": 9.705141617265294, + "grad_norm": 1.5208210772777109, + "learning_rate": 4.852758834469933e-07, + "loss": 0.9333, + "step": 125240 + }, + { + "epoch": 9.705916540741601, + "grad_norm": 1.4580162618622368, + "learning_rate": 4.853146311221328e-07, + "loss": 0.8957, + "step": 125250 + }, + { + "epoch": 9.706691464217908, + "grad_norm": 1.3805608996086571, + "learning_rate": 4.853533787972722e-07, + "loss": 0.8966, + "step": 125260 + }, + { + "epoch": 9.707466387694215, + "grad_norm": 1.522335398653141, + "learning_rate": 4.853921264724117e-07, + "loss": 0.8997, + "step": 125270 + }, + { + "epoch": 9.708241311170521, + "grad_norm": 1.393976618341518, + "learning_rate": 4.854308741475513e-07, + "loss": 0.9049, + "step": 125280 + }, + { + "epoch": 9.709016234646828, + "grad_norm": 1.4430210852124243, + "learning_rate": 4.854696218226907e-07, + "loss": 0.8973, + "step": 125290 + }, + { + "epoch": 9.709791158123135, + "grad_norm": 1.4915713861893791, + "learning_rate": 4.855083694978302e-07, + "loss": 0.9092, + "step": 125300 + }, + { + "epoch": 9.710566081599442, + "grad_norm": 1.4754979942078634, + "learning_rate": 4.855471171729697e-07, + "loss": 0.9058, + "step": 125310 + }, + { + "epoch": 9.711341005075749, + "grad_norm": 1.4777695209865394, + "learning_rate": 4.855858648481092e-07, + "loss": 0.8962, + "step": 125320 + }, + { + "epoch": 9.712115928552056, + "grad_norm": 1.5509766418515838, + "learning_rate": 4.856246125232487e-07, + "loss": 0.9135, + "step": 125330 + }, + { + "epoch": 9.712890852028362, + "grad_norm": 1.413670319530477, + "learning_rate": 4.856633601983882e-07, + "loss": 0.9027, + "step": 125340 + }, + { + "epoch": 9.71366577550467, + "grad_norm": 1.468230748466068, + "learning_rate": 4.857021078735277e-07, + "loss": 0.9025, + "step": 125350 + }, + { + "epoch": 9.714440698980976, + "grad_norm": 1.429697138047906, + "learning_rate": 4.857408555486671e-07, + "loss": 0.8968, + "step": 125360 + }, + { + "epoch": 9.715215622457283, + "grad_norm": 1.3889309197046054, + "learning_rate": 4.857796032238066e-07, + "loss": 0.8956, + "step": 125370 + }, + { + "epoch": 9.71599054593359, + "grad_norm": 1.3750364834438367, + "learning_rate": 4.858183508989462e-07, + "loss": 0.9155, + "step": 125380 + }, + { + "epoch": 9.716765469409896, + "grad_norm": 1.441107930780597, + "learning_rate": 4.858570985740857e-07, + "loss": 0.9096, + "step": 125390 + }, + { + "epoch": 9.717540392886203, + "grad_norm": 1.486256360987292, + "learning_rate": 4.858958462492251e-07, + "loss": 0.9158, + "step": 125400 + }, + { + "epoch": 9.71831531636251, + "grad_norm": 1.4343448004245545, + "learning_rate": 4.859345939243646e-07, + "loss": 0.8942, + "step": 125410 + }, + { + "epoch": 9.719090239838817, + "grad_norm": 1.4466460382225752, + "learning_rate": 4.859733415995041e-07, + "loss": 0.9078, + "step": 125420 + }, + { + "epoch": 9.719865163315122, + "grad_norm": 1.3839436968898522, + "learning_rate": 4.860120892746436e-07, + "loss": 0.8821, + "step": 125430 + }, + { + "epoch": 9.720640086791429, + "grad_norm": 1.420072756092448, + "learning_rate": 4.860508369497831e-07, + "loss": 0.9131, + "step": 125440 + }, + { + "epoch": 9.721415010267735, + "grad_norm": 1.428692876742635, + "learning_rate": 4.860895846249226e-07, + "loss": 0.8894, + "step": 125450 + }, + { + "epoch": 9.722189933744042, + "grad_norm": 1.4487645717699902, + "learning_rate": 4.86128332300062e-07, + "loss": 0.8999, + "step": 125460 + }, + { + "epoch": 9.722964857220349, + "grad_norm": 1.4466133513372754, + "learning_rate": 4.861670799752015e-07, + "loss": 0.9029, + "step": 125470 + }, + { + "epoch": 9.723739780696656, + "grad_norm": 1.4589641841735663, + "learning_rate": 4.862058276503411e-07, + "loss": 0.8828, + "step": 125480 + }, + { + "epoch": 9.724514704172963, + "grad_norm": 1.438230484621883, + "learning_rate": 4.862445753254806e-07, + "loss": 0.9008, + "step": 125490 + }, + { + "epoch": 9.72528962764927, + "grad_norm": 1.4910548474997882, + "learning_rate": 4.8628332300062e-07, + "loss": 0.8817, + "step": 125500 + }, + { + "epoch": 9.72528962764927, + "eval_loss": 0.9100252985954285, + "eval_runtime": 332.8097, + "eval_samples_per_second": 34.467, + "eval_steps_per_second": 8.618, + "step": 125500 + }, + { + "epoch": 9.726064551125576, + "grad_norm": 1.3692735658021882, + "learning_rate": 4.863220706757595e-07, + "loss": 0.8871, + "step": 125510 + }, + { + "epoch": 9.726839474601883, + "grad_norm": 1.4674828767757429, + "learning_rate": 4.86360818350899e-07, + "loss": 0.8871, + "step": 125520 + }, + { + "epoch": 9.72761439807819, + "grad_norm": 1.485453905915929, + "learning_rate": 4.863995660260385e-07, + "loss": 0.903, + "step": 125530 + }, + { + "epoch": 9.728389321554497, + "grad_norm": 1.4657406647022113, + "learning_rate": 4.86438313701178e-07, + "loss": 0.9029, + "step": 125540 + }, + { + "epoch": 9.729164245030804, + "grad_norm": 1.4069492173248201, + "learning_rate": 4.864770613763175e-07, + "loss": 0.8957, + "step": 125550 + }, + { + "epoch": 9.72993916850711, + "grad_norm": 1.4191705856627688, + "learning_rate": 4.86515809051457e-07, + "loss": 0.8944, + "step": 125560 + }, + { + "epoch": 9.730714091983417, + "grad_norm": 1.491656736047663, + "learning_rate": 4.865545567265964e-07, + "loss": 0.9422, + "step": 125570 + }, + { + "epoch": 9.731489015459724, + "grad_norm": 1.4694915588602369, + "learning_rate": 4.86593304401736e-07, + "loss": 0.9055, + "step": 125580 + }, + { + "epoch": 9.73226393893603, + "grad_norm": 1.436139215673141, + "learning_rate": 4.866320520768755e-07, + "loss": 0.9091, + "step": 125590 + }, + { + "epoch": 9.733038862412336, + "grad_norm": 1.506092244360831, + "learning_rate": 4.866707997520149e-07, + "loss": 0.9465, + "step": 125600 + }, + { + "epoch": 9.733813785888643, + "grad_norm": 1.3427909211932119, + "learning_rate": 4.867095474271544e-07, + "loss": 0.8791, + "step": 125610 + }, + { + "epoch": 9.73458870936495, + "grad_norm": 1.5044080883525517, + "learning_rate": 4.867482951022939e-07, + "loss": 0.9145, + "step": 125620 + }, + { + "epoch": 9.735363632841256, + "grad_norm": 1.381292724924431, + "learning_rate": 4.867870427774335e-07, + "loss": 0.8693, + "step": 125630 + }, + { + "epoch": 9.736138556317563, + "grad_norm": 1.3692124516909336, + "learning_rate": 4.868257904525729e-07, + "loss": 0.9057, + "step": 125640 + }, + { + "epoch": 9.73691347979387, + "grad_norm": 1.4062988112392536, + "learning_rate": 4.868645381277124e-07, + "loss": 0.9035, + "step": 125650 + }, + { + "epoch": 9.737688403270177, + "grad_norm": 1.3418810839805353, + "learning_rate": 4.869032858028519e-07, + "loss": 0.8868, + "step": 125660 + }, + { + "epoch": 9.738463326746484, + "grad_norm": 1.3787474356129987, + "learning_rate": 4.869420334779913e-07, + "loss": 0.8955, + "step": 125670 + }, + { + "epoch": 9.73923825022279, + "grad_norm": 1.3922764370643577, + "learning_rate": 4.869807811531309e-07, + "loss": 0.9058, + "step": 125680 + }, + { + "epoch": 9.740013173699097, + "grad_norm": 1.4635421537895843, + "learning_rate": 4.870195288282704e-07, + "loss": 0.8895, + "step": 125690 + }, + { + "epoch": 9.740788097175404, + "grad_norm": 1.4726765235769097, + "learning_rate": 4.870582765034099e-07, + "loss": 0.9174, + "step": 125700 + }, + { + "epoch": 9.74156302065171, + "grad_norm": 1.3753983091731201, + "learning_rate": 4.870970241785493e-07, + "loss": 0.9098, + "step": 125710 + }, + { + "epoch": 9.742337944128018, + "grad_norm": 1.5075607004688691, + "learning_rate": 4.871357718536888e-07, + "loss": 0.9161, + "step": 125720 + }, + { + "epoch": 9.743112867604324, + "grad_norm": 1.4715878173427919, + "learning_rate": 4.871745195288284e-07, + "loss": 0.8834, + "step": 125730 + }, + { + "epoch": 9.743887791080631, + "grad_norm": 1.4512699911355216, + "learning_rate": 4.872132672039678e-07, + "loss": 0.8893, + "step": 125740 + }, + { + "epoch": 9.744662714556938, + "grad_norm": 1.527510085970089, + "learning_rate": 4.872520148791073e-07, + "loss": 0.9098, + "step": 125750 + }, + { + "epoch": 9.745437638033245, + "grad_norm": 1.428589505404023, + "learning_rate": 4.872907625542468e-07, + "loss": 0.8976, + "step": 125760 + }, + { + "epoch": 9.746212561509552, + "grad_norm": 1.3832395899866539, + "learning_rate": 4.873295102293863e-07, + "loss": 0.8981, + "step": 125770 + }, + { + "epoch": 9.746987484985858, + "grad_norm": 1.6207359563365102, + "learning_rate": 4.873682579045258e-07, + "loss": 0.895, + "step": 125780 + }, + { + "epoch": 9.747762408462165, + "grad_norm": 1.5249876479542543, + "learning_rate": 4.874070055796653e-07, + "loss": 0.9023, + "step": 125790 + }, + { + "epoch": 9.74853733193847, + "grad_norm": 1.5150832470110798, + "learning_rate": 4.874457532548048e-07, + "loss": 0.8796, + "step": 125800 + }, + { + "epoch": 9.749312255414777, + "grad_norm": 1.4606084375270905, + "learning_rate": 4.874845009299442e-07, + "loss": 0.884, + "step": 125810 + }, + { + "epoch": 9.750087178891084, + "grad_norm": 1.4574065966180283, + "learning_rate": 4.875232486050837e-07, + "loss": 0.8792, + "step": 125820 + }, + { + "epoch": 9.75086210236739, + "grad_norm": 1.4143180337651833, + "learning_rate": 4.875619962802233e-07, + "loss": 0.8957, + "step": 125830 + }, + { + "epoch": 9.751637025843698, + "grad_norm": 1.4528496872265044, + "learning_rate": 4.876007439553628e-07, + "loss": 0.9027, + "step": 125840 + }, + { + "epoch": 9.752411949320004, + "grad_norm": 1.4095597079642117, + "learning_rate": 4.876394916305022e-07, + "loss": 0.8949, + "step": 125850 + }, + { + "epoch": 9.753186872796311, + "grad_norm": 1.3914969843011356, + "learning_rate": 4.876782393056417e-07, + "loss": 0.9059, + "step": 125860 + }, + { + "epoch": 9.753961796272618, + "grad_norm": 1.3866115312725211, + "learning_rate": 4.877169869807813e-07, + "loss": 0.8978, + "step": 125870 + }, + { + "epoch": 9.754736719748925, + "grad_norm": 1.554612057531171, + "learning_rate": 4.877557346559207e-07, + "loss": 0.883, + "step": 125880 + }, + { + "epoch": 9.755511643225232, + "grad_norm": 1.4476665263021047, + "learning_rate": 4.877944823310602e-07, + "loss": 0.9104, + "step": 125890 + }, + { + "epoch": 9.756286566701538, + "grad_norm": 1.4605919794716715, + "learning_rate": 4.878332300061997e-07, + "loss": 0.9019, + "step": 125900 + }, + { + "epoch": 9.757061490177845, + "grad_norm": 1.5196495879900285, + "learning_rate": 4.878719776813392e-07, + "loss": 0.8963, + "step": 125910 + }, + { + "epoch": 9.757836413654152, + "grad_norm": 1.4159110201534504, + "learning_rate": 4.879107253564786e-07, + "loss": 0.8828, + "step": 125920 + }, + { + "epoch": 9.758611337130459, + "grad_norm": 1.4614710342475823, + "learning_rate": 4.879494730316182e-07, + "loss": 0.9056, + "step": 125930 + }, + { + "epoch": 9.759386260606766, + "grad_norm": 1.5216901568604937, + "learning_rate": 4.879882207067577e-07, + "loss": 0.8974, + "step": 125940 + }, + { + "epoch": 9.760161184083072, + "grad_norm": 1.3890849048100422, + "learning_rate": 4.880269683818971e-07, + "loss": 0.898, + "step": 125950 + }, + { + "epoch": 9.76093610755938, + "grad_norm": 1.4318294789204842, + "learning_rate": 4.880657160570366e-07, + "loss": 0.878, + "step": 125960 + }, + { + "epoch": 9.761711031035686, + "grad_norm": 1.4088446021206116, + "learning_rate": 4.881044637321762e-07, + "loss": 0.9116, + "step": 125970 + }, + { + "epoch": 9.762485954511991, + "grad_norm": 1.4407954494340094, + "learning_rate": 4.881432114073157e-07, + "loss": 0.8726, + "step": 125980 + }, + { + "epoch": 9.763260877988298, + "grad_norm": 1.3589531626693614, + "learning_rate": 4.881819590824551e-07, + "loss": 0.8954, + "step": 125990 + }, + { + "epoch": 9.764035801464605, + "grad_norm": 1.4148122062568582, + "learning_rate": 4.882207067575946e-07, + "loss": 0.8882, + "step": 126000 + }, + { + "epoch": 9.764035801464605, + "eval_loss": 0.909717321395874, + "eval_runtime": 335.538, + "eval_samples_per_second": 34.187, + "eval_steps_per_second": 8.547, + "step": 126000 + }, + { + "epoch": 9.764810724940912, + "grad_norm": 1.5022998518339101, + "learning_rate": 4.882594544327341e-07, + "loss": 0.871, + "step": 126010 + }, + { + "epoch": 9.765585648417218, + "grad_norm": 1.4079014372971597, + "learning_rate": 4.882982021078736e-07, + "loss": 0.9005, + "step": 126020 + }, + { + "epoch": 9.766360571893525, + "grad_norm": 1.463813685122193, + "learning_rate": 4.883369497830131e-07, + "loss": 0.9015, + "step": 126030 + }, + { + "epoch": 9.767135495369832, + "grad_norm": 1.4776607565811137, + "learning_rate": 4.883756974581526e-07, + "loss": 0.912, + "step": 126040 + }, + { + "epoch": 9.767910418846139, + "grad_norm": 1.4434400552760518, + "learning_rate": 4.884144451332921e-07, + "loss": 0.9032, + "step": 126050 + }, + { + "epoch": 9.768685342322446, + "grad_norm": 1.5016096432975339, + "learning_rate": 4.884531928084315e-07, + "loss": 0.9168, + "step": 126060 + }, + { + "epoch": 9.769460265798752, + "grad_norm": 1.5001563271737512, + "learning_rate": 4.884919404835711e-07, + "loss": 0.8996, + "step": 126070 + }, + { + "epoch": 9.77023518927506, + "grad_norm": 1.5993518544703087, + "learning_rate": 4.885306881587106e-07, + "loss": 0.8991, + "step": 126080 + }, + { + "epoch": 9.771010112751366, + "grad_norm": 1.3856517479962622, + "learning_rate": 4.8856943583385e-07, + "loss": 0.876, + "step": 126090 + }, + { + "epoch": 9.771785036227673, + "grad_norm": 1.376813625588449, + "learning_rate": 4.886081835089895e-07, + "loss": 0.8925, + "step": 126100 + }, + { + "epoch": 9.77255995970398, + "grad_norm": 1.3919195773704405, + "learning_rate": 4.88646931184129e-07, + "loss": 0.9037, + "step": 126110 + }, + { + "epoch": 9.773334883180286, + "grad_norm": 1.3867507523345608, + "learning_rate": 4.886856788592686e-07, + "loss": 0.9051, + "step": 126120 + }, + { + "epoch": 9.774109806656593, + "grad_norm": 1.405130609668734, + "learning_rate": 4.88724426534408e-07, + "loss": 0.8932, + "step": 126130 + }, + { + "epoch": 9.7748847301329, + "grad_norm": 1.428418660522436, + "learning_rate": 4.887631742095475e-07, + "loss": 0.9055, + "step": 126140 + }, + { + "epoch": 9.775659653609207, + "grad_norm": 1.4238156957378447, + "learning_rate": 4.88801921884687e-07, + "loss": 0.9142, + "step": 126150 + }, + { + "epoch": 9.776434577085514, + "grad_norm": 1.4704124610154938, + "learning_rate": 4.888406695598264e-07, + "loss": 0.9143, + "step": 126160 + }, + { + "epoch": 9.777209500561819, + "grad_norm": 1.4085154371044233, + "learning_rate": 4.88879417234966e-07, + "loss": 0.8997, + "step": 126170 + }, + { + "epoch": 9.777984424038126, + "grad_norm": 1.5381671933427905, + "learning_rate": 4.889181649101055e-07, + "loss": 0.9027, + "step": 126180 + }, + { + "epoch": 9.778759347514432, + "grad_norm": 1.4075413490094415, + "learning_rate": 4.88956912585245e-07, + "loss": 0.8955, + "step": 126190 + }, + { + "epoch": 9.77953427099074, + "grad_norm": 1.456059648580341, + "learning_rate": 4.889956602603844e-07, + "loss": 0.8943, + "step": 126200 + }, + { + "epoch": 9.780309194467046, + "grad_norm": 1.4415545706194424, + "learning_rate": 4.890344079355239e-07, + "loss": 0.8957, + "step": 126210 + }, + { + "epoch": 9.781084117943353, + "grad_norm": 1.4302444683700288, + "learning_rate": 4.890731556106635e-07, + "loss": 0.8991, + "step": 126220 + }, + { + "epoch": 9.78185904141966, + "grad_norm": 1.4726482585848386, + "learning_rate": 4.891119032858029e-07, + "loss": 0.8969, + "step": 126230 + }, + { + "epoch": 9.782633964895966, + "grad_norm": 1.3390629450752645, + "learning_rate": 4.891506509609424e-07, + "loss": 0.8947, + "step": 126240 + }, + { + "epoch": 9.783408888372273, + "grad_norm": 1.339550034459689, + "learning_rate": 4.891893986360819e-07, + "loss": 0.8937, + "step": 126250 + }, + { + "epoch": 9.78418381184858, + "grad_norm": 1.4232711559048368, + "learning_rate": 4.892281463112214e-07, + "loss": 0.9133, + "step": 126260 + }, + { + "epoch": 9.784958735324887, + "grad_norm": 1.413493968916563, + "learning_rate": 4.892668939863609e-07, + "loss": 0.9022, + "step": 126270 + }, + { + "epoch": 9.785733658801194, + "grad_norm": 1.4803815884020637, + "learning_rate": 4.893056416615004e-07, + "loss": 0.8853, + "step": 126280 + }, + { + "epoch": 9.7865085822775, + "grad_norm": 1.4060265747740797, + "learning_rate": 4.893443893366399e-07, + "loss": 0.8935, + "step": 126290 + }, + { + "epoch": 9.787283505753807, + "grad_norm": 1.4546256465332672, + "learning_rate": 4.893831370117793e-07, + "loss": 0.9041, + "step": 126300 + }, + { + "epoch": 9.788058429230114, + "grad_norm": 1.5347944955454411, + "learning_rate": 4.894218846869188e-07, + "loss": 0.8842, + "step": 126310 + }, + { + "epoch": 9.78883335270642, + "grad_norm": 1.4018085808332872, + "learning_rate": 4.894606323620584e-07, + "loss": 0.9008, + "step": 126320 + }, + { + "epoch": 9.789608276182728, + "grad_norm": 1.510406156795688, + "learning_rate": 4.894993800371979e-07, + "loss": 0.9061, + "step": 126330 + }, + { + "epoch": 9.790383199659034, + "grad_norm": 1.4045857448823584, + "learning_rate": 4.895381277123373e-07, + "loss": 0.9245, + "step": 126340 + }, + { + "epoch": 9.79115812313534, + "grad_norm": 1.4907700307144451, + "learning_rate": 4.895768753874768e-07, + "loss": 0.9053, + "step": 126350 + }, + { + "epoch": 9.791933046611646, + "grad_norm": 1.466700916736669, + "learning_rate": 4.896156230626163e-07, + "loss": 0.8992, + "step": 126360 + }, + { + "epoch": 9.792707970087953, + "grad_norm": 1.361634603765621, + "learning_rate": 4.896543707377558e-07, + "loss": 0.8818, + "step": 126370 + }, + { + "epoch": 9.79348289356426, + "grad_norm": 1.4602447608445623, + "learning_rate": 4.896931184128953e-07, + "loss": 0.91, + "step": 126380 + }, + { + "epoch": 9.794257817040567, + "grad_norm": 1.4178777169491614, + "learning_rate": 4.897318660880348e-07, + "loss": 0.9196, + "step": 126390 + }, + { + "epoch": 9.795032740516874, + "grad_norm": 1.3615338916093278, + "learning_rate": 4.897706137631743e-07, + "loss": 0.8963, + "step": 126400 + }, + { + "epoch": 9.79580766399318, + "grad_norm": 1.4294230344441212, + "learning_rate": 4.898093614383137e-07, + "loss": 0.9037, + "step": 126410 + }, + { + "epoch": 9.796582587469487, + "grad_norm": 1.4743900604043758, + "learning_rate": 4.898481091134533e-07, + "loss": 0.9112, + "step": 126420 + }, + { + "epoch": 9.797357510945794, + "grad_norm": 1.4469029820045631, + "learning_rate": 4.898868567885928e-07, + "loss": 0.8907, + "step": 126430 + }, + { + "epoch": 9.7981324344221, + "grad_norm": 1.4194147182620216, + "learning_rate": 4.899256044637322e-07, + "loss": 0.8935, + "step": 126440 + }, + { + "epoch": 9.798907357898408, + "grad_norm": 1.4641055795174664, + "learning_rate": 4.899643521388717e-07, + "loss": 0.91, + "step": 126450 + }, + { + "epoch": 9.799682281374714, + "grad_norm": 1.4050967600758069, + "learning_rate": 4.900030998140112e-07, + "loss": 0.9111, + "step": 126460 + }, + { + "epoch": 9.800457204851021, + "grad_norm": 1.3711380204450718, + "learning_rate": 4.900418474891508e-07, + "loss": 0.9093, + "step": 126470 + }, + { + "epoch": 9.801232128327328, + "grad_norm": 1.5477258923440418, + "learning_rate": 4.900805951642902e-07, + "loss": 0.8771, + "step": 126480 + }, + { + "epoch": 9.802007051803635, + "grad_norm": 1.509168605316422, + "learning_rate": 4.901193428394297e-07, + "loss": 0.9122, + "step": 126490 + }, + { + "epoch": 9.802781975279942, + "grad_norm": 1.4722943584156367, + "learning_rate": 4.901580905145692e-07, + "loss": 0.9094, + "step": 126500 + }, + { + "epoch": 9.802781975279942, + "eval_loss": 0.9093234539031982, + "eval_runtime": 327.3611, + "eval_samples_per_second": 35.041, + "eval_steps_per_second": 8.761, + "step": 126500 + }, + { + "epoch": 9.803556898756248, + "grad_norm": 1.4173637049135757, + "learning_rate": 4.901968381897086e-07, + "loss": 0.9017, + "step": 126510 + }, + { + "epoch": 9.804331822232555, + "grad_norm": 1.4392790349933768, + "learning_rate": 4.902355858648482e-07, + "loss": 0.9024, + "step": 126520 + }, + { + "epoch": 9.805106745708862, + "grad_norm": 1.4197977527134176, + "learning_rate": 4.902743335399877e-07, + "loss": 0.9212, + "step": 126530 + }, + { + "epoch": 9.805881669185169, + "grad_norm": 1.3680075148035973, + "learning_rate": 4.903130812151272e-07, + "loss": 0.8901, + "step": 126540 + }, + { + "epoch": 9.806656592661474, + "grad_norm": 1.4967778902436644, + "learning_rate": 4.903518288902666e-07, + "loss": 0.9074, + "step": 126550 + }, + { + "epoch": 9.80743151613778, + "grad_norm": 1.4273333196743077, + "learning_rate": 4.903905765654061e-07, + "loss": 0.8897, + "step": 126560 + }, + { + "epoch": 9.808206439614088, + "grad_norm": 1.4239308795230585, + "learning_rate": 4.904293242405457e-07, + "loss": 0.9029, + "step": 126570 + }, + { + "epoch": 9.808981363090394, + "grad_norm": 1.4245639029653254, + "learning_rate": 4.904680719156851e-07, + "loss": 0.8992, + "step": 126580 + }, + { + "epoch": 9.809756286566701, + "grad_norm": 1.4406745783726698, + "learning_rate": 4.905068195908246e-07, + "loss": 0.9072, + "step": 126590 + }, + { + "epoch": 9.810531210043008, + "grad_norm": 1.3807891603650229, + "learning_rate": 4.905455672659641e-07, + "loss": 0.8932, + "step": 126600 + }, + { + "epoch": 9.811306133519315, + "grad_norm": 1.3766868421703315, + "learning_rate": 4.905843149411037e-07, + "loss": 0.9185, + "step": 126610 + }, + { + "epoch": 9.812081056995622, + "grad_norm": 1.4913474212456788, + "learning_rate": 4.906230626162431e-07, + "loss": 0.898, + "step": 126620 + }, + { + "epoch": 9.812855980471928, + "grad_norm": 1.5449392571516765, + "learning_rate": 4.906618102913826e-07, + "loss": 0.8905, + "step": 126630 + }, + { + "epoch": 9.813630903948235, + "grad_norm": 1.4670069969132868, + "learning_rate": 4.907005579665221e-07, + "loss": 0.9204, + "step": 126640 + }, + { + "epoch": 9.814405827424542, + "grad_norm": 1.551175106113056, + "learning_rate": 4.907393056416615e-07, + "loss": 0.8913, + "step": 126650 + }, + { + "epoch": 9.815180750900849, + "grad_norm": 1.4256075617374817, + "learning_rate": 4.90778053316801e-07, + "loss": 0.9003, + "step": 126660 + }, + { + "epoch": 9.815955674377156, + "grad_norm": 1.4842838484063938, + "learning_rate": 4.908168009919406e-07, + "loss": 0.8938, + "step": 126670 + }, + { + "epoch": 9.816730597853462, + "grad_norm": 1.5362581382558311, + "learning_rate": 4.908555486670801e-07, + "loss": 0.9011, + "step": 126680 + }, + { + "epoch": 9.81750552132977, + "grad_norm": 1.5774261176064113, + "learning_rate": 4.908942963422195e-07, + "loss": 0.9072, + "step": 126690 + }, + { + "epoch": 9.818280444806076, + "grad_norm": 1.3578493775080516, + "learning_rate": 4.90933044017359e-07, + "loss": 0.9083, + "step": 126700 + }, + { + "epoch": 9.819055368282383, + "grad_norm": 1.4531419586049399, + "learning_rate": 4.909717916924986e-07, + "loss": 0.8861, + "step": 126710 + }, + { + "epoch": 9.819830291758688, + "grad_norm": 1.4630896906777033, + "learning_rate": 4.91010539367638e-07, + "loss": 0.9164, + "step": 126720 + }, + { + "epoch": 9.820605215234995, + "grad_norm": 1.4474401110906256, + "learning_rate": 4.910492870427775e-07, + "loss": 0.9054, + "step": 126730 + }, + { + "epoch": 9.821380138711302, + "grad_norm": 1.3904746971145439, + "learning_rate": 4.91088034717917e-07, + "loss": 0.8808, + "step": 126740 + }, + { + "epoch": 9.822155062187608, + "grad_norm": 1.4022995051999105, + "learning_rate": 4.911267823930565e-07, + "loss": 0.9043, + "step": 126750 + }, + { + "epoch": 9.822929985663915, + "grad_norm": 1.3353724142985757, + "learning_rate": 4.91165530068196e-07, + "loss": 0.927, + "step": 126760 + }, + { + "epoch": 9.823704909140222, + "grad_norm": 1.5409250209375027, + "learning_rate": 4.912042777433355e-07, + "loss": 0.8833, + "step": 126770 + }, + { + "epoch": 9.824479832616529, + "grad_norm": 1.476058460526329, + "learning_rate": 4.91243025418475e-07, + "loss": 0.893, + "step": 126780 + }, + { + "epoch": 9.825254756092836, + "grad_norm": 1.4651249526178032, + "learning_rate": 4.912817730936144e-07, + "loss": 0.8973, + "step": 126790 + }, + { + "epoch": 9.826029679569142, + "grad_norm": 1.4248706282266494, + "learning_rate": 4.913205207687539e-07, + "loss": 0.911, + "step": 126800 + }, + { + "epoch": 9.82680460304545, + "grad_norm": 1.5042659704737809, + "learning_rate": 4.913592684438935e-07, + "loss": 0.9089, + "step": 126810 + }, + { + "epoch": 9.827579526521756, + "grad_norm": 1.4347139044467447, + "learning_rate": 4.91398016119033e-07, + "loss": 0.9159, + "step": 126820 + }, + { + "epoch": 9.828354449998063, + "grad_norm": 1.4027127008059124, + "learning_rate": 4.914367637941724e-07, + "loss": 0.9143, + "step": 126830 + }, + { + "epoch": 9.82912937347437, + "grad_norm": 1.4883659302872958, + "learning_rate": 4.914755114693119e-07, + "loss": 0.8974, + "step": 126840 + }, + { + "epoch": 9.829904296950676, + "grad_norm": 1.3708878902354218, + "learning_rate": 4.915142591444514e-07, + "loss": 0.903, + "step": 126850 + }, + { + "epoch": 9.830679220426983, + "grad_norm": 1.529720072819278, + "learning_rate": 4.915530068195909e-07, + "loss": 0.907, + "step": 126860 + }, + { + "epoch": 9.83145414390329, + "grad_norm": 1.513584806547159, + "learning_rate": 4.915917544947304e-07, + "loss": 0.889, + "step": 126870 + }, + { + "epoch": 9.832229067379597, + "grad_norm": 1.5652867824765777, + "learning_rate": 4.916305021698699e-07, + "loss": 0.9089, + "step": 126880 + }, + { + "epoch": 9.833003990855904, + "grad_norm": 1.5382354196770363, + "learning_rate": 4.916692498450094e-07, + "loss": 0.8913, + "step": 126890 + }, + { + "epoch": 9.83377891433221, + "grad_norm": 1.4470811767394274, + "learning_rate": 4.917079975201488e-07, + "loss": 0.9038, + "step": 126900 + }, + { + "epoch": 9.834553837808517, + "grad_norm": 1.414887509446668, + "learning_rate": 4.917467451952884e-07, + "loss": 0.8991, + "step": 126910 + }, + { + "epoch": 9.835328761284822, + "grad_norm": 1.526910216407638, + "learning_rate": 4.917854928704279e-07, + "loss": 0.8925, + "step": 126920 + }, + { + "epoch": 9.83610368476113, + "grad_norm": 1.493261861887397, + "learning_rate": 4.918242405455673e-07, + "loss": 0.8786, + "step": 126930 + }, + { + "epoch": 9.836878608237436, + "grad_norm": 1.4337284509285744, + "learning_rate": 4.918629882207068e-07, + "loss": 0.913, + "step": 126940 + }, + { + "epoch": 9.837653531713743, + "grad_norm": 1.465642475107368, + "learning_rate": 4.919017358958463e-07, + "loss": 0.9036, + "step": 126950 + }, + { + "epoch": 9.83842845519005, + "grad_norm": 1.4569812867301912, + "learning_rate": 4.919404835709858e-07, + "loss": 0.8971, + "step": 126960 + }, + { + "epoch": 9.839203378666356, + "grad_norm": 1.501889891481045, + "learning_rate": 4.919792312461253e-07, + "loss": 0.8862, + "step": 126970 + }, + { + "epoch": 9.839978302142663, + "grad_norm": 1.405789593881039, + "learning_rate": 4.920179789212648e-07, + "loss": 0.8972, + "step": 126980 + }, + { + "epoch": 9.84075322561897, + "grad_norm": 1.504750265019976, + "learning_rate": 4.920567265964043e-07, + "loss": 0.9258, + "step": 126990 + }, + { + "epoch": 9.841528149095277, + "grad_norm": 1.5095776782274652, + "learning_rate": 4.920954742715437e-07, + "loss": 0.9062, + "step": 127000 + }, + { + "epoch": 9.841528149095277, + "eval_loss": 0.9090984463691711, + "eval_runtime": 332.6369, + "eval_samples_per_second": 34.485, + "eval_steps_per_second": 8.622, + "step": 127000 + }, + { + "epoch": 9.842303072571584, + "grad_norm": 1.4092312602459969, + "learning_rate": 4.921342219466833e-07, + "loss": 0.9119, + "step": 127010 + }, + { + "epoch": 9.84307799604789, + "grad_norm": 1.4684805307556708, + "learning_rate": 4.921729696218228e-07, + "loss": 0.902, + "step": 127020 + }, + { + "epoch": 9.843852919524197, + "grad_norm": 1.3679140369400553, + "learning_rate": 4.922117172969622e-07, + "loss": 0.8939, + "step": 127030 + }, + { + "epoch": 9.844627843000504, + "grad_norm": 1.4538974744411692, + "learning_rate": 4.922504649721017e-07, + "loss": 0.8924, + "step": 127040 + }, + { + "epoch": 9.845402766476811, + "grad_norm": 1.3685863262866969, + "learning_rate": 4.922892126472412e-07, + "loss": 0.9107, + "step": 127050 + }, + { + "epoch": 9.846177689953118, + "grad_norm": 1.456260308571594, + "learning_rate": 4.923279603223808e-07, + "loss": 0.9061, + "step": 127060 + }, + { + "epoch": 9.846952613429425, + "grad_norm": 1.3979380906896037, + "learning_rate": 4.923667079975202e-07, + "loss": 0.9137, + "step": 127070 + }, + { + "epoch": 9.847727536905731, + "grad_norm": 1.3795267968524632, + "learning_rate": 4.924054556726597e-07, + "loss": 0.9136, + "step": 127080 + }, + { + "epoch": 9.848502460382036, + "grad_norm": 1.4126961093352715, + "learning_rate": 4.924442033477992e-07, + "loss": 0.9159, + "step": 127090 + }, + { + "epoch": 9.849277383858343, + "grad_norm": 1.4722284243397994, + "learning_rate": 4.924829510229386e-07, + "loss": 0.9067, + "step": 127100 + }, + { + "epoch": 9.85005230733465, + "grad_norm": 1.3917584244235988, + "learning_rate": 4.925216986980782e-07, + "loss": 0.8965, + "step": 127110 + }, + { + "epoch": 9.850827230810957, + "grad_norm": 1.4564775514957213, + "learning_rate": 4.925604463732177e-07, + "loss": 0.8905, + "step": 127120 + }, + { + "epoch": 9.851602154287264, + "grad_norm": 1.3714854128987604, + "learning_rate": 4.925991940483572e-07, + "loss": 0.9045, + "step": 127130 + }, + { + "epoch": 9.85237707776357, + "grad_norm": 1.349878915982942, + "learning_rate": 4.926379417234966e-07, + "loss": 0.9039, + "step": 127140 + }, + { + "epoch": 9.853152001239877, + "grad_norm": 1.4779648196765796, + "learning_rate": 4.926766893986361e-07, + "loss": 0.8907, + "step": 127150 + }, + { + "epoch": 9.853926924716184, + "grad_norm": 1.5040944309074262, + "learning_rate": 4.927154370737757e-07, + "loss": 0.9027, + "step": 127160 + }, + { + "epoch": 9.85470184819249, + "grad_norm": 1.5355601835210164, + "learning_rate": 4.927541847489151e-07, + "loss": 0.9049, + "step": 127170 + }, + { + "epoch": 9.855476771668798, + "grad_norm": 1.5226921958486725, + "learning_rate": 4.927929324240546e-07, + "loss": 0.8932, + "step": 127180 + }, + { + "epoch": 9.856251695145104, + "grad_norm": 1.45695209825893, + "learning_rate": 4.928316800991941e-07, + "loss": 0.9078, + "step": 127190 + }, + { + "epoch": 9.857026618621411, + "grad_norm": 1.5173426708224382, + "learning_rate": 4.928704277743337e-07, + "loss": 0.8882, + "step": 127200 + }, + { + "epoch": 9.857801542097718, + "grad_norm": 1.3652165763350173, + "learning_rate": 4.929091754494731e-07, + "loss": 0.9118, + "step": 127210 + }, + { + "epoch": 9.858576465574025, + "grad_norm": 1.4565360734600332, + "learning_rate": 4.929479231246126e-07, + "loss": 0.8776, + "step": 127220 + }, + { + "epoch": 9.859351389050332, + "grad_norm": 1.463974794536311, + "learning_rate": 4.929866707997521e-07, + "loss": 0.8917, + "step": 127230 + }, + { + "epoch": 9.860126312526639, + "grad_norm": 1.4791710031760115, + "learning_rate": 4.930254184748915e-07, + "loss": 0.8914, + "step": 127240 + }, + { + "epoch": 9.860901236002945, + "grad_norm": 1.4706026687358702, + "learning_rate": 4.93064166150031e-07, + "loss": 0.91, + "step": 127250 + }, + { + "epoch": 9.861676159479252, + "grad_norm": 1.527067425528806, + "learning_rate": 4.931029138251706e-07, + "loss": 0.9046, + "step": 127260 + }, + { + "epoch": 9.862451082955559, + "grad_norm": 1.3491210694174534, + "learning_rate": 4.931416615003101e-07, + "loss": 0.8884, + "step": 127270 + }, + { + "epoch": 9.863226006431866, + "grad_norm": 1.4961084122606902, + "learning_rate": 4.931804091754495e-07, + "loss": 0.8955, + "step": 127280 + }, + { + "epoch": 9.86400092990817, + "grad_norm": 1.3512946820394085, + "learning_rate": 4.93219156850589e-07, + "loss": 0.9048, + "step": 127290 + }, + { + "epoch": 9.864775853384478, + "grad_norm": 1.426876746731671, + "learning_rate": 4.932579045257286e-07, + "loss": 0.9103, + "step": 127300 + }, + { + "epoch": 9.865550776860784, + "grad_norm": 1.4389330899318573, + "learning_rate": 4.93296652200868e-07, + "loss": 0.9028, + "step": 127310 + }, + { + "epoch": 9.866325700337091, + "grad_norm": 1.4680125469626317, + "learning_rate": 4.933353998760075e-07, + "loss": 0.8952, + "step": 127320 + }, + { + "epoch": 9.867100623813398, + "grad_norm": 1.4960607931273153, + "learning_rate": 4.93374147551147e-07, + "loss": 0.8905, + "step": 127330 + }, + { + "epoch": 9.867875547289705, + "grad_norm": 1.379707911348566, + "learning_rate": 4.934128952262865e-07, + "loss": 0.9028, + "step": 127340 + }, + { + "epoch": 9.868650470766012, + "grad_norm": 1.4006415632049596, + "learning_rate": 4.93451642901426e-07, + "loss": 0.9148, + "step": 127350 + }, + { + "epoch": 9.869425394242318, + "grad_norm": 1.4179359310262172, + "learning_rate": 4.934903905765655e-07, + "loss": 0.8917, + "step": 127360 + }, + { + "epoch": 9.870200317718625, + "grad_norm": 1.4182128546212673, + "learning_rate": 4.93529138251705e-07, + "loss": 0.8988, + "step": 127370 + }, + { + "epoch": 9.870975241194932, + "grad_norm": 1.5187813142751303, + "learning_rate": 4.935678859268444e-07, + "loss": 0.8916, + "step": 127380 + }, + { + "epoch": 9.871750164671239, + "grad_norm": 1.3989207983679757, + "learning_rate": 4.936066336019839e-07, + "loss": 0.8951, + "step": 127390 + }, + { + "epoch": 9.872525088147546, + "grad_norm": 1.4972471662660898, + "learning_rate": 4.936453812771235e-07, + "loss": 0.9107, + "step": 127400 + }, + { + "epoch": 9.873300011623853, + "grad_norm": 1.44419309146891, + "learning_rate": 4.93684128952263e-07, + "loss": 0.8899, + "step": 127410 + }, + { + "epoch": 9.87407493510016, + "grad_norm": 1.3842634479282234, + "learning_rate": 4.937228766274024e-07, + "loss": 0.9039, + "step": 127420 + }, + { + "epoch": 9.874849858576466, + "grad_norm": 1.533407149988287, + "learning_rate": 4.937616243025419e-07, + "loss": 0.907, + "step": 127430 + }, + { + "epoch": 9.875624782052773, + "grad_norm": 1.4321408478769921, + "learning_rate": 4.938003719776814e-07, + "loss": 0.9001, + "step": 127440 + }, + { + "epoch": 9.87639970552908, + "grad_norm": 1.420035885215013, + "learning_rate": 4.938391196528209e-07, + "loss": 0.9032, + "step": 127450 + }, + { + "epoch": 9.877174629005385, + "grad_norm": 1.4468045569136703, + "learning_rate": 4.938778673279604e-07, + "loss": 0.9105, + "step": 127460 + }, + { + "epoch": 9.877949552481692, + "grad_norm": 1.4539184088651151, + "learning_rate": 4.939166150030999e-07, + "loss": 0.8942, + "step": 127470 + }, + { + "epoch": 9.878724475957998, + "grad_norm": 1.452600246523692, + "learning_rate": 4.939553626782394e-07, + "loss": 0.9245, + "step": 127480 + }, + { + "epoch": 9.879499399434305, + "grad_norm": 1.4578505372208932, + "learning_rate": 4.939941103533788e-07, + "loss": 0.9016, + "step": 127490 + }, + { + "epoch": 9.880274322910612, + "grad_norm": 1.4755047490944582, + "learning_rate": 4.940328580285184e-07, + "loss": 0.9098, + "step": 127500 + }, + { + "epoch": 9.880274322910612, + "eval_loss": 0.9089439511299133, + "eval_runtime": 332.497, + "eval_samples_per_second": 34.5, + "eval_steps_per_second": 8.626, + "step": 127500 + }, + { + "epoch": 9.881049246386919, + "grad_norm": 1.4568207362975398, + "learning_rate": 4.940716057036579e-07, + "loss": 0.9044, + "step": 127510 + }, + { + "epoch": 9.881824169863226, + "grad_norm": 1.501624882074297, + "learning_rate": 4.941103533787973e-07, + "loss": 0.8956, + "step": 127520 + }, + { + "epoch": 9.882599093339532, + "grad_norm": 1.4132869671951662, + "learning_rate": 4.941491010539368e-07, + "loss": 0.9029, + "step": 127530 + }, + { + "epoch": 9.88337401681584, + "grad_norm": 1.459280540618881, + "learning_rate": 4.941878487290763e-07, + "loss": 0.8609, + "step": 127540 + }, + { + "epoch": 9.884148940292146, + "grad_norm": 1.5029573862500498, + "learning_rate": 4.942265964042159e-07, + "loss": 0.9179, + "step": 127550 + }, + { + "epoch": 9.884923863768453, + "grad_norm": 1.4695731472735465, + "learning_rate": 4.942653440793553e-07, + "loss": 0.9041, + "step": 127560 + }, + { + "epoch": 9.88569878724476, + "grad_norm": 1.5019640765456723, + "learning_rate": 4.943040917544948e-07, + "loss": 0.8852, + "step": 127570 + }, + { + "epoch": 9.886473710721067, + "grad_norm": 1.500703429810537, + "learning_rate": 4.943428394296343e-07, + "loss": 0.8936, + "step": 127580 + }, + { + "epoch": 9.887248634197373, + "grad_norm": 1.427894652547824, + "learning_rate": 4.943815871047737e-07, + "loss": 0.8841, + "step": 127590 + }, + { + "epoch": 9.88802355767368, + "grad_norm": 1.5727976401627464, + "learning_rate": 4.944203347799133e-07, + "loss": 0.8967, + "step": 127600 + }, + { + "epoch": 9.888798481149987, + "grad_norm": 1.4197341274336537, + "learning_rate": 4.944590824550528e-07, + "loss": 0.8936, + "step": 127610 + }, + { + "epoch": 9.889573404626294, + "grad_norm": 1.4790969648635945, + "learning_rate": 4.944978301301923e-07, + "loss": 0.8859, + "step": 127620 + }, + { + "epoch": 9.8903483281026, + "grad_norm": 1.421332000610074, + "learning_rate": 4.945365778053317e-07, + "loss": 0.887, + "step": 127630 + }, + { + "epoch": 9.891123251578907, + "grad_norm": 1.505711100901298, + "learning_rate": 4.945753254804712e-07, + "loss": 0.8944, + "step": 127640 + }, + { + "epoch": 9.891898175055214, + "grad_norm": 1.4175381159745917, + "learning_rate": 4.946140731556108e-07, + "loss": 0.9031, + "step": 127650 + }, + { + "epoch": 9.89267309853152, + "grad_norm": 1.4380171163020297, + "learning_rate": 4.946528208307502e-07, + "loss": 0.9144, + "step": 127660 + }, + { + "epoch": 9.893448022007826, + "grad_norm": 1.528847799690383, + "learning_rate": 4.946915685058897e-07, + "loss": 0.9078, + "step": 127670 + }, + { + "epoch": 9.894222945484133, + "grad_norm": 1.4397132991489268, + "learning_rate": 4.947303161810292e-07, + "loss": 0.8929, + "step": 127680 + }, + { + "epoch": 9.89499786896044, + "grad_norm": 1.3484178587893876, + "learning_rate": 4.947690638561687e-07, + "loss": 0.9168, + "step": 127690 + }, + { + "epoch": 9.895772792436746, + "grad_norm": 1.4544992417928757, + "learning_rate": 4.948078115313082e-07, + "loss": 0.8731, + "step": 127700 + }, + { + "epoch": 9.896547715913053, + "grad_norm": 1.4682522861602831, + "learning_rate": 4.948465592064477e-07, + "loss": 0.8874, + "step": 127710 + }, + { + "epoch": 9.89732263938936, + "grad_norm": 1.456868984127823, + "learning_rate": 4.948853068815872e-07, + "loss": 0.8927, + "step": 127720 + }, + { + "epoch": 9.898097562865667, + "grad_norm": 1.3989773192417108, + "learning_rate": 4.949240545567266e-07, + "loss": 0.9025, + "step": 127730 + }, + { + "epoch": 9.898872486341974, + "grad_norm": 1.3501622195392504, + "learning_rate": 4.949628022318661e-07, + "loss": 0.8949, + "step": 127740 + }, + { + "epoch": 9.89964740981828, + "grad_norm": 1.4096245864069463, + "learning_rate": 4.950015499070057e-07, + "loss": 0.9026, + "step": 127750 + }, + { + "epoch": 9.900422333294587, + "grad_norm": 1.4261228973851623, + "learning_rate": 4.950402975821452e-07, + "loss": 0.9061, + "step": 127760 + }, + { + "epoch": 9.901197256770894, + "grad_norm": 1.5092722982894928, + "learning_rate": 4.950790452572846e-07, + "loss": 0.9082, + "step": 127770 + }, + { + "epoch": 9.901972180247201, + "grad_norm": 1.386800278409314, + "learning_rate": 4.951177929324241e-07, + "loss": 0.8936, + "step": 127780 + }, + { + "epoch": 9.902747103723508, + "grad_norm": 1.4417587383698387, + "learning_rate": 4.951565406075636e-07, + "loss": 0.9067, + "step": 127790 + }, + { + "epoch": 9.903522027199815, + "grad_norm": 1.4777906233314657, + "learning_rate": 4.951952882827031e-07, + "loss": 0.8806, + "step": 127800 + }, + { + "epoch": 9.904296950676121, + "grad_norm": 1.4165312463198398, + "learning_rate": 4.952340359578426e-07, + "loss": 0.8955, + "step": 127810 + }, + { + "epoch": 9.905071874152428, + "grad_norm": 1.3762819823330783, + "learning_rate": 4.952727836329821e-07, + "loss": 0.8929, + "step": 127820 + }, + { + "epoch": 9.905846797628735, + "grad_norm": 1.4544493749729124, + "learning_rate": 4.953115313081216e-07, + "loss": 0.8818, + "step": 127830 + }, + { + "epoch": 9.90662172110504, + "grad_norm": 1.36196155308716, + "learning_rate": 4.95350278983261e-07, + "loss": 0.8875, + "step": 127840 + }, + { + "epoch": 9.907396644581347, + "grad_norm": 1.4883773065357169, + "learning_rate": 4.953890266584006e-07, + "loss": 0.9008, + "step": 127850 + }, + { + "epoch": 9.908171568057654, + "grad_norm": 1.4927827425703095, + "learning_rate": 4.954277743335401e-07, + "loss": 0.8942, + "step": 127860 + }, + { + "epoch": 9.90894649153396, + "grad_norm": 1.4023977727828094, + "learning_rate": 4.954665220086795e-07, + "loss": 0.9057, + "step": 127870 + }, + { + "epoch": 9.909721415010267, + "grad_norm": 1.4446029090006094, + "learning_rate": 4.95505269683819e-07, + "loss": 0.8803, + "step": 127880 + }, + { + "epoch": 9.910496338486574, + "grad_norm": 1.4292056858362117, + "learning_rate": 4.955440173589585e-07, + "loss": 0.9226, + "step": 127890 + }, + { + "epoch": 9.91127126196288, + "grad_norm": 1.4460790441480118, + "learning_rate": 4.955827650340981e-07, + "loss": 0.9032, + "step": 127900 + }, + { + "epoch": 9.912046185439188, + "grad_norm": 1.4383217916321063, + "learning_rate": 4.956215127092375e-07, + "loss": 0.9097, + "step": 127910 + }, + { + "epoch": 9.912821108915495, + "grad_norm": 1.4890638878986369, + "learning_rate": 4.95660260384377e-07, + "loss": 0.886, + "step": 127920 + }, + { + "epoch": 9.913596032391801, + "grad_norm": 1.422172961373081, + "learning_rate": 4.956990080595165e-07, + "loss": 0.8892, + "step": 127930 + }, + { + "epoch": 9.914370955868108, + "grad_norm": 1.5212057446177087, + "learning_rate": 4.95737755734656e-07, + "loss": 0.9048, + "step": 127940 + }, + { + "epoch": 9.915145879344415, + "grad_norm": 1.4221036311056277, + "learning_rate": 4.957765034097955e-07, + "loss": 0.8964, + "step": 127950 + }, + { + "epoch": 9.915920802820722, + "grad_norm": 1.346781693707012, + "learning_rate": 4.95815251084935e-07, + "loss": 0.9159, + "step": 127960 + }, + { + "epoch": 9.916695726297029, + "grad_norm": 1.407803991865187, + "learning_rate": 4.958539987600745e-07, + "loss": 0.8886, + "step": 127970 + }, + { + "epoch": 9.917470649773335, + "grad_norm": 1.4917103485447065, + "learning_rate": 4.958927464352139e-07, + "loss": 0.9106, + "step": 127980 + }, + { + "epoch": 9.918245573249642, + "grad_norm": 1.490833171098359, + "learning_rate": 4.959314941103535e-07, + "loss": 0.8955, + "step": 127990 + }, + { + "epoch": 9.919020496725949, + "grad_norm": 1.417227173424181, + "learning_rate": 4.95970241785493e-07, + "loss": 0.8922, + "step": 128000 + }, + { + "epoch": 9.919020496725949, + "eval_loss": 0.9086829423904419, + "eval_runtime": 332.5229, + "eval_samples_per_second": 34.497, + "eval_steps_per_second": 8.625, + "step": 128000 + }, + { + "epoch": 9.919795420202256, + "grad_norm": 1.4757026879107056, + "learning_rate": 4.960089894606324e-07, + "loss": 0.9058, + "step": 128010 + }, + { + "epoch": 9.920570343678563, + "grad_norm": 1.4178613041918835, + "learning_rate": 4.960477371357719e-07, + "loss": 0.9118, + "step": 128020 + }, + { + "epoch": 9.921345267154868, + "grad_norm": 1.4298827296816925, + "learning_rate": 4.960864848109114e-07, + "loss": 0.8985, + "step": 128030 + }, + { + "epoch": 9.922120190631174, + "grad_norm": 1.4475094359376048, + "learning_rate": 4.96125232486051e-07, + "loss": 0.8899, + "step": 128040 + }, + { + "epoch": 9.922895114107481, + "grad_norm": 1.514364348604334, + "learning_rate": 4.961639801611904e-07, + "loss": 0.8924, + "step": 128050 + }, + { + "epoch": 9.923670037583788, + "grad_norm": 1.4174352954363487, + "learning_rate": 4.962027278363299e-07, + "loss": 0.9081, + "step": 128060 + }, + { + "epoch": 9.924444961060095, + "grad_norm": 1.4904884036842294, + "learning_rate": 4.962414755114694e-07, + "loss": 0.9166, + "step": 128070 + }, + { + "epoch": 9.925219884536402, + "grad_norm": 1.4502653524905356, + "learning_rate": 4.962802231866088e-07, + "loss": 0.9058, + "step": 128080 + }, + { + "epoch": 9.925994808012709, + "grad_norm": 1.4531807388464801, + "learning_rate": 4.963189708617484e-07, + "loss": 0.8942, + "step": 128090 + }, + { + "epoch": 9.926769731489015, + "grad_norm": 1.4865694444329962, + "learning_rate": 4.963577185368879e-07, + "loss": 0.9066, + "step": 128100 + }, + { + "epoch": 9.927544654965322, + "grad_norm": 1.4937564191941752, + "learning_rate": 4.963964662120274e-07, + "loss": 0.894, + "step": 128110 + }, + { + "epoch": 9.928319578441629, + "grad_norm": 1.509818902729871, + "learning_rate": 4.964352138871668e-07, + "loss": 0.8992, + "step": 128120 + }, + { + "epoch": 9.929094501917936, + "grad_norm": 1.476560674689932, + "learning_rate": 4.964739615623063e-07, + "loss": 0.9219, + "step": 128130 + }, + { + "epoch": 9.929869425394243, + "grad_norm": 1.5282113119292982, + "learning_rate": 4.965127092374459e-07, + "loss": 0.9003, + "step": 128140 + }, + { + "epoch": 9.93064434887055, + "grad_norm": 1.3794089485130456, + "learning_rate": 4.965514569125853e-07, + "loss": 0.89, + "step": 128150 + }, + { + "epoch": 9.931419272346856, + "grad_norm": 1.444785136719737, + "learning_rate": 4.965902045877248e-07, + "loss": 0.9123, + "step": 128160 + }, + { + "epoch": 9.932194195823163, + "grad_norm": 1.4407868345563297, + "learning_rate": 4.966289522628643e-07, + "loss": 0.9042, + "step": 128170 + }, + { + "epoch": 9.93296911929947, + "grad_norm": 1.571745260348184, + "learning_rate": 4.966676999380038e-07, + "loss": 0.8984, + "step": 128180 + }, + { + "epoch": 9.933744042775777, + "grad_norm": 1.5106297038963012, + "learning_rate": 4.967064476131433e-07, + "loss": 0.8962, + "step": 128190 + }, + { + "epoch": 9.934518966252083, + "grad_norm": 1.4831736546222898, + "learning_rate": 4.967451952882828e-07, + "loss": 0.9186, + "step": 128200 + }, + { + "epoch": 9.935293889728388, + "grad_norm": 1.3995305319561102, + "learning_rate": 4.967839429634223e-07, + "loss": 0.9046, + "step": 128210 + }, + { + "epoch": 9.936068813204695, + "grad_norm": 1.4681931520211497, + "learning_rate": 4.968226906385617e-07, + "loss": 0.8947, + "step": 128220 + }, + { + "epoch": 9.936843736681002, + "grad_norm": 1.4524612269955584, + "learning_rate": 4.968614383137012e-07, + "loss": 0.9127, + "step": 128230 + }, + { + "epoch": 9.937618660157309, + "grad_norm": 1.4604580644073333, + "learning_rate": 4.969001859888408e-07, + "loss": 0.8829, + "step": 128240 + }, + { + "epoch": 9.938393583633616, + "grad_norm": 1.7399694858676873, + "learning_rate": 4.969389336639803e-07, + "loss": 0.9205, + "step": 128250 + }, + { + "epoch": 9.939168507109923, + "grad_norm": 1.5910962609684591, + "learning_rate": 4.969776813391197e-07, + "loss": 0.9176, + "step": 128260 + }, + { + "epoch": 9.93994343058623, + "grad_norm": 1.3818078914919172, + "learning_rate": 4.970164290142592e-07, + "loss": 0.9007, + "step": 128270 + }, + { + "epoch": 9.940718354062536, + "grad_norm": 1.515254331503308, + "learning_rate": 4.970551766893987e-07, + "loss": 0.9354, + "step": 128280 + }, + { + "epoch": 9.941493277538843, + "grad_norm": 1.4963447653857342, + "learning_rate": 4.970939243645382e-07, + "loss": 0.9031, + "step": 128290 + }, + { + "epoch": 9.94226820101515, + "grad_norm": 1.5368959208087578, + "learning_rate": 4.971326720396777e-07, + "loss": 0.8865, + "step": 128300 + }, + { + "epoch": 9.943043124491457, + "grad_norm": 1.707605710628801, + "learning_rate": 4.971714197148172e-07, + "loss": 0.9047, + "step": 128310 + }, + { + "epoch": 9.943818047967763, + "grad_norm": 1.497372533032166, + "learning_rate": 4.972101673899567e-07, + "loss": 0.9269, + "step": 128320 + }, + { + "epoch": 9.94459297144407, + "grad_norm": 1.376711692007508, + "learning_rate": 4.972489150650961e-07, + "loss": 0.8992, + "step": 128330 + }, + { + "epoch": 9.945367894920377, + "grad_norm": 1.526482801042922, + "learning_rate": 4.972876627402357e-07, + "loss": 0.9082, + "step": 128340 + }, + { + "epoch": 9.946142818396684, + "grad_norm": 1.4637069479859561, + "learning_rate": 4.973264104153752e-07, + "loss": 0.8926, + "step": 128350 + }, + { + "epoch": 9.94691774187299, + "grad_norm": 1.4367091120498605, + "learning_rate": 4.973651580905146e-07, + "loss": 0.8951, + "step": 128360 + }, + { + "epoch": 9.947692665349297, + "grad_norm": 1.398053105154426, + "learning_rate": 4.974039057656541e-07, + "loss": 0.8902, + "step": 128370 + }, + { + "epoch": 9.948467588825604, + "grad_norm": 1.4513463983565558, + "learning_rate": 4.974426534407936e-07, + "loss": 0.8943, + "step": 128380 + }, + { + "epoch": 9.949242512301911, + "grad_norm": 1.4724303239974308, + "learning_rate": 4.974814011159331e-07, + "loss": 0.8921, + "step": 128390 + }, + { + "epoch": 9.950017435778218, + "grad_norm": 1.4217572817906534, + "learning_rate": 4.975201487910726e-07, + "loss": 0.9176, + "step": 128400 + }, + { + "epoch": 9.950792359254523, + "grad_norm": 1.4372995036421534, + "learning_rate": 4.975588964662121e-07, + "loss": 0.9007, + "step": 128410 + }, + { + "epoch": 9.95156728273083, + "grad_norm": 1.3844420219474367, + "learning_rate": 4.975976441413516e-07, + "loss": 0.8979, + "step": 128420 + }, + { + "epoch": 9.952342206207136, + "grad_norm": 1.5318738084955386, + "learning_rate": 4.97636391816491e-07, + "loss": 0.9192, + "step": 128430 + }, + { + "epoch": 9.953117129683443, + "grad_norm": 1.5181459316056385, + "learning_rate": 4.976751394916306e-07, + "loss": 0.8938, + "step": 128440 + }, + { + "epoch": 9.95389205315975, + "grad_norm": 1.4256386896277702, + "learning_rate": 4.977138871667701e-07, + "loss": 0.9056, + "step": 128450 + }, + { + "epoch": 9.954666976636057, + "grad_norm": 1.4994496912080015, + "learning_rate": 4.977526348419095e-07, + "loss": 0.9001, + "step": 128460 + }, + { + "epoch": 9.955441900112364, + "grad_norm": 1.4110914589981933, + "learning_rate": 4.97791382517049e-07, + "loss": 0.9162, + "step": 128470 + }, + { + "epoch": 9.95621682358867, + "grad_norm": 1.350731007306368, + "learning_rate": 4.978301301921885e-07, + "loss": 0.908, + "step": 128480 + }, + { + "epoch": 9.956991747064977, + "grad_norm": 1.446832585065161, + "learning_rate": 4.978688778673281e-07, + "loss": 0.9531, + "step": 128490 + }, + { + "epoch": 9.957766670541284, + "grad_norm": 1.498336176433515, + "learning_rate": 4.979076255424675e-07, + "loss": 0.8865, + "step": 128500 + }, + { + "epoch": 9.957766670541284, + "eval_loss": 0.9084494113922119, + "eval_runtime": 332.9234, + "eval_samples_per_second": 34.455, + "eval_steps_per_second": 8.615, + "step": 128500 + }, + { + "epoch": 9.958541594017591, + "grad_norm": 1.4276279417346573, + "learning_rate": 4.97946373217607e-07, + "loss": 0.8952, + "step": 128510 + }, + { + "epoch": 9.959316517493898, + "grad_norm": 1.437765599169297, + "learning_rate": 4.979851208927465e-07, + "loss": 0.8905, + "step": 128520 + }, + { + "epoch": 9.960091440970205, + "grad_norm": 1.4971679949955505, + "learning_rate": 4.980238685678859e-07, + "loss": 0.8804, + "step": 128530 + }, + { + "epoch": 9.960866364446511, + "grad_norm": 1.5064493149446003, + "learning_rate": 4.980626162430255e-07, + "loss": 0.9299, + "step": 128540 + }, + { + "epoch": 9.961641287922818, + "grad_norm": 1.3553186407205648, + "learning_rate": 4.98101363918165e-07, + "loss": 0.8732, + "step": 128550 + }, + { + "epoch": 9.962416211399125, + "grad_norm": 1.5204633229567335, + "learning_rate": 4.981401115933045e-07, + "loss": 0.8855, + "step": 128560 + }, + { + "epoch": 9.963191134875432, + "grad_norm": 1.4353413290534425, + "learning_rate": 4.981788592684439e-07, + "loss": 0.8962, + "step": 128570 + }, + { + "epoch": 9.963966058351737, + "grad_norm": 1.460319880707647, + "learning_rate": 4.982176069435834e-07, + "loss": 0.9066, + "step": 128580 + }, + { + "epoch": 9.964740981828044, + "grad_norm": 1.3813317415091155, + "learning_rate": 4.98256354618723e-07, + "loss": 0.9001, + "step": 128590 + }, + { + "epoch": 9.96551590530435, + "grad_norm": 1.4679762513807446, + "learning_rate": 4.982951022938624e-07, + "loss": 0.9018, + "step": 128600 + }, + { + "epoch": 9.966290828780657, + "grad_norm": 1.456622793312799, + "learning_rate": 4.983338499690019e-07, + "loss": 0.8948, + "step": 128610 + }, + { + "epoch": 9.967065752256964, + "grad_norm": 1.442042934202095, + "learning_rate": 4.983725976441414e-07, + "loss": 0.9175, + "step": 128620 + }, + { + "epoch": 9.967840675733271, + "grad_norm": 1.4238420165082812, + "learning_rate": 4.98411345319281e-07, + "loss": 0.8816, + "step": 128630 + }, + { + "epoch": 9.968615599209578, + "grad_norm": 1.6380618808280258, + "learning_rate": 4.984500929944204e-07, + "loss": 0.8865, + "step": 128640 + }, + { + "epoch": 9.969390522685885, + "grad_norm": 1.492801518295036, + "learning_rate": 4.984888406695599e-07, + "loss": 0.902, + "step": 128650 + }, + { + "epoch": 9.970165446162191, + "grad_norm": 1.486852169131868, + "learning_rate": 4.985275883446994e-07, + "loss": 0.878, + "step": 128660 + }, + { + "epoch": 9.970940369638498, + "grad_norm": 1.468308061582902, + "learning_rate": 4.985663360198388e-07, + "loss": 0.8966, + "step": 128670 + }, + { + "epoch": 9.971715293114805, + "grad_norm": 1.458456817946087, + "learning_rate": 4.986050836949784e-07, + "loss": 0.8954, + "step": 128680 + }, + { + "epoch": 9.972490216591112, + "grad_norm": 1.4473447381070423, + "learning_rate": 4.986438313701179e-07, + "loss": 0.9085, + "step": 128690 + }, + { + "epoch": 9.973265140067419, + "grad_norm": 1.4243361705107072, + "learning_rate": 4.986825790452574e-07, + "loss": 0.9083, + "step": 128700 + }, + { + "epoch": 9.974040063543725, + "grad_norm": 1.4791766326107025, + "learning_rate": 4.987213267203968e-07, + "loss": 0.9032, + "step": 128710 + }, + { + "epoch": 9.974814987020032, + "grad_norm": 1.4007471391026078, + "learning_rate": 4.987600743955363e-07, + "loss": 0.8957, + "step": 128720 + }, + { + "epoch": 9.975589910496339, + "grad_norm": 1.4064030852712233, + "learning_rate": 4.987988220706759e-07, + "loss": 0.9001, + "step": 128730 + }, + { + "epoch": 9.976364833972646, + "grad_norm": 1.3991629012698694, + "learning_rate": 4.988375697458153e-07, + "loss": 0.8979, + "step": 128740 + }, + { + "epoch": 9.977139757448953, + "grad_norm": 1.5191187300641202, + "learning_rate": 4.988763174209548e-07, + "loss": 0.8969, + "step": 128750 + }, + { + "epoch": 9.97791468092526, + "grad_norm": 1.391583102908361, + "learning_rate": 4.989150650960943e-07, + "loss": 0.913, + "step": 128760 + }, + { + "epoch": 9.978689604401566, + "grad_norm": 1.5038959758727086, + "learning_rate": 4.989538127712338e-07, + "loss": 0.9046, + "step": 128770 + }, + { + "epoch": 9.979464527877871, + "grad_norm": 1.48249583686797, + "learning_rate": 4.989925604463733e-07, + "loss": 0.8997, + "step": 128780 + }, + { + "epoch": 9.980239451354178, + "grad_norm": 1.4616221016922593, + "learning_rate": 4.990313081215128e-07, + "loss": 0.8906, + "step": 128790 + }, + { + "epoch": 9.981014374830485, + "grad_norm": 1.4829231202213569, + "learning_rate": 4.990700557966523e-07, + "loss": 0.8915, + "step": 128800 + }, + { + "epoch": 9.981789298306792, + "grad_norm": 1.3994381987000577, + "learning_rate": 4.991088034717917e-07, + "loss": 0.9118, + "step": 128810 + }, + { + "epoch": 9.982564221783099, + "grad_norm": 1.4927426114026843, + "learning_rate": 4.991475511469312e-07, + "loss": 0.9092, + "step": 128820 + }, + { + "epoch": 9.983339145259405, + "grad_norm": 1.6673734982344817, + "learning_rate": 4.991862988220708e-07, + "loss": 0.933, + "step": 128830 + }, + { + "epoch": 9.984114068735712, + "grad_norm": 1.6384363493598602, + "learning_rate": 4.992250464972103e-07, + "loss": 0.9071, + "step": 128840 + }, + { + "epoch": 9.984888992212019, + "grad_norm": 1.4832481740883638, + "learning_rate": 4.992637941723497e-07, + "loss": 0.9273, + "step": 128850 + }, + { + "epoch": 9.985663915688326, + "grad_norm": 1.4325751402714733, + "learning_rate": 4.993025418474892e-07, + "loss": 0.8928, + "step": 128860 + }, + { + "epoch": 9.986438839164633, + "grad_norm": 1.422738919214255, + "learning_rate": 4.993412895226287e-07, + "loss": 0.8989, + "step": 128870 + }, + { + "epoch": 9.98721376264094, + "grad_norm": 1.4105846844705163, + "learning_rate": 4.993800371977682e-07, + "loss": 0.9004, + "step": 128880 + }, + { + "epoch": 9.987988686117246, + "grad_norm": 1.4768399759481272, + "learning_rate": 4.994187848729077e-07, + "loss": 0.9117, + "step": 128890 + }, + { + "epoch": 9.988763609593553, + "grad_norm": 1.51855135305933, + "learning_rate": 4.994575325480472e-07, + "loss": 0.9053, + "step": 128900 + }, + { + "epoch": 9.98953853306986, + "grad_norm": 1.6142904324613103, + "learning_rate": 4.994962802231867e-07, + "loss": 0.9165, + "step": 128910 + }, + { + "epoch": 9.990313456546167, + "grad_norm": 1.4905836221109412, + "learning_rate": 4.995350278983261e-07, + "loss": 0.8933, + "step": 128920 + }, + { + "epoch": 9.991088380022473, + "grad_norm": 1.5418070196089244, + "learning_rate": 4.995737755734657e-07, + "loss": 0.9122, + "step": 128930 + }, + { + "epoch": 9.99186330349878, + "grad_norm": 1.368602593665277, + "learning_rate": 4.996125232486052e-07, + "loss": 0.878, + "step": 128940 + }, + { + "epoch": 9.992638226975085, + "grad_norm": 1.4095438137066605, + "learning_rate": 4.996512709237446e-07, + "loss": 0.8957, + "step": 128950 + }, + { + "epoch": 9.993413150451392, + "grad_norm": 1.534095256678166, + "learning_rate": 4.996900185988841e-07, + "loss": 0.9014, + "step": 128960 + }, + { + "epoch": 9.994188073927699, + "grad_norm": 1.473721170967868, + "learning_rate": 4.997287662740236e-07, + "loss": 0.9062, + "step": 128970 + }, + { + "epoch": 9.994962997404006, + "grad_norm": 1.479140000670776, + "learning_rate": 4.997675139491632e-07, + "loss": 0.8883, + "step": 128980 + }, + { + "epoch": 9.995737920880313, + "grad_norm": 1.5238905256522792, + "learning_rate": 4.998062616243026e-07, + "loss": 0.9071, + "step": 128990 + }, + { + "epoch": 9.99651284435662, + "grad_norm": 1.4555385395797842, + "learning_rate": 4.998450092994421e-07, + "loss": 0.9372, + "step": 129000 + }, + { + "epoch": 9.99651284435662, + "eval_loss": 0.9082434773445129, + "eval_runtime": 326.9955, + "eval_samples_per_second": 35.08, + "eval_steps_per_second": 8.771, + "step": 129000 + }, + { + "epoch": 9.997287767832926, + "grad_norm": 1.4957151134122153, + "learning_rate": 4.998837569745816e-07, + "loss": 0.9158, + "step": 129010 + }, + { + "epoch": 9.998062691309233, + "grad_norm": 1.3686291958686674, + "learning_rate": 4.99922504649721e-07, + "loss": 0.8942, + "step": 129020 + }, + { + "epoch": 9.99883761478554, + "grad_norm": 1.4105165998029012, + "learning_rate": 4.999612523248606e-07, + "loss": 0.881, + "step": 129030 + }, + { + "epoch": 9.999612538261847, + "grad_norm": 1.4131872693727472, + "learning_rate": 5.000000000000001e-07, + "loss": 0.9011, + "step": 129040 + }, + { + "epoch": 10.000387461738153, + "grad_norm": 1.4590518242803316, + "learning_rate": 5.000387476751396e-07, + "loss": 0.8963, + "step": 129050 + }, + { + "epoch": 10.00116238521446, + "grad_norm": 1.3934180082790621, + "learning_rate": 5.00077495350279e-07, + "loss": 0.913, + "step": 129060 + }, + { + "epoch": 10.001937308690767, + "grad_norm": 1.437713076471302, + "learning_rate": 5.001162430254185e-07, + "loss": 0.8815, + "step": 129070 + }, + { + "epoch": 10.002712232167074, + "grad_norm": 1.5454233054669588, + "learning_rate": 5.001549907005581e-07, + "loss": 0.891, + "step": 129080 + }, + { + "epoch": 10.00348715564338, + "grad_norm": 1.454049022494876, + "learning_rate": 5.001937383756975e-07, + "loss": 0.8903, + "step": 129090 + }, + { + "epoch": 10.004262079119687, + "grad_norm": 1.53809431209946, + "learning_rate": 5.00232486050837e-07, + "loss": 0.8914, + "step": 129100 + }, + { + "epoch": 10.005037002595994, + "grad_norm": 1.5242352243053747, + "learning_rate": 5.002712337259765e-07, + "loss": 0.8886, + "step": 129110 + }, + { + "epoch": 10.005811926072301, + "grad_norm": 1.558141844590811, + "learning_rate": 5.00309981401116e-07, + "loss": 0.9001, + "step": 129120 + }, + { + "epoch": 10.006586849548608, + "grad_norm": 1.4797335790697854, + "learning_rate": 5.003487290762555e-07, + "loss": 0.8924, + "step": 129130 + }, + { + "epoch": 10.007361773024913, + "grad_norm": 1.3920807520394418, + "learning_rate": 5.00387476751395e-07, + "loss": 0.8766, + "step": 129140 + }, + { + "epoch": 10.00813669650122, + "grad_norm": 1.4685596246552775, + "learning_rate": 5.004262244265345e-07, + "loss": 0.8874, + "step": 129150 + }, + { + "epoch": 10.008911619977527, + "grad_norm": 1.4567188573743317, + "learning_rate": 5.004649721016739e-07, + "loss": 0.896, + "step": 129160 + }, + { + "epoch": 10.009686543453833, + "grad_norm": 1.4305446211050532, + "learning_rate": 5.005037197768134e-07, + "loss": 0.898, + "step": 129170 + }, + { + "epoch": 10.01046146693014, + "grad_norm": 1.485090508620578, + "learning_rate": 5.00542467451953e-07, + "loss": 0.9137, + "step": 129180 + }, + { + "epoch": 10.011236390406447, + "grad_norm": 1.417471430309089, + "learning_rate": 5.005812151270925e-07, + "loss": 0.889, + "step": 129190 + }, + { + "epoch": 10.012011313882754, + "grad_norm": 1.486958123943365, + "learning_rate": 5.006199628022319e-07, + "loss": 0.8917, + "step": 129200 + }, + { + "epoch": 10.01278623735906, + "grad_norm": 1.4871296219982189, + "learning_rate": 5.006587104773714e-07, + "loss": 0.9194, + "step": 129210 + }, + { + "epoch": 10.013561160835367, + "grad_norm": 1.4579274895718246, + "learning_rate": 5.00697458152511e-07, + "loss": 0.9034, + "step": 129220 + }, + { + "epoch": 10.014336084311674, + "grad_norm": 1.4674397095254552, + "learning_rate": 5.007362058276504e-07, + "loss": 0.8988, + "step": 129230 + }, + { + "epoch": 10.015111007787981, + "grad_norm": 1.4308722944465606, + "learning_rate": 5.007749535027899e-07, + "loss": 0.8798, + "step": 129240 + }, + { + "epoch": 10.015885931264288, + "grad_norm": 1.570705192006623, + "learning_rate": 5.008137011779294e-07, + "loss": 0.902, + "step": 129250 + }, + { + "epoch": 10.016660854740595, + "grad_norm": 1.5411557130190028, + "learning_rate": 5.008524488530689e-07, + "loss": 0.9085, + "step": 129260 + }, + { + "epoch": 10.017435778216901, + "grad_norm": 1.5190659632718146, + "learning_rate": 5.008911965282083e-07, + "loss": 0.8941, + "step": 129270 + }, + { + "epoch": 10.018210701693208, + "grad_norm": 1.4184898598614282, + "learning_rate": 5.009299442033479e-07, + "loss": 0.8841, + "step": 129280 + }, + { + "epoch": 10.018985625169515, + "grad_norm": 1.4763599185525784, + "learning_rate": 5.009686918784874e-07, + "loss": 0.8931, + "step": 129290 + }, + { + "epoch": 10.019760548645822, + "grad_norm": 1.4306221740059213, + "learning_rate": 5.010074395536268e-07, + "loss": 0.8894, + "step": 129300 + }, + { + "epoch": 10.020535472122129, + "grad_norm": 1.4866168459210787, + "learning_rate": 5.010461872287663e-07, + "loss": 0.8999, + "step": 129310 + }, + { + "epoch": 10.021310395598436, + "grad_norm": 1.6088653243529072, + "learning_rate": 5.010849349039059e-07, + "loss": 0.8843, + "step": 129320 + }, + { + "epoch": 10.02208531907474, + "grad_norm": 1.488233709353002, + "learning_rate": 5.011236825790454e-07, + "loss": 0.8863, + "step": 129330 + }, + { + "epoch": 10.022860242551047, + "grad_norm": 1.569189592295576, + "learning_rate": 5.011624302541848e-07, + "loss": 0.9058, + "step": 129340 + }, + { + "epoch": 10.023635166027354, + "grad_norm": 1.5335307308057884, + "learning_rate": 5.012011779293243e-07, + "loss": 0.9002, + "step": 129350 + }, + { + "epoch": 10.024410089503661, + "grad_norm": 1.5197802649991843, + "learning_rate": 5.012399256044638e-07, + "loss": 0.8772, + "step": 129360 + }, + { + "epoch": 10.025185012979968, + "grad_norm": 1.4620438085190481, + "learning_rate": 5.012786732796032e-07, + "loss": 0.9179, + "step": 129370 + }, + { + "epoch": 10.025959936456275, + "grad_norm": 1.3593551605300067, + "learning_rate": 5.013174209547428e-07, + "loss": 0.8699, + "step": 129380 + }, + { + "epoch": 10.026734859932581, + "grad_norm": 1.5037761805918417, + "learning_rate": 5.013561686298823e-07, + "loss": 0.9097, + "step": 129390 + }, + { + "epoch": 10.027509783408888, + "grad_norm": 1.4126611034710153, + "learning_rate": 5.013949163050218e-07, + "loss": 0.8785, + "step": 129400 + }, + { + "epoch": 10.028284706885195, + "grad_norm": 1.4452498992696114, + "learning_rate": 5.014336639801612e-07, + "loss": 0.882, + "step": 129410 + }, + { + "epoch": 10.029059630361502, + "grad_norm": 1.437917363868492, + "learning_rate": 5.014724116553008e-07, + "loss": 0.8778, + "step": 129420 + }, + { + "epoch": 10.029834553837809, + "grad_norm": 1.4331210601388373, + "learning_rate": 5.015111593304403e-07, + "loss": 0.9158, + "step": 129430 + }, + { + "epoch": 10.030609477314115, + "grad_norm": 1.5761865406993039, + "learning_rate": 5.015499070055797e-07, + "loss": 0.8909, + "step": 129440 + }, + { + "epoch": 10.031384400790422, + "grad_norm": 1.4998598229475726, + "learning_rate": 5.015886546807192e-07, + "loss": 0.8938, + "step": 129450 + }, + { + "epoch": 10.032159324266729, + "grad_norm": 1.4999950899967973, + "learning_rate": 5.016274023558587e-07, + "loss": 0.9004, + "step": 129460 + }, + { + "epoch": 10.032934247743036, + "grad_norm": 1.4218748364473326, + "learning_rate": 5.016661500309983e-07, + "loss": 0.8884, + "step": 129470 + }, + { + "epoch": 10.033709171219343, + "grad_norm": 1.4066332056829072, + "learning_rate": 5.017048977061377e-07, + "loss": 0.9055, + "step": 129480 + }, + { + "epoch": 10.03448409469565, + "grad_norm": 1.4098165260828275, + "learning_rate": 5.017436453812772e-07, + "loss": 0.8749, + "step": 129490 + }, + { + "epoch": 10.035259018171956, + "grad_norm": 1.6464505143859214, + "learning_rate": 5.017823930564167e-07, + "loss": 0.9068, + "step": 129500 + }, + { + "epoch": 10.035259018171956, + "eval_loss": 0.9082522392272949, + "eval_runtime": 328.6966, + "eval_samples_per_second": 34.898, + "eval_steps_per_second": 8.725, + "step": 129500 + }, + { + "epoch": 10.036033941648263, + "grad_norm": 1.4461399033893994, + "learning_rate": 5.018211407315561e-07, + "loss": 0.899, + "step": 129510 + }, + { + "epoch": 10.036808865124568, + "grad_norm": 1.4298819533322689, + "learning_rate": 5.018598884066957e-07, + "loss": 0.8836, + "step": 129520 + }, + { + "epoch": 10.037583788600875, + "grad_norm": 1.4128229376346078, + "learning_rate": 5.018986360818352e-07, + "loss": 0.8942, + "step": 129530 + }, + { + "epoch": 10.038358712077182, + "grad_norm": 1.4350581040114883, + "learning_rate": 5.019373837569747e-07, + "loss": 0.8935, + "step": 129540 + }, + { + "epoch": 10.039133635553489, + "grad_norm": 1.4788598376258282, + "learning_rate": 5.019761314321141e-07, + "loss": 0.8901, + "step": 129550 + }, + { + "epoch": 10.039908559029795, + "grad_norm": 1.4593469229582818, + "learning_rate": 5.020148791072536e-07, + "loss": 0.8861, + "step": 129560 + }, + { + "epoch": 10.040683482506102, + "grad_norm": 1.4710439485354545, + "learning_rate": 5.020536267823932e-07, + "loss": 0.8937, + "step": 129570 + }, + { + "epoch": 10.041458405982409, + "grad_norm": 1.390136001112114, + "learning_rate": 5.020923744575326e-07, + "loss": 0.8988, + "step": 129580 + }, + { + "epoch": 10.042233329458716, + "grad_norm": 1.4888411909992323, + "learning_rate": 5.021311221326721e-07, + "loss": 0.8887, + "step": 129590 + }, + { + "epoch": 10.043008252935023, + "grad_norm": 1.5175878697805394, + "learning_rate": 5.021698698078116e-07, + "loss": 0.9104, + "step": 129600 + }, + { + "epoch": 10.04378317641133, + "grad_norm": 1.4286343402863935, + "learning_rate": 5.022086174829511e-07, + "loss": 0.8774, + "step": 129610 + }, + { + "epoch": 10.044558099887636, + "grad_norm": 1.5074945974557878, + "learning_rate": 5.022473651580906e-07, + "loss": 0.893, + "step": 129620 + }, + { + "epoch": 10.045333023363943, + "grad_norm": 1.512186150344271, + "learning_rate": 5.022861128332301e-07, + "loss": 0.9047, + "step": 129630 + }, + { + "epoch": 10.04610794684025, + "grad_norm": 1.5436447374213247, + "learning_rate": 5.023248605083696e-07, + "loss": 0.9013, + "step": 129640 + }, + { + "epoch": 10.046882870316557, + "grad_norm": 1.5046698386938888, + "learning_rate": 5.02363608183509e-07, + "loss": 0.8805, + "step": 129650 + }, + { + "epoch": 10.047657793792864, + "grad_norm": 1.9246434761042193, + "learning_rate": 5.024023558586485e-07, + "loss": 0.8941, + "step": 129660 + }, + { + "epoch": 10.04843271726917, + "grad_norm": 1.466858987737825, + "learning_rate": 5.024411035337881e-07, + "loss": 0.8832, + "step": 129670 + }, + { + "epoch": 10.049207640745477, + "grad_norm": 1.4912925709300453, + "learning_rate": 5.024798512089276e-07, + "loss": 0.8755, + "step": 129680 + }, + { + "epoch": 10.049982564221784, + "grad_norm": 1.5383700058992476, + "learning_rate": 5.02518598884067e-07, + "loss": 0.8857, + "step": 129690 + }, + { + "epoch": 10.050757487698089, + "grad_norm": 1.3633935794885734, + "learning_rate": 5.025573465592065e-07, + "loss": 0.9153, + "step": 129700 + }, + { + "epoch": 10.051532411174396, + "grad_norm": 1.5281263886668688, + "learning_rate": 5.02596094234346e-07, + "loss": 0.8966, + "step": 129710 + }, + { + "epoch": 10.052307334650703, + "grad_norm": 1.588253075032848, + "learning_rate": 5.026348419094855e-07, + "loss": 0.9159, + "step": 129720 + }, + { + "epoch": 10.05308225812701, + "grad_norm": 1.5296187111417459, + "learning_rate": 5.02673589584625e-07, + "loss": 0.908, + "step": 129730 + }, + { + "epoch": 10.053857181603316, + "grad_norm": 1.4246424462703657, + "learning_rate": 5.027123372597645e-07, + "loss": 0.8956, + "step": 129740 + }, + { + "epoch": 10.054632105079623, + "grad_norm": 1.403261892086313, + "learning_rate": 5.02751084934904e-07, + "loss": 0.9247, + "step": 129750 + }, + { + "epoch": 10.05540702855593, + "grad_norm": 1.415138029313472, + "learning_rate": 5.027898326100434e-07, + "loss": 0.899, + "step": 129760 + }, + { + "epoch": 10.056181952032237, + "grad_norm": 1.495222528205757, + "learning_rate": 5.02828580285183e-07, + "loss": 0.9058, + "step": 129770 + }, + { + "epoch": 10.056956875508543, + "grad_norm": 1.4564234633694948, + "learning_rate": 5.028673279603225e-07, + "loss": 0.9081, + "step": 129780 + }, + { + "epoch": 10.05773179898485, + "grad_norm": 1.4406700857689778, + "learning_rate": 5.029060756354619e-07, + "loss": 0.8958, + "step": 129790 + }, + { + "epoch": 10.058506722461157, + "grad_norm": 1.4376403222059355, + "learning_rate": 5.029448233106014e-07, + "loss": 0.9021, + "step": 129800 + }, + { + "epoch": 10.059281645937464, + "grad_norm": 1.4912065474385154, + "learning_rate": 5.029835709857409e-07, + "loss": 0.8854, + "step": 129810 + }, + { + "epoch": 10.06005656941377, + "grad_norm": 1.4282625592930502, + "learning_rate": 5.030223186608805e-07, + "loss": 0.9016, + "step": 129820 + }, + { + "epoch": 10.060831492890077, + "grad_norm": 1.4108509442215098, + "learning_rate": 5.030610663360199e-07, + "loss": 0.8974, + "step": 129830 + }, + { + "epoch": 10.061606416366384, + "grad_norm": 1.4873576754123297, + "learning_rate": 5.030998140111594e-07, + "loss": 0.8936, + "step": 129840 + }, + { + "epoch": 10.062381339842691, + "grad_norm": 1.4714560635023108, + "learning_rate": 5.031385616862989e-07, + "loss": 0.8769, + "step": 129850 + }, + { + "epoch": 10.063156263318998, + "grad_norm": 1.4495163868571654, + "learning_rate": 5.031773093614383e-07, + "loss": 0.9092, + "step": 129860 + }, + { + "epoch": 10.063931186795305, + "grad_norm": 1.3714490228887646, + "learning_rate": 5.032160570365779e-07, + "loss": 0.8886, + "step": 129870 + }, + { + "epoch": 10.064706110271612, + "grad_norm": 1.484223652639443, + "learning_rate": 5.032548047117174e-07, + "loss": 0.9022, + "step": 129880 + }, + { + "epoch": 10.065481033747917, + "grad_norm": 1.494538914633646, + "learning_rate": 5.032935523868568e-07, + "loss": 0.8729, + "step": 129890 + }, + { + "epoch": 10.066255957224223, + "grad_norm": 1.4942649823663676, + "learning_rate": 5.033323000619963e-07, + "loss": 0.9105, + "step": 129900 + }, + { + "epoch": 10.06703088070053, + "grad_norm": 1.446845454778427, + "learning_rate": 5.033710477371358e-07, + "loss": 0.8992, + "step": 129910 + }, + { + "epoch": 10.067805804176837, + "grad_norm": 1.4957900598175786, + "learning_rate": 5.034097954122754e-07, + "loss": 0.8943, + "step": 129920 + }, + { + "epoch": 10.068580727653144, + "grad_norm": 1.4927826441148186, + "learning_rate": 5.034485430874148e-07, + "loss": 0.8905, + "step": 129930 + }, + { + "epoch": 10.06935565112945, + "grad_norm": 1.4097514047979414, + "learning_rate": 5.034872907625543e-07, + "loss": 0.8828, + "step": 129940 + }, + { + "epoch": 10.070130574605757, + "grad_norm": 1.4098925505248714, + "learning_rate": 5.035260384376938e-07, + "loss": 0.8951, + "step": 129950 + }, + { + "epoch": 10.070905498082064, + "grad_norm": 1.483747738456451, + "learning_rate": 5.035647861128332e-07, + "loss": 0.8914, + "step": 129960 + }, + { + "epoch": 10.071680421558371, + "grad_norm": 1.4501662754057283, + "learning_rate": 5.036035337879728e-07, + "loss": 0.8834, + "step": 129970 + }, + { + "epoch": 10.072455345034678, + "grad_norm": 1.4577987463748014, + "learning_rate": 5.036422814631123e-07, + "loss": 0.9017, + "step": 129980 + }, + { + "epoch": 10.073230268510985, + "grad_norm": 1.4711267253967875, + "learning_rate": 5.036810291382518e-07, + "loss": 0.9016, + "step": 129990 + }, + { + "epoch": 10.074005191987291, + "grad_norm": 1.454285275424651, + "learning_rate": 5.037197768133912e-07, + "loss": 0.896, + "step": 130000 + }, + { + "epoch": 10.074005191987291, + "eval_loss": 0.9080500602722168, + "eval_runtime": 327.9187, + "eval_samples_per_second": 34.981, + "eval_steps_per_second": 8.746, + "step": 130000 + }, + { + "epoch": 10.074780115463598, + "grad_norm": 1.4731377073917586, + "learning_rate": 5.037585244885308e-07, + "loss": 0.888, + "step": 130010 + }, + { + "epoch": 10.075555038939905, + "grad_norm": 1.419619028582313, + "learning_rate": 5.037972721636703e-07, + "loss": 0.8861, + "step": 130020 + }, + { + "epoch": 10.076329962416212, + "grad_norm": 1.3832918086255725, + "learning_rate": 5.038360198388097e-07, + "loss": 0.8811, + "step": 130030 + }, + { + "epoch": 10.077104885892519, + "grad_norm": 1.469976194844761, + "learning_rate": 5.038747675139492e-07, + "loss": 0.8877, + "step": 130040 + }, + { + "epoch": 10.077879809368826, + "grad_norm": 1.5689607037605386, + "learning_rate": 5.039135151890887e-07, + "loss": 0.8989, + "step": 130050 + }, + { + "epoch": 10.078654732845132, + "grad_norm": 1.519274571059741, + "learning_rate": 5.039522628642283e-07, + "loss": 0.8787, + "step": 130060 + }, + { + "epoch": 10.079429656321437, + "grad_norm": 1.434381178988359, + "learning_rate": 5.039910105393677e-07, + "loss": 0.8883, + "step": 130070 + }, + { + "epoch": 10.080204579797744, + "grad_norm": 1.4072372707598297, + "learning_rate": 5.040297582145072e-07, + "loss": 0.9198, + "step": 130080 + }, + { + "epoch": 10.080979503274051, + "grad_norm": 1.485451811592041, + "learning_rate": 5.040685058896467e-07, + "loss": 0.9168, + "step": 130090 + }, + { + "epoch": 10.081754426750358, + "grad_norm": 1.534418906930626, + "learning_rate": 5.041072535647861e-07, + "loss": 0.9004, + "step": 130100 + }, + { + "epoch": 10.082529350226665, + "grad_norm": 1.4855540235123477, + "learning_rate": 5.041460012399257e-07, + "loss": 0.8934, + "step": 130110 + }, + { + "epoch": 10.083304273702971, + "grad_norm": 1.4222437664286043, + "learning_rate": 5.041847489150652e-07, + "loss": 0.9025, + "step": 130120 + }, + { + "epoch": 10.084079197179278, + "grad_norm": 1.3851694908653558, + "learning_rate": 5.042234965902047e-07, + "loss": 0.8563, + "step": 130130 + }, + { + "epoch": 10.084854120655585, + "grad_norm": 1.5348102430528945, + "learning_rate": 5.042622442653441e-07, + "loss": 0.8947, + "step": 130140 + }, + { + "epoch": 10.085629044131892, + "grad_norm": 1.6356365959624448, + "learning_rate": 5.043009919404836e-07, + "loss": 0.9134, + "step": 130150 + }, + { + "epoch": 10.086403967608199, + "grad_norm": 1.482294879121203, + "learning_rate": 5.043397396156232e-07, + "loss": 0.9078, + "step": 130160 + }, + { + "epoch": 10.087178891084505, + "grad_norm": 1.467808109808431, + "learning_rate": 5.043784872907626e-07, + "loss": 0.8954, + "step": 130170 + }, + { + "epoch": 10.087953814560812, + "grad_norm": 1.4747311878204241, + "learning_rate": 5.044172349659021e-07, + "loss": 0.8836, + "step": 130180 + }, + { + "epoch": 10.08872873803712, + "grad_norm": 1.4528820481820257, + "learning_rate": 5.044559826410416e-07, + "loss": 0.8825, + "step": 130190 + }, + { + "epoch": 10.089503661513426, + "grad_norm": 1.4902501096305785, + "learning_rate": 5.044947303161811e-07, + "loss": 0.8971, + "step": 130200 + }, + { + "epoch": 10.090278584989733, + "grad_norm": 1.434968453284724, + "learning_rate": 5.045334779913206e-07, + "loss": 0.8996, + "step": 130210 + }, + { + "epoch": 10.09105350846604, + "grad_norm": 1.513589213149128, + "learning_rate": 5.045722256664601e-07, + "loss": 0.8825, + "step": 130220 + }, + { + "epoch": 10.091828431942346, + "grad_norm": 1.469590146939004, + "learning_rate": 5.046109733415996e-07, + "loss": 0.8728, + "step": 130230 + }, + { + "epoch": 10.092603355418653, + "grad_norm": 1.4222048684271231, + "learning_rate": 5.04649721016739e-07, + "loss": 0.8883, + "step": 130240 + }, + { + "epoch": 10.09337827889496, + "grad_norm": 1.4926346529736365, + "learning_rate": 5.046884686918785e-07, + "loss": 0.8932, + "step": 130250 + }, + { + "epoch": 10.094153202371265, + "grad_norm": 1.438770700565587, + "learning_rate": 5.047272163670181e-07, + "loss": 0.8886, + "step": 130260 + }, + { + "epoch": 10.094928125847572, + "grad_norm": 1.491902026019589, + "learning_rate": 5.047659640421576e-07, + "loss": 0.8931, + "step": 130270 + }, + { + "epoch": 10.095703049323879, + "grad_norm": 1.4950771188034422, + "learning_rate": 5.04804711717297e-07, + "loss": 0.9065, + "step": 130280 + }, + { + "epoch": 10.096477972800185, + "grad_norm": 1.3908766698000334, + "learning_rate": 5.048434593924365e-07, + "loss": 0.8861, + "step": 130290 + }, + { + "epoch": 10.097252896276492, + "grad_norm": 1.4758503809048324, + "learning_rate": 5.04882207067576e-07, + "loss": 0.8642, + "step": 130300 + }, + { + "epoch": 10.098027819752799, + "grad_norm": 1.5327077978156487, + "learning_rate": 5.049209547427155e-07, + "loss": 0.8817, + "step": 130310 + }, + { + "epoch": 10.098802743229106, + "grad_norm": 1.4091765870315216, + "learning_rate": 5.04959702417855e-07, + "loss": 0.8989, + "step": 130320 + }, + { + "epoch": 10.099577666705413, + "grad_norm": 1.4180278666842163, + "learning_rate": 5.049984500929945e-07, + "loss": 0.8913, + "step": 130330 + }, + { + "epoch": 10.10035259018172, + "grad_norm": 1.498117507107984, + "learning_rate": 5.05037197768134e-07, + "loss": 0.8931, + "step": 130340 + }, + { + "epoch": 10.101127513658026, + "grad_norm": 1.398005209336396, + "learning_rate": 5.050759454432734e-07, + "loss": 0.885, + "step": 130350 + }, + { + "epoch": 10.101902437134333, + "grad_norm": 1.3780720751202686, + "learning_rate": 5.05114693118413e-07, + "loss": 0.8729, + "step": 130360 + }, + { + "epoch": 10.10267736061064, + "grad_norm": 1.5708835475755967, + "learning_rate": 5.051534407935525e-07, + "loss": 0.8975, + "step": 130370 + }, + { + "epoch": 10.103452284086947, + "grad_norm": 1.592523315705164, + "learning_rate": 5.051921884686919e-07, + "loss": 0.8751, + "step": 130380 + }, + { + "epoch": 10.104227207563254, + "grad_norm": 1.4918783010425323, + "learning_rate": 5.052309361438314e-07, + "loss": 0.884, + "step": 130390 + }, + { + "epoch": 10.10500213103956, + "grad_norm": 1.4644354614251742, + "learning_rate": 5.052696838189709e-07, + "loss": 0.8955, + "step": 130400 + }, + { + "epoch": 10.105777054515867, + "grad_norm": 1.4948550932213234, + "learning_rate": 5.053084314941105e-07, + "loss": 0.8961, + "step": 130410 + }, + { + "epoch": 10.106551977992174, + "grad_norm": 1.5009837345600037, + "learning_rate": 5.053471791692499e-07, + "loss": 0.8792, + "step": 130420 + }, + { + "epoch": 10.10732690146848, + "grad_norm": 1.5249249043076092, + "learning_rate": 5.053859268443894e-07, + "loss": 0.8874, + "step": 130430 + }, + { + "epoch": 10.108101824944786, + "grad_norm": 1.4389942014238188, + "learning_rate": 5.054246745195289e-07, + "loss": 0.8866, + "step": 130440 + }, + { + "epoch": 10.108876748421093, + "grad_norm": 1.4193688198780745, + "learning_rate": 5.054634221946683e-07, + "loss": 0.8737, + "step": 130450 + }, + { + "epoch": 10.1096516718974, + "grad_norm": 1.4659899573742305, + "learning_rate": 5.055021698698079e-07, + "loss": 0.8874, + "step": 130460 + }, + { + "epoch": 10.110426595373706, + "grad_norm": 1.3748597794426292, + "learning_rate": 5.055409175449474e-07, + "loss": 0.8903, + "step": 130470 + }, + { + "epoch": 10.111201518850013, + "grad_norm": 1.5104111567440746, + "learning_rate": 5.055796652200869e-07, + "loss": 0.8862, + "step": 130480 + }, + { + "epoch": 10.11197644232632, + "grad_norm": 1.4449642117850399, + "learning_rate": 5.056184128952263e-07, + "loss": 0.8762, + "step": 130490 + }, + { + "epoch": 10.112751365802627, + "grad_norm": 1.5146697167490462, + "learning_rate": 5.056571605703658e-07, + "loss": 0.8998, + "step": 130500 + }, + { + "epoch": 10.112751365802627, + "eval_loss": 0.9080873727798462, + "eval_runtime": 328.1729, + "eval_samples_per_second": 34.954, + "eval_steps_per_second": 8.739, + "step": 130500 + }, + { + "epoch": 10.113526289278933, + "grad_norm": 1.5367381065759484, + "learning_rate": 5.056959082455054e-07, + "loss": 0.8961, + "step": 130510 + }, + { + "epoch": 10.11430121275524, + "grad_norm": 1.4855701803255696, + "learning_rate": 5.057346559206448e-07, + "loss": 0.8925, + "step": 130520 + }, + { + "epoch": 10.115076136231547, + "grad_norm": 1.424245898015697, + "learning_rate": 5.057734035957843e-07, + "loss": 0.8898, + "step": 130530 + }, + { + "epoch": 10.115851059707854, + "grad_norm": 1.4742999072577794, + "learning_rate": 5.058121512709238e-07, + "loss": 0.8995, + "step": 130540 + }, + { + "epoch": 10.11662598318416, + "grad_norm": 1.4438591373415526, + "learning_rate": 5.058508989460633e-07, + "loss": 0.8917, + "step": 130550 + }, + { + "epoch": 10.117400906660468, + "grad_norm": 1.4133589899473515, + "learning_rate": 5.058896466212028e-07, + "loss": 0.8948, + "step": 130560 + }, + { + "epoch": 10.118175830136774, + "grad_norm": 1.5187473947384678, + "learning_rate": 5.059283942963423e-07, + "loss": 0.9072, + "step": 130570 + }, + { + "epoch": 10.118950753613081, + "grad_norm": 1.4802626440687947, + "learning_rate": 5.059671419714818e-07, + "loss": 0.8972, + "step": 130580 + }, + { + "epoch": 10.119725677089388, + "grad_norm": 1.4138599977046908, + "learning_rate": 5.060058896466212e-07, + "loss": 0.8792, + "step": 130590 + }, + { + "epoch": 10.120500600565695, + "grad_norm": 1.5196160362753495, + "learning_rate": 5.060446373217607e-07, + "loss": 0.8869, + "step": 130600 + }, + { + "epoch": 10.121275524042002, + "grad_norm": 1.4259356740672589, + "learning_rate": 5.060833849969003e-07, + "loss": 0.8936, + "step": 130610 + }, + { + "epoch": 10.122050447518308, + "grad_norm": 1.4226418129548684, + "learning_rate": 5.061221326720398e-07, + "loss": 0.9002, + "step": 130620 + }, + { + "epoch": 10.122825370994613, + "grad_norm": 1.416440670832179, + "learning_rate": 5.061608803471792e-07, + "loss": 0.8994, + "step": 130630 + }, + { + "epoch": 10.12360029447092, + "grad_norm": 1.478206538721932, + "learning_rate": 5.061996280223187e-07, + "loss": 0.8918, + "step": 130640 + }, + { + "epoch": 10.124375217947227, + "grad_norm": 1.3988313313626524, + "learning_rate": 5.062383756974583e-07, + "loss": 0.892, + "step": 130650 + }, + { + "epoch": 10.125150141423534, + "grad_norm": 1.4054388215276177, + "learning_rate": 5.062771233725977e-07, + "loss": 0.8831, + "step": 130660 + }, + { + "epoch": 10.12592506489984, + "grad_norm": 1.4791294599115241, + "learning_rate": 5.063158710477372e-07, + "loss": 0.8726, + "step": 130670 + }, + { + "epoch": 10.126699988376147, + "grad_norm": 1.4248011356691233, + "learning_rate": 5.063546187228767e-07, + "loss": 0.8943, + "step": 130680 + }, + { + "epoch": 10.127474911852454, + "grad_norm": 1.4197484339768396, + "learning_rate": 5.063933663980162e-07, + "loss": 0.8926, + "step": 130690 + }, + { + "epoch": 10.128249835328761, + "grad_norm": 1.4333684216868896, + "learning_rate": 5.064321140731556e-07, + "loss": 0.883, + "step": 130700 + }, + { + "epoch": 10.129024758805068, + "grad_norm": 1.417275577452165, + "learning_rate": 5.064708617482952e-07, + "loss": 0.8823, + "step": 130710 + }, + { + "epoch": 10.129799682281375, + "grad_norm": 1.4212030829772324, + "learning_rate": 5.065096094234347e-07, + "loss": 0.8723, + "step": 130720 + }, + { + "epoch": 10.130574605757682, + "grad_norm": 1.4339802097195753, + "learning_rate": 5.065483570985741e-07, + "loss": 0.9079, + "step": 130730 + }, + { + "epoch": 10.131349529233988, + "grad_norm": 1.4558020998267145, + "learning_rate": 5.065871047737136e-07, + "loss": 0.9042, + "step": 130740 + }, + { + "epoch": 10.132124452710295, + "grad_norm": 1.4633173941536097, + "learning_rate": 5.066258524488532e-07, + "loss": 0.8919, + "step": 130750 + }, + { + "epoch": 10.132899376186602, + "grad_norm": 1.5046353471488734, + "learning_rate": 5.066646001239927e-07, + "loss": 0.8816, + "step": 130760 + }, + { + "epoch": 10.133674299662909, + "grad_norm": 1.5278730269804786, + "learning_rate": 5.067033477991321e-07, + "loss": 0.8937, + "step": 130770 + }, + { + "epoch": 10.134449223139216, + "grad_norm": 1.4323983378409486, + "learning_rate": 5.067420954742716e-07, + "loss": 0.8932, + "step": 130780 + }, + { + "epoch": 10.135224146615522, + "grad_norm": 1.4973620512192034, + "learning_rate": 5.067808431494111e-07, + "loss": 0.8905, + "step": 130790 + }, + { + "epoch": 10.13599907009183, + "grad_norm": 1.4787786001477146, + "learning_rate": 5.068195908245506e-07, + "loss": 0.8896, + "step": 130800 + }, + { + "epoch": 10.136773993568136, + "grad_norm": 1.5268841618872198, + "learning_rate": 5.068583384996901e-07, + "loss": 0.8924, + "step": 130810 + }, + { + "epoch": 10.137548917044441, + "grad_norm": 1.4100535237917111, + "learning_rate": 5.068970861748296e-07, + "loss": 0.9043, + "step": 130820 + }, + { + "epoch": 10.138323840520748, + "grad_norm": 1.5392158590554579, + "learning_rate": 5.069358338499691e-07, + "loss": 0.9137, + "step": 130830 + }, + { + "epoch": 10.139098763997055, + "grad_norm": 1.4469818848580656, + "learning_rate": 5.069745815251085e-07, + "loss": 0.8807, + "step": 130840 + }, + { + "epoch": 10.139873687473361, + "grad_norm": 1.5035485108907676, + "learning_rate": 5.070133292002481e-07, + "loss": 0.8796, + "step": 130850 + }, + { + "epoch": 10.140648610949668, + "grad_norm": 1.4516537072664752, + "learning_rate": 5.070520768753876e-07, + "loss": 0.8943, + "step": 130860 + }, + { + "epoch": 10.141423534425975, + "grad_norm": 1.4934642960793945, + "learning_rate": 5.07090824550527e-07, + "loss": 0.9179, + "step": 130870 + }, + { + "epoch": 10.142198457902282, + "grad_norm": 1.5054579211980252, + "learning_rate": 5.071295722256665e-07, + "loss": 0.9167, + "step": 130880 + }, + { + "epoch": 10.142973381378589, + "grad_norm": 1.604911064083177, + "learning_rate": 5.07168319900806e-07, + "loss": 0.8903, + "step": 130890 + }, + { + "epoch": 10.143748304854896, + "grad_norm": 1.5162707308080896, + "learning_rate": 5.072070675759456e-07, + "loss": 0.8869, + "step": 130900 + }, + { + "epoch": 10.144523228331202, + "grad_norm": 1.5602745407775809, + "learning_rate": 5.07245815251085e-07, + "loss": 0.8879, + "step": 130910 + }, + { + "epoch": 10.14529815180751, + "grad_norm": 1.5140604049732473, + "learning_rate": 5.072845629262245e-07, + "loss": 0.8937, + "step": 130920 + }, + { + "epoch": 10.146073075283816, + "grad_norm": 1.4081258199899642, + "learning_rate": 5.07323310601364e-07, + "loss": 0.8904, + "step": 130930 + }, + { + "epoch": 10.146847998760123, + "grad_norm": 1.5149127625350731, + "learning_rate": 5.073620582765034e-07, + "loss": 0.8972, + "step": 130940 + }, + { + "epoch": 10.14762292223643, + "grad_norm": 1.568806283957743, + "learning_rate": 5.07400805951643e-07, + "loss": 0.9114, + "step": 130950 + }, + { + "epoch": 10.148397845712736, + "grad_norm": 1.5136495591172434, + "learning_rate": 5.074395536267825e-07, + "loss": 0.8804, + "step": 130960 + }, + { + "epoch": 10.149172769189043, + "grad_norm": 1.4995716029624975, + "learning_rate": 5.07478301301922e-07, + "loss": 0.884, + "step": 130970 + }, + { + "epoch": 10.14994769266535, + "grad_norm": 1.3822459095656296, + "learning_rate": 5.075170489770614e-07, + "loss": 0.9018, + "step": 130980 + }, + { + "epoch": 10.150722616141657, + "grad_norm": 1.453944016942295, + "learning_rate": 5.075557966522009e-07, + "loss": 0.9118, + "step": 130990 + }, + { + "epoch": 10.151497539617962, + "grad_norm": 1.5006016993966103, + "learning_rate": 5.075945443273405e-07, + "loss": 0.9086, + "step": 131000 + }, + { + "epoch": 10.151497539617962, + "eval_loss": 0.9078992009162903, + "eval_runtime": 327.3931, + "eval_samples_per_second": 35.037, + "eval_steps_per_second": 8.76, + "step": 131000 + }, + { + "epoch": 10.152272463094269, + "grad_norm": 1.4213630572587297, + "learning_rate": 5.076332920024799e-07, + "loss": 0.8853, + "step": 131010 + }, + { + "epoch": 10.153047386570575, + "grad_norm": 1.437883390835838, + "learning_rate": 5.076720396776194e-07, + "loss": 0.9243, + "step": 131020 + }, + { + "epoch": 10.153822310046882, + "grad_norm": 1.5556009167779123, + "learning_rate": 5.077107873527589e-07, + "loss": 0.8921, + "step": 131030 + }, + { + "epoch": 10.154597233523189, + "grad_norm": 1.488738649631902, + "learning_rate": 5.077495350278984e-07, + "loss": 0.8866, + "step": 131040 + }, + { + "epoch": 10.155372156999496, + "grad_norm": 1.4094568284234625, + "learning_rate": 5.077882827030379e-07, + "loss": 0.896, + "step": 131050 + }, + { + "epoch": 10.156147080475803, + "grad_norm": 1.545412860449697, + "learning_rate": 5.078270303781774e-07, + "loss": 0.917, + "step": 131060 + }, + { + "epoch": 10.15692200395211, + "grad_norm": 1.5239377641903897, + "learning_rate": 5.078657780533169e-07, + "loss": 0.9096, + "step": 131070 + }, + { + "epoch": 10.157696927428416, + "grad_norm": 1.6054831833249281, + "learning_rate": 5.079045257284563e-07, + "loss": 0.8892, + "step": 131080 + }, + { + "epoch": 10.158471850904723, + "grad_norm": 1.4755766163813469, + "learning_rate": 5.079432734035958e-07, + "loss": 0.9128, + "step": 131090 + }, + { + "epoch": 10.15924677438103, + "grad_norm": 1.4491040743544978, + "learning_rate": 5.079820210787354e-07, + "loss": 0.8775, + "step": 131100 + }, + { + "epoch": 10.160021697857337, + "grad_norm": 1.4629186922708801, + "learning_rate": 5.080207687538749e-07, + "loss": 0.8937, + "step": 131110 + }, + { + "epoch": 10.160796621333644, + "grad_norm": 1.5188405174552055, + "learning_rate": 5.080595164290143e-07, + "loss": 0.8936, + "step": 131120 + }, + { + "epoch": 10.16157154480995, + "grad_norm": 1.3846499009341624, + "learning_rate": 5.080982641041538e-07, + "loss": 0.9015, + "step": 131130 + }, + { + "epoch": 10.162346468286257, + "grad_norm": 1.4754931380281322, + "learning_rate": 5.081370117792933e-07, + "loss": 0.8842, + "step": 131140 + }, + { + "epoch": 10.163121391762564, + "grad_norm": 1.4993333639273105, + "learning_rate": 5.081757594544328e-07, + "loss": 0.9116, + "step": 131150 + }, + { + "epoch": 10.16389631523887, + "grad_norm": 1.6774187825032587, + "learning_rate": 5.082145071295723e-07, + "loss": 0.8991, + "step": 131160 + }, + { + "epoch": 10.164671238715178, + "grad_norm": 1.5223789989024468, + "learning_rate": 5.082532548047118e-07, + "loss": 0.8819, + "step": 131170 + }, + { + "epoch": 10.165446162191484, + "grad_norm": 1.5394613362722007, + "learning_rate": 5.082920024798513e-07, + "loss": 0.8976, + "step": 131180 + }, + { + "epoch": 10.16622108566779, + "grad_norm": 1.5389451229868822, + "learning_rate": 5.083307501549907e-07, + "loss": 0.8907, + "step": 131190 + }, + { + "epoch": 10.166996009144096, + "grad_norm": 1.4583724129726099, + "learning_rate": 5.083694978301303e-07, + "loss": 0.8763, + "step": 131200 + }, + { + "epoch": 10.167770932620403, + "grad_norm": 1.4728712470435361, + "learning_rate": 5.084082455052698e-07, + "loss": 0.8914, + "step": 131210 + }, + { + "epoch": 10.16854585609671, + "grad_norm": 1.4182458599924774, + "learning_rate": 5.084469931804092e-07, + "loss": 0.9319, + "step": 131220 + }, + { + "epoch": 10.169320779573017, + "grad_norm": 1.4309966432041918, + "learning_rate": 5.084857408555487e-07, + "loss": 0.9137, + "step": 131230 + }, + { + "epoch": 10.170095703049324, + "grad_norm": 1.4832413302856364, + "learning_rate": 5.085244885306882e-07, + "loss": 0.8786, + "step": 131240 + }, + { + "epoch": 10.17087062652563, + "grad_norm": 1.4864870095209466, + "learning_rate": 5.085632362058278e-07, + "loss": 0.8896, + "step": 131250 + }, + { + "epoch": 10.171645550001937, + "grad_norm": 1.4803128303682076, + "learning_rate": 5.086019838809672e-07, + "loss": 0.8846, + "step": 131260 + }, + { + "epoch": 10.172420473478244, + "grad_norm": 1.5970558631988079, + "learning_rate": 5.086407315561067e-07, + "loss": 0.9183, + "step": 131270 + }, + { + "epoch": 10.17319539695455, + "grad_norm": 1.420167150522375, + "learning_rate": 5.086794792312462e-07, + "loss": 0.9071, + "step": 131280 + }, + { + "epoch": 10.173970320430858, + "grad_norm": 1.4122619137978834, + "learning_rate": 5.087182269063856e-07, + "loss": 0.8899, + "step": 131290 + }, + { + "epoch": 10.174745243907164, + "grad_norm": 1.4202249875911994, + "learning_rate": 5.087569745815252e-07, + "loss": 0.8999, + "step": 131300 + }, + { + "epoch": 10.175520167383471, + "grad_norm": 1.3715460460315987, + "learning_rate": 5.087957222566647e-07, + "loss": 0.9115, + "step": 131310 + }, + { + "epoch": 10.176295090859778, + "grad_norm": 1.5465742289426798, + "learning_rate": 5.088344699318041e-07, + "loss": 0.9154, + "step": 131320 + }, + { + "epoch": 10.177070014336085, + "grad_norm": 1.500170575621103, + "learning_rate": 5.088732176069436e-07, + "loss": 0.8901, + "step": 131330 + }, + { + "epoch": 10.177844937812392, + "grad_norm": 1.509484102007462, + "learning_rate": 5.089119652820832e-07, + "loss": 0.8798, + "step": 131340 + }, + { + "epoch": 10.178619861288698, + "grad_norm": 1.5124629273658332, + "learning_rate": 5.089507129572227e-07, + "loss": 0.9081, + "step": 131350 + }, + { + "epoch": 10.179394784765005, + "grad_norm": 1.525097308966362, + "learning_rate": 5.089894606323621e-07, + "loss": 0.872, + "step": 131360 + }, + { + "epoch": 10.180169708241312, + "grad_norm": 1.4892585752259497, + "learning_rate": 5.090282083075016e-07, + "loss": 0.8961, + "step": 131370 + }, + { + "epoch": 10.180944631717617, + "grad_norm": 1.4960724711789546, + "learning_rate": 5.090669559826411e-07, + "loss": 0.8852, + "step": 131380 + }, + { + "epoch": 10.181719555193924, + "grad_norm": 1.4609116129674482, + "learning_rate": 5.091057036577805e-07, + "loss": 0.8788, + "step": 131390 + }, + { + "epoch": 10.18249447867023, + "grad_norm": 1.5523533589914076, + "learning_rate": 5.091444513329201e-07, + "loss": 0.8973, + "step": 131400 + }, + { + "epoch": 10.183269402146538, + "grad_norm": 1.4613838621717403, + "learning_rate": 5.091831990080596e-07, + "loss": 0.9167, + "step": 131410 + }, + { + "epoch": 10.184044325622844, + "grad_norm": 1.4862877011399986, + "learning_rate": 5.092219466831991e-07, + "loss": 0.8675, + "step": 131420 + }, + { + "epoch": 10.184819249099151, + "grad_norm": 1.459935896484218, + "learning_rate": 5.092606943583385e-07, + "loss": 0.8773, + "step": 131430 + }, + { + "epoch": 10.185594172575458, + "grad_norm": 1.4528761112226618, + "learning_rate": 5.09299442033478e-07, + "loss": 0.886, + "step": 131440 + }, + { + "epoch": 10.186369096051765, + "grad_norm": 1.448262306449891, + "learning_rate": 5.093381897086176e-07, + "loss": 0.8939, + "step": 131450 + }, + { + "epoch": 10.187144019528072, + "grad_norm": 1.474235619944454, + "learning_rate": 5.09376937383757e-07, + "loss": 0.888, + "step": 131460 + }, + { + "epoch": 10.187918943004378, + "grad_norm": 1.4711956311660197, + "learning_rate": 5.094156850588965e-07, + "loss": 0.898, + "step": 131470 + }, + { + "epoch": 10.188693866480685, + "grad_norm": 1.4439638510800248, + "learning_rate": 5.09454432734036e-07, + "loss": 0.9086, + "step": 131480 + }, + { + "epoch": 10.189468789956992, + "grad_norm": 1.4984775400600392, + "learning_rate": 5.094931804091756e-07, + "loss": 0.8898, + "step": 131490 + }, + { + "epoch": 10.190243713433299, + "grad_norm": 1.394784233576682, + "learning_rate": 5.09531928084315e-07, + "loss": 0.8837, + "step": 131500 + }, + { + "epoch": 10.190243713433299, + "eval_loss": 0.9076955318450928, + "eval_runtime": 329.7886, + "eval_samples_per_second": 34.783, + "eval_steps_per_second": 8.696, + "step": 131500 + }, + { + "epoch": 10.191018636909606, + "grad_norm": 1.514713055337194, + "learning_rate": 5.095706757594545e-07, + "loss": 0.8968, + "step": 131510 + }, + { + "epoch": 10.191793560385912, + "grad_norm": 1.565692645006517, + "learning_rate": 5.09609423434594e-07, + "loss": 0.9097, + "step": 131520 + }, + { + "epoch": 10.19256848386222, + "grad_norm": 1.3907578713924826, + "learning_rate": 5.096481711097334e-07, + "loss": 0.8799, + "step": 131530 + }, + { + "epoch": 10.193343407338526, + "grad_norm": 1.4696022404263143, + "learning_rate": 5.09686918784873e-07, + "loss": 0.9055, + "step": 131540 + }, + { + "epoch": 10.194118330814833, + "grad_norm": 1.4366148734101474, + "learning_rate": 5.097256664600125e-07, + "loss": 0.8869, + "step": 131550 + }, + { + "epoch": 10.194893254291138, + "grad_norm": 1.4786615354547965, + "learning_rate": 5.09764414135152e-07, + "loss": 0.8951, + "step": 131560 + }, + { + "epoch": 10.195668177767445, + "grad_norm": 1.4742378621731067, + "learning_rate": 5.098031618102914e-07, + "loss": 0.8784, + "step": 131570 + }, + { + "epoch": 10.196443101243752, + "grad_norm": 1.4036498879527237, + "learning_rate": 5.098419094854309e-07, + "loss": 0.9089, + "step": 131580 + }, + { + "epoch": 10.197218024720058, + "grad_norm": 1.478590036895601, + "learning_rate": 5.098806571605705e-07, + "loss": 0.882, + "step": 131590 + }, + { + "epoch": 10.197992948196365, + "grad_norm": 1.5275947041292819, + "learning_rate": 5.099194048357099e-07, + "loss": 0.8999, + "step": 131600 + }, + { + "epoch": 10.198767871672672, + "grad_norm": 1.426650789112297, + "learning_rate": 5.099581525108494e-07, + "loss": 0.89, + "step": 131610 + }, + { + "epoch": 10.199542795148979, + "grad_norm": 1.473752262213086, + "learning_rate": 5.099969001859889e-07, + "loss": 0.8976, + "step": 131620 + }, + { + "epoch": 10.200317718625286, + "grad_norm": 1.4651842780162232, + "learning_rate": 5.100356478611284e-07, + "loss": 0.9034, + "step": 131630 + }, + { + "epoch": 10.201092642101592, + "grad_norm": 1.5192227514235535, + "learning_rate": 5.100743955362679e-07, + "loss": 0.9029, + "step": 131640 + }, + { + "epoch": 10.2018675655779, + "grad_norm": 1.3919223057556083, + "learning_rate": 5.101131432114074e-07, + "loss": 0.8922, + "step": 131650 + }, + { + "epoch": 10.202642489054206, + "grad_norm": 1.4841667809300096, + "learning_rate": 5.101518908865469e-07, + "loss": 0.8853, + "step": 131660 + }, + { + "epoch": 10.203417412530513, + "grad_norm": 1.475980129558764, + "learning_rate": 5.101906385616863e-07, + "loss": 0.8783, + "step": 131670 + }, + { + "epoch": 10.20419233600682, + "grad_norm": 1.4364079699184285, + "learning_rate": 5.102293862368258e-07, + "loss": 0.9014, + "step": 131680 + }, + { + "epoch": 10.204967259483126, + "grad_norm": 1.4792207672594984, + "learning_rate": 5.102681339119654e-07, + "loss": 0.8988, + "step": 131690 + }, + { + "epoch": 10.205742182959433, + "grad_norm": 1.451691329225587, + "learning_rate": 5.103068815871049e-07, + "loss": 0.8758, + "step": 131700 + }, + { + "epoch": 10.20651710643574, + "grad_norm": 1.472536774353604, + "learning_rate": 5.103456292622443e-07, + "loss": 0.8944, + "step": 131710 + }, + { + "epoch": 10.207292029912047, + "grad_norm": 1.5124515772051004, + "learning_rate": 5.103843769373838e-07, + "loss": 0.9096, + "step": 131720 + }, + { + "epoch": 10.208066953388354, + "grad_norm": 1.4204505116499095, + "learning_rate": 5.104231246125233e-07, + "loss": 0.8965, + "step": 131730 + }, + { + "epoch": 10.20884187686466, + "grad_norm": 1.4941366284598738, + "learning_rate": 5.104618722876628e-07, + "loss": 0.8861, + "step": 131740 + }, + { + "epoch": 10.209616800340966, + "grad_norm": 1.4982388448745336, + "learning_rate": 5.105006199628023e-07, + "loss": 0.9034, + "step": 131750 + }, + { + "epoch": 10.210391723817272, + "grad_norm": 1.5179510559304585, + "learning_rate": 5.105393676379418e-07, + "loss": 0.8904, + "step": 131760 + }, + { + "epoch": 10.21116664729358, + "grad_norm": 1.4580290397388485, + "learning_rate": 5.105781153130813e-07, + "loss": 0.8993, + "step": 131770 + }, + { + "epoch": 10.211941570769886, + "grad_norm": 1.46546004106173, + "learning_rate": 5.106168629882207e-07, + "loss": 0.9009, + "step": 131780 + }, + { + "epoch": 10.212716494246193, + "grad_norm": 1.461695359207616, + "learning_rate": 5.106556106633603e-07, + "loss": 0.8843, + "step": 131790 + }, + { + "epoch": 10.2134914177225, + "grad_norm": 1.444469393807656, + "learning_rate": 5.106943583384998e-07, + "loss": 0.9096, + "step": 131800 + }, + { + "epoch": 10.214266341198806, + "grad_norm": 1.581543252961387, + "learning_rate": 5.107331060136392e-07, + "loss": 0.8955, + "step": 131810 + }, + { + "epoch": 10.215041264675113, + "grad_norm": 1.5420881620074522, + "learning_rate": 5.107718536887787e-07, + "loss": 0.8959, + "step": 131820 + }, + { + "epoch": 10.21581618815142, + "grad_norm": 1.3418561773299986, + "learning_rate": 5.108106013639182e-07, + "loss": 0.8947, + "step": 131830 + }, + { + "epoch": 10.216591111627727, + "grad_norm": 1.6075772215097766, + "learning_rate": 5.108493490390578e-07, + "loss": 0.9095, + "step": 131840 + }, + { + "epoch": 10.217366035104034, + "grad_norm": 1.317072789147677, + "learning_rate": 5.108880967141972e-07, + "loss": 0.9024, + "step": 131850 + }, + { + "epoch": 10.21814095858034, + "grad_norm": 1.4679237944914483, + "learning_rate": 5.109268443893367e-07, + "loss": 0.8829, + "step": 131860 + }, + { + "epoch": 10.218915882056647, + "grad_norm": 1.470122219269635, + "learning_rate": 5.109655920644762e-07, + "loss": 0.8715, + "step": 131870 + }, + { + "epoch": 10.219690805532954, + "grad_norm": 1.4398459953905725, + "learning_rate": 5.110043397396156e-07, + "loss": 0.9214, + "step": 131880 + }, + { + "epoch": 10.22046572900926, + "grad_norm": 1.461419365161722, + "learning_rate": 5.110430874147552e-07, + "loss": 0.8889, + "step": 131890 + }, + { + "epoch": 10.221240652485568, + "grad_norm": 1.5139306307942453, + "learning_rate": 5.110818350898947e-07, + "loss": 0.9204, + "step": 131900 + }, + { + "epoch": 10.222015575961874, + "grad_norm": 1.4566410730991968, + "learning_rate": 5.111205827650342e-07, + "loss": 0.8967, + "step": 131910 + }, + { + "epoch": 10.222790499438181, + "grad_norm": 1.4870136383571397, + "learning_rate": 5.111593304401736e-07, + "loss": 0.8775, + "step": 131920 + }, + { + "epoch": 10.223565422914486, + "grad_norm": 1.4852061663883365, + "learning_rate": 5.111980781153131e-07, + "loss": 0.894, + "step": 131930 + }, + { + "epoch": 10.224340346390793, + "grad_norm": 1.554390245321734, + "learning_rate": 5.112368257904527e-07, + "loss": 0.8922, + "step": 131940 + }, + { + "epoch": 10.2251152698671, + "grad_norm": 1.4819280976889169, + "learning_rate": 5.112755734655921e-07, + "loss": 0.883, + "step": 131950 + }, + { + "epoch": 10.225890193343407, + "grad_norm": 1.4860425218751272, + "learning_rate": 5.113143211407316e-07, + "loss": 0.8897, + "step": 131960 + }, + { + "epoch": 10.226665116819714, + "grad_norm": 1.4523715365229444, + "learning_rate": 5.113530688158711e-07, + "loss": 0.8927, + "step": 131970 + }, + { + "epoch": 10.22744004029602, + "grad_norm": 1.5464591632983729, + "learning_rate": 5.113918164910107e-07, + "loss": 0.9169, + "step": 131980 + }, + { + "epoch": 10.228214963772327, + "grad_norm": 1.509729854972989, + "learning_rate": 5.114305641661501e-07, + "loss": 0.8753, + "step": 131990 + }, + { + "epoch": 10.228989887248634, + "grad_norm": 1.4159867041645666, + "learning_rate": 5.114693118412896e-07, + "loss": 0.8981, + "step": 132000 + }, + { + "epoch": 10.228989887248634, + "eval_loss": 0.9072683453559875, + "eval_runtime": 333.4263, + "eval_samples_per_second": 34.403, + "eval_steps_per_second": 8.602, + "step": 132000 + }, + { + "epoch": 10.22976481072494, + "grad_norm": 1.5127769177412194, + "learning_rate": 5.115080595164291e-07, + "loss": 0.8906, + "step": 132010 + }, + { + "epoch": 10.230539734201248, + "grad_norm": 1.4906531044425662, + "learning_rate": 5.115468071915685e-07, + "loss": 0.8812, + "step": 132020 + }, + { + "epoch": 10.231314657677554, + "grad_norm": 1.4361116312325046, + "learning_rate": 5.11585554866708e-07, + "loss": 0.9261, + "step": 132030 + }, + { + "epoch": 10.232089581153861, + "grad_norm": 1.5005944212922633, + "learning_rate": 5.116243025418476e-07, + "loss": 0.8981, + "step": 132040 + }, + { + "epoch": 10.232864504630168, + "grad_norm": 1.4702234256938045, + "learning_rate": 5.116630502169871e-07, + "loss": 0.9113, + "step": 132050 + }, + { + "epoch": 10.233639428106475, + "grad_norm": 1.5644058074231573, + "learning_rate": 5.117017978921265e-07, + "loss": 0.8941, + "step": 132060 + }, + { + "epoch": 10.234414351582782, + "grad_norm": 1.4450756677777608, + "learning_rate": 5.11740545567266e-07, + "loss": 0.8652, + "step": 132070 + }, + { + "epoch": 10.235189275059088, + "grad_norm": 1.5175837425062644, + "learning_rate": 5.117792932424056e-07, + "loss": 0.8984, + "step": 132080 + }, + { + "epoch": 10.235964198535395, + "grad_norm": 1.391063918789499, + "learning_rate": 5.11818040917545e-07, + "loss": 0.8805, + "step": 132090 + }, + { + "epoch": 10.236739122011702, + "grad_norm": 1.5007150928712372, + "learning_rate": 5.118567885926845e-07, + "loss": 0.8892, + "step": 132100 + }, + { + "epoch": 10.237514045488009, + "grad_norm": 1.392117637934074, + "learning_rate": 5.11895536267824e-07, + "loss": 0.9002, + "step": 132110 + }, + { + "epoch": 10.238288968964314, + "grad_norm": 1.4441369406638889, + "learning_rate": 5.119342839429635e-07, + "loss": 0.8942, + "step": 132120 + }, + { + "epoch": 10.23906389244062, + "grad_norm": 1.5105659655344714, + "learning_rate": 5.11973031618103e-07, + "loss": 0.9034, + "step": 132130 + }, + { + "epoch": 10.239838815916928, + "grad_norm": 1.3836435551406852, + "learning_rate": 5.120117792932425e-07, + "loss": 0.8697, + "step": 132140 + }, + { + "epoch": 10.240613739393234, + "grad_norm": 1.5736807818749683, + "learning_rate": 5.12050526968382e-07, + "loss": 0.8836, + "step": 132150 + }, + { + "epoch": 10.241388662869541, + "grad_norm": 1.6054369084729874, + "learning_rate": 5.120892746435214e-07, + "loss": 0.8911, + "step": 132160 + }, + { + "epoch": 10.242163586345848, + "grad_norm": 1.4635350458945828, + "learning_rate": 5.121280223186609e-07, + "loss": 0.9021, + "step": 132170 + }, + { + "epoch": 10.242938509822155, + "grad_norm": 1.4792071259560462, + "learning_rate": 5.121667699938005e-07, + "loss": 0.9028, + "step": 132180 + }, + { + "epoch": 10.243713433298462, + "grad_norm": 1.483942324972298, + "learning_rate": 5.1220551766894e-07, + "loss": 0.872, + "step": 132190 + }, + { + "epoch": 10.244488356774768, + "grad_norm": 1.3728358559417015, + "learning_rate": 5.122442653440794e-07, + "loss": 0.899, + "step": 132200 + }, + { + "epoch": 10.245263280251075, + "grad_norm": 1.425331572889188, + "learning_rate": 5.122830130192189e-07, + "loss": 0.8799, + "step": 132210 + }, + { + "epoch": 10.246038203727382, + "grad_norm": 1.4858004834432772, + "learning_rate": 5.123217606943584e-07, + "loss": 0.9039, + "step": 132220 + }, + { + "epoch": 10.246813127203689, + "grad_norm": 1.4784467938408776, + "learning_rate": 5.123605083694979e-07, + "loss": 0.9046, + "step": 132230 + }, + { + "epoch": 10.247588050679996, + "grad_norm": 1.484636428006167, + "learning_rate": 5.123992560446374e-07, + "loss": 0.9119, + "step": 132240 + }, + { + "epoch": 10.248362974156302, + "grad_norm": 1.4368960852383255, + "learning_rate": 5.124380037197769e-07, + "loss": 0.8848, + "step": 132250 + }, + { + "epoch": 10.24913789763261, + "grad_norm": 1.5812240874437262, + "learning_rate": 5.124767513949164e-07, + "loss": 0.8947, + "step": 132260 + }, + { + "epoch": 10.249912821108916, + "grad_norm": 1.4540595290255534, + "learning_rate": 5.125154990700558e-07, + "loss": 0.8864, + "step": 132270 + }, + { + "epoch": 10.250687744585223, + "grad_norm": 1.417691019740487, + "learning_rate": 5.125542467451954e-07, + "loss": 0.8857, + "step": 132280 + }, + { + "epoch": 10.25146266806153, + "grad_norm": 1.49256452899107, + "learning_rate": 5.125929944203349e-07, + "loss": 0.9013, + "step": 132290 + }, + { + "epoch": 10.252237591537835, + "grad_norm": 1.5298478793233377, + "learning_rate": 5.126317420954743e-07, + "loss": 0.9017, + "step": 132300 + }, + { + "epoch": 10.253012515014142, + "grad_norm": 1.4341326634860254, + "learning_rate": 5.126704897706138e-07, + "loss": 0.8995, + "step": 132310 + }, + { + "epoch": 10.253787438490448, + "grad_norm": 1.4229954237615348, + "learning_rate": 5.127092374457533e-07, + "loss": 0.8858, + "step": 132320 + }, + { + "epoch": 10.254562361966755, + "grad_norm": 1.5163304283142434, + "learning_rate": 5.127479851208929e-07, + "loss": 0.8858, + "step": 132330 + }, + { + "epoch": 10.255337285443062, + "grad_norm": 1.4291199762541063, + "learning_rate": 5.127867327960323e-07, + "loss": 0.9116, + "step": 132340 + }, + { + "epoch": 10.256112208919369, + "grad_norm": 1.4400717418833755, + "learning_rate": 5.128254804711718e-07, + "loss": 0.884, + "step": 132350 + }, + { + "epoch": 10.256887132395676, + "grad_norm": 1.523554040694319, + "learning_rate": 5.128642281463113e-07, + "loss": 0.9033, + "step": 132360 + }, + { + "epoch": 10.257662055871982, + "grad_norm": 1.4732119908822336, + "learning_rate": 5.129029758214507e-07, + "loss": 0.8717, + "step": 132370 + }, + { + "epoch": 10.25843697934829, + "grad_norm": 1.5072689119759597, + "learning_rate": 5.129417234965903e-07, + "loss": 0.8987, + "step": 132380 + }, + { + "epoch": 10.259211902824596, + "grad_norm": 1.5233194878191476, + "learning_rate": 5.129804711717298e-07, + "loss": 0.8857, + "step": 132390 + }, + { + "epoch": 10.259986826300903, + "grad_norm": 1.5163122612566882, + "learning_rate": 5.130192188468693e-07, + "loss": 0.8983, + "step": 132400 + }, + { + "epoch": 10.26076174977721, + "grad_norm": 1.4409370677908027, + "learning_rate": 5.130579665220087e-07, + "loss": 0.8973, + "step": 132410 + }, + { + "epoch": 10.261536673253516, + "grad_norm": 1.5101148706905552, + "learning_rate": 5.130967141971482e-07, + "loss": 0.8895, + "step": 132420 + }, + { + "epoch": 10.262311596729823, + "grad_norm": 1.5024586091002503, + "learning_rate": 5.131354618722878e-07, + "loss": 0.8816, + "step": 132430 + }, + { + "epoch": 10.26308652020613, + "grad_norm": 1.4132571069896451, + "learning_rate": 5.131742095474272e-07, + "loss": 0.9086, + "step": 132440 + }, + { + "epoch": 10.263861443682437, + "grad_norm": 1.6062834797135448, + "learning_rate": 5.132129572225667e-07, + "loss": 0.8781, + "step": 132450 + }, + { + "epoch": 10.264636367158744, + "grad_norm": 1.5376585708646324, + "learning_rate": 5.132517048977062e-07, + "loss": 0.8969, + "step": 132460 + }, + { + "epoch": 10.26541129063505, + "grad_norm": 1.3430649279039926, + "learning_rate": 5.132904525728457e-07, + "loss": 0.8954, + "step": 132470 + }, + { + "epoch": 10.266186214111357, + "grad_norm": 1.4646685062127538, + "learning_rate": 5.133292002479852e-07, + "loss": 0.8681, + "step": 132480 + }, + { + "epoch": 10.266961137587662, + "grad_norm": 1.4339151341956466, + "learning_rate": 5.133679479231247e-07, + "loss": 0.8927, + "step": 132490 + }, + { + "epoch": 10.26773606106397, + "grad_norm": 1.4150304961272786, + "learning_rate": 5.134066955982642e-07, + "loss": 0.9125, + "step": 132500 + }, + { + "epoch": 10.26773606106397, + "eval_loss": 0.9071922302246094, + "eval_runtime": 332.4129, + "eval_samples_per_second": 34.508, + "eval_steps_per_second": 8.628, + "step": 132500 + }, + { + "epoch": 10.268510984540276, + "grad_norm": 1.4150283209290053, + "learning_rate": 5.134454432734036e-07, + "loss": 0.8824, + "step": 132510 + }, + { + "epoch": 10.269285908016583, + "grad_norm": 1.4473872570275705, + "learning_rate": 5.134841909485431e-07, + "loss": 0.8767, + "step": 132520 + }, + { + "epoch": 10.27006083149289, + "grad_norm": 1.4210350670235794, + "learning_rate": 5.135229386236827e-07, + "loss": 0.9031, + "step": 132530 + }, + { + "epoch": 10.270835754969196, + "grad_norm": 1.5024912644224466, + "learning_rate": 5.135616862988222e-07, + "loss": 0.8996, + "step": 132540 + }, + { + "epoch": 10.271610678445503, + "grad_norm": 1.6030478802713, + "learning_rate": 5.136004339739616e-07, + "loss": 0.9137, + "step": 132550 + }, + { + "epoch": 10.27238560192181, + "grad_norm": 1.4100868788105585, + "learning_rate": 5.136391816491011e-07, + "loss": 0.8757, + "step": 132560 + }, + { + "epoch": 10.273160525398117, + "grad_norm": 1.6230097592330937, + "learning_rate": 5.136779293242406e-07, + "loss": 0.9118, + "step": 132570 + }, + { + "epoch": 10.273935448874424, + "grad_norm": 1.5484020018538238, + "learning_rate": 5.137166769993801e-07, + "loss": 0.9051, + "step": 132580 + }, + { + "epoch": 10.27471037235073, + "grad_norm": 1.4851897799346077, + "learning_rate": 5.137554246745196e-07, + "loss": 0.9102, + "step": 132590 + }, + { + "epoch": 10.275485295827037, + "grad_norm": 1.5497002614876412, + "learning_rate": 5.137941723496591e-07, + "loss": 0.9076, + "step": 132600 + }, + { + "epoch": 10.276260219303344, + "grad_norm": 1.476023667379437, + "learning_rate": 5.138329200247986e-07, + "loss": 0.886, + "step": 132610 + }, + { + "epoch": 10.277035142779651, + "grad_norm": 1.4387972009478376, + "learning_rate": 5.13871667699938e-07, + "loss": 0.9076, + "step": 132620 + }, + { + "epoch": 10.277810066255958, + "grad_norm": 1.5735542874084112, + "learning_rate": 5.139104153750776e-07, + "loss": 0.904, + "step": 132630 + }, + { + "epoch": 10.278584989732265, + "grad_norm": 1.3932424177555351, + "learning_rate": 5.139491630502171e-07, + "loss": 0.8837, + "step": 132640 + }, + { + "epoch": 10.279359913208571, + "grad_norm": 1.4975958582735822, + "learning_rate": 5.139879107253565e-07, + "loss": 0.884, + "step": 132650 + }, + { + "epoch": 10.280134836684878, + "grad_norm": 1.3306813477746304, + "learning_rate": 5.14026658400496e-07, + "loss": 0.8976, + "step": 132660 + }, + { + "epoch": 10.280909760161183, + "grad_norm": 1.446081481294508, + "learning_rate": 5.140654060756356e-07, + "loss": 0.8985, + "step": 132670 + }, + { + "epoch": 10.28168468363749, + "grad_norm": 1.5835437387139464, + "learning_rate": 5.141041537507751e-07, + "loss": 0.9064, + "step": 132680 + }, + { + "epoch": 10.282459607113797, + "grad_norm": 1.5450439708711214, + "learning_rate": 5.141429014259145e-07, + "loss": 0.8806, + "step": 132690 + }, + { + "epoch": 10.283234530590104, + "grad_norm": 1.5292361276728186, + "learning_rate": 5.14181649101054e-07, + "loss": 0.8789, + "step": 132700 + }, + { + "epoch": 10.28400945406641, + "grad_norm": 1.4468268456819218, + "learning_rate": 5.142203967761935e-07, + "loss": 0.9161, + "step": 132710 + }, + { + "epoch": 10.284784377542717, + "grad_norm": 1.4749597561997088, + "learning_rate": 5.14259144451333e-07, + "loss": 0.8893, + "step": 132720 + }, + { + "epoch": 10.285559301019024, + "grad_norm": 1.4092866733033513, + "learning_rate": 5.142978921264725e-07, + "loss": 0.9276, + "step": 132730 + }, + { + "epoch": 10.28633422449533, + "grad_norm": 1.486233661716359, + "learning_rate": 5.14336639801612e-07, + "loss": 0.9006, + "step": 132740 + }, + { + "epoch": 10.287109147971638, + "grad_norm": 1.4908145274602465, + "learning_rate": 5.143753874767515e-07, + "loss": 0.8835, + "step": 132750 + }, + { + "epoch": 10.287884071447944, + "grad_norm": 1.3978286604838632, + "learning_rate": 5.144141351518909e-07, + "loss": 0.9022, + "step": 132760 + }, + { + "epoch": 10.288658994924251, + "grad_norm": 1.4763510347913036, + "learning_rate": 5.144528828270305e-07, + "loss": 0.9023, + "step": 132770 + }, + { + "epoch": 10.289433918400558, + "grad_norm": 1.5106758997209226, + "learning_rate": 5.1449163050217e-07, + "loss": 0.8838, + "step": 132780 + }, + { + "epoch": 10.290208841876865, + "grad_norm": 1.4571360242490874, + "learning_rate": 5.145303781773094e-07, + "loss": 0.8822, + "step": 132790 + }, + { + "epoch": 10.290983765353172, + "grad_norm": 1.4827715300318454, + "learning_rate": 5.145691258524489e-07, + "loss": 0.8902, + "step": 132800 + }, + { + "epoch": 10.291758688829479, + "grad_norm": 1.492258730891454, + "learning_rate": 5.146078735275884e-07, + "loss": 0.8971, + "step": 132810 + }, + { + "epoch": 10.292533612305785, + "grad_norm": 1.4953990154919325, + "learning_rate": 5.146466212027279e-07, + "loss": 0.8885, + "step": 132820 + }, + { + "epoch": 10.293308535782092, + "grad_norm": 1.4293356605699257, + "learning_rate": 5.146853688778674e-07, + "loss": 0.9219, + "step": 132830 + }, + { + "epoch": 10.294083459258399, + "grad_norm": 1.5140988743351063, + "learning_rate": 5.147241165530069e-07, + "loss": 0.8896, + "step": 132840 + }, + { + "epoch": 10.294858382734706, + "grad_norm": 1.4849373004332236, + "learning_rate": 5.147628642281464e-07, + "loss": 0.9206, + "step": 132850 + }, + { + "epoch": 10.29563330621101, + "grad_norm": 1.474396969559561, + "learning_rate": 5.148016119032858e-07, + "loss": 0.8932, + "step": 132860 + }, + { + "epoch": 10.296408229687318, + "grad_norm": 1.4415579901999718, + "learning_rate": 5.148403595784254e-07, + "loss": 0.8878, + "step": 132870 + }, + { + "epoch": 10.297183153163624, + "grad_norm": 1.4883150858269252, + "learning_rate": 5.148791072535649e-07, + "loss": 0.8866, + "step": 132880 + }, + { + "epoch": 10.297958076639931, + "grad_norm": 1.4458459931942733, + "learning_rate": 5.149178549287043e-07, + "loss": 0.886, + "step": 132890 + }, + { + "epoch": 10.298733000116238, + "grad_norm": 1.4475728353117445, + "learning_rate": 5.149566026038438e-07, + "loss": 0.9152, + "step": 132900 + }, + { + "epoch": 10.299507923592545, + "grad_norm": 1.5284317291045584, + "learning_rate": 5.149953502789833e-07, + "loss": 0.8755, + "step": 132910 + }, + { + "epoch": 10.300282847068852, + "grad_norm": 1.4594204893396474, + "learning_rate": 5.150340979541229e-07, + "loss": 0.909, + "step": 132920 + }, + { + "epoch": 10.301057770545158, + "grad_norm": 1.4917254121760097, + "learning_rate": 5.150728456292623e-07, + "loss": 0.9018, + "step": 132930 + }, + { + "epoch": 10.301832694021465, + "grad_norm": 1.5264020848766129, + "learning_rate": 5.151115933044018e-07, + "loss": 0.9066, + "step": 132940 + }, + { + "epoch": 10.302607617497772, + "grad_norm": 1.572051343317392, + "learning_rate": 5.151503409795413e-07, + "loss": 0.8786, + "step": 132950 + }, + { + "epoch": 10.303382540974079, + "grad_norm": 1.60092008616018, + "learning_rate": 5.151890886546807e-07, + "loss": 0.9, + "step": 132960 + }, + { + "epoch": 10.304157464450386, + "grad_norm": 1.4612437958321651, + "learning_rate": 5.152278363298203e-07, + "loss": 0.8904, + "step": 132970 + }, + { + "epoch": 10.304932387926693, + "grad_norm": 1.4934943572878907, + "learning_rate": 5.152665840049598e-07, + "loss": 0.9105, + "step": 132980 + }, + { + "epoch": 10.305707311403, + "grad_norm": 1.4630392561471413, + "learning_rate": 5.153053316800993e-07, + "loss": 0.8963, + "step": 132990 + }, + { + "epoch": 10.306482234879306, + "grad_norm": 1.470936618957929, + "learning_rate": 5.153440793552387e-07, + "loss": 0.8974, + "step": 133000 + }, + { + "epoch": 10.306482234879306, + "eval_loss": 0.9069667458534241, + "eval_runtime": 333.5883, + "eval_samples_per_second": 34.387, + "eval_steps_per_second": 8.597, + "step": 133000 + }, + { + "epoch": 10.307257158355613, + "grad_norm": 1.3902050916477664, + "learning_rate": 5.153828270303782e-07, + "loss": 0.904, + "step": 133010 + }, + { + "epoch": 10.30803208183192, + "grad_norm": 1.5299301291608902, + "learning_rate": 5.154215747055178e-07, + "loss": 0.9106, + "step": 133020 + }, + { + "epoch": 10.308807005308227, + "grad_norm": 1.4679522783525993, + "learning_rate": 5.154603223806572e-07, + "loss": 0.9007, + "step": 133030 + }, + { + "epoch": 10.309581928784532, + "grad_norm": 1.4952728349517328, + "learning_rate": 5.154990700557967e-07, + "loss": 0.9054, + "step": 133040 + }, + { + "epoch": 10.310356852260838, + "grad_norm": 1.3828072879288487, + "learning_rate": 5.155378177309362e-07, + "loss": 0.8988, + "step": 133050 + }, + { + "epoch": 10.311131775737145, + "grad_norm": 1.4963826636194302, + "learning_rate": 5.155765654060757e-07, + "loss": 0.9347, + "step": 133060 + }, + { + "epoch": 10.311906699213452, + "grad_norm": 1.422928912889341, + "learning_rate": 5.156153130812152e-07, + "loss": 0.9162, + "step": 133070 + }, + { + "epoch": 10.312681622689759, + "grad_norm": 1.4833453261877811, + "learning_rate": 5.156540607563547e-07, + "loss": 0.8911, + "step": 133080 + }, + { + "epoch": 10.313456546166066, + "grad_norm": 1.456076322640726, + "learning_rate": 5.156928084314942e-07, + "loss": 0.8911, + "step": 133090 + }, + { + "epoch": 10.314231469642372, + "grad_norm": 1.5099010679504419, + "learning_rate": 5.157315561066336e-07, + "loss": 0.9055, + "step": 133100 + }, + { + "epoch": 10.31500639311868, + "grad_norm": 1.4312727838911903, + "learning_rate": 5.157703037817731e-07, + "loss": 0.9142, + "step": 133110 + }, + { + "epoch": 10.315781316594986, + "grad_norm": 1.333428883380274, + "learning_rate": 5.158090514569127e-07, + "loss": 0.8844, + "step": 133120 + }, + { + "epoch": 10.316556240071293, + "grad_norm": 1.502380790809101, + "learning_rate": 5.158477991320522e-07, + "loss": 0.8968, + "step": 133130 + }, + { + "epoch": 10.3173311635476, + "grad_norm": 1.563366274657955, + "learning_rate": 5.158865468071916e-07, + "loss": 0.8953, + "step": 133140 + }, + { + "epoch": 10.318106087023907, + "grad_norm": 1.563384330042297, + "learning_rate": 5.159252944823311e-07, + "loss": 0.8809, + "step": 133150 + }, + { + "epoch": 10.318881010500213, + "grad_norm": 1.4977369752847107, + "learning_rate": 5.159640421574706e-07, + "loss": 0.8831, + "step": 133160 + }, + { + "epoch": 10.31965593397652, + "grad_norm": 1.5327039486394225, + "learning_rate": 5.160027898326101e-07, + "loss": 0.882, + "step": 133170 + }, + { + "epoch": 10.320430857452827, + "grad_norm": 1.464858218705618, + "learning_rate": 5.160415375077496e-07, + "loss": 0.8864, + "step": 133180 + }, + { + "epoch": 10.321205780929134, + "grad_norm": 1.4033082096309093, + "learning_rate": 5.160802851828891e-07, + "loss": 0.8837, + "step": 133190 + }, + { + "epoch": 10.32198070440544, + "grad_norm": 1.4757303062359821, + "learning_rate": 5.161190328580286e-07, + "loss": 0.9095, + "step": 133200 + }, + { + "epoch": 10.322755627881747, + "grad_norm": 1.399846531736037, + "learning_rate": 5.16157780533168e-07, + "loss": 0.9032, + "step": 133210 + }, + { + "epoch": 10.323530551358054, + "grad_norm": 1.43210807923052, + "learning_rate": 5.161965282083076e-07, + "loss": 0.8747, + "step": 133220 + }, + { + "epoch": 10.324305474834361, + "grad_norm": 1.472051600398896, + "learning_rate": 5.162352758834471e-07, + "loss": 0.8787, + "step": 133230 + }, + { + "epoch": 10.325080398310666, + "grad_norm": 1.5397261002996687, + "learning_rate": 5.162740235585865e-07, + "loss": 0.9051, + "step": 133240 + }, + { + "epoch": 10.325855321786973, + "grad_norm": 1.5715839256236739, + "learning_rate": 5.16312771233726e-07, + "loss": 0.8926, + "step": 133250 + }, + { + "epoch": 10.32663024526328, + "grad_norm": 1.4757598427191547, + "learning_rate": 5.163515189088655e-07, + "loss": 0.8995, + "step": 133260 + }, + { + "epoch": 10.327405168739586, + "grad_norm": 1.4448002073921071, + "learning_rate": 5.163902665840051e-07, + "loss": 0.8921, + "step": 133270 + }, + { + "epoch": 10.328180092215893, + "grad_norm": 1.4923201092426177, + "learning_rate": 5.164290142591445e-07, + "loss": 0.9028, + "step": 133280 + }, + { + "epoch": 10.3289550156922, + "grad_norm": 1.5844165718920455, + "learning_rate": 5.16467761934284e-07, + "loss": 0.9048, + "step": 133290 + }, + { + "epoch": 10.329729939168507, + "grad_norm": 1.499241301710453, + "learning_rate": 5.165065096094235e-07, + "loss": 0.8769, + "step": 133300 + }, + { + "epoch": 10.330504862644814, + "grad_norm": 1.389556294093219, + "learning_rate": 5.165452572845629e-07, + "loss": 0.8798, + "step": 133310 + }, + { + "epoch": 10.33127978612112, + "grad_norm": 1.4525332098934016, + "learning_rate": 5.165840049597025e-07, + "loss": 0.9014, + "step": 133320 + }, + { + "epoch": 10.332054709597427, + "grad_norm": 1.4838062894019401, + "learning_rate": 5.16622752634842e-07, + "loss": 0.8749, + "step": 133330 + }, + { + "epoch": 10.332829633073734, + "grad_norm": 1.5308444525681242, + "learning_rate": 5.166615003099815e-07, + "loss": 0.8926, + "step": 133340 + }, + { + "epoch": 10.333604556550041, + "grad_norm": 1.442502984375027, + "learning_rate": 5.167002479851209e-07, + "loss": 0.8988, + "step": 133350 + }, + { + "epoch": 10.334379480026348, + "grad_norm": 1.4835516120642138, + "learning_rate": 5.167389956602604e-07, + "loss": 0.8912, + "step": 133360 + }, + { + "epoch": 10.335154403502655, + "grad_norm": 1.4353715497845652, + "learning_rate": 5.167777433354e-07, + "loss": 0.8666, + "step": 133370 + }, + { + "epoch": 10.335929326978961, + "grad_norm": 1.6140291356347138, + "learning_rate": 5.168164910105394e-07, + "loss": 0.9029, + "step": 133380 + }, + { + "epoch": 10.336704250455268, + "grad_norm": 1.4616722263093127, + "learning_rate": 5.168552386856789e-07, + "loss": 0.8754, + "step": 133390 + }, + { + "epoch": 10.337479173931575, + "grad_norm": 1.4577543655949674, + "learning_rate": 5.168939863608184e-07, + "loss": 0.8854, + "step": 133400 + }, + { + "epoch": 10.338254097407882, + "grad_norm": 1.4692992922332635, + "learning_rate": 5.16932734035958e-07, + "loss": 0.9024, + "step": 133410 + }, + { + "epoch": 10.339029020884187, + "grad_norm": 1.4392480232027345, + "learning_rate": 5.169714817110974e-07, + "loss": 0.8942, + "step": 133420 + }, + { + "epoch": 10.339803944360494, + "grad_norm": 1.4921357193073004, + "learning_rate": 5.170102293862369e-07, + "loss": 0.9107, + "step": 133430 + }, + { + "epoch": 10.3405788678368, + "grad_norm": 1.5148602271785203, + "learning_rate": 5.170489770613764e-07, + "loss": 0.8986, + "step": 133440 + }, + { + "epoch": 10.341353791313107, + "grad_norm": 1.4783076517900744, + "learning_rate": 5.170877247365158e-07, + "loss": 0.89, + "step": 133450 + }, + { + "epoch": 10.342128714789414, + "grad_norm": 1.4456930404438162, + "learning_rate": 5.171264724116554e-07, + "loss": 0.8847, + "step": 133460 + }, + { + "epoch": 10.342903638265721, + "grad_norm": 1.442395240806603, + "learning_rate": 5.171652200867949e-07, + "loss": 0.8909, + "step": 133470 + }, + { + "epoch": 10.343678561742028, + "grad_norm": 1.4425871649237645, + "learning_rate": 5.172039677619344e-07, + "loss": 0.8839, + "step": 133480 + }, + { + "epoch": 10.344453485218335, + "grad_norm": 1.4368329172378873, + "learning_rate": 5.172427154370738e-07, + "loss": 0.8971, + "step": 133490 + }, + { + "epoch": 10.345228408694641, + "grad_norm": 1.4498450119052009, + "learning_rate": 5.172814631122133e-07, + "loss": 0.904, + "step": 133500 + }, + { + "epoch": 10.345228408694641, + "eval_loss": 0.9066358804702759, + "eval_runtime": 331.5786, + "eval_samples_per_second": 34.595, + "eval_steps_per_second": 8.65, + "step": 133500 + }, + { + "epoch": 10.346003332170948, + "grad_norm": 1.4669687358578747, + "learning_rate": 5.173202107873529e-07, + "loss": 0.9103, + "step": 133510 + }, + { + "epoch": 10.346778255647255, + "grad_norm": 1.4013702532951602, + "learning_rate": 5.173589584624923e-07, + "loss": 0.922, + "step": 133520 + }, + { + "epoch": 10.347553179123562, + "grad_norm": 1.5348087345924262, + "learning_rate": 5.173977061376318e-07, + "loss": 0.8785, + "step": 133530 + }, + { + "epoch": 10.348328102599869, + "grad_norm": 1.4026005335045593, + "learning_rate": 5.174364538127713e-07, + "loss": 0.886, + "step": 133540 + }, + { + "epoch": 10.349103026076175, + "grad_norm": 1.5200660449604455, + "learning_rate": 5.174752014879108e-07, + "loss": 0.8905, + "step": 133550 + }, + { + "epoch": 10.349877949552482, + "grad_norm": 1.455365879636586, + "learning_rate": 5.175139491630503e-07, + "loss": 0.906, + "step": 133560 + }, + { + "epoch": 10.350652873028789, + "grad_norm": 1.372984144533403, + "learning_rate": 5.175526968381898e-07, + "loss": 0.8984, + "step": 133570 + }, + { + "epoch": 10.351427796505096, + "grad_norm": 1.4377011410924685, + "learning_rate": 5.175914445133293e-07, + "loss": 0.8907, + "step": 133580 + }, + { + "epoch": 10.352202719981403, + "grad_norm": 1.4505966614927146, + "learning_rate": 5.176301921884687e-07, + "loss": 0.9115, + "step": 133590 + }, + { + "epoch": 10.35297764345771, + "grad_norm": 1.6314566666545025, + "learning_rate": 5.176689398636082e-07, + "loss": 0.8744, + "step": 133600 + }, + { + "epoch": 10.353752566934014, + "grad_norm": 1.4914235865494097, + "learning_rate": 5.177076875387478e-07, + "loss": 0.8764, + "step": 133610 + }, + { + "epoch": 10.354527490410321, + "grad_norm": 1.4800616469582704, + "learning_rate": 5.177464352138873e-07, + "loss": 0.8909, + "step": 133620 + }, + { + "epoch": 10.355302413886628, + "grad_norm": 1.45032569897334, + "learning_rate": 5.177851828890267e-07, + "loss": 0.8809, + "step": 133630 + }, + { + "epoch": 10.356077337362935, + "grad_norm": 1.5008031456417184, + "learning_rate": 5.178239305641662e-07, + "loss": 0.883, + "step": 133640 + }, + { + "epoch": 10.356852260839242, + "grad_norm": 1.4330647282036328, + "learning_rate": 5.178626782393057e-07, + "loss": 0.8996, + "step": 133650 + }, + { + "epoch": 10.357627184315549, + "grad_norm": 1.50048096663945, + "learning_rate": 5.179014259144452e-07, + "loss": 0.8787, + "step": 133660 + }, + { + "epoch": 10.358402107791855, + "grad_norm": 1.502777707899337, + "learning_rate": 5.179401735895847e-07, + "loss": 0.8992, + "step": 133670 + }, + { + "epoch": 10.359177031268162, + "grad_norm": 1.3860029565605887, + "learning_rate": 5.179789212647242e-07, + "loss": 0.869, + "step": 133680 + }, + { + "epoch": 10.359951954744469, + "grad_norm": 1.5158114652068184, + "learning_rate": 5.180176689398637e-07, + "loss": 0.9046, + "step": 133690 + }, + { + "epoch": 10.360726878220776, + "grad_norm": 1.7891936097251113, + "learning_rate": 5.180564166150031e-07, + "loss": 0.9306, + "step": 133700 + }, + { + "epoch": 10.361501801697083, + "grad_norm": 1.5052248915304318, + "learning_rate": 5.180951642901427e-07, + "loss": 0.9029, + "step": 133710 + }, + { + "epoch": 10.36227672517339, + "grad_norm": 1.4975713879256243, + "learning_rate": 5.181339119652822e-07, + "loss": 0.9097, + "step": 133720 + }, + { + "epoch": 10.363051648649696, + "grad_norm": 1.443894218210395, + "learning_rate": 5.181726596404216e-07, + "loss": 0.9333, + "step": 133730 + }, + { + "epoch": 10.363826572126003, + "grad_norm": 1.5743760186062745, + "learning_rate": 5.182114073155611e-07, + "loss": 0.9009, + "step": 133740 + }, + { + "epoch": 10.36460149560231, + "grad_norm": 1.4920867914223503, + "learning_rate": 5.182501549907006e-07, + "loss": 0.8918, + "step": 133750 + }, + { + "epoch": 10.365376419078617, + "grad_norm": 1.4464799773232313, + "learning_rate": 5.182889026658402e-07, + "loss": 0.8955, + "step": 133760 + }, + { + "epoch": 10.366151342554923, + "grad_norm": 1.5482633456570536, + "learning_rate": 5.183276503409796e-07, + "loss": 0.8834, + "step": 133770 + }, + { + "epoch": 10.36692626603123, + "grad_norm": 1.528711623072885, + "learning_rate": 5.183663980161191e-07, + "loss": 0.8974, + "step": 133780 + }, + { + "epoch": 10.367701189507535, + "grad_norm": 1.4421288863447492, + "learning_rate": 5.184051456912586e-07, + "loss": 0.9037, + "step": 133790 + }, + { + "epoch": 10.368476112983842, + "grad_norm": 1.4608335540733088, + "learning_rate": 5.18443893366398e-07, + "loss": 0.8846, + "step": 133800 + }, + { + "epoch": 10.369251036460149, + "grad_norm": 1.41268769603756, + "learning_rate": 5.184826410415376e-07, + "loss": 0.8988, + "step": 133810 + }, + { + "epoch": 10.370025959936456, + "grad_norm": 1.4904128669341568, + "learning_rate": 5.185213887166771e-07, + "loss": 0.8949, + "step": 133820 + }, + { + "epoch": 10.370800883412763, + "grad_norm": 1.4167067753987435, + "learning_rate": 5.185601363918166e-07, + "loss": 0.8825, + "step": 133830 + }, + { + "epoch": 10.37157580688907, + "grad_norm": 1.5002892746382221, + "learning_rate": 5.18598884066956e-07, + "loss": 0.9122, + "step": 133840 + }, + { + "epoch": 10.372350730365376, + "grad_norm": 1.4537226596815187, + "learning_rate": 5.186376317420955e-07, + "loss": 0.8941, + "step": 133850 + }, + { + "epoch": 10.373125653841683, + "grad_norm": 1.4425985270866697, + "learning_rate": 5.186763794172351e-07, + "loss": 0.8942, + "step": 133860 + }, + { + "epoch": 10.37390057731799, + "grad_norm": 1.4849355438979044, + "learning_rate": 5.187151270923745e-07, + "loss": 0.8879, + "step": 133870 + }, + { + "epoch": 10.374675500794297, + "grad_norm": 1.4017454135959049, + "learning_rate": 5.18753874767514e-07, + "loss": 0.8729, + "step": 133880 + }, + { + "epoch": 10.375450424270603, + "grad_norm": 1.4265648529952317, + "learning_rate": 5.187926224426535e-07, + "loss": 0.9049, + "step": 133890 + }, + { + "epoch": 10.37622534774691, + "grad_norm": 1.4704307861842227, + "learning_rate": 5.18831370117793e-07, + "loss": 0.8912, + "step": 133900 + }, + { + "epoch": 10.377000271223217, + "grad_norm": 1.4630463502415458, + "learning_rate": 5.188701177929325e-07, + "loss": 0.8778, + "step": 133910 + }, + { + "epoch": 10.377775194699524, + "grad_norm": 1.468855281153841, + "learning_rate": 5.18908865468072e-07, + "loss": 0.8945, + "step": 133920 + }, + { + "epoch": 10.37855011817583, + "grad_norm": 1.5229236745984054, + "learning_rate": 5.189476131432115e-07, + "loss": 0.8917, + "step": 133930 + }, + { + "epoch": 10.379325041652137, + "grad_norm": 1.436924043583488, + "learning_rate": 5.189863608183509e-07, + "loss": 0.8675, + "step": 133940 + }, + { + "epoch": 10.380099965128444, + "grad_norm": 1.4685403355710345, + "learning_rate": 5.190251084934904e-07, + "loss": 0.8951, + "step": 133950 + }, + { + "epoch": 10.380874888604751, + "grad_norm": 1.5226003987838126, + "learning_rate": 5.1906385616863e-07, + "loss": 0.8877, + "step": 133960 + }, + { + "epoch": 10.381649812081058, + "grad_norm": 1.5446349459473243, + "learning_rate": 5.191026038437695e-07, + "loss": 0.8843, + "step": 133970 + }, + { + "epoch": 10.382424735557363, + "grad_norm": 1.439541684778517, + "learning_rate": 5.191413515189089e-07, + "loss": 0.882, + "step": 133980 + }, + { + "epoch": 10.38319965903367, + "grad_norm": 1.5288943960879509, + "learning_rate": 5.191800991940484e-07, + "loss": 0.8981, + "step": 133990 + }, + { + "epoch": 10.383974582509977, + "grad_norm": 1.5288767375262358, + "learning_rate": 5.19218846869188e-07, + "loss": 0.9137, + "step": 134000 + }, + { + "epoch": 10.383974582509977, + "eval_loss": 0.9065219759941101, + "eval_runtime": 331.3989, + "eval_samples_per_second": 34.614, + "eval_steps_per_second": 8.654, + "step": 134000 + }, + { + "epoch": 10.384749505986283, + "grad_norm": 1.488080974239302, + "learning_rate": 5.192575945443274e-07, + "loss": 0.8834, + "step": 134010 + }, + { + "epoch": 10.38552442946259, + "grad_norm": 1.558563859162339, + "learning_rate": 5.192963422194669e-07, + "loss": 0.8893, + "step": 134020 + }, + { + "epoch": 10.386299352938897, + "grad_norm": 1.4443254085606396, + "learning_rate": 5.193350898946064e-07, + "loss": 0.8925, + "step": 134030 + }, + { + "epoch": 10.387074276415204, + "grad_norm": 1.4461247445617433, + "learning_rate": 5.193738375697459e-07, + "loss": 0.8726, + "step": 134040 + }, + { + "epoch": 10.38784919989151, + "grad_norm": 1.4755301111494503, + "learning_rate": 5.194125852448853e-07, + "loss": 0.8963, + "step": 134050 + }, + { + "epoch": 10.388624123367817, + "grad_norm": 1.4870977218383497, + "learning_rate": 5.194513329200249e-07, + "loss": 0.9116, + "step": 134060 + }, + { + "epoch": 10.389399046844124, + "grad_norm": 1.4224430426453425, + "learning_rate": 5.194900805951644e-07, + "loss": 0.9213, + "step": 134070 + }, + { + "epoch": 10.390173970320431, + "grad_norm": 1.4992029408443959, + "learning_rate": 5.195288282703038e-07, + "loss": 0.8856, + "step": 134080 + }, + { + "epoch": 10.390948893796738, + "grad_norm": 1.5692872165283331, + "learning_rate": 5.195675759454433e-07, + "loss": 0.9, + "step": 134090 + }, + { + "epoch": 10.391723817273045, + "grad_norm": 1.4076646890785345, + "learning_rate": 5.196063236205829e-07, + "loss": 0.9038, + "step": 134100 + }, + { + "epoch": 10.392498740749351, + "grad_norm": 1.3933737124653887, + "learning_rate": 5.196450712957224e-07, + "loss": 0.882, + "step": 134110 + }, + { + "epoch": 10.393273664225658, + "grad_norm": 1.4101877452754967, + "learning_rate": 5.196838189708618e-07, + "loss": 0.8742, + "step": 134120 + }, + { + "epoch": 10.394048587701965, + "grad_norm": 1.5198219766460563, + "learning_rate": 5.197225666460013e-07, + "loss": 0.8956, + "step": 134130 + }, + { + "epoch": 10.394823511178272, + "grad_norm": 1.4416285036545782, + "learning_rate": 5.197613143211408e-07, + "loss": 0.9137, + "step": 134140 + }, + { + "epoch": 10.395598434654579, + "grad_norm": 1.4168823654481226, + "learning_rate": 5.198000619962803e-07, + "loss": 0.8811, + "step": 134150 + }, + { + "epoch": 10.396373358130884, + "grad_norm": 1.5617516750967195, + "learning_rate": 5.198388096714198e-07, + "loss": 0.9003, + "step": 134160 + }, + { + "epoch": 10.39714828160719, + "grad_norm": 1.3823087331806434, + "learning_rate": 5.198775573465593e-07, + "loss": 0.8977, + "step": 134170 + }, + { + "epoch": 10.397923205083497, + "grad_norm": 1.4621243704343043, + "learning_rate": 5.199163050216988e-07, + "loss": 0.8889, + "step": 134180 + }, + { + "epoch": 10.398698128559804, + "grad_norm": 1.4409320978965519, + "learning_rate": 5.199550526968382e-07, + "loss": 0.8835, + "step": 134190 + }, + { + "epoch": 10.399473052036111, + "grad_norm": 1.4985149313629316, + "learning_rate": 5.199938003719778e-07, + "loss": 0.9081, + "step": 134200 + }, + { + "epoch": 10.400247975512418, + "grad_norm": 1.4923104473075413, + "learning_rate": 5.200325480471173e-07, + "loss": 0.9141, + "step": 134210 + }, + { + "epoch": 10.401022898988725, + "grad_norm": 1.5131177009406838, + "learning_rate": 5.200712957222567e-07, + "loss": 0.8975, + "step": 134220 + }, + { + "epoch": 10.401797822465031, + "grad_norm": 1.3862856397513745, + "learning_rate": 5.201100433973962e-07, + "loss": 0.8803, + "step": 134230 + }, + { + "epoch": 10.402572745941338, + "grad_norm": 1.4277685539440192, + "learning_rate": 5.201487910725357e-07, + "loss": 0.8892, + "step": 134240 + }, + { + "epoch": 10.403347669417645, + "grad_norm": 1.4527445552352682, + "learning_rate": 5.201875387476752e-07, + "loss": 0.8937, + "step": 134250 + }, + { + "epoch": 10.404122592893952, + "grad_norm": 1.486569674504426, + "learning_rate": 5.202262864228147e-07, + "loss": 0.8901, + "step": 134260 + }, + { + "epoch": 10.404897516370259, + "grad_norm": 1.5006089674936114, + "learning_rate": 5.202650340979542e-07, + "loss": 0.8995, + "step": 134270 + }, + { + "epoch": 10.405672439846565, + "grad_norm": 1.4270505389993902, + "learning_rate": 5.203037817730937e-07, + "loss": 0.899, + "step": 134280 + }, + { + "epoch": 10.406447363322872, + "grad_norm": 1.484960509610365, + "learning_rate": 5.203425294482331e-07, + "loss": 0.8962, + "step": 134290 + }, + { + "epoch": 10.407222286799179, + "grad_norm": 1.454664616929039, + "learning_rate": 5.203812771233727e-07, + "loss": 0.8903, + "step": 134300 + }, + { + "epoch": 10.407997210275486, + "grad_norm": 1.4688888824022204, + "learning_rate": 5.204200247985122e-07, + "loss": 0.8826, + "step": 134310 + }, + { + "epoch": 10.408772133751793, + "grad_norm": 1.4958003930264963, + "learning_rate": 5.204587724736516e-07, + "loss": 0.9183, + "step": 134320 + }, + { + "epoch": 10.4095470572281, + "grad_norm": 1.4131729304040874, + "learning_rate": 5.204975201487911e-07, + "loss": 0.8793, + "step": 134330 + }, + { + "epoch": 10.410321980704406, + "grad_norm": 1.4069022667718603, + "learning_rate": 5.205362678239306e-07, + "loss": 0.8848, + "step": 134340 + }, + { + "epoch": 10.411096904180711, + "grad_norm": 1.3837325079574394, + "learning_rate": 5.205750154990702e-07, + "loss": 0.8919, + "step": 134350 + }, + { + "epoch": 10.411871827657018, + "grad_norm": 1.3691602810740398, + "learning_rate": 5.206137631742096e-07, + "loss": 0.8772, + "step": 134360 + }, + { + "epoch": 10.412646751133325, + "grad_norm": 1.5215061234363314, + "learning_rate": 5.206525108493491e-07, + "loss": 0.8965, + "step": 134370 + }, + { + "epoch": 10.413421674609632, + "grad_norm": 1.4066845339359733, + "learning_rate": 5.206912585244886e-07, + "loss": 0.9138, + "step": 134380 + }, + { + "epoch": 10.414196598085939, + "grad_norm": 1.4208900516385423, + "learning_rate": 5.20730006199628e-07, + "loss": 0.9021, + "step": 134390 + }, + { + "epoch": 10.414971521562245, + "grad_norm": 1.511099072251255, + "learning_rate": 5.207687538747676e-07, + "loss": 0.8919, + "step": 134400 + }, + { + "epoch": 10.415746445038552, + "grad_norm": 1.3733126053703886, + "learning_rate": 5.208075015499071e-07, + "loss": 0.8782, + "step": 134410 + }, + { + "epoch": 10.416521368514859, + "grad_norm": 1.4845927455677077, + "learning_rate": 5.208462492250466e-07, + "loss": 0.8859, + "step": 134420 + }, + { + "epoch": 10.417296291991166, + "grad_norm": 1.4171283543803788, + "learning_rate": 5.20884996900186e-07, + "loss": 0.8979, + "step": 134430 + }, + { + "epoch": 10.418071215467473, + "grad_norm": 1.599951094319378, + "learning_rate": 5.209237445753255e-07, + "loss": 0.8934, + "step": 134440 + }, + { + "epoch": 10.41884613894378, + "grad_norm": 1.46514074356886, + "learning_rate": 5.209624922504651e-07, + "loss": 0.8745, + "step": 134450 + }, + { + "epoch": 10.419621062420086, + "grad_norm": 1.4602586577064511, + "learning_rate": 5.210012399256045e-07, + "loss": 0.8807, + "step": 134460 + }, + { + "epoch": 10.420395985896393, + "grad_norm": 1.4653168521732072, + "learning_rate": 5.21039987600744e-07, + "loss": 0.8917, + "step": 134470 + }, + { + "epoch": 10.4211709093727, + "grad_norm": 1.4389093899366194, + "learning_rate": 5.210787352758835e-07, + "loss": 0.8887, + "step": 134480 + }, + { + "epoch": 10.421945832849007, + "grad_norm": 1.5248029988668115, + "learning_rate": 5.21117482951023e-07, + "loss": 0.9013, + "step": 134490 + }, + { + "epoch": 10.422720756325313, + "grad_norm": 1.4260270656789726, + "learning_rate": 5.211562306261625e-07, + "loss": 0.9161, + "step": 134500 + }, + { + "epoch": 10.422720756325313, + "eval_loss": 0.9064379930496216, + "eval_runtime": 332.8796, + "eval_samples_per_second": 34.46, + "eval_steps_per_second": 8.616, + "step": 134500 + }, + { + "epoch": 10.42349567980162, + "grad_norm": 1.4195684255035972, + "learning_rate": 5.21194978301302e-07, + "loss": 0.8661, + "step": 134510 + }, + { + "epoch": 10.424270603277927, + "grad_norm": 1.4679306479186653, + "learning_rate": 5.212337259764415e-07, + "loss": 0.8857, + "step": 134520 + }, + { + "epoch": 10.425045526754232, + "grad_norm": 1.381710752778209, + "learning_rate": 5.212724736515809e-07, + "loss": 0.8873, + "step": 134530 + }, + { + "epoch": 10.425820450230539, + "grad_norm": 1.456400370631152, + "learning_rate": 5.213112213267204e-07, + "loss": 0.8858, + "step": 134540 + }, + { + "epoch": 10.426595373706846, + "grad_norm": 1.4343211593015897, + "learning_rate": 5.2134996900186e-07, + "loss": 0.8927, + "step": 134550 + }, + { + "epoch": 10.427370297183153, + "grad_norm": 1.4658945375648376, + "learning_rate": 5.213887166769995e-07, + "loss": 0.8958, + "step": 134560 + }, + { + "epoch": 10.42814522065946, + "grad_norm": 1.4322194973048579, + "learning_rate": 5.214274643521389e-07, + "loss": 0.9071, + "step": 134570 + }, + { + "epoch": 10.428920144135766, + "grad_norm": 1.3847795524286675, + "learning_rate": 5.214662120272784e-07, + "loss": 0.8814, + "step": 134580 + }, + { + "epoch": 10.429695067612073, + "grad_norm": 1.4117913968697775, + "learning_rate": 5.21504959702418e-07, + "loss": 0.9068, + "step": 134590 + }, + { + "epoch": 10.43046999108838, + "grad_norm": 1.4681500668214245, + "learning_rate": 5.215437073775574e-07, + "loss": 0.8861, + "step": 134600 + }, + { + "epoch": 10.431244914564687, + "grad_norm": 1.4736388413048052, + "learning_rate": 5.215824550526969e-07, + "loss": 0.9305, + "step": 134610 + }, + { + "epoch": 10.432019838040993, + "grad_norm": 1.4163982797001344, + "learning_rate": 5.216212027278364e-07, + "loss": 0.8833, + "step": 134620 + }, + { + "epoch": 10.4327947615173, + "grad_norm": 1.6276396406375997, + "learning_rate": 5.216599504029759e-07, + "loss": 0.9064, + "step": 134630 + }, + { + "epoch": 10.433569684993607, + "grad_norm": 1.4005981337889806, + "learning_rate": 5.216986980781153e-07, + "loss": 0.8887, + "step": 134640 + }, + { + "epoch": 10.434344608469914, + "grad_norm": 1.5827630266821624, + "learning_rate": 5.217374457532549e-07, + "loss": 0.8838, + "step": 134650 + }, + { + "epoch": 10.43511953194622, + "grad_norm": 1.4647905409863784, + "learning_rate": 5.217761934283944e-07, + "loss": 0.8863, + "step": 134660 + }, + { + "epoch": 10.435894455422527, + "grad_norm": 1.4706440166290387, + "learning_rate": 5.218149411035338e-07, + "loss": 0.8758, + "step": 134670 + }, + { + "epoch": 10.436669378898834, + "grad_norm": 1.4247127103518071, + "learning_rate": 5.218536887786733e-07, + "loss": 0.891, + "step": 134680 + }, + { + "epoch": 10.437444302375141, + "grad_norm": 1.4010145809433354, + "learning_rate": 5.218924364538128e-07, + "loss": 0.9001, + "step": 134690 + }, + { + "epoch": 10.438219225851448, + "grad_norm": 1.3791392259879016, + "learning_rate": 5.219311841289524e-07, + "loss": 0.8958, + "step": 134700 + }, + { + "epoch": 10.438994149327755, + "grad_norm": 1.4799927349296405, + "learning_rate": 5.219699318040918e-07, + "loss": 0.8663, + "step": 134710 + }, + { + "epoch": 10.43976907280406, + "grad_norm": 1.4320631165822473, + "learning_rate": 5.220086794792313e-07, + "loss": 0.8994, + "step": 134720 + }, + { + "epoch": 10.440543996280367, + "grad_norm": 1.4685835948783992, + "learning_rate": 5.220474271543708e-07, + "loss": 0.9094, + "step": 134730 + }, + { + "epoch": 10.441318919756673, + "grad_norm": 1.4717640801203675, + "learning_rate": 5.220861748295102e-07, + "loss": 0.8994, + "step": 134740 + }, + { + "epoch": 10.44209384323298, + "grad_norm": 1.4507041695523635, + "learning_rate": 5.221249225046498e-07, + "loss": 0.906, + "step": 134750 + }, + { + "epoch": 10.442868766709287, + "grad_norm": 1.4083145548838691, + "learning_rate": 5.221636701797893e-07, + "loss": 0.9154, + "step": 134760 + }, + { + "epoch": 10.443643690185594, + "grad_norm": 1.4948587907853363, + "learning_rate": 5.222024178549288e-07, + "loss": 0.8941, + "step": 134770 + }, + { + "epoch": 10.4444186136619, + "grad_norm": 1.504497082996383, + "learning_rate": 5.222411655300682e-07, + "loss": 0.912, + "step": 134780 + }, + { + "epoch": 10.445193537138207, + "grad_norm": 1.4983215585247716, + "learning_rate": 5.222799132052078e-07, + "loss": 0.9014, + "step": 134790 + }, + { + "epoch": 10.445968460614514, + "grad_norm": 1.4979620506899827, + "learning_rate": 5.223186608803473e-07, + "loss": 0.9009, + "step": 134800 + }, + { + "epoch": 10.446743384090821, + "grad_norm": 1.4664595676573036, + "learning_rate": 5.223574085554867e-07, + "loss": 0.8939, + "step": 134810 + }, + { + "epoch": 10.447518307567128, + "grad_norm": 1.442874495679953, + "learning_rate": 5.223961562306262e-07, + "loss": 0.8779, + "step": 134820 + }, + { + "epoch": 10.448293231043435, + "grad_norm": 1.4916130164891488, + "learning_rate": 5.224349039057657e-07, + "loss": 0.8821, + "step": 134830 + }, + { + "epoch": 10.449068154519741, + "grad_norm": 1.4658492689454812, + "learning_rate": 5.224736515809053e-07, + "loss": 0.9126, + "step": 134840 + }, + { + "epoch": 10.449843077996048, + "grad_norm": 1.4656955028779504, + "learning_rate": 5.225123992560447e-07, + "loss": 0.8842, + "step": 134850 + }, + { + "epoch": 10.450618001472355, + "grad_norm": 1.5957697550337646, + "learning_rate": 5.225511469311842e-07, + "loss": 0.8907, + "step": 134860 + }, + { + "epoch": 10.451392924948662, + "grad_norm": 1.391053378267637, + "learning_rate": 5.225898946063237e-07, + "loss": 0.8819, + "step": 134870 + }, + { + "epoch": 10.452167848424969, + "grad_norm": 1.5515208211987592, + "learning_rate": 5.226286422814631e-07, + "loss": 0.9091, + "step": 134880 + }, + { + "epoch": 10.452942771901276, + "grad_norm": 1.6147077079656977, + "learning_rate": 5.226673899566027e-07, + "loss": 0.9155, + "step": 134890 + }, + { + "epoch": 10.45371769537758, + "grad_norm": 1.4648975493830867, + "learning_rate": 5.227061376317422e-07, + "loss": 0.8956, + "step": 134900 + }, + { + "epoch": 10.454492618853887, + "grad_norm": 1.409535788988286, + "learning_rate": 5.227448853068817e-07, + "loss": 0.8832, + "step": 134910 + }, + { + "epoch": 10.455267542330194, + "grad_norm": 1.4935474648834413, + "learning_rate": 5.227836329820211e-07, + "loss": 0.9025, + "step": 134920 + }, + { + "epoch": 10.456042465806501, + "grad_norm": 1.5795926707110197, + "learning_rate": 5.228223806571606e-07, + "loss": 0.8821, + "step": 134930 + }, + { + "epoch": 10.456817389282808, + "grad_norm": 1.5785808894345883, + "learning_rate": 5.228611283323002e-07, + "loss": 0.8957, + "step": 134940 + }, + { + "epoch": 10.457592312759115, + "grad_norm": 1.4055201101048438, + "learning_rate": 5.228998760074396e-07, + "loss": 0.886, + "step": 134950 + }, + { + "epoch": 10.458367236235421, + "grad_norm": 1.497148234171783, + "learning_rate": 5.229386236825791e-07, + "loss": 0.9034, + "step": 134960 + }, + { + "epoch": 10.459142159711728, + "grad_norm": 1.5843612624898582, + "learning_rate": 5.229773713577186e-07, + "loss": 0.9111, + "step": 134970 + }, + { + "epoch": 10.459917083188035, + "grad_norm": 1.3888600649634455, + "learning_rate": 5.230161190328581e-07, + "loss": 0.9172, + "step": 134980 + }, + { + "epoch": 10.460692006664342, + "grad_norm": 1.4827823304201615, + "learning_rate": 5.230548667079976e-07, + "loss": 0.8873, + "step": 134990 + }, + { + "epoch": 10.461466930140649, + "grad_norm": 1.5204569178494007, + "learning_rate": 5.230936143831371e-07, + "loss": 0.9119, + "step": 135000 + }, + { + "epoch": 10.461466930140649, + "eval_loss": 0.9061199426651001, + "eval_runtime": 331.982, + "eval_samples_per_second": 34.553, + "eval_steps_per_second": 8.639, + "step": 135000 + }, + { + "epoch": 10.462241853616955, + "grad_norm": 1.5095649688318988, + "learning_rate": 5.231323620582766e-07, + "loss": 0.8973, + "step": 135010 + }, + { + "epoch": 10.463016777093262, + "grad_norm": 1.3941246249375898, + "learning_rate": 5.23171109733416e-07, + "loss": 0.9174, + "step": 135020 + }, + { + "epoch": 10.463791700569569, + "grad_norm": 1.462472679908022, + "learning_rate": 5.232098574085555e-07, + "loss": 0.8912, + "step": 135030 + }, + { + "epoch": 10.464566624045876, + "grad_norm": 1.433704590357491, + "learning_rate": 5.232486050836951e-07, + "loss": 0.8945, + "step": 135040 + }, + { + "epoch": 10.465341547522183, + "grad_norm": 1.4094655696963754, + "learning_rate": 5.232873527588346e-07, + "loss": 0.8969, + "step": 135050 + }, + { + "epoch": 10.46611647099849, + "grad_norm": 1.4384155480037364, + "learning_rate": 5.23326100433974e-07, + "loss": 0.8998, + "step": 135060 + }, + { + "epoch": 10.466891394474796, + "grad_norm": 1.5198391761247039, + "learning_rate": 5.233648481091135e-07, + "loss": 0.8797, + "step": 135070 + }, + { + "epoch": 10.467666317951103, + "grad_norm": 1.4490393156606305, + "learning_rate": 5.23403595784253e-07, + "loss": 0.8823, + "step": 135080 + }, + { + "epoch": 10.46844124142741, + "grad_norm": 1.4476770196029936, + "learning_rate": 5.234423434593925e-07, + "loss": 0.9105, + "step": 135090 + }, + { + "epoch": 10.469216164903715, + "grad_norm": 1.439302662452757, + "learning_rate": 5.23481091134532e-07, + "loss": 0.8884, + "step": 135100 + }, + { + "epoch": 10.469991088380022, + "grad_norm": 1.43745675996848, + "learning_rate": 5.235198388096715e-07, + "loss": 0.8706, + "step": 135110 + }, + { + "epoch": 10.470766011856329, + "grad_norm": 1.49080669442824, + "learning_rate": 5.23558586484811e-07, + "loss": 0.904, + "step": 135120 + }, + { + "epoch": 10.471540935332635, + "grad_norm": 1.5044334913543684, + "learning_rate": 5.235973341599504e-07, + "loss": 0.8831, + "step": 135130 + }, + { + "epoch": 10.472315858808942, + "grad_norm": 1.5178895374215433, + "learning_rate": 5.2363608183509e-07, + "loss": 0.918, + "step": 135140 + }, + { + "epoch": 10.473090782285249, + "grad_norm": 1.5048721429148877, + "learning_rate": 5.236748295102295e-07, + "loss": 0.902, + "step": 135150 + }, + { + "epoch": 10.473865705761556, + "grad_norm": 1.5631529913078885, + "learning_rate": 5.237135771853689e-07, + "loss": 0.8854, + "step": 135160 + }, + { + "epoch": 10.474640629237863, + "grad_norm": 1.4807755044099284, + "learning_rate": 5.237523248605084e-07, + "loss": 0.8981, + "step": 135170 + }, + { + "epoch": 10.47541555271417, + "grad_norm": 1.5123626500793295, + "learning_rate": 5.237910725356479e-07, + "loss": 0.9058, + "step": 135180 + }, + { + "epoch": 10.476190476190476, + "grad_norm": 1.5233793748690865, + "learning_rate": 5.238298202107875e-07, + "loss": 0.8849, + "step": 135190 + }, + { + "epoch": 10.476965399666783, + "grad_norm": 1.4297499434971506, + "learning_rate": 5.238685678859269e-07, + "loss": 0.9123, + "step": 135200 + }, + { + "epoch": 10.47774032314309, + "grad_norm": 1.6426571737603568, + "learning_rate": 5.239073155610664e-07, + "loss": 0.9153, + "step": 135210 + }, + { + "epoch": 10.478515246619397, + "grad_norm": 1.4723467962923265, + "learning_rate": 5.239460632362059e-07, + "loss": 0.8851, + "step": 135220 + }, + { + "epoch": 10.479290170095704, + "grad_norm": 1.4749352216977714, + "learning_rate": 5.239848109113453e-07, + "loss": 0.8751, + "step": 135230 + }, + { + "epoch": 10.48006509357201, + "grad_norm": 1.6163461427944028, + "learning_rate": 5.240235585864849e-07, + "loss": 0.8843, + "step": 135240 + }, + { + "epoch": 10.480840017048317, + "grad_norm": 1.5961817504213855, + "learning_rate": 5.240623062616244e-07, + "loss": 0.9059, + "step": 135250 + }, + { + "epoch": 10.481614940524624, + "grad_norm": 1.49080538436192, + "learning_rate": 5.241010539367639e-07, + "loss": 0.8966, + "step": 135260 + }, + { + "epoch": 10.48238986400093, + "grad_norm": 1.568065082926604, + "learning_rate": 5.241398016119033e-07, + "loss": 0.9055, + "step": 135270 + }, + { + "epoch": 10.483164787477236, + "grad_norm": 1.5048806115241875, + "learning_rate": 5.241785492870428e-07, + "loss": 0.9044, + "step": 135280 + }, + { + "epoch": 10.483939710953543, + "grad_norm": 1.4333668720716253, + "learning_rate": 5.242172969621824e-07, + "loss": 0.8984, + "step": 135290 + }, + { + "epoch": 10.48471463442985, + "grad_norm": 1.5354415210571402, + "learning_rate": 5.242560446373218e-07, + "loss": 0.8846, + "step": 135300 + }, + { + "epoch": 10.485489557906156, + "grad_norm": 1.4616795740143127, + "learning_rate": 5.242947923124613e-07, + "loss": 0.9024, + "step": 135310 + }, + { + "epoch": 10.486264481382463, + "grad_norm": 1.4049105833695112, + "learning_rate": 5.243335399876008e-07, + "loss": 0.8975, + "step": 135320 + }, + { + "epoch": 10.48703940485877, + "grad_norm": 1.5349207381460948, + "learning_rate": 5.243722876627404e-07, + "loss": 0.9024, + "step": 135330 + }, + { + "epoch": 10.487814328335077, + "grad_norm": 1.4956531911386042, + "learning_rate": 5.244110353378798e-07, + "loss": 0.8983, + "step": 135340 + }, + { + "epoch": 10.488589251811383, + "grad_norm": 1.4751419947929616, + "learning_rate": 5.244497830130193e-07, + "loss": 0.8983, + "step": 135350 + }, + { + "epoch": 10.48936417528769, + "grad_norm": 1.4810176344873982, + "learning_rate": 5.244885306881588e-07, + "loss": 0.8852, + "step": 135360 + }, + { + "epoch": 10.490139098763997, + "grad_norm": 1.5308145009115783, + "learning_rate": 5.245272783632982e-07, + "loss": 0.9001, + "step": 135370 + }, + { + "epoch": 10.490914022240304, + "grad_norm": 1.5004652818735935, + "learning_rate": 5.245660260384377e-07, + "loss": 0.9044, + "step": 135380 + }, + { + "epoch": 10.49168894571661, + "grad_norm": 1.4938753857193516, + "learning_rate": 5.246047737135773e-07, + "loss": 0.8992, + "step": 135390 + }, + { + "epoch": 10.492463869192918, + "grad_norm": 1.4840615451882577, + "learning_rate": 5.246435213887168e-07, + "loss": 0.8863, + "step": 135400 + }, + { + "epoch": 10.493238792669224, + "grad_norm": 1.3994083496851295, + "learning_rate": 5.246822690638562e-07, + "loss": 0.8734, + "step": 135410 + }, + { + "epoch": 10.494013716145531, + "grad_norm": 1.5540996412944539, + "learning_rate": 5.247210167389957e-07, + "loss": 0.8826, + "step": 135420 + }, + { + "epoch": 10.494788639621838, + "grad_norm": 1.5325892494944595, + "learning_rate": 5.247597644141353e-07, + "loss": 0.8682, + "step": 135430 + }, + { + "epoch": 10.495563563098145, + "grad_norm": 1.4232628212716727, + "learning_rate": 5.247985120892747e-07, + "loss": 0.8836, + "step": 135440 + }, + { + "epoch": 10.496338486574452, + "grad_norm": 1.4513190574391746, + "learning_rate": 5.248372597644142e-07, + "loss": 0.9062, + "step": 135450 + }, + { + "epoch": 10.497113410050758, + "grad_norm": 1.5354262115982147, + "learning_rate": 5.248760074395537e-07, + "loss": 0.8961, + "step": 135460 + }, + { + "epoch": 10.497888333527063, + "grad_norm": 1.3422550245089582, + "learning_rate": 5.249147551146932e-07, + "loss": 0.8993, + "step": 135470 + }, + { + "epoch": 10.49866325700337, + "grad_norm": 1.4905438386957621, + "learning_rate": 5.249535027898327e-07, + "loss": 0.9062, + "step": 135480 + }, + { + "epoch": 10.499438180479677, + "grad_norm": 1.4360111251773904, + "learning_rate": 5.249922504649722e-07, + "loss": 0.8997, + "step": 135490 + }, + { + "epoch": 10.500213103955984, + "grad_norm": 1.5105912959088232, + "learning_rate": 5.250309981401117e-07, + "loss": 0.8909, + "step": 135500 + }, + { + "epoch": 10.500213103955984, + "eval_loss": 0.9058817028999329, + "eval_runtime": 330.7014, + "eval_samples_per_second": 34.687, + "eval_steps_per_second": 8.672, + "step": 135500 + }, + { + "epoch": 10.50098802743229, + "grad_norm": 1.481927994206511, + "learning_rate": 5.250697458152511e-07, + "loss": 0.9212, + "step": 135510 + }, + { + "epoch": 10.501762950908597, + "grad_norm": 1.4714710917576506, + "learning_rate": 5.251084934903906e-07, + "loss": 0.8906, + "step": 135520 + }, + { + "epoch": 10.502537874384904, + "grad_norm": 1.5076493288981005, + "learning_rate": 5.251472411655302e-07, + "loss": 0.8888, + "step": 135530 + }, + { + "epoch": 10.503312797861211, + "grad_norm": 1.605539146369708, + "learning_rate": 5.251859888406697e-07, + "loss": 0.8922, + "step": 135540 + }, + { + "epoch": 10.504087721337518, + "grad_norm": 1.3840332854740092, + "learning_rate": 5.252247365158091e-07, + "loss": 0.8766, + "step": 135550 + }, + { + "epoch": 10.504862644813825, + "grad_norm": 1.460210609953367, + "learning_rate": 5.252634841909486e-07, + "loss": 0.8931, + "step": 135560 + }, + { + "epoch": 10.505637568290132, + "grad_norm": 1.4694398156500366, + "learning_rate": 5.253022318660881e-07, + "loss": 0.8859, + "step": 135570 + }, + { + "epoch": 10.506412491766438, + "grad_norm": 1.466505518578398, + "learning_rate": 5.253409795412276e-07, + "loss": 0.8877, + "step": 135580 + }, + { + "epoch": 10.507187415242745, + "grad_norm": 1.502536723305127, + "learning_rate": 5.253797272163671e-07, + "loss": 0.9112, + "step": 135590 + }, + { + "epoch": 10.507962338719052, + "grad_norm": 1.3768411530141949, + "learning_rate": 5.254184748915066e-07, + "loss": 0.8832, + "step": 135600 + }, + { + "epoch": 10.508737262195359, + "grad_norm": 1.5385603444431921, + "learning_rate": 5.254572225666461e-07, + "loss": 0.9055, + "step": 135610 + }, + { + "epoch": 10.509512185671666, + "grad_norm": 1.5780716976860327, + "learning_rate": 5.254959702417855e-07, + "loss": 0.8804, + "step": 135620 + }, + { + "epoch": 10.510287109147972, + "grad_norm": 1.5104681489247487, + "learning_rate": 5.255347179169251e-07, + "loss": 0.8988, + "step": 135630 + }, + { + "epoch": 10.51106203262428, + "grad_norm": 1.4591588849555797, + "learning_rate": 5.255734655920646e-07, + "loss": 0.8907, + "step": 135640 + }, + { + "epoch": 10.511836956100584, + "grad_norm": 1.594493877474068, + "learning_rate": 5.25612213267204e-07, + "loss": 0.8994, + "step": 135650 + }, + { + "epoch": 10.512611879576891, + "grad_norm": 1.423217717371596, + "learning_rate": 5.256509609423435e-07, + "loss": 0.9095, + "step": 135660 + }, + { + "epoch": 10.513386803053198, + "grad_norm": 1.5309137079576458, + "learning_rate": 5.25689708617483e-07, + "loss": 0.888, + "step": 135670 + }, + { + "epoch": 10.514161726529505, + "grad_norm": 1.5954920369515622, + "learning_rate": 5.257284562926225e-07, + "loss": 0.8875, + "step": 135680 + }, + { + "epoch": 10.514936650005811, + "grad_norm": 1.5323196708375906, + "learning_rate": 5.25767203967762e-07, + "loss": 0.9013, + "step": 135690 + }, + { + "epoch": 10.515711573482118, + "grad_norm": 1.4584587962427285, + "learning_rate": 5.258059516429015e-07, + "loss": 0.9109, + "step": 135700 + }, + { + "epoch": 10.516486496958425, + "grad_norm": 1.5152302707986316, + "learning_rate": 5.25844699318041e-07, + "loss": 0.8742, + "step": 135710 + }, + { + "epoch": 10.517261420434732, + "grad_norm": 1.4916495945338104, + "learning_rate": 5.258834469931804e-07, + "loss": 0.9034, + "step": 135720 + }, + { + "epoch": 10.518036343911039, + "grad_norm": 1.5157746449778549, + "learning_rate": 5.2592219466832e-07, + "loss": 0.8976, + "step": 135730 + }, + { + "epoch": 10.518811267387346, + "grad_norm": 1.479159796575727, + "learning_rate": 5.259609423434595e-07, + "loss": 0.8995, + "step": 135740 + }, + { + "epoch": 10.519586190863652, + "grad_norm": 1.4730078158079432, + "learning_rate": 5.259996900185989e-07, + "loss": 0.8842, + "step": 135750 + }, + { + "epoch": 10.52036111433996, + "grad_norm": 1.4968898866234492, + "learning_rate": 5.260384376937384e-07, + "loss": 0.9153, + "step": 135760 + }, + { + "epoch": 10.521136037816266, + "grad_norm": 1.449488065752946, + "learning_rate": 5.260771853688779e-07, + "loss": 0.9326, + "step": 135770 + }, + { + "epoch": 10.521910961292573, + "grad_norm": 1.5318956441200693, + "learning_rate": 5.261159330440175e-07, + "loss": 0.884, + "step": 135780 + }, + { + "epoch": 10.52268588476888, + "grad_norm": 1.4341591014837114, + "learning_rate": 5.261546807191569e-07, + "loss": 0.8978, + "step": 135790 + }, + { + "epoch": 10.523460808245186, + "grad_norm": 1.4678896496694527, + "learning_rate": 5.261934283942964e-07, + "loss": 0.8772, + "step": 135800 + }, + { + "epoch": 10.524235731721493, + "grad_norm": 1.478316453025314, + "learning_rate": 5.262321760694359e-07, + "loss": 0.8944, + "step": 135810 + }, + { + "epoch": 10.5250106551978, + "grad_norm": 1.466913216396516, + "learning_rate": 5.262709237445753e-07, + "loss": 0.8816, + "step": 135820 + }, + { + "epoch": 10.525785578674107, + "grad_norm": 1.5199814925762871, + "learning_rate": 5.263096714197149e-07, + "loss": 0.8819, + "step": 135830 + }, + { + "epoch": 10.526560502150412, + "grad_norm": 1.3665400644540027, + "learning_rate": 5.263484190948544e-07, + "loss": 0.8721, + "step": 135840 + }, + { + "epoch": 10.527335425626719, + "grad_norm": 1.451170887362118, + "learning_rate": 5.263871667699939e-07, + "loss": 0.9021, + "step": 135850 + }, + { + "epoch": 10.528110349103025, + "grad_norm": 1.484002118544728, + "learning_rate": 5.264259144451333e-07, + "loss": 0.8823, + "step": 135860 + }, + { + "epoch": 10.528885272579332, + "grad_norm": 1.4469752533641278, + "learning_rate": 5.264646621202728e-07, + "loss": 0.9033, + "step": 135870 + }, + { + "epoch": 10.529660196055639, + "grad_norm": 1.5291062222329235, + "learning_rate": 5.265034097954124e-07, + "loss": 0.8902, + "step": 135880 + }, + { + "epoch": 10.530435119531946, + "grad_norm": 1.544361960786643, + "learning_rate": 5.265421574705518e-07, + "loss": 0.9013, + "step": 135890 + }, + { + "epoch": 10.531210043008253, + "grad_norm": 1.5107446949229322, + "learning_rate": 5.265809051456913e-07, + "loss": 0.8869, + "step": 135900 + }, + { + "epoch": 10.53198496648456, + "grad_norm": 1.4687104724828843, + "learning_rate": 5.266196528208308e-07, + "loss": 0.8845, + "step": 135910 + }, + { + "epoch": 10.532759889960866, + "grad_norm": 1.5755549042354378, + "learning_rate": 5.266584004959703e-07, + "loss": 0.9004, + "step": 135920 + }, + { + "epoch": 10.533534813437173, + "grad_norm": 1.5169496515977423, + "learning_rate": 5.266971481711098e-07, + "loss": 0.9109, + "step": 135930 + }, + { + "epoch": 10.53430973691348, + "grad_norm": 1.4748229665854067, + "learning_rate": 5.267358958462493e-07, + "loss": 0.8986, + "step": 135940 + }, + { + "epoch": 10.535084660389787, + "grad_norm": 1.499895658926976, + "learning_rate": 5.267746435213888e-07, + "loss": 0.8991, + "step": 135950 + }, + { + "epoch": 10.535859583866094, + "grad_norm": 1.458408945795121, + "learning_rate": 5.268133911965282e-07, + "loss": 0.9038, + "step": 135960 + }, + { + "epoch": 10.5366345073424, + "grad_norm": 1.4619196414775315, + "learning_rate": 5.268521388716677e-07, + "loss": 0.8888, + "step": 135970 + }, + { + "epoch": 10.537409430818707, + "grad_norm": 1.4032907916333388, + "learning_rate": 5.268908865468073e-07, + "loss": 0.901, + "step": 135980 + }, + { + "epoch": 10.538184354295014, + "grad_norm": 1.4891458551331613, + "learning_rate": 5.269296342219468e-07, + "loss": 0.8913, + "step": 135990 + }, + { + "epoch": 10.53895927777132, + "grad_norm": 1.471876775701414, + "learning_rate": 5.269683818970862e-07, + "loss": 0.8964, + "step": 136000 + }, + { + "epoch": 10.53895927777132, + "eval_loss": 0.9057315587997437, + "eval_runtime": 328.0329, + "eval_samples_per_second": 34.969, + "eval_steps_per_second": 8.743, + "step": 136000 + }, + { + "epoch": 10.539734201247628, + "grad_norm": 1.4661587328446164, + "learning_rate": 5.270071295722257e-07, + "loss": 0.9092, + "step": 136010 + }, + { + "epoch": 10.540509124723933, + "grad_norm": 1.41789885139148, + "learning_rate": 5.270458772473652e-07, + "loss": 0.8881, + "step": 136020 + }, + { + "epoch": 10.54128404820024, + "grad_norm": 1.4896662208712754, + "learning_rate": 5.270846249225047e-07, + "loss": 0.9054, + "step": 136030 + }, + { + "epoch": 10.542058971676546, + "grad_norm": 1.4489712897126854, + "learning_rate": 5.271233725976442e-07, + "loss": 0.9049, + "step": 136040 + }, + { + "epoch": 10.542833895152853, + "grad_norm": 1.4457370665484612, + "learning_rate": 5.271621202727837e-07, + "loss": 0.8944, + "step": 136050 + }, + { + "epoch": 10.54360881862916, + "grad_norm": 1.4228340361405214, + "learning_rate": 5.272008679479232e-07, + "loss": 0.9041, + "step": 136060 + }, + { + "epoch": 10.544383742105467, + "grad_norm": 1.4756764217949514, + "learning_rate": 5.272396156230626e-07, + "loss": 0.8919, + "step": 136070 + }, + { + "epoch": 10.545158665581774, + "grad_norm": 1.4158546736219855, + "learning_rate": 5.272783632982022e-07, + "loss": 0.8958, + "step": 136080 + }, + { + "epoch": 10.54593358905808, + "grad_norm": 1.4868220375606536, + "learning_rate": 5.273171109733417e-07, + "loss": 0.9187, + "step": 136090 + }, + { + "epoch": 10.546708512534387, + "grad_norm": 1.5924905754339789, + "learning_rate": 5.273558586484811e-07, + "loss": 0.8874, + "step": 136100 + }, + { + "epoch": 10.547483436010694, + "grad_norm": 1.4130915215889364, + "learning_rate": 5.273946063236206e-07, + "loss": 0.8861, + "step": 136110 + }, + { + "epoch": 10.548258359487, + "grad_norm": 1.5073173318229451, + "learning_rate": 5.274333539987602e-07, + "loss": 0.8891, + "step": 136120 + }, + { + "epoch": 10.549033282963308, + "grad_norm": 1.4597958903762336, + "learning_rate": 5.274721016738997e-07, + "loss": 0.8794, + "step": 136130 + }, + { + "epoch": 10.549808206439614, + "grad_norm": 1.495672283369898, + "learning_rate": 5.275108493490391e-07, + "loss": 0.9044, + "step": 136140 + }, + { + "epoch": 10.550583129915921, + "grad_norm": 1.529842708069103, + "learning_rate": 5.275495970241786e-07, + "loss": 0.8996, + "step": 136150 + }, + { + "epoch": 10.551358053392228, + "grad_norm": 1.45059480616764, + "learning_rate": 5.275883446993181e-07, + "loss": 0.8853, + "step": 136160 + }, + { + "epoch": 10.552132976868535, + "grad_norm": 1.4933678393089485, + "learning_rate": 5.276270923744575e-07, + "loss": 0.8793, + "step": 136170 + }, + { + "epoch": 10.552907900344842, + "grad_norm": 1.523964375163948, + "learning_rate": 5.276658400495971e-07, + "loss": 0.8872, + "step": 136180 + }, + { + "epoch": 10.553682823821148, + "grad_norm": 1.488108010733818, + "learning_rate": 5.277045877247366e-07, + "loss": 0.8945, + "step": 136190 + }, + { + "epoch": 10.554457747297455, + "grad_norm": 1.386895198974988, + "learning_rate": 5.277433353998761e-07, + "loss": 0.873, + "step": 136200 + }, + { + "epoch": 10.55523267077376, + "grad_norm": 1.457397151998978, + "learning_rate": 5.277820830750155e-07, + "loss": 0.8863, + "step": 136210 + }, + { + "epoch": 10.556007594250067, + "grad_norm": 1.4299971467804824, + "learning_rate": 5.278208307501551e-07, + "loss": 0.8804, + "step": 136220 + }, + { + "epoch": 10.556782517726374, + "grad_norm": 1.454998453941114, + "learning_rate": 5.278595784252946e-07, + "loss": 0.9119, + "step": 136230 + }, + { + "epoch": 10.55755744120268, + "grad_norm": 1.430076619161379, + "learning_rate": 5.27898326100434e-07, + "loss": 0.8962, + "step": 136240 + }, + { + "epoch": 10.558332364678988, + "grad_norm": 1.4465978957936094, + "learning_rate": 5.279370737755735e-07, + "loss": 0.8908, + "step": 136250 + }, + { + "epoch": 10.559107288155294, + "grad_norm": 1.4588905425312584, + "learning_rate": 5.27975821450713e-07, + "loss": 0.8957, + "step": 136260 + }, + { + "epoch": 10.559882211631601, + "grad_norm": 1.3942727939851747, + "learning_rate": 5.280145691258526e-07, + "loss": 0.9078, + "step": 136270 + }, + { + "epoch": 10.560657135107908, + "grad_norm": 1.5913538567559697, + "learning_rate": 5.28053316800992e-07, + "loss": 0.8995, + "step": 136280 + }, + { + "epoch": 10.561432058584215, + "grad_norm": 1.445751676113096, + "learning_rate": 5.280920644761315e-07, + "loss": 0.9001, + "step": 136290 + }, + { + "epoch": 10.562206982060522, + "grad_norm": 1.451973555898604, + "learning_rate": 5.28130812151271e-07, + "loss": 0.8832, + "step": 136300 + }, + { + "epoch": 10.562981905536828, + "grad_norm": 1.4184941561761215, + "learning_rate": 5.281695598264104e-07, + "loss": 0.8885, + "step": 136310 + }, + { + "epoch": 10.563756829013135, + "grad_norm": 1.495035990472225, + "learning_rate": 5.2820830750155e-07, + "loss": 0.8953, + "step": 136320 + }, + { + "epoch": 10.564531752489442, + "grad_norm": 1.3523097599405967, + "learning_rate": 5.282470551766895e-07, + "loss": 0.9061, + "step": 136330 + }, + { + "epoch": 10.565306675965749, + "grad_norm": 1.4611107672258206, + "learning_rate": 5.28285802851829e-07, + "loss": 0.8713, + "step": 136340 + }, + { + "epoch": 10.566081599442056, + "grad_norm": 1.3908618635788688, + "learning_rate": 5.283245505269684e-07, + "loss": 0.8887, + "step": 136350 + }, + { + "epoch": 10.566856522918362, + "grad_norm": 1.3700755403221945, + "learning_rate": 5.283632982021079e-07, + "loss": 0.8818, + "step": 136360 + }, + { + "epoch": 10.56763144639467, + "grad_norm": 1.4892388038073587, + "learning_rate": 5.284020458772475e-07, + "loss": 0.9009, + "step": 136370 + }, + { + "epoch": 10.568406369870976, + "grad_norm": 1.472115063391147, + "learning_rate": 5.284407935523869e-07, + "loss": 0.8921, + "step": 136380 + }, + { + "epoch": 10.569181293347281, + "grad_norm": 1.429801960283118, + "learning_rate": 5.284795412275264e-07, + "loss": 0.9152, + "step": 136390 + }, + { + "epoch": 10.569956216823588, + "grad_norm": 1.5050051811422058, + "learning_rate": 5.285182889026659e-07, + "loss": 0.8775, + "step": 136400 + }, + { + "epoch": 10.570731140299895, + "grad_norm": 1.5048445216473563, + "learning_rate": 5.285570365778054e-07, + "loss": 0.8891, + "step": 136410 + }, + { + "epoch": 10.571506063776202, + "grad_norm": 1.4176181015485179, + "learning_rate": 5.285957842529449e-07, + "loss": 0.8922, + "step": 136420 + }, + { + "epoch": 10.572280987252508, + "grad_norm": 1.4398984021475099, + "learning_rate": 5.286345319280844e-07, + "loss": 0.8956, + "step": 136430 + }, + { + "epoch": 10.573055910728815, + "grad_norm": 1.3974557789840938, + "learning_rate": 5.286732796032239e-07, + "loss": 0.87, + "step": 136440 + }, + { + "epoch": 10.573830834205122, + "grad_norm": 1.5085534962658897, + "learning_rate": 5.287120272783633e-07, + "loss": 0.8975, + "step": 136450 + }, + { + "epoch": 10.574605757681429, + "grad_norm": 1.4962026469829273, + "learning_rate": 5.287507749535028e-07, + "loss": 0.9058, + "step": 136460 + }, + { + "epoch": 10.575380681157736, + "grad_norm": 1.6850997941406607, + "learning_rate": 5.287895226286424e-07, + "loss": 0.9153, + "step": 136470 + }, + { + "epoch": 10.576155604634042, + "grad_norm": 1.5079332573472528, + "learning_rate": 5.288282703037819e-07, + "loss": 0.9052, + "step": 136480 + }, + { + "epoch": 10.57693052811035, + "grad_norm": 1.5136810743578715, + "learning_rate": 5.288670179789213e-07, + "loss": 0.8937, + "step": 136490 + }, + { + "epoch": 10.577705451586656, + "grad_norm": 1.488940512882765, + "learning_rate": 5.289057656540608e-07, + "loss": 0.8745, + "step": 136500 + }, + { + "epoch": 10.577705451586656, + "eval_loss": 0.905411422252655, + "eval_runtime": 329.578, + "eval_samples_per_second": 34.805, + "eval_steps_per_second": 8.702, + "step": 136500 + }, + { + "epoch": 10.578480375062963, + "grad_norm": 1.4783367129592728, + "learning_rate": 5.289445133292003e-07, + "loss": 0.89, + "step": 136510 + }, + { + "epoch": 10.57925529853927, + "grad_norm": 1.4747653893441597, + "learning_rate": 5.289832610043398e-07, + "loss": 0.9192, + "step": 136520 + }, + { + "epoch": 10.580030222015576, + "grad_norm": 1.5313991741435127, + "learning_rate": 5.290220086794793e-07, + "loss": 0.8963, + "step": 136530 + }, + { + "epoch": 10.580805145491883, + "grad_norm": 1.5992968849744913, + "learning_rate": 5.290607563546188e-07, + "loss": 0.8918, + "step": 136540 + }, + { + "epoch": 10.58158006896819, + "grad_norm": 1.4101987629619033, + "learning_rate": 5.290995040297583e-07, + "loss": 0.9157, + "step": 136550 + }, + { + "epoch": 10.582354992444497, + "grad_norm": 1.5579567562712113, + "learning_rate": 5.291382517048977e-07, + "loss": 0.9079, + "step": 136560 + }, + { + "epoch": 10.583129915920804, + "grad_norm": 1.4319168477004671, + "learning_rate": 5.291769993800373e-07, + "loss": 0.8763, + "step": 136570 + }, + { + "epoch": 10.58390483939711, + "grad_norm": 1.5041812558955916, + "learning_rate": 5.292157470551768e-07, + "loss": 0.867, + "step": 136580 + }, + { + "epoch": 10.584679762873415, + "grad_norm": 1.445270429892182, + "learning_rate": 5.292544947303162e-07, + "loss": 0.9098, + "step": 136590 + }, + { + "epoch": 10.585454686349722, + "grad_norm": 1.4005551306241182, + "learning_rate": 5.292932424054557e-07, + "loss": 0.877, + "step": 136600 + }, + { + "epoch": 10.58622960982603, + "grad_norm": 1.5319850249420899, + "learning_rate": 5.293319900805952e-07, + "loss": 0.8993, + "step": 136610 + }, + { + "epoch": 10.587004533302336, + "grad_norm": 1.5285939488649276, + "learning_rate": 5.293707377557348e-07, + "loss": 0.9031, + "step": 136620 + }, + { + "epoch": 10.587779456778643, + "grad_norm": 1.4924168516392242, + "learning_rate": 5.294094854308742e-07, + "loss": 0.8949, + "step": 136630 + }, + { + "epoch": 10.58855438025495, + "grad_norm": 1.3851031612459097, + "learning_rate": 5.294482331060137e-07, + "loss": 0.8819, + "step": 136640 + }, + { + "epoch": 10.589329303731256, + "grad_norm": 1.5089894471270278, + "learning_rate": 5.294869807811532e-07, + "loss": 0.8762, + "step": 136650 + }, + { + "epoch": 10.590104227207563, + "grad_norm": 1.5709546991405055, + "learning_rate": 5.295257284562926e-07, + "loss": 0.892, + "step": 136660 + }, + { + "epoch": 10.59087915068387, + "grad_norm": 1.480516133913263, + "learning_rate": 5.295644761314322e-07, + "loss": 0.8763, + "step": 136670 + }, + { + "epoch": 10.591654074160177, + "grad_norm": 1.5323045049037276, + "learning_rate": 5.296032238065717e-07, + "loss": 0.9014, + "step": 136680 + }, + { + "epoch": 10.592428997636484, + "grad_norm": 1.6032490854034285, + "learning_rate": 5.296419714817112e-07, + "loss": 0.8797, + "step": 136690 + }, + { + "epoch": 10.59320392111279, + "grad_norm": 1.4603429376150587, + "learning_rate": 5.296807191568506e-07, + "loss": 0.8995, + "step": 136700 + }, + { + "epoch": 10.593978844589097, + "grad_norm": 1.3725825887730425, + "learning_rate": 5.297194668319901e-07, + "loss": 0.8907, + "step": 136710 + }, + { + "epoch": 10.594753768065404, + "grad_norm": 1.4968789660199506, + "learning_rate": 5.297582145071297e-07, + "loss": 0.9103, + "step": 136720 + }, + { + "epoch": 10.59552869154171, + "grad_norm": 1.4985376325044504, + "learning_rate": 5.297969621822691e-07, + "loss": 0.9019, + "step": 136730 + }, + { + "epoch": 10.596303615018018, + "grad_norm": 1.441276500682025, + "learning_rate": 5.298357098574086e-07, + "loss": 0.8817, + "step": 136740 + }, + { + "epoch": 10.597078538494324, + "grad_norm": 1.4804291021315443, + "learning_rate": 5.298744575325481e-07, + "loss": 0.8744, + "step": 136750 + }, + { + "epoch": 10.59785346197063, + "grad_norm": 1.483082018273853, + "learning_rate": 5.299132052076877e-07, + "loss": 0.9091, + "step": 136760 + }, + { + "epoch": 10.598628385446936, + "grad_norm": 1.458916398914952, + "learning_rate": 5.299519528828271e-07, + "loss": 0.8981, + "step": 136770 + }, + { + "epoch": 10.599403308923243, + "grad_norm": 1.5420145636292588, + "learning_rate": 5.299907005579666e-07, + "loss": 0.9039, + "step": 136780 + }, + { + "epoch": 10.60017823239955, + "grad_norm": 1.4290746701399784, + "learning_rate": 5.300294482331061e-07, + "loss": 0.9003, + "step": 136790 + }, + { + "epoch": 10.600953155875857, + "grad_norm": 1.4619109247823083, + "learning_rate": 5.300681959082455e-07, + "loss": 0.8873, + "step": 136800 + }, + { + "epoch": 10.601728079352164, + "grad_norm": 1.3645689497791411, + "learning_rate": 5.30106943583385e-07, + "loss": 0.884, + "step": 136810 + }, + { + "epoch": 10.60250300282847, + "grad_norm": 1.5103857617711223, + "learning_rate": 5.301456912585246e-07, + "loss": 0.8845, + "step": 136820 + }, + { + "epoch": 10.603277926304777, + "grad_norm": 1.5412876913765832, + "learning_rate": 5.301844389336641e-07, + "loss": 0.8804, + "step": 136830 + }, + { + "epoch": 10.604052849781084, + "grad_norm": 1.4350016922244766, + "learning_rate": 5.302231866088035e-07, + "loss": 0.8996, + "step": 136840 + }, + { + "epoch": 10.60482777325739, + "grad_norm": 1.527474318721194, + "learning_rate": 5.30261934283943e-07, + "loss": 0.8972, + "step": 136850 + }, + { + "epoch": 10.605602696733698, + "grad_norm": 1.4116222764978736, + "learning_rate": 5.303006819590826e-07, + "loss": 0.8991, + "step": 136860 + }, + { + "epoch": 10.606377620210004, + "grad_norm": 1.4905226165133376, + "learning_rate": 5.30339429634222e-07, + "loss": 0.8973, + "step": 136870 + }, + { + "epoch": 10.607152543686311, + "grad_norm": 1.4448009730523539, + "learning_rate": 5.303781773093615e-07, + "loss": 0.8887, + "step": 136880 + }, + { + "epoch": 10.607927467162618, + "grad_norm": 1.431247468961554, + "learning_rate": 5.30416924984501e-07, + "loss": 0.8862, + "step": 136890 + }, + { + "epoch": 10.608702390638925, + "grad_norm": 1.4234804895527342, + "learning_rate": 5.304556726596405e-07, + "loss": 0.8743, + "step": 136900 + }, + { + "epoch": 10.609477314115232, + "grad_norm": 1.4908543649329447, + "learning_rate": 5.3049442033478e-07, + "loss": 0.8925, + "step": 136910 + }, + { + "epoch": 10.610252237591538, + "grad_norm": 1.526175776478935, + "learning_rate": 5.305331680099195e-07, + "loss": 0.8903, + "step": 136920 + }, + { + "epoch": 10.611027161067845, + "grad_norm": 1.5261309072941807, + "learning_rate": 5.30571915685059e-07, + "loss": 0.9043, + "step": 136930 + }, + { + "epoch": 10.611802084544152, + "grad_norm": 1.4466184887245968, + "learning_rate": 5.306106633601984e-07, + "loss": 0.9081, + "step": 136940 + }, + { + "epoch": 10.612577008020459, + "grad_norm": 1.522289548149008, + "learning_rate": 5.306494110353379e-07, + "loss": 0.895, + "step": 136950 + }, + { + "epoch": 10.613351931496764, + "grad_norm": 1.3710792358170498, + "learning_rate": 5.306881587104775e-07, + "loss": 0.8884, + "step": 136960 + }, + { + "epoch": 10.61412685497307, + "grad_norm": 1.519174756043747, + "learning_rate": 5.30726906385617e-07, + "loss": 0.9083, + "step": 136970 + }, + { + "epoch": 10.614901778449378, + "grad_norm": 1.4217542215775618, + "learning_rate": 5.307656540607564e-07, + "loss": 0.8874, + "step": 136980 + }, + { + "epoch": 10.615676701925684, + "grad_norm": 1.4573729436034122, + "learning_rate": 5.308044017358959e-07, + "loss": 0.8759, + "step": 136990 + }, + { + "epoch": 10.616451625401991, + "grad_norm": 1.4658254198807381, + "learning_rate": 5.308431494110354e-07, + "loss": 0.8913, + "step": 137000 + }, + { + "epoch": 10.616451625401991, + "eval_loss": 0.9052267074584961, + "eval_runtime": 326.4824, + "eval_samples_per_second": 35.135, + "eval_steps_per_second": 8.785, + "step": 137000 + }, + { + "epoch": 10.617226548878298, + "grad_norm": 1.4884665920922626, + "learning_rate": 5.308818970861749e-07, + "loss": 0.9072, + "step": 137010 + }, + { + "epoch": 10.618001472354605, + "grad_norm": 1.482823432674992, + "learning_rate": 5.309206447613144e-07, + "loss": 0.8799, + "step": 137020 + }, + { + "epoch": 10.618776395830912, + "grad_norm": 1.5039365255338117, + "learning_rate": 5.309593924364539e-07, + "loss": 0.8978, + "step": 137030 + }, + { + "epoch": 10.619551319307218, + "grad_norm": 1.4047946854979088, + "learning_rate": 5.309981401115934e-07, + "loss": 0.9031, + "step": 137040 + }, + { + "epoch": 10.620326242783525, + "grad_norm": 1.4327655106034782, + "learning_rate": 5.310368877867328e-07, + "loss": 0.8975, + "step": 137050 + }, + { + "epoch": 10.621101166259832, + "grad_norm": 1.4699001103991227, + "learning_rate": 5.310756354618724e-07, + "loss": 0.8906, + "step": 137060 + }, + { + "epoch": 10.621876089736139, + "grad_norm": 1.52803990741692, + "learning_rate": 5.311143831370119e-07, + "loss": 0.9051, + "step": 137070 + }, + { + "epoch": 10.622651013212446, + "grad_norm": 1.4458712065960126, + "learning_rate": 5.311531308121513e-07, + "loss": 0.8834, + "step": 137080 + }, + { + "epoch": 10.623425936688752, + "grad_norm": 1.5116527552873877, + "learning_rate": 5.311918784872908e-07, + "loss": 0.8901, + "step": 137090 + }, + { + "epoch": 10.62420086016506, + "grad_norm": 1.367266805205649, + "learning_rate": 5.312306261624303e-07, + "loss": 0.8913, + "step": 137100 + }, + { + "epoch": 10.624975783641366, + "grad_norm": 1.3486883216309582, + "learning_rate": 5.312693738375699e-07, + "loss": 0.8796, + "step": 137110 + }, + { + "epoch": 10.625750707117673, + "grad_norm": 1.450335538474993, + "learning_rate": 5.313081215127093e-07, + "loss": 0.9084, + "step": 137120 + }, + { + "epoch": 10.626525630593978, + "grad_norm": 1.3885535732005578, + "learning_rate": 5.313468691878488e-07, + "loss": 0.893, + "step": 137130 + }, + { + "epoch": 10.627300554070285, + "grad_norm": 1.422798997730865, + "learning_rate": 5.313856168629883e-07, + "loss": 0.8761, + "step": 137140 + }, + { + "epoch": 10.628075477546592, + "grad_norm": 1.4620214816062298, + "learning_rate": 5.314243645381277e-07, + "loss": 0.9129, + "step": 137150 + }, + { + "epoch": 10.628850401022898, + "grad_norm": 1.456693818012347, + "learning_rate": 5.314631122132673e-07, + "loss": 0.9121, + "step": 137160 + }, + { + "epoch": 10.629625324499205, + "grad_norm": 1.3839526630770822, + "learning_rate": 5.315018598884068e-07, + "loss": 0.8967, + "step": 137170 + }, + { + "epoch": 10.630400247975512, + "grad_norm": 1.5020429257471004, + "learning_rate": 5.315406075635462e-07, + "loss": 0.9031, + "step": 137180 + }, + { + "epoch": 10.631175171451819, + "grad_norm": 1.4669283091708107, + "learning_rate": 5.315793552386857e-07, + "loss": 0.9163, + "step": 137190 + }, + { + "epoch": 10.631950094928126, + "grad_norm": 1.4242329966696514, + "learning_rate": 5.316181029138252e-07, + "loss": 0.8823, + "step": 137200 + }, + { + "epoch": 10.632725018404432, + "grad_norm": 1.4982708515135952, + "learning_rate": 5.316568505889648e-07, + "loss": 0.9176, + "step": 137210 + }, + { + "epoch": 10.63349994188074, + "grad_norm": 1.5523869449277425, + "learning_rate": 5.316955982641042e-07, + "loss": 0.8873, + "step": 137220 + }, + { + "epoch": 10.634274865357046, + "grad_norm": 1.527545052926371, + "learning_rate": 5.317343459392437e-07, + "loss": 0.8837, + "step": 137230 + }, + { + "epoch": 10.635049788833353, + "grad_norm": 1.5065754658455057, + "learning_rate": 5.317730936143832e-07, + "loss": 0.8742, + "step": 137240 + }, + { + "epoch": 10.63582471230966, + "grad_norm": 1.4622750394606798, + "learning_rate": 5.318118412895226e-07, + "loss": 0.8835, + "step": 137250 + }, + { + "epoch": 10.636599635785966, + "grad_norm": 1.398858564809826, + "learning_rate": 5.318505889646622e-07, + "loss": 0.8831, + "step": 137260 + }, + { + "epoch": 10.637374559262273, + "grad_norm": 1.473459007209634, + "learning_rate": 5.318893366398017e-07, + "loss": 0.8826, + "step": 137270 + }, + { + "epoch": 10.63814948273858, + "grad_norm": 1.4922400349677456, + "learning_rate": 5.319280843149412e-07, + "loss": 0.8908, + "step": 137280 + }, + { + "epoch": 10.638924406214887, + "grad_norm": 1.4793025309518466, + "learning_rate": 5.319668319900806e-07, + "loss": 0.8932, + "step": 137290 + }, + { + "epoch": 10.639699329691194, + "grad_norm": 1.4395769598562032, + "learning_rate": 5.320055796652201e-07, + "loss": 0.8988, + "step": 137300 + }, + { + "epoch": 10.6404742531675, + "grad_norm": 1.4820086735741933, + "learning_rate": 5.320443273403597e-07, + "loss": 0.8847, + "step": 137310 + }, + { + "epoch": 10.641249176643807, + "grad_norm": 1.3814379146654068, + "learning_rate": 5.320830750154991e-07, + "loss": 0.8994, + "step": 137320 + }, + { + "epoch": 10.642024100120112, + "grad_norm": 1.4793438935388814, + "learning_rate": 5.321218226906386e-07, + "loss": 0.8928, + "step": 137330 + }, + { + "epoch": 10.64279902359642, + "grad_norm": 1.3552918116596209, + "learning_rate": 5.321605703657781e-07, + "loss": 0.8847, + "step": 137340 + }, + { + "epoch": 10.643573947072726, + "grad_norm": 1.4694249068920813, + "learning_rate": 5.321993180409176e-07, + "loss": 0.8839, + "step": 137350 + }, + { + "epoch": 10.644348870549033, + "grad_norm": 1.466479266065045, + "learning_rate": 5.322380657160571e-07, + "loss": 0.9037, + "step": 137360 + }, + { + "epoch": 10.64512379402534, + "grad_norm": 1.4367434026713153, + "learning_rate": 5.322768133911966e-07, + "loss": 0.8959, + "step": 137370 + }, + { + "epoch": 10.645898717501646, + "grad_norm": 1.442356409229532, + "learning_rate": 5.323155610663361e-07, + "loss": 0.9082, + "step": 137380 + }, + { + "epoch": 10.646673640977953, + "grad_norm": 1.4332446294666512, + "learning_rate": 5.323543087414755e-07, + "loss": 0.8786, + "step": 137390 + }, + { + "epoch": 10.64744856445426, + "grad_norm": 1.5404262742300814, + "learning_rate": 5.32393056416615e-07, + "loss": 0.8752, + "step": 137400 + }, + { + "epoch": 10.648223487930567, + "grad_norm": 1.4959477732669144, + "learning_rate": 5.324318040917546e-07, + "loss": 0.8649, + "step": 137410 + }, + { + "epoch": 10.648998411406874, + "grad_norm": 1.4623242610977332, + "learning_rate": 5.324705517668941e-07, + "loss": 0.8961, + "step": 137420 + }, + { + "epoch": 10.64977333488318, + "grad_norm": 1.5055735888703157, + "learning_rate": 5.325092994420335e-07, + "loss": 0.8957, + "step": 137430 + }, + { + "epoch": 10.650548258359487, + "grad_norm": 1.4579286158358808, + "learning_rate": 5.32548047117173e-07, + "loss": 0.8913, + "step": 137440 + }, + { + "epoch": 10.651323181835794, + "grad_norm": 1.430368626022421, + "learning_rate": 5.325867947923126e-07, + "loss": 0.9195, + "step": 137450 + }, + { + "epoch": 10.6520981053121, + "grad_norm": 1.5643519571279192, + "learning_rate": 5.32625542467452e-07, + "loss": 0.9152, + "step": 137460 + }, + { + "epoch": 10.652873028788408, + "grad_norm": 1.4576723651559869, + "learning_rate": 5.326642901425915e-07, + "loss": 0.8988, + "step": 137470 + }, + { + "epoch": 10.653647952264715, + "grad_norm": 1.4680199425186866, + "learning_rate": 5.32703037817731e-07, + "loss": 0.9062, + "step": 137480 + }, + { + "epoch": 10.654422875741021, + "grad_norm": 1.5417157780860837, + "learning_rate": 5.327417854928705e-07, + "loss": 0.8792, + "step": 137490 + }, + { + "epoch": 10.655197799217326, + "grad_norm": 1.4835784276242772, + "learning_rate": 5.3278053316801e-07, + "loss": 0.8925, + "step": 137500 + }, + { + "epoch": 10.655197799217326, + "eval_loss": 0.905164361000061, + "eval_runtime": 327.2791, + "eval_samples_per_second": 35.05, + "eval_steps_per_second": 8.763, + "step": 137500 + }, + { + "epoch": 10.655972722693633, + "grad_norm": 1.4782937817922113, + "learning_rate": 5.328192808431495e-07, + "loss": 0.8922, + "step": 137510 + }, + { + "epoch": 10.65674764616994, + "grad_norm": 1.5457498046778775, + "learning_rate": 5.32858028518289e-07, + "loss": 0.8857, + "step": 137520 + }, + { + "epoch": 10.657522569646247, + "grad_norm": 1.5333515842468655, + "learning_rate": 5.328967761934284e-07, + "loss": 0.8761, + "step": 137530 + }, + { + "epoch": 10.658297493122554, + "grad_norm": 1.4110806016407507, + "learning_rate": 5.329355238685679e-07, + "loss": 0.9073, + "step": 137540 + }, + { + "epoch": 10.65907241659886, + "grad_norm": 1.4938676083100437, + "learning_rate": 5.329742715437075e-07, + "loss": 0.8918, + "step": 137550 + }, + { + "epoch": 10.659847340075167, + "grad_norm": 1.5659640495918725, + "learning_rate": 5.33013019218847e-07, + "loss": 0.8911, + "step": 137560 + }, + { + "epoch": 10.660622263551474, + "grad_norm": 1.4397214342780538, + "learning_rate": 5.330517668939864e-07, + "loss": 0.9069, + "step": 137570 + }, + { + "epoch": 10.66139718702778, + "grad_norm": 1.3369444145345586, + "learning_rate": 5.330905145691259e-07, + "loss": 0.8879, + "step": 137580 + }, + { + "epoch": 10.662172110504088, + "grad_norm": 1.507499857339265, + "learning_rate": 5.331292622442654e-07, + "loss": 0.8816, + "step": 137590 + }, + { + "epoch": 10.662947033980394, + "grad_norm": 1.3942386529193682, + "learning_rate": 5.331680099194049e-07, + "loss": 0.887, + "step": 137600 + }, + { + "epoch": 10.663721957456701, + "grad_norm": 1.5203577445197356, + "learning_rate": 5.332067575945444e-07, + "loss": 0.8856, + "step": 137610 + }, + { + "epoch": 10.664496880933008, + "grad_norm": 1.4208295035050622, + "learning_rate": 5.332455052696839e-07, + "loss": 0.9123, + "step": 137620 + }, + { + "epoch": 10.665271804409315, + "grad_norm": 1.383558945835502, + "learning_rate": 5.332842529448234e-07, + "loss": 0.8819, + "step": 137630 + }, + { + "epoch": 10.666046727885622, + "grad_norm": 1.5241976447167782, + "learning_rate": 5.333230006199628e-07, + "loss": 0.8787, + "step": 137640 + }, + { + "epoch": 10.666821651361929, + "grad_norm": 1.4377108594948196, + "learning_rate": 5.333617482951024e-07, + "loss": 0.8896, + "step": 137650 + }, + { + "epoch": 10.667596574838235, + "grad_norm": 1.4148780790650812, + "learning_rate": 5.334004959702419e-07, + "loss": 0.9101, + "step": 137660 + }, + { + "epoch": 10.668371498314542, + "grad_norm": 1.4767455012666957, + "learning_rate": 5.334392436453813e-07, + "loss": 0.8952, + "step": 137670 + }, + { + "epoch": 10.669146421790849, + "grad_norm": 1.4403492730998695, + "learning_rate": 5.334779913205208e-07, + "loss": 0.905, + "step": 137680 + }, + { + "epoch": 10.669921345267156, + "grad_norm": 1.4134630185824104, + "learning_rate": 5.335167389956603e-07, + "loss": 0.8735, + "step": 137690 + }, + { + "epoch": 10.67069626874346, + "grad_norm": 1.3946948006501698, + "learning_rate": 5.335554866707999e-07, + "loss": 0.8814, + "step": 137700 + }, + { + "epoch": 10.671471192219768, + "grad_norm": 1.5079456362111372, + "learning_rate": 5.335942343459393e-07, + "loss": 0.8799, + "step": 137710 + }, + { + "epoch": 10.672246115696074, + "grad_norm": 1.486175973554535, + "learning_rate": 5.336329820210788e-07, + "loss": 0.8847, + "step": 137720 + }, + { + "epoch": 10.673021039172381, + "grad_norm": 1.3833469506210314, + "learning_rate": 5.336717296962183e-07, + "loss": 0.8797, + "step": 137730 + }, + { + "epoch": 10.673795962648688, + "grad_norm": 1.4077053558110912, + "learning_rate": 5.337104773713577e-07, + "loss": 0.9041, + "step": 137740 + }, + { + "epoch": 10.674570886124995, + "grad_norm": 1.4644462110753786, + "learning_rate": 5.337492250464973e-07, + "loss": 0.8935, + "step": 137750 + }, + { + "epoch": 10.675345809601302, + "grad_norm": 1.4086709206807089, + "learning_rate": 5.337879727216368e-07, + "loss": 0.8963, + "step": 137760 + }, + { + "epoch": 10.676120733077608, + "grad_norm": 1.5050057092996545, + "learning_rate": 5.338267203967763e-07, + "loss": 0.8985, + "step": 137770 + }, + { + "epoch": 10.676895656553915, + "grad_norm": 1.4817105940638442, + "learning_rate": 5.338654680719157e-07, + "loss": 0.8919, + "step": 137780 + }, + { + "epoch": 10.677670580030222, + "grad_norm": 1.6014361136262916, + "learning_rate": 5.339042157470552e-07, + "loss": 0.8875, + "step": 137790 + }, + { + "epoch": 10.678445503506529, + "grad_norm": 1.6055592676574457, + "learning_rate": 5.339429634221948e-07, + "loss": 0.8942, + "step": 137800 + }, + { + "epoch": 10.679220426982836, + "grad_norm": 1.4180050765266914, + "learning_rate": 5.339817110973342e-07, + "loss": 0.8959, + "step": 137810 + }, + { + "epoch": 10.679995350459143, + "grad_norm": 1.499028962426174, + "learning_rate": 5.340204587724737e-07, + "loss": 0.8797, + "step": 137820 + }, + { + "epoch": 10.68077027393545, + "grad_norm": 1.6082769120373268, + "learning_rate": 5.340592064476132e-07, + "loss": 0.8896, + "step": 137830 + }, + { + "epoch": 10.681545197411756, + "grad_norm": 1.4347843092376014, + "learning_rate": 5.340979541227527e-07, + "loss": 0.8993, + "step": 137840 + }, + { + "epoch": 10.682320120888063, + "grad_norm": 1.4970149755265871, + "learning_rate": 5.341367017978922e-07, + "loss": 0.901, + "step": 137850 + }, + { + "epoch": 10.68309504436437, + "grad_norm": 1.5346324536830704, + "learning_rate": 5.341754494730317e-07, + "loss": 0.893, + "step": 137860 + }, + { + "epoch": 10.683869967840677, + "grad_norm": 1.4070823697283643, + "learning_rate": 5.342141971481712e-07, + "loss": 0.9089, + "step": 137870 + }, + { + "epoch": 10.684644891316982, + "grad_norm": 1.4560435422324765, + "learning_rate": 5.342529448233106e-07, + "loss": 0.8811, + "step": 137880 + }, + { + "epoch": 10.685419814793288, + "grad_norm": 1.4723359506861757, + "learning_rate": 5.342916924984501e-07, + "loss": 0.8838, + "step": 137890 + }, + { + "epoch": 10.686194738269595, + "grad_norm": 1.5006806609083638, + "learning_rate": 5.343304401735897e-07, + "loss": 0.9053, + "step": 137900 + }, + { + "epoch": 10.686969661745902, + "grad_norm": 1.4549770730887255, + "learning_rate": 5.343691878487292e-07, + "loss": 0.9054, + "step": 137910 + }, + { + "epoch": 10.687744585222209, + "grad_norm": 1.456827207254797, + "learning_rate": 5.344079355238686e-07, + "loss": 0.8996, + "step": 137920 + }, + { + "epoch": 10.688519508698516, + "grad_norm": 1.4501776049330406, + "learning_rate": 5.344466831990081e-07, + "loss": 0.9036, + "step": 137930 + }, + { + "epoch": 10.689294432174822, + "grad_norm": 1.509889172440281, + "learning_rate": 5.344854308741476e-07, + "loss": 0.8925, + "step": 137940 + }, + { + "epoch": 10.69006935565113, + "grad_norm": 1.4404942047372542, + "learning_rate": 5.345241785492871e-07, + "loss": 0.8788, + "step": 137950 + }, + { + "epoch": 10.690844279127436, + "grad_norm": 1.521227526463457, + "learning_rate": 5.345629262244266e-07, + "loss": 0.8839, + "step": 137960 + }, + { + "epoch": 10.691619202603743, + "grad_norm": 1.5131455275131105, + "learning_rate": 5.346016738995661e-07, + "loss": 0.8891, + "step": 137970 + }, + { + "epoch": 10.69239412608005, + "grad_norm": 1.4756482585652748, + "learning_rate": 5.346404215747056e-07, + "loss": 0.8922, + "step": 137980 + }, + { + "epoch": 10.693169049556356, + "grad_norm": 1.4661589575810237, + "learning_rate": 5.34679169249845e-07, + "loss": 0.8864, + "step": 137990 + }, + { + "epoch": 10.693943973032663, + "grad_norm": 1.421722000311954, + "learning_rate": 5.347179169249846e-07, + "loss": 0.8849, + "step": 138000 + }, + { + "epoch": 10.693943973032663, + "eval_loss": 0.9048635959625244, + "eval_runtime": 328.672, + "eval_samples_per_second": 34.901, + "eval_steps_per_second": 8.726, + "step": 138000 + }, + { + "epoch": 10.69471889650897, + "grad_norm": 1.544419955785126, + "learning_rate": 5.347566646001241e-07, + "loss": 0.8948, + "step": 138010 + }, + { + "epoch": 10.695493819985277, + "grad_norm": 1.4942791653012197, + "learning_rate": 5.347954122752635e-07, + "loss": 0.8948, + "step": 138020 + }, + { + "epoch": 10.696268743461584, + "grad_norm": 1.4688029456654408, + "learning_rate": 5.34834159950403e-07, + "loss": 0.89, + "step": 138030 + }, + { + "epoch": 10.69704366693789, + "grad_norm": 1.516695836493352, + "learning_rate": 5.348729076255425e-07, + "loss": 0.8942, + "step": 138040 + }, + { + "epoch": 10.697818590414197, + "grad_norm": 1.4577618844956992, + "learning_rate": 5.349116553006821e-07, + "loss": 0.8875, + "step": 138050 + }, + { + "epoch": 10.698593513890504, + "grad_norm": 1.4608126593678898, + "learning_rate": 5.349504029758215e-07, + "loss": 0.8987, + "step": 138060 + }, + { + "epoch": 10.69936843736681, + "grad_norm": 1.5306331313615948, + "learning_rate": 5.34989150650961e-07, + "loss": 0.9013, + "step": 138070 + }, + { + "epoch": 10.700143360843116, + "grad_norm": 1.408263479258084, + "learning_rate": 5.350278983261005e-07, + "loss": 0.8826, + "step": 138080 + }, + { + "epoch": 10.700918284319423, + "grad_norm": 1.4073172513431618, + "learning_rate": 5.350666460012399e-07, + "loss": 0.8771, + "step": 138090 + }, + { + "epoch": 10.70169320779573, + "grad_norm": 1.4315440535402968, + "learning_rate": 5.351053936763795e-07, + "loss": 0.8918, + "step": 138100 + }, + { + "epoch": 10.702468131272036, + "grad_norm": 1.4062039849158008, + "learning_rate": 5.35144141351519e-07, + "loss": 0.8747, + "step": 138110 + }, + { + "epoch": 10.703243054748343, + "grad_norm": 1.380058775948571, + "learning_rate": 5.351828890266585e-07, + "loss": 0.8981, + "step": 138120 + }, + { + "epoch": 10.70401797822465, + "grad_norm": 1.4862603561702583, + "learning_rate": 5.352216367017979e-07, + "loss": 0.9119, + "step": 138130 + }, + { + "epoch": 10.704792901700957, + "grad_norm": 1.5466297142011642, + "learning_rate": 5.352603843769375e-07, + "loss": 0.8994, + "step": 138140 + }, + { + "epoch": 10.705567825177264, + "grad_norm": 1.5511408027825624, + "learning_rate": 5.35299132052077e-07, + "loss": 0.9003, + "step": 138150 + }, + { + "epoch": 10.70634274865357, + "grad_norm": 1.4622824755673383, + "learning_rate": 5.353378797272164e-07, + "loss": 0.898, + "step": 138160 + }, + { + "epoch": 10.707117672129877, + "grad_norm": 1.4129300112937186, + "learning_rate": 5.353766274023559e-07, + "loss": 0.8953, + "step": 138170 + }, + { + "epoch": 10.707892595606184, + "grad_norm": 1.3251286641593918, + "learning_rate": 5.354153750774954e-07, + "loss": 0.9004, + "step": 138180 + }, + { + "epoch": 10.708667519082491, + "grad_norm": 1.4687776604314677, + "learning_rate": 5.35454122752635e-07, + "loss": 0.8857, + "step": 138190 + }, + { + "epoch": 10.709442442558798, + "grad_norm": 1.381936420418856, + "learning_rate": 5.354928704277744e-07, + "loss": 0.9051, + "step": 138200 + }, + { + "epoch": 10.710217366035105, + "grad_norm": 1.4776112232596574, + "learning_rate": 5.355316181029139e-07, + "loss": 0.8975, + "step": 138210 + }, + { + "epoch": 10.710992289511411, + "grad_norm": 1.47015714932678, + "learning_rate": 5.355703657780534e-07, + "loss": 0.8649, + "step": 138220 + }, + { + "epoch": 10.711767212987718, + "grad_norm": 1.4525767117096966, + "learning_rate": 5.356091134531928e-07, + "loss": 0.8928, + "step": 138230 + }, + { + "epoch": 10.712542136464025, + "grad_norm": 1.452384021839479, + "learning_rate": 5.356478611283324e-07, + "loss": 0.9024, + "step": 138240 + }, + { + "epoch": 10.71331705994033, + "grad_norm": 1.404759672045383, + "learning_rate": 5.356866088034719e-07, + "loss": 0.8724, + "step": 138250 + }, + { + "epoch": 10.714091983416637, + "grad_norm": 1.4976742430932979, + "learning_rate": 5.357253564786114e-07, + "loss": 0.888, + "step": 138260 + }, + { + "epoch": 10.714866906892944, + "grad_norm": 1.529885268535206, + "learning_rate": 5.357641041537508e-07, + "loss": 0.8925, + "step": 138270 + }, + { + "epoch": 10.71564183036925, + "grad_norm": 1.4528980884426774, + "learning_rate": 5.358028518288903e-07, + "loss": 0.8812, + "step": 138280 + }, + { + "epoch": 10.716416753845557, + "grad_norm": 1.4626383121283244, + "learning_rate": 5.358415995040299e-07, + "loss": 0.8943, + "step": 138290 + }, + { + "epoch": 10.717191677321864, + "grad_norm": 1.4692445127966551, + "learning_rate": 5.358803471791693e-07, + "loss": 0.8913, + "step": 138300 + }, + { + "epoch": 10.71796660079817, + "grad_norm": 1.6624166424746318, + "learning_rate": 5.359190948543088e-07, + "loss": 0.8768, + "step": 138310 + }, + { + "epoch": 10.718741524274478, + "grad_norm": 1.392940545924807, + "learning_rate": 5.359578425294483e-07, + "loss": 0.8779, + "step": 138320 + }, + { + "epoch": 10.719516447750784, + "grad_norm": 1.4802907552328186, + "learning_rate": 5.359965902045878e-07, + "loss": 0.8961, + "step": 138330 + }, + { + "epoch": 10.720291371227091, + "grad_norm": 1.4612279710288165, + "learning_rate": 5.360353378797273e-07, + "loss": 0.8889, + "step": 138340 + }, + { + "epoch": 10.721066294703398, + "grad_norm": 1.6129936444017954, + "learning_rate": 5.360740855548668e-07, + "loss": 0.8907, + "step": 138350 + }, + { + "epoch": 10.721841218179705, + "grad_norm": 1.5896410133293828, + "learning_rate": 5.361128332300063e-07, + "loss": 0.9074, + "step": 138360 + }, + { + "epoch": 10.722616141656012, + "grad_norm": 1.3964315409447345, + "learning_rate": 5.361515809051457e-07, + "loss": 0.9089, + "step": 138370 + }, + { + "epoch": 10.723391065132319, + "grad_norm": 1.4869988384876553, + "learning_rate": 5.361903285802852e-07, + "loss": 0.9058, + "step": 138380 + }, + { + "epoch": 10.724165988608625, + "grad_norm": 1.4636722836057052, + "learning_rate": 5.362290762554248e-07, + "loss": 0.8899, + "step": 138390 + }, + { + "epoch": 10.724940912084932, + "grad_norm": 1.4689915684246078, + "learning_rate": 5.362678239305643e-07, + "loss": 0.8794, + "step": 138400 + }, + { + "epoch": 10.725715835561239, + "grad_norm": 1.4424157244689129, + "learning_rate": 5.363065716057037e-07, + "loss": 0.8996, + "step": 138410 + }, + { + "epoch": 10.726490759037546, + "grad_norm": 1.537473190408573, + "learning_rate": 5.363453192808432e-07, + "loss": 0.8773, + "step": 138420 + }, + { + "epoch": 10.727265682513853, + "grad_norm": 1.62933777104022, + "learning_rate": 5.363840669559827e-07, + "loss": 0.9143, + "step": 138430 + }, + { + "epoch": 10.72804060599016, + "grad_norm": 1.4513002926767113, + "learning_rate": 5.364228146311222e-07, + "loss": 0.8724, + "step": 138440 + }, + { + "epoch": 10.728815529466464, + "grad_norm": 1.4805388749853794, + "learning_rate": 5.364615623062617e-07, + "loss": 0.8999, + "step": 138450 + }, + { + "epoch": 10.729590452942771, + "grad_norm": 1.4568095668255554, + "learning_rate": 5.365003099814012e-07, + "loss": 0.8771, + "step": 138460 + }, + { + "epoch": 10.730365376419078, + "grad_norm": 1.4045072582071099, + "learning_rate": 5.365390576565407e-07, + "loss": 0.919, + "step": 138470 + }, + { + "epoch": 10.731140299895385, + "grad_norm": 1.4243067368274647, + "learning_rate": 5.365778053316801e-07, + "loss": 0.8957, + "step": 138480 + }, + { + "epoch": 10.731915223371692, + "grad_norm": 1.448330970281742, + "learning_rate": 5.366165530068197e-07, + "loss": 0.8769, + "step": 138490 + }, + { + "epoch": 10.732690146847998, + "grad_norm": 1.553105898347551, + "learning_rate": 5.366553006819592e-07, + "loss": 0.9056, + "step": 138500 + }, + { + "epoch": 10.732690146847998, + "eval_loss": 0.904663622379303, + "eval_runtime": 329.9408, + "eval_samples_per_second": 34.767, + "eval_steps_per_second": 8.692, + "step": 138500 + }, + { + "epoch": 10.733465070324305, + "grad_norm": 1.4770157072963794, + "learning_rate": 5.366940483570986e-07, + "loss": 0.8929, + "step": 138510 + }, + { + "epoch": 10.734239993800612, + "grad_norm": 1.5161484508723482, + "learning_rate": 5.367327960322381e-07, + "loss": 0.8942, + "step": 138520 + }, + { + "epoch": 10.735014917276919, + "grad_norm": 1.5242422638267772, + "learning_rate": 5.367715437073776e-07, + "loss": 0.8965, + "step": 138530 + }, + { + "epoch": 10.735789840753226, + "grad_norm": 1.4269152629149537, + "learning_rate": 5.368102913825172e-07, + "loss": 0.8787, + "step": 138540 + }, + { + "epoch": 10.736564764229533, + "grad_norm": 1.4494483325235956, + "learning_rate": 5.368490390576566e-07, + "loss": 0.912, + "step": 138550 + }, + { + "epoch": 10.73733968770584, + "grad_norm": 1.5581907625272866, + "learning_rate": 5.368877867327961e-07, + "loss": 0.8855, + "step": 138560 + }, + { + "epoch": 10.738114611182146, + "grad_norm": 1.4665578073262944, + "learning_rate": 5.369265344079356e-07, + "loss": 0.8848, + "step": 138570 + }, + { + "epoch": 10.738889534658453, + "grad_norm": 1.5061831000214583, + "learning_rate": 5.36965282083075e-07, + "loss": 0.9108, + "step": 138580 + }, + { + "epoch": 10.73966445813476, + "grad_norm": 1.481209760006998, + "learning_rate": 5.370040297582146e-07, + "loss": 0.8861, + "step": 138590 + }, + { + "epoch": 10.740439381611067, + "grad_norm": 1.439908439328352, + "learning_rate": 5.370427774333541e-07, + "loss": 0.8969, + "step": 138600 + }, + { + "epoch": 10.741214305087373, + "grad_norm": 1.4291492053807824, + "learning_rate": 5.370815251084935e-07, + "loss": 0.8853, + "step": 138610 + }, + { + "epoch": 10.741989228563678, + "grad_norm": 1.582455617005074, + "learning_rate": 5.37120272783633e-07, + "loss": 0.8842, + "step": 138620 + }, + { + "epoch": 10.742764152039985, + "grad_norm": 1.3929681753096184, + "learning_rate": 5.371590204587725e-07, + "loss": 0.906, + "step": 138630 + }, + { + "epoch": 10.743539075516292, + "grad_norm": 1.5122043369302878, + "learning_rate": 5.371977681339121e-07, + "loss": 0.8821, + "step": 138640 + }, + { + "epoch": 10.744313998992599, + "grad_norm": 1.4874357490868084, + "learning_rate": 5.372365158090515e-07, + "loss": 0.8904, + "step": 138650 + }, + { + "epoch": 10.745088922468906, + "grad_norm": 1.4577170164364643, + "learning_rate": 5.37275263484191e-07, + "loss": 0.9096, + "step": 138660 + }, + { + "epoch": 10.745863845945212, + "grad_norm": 1.36300531772006, + "learning_rate": 5.373140111593305e-07, + "loss": 0.9102, + "step": 138670 + }, + { + "epoch": 10.74663876942152, + "grad_norm": 1.441783219810115, + "learning_rate": 5.373527588344699e-07, + "loss": 0.8914, + "step": 138680 + }, + { + "epoch": 10.747413692897826, + "grad_norm": 1.4045809320305742, + "learning_rate": 5.373915065096095e-07, + "loss": 0.8887, + "step": 138690 + }, + { + "epoch": 10.748188616374133, + "grad_norm": 1.4469911886005038, + "learning_rate": 5.37430254184749e-07, + "loss": 0.8778, + "step": 138700 + }, + { + "epoch": 10.74896353985044, + "grad_norm": 1.4616353543415832, + "learning_rate": 5.374690018598885e-07, + "loss": 0.9097, + "step": 138710 + }, + { + "epoch": 10.749738463326747, + "grad_norm": 1.6282267834240784, + "learning_rate": 5.375077495350279e-07, + "loss": 0.9143, + "step": 138720 + }, + { + "epoch": 10.750513386803053, + "grad_norm": 1.4403240215931052, + "learning_rate": 5.375464972101674e-07, + "loss": 0.8944, + "step": 138730 + }, + { + "epoch": 10.75128831027936, + "grad_norm": 1.4373523261246337, + "learning_rate": 5.37585244885307e-07, + "loss": 0.8922, + "step": 138740 + }, + { + "epoch": 10.752063233755667, + "grad_norm": 1.4873475359768673, + "learning_rate": 5.376239925604464e-07, + "loss": 0.8896, + "step": 138750 + }, + { + "epoch": 10.752838157231974, + "grad_norm": 1.3840844688286964, + "learning_rate": 5.376627402355859e-07, + "loss": 0.8816, + "step": 138760 + }, + { + "epoch": 10.75361308070828, + "grad_norm": 1.5065943806293238, + "learning_rate": 5.377014879107254e-07, + "loss": 0.8905, + "step": 138770 + }, + { + "epoch": 10.754388004184587, + "grad_norm": 1.5167669583213377, + "learning_rate": 5.37740235585865e-07, + "loss": 0.8791, + "step": 138780 + }, + { + "epoch": 10.755162927660894, + "grad_norm": 1.445658428877417, + "learning_rate": 5.377789832610044e-07, + "loss": 0.902, + "step": 138790 + }, + { + "epoch": 10.755937851137201, + "grad_norm": 1.4455117352495759, + "learning_rate": 5.378177309361439e-07, + "loss": 0.8863, + "step": 138800 + }, + { + "epoch": 10.756712774613508, + "grad_norm": 1.5387072558703148, + "learning_rate": 5.378564786112834e-07, + "loss": 0.8689, + "step": 138810 + }, + { + "epoch": 10.757487698089813, + "grad_norm": 1.4179682216876521, + "learning_rate": 5.378952262864228e-07, + "loss": 0.8858, + "step": 138820 + }, + { + "epoch": 10.75826262156612, + "grad_norm": 1.5129604789543882, + "learning_rate": 5.379339739615623e-07, + "loss": 0.8879, + "step": 138830 + }, + { + "epoch": 10.759037545042426, + "grad_norm": 1.545087035772768, + "learning_rate": 5.379727216367019e-07, + "loss": 0.8835, + "step": 138840 + }, + { + "epoch": 10.759812468518733, + "grad_norm": 1.4872509695388159, + "learning_rate": 5.380114693118414e-07, + "loss": 0.9033, + "step": 138850 + }, + { + "epoch": 10.76058739199504, + "grad_norm": 1.4301778885028649, + "learning_rate": 5.380502169869808e-07, + "loss": 0.8941, + "step": 138860 + }, + { + "epoch": 10.761362315471347, + "grad_norm": 1.345703135592094, + "learning_rate": 5.380889646621203e-07, + "loss": 0.8928, + "step": 138870 + }, + { + "epoch": 10.762137238947654, + "grad_norm": 1.5335462308173406, + "learning_rate": 5.381277123372599e-07, + "loss": 0.9027, + "step": 138880 + }, + { + "epoch": 10.76291216242396, + "grad_norm": 1.4285606614822606, + "learning_rate": 5.381664600123993e-07, + "loss": 0.9105, + "step": 138890 + }, + { + "epoch": 10.763687085900267, + "grad_norm": 1.434092440197182, + "learning_rate": 5.382052076875388e-07, + "loss": 0.9034, + "step": 138900 + }, + { + "epoch": 10.764462009376574, + "grad_norm": 1.3649421433966968, + "learning_rate": 5.382439553626783e-07, + "loss": 0.8908, + "step": 138910 + }, + { + "epoch": 10.765236932852881, + "grad_norm": 1.3628702065524483, + "learning_rate": 5.382827030378178e-07, + "loss": 0.8882, + "step": 138920 + }, + { + "epoch": 10.766011856329188, + "grad_norm": 1.5295641793994732, + "learning_rate": 5.383214507129573e-07, + "loss": 0.9111, + "step": 138930 + }, + { + "epoch": 10.766786779805495, + "grad_norm": 1.5697253204993298, + "learning_rate": 5.383601983880968e-07, + "loss": 0.8911, + "step": 138940 + }, + { + "epoch": 10.767561703281801, + "grad_norm": 1.3807422946746317, + "learning_rate": 5.383989460632363e-07, + "loss": 0.8834, + "step": 138950 + }, + { + "epoch": 10.768336626758108, + "grad_norm": 1.4143868522794554, + "learning_rate": 5.384376937383757e-07, + "loss": 0.8986, + "step": 138960 + }, + { + "epoch": 10.769111550234415, + "grad_norm": 1.5360930699565658, + "learning_rate": 5.384764414135152e-07, + "loss": 0.8914, + "step": 138970 + }, + { + "epoch": 10.769886473710722, + "grad_norm": 1.442887248688554, + "learning_rate": 5.385151890886548e-07, + "loss": 0.9064, + "step": 138980 + }, + { + "epoch": 10.770661397187027, + "grad_norm": 1.3648001959380338, + "learning_rate": 5.385539367637943e-07, + "loss": 0.8773, + "step": 138990 + }, + { + "epoch": 10.771436320663334, + "grad_norm": 1.4923809553188987, + "learning_rate": 5.385926844389337e-07, + "loss": 0.897, + "step": 139000 + }, + { + "epoch": 10.771436320663334, + "eval_loss": 0.9046270847320557, + "eval_runtime": 330.5231, + "eval_samples_per_second": 34.706, + "eval_steps_per_second": 8.677, + "step": 139000 + }, + { + "epoch": 10.77221124413964, + "grad_norm": 1.4910313794699137, + "learning_rate": 5.386314321140732e-07, + "loss": 0.9113, + "step": 139010 + }, + { + "epoch": 10.772986167615947, + "grad_norm": 1.4251456758207106, + "learning_rate": 5.386701797892127e-07, + "loss": 0.8779, + "step": 139020 + }, + { + "epoch": 10.773761091092254, + "grad_norm": 1.4844540591568685, + "learning_rate": 5.387089274643522e-07, + "loss": 0.8923, + "step": 139030 + }, + { + "epoch": 10.774536014568561, + "grad_norm": 1.415525209826962, + "learning_rate": 5.387476751394917e-07, + "loss": 0.8898, + "step": 139040 + }, + { + "epoch": 10.775310938044868, + "grad_norm": 1.4139201668096473, + "learning_rate": 5.387864228146312e-07, + "loss": 0.8912, + "step": 139050 + }, + { + "epoch": 10.776085861521175, + "grad_norm": 1.5043070384749906, + "learning_rate": 5.388251704897707e-07, + "loss": 0.8931, + "step": 139060 + }, + { + "epoch": 10.776860784997481, + "grad_norm": 1.5242887276591333, + "learning_rate": 5.388639181649101e-07, + "loss": 0.8965, + "step": 139070 + }, + { + "epoch": 10.777635708473788, + "grad_norm": 1.4458507549014117, + "learning_rate": 5.389026658400497e-07, + "loss": 0.893, + "step": 139080 + }, + { + "epoch": 10.778410631950095, + "grad_norm": 1.3721785813333747, + "learning_rate": 5.389414135151892e-07, + "loss": 0.91, + "step": 139090 + }, + { + "epoch": 10.779185555426402, + "grad_norm": 1.4828616171344864, + "learning_rate": 5.389801611903286e-07, + "loss": 0.9076, + "step": 139100 + }, + { + "epoch": 10.779960478902709, + "grad_norm": 1.4691413430808877, + "learning_rate": 5.390189088654681e-07, + "loss": 0.8723, + "step": 139110 + }, + { + "epoch": 10.780735402379015, + "grad_norm": 1.5357269143849879, + "learning_rate": 5.390576565406076e-07, + "loss": 0.909, + "step": 139120 + }, + { + "epoch": 10.781510325855322, + "grad_norm": 1.5509261012069913, + "learning_rate": 5.390964042157472e-07, + "loss": 0.8838, + "step": 139130 + }, + { + "epoch": 10.782285249331629, + "grad_norm": 1.4452864188055157, + "learning_rate": 5.391351518908866e-07, + "loss": 0.8729, + "step": 139140 + }, + { + "epoch": 10.783060172807936, + "grad_norm": 1.4318988280438965, + "learning_rate": 5.391738995660261e-07, + "loss": 0.8977, + "step": 139150 + }, + { + "epoch": 10.783835096284243, + "grad_norm": 1.4280446797447592, + "learning_rate": 5.392126472411656e-07, + "loss": 0.8944, + "step": 139160 + }, + { + "epoch": 10.78461001976055, + "grad_norm": 1.526561607909632, + "learning_rate": 5.39251394916305e-07, + "loss": 0.8778, + "step": 139170 + }, + { + "epoch": 10.785384943236856, + "grad_norm": 1.385232758612107, + "learning_rate": 5.392901425914446e-07, + "loss": 0.8814, + "step": 139180 + }, + { + "epoch": 10.786159866713161, + "grad_norm": 1.4179718148115623, + "learning_rate": 5.393288902665841e-07, + "loss": 0.8968, + "step": 139190 + }, + { + "epoch": 10.786934790189468, + "grad_norm": 1.4799138298363015, + "learning_rate": 5.393676379417236e-07, + "loss": 0.8755, + "step": 139200 + }, + { + "epoch": 10.787709713665775, + "grad_norm": 1.4459104236334823, + "learning_rate": 5.39406385616863e-07, + "loss": 0.8937, + "step": 139210 + }, + { + "epoch": 10.788484637142082, + "grad_norm": 1.4145062266720336, + "learning_rate": 5.394451332920025e-07, + "loss": 0.8796, + "step": 139220 + }, + { + "epoch": 10.789259560618389, + "grad_norm": 1.482074909517675, + "learning_rate": 5.394838809671421e-07, + "loss": 0.9005, + "step": 139230 + }, + { + "epoch": 10.790034484094695, + "grad_norm": 1.3758996326998558, + "learning_rate": 5.395226286422815e-07, + "loss": 0.8853, + "step": 139240 + }, + { + "epoch": 10.790809407571002, + "grad_norm": 1.5515210628319294, + "learning_rate": 5.39561376317421e-07, + "loss": 0.9231, + "step": 139250 + }, + { + "epoch": 10.791584331047309, + "grad_norm": 1.4145732289472541, + "learning_rate": 5.396001239925605e-07, + "loss": 0.9064, + "step": 139260 + }, + { + "epoch": 10.792359254523616, + "grad_norm": 1.451283649832885, + "learning_rate": 5.396388716677e-07, + "loss": 0.893, + "step": 139270 + }, + { + "epoch": 10.793134177999923, + "grad_norm": 1.3850190045478716, + "learning_rate": 5.396776193428395e-07, + "loss": 0.8941, + "step": 139280 + }, + { + "epoch": 10.79390910147623, + "grad_norm": 1.5097508243696018, + "learning_rate": 5.39716367017979e-07, + "loss": 0.8961, + "step": 139290 + }, + { + "epoch": 10.794684024952536, + "grad_norm": 1.428636074484062, + "learning_rate": 5.397551146931185e-07, + "loss": 0.8886, + "step": 139300 + }, + { + "epoch": 10.795458948428843, + "grad_norm": 1.485648117812185, + "learning_rate": 5.397938623682579e-07, + "loss": 0.8943, + "step": 139310 + }, + { + "epoch": 10.79623387190515, + "grad_norm": 1.4342967668762348, + "learning_rate": 5.398326100433974e-07, + "loss": 0.897, + "step": 139320 + }, + { + "epoch": 10.797008795381457, + "grad_norm": 1.4580880192569523, + "learning_rate": 5.39871357718537e-07, + "loss": 0.8863, + "step": 139330 + }, + { + "epoch": 10.797783718857763, + "grad_norm": 1.4173669794032369, + "learning_rate": 5.399101053936765e-07, + "loss": 0.9004, + "step": 139340 + }, + { + "epoch": 10.79855864233407, + "grad_norm": 1.482075032189422, + "learning_rate": 5.399488530688159e-07, + "loss": 0.9015, + "step": 139350 + }, + { + "epoch": 10.799333565810375, + "grad_norm": 1.4874833040678608, + "learning_rate": 5.399876007439554e-07, + "loss": 0.9007, + "step": 139360 + }, + { + "epoch": 10.800108489286682, + "grad_norm": 1.4503692880901742, + "learning_rate": 5.40026348419095e-07, + "loss": 0.9134, + "step": 139370 + }, + { + "epoch": 10.800883412762989, + "grad_norm": 1.4262120200507107, + "learning_rate": 5.400650960942344e-07, + "loss": 0.8837, + "step": 139380 + }, + { + "epoch": 10.801658336239296, + "grad_norm": 1.3857814880939598, + "learning_rate": 5.401038437693739e-07, + "loss": 0.907, + "step": 139390 + }, + { + "epoch": 10.802433259715603, + "grad_norm": 1.445421842867491, + "learning_rate": 5.401425914445134e-07, + "loss": 0.8949, + "step": 139400 + }, + { + "epoch": 10.80320818319191, + "grad_norm": 1.4654214053652581, + "learning_rate": 5.401813391196529e-07, + "loss": 0.8919, + "step": 139410 + }, + { + "epoch": 10.803983106668216, + "grad_norm": 1.4210480643072623, + "learning_rate": 5.402200867947923e-07, + "loss": 0.8993, + "step": 139420 + }, + { + "epoch": 10.804758030144523, + "grad_norm": 1.4631641841283773, + "learning_rate": 5.402588344699319e-07, + "loss": 0.899, + "step": 139430 + }, + { + "epoch": 10.80553295362083, + "grad_norm": 1.4662464400356854, + "learning_rate": 5.402975821450714e-07, + "loss": 0.8942, + "step": 139440 + }, + { + "epoch": 10.806307877097137, + "grad_norm": 1.4354090570039204, + "learning_rate": 5.403363298202108e-07, + "loss": 0.9115, + "step": 139450 + }, + { + "epoch": 10.807082800573443, + "grad_norm": 1.4241552982305363, + "learning_rate": 5.403750774953503e-07, + "loss": 0.9001, + "step": 139460 + }, + { + "epoch": 10.80785772404975, + "grad_norm": 1.459582163965125, + "learning_rate": 5.404138251704899e-07, + "loss": 0.9197, + "step": 139470 + }, + { + "epoch": 10.808632647526057, + "grad_norm": 1.400164965508889, + "learning_rate": 5.404525728456294e-07, + "loss": 0.8809, + "step": 139480 + }, + { + "epoch": 10.809407571002364, + "grad_norm": 1.3973811770114148, + "learning_rate": 5.404913205207688e-07, + "loss": 0.8912, + "step": 139490 + }, + { + "epoch": 10.81018249447867, + "grad_norm": 1.3847350451396008, + "learning_rate": 5.405300681959083e-07, + "loss": 0.8978, + "step": 139500 + }, + { + "epoch": 10.81018249447867, + "eval_loss": 0.9043657183647156, + "eval_runtime": 329.2838, + "eval_samples_per_second": 34.836, + "eval_steps_per_second": 8.71, + "step": 139500 + }, + { + "epoch": 10.810957417954977, + "grad_norm": 1.499030437916398, + "learning_rate": 5.405688158710478e-07, + "loss": 0.8869, + "step": 139510 + }, + { + "epoch": 10.811732341431284, + "grad_norm": 1.4775374155234176, + "learning_rate": 5.406075635461872e-07, + "loss": 0.8814, + "step": 139520 + }, + { + "epoch": 10.812507264907591, + "grad_norm": 1.475970858665857, + "learning_rate": 5.406463112213268e-07, + "loss": 0.9066, + "step": 139530 + }, + { + "epoch": 10.813282188383898, + "grad_norm": 1.4543345513233974, + "learning_rate": 5.406850588964663e-07, + "loss": 0.8971, + "step": 139540 + }, + { + "epoch": 10.814057111860205, + "grad_norm": 1.5896007825643212, + "learning_rate": 5.407238065716058e-07, + "loss": 0.8975, + "step": 139550 + }, + { + "epoch": 10.81483203533651, + "grad_norm": 1.48577190136706, + "learning_rate": 5.407625542467452e-07, + "loss": 0.8988, + "step": 139560 + }, + { + "epoch": 10.815606958812817, + "grad_norm": 1.4908426251175835, + "learning_rate": 5.408013019218848e-07, + "loss": 0.8979, + "step": 139570 + }, + { + "epoch": 10.816381882289123, + "grad_norm": 1.4522840924119091, + "learning_rate": 5.408400495970243e-07, + "loss": 0.9174, + "step": 139580 + }, + { + "epoch": 10.81715680576543, + "grad_norm": 1.5324242543644024, + "learning_rate": 5.408787972721637e-07, + "loss": 0.8901, + "step": 139590 + }, + { + "epoch": 10.817931729241737, + "grad_norm": 1.535815240211822, + "learning_rate": 5.409175449473032e-07, + "loss": 0.9015, + "step": 139600 + }, + { + "epoch": 10.818706652718044, + "grad_norm": 1.4877725907689745, + "learning_rate": 5.409562926224427e-07, + "loss": 0.8885, + "step": 139610 + }, + { + "epoch": 10.81948157619435, + "grad_norm": 1.4169247861724437, + "learning_rate": 5.409950402975823e-07, + "loss": 0.9239, + "step": 139620 + }, + { + "epoch": 10.820256499670657, + "grad_norm": 1.5121862252945584, + "learning_rate": 5.410337879727217e-07, + "loss": 0.902, + "step": 139630 + }, + { + "epoch": 10.821031423146964, + "grad_norm": 1.3685121199961563, + "learning_rate": 5.410725356478612e-07, + "loss": 0.8861, + "step": 139640 + }, + { + "epoch": 10.821806346623271, + "grad_norm": 1.4830496300804508, + "learning_rate": 5.411112833230007e-07, + "loss": 0.9008, + "step": 139650 + }, + { + "epoch": 10.822581270099578, + "grad_norm": 1.417394864856451, + "learning_rate": 5.411500309981401e-07, + "loss": 0.8859, + "step": 139660 + }, + { + "epoch": 10.823356193575885, + "grad_norm": 1.5133240987641075, + "learning_rate": 5.411887786732797e-07, + "loss": 0.8939, + "step": 139670 + }, + { + "epoch": 10.824131117052191, + "grad_norm": 1.4539745268955446, + "learning_rate": 5.412275263484192e-07, + "loss": 0.881, + "step": 139680 + }, + { + "epoch": 10.824906040528498, + "grad_norm": 1.5834464776073236, + "learning_rate": 5.412662740235587e-07, + "loss": 0.8956, + "step": 139690 + }, + { + "epoch": 10.825680964004805, + "grad_norm": 1.5115539318681794, + "learning_rate": 5.413050216986981e-07, + "loss": 0.874, + "step": 139700 + }, + { + "epoch": 10.826455887481112, + "grad_norm": 1.4768794851927456, + "learning_rate": 5.413437693738376e-07, + "loss": 0.9118, + "step": 139710 + }, + { + "epoch": 10.827230810957419, + "grad_norm": 1.4508123559009767, + "learning_rate": 5.413825170489772e-07, + "loss": 0.8811, + "step": 139720 + }, + { + "epoch": 10.828005734433725, + "grad_norm": 1.554039743344941, + "learning_rate": 5.414212647241166e-07, + "loss": 0.885, + "step": 139730 + }, + { + "epoch": 10.82878065791003, + "grad_norm": 1.425304392249329, + "learning_rate": 5.414600123992561e-07, + "loss": 0.8903, + "step": 139740 + }, + { + "epoch": 10.829555581386337, + "grad_norm": 1.4588518238334378, + "learning_rate": 5.414987600743956e-07, + "loss": 0.897, + "step": 139750 + }, + { + "epoch": 10.830330504862644, + "grad_norm": 1.446399848760412, + "learning_rate": 5.415375077495351e-07, + "loss": 0.8859, + "step": 139760 + }, + { + "epoch": 10.831105428338951, + "grad_norm": 1.419353814660216, + "learning_rate": 5.415762554246746e-07, + "loss": 0.8846, + "step": 139770 + }, + { + "epoch": 10.831880351815258, + "grad_norm": 1.4457736199247748, + "learning_rate": 5.416150030998141e-07, + "loss": 0.8863, + "step": 139780 + }, + { + "epoch": 10.832655275291565, + "grad_norm": 1.619943472270438, + "learning_rate": 5.416537507749536e-07, + "loss": 0.9139, + "step": 139790 + }, + { + "epoch": 10.833430198767871, + "grad_norm": 1.4421909012230398, + "learning_rate": 5.41692498450093e-07, + "loss": 0.8952, + "step": 139800 + }, + { + "epoch": 10.834205122244178, + "grad_norm": 1.4618384532010316, + "learning_rate": 5.417312461252325e-07, + "loss": 0.8936, + "step": 139810 + }, + { + "epoch": 10.834980045720485, + "grad_norm": 1.531040755355352, + "learning_rate": 5.417699938003721e-07, + "loss": 0.8963, + "step": 139820 + }, + { + "epoch": 10.835754969196792, + "grad_norm": 1.4705343208687962, + "learning_rate": 5.418087414755116e-07, + "loss": 0.9032, + "step": 139830 + }, + { + "epoch": 10.836529892673099, + "grad_norm": 1.5547790423086998, + "learning_rate": 5.41847489150651e-07, + "loss": 0.9024, + "step": 139840 + }, + { + "epoch": 10.837304816149405, + "grad_norm": 1.5438219506424373, + "learning_rate": 5.418862368257905e-07, + "loss": 0.8928, + "step": 139850 + }, + { + "epoch": 10.838079739625712, + "grad_norm": 1.441971513028545, + "learning_rate": 5.4192498450093e-07, + "loss": 0.8943, + "step": 139860 + }, + { + "epoch": 10.838854663102019, + "grad_norm": 1.4946227150576494, + "learning_rate": 5.419637321760695e-07, + "loss": 0.9093, + "step": 139870 + }, + { + "epoch": 10.839629586578326, + "grad_norm": 1.4192748456767073, + "learning_rate": 5.42002479851209e-07, + "loss": 0.9113, + "step": 139880 + }, + { + "epoch": 10.840404510054633, + "grad_norm": 1.4463867487153668, + "learning_rate": 5.420412275263485e-07, + "loss": 0.9086, + "step": 139890 + }, + { + "epoch": 10.84117943353094, + "grad_norm": 1.4335683413877953, + "learning_rate": 5.42079975201488e-07, + "loss": 0.8831, + "step": 139900 + }, + { + "epoch": 10.841954357007246, + "grad_norm": 1.4225233820188825, + "learning_rate": 5.421187228766274e-07, + "loss": 0.898, + "step": 139910 + }, + { + "epoch": 10.842729280483553, + "grad_norm": 1.4437191504700064, + "learning_rate": 5.42157470551767e-07, + "loss": 0.8996, + "step": 139920 + }, + { + "epoch": 10.843504203959858, + "grad_norm": 1.469501281556425, + "learning_rate": 5.421962182269065e-07, + "loss": 0.8663, + "step": 139930 + }, + { + "epoch": 10.844279127436165, + "grad_norm": 1.450225822534679, + "learning_rate": 5.422349659020459e-07, + "loss": 0.9121, + "step": 139940 + }, + { + "epoch": 10.845054050912472, + "grad_norm": 1.3863303091761041, + "learning_rate": 5.422737135771854e-07, + "loss": 0.9059, + "step": 139950 + }, + { + "epoch": 10.845828974388779, + "grad_norm": 1.4348355013990186, + "learning_rate": 5.423124612523249e-07, + "loss": 0.8931, + "step": 139960 + }, + { + "epoch": 10.846603897865085, + "grad_norm": 1.5349554544495214, + "learning_rate": 5.423512089274645e-07, + "loss": 0.8828, + "step": 139970 + }, + { + "epoch": 10.847378821341392, + "grad_norm": 1.4397071653215787, + "learning_rate": 5.423899566026039e-07, + "loss": 0.875, + "step": 139980 + }, + { + "epoch": 10.848153744817699, + "grad_norm": 1.4582218703449756, + "learning_rate": 5.424287042777434e-07, + "loss": 0.9046, + "step": 139990 + }, + { + "epoch": 10.848928668294006, + "grad_norm": 1.5005840397674128, + "learning_rate": 5.424674519528829e-07, + "loss": 0.9096, + "step": 140000 + }, + { + "epoch": 10.848928668294006, + "eval_loss": 0.9040204286575317, + "eval_runtime": 329.7097, + "eval_samples_per_second": 34.791, + "eval_steps_per_second": 8.699, + "step": 140000 + }, + { + "epoch": 10.849703591770313, + "grad_norm": 1.498215607460944, + "learning_rate": 5.425061996280223e-07, + "loss": 0.8758, + "step": 140010 + }, + { + "epoch": 10.85047851524662, + "grad_norm": 1.3898296542063846, + "learning_rate": 5.425449473031619e-07, + "loss": 0.884, + "step": 140020 + }, + { + "epoch": 10.851253438722926, + "grad_norm": 1.4788761155080055, + "learning_rate": 5.425836949783014e-07, + "loss": 0.8855, + "step": 140030 + }, + { + "epoch": 10.852028362199233, + "grad_norm": 1.4281814358022353, + "learning_rate": 5.426224426534409e-07, + "loss": 0.9075, + "step": 140040 + }, + { + "epoch": 10.85280328567554, + "grad_norm": 1.4233394232278511, + "learning_rate": 5.426611903285803e-07, + "loss": 0.8931, + "step": 140050 + }, + { + "epoch": 10.853578209151847, + "grad_norm": 1.376560744728049, + "learning_rate": 5.426999380037198e-07, + "loss": 0.8859, + "step": 140060 + }, + { + "epoch": 10.854353132628153, + "grad_norm": 1.3812080462878082, + "learning_rate": 5.427386856788594e-07, + "loss": 0.8892, + "step": 140070 + }, + { + "epoch": 10.85512805610446, + "grad_norm": 1.4732046409887067, + "learning_rate": 5.427774333539988e-07, + "loss": 0.9093, + "step": 140080 + }, + { + "epoch": 10.855902979580767, + "grad_norm": 1.4045978247875242, + "learning_rate": 5.428161810291383e-07, + "loss": 0.8978, + "step": 140090 + }, + { + "epoch": 10.856677903057074, + "grad_norm": 1.4534737557947812, + "learning_rate": 5.428549287042778e-07, + "loss": 0.8856, + "step": 140100 + }, + { + "epoch": 10.857452826533379, + "grad_norm": 1.4769141432109973, + "learning_rate": 5.428936763794172e-07, + "loss": 0.8933, + "step": 140110 + }, + { + "epoch": 10.858227750009686, + "grad_norm": 1.4600843222532873, + "learning_rate": 5.429324240545568e-07, + "loss": 0.9241, + "step": 140120 + }, + { + "epoch": 10.859002673485993, + "grad_norm": 1.5955214313304256, + "learning_rate": 5.429711717296963e-07, + "loss": 0.9263, + "step": 140130 + }, + { + "epoch": 10.8597775969623, + "grad_norm": 1.5011210761458036, + "learning_rate": 5.430099194048358e-07, + "loss": 0.8962, + "step": 140140 + }, + { + "epoch": 10.860552520438606, + "grad_norm": 1.5121511127901963, + "learning_rate": 5.430486670799752e-07, + "loss": 0.9102, + "step": 140150 + }, + { + "epoch": 10.861327443914913, + "grad_norm": 1.4618249243339074, + "learning_rate": 5.430874147551147e-07, + "loss": 0.8942, + "step": 140160 + }, + { + "epoch": 10.86210236739122, + "grad_norm": 1.3773389571872865, + "learning_rate": 5.431261624302543e-07, + "loss": 0.8781, + "step": 140170 + }, + { + "epoch": 10.862877290867527, + "grad_norm": 1.4787629976482095, + "learning_rate": 5.431649101053937e-07, + "loss": 0.8756, + "step": 140180 + }, + { + "epoch": 10.863652214343833, + "grad_norm": 1.459010800020474, + "learning_rate": 5.432036577805332e-07, + "loss": 0.9163, + "step": 140190 + }, + { + "epoch": 10.86442713782014, + "grad_norm": 1.4190594894522757, + "learning_rate": 5.432424054556727e-07, + "loss": 0.8761, + "step": 140200 + }, + { + "epoch": 10.865202061296447, + "grad_norm": 1.4761562075204928, + "learning_rate": 5.432811531308123e-07, + "loss": 0.88, + "step": 140210 + }, + { + "epoch": 10.865976984772754, + "grad_norm": 1.3906371631142422, + "learning_rate": 5.433199008059517e-07, + "loss": 0.8796, + "step": 140220 + }, + { + "epoch": 10.86675190824906, + "grad_norm": 1.4507296953178492, + "learning_rate": 5.433586484810912e-07, + "loss": 0.8876, + "step": 140230 + }, + { + "epoch": 10.867526831725367, + "grad_norm": 1.450696347924159, + "learning_rate": 5.433973961562307e-07, + "loss": 0.8971, + "step": 140240 + }, + { + "epoch": 10.868301755201674, + "grad_norm": 1.4495549085438613, + "learning_rate": 5.434361438313701e-07, + "loss": 0.8918, + "step": 140250 + }, + { + "epoch": 10.869076678677981, + "grad_norm": 1.58050747002361, + "learning_rate": 5.434748915065097e-07, + "loss": 0.8822, + "step": 140260 + }, + { + "epoch": 10.869851602154288, + "grad_norm": 1.4099819499199777, + "learning_rate": 5.435136391816492e-07, + "loss": 0.8733, + "step": 140270 + }, + { + "epoch": 10.870626525630595, + "grad_norm": 1.5259959534605205, + "learning_rate": 5.435523868567887e-07, + "loss": 0.8927, + "step": 140280 + }, + { + "epoch": 10.871401449106902, + "grad_norm": 1.408443321700963, + "learning_rate": 5.435911345319281e-07, + "loss": 0.8904, + "step": 140290 + }, + { + "epoch": 10.872176372583208, + "grad_norm": 1.4294580677491247, + "learning_rate": 5.436298822070676e-07, + "loss": 0.9106, + "step": 140300 + }, + { + "epoch": 10.872951296059513, + "grad_norm": 1.409704525039811, + "learning_rate": 5.436686298822072e-07, + "loss": 0.9, + "step": 140310 + }, + { + "epoch": 10.87372621953582, + "grad_norm": 1.4577824502091707, + "learning_rate": 5.437073775573466e-07, + "loss": 0.8714, + "step": 140320 + }, + { + "epoch": 10.874501143012127, + "grad_norm": 1.4411126161001415, + "learning_rate": 5.437461252324861e-07, + "loss": 0.8856, + "step": 140330 + }, + { + "epoch": 10.875276066488434, + "grad_norm": 1.4786062004411598, + "learning_rate": 5.437848729076256e-07, + "loss": 0.8993, + "step": 140340 + }, + { + "epoch": 10.87605098996474, + "grad_norm": 1.4178492460938241, + "learning_rate": 5.438236205827651e-07, + "loss": 0.9126, + "step": 140350 + }, + { + "epoch": 10.876825913441047, + "grad_norm": 1.4262429949825721, + "learning_rate": 5.438623682579046e-07, + "loss": 0.8999, + "step": 140360 + }, + { + "epoch": 10.877600836917354, + "grad_norm": 1.4582560259344386, + "learning_rate": 5.439011159330441e-07, + "loss": 0.8884, + "step": 140370 + }, + { + "epoch": 10.878375760393661, + "grad_norm": 1.3893462098614373, + "learning_rate": 5.439398636081836e-07, + "loss": 0.8839, + "step": 140380 + }, + { + "epoch": 10.879150683869968, + "grad_norm": 1.4776451041390475, + "learning_rate": 5.43978611283323e-07, + "loss": 0.8722, + "step": 140390 + }, + { + "epoch": 10.879925607346275, + "grad_norm": 1.4280810355020304, + "learning_rate": 5.440173589584625e-07, + "loss": 0.8878, + "step": 140400 + }, + { + "epoch": 10.880700530822581, + "grad_norm": 1.4841219949987516, + "learning_rate": 5.440561066336021e-07, + "loss": 0.9097, + "step": 140410 + }, + { + "epoch": 10.881475454298888, + "grad_norm": 1.4850798884810141, + "learning_rate": 5.440948543087416e-07, + "loss": 0.886, + "step": 140420 + }, + { + "epoch": 10.882250377775195, + "grad_norm": 1.4735139170533822, + "learning_rate": 5.44133601983881e-07, + "loss": 0.8921, + "step": 140430 + }, + { + "epoch": 10.883025301251502, + "grad_norm": 1.5236207781679423, + "learning_rate": 5.441723496590205e-07, + "loss": 0.8973, + "step": 140440 + }, + { + "epoch": 10.883800224727809, + "grad_norm": 1.4860702891635844, + "learning_rate": 5.4421109733416e-07, + "loss": 0.8851, + "step": 140450 + }, + { + "epoch": 10.884575148204116, + "grad_norm": 1.5103760459204845, + "learning_rate": 5.442498450092995e-07, + "loss": 0.8844, + "step": 140460 + }, + { + "epoch": 10.885350071680422, + "grad_norm": 1.4193392501566542, + "learning_rate": 5.44288592684439e-07, + "loss": 0.8959, + "step": 140470 + }, + { + "epoch": 10.886124995156727, + "grad_norm": 1.401276687069942, + "learning_rate": 5.443273403595785e-07, + "loss": 0.8976, + "step": 140480 + }, + { + "epoch": 10.886899918633034, + "grad_norm": 1.4680475775991606, + "learning_rate": 5.44366088034718e-07, + "loss": 0.9088, + "step": 140490 + }, + { + "epoch": 10.887674842109341, + "grad_norm": 1.4542744019048313, + "learning_rate": 5.444048357098574e-07, + "loss": 0.8967, + "step": 140500 + }, + { + "epoch": 10.887674842109341, + "eval_loss": 0.9039192199707031, + "eval_runtime": 328.3719, + "eval_samples_per_second": 34.933, + "eval_steps_per_second": 8.734, + "step": 140500 + }, + { + "epoch": 10.888449765585648, + "grad_norm": 1.4606743910776694, + "learning_rate": 5.44443583384997e-07, + "loss": 0.8882, + "step": 140510 + }, + { + "epoch": 10.889224689061955, + "grad_norm": 1.517295570999563, + "learning_rate": 5.444823310601365e-07, + "loss": 0.8898, + "step": 140520 + }, + { + "epoch": 10.889999612538261, + "grad_norm": 1.4254908878913604, + "learning_rate": 5.445210787352759e-07, + "loss": 0.8933, + "step": 140530 + }, + { + "epoch": 10.890774536014568, + "grad_norm": 1.5754769268183957, + "learning_rate": 5.445598264104154e-07, + "loss": 0.913, + "step": 140540 + }, + { + "epoch": 10.891549459490875, + "grad_norm": 1.3891764759021816, + "learning_rate": 5.445985740855549e-07, + "loss": 0.8717, + "step": 140550 + }, + { + "epoch": 10.892324382967182, + "grad_norm": 1.3968246129878452, + "learning_rate": 5.446373217606945e-07, + "loss": 0.8842, + "step": 140560 + }, + { + "epoch": 10.893099306443489, + "grad_norm": 1.48189971464619, + "learning_rate": 5.446760694358339e-07, + "loss": 0.9181, + "step": 140570 + }, + { + "epoch": 10.893874229919795, + "grad_norm": 1.366344344735334, + "learning_rate": 5.447148171109734e-07, + "loss": 0.8842, + "step": 140580 + }, + { + "epoch": 10.894649153396102, + "grad_norm": 1.5213171235404996, + "learning_rate": 5.447535647861129e-07, + "loss": 0.8947, + "step": 140590 + }, + { + "epoch": 10.895424076872409, + "grad_norm": 1.3660281049466434, + "learning_rate": 5.447923124612523e-07, + "loss": 0.9115, + "step": 140600 + }, + { + "epoch": 10.896199000348716, + "grad_norm": 1.4954540158899845, + "learning_rate": 5.448310601363919e-07, + "loss": 0.8798, + "step": 140610 + }, + { + "epoch": 10.896973923825023, + "grad_norm": 1.4904605291444226, + "learning_rate": 5.448698078115314e-07, + "loss": 0.9026, + "step": 140620 + }, + { + "epoch": 10.89774884730133, + "grad_norm": 1.3910194202252106, + "learning_rate": 5.449085554866709e-07, + "loss": 0.8898, + "step": 140630 + }, + { + "epoch": 10.898523770777636, + "grad_norm": 1.528643647951811, + "learning_rate": 5.449473031618103e-07, + "loss": 0.9188, + "step": 140640 + }, + { + "epoch": 10.899298694253943, + "grad_norm": 1.4859602647383041, + "learning_rate": 5.449860508369498e-07, + "loss": 0.8799, + "step": 140650 + }, + { + "epoch": 10.90007361773025, + "grad_norm": 1.4788352538228753, + "learning_rate": 5.450247985120894e-07, + "loss": 0.8848, + "step": 140660 + }, + { + "epoch": 10.900848541206557, + "grad_norm": 1.540997246779615, + "learning_rate": 5.450635461872288e-07, + "loss": 0.8701, + "step": 140670 + }, + { + "epoch": 10.901623464682862, + "grad_norm": 1.4559693920622219, + "learning_rate": 5.451022938623683e-07, + "loss": 0.8818, + "step": 140680 + }, + { + "epoch": 10.902398388159169, + "grad_norm": 1.4915011643090168, + "learning_rate": 5.451410415375078e-07, + "loss": 0.8866, + "step": 140690 + }, + { + "epoch": 10.903173311635475, + "grad_norm": 1.4378887772056976, + "learning_rate": 5.451797892126473e-07, + "loss": 0.8813, + "step": 140700 + }, + { + "epoch": 10.903948235111782, + "grad_norm": 1.5069180890358032, + "learning_rate": 5.452185368877868e-07, + "loss": 0.878, + "step": 140710 + }, + { + "epoch": 10.904723158588089, + "grad_norm": 1.4596589396728095, + "learning_rate": 5.452572845629263e-07, + "loss": 0.8966, + "step": 140720 + }, + { + "epoch": 10.905498082064396, + "grad_norm": 1.4700318060509296, + "learning_rate": 5.452960322380658e-07, + "loss": 0.9055, + "step": 140730 + }, + { + "epoch": 10.906273005540703, + "grad_norm": 1.4453069242002587, + "learning_rate": 5.453347799132052e-07, + "loss": 0.8913, + "step": 140740 + }, + { + "epoch": 10.90704792901701, + "grad_norm": 1.434161803928886, + "learning_rate": 5.453735275883447e-07, + "loss": 0.885, + "step": 140750 + }, + { + "epoch": 10.907822852493316, + "grad_norm": 1.5261388007941028, + "learning_rate": 5.454122752634843e-07, + "loss": 0.8983, + "step": 140760 + }, + { + "epoch": 10.908597775969623, + "grad_norm": 1.4709297653698024, + "learning_rate": 5.454510229386238e-07, + "loss": 0.8975, + "step": 140770 + }, + { + "epoch": 10.90937269944593, + "grad_norm": 1.411149023358321, + "learning_rate": 5.454897706137632e-07, + "loss": 0.8887, + "step": 140780 + }, + { + "epoch": 10.910147622922237, + "grad_norm": 1.476913429380433, + "learning_rate": 5.455285182889027e-07, + "loss": 0.9201, + "step": 140790 + }, + { + "epoch": 10.910922546398544, + "grad_norm": 1.4975689086715338, + "learning_rate": 5.455672659640422e-07, + "loss": 0.8903, + "step": 140800 + }, + { + "epoch": 10.91169746987485, + "grad_norm": 1.3990246970633269, + "learning_rate": 5.456060136391817e-07, + "loss": 0.8925, + "step": 140810 + }, + { + "epoch": 10.912472393351157, + "grad_norm": 1.567943486654623, + "learning_rate": 5.456447613143212e-07, + "loss": 0.8896, + "step": 140820 + }, + { + "epoch": 10.913247316827464, + "grad_norm": 1.439455955647396, + "learning_rate": 5.456835089894607e-07, + "loss": 0.8825, + "step": 140830 + }, + { + "epoch": 10.91402224030377, + "grad_norm": 1.3842334824554428, + "learning_rate": 5.457222566646002e-07, + "loss": 0.8814, + "step": 140840 + }, + { + "epoch": 10.914797163780076, + "grad_norm": 1.4585641395957047, + "learning_rate": 5.457610043397396e-07, + "loss": 0.8836, + "step": 140850 + }, + { + "epoch": 10.915572087256383, + "grad_norm": 1.4797832856269244, + "learning_rate": 5.457997520148792e-07, + "loss": 0.8736, + "step": 140860 + }, + { + "epoch": 10.91634701073269, + "grad_norm": 1.438444468820255, + "learning_rate": 5.458384996900187e-07, + "loss": 0.8983, + "step": 140870 + }, + { + "epoch": 10.917121934208996, + "grad_norm": 1.4341846559196845, + "learning_rate": 5.458772473651581e-07, + "loss": 0.9096, + "step": 140880 + }, + { + "epoch": 10.917896857685303, + "grad_norm": 1.508255062355546, + "learning_rate": 5.459159950402976e-07, + "loss": 0.8732, + "step": 140890 + }, + { + "epoch": 10.91867178116161, + "grad_norm": 1.4367906544568043, + "learning_rate": 5.459547427154372e-07, + "loss": 0.8973, + "step": 140900 + }, + { + "epoch": 10.919446704637917, + "grad_norm": 1.425774429436999, + "learning_rate": 5.459934903905767e-07, + "loss": 0.8804, + "step": 140910 + }, + { + "epoch": 10.920221628114223, + "grad_norm": 1.4031171608432482, + "learning_rate": 5.460322380657161e-07, + "loss": 0.8986, + "step": 140920 + }, + { + "epoch": 10.92099655159053, + "grad_norm": 1.4079414106514976, + "learning_rate": 5.460709857408556e-07, + "loss": 0.8965, + "step": 140930 + }, + { + "epoch": 10.921771475066837, + "grad_norm": 1.3905855294523135, + "learning_rate": 5.461097334159951e-07, + "loss": 0.8795, + "step": 140940 + }, + { + "epoch": 10.922546398543144, + "grad_norm": 1.4695879745056175, + "learning_rate": 5.461484810911346e-07, + "loss": 0.8837, + "step": 140950 + }, + { + "epoch": 10.92332132201945, + "grad_norm": 1.460166101370618, + "learning_rate": 5.461872287662741e-07, + "loss": 0.8865, + "step": 140960 + }, + { + "epoch": 10.924096245495758, + "grad_norm": 1.4740836439779577, + "learning_rate": 5.462259764414136e-07, + "loss": 0.8777, + "step": 140970 + }, + { + "epoch": 10.924871168972064, + "grad_norm": 1.4620525176853765, + "learning_rate": 5.462647241165531e-07, + "loss": 0.8943, + "step": 140980 + }, + { + "epoch": 10.925646092448371, + "grad_norm": 1.451236250199754, + "learning_rate": 5.463034717916925e-07, + "loss": 0.8914, + "step": 140990 + }, + { + "epoch": 10.926421015924678, + "grad_norm": 1.418956212174637, + "learning_rate": 5.463422194668321e-07, + "loss": 0.8782, + "step": 141000 + }, + { + "epoch": 10.926421015924678, + "eval_loss": 0.9036675691604614, + "eval_runtime": 326.8758, + "eval_samples_per_second": 35.093, + "eval_steps_per_second": 8.774, + "step": 141000 + }, + { + "epoch": 10.927195939400985, + "grad_norm": 1.50482251978917, + "learning_rate": 5.463809671419716e-07, + "loss": 0.8869, + "step": 141010 + }, + { + "epoch": 10.927970862877292, + "grad_norm": 1.5044617942360035, + "learning_rate": 5.46419714817111e-07, + "loss": 0.8908, + "step": 141020 + }, + { + "epoch": 10.928745786353598, + "grad_norm": 1.4633923604302261, + "learning_rate": 5.464584624922505e-07, + "loss": 0.8862, + "step": 141030 + }, + { + "epoch": 10.929520709829905, + "grad_norm": 1.5686342093166346, + "learning_rate": 5.4649721016739e-07, + "loss": 0.8738, + "step": 141040 + }, + { + "epoch": 10.93029563330621, + "grad_norm": 1.4622961149322258, + "learning_rate": 5.465359578425296e-07, + "loss": 0.8758, + "step": 141050 + }, + { + "epoch": 10.931070556782517, + "grad_norm": 1.479063061263029, + "learning_rate": 5.46574705517669e-07, + "loss": 0.9064, + "step": 141060 + }, + { + "epoch": 10.931845480258824, + "grad_norm": 1.5411325610008322, + "learning_rate": 5.466134531928085e-07, + "loss": 0.8882, + "step": 141070 + }, + { + "epoch": 10.93262040373513, + "grad_norm": 1.459913620931042, + "learning_rate": 5.46652200867948e-07, + "loss": 0.8834, + "step": 141080 + }, + { + "epoch": 10.933395327211437, + "grad_norm": 1.4969834872844663, + "learning_rate": 5.466909485430874e-07, + "loss": 0.8975, + "step": 141090 + }, + { + "epoch": 10.934170250687744, + "grad_norm": 1.4285585037473278, + "learning_rate": 5.46729696218227e-07, + "loss": 0.904, + "step": 141100 + }, + { + "epoch": 10.934945174164051, + "grad_norm": 1.509660288781936, + "learning_rate": 5.467684438933665e-07, + "loss": 0.9095, + "step": 141110 + }, + { + "epoch": 10.935720097640358, + "grad_norm": 1.5607169591442451, + "learning_rate": 5.46807191568506e-07, + "loss": 0.904, + "step": 141120 + }, + { + "epoch": 10.936495021116665, + "grad_norm": 1.4088103505332594, + "learning_rate": 5.468459392436454e-07, + "loss": 0.9063, + "step": 141130 + }, + { + "epoch": 10.937269944592972, + "grad_norm": 1.4676200044853172, + "learning_rate": 5.468846869187849e-07, + "loss": 0.8956, + "step": 141140 + }, + { + "epoch": 10.938044868069278, + "grad_norm": 1.4733347573425872, + "learning_rate": 5.469234345939245e-07, + "loss": 0.8807, + "step": 141150 + }, + { + "epoch": 10.938819791545585, + "grad_norm": 1.418648521960202, + "learning_rate": 5.469621822690639e-07, + "loss": 0.8914, + "step": 141160 + }, + { + "epoch": 10.939594715021892, + "grad_norm": 1.4200224133477257, + "learning_rate": 5.470009299442034e-07, + "loss": 0.9079, + "step": 141170 + }, + { + "epoch": 10.940369638498199, + "grad_norm": 1.5224676417465923, + "learning_rate": 5.470396776193429e-07, + "loss": 0.8957, + "step": 141180 + }, + { + "epoch": 10.941144561974506, + "grad_norm": 1.3922227510855023, + "learning_rate": 5.470784252944824e-07, + "loss": 0.8919, + "step": 141190 + }, + { + "epoch": 10.941919485450812, + "grad_norm": 1.5270111709263445, + "learning_rate": 5.471171729696219e-07, + "loss": 0.908, + "step": 141200 + }, + { + "epoch": 10.94269440892712, + "grad_norm": 1.4526193327468715, + "learning_rate": 5.471559206447614e-07, + "loss": 0.8893, + "step": 141210 + }, + { + "epoch": 10.943469332403424, + "grad_norm": 1.4566904974140462, + "learning_rate": 5.471946683199009e-07, + "loss": 0.8994, + "step": 141220 + }, + { + "epoch": 10.944244255879731, + "grad_norm": 1.4407055372634567, + "learning_rate": 5.472334159950403e-07, + "loss": 0.8895, + "step": 141230 + }, + { + "epoch": 10.945019179356038, + "grad_norm": 1.4476773761110582, + "learning_rate": 5.472721636701798e-07, + "loss": 0.87, + "step": 141240 + }, + { + "epoch": 10.945794102832345, + "grad_norm": 1.531360440747895, + "learning_rate": 5.473109113453194e-07, + "loss": 0.9001, + "step": 141250 + }, + { + "epoch": 10.946569026308651, + "grad_norm": 1.428279649650142, + "learning_rate": 5.473496590204589e-07, + "loss": 0.8892, + "step": 141260 + }, + { + "epoch": 10.947343949784958, + "grad_norm": 1.4318447996890038, + "learning_rate": 5.473884066955983e-07, + "loss": 0.8897, + "step": 141270 + }, + { + "epoch": 10.948118873261265, + "grad_norm": 1.4546598734875555, + "learning_rate": 5.474271543707378e-07, + "loss": 0.9007, + "step": 141280 + }, + { + "epoch": 10.948893796737572, + "grad_norm": 1.4602100624693102, + "learning_rate": 5.474659020458773e-07, + "loss": 0.9104, + "step": 141290 + }, + { + "epoch": 10.949668720213879, + "grad_norm": 1.4515904089017113, + "learning_rate": 5.475046497210168e-07, + "loss": 0.8658, + "step": 141300 + }, + { + "epoch": 10.950443643690186, + "grad_norm": 1.4706405889182004, + "learning_rate": 5.475433973961563e-07, + "loss": 0.8926, + "step": 141310 + }, + { + "epoch": 10.951218567166492, + "grad_norm": 1.4685805545215675, + "learning_rate": 5.475821450712958e-07, + "loss": 0.8997, + "step": 141320 + }, + { + "epoch": 10.9519934906428, + "grad_norm": 1.477148843287869, + "learning_rate": 5.476208927464353e-07, + "loss": 0.8771, + "step": 141330 + }, + { + "epoch": 10.952768414119106, + "grad_norm": 1.4026504837860467, + "learning_rate": 5.476596404215747e-07, + "loss": 0.883, + "step": 141340 + }, + { + "epoch": 10.953543337595413, + "grad_norm": 1.4234829220638971, + "learning_rate": 5.476983880967143e-07, + "loss": 0.8897, + "step": 141350 + }, + { + "epoch": 10.95431826107172, + "grad_norm": 1.5125479093688368, + "learning_rate": 5.477371357718538e-07, + "loss": 0.8972, + "step": 141360 + }, + { + "epoch": 10.955093184548026, + "grad_norm": 1.4317599450685103, + "learning_rate": 5.477758834469932e-07, + "loss": 0.9021, + "step": 141370 + }, + { + "epoch": 10.955868108024333, + "grad_norm": 1.401035709496773, + "learning_rate": 5.478146311221327e-07, + "loss": 0.8909, + "step": 141380 + }, + { + "epoch": 10.95664303150064, + "grad_norm": 1.3907043164703305, + "learning_rate": 5.478533787972722e-07, + "loss": 0.905, + "step": 141390 + }, + { + "epoch": 10.957417954976947, + "grad_norm": 1.4097685934492628, + "learning_rate": 5.478921264724118e-07, + "loss": 0.8852, + "step": 141400 + }, + { + "epoch": 10.958192878453254, + "grad_norm": 1.3521595026764652, + "learning_rate": 5.479308741475512e-07, + "loss": 0.8659, + "step": 141410 + }, + { + "epoch": 10.958967801929559, + "grad_norm": 1.4997505976963663, + "learning_rate": 5.479696218226907e-07, + "loss": 0.8914, + "step": 141420 + }, + { + "epoch": 10.959742725405865, + "grad_norm": 1.4483835008979855, + "learning_rate": 5.480083694978302e-07, + "loss": 0.8807, + "step": 141430 + }, + { + "epoch": 10.960517648882172, + "grad_norm": 1.379555572102087, + "learning_rate": 5.480471171729696e-07, + "loss": 0.9019, + "step": 141440 + }, + { + "epoch": 10.961292572358479, + "grad_norm": 1.4830220592856598, + "learning_rate": 5.480858648481092e-07, + "loss": 0.8906, + "step": 141450 + }, + { + "epoch": 10.962067495834786, + "grad_norm": 1.5664137794533661, + "learning_rate": 5.481246125232487e-07, + "loss": 0.8919, + "step": 141460 + }, + { + "epoch": 10.962842419311093, + "grad_norm": 1.482373378421743, + "learning_rate": 5.481633601983882e-07, + "loss": 0.8843, + "step": 141470 + }, + { + "epoch": 10.9636173427874, + "grad_norm": 1.5110516845122866, + "learning_rate": 5.482021078735276e-07, + "loss": 0.8946, + "step": 141480 + }, + { + "epoch": 10.964392266263706, + "grad_norm": 1.4894857666850334, + "learning_rate": 5.482408555486671e-07, + "loss": 0.8709, + "step": 141490 + }, + { + "epoch": 10.965167189740013, + "grad_norm": 1.5799801960845343, + "learning_rate": 5.482796032238067e-07, + "loss": 0.8899, + "step": 141500 + }, + { + "epoch": 10.965167189740013, + "eval_loss": 0.903336226940155, + "eval_runtime": 326.9473, + "eval_samples_per_second": 35.085, + "eval_steps_per_second": 8.772, + "step": 141500 + }, + { + "epoch": 10.96594211321632, + "grad_norm": 1.4220512888428727, + "learning_rate": 5.483183508989461e-07, + "loss": 0.9107, + "step": 141510 + }, + { + "epoch": 10.966717036692627, + "grad_norm": 1.4042333002165077, + "learning_rate": 5.483570985740856e-07, + "loss": 0.8683, + "step": 141520 + }, + { + "epoch": 10.967491960168934, + "grad_norm": 1.449193250302443, + "learning_rate": 5.483958462492251e-07, + "loss": 0.882, + "step": 141530 + }, + { + "epoch": 10.96826688364524, + "grad_norm": 1.3575180107818017, + "learning_rate": 5.484345939243645e-07, + "loss": 0.8932, + "step": 141540 + }, + { + "epoch": 10.969041807121547, + "grad_norm": 1.4394097208814738, + "learning_rate": 5.484733415995041e-07, + "loss": 0.9129, + "step": 141550 + }, + { + "epoch": 10.969816730597854, + "grad_norm": 1.443807947887313, + "learning_rate": 5.485120892746436e-07, + "loss": 0.8902, + "step": 141560 + }, + { + "epoch": 10.97059165407416, + "grad_norm": 1.4853022354803729, + "learning_rate": 5.485508369497831e-07, + "loss": 0.9035, + "step": 141570 + }, + { + "epoch": 10.971366577550468, + "grad_norm": 1.48659155287148, + "learning_rate": 5.485895846249225e-07, + "loss": 0.9016, + "step": 141580 + }, + { + "epoch": 10.972141501026774, + "grad_norm": 1.468832633704607, + "learning_rate": 5.48628332300062e-07, + "loss": 0.8931, + "step": 141590 + }, + { + "epoch": 10.97291642450308, + "grad_norm": 1.441422596306763, + "learning_rate": 5.486670799752016e-07, + "loss": 0.8756, + "step": 141600 + }, + { + "epoch": 10.973691347979386, + "grad_norm": 1.399212924887847, + "learning_rate": 5.48705827650341e-07, + "loss": 0.8745, + "step": 141610 + }, + { + "epoch": 10.974466271455693, + "grad_norm": 1.4374299061575975, + "learning_rate": 5.487445753254805e-07, + "loss": 0.902, + "step": 141620 + }, + { + "epoch": 10.975241194932, + "grad_norm": 1.4829649889835412, + "learning_rate": 5.4878332300062e-07, + "loss": 0.8807, + "step": 141630 + }, + { + "epoch": 10.976016118408307, + "grad_norm": 1.5370493320879692, + "learning_rate": 5.488220706757596e-07, + "loss": 0.8925, + "step": 141640 + }, + { + "epoch": 10.976791041884614, + "grad_norm": 1.5157811998508284, + "learning_rate": 5.48860818350899e-07, + "loss": 0.892, + "step": 141650 + }, + { + "epoch": 10.97756596536092, + "grad_norm": 1.4965757754372468, + "learning_rate": 5.488995660260385e-07, + "loss": 0.9067, + "step": 141660 + }, + { + "epoch": 10.978340888837227, + "grad_norm": 1.42871864833659, + "learning_rate": 5.48938313701178e-07, + "loss": 0.8837, + "step": 141670 + }, + { + "epoch": 10.979115812313534, + "grad_norm": 1.4178200672035872, + "learning_rate": 5.489770613763174e-07, + "loss": 0.8955, + "step": 141680 + }, + { + "epoch": 10.97989073578984, + "grad_norm": 1.4408560765143854, + "learning_rate": 5.49015809051457e-07, + "loss": 0.892, + "step": 141690 + }, + { + "epoch": 10.980665659266148, + "grad_norm": 1.5331928039352982, + "learning_rate": 5.490545567265965e-07, + "loss": 0.8966, + "step": 141700 + }, + { + "epoch": 10.981440582742454, + "grad_norm": 1.4713714262007727, + "learning_rate": 5.49093304401736e-07, + "loss": 0.883, + "step": 141710 + }, + { + "epoch": 10.982215506218761, + "grad_norm": 1.4860661166098683, + "learning_rate": 5.491320520768754e-07, + "loss": 0.8886, + "step": 141720 + }, + { + "epoch": 10.982990429695068, + "grad_norm": 1.4352654492967334, + "learning_rate": 5.491707997520149e-07, + "loss": 0.8918, + "step": 141730 + }, + { + "epoch": 10.983765353171375, + "grad_norm": 1.4430242985520585, + "learning_rate": 5.492095474271545e-07, + "loss": 0.8852, + "step": 141740 + }, + { + "epoch": 10.984540276647682, + "grad_norm": 1.3894218748049674, + "learning_rate": 5.492482951022939e-07, + "loss": 0.8868, + "step": 141750 + }, + { + "epoch": 10.985315200123988, + "grad_norm": 1.4983995912794974, + "learning_rate": 5.492870427774334e-07, + "loss": 0.9014, + "step": 141760 + }, + { + "epoch": 10.986090123600295, + "grad_norm": 1.4358964669236502, + "learning_rate": 5.493257904525729e-07, + "loss": 0.8836, + "step": 141770 + }, + { + "epoch": 10.986865047076602, + "grad_norm": 1.4728422367813876, + "learning_rate": 5.493645381277124e-07, + "loss": 0.8818, + "step": 141780 + }, + { + "epoch": 10.987639970552907, + "grad_norm": 1.426054260534058, + "learning_rate": 5.494032858028519e-07, + "loss": 0.8767, + "step": 141790 + }, + { + "epoch": 10.988414894029214, + "grad_norm": 1.4337189916680988, + "learning_rate": 5.494420334779914e-07, + "loss": 0.888, + "step": 141800 + }, + { + "epoch": 10.98918981750552, + "grad_norm": 1.4825069653492846, + "learning_rate": 5.494807811531309e-07, + "loss": 0.8948, + "step": 141810 + }, + { + "epoch": 10.989964740981828, + "grad_norm": 1.373305686472056, + "learning_rate": 5.495195288282703e-07, + "loss": 0.8764, + "step": 141820 + }, + { + "epoch": 10.990739664458134, + "grad_norm": 1.5637622279847163, + "learning_rate": 5.495582765034098e-07, + "loss": 0.8875, + "step": 141830 + }, + { + "epoch": 10.991514587934441, + "grad_norm": 1.512421887495804, + "learning_rate": 5.495970241785494e-07, + "loss": 0.9241, + "step": 141840 + }, + { + "epoch": 10.992289511410748, + "grad_norm": 1.4706448695033802, + "learning_rate": 5.496357718536889e-07, + "loss": 0.8819, + "step": 141850 + }, + { + "epoch": 10.993064434887055, + "grad_norm": 1.505838735734467, + "learning_rate": 5.496745195288283e-07, + "loss": 0.8997, + "step": 141860 + }, + { + "epoch": 10.993839358363362, + "grad_norm": 1.4049566140494822, + "learning_rate": 5.497132672039678e-07, + "loss": 0.8656, + "step": 141870 + }, + { + "epoch": 10.994614281839668, + "grad_norm": 1.5022493604581173, + "learning_rate": 5.497520148791073e-07, + "loss": 0.8676, + "step": 141880 + }, + { + "epoch": 10.995389205315975, + "grad_norm": 1.4977119286765423, + "learning_rate": 5.497907625542468e-07, + "loss": 0.9047, + "step": 141890 + }, + { + "epoch": 10.996164128792282, + "grad_norm": 1.4680512547318814, + "learning_rate": 5.498295102293863e-07, + "loss": 0.9059, + "step": 141900 + }, + { + "epoch": 10.996939052268589, + "grad_norm": 1.3633815142530863, + "learning_rate": 5.498682579045258e-07, + "loss": 0.8799, + "step": 141910 + }, + { + "epoch": 10.997713975744896, + "grad_norm": 1.465474738899315, + "learning_rate": 5.499070055796653e-07, + "loss": 0.9, + "step": 141920 + }, + { + "epoch": 10.998488899221202, + "grad_norm": 1.5148027130250399, + "learning_rate": 5.499457532548047e-07, + "loss": 0.9045, + "step": 141930 + }, + { + "epoch": 10.99926382269751, + "grad_norm": 1.4614771802532418, + "learning_rate": 5.499845009299443e-07, + "loss": 0.8979, + "step": 141940 + }, + { + "epoch": 11.000038746173816, + "grad_norm": 1.4642591493303583, + "learning_rate": 5.500232486050838e-07, + "loss": 0.8836, + "step": 141950 + }, + { + "epoch": 11.000813669650123, + "grad_norm": 1.4755308524341693, + "learning_rate": 5.500619962802232e-07, + "loss": 0.8811, + "step": 141960 + }, + { + "epoch": 11.001588593126428, + "grad_norm": 1.5979810767002083, + "learning_rate": 5.501007439553627e-07, + "loss": 0.8761, + "step": 141970 + }, + { + "epoch": 11.002363516602735, + "grad_norm": 1.4179595242910137, + "learning_rate": 5.501394916305022e-07, + "loss": 0.8771, + "step": 141980 + }, + { + "epoch": 11.003138440079042, + "grad_norm": 1.561987221140231, + "learning_rate": 5.501782393056418e-07, + "loss": 0.89, + "step": 141990 + }, + { + "epoch": 11.003913363555348, + "grad_norm": 1.5867843881767438, + "learning_rate": 5.502169869807812e-07, + "loss": 0.8996, + "step": 142000 + }, + { + "epoch": 11.003913363555348, + "eval_loss": 0.9033332467079163, + "eval_runtime": 328.252, + "eval_samples_per_second": 34.946, + "eval_steps_per_second": 8.737, + "step": 142000 + }, + { + "epoch": 11.004688287031655, + "grad_norm": 1.454498317106442, + "learning_rate": 5.502557346559207e-07, + "loss": 0.8698, + "step": 142010 + }, + { + "epoch": 11.005463210507962, + "grad_norm": 1.4815705667756427, + "learning_rate": 5.502944823310602e-07, + "loss": 0.8953, + "step": 142020 + }, + { + "epoch": 11.006238133984269, + "grad_norm": 1.485571174539126, + "learning_rate": 5.503332300061996e-07, + "loss": 0.8774, + "step": 142030 + }, + { + "epoch": 11.007013057460576, + "grad_norm": 1.4727772726457342, + "learning_rate": 5.503719776813392e-07, + "loss": 0.886, + "step": 142040 + }, + { + "epoch": 11.007787980936882, + "grad_norm": 1.535082101451502, + "learning_rate": 5.504107253564787e-07, + "loss": 0.9028, + "step": 142050 + }, + { + "epoch": 11.00856290441319, + "grad_norm": 1.4231055153449408, + "learning_rate": 5.504494730316182e-07, + "loss": 0.8914, + "step": 142060 + }, + { + "epoch": 11.009337827889496, + "grad_norm": 1.4714165189280264, + "learning_rate": 5.504882207067576e-07, + "loss": 0.8776, + "step": 142070 + }, + { + "epoch": 11.010112751365803, + "grad_norm": 1.4334711626008463, + "learning_rate": 5.505269683818971e-07, + "loss": 0.8956, + "step": 142080 + }, + { + "epoch": 11.01088767484211, + "grad_norm": 1.4233921654245751, + "learning_rate": 5.505657160570367e-07, + "loss": 0.9133, + "step": 142090 + }, + { + "epoch": 11.011662598318416, + "grad_norm": 1.517728383505297, + "learning_rate": 5.506044637321761e-07, + "loss": 0.8911, + "step": 142100 + }, + { + "epoch": 11.012437521794723, + "grad_norm": 1.4043528778684995, + "learning_rate": 5.506432114073156e-07, + "loss": 0.9122, + "step": 142110 + }, + { + "epoch": 11.01321244527103, + "grad_norm": 1.5339381018447873, + "learning_rate": 5.506819590824551e-07, + "loss": 0.8992, + "step": 142120 + }, + { + "epoch": 11.013987368747337, + "grad_norm": 1.4756602168671507, + "learning_rate": 5.507207067575946e-07, + "loss": 0.9003, + "step": 142130 + }, + { + "epoch": 11.014762292223644, + "grad_norm": 1.5540776844173263, + "learning_rate": 5.507594544327341e-07, + "loss": 0.8833, + "step": 142140 + }, + { + "epoch": 11.01553721569995, + "grad_norm": 1.495720701610157, + "learning_rate": 5.507982021078736e-07, + "loss": 0.8809, + "step": 142150 + }, + { + "epoch": 11.016312139176256, + "grad_norm": 1.4736479644192726, + "learning_rate": 5.508369497830131e-07, + "loss": 0.8895, + "step": 142160 + }, + { + "epoch": 11.017087062652562, + "grad_norm": 1.5452685194809805, + "learning_rate": 5.508756974581525e-07, + "loss": 0.8848, + "step": 142170 + }, + { + "epoch": 11.01786198612887, + "grad_norm": 1.4198678146177206, + "learning_rate": 5.50914445133292e-07, + "loss": 0.8986, + "step": 142180 + }, + { + "epoch": 11.018636909605176, + "grad_norm": 1.461837355501242, + "learning_rate": 5.509531928084316e-07, + "loss": 0.9041, + "step": 142190 + }, + { + "epoch": 11.019411833081483, + "grad_norm": 1.5213704963744172, + "learning_rate": 5.509919404835711e-07, + "loss": 0.8852, + "step": 142200 + }, + { + "epoch": 11.02018675655779, + "grad_norm": 1.3318654901986684, + "learning_rate": 5.510306881587105e-07, + "loss": 0.8855, + "step": 142210 + }, + { + "epoch": 11.020961680034096, + "grad_norm": 1.4614062396681138, + "learning_rate": 5.5106943583385e-07, + "loss": 0.8779, + "step": 142220 + }, + { + "epoch": 11.021736603510403, + "grad_norm": 1.5063789252715512, + "learning_rate": 5.511081835089896e-07, + "loss": 0.8918, + "step": 142230 + }, + { + "epoch": 11.02251152698671, + "grad_norm": 1.4638233906870777, + "learning_rate": 5.51146931184129e-07, + "loss": 0.8798, + "step": 142240 + }, + { + "epoch": 11.023286450463017, + "grad_norm": 1.5527374260020363, + "learning_rate": 5.511856788592685e-07, + "loss": 0.8938, + "step": 142250 + }, + { + "epoch": 11.024061373939324, + "grad_norm": 1.511083256843584, + "learning_rate": 5.51224426534408e-07, + "loss": 0.8846, + "step": 142260 + }, + { + "epoch": 11.02483629741563, + "grad_norm": 1.4733748841849654, + "learning_rate": 5.512631742095475e-07, + "loss": 0.8679, + "step": 142270 + }, + { + "epoch": 11.025611220891937, + "grad_norm": 1.5036006310826904, + "learning_rate": 5.51301921884687e-07, + "loss": 0.9048, + "step": 142280 + }, + { + "epoch": 11.026386144368244, + "grad_norm": 1.520617273965796, + "learning_rate": 5.513406695598265e-07, + "loss": 0.8828, + "step": 142290 + }, + { + "epoch": 11.02716106784455, + "grad_norm": 1.4899586654395358, + "learning_rate": 5.51379417234966e-07, + "loss": 0.8879, + "step": 142300 + }, + { + "epoch": 11.027935991320858, + "grad_norm": 1.4946198831600295, + "learning_rate": 5.514181649101054e-07, + "loss": 0.8698, + "step": 142310 + }, + { + "epoch": 11.028710914797164, + "grad_norm": 1.4936270214276197, + "learning_rate": 5.514569125852449e-07, + "loss": 0.897, + "step": 142320 + }, + { + "epoch": 11.029485838273471, + "grad_norm": 1.4913491703394932, + "learning_rate": 5.514956602603845e-07, + "loss": 0.8965, + "step": 142330 + }, + { + "epoch": 11.030260761749776, + "grad_norm": 1.4352373464969397, + "learning_rate": 5.51534407935524e-07, + "loss": 0.8835, + "step": 142340 + }, + { + "epoch": 11.031035685226083, + "grad_norm": 1.5439641999422131, + "learning_rate": 5.515731556106634e-07, + "loss": 0.8744, + "step": 142350 + }, + { + "epoch": 11.03181060870239, + "grad_norm": 1.4510613753364308, + "learning_rate": 5.516119032858029e-07, + "loss": 0.8973, + "step": 142360 + }, + { + "epoch": 11.032585532178697, + "grad_norm": 1.4791455269003806, + "learning_rate": 5.516506509609424e-07, + "loss": 0.8806, + "step": 142370 + }, + { + "epoch": 11.033360455655004, + "grad_norm": 1.501376289747702, + "learning_rate": 5.516893986360819e-07, + "loss": 0.9038, + "step": 142380 + }, + { + "epoch": 11.03413537913131, + "grad_norm": 1.4556457397841838, + "learning_rate": 5.517281463112214e-07, + "loss": 0.8788, + "step": 142390 + }, + { + "epoch": 11.034910302607617, + "grad_norm": 1.4501336611685305, + "learning_rate": 5.517668939863609e-07, + "loss": 0.8782, + "step": 142400 + }, + { + "epoch": 11.035685226083924, + "grad_norm": 1.4210153269517176, + "learning_rate": 5.518056416615004e-07, + "loss": 0.8812, + "step": 142410 + }, + { + "epoch": 11.03646014956023, + "grad_norm": 1.4363090406375647, + "learning_rate": 5.518443893366398e-07, + "loss": 0.891, + "step": 142420 + }, + { + "epoch": 11.037235073036538, + "grad_norm": 1.4505217017642484, + "learning_rate": 5.518831370117794e-07, + "loss": 0.8797, + "step": 142430 + }, + { + "epoch": 11.038009996512844, + "grad_norm": 1.506618829220249, + "learning_rate": 5.519218846869189e-07, + "loss": 0.8817, + "step": 142440 + }, + { + "epoch": 11.038784919989151, + "grad_norm": 1.4930421013727762, + "learning_rate": 5.519606323620583e-07, + "loss": 0.9071, + "step": 142450 + }, + { + "epoch": 11.039559843465458, + "grad_norm": 1.4455594526390072, + "learning_rate": 5.519993800371978e-07, + "loss": 0.9316, + "step": 142460 + }, + { + "epoch": 11.040334766941765, + "grad_norm": 1.4772050895868671, + "learning_rate": 5.520381277123373e-07, + "loss": 0.8977, + "step": 142470 + }, + { + "epoch": 11.041109690418072, + "grad_norm": 1.4401402532602228, + "learning_rate": 5.520768753874769e-07, + "loss": 0.8757, + "step": 142480 + }, + { + "epoch": 11.041884613894378, + "grad_norm": 1.454206852563526, + "learning_rate": 5.521156230626163e-07, + "loss": 0.8907, + "step": 142490 + }, + { + "epoch": 11.042659537370685, + "grad_norm": 1.510666810389921, + "learning_rate": 5.521543707377558e-07, + "loss": 0.8619, + "step": 142500 + }, + { + "epoch": 11.042659537370685, + "eval_loss": 0.9035292863845825, + "eval_runtime": 326.5046, + "eval_samples_per_second": 35.133, + "eval_steps_per_second": 8.784, + "step": 142500 + }, + { + "epoch": 11.043434460846992, + "grad_norm": 1.4367215933500421, + "learning_rate": 5.521931184128953e-07, + "loss": 0.8701, + "step": 142510 + }, + { + "epoch": 11.044209384323299, + "grad_norm": 1.4645615509745369, + "learning_rate": 5.522318660880347e-07, + "loss": 0.8774, + "step": 142520 + }, + { + "epoch": 11.044984307799604, + "grad_norm": 1.450796133364375, + "learning_rate": 5.522706137631743e-07, + "loss": 0.8859, + "step": 142530 + }, + { + "epoch": 11.04575923127591, + "grad_norm": 1.4698628065027497, + "learning_rate": 5.523093614383138e-07, + "loss": 0.8918, + "step": 142540 + }, + { + "epoch": 11.046534154752218, + "grad_norm": 1.4801565949204103, + "learning_rate": 5.523481091134533e-07, + "loss": 0.8819, + "step": 142550 + }, + { + "epoch": 11.047309078228524, + "grad_norm": 1.4559873034489947, + "learning_rate": 5.523868567885927e-07, + "loss": 0.8911, + "step": 142560 + }, + { + "epoch": 11.048084001704831, + "grad_norm": 1.4691506560915464, + "learning_rate": 5.524256044637322e-07, + "loss": 0.8877, + "step": 142570 + }, + { + "epoch": 11.048858925181138, + "grad_norm": 1.4341858847550484, + "learning_rate": 5.524643521388718e-07, + "loss": 0.8921, + "step": 142580 + }, + { + "epoch": 11.049633848657445, + "grad_norm": 1.4705708607875596, + "learning_rate": 5.525030998140112e-07, + "loss": 0.8848, + "step": 142590 + }, + { + "epoch": 11.050408772133752, + "grad_norm": 1.4534771614128528, + "learning_rate": 5.525418474891507e-07, + "loss": 0.8724, + "step": 142600 + }, + { + "epoch": 11.051183695610058, + "grad_norm": 1.4750213387417408, + "learning_rate": 5.525805951642902e-07, + "loss": 0.8849, + "step": 142610 + }, + { + "epoch": 11.051958619086365, + "grad_norm": 1.5264660149658886, + "learning_rate": 5.526193428394297e-07, + "loss": 0.8734, + "step": 142620 + }, + { + "epoch": 11.052733542562672, + "grad_norm": 1.438875939559325, + "learning_rate": 5.526580905145692e-07, + "loss": 0.8983, + "step": 142630 + }, + { + "epoch": 11.053508466038979, + "grad_norm": 1.4058300322408386, + "learning_rate": 5.526968381897087e-07, + "loss": 0.8839, + "step": 142640 + }, + { + "epoch": 11.054283389515286, + "grad_norm": 1.4877433327422311, + "learning_rate": 5.527355858648482e-07, + "loss": 0.9076, + "step": 142650 + }, + { + "epoch": 11.055058312991592, + "grad_norm": 1.5022087456590296, + "learning_rate": 5.527743335399876e-07, + "loss": 0.8808, + "step": 142660 + }, + { + "epoch": 11.0558332364679, + "grad_norm": 1.453629164480602, + "learning_rate": 5.528130812151271e-07, + "loss": 0.8883, + "step": 142670 + }, + { + "epoch": 11.056608159944206, + "grad_norm": 1.4685424165451615, + "learning_rate": 5.528518288902667e-07, + "loss": 0.8783, + "step": 142680 + }, + { + "epoch": 11.057383083420513, + "grad_norm": 1.456272650024107, + "learning_rate": 5.528905765654062e-07, + "loss": 0.8743, + "step": 142690 + }, + { + "epoch": 11.05815800689682, + "grad_norm": 1.4352156054532719, + "learning_rate": 5.529293242405456e-07, + "loss": 0.8813, + "step": 142700 + }, + { + "epoch": 11.058932930373127, + "grad_norm": 1.4426963001984001, + "learning_rate": 5.529680719156851e-07, + "loss": 0.8875, + "step": 142710 + }, + { + "epoch": 11.059707853849432, + "grad_norm": 1.5596632934149104, + "learning_rate": 5.530068195908246e-07, + "loss": 0.8747, + "step": 142720 + }, + { + "epoch": 11.060482777325738, + "grad_norm": 1.4991455707264445, + "learning_rate": 5.530455672659641e-07, + "loss": 0.9079, + "step": 142730 + }, + { + "epoch": 11.061257700802045, + "grad_norm": 1.4318001189623097, + "learning_rate": 5.530843149411036e-07, + "loss": 0.8684, + "step": 142740 + }, + { + "epoch": 11.062032624278352, + "grad_norm": 1.403797452511334, + "learning_rate": 5.531230626162431e-07, + "loss": 0.9013, + "step": 142750 + }, + { + "epoch": 11.062807547754659, + "grad_norm": 1.498940101788064, + "learning_rate": 5.531618102913826e-07, + "loss": 0.8866, + "step": 142760 + }, + { + "epoch": 11.063582471230966, + "grad_norm": 1.4194395385054086, + "learning_rate": 5.53200557966522e-07, + "loss": 0.8801, + "step": 142770 + }, + { + "epoch": 11.064357394707272, + "grad_norm": 1.5694177110418548, + "learning_rate": 5.532393056416616e-07, + "loss": 0.9049, + "step": 142780 + }, + { + "epoch": 11.06513231818358, + "grad_norm": 1.542421482563571, + "learning_rate": 5.532780533168011e-07, + "loss": 0.8958, + "step": 142790 + }, + { + "epoch": 11.065907241659886, + "grad_norm": 1.5120723246897325, + "learning_rate": 5.533168009919405e-07, + "loss": 0.8801, + "step": 142800 + }, + { + "epoch": 11.066682165136193, + "grad_norm": 1.599893873841046, + "learning_rate": 5.5335554866708e-07, + "loss": 0.8715, + "step": 142810 + }, + { + "epoch": 11.0674570886125, + "grad_norm": 1.4351164917950952, + "learning_rate": 5.533942963422195e-07, + "loss": 0.8795, + "step": 142820 + }, + { + "epoch": 11.068232012088806, + "grad_norm": 1.624117526719382, + "learning_rate": 5.534330440173591e-07, + "loss": 0.9046, + "step": 142830 + }, + { + "epoch": 11.069006935565113, + "grad_norm": 1.4916268213938704, + "learning_rate": 5.534717916924985e-07, + "loss": 0.9031, + "step": 142840 + }, + { + "epoch": 11.06978185904142, + "grad_norm": 1.4354081912663643, + "learning_rate": 5.53510539367638e-07, + "loss": 0.8979, + "step": 142850 + }, + { + "epoch": 11.070556782517727, + "grad_norm": 1.3760765963506285, + "learning_rate": 5.535492870427775e-07, + "loss": 0.8789, + "step": 142860 + }, + { + "epoch": 11.071331705994034, + "grad_norm": 1.5281725833262756, + "learning_rate": 5.535880347179169e-07, + "loss": 0.8798, + "step": 142870 + }, + { + "epoch": 11.07210662947034, + "grad_norm": 1.4624352413845845, + "learning_rate": 5.536267823930565e-07, + "loss": 0.8675, + "step": 142880 + }, + { + "epoch": 11.072881552946647, + "grad_norm": 1.5257113865424912, + "learning_rate": 5.53665530068196e-07, + "loss": 0.897, + "step": 142890 + }, + { + "epoch": 11.073656476422952, + "grad_norm": 1.453249161887723, + "learning_rate": 5.537042777433355e-07, + "loss": 0.8989, + "step": 142900 + }, + { + "epoch": 11.07443139989926, + "grad_norm": 1.4354910189975811, + "learning_rate": 5.537430254184749e-07, + "loss": 0.8648, + "step": 142910 + }, + { + "epoch": 11.075206323375566, + "grad_norm": 1.4370625292616401, + "learning_rate": 5.537817730936145e-07, + "loss": 0.8851, + "step": 142920 + }, + { + "epoch": 11.075981246851873, + "grad_norm": 1.414449335146855, + "learning_rate": 5.53820520768754e-07, + "loss": 0.8869, + "step": 142930 + }, + { + "epoch": 11.07675617032818, + "grad_norm": 1.442228370729051, + "learning_rate": 5.538592684438934e-07, + "loss": 0.9161, + "step": 142940 + }, + { + "epoch": 11.077531093804486, + "grad_norm": 1.4845184778374938, + "learning_rate": 5.538980161190329e-07, + "loss": 0.8877, + "step": 142950 + }, + { + "epoch": 11.078306017280793, + "grad_norm": 1.450907793495238, + "learning_rate": 5.539367637941724e-07, + "loss": 0.8739, + "step": 142960 + }, + { + "epoch": 11.0790809407571, + "grad_norm": 1.3989249094531933, + "learning_rate": 5.53975511469312e-07, + "loss": 0.8888, + "step": 142970 + }, + { + "epoch": 11.079855864233407, + "grad_norm": 1.5747240139496874, + "learning_rate": 5.540142591444514e-07, + "loss": 0.8808, + "step": 142980 + }, + { + "epoch": 11.080630787709714, + "grad_norm": 1.3950295217327442, + "learning_rate": 5.540530068195909e-07, + "loss": 0.8793, + "step": 142990 + }, + { + "epoch": 11.08140571118602, + "grad_norm": 1.4302686161014437, + "learning_rate": 5.540917544947304e-07, + "loss": 0.8782, + "step": 143000 + }, + { + "epoch": 11.08140571118602, + "eval_loss": 0.9032776355743408, + "eval_runtime": 328.7149, + "eval_samples_per_second": 34.896, + "eval_steps_per_second": 8.725, + "step": 143000 + }, + { + "epoch": 11.082180634662327, + "grad_norm": 1.4940700266680937, + "learning_rate": 5.541305021698698e-07, + "loss": 0.8854, + "step": 143010 + }, + { + "epoch": 11.082955558138634, + "grad_norm": 1.5702214425613608, + "learning_rate": 5.541692498450094e-07, + "loss": 0.8851, + "step": 143020 + }, + { + "epoch": 11.083730481614941, + "grad_norm": 1.5351754178352424, + "learning_rate": 5.542079975201489e-07, + "loss": 0.8816, + "step": 143030 + }, + { + "epoch": 11.084505405091248, + "grad_norm": 1.531888537053884, + "learning_rate": 5.542467451952883e-07, + "loss": 0.8743, + "step": 143040 + }, + { + "epoch": 11.085280328567555, + "grad_norm": 1.4464433973012985, + "learning_rate": 5.542854928704278e-07, + "loss": 0.8979, + "step": 143050 + }, + { + "epoch": 11.086055252043861, + "grad_norm": 1.5241570918003922, + "learning_rate": 5.543242405455673e-07, + "loss": 0.8806, + "step": 143060 + }, + { + "epoch": 11.086830175520168, + "grad_norm": 1.4221869476425582, + "learning_rate": 5.543629882207069e-07, + "loss": 0.8881, + "step": 143070 + }, + { + "epoch": 11.087605098996475, + "grad_norm": 1.5021001731574015, + "learning_rate": 5.544017358958463e-07, + "loss": 0.9056, + "step": 143080 + }, + { + "epoch": 11.08838002247278, + "grad_norm": 1.4683205716447718, + "learning_rate": 5.544404835709858e-07, + "loss": 0.8862, + "step": 143090 + }, + { + "epoch": 11.089154945949087, + "grad_norm": 1.5734926564702754, + "learning_rate": 5.544792312461253e-07, + "loss": 0.8919, + "step": 143100 + }, + { + "epoch": 11.089929869425394, + "grad_norm": 1.4519278362959709, + "learning_rate": 5.545179789212647e-07, + "loss": 0.8777, + "step": 143110 + }, + { + "epoch": 11.0907047929017, + "grad_norm": 1.5439410685854917, + "learning_rate": 5.545567265964043e-07, + "loss": 0.8994, + "step": 143120 + }, + { + "epoch": 11.091479716378007, + "grad_norm": 1.4250285588863265, + "learning_rate": 5.545954742715438e-07, + "loss": 0.8872, + "step": 143130 + }, + { + "epoch": 11.092254639854314, + "grad_norm": 1.614505678498865, + "learning_rate": 5.546342219466833e-07, + "loss": 0.8924, + "step": 143140 + }, + { + "epoch": 11.09302956333062, + "grad_norm": 1.5246409282468532, + "learning_rate": 5.546729696218227e-07, + "loss": 0.8833, + "step": 143150 + }, + { + "epoch": 11.093804486806928, + "grad_norm": 1.4767937501709552, + "learning_rate": 5.547117172969622e-07, + "loss": 0.8947, + "step": 143160 + }, + { + "epoch": 11.094579410283234, + "grad_norm": 1.4933941158307626, + "learning_rate": 5.547504649721018e-07, + "loss": 0.8747, + "step": 143170 + }, + { + "epoch": 11.095354333759541, + "grad_norm": 1.4946148865246438, + "learning_rate": 5.547892126472412e-07, + "loss": 0.893, + "step": 143180 + }, + { + "epoch": 11.096129257235848, + "grad_norm": 1.4456130107709493, + "learning_rate": 5.548279603223807e-07, + "loss": 0.9042, + "step": 143190 + }, + { + "epoch": 11.096904180712155, + "grad_norm": 1.4152636781830699, + "learning_rate": 5.548667079975202e-07, + "loss": 0.8849, + "step": 143200 + }, + { + "epoch": 11.097679104188462, + "grad_norm": 1.4858222839153457, + "learning_rate": 5.549054556726597e-07, + "loss": 0.8806, + "step": 143210 + }, + { + "epoch": 11.098454027664769, + "grad_norm": 1.5032046318597845, + "learning_rate": 5.549442033477992e-07, + "loss": 0.8834, + "step": 143220 + }, + { + "epoch": 11.099228951141075, + "grad_norm": 1.417550385802161, + "learning_rate": 5.549829510229387e-07, + "loss": 0.8917, + "step": 143230 + }, + { + "epoch": 11.100003874617382, + "grad_norm": 1.4942907269181525, + "learning_rate": 5.550216986980782e-07, + "loss": 0.8817, + "step": 143240 + }, + { + "epoch": 11.100778798093689, + "grad_norm": 1.4498441394579993, + "learning_rate": 5.550604463732176e-07, + "loss": 0.8856, + "step": 143250 + }, + { + "epoch": 11.101553721569996, + "grad_norm": 1.5397147008116954, + "learning_rate": 5.550991940483571e-07, + "loss": 0.8988, + "step": 143260 + }, + { + "epoch": 11.102328645046303, + "grad_norm": 1.5339315432419542, + "learning_rate": 5.551379417234967e-07, + "loss": 0.8838, + "step": 143270 + }, + { + "epoch": 11.103103568522608, + "grad_norm": 1.4789472406516897, + "learning_rate": 5.551766893986362e-07, + "loss": 0.8818, + "step": 143280 + }, + { + "epoch": 11.103878491998914, + "grad_norm": 1.506825098054256, + "learning_rate": 5.552154370737756e-07, + "loss": 0.8652, + "step": 143290 + }, + { + "epoch": 11.104653415475221, + "grad_norm": 1.497217956639637, + "learning_rate": 5.552541847489151e-07, + "loss": 0.8784, + "step": 143300 + }, + { + "epoch": 11.105428338951528, + "grad_norm": 1.4647146939889022, + "learning_rate": 5.552929324240546e-07, + "loss": 0.9019, + "step": 143310 + }, + { + "epoch": 11.106203262427835, + "grad_norm": 1.4617949553928813, + "learning_rate": 5.553316800991941e-07, + "loss": 0.8918, + "step": 143320 + }, + { + "epoch": 11.106978185904142, + "grad_norm": 1.4526185769401996, + "learning_rate": 5.553704277743336e-07, + "loss": 0.8844, + "step": 143330 + }, + { + "epoch": 11.107753109380448, + "grad_norm": 1.5419807266945993, + "learning_rate": 5.554091754494731e-07, + "loss": 0.8803, + "step": 143340 + }, + { + "epoch": 11.108528032856755, + "grad_norm": 1.5098771966640698, + "learning_rate": 5.554479231246126e-07, + "loss": 0.8879, + "step": 143350 + }, + { + "epoch": 11.109302956333062, + "grad_norm": 1.4850987871762111, + "learning_rate": 5.55486670799752e-07, + "loss": 0.8861, + "step": 143360 + }, + { + "epoch": 11.110077879809369, + "grad_norm": 1.3490619211164578, + "learning_rate": 5.555254184748916e-07, + "loss": 0.8702, + "step": 143370 + }, + { + "epoch": 11.110852803285676, + "grad_norm": 1.5120351969803367, + "learning_rate": 5.555641661500311e-07, + "loss": 0.9154, + "step": 143380 + }, + { + "epoch": 11.111627726761983, + "grad_norm": 1.5369827553940065, + "learning_rate": 5.556029138251705e-07, + "loss": 0.8785, + "step": 143390 + }, + { + "epoch": 11.11240265023829, + "grad_norm": 1.4931108478913235, + "learning_rate": 5.5564166150031e-07, + "loss": 0.8928, + "step": 143400 + }, + { + "epoch": 11.113177573714596, + "grad_norm": 1.5066729480437515, + "learning_rate": 5.556804091754495e-07, + "loss": 0.8864, + "step": 143410 + }, + { + "epoch": 11.113952497190903, + "grad_norm": 1.6139232032888833, + "learning_rate": 5.557191568505891e-07, + "loss": 0.8801, + "step": 143420 + }, + { + "epoch": 11.11472742066721, + "grad_norm": 1.4310882322775091, + "learning_rate": 5.557579045257285e-07, + "loss": 0.8875, + "step": 143430 + }, + { + "epoch": 11.115502344143517, + "grad_norm": 1.4395741919061409, + "learning_rate": 5.55796652200868e-07, + "loss": 0.8868, + "step": 143440 + }, + { + "epoch": 11.116277267619823, + "grad_norm": 1.5436794905424978, + "learning_rate": 5.558353998760075e-07, + "loss": 0.8686, + "step": 143450 + }, + { + "epoch": 11.117052191096128, + "grad_norm": 1.5058980635749006, + "learning_rate": 5.558741475511469e-07, + "loss": 0.894, + "step": 143460 + }, + { + "epoch": 11.117827114572435, + "grad_norm": 1.4336746817742674, + "learning_rate": 5.559128952262865e-07, + "loss": 0.8996, + "step": 143470 + }, + { + "epoch": 11.118602038048742, + "grad_norm": 1.455223881478233, + "learning_rate": 5.55951642901426e-07, + "loss": 0.882, + "step": 143480 + }, + { + "epoch": 11.119376961525049, + "grad_norm": 1.3525955138937913, + "learning_rate": 5.559903905765655e-07, + "loss": 0.884, + "step": 143490 + }, + { + "epoch": 11.120151885001356, + "grad_norm": 1.4990012635687389, + "learning_rate": 5.560291382517049e-07, + "loss": 0.8936, + "step": 143500 + }, + { + "epoch": 11.120151885001356, + "eval_loss": 0.9031535387039185, + "eval_runtime": 330.9749, + "eval_samples_per_second": 34.658, + "eval_steps_per_second": 8.665, + "step": 143500 + }, + { + "epoch": 11.120926808477662, + "grad_norm": 1.4422601107724569, + "learning_rate": 5.560678859268444e-07, + "loss": 0.8964, + "step": 143510 + }, + { + "epoch": 11.12170173195397, + "grad_norm": 1.5034573243991316, + "learning_rate": 5.56106633601984e-07, + "loss": 0.8834, + "step": 143520 + }, + { + "epoch": 11.122476655430276, + "grad_norm": 1.4911708006894677, + "learning_rate": 5.561453812771234e-07, + "loss": 0.8853, + "step": 143530 + }, + { + "epoch": 11.123251578906583, + "grad_norm": 1.4842239148042655, + "learning_rate": 5.561841289522629e-07, + "loss": 0.8872, + "step": 143540 + }, + { + "epoch": 11.12402650238289, + "grad_norm": 1.4635400150379858, + "learning_rate": 5.562228766274024e-07, + "loss": 0.8768, + "step": 143550 + }, + { + "epoch": 11.124801425859197, + "grad_norm": 1.658733700049365, + "learning_rate": 5.56261624302542e-07, + "loss": 0.8975, + "step": 143560 + }, + { + "epoch": 11.125576349335503, + "grad_norm": 1.4184051977419667, + "learning_rate": 5.563003719776814e-07, + "loss": 0.9065, + "step": 143570 + }, + { + "epoch": 11.12635127281181, + "grad_norm": 1.398598993718083, + "learning_rate": 5.563391196528209e-07, + "loss": 0.8769, + "step": 143580 + }, + { + "epoch": 11.127126196288117, + "grad_norm": 1.5071719583769347, + "learning_rate": 5.563778673279604e-07, + "loss": 0.8928, + "step": 143590 + }, + { + "epoch": 11.127901119764424, + "grad_norm": 1.4638552716679065, + "learning_rate": 5.564166150030998e-07, + "loss": 0.8843, + "step": 143600 + }, + { + "epoch": 11.12867604324073, + "grad_norm": 1.4060681510069024, + "learning_rate": 5.564553626782393e-07, + "loss": 0.8891, + "step": 143610 + }, + { + "epoch": 11.129450966717037, + "grad_norm": 1.5104167185271873, + "learning_rate": 5.564941103533789e-07, + "loss": 0.8987, + "step": 143620 + }, + { + "epoch": 11.130225890193344, + "grad_norm": 1.5891143023822076, + "learning_rate": 5.565328580285184e-07, + "loss": 0.8843, + "step": 143630 + }, + { + "epoch": 11.131000813669651, + "grad_norm": 1.470695020136253, + "learning_rate": 5.565716057036578e-07, + "loss": 0.8684, + "step": 143640 + }, + { + "epoch": 11.131775737145956, + "grad_norm": 1.3798542691708042, + "learning_rate": 5.566103533787973e-07, + "loss": 0.8817, + "step": 143650 + }, + { + "epoch": 11.132550660622263, + "grad_norm": 1.4079838730669307, + "learning_rate": 5.566491010539369e-07, + "loss": 0.8744, + "step": 143660 + }, + { + "epoch": 11.13332558409857, + "grad_norm": 1.451045045339204, + "learning_rate": 5.566878487290763e-07, + "loss": 0.8642, + "step": 143670 + }, + { + "epoch": 11.134100507574876, + "grad_norm": 1.4749223960898208, + "learning_rate": 5.567265964042158e-07, + "loss": 0.8657, + "step": 143680 + }, + { + "epoch": 11.134875431051183, + "grad_norm": 1.4711450015165244, + "learning_rate": 5.567653440793553e-07, + "loss": 0.9039, + "step": 143690 + }, + { + "epoch": 11.13565035452749, + "grad_norm": 1.4789961891734476, + "learning_rate": 5.568040917544948e-07, + "loss": 0.8862, + "step": 143700 + }, + { + "epoch": 11.136425278003797, + "grad_norm": 1.3871189119099847, + "learning_rate": 5.568428394296343e-07, + "loss": 0.8894, + "step": 143710 + }, + { + "epoch": 11.137200201480104, + "grad_norm": 1.4774173798462162, + "learning_rate": 5.568815871047738e-07, + "loss": 0.8843, + "step": 143720 + }, + { + "epoch": 11.13797512495641, + "grad_norm": 1.5263891166888304, + "learning_rate": 5.569203347799133e-07, + "loss": 0.8753, + "step": 143730 + }, + { + "epoch": 11.138750048432717, + "grad_norm": 1.4887635660361298, + "learning_rate": 5.569590824550527e-07, + "loss": 0.8947, + "step": 143740 + }, + { + "epoch": 11.139524971909024, + "grad_norm": 1.580170680479364, + "learning_rate": 5.569978301301922e-07, + "loss": 0.8942, + "step": 143750 + }, + { + "epoch": 11.140299895385331, + "grad_norm": 1.4731956971342792, + "learning_rate": 5.570365778053318e-07, + "loss": 0.881, + "step": 143760 + }, + { + "epoch": 11.141074818861638, + "grad_norm": 1.473225929400249, + "learning_rate": 5.570753254804713e-07, + "loss": 0.8894, + "step": 143770 + }, + { + "epoch": 11.141849742337945, + "grad_norm": 1.621238369151048, + "learning_rate": 5.571140731556107e-07, + "loss": 0.896, + "step": 143780 + }, + { + "epoch": 11.142624665814251, + "grad_norm": 1.4152646124906412, + "learning_rate": 5.571528208307502e-07, + "loss": 0.8829, + "step": 143790 + }, + { + "epoch": 11.143399589290558, + "grad_norm": 1.5597290456118025, + "learning_rate": 5.571915685058897e-07, + "loss": 0.8988, + "step": 143800 + }, + { + "epoch": 11.144174512766865, + "grad_norm": 1.5055750802711971, + "learning_rate": 5.572303161810292e-07, + "loss": 0.8921, + "step": 143810 + }, + { + "epoch": 11.144949436243172, + "grad_norm": 1.5676644494425662, + "learning_rate": 5.572690638561687e-07, + "loss": 0.9035, + "step": 143820 + }, + { + "epoch": 11.145724359719477, + "grad_norm": 1.5996829353696373, + "learning_rate": 5.573078115313082e-07, + "loss": 0.8758, + "step": 143830 + }, + { + "epoch": 11.146499283195784, + "grad_norm": 1.5305322300632525, + "learning_rate": 5.573465592064477e-07, + "loss": 0.8762, + "step": 143840 + }, + { + "epoch": 11.14727420667209, + "grad_norm": 1.5376361510803014, + "learning_rate": 5.573853068815871e-07, + "loss": 0.8853, + "step": 143850 + }, + { + "epoch": 11.148049130148397, + "grad_norm": 1.4457166499041538, + "learning_rate": 5.574240545567267e-07, + "loss": 0.8738, + "step": 143860 + }, + { + "epoch": 11.148824053624704, + "grad_norm": 1.4101188648377556, + "learning_rate": 5.574628022318662e-07, + "loss": 0.8937, + "step": 143870 + }, + { + "epoch": 11.149598977101011, + "grad_norm": 1.3930429239744302, + "learning_rate": 5.575015499070056e-07, + "loss": 0.8691, + "step": 143880 + }, + { + "epoch": 11.150373900577318, + "grad_norm": 1.3828641156063417, + "learning_rate": 5.575402975821451e-07, + "loss": 0.9163, + "step": 143890 + }, + { + "epoch": 11.151148824053625, + "grad_norm": 1.697825869891676, + "learning_rate": 5.575790452572846e-07, + "loss": 0.9118, + "step": 143900 + }, + { + "epoch": 11.151923747529931, + "grad_norm": 1.451721590756606, + "learning_rate": 5.576177929324242e-07, + "loss": 0.8843, + "step": 143910 + }, + { + "epoch": 11.152698671006238, + "grad_norm": 1.4734455615336641, + "learning_rate": 5.576565406075636e-07, + "loss": 0.9031, + "step": 143920 + }, + { + "epoch": 11.153473594482545, + "grad_norm": 1.4341937726499947, + "learning_rate": 5.576952882827031e-07, + "loss": 0.8739, + "step": 143930 + }, + { + "epoch": 11.154248517958852, + "grad_norm": 1.462988121122333, + "learning_rate": 5.577340359578426e-07, + "loss": 0.883, + "step": 143940 + }, + { + "epoch": 11.155023441435159, + "grad_norm": 1.434225232938413, + "learning_rate": 5.57772783632982e-07, + "loss": 0.9199, + "step": 143950 + }, + { + "epoch": 11.155798364911465, + "grad_norm": 1.4460291664844829, + "learning_rate": 5.578115313081216e-07, + "loss": 0.8856, + "step": 143960 + }, + { + "epoch": 11.156573288387772, + "grad_norm": 1.443310335788734, + "learning_rate": 5.578502789832611e-07, + "loss": 0.8813, + "step": 143970 + }, + { + "epoch": 11.157348211864079, + "grad_norm": 1.4945932755277493, + "learning_rate": 5.578890266584006e-07, + "loss": 0.8948, + "step": 143980 + }, + { + "epoch": 11.158123135340386, + "grad_norm": 1.445034356778851, + "learning_rate": 5.5792777433354e-07, + "loss": 0.8678, + "step": 143990 + }, + { + "epoch": 11.158898058816693, + "grad_norm": 1.442602822450271, + "learning_rate": 5.579665220086795e-07, + "loss": 0.8839, + "step": 144000 + }, + { + "epoch": 11.158898058816693, + "eval_loss": 0.9029766917228699, + "eval_runtime": 331.4393, + "eval_samples_per_second": 34.61, + "eval_steps_per_second": 8.653, + "step": 144000 + }, + { + "epoch": 11.159672982293, + "grad_norm": 1.6115807089807521, + "learning_rate": 5.580052696838191e-07, + "loss": 0.8633, + "step": 144010 + }, + { + "epoch": 11.160447905769304, + "grad_norm": 1.5266734198026333, + "learning_rate": 5.580440173589585e-07, + "loss": 0.8886, + "step": 144020 + }, + { + "epoch": 11.161222829245611, + "grad_norm": 1.4325347236321755, + "learning_rate": 5.58082765034098e-07, + "loss": 0.8661, + "step": 144030 + }, + { + "epoch": 11.161997752721918, + "grad_norm": 1.433719649528043, + "learning_rate": 5.581215127092375e-07, + "loss": 0.8924, + "step": 144040 + }, + { + "epoch": 11.162772676198225, + "grad_norm": 1.4816355385046458, + "learning_rate": 5.58160260384377e-07, + "loss": 0.8702, + "step": 144050 + }, + { + "epoch": 11.163547599674532, + "grad_norm": 1.4090756619313867, + "learning_rate": 5.581990080595165e-07, + "loss": 0.8979, + "step": 144060 + }, + { + "epoch": 11.164322523150839, + "grad_norm": 1.4599800790266315, + "learning_rate": 5.58237755734656e-07, + "loss": 0.8986, + "step": 144070 + }, + { + "epoch": 11.165097446627145, + "grad_norm": 1.4510569685224606, + "learning_rate": 5.582765034097955e-07, + "loss": 0.908, + "step": 144080 + }, + { + "epoch": 11.165872370103452, + "grad_norm": 1.3500110155623484, + "learning_rate": 5.583152510849349e-07, + "loss": 0.8854, + "step": 144090 + }, + { + "epoch": 11.166647293579759, + "grad_norm": 1.4909748274512669, + "learning_rate": 5.583539987600744e-07, + "loss": 0.8823, + "step": 144100 + }, + { + "epoch": 11.167422217056066, + "grad_norm": 1.4530719108109347, + "learning_rate": 5.58392746435214e-07, + "loss": 0.8994, + "step": 144110 + }, + { + "epoch": 11.168197140532373, + "grad_norm": 1.52734607678885, + "learning_rate": 5.584314941103535e-07, + "loss": 0.8969, + "step": 144120 + }, + { + "epoch": 11.16897206400868, + "grad_norm": 1.4487797203133805, + "learning_rate": 5.584702417854929e-07, + "loss": 0.8948, + "step": 144130 + }, + { + "epoch": 11.169746987484986, + "grad_norm": 1.5701075386708514, + "learning_rate": 5.585089894606324e-07, + "loss": 0.9254, + "step": 144140 + }, + { + "epoch": 11.170521910961293, + "grad_norm": 1.3932848053847868, + "learning_rate": 5.58547737135772e-07, + "loss": 0.8821, + "step": 144150 + }, + { + "epoch": 11.1712968344376, + "grad_norm": 1.5307171716094563, + "learning_rate": 5.585864848109114e-07, + "loss": 0.885, + "step": 144160 + }, + { + "epoch": 11.172071757913907, + "grad_norm": 1.4356437531144721, + "learning_rate": 5.586252324860509e-07, + "loss": 0.8829, + "step": 144170 + }, + { + "epoch": 11.172846681390213, + "grad_norm": 1.449998333686739, + "learning_rate": 5.586639801611904e-07, + "loss": 0.8769, + "step": 144180 + }, + { + "epoch": 11.17362160486652, + "grad_norm": 1.4384285776943697, + "learning_rate": 5.587027278363299e-07, + "loss": 0.8672, + "step": 144190 + }, + { + "epoch": 11.174396528342825, + "grad_norm": 1.5039293571979357, + "learning_rate": 5.587414755114693e-07, + "loss": 0.9048, + "step": 144200 + }, + { + "epoch": 11.175171451819132, + "grad_norm": 1.4877659602310096, + "learning_rate": 5.587802231866089e-07, + "loss": 0.8807, + "step": 144210 + }, + { + "epoch": 11.175946375295439, + "grad_norm": 1.5289998134009417, + "learning_rate": 5.588189708617484e-07, + "loss": 0.8786, + "step": 144220 + }, + { + "epoch": 11.176721298771746, + "grad_norm": 1.4835730816631474, + "learning_rate": 5.588577185368878e-07, + "loss": 0.889, + "step": 144230 + }, + { + "epoch": 11.177496222248053, + "grad_norm": 1.4427278392910134, + "learning_rate": 5.588964662120273e-07, + "loss": 0.8801, + "step": 144240 + }, + { + "epoch": 11.17827114572436, + "grad_norm": 1.4260664856289045, + "learning_rate": 5.589352138871669e-07, + "loss": 0.8799, + "step": 144250 + }, + { + "epoch": 11.179046069200666, + "grad_norm": 1.4674477843905114, + "learning_rate": 5.589739615623064e-07, + "loss": 0.8738, + "step": 144260 + }, + { + "epoch": 11.179820992676973, + "grad_norm": 1.6574839622576076, + "learning_rate": 5.590127092374458e-07, + "loss": 0.9014, + "step": 144270 + }, + { + "epoch": 11.18059591615328, + "grad_norm": 1.487346245928616, + "learning_rate": 5.590514569125853e-07, + "loss": 0.8956, + "step": 144280 + }, + { + "epoch": 11.181370839629587, + "grad_norm": 1.4145073314763534, + "learning_rate": 5.590902045877248e-07, + "loss": 0.8855, + "step": 144290 + }, + { + "epoch": 11.182145763105893, + "grad_norm": 1.4790346467504387, + "learning_rate": 5.591289522628642e-07, + "loss": 0.8722, + "step": 144300 + }, + { + "epoch": 11.1829206865822, + "grad_norm": 1.6996381616897778, + "learning_rate": 5.591676999380038e-07, + "loss": 0.9077, + "step": 144310 + }, + { + "epoch": 11.183695610058507, + "grad_norm": 1.472823961087526, + "learning_rate": 5.592064476131433e-07, + "loss": 0.8879, + "step": 144320 + }, + { + "epoch": 11.184470533534814, + "grad_norm": 1.421813957540438, + "learning_rate": 5.592451952882828e-07, + "loss": 0.8833, + "step": 144330 + }, + { + "epoch": 11.18524545701112, + "grad_norm": 1.5053323134250824, + "learning_rate": 5.592839429634222e-07, + "loss": 0.8818, + "step": 144340 + }, + { + "epoch": 11.186020380487427, + "grad_norm": 1.4987308129696904, + "learning_rate": 5.593226906385618e-07, + "loss": 0.8912, + "step": 144350 + }, + { + "epoch": 11.186795303963734, + "grad_norm": 1.453970715089775, + "learning_rate": 5.593614383137013e-07, + "loss": 0.8777, + "step": 144360 + }, + { + "epoch": 11.187570227440041, + "grad_norm": 1.4885362333029049, + "learning_rate": 5.594001859888407e-07, + "loss": 0.8913, + "step": 144370 + }, + { + "epoch": 11.188345150916348, + "grad_norm": 1.440037714613591, + "learning_rate": 5.594389336639802e-07, + "loss": 0.8936, + "step": 144380 + }, + { + "epoch": 11.189120074392653, + "grad_norm": 1.509065643688239, + "learning_rate": 5.594776813391197e-07, + "loss": 0.8819, + "step": 144390 + }, + { + "epoch": 11.18989499786896, + "grad_norm": 1.5435789027517715, + "learning_rate": 5.595164290142593e-07, + "loss": 0.8772, + "step": 144400 + }, + { + "epoch": 11.190669921345267, + "grad_norm": 1.4152339042727107, + "learning_rate": 5.595551766893987e-07, + "loss": 0.8943, + "step": 144410 + }, + { + "epoch": 11.191444844821573, + "grad_norm": 1.4035837027298457, + "learning_rate": 5.595939243645382e-07, + "loss": 0.892, + "step": 144420 + }, + { + "epoch": 11.19221976829788, + "grad_norm": 1.5518726811000028, + "learning_rate": 5.596326720396777e-07, + "loss": 0.8874, + "step": 144430 + }, + { + "epoch": 11.192994691774187, + "grad_norm": 1.481133745238338, + "learning_rate": 5.596714197148171e-07, + "loss": 0.8746, + "step": 144440 + }, + { + "epoch": 11.193769615250494, + "grad_norm": 1.4552961241451392, + "learning_rate": 5.597101673899567e-07, + "loss": 0.8945, + "step": 144450 + }, + { + "epoch": 11.1945445387268, + "grad_norm": 1.6137648614317859, + "learning_rate": 5.597489150650962e-07, + "loss": 0.8699, + "step": 144460 + }, + { + "epoch": 11.195319462203107, + "grad_norm": 1.5964583940313786, + "learning_rate": 5.597876627402356e-07, + "loss": 0.8733, + "step": 144470 + }, + { + "epoch": 11.196094385679414, + "grad_norm": 1.5494060030288053, + "learning_rate": 5.598264104153751e-07, + "loss": 0.9216, + "step": 144480 + }, + { + "epoch": 11.196869309155721, + "grad_norm": 1.4205952362369714, + "learning_rate": 5.598651580905146e-07, + "loss": 0.874, + "step": 144490 + }, + { + "epoch": 11.197644232632028, + "grad_norm": 1.5262734114982828, + "learning_rate": 5.599039057656542e-07, + "loss": 0.8772, + "step": 144500 + }, + { + "epoch": 11.197644232632028, + "eval_loss": 0.9028933644294739, + "eval_runtime": 330.3373, + "eval_samples_per_second": 34.725, + "eval_steps_per_second": 8.682, + "step": 144500 + }, + { + "epoch": 11.198419156108335, + "grad_norm": 1.530973577894173, + "learning_rate": 5.599426534407936e-07, + "loss": 0.8987, + "step": 144510 + }, + { + "epoch": 11.199194079584641, + "grad_norm": 1.436351502797153, + "learning_rate": 5.599814011159331e-07, + "loss": 0.876, + "step": 144520 + }, + { + "epoch": 11.199969003060948, + "grad_norm": 1.4556073981242785, + "learning_rate": 5.600201487910726e-07, + "loss": 0.8896, + "step": 144530 + }, + { + "epoch": 11.200743926537255, + "grad_norm": 1.46318761922121, + "learning_rate": 5.60058896466212e-07, + "loss": 0.8858, + "step": 144540 + }, + { + "epoch": 11.201518850013562, + "grad_norm": 1.4969974320695005, + "learning_rate": 5.600976441413516e-07, + "loss": 0.9046, + "step": 144550 + }, + { + "epoch": 11.202293773489869, + "grad_norm": 1.4066194023973297, + "learning_rate": 5.601363918164911e-07, + "loss": 0.8817, + "step": 144560 + }, + { + "epoch": 11.203068696966174, + "grad_norm": 1.4426531574298493, + "learning_rate": 5.601751394916306e-07, + "loss": 0.8868, + "step": 144570 + }, + { + "epoch": 11.20384362044248, + "grad_norm": 1.4286928133918195, + "learning_rate": 5.6021388716677e-07, + "loss": 0.8879, + "step": 144580 + }, + { + "epoch": 11.204618543918787, + "grad_norm": 1.5043698172123696, + "learning_rate": 5.602526348419095e-07, + "loss": 0.9004, + "step": 144590 + }, + { + "epoch": 11.205393467395094, + "grad_norm": 1.6029714227898566, + "learning_rate": 5.602913825170491e-07, + "loss": 0.8947, + "step": 144600 + }, + { + "epoch": 11.206168390871401, + "grad_norm": 1.5170958538154757, + "learning_rate": 5.603301301921885e-07, + "loss": 0.8788, + "step": 144610 + }, + { + "epoch": 11.206943314347708, + "grad_norm": 1.506427696071795, + "learning_rate": 5.60368877867328e-07, + "loss": 0.8861, + "step": 144620 + }, + { + "epoch": 11.207718237824015, + "grad_norm": 1.55005873833723, + "learning_rate": 5.604076255424675e-07, + "loss": 0.89, + "step": 144630 + }, + { + "epoch": 11.208493161300321, + "grad_norm": 1.4794995779163809, + "learning_rate": 5.60446373217607e-07, + "loss": 0.8789, + "step": 144640 + }, + { + "epoch": 11.209268084776628, + "grad_norm": 1.4153918714637448, + "learning_rate": 5.604851208927465e-07, + "loss": 0.8794, + "step": 144650 + }, + { + "epoch": 11.210043008252935, + "grad_norm": 1.4269935611610656, + "learning_rate": 5.60523868567886e-07, + "loss": 0.8864, + "step": 144660 + }, + { + "epoch": 11.210817931729242, + "grad_norm": 1.430541083239123, + "learning_rate": 5.605626162430255e-07, + "loss": 0.8956, + "step": 144670 + }, + { + "epoch": 11.211592855205549, + "grad_norm": 1.4433953921527205, + "learning_rate": 5.606013639181649e-07, + "loss": 0.8643, + "step": 144680 + }, + { + "epoch": 11.212367778681855, + "grad_norm": 1.5554410900619704, + "learning_rate": 5.606401115933044e-07, + "loss": 0.9034, + "step": 144690 + }, + { + "epoch": 11.213142702158162, + "grad_norm": 1.4380300227301397, + "learning_rate": 5.60678859268444e-07, + "loss": 0.892, + "step": 144700 + }, + { + "epoch": 11.213917625634469, + "grad_norm": 1.4089745184929843, + "learning_rate": 5.607176069435835e-07, + "loss": 0.9062, + "step": 144710 + }, + { + "epoch": 11.214692549110776, + "grad_norm": 1.4489329591709335, + "learning_rate": 5.607563546187229e-07, + "loss": 0.8832, + "step": 144720 + }, + { + "epoch": 11.215467472587083, + "grad_norm": 1.6113935788435874, + "learning_rate": 5.607951022938624e-07, + "loss": 0.8926, + "step": 144730 + }, + { + "epoch": 11.21624239606339, + "grad_norm": 1.5299666008304025, + "learning_rate": 5.608338499690019e-07, + "loss": 0.8939, + "step": 144740 + }, + { + "epoch": 11.217017319539696, + "grad_norm": 1.5753598149323347, + "learning_rate": 5.608725976441414e-07, + "loss": 0.8908, + "step": 144750 + }, + { + "epoch": 11.217792243016001, + "grad_norm": 1.482962504278471, + "learning_rate": 5.609113453192809e-07, + "loss": 0.8955, + "step": 144760 + }, + { + "epoch": 11.218567166492308, + "grad_norm": 1.7211539617886977, + "learning_rate": 5.609500929944204e-07, + "loss": 0.9286, + "step": 144770 + }, + { + "epoch": 11.219342089968615, + "grad_norm": 1.5577559922579007, + "learning_rate": 5.609888406695599e-07, + "loss": 0.8819, + "step": 144780 + }, + { + "epoch": 11.220117013444922, + "grad_norm": 1.81170605680975, + "learning_rate": 5.610275883446993e-07, + "loss": 0.8809, + "step": 144790 + }, + { + "epoch": 11.220891936921229, + "grad_norm": 1.4450793539063977, + "learning_rate": 5.610663360198389e-07, + "loss": 0.8859, + "step": 144800 + }, + { + "epoch": 11.221666860397535, + "grad_norm": 1.566641052793399, + "learning_rate": 5.611050836949784e-07, + "loss": 0.8826, + "step": 144810 + }, + { + "epoch": 11.222441783873842, + "grad_norm": 1.4595179524583377, + "learning_rate": 5.611438313701178e-07, + "loss": 0.8852, + "step": 144820 + }, + { + "epoch": 11.223216707350149, + "grad_norm": 1.4686589742307934, + "learning_rate": 5.611825790452573e-07, + "loss": 0.8852, + "step": 144830 + }, + { + "epoch": 11.223991630826456, + "grad_norm": 1.490003561446926, + "learning_rate": 5.612213267203968e-07, + "loss": 0.8924, + "step": 144840 + }, + { + "epoch": 11.224766554302763, + "grad_norm": 1.450152285664098, + "learning_rate": 5.612600743955364e-07, + "loss": 0.8821, + "step": 144850 + }, + { + "epoch": 11.22554147777907, + "grad_norm": 1.426889046809052, + "learning_rate": 5.612988220706758e-07, + "loss": 0.8968, + "step": 144860 + }, + { + "epoch": 11.226316401255376, + "grad_norm": 1.4775747838775026, + "learning_rate": 5.613375697458153e-07, + "loss": 0.8728, + "step": 144870 + }, + { + "epoch": 11.227091324731683, + "grad_norm": 1.4596560856245606, + "learning_rate": 5.613763174209548e-07, + "loss": 0.8691, + "step": 144880 + }, + { + "epoch": 11.22786624820799, + "grad_norm": 1.4131645912840045, + "learning_rate": 5.614150650960942e-07, + "loss": 0.8887, + "step": 144890 + }, + { + "epoch": 11.228641171684297, + "grad_norm": 1.5630720753377918, + "learning_rate": 5.614538127712338e-07, + "loss": 0.8915, + "step": 144900 + }, + { + "epoch": 11.229416095160603, + "grad_norm": 1.5231275057423328, + "learning_rate": 5.614925604463733e-07, + "loss": 0.8763, + "step": 144910 + }, + { + "epoch": 11.23019101863691, + "grad_norm": 1.4296470343995678, + "learning_rate": 5.615313081215128e-07, + "loss": 0.8822, + "step": 144920 + }, + { + "epoch": 11.230965942113217, + "grad_norm": 1.5500016967441337, + "learning_rate": 5.615700557966522e-07, + "loss": 0.8869, + "step": 144930 + }, + { + "epoch": 11.231740865589524, + "grad_norm": 1.4268734422185962, + "learning_rate": 5.616088034717917e-07, + "loss": 0.8687, + "step": 144940 + }, + { + "epoch": 11.232515789065829, + "grad_norm": 1.5085953848680451, + "learning_rate": 5.616475511469313e-07, + "loss": 0.8676, + "step": 144950 + }, + { + "epoch": 11.233290712542136, + "grad_norm": 1.4014913794390262, + "learning_rate": 5.616862988220707e-07, + "loss": 0.882, + "step": 144960 + }, + { + "epoch": 11.234065636018443, + "grad_norm": 1.4653672617033544, + "learning_rate": 5.617250464972102e-07, + "loss": 0.895, + "step": 144970 + }, + { + "epoch": 11.23484055949475, + "grad_norm": 1.5546912397809627, + "learning_rate": 5.617637941723497e-07, + "loss": 0.8939, + "step": 144980 + }, + { + "epoch": 11.235615482971056, + "grad_norm": 1.4642178815853246, + "learning_rate": 5.618025418474893e-07, + "loss": 0.8881, + "step": 144990 + }, + { + "epoch": 11.236390406447363, + "grad_norm": 1.4145365168725013, + "learning_rate": 5.618412895226287e-07, + "loss": 0.8749, + "step": 145000 + }, + { + "epoch": 11.236390406447363, + "eval_loss": 0.9025052189826965, + "eval_runtime": 331.3526, + "eval_samples_per_second": 34.619, + "eval_steps_per_second": 8.655, + "step": 145000 + }, + { + "epoch": 11.23716532992367, + "grad_norm": 1.601030084347568, + "learning_rate": 5.618800371977682e-07, + "loss": 0.8948, + "step": 145010 + }, + { + "epoch": 11.237940253399977, + "grad_norm": 1.6358867399023949, + "learning_rate": 5.619187848729077e-07, + "loss": 0.8772, + "step": 145020 + }, + { + "epoch": 11.238715176876283, + "grad_norm": 1.4621196568677857, + "learning_rate": 5.619575325480471e-07, + "loss": 0.8793, + "step": 145030 + }, + { + "epoch": 11.23949010035259, + "grad_norm": 1.4467053035180268, + "learning_rate": 5.619962802231867e-07, + "loss": 0.8748, + "step": 145040 + }, + { + "epoch": 11.240265023828897, + "grad_norm": 1.4504451178559454, + "learning_rate": 5.620350278983262e-07, + "loss": 0.8927, + "step": 145050 + }, + { + "epoch": 11.241039947305204, + "grad_norm": 1.4334296132657207, + "learning_rate": 5.620737755734657e-07, + "loss": 0.8757, + "step": 145060 + }, + { + "epoch": 11.24181487078151, + "grad_norm": 1.4886484686669033, + "learning_rate": 5.621125232486051e-07, + "loss": 0.884, + "step": 145070 + }, + { + "epoch": 11.242589794257817, + "grad_norm": 1.4829454621607807, + "learning_rate": 5.621512709237446e-07, + "loss": 0.8876, + "step": 145080 + }, + { + "epoch": 11.243364717734124, + "grad_norm": 1.472536322744348, + "learning_rate": 5.621900185988842e-07, + "loss": 0.8964, + "step": 145090 + }, + { + "epoch": 11.244139641210431, + "grad_norm": 1.5165375439747573, + "learning_rate": 5.622287662740236e-07, + "loss": 0.9115, + "step": 145100 + }, + { + "epoch": 11.244914564686738, + "grad_norm": 1.4810486256955202, + "learning_rate": 5.622675139491631e-07, + "loss": 0.8788, + "step": 145110 + }, + { + "epoch": 11.245689488163045, + "grad_norm": 1.4643394327530088, + "learning_rate": 5.623062616243026e-07, + "loss": 0.8784, + "step": 145120 + }, + { + "epoch": 11.246464411639352, + "grad_norm": 1.569054551101899, + "learning_rate": 5.623450092994421e-07, + "loss": 0.8925, + "step": 145130 + }, + { + "epoch": 11.247239335115657, + "grad_norm": 1.5690288166805066, + "learning_rate": 5.623837569745816e-07, + "loss": 0.877, + "step": 145140 + }, + { + "epoch": 11.248014258591963, + "grad_norm": 1.486866713821596, + "learning_rate": 5.624225046497211e-07, + "loss": 0.8986, + "step": 145150 + }, + { + "epoch": 11.24878918206827, + "grad_norm": 1.49273568278538, + "learning_rate": 5.624612523248606e-07, + "loss": 0.9174, + "step": 145160 + }, + { + "epoch": 11.249564105544577, + "grad_norm": 1.4124657084884262, + "learning_rate": 5.625e-07, + "loss": 0.8817, + "step": 145170 + }, + { + "epoch": 11.250339029020884, + "grad_norm": 1.5667374511595686, + "learning_rate": 5.625387476751395e-07, + "loss": 0.8883, + "step": 145180 + }, + { + "epoch": 11.25111395249719, + "grad_norm": 1.3935639696638595, + "learning_rate": 5.625774953502791e-07, + "loss": 0.8717, + "step": 145190 + }, + { + "epoch": 11.251888875973497, + "grad_norm": 1.4332841238716751, + "learning_rate": 5.626162430254186e-07, + "loss": 0.8847, + "step": 145200 + }, + { + "epoch": 11.252663799449804, + "grad_norm": 1.4824154758919246, + "learning_rate": 5.62654990700558e-07, + "loss": 0.9226, + "step": 145210 + }, + { + "epoch": 11.253438722926111, + "grad_norm": 1.4653940151461657, + "learning_rate": 5.626937383756975e-07, + "loss": 0.8801, + "step": 145220 + }, + { + "epoch": 11.254213646402418, + "grad_norm": 1.5933312770457881, + "learning_rate": 5.62732486050837e-07, + "loss": 0.8959, + "step": 145230 + }, + { + "epoch": 11.254988569878725, + "grad_norm": 1.467368965266407, + "learning_rate": 5.627712337259765e-07, + "loss": 0.8892, + "step": 145240 + }, + { + "epoch": 11.255763493355031, + "grad_norm": 1.657274541076055, + "learning_rate": 5.62809981401116e-07, + "loss": 0.8806, + "step": 145250 + }, + { + "epoch": 11.256538416831338, + "grad_norm": 1.4686300338101497, + "learning_rate": 5.628487290762555e-07, + "loss": 0.9101, + "step": 145260 + }, + { + "epoch": 11.257313340307645, + "grad_norm": 1.561452068009912, + "learning_rate": 5.62887476751395e-07, + "loss": 0.8895, + "step": 145270 + }, + { + "epoch": 11.258088263783952, + "grad_norm": 1.470214062509879, + "learning_rate": 5.629262244265344e-07, + "loss": 0.8877, + "step": 145280 + }, + { + "epoch": 11.258863187260259, + "grad_norm": 1.4782588367073404, + "learning_rate": 5.62964972101674e-07, + "loss": 0.8872, + "step": 145290 + }, + { + "epoch": 11.259638110736566, + "grad_norm": 1.5155759648375047, + "learning_rate": 5.630037197768135e-07, + "loss": 0.8822, + "step": 145300 + }, + { + "epoch": 11.260413034212872, + "grad_norm": 1.458488779979221, + "learning_rate": 5.630424674519529e-07, + "loss": 0.8783, + "step": 145310 + }, + { + "epoch": 11.261187957689177, + "grad_norm": 1.4575803415025204, + "learning_rate": 5.630812151270924e-07, + "loss": 0.8823, + "step": 145320 + }, + { + "epoch": 11.261962881165484, + "grad_norm": 1.533361877972353, + "learning_rate": 5.631199628022319e-07, + "loss": 0.8983, + "step": 145330 + }, + { + "epoch": 11.262737804641791, + "grad_norm": 1.5130513305813236, + "learning_rate": 5.631587104773715e-07, + "loss": 0.8948, + "step": 145340 + }, + { + "epoch": 11.263512728118098, + "grad_norm": 1.525836269485646, + "learning_rate": 5.631974581525109e-07, + "loss": 0.874, + "step": 145350 + }, + { + "epoch": 11.264287651594405, + "grad_norm": 1.4613139601385823, + "learning_rate": 5.632362058276504e-07, + "loss": 0.8871, + "step": 145360 + }, + { + "epoch": 11.265062575070711, + "grad_norm": 1.419029252660432, + "learning_rate": 5.632749535027899e-07, + "loss": 0.8893, + "step": 145370 + }, + { + "epoch": 11.265837498547018, + "grad_norm": 1.4488640551330587, + "learning_rate": 5.633137011779293e-07, + "loss": 0.8969, + "step": 145380 + }, + { + "epoch": 11.266612422023325, + "grad_norm": 1.5191096617408681, + "learning_rate": 5.633524488530689e-07, + "loss": 0.9039, + "step": 145390 + }, + { + "epoch": 11.267387345499632, + "grad_norm": 1.4867687205155606, + "learning_rate": 5.633911965282084e-07, + "loss": 0.89, + "step": 145400 + }, + { + "epoch": 11.268162268975939, + "grad_norm": 1.4777013485719812, + "learning_rate": 5.634299442033479e-07, + "loss": 0.8702, + "step": 145410 + }, + { + "epoch": 11.268937192452245, + "grad_norm": 1.439355283509432, + "learning_rate": 5.634686918784873e-07, + "loss": 0.8636, + "step": 145420 + }, + { + "epoch": 11.269712115928552, + "grad_norm": 1.350136118090156, + "learning_rate": 5.635074395536268e-07, + "loss": 0.873, + "step": 145430 + }, + { + "epoch": 11.270487039404859, + "grad_norm": 1.454284482145956, + "learning_rate": 5.635461872287664e-07, + "loss": 0.901, + "step": 145440 + }, + { + "epoch": 11.271261962881166, + "grad_norm": 1.5312030711324665, + "learning_rate": 5.635849349039058e-07, + "loss": 0.8828, + "step": 145450 + }, + { + "epoch": 11.272036886357473, + "grad_norm": 1.4243053757976287, + "learning_rate": 5.636236825790453e-07, + "loss": 0.8781, + "step": 145460 + }, + { + "epoch": 11.27281180983378, + "grad_norm": 1.6103880632690812, + "learning_rate": 5.636624302541848e-07, + "loss": 0.8601, + "step": 145470 + }, + { + "epoch": 11.273586733310086, + "grad_norm": 1.4594361781558767, + "learning_rate": 5.637011779293243e-07, + "loss": 0.8923, + "step": 145480 + }, + { + "epoch": 11.274361656786393, + "grad_norm": 1.372595085756209, + "learning_rate": 5.637399256044638e-07, + "loss": 0.875, + "step": 145490 + }, + { + "epoch": 11.2751365802627, + "grad_norm": 1.4288783222525432, + "learning_rate": 5.637786732796033e-07, + "loss": 0.8893, + "step": 145500 + }, + { + "epoch": 11.2751365802627, + "eval_loss": 0.9024921655654907, + "eval_runtime": 333.3027, + "eval_samples_per_second": 34.416, + "eval_steps_per_second": 8.605, + "step": 145500 + }, + { + "epoch": 11.275911503739005, + "grad_norm": 1.4071955604557094, + "learning_rate": 5.638174209547428e-07, + "loss": 0.8871, + "step": 145510 + }, + { + "epoch": 11.276686427215312, + "grad_norm": 1.480526942089028, + "learning_rate": 5.638561686298822e-07, + "loss": 0.8672, + "step": 145520 + }, + { + "epoch": 11.277461350691619, + "grad_norm": 1.4049426659101079, + "learning_rate": 5.638949163050217e-07, + "loss": 0.8841, + "step": 145530 + }, + { + "epoch": 11.278236274167925, + "grad_norm": 1.5031700716196592, + "learning_rate": 5.639336639801613e-07, + "loss": 0.8871, + "step": 145540 + }, + { + "epoch": 11.279011197644232, + "grad_norm": 1.5107803666701238, + "learning_rate": 5.639724116553008e-07, + "loss": 0.8954, + "step": 145550 + }, + { + "epoch": 11.279786121120539, + "grad_norm": 1.5628909739792805, + "learning_rate": 5.640111593304402e-07, + "loss": 0.8648, + "step": 145560 + }, + { + "epoch": 11.280561044596846, + "grad_norm": 1.5021424436044406, + "learning_rate": 5.640499070055797e-07, + "loss": 0.8858, + "step": 145570 + }, + { + "epoch": 11.281335968073153, + "grad_norm": 1.4501253055342385, + "learning_rate": 5.640886546807193e-07, + "loss": 0.8804, + "step": 145580 + }, + { + "epoch": 11.28211089154946, + "grad_norm": 1.4904859218239381, + "learning_rate": 5.641274023558587e-07, + "loss": 0.8876, + "step": 145590 + }, + { + "epoch": 11.282885815025766, + "grad_norm": 1.496600760834263, + "learning_rate": 5.641661500309982e-07, + "loss": 0.8674, + "step": 145600 + }, + { + "epoch": 11.283660738502073, + "grad_norm": 1.4824631942271866, + "learning_rate": 5.642048977061377e-07, + "loss": 0.8792, + "step": 145610 + }, + { + "epoch": 11.28443566197838, + "grad_norm": 1.6370558244228104, + "learning_rate": 5.642436453812772e-07, + "loss": 0.8892, + "step": 145620 + }, + { + "epoch": 11.285210585454687, + "grad_norm": 1.6045777972082336, + "learning_rate": 5.642823930564166e-07, + "loss": 0.9, + "step": 145630 + }, + { + "epoch": 11.285985508930994, + "grad_norm": 1.4895072979745942, + "learning_rate": 5.643211407315562e-07, + "loss": 0.9139, + "step": 145640 + }, + { + "epoch": 11.2867604324073, + "grad_norm": 1.4684829952048994, + "learning_rate": 5.643598884066957e-07, + "loss": 0.892, + "step": 145650 + }, + { + "epoch": 11.287535355883607, + "grad_norm": 1.3927764778757419, + "learning_rate": 5.643986360818351e-07, + "loss": 0.8952, + "step": 145660 + }, + { + "epoch": 11.288310279359914, + "grad_norm": 1.4661723634496264, + "learning_rate": 5.644373837569746e-07, + "loss": 0.8998, + "step": 145670 + }, + { + "epoch": 11.28908520283622, + "grad_norm": 1.5077939518361156, + "learning_rate": 5.644761314321142e-07, + "loss": 0.884, + "step": 145680 + }, + { + "epoch": 11.289860126312526, + "grad_norm": 1.482709231759996, + "learning_rate": 5.645148791072537e-07, + "loss": 0.8809, + "step": 145690 + }, + { + "epoch": 11.290635049788833, + "grad_norm": 1.508444953166647, + "learning_rate": 5.645536267823931e-07, + "loss": 0.8825, + "step": 145700 + }, + { + "epoch": 11.29140997326514, + "grad_norm": 1.5499699718014988, + "learning_rate": 5.645923744575326e-07, + "loss": 0.8864, + "step": 145710 + }, + { + "epoch": 11.292184896741446, + "grad_norm": 1.4981789054409664, + "learning_rate": 5.646311221326721e-07, + "loss": 0.8911, + "step": 145720 + }, + { + "epoch": 11.292959820217753, + "grad_norm": 1.5381593588055773, + "learning_rate": 5.646698698078116e-07, + "loss": 0.8856, + "step": 145730 + }, + { + "epoch": 11.29373474369406, + "grad_norm": 1.4426374924236338, + "learning_rate": 5.647086174829511e-07, + "loss": 0.8642, + "step": 145740 + }, + { + "epoch": 11.294509667170367, + "grad_norm": 1.412248668371972, + "learning_rate": 5.647473651580906e-07, + "loss": 0.8817, + "step": 145750 + }, + { + "epoch": 11.295284590646673, + "grad_norm": 1.4498837995156786, + "learning_rate": 5.647861128332301e-07, + "loss": 0.885, + "step": 145760 + }, + { + "epoch": 11.29605951412298, + "grad_norm": 1.4810395763986564, + "learning_rate": 5.648248605083695e-07, + "loss": 0.8755, + "step": 145770 + }, + { + "epoch": 11.296834437599287, + "grad_norm": 1.5888441944812266, + "learning_rate": 5.648636081835091e-07, + "loss": 0.8784, + "step": 145780 + }, + { + "epoch": 11.297609361075594, + "grad_norm": 1.4528076801324148, + "learning_rate": 5.649023558586486e-07, + "loss": 0.8823, + "step": 145790 + }, + { + "epoch": 11.2983842845519, + "grad_norm": 1.4408905135120367, + "learning_rate": 5.64941103533788e-07, + "loss": 0.8846, + "step": 145800 + }, + { + "epoch": 11.299159208028208, + "grad_norm": 1.7345989463516087, + "learning_rate": 5.649798512089275e-07, + "loss": 0.9103, + "step": 145810 + }, + { + "epoch": 11.299934131504514, + "grad_norm": 1.525973394330587, + "learning_rate": 5.65018598884067e-07, + "loss": 0.8796, + "step": 145820 + }, + { + "epoch": 11.300709054980821, + "grad_norm": 1.4739054880578388, + "learning_rate": 5.650573465592066e-07, + "loss": 0.8984, + "step": 145830 + }, + { + "epoch": 11.301483978457128, + "grad_norm": 1.4424989327071776, + "learning_rate": 5.65096094234346e-07, + "loss": 0.8751, + "step": 145840 + }, + { + "epoch": 11.302258901933435, + "grad_norm": 1.5669026541658633, + "learning_rate": 5.651348419094855e-07, + "loss": 0.8854, + "step": 145850 + }, + { + "epoch": 11.303033825409742, + "grad_norm": 1.5736131454925448, + "learning_rate": 5.65173589584625e-07, + "loss": 0.8825, + "step": 145860 + }, + { + "epoch": 11.303808748886048, + "grad_norm": 1.433449416064523, + "learning_rate": 5.652123372597644e-07, + "loss": 0.8881, + "step": 145870 + }, + { + "epoch": 11.304583672362353, + "grad_norm": 1.4561710066938764, + "learning_rate": 5.65251084934904e-07, + "loss": 0.8892, + "step": 145880 + }, + { + "epoch": 11.30535859583866, + "grad_norm": 1.4329946378826677, + "learning_rate": 5.652898326100435e-07, + "loss": 0.8757, + "step": 145890 + }, + { + "epoch": 11.306133519314967, + "grad_norm": 1.4998734928830264, + "learning_rate": 5.65328580285183e-07, + "loss": 0.8793, + "step": 145900 + }, + { + "epoch": 11.306908442791274, + "grad_norm": 1.4526286440862517, + "learning_rate": 5.653673279603224e-07, + "loss": 0.9011, + "step": 145910 + }, + { + "epoch": 11.30768336626758, + "grad_norm": 1.4634322860413742, + "learning_rate": 5.654060756354619e-07, + "loss": 0.8665, + "step": 145920 + }, + { + "epoch": 11.308458289743887, + "grad_norm": 1.4749615586046099, + "learning_rate": 5.654448233106015e-07, + "loss": 0.9116, + "step": 145930 + }, + { + "epoch": 11.309233213220194, + "grad_norm": 1.4713390708807448, + "learning_rate": 5.654835709857409e-07, + "loss": 0.8698, + "step": 145940 + }, + { + "epoch": 11.310008136696501, + "grad_norm": 1.4897616851087923, + "learning_rate": 5.655223186608804e-07, + "loss": 0.8775, + "step": 145950 + }, + { + "epoch": 11.310783060172808, + "grad_norm": 1.4988785198939731, + "learning_rate": 5.655610663360199e-07, + "loss": 0.9066, + "step": 145960 + }, + { + "epoch": 11.311557983649115, + "grad_norm": 1.412166391611818, + "learning_rate": 5.655998140111593e-07, + "loss": 0.8976, + "step": 145970 + }, + { + "epoch": 11.312332907125421, + "grad_norm": 1.4957127561175443, + "learning_rate": 5.656385616862989e-07, + "loss": 0.8872, + "step": 145980 + }, + { + "epoch": 11.313107830601728, + "grad_norm": 1.5905849095411457, + "learning_rate": 5.656773093614384e-07, + "loss": 0.8858, + "step": 145990 + }, + { + "epoch": 11.313882754078035, + "grad_norm": 1.4756647360064283, + "learning_rate": 5.657160570365779e-07, + "loss": 0.8862, + "step": 146000 + }, + { + "epoch": 11.313882754078035, + "eval_loss": 0.902498722076416, + "eval_runtime": 330.6037, + "eval_samples_per_second": 34.697, + "eval_steps_per_second": 8.675, + "step": 146000 + }, + { + "epoch": 11.314657677554342, + "grad_norm": 1.5050519217068092, + "learning_rate": 5.657548047117173e-07, + "loss": 0.8883, + "step": 146010 + }, + { + "epoch": 11.315432601030649, + "grad_norm": 1.5375766885452644, + "learning_rate": 5.657935523868568e-07, + "loss": 0.8809, + "step": 146020 + }, + { + "epoch": 11.316207524506956, + "grad_norm": 1.5488766716259166, + "learning_rate": 5.658323000619964e-07, + "loss": 0.8931, + "step": 146030 + }, + { + "epoch": 11.316982447983262, + "grad_norm": 1.4960226986600347, + "learning_rate": 5.658710477371358e-07, + "loss": 0.8736, + "step": 146040 + }, + { + "epoch": 11.31775737145957, + "grad_norm": 1.493402737027043, + "learning_rate": 5.659097954122753e-07, + "loss": 0.9153, + "step": 146050 + }, + { + "epoch": 11.318532294935874, + "grad_norm": 1.4598694557951635, + "learning_rate": 5.659485430874148e-07, + "loss": 0.8787, + "step": 146060 + }, + { + "epoch": 11.319307218412181, + "grad_norm": 1.5005114587614132, + "learning_rate": 5.659872907625543e-07, + "loss": 0.8939, + "step": 146070 + }, + { + "epoch": 11.320082141888488, + "grad_norm": 1.440856219502891, + "learning_rate": 5.660260384376938e-07, + "loss": 0.8652, + "step": 146080 + }, + { + "epoch": 11.320857065364795, + "grad_norm": 1.481284617475907, + "learning_rate": 5.660647861128333e-07, + "loss": 0.89, + "step": 146090 + }, + { + "epoch": 11.321631988841101, + "grad_norm": 1.523205820372966, + "learning_rate": 5.661035337879728e-07, + "loss": 0.8885, + "step": 146100 + }, + { + "epoch": 11.322406912317408, + "grad_norm": 1.4121030255750728, + "learning_rate": 5.661422814631122e-07, + "loss": 0.8952, + "step": 146110 + }, + { + "epoch": 11.323181835793715, + "grad_norm": 1.5393184740424204, + "learning_rate": 5.661810291382517e-07, + "loss": 0.8838, + "step": 146120 + }, + { + "epoch": 11.323956759270022, + "grad_norm": 1.5870318197513917, + "learning_rate": 5.662197768133913e-07, + "loss": 0.9165, + "step": 146130 + }, + { + "epoch": 11.324731682746329, + "grad_norm": 1.5353167281019298, + "learning_rate": 5.662585244885308e-07, + "loss": 0.8909, + "step": 146140 + }, + { + "epoch": 11.325506606222635, + "grad_norm": 1.4950832519861086, + "learning_rate": 5.662972721636702e-07, + "loss": 0.8814, + "step": 146150 + }, + { + "epoch": 11.326281529698942, + "grad_norm": 1.4592877754754308, + "learning_rate": 5.663360198388097e-07, + "loss": 0.8772, + "step": 146160 + }, + { + "epoch": 11.32705645317525, + "grad_norm": 1.5456943721639353, + "learning_rate": 5.663747675139492e-07, + "loss": 0.887, + "step": 146170 + }, + { + "epoch": 11.327831376651556, + "grad_norm": 1.3959936586715498, + "learning_rate": 5.664135151890887e-07, + "loss": 0.8796, + "step": 146180 + }, + { + "epoch": 11.328606300127863, + "grad_norm": 1.4330852268106273, + "learning_rate": 5.664522628642282e-07, + "loss": 0.9033, + "step": 146190 + }, + { + "epoch": 11.32938122360417, + "grad_norm": 1.4412844624152654, + "learning_rate": 5.664910105393677e-07, + "loss": 0.8728, + "step": 146200 + }, + { + "epoch": 11.330156147080476, + "grad_norm": 1.5105112810282992, + "learning_rate": 5.665297582145072e-07, + "loss": 0.8751, + "step": 146210 + }, + { + "epoch": 11.330931070556783, + "grad_norm": 1.428315357096221, + "learning_rate": 5.665685058896466e-07, + "loss": 0.9016, + "step": 146220 + }, + { + "epoch": 11.33170599403309, + "grad_norm": 1.4527973576657152, + "learning_rate": 5.666072535647862e-07, + "loss": 0.9185, + "step": 146230 + }, + { + "epoch": 11.332480917509397, + "grad_norm": 1.5388665853127659, + "learning_rate": 5.666460012399257e-07, + "loss": 0.8953, + "step": 146240 + }, + { + "epoch": 11.333255840985702, + "grad_norm": 1.4369279228303824, + "learning_rate": 5.666847489150651e-07, + "loss": 0.8903, + "step": 146250 + }, + { + "epoch": 11.334030764462009, + "grad_norm": 1.452912244708126, + "learning_rate": 5.667234965902046e-07, + "loss": 0.8913, + "step": 146260 + }, + { + "epoch": 11.334805687938315, + "grad_norm": 1.6031979231540994, + "learning_rate": 5.667622442653441e-07, + "loss": 0.8892, + "step": 146270 + }, + { + "epoch": 11.335580611414622, + "grad_norm": 1.5212627074254574, + "learning_rate": 5.668009919404837e-07, + "loss": 0.8909, + "step": 146280 + }, + { + "epoch": 11.336355534890929, + "grad_norm": 1.4606567620722213, + "learning_rate": 5.668397396156231e-07, + "loss": 0.875, + "step": 146290 + }, + { + "epoch": 11.337130458367236, + "grad_norm": 1.385976962673939, + "learning_rate": 5.668784872907626e-07, + "loss": 0.9139, + "step": 146300 + }, + { + "epoch": 11.337905381843543, + "grad_norm": 1.692108647322515, + "learning_rate": 5.669172349659021e-07, + "loss": 0.9093, + "step": 146310 + }, + { + "epoch": 11.33868030531985, + "grad_norm": 1.4967556778941866, + "learning_rate": 5.669559826410415e-07, + "loss": 0.8882, + "step": 146320 + }, + { + "epoch": 11.339455228796156, + "grad_norm": 1.4242322675392378, + "learning_rate": 5.669947303161811e-07, + "loss": 0.8841, + "step": 146330 + }, + { + "epoch": 11.340230152272463, + "grad_norm": 1.5055583210635262, + "learning_rate": 5.670334779913206e-07, + "loss": 0.8879, + "step": 146340 + }, + { + "epoch": 11.34100507574877, + "grad_norm": 1.4681873438313007, + "learning_rate": 5.670722256664601e-07, + "loss": 0.8717, + "step": 146350 + }, + { + "epoch": 11.341779999225077, + "grad_norm": 1.4254766569764095, + "learning_rate": 5.671109733415995e-07, + "loss": 0.8839, + "step": 146360 + }, + { + "epoch": 11.342554922701384, + "grad_norm": 1.5403938413983504, + "learning_rate": 5.67149721016739e-07, + "loss": 0.8734, + "step": 146370 + }, + { + "epoch": 11.34332984617769, + "grad_norm": 1.4834990407795352, + "learning_rate": 5.671884686918786e-07, + "loss": 0.8787, + "step": 146380 + }, + { + "epoch": 11.344104769653997, + "grad_norm": 1.501587019961175, + "learning_rate": 5.67227216367018e-07, + "loss": 0.8868, + "step": 146390 + }, + { + "epoch": 11.344879693130304, + "grad_norm": 1.5283982764191988, + "learning_rate": 5.672659640421575e-07, + "loss": 0.8728, + "step": 146400 + }, + { + "epoch": 11.34565461660661, + "grad_norm": 1.416570269551568, + "learning_rate": 5.67304711717297e-07, + "loss": 0.8902, + "step": 146410 + }, + { + "epoch": 11.346429540082918, + "grad_norm": 1.497601894327209, + "learning_rate": 5.673434593924366e-07, + "loss": 0.8895, + "step": 146420 + }, + { + "epoch": 11.347204463559223, + "grad_norm": 1.5036212032489418, + "learning_rate": 5.67382207067576e-07, + "loss": 0.8979, + "step": 146430 + }, + { + "epoch": 11.34797938703553, + "grad_norm": 1.523341446392688, + "learning_rate": 5.674209547427155e-07, + "loss": 0.8774, + "step": 146440 + }, + { + "epoch": 11.348754310511836, + "grad_norm": 1.4242925145979826, + "learning_rate": 5.67459702417855e-07, + "loss": 0.8698, + "step": 146450 + }, + { + "epoch": 11.349529233988143, + "grad_norm": 1.5230394389880826, + "learning_rate": 5.674984500929944e-07, + "loss": 0.8976, + "step": 146460 + }, + { + "epoch": 11.35030415746445, + "grad_norm": 1.4469568779802213, + "learning_rate": 5.67537197768134e-07, + "loss": 0.8929, + "step": 146470 + }, + { + "epoch": 11.351079080940757, + "grad_norm": 1.4499646176888765, + "learning_rate": 5.675759454432735e-07, + "loss": 0.8826, + "step": 146480 + }, + { + "epoch": 11.351854004417063, + "grad_norm": 1.4172275599536333, + "learning_rate": 5.67614693118413e-07, + "loss": 0.8682, + "step": 146490 + }, + { + "epoch": 11.35262892789337, + "grad_norm": 1.559361829086548, + "learning_rate": 5.676534407935524e-07, + "loss": 0.8759, + "step": 146500 + }, + { + "epoch": 11.35262892789337, + "eval_loss": 0.9022455215454102, + "eval_runtime": 331.46, + "eval_samples_per_second": 34.607, + "eval_steps_per_second": 8.653, + "step": 146500 + }, + { + "epoch": 11.353403851369677, + "grad_norm": 1.5216079227825805, + "learning_rate": 5.676921884686919e-07, + "loss": 0.8919, + "step": 146510 + }, + { + "epoch": 11.354178774845984, + "grad_norm": 1.4582254568805166, + "learning_rate": 5.677309361438315e-07, + "loss": 0.8946, + "step": 146520 + }, + { + "epoch": 11.35495369832229, + "grad_norm": 1.46250095043227, + "learning_rate": 5.677696838189709e-07, + "loss": 0.8761, + "step": 146530 + }, + { + "epoch": 11.355728621798598, + "grad_norm": 1.4290348527943533, + "learning_rate": 5.678084314941104e-07, + "loss": 0.8764, + "step": 146540 + }, + { + "epoch": 11.356503545274904, + "grad_norm": 1.5112728074606965, + "learning_rate": 5.678471791692499e-07, + "loss": 0.8965, + "step": 146550 + }, + { + "epoch": 11.357278468751211, + "grad_norm": 1.4628309930980246, + "learning_rate": 5.678859268443894e-07, + "loss": 0.8799, + "step": 146560 + }, + { + "epoch": 11.358053392227518, + "grad_norm": 1.4397905926032264, + "learning_rate": 5.679246745195289e-07, + "loss": 0.8938, + "step": 146570 + }, + { + "epoch": 11.358828315703825, + "grad_norm": 1.429648194793765, + "learning_rate": 5.679634221946684e-07, + "loss": 0.8853, + "step": 146580 + }, + { + "epoch": 11.359603239180132, + "grad_norm": 1.461060637585406, + "learning_rate": 5.680021698698079e-07, + "loss": 0.8784, + "step": 146590 + }, + { + "epoch": 11.360378162656438, + "grad_norm": 1.503895207235982, + "learning_rate": 5.680409175449473e-07, + "loss": 0.8896, + "step": 146600 + }, + { + "epoch": 11.361153086132745, + "grad_norm": 1.5531299535164744, + "learning_rate": 5.680796652200868e-07, + "loss": 0.8754, + "step": 146610 + }, + { + "epoch": 11.36192800960905, + "grad_norm": 1.4439760614789567, + "learning_rate": 5.681184128952264e-07, + "loss": 0.8911, + "step": 146620 + }, + { + "epoch": 11.362702933085357, + "grad_norm": 1.4344685460342428, + "learning_rate": 5.681571605703659e-07, + "loss": 0.887, + "step": 146630 + }, + { + "epoch": 11.363477856561664, + "grad_norm": 1.47343596408932, + "learning_rate": 5.681959082455053e-07, + "loss": 0.914, + "step": 146640 + }, + { + "epoch": 11.36425278003797, + "grad_norm": 1.4053290095425726, + "learning_rate": 5.682346559206448e-07, + "loss": 0.8961, + "step": 146650 + }, + { + "epoch": 11.365027703514277, + "grad_norm": 1.5103829202956172, + "learning_rate": 5.682734035957843e-07, + "loss": 0.9023, + "step": 146660 + }, + { + "epoch": 11.365802626990584, + "grad_norm": 1.4653633225947844, + "learning_rate": 5.683121512709238e-07, + "loss": 0.8913, + "step": 146670 + }, + { + "epoch": 11.366577550466891, + "grad_norm": 1.4676098848974977, + "learning_rate": 5.683508989460633e-07, + "loss": 0.8764, + "step": 146680 + }, + { + "epoch": 11.367352473943198, + "grad_norm": 1.4257648133616492, + "learning_rate": 5.683896466212028e-07, + "loss": 0.8751, + "step": 146690 + }, + { + "epoch": 11.368127397419505, + "grad_norm": 1.4506317849624935, + "learning_rate": 5.684283942963423e-07, + "loss": 0.913, + "step": 146700 + }, + { + "epoch": 11.368902320895812, + "grad_norm": 1.432185939938456, + "learning_rate": 5.684671419714817e-07, + "loss": 0.8893, + "step": 146710 + }, + { + "epoch": 11.369677244372118, + "grad_norm": 1.4852770079632762, + "learning_rate": 5.685058896466213e-07, + "loss": 0.8827, + "step": 146720 + }, + { + "epoch": 11.370452167848425, + "grad_norm": 1.5072538403302345, + "learning_rate": 5.685446373217608e-07, + "loss": 0.9153, + "step": 146730 + }, + { + "epoch": 11.371227091324732, + "grad_norm": 1.4930853942967852, + "learning_rate": 5.685833849969002e-07, + "loss": 0.888, + "step": 146740 + }, + { + "epoch": 11.372002014801039, + "grad_norm": 1.3926453040158737, + "learning_rate": 5.686221326720397e-07, + "loss": 0.8723, + "step": 146750 + }, + { + "epoch": 11.372776938277346, + "grad_norm": 1.3870671791727045, + "learning_rate": 5.686608803471792e-07, + "loss": 0.9008, + "step": 146760 + }, + { + "epoch": 11.373551861753652, + "grad_norm": 1.5177839973610638, + "learning_rate": 5.686996280223188e-07, + "loss": 0.8915, + "step": 146770 + }, + { + "epoch": 11.37432678522996, + "grad_norm": 1.471964628906824, + "learning_rate": 5.687383756974582e-07, + "loss": 0.8882, + "step": 146780 + }, + { + "epoch": 11.375101708706266, + "grad_norm": 1.4706709194689762, + "learning_rate": 5.687771233725977e-07, + "loss": 0.8899, + "step": 146790 + }, + { + "epoch": 11.375876632182571, + "grad_norm": 1.4988991267613316, + "learning_rate": 5.688158710477372e-07, + "loss": 0.88, + "step": 146800 + }, + { + "epoch": 11.376651555658878, + "grad_norm": 1.4929910126272572, + "learning_rate": 5.688546187228766e-07, + "loss": 0.8803, + "step": 146810 + }, + { + "epoch": 11.377426479135185, + "grad_norm": 1.5400325226152582, + "learning_rate": 5.688933663980162e-07, + "loss": 0.8811, + "step": 146820 + }, + { + "epoch": 11.378201402611491, + "grad_norm": 1.444121448893791, + "learning_rate": 5.689321140731557e-07, + "loss": 0.8918, + "step": 146830 + }, + { + "epoch": 11.378976326087798, + "grad_norm": 1.4417857831862202, + "learning_rate": 5.689708617482952e-07, + "loss": 0.8832, + "step": 146840 + }, + { + "epoch": 11.379751249564105, + "grad_norm": 1.558423695361138, + "learning_rate": 5.690096094234346e-07, + "loss": 0.9036, + "step": 146850 + }, + { + "epoch": 11.380526173040412, + "grad_norm": 1.401264039646215, + "learning_rate": 5.690483570985741e-07, + "loss": 0.8768, + "step": 146860 + }, + { + "epoch": 11.381301096516719, + "grad_norm": 1.555024057224775, + "learning_rate": 5.690871047737137e-07, + "loss": 0.8925, + "step": 146870 + }, + { + "epoch": 11.382076019993026, + "grad_norm": 1.5692818042654033, + "learning_rate": 5.691258524488531e-07, + "loss": 0.8685, + "step": 146880 + }, + { + "epoch": 11.382850943469332, + "grad_norm": 1.434668081990368, + "learning_rate": 5.691646001239926e-07, + "loss": 0.8852, + "step": 146890 + }, + { + "epoch": 11.38362586694564, + "grad_norm": 1.5062112771348628, + "learning_rate": 5.692033477991321e-07, + "loss": 0.8852, + "step": 146900 + }, + { + "epoch": 11.384400790421946, + "grad_norm": 1.4847338475515528, + "learning_rate": 5.692420954742717e-07, + "loss": 0.8835, + "step": 146910 + }, + { + "epoch": 11.385175713898253, + "grad_norm": 1.5256364200436257, + "learning_rate": 5.692808431494111e-07, + "loss": 0.8731, + "step": 146920 + }, + { + "epoch": 11.38595063737456, + "grad_norm": 1.538321998543284, + "learning_rate": 5.693195908245506e-07, + "loss": 0.8991, + "step": 146930 + }, + { + "epoch": 11.386725560850866, + "grad_norm": 1.4292593750546991, + "learning_rate": 5.693583384996901e-07, + "loss": 0.8794, + "step": 146940 + }, + { + "epoch": 11.387500484327173, + "grad_norm": 1.3753701942130967, + "learning_rate": 5.693970861748295e-07, + "loss": 0.8874, + "step": 146950 + }, + { + "epoch": 11.38827540780348, + "grad_norm": 1.5228611808135817, + "learning_rate": 5.69435833849969e-07, + "loss": 0.8722, + "step": 146960 + }, + { + "epoch": 11.389050331279787, + "grad_norm": 1.5065534779517153, + "learning_rate": 5.694745815251086e-07, + "loss": 0.894, + "step": 146970 + }, + { + "epoch": 11.389825254756094, + "grad_norm": 1.6181247209675407, + "learning_rate": 5.695133292002481e-07, + "loss": 0.9156, + "step": 146980 + }, + { + "epoch": 11.3906001782324, + "grad_norm": 1.53248986857921, + "learning_rate": 5.695520768753875e-07, + "loss": 0.8836, + "step": 146990 + }, + { + "epoch": 11.391375101708705, + "grad_norm": 1.3936040970148922, + "learning_rate": 5.69590824550527e-07, + "loss": 0.8951, + "step": 147000 + }, + { + "epoch": 11.391375101708705, + "eval_loss": 0.9020092487335205, + "eval_runtime": 330.5219, + "eval_samples_per_second": 34.706, + "eval_steps_per_second": 8.677, + "step": 147000 + }, + { + "epoch": 11.392150025185012, + "grad_norm": 1.4321268684945763, + "learning_rate": 5.696295722256666e-07, + "loss": 0.9007, + "step": 147010 + }, + { + "epoch": 11.392924948661319, + "grad_norm": 1.3911336106926457, + "learning_rate": 5.69668319900806e-07, + "loss": 0.8851, + "step": 147020 + }, + { + "epoch": 11.393699872137626, + "grad_norm": 1.5083969603400442, + "learning_rate": 5.697070675759455e-07, + "loss": 0.8778, + "step": 147030 + }, + { + "epoch": 11.394474795613933, + "grad_norm": 1.3563299296260993, + "learning_rate": 5.69745815251085e-07, + "loss": 0.8759, + "step": 147040 + }, + { + "epoch": 11.39524971909024, + "grad_norm": 1.5064066252979347, + "learning_rate": 5.697845629262245e-07, + "loss": 0.8654, + "step": 147050 + }, + { + "epoch": 11.396024642566546, + "grad_norm": 1.4220794858815338, + "learning_rate": 5.69823310601364e-07, + "loss": 0.8904, + "step": 147060 + }, + { + "epoch": 11.396799566042853, + "grad_norm": 1.5234568596881988, + "learning_rate": 5.698620582765035e-07, + "loss": 0.8855, + "step": 147070 + }, + { + "epoch": 11.39757448951916, + "grad_norm": 1.498074870286036, + "learning_rate": 5.69900805951643e-07, + "loss": 0.8919, + "step": 147080 + }, + { + "epoch": 11.398349412995467, + "grad_norm": 1.4264371517983918, + "learning_rate": 5.699395536267824e-07, + "loss": 0.8647, + "step": 147090 + }, + { + "epoch": 11.399124336471774, + "grad_norm": 1.4780830015162398, + "learning_rate": 5.699783013019219e-07, + "loss": 0.9154, + "step": 147100 + }, + { + "epoch": 11.39989925994808, + "grad_norm": 1.4850234010023848, + "learning_rate": 5.700170489770615e-07, + "loss": 0.8826, + "step": 147110 + }, + { + "epoch": 11.400674183424387, + "grad_norm": 1.5118659292460086, + "learning_rate": 5.70055796652201e-07, + "loss": 0.9265, + "step": 147120 + }, + { + "epoch": 11.401449106900694, + "grad_norm": 1.4100281150420904, + "learning_rate": 5.700945443273404e-07, + "loss": 0.8883, + "step": 147130 + }, + { + "epoch": 11.402224030377, + "grad_norm": 1.5241683617040414, + "learning_rate": 5.701332920024799e-07, + "loss": 0.9044, + "step": 147140 + }, + { + "epoch": 11.402998953853308, + "grad_norm": 1.525761156038103, + "learning_rate": 5.701720396776194e-07, + "loss": 0.8702, + "step": 147150 + }, + { + "epoch": 11.403773877329614, + "grad_norm": 1.5068914473132233, + "learning_rate": 5.702107873527589e-07, + "loss": 0.8916, + "step": 147160 + }, + { + "epoch": 11.404548800805921, + "grad_norm": 1.41539250055816, + "learning_rate": 5.702495350278984e-07, + "loss": 0.915, + "step": 147170 + }, + { + "epoch": 11.405323724282226, + "grad_norm": 1.5300942347377744, + "learning_rate": 5.702882827030379e-07, + "loss": 0.9073, + "step": 147180 + }, + { + "epoch": 11.406098647758533, + "grad_norm": 1.4992691126106072, + "learning_rate": 5.703270303781774e-07, + "loss": 0.8767, + "step": 147190 + }, + { + "epoch": 11.40687357123484, + "grad_norm": 1.4354479256073371, + "learning_rate": 5.703657780533168e-07, + "loss": 0.8931, + "step": 147200 + }, + { + "epoch": 11.407648494711147, + "grad_norm": 1.592271210843576, + "learning_rate": 5.704045257284564e-07, + "loss": 0.8561, + "step": 147210 + }, + { + "epoch": 11.408423418187454, + "grad_norm": 1.4586352248754684, + "learning_rate": 5.704432734035959e-07, + "loss": 0.8957, + "step": 147220 + }, + { + "epoch": 11.40919834166376, + "grad_norm": 1.4919490904465322, + "learning_rate": 5.704820210787353e-07, + "loss": 0.8851, + "step": 147230 + }, + { + "epoch": 11.409973265140067, + "grad_norm": 1.525488777007342, + "learning_rate": 5.705207687538748e-07, + "loss": 0.8907, + "step": 147240 + }, + { + "epoch": 11.410748188616374, + "grad_norm": 1.5049061801456864, + "learning_rate": 5.705595164290143e-07, + "loss": 0.8867, + "step": 147250 + }, + { + "epoch": 11.41152311209268, + "grad_norm": 1.5095001928665674, + "learning_rate": 5.705982641041539e-07, + "loss": 0.894, + "step": 147260 + }, + { + "epoch": 11.412298035568988, + "grad_norm": 1.5104812967761563, + "learning_rate": 5.706370117792933e-07, + "loss": 0.8879, + "step": 147270 + }, + { + "epoch": 11.413072959045294, + "grad_norm": 1.6047424668417671, + "learning_rate": 5.706757594544328e-07, + "loss": 0.8886, + "step": 147280 + }, + { + "epoch": 11.413847882521601, + "grad_norm": 1.466718181900393, + "learning_rate": 5.707145071295723e-07, + "loss": 0.8852, + "step": 147290 + }, + { + "epoch": 11.414622805997908, + "grad_norm": 1.4333286953205173, + "learning_rate": 5.707532548047117e-07, + "loss": 0.881, + "step": 147300 + }, + { + "epoch": 11.415397729474215, + "grad_norm": 1.4090570018587198, + "learning_rate": 5.707920024798513e-07, + "loss": 0.8707, + "step": 147310 + }, + { + "epoch": 11.416172652950522, + "grad_norm": 1.4906017531778175, + "learning_rate": 5.708307501549908e-07, + "loss": 0.8776, + "step": 147320 + }, + { + "epoch": 11.416947576426828, + "grad_norm": 1.481143114796729, + "learning_rate": 5.708694978301303e-07, + "loss": 0.8952, + "step": 147330 + }, + { + "epoch": 11.417722499903135, + "grad_norm": 1.4983865521708757, + "learning_rate": 5.709082455052697e-07, + "loss": 0.8939, + "step": 147340 + }, + { + "epoch": 11.418497423379442, + "grad_norm": 1.4233381210082208, + "learning_rate": 5.709469931804092e-07, + "loss": 0.8862, + "step": 147350 + }, + { + "epoch": 11.419272346855749, + "grad_norm": 1.4459804371563192, + "learning_rate": 5.709857408555488e-07, + "loss": 0.8918, + "step": 147360 + }, + { + "epoch": 11.420047270332054, + "grad_norm": 1.446289568977879, + "learning_rate": 5.710244885306882e-07, + "loss": 0.8767, + "step": 147370 + }, + { + "epoch": 11.42082219380836, + "grad_norm": 1.405021712728326, + "learning_rate": 5.710632362058277e-07, + "loss": 0.8685, + "step": 147380 + }, + { + "epoch": 11.421597117284668, + "grad_norm": 1.4755859133642328, + "learning_rate": 5.711019838809672e-07, + "loss": 0.8961, + "step": 147390 + }, + { + "epoch": 11.422372040760974, + "grad_norm": 1.4602251302413578, + "learning_rate": 5.711407315561066e-07, + "loss": 0.9005, + "step": 147400 + }, + { + "epoch": 11.423146964237281, + "grad_norm": 1.4361209348006652, + "learning_rate": 5.711794792312462e-07, + "loss": 0.8938, + "step": 147410 + }, + { + "epoch": 11.423921887713588, + "grad_norm": 1.493232468572235, + "learning_rate": 5.712182269063857e-07, + "loss": 0.9066, + "step": 147420 + }, + { + "epoch": 11.424696811189895, + "grad_norm": 1.4431910513419883, + "learning_rate": 5.712569745815252e-07, + "loss": 0.8719, + "step": 147430 + }, + { + "epoch": 11.425471734666202, + "grad_norm": 1.56373802554539, + "learning_rate": 5.712957222566646e-07, + "loss": 0.9042, + "step": 147440 + }, + { + "epoch": 11.426246658142508, + "grad_norm": 1.5063651956402864, + "learning_rate": 5.713344699318041e-07, + "loss": 0.8855, + "step": 147450 + }, + { + "epoch": 11.427021581618815, + "grad_norm": 1.5829329063827347, + "learning_rate": 5.713732176069437e-07, + "loss": 0.8747, + "step": 147460 + }, + { + "epoch": 11.427796505095122, + "grad_norm": 1.4773201612944353, + "learning_rate": 5.714119652820831e-07, + "loss": 0.8887, + "step": 147470 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 1.5277065559617529, + "learning_rate": 5.714507129572226e-07, + "loss": 0.9123, + "step": 147480 + }, + { + "epoch": 11.429346352047736, + "grad_norm": 1.4438630372779713, + "learning_rate": 5.714894606323621e-07, + "loss": 0.8833, + "step": 147490 + }, + { + "epoch": 11.430121275524042, + "grad_norm": 1.5210701962702473, + "learning_rate": 5.715282083075016e-07, + "loss": 0.8808, + "step": 147500 + }, + { + "epoch": 11.430121275524042, + "eval_loss": 0.90172278881073, + "eval_runtime": 333.1754, + "eval_samples_per_second": 34.429, + "eval_steps_per_second": 8.608, + "step": 147500 + }, + { + "epoch": 11.43089619900035, + "grad_norm": 1.480985295914981, + "learning_rate": 5.715669559826411e-07, + "loss": 0.8684, + "step": 147510 + }, + { + "epoch": 11.431671122476656, + "grad_norm": 1.4263677559882406, + "learning_rate": 5.716057036577806e-07, + "loss": 0.8974, + "step": 147520 + }, + { + "epoch": 11.432446045952963, + "grad_norm": 1.420857107817916, + "learning_rate": 5.716444513329201e-07, + "loss": 0.8804, + "step": 147530 + }, + { + "epoch": 11.43322096942927, + "grad_norm": 1.5267362137217502, + "learning_rate": 5.716831990080595e-07, + "loss": 0.8957, + "step": 147540 + }, + { + "epoch": 11.433995892905575, + "grad_norm": 1.5484194130571995, + "learning_rate": 5.71721946683199e-07, + "loss": 0.8832, + "step": 147550 + }, + { + "epoch": 11.434770816381882, + "grad_norm": 1.546941166269761, + "learning_rate": 5.717606943583386e-07, + "loss": 0.8908, + "step": 147560 + }, + { + "epoch": 11.435545739858188, + "grad_norm": 1.4972487854382135, + "learning_rate": 5.717994420334781e-07, + "loss": 0.8633, + "step": 147570 + }, + { + "epoch": 11.436320663334495, + "grad_norm": 1.4972104801973114, + "learning_rate": 5.718381897086175e-07, + "loss": 0.9068, + "step": 147580 + }, + { + "epoch": 11.437095586810802, + "grad_norm": 1.4682744460790016, + "learning_rate": 5.71876937383757e-07, + "loss": 0.8719, + "step": 147590 + }, + { + "epoch": 11.437870510287109, + "grad_norm": 1.527904180533729, + "learning_rate": 5.719156850588965e-07, + "loss": 0.901, + "step": 147600 + }, + { + "epoch": 11.438645433763416, + "grad_norm": 1.4935723973545614, + "learning_rate": 5.71954432734036e-07, + "loss": 0.8784, + "step": 147610 + }, + { + "epoch": 11.439420357239722, + "grad_norm": 1.474601573526804, + "learning_rate": 5.719931804091755e-07, + "loss": 0.9044, + "step": 147620 + }, + { + "epoch": 11.44019528071603, + "grad_norm": 1.484282741884123, + "learning_rate": 5.72031928084315e-07, + "loss": 0.8623, + "step": 147630 + }, + { + "epoch": 11.440970204192336, + "grad_norm": 1.5377429997963872, + "learning_rate": 5.720706757594545e-07, + "loss": 0.9389, + "step": 147640 + }, + { + "epoch": 11.441745127668643, + "grad_norm": 1.4311111677132102, + "learning_rate": 5.72109423434594e-07, + "loss": 0.9022, + "step": 147650 + }, + { + "epoch": 11.44252005114495, + "grad_norm": 1.4994052754109577, + "learning_rate": 5.721481711097335e-07, + "loss": 0.8892, + "step": 147660 + }, + { + "epoch": 11.443294974621256, + "grad_norm": 1.5476397163005928, + "learning_rate": 5.72186918784873e-07, + "loss": 0.8784, + "step": 147670 + }, + { + "epoch": 11.444069898097563, + "grad_norm": 1.468476879555866, + "learning_rate": 5.722256664600124e-07, + "loss": 0.899, + "step": 147680 + }, + { + "epoch": 11.44484482157387, + "grad_norm": 1.506781080716496, + "learning_rate": 5.722644141351519e-07, + "loss": 0.9094, + "step": 147690 + }, + { + "epoch": 11.445619745050177, + "grad_norm": 1.5033621872498069, + "learning_rate": 5.723031618102915e-07, + "loss": 0.8886, + "step": 147700 + }, + { + "epoch": 11.446394668526484, + "grad_norm": 1.4347142703077826, + "learning_rate": 5.72341909485431e-07, + "loss": 0.8682, + "step": 147710 + }, + { + "epoch": 11.44716959200279, + "grad_norm": 1.4985819698123752, + "learning_rate": 5.723806571605704e-07, + "loss": 0.8947, + "step": 147720 + }, + { + "epoch": 11.447944515479097, + "grad_norm": 1.5853611670735759, + "learning_rate": 5.724194048357099e-07, + "loss": 0.8862, + "step": 147730 + }, + { + "epoch": 11.448719438955402, + "grad_norm": 1.394566501016105, + "learning_rate": 5.724581525108494e-07, + "loss": 0.8939, + "step": 147740 + }, + { + "epoch": 11.44949436243171, + "grad_norm": 1.4399981839852156, + "learning_rate": 5.724969001859888e-07, + "loss": 0.8856, + "step": 147750 + }, + { + "epoch": 11.450269285908016, + "grad_norm": 1.4423639607708063, + "learning_rate": 5.725356478611284e-07, + "loss": 0.8742, + "step": 147760 + }, + { + "epoch": 11.451044209384323, + "grad_norm": 1.539656939692969, + "learning_rate": 5.725743955362679e-07, + "loss": 0.8736, + "step": 147770 + }, + { + "epoch": 11.45181913286063, + "grad_norm": 1.4598022884079143, + "learning_rate": 5.726131432114074e-07, + "loss": 0.8799, + "step": 147780 + }, + { + "epoch": 11.452594056336936, + "grad_norm": 1.5575650389850215, + "learning_rate": 5.726518908865468e-07, + "loss": 0.9283, + "step": 147790 + }, + { + "epoch": 11.453368979813243, + "grad_norm": 1.4761108749787142, + "learning_rate": 5.726906385616864e-07, + "loss": 0.8909, + "step": 147800 + }, + { + "epoch": 11.45414390328955, + "grad_norm": 1.5063810650094993, + "learning_rate": 5.727293862368259e-07, + "loss": 0.8855, + "step": 147810 + }, + { + "epoch": 11.454918826765857, + "grad_norm": 1.5317700028161583, + "learning_rate": 5.727681339119653e-07, + "loss": 0.8868, + "step": 147820 + }, + { + "epoch": 11.455693750242164, + "grad_norm": 1.4256039675209937, + "learning_rate": 5.728068815871048e-07, + "loss": 0.8957, + "step": 147830 + }, + { + "epoch": 11.45646867371847, + "grad_norm": 1.5534838721862771, + "learning_rate": 5.728456292622443e-07, + "loss": 0.8895, + "step": 147840 + }, + { + "epoch": 11.457243597194777, + "grad_norm": 1.4575417182959565, + "learning_rate": 5.728843769373839e-07, + "loss": 0.8924, + "step": 147850 + }, + { + "epoch": 11.458018520671084, + "grad_norm": 1.4113439549986568, + "learning_rate": 5.729231246125233e-07, + "loss": 0.8895, + "step": 147860 + }, + { + "epoch": 11.45879344414739, + "grad_norm": 1.504566390124061, + "learning_rate": 5.729618722876628e-07, + "loss": 0.9035, + "step": 147870 + }, + { + "epoch": 11.459568367623698, + "grad_norm": 1.5119534884241148, + "learning_rate": 5.730006199628023e-07, + "loss": 0.8635, + "step": 147880 + }, + { + "epoch": 11.460343291100004, + "grad_norm": 1.424696813423707, + "learning_rate": 5.730393676379417e-07, + "loss": 0.8824, + "step": 147890 + }, + { + "epoch": 11.461118214576311, + "grad_norm": 1.5066749743988577, + "learning_rate": 5.730781153130813e-07, + "loss": 0.8799, + "step": 147900 + }, + { + "epoch": 11.461893138052618, + "grad_norm": 1.4236364583896177, + "learning_rate": 5.731168629882208e-07, + "loss": 0.8946, + "step": 147910 + }, + { + "epoch": 11.462668061528923, + "grad_norm": 1.4637054904169258, + "learning_rate": 5.731556106633603e-07, + "loss": 0.8964, + "step": 147920 + }, + { + "epoch": 11.46344298500523, + "grad_norm": 1.4330197573309733, + "learning_rate": 5.731943583384997e-07, + "loss": 0.879, + "step": 147930 + }, + { + "epoch": 11.464217908481537, + "grad_norm": 1.4196981239913675, + "learning_rate": 5.732331060136392e-07, + "loss": 0.9091, + "step": 147940 + }, + { + "epoch": 11.464992831957844, + "grad_norm": 1.5100820572557405, + "learning_rate": 5.732718536887788e-07, + "loss": 0.8883, + "step": 147950 + }, + { + "epoch": 11.46576775543415, + "grad_norm": 1.453220702687978, + "learning_rate": 5.733106013639182e-07, + "loss": 0.8912, + "step": 147960 + }, + { + "epoch": 11.466542678910457, + "grad_norm": 1.5097303848835057, + "learning_rate": 5.733493490390577e-07, + "loss": 0.8872, + "step": 147970 + }, + { + "epoch": 11.467317602386764, + "grad_norm": 1.5747095050245414, + "learning_rate": 5.733880967141972e-07, + "loss": 0.8844, + "step": 147980 + }, + { + "epoch": 11.46809252586307, + "grad_norm": 1.615680646404009, + "learning_rate": 5.734268443893367e-07, + "loss": 0.8629, + "step": 147990 + }, + { + "epoch": 11.468867449339378, + "grad_norm": 1.501588689584499, + "learning_rate": 5.734655920644762e-07, + "loss": 0.9043, + "step": 148000 + }, + { + "epoch": 11.468867449339378, + "eval_loss": 0.9014586806297302, + "eval_runtime": 328.6523, + "eval_samples_per_second": 34.903, + "eval_steps_per_second": 8.727, + "step": 148000 + }, + { + "epoch": 11.469642372815684, + "grad_norm": 1.4868817940721086, + "learning_rate": 5.735043397396157e-07, + "loss": 0.8881, + "step": 148010 + }, + { + "epoch": 11.470417296291991, + "grad_norm": 1.5327690427286138, + "learning_rate": 5.735430874147552e-07, + "loss": 0.89, + "step": 148020 + }, + { + "epoch": 11.471192219768298, + "grad_norm": 1.3815441334794474, + "learning_rate": 5.735818350898946e-07, + "loss": 0.8965, + "step": 148030 + }, + { + "epoch": 11.471967143244605, + "grad_norm": 1.4502889768524416, + "learning_rate": 5.736205827650341e-07, + "loss": 0.8808, + "step": 148040 + }, + { + "epoch": 11.472742066720912, + "grad_norm": 1.4232301817848125, + "learning_rate": 5.736593304401737e-07, + "loss": 0.8754, + "step": 148050 + }, + { + "epoch": 11.473516990197218, + "grad_norm": 1.4165834071606498, + "learning_rate": 5.736980781153132e-07, + "loss": 0.9095, + "step": 148060 + }, + { + "epoch": 11.474291913673525, + "grad_norm": 1.5001524790108287, + "learning_rate": 5.737368257904526e-07, + "loss": 0.8677, + "step": 148070 + }, + { + "epoch": 11.475066837149832, + "grad_norm": 1.4959525343839792, + "learning_rate": 5.737755734655921e-07, + "loss": 0.8969, + "step": 148080 + }, + { + "epoch": 11.475841760626139, + "grad_norm": 1.4883361504940138, + "learning_rate": 5.738143211407316e-07, + "loss": 0.8706, + "step": 148090 + }, + { + "epoch": 11.476616684102446, + "grad_norm": 1.4661742070538126, + "learning_rate": 5.738530688158711e-07, + "loss": 0.868, + "step": 148100 + }, + { + "epoch": 11.47739160757875, + "grad_norm": 1.5165933090895451, + "learning_rate": 5.738918164910106e-07, + "loss": 0.8692, + "step": 148110 + }, + { + "epoch": 11.478166531055058, + "grad_norm": 1.4727970761430496, + "learning_rate": 5.739305641661501e-07, + "loss": 0.8631, + "step": 148120 + }, + { + "epoch": 11.478941454531364, + "grad_norm": 1.5219523705536024, + "learning_rate": 5.739693118412896e-07, + "loss": 0.8663, + "step": 148130 + }, + { + "epoch": 11.479716378007671, + "grad_norm": 1.6142283052514763, + "learning_rate": 5.74008059516429e-07, + "loss": 0.891, + "step": 148140 + }, + { + "epoch": 11.480491301483978, + "grad_norm": 1.4746537192928129, + "learning_rate": 5.740468071915686e-07, + "loss": 0.8776, + "step": 148150 + }, + { + "epoch": 11.481266224960285, + "grad_norm": 1.4596867367046134, + "learning_rate": 5.740855548667081e-07, + "loss": 0.8799, + "step": 148160 + }, + { + "epoch": 11.482041148436592, + "grad_norm": 1.4820916860939297, + "learning_rate": 5.741243025418475e-07, + "loss": 0.8838, + "step": 148170 + }, + { + "epoch": 11.482816071912898, + "grad_norm": 1.4767633151756434, + "learning_rate": 5.74163050216987e-07, + "loss": 0.8784, + "step": 148180 + }, + { + "epoch": 11.483590995389205, + "grad_norm": 1.5017554909418505, + "learning_rate": 5.742017978921265e-07, + "loss": 0.9178, + "step": 148190 + }, + { + "epoch": 11.484365918865512, + "grad_norm": 1.5689483603210752, + "learning_rate": 5.742405455672661e-07, + "loss": 0.9014, + "step": 148200 + }, + { + "epoch": 11.485140842341819, + "grad_norm": 1.5063732449711373, + "learning_rate": 5.742792932424055e-07, + "loss": 0.8918, + "step": 148210 + }, + { + "epoch": 11.485915765818126, + "grad_norm": 1.4776025909350616, + "learning_rate": 5.74318040917545e-07, + "loss": 0.8643, + "step": 148220 + }, + { + "epoch": 11.486690689294432, + "grad_norm": 1.4687855215457495, + "learning_rate": 5.743567885926845e-07, + "loss": 0.8839, + "step": 148230 + }, + { + "epoch": 11.48746561277074, + "grad_norm": 1.4760552043129227, + "learning_rate": 5.743955362678239e-07, + "loss": 0.8921, + "step": 148240 + }, + { + "epoch": 11.488240536247046, + "grad_norm": 1.425341527751119, + "learning_rate": 5.744342839429635e-07, + "loss": 0.8927, + "step": 148250 + }, + { + "epoch": 11.489015459723353, + "grad_norm": 1.478392292126367, + "learning_rate": 5.74473031618103e-07, + "loss": 0.8817, + "step": 148260 + }, + { + "epoch": 11.48979038319966, + "grad_norm": 1.4567611741552755, + "learning_rate": 5.745117792932425e-07, + "loss": 0.876, + "step": 148270 + }, + { + "epoch": 11.490565306675967, + "grad_norm": 1.472453614607777, + "learning_rate": 5.745505269683819e-07, + "loss": 0.8817, + "step": 148280 + }, + { + "epoch": 11.491340230152272, + "grad_norm": 1.4808696862312427, + "learning_rate": 5.745892746435214e-07, + "loss": 0.8806, + "step": 148290 + }, + { + "epoch": 11.492115153628578, + "grad_norm": 1.4476973573379328, + "learning_rate": 5.74628022318661e-07, + "loss": 0.9113, + "step": 148300 + }, + { + "epoch": 11.492890077104885, + "grad_norm": 1.4639543507504813, + "learning_rate": 5.746667699938004e-07, + "loss": 0.8889, + "step": 148310 + }, + { + "epoch": 11.493665000581192, + "grad_norm": 1.44824310010544, + "learning_rate": 5.747055176689399e-07, + "loss": 0.8817, + "step": 148320 + }, + { + "epoch": 11.494439924057499, + "grad_norm": 1.420248506126764, + "learning_rate": 5.747442653440794e-07, + "loss": 0.9001, + "step": 148330 + }, + { + "epoch": 11.495214847533806, + "grad_norm": 1.4595525492221721, + "learning_rate": 5.74783013019219e-07, + "loss": 0.8843, + "step": 148340 + }, + { + "epoch": 11.495989771010112, + "grad_norm": 1.5266024404483798, + "learning_rate": 5.748217606943584e-07, + "loss": 0.8913, + "step": 148350 + }, + { + "epoch": 11.49676469448642, + "grad_norm": 1.4776909041255002, + "learning_rate": 5.748605083694979e-07, + "loss": 0.8891, + "step": 148360 + }, + { + "epoch": 11.497539617962726, + "grad_norm": 1.5468018919375404, + "learning_rate": 5.748992560446374e-07, + "loss": 0.876, + "step": 148370 + }, + { + "epoch": 11.498314541439033, + "grad_norm": 1.4594924619466931, + "learning_rate": 5.749380037197768e-07, + "loss": 0.8819, + "step": 148380 + }, + { + "epoch": 11.49908946491534, + "grad_norm": 1.443980737463145, + "learning_rate": 5.749767513949164e-07, + "loss": 0.8864, + "step": 148390 + }, + { + "epoch": 11.499864388391646, + "grad_norm": 1.601484313227689, + "learning_rate": 5.750154990700559e-07, + "loss": 0.9011, + "step": 148400 + }, + { + "epoch": 11.500639311867953, + "grad_norm": 1.4480638535710115, + "learning_rate": 5.750542467451954e-07, + "loss": 0.9047, + "step": 148410 + }, + { + "epoch": 11.50141423534426, + "grad_norm": 1.4127484827218597, + "learning_rate": 5.750929944203348e-07, + "loss": 0.8662, + "step": 148420 + }, + { + "epoch": 11.502189158820567, + "grad_norm": 1.447276601715429, + "learning_rate": 5.751317420954743e-07, + "loss": 0.9028, + "step": 148430 + }, + { + "epoch": 11.502964082296874, + "grad_norm": 1.4245024132206552, + "learning_rate": 5.751704897706139e-07, + "loss": 0.885, + "step": 148440 + }, + { + "epoch": 11.50373900577318, + "grad_norm": 1.5142438213040443, + "learning_rate": 5.752092374457533e-07, + "loss": 0.9082, + "step": 148450 + }, + { + "epoch": 11.504513929249487, + "grad_norm": 1.456842902037926, + "learning_rate": 5.752479851208928e-07, + "loss": 0.8896, + "step": 148460 + }, + { + "epoch": 11.505288852725794, + "grad_norm": 1.4420102584914931, + "learning_rate": 5.752867327960323e-07, + "loss": 0.8918, + "step": 148470 + }, + { + "epoch": 11.506063776202101, + "grad_norm": 1.5480656974300009, + "learning_rate": 5.753254804711718e-07, + "loss": 0.8913, + "step": 148480 + }, + { + "epoch": 11.506838699678406, + "grad_norm": 1.462157333111305, + "learning_rate": 5.753642281463113e-07, + "loss": 0.8774, + "step": 148490 + }, + { + "epoch": 11.507613623154713, + "grad_norm": 1.5029464601738896, + "learning_rate": 5.754029758214508e-07, + "loss": 0.8785, + "step": 148500 + }, + { + "epoch": 11.507613623154713, + "eval_loss": 0.9013211131095886, + "eval_runtime": 330.3369, + "eval_samples_per_second": 34.725, + "eval_steps_per_second": 8.682, + "step": 148500 + }, + { + "epoch": 11.50838854663102, + "grad_norm": 1.4953793622797729, + "learning_rate": 5.754417234965903e-07, + "loss": 0.8775, + "step": 148510 + }, + { + "epoch": 11.509163470107326, + "grad_norm": 1.5201763293679722, + "learning_rate": 5.754804711717297e-07, + "loss": 0.9114, + "step": 148520 + }, + { + "epoch": 11.509938393583633, + "grad_norm": 1.508827986373488, + "learning_rate": 5.755192188468692e-07, + "loss": 0.8965, + "step": 148530 + }, + { + "epoch": 11.51071331705994, + "grad_norm": 1.341542453129083, + "learning_rate": 5.755579665220088e-07, + "loss": 0.888, + "step": 148540 + }, + { + "epoch": 11.511488240536247, + "grad_norm": 1.4597978325019765, + "learning_rate": 5.755967141971483e-07, + "loss": 0.8792, + "step": 148550 + }, + { + "epoch": 11.512263164012554, + "grad_norm": 1.4823975019764035, + "learning_rate": 5.756354618722877e-07, + "loss": 0.9037, + "step": 148560 + }, + { + "epoch": 11.51303808748886, + "grad_norm": 1.5546512587456165, + "learning_rate": 5.756742095474272e-07, + "loss": 0.8825, + "step": 148570 + }, + { + "epoch": 11.513813010965167, + "grad_norm": 1.5495519511587539, + "learning_rate": 5.757129572225667e-07, + "loss": 0.889, + "step": 148580 + }, + { + "epoch": 11.514587934441474, + "grad_norm": 1.4877395998712213, + "learning_rate": 5.757517048977062e-07, + "loss": 0.8773, + "step": 148590 + }, + { + "epoch": 11.515362857917781, + "grad_norm": 1.4202884503091115, + "learning_rate": 5.757904525728457e-07, + "loss": 0.8964, + "step": 148600 + }, + { + "epoch": 11.516137781394088, + "grad_norm": 1.5555789434450624, + "learning_rate": 5.758292002479852e-07, + "loss": 0.8692, + "step": 148610 + }, + { + "epoch": 11.516912704870395, + "grad_norm": 1.4845677830730117, + "learning_rate": 5.758679479231247e-07, + "loss": 0.8785, + "step": 148620 + }, + { + "epoch": 11.517687628346701, + "grad_norm": 1.4897167029444371, + "learning_rate": 5.759066955982641e-07, + "loss": 0.9056, + "step": 148630 + }, + { + "epoch": 11.518462551823008, + "grad_norm": 1.5096064967781329, + "learning_rate": 5.759454432734037e-07, + "loss": 0.8634, + "step": 148640 + }, + { + "epoch": 11.519237475299315, + "grad_norm": 1.5529303533456547, + "learning_rate": 5.759841909485432e-07, + "loss": 0.8854, + "step": 148650 + }, + { + "epoch": 11.52001239877562, + "grad_norm": 1.464332974633574, + "learning_rate": 5.760229386236826e-07, + "loss": 0.8612, + "step": 148660 + }, + { + "epoch": 11.520787322251927, + "grad_norm": 1.4546807259152699, + "learning_rate": 5.760616862988221e-07, + "loss": 0.8938, + "step": 148670 + }, + { + "epoch": 11.521562245728234, + "grad_norm": 1.5250113247928192, + "learning_rate": 5.761004339739616e-07, + "loss": 0.8849, + "step": 148680 + }, + { + "epoch": 11.52233716920454, + "grad_norm": 1.5032006522376116, + "learning_rate": 5.761391816491012e-07, + "loss": 0.8846, + "step": 148690 + }, + { + "epoch": 11.523112092680847, + "grad_norm": 1.5079328089640873, + "learning_rate": 5.761779293242406e-07, + "loss": 0.8871, + "step": 148700 + }, + { + "epoch": 11.523887016157154, + "grad_norm": 1.4711184190954876, + "learning_rate": 5.762166769993801e-07, + "loss": 0.8863, + "step": 148710 + }, + { + "epoch": 11.52466193963346, + "grad_norm": 1.53266518025726, + "learning_rate": 5.762554246745196e-07, + "loss": 0.8846, + "step": 148720 + }, + { + "epoch": 11.525436863109768, + "grad_norm": 1.4646393294565685, + "learning_rate": 5.76294172349659e-07, + "loss": 0.8686, + "step": 148730 + }, + { + "epoch": 11.526211786586074, + "grad_norm": 1.45993295748011, + "learning_rate": 5.763329200247986e-07, + "loss": 0.8851, + "step": 148740 + }, + { + "epoch": 11.526986710062381, + "grad_norm": 1.4251302204417682, + "learning_rate": 5.763716676999381e-07, + "loss": 0.8619, + "step": 148750 + }, + { + "epoch": 11.527761633538688, + "grad_norm": 1.5274329906320188, + "learning_rate": 5.764104153750776e-07, + "loss": 0.9081, + "step": 148760 + }, + { + "epoch": 11.528536557014995, + "grad_norm": 1.4774494704358896, + "learning_rate": 5.76449163050217e-07, + "loss": 0.8766, + "step": 148770 + }, + { + "epoch": 11.529311480491302, + "grad_norm": 1.5213382727814249, + "learning_rate": 5.764879107253565e-07, + "loss": 0.9152, + "step": 148780 + }, + { + "epoch": 11.530086403967609, + "grad_norm": 1.3945063943702678, + "learning_rate": 5.765266584004961e-07, + "loss": 0.8962, + "step": 148790 + }, + { + "epoch": 11.530861327443915, + "grad_norm": 1.5521267413328184, + "learning_rate": 5.765654060756355e-07, + "loss": 0.9038, + "step": 148800 + }, + { + "epoch": 11.531636250920222, + "grad_norm": 1.4578016445219988, + "learning_rate": 5.76604153750775e-07, + "loss": 0.8761, + "step": 148810 + }, + { + "epoch": 11.532411174396529, + "grad_norm": 1.3437979770676958, + "learning_rate": 5.766429014259145e-07, + "loss": 0.879, + "step": 148820 + }, + { + "epoch": 11.533186097872836, + "grad_norm": 1.5547399740939312, + "learning_rate": 5.76681649101054e-07, + "loss": 0.8901, + "step": 148830 + }, + { + "epoch": 11.533961021349143, + "grad_norm": 1.3978677297426165, + "learning_rate": 5.767203967761935e-07, + "loss": 0.8745, + "step": 148840 + }, + { + "epoch": 11.53473594482545, + "grad_norm": 1.5585753445164783, + "learning_rate": 5.76759144451333e-07, + "loss": 0.9024, + "step": 148850 + }, + { + "epoch": 11.535510868301754, + "grad_norm": 1.4501426129142534, + "learning_rate": 5.767978921264725e-07, + "loss": 0.888, + "step": 148860 + }, + { + "epoch": 11.536285791778061, + "grad_norm": 1.421229194506164, + "learning_rate": 5.768366398016119e-07, + "loss": 0.8791, + "step": 148870 + }, + { + "epoch": 11.537060715254368, + "grad_norm": 1.4976589811202468, + "learning_rate": 5.768753874767514e-07, + "loss": 0.8965, + "step": 148880 + }, + { + "epoch": 11.537835638730675, + "grad_norm": 1.4586993323589368, + "learning_rate": 5.76914135151891e-07, + "loss": 0.8824, + "step": 148890 + }, + { + "epoch": 11.538610562206982, + "grad_norm": 1.4392734693758626, + "learning_rate": 5.769528828270304e-07, + "loss": 0.8773, + "step": 148900 + }, + { + "epoch": 11.539385485683288, + "grad_norm": 1.4635265222849925, + "learning_rate": 5.769916305021699e-07, + "loss": 0.8866, + "step": 148910 + }, + { + "epoch": 11.540160409159595, + "grad_norm": 1.4471991179869632, + "learning_rate": 5.770303781773094e-07, + "loss": 0.9156, + "step": 148920 + }, + { + "epoch": 11.540935332635902, + "grad_norm": 1.4437797476351575, + "learning_rate": 5.77069125852449e-07, + "loss": 0.8667, + "step": 148930 + }, + { + "epoch": 11.541710256112209, + "grad_norm": 1.4326441810791393, + "learning_rate": 5.771078735275884e-07, + "loss": 0.8882, + "step": 148940 + }, + { + "epoch": 11.542485179588516, + "grad_norm": 1.4849514622571174, + "learning_rate": 5.771466212027279e-07, + "loss": 0.8858, + "step": 148950 + }, + { + "epoch": 11.543260103064823, + "grad_norm": 1.5342577457675488, + "learning_rate": 5.771853688778674e-07, + "loss": 0.8987, + "step": 148960 + }, + { + "epoch": 11.54403502654113, + "grad_norm": 1.475377181334803, + "learning_rate": 5.772241165530068e-07, + "loss": 0.8889, + "step": 148970 + }, + { + "epoch": 11.544809950017436, + "grad_norm": 1.4624065587838002, + "learning_rate": 5.772628642281463e-07, + "loss": 0.8769, + "step": 148980 + }, + { + "epoch": 11.545584873493743, + "grad_norm": 1.4659635394376298, + "learning_rate": 5.773016119032859e-07, + "loss": 0.8831, + "step": 148990 + }, + { + "epoch": 11.54635979697005, + "grad_norm": 1.4663007305528437, + "learning_rate": 5.773403595784254e-07, + "loss": 0.8999, + "step": 149000 + }, + { + "epoch": 11.54635979697005, + "eval_loss": 0.9012012481689453, + "eval_runtime": 328.9822, + "eval_samples_per_second": 34.868, + "eval_steps_per_second": 8.718, + "step": 149000 + }, + { + "epoch": 11.547134720446357, + "grad_norm": 1.4564766622859955, + "learning_rate": 5.773791072535648e-07, + "loss": 0.8938, + "step": 149010 + }, + { + "epoch": 11.547909643922663, + "grad_norm": 1.43091654707339, + "learning_rate": 5.774178549287043e-07, + "loss": 0.8838, + "step": 149020 + }, + { + "epoch": 11.548684567398968, + "grad_norm": 1.5485819022212937, + "learning_rate": 5.774566026038439e-07, + "loss": 0.8835, + "step": 149030 + }, + { + "epoch": 11.549459490875275, + "grad_norm": 1.414325516542771, + "learning_rate": 5.774953502789833e-07, + "loss": 0.8727, + "step": 149040 + }, + { + "epoch": 11.550234414351582, + "grad_norm": 1.5284855830613955, + "learning_rate": 5.775340979541228e-07, + "loss": 0.9133, + "step": 149050 + }, + { + "epoch": 11.551009337827889, + "grad_norm": 1.4822219241123733, + "learning_rate": 5.775728456292623e-07, + "loss": 0.8825, + "step": 149060 + }, + { + "epoch": 11.551784261304196, + "grad_norm": 1.4613321908642858, + "learning_rate": 5.776115933044018e-07, + "loss": 0.8901, + "step": 149070 + }, + { + "epoch": 11.552559184780502, + "grad_norm": 1.4753137540252514, + "learning_rate": 5.776503409795412e-07, + "loss": 0.8941, + "step": 149080 + }, + { + "epoch": 11.55333410825681, + "grad_norm": 1.4578271516569934, + "learning_rate": 5.776890886546808e-07, + "loss": 0.8844, + "step": 149090 + }, + { + "epoch": 11.554109031733116, + "grad_norm": 1.503758823306561, + "learning_rate": 5.777278363298203e-07, + "loss": 0.8883, + "step": 149100 + }, + { + "epoch": 11.554883955209423, + "grad_norm": 1.5107823787564927, + "learning_rate": 5.777665840049597e-07, + "loss": 0.899, + "step": 149110 + }, + { + "epoch": 11.55565887868573, + "grad_norm": 1.462224021858976, + "learning_rate": 5.778053316800992e-07, + "loss": 0.9222, + "step": 149120 + }, + { + "epoch": 11.556433802162037, + "grad_norm": 1.5977209546537745, + "learning_rate": 5.778440793552388e-07, + "loss": 0.9029, + "step": 149130 + }, + { + "epoch": 11.557208725638343, + "grad_norm": 1.4710706608202415, + "learning_rate": 5.778828270303783e-07, + "loss": 0.8769, + "step": 149140 + }, + { + "epoch": 11.55798364911465, + "grad_norm": 1.4711828999118202, + "learning_rate": 5.779215747055177e-07, + "loss": 0.8868, + "step": 149150 + }, + { + "epoch": 11.558758572590957, + "grad_norm": 1.4844724401246887, + "learning_rate": 5.779603223806572e-07, + "loss": 0.8735, + "step": 149160 + }, + { + "epoch": 11.559533496067264, + "grad_norm": 1.4682600226774243, + "learning_rate": 5.779990700557967e-07, + "loss": 0.8908, + "step": 149170 + }, + { + "epoch": 11.56030841954357, + "grad_norm": 1.48721073577927, + "learning_rate": 5.780378177309362e-07, + "loss": 0.8765, + "step": 149180 + }, + { + "epoch": 11.561083343019877, + "grad_norm": 1.3982472230540002, + "learning_rate": 5.780765654060757e-07, + "loss": 0.8762, + "step": 149190 + }, + { + "epoch": 11.561858266496184, + "grad_norm": 1.4984397967329257, + "learning_rate": 5.781153130812152e-07, + "loss": 0.8623, + "step": 149200 + }, + { + "epoch": 11.562633189972491, + "grad_norm": 1.4768254160309175, + "learning_rate": 5.781540607563547e-07, + "loss": 0.8887, + "step": 149210 + }, + { + "epoch": 11.563408113448798, + "grad_norm": 1.465156966589591, + "learning_rate": 5.781928084314941e-07, + "loss": 0.8853, + "step": 149220 + }, + { + "epoch": 11.564183036925103, + "grad_norm": 1.388064614752612, + "learning_rate": 5.782315561066337e-07, + "loss": 0.9101, + "step": 149230 + }, + { + "epoch": 11.56495796040141, + "grad_norm": 1.5857738750988648, + "learning_rate": 5.782703037817732e-07, + "loss": 0.8932, + "step": 149240 + }, + { + "epoch": 11.565732883877716, + "grad_norm": 1.4446331147812008, + "learning_rate": 5.783090514569126e-07, + "loss": 0.8838, + "step": 149250 + }, + { + "epoch": 11.566507807354023, + "grad_norm": 1.4965967051749052, + "learning_rate": 5.783477991320521e-07, + "loss": 0.8767, + "step": 149260 + }, + { + "epoch": 11.56728273083033, + "grad_norm": 1.4370074204013568, + "learning_rate": 5.783865468071916e-07, + "loss": 0.8928, + "step": 149270 + }, + { + "epoch": 11.568057654306637, + "grad_norm": 1.4844269615090484, + "learning_rate": 5.784252944823312e-07, + "loss": 0.8783, + "step": 149280 + }, + { + "epoch": 11.568832577782944, + "grad_norm": 1.5374829140708788, + "learning_rate": 5.784640421574706e-07, + "loss": 0.8855, + "step": 149290 + }, + { + "epoch": 11.56960750125925, + "grad_norm": 1.459100330187715, + "learning_rate": 5.785027898326101e-07, + "loss": 0.8705, + "step": 149300 + }, + { + "epoch": 11.570382424735557, + "grad_norm": 1.404629505745215, + "learning_rate": 5.785415375077496e-07, + "loss": 0.8842, + "step": 149310 + }, + { + "epoch": 11.571157348211864, + "grad_norm": 1.4529635531121607, + "learning_rate": 5.78580285182889e-07, + "loss": 0.8887, + "step": 149320 + }, + { + "epoch": 11.571932271688171, + "grad_norm": 1.4803273975140074, + "learning_rate": 5.786190328580286e-07, + "loss": 0.8743, + "step": 149330 + }, + { + "epoch": 11.572707195164478, + "grad_norm": 1.3875715812738572, + "learning_rate": 5.786577805331681e-07, + "loss": 0.8747, + "step": 149340 + }, + { + "epoch": 11.573482118640785, + "grad_norm": 1.5573085084771578, + "learning_rate": 5.786965282083076e-07, + "loss": 0.8761, + "step": 149350 + }, + { + "epoch": 11.574257042117091, + "grad_norm": 1.5041964570587685, + "learning_rate": 5.78735275883447e-07, + "loss": 0.8785, + "step": 149360 + }, + { + "epoch": 11.575031965593398, + "grad_norm": 1.5428422420401813, + "learning_rate": 5.787740235585865e-07, + "loss": 0.8771, + "step": 149370 + }, + { + "epoch": 11.575806889069705, + "grad_norm": 1.4476878755321143, + "learning_rate": 5.788127712337261e-07, + "loss": 0.8761, + "step": 149380 + }, + { + "epoch": 11.576581812546012, + "grad_norm": 1.4203013882214306, + "learning_rate": 5.788515189088655e-07, + "loss": 0.8632, + "step": 149390 + }, + { + "epoch": 11.577356736022319, + "grad_norm": 1.4165476964064327, + "learning_rate": 5.78890266584005e-07, + "loss": 0.8828, + "step": 149400 + }, + { + "epoch": 11.578131659498624, + "grad_norm": 1.393164244476865, + "learning_rate": 5.789290142591445e-07, + "loss": 0.9062, + "step": 149410 + }, + { + "epoch": 11.57890658297493, + "grad_norm": 1.4827380810797282, + "learning_rate": 5.78967761934284e-07, + "loss": 0.8721, + "step": 149420 + }, + { + "epoch": 11.579681506451237, + "grad_norm": 1.5173725831021065, + "learning_rate": 5.790065096094235e-07, + "loss": 0.8662, + "step": 149430 + }, + { + "epoch": 11.580456429927544, + "grad_norm": 1.3970575778479895, + "learning_rate": 5.79045257284563e-07, + "loss": 0.8557, + "step": 149440 + }, + { + "epoch": 11.581231353403851, + "grad_norm": 1.631270908199103, + "learning_rate": 5.790840049597025e-07, + "loss": 0.8926, + "step": 149450 + }, + { + "epoch": 11.582006276880158, + "grad_norm": 1.530279204499324, + "learning_rate": 5.791227526348419e-07, + "loss": 0.9064, + "step": 149460 + }, + { + "epoch": 11.582781200356465, + "grad_norm": 1.4664972484916075, + "learning_rate": 5.791615003099814e-07, + "loss": 0.8715, + "step": 149470 + }, + { + "epoch": 11.583556123832771, + "grad_norm": 1.472418501660249, + "learning_rate": 5.79200247985121e-07, + "loss": 0.8682, + "step": 149480 + }, + { + "epoch": 11.584331047309078, + "grad_norm": 1.48249984466724, + "learning_rate": 5.792389956602605e-07, + "loss": 0.8725, + "step": 149490 + }, + { + "epoch": 11.585105970785385, + "grad_norm": 1.5513830901197616, + "learning_rate": 5.792777433353999e-07, + "loss": 0.8794, + "step": 149500 + }, + { + "epoch": 11.585105970785385, + "eval_loss": 0.9010752439498901, + "eval_runtime": 328.7425, + "eval_samples_per_second": 34.894, + "eval_steps_per_second": 8.724, + "step": 149500 + }, + { + "epoch": 11.585880894261692, + "grad_norm": 1.4839315030740639, + "learning_rate": 5.793164910105394e-07, + "loss": 0.8744, + "step": 149510 + }, + { + "epoch": 11.586655817737999, + "grad_norm": 1.502458098422954, + "learning_rate": 5.793552386856789e-07, + "loss": 0.8762, + "step": 149520 + }, + { + "epoch": 11.587430741214305, + "grad_norm": 1.4997665556538835, + "learning_rate": 5.793939863608184e-07, + "loss": 0.8777, + "step": 149530 + }, + { + "epoch": 11.588205664690612, + "grad_norm": 1.4842221839201757, + "learning_rate": 5.794327340359579e-07, + "loss": 0.9043, + "step": 149540 + }, + { + "epoch": 11.588980588166919, + "grad_norm": 1.4361775724733832, + "learning_rate": 5.794714817110974e-07, + "loss": 0.9099, + "step": 149550 + }, + { + "epoch": 11.589755511643226, + "grad_norm": 1.392277994364776, + "learning_rate": 5.795102293862369e-07, + "loss": 0.9018, + "step": 149560 + }, + { + "epoch": 11.590530435119533, + "grad_norm": 1.384555407292558, + "learning_rate": 5.795489770613763e-07, + "loss": 0.9008, + "step": 149570 + }, + { + "epoch": 11.59130535859584, + "grad_norm": 1.5355956842688325, + "learning_rate": 5.795877247365159e-07, + "loss": 0.8722, + "step": 149580 + }, + { + "epoch": 11.592080282072146, + "grad_norm": 1.5318898239681864, + "learning_rate": 5.796264724116554e-07, + "loss": 0.8875, + "step": 149590 + }, + { + "epoch": 11.592855205548451, + "grad_norm": 1.4669914600129599, + "learning_rate": 5.796652200867948e-07, + "loss": 0.8683, + "step": 149600 + }, + { + "epoch": 11.593630129024758, + "grad_norm": 1.4631288818443082, + "learning_rate": 5.797039677619343e-07, + "loss": 0.8864, + "step": 149610 + }, + { + "epoch": 11.594405052501065, + "grad_norm": 1.4615017622895758, + "learning_rate": 5.797427154370738e-07, + "loss": 0.88, + "step": 149620 + }, + { + "epoch": 11.595179975977372, + "grad_norm": 1.5460560045683007, + "learning_rate": 5.797814631122134e-07, + "loss": 0.876, + "step": 149630 + }, + { + "epoch": 11.595954899453679, + "grad_norm": 1.5275339557482328, + "learning_rate": 5.798202107873528e-07, + "loss": 0.9001, + "step": 149640 + }, + { + "epoch": 11.596729822929985, + "grad_norm": 1.4906840156104395, + "learning_rate": 5.798589584624923e-07, + "loss": 0.8974, + "step": 149650 + }, + { + "epoch": 11.597504746406292, + "grad_norm": 1.4833784838262216, + "learning_rate": 5.798977061376318e-07, + "loss": 0.8807, + "step": 149660 + }, + { + "epoch": 11.598279669882599, + "grad_norm": 1.4862911083200447, + "learning_rate": 5.799364538127712e-07, + "loss": 0.883, + "step": 149670 + }, + { + "epoch": 11.599054593358906, + "grad_norm": 1.4098683826766385, + "learning_rate": 5.799752014879108e-07, + "loss": 0.8877, + "step": 149680 + }, + { + "epoch": 11.599829516835213, + "grad_norm": 1.500882589908433, + "learning_rate": 5.800139491630503e-07, + "loss": 0.8748, + "step": 149690 + }, + { + "epoch": 11.60060444031152, + "grad_norm": 1.4557280562090549, + "learning_rate": 5.800526968381898e-07, + "loss": 0.8953, + "step": 149700 + }, + { + "epoch": 11.601379363787826, + "grad_norm": 1.4916444641111548, + "learning_rate": 5.800914445133292e-07, + "loss": 0.8782, + "step": 149710 + }, + { + "epoch": 11.602154287264133, + "grad_norm": 1.4103130892945852, + "learning_rate": 5.801301921884688e-07, + "loss": 0.8775, + "step": 149720 + }, + { + "epoch": 11.60292921074044, + "grad_norm": 1.5247170845529323, + "learning_rate": 5.801689398636083e-07, + "loss": 0.9046, + "step": 149730 + }, + { + "epoch": 11.603704134216747, + "grad_norm": 1.5790914539232077, + "learning_rate": 5.802076875387477e-07, + "loss": 0.8706, + "step": 149740 + }, + { + "epoch": 11.604479057693053, + "grad_norm": 1.5013805339109012, + "learning_rate": 5.802464352138872e-07, + "loss": 0.8961, + "step": 149750 + }, + { + "epoch": 11.60525398116936, + "grad_norm": 1.458824900730456, + "learning_rate": 5.802851828890267e-07, + "loss": 0.8842, + "step": 149760 + }, + { + "epoch": 11.606028904645667, + "grad_norm": 1.4164038248211293, + "learning_rate": 5.803239305641663e-07, + "loss": 0.8939, + "step": 149770 + }, + { + "epoch": 11.606803828121972, + "grad_norm": 1.4866578436401678, + "learning_rate": 5.803626782393057e-07, + "loss": 0.895, + "step": 149780 + }, + { + "epoch": 11.607578751598279, + "grad_norm": 1.392410819345245, + "learning_rate": 5.804014259144452e-07, + "loss": 0.8649, + "step": 149790 + }, + { + "epoch": 11.608353675074586, + "grad_norm": 1.4431885912495674, + "learning_rate": 5.804401735895847e-07, + "loss": 0.9072, + "step": 149800 + }, + { + "epoch": 11.609128598550893, + "grad_norm": 1.5011161446357, + "learning_rate": 5.804789212647241e-07, + "loss": 0.912, + "step": 149810 + }, + { + "epoch": 11.6099035220272, + "grad_norm": 1.4517488264491039, + "learning_rate": 5.805176689398637e-07, + "loss": 0.8783, + "step": 149820 + }, + { + "epoch": 11.610678445503506, + "grad_norm": 1.4177626444170397, + "learning_rate": 5.805564166150032e-07, + "loss": 0.8975, + "step": 149830 + }, + { + "epoch": 11.611453368979813, + "grad_norm": 1.4464263925739935, + "learning_rate": 5.805951642901427e-07, + "loss": 0.8988, + "step": 149840 + }, + { + "epoch": 11.61222829245612, + "grad_norm": 1.3919228598312643, + "learning_rate": 5.806339119652821e-07, + "loss": 0.8776, + "step": 149850 + }, + { + "epoch": 11.613003215932427, + "grad_norm": 1.5272797141905485, + "learning_rate": 5.806726596404216e-07, + "loss": 0.899, + "step": 149860 + }, + { + "epoch": 11.613778139408733, + "grad_norm": 1.4459036962157774, + "learning_rate": 5.807114073155612e-07, + "loss": 0.8833, + "step": 149870 + }, + { + "epoch": 11.61455306288504, + "grad_norm": 1.551894827203779, + "learning_rate": 5.807501549907006e-07, + "loss": 0.8917, + "step": 149880 + }, + { + "epoch": 11.615327986361347, + "grad_norm": 1.5398032646443038, + "learning_rate": 5.807889026658401e-07, + "loss": 0.8788, + "step": 149890 + }, + { + "epoch": 11.616102909837654, + "grad_norm": 1.5026795139706428, + "learning_rate": 5.808276503409796e-07, + "loss": 0.8857, + "step": 149900 + }, + { + "epoch": 11.61687783331396, + "grad_norm": 1.513498953509901, + "learning_rate": 5.808663980161191e-07, + "loss": 0.8795, + "step": 149910 + }, + { + "epoch": 11.617652756790267, + "grad_norm": 1.4789610052860422, + "learning_rate": 5.809051456912586e-07, + "loss": 0.8973, + "step": 149920 + }, + { + "epoch": 11.618427680266574, + "grad_norm": 1.4244579606269736, + "learning_rate": 5.809438933663981e-07, + "loss": 0.8912, + "step": 149930 + }, + { + "epoch": 11.619202603742881, + "grad_norm": 1.5351467385438413, + "learning_rate": 5.809826410415376e-07, + "loss": 0.8926, + "step": 149940 + }, + { + "epoch": 11.619977527219188, + "grad_norm": 1.401747826461292, + "learning_rate": 5.81021388716677e-07, + "loss": 0.8992, + "step": 149950 + }, + { + "epoch": 11.620752450695495, + "grad_norm": 1.5372667641896531, + "learning_rate": 5.810601363918165e-07, + "loss": 0.876, + "step": 149960 + }, + { + "epoch": 11.6215273741718, + "grad_norm": 1.416521076617453, + "learning_rate": 5.810988840669561e-07, + "loss": 0.9012, + "step": 149970 + }, + { + "epoch": 11.622302297648107, + "grad_norm": 1.506604753294481, + "learning_rate": 5.811376317420956e-07, + "loss": 0.8773, + "step": 149980 + }, + { + "epoch": 11.623077221124413, + "grad_norm": 1.5349963167292953, + "learning_rate": 5.81176379417235e-07, + "loss": 0.903, + "step": 149990 + }, + { + "epoch": 11.62385214460072, + "grad_norm": 1.4861931274985498, + "learning_rate": 5.812151270923745e-07, + "loss": 0.8879, + "step": 150000 + }, + { + "epoch": 11.62385214460072, + "eval_loss": 0.9007593989372253, + "eval_runtime": 327.0037, + "eval_samples_per_second": 35.079, + "eval_steps_per_second": 8.771, + "step": 150000 + }, + { + "epoch": 11.624627068077027, + "grad_norm": 1.418745868802665, + "learning_rate": 5.81253874767514e-07, + "loss": 0.8979, + "step": 150010 + }, + { + "epoch": 11.625401991553334, + "grad_norm": 1.4239179180983028, + "learning_rate": 5.812926224426535e-07, + "loss": 0.8848, + "step": 150020 + }, + { + "epoch": 11.62617691502964, + "grad_norm": 1.4592576748952244, + "learning_rate": 5.81331370117793e-07, + "loss": 0.8895, + "step": 150030 + }, + { + "epoch": 11.626951838505947, + "grad_norm": 1.3959526417489136, + "learning_rate": 5.813701177929325e-07, + "loss": 0.8865, + "step": 150040 + }, + { + "epoch": 11.627726761982254, + "grad_norm": 1.527535166204276, + "learning_rate": 5.81408865468072e-07, + "loss": 0.8706, + "step": 150050 + }, + { + "epoch": 11.628501685458561, + "grad_norm": 1.3892697634359439, + "learning_rate": 5.814476131432114e-07, + "loss": 0.8927, + "step": 150060 + }, + { + "epoch": 11.629276608934868, + "grad_norm": 1.5159766179139986, + "learning_rate": 5.81486360818351e-07, + "loss": 0.8733, + "step": 150070 + }, + { + "epoch": 11.630051532411175, + "grad_norm": 1.5205484677863415, + "learning_rate": 5.815251084934905e-07, + "loss": 0.9071, + "step": 150080 + }, + { + "epoch": 11.630826455887481, + "grad_norm": 1.5992177931318712, + "learning_rate": 5.815638561686299e-07, + "loss": 0.9134, + "step": 150090 + }, + { + "epoch": 11.631601379363788, + "grad_norm": 1.4511152644067973, + "learning_rate": 5.816026038437694e-07, + "loss": 0.8767, + "step": 150100 + }, + { + "epoch": 11.632376302840095, + "grad_norm": 1.474726878189188, + "learning_rate": 5.816413515189089e-07, + "loss": 0.8851, + "step": 150110 + }, + { + "epoch": 11.633151226316402, + "grad_norm": 1.3976294205837545, + "learning_rate": 5.816800991940485e-07, + "loss": 0.8956, + "step": 150120 + }, + { + "epoch": 11.633926149792709, + "grad_norm": 1.4909358537864412, + "learning_rate": 5.817188468691879e-07, + "loss": 0.8909, + "step": 150130 + }, + { + "epoch": 11.634701073269015, + "grad_norm": 1.4836409717242114, + "learning_rate": 5.817575945443274e-07, + "loss": 0.8773, + "step": 150140 + }, + { + "epoch": 11.63547599674532, + "grad_norm": 1.4559912334411258, + "learning_rate": 5.817963422194669e-07, + "loss": 0.8954, + "step": 150150 + }, + { + "epoch": 11.636250920221627, + "grad_norm": 1.519108842586579, + "learning_rate": 5.818350898946063e-07, + "loss": 0.8932, + "step": 150160 + }, + { + "epoch": 11.637025843697934, + "grad_norm": 1.3890302547937006, + "learning_rate": 5.818738375697459e-07, + "loss": 0.8879, + "step": 150170 + }, + { + "epoch": 11.637800767174241, + "grad_norm": 1.4738661066614476, + "learning_rate": 5.819125852448854e-07, + "loss": 0.8826, + "step": 150180 + }, + { + "epoch": 11.638575690650548, + "grad_norm": 1.481805812079676, + "learning_rate": 5.819513329200249e-07, + "loss": 0.8866, + "step": 150190 + }, + { + "epoch": 11.639350614126855, + "grad_norm": 1.4244854248406222, + "learning_rate": 5.819900805951643e-07, + "loss": 0.8825, + "step": 150200 + }, + { + "epoch": 11.640125537603161, + "grad_norm": 1.5603893271579021, + "learning_rate": 5.820288282703038e-07, + "loss": 0.9027, + "step": 150210 + }, + { + "epoch": 11.640900461079468, + "grad_norm": 1.3642558111280936, + "learning_rate": 5.820675759454434e-07, + "loss": 0.8922, + "step": 150220 + }, + { + "epoch": 11.641675384555775, + "grad_norm": 1.4702950335357527, + "learning_rate": 5.821063236205828e-07, + "loss": 0.8678, + "step": 150230 + }, + { + "epoch": 11.642450308032082, + "grad_norm": 1.5076151374830007, + "learning_rate": 5.821450712957223e-07, + "loss": 0.8924, + "step": 150240 + }, + { + "epoch": 11.643225231508389, + "grad_norm": 1.5494878162162418, + "learning_rate": 5.821838189708618e-07, + "loss": 0.8933, + "step": 150250 + }, + { + "epoch": 11.644000154984695, + "grad_norm": 1.5190830472544548, + "learning_rate": 5.822225666460013e-07, + "loss": 0.8768, + "step": 150260 + }, + { + "epoch": 11.644775078461002, + "grad_norm": 1.4352112134931803, + "learning_rate": 5.822613143211408e-07, + "loss": 0.8878, + "step": 150270 + }, + { + "epoch": 11.645550001937309, + "grad_norm": 1.4432331050913325, + "learning_rate": 5.823000619962803e-07, + "loss": 0.8864, + "step": 150280 + }, + { + "epoch": 11.646324925413616, + "grad_norm": 1.499745310531178, + "learning_rate": 5.823388096714198e-07, + "loss": 0.8756, + "step": 150290 + }, + { + "epoch": 11.647099848889923, + "grad_norm": 1.6242419565799484, + "learning_rate": 5.823775573465592e-07, + "loss": 0.8882, + "step": 150300 + }, + { + "epoch": 11.64787477236623, + "grad_norm": 1.5013568077277162, + "learning_rate": 5.824163050216987e-07, + "loss": 0.8806, + "step": 150310 + }, + { + "epoch": 11.648649695842536, + "grad_norm": 1.5428872172002142, + "learning_rate": 5.824550526968383e-07, + "loss": 0.8772, + "step": 150320 + }, + { + "epoch": 11.649424619318843, + "grad_norm": 1.4504383448866356, + "learning_rate": 5.824938003719777e-07, + "loss": 0.8808, + "step": 150330 + }, + { + "epoch": 11.65019954279515, + "grad_norm": 1.519323238797721, + "learning_rate": 5.825325480471172e-07, + "loss": 0.8864, + "step": 150340 + }, + { + "epoch": 11.650974466271455, + "grad_norm": 1.422307056825973, + "learning_rate": 5.825712957222567e-07, + "loss": 0.8971, + "step": 150350 + }, + { + "epoch": 11.651749389747762, + "grad_norm": 1.6042947640571306, + "learning_rate": 5.826100433973963e-07, + "loss": 0.9005, + "step": 150360 + }, + { + "epoch": 11.652524313224069, + "grad_norm": 1.4661330038292109, + "learning_rate": 5.826487910725357e-07, + "loss": 0.8967, + "step": 150370 + }, + { + "epoch": 11.653299236700375, + "grad_norm": 1.5139353073346362, + "learning_rate": 5.826875387476752e-07, + "loss": 0.8924, + "step": 150380 + }, + { + "epoch": 11.654074160176682, + "grad_norm": 1.5187493250374757, + "learning_rate": 5.827262864228147e-07, + "loss": 0.876, + "step": 150390 + }, + { + "epoch": 11.654849083652989, + "grad_norm": 1.4474946135844844, + "learning_rate": 5.827650340979541e-07, + "loss": 0.8918, + "step": 150400 + }, + { + "epoch": 11.655624007129296, + "grad_norm": 1.5526492564175192, + "learning_rate": 5.828037817730936e-07, + "loss": 0.8914, + "step": 150410 + }, + { + "epoch": 11.656398930605603, + "grad_norm": 1.3670848081773257, + "learning_rate": 5.828425294482332e-07, + "loss": 0.8696, + "step": 150420 + }, + { + "epoch": 11.65717385408191, + "grad_norm": 1.4871756905914442, + "learning_rate": 5.828812771233727e-07, + "loss": 0.8846, + "step": 150430 + }, + { + "epoch": 11.657948777558216, + "grad_norm": 1.4603785913374476, + "learning_rate": 5.829200247985121e-07, + "loss": 0.8825, + "step": 150440 + }, + { + "epoch": 11.658723701034523, + "grad_norm": 1.5049983297357636, + "learning_rate": 5.829587724736516e-07, + "loss": 0.8727, + "step": 150450 + }, + { + "epoch": 11.65949862451083, + "grad_norm": 1.4546334117481925, + "learning_rate": 5.829975201487912e-07, + "loss": 0.8834, + "step": 150460 + }, + { + "epoch": 11.660273547987137, + "grad_norm": 1.4027230208133077, + "learning_rate": 5.830362678239306e-07, + "loss": 0.8892, + "step": 150470 + }, + { + "epoch": 11.661048471463443, + "grad_norm": 1.464465439831939, + "learning_rate": 5.830750154990701e-07, + "loss": 0.8841, + "step": 150480 + }, + { + "epoch": 11.66182339493975, + "grad_norm": 1.4074448772596913, + "learning_rate": 5.831137631742096e-07, + "loss": 0.9045, + "step": 150490 + }, + { + "epoch": 11.662598318416057, + "grad_norm": 1.507181438052099, + "learning_rate": 5.831525108493491e-07, + "loss": 0.8859, + "step": 150500 + }, + { + "epoch": 11.662598318416057, + "eval_loss": 0.9005164504051208, + "eval_runtime": 329.5366, + "eval_samples_per_second": 34.809, + "eval_steps_per_second": 8.703, + "step": 150500 + }, + { + "epoch": 11.663373241892364, + "grad_norm": 1.483929708838383, + "learning_rate": 5.831912585244886e-07, + "loss": 0.904, + "step": 150510 + }, + { + "epoch": 11.664148165368669, + "grad_norm": 1.5046494002580357, + "learning_rate": 5.832300061996281e-07, + "loss": 0.8918, + "step": 150520 + }, + { + "epoch": 11.664923088844976, + "grad_norm": 1.4916215871954346, + "learning_rate": 5.832687538747676e-07, + "loss": 0.8854, + "step": 150530 + }, + { + "epoch": 11.665698012321283, + "grad_norm": 1.4921831212586425, + "learning_rate": 5.83307501549907e-07, + "loss": 0.8969, + "step": 150540 + }, + { + "epoch": 11.66647293579759, + "grad_norm": 1.4501476257932326, + "learning_rate": 5.833462492250465e-07, + "loss": 0.8769, + "step": 150550 + }, + { + "epoch": 11.667247859273896, + "grad_norm": 1.4645298015477985, + "learning_rate": 5.833849969001861e-07, + "loss": 0.8809, + "step": 150560 + }, + { + "epoch": 11.668022782750203, + "grad_norm": 1.4580658464893932, + "learning_rate": 5.834237445753256e-07, + "loss": 0.8883, + "step": 150570 + }, + { + "epoch": 11.66879770622651, + "grad_norm": 1.4529767154729099, + "learning_rate": 5.83462492250465e-07, + "loss": 0.893, + "step": 150580 + }, + { + "epoch": 11.669572629702817, + "grad_norm": 1.4267469382074924, + "learning_rate": 5.835012399256045e-07, + "loss": 0.875, + "step": 150590 + }, + { + "epoch": 11.670347553179123, + "grad_norm": 1.4162101751511202, + "learning_rate": 5.83539987600744e-07, + "loss": 0.8811, + "step": 150600 + }, + { + "epoch": 11.67112247665543, + "grad_norm": 1.4538128244996245, + "learning_rate": 5.835787352758835e-07, + "loss": 0.8887, + "step": 150610 + }, + { + "epoch": 11.671897400131737, + "grad_norm": 1.5470306179623325, + "learning_rate": 5.83617482951023e-07, + "loss": 0.8902, + "step": 150620 + }, + { + "epoch": 11.672672323608044, + "grad_norm": 1.4537753361245562, + "learning_rate": 5.836562306261625e-07, + "loss": 0.8922, + "step": 150630 + }, + { + "epoch": 11.67344724708435, + "grad_norm": 1.5743263227880129, + "learning_rate": 5.83694978301302e-07, + "loss": 0.8717, + "step": 150640 + }, + { + "epoch": 11.674222170560657, + "grad_norm": 1.4873675300173328, + "learning_rate": 5.837337259764414e-07, + "loss": 0.8813, + "step": 150650 + }, + { + "epoch": 11.674997094036964, + "grad_norm": 1.6352699253758964, + "learning_rate": 5.83772473651581e-07, + "loss": 0.9021, + "step": 150660 + }, + { + "epoch": 11.675772017513271, + "grad_norm": 1.4996799939086163, + "learning_rate": 5.838112213267205e-07, + "loss": 0.901, + "step": 150670 + }, + { + "epoch": 11.676546940989578, + "grad_norm": 1.3934498217036815, + "learning_rate": 5.838499690018599e-07, + "loss": 0.8806, + "step": 150680 + }, + { + "epoch": 11.677321864465885, + "grad_norm": 1.5125509648918563, + "learning_rate": 5.838887166769994e-07, + "loss": 0.8924, + "step": 150690 + }, + { + "epoch": 11.678096787942192, + "grad_norm": 1.5085183381531986, + "learning_rate": 5.839274643521389e-07, + "loss": 0.867, + "step": 150700 + }, + { + "epoch": 11.678871711418498, + "grad_norm": 1.389370009674482, + "learning_rate": 5.839662120272785e-07, + "loss": 0.8706, + "step": 150710 + }, + { + "epoch": 11.679646634894803, + "grad_norm": 1.4918353891440983, + "learning_rate": 5.840049597024179e-07, + "loss": 0.8837, + "step": 150720 + }, + { + "epoch": 11.68042155837111, + "grad_norm": 1.488718653081193, + "learning_rate": 5.840437073775574e-07, + "loss": 0.8917, + "step": 150730 + }, + { + "epoch": 11.681196481847417, + "grad_norm": 1.4601112977114539, + "learning_rate": 5.840824550526969e-07, + "loss": 0.8974, + "step": 150740 + }, + { + "epoch": 11.681971405323724, + "grad_norm": 1.4738579778264496, + "learning_rate": 5.841212027278363e-07, + "loss": 0.8773, + "step": 150750 + }, + { + "epoch": 11.68274632880003, + "grad_norm": 1.4772271400159953, + "learning_rate": 5.841599504029759e-07, + "loss": 0.8843, + "step": 150760 + }, + { + "epoch": 11.683521252276337, + "grad_norm": 1.4182200565537872, + "learning_rate": 5.841986980781154e-07, + "loss": 0.8796, + "step": 150770 + }, + { + "epoch": 11.684296175752644, + "grad_norm": 1.4946096302921499, + "learning_rate": 5.842374457532549e-07, + "loss": 0.9013, + "step": 150780 + }, + { + "epoch": 11.685071099228951, + "grad_norm": 1.4941896007712983, + "learning_rate": 5.842761934283943e-07, + "loss": 0.9015, + "step": 150790 + }, + { + "epoch": 11.685846022705258, + "grad_norm": 1.5106262972679962, + "learning_rate": 5.843149411035338e-07, + "loss": 0.8884, + "step": 150800 + }, + { + "epoch": 11.686620946181565, + "grad_norm": 1.4045111118239728, + "learning_rate": 5.843536887786734e-07, + "loss": 0.922, + "step": 150810 + }, + { + "epoch": 11.687395869657871, + "grad_norm": 1.51704813952679, + "learning_rate": 5.843924364538128e-07, + "loss": 0.8963, + "step": 150820 + }, + { + "epoch": 11.688170793134178, + "grad_norm": 1.4707709843420211, + "learning_rate": 5.844311841289523e-07, + "loss": 0.8776, + "step": 150830 + }, + { + "epoch": 11.688945716610485, + "grad_norm": 1.4610806049566811, + "learning_rate": 5.844699318040918e-07, + "loss": 0.8973, + "step": 150840 + }, + { + "epoch": 11.689720640086792, + "grad_norm": 1.4546351199191239, + "learning_rate": 5.845086794792313e-07, + "loss": 0.8851, + "step": 150850 + }, + { + "epoch": 11.690495563563099, + "grad_norm": 1.4482616624861242, + "learning_rate": 5.845474271543708e-07, + "loss": 0.9097, + "step": 150860 + }, + { + "epoch": 11.691270487039406, + "grad_norm": 1.5872451156488938, + "learning_rate": 5.845861748295103e-07, + "loss": 0.8874, + "step": 150870 + }, + { + "epoch": 11.692045410515712, + "grad_norm": 1.545439902717498, + "learning_rate": 5.846249225046498e-07, + "loss": 0.9009, + "step": 150880 + }, + { + "epoch": 11.692820333992017, + "grad_norm": 1.4295465719833504, + "learning_rate": 5.846636701797892e-07, + "loss": 0.8834, + "step": 150890 + }, + { + "epoch": 11.693595257468324, + "grad_norm": 1.5462373376220346, + "learning_rate": 5.847024178549287e-07, + "loss": 0.8845, + "step": 150900 + }, + { + "epoch": 11.694370180944631, + "grad_norm": 1.42191577688138, + "learning_rate": 5.847411655300683e-07, + "loss": 0.8752, + "step": 150910 + }, + { + "epoch": 11.695145104420938, + "grad_norm": 1.4494326089077292, + "learning_rate": 5.847799132052078e-07, + "loss": 0.8927, + "step": 150920 + }, + { + "epoch": 11.695920027897245, + "grad_norm": 1.4963888684031825, + "learning_rate": 5.848186608803472e-07, + "loss": 0.8799, + "step": 150930 + }, + { + "epoch": 11.696694951373551, + "grad_norm": 1.4695997545173505, + "learning_rate": 5.848574085554867e-07, + "loss": 0.8926, + "step": 150940 + }, + { + "epoch": 11.697469874849858, + "grad_norm": 1.4409458931969903, + "learning_rate": 5.848961562306262e-07, + "loss": 0.901, + "step": 150950 + }, + { + "epoch": 11.698244798326165, + "grad_norm": 1.501842799375733, + "learning_rate": 5.849349039057657e-07, + "loss": 0.8991, + "step": 150960 + }, + { + "epoch": 11.699019721802472, + "grad_norm": 1.486264455226932, + "learning_rate": 5.849736515809052e-07, + "loss": 0.8963, + "step": 150970 + }, + { + "epoch": 11.699794645278779, + "grad_norm": 1.4618836537748374, + "learning_rate": 5.850123992560447e-07, + "loss": 0.8889, + "step": 150980 + }, + { + "epoch": 11.700569568755085, + "grad_norm": 1.505100036329453, + "learning_rate": 5.850511469311842e-07, + "loss": 0.8779, + "step": 150990 + }, + { + "epoch": 11.701344492231392, + "grad_norm": 1.6089503550407456, + "learning_rate": 5.850898946063236e-07, + "loss": 0.8894, + "step": 151000 + }, + { + "epoch": 11.701344492231392, + "eval_loss": 0.9003046154975891, + "eval_runtime": 331.7037, + "eval_samples_per_second": 34.582, + "eval_steps_per_second": 8.646, + "step": 151000 + }, + { + "epoch": 11.702119415707699, + "grad_norm": 1.4095285740481986, + "learning_rate": 5.851286422814632e-07, + "loss": 0.8994, + "step": 151010 + }, + { + "epoch": 11.702894339184006, + "grad_norm": 1.493930715526186, + "learning_rate": 5.851673899566027e-07, + "loss": 0.8904, + "step": 151020 + }, + { + "epoch": 11.703669262660313, + "grad_norm": 1.4439533733889858, + "learning_rate": 5.852061376317421e-07, + "loss": 0.8772, + "step": 151030 + }, + { + "epoch": 11.70444418613662, + "grad_norm": 1.507507398417448, + "learning_rate": 5.852448853068816e-07, + "loss": 0.8835, + "step": 151040 + }, + { + "epoch": 11.705219109612926, + "grad_norm": 1.4933766136694786, + "learning_rate": 5.852836329820212e-07, + "loss": 0.9009, + "step": 151050 + }, + { + "epoch": 11.705994033089233, + "grad_norm": 1.4677935088515055, + "learning_rate": 5.853223806571607e-07, + "loss": 0.8779, + "step": 151060 + }, + { + "epoch": 11.70676895656554, + "grad_norm": 1.4749826670829438, + "learning_rate": 5.853611283323001e-07, + "loss": 0.901, + "step": 151070 + }, + { + "epoch": 11.707543880041847, + "grad_norm": 1.5191059764277435, + "learning_rate": 5.853998760074396e-07, + "loss": 0.877, + "step": 151080 + }, + { + "epoch": 11.708318803518152, + "grad_norm": 1.4924814273887486, + "learning_rate": 5.854386236825791e-07, + "loss": 0.8847, + "step": 151090 + }, + { + "epoch": 11.709093726994459, + "grad_norm": 1.5061194196579524, + "learning_rate": 5.854773713577185e-07, + "loss": 0.8848, + "step": 151100 + }, + { + "epoch": 11.709868650470765, + "grad_norm": 1.3786602747590608, + "learning_rate": 5.855161190328581e-07, + "loss": 0.8706, + "step": 151110 + }, + { + "epoch": 11.710643573947072, + "grad_norm": 1.447701965606096, + "learning_rate": 5.855548667079976e-07, + "loss": 0.8875, + "step": 151120 + }, + { + "epoch": 11.711418497423379, + "grad_norm": 1.4383999850031144, + "learning_rate": 5.855936143831371e-07, + "loss": 0.872, + "step": 151130 + }, + { + "epoch": 11.712193420899686, + "grad_norm": 1.571322199818717, + "learning_rate": 5.856323620582765e-07, + "loss": 0.881, + "step": 151140 + }, + { + "epoch": 11.712968344375993, + "grad_norm": 1.4208220098593554, + "learning_rate": 5.85671109733416e-07, + "loss": 0.8805, + "step": 151150 + }, + { + "epoch": 11.7137432678523, + "grad_norm": 1.4581133592735778, + "learning_rate": 5.857098574085556e-07, + "loss": 0.8975, + "step": 151160 + }, + { + "epoch": 11.714518191328606, + "grad_norm": 1.3988400604371345, + "learning_rate": 5.85748605083695e-07, + "loss": 0.8843, + "step": 151170 + }, + { + "epoch": 11.715293114804913, + "grad_norm": 1.4850993158640524, + "learning_rate": 5.857873527588345e-07, + "loss": 0.891, + "step": 151180 + }, + { + "epoch": 11.71606803828122, + "grad_norm": 1.498593307939059, + "learning_rate": 5.85826100433974e-07, + "loss": 0.8776, + "step": 151190 + }, + { + "epoch": 11.716842961757527, + "grad_norm": 1.4748175940622925, + "learning_rate": 5.858648481091136e-07, + "loss": 0.8885, + "step": 151200 + }, + { + "epoch": 11.717617885233834, + "grad_norm": 1.563016644676713, + "learning_rate": 5.85903595784253e-07, + "loss": 0.9, + "step": 151210 + }, + { + "epoch": 11.71839280871014, + "grad_norm": 1.4470989891947086, + "learning_rate": 5.859423434593925e-07, + "loss": 0.8753, + "step": 151220 + }, + { + "epoch": 11.719167732186447, + "grad_norm": 1.452257615047897, + "learning_rate": 5.85981091134532e-07, + "loss": 0.8653, + "step": 151230 + }, + { + "epoch": 11.719942655662754, + "grad_norm": 1.5770914016030708, + "learning_rate": 5.860198388096714e-07, + "loss": 0.8834, + "step": 151240 + }, + { + "epoch": 11.72071757913906, + "grad_norm": 1.5022176248424712, + "learning_rate": 5.86058586484811e-07, + "loss": 0.8865, + "step": 151250 + }, + { + "epoch": 11.721492502615366, + "grad_norm": 1.4661343249341312, + "learning_rate": 5.860973341599505e-07, + "loss": 0.8779, + "step": 151260 + }, + { + "epoch": 11.722267426091673, + "grad_norm": 1.4947311382747321, + "learning_rate": 5.8613608183509e-07, + "loss": 0.8996, + "step": 151270 + }, + { + "epoch": 11.72304234956798, + "grad_norm": 1.4633275755424444, + "learning_rate": 5.861748295102294e-07, + "loss": 0.8851, + "step": 151280 + }, + { + "epoch": 11.723817273044286, + "grad_norm": 1.424220741171486, + "learning_rate": 5.862135771853689e-07, + "loss": 0.8726, + "step": 151290 + }, + { + "epoch": 11.724592196520593, + "grad_norm": 1.5337720699184496, + "learning_rate": 5.862523248605085e-07, + "loss": 0.8755, + "step": 151300 + }, + { + "epoch": 11.7253671199969, + "grad_norm": 1.431365072876708, + "learning_rate": 5.862910725356479e-07, + "loss": 0.8801, + "step": 151310 + }, + { + "epoch": 11.726142043473207, + "grad_norm": 1.5382774948513198, + "learning_rate": 5.863298202107874e-07, + "loss": 0.8868, + "step": 151320 + }, + { + "epoch": 11.726916966949513, + "grad_norm": 1.4622115504246387, + "learning_rate": 5.863685678859269e-07, + "loss": 0.8824, + "step": 151330 + }, + { + "epoch": 11.72769189042582, + "grad_norm": 1.5146813516126043, + "learning_rate": 5.864073155610664e-07, + "loss": 0.8842, + "step": 151340 + }, + { + "epoch": 11.728466813902127, + "grad_norm": 1.4310208185468476, + "learning_rate": 5.864460632362059e-07, + "loss": 0.8988, + "step": 151350 + }, + { + "epoch": 11.729241737378434, + "grad_norm": 1.469261976779874, + "learning_rate": 5.864848109113454e-07, + "loss": 0.8823, + "step": 151360 + }, + { + "epoch": 11.73001666085474, + "grad_norm": 1.4704210919244216, + "learning_rate": 5.865235585864849e-07, + "loss": 0.8805, + "step": 151370 + }, + { + "epoch": 11.730791584331048, + "grad_norm": 1.4535782572627216, + "learning_rate": 5.865623062616243e-07, + "loss": 0.892, + "step": 151380 + }, + { + "epoch": 11.731566507807354, + "grad_norm": 1.5473843093636714, + "learning_rate": 5.866010539367638e-07, + "loss": 0.8647, + "step": 151390 + }, + { + "epoch": 11.732341431283661, + "grad_norm": 1.5637200634462105, + "learning_rate": 5.866398016119034e-07, + "loss": 0.872, + "step": 151400 + }, + { + "epoch": 11.733116354759968, + "grad_norm": 1.5162628320215865, + "learning_rate": 5.866785492870429e-07, + "loss": 0.8788, + "step": 151410 + }, + { + "epoch": 11.733891278236275, + "grad_norm": 1.4010891715120695, + "learning_rate": 5.867172969621823e-07, + "loss": 0.8908, + "step": 151420 + }, + { + "epoch": 11.734666201712582, + "grad_norm": 1.4972329447136619, + "learning_rate": 5.867560446373218e-07, + "loss": 0.8923, + "step": 151430 + }, + { + "epoch": 11.735441125188888, + "grad_norm": 1.435264421968173, + "learning_rate": 5.867947923124613e-07, + "loss": 0.8842, + "step": 151440 + }, + { + "epoch": 11.736216048665195, + "grad_norm": 1.4179010994460965, + "learning_rate": 5.868335399876008e-07, + "loss": 0.8774, + "step": 151450 + }, + { + "epoch": 11.7369909721415, + "grad_norm": 1.5032300515286754, + "learning_rate": 5.868722876627403e-07, + "loss": 0.8796, + "step": 151460 + }, + { + "epoch": 11.737765895617807, + "grad_norm": 1.5409619942704262, + "learning_rate": 5.869110353378798e-07, + "loss": 0.8787, + "step": 151470 + }, + { + "epoch": 11.738540819094114, + "grad_norm": 1.3247624712570927, + "learning_rate": 5.869497830130193e-07, + "loss": 0.8681, + "step": 151480 + }, + { + "epoch": 11.73931574257042, + "grad_norm": 1.4584696930104257, + "learning_rate": 5.869885306881587e-07, + "loss": 0.8909, + "step": 151490 + }, + { + "epoch": 11.740090666046727, + "grad_norm": 1.5121396969248149, + "learning_rate": 5.870272783632983e-07, + "loss": 0.8786, + "step": 151500 + }, + { + "epoch": 11.740090666046727, + "eval_loss": 0.9003280401229858, + "eval_runtime": 330.6064, + "eval_samples_per_second": 34.697, + "eval_steps_per_second": 8.675, + "step": 151500 + }, + { + "epoch": 11.740865589523034, + "grad_norm": 1.4772716545209987, + "learning_rate": 5.870660260384378e-07, + "loss": 0.8951, + "step": 151510 + }, + { + "epoch": 11.741640512999341, + "grad_norm": 1.510381685437853, + "learning_rate": 5.871047737135772e-07, + "loss": 0.8788, + "step": 151520 + }, + { + "epoch": 11.742415436475648, + "grad_norm": 1.45938608096415, + "learning_rate": 5.871435213887167e-07, + "loss": 0.8799, + "step": 151530 + }, + { + "epoch": 11.743190359951955, + "grad_norm": 1.4212919320486694, + "learning_rate": 5.871822690638562e-07, + "loss": 0.8796, + "step": 151540 + }, + { + "epoch": 11.743965283428262, + "grad_norm": 1.5076359211905297, + "learning_rate": 5.872210167389958e-07, + "loss": 0.8781, + "step": 151550 + }, + { + "epoch": 11.744740206904568, + "grad_norm": 1.5248471127189789, + "learning_rate": 5.872597644141352e-07, + "loss": 0.8822, + "step": 151560 + }, + { + "epoch": 11.745515130380875, + "grad_norm": 1.4016912781733333, + "learning_rate": 5.872985120892747e-07, + "loss": 0.8962, + "step": 151570 + }, + { + "epoch": 11.746290053857182, + "grad_norm": 1.4059032999218612, + "learning_rate": 5.873372597644142e-07, + "loss": 0.8918, + "step": 151580 + }, + { + "epoch": 11.747064977333489, + "grad_norm": 1.5258082030099023, + "learning_rate": 5.873760074395536e-07, + "loss": 0.8827, + "step": 151590 + }, + { + "epoch": 11.747839900809796, + "grad_norm": 1.4514801511974815, + "learning_rate": 5.874147551146932e-07, + "loss": 0.8986, + "step": 151600 + }, + { + "epoch": 11.748614824286102, + "grad_norm": 1.5252603177814812, + "learning_rate": 5.874535027898327e-07, + "loss": 0.8769, + "step": 151610 + }, + { + "epoch": 11.74938974776241, + "grad_norm": 1.5288029245985553, + "learning_rate": 5.874922504649722e-07, + "loss": 0.8789, + "step": 151620 + }, + { + "epoch": 11.750164671238716, + "grad_norm": 1.4305181736430777, + "learning_rate": 5.875309981401116e-07, + "loss": 0.8823, + "step": 151630 + }, + { + "epoch": 11.750939594715021, + "grad_norm": 1.5293131490699903, + "learning_rate": 5.875697458152511e-07, + "loss": 0.8784, + "step": 151640 + }, + { + "epoch": 11.751714518191328, + "grad_norm": 1.4002766699121076, + "learning_rate": 5.876084934903907e-07, + "loss": 0.8908, + "step": 151650 + }, + { + "epoch": 11.752489441667635, + "grad_norm": 1.5003776921184104, + "learning_rate": 5.876472411655301e-07, + "loss": 0.8834, + "step": 151660 + }, + { + "epoch": 11.753264365143941, + "grad_norm": 1.4367221371476626, + "learning_rate": 5.876859888406696e-07, + "loss": 0.8709, + "step": 151670 + }, + { + "epoch": 11.754039288620248, + "grad_norm": 1.4490802467888397, + "learning_rate": 5.877247365158091e-07, + "loss": 0.8982, + "step": 151680 + }, + { + "epoch": 11.754814212096555, + "grad_norm": 1.4914579959909067, + "learning_rate": 5.877634841909487e-07, + "loss": 0.8848, + "step": 151690 + }, + { + "epoch": 11.755589135572862, + "grad_norm": 1.483562643808212, + "learning_rate": 5.878022318660881e-07, + "loss": 0.8823, + "step": 151700 + }, + { + "epoch": 11.756364059049169, + "grad_norm": 1.4964297823227353, + "learning_rate": 5.878409795412276e-07, + "loss": 0.8843, + "step": 151710 + }, + { + "epoch": 11.757138982525476, + "grad_norm": 1.5559439677261624, + "learning_rate": 5.878797272163671e-07, + "loss": 0.8801, + "step": 151720 + }, + { + "epoch": 11.757913906001782, + "grad_norm": 1.4059802298429838, + "learning_rate": 5.879184748915065e-07, + "loss": 0.8951, + "step": 151730 + }, + { + "epoch": 11.75868882947809, + "grad_norm": 1.5116881620730347, + "learning_rate": 5.87957222566646e-07, + "loss": 0.9, + "step": 151740 + }, + { + "epoch": 11.759463752954396, + "grad_norm": 1.522553675453697, + "learning_rate": 5.879959702417856e-07, + "loss": 0.8942, + "step": 151750 + }, + { + "epoch": 11.760238676430703, + "grad_norm": 1.4773676357873287, + "learning_rate": 5.880347179169251e-07, + "loss": 0.8831, + "step": 151760 + }, + { + "epoch": 11.76101359990701, + "grad_norm": 1.5619340782047393, + "learning_rate": 5.880734655920645e-07, + "loss": 0.9083, + "step": 151770 + }, + { + "epoch": 11.761788523383316, + "grad_norm": 1.4367102375087784, + "learning_rate": 5.88112213267204e-07, + "loss": 0.904, + "step": 151780 + }, + { + "epoch": 11.762563446859623, + "grad_norm": 1.4247282442304958, + "learning_rate": 5.881509609423436e-07, + "loss": 0.8856, + "step": 151790 + }, + { + "epoch": 11.76333837033593, + "grad_norm": 1.4265993449833125, + "learning_rate": 5.88189708617483e-07, + "loss": 0.8898, + "step": 151800 + }, + { + "epoch": 11.764113293812237, + "grad_norm": 1.435847498483911, + "learning_rate": 5.882284562926225e-07, + "loss": 0.8761, + "step": 151810 + }, + { + "epoch": 11.764888217288544, + "grad_norm": 1.531993497177844, + "learning_rate": 5.88267203967762e-07, + "loss": 0.8884, + "step": 151820 + }, + { + "epoch": 11.765663140764849, + "grad_norm": 1.5118036965349184, + "learning_rate": 5.883059516429014e-07, + "loss": 0.9041, + "step": 151830 + }, + { + "epoch": 11.766438064241155, + "grad_norm": 1.4272922565827415, + "learning_rate": 5.88344699318041e-07, + "loss": 0.8661, + "step": 151840 + }, + { + "epoch": 11.767212987717462, + "grad_norm": 1.601458836561993, + "learning_rate": 5.883834469931805e-07, + "loss": 0.892, + "step": 151850 + }, + { + "epoch": 11.767987911193769, + "grad_norm": 1.4554315413659689, + "learning_rate": 5.8842219466832e-07, + "loss": 0.8658, + "step": 151860 + }, + { + "epoch": 11.768762834670076, + "grad_norm": 1.4792813711247588, + "learning_rate": 5.884609423434594e-07, + "loss": 0.8851, + "step": 151870 + }, + { + "epoch": 11.769537758146383, + "grad_norm": 1.4505993710761558, + "learning_rate": 5.884996900185989e-07, + "loss": 0.8712, + "step": 151880 + }, + { + "epoch": 11.77031268162269, + "grad_norm": 1.5216706967907423, + "learning_rate": 5.885384376937385e-07, + "loss": 0.9053, + "step": 151890 + }, + { + "epoch": 11.771087605098996, + "grad_norm": 1.4501089365300046, + "learning_rate": 5.885771853688779e-07, + "loss": 0.9093, + "step": 151900 + }, + { + "epoch": 11.771862528575303, + "grad_norm": 1.5200183474732971, + "learning_rate": 5.886159330440174e-07, + "loss": 0.8757, + "step": 151910 + }, + { + "epoch": 11.77263745205161, + "grad_norm": 1.5234921604563574, + "learning_rate": 5.886546807191569e-07, + "loss": 0.8623, + "step": 151920 + }, + { + "epoch": 11.773412375527917, + "grad_norm": 1.5197263102306096, + "learning_rate": 5.886934283942964e-07, + "loss": 0.8687, + "step": 151930 + }, + { + "epoch": 11.774187299004224, + "grad_norm": 1.552830800363363, + "learning_rate": 5.887321760694359e-07, + "loss": 0.8779, + "step": 151940 + }, + { + "epoch": 11.77496222248053, + "grad_norm": 1.5104265281203424, + "learning_rate": 5.887709237445754e-07, + "loss": 0.8866, + "step": 151950 + }, + { + "epoch": 11.775737145956837, + "grad_norm": 1.4940177614149321, + "learning_rate": 5.888096714197149e-07, + "loss": 0.8755, + "step": 151960 + }, + { + "epoch": 11.776512069433144, + "grad_norm": 1.4188019256525155, + "learning_rate": 5.888484190948543e-07, + "loss": 0.8742, + "step": 151970 + }, + { + "epoch": 11.77728699290945, + "grad_norm": 1.423126584498867, + "learning_rate": 5.888871667699938e-07, + "loss": 0.8792, + "step": 151980 + }, + { + "epoch": 11.778061916385758, + "grad_norm": 1.431765700144714, + "learning_rate": 5.889259144451334e-07, + "loss": 0.8852, + "step": 151990 + }, + { + "epoch": 11.778836839862064, + "grad_norm": 1.4307269873193604, + "learning_rate": 5.889646621202729e-07, + "loss": 0.8848, + "step": 152000 + }, + { + "epoch": 11.778836839862064, + "eval_loss": 0.9001546502113342, + "eval_runtime": 330.6903, + "eval_samples_per_second": 34.688, + "eval_steps_per_second": 8.673, + "step": 152000 + }, + { + "epoch": 11.77961176333837, + "grad_norm": 1.4248991999059415, + "learning_rate": 5.890034097954123e-07, + "loss": 0.8892, + "step": 152010 + }, + { + "epoch": 11.780386686814676, + "grad_norm": 1.4299958964324881, + "learning_rate": 5.890421574705518e-07, + "loss": 0.9107, + "step": 152020 + }, + { + "epoch": 11.781161610290983, + "grad_norm": 1.601158287265439, + "learning_rate": 5.890809051456913e-07, + "loss": 0.8947, + "step": 152030 + }, + { + "epoch": 11.78193653376729, + "grad_norm": 1.4588596199746817, + "learning_rate": 5.891196528208308e-07, + "loss": 0.8782, + "step": 152040 + }, + { + "epoch": 11.782711457243597, + "grad_norm": 1.4267504067611125, + "learning_rate": 5.891584004959703e-07, + "loss": 0.8695, + "step": 152050 + }, + { + "epoch": 11.783486380719904, + "grad_norm": 1.5240305471290712, + "learning_rate": 5.891971481711098e-07, + "loss": 0.884, + "step": 152060 + }, + { + "epoch": 11.78426130419621, + "grad_norm": 1.5266044212348906, + "learning_rate": 5.892358958462493e-07, + "loss": 0.8986, + "step": 152070 + }, + { + "epoch": 11.785036227672517, + "grad_norm": 1.4224638916503316, + "learning_rate": 5.892746435213887e-07, + "loss": 0.8689, + "step": 152080 + }, + { + "epoch": 11.785811151148824, + "grad_norm": 1.5074138692641894, + "learning_rate": 5.893133911965283e-07, + "loss": 0.9066, + "step": 152090 + }, + { + "epoch": 11.78658607462513, + "grad_norm": 1.4402249965463063, + "learning_rate": 5.893521388716678e-07, + "loss": 0.8922, + "step": 152100 + }, + { + "epoch": 11.787360998101438, + "grad_norm": 1.4816877171423712, + "learning_rate": 5.893908865468072e-07, + "loss": 0.8958, + "step": 152110 + }, + { + "epoch": 11.788135921577744, + "grad_norm": 1.4510436549702532, + "learning_rate": 5.894296342219467e-07, + "loss": 0.8962, + "step": 152120 + }, + { + "epoch": 11.788910845054051, + "grad_norm": 1.5463375903173646, + "learning_rate": 5.894683818970862e-07, + "loss": 0.8772, + "step": 152130 + }, + { + "epoch": 11.789685768530358, + "grad_norm": 1.4404118635425274, + "learning_rate": 5.895071295722258e-07, + "loss": 0.8537, + "step": 152140 + }, + { + "epoch": 11.790460692006665, + "grad_norm": 1.3801135889166316, + "learning_rate": 5.895458772473652e-07, + "loss": 0.8986, + "step": 152150 + }, + { + "epoch": 11.791235615482972, + "grad_norm": 1.457686673610208, + "learning_rate": 5.895846249225047e-07, + "loss": 0.8731, + "step": 152160 + }, + { + "epoch": 11.792010538959278, + "grad_norm": 1.4659061742035686, + "learning_rate": 5.896233725976442e-07, + "loss": 0.9066, + "step": 152170 + }, + { + "epoch": 11.792785462435585, + "grad_norm": 1.4522198041278231, + "learning_rate": 5.896621202727836e-07, + "loss": 0.8891, + "step": 152180 + }, + { + "epoch": 11.793560385911892, + "grad_norm": 1.4174098658879821, + "learning_rate": 5.897008679479232e-07, + "loss": 0.8754, + "step": 152190 + }, + { + "epoch": 11.794335309388199, + "grad_norm": 1.5067324325683957, + "learning_rate": 5.897396156230627e-07, + "loss": 0.8615, + "step": 152200 + }, + { + "epoch": 11.795110232864504, + "grad_norm": 1.5296396473052267, + "learning_rate": 5.897783632982022e-07, + "loss": 0.8794, + "step": 152210 + }, + { + "epoch": 11.79588515634081, + "grad_norm": 1.514400798638306, + "learning_rate": 5.898171109733416e-07, + "loss": 0.8895, + "step": 152220 + }, + { + "epoch": 11.796660079817118, + "grad_norm": 1.507016701973433, + "learning_rate": 5.898558586484811e-07, + "loss": 0.8772, + "step": 152230 + }, + { + "epoch": 11.797435003293424, + "grad_norm": 1.4631092814794195, + "learning_rate": 5.898946063236207e-07, + "loss": 0.879, + "step": 152240 + }, + { + "epoch": 11.798209926769731, + "grad_norm": 1.4742659911802756, + "learning_rate": 5.899333539987601e-07, + "loss": 0.9008, + "step": 152250 + }, + { + "epoch": 11.798984850246038, + "grad_norm": 1.4532239496962216, + "learning_rate": 5.899721016738996e-07, + "loss": 0.8803, + "step": 152260 + }, + { + "epoch": 11.799759773722345, + "grad_norm": 1.4204202996310336, + "learning_rate": 5.900108493490391e-07, + "loss": 0.8827, + "step": 152270 + }, + { + "epoch": 11.800534697198652, + "grad_norm": 1.4826005997836096, + "learning_rate": 5.900495970241786e-07, + "loss": 0.9213, + "step": 152280 + }, + { + "epoch": 11.801309620674958, + "grad_norm": 1.5601162920967884, + "learning_rate": 5.900883446993181e-07, + "loss": 0.8809, + "step": 152290 + }, + { + "epoch": 11.802084544151265, + "grad_norm": 1.4550670938147316, + "learning_rate": 5.901270923744576e-07, + "loss": 0.8818, + "step": 152300 + }, + { + "epoch": 11.802859467627572, + "grad_norm": 1.438944067102383, + "learning_rate": 5.901658400495971e-07, + "loss": 0.8574, + "step": 152310 + }, + { + "epoch": 11.803634391103879, + "grad_norm": 1.4385871666866508, + "learning_rate": 5.902045877247365e-07, + "loss": 0.8934, + "step": 152320 + }, + { + "epoch": 11.804409314580186, + "grad_norm": 1.560607714381426, + "learning_rate": 5.90243335399876e-07, + "loss": 0.8958, + "step": 152330 + }, + { + "epoch": 11.805184238056492, + "grad_norm": 1.4558828318968517, + "learning_rate": 5.902820830750156e-07, + "loss": 0.8828, + "step": 152340 + }, + { + "epoch": 11.8059591615328, + "grad_norm": 1.5039485722894137, + "learning_rate": 5.903208307501551e-07, + "loss": 0.8902, + "step": 152350 + }, + { + "epoch": 11.806734085009106, + "grad_norm": 1.5706089478615846, + "learning_rate": 5.903595784252945e-07, + "loss": 0.8826, + "step": 152360 + }, + { + "epoch": 11.807509008485413, + "grad_norm": 1.3964994292378556, + "learning_rate": 5.90398326100434e-07, + "loss": 0.9011, + "step": 152370 + }, + { + "epoch": 11.808283931961718, + "grad_norm": 1.4898173832414134, + "learning_rate": 5.904370737755736e-07, + "loss": 0.8815, + "step": 152380 + }, + { + "epoch": 11.809058855438025, + "grad_norm": 1.4507562640363858, + "learning_rate": 5.90475821450713e-07, + "loss": 0.8787, + "step": 152390 + }, + { + "epoch": 11.809833778914332, + "grad_norm": 1.517178299388674, + "learning_rate": 5.905145691258525e-07, + "loss": 0.8827, + "step": 152400 + }, + { + "epoch": 11.810608702390638, + "grad_norm": 1.494628854207851, + "learning_rate": 5.90553316800992e-07, + "loss": 0.884, + "step": 152410 + }, + { + "epoch": 11.811383625866945, + "grad_norm": 1.5483858560826953, + "learning_rate": 5.905920644761315e-07, + "loss": 0.9163, + "step": 152420 + }, + { + "epoch": 11.812158549343252, + "grad_norm": 1.4725169413775563, + "learning_rate": 5.90630812151271e-07, + "loss": 0.8893, + "step": 152430 + }, + { + "epoch": 11.812933472819559, + "grad_norm": 1.5127487132455926, + "learning_rate": 5.906695598264105e-07, + "loss": 0.8835, + "step": 152440 + }, + { + "epoch": 11.813708396295866, + "grad_norm": 1.5076369282683493, + "learning_rate": 5.9070830750155e-07, + "loss": 0.8911, + "step": 152450 + }, + { + "epoch": 11.814483319772172, + "grad_norm": 1.501213847010707, + "learning_rate": 5.907470551766894e-07, + "loss": 0.8899, + "step": 152460 + }, + { + "epoch": 11.81525824324848, + "grad_norm": 1.4332084419745623, + "learning_rate": 5.907858028518289e-07, + "loss": 0.8909, + "step": 152470 + }, + { + "epoch": 11.816033166724786, + "grad_norm": 1.5568465690986784, + "learning_rate": 5.908245505269685e-07, + "loss": 0.8909, + "step": 152480 + }, + { + "epoch": 11.816808090201093, + "grad_norm": 1.4616721047356789, + "learning_rate": 5.90863298202108e-07, + "loss": 0.8908, + "step": 152490 + }, + { + "epoch": 11.8175830136774, + "grad_norm": 1.456866142522316, + "learning_rate": 5.909020458772474e-07, + "loss": 0.8839, + "step": 152500 + }, + { + "epoch": 11.8175830136774, + "eval_loss": 0.8997487425804138, + "eval_runtime": 326.2137, + "eval_samples_per_second": 35.164, + "eval_steps_per_second": 8.792, + "step": 152500 + }, + { + "epoch": 11.818357937153706, + "grad_norm": 1.3749472273922383, + "learning_rate": 5.909407935523869e-07, + "loss": 0.8757, + "step": 152510 + }, + { + "epoch": 11.819132860630013, + "grad_norm": 1.4837857748572352, + "learning_rate": 5.909795412275264e-07, + "loss": 0.875, + "step": 152520 + }, + { + "epoch": 11.81990778410632, + "grad_norm": 1.4475902146065633, + "learning_rate": 5.910182889026659e-07, + "loss": 0.8776, + "step": 152530 + }, + { + "epoch": 11.820682707582627, + "grad_norm": 1.489616617206633, + "learning_rate": 5.910570365778054e-07, + "loss": 0.8709, + "step": 152540 + }, + { + "epoch": 11.821457631058934, + "grad_norm": 1.4887189843793893, + "learning_rate": 5.910957842529449e-07, + "loss": 0.8787, + "step": 152550 + }, + { + "epoch": 11.82223255453524, + "grad_norm": 1.4527588885693914, + "learning_rate": 5.911345319280844e-07, + "loss": 0.8943, + "step": 152560 + }, + { + "epoch": 11.823007478011547, + "grad_norm": 1.429710147885339, + "learning_rate": 5.911732796032238e-07, + "loss": 0.8841, + "step": 152570 + }, + { + "epoch": 11.823782401487852, + "grad_norm": 1.4260644403720013, + "learning_rate": 5.912120272783634e-07, + "loss": 0.8877, + "step": 152580 + }, + { + "epoch": 11.82455732496416, + "grad_norm": 1.3529952128449865, + "learning_rate": 5.912507749535029e-07, + "loss": 0.8919, + "step": 152590 + }, + { + "epoch": 11.825332248440466, + "grad_norm": 1.5033338398093727, + "learning_rate": 5.912895226286423e-07, + "loss": 0.8856, + "step": 152600 + }, + { + "epoch": 11.826107171916773, + "grad_norm": 1.4245607788504875, + "learning_rate": 5.913282703037818e-07, + "loss": 0.8913, + "step": 152610 + }, + { + "epoch": 11.82688209539308, + "grad_norm": 1.4983374964395013, + "learning_rate": 5.913670179789213e-07, + "loss": 0.8959, + "step": 152620 + }, + { + "epoch": 11.827657018869386, + "grad_norm": 1.4190339560470306, + "learning_rate": 5.914057656540609e-07, + "loss": 0.8791, + "step": 152630 + }, + { + "epoch": 11.828431942345693, + "grad_norm": 1.5265163392975822, + "learning_rate": 5.914445133292003e-07, + "loss": 0.8733, + "step": 152640 + }, + { + "epoch": 11.829206865822, + "grad_norm": 1.40267958524431, + "learning_rate": 5.914832610043398e-07, + "loss": 0.8743, + "step": 152650 + }, + { + "epoch": 11.829981789298307, + "grad_norm": 1.5304632880527886, + "learning_rate": 5.915220086794793e-07, + "loss": 0.9046, + "step": 152660 + }, + { + "epoch": 11.830756712774614, + "grad_norm": 1.3435307385186417, + "learning_rate": 5.915607563546187e-07, + "loss": 0.8893, + "step": 152670 + }, + { + "epoch": 11.83153163625092, + "grad_norm": 1.556248360439386, + "learning_rate": 5.915995040297583e-07, + "loss": 0.9213, + "step": 152680 + }, + { + "epoch": 11.832306559727227, + "grad_norm": 1.5636879805362927, + "learning_rate": 5.916382517048978e-07, + "loss": 0.8749, + "step": 152690 + }, + { + "epoch": 11.833081483203534, + "grad_norm": 1.4895809892580132, + "learning_rate": 5.916769993800373e-07, + "loss": 0.882, + "step": 152700 + }, + { + "epoch": 11.83385640667984, + "grad_norm": 1.3528608883716413, + "learning_rate": 5.917157470551767e-07, + "loss": 0.867, + "step": 152710 + }, + { + "epoch": 11.834631330156148, + "grad_norm": 1.4241805101265017, + "learning_rate": 5.917544947303162e-07, + "loss": 0.9063, + "step": 152720 + }, + { + "epoch": 11.835406253632454, + "grad_norm": 1.4708946678045614, + "learning_rate": 5.917932424054558e-07, + "loss": 0.8887, + "step": 152730 + }, + { + "epoch": 11.836181177108761, + "grad_norm": 1.4990282572192786, + "learning_rate": 5.918319900805952e-07, + "loss": 0.8726, + "step": 152740 + }, + { + "epoch": 11.836956100585066, + "grad_norm": 1.6896907921311062, + "learning_rate": 5.918707377557347e-07, + "loss": 0.9186, + "step": 152750 + }, + { + "epoch": 11.837731024061373, + "grad_norm": 1.480512736925051, + "learning_rate": 5.919094854308742e-07, + "loss": 0.8693, + "step": 152760 + }, + { + "epoch": 11.83850594753768, + "grad_norm": 1.4516324814696004, + "learning_rate": 5.919482331060137e-07, + "loss": 0.8974, + "step": 152770 + }, + { + "epoch": 11.839280871013987, + "grad_norm": 1.4089236278259425, + "learning_rate": 5.919869807811532e-07, + "loss": 0.8897, + "step": 152780 + }, + { + "epoch": 11.840055794490294, + "grad_norm": 1.4051461761343351, + "learning_rate": 5.920257284562927e-07, + "loss": 0.8963, + "step": 152790 + }, + { + "epoch": 11.8408307179666, + "grad_norm": 1.453494812904661, + "learning_rate": 5.920644761314322e-07, + "loss": 0.878, + "step": 152800 + }, + { + "epoch": 11.841605641442907, + "grad_norm": 1.4622268317409035, + "learning_rate": 5.921032238065716e-07, + "loss": 0.8736, + "step": 152810 + }, + { + "epoch": 11.842380564919214, + "grad_norm": 1.472501556295623, + "learning_rate": 5.921419714817111e-07, + "loss": 0.9015, + "step": 152820 + }, + { + "epoch": 11.84315548839552, + "grad_norm": 1.5717428339697719, + "learning_rate": 5.921807191568507e-07, + "loss": 0.8967, + "step": 152830 + }, + { + "epoch": 11.843930411871828, + "grad_norm": 1.4797456895652012, + "learning_rate": 5.922194668319902e-07, + "loss": 0.8784, + "step": 152840 + }, + { + "epoch": 11.844705335348134, + "grad_norm": 1.561310336553484, + "learning_rate": 5.922582145071296e-07, + "loss": 0.8925, + "step": 152850 + }, + { + "epoch": 11.845480258824441, + "grad_norm": 1.4491125284859596, + "learning_rate": 5.922969621822691e-07, + "loss": 0.884, + "step": 152860 + }, + { + "epoch": 11.846255182300748, + "grad_norm": 1.4903083227381808, + "learning_rate": 5.923357098574086e-07, + "loss": 0.8555, + "step": 152870 + }, + { + "epoch": 11.847030105777055, + "grad_norm": 1.4808461339506829, + "learning_rate": 5.923744575325481e-07, + "loss": 0.8711, + "step": 152880 + }, + { + "epoch": 11.847805029253362, + "grad_norm": 1.465205645180691, + "learning_rate": 5.924132052076876e-07, + "loss": 0.9049, + "step": 152890 + }, + { + "epoch": 11.848579952729668, + "grad_norm": 1.5492913617100121, + "learning_rate": 5.924519528828271e-07, + "loss": 0.9012, + "step": 152900 + }, + { + "epoch": 11.849354876205975, + "grad_norm": 1.4951369712780143, + "learning_rate": 5.924907005579666e-07, + "loss": 0.9076, + "step": 152910 + }, + { + "epoch": 11.850129799682282, + "grad_norm": 1.4744545798492787, + "learning_rate": 5.92529448233106e-07, + "loss": 0.8621, + "step": 152920 + }, + { + "epoch": 11.850904723158589, + "grad_norm": 1.4679318625000712, + "learning_rate": 5.925681959082456e-07, + "loss": 0.8896, + "step": 152930 + }, + { + "epoch": 11.851679646634896, + "grad_norm": 1.4015461122060302, + "learning_rate": 5.926069435833851e-07, + "loss": 0.8726, + "step": 152940 + }, + { + "epoch": 11.8524545701112, + "grad_norm": 1.4566766288875932, + "learning_rate": 5.926456912585245e-07, + "loss": 0.8868, + "step": 152950 + }, + { + "epoch": 11.853229493587508, + "grad_norm": 1.4675874749539928, + "learning_rate": 5.92684438933664e-07, + "loss": 0.8945, + "step": 152960 + }, + { + "epoch": 11.854004417063814, + "grad_norm": 1.3759092064059115, + "learning_rate": 5.927231866088035e-07, + "loss": 0.8866, + "step": 152970 + }, + { + "epoch": 11.854779340540121, + "grad_norm": 1.4549250931265056, + "learning_rate": 5.927619342839431e-07, + "loss": 0.8939, + "step": 152980 + }, + { + "epoch": 11.855554264016428, + "grad_norm": 1.5089950681530238, + "learning_rate": 5.928006819590825e-07, + "loss": 0.9005, + "step": 152990 + }, + { + "epoch": 11.856329187492735, + "grad_norm": 1.549687120262666, + "learning_rate": 5.92839429634222e-07, + "loss": 0.8806, + "step": 153000 + }, + { + "epoch": 11.856329187492735, + "eval_loss": 0.8996762037277222, + "eval_runtime": 326.7511, + "eval_samples_per_second": 35.106, + "eval_steps_per_second": 8.777, + "step": 153000 + }, + { + "epoch": 11.857104110969042, + "grad_norm": 1.54007425997968, + "learning_rate": 5.928781773093615e-07, + "loss": 0.8963, + "step": 153010 + }, + { + "epoch": 11.857879034445348, + "grad_norm": 1.5078066172682665, + "learning_rate": 5.929169249845009e-07, + "loss": 0.882, + "step": 153020 + }, + { + "epoch": 11.858653957921655, + "grad_norm": 1.498496379980969, + "learning_rate": 5.929556726596405e-07, + "loss": 0.8742, + "step": 153030 + }, + { + "epoch": 11.859428881397962, + "grad_norm": 1.487273400927105, + "learning_rate": 5.9299442033478e-07, + "loss": 0.8852, + "step": 153040 + }, + { + "epoch": 11.860203804874269, + "grad_norm": 1.4279302191705143, + "learning_rate": 5.930331680099195e-07, + "loss": 0.8767, + "step": 153050 + }, + { + "epoch": 11.860978728350576, + "grad_norm": 1.3511466337647227, + "learning_rate": 5.930719156850589e-07, + "loss": 0.8736, + "step": 153060 + }, + { + "epoch": 11.861753651826882, + "grad_norm": 1.4547532534810401, + "learning_rate": 5.931106633601984e-07, + "loss": 0.8758, + "step": 153070 + }, + { + "epoch": 11.86252857530319, + "grad_norm": 1.3537601419784182, + "learning_rate": 5.93149411035338e-07, + "loss": 0.8828, + "step": 153080 + }, + { + "epoch": 11.863303498779496, + "grad_norm": 1.4831681462608273, + "learning_rate": 5.931881587104774e-07, + "loss": 0.8851, + "step": 153090 + }, + { + "epoch": 11.864078422255803, + "grad_norm": 1.5208476435899316, + "learning_rate": 5.932269063856169e-07, + "loss": 0.8907, + "step": 153100 + }, + { + "epoch": 11.86485334573211, + "grad_norm": 1.629697992884903, + "learning_rate": 5.932656540607564e-07, + "loss": 0.8925, + "step": 153110 + }, + { + "epoch": 11.865628269208415, + "grad_norm": 1.5168473989793172, + "learning_rate": 5.93304401735896e-07, + "loss": 0.8701, + "step": 153120 + }, + { + "epoch": 11.866403192684722, + "grad_norm": 1.5079606299702706, + "learning_rate": 5.933431494110354e-07, + "loss": 0.8943, + "step": 153130 + }, + { + "epoch": 11.867178116161028, + "grad_norm": 1.4263713850317998, + "learning_rate": 5.933818970861749e-07, + "loss": 0.8748, + "step": 153140 + }, + { + "epoch": 11.867953039637335, + "grad_norm": 1.5154037415739559, + "learning_rate": 5.934206447613144e-07, + "loss": 0.8874, + "step": 153150 + }, + { + "epoch": 11.868727963113642, + "grad_norm": 1.344599539709784, + "learning_rate": 5.934593924364538e-07, + "loss": 0.8885, + "step": 153160 + }, + { + "epoch": 11.869502886589949, + "grad_norm": 1.3636902100898352, + "learning_rate": 5.934981401115934e-07, + "loss": 0.8717, + "step": 153170 + }, + { + "epoch": 11.870277810066256, + "grad_norm": 1.4142850802124776, + "learning_rate": 5.935368877867329e-07, + "loss": 0.8857, + "step": 153180 + }, + { + "epoch": 11.871052733542562, + "grad_norm": 1.4671060893788281, + "learning_rate": 5.935756354618724e-07, + "loss": 0.8943, + "step": 153190 + }, + { + "epoch": 11.87182765701887, + "grad_norm": 1.4786883774618065, + "learning_rate": 5.936143831370118e-07, + "loss": 0.8768, + "step": 153200 + }, + { + "epoch": 11.872602580495176, + "grad_norm": 1.4051916987944832, + "learning_rate": 5.936531308121513e-07, + "loss": 0.8699, + "step": 153210 + }, + { + "epoch": 11.873377503971483, + "grad_norm": 1.5040962193323806, + "learning_rate": 5.936918784872909e-07, + "loss": 0.8878, + "step": 153220 + }, + { + "epoch": 11.87415242744779, + "grad_norm": 1.457151593170063, + "learning_rate": 5.937306261624303e-07, + "loss": 0.8825, + "step": 153230 + }, + { + "epoch": 11.874927350924096, + "grad_norm": 1.4767103660203926, + "learning_rate": 5.937693738375698e-07, + "loss": 0.8758, + "step": 153240 + }, + { + "epoch": 11.875702274400403, + "grad_norm": 1.525769508068761, + "learning_rate": 5.938081215127093e-07, + "loss": 0.8797, + "step": 153250 + }, + { + "epoch": 11.87647719787671, + "grad_norm": 1.4642395872261966, + "learning_rate": 5.938468691878487e-07, + "loss": 0.8947, + "step": 153260 + }, + { + "epoch": 11.877252121353017, + "grad_norm": 1.4907522555502848, + "learning_rate": 5.938856168629883e-07, + "loss": 0.8717, + "step": 153270 + }, + { + "epoch": 11.878027044829324, + "grad_norm": 1.4534856073109281, + "learning_rate": 5.939243645381278e-07, + "loss": 0.8836, + "step": 153280 + }, + { + "epoch": 11.87880196830563, + "grad_norm": 1.4733915887611202, + "learning_rate": 5.939631122132673e-07, + "loss": 0.8903, + "step": 153290 + }, + { + "epoch": 11.879576891781937, + "grad_norm": 1.562768175923201, + "learning_rate": 5.940018598884067e-07, + "loss": 0.8943, + "step": 153300 + }, + { + "epoch": 11.880351815258244, + "grad_norm": 1.3994936480638078, + "learning_rate": 5.940406075635462e-07, + "loss": 0.8708, + "step": 153310 + }, + { + "epoch": 11.88112673873455, + "grad_norm": 1.4858269477040897, + "learning_rate": 5.940793552386858e-07, + "loss": 0.8916, + "step": 153320 + }, + { + "epoch": 11.881901662210856, + "grad_norm": 1.4371730561009124, + "learning_rate": 5.941181029138252e-07, + "loss": 0.8878, + "step": 153330 + }, + { + "epoch": 11.882676585687163, + "grad_norm": 1.4152299915039155, + "learning_rate": 5.941568505889647e-07, + "loss": 0.8951, + "step": 153340 + }, + { + "epoch": 11.88345150916347, + "grad_norm": 1.4849085699276876, + "learning_rate": 5.941955982641042e-07, + "loss": 0.8798, + "step": 153350 + }, + { + "epoch": 11.884226432639776, + "grad_norm": 1.50465279892456, + "learning_rate": 5.942343459392437e-07, + "loss": 0.8833, + "step": 153360 + }, + { + "epoch": 11.885001356116083, + "grad_norm": 1.5041812412056625, + "learning_rate": 5.942730936143832e-07, + "loss": 0.8717, + "step": 153370 + }, + { + "epoch": 11.88577627959239, + "grad_norm": 1.4755831629656917, + "learning_rate": 5.943118412895227e-07, + "loss": 0.9123, + "step": 153380 + }, + { + "epoch": 11.886551203068697, + "grad_norm": 1.5668013318668044, + "learning_rate": 5.943505889646622e-07, + "loss": 0.8869, + "step": 153390 + }, + { + "epoch": 11.887326126545004, + "grad_norm": 1.456961912378347, + "learning_rate": 5.943893366398016e-07, + "loss": 0.8862, + "step": 153400 + }, + { + "epoch": 11.88810105002131, + "grad_norm": 1.4934448634856403, + "learning_rate": 5.944280843149411e-07, + "loss": 0.885, + "step": 153410 + }, + { + "epoch": 11.888875973497617, + "grad_norm": 1.5078268458748185, + "learning_rate": 5.944668319900807e-07, + "loss": 0.8775, + "step": 153420 + }, + { + "epoch": 11.889650896973924, + "grad_norm": 1.4884997388491097, + "learning_rate": 5.945055796652202e-07, + "loss": 0.8787, + "step": 153430 + }, + { + "epoch": 11.89042582045023, + "grad_norm": 1.409768970514064, + "learning_rate": 5.945443273403596e-07, + "loss": 0.9101, + "step": 153440 + }, + { + "epoch": 11.891200743926538, + "grad_norm": 1.4881143616798327, + "learning_rate": 5.945830750154991e-07, + "loss": 0.8798, + "step": 153450 + }, + { + "epoch": 11.891975667402845, + "grad_norm": 1.4302815069732595, + "learning_rate": 5.946218226906386e-07, + "loss": 0.8878, + "step": 153460 + }, + { + "epoch": 11.892750590879151, + "grad_norm": 1.45828849680311, + "learning_rate": 5.946605703657781e-07, + "loss": 0.8877, + "step": 153470 + }, + { + "epoch": 11.893525514355458, + "grad_norm": 1.5473308336646465, + "learning_rate": 5.946993180409176e-07, + "loss": 0.9143, + "step": 153480 + }, + { + "epoch": 11.894300437831765, + "grad_norm": 1.4333083679430385, + "learning_rate": 5.947380657160571e-07, + "loss": 0.8747, + "step": 153490 + }, + { + "epoch": 11.89507536130807, + "grad_norm": 1.5038351763329663, + "learning_rate": 5.947768133911966e-07, + "loss": 0.8936, + "step": 153500 + }, + { + "epoch": 11.89507536130807, + "eval_loss": 0.8996007442474365, + "eval_runtime": 327.9006, + "eval_samples_per_second": 34.983, + "eval_steps_per_second": 8.747, + "step": 153500 + }, + { + "epoch": 11.895850284784377, + "grad_norm": 1.5189070270578502, + "learning_rate": 5.94815561066336e-07, + "loss": 0.8879, + "step": 153510 + }, + { + "epoch": 11.896625208260684, + "grad_norm": 1.4412585249791312, + "learning_rate": 5.948543087414756e-07, + "loss": 0.8913, + "step": 153520 + }, + { + "epoch": 11.89740013173699, + "grad_norm": 1.439997724900694, + "learning_rate": 5.948930564166151e-07, + "loss": 0.8977, + "step": 153530 + }, + { + "epoch": 11.898175055213297, + "grad_norm": 1.4596552574242658, + "learning_rate": 5.949318040917545e-07, + "loss": 0.8827, + "step": 153540 + }, + { + "epoch": 11.898949978689604, + "grad_norm": 1.4764959737937573, + "learning_rate": 5.94970551766894e-07, + "loss": 0.8983, + "step": 153550 + }, + { + "epoch": 11.89972490216591, + "grad_norm": 1.4233872665656824, + "learning_rate": 5.950092994420335e-07, + "loss": 0.8788, + "step": 153560 + }, + { + "epoch": 11.900499825642218, + "grad_norm": 1.5449634700124013, + "learning_rate": 5.950480471171731e-07, + "loss": 0.9127, + "step": 153570 + }, + { + "epoch": 11.901274749118524, + "grad_norm": 1.380893652907036, + "learning_rate": 5.950867947923125e-07, + "loss": 0.892, + "step": 153580 + }, + { + "epoch": 11.902049672594831, + "grad_norm": 1.4455690140718258, + "learning_rate": 5.95125542467452e-07, + "loss": 0.856, + "step": 153590 + }, + { + "epoch": 11.902824596071138, + "grad_norm": 1.536057378461, + "learning_rate": 5.951642901425915e-07, + "loss": 0.8907, + "step": 153600 + }, + { + "epoch": 11.903599519547445, + "grad_norm": 1.405023706222149, + "learning_rate": 5.952030378177309e-07, + "loss": 0.867, + "step": 153610 + }, + { + "epoch": 11.904374443023752, + "grad_norm": 1.4668454493961773, + "learning_rate": 5.952417854928705e-07, + "loss": 0.8685, + "step": 153620 + }, + { + "epoch": 11.905149366500059, + "grad_norm": 1.3983790015790838, + "learning_rate": 5.9528053316801e-07, + "loss": 0.8832, + "step": 153630 + }, + { + "epoch": 11.905924289976365, + "grad_norm": 1.4465055938607274, + "learning_rate": 5.953192808431495e-07, + "loss": 0.8921, + "step": 153640 + }, + { + "epoch": 11.906699213452672, + "grad_norm": 1.4057267263918427, + "learning_rate": 5.953580285182889e-07, + "loss": 0.8967, + "step": 153650 + }, + { + "epoch": 11.907474136928979, + "grad_norm": 1.502925901400588, + "learning_rate": 5.953967761934284e-07, + "loss": 0.8879, + "step": 153660 + }, + { + "epoch": 11.908249060405286, + "grad_norm": 1.494680475577271, + "learning_rate": 5.95435523868568e-07, + "loss": 0.8834, + "step": 153670 + }, + { + "epoch": 11.909023983881593, + "grad_norm": 1.553446382439009, + "learning_rate": 5.954742715437074e-07, + "loss": 0.8798, + "step": 153680 + }, + { + "epoch": 11.909798907357898, + "grad_norm": 1.5015664749867752, + "learning_rate": 5.955130192188469e-07, + "loss": 0.8841, + "step": 153690 + }, + { + "epoch": 11.910573830834204, + "grad_norm": 1.4646134999256153, + "learning_rate": 5.955517668939864e-07, + "loss": 0.8948, + "step": 153700 + }, + { + "epoch": 11.911348754310511, + "grad_norm": 1.535857683897203, + "learning_rate": 5.95590514569126e-07, + "loss": 0.8836, + "step": 153710 + }, + { + "epoch": 11.912123677786818, + "grad_norm": 1.5038569851882362, + "learning_rate": 5.956292622442654e-07, + "loss": 0.8709, + "step": 153720 + }, + { + "epoch": 11.912898601263125, + "grad_norm": 1.4750660748994218, + "learning_rate": 5.956680099194049e-07, + "loss": 0.8808, + "step": 153730 + }, + { + "epoch": 11.913673524739432, + "grad_norm": 1.4429749722319745, + "learning_rate": 5.957067575945444e-07, + "loss": 0.8733, + "step": 153740 + }, + { + "epoch": 11.914448448215738, + "grad_norm": 1.4298330614514039, + "learning_rate": 5.957455052696838e-07, + "loss": 0.8853, + "step": 153750 + }, + { + "epoch": 11.915223371692045, + "grad_norm": 1.4470149703195985, + "learning_rate": 5.957842529448233e-07, + "loss": 0.8996, + "step": 153760 + }, + { + "epoch": 11.915998295168352, + "grad_norm": 1.5675103980344942, + "learning_rate": 5.958230006199629e-07, + "loss": 0.8814, + "step": 153770 + }, + { + "epoch": 11.916773218644659, + "grad_norm": 1.5320553041904768, + "learning_rate": 5.958617482951024e-07, + "loss": 0.9068, + "step": 153780 + }, + { + "epoch": 11.917548142120966, + "grad_norm": 1.5324788871745392, + "learning_rate": 5.959004959702418e-07, + "loss": 0.8724, + "step": 153790 + }, + { + "epoch": 11.918323065597273, + "grad_norm": 1.3976754342796864, + "learning_rate": 5.959392436453813e-07, + "loss": 0.8646, + "step": 153800 + }, + { + "epoch": 11.91909798907358, + "grad_norm": 1.4727271140699396, + "learning_rate": 5.959779913205209e-07, + "loss": 0.9013, + "step": 153810 + }, + { + "epoch": 11.919872912549886, + "grad_norm": 1.5550600736270408, + "learning_rate": 5.960167389956603e-07, + "loss": 0.8978, + "step": 153820 + }, + { + "epoch": 11.920647836026193, + "grad_norm": 1.5087281159692898, + "learning_rate": 5.960554866707998e-07, + "loss": 0.8953, + "step": 153830 + }, + { + "epoch": 11.9214227595025, + "grad_norm": 1.5606284673156383, + "learning_rate": 5.960942343459393e-07, + "loss": 0.892, + "step": 153840 + }, + { + "epoch": 11.922197682978807, + "grad_norm": 1.4712021258155885, + "learning_rate": 5.961329820210788e-07, + "loss": 0.8763, + "step": 153850 + }, + { + "epoch": 11.922972606455113, + "grad_norm": 1.4826963016676256, + "learning_rate": 5.961717296962183e-07, + "loss": 0.8715, + "step": 153860 + }, + { + "epoch": 11.923747529931418, + "grad_norm": 1.4978905981369441, + "learning_rate": 5.962104773713578e-07, + "loss": 0.8925, + "step": 153870 + }, + { + "epoch": 11.924522453407725, + "grad_norm": 1.4640298675573977, + "learning_rate": 5.962492250464973e-07, + "loss": 0.9021, + "step": 153880 + }, + { + "epoch": 11.925297376884032, + "grad_norm": 1.4330945891631532, + "learning_rate": 5.962879727216367e-07, + "loss": 0.8812, + "step": 153890 + }, + { + "epoch": 11.926072300360339, + "grad_norm": 1.4625312721526607, + "learning_rate": 5.963267203967762e-07, + "loss": 0.8921, + "step": 153900 + }, + { + "epoch": 11.926847223836646, + "grad_norm": 1.5394495658077536, + "learning_rate": 5.963654680719158e-07, + "loss": 0.8898, + "step": 153910 + }, + { + "epoch": 11.927622147312952, + "grad_norm": 1.4910868734009035, + "learning_rate": 5.964042157470553e-07, + "loss": 0.8881, + "step": 153920 + }, + { + "epoch": 11.92839707078926, + "grad_norm": 1.5353217454528485, + "learning_rate": 5.964429634221947e-07, + "loss": 0.8849, + "step": 153930 + }, + { + "epoch": 11.929171994265566, + "grad_norm": 1.4447926856017552, + "learning_rate": 5.964817110973342e-07, + "loss": 0.859, + "step": 153940 + }, + { + "epoch": 11.929946917741873, + "grad_norm": 1.410425057944489, + "learning_rate": 5.965204587724737e-07, + "loss": 0.8806, + "step": 153950 + }, + { + "epoch": 11.93072184121818, + "grad_norm": 1.4741681526228667, + "learning_rate": 5.965592064476132e-07, + "loss": 0.8914, + "step": 153960 + }, + { + "epoch": 11.931496764694487, + "grad_norm": 1.522474437096631, + "learning_rate": 5.965979541227527e-07, + "loss": 0.8834, + "step": 153970 + }, + { + "epoch": 11.932271688170793, + "grad_norm": 1.4609978354245716, + "learning_rate": 5.966367017978922e-07, + "loss": 0.9112, + "step": 153980 + }, + { + "epoch": 11.9330466116471, + "grad_norm": 1.4251491888359142, + "learning_rate": 5.966754494730317e-07, + "loss": 0.8831, + "step": 153990 + }, + { + "epoch": 11.933821535123407, + "grad_norm": 1.4287932341347844, + "learning_rate": 5.967141971481711e-07, + "loss": 0.881, + "step": 154000 + }, + { + "epoch": 11.933821535123407, + "eval_loss": 0.8992840647697449, + "eval_runtime": 327.2282, + "eval_samples_per_second": 35.055, + "eval_steps_per_second": 8.765, + "step": 154000 + }, + { + "epoch": 11.934596458599714, + "grad_norm": 1.4810880453495716, + "learning_rate": 5.967529448233107e-07, + "loss": 0.8637, + "step": 154010 + }, + { + "epoch": 11.93537138207602, + "grad_norm": 1.513952236778419, + "learning_rate": 5.967916924984502e-07, + "loss": 0.8942, + "step": 154020 + }, + { + "epoch": 11.936146305552327, + "grad_norm": 1.4705146056772866, + "learning_rate": 5.968304401735896e-07, + "loss": 0.885, + "step": 154030 + }, + { + "epoch": 11.936921229028634, + "grad_norm": 1.4352862134258275, + "learning_rate": 5.968691878487291e-07, + "loss": 0.8775, + "step": 154040 + }, + { + "epoch": 11.937696152504941, + "grad_norm": 1.4694823016396137, + "learning_rate": 5.969079355238686e-07, + "loss": 0.8812, + "step": 154050 + }, + { + "epoch": 11.938471075981248, + "grad_norm": 1.451794598354103, + "learning_rate": 5.969466831990082e-07, + "loss": 0.8741, + "step": 154060 + }, + { + "epoch": 11.939245999457553, + "grad_norm": 1.4091159243462983, + "learning_rate": 5.969854308741476e-07, + "loss": 0.8778, + "step": 154070 + }, + { + "epoch": 11.94002092293386, + "grad_norm": 1.490321884836939, + "learning_rate": 5.970241785492871e-07, + "loss": 0.9036, + "step": 154080 + }, + { + "epoch": 11.940795846410166, + "grad_norm": 1.4669223907054654, + "learning_rate": 5.970629262244266e-07, + "loss": 0.8651, + "step": 154090 + }, + { + "epoch": 11.941570769886473, + "grad_norm": 1.4270209198697428, + "learning_rate": 5.97101673899566e-07, + "loss": 0.8874, + "step": 154100 + }, + { + "epoch": 11.94234569336278, + "grad_norm": 1.5227789872846798, + "learning_rate": 5.971404215747056e-07, + "loss": 0.8922, + "step": 154110 + }, + { + "epoch": 11.943120616839087, + "grad_norm": 1.3737945617527705, + "learning_rate": 5.971791692498451e-07, + "loss": 0.8911, + "step": 154120 + }, + { + "epoch": 11.943895540315394, + "grad_norm": 1.4690353770039783, + "learning_rate": 5.972179169249846e-07, + "loss": 0.8644, + "step": 154130 + }, + { + "epoch": 11.9446704637917, + "grad_norm": 1.4308660611486446, + "learning_rate": 5.97256664600124e-07, + "loss": 0.8686, + "step": 154140 + }, + { + "epoch": 11.945445387268007, + "grad_norm": 1.5612590741446133, + "learning_rate": 5.972954122752635e-07, + "loss": 0.8652, + "step": 154150 + }, + { + "epoch": 11.946220310744314, + "grad_norm": 1.48341721199793, + "learning_rate": 5.973341599504031e-07, + "loss": 0.8885, + "step": 154160 + }, + { + "epoch": 11.946995234220621, + "grad_norm": 1.4102844157086831, + "learning_rate": 5.973729076255425e-07, + "loss": 0.8911, + "step": 154170 + }, + { + "epoch": 11.947770157696928, + "grad_norm": 1.5789228633913621, + "learning_rate": 5.97411655300682e-07, + "loss": 0.8922, + "step": 154180 + }, + { + "epoch": 11.948545081173235, + "grad_norm": 1.4801578221702811, + "learning_rate": 5.974504029758215e-07, + "loss": 0.8755, + "step": 154190 + }, + { + "epoch": 11.949320004649541, + "grad_norm": 1.4818621090055295, + "learning_rate": 5.97489150650961e-07, + "loss": 0.8849, + "step": 154200 + }, + { + "epoch": 11.950094928125848, + "grad_norm": 1.4202638287422216, + "learning_rate": 5.975278983261005e-07, + "loss": 0.8786, + "step": 154210 + }, + { + "epoch": 11.950869851602155, + "grad_norm": 1.5428414165929332, + "learning_rate": 5.9756664600124e-07, + "loss": 0.8915, + "step": 154220 + }, + { + "epoch": 11.951644775078462, + "grad_norm": 1.4378541660520598, + "learning_rate": 5.976053936763795e-07, + "loss": 0.8858, + "step": 154230 + }, + { + "epoch": 11.952419698554767, + "grad_norm": 1.3932626761437064, + "learning_rate": 5.976441413515189e-07, + "loss": 0.8894, + "step": 154240 + }, + { + "epoch": 11.953194622031074, + "grad_norm": 1.4578420840286466, + "learning_rate": 5.976828890266584e-07, + "loss": 0.8905, + "step": 154250 + }, + { + "epoch": 11.95396954550738, + "grad_norm": 1.494448852518979, + "learning_rate": 5.97721636701798e-07, + "loss": 0.8655, + "step": 154260 + }, + { + "epoch": 11.954744468983687, + "grad_norm": 1.4977971154075276, + "learning_rate": 5.977603843769375e-07, + "loss": 0.8848, + "step": 154270 + }, + { + "epoch": 11.955519392459994, + "grad_norm": 1.5286713716271902, + "learning_rate": 5.977991320520769e-07, + "loss": 0.8886, + "step": 154280 + }, + { + "epoch": 11.9562943159363, + "grad_norm": 1.4927883283038217, + "learning_rate": 5.978378797272164e-07, + "loss": 0.8913, + "step": 154290 + }, + { + "epoch": 11.957069239412608, + "grad_norm": 1.4187312787071167, + "learning_rate": 5.97876627402356e-07, + "loss": 0.8732, + "step": 154300 + }, + { + "epoch": 11.957844162888914, + "grad_norm": 1.4501891217827076, + "learning_rate": 5.979153750774954e-07, + "loss": 0.895, + "step": 154310 + }, + { + "epoch": 11.958619086365221, + "grad_norm": 1.5945446061386357, + "learning_rate": 5.979541227526349e-07, + "loss": 0.8774, + "step": 154320 + }, + { + "epoch": 11.959394009841528, + "grad_norm": 1.5398439148429874, + "learning_rate": 5.979928704277744e-07, + "loss": 0.8711, + "step": 154330 + }, + { + "epoch": 11.960168933317835, + "grad_norm": 1.4534093672052215, + "learning_rate": 5.980316181029139e-07, + "loss": 0.8736, + "step": 154340 + }, + { + "epoch": 11.960943856794142, + "grad_norm": 1.428876206631113, + "learning_rate": 5.980703657780533e-07, + "loss": 0.9014, + "step": 154350 + }, + { + "epoch": 11.961718780270449, + "grad_norm": 1.548903982802627, + "learning_rate": 5.981091134531929e-07, + "loss": 0.8832, + "step": 154360 + }, + { + "epoch": 11.962493703746755, + "grad_norm": 1.6095240646534905, + "learning_rate": 5.981478611283324e-07, + "loss": 0.8834, + "step": 154370 + }, + { + "epoch": 11.963268627223062, + "grad_norm": 1.4714415389450721, + "learning_rate": 5.981866088034718e-07, + "loss": 0.8755, + "step": 154380 + }, + { + "epoch": 11.964043550699369, + "grad_norm": 1.4285161515824325, + "learning_rate": 5.982253564786113e-07, + "loss": 0.8737, + "step": 154390 + }, + { + "epoch": 11.964818474175676, + "grad_norm": 1.4534634078145605, + "learning_rate": 5.982641041537508e-07, + "loss": 0.8792, + "step": 154400 + }, + { + "epoch": 11.965593397651983, + "grad_norm": 1.445820120062502, + "learning_rate": 5.983028518288904e-07, + "loss": 0.878, + "step": 154410 + }, + { + "epoch": 11.96636832112829, + "grad_norm": 1.382608565457896, + "learning_rate": 5.983415995040298e-07, + "loss": 0.8844, + "step": 154420 + }, + { + "epoch": 11.967143244604596, + "grad_norm": 1.4804979449684166, + "learning_rate": 5.983803471791693e-07, + "loss": 0.9019, + "step": 154430 + }, + { + "epoch": 11.967918168080901, + "grad_norm": 1.4884918389587902, + "learning_rate": 5.984190948543088e-07, + "loss": 0.8837, + "step": 154440 + }, + { + "epoch": 11.968693091557208, + "grad_norm": 1.4945926323475016, + "learning_rate": 5.984578425294482e-07, + "loss": 0.8684, + "step": 154450 + }, + { + "epoch": 11.969468015033515, + "grad_norm": 1.514439827638201, + "learning_rate": 5.984965902045878e-07, + "loss": 0.8784, + "step": 154460 + }, + { + "epoch": 11.970242938509822, + "grad_norm": 1.5488536528016728, + "learning_rate": 5.985353378797273e-07, + "loss": 0.8826, + "step": 154470 + }, + { + "epoch": 11.971017861986128, + "grad_norm": 1.357861259707862, + "learning_rate": 5.985740855548668e-07, + "loss": 0.8532, + "step": 154480 + }, + { + "epoch": 11.971792785462435, + "grad_norm": 1.4571353617690657, + "learning_rate": 5.986128332300062e-07, + "loss": 0.867, + "step": 154490 + }, + { + "epoch": 11.972567708938742, + "grad_norm": 1.4178776711305174, + "learning_rate": 5.986515809051458e-07, + "loss": 0.8763, + "step": 154500 + }, + { + "epoch": 11.972567708938742, + "eval_loss": 0.8992680311203003, + "eval_runtime": 326.7899, + "eval_samples_per_second": 35.102, + "eval_steps_per_second": 8.776, + "step": 154500 + }, + { + "epoch": 11.973342632415049, + "grad_norm": 1.5114328165845847, + "learning_rate": 5.986903285802853e-07, + "loss": 0.8995, + "step": 154510 + }, + { + "epoch": 11.974117555891356, + "grad_norm": 1.5004906849032689, + "learning_rate": 5.987290762554247e-07, + "loss": 0.8833, + "step": 154520 + }, + { + "epoch": 11.974892479367663, + "grad_norm": 1.4751167998269448, + "learning_rate": 5.987678239305642e-07, + "loss": 0.8716, + "step": 154530 + }, + { + "epoch": 11.97566740284397, + "grad_norm": 1.4211929217406538, + "learning_rate": 5.988065716057037e-07, + "loss": 0.882, + "step": 154540 + }, + { + "epoch": 11.976442326320276, + "grad_norm": 1.472107275936572, + "learning_rate": 5.988453192808433e-07, + "loss": 0.8804, + "step": 154550 + }, + { + "epoch": 11.977217249796583, + "grad_norm": 1.4429536771954448, + "learning_rate": 5.988840669559827e-07, + "loss": 0.8775, + "step": 154560 + }, + { + "epoch": 11.97799217327289, + "grad_norm": 1.4292493014597822, + "learning_rate": 5.989228146311222e-07, + "loss": 0.9134, + "step": 154570 + }, + { + "epoch": 11.978767096749197, + "grad_norm": 1.42822646010791, + "learning_rate": 5.989615623062617e-07, + "loss": 0.8717, + "step": 154580 + }, + { + "epoch": 11.979542020225503, + "grad_norm": 1.4764364122648241, + "learning_rate": 5.990003099814011e-07, + "loss": 0.868, + "step": 154590 + }, + { + "epoch": 11.98031694370181, + "grad_norm": 1.55456768114497, + "learning_rate": 5.990390576565407e-07, + "loss": 0.8799, + "step": 154600 + }, + { + "epoch": 11.981091867178115, + "grad_norm": 1.4686266404385107, + "learning_rate": 5.990778053316802e-07, + "loss": 0.8834, + "step": 154610 + }, + { + "epoch": 11.981866790654422, + "grad_norm": 1.4501745822984464, + "learning_rate": 5.991165530068197e-07, + "loss": 0.8671, + "step": 154620 + }, + { + "epoch": 11.982641714130729, + "grad_norm": 1.419057652628041, + "learning_rate": 5.991553006819591e-07, + "loss": 0.8755, + "step": 154630 + }, + { + "epoch": 11.983416637607036, + "grad_norm": 1.3705417995419744, + "learning_rate": 5.991940483570986e-07, + "loss": 0.8943, + "step": 154640 + }, + { + "epoch": 11.984191561083342, + "grad_norm": 1.4157555894638667, + "learning_rate": 5.992327960322382e-07, + "loss": 0.9135, + "step": 154650 + }, + { + "epoch": 11.98496648455965, + "grad_norm": 1.4579242901371736, + "learning_rate": 5.992715437073776e-07, + "loss": 0.8707, + "step": 154660 + }, + { + "epoch": 11.985741408035956, + "grad_norm": 1.4754421871601366, + "learning_rate": 5.993102913825171e-07, + "loss": 0.8785, + "step": 154670 + }, + { + "epoch": 11.986516331512263, + "grad_norm": 1.4565563806338975, + "learning_rate": 5.993490390576566e-07, + "loss": 0.8772, + "step": 154680 + }, + { + "epoch": 11.98729125498857, + "grad_norm": 1.4859499572577068, + "learning_rate": 5.993877867327961e-07, + "loss": 0.8679, + "step": 154690 + }, + { + "epoch": 11.988066178464877, + "grad_norm": 1.4323383467206492, + "learning_rate": 5.994265344079356e-07, + "loss": 0.8949, + "step": 154700 + }, + { + "epoch": 11.988841101941183, + "grad_norm": 1.5287952101474094, + "learning_rate": 5.994652820830751e-07, + "loss": 0.8893, + "step": 154710 + }, + { + "epoch": 11.98961602541749, + "grad_norm": 1.4687096405912465, + "learning_rate": 5.995040297582146e-07, + "loss": 0.8618, + "step": 154720 + }, + { + "epoch": 11.990390948893797, + "grad_norm": 1.4445580497074957, + "learning_rate": 5.99542777433354e-07, + "loss": 0.8751, + "step": 154730 + }, + { + "epoch": 11.991165872370104, + "grad_norm": 1.461505124589696, + "learning_rate": 5.995815251084935e-07, + "loss": 0.906, + "step": 154740 + }, + { + "epoch": 11.99194079584641, + "grad_norm": 1.4915438814020974, + "learning_rate": 5.996202727836331e-07, + "loss": 0.8921, + "step": 154750 + }, + { + "epoch": 11.992715719322717, + "grad_norm": 1.4913289773020115, + "learning_rate": 5.996590204587725e-07, + "loss": 0.898, + "step": 154760 + }, + { + "epoch": 11.993490642799024, + "grad_norm": 1.5399219818392187, + "learning_rate": 5.99697768133912e-07, + "loss": 0.8833, + "step": 154770 + }, + { + "epoch": 11.994265566275331, + "grad_norm": 1.531864662361833, + "learning_rate": 5.997365158090515e-07, + "loss": 0.8889, + "step": 154780 + }, + { + "epoch": 11.995040489751638, + "grad_norm": 1.449081849165164, + "learning_rate": 5.99775263484191e-07, + "loss": 0.9009, + "step": 154790 + }, + { + "epoch": 11.995815413227945, + "grad_norm": 1.5042867938855728, + "learning_rate": 5.998140111593305e-07, + "loss": 0.8811, + "step": 154800 + }, + { + "epoch": 11.99659033670425, + "grad_norm": 1.4810566368515572, + "learning_rate": 5.9985275883447e-07, + "loss": 0.8894, + "step": 154810 + }, + { + "epoch": 11.997365260180556, + "grad_norm": 1.4322276518735066, + "learning_rate": 5.998915065096095e-07, + "loss": 0.8835, + "step": 154820 + }, + { + "epoch": 11.998140183656863, + "grad_norm": 1.3737014491648727, + "learning_rate": 5.999302541847489e-07, + "loss": 0.9139, + "step": 154830 + }, + { + "epoch": 11.99891510713317, + "grad_norm": 1.55967249714125, + "learning_rate": 5.999690018598884e-07, + "loss": 0.8552, + "step": 154840 + }, + { + "epoch": 11.999690030609477, + "grad_norm": 1.542165536941804, + "learning_rate": 6.00007749535028e-07, + "loss": 0.8599, + "step": 154850 + }, + { + "epoch": 12.000464954085784, + "grad_norm": 1.4898412142905748, + "learning_rate": 6.000464972101675e-07, + "loss": 0.8773, + "step": 154860 + }, + { + "epoch": 12.00123987756209, + "grad_norm": 1.4703992321923363, + "learning_rate": 6.000852448853069e-07, + "loss": 0.867, + "step": 154870 + }, + { + "epoch": 12.002014801038397, + "grad_norm": 1.567608048605454, + "learning_rate": 6.001239925604464e-07, + "loss": 0.8899, + "step": 154880 + }, + { + "epoch": 12.002789724514704, + "grad_norm": 1.4039617856016355, + "learning_rate": 6.001627402355859e-07, + "loss": 0.8764, + "step": 154890 + }, + { + "epoch": 12.003564647991011, + "grad_norm": 1.511371961453414, + "learning_rate": 6.002014879107254e-07, + "loss": 0.885, + "step": 154900 + }, + { + "epoch": 12.004339571467318, + "grad_norm": 1.4328120779781017, + "learning_rate": 6.002402355858649e-07, + "loss": 0.8678, + "step": 154910 + }, + { + "epoch": 12.005114494943625, + "grad_norm": 1.4620398533358487, + "learning_rate": 6.002789832610044e-07, + "loss": 0.8862, + "step": 154920 + }, + { + "epoch": 12.005889418419931, + "grad_norm": 1.5724509593594937, + "learning_rate": 6.003177309361439e-07, + "loss": 0.872, + "step": 154930 + }, + { + "epoch": 12.006664341896238, + "grad_norm": 1.4108724685313279, + "learning_rate": 6.003564786112833e-07, + "loss": 0.8797, + "step": 154940 + }, + { + "epoch": 12.007439265372545, + "grad_norm": 1.440706058164247, + "learning_rate": 6.003952262864229e-07, + "loss": 0.8976, + "step": 154950 + }, + { + "epoch": 12.008214188848852, + "grad_norm": 1.5139263951735655, + "learning_rate": 6.004339739615624e-07, + "loss": 0.8779, + "step": 154960 + }, + { + "epoch": 12.008989112325159, + "grad_norm": 1.4984456430305573, + "learning_rate": 6.004727216367018e-07, + "loss": 0.8872, + "step": 154970 + }, + { + "epoch": 12.009764035801465, + "grad_norm": 1.5035907113314213, + "learning_rate": 6.005114693118413e-07, + "loss": 0.8599, + "step": 154980 + }, + { + "epoch": 12.01053895927777, + "grad_norm": 1.6250375721841581, + "learning_rate": 6.005502169869808e-07, + "loss": 0.8737, + "step": 154990 + }, + { + "epoch": 12.011313882754077, + "grad_norm": 1.4239690492424746, + "learning_rate": 6.005889646621204e-07, + "loss": 0.8953, + "step": 155000 + }, + { + "epoch": 12.011313882754077, + "eval_loss": 0.8994728922843933, + "eval_runtime": 331.9105, + "eval_samples_per_second": 34.561, + "eval_steps_per_second": 8.641, + "step": 155000 + }, + { + "epoch": 12.012088806230384, + "grad_norm": 1.457804083454543, + "learning_rate": 6.006277123372598e-07, + "loss": 0.8822, + "step": 155010 + }, + { + "epoch": 12.012863729706691, + "grad_norm": 1.505693639169695, + "learning_rate": 6.006664600123993e-07, + "loss": 0.8837, + "step": 155020 + }, + { + "epoch": 12.013638653182998, + "grad_norm": 1.4331248730243284, + "learning_rate": 6.007052076875388e-07, + "loss": 0.8874, + "step": 155030 + }, + { + "epoch": 12.014413576659305, + "grad_norm": 1.443261729532448, + "learning_rate": 6.007439553626782e-07, + "loss": 0.8894, + "step": 155040 + }, + { + "epoch": 12.015188500135611, + "grad_norm": 1.464154322894863, + "learning_rate": 6.007827030378178e-07, + "loss": 0.866, + "step": 155050 + }, + { + "epoch": 12.015963423611918, + "grad_norm": 1.451344119600828, + "learning_rate": 6.008214507129573e-07, + "loss": 0.8803, + "step": 155060 + }, + { + "epoch": 12.016738347088225, + "grad_norm": 1.4947497994403203, + "learning_rate": 6.008601983880968e-07, + "loss": 0.87, + "step": 155070 + }, + { + "epoch": 12.017513270564532, + "grad_norm": 1.4666801311612516, + "learning_rate": 6.008989460632362e-07, + "loss": 0.8621, + "step": 155080 + }, + { + "epoch": 12.018288194040839, + "grad_norm": 1.527177455814391, + "learning_rate": 6.009376937383757e-07, + "loss": 0.8972, + "step": 155090 + }, + { + "epoch": 12.019063117517145, + "grad_norm": 1.4714259104431562, + "learning_rate": 6.009764414135153e-07, + "loss": 0.9153, + "step": 155100 + }, + { + "epoch": 12.019838040993452, + "grad_norm": 1.4210378725794324, + "learning_rate": 6.010151890886547e-07, + "loss": 0.8611, + "step": 155110 + }, + { + "epoch": 12.020612964469759, + "grad_norm": 1.4083349363949715, + "learning_rate": 6.010539367637942e-07, + "loss": 0.881, + "step": 155120 + }, + { + "epoch": 12.021387887946066, + "grad_norm": 1.6228828067200307, + "learning_rate": 6.010926844389337e-07, + "loss": 0.8874, + "step": 155130 + }, + { + "epoch": 12.022162811422373, + "grad_norm": 1.4618888638264205, + "learning_rate": 6.011314321140733e-07, + "loss": 0.8987, + "step": 155140 + }, + { + "epoch": 12.02293773489868, + "grad_norm": 1.4772645964027686, + "learning_rate": 6.011701797892127e-07, + "loss": 0.8771, + "step": 155150 + }, + { + "epoch": 12.023712658374986, + "grad_norm": 1.5337947586849046, + "learning_rate": 6.012089274643522e-07, + "loss": 0.8612, + "step": 155160 + }, + { + "epoch": 12.024487581851293, + "grad_norm": 1.5302090462356437, + "learning_rate": 6.012476751394917e-07, + "loss": 0.8614, + "step": 155170 + }, + { + "epoch": 12.025262505327598, + "grad_norm": 1.4826782393683344, + "learning_rate": 6.012864228146311e-07, + "loss": 0.8749, + "step": 155180 + }, + { + "epoch": 12.026037428803905, + "grad_norm": 1.458878026217401, + "learning_rate": 6.013251704897707e-07, + "loss": 0.8967, + "step": 155190 + }, + { + "epoch": 12.026812352280212, + "grad_norm": 1.5025695556587229, + "learning_rate": 6.013639181649102e-07, + "loss": 0.8764, + "step": 155200 + }, + { + "epoch": 12.027587275756519, + "grad_norm": 1.4701233352451288, + "learning_rate": 6.014026658400497e-07, + "loss": 0.8606, + "step": 155210 + }, + { + "epoch": 12.028362199232825, + "grad_norm": 1.5180739411583697, + "learning_rate": 6.014414135151891e-07, + "loss": 0.8789, + "step": 155220 + }, + { + "epoch": 12.029137122709132, + "grad_norm": 1.576008239905358, + "learning_rate": 6.014801611903286e-07, + "loss": 0.8742, + "step": 155230 + }, + { + "epoch": 12.029912046185439, + "grad_norm": 1.4928995704673882, + "learning_rate": 6.015189088654682e-07, + "loss": 0.8523, + "step": 155240 + }, + { + "epoch": 12.030686969661746, + "grad_norm": 1.4595627454160933, + "learning_rate": 6.015576565406076e-07, + "loss": 0.8938, + "step": 155250 + }, + { + "epoch": 12.031461893138053, + "grad_norm": 1.4555315613136912, + "learning_rate": 6.015964042157471e-07, + "loss": 0.8682, + "step": 155260 + }, + { + "epoch": 12.03223681661436, + "grad_norm": 1.492425014827705, + "learning_rate": 6.016351518908866e-07, + "loss": 0.8702, + "step": 155270 + }, + { + "epoch": 12.033011740090666, + "grad_norm": 1.535822029752254, + "learning_rate": 6.016738995660261e-07, + "loss": 0.8801, + "step": 155280 + }, + { + "epoch": 12.033786663566973, + "grad_norm": 1.4918493535587227, + "learning_rate": 6.017126472411656e-07, + "loss": 0.8961, + "step": 155290 + }, + { + "epoch": 12.03456158704328, + "grad_norm": 1.4539316917905967, + "learning_rate": 6.017513949163051e-07, + "loss": 0.8734, + "step": 155300 + }, + { + "epoch": 12.035336510519587, + "grad_norm": 1.3811820673546567, + "learning_rate": 6.017901425914446e-07, + "loss": 0.8728, + "step": 155310 + }, + { + "epoch": 12.036111433995893, + "grad_norm": 1.476501711672417, + "learning_rate": 6.01828890266584e-07, + "loss": 0.8726, + "step": 155320 + }, + { + "epoch": 12.0368863574722, + "grad_norm": 1.517908485951429, + "learning_rate": 6.018676379417235e-07, + "loss": 0.882, + "step": 155330 + }, + { + "epoch": 12.037661280948507, + "grad_norm": 1.4202262160310204, + "learning_rate": 6.019063856168631e-07, + "loss": 0.8703, + "step": 155340 + }, + { + "epoch": 12.038436204424814, + "grad_norm": 1.521612145466335, + "learning_rate": 6.019451332920026e-07, + "loss": 0.8865, + "step": 155350 + }, + { + "epoch": 12.039211127901119, + "grad_norm": 1.492477155904759, + "learning_rate": 6.01983880967142e-07, + "loss": 0.8681, + "step": 155360 + }, + { + "epoch": 12.039986051377426, + "grad_norm": 1.4293231132680861, + "learning_rate": 6.020226286422815e-07, + "loss": 0.8872, + "step": 155370 + }, + { + "epoch": 12.040760974853733, + "grad_norm": 1.4464228683087976, + "learning_rate": 6.02061376317421e-07, + "loss": 0.8864, + "step": 155380 + }, + { + "epoch": 12.04153589833004, + "grad_norm": 1.4866611573985653, + "learning_rate": 6.021001239925605e-07, + "loss": 0.9073, + "step": 155390 + }, + { + "epoch": 12.042310821806346, + "grad_norm": 1.5370476320872524, + "learning_rate": 6.021388716677e-07, + "loss": 0.8895, + "step": 155400 + }, + { + "epoch": 12.043085745282653, + "grad_norm": 1.511714374408981, + "learning_rate": 6.021776193428395e-07, + "loss": 0.868, + "step": 155410 + }, + { + "epoch": 12.04386066875896, + "grad_norm": 1.426669708339641, + "learning_rate": 6.02216367017979e-07, + "loss": 0.8708, + "step": 155420 + }, + { + "epoch": 12.044635592235267, + "grad_norm": 1.60212761690784, + "learning_rate": 6.022551146931184e-07, + "loss": 0.8992, + "step": 155430 + }, + { + "epoch": 12.045410515711573, + "grad_norm": 1.4948277874230385, + "learning_rate": 6.02293862368258e-07, + "loss": 0.865, + "step": 155440 + }, + { + "epoch": 12.04618543918788, + "grad_norm": 1.47289182396088, + "learning_rate": 6.023326100433975e-07, + "loss": 0.8797, + "step": 155450 + }, + { + "epoch": 12.046960362664187, + "grad_norm": 1.5345653554998, + "learning_rate": 6.023713577185369e-07, + "loss": 0.893, + "step": 155460 + }, + { + "epoch": 12.047735286140494, + "grad_norm": 1.496381470381331, + "learning_rate": 6.024101053936764e-07, + "loss": 0.8819, + "step": 155470 + }, + { + "epoch": 12.0485102096168, + "grad_norm": 1.5314077436476041, + "learning_rate": 6.024488530688159e-07, + "loss": 0.8738, + "step": 155480 + }, + { + "epoch": 12.049285133093107, + "grad_norm": 1.5057761501786697, + "learning_rate": 6.024876007439555e-07, + "loss": 0.9079, + "step": 155490 + }, + { + "epoch": 12.050060056569414, + "grad_norm": 1.5174262324848735, + "learning_rate": 6.025263484190949e-07, + "loss": 0.8749, + "step": 155500 + }, + { + "epoch": 12.050060056569414, + "eval_loss": 0.8994477987289429, + "eval_runtime": 330.0619, + "eval_samples_per_second": 34.754, + "eval_steps_per_second": 8.689, + "step": 155500 + }, + { + "epoch": 12.050834980045721, + "grad_norm": 1.511747831981707, + "learning_rate": 6.025650960942344e-07, + "loss": 0.8786, + "step": 155510 + }, + { + "epoch": 12.051609903522028, + "grad_norm": 1.4454302186648513, + "learning_rate": 6.026038437693739e-07, + "loss": 0.8982, + "step": 155520 + }, + { + "epoch": 12.052384826998335, + "grad_norm": 1.6299572075267357, + "learning_rate": 6.026425914445133e-07, + "loss": 0.874, + "step": 155530 + }, + { + "epoch": 12.053159750474641, + "grad_norm": 1.4446624852033418, + "learning_rate": 6.026813391196529e-07, + "loss": 0.862, + "step": 155540 + }, + { + "epoch": 12.053934673950947, + "grad_norm": 1.5193747089675538, + "learning_rate": 6.027200867947924e-07, + "loss": 0.8834, + "step": 155550 + }, + { + "epoch": 12.054709597427253, + "grad_norm": 1.3932394994954391, + "learning_rate": 6.027588344699319e-07, + "loss": 0.8876, + "step": 155560 + }, + { + "epoch": 12.05548452090356, + "grad_norm": 1.4806810114687499, + "learning_rate": 6.027975821450713e-07, + "loss": 0.8651, + "step": 155570 + }, + { + "epoch": 12.056259444379867, + "grad_norm": 1.6052605418416659, + "learning_rate": 6.028363298202108e-07, + "loss": 0.862, + "step": 155580 + }, + { + "epoch": 12.057034367856174, + "grad_norm": 1.4667065789871894, + "learning_rate": 6.028750774953504e-07, + "loss": 0.8845, + "step": 155590 + }, + { + "epoch": 12.05780929133248, + "grad_norm": 1.531087811030159, + "learning_rate": 6.029138251704898e-07, + "loss": 0.8845, + "step": 155600 + }, + { + "epoch": 12.058584214808787, + "grad_norm": 1.436924593677797, + "learning_rate": 6.029525728456293e-07, + "loss": 0.877, + "step": 155610 + }, + { + "epoch": 12.059359138285094, + "grad_norm": 1.498545961698412, + "learning_rate": 6.029913205207688e-07, + "loss": 0.8674, + "step": 155620 + }, + { + "epoch": 12.060134061761401, + "grad_norm": 1.51988741936543, + "learning_rate": 6.030300681959083e-07, + "loss": 0.8978, + "step": 155630 + }, + { + "epoch": 12.060908985237708, + "grad_norm": 1.402047818420674, + "learning_rate": 6.030688158710478e-07, + "loss": 0.8739, + "step": 155640 + }, + { + "epoch": 12.061683908714015, + "grad_norm": 1.4634908797758737, + "learning_rate": 6.031075635461873e-07, + "loss": 0.8745, + "step": 155650 + }, + { + "epoch": 12.062458832190321, + "grad_norm": 1.488957846247291, + "learning_rate": 6.031463112213268e-07, + "loss": 0.8906, + "step": 155660 + }, + { + "epoch": 12.063233755666628, + "grad_norm": 1.4196852737135646, + "learning_rate": 6.031850588964662e-07, + "loss": 0.8629, + "step": 155670 + }, + { + "epoch": 12.064008679142935, + "grad_norm": 1.5719630269776033, + "learning_rate": 6.032238065716057e-07, + "loss": 0.8581, + "step": 155680 + }, + { + "epoch": 12.064783602619242, + "grad_norm": 1.4840894932970963, + "learning_rate": 6.032625542467453e-07, + "loss": 0.8974, + "step": 155690 + }, + { + "epoch": 12.065558526095549, + "grad_norm": 1.460323610396662, + "learning_rate": 6.033013019218848e-07, + "loss": 0.8818, + "step": 155700 + }, + { + "epoch": 12.066333449571855, + "grad_norm": 1.4619541468573138, + "learning_rate": 6.033400495970242e-07, + "loss": 0.8818, + "step": 155710 + }, + { + "epoch": 12.067108373048162, + "grad_norm": 1.4567893439872452, + "learning_rate": 6.033787972721637e-07, + "loss": 0.8935, + "step": 155720 + }, + { + "epoch": 12.067883296524467, + "grad_norm": 1.4477314474343557, + "learning_rate": 6.034175449473032e-07, + "loss": 0.8797, + "step": 155730 + }, + { + "epoch": 12.068658220000774, + "grad_norm": 1.4645738567225337, + "learning_rate": 6.034562926224427e-07, + "loss": 0.8837, + "step": 155740 + }, + { + "epoch": 12.069433143477081, + "grad_norm": 1.4471641577615482, + "learning_rate": 6.034950402975822e-07, + "loss": 0.8858, + "step": 155750 + }, + { + "epoch": 12.070208066953388, + "grad_norm": 1.4121927476668397, + "learning_rate": 6.035337879727217e-07, + "loss": 0.8785, + "step": 155760 + }, + { + "epoch": 12.070982990429695, + "grad_norm": 1.4240452356426514, + "learning_rate": 6.035725356478612e-07, + "loss": 0.8807, + "step": 155770 + }, + { + "epoch": 12.071757913906001, + "grad_norm": 1.4425508451729405, + "learning_rate": 6.036112833230006e-07, + "loss": 0.8922, + "step": 155780 + }, + { + "epoch": 12.072532837382308, + "grad_norm": 1.5247547385728217, + "learning_rate": 6.036500309981402e-07, + "loss": 0.8795, + "step": 155790 + }, + { + "epoch": 12.073307760858615, + "grad_norm": 1.4416887035884383, + "learning_rate": 6.036887786732797e-07, + "loss": 0.8814, + "step": 155800 + }, + { + "epoch": 12.074082684334922, + "grad_norm": 1.5134798779365202, + "learning_rate": 6.037275263484191e-07, + "loss": 0.879, + "step": 155810 + }, + { + "epoch": 12.074857607811229, + "grad_norm": 1.4994687440058851, + "learning_rate": 6.037662740235586e-07, + "loss": 0.8825, + "step": 155820 + }, + { + "epoch": 12.075632531287535, + "grad_norm": 1.556563779827516, + "learning_rate": 6.038050216986982e-07, + "loss": 0.8758, + "step": 155830 + }, + { + "epoch": 12.076407454763842, + "grad_norm": 1.5256587167824411, + "learning_rate": 6.038437693738377e-07, + "loss": 0.8769, + "step": 155840 + }, + { + "epoch": 12.077182378240149, + "grad_norm": 1.4799669014316068, + "learning_rate": 6.038825170489771e-07, + "loss": 0.8748, + "step": 155850 + }, + { + "epoch": 12.077957301716456, + "grad_norm": 1.4737490500718744, + "learning_rate": 6.039212647241166e-07, + "loss": 0.8677, + "step": 155860 + }, + { + "epoch": 12.078732225192763, + "grad_norm": 1.5555162970837815, + "learning_rate": 6.039600123992561e-07, + "loss": 0.8998, + "step": 155870 + }, + { + "epoch": 12.07950714866907, + "grad_norm": 1.4765771400101775, + "learning_rate": 6.039987600743955e-07, + "loss": 0.8941, + "step": 155880 + }, + { + "epoch": 12.080282072145376, + "grad_norm": 1.449044749489514, + "learning_rate": 6.040375077495351e-07, + "loss": 0.8894, + "step": 155890 + }, + { + "epoch": 12.081056995621683, + "grad_norm": 1.5198682223248527, + "learning_rate": 6.040762554246746e-07, + "loss": 0.8703, + "step": 155900 + }, + { + "epoch": 12.08183191909799, + "grad_norm": 1.4936724063922882, + "learning_rate": 6.041150030998141e-07, + "loss": 0.8635, + "step": 155910 + }, + { + "epoch": 12.082606842574295, + "grad_norm": 1.5306286545800225, + "learning_rate": 6.041537507749535e-07, + "loss": 0.883, + "step": 155920 + }, + { + "epoch": 12.083381766050602, + "grad_norm": 1.470806083661034, + "learning_rate": 6.041924984500931e-07, + "loss": 0.8902, + "step": 155930 + }, + { + "epoch": 12.084156689526909, + "grad_norm": 1.4925207839134147, + "learning_rate": 6.042312461252326e-07, + "loss": 0.8746, + "step": 155940 + }, + { + "epoch": 12.084931613003215, + "grad_norm": 1.491569414841436, + "learning_rate": 6.04269993800372e-07, + "loss": 0.8916, + "step": 155950 + }, + { + "epoch": 12.085706536479522, + "grad_norm": 1.5541371722807547, + "learning_rate": 6.043087414755115e-07, + "loss": 0.8764, + "step": 155960 + }, + { + "epoch": 12.086481459955829, + "grad_norm": 1.5800527935466813, + "learning_rate": 6.04347489150651e-07, + "loss": 0.8789, + "step": 155970 + }, + { + "epoch": 12.087256383432136, + "grad_norm": 1.5771735589411222, + "learning_rate": 6.043862368257906e-07, + "loss": 0.8892, + "step": 155980 + }, + { + "epoch": 12.088031306908443, + "grad_norm": 1.535358893743383, + "learning_rate": 6.0442498450093e-07, + "loss": 0.8739, + "step": 155990 + }, + { + "epoch": 12.08880623038475, + "grad_norm": 1.5197924853463387, + "learning_rate": 6.044637321760695e-07, + "loss": 0.8769, + "step": 156000 + }, + { + "epoch": 12.08880623038475, + "eval_loss": 0.8995653390884399, + "eval_runtime": 330.1299, + "eval_samples_per_second": 34.747, + "eval_steps_per_second": 8.687, + "step": 156000 + }, + { + "epoch": 12.089581153861056, + "grad_norm": 1.4604931122868372, + "learning_rate": 6.04502479851209e-07, + "loss": 0.8713, + "step": 156010 + }, + { + "epoch": 12.090356077337363, + "grad_norm": 1.5937930984439108, + "learning_rate": 6.045412275263484e-07, + "loss": 0.8834, + "step": 156020 + }, + { + "epoch": 12.09113100081367, + "grad_norm": 1.4993891896411395, + "learning_rate": 6.04579975201488e-07, + "loss": 0.9076, + "step": 156030 + }, + { + "epoch": 12.091905924289977, + "grad_norm": 1.5399747123784244, + "learning_rate": 6.046187228766275e-07, + "loss": 0.8976, + "step": 156040 + }, + { + "epoch": 12.092680847766283, + "grad_norm": 1.4478326238829702, + "learning_rate": 6.04657470551767e-07, + "loss": 0.8785, + "step": 156050 + }, + { + "epoch": 12.09345577124259, + "grad_norm": 1.4100459977788573, + "learning_rate": 6.046962182269064e-07, + "loss": 0.8827, + "step": 156060 + }, + { + "epoch": 12.094230694718897, + "grad_norm": 1.404333859616359, + "learning_rate": 6.047349659020459e-07, + "loss": 0.8693, + "step": 156070 + }, + { + "epoch": 12.095005618195204, + "grad_norm": 1.5226618156321585, + "learning_rate": 6.047737135771855e-07, + "loss": 0.8747, + "step": 156080 + }, + { + "epoch": 12.09578054167151, + "grad_norm": 1.5451904597797668, + "learning_rate": 6.048124612523249e-07, + "loss": 0.8632, + "step": 156090 + }, + { + "epoch": 12.096555465147816, + "grad_norm": 1.459492478671033, + "learning_rate": 6.048512089274644e-07, + "loss": 0.8875, + "step": 156100 + }, + { + "epoch": 12.097330388624123, + "grad_norm": 1.5452963341942703, + "learning_rate": 6.048899566026039e-07, + "loss": 0.8872, + "step": 156110 + }, + { + "epoch": 12.09810531210043, + "grad_norm": 1.545015012594595, + "learning_rate": 6.049287042777434e-07, + "loss": 0.8883, + "step": 156120 + }, + { + "epoch": 12.098880235576736, + "grad_norm": 1.4577019065808532, + "learning_rate": 6.049674519528829e-07, + "loss": 0.869, + "step": 156130 + }, + { + "epoch": 12.099655159053043, + "grad_norm": 1.5306985295315125, + "learning_rate": 6.050061996280224e-07, + "loss": 0.8746, + "step": 156140 + }, + { + "epoch": 12.10043008252935, + "grad_norm": 1.480689453784675, + "learning_rate": 6.050449473031619e-07, + "loss": 0.8554, + "step": 156150 + }, + { + "epoch": 12.101205006005657, + "grad_norm": 1.5257809879654511, + "learning_rate": 6.050836949783013e-07, + "loss": 0.8931, + "step": 156160 + }, + { + "epoch": 12.101979929481963, + "grad_norm": 1.4597555175366905, + "learning_rate": 6.051224426534408e-07, + "loss": 0.8709, + "step": 156170 + }, + { + "epoch": 12.10275485295827, + "grad_norm": 1.4867745208906225, + "learning_rate": 6.051611903285804e-07, + "loss": 0.8847, + "step": 156180 + }, + { + "epoch": 12.103529776434577, + "grad_norm": 1.5394740343166453, + "learning_rate": 6.051999380037198e-07, + "loss": 0.8748, + "step": 156190 + }, + { + "epoch": 12.104304699910884, + "grad_norm": 1.5791544925139416, + "learning_rate": 6.052386856788593e-07, + "loss": 0.8895, + "step": 156200 + }, + { + "epoch": 12.10507962338719, + "grad_norm": 2.108278773969711, + "learning_rate": 6.052774333539988e-07, + "loss": 0.8724, + "step": 156210 + }, + { + "epoch": 12.105854546863497, + "grad_norm": 1.5474699836762347, + "learning_rate": 6.053161810291383e-07, + "loss": 0.8996, + "step": 156220 + }, + { + "epoch": 12.106629470339804, + "grad_norm": 1.4720014562421915, + "learning_rate": 6.053549287042778e-07, + "loss": 0.8705, + "step": 156230 + }, + { + "epoch": 12.107404393816111, + "grad_norm": 1.5671898922767027, + "learning_rate": 6.053936763794173e-07, + "loss": 0.8678, + "step": 156240 + }, + { + "epoch": 12.108179317292418, + "grad_norm": 1.4664340661894515, + "learning_rate": 6.054324240545568e-07, + "loss": 0.8719, + "step": 156250 + }, + { + "epoch": 12.108954240768725, + "grad_norm": 1.5220067245340392, + "learning_rate": 6.054711717296962e-07, + "loss": 0.8887, + "step": 156260 + }, + { + "epoch": 12.109729164245032, + "grad_norm": 1.4750049930078872, + "learning_rate": 6.055099194048357e-07, + "loss": 0.8788, + "step": 156270 + }, + { + "epoch": 12.110504087721338, + "grad_norm": 1.527237409245921, + "learning_rate": 6.055486670799753e-07, + "loss": 0.883, + "step": 156280 + }, + { + "epoch": 12.111279011197643, + "grad_norm": 1.4559981592159947, + "learning_rate": 6.055874147551148e-07, + "loss": 0.9024, + "step": 156290 + }, + { + "epoch": 12.11205393467395, + "grad_norm": 1.4778921849630127, + "learning_rate": 6.056261624302542e-07, + "loss": 0.873, + "step": 156300 + }, + { + "epoch": 12.112828858150257, + "grad_norm": 1.5569920412157225, + "learning_rate": 6.056649101053937e-07, + "loss": 0.8636, + "step": 156310 + }, + { + "epoch": 12.113603781626564, + "grad_norm": 1.5052266917525396, + "learning_rate": 6.057036577805332e-07, + "loss": 0.8843, + "step": 156320 + }, + { + "epoch": 12.11437870510287, + "grad_norm": 1.4664382598900527, + "learning_rate": 6.057424054556727e-07, + "loss": 0.9219, + "step": 156330 + }, + { + "epoch": 12.115153628579177, + "grad_norm": 1.5429798301365487, + "learning_rate": 6.057811531308122e-07, + "loss": 0.8892, + "step": 156340 + }, + { + "epoch": 12.115928552055484, + "grad_norm": 1.4654751717350112, + "learning_rate": 6.058199008059517e-07, + "loss": 0.8921, + "step": 156350 + }, + { + "epoch": 12.116703475531791, + "grad_norm": 1.4182459281525122, + "learning_rate": 6.058586484810912e-07, + "loss": 0.8818, + "step": 156360 + }, + { + "epoch": 12.117478399008098, + "grad_norm": 1.4755087367285094, + "learning_rate": 6.058973961562306e-07, + "loss": 0.8833, + "step": 156370 + }, + { + "epoch": 12.118253322484405, + "grad_norm": 1.490333225940191, + "learning_rate": 6.059361438313702e-07, + "loss": 0.8708, + "step": 156380 + }, + { + "epoch": 12.119028245960711, + "grad_norm": 1.4680588552804403, + "learning_rate": 6.059748915065097e-07, + "loss": 0.8807, + "step": 156390 + }, + { + "epoch": 12.119803169437018, + "grad_norm": 1.5401720969502075, + "learning_rate": 6.060136391816491e-07, + "loss": 0.8679, + "step": 156400 + }, + { + "epoch": 12.120578092913325, + "grad_norm": 1.4853973487318999, + "learning_rate": 6.060523868567886e-07, + "loss": 0.8901, + "step": 156410 + }, + { + "epoch": 12.121353016389632, + "grad_norm": 1.5373143561822462, + "learning_rate": 6.060911345319281e-07, + "loss": 0.8928, + "step": 156420 + }, + { + "epoch": 12.122127939865939, + "grad_norm": 1.4576340420215497, + "learning_rate": 6.061298822070677e-07, + "loss": 0.8787, + "step": 156430 + }, + { + "epoch": 12.122902863342246, + "grad_norm": 1.4797303488133922, + "learning_rate": 6.061686298822071e-07, + "loss": 0.8825, + "step": 156440 + }, + { + "epoch": 12.123677786818552, + "grad_norm": 1.4720722669646849, + "learning_rate": 6.062073775573466e-07, + "loss": 0.892, + "step": 156450 + }, + { + "epoch": 12.12445271029486, + "grad_norm": 1.4305781390560648, + "learning_rate": 6.062461252324861e-07, + "loss": 0.8859, + "step": 156460 + }, + { + "epoch": 12.125227633771166, + "grad_norm": 1.4944439590166343, + "learning_rate": 6.062848729076255e-07, + "loss": 0.881, + "step": 156470 + }, + { + "epoch": 12.126002557247471, + "grad_norm": 1.4965861716449194, + "learning_rate": 6.063236205827651e-07, + "loss": 0.8778, + "step": 156480 + }, + { + "epoch": 12.126777480723778, + "grad_norm": 1.5561180609029546, + "learning_rate": 6.063623682579046e-07, + "loss": 0.8885, + "step": 156490 + }, + { + "epoch": 12.127552404200085, + "grad_norm": 1.580233135704789, + "learning_rate": 6.064011159330441e-07, + "loss": 0.896, + "step": 156500 + }, + { + "epoch": 12.127552404200085, + "eval_loss": 0.8991771340370178, + "eval_runtime": 331.7274, + "eval_samples_per_second": 34.58, + "eval_steps_per_second": 8.646, + "step": 156500 + }, + { + "epoch": 12.128327327676391, + "grad_norm": 1.416241368913254, + "learning_rate": 6.064398636081835e-07, + "loss": 0.8768, + "step": 156510 + }, + { + "epoch": 12.129102251152698, + "grad_norm": 1.444194557426298, + "learning_rate": 6.06478611283323e-07, + "loss": 0.8847, + "step": 156520 + }, + { + "epoch": 12.129877174629005, + "grad_norm": 1.5251223864787649, + "learning_rate": 6.065173589584626e-07, + "loss": 0.8934, + "step": 156530 + }, + { + "epoch": 12.130652098105312, + "grad_norm": 1.4690866059987844, + "learning_rate": 6.06556106633602e-07, + "loss": 0.8818, + "step": 156540 + }, + { + "epoch": 12.131427021581619, + "grad_norm": 1.5398677218143677, + "learning_rate": 6.065948543087415e-07, + "loss": 0.9076, + "step": 156550 + }, + { + "epoch": 12.132201945057925, + "grad_norm": 1.481610360951755, + "learning_rate": 6.06633601983881e-07, + "loss": 0.888, + "step": 156560 + }, + { + "epoch": 12.132976868534232, + "grad_norm": 1.4500600734476699, + "learning_rate": 6.066723496590206e-07, + "loss": 0.8678, + "step": 156570 + }, + { + "epoch": 12.133751792010539, + "grad_norm": 1.5109452417789948, + "learning_rate": 6.0671109733416e-07, + "loss": 0.8741, + "step": 156580 + }, + { + "epoch": 12.134526715486846, + "grad_norm": 1.4926721474730094, + "learning_rate": 6.067498450092995e-07, + "loss": 0.8792, + "step": 156590 + }, + { + "epoch": 12.135301638963153, + "grad_norm": 1.4827810006570163, + "learning_rate": 6.06788592684439e-07, + "loss": 0.8817, + "step": 156600 + }, + { + "epoch": 12.13607656243946, + "grad_norm": 1.4255060505946393, + "learning_rate": 6.068273403595784e-07, + "loss": 0.8859, + "step": 156610 + }, + { + "epoch": 12.136851485915766, + "grad_norm": 1.5561559073010485, + "learning_rate": 6.06866088034718e-07, + "loss": 0.8862, + "step": 156620 + }, + { + "epoch": 12.137626409392073, + "grad_norm": 1.6437516618514758, + "learning_rate": 6.069048357098575e-07, + "loss": 0.8948, + "step": 156630 + }, + { + "epoch": 12.13840133286838, + "grad_norm": 1.4792969528902673, + "learning_rate": 6.06943583384997e-07, + "loss": 0.884, + "step": 156640 + }, + { + "epoch": 12.139176256344687, + "grad_norm": 1.4359592288426268, + "learning_rate": 6.069823310601364e-07, + "loss": 0.8603, + "step": 156650 + }, + { + "epoch": 12.139951179820992, + "grad_norm": 1.5595950833894938, + "learning_rate": 6.070210787352759e-07, + "loss": 0.8862, + "step": 156660 + }, + { + "epoch": 12.140726103297299, + "grad_norm": 1.5374198995528086, + "learning_rate": 6.070598264104155e-07, + "loss": 0.8801, + "step": 156670 + }, + { + "epoch": 12.141501026773605, + "grad_norm": 1.5449460287572432, + "learning_rate": 6.070985740855549e-07, + "loss": 0.8798, + "step": 156680 + }, + { + "epoch": 12.142275950249912, + "grad_norm": 1.4395955052970473, + "learning_rate": 6.071373217606944e-07, + "loss": 0.8717, + "step": 156690 + }, + { + "epoch": 12.143050873726219, + "grad_norm": 1.3789792762894408, + "learning_rate": 6.071760694358339e-07, + "loss": 0.8803, + "step": 156700 + }, + { + "epoch": 12.143825797202526, + "grad_norm": 1.4831120067976804, + "learning_rate": 6.072148171109734e-07, + "loss": 0.8726, + "step": 156710 + }, + { + "epoch": 12.144600720678833, + "grad_norm": 1.4537432207445264, + "learning_rate": 6.072535647861129e-07, + "loss": 0.883, + "step": 156720 + }, + { + "epoch": 12.14537564415514, + "grad_norm": 1.43180393078308, + "learning_rate": 6.072923124612524e-07, + "loss": 0.8779, + "step": 156730 + }, + { + "epoch": 12.146150567631446, + "grad_norm": 1.4942643800813027, + "learning_rate": 6.073310601363919e-07, + "loss": 0.8917, + "step": 156740 + }, + { + "epoch": 12.146925491107753, + "grad_norm": 1.4403159188087842, + "learning_rate": 6.073698078115313e-07, + "loss": 0.8882, + "step": 156750 + }, + { + "epoch": 12.14770041458406, + "grad_norm": 1.541248933955264, + "learning_rate": 6.074085554866708e-07, + "loss": 0.8678, + "step": 156760 + }, + { + "epoch": 12.148475338060367, + "grad_norm": 1.5217897729386851, + "learning_rate": 6.074473031618104e-07, + "loss": 0.8696, + "step": 156770 + }, + { + "epoch": 12.149250261536674, + "grad_norm": 1.4001664171049033, + "learning_rate": 6.074860508369499e-07, + "loss": 0.8782, + "step": 156780 + }, + { + "epoch": 12.15002518501298, + "grad_norm": 1.5283766991395595, + "learning_rate": 6.075247985120893e-07, + "loss": 0.8948, + "step": 156790 + }, + { + "epoch": 12.150800108489287, + "grad_norm": 1.4679033112547935, + "learning_rate": 6.075635461872288e-07, + "loss": 0.8869, + "step": 156800 + }, + { + "epoch": 12.151575031965594, + "grad_norm": 1.4474088273609031, + "learning_rate": 6.076022938623683e-07, + "loss": 0.888, + "step": 156810 + }, + { + "epoch": 12.1523499554419, + "grad_norm": 1.5045390646842571, + "learning_rate": 6.076410415375078e-07, + "loss": 0.8822, + "step": 156820 + }, + { + "epoch": 12.153124878918208, + "grad_norm": 1.4294652880246355, + "learning_rate": 6.076797892126473e-07, + "loss": 0.8896, + "step": 156830 + }, + { + "epoch": 12.153899802394514, + "grad_norm": 1.410617520425946, + "learning_rate": 6.077185368877868e-07, + "loss": 0.874, + "step": 156840 + }, + { + "epoch": 12.15467472587082, + "grad_norm": 1.4709289806894406, + "learning_rate": 6.077572845629263e-07, + "loss": 0.8852, + "step": 156850 + }, + { + "epoch": 12.155449649347126, + "grad_norm": 1.5799398495719457, + "learning_rate": 6.077960322380657e-07, + "loss": 0.8784, + "step": 156860 + }, + { + "epoch": 12.156224572823433, + "grad_norm": 1.5127078393463214, + "learning_rate": 6.078347799132053e-07, + "loss": 0.8816, + "step": 156870 + }, + { + "epoch": 12.15699949629974, + "grad_norm": 1.4763680173138893, + "learning_rate": 6.078735275883448e-07, + "loss": 0.8646, + "step": 156880 + }, + { + "epoch": 12.157774419776047, + "grad_norm": 1.4283910052551383, + "learning_rate": 6.079122752634842e-07, + "loss": 0.8679, + "step": 156890 + }, + { + "epoch": 12.158549343252353, + "grad_norm": 1.4743300745674652, + "learning_rate": 6.079510229386237e-07, + "loss": 0.8822, + "step": 156900 + }, + { + "epoch": 12.15932426672866, + "grad_norm": 1.4352792200615423, + "learning_rate": 6.079897706137632e-07, + "loss": 0.8889, + "step": 156910 + }, + { + "epoch": 12.160099190204967, + "grad_norm": 1.4607673077485552, + "learning_rate": 6.080285182889028e-07, + "loss": 0.8795, + "step": 156920 + }, + { + "epoch": 12.160874113681274, + "grad_norm": 1.4631387947269667, + "learning_rate": 6.080672659640422e-07, + "loss": 0.8921, + "step": 156930 + }, + { + "epoch": 12.16164903715758, + "grad_norm": 1.432351641257757, + "learning_rate": 6.081060136391817e-07, + "loss": 0.8888, + "step": 156940 + }, + { + "epoch": 12.162423960633888, + "grad_norm": 1.5160711747148805, + "learning_rate": 6.081447613143212e-07, + "loss": 0.8754, + "step": 156950 + }, + { + "epoch": 12.163198884110194, + "grad_norm": 1.5432246718351992, + "learning_rate": 6.081835089894606e-07, + "loss": 0.8728, + "step": 156960 + }, + { + "epoch": 12.163973807586501, + "grad_norm": 1.609180386851863, + "learning_rate": 6.082222566646002e-07, + "loss": 0.8796, + "step": 156970 + }, + { + "epoch": 12.164748731062808, + "grad_norm": 1.4382853407102116, + "learning_rate": 6.082610043397397e-07, + "loss": 0.8945, + "step": 156980 + }, + { + "epoch": 12.165523654539115, + "grad_norm": 1.4654299244319873, + "learning_rate": 6.082997520148792e-07, + "loss": 0.9004, + "step": 156990 + }, + { + "epoch": 12.166298578015422, + "grad_norm": 1.4427521562006145, + "learning_rate": 6.083384996900186e-07, + "loss": 0.8692, + "step": 157000 + }, + { + "epoch": 12.166298578015422, + "eval_loss": 0.8991262316703796, + "eval_runtime": 330.3872, + "eval_samples_per_second": 34.72, + "eval_steps_per_second": 8.681, + "step": 157000 + }, + { + "epoch": 12.167073501491728, + "grad_norm": 1.4740380241454256, + "learning_rate": 6.083772473651581e-07, + "loss": 0.886, + "step": 157010 + }, + { + "epoch": 12.167848424968035, + "grad_norm": 1.5398728096256238, + "learning_rate": 6.084159950402977e-07, + "loss": 0.8742, + "step": 157020 + }, + { + "epoch": 12.168623348444342, + "grad_norm": 1.5852526015881676, + "learning_rate": 6.084547427154371e-07, + "loss": 0.875, + "step": 157030 + }, + { + "epoch": 12.169398271920647, + "grad_norm": 1.4051949193772355, + "learning_rate": 6.084934903905766e-07, + "loss": 0.8597, + "step": 157040 + }, + { + "epoch": 12.170173195396954, + "grad_norm": 1.604129278770291, + "learning_rate": 6.085322380657161e-07, + "loss": 0.8664, + "step": 157050 + }, + { + "epoch": 12.17094811887326, + "grad_norm": 1.5072527192793437, + "learning_rate": 6.085709857408556e-07, + "loss": 0.8622, + "step": 157060 + }, + { + "epoch": 12.171723042349567, + "grad_norm": 1.4990784924425824, + "learning_rate": 6.086097334159951e-07, + "loss": 0.859, + "step": 157070 + }, + { + "epoch": 12.172497965825874, + "grad_norm": 1.472676989627385, + "learning_rate": 6.086484810911346e-07, + "loss": 0.8682, + "step": 157080 + }, + { + "epoch": 12.173272889302181, + "grad_norm": 1.4811846019095183, + "learning_rate": 6.086872287662741e-07, + "loss": 0.8736, + "step": 157090 + }, + { + "epoch": 12.174047812778488, + "grad_norm": 1.585617813714764, + "learning_rate": 6.087259764414135e-07, + "loss": 0.8927, + "step": 157100 + }, + { + "epoch": 12.174822736254795, + "grad_norm": 1.4651699587140645, + "learning_rate": 6.08764724116553e-07, + "loss": 0.8632, + "step": 157110 + }, + { + "epoch": 12.175597659731102, + "grad_norm": 1.450973906583597, + "learning_rate": 6.088034717916926e-07, + "loss": 0.8781, + "step": 157120 + }, + { + "epoch": 12.176372583207408, + "grad_norm": 1.4091964649686315, + "learning_rate": 6.088422194668321e-07, + "loss": 0.8657, + "step": 157130 + }, + { + "epoch": 12.177147506683715, + "grad_norm": 1.507768156077027, + "learning_rate": 6.088809671419715e-07, + "loss": 0.8714, + "step": 157140 + }, + { + "epoch": 12.177922430160022, + "grad_norm": 1.4666360377370489, + "learning_rate": 6.08919714817111e-07, + "loss": 0.8737, + "step": 157150 + }, + { + "epoch": 12.178697353636329, + "grad_norm": 1.395832942095137, + "learning_rate": 6.089584624922506e-07, + "loss": 0.8881, + "step": 157160 + }, + { + "epoch": 12.179472277112636, + "grad_norm": 1.4227621270495039, + "learning_rate": 6.0899721016739e-07, + "loss": 0.8708, + "step": 157170 + }, + { + "epoch": 12.180247200588942, + "grad_norm": 1.4043421253491992, + "learning_rate": 6.090359578425295e-07, + "loss": 0.8635, + "step": 157180 + }, + { + "epoch": 12.18102212406525, + "grad_norm": 1.5288085485194445, + "learning_rate": 6.09074705517669e-07, + "loss": 0.884, + "step": 157190 + }, + { + "epoch": 12.181797047541556, + "grad_norm": 1.4709109280290704, + "learning_rate": 6.091134531928085e-07, + "loss": 0.8617, + "step": 157200 + }, + { + "epoch": 12.182571971017863, + "grad_norm": 1.4419944543984073, + "learning_rate": 6.09152200867948e-07, + "loss": 0.8649, + "step": 157210 + }, + { + "epoch": 12.183346894494168, + "grad_norm": 1.5197572153209873, + "learning_rate": 6.091909485430875e-07, + "loss": 0.8746, + "step": 157220 + }, + { + "epoch": 12.184121817970475, + "grad_norm": 1.5162363098839673, + "learning_rate": 6.09229696218227e-07, + "loss": 0.8879, + "step": 157230 + }, + { + "epoch": 12.184896741446781, + "grad_norm": 1.412566544759313, + "learning_rate": 6.092684438933664e-07, + "loss": 0.8922, + "step": 157240 + }, + { + "epoch": 12.185671664923088, + "grad_norm": 1.5294501617441398, + "learning_rate": 6.093071915685059e-07, + "loss": 0.886, + "step": 157250 + }, + { + "epoch": 12.186446588399395, + "grad_norm": 1.4351018803295241, + "learning_rate": 6.093459392436455e-07, + "loss": 0.8846, + "step": 157260 + }, + { + "epoch": 12.187221511875702, + "grad_norm": 1.5787933747525669, + "learning_rate": 6.09384686918785e-07, + "loss": 0.8945, + "step": 157270 + }, + { + "epoch": 12.187996435352009, + "grad_norm": 1.4153890455785956, + "learning_rate": 6.094234345939244e-07, + "loss": 0.8799, + "step": 157280 + }, + { + "epoch": 12.188771358828316, + "grad_norm": 1.5103202573148427, + "learning_rate": 6.094621822690639e-07, + "loss": 0.8646, + "step": 157290 + }, + { + "epoch": 12.189546282304622, + "grad_norm": 1.4550593528886515, + "learning_rate": 6.095009299442034e-07, + "loss": 0.8824, + "step": 157300 + }, + { + "epoch": 12.19032120578093, + "grad_norm": 1.5048789929017683, + "learning_rate": 6.095396776193429e-07, + "loss": 0.8826, + "step": 157310 + }, + { + "epoch": 12.191096129257236, + "grad_norm": 1.4870772269809132, + "learning_rate": 6.095784252944824e-07, + "loss": 0.8906, + "step": 157320 + }, + { + "epoch": 12.191871052733543, + "grad_norm": 1.5153321486790436, + "learning_rate": 6.096171729696219e-07, + "loss": 0.8598, + "step": 157330 + }, + { + "epoch": 12.19264597620985, + "grad_norm": 1.4903524762653646, + "learning_rate": 6.096559206447614e-07, + "loss": 0.8945, + "step": 157340 + }, + { + "epoch": 12.193420899686156, + "grad_norm": 1.4853228453837046, + "learning_rate": 6.096946683199008e-07, + "loss": 0.8822, + "step": 157350 + }, + { + "epoch": 12.194195823162463, + "grad_norm": 1.4803852311041705, + "learning_rate": 6.097334159950404e-07, + "loss": 0.8698, + "step": 157360 + }, + { + "epoch": 12.19497074663877, + "grad_norm": 1.4697303881056898, + "learning_rate": 6.097721636701799e-07, + "loss": 0.8796, + "step": 157370 + }, + { + "epoch": 12.195745670115077, + "grad_norm": 1.4720595998734827, + "learning_rate": 6.098109113453193e-07, + "loss": 0.8673, + "step": 157380 + }, + { + "epoch": 12.196520593591384, + "grad_norm": 1.6090671821685225, + "learning_rate": 6.098496590204588e-07, + "loss": 0.8647, + "step": 157390 + }, + { + "epoch": 12.19729551706769, + "grad_norm": 1.4237073862620842, + "learning_rate": 6.098884066955983e-07, + "loss": 0.8869, + "step": 157400 + }, + { + "epoch": 12.198070440543995, + "grad_norm": 1.6273065371797721, + "learning_rate": 6.099271543707379e-07, + "loss": 0.9005, + "step": 157410 + }, + { + "epoch": 12.198845364020302, + "grad_norm": 1.4583808876606914, + "learning_rate": 6.099659020458773e-07, + "loss": 0.893, + "step": 157420 + }, + { + "epoch": 12.199620287496609, + "grad_norm": 1.4522974135212177, + "learning_rate": 6.100046497210168e-07, + "loss": 0.8842, + "step": 157430 + }, + { + "epoch": 12.200395210972916, + "grad_norm": 1.5319710107464184, + "learning_rate": 6.100433973961563e-07, + "loss": 0.8636, + "step": 157440 + }, + { + "epoch": 12.201170134449223, + "grad_norm": 1.5235145132227819, + "learning_rate": 6.100821450712957e-07, + "loss": 0.882, + "step": 157450 + }, + { + "epoch": 12.20194505792553, + "grad_norm": 1.4227544032546682, + "learning_rate": 6.101208927464353e-07, + "loss": 0.895, + "step": 157460 + }, + { + "epoch": 12.202719981401836, + "grad_norm": 1.4688662681493536, + "learning_rate": 6.101596404215748e-07, + "loss": 0.883, + "step": 157470 + }, + { + "epoch": 12.203494904878143, + "grad_norm": 1.5070667043410544, + "learning_rate": 6.101983880967143e-07, + "loss": 0.8737, + "step": 157480 + }, + { + "epoch": 12.20426982835445, + "grad_norm": 1.5624124774306078, + "learning_rate": 6.102371357718537e-07, + "loss": 0.8862, + "step": 157490 + }, + { + "epoch": 12.205044751830757, + "grad_norm": 1.5487228239205473, + "learning_rate": 6.102758834469932e-07, + "loss": 0.8552, + "step": 157500 + }, + { + "epoch": 12.205044751830757, + "eval_loss": 0.8988638520240784, + "eval_runtime": 326.6905, + "eval_samples_per_second": 35.113, + "eval_steps_per_second": 8.779, + "step": 157500 + }, + { + "epoch": 12.205819675307064, + "grad_norm": 1.4337212986689083, + "learning_rate": 6.103146311221328e-07, + "loss": 0.8774, + "step": 157510 + }, + { + "epoch": 12.20659459878337, + "grad_norm": 1.548401697181017, + "learning_rate": 6.103533787972722e-07, + "loss": 0.8928, + "step": 157520 + }, + { + "epoch": 12.207369522259677, + "grad_norm": 1.4817137053064802, + "learning_rate": 6.103921264724117e-07, + "loss": 0.8727, + "step": 157530 + }, + { + "epoch": 12.208144445735984, + "grad_norm": 1.6157560123474886, + "learning_rate": 6.104308741475512e-07, + "loss": 0.8824, + "step": 157540 + }, + { + "epoch": 12.20891936921229, + "grad_norm": 1.4749773730105824, + "learning_rate": 6.104696218226907e-07, + "loss": 0.8837, + "step": 157550 + }, + { + "epoch": 12.209694292688598, + "grad_norm": 1.502334424614696, + "learning_rate": 6.105083694978302e-07, + "loss": 0.8867, + "step": 157560 + }, + { + "epoch": 12.210469216164904, + "grad_norm": 1.6238858739055417, + "learning_rate": 6.105471171729697e-07, + "loss": 0.8791, + "step": 157570 + }, + { + "epoch": 12.211244139641211, + "grad_norm": 1.3971362507160594, + "learning_rate": 6.105858648481092e-07, + "loss": 0.8905, + "step": 157580 + }, + { + "epoch": 12.212019063117516, + "grad_norm": 1.6052094159741583, + "learning_rate": 6.106246125232486e-07, + "loss": 0.8847, + "step": 157590 + }, + { + "epoch": 12.212793986593823, + "grad_norm": 1.55514387479929, + "learning_rate": 6.106633601983881e-07, + "loss": 0.8739, + "step": 157600 + }, + { + "epoch": 12.21356891007013, + "grad_norm": 1.5282906097516522, + "learning_rate": 6.107021078735277e-07, + "loss": 0.8828, + "step": 157610 + }, + { + "epoch": 12.214343833546437, + "grad_norm": 1.4735564161524413, + "learning_rate": 6.107408555486672e-07, + "loss": 0.8736, + "step": 157620 + }, + { + "epoch": 12.215118757022744, + "grad_norm": 1.5202675801740275, + "learning_rate": 6.107796032238066e-07, + "loss": 0.8728, + "step": 157630 + }, + { + "epoch": 12.21589368049905, + "grad_norm": 1.5214611917348075, + "learning_rate": 6.108183508989461e-07, + "loss": 0.8944, + "step": 157640 + }, + { + "epoch": 12.216668603975357, + "grad_norm": 1.5107890521895306, + "learning_rate": 6.108570985740856e-07, + "loss": 0.8808, + "step": 157650 + }, + { + "epoch": 12.217443527451664, + "grad_norm": 1.402889173650584, + "learning_rate": 6.108958462492251e-07, + "loss": 0.8798, + "step": 157660 + }, + { + "epoch": 12.21821845092797, + "grad_norm": 1.6104199275112097, + "learning_rate": 6.109345939243646e-07, + "loss": 0.8727, + "step": 157670 + }, + { + "epoch": 12.218993374404278, + "grad_norm": 1.4650271671171256, + "learning_rate": 6.109733415995041e-07, + "loss": 0.8732, + "step": 157680 + }, + { + "epoch": 12.219768297880584, + "grad_norm": 1.493403168911699, + "learning_rate": 6.110120892746435e-07, + "loss": 0.8963, + "step": 157690 + }, + { + "epoch": 12.220543221356891, + "grad_norm": 1.4134781892166683, + "learning_rate": 6.11050836949783e-07, + "loss": 0.88, + "step": 157700 + }, + { + "epoch": 12.221318144833198, + "grad_norm": 1.4745171449995453, + "learning_rate": 6.110895846249226e-07, + "loss": 0.8898, + "step": 157710 + }, + { + "epoch": 12.222093068309505, + "grad_norm": 1.5489122151705181, + "learning_rate": 6.111283323000621e-07, + "loss": 0.8879, + "step": 157720 + }, + { + "epoch": 12.222867991785812, + "grad_norm": 1.4830542619427831, + "learning_rate": 6.111670799752015e-07, + "loss": 0.875, + "step": 157730 + }, + { + "epoch": 12.223642915262118, + "grad_norm": 1.5621721928220935, + "learning_rate": 6.11205827650341e-07, + "loss": 0.8933, + "step": 157740 + }, + { + "epoch": 12.224417838738425, + "grad_norm": 1.492978819873221, + "learning_rate": 6.112445753254805e-07, + "loss": 0.8835, + "step": 157750 + }, + { + "epoch": 12.225192762214732, + "grad_norm": 1.420111125145664, + "learning_rate": 6.1128332300062e-07, + "loss": 0.8816, + "step": 157760 + }, + { + "epoch": 12.225967685691039, + "grad_norm": 1.517154513351472, + "learning_rate": 6.113220706757595e-07, + "loss": 0.8785, + "step": 157770 + }, + { + "epoch": 12.226742609167344, + "grad_norm": 1.4839160429266265, + "learning_rate": 6.11360818350899e-07, + "loss": 0.8842, + "step": 157780 + }, + { + "epoch": 12.22751753264365, + "grad_norm": 1.5769431868767165, + "learning_rate": 6.113995660260385e-07, + "loss": 0.8817, + "step": 157790 + }, + { + "epoch": 12.228292456119958, + "grad_norm": 1.516874776383057, + "learning_rate": 6.114383137011779e-07, + "loss": 0.8854, + "step": 157800 + }, + { + "epoch": 12.229067379596264, + "grad_norm": 1.5263871244746874, + "learning_rate": 6.114770613763175e-07, + "loss": 0.8919, + "step": 157810 + }, + { + "epoch": 12.229842303072571, + "grad_norm": 1.4820079771937125, + "learning_rate": 6.11515809051457e-07, + "loss": 0.8827, + "step": 157820 + }, + { + "epoch": 12.230617226548878, + "grad_norm": 1.6482307043113562, + "learning_rate": 6.115545567265964e-07, + "loss": 0.8803, + "step": 157830 + }, + { + "epoch": 12.231392150025185, + "grad_norm": 1.4538140273867797, + "learning_rate": 6.115933044017359e-07, + "loss": 0.8809, + "step": 157840 + }, + { + "epoch": 12.232167073501492, + "grad_norm": 1.4440712720556383, + "learning_rate": 6.116320520768755e-07, + "loss": 0.8866, + "step": 157850 + }, + { + "epoch": 12.232941996977798, + "grad_norm": 1.4114184504747918, + "learning_rate": 6.11670799752015e-07, + "loss": 0.891, + "step": 157860 + }, + { + "epoch": 12.233716920454105, + "grad_norm": 1.5602390342890444, + "learning_rate": 6.117095474271544e-07, + "loss": 0.8834, + "step": 157870 + }, + { + "epoch": 12.234491843930412, + "grad_norm": 1.5494202147352656, + "learning_rate": 6.117482951022939e-07, + "loss": 0.8956, + "step": 157880 + }, + { + "epoch": 12.235266767406719, + "grad_norm": 1.5123081126370017, + "learning_rate": 6.117870427774334e-07, + "loss": 0.8822, + "step": 157890 + }, + { + "epoch": 12.236041690883026, + "grad_norm": 1.5524163106638647, + "learning_rate": 6.118257904525728e-07, + "loss": 0.8844, + "step": 157900 + }, + { + "epoch": 12.236816614359332, + "grad_norm": 1.5384497796971253, + "learning_rate": 6.118645381277124e-07, + "loss": 0.8765, + "step": 157910 + }, + { + "epoch": 12.23759153783564, + "grad_norm": 1.565628535922822, + "learning_rate": 6.119032858028519e-07, + "loss": 0.8638, + "step": 157920 + }, + { + "epoch": 12.238366461311946, + "grad_norm": 1.4961965529427175, + "learning_rate": 6.119420334779914e-07, + "loss": 0.8777, + "step": 157930 + }, + { + "epoch": 12.239141384788253, + "grad_norm": 1.4418957191204487, + "learning_rate": 6.119807811531308e-07, + "loss": 0.8739, + "step": 157940 + }, + { + "epoch": 12.23991630826456, + "grad_norm": 1.5009261093480586, + "learning_rate": 6.120195288282704e-07, + "loss": 0.8778, + "step": 157950 + }, + { + "epoch": 12.240691231740865, + "grad_norm": 1.4451979490583269, + "learning_rate": 6.120582765034099e-07, + "loss": 0.8778, + "step": 157960 + }, + { + "epoch": 12.241466155217172, + "grad_norm": 1.6377026234956553, + "learning_rate": 6.120970241785493e-07, + "loss": 0.8726, + "step": 157970 + }, + { + "epoch": 12.242241078693478, + "grad_norm": 1.4998602166392312, + "learning_rate": 6.121357718536888e-07, + "loss": 0.8984, + "step": 157980 + }, + { + "epoch": 12.243016002169785, + "grad_norm": 1.6022727979600087, + "learning_rate": 6.121745195288283e-07, + "loss": 0.859, + "step": 157990 + }, + { + "epoch": 12.243790925646092, + "grad_norm": 1.453978096318387, + "learning_rate": 6.122132672039679e-07, + "loss": 0.8962, + "step": 158000 + }, + { + "epoch": 12.243790925646092, + "eval_loss": 0.8985908031463623, + "eval_runtime": 328.9887, + "eval_samples_per_second": 34.867, + "eval_steps_per_second": 8.718, + "step": 158000 + }, + { + "epoch": 12.244565849122399, + "grad_norm": 1.5362710148995187, + "learning_rate": 6.122520148791073e-07, + "loss": 0.875, + "step": 158010 + }, + { + "epoch": 12.245340772598706, + "grad_norm": 1.6218278276894291, + "learning_rate": 6.122907625542468e-07, + "loss": 0.882, + "step": 158020 + }, + { + "epoch": 12.246115696075012, + "grad_norm": 1.4995415097326112, + "learning_rate": 6.123295102293863e-07, + "loss": 0.8797, + "step": 158030 + }, + { + "epoch": 12.24689061955132, + "grad_norm": 1.4523247816791052, + "learning_rate": 6.123682579045257e-07, + "loss": 0.8667, + "step": 158040 + }, + { + "epoch": 12.247665543027626, + "grad_norm": 1.5315803270905022, + "learning_rate": 6.124070055796653e-07, + "loss": 0.8755, + "step": 158050 + }, + { + "epoch": 12.248440466503933, + "grad_norm": 1.4357744298925892, + "learning_rate": 6.124457532548048e-07, + "loss": 0.8876, + "step": 158060 + }, + { + "epoch": 12.24921538998024, + "grad_norm": 1.6742740527846638, + "learning_rate": 6.124845009299443e-07, + "loss": 0.8892, + "step": 158070 + }, + { + "epoch": 12.249990313456546, + "grad_norm": 1.4832526908294614, + "learning_rate": 6.125232486050837e-07, + "loss": 0.8914, + "step": 158080 + }, + { + "epoch": 12.250765236932853, + "grad_norm": 1.494882698677439, + "learning_rate": 6.125619962802232e-07, + "loss": 0.8578, + "step": 158090 + }, + { + "epoch": 12.25154016040916, + "grad_norm": 1.554498969783276, + "learning_rate": 6.126007439553628e-07, + "loss": 0.8714, + "step": 158100 + }, + { + "epoch": 12.252315083885467, + "grad_norm": 1.4360609780630518, + "learning_rate": 6.126394916305022e-07, + "loss": 0.8783, + "step": 158110 + }, + { + "epoch": 12.253090007361774, + "grad_norm": 1.5742183783720312, + "learning_rate": 6.126782393056417e-07, + "loss": 0.8975, + "step": 158120 + }, + { + "epoch": 12.25386493083808, + "grad_norm": 1.5605283081613657, + "learning_rate": 6.127169869807812e-07, + "loss": 0.8967, + "step": 158130 + }, + { + "epoch": 12.254639854314387, + "grad_norm": 1.4940018238031143, + "learning_rate": 6.127557346559207e-07, + "loss": 0.8702, + "step": 158140 + }, + { + "epoch": 12.255414777790692, + "grad_norm": 1.5263453350154443, + "learning_rate": 6.127944823310602e-07, + "loss": 0.8931, + "step": 158150 + }, + { + "epoch": 12.256189701267, + "grad_norm": 1.4843187164991347, + "learning_rate": 6.128332300061997e-07, + "loss": 0.8785, + "step": 158160 + }, + { + "epoch": 12.256964624743306, + "grad_norm": 1.475757815841219, + "learning_rate": 6.128719776813392e-07, + "loss": 0.9068, + "step": 158170 + }, + { + "epoch": 12.257739548219613, + "grad_norm": 1.5720692417802542, + "learning_rate": 6.129107253564786e-07, + "loss": 0.8956, + "step": 158180 + }, + { + "epoch": 12.25851447169592, + "grad_norm": 1.4650203126200751, + "learning_rate": 6.129494730316181e-07, + "loss": 0.8758, + "step": 158190 + }, + { + "epoch": 12.259289395172226, + "grad_norm": 1.4587951086838586, + "learning_rate": 6.129882207067577e-07, + "loss": 0.8824, + "step": 158200 + }, + { + "epoch": 12.260064318648533, + "grad_norm": 1.4990016521782896, + "learning_rate": 6.130269683818972e-07, + "loss": 0.8723, + "step": 158210 + }, + { + "epoch": 12.26083924212484, + "grad_norm": 1.493901133995111, + "learning_rate": 6.130657160570366e-07, + "loss": 0.8601, + "step": 158220 + }, + { + "epoch": 12.261614165601147, + "grad_norm": 1.5680163764249273, + "learning_rate": 6.131044637321761e-07, + "loss": 0.8771, + "step": 158230 + }, + { + "epoch": 12.262389089077454, + "grad_norm": 1.5399203905901024, + "learning_rate": 6.131432114073156e-07, + "loss": 0.8858, + "step": 158240 + }, + { + "epoch": 12.26316401255376, + "grad_norm": 1.5320385847172138, + "learning_rate": 6.131819590824551e-07, + "loss": 0.8705, + "step": 158250 + }, + { + "epoch": 12.263938936030067, + "grad_norm": 1.531519190307248, + "learning_rate": 6.132207067575946e-07, + "loss": 0.8887, + "step": 158260 + }, + { + "epoch": 12.264713859506374, + "grad_norm": 1.5897786522978214, + "learning_rate": 6.132594544327341e-07, + "loss": 0.8784, + "step": 158270 + }, + { + "epoch": 12.26548878298268, + "grad_norm": 1.4216080205852393, + "learning_rate": 6.132982021078736e-07, + "loss": 0.8645, + "step": 158280 + }, + { + "epoch": 12.266263706458988, + "grad_norm": 1.4477300417067982, + "learning_rate": 6.13336949783013e-07, + "loss": 0.8712, + "step": 158290 + }, + { + "epoch": 12.267038629935294, + "grad_norm": 1.4901624505862654, + "learning_rate": 6.133756974581526e-07, + "loss": 0.8826, + "step": 158300 + }, + { + "epoch": 12.267813553411601, + "grad_norm": 1.4487206269954955, + "learning_rate": 6.134144451332921e-07, + "loss": 0.8697, + "step": 158310 + }, + { + "epoch": 12.268588476887908, + "grad_norm": 1.488043109174641, + "learning_rate": 6.134531928084315e-07, + "loss": 0.8693, + "step": 158320 + }, + { + "epoch": 12.269363400364213, + "grad_norm": 1.518291603788926, + "learning_rate": 6.13491940483571e-07, + "loss": 0.8767, + "step": 158330 + }, + { + "epoch": 12.27013832384052, + "grad_norm": 1.5639076427747012, + "learning_rate": 6.135306881587105e-07, + "loss": 0.8856, + "step": 158340 + }, + { + "epoch": 12.270913247316827, + "grad_norm": 1.457695135026716, + "learning_rate": 6.135694358338501e-07, + "loss": 0.8923, + "step": 158350 + }, + { + "epoch": 12.271688170793134, + "grad_norm": 1.5584815993066823, + "learning_rate": 6.136081835089895e-07, + "loss": 0.8799, + "step": 158360 + }, + { + "epoch": 12.27246309426944, + "grad_norm": 1.5100889030375142, + "learning_rate": 6.13646931184129e-07, + "loss": 0.8609, + "step": 158370 + }, + { + "epoch": 12.273238017745747, + "grad_norm": 1.5424241505628085, + "learning_rate": 6.136856788592685e-07, + "loss": 0.8747, + "step": 158380 + }, + { + "epoch": 12.274012941222054, + "grad_norm": 1.5856208944816985, + "learning_rate": 6.137244265344079e-07, + "loss": 0.8981, + "step": 158390 + }, + { + "epoch": 12.27478786469836, + "grad_norm": 1.5347552028236982, + "learning_rate": 6.137631742095475e-07, + "loss": 0.8748, + "step": 158400 + }, + { + "epoch": 12.275562788174668, + "grad_norm": 1.4931730929823501, + "learning_rate": 6.13801921884687e-07, + "loss": 0.8733, + "step": 158410 + }, + { + "epoch": 12.276337711650974, + "grad_norm": 1.4975403105121528, + "learning_rate": 6.138406695598265e-07, + "loss": 0.8907, + "step": 158420 + }, + { + "epoch": 12.277112635127281, + "grad_norm": 1.5721824636661523, + "learning_rate": 6.138794172349659e-07, + "loss": 0.8953, + "step": 158430 + }, + { + "epoch": 12.277887558603588, + "grad_norm": 1.5024124656898867, + "learning_rate": 6.139181649101054e-07, + "loss": 0.8795, + "step": 158440 + }, + { + "epoch": 12.278662482079895, + "grad_norm": 1.537837644576179, + "learning_rate": 6.13956912585245e-07, + "loss": 0.881, + "step": 158450 + }, + { + "epoch": 12.279437405556202, + "grad_norm": 1.4944922073432556, + "learning_rate": 6.139956602603844e-07, + "loss": 0.8577, + "step": 158460 + }, + { + "epoch": 12.280212329032508, + "grad_norm": 1.4902578792366312, + "learning_rate": 6.140344079355239e-07, + "loss": 0.8808, + "step": 158470 + }, + { + "epoch": 12.280987252508815, + "grad_norm": 1.5410948688082757, + "learning_rate": 6.140731556106634e-07, + "loss": 0.8696, + "step": 158480 + }, + { + "epoch": 12.281762175985122, + "grad_norm": 1.470437291699779, + "learning_rate": 6.14111903285803e-07, + "loss": 0.8713, + "step": 158490 + }, + { + "epoch": 12.282537099461429, + "grad_norm": 1.4239977134038613, + "learning_rate": 6.141506509609424e-07, + "loss": 0.877, + "step": 158500 + }, + { + "epoch": 12.282537099461429, + "eval_loss": 0.8987441658973694, + "eval_runtime": 329.156, + "eval_samples_per_second": 34.85, + "eval_steps_per_second": 8.713, + "step": 158500 + }, + { + "epoch": 12.283312022937736, + "grad_norm": 1.4997741682714454, + "learning_rate": 6.141893986360819e-07, + "loss": 0.8843, + "step": 158510 + }, + { + "epoch": 12.28408694641404, + "grad_norm": 1.436646726245214, + "learning_rate": 6.142281463112214e-07, + "loss": 0.8778, + "step": 158520 + }, + { + "epoch": 12.284861869890348, + "grad_norm": 1.5126247289702892, + "learning_rate": 6.142668939863608e-07, + "loss": 0.8627, + "step": 158530 + }, + { + "epoch": 12.285636793366654, + "grad_norm": 1.533681322223184, + "learning_rate": 6.143056416615003e-07, + "loss": 0.8853, + "step": 158540 + }, + { + "epoch": 12.286411716842961, + "grad_norm": 1.4368728517664946, + "learning_rate": 6.143443893366399e-07, + "loss": 0.8806, + "step": 158550 + }, + { + "epoch": 12.287186640319268, + "grad_norm": 1.535419381657779, + "learning_rate": 6.143831370117794e-07, + "loss": 0.883, + "step": 158560 + }, + { + "epoch": 12.287961563795575, + "grad_norm": 1.4249223644814604, + "learning_rate": 6.144218846869188e-07, + "loss": 0.8628, + "step": 158570 + }, + { + "epoch": 12.288736487271882, + "grad_norm": 1.498536926747217, + "learning_rate": 6.144606323620583e-07, + "loss": 0.8627, + "step": 158580 + }, + { + "epoch": 12.289511410748188, + "grad_norm": 1.4415351975153858, + "learning_rate": 6.144993800371979e-07, + "loss": 0.9026, + "step": 158590 + }, + { + "epoch": 12.290286334224495, + "grad_norm": 1.5235355916412139, + "learning_rate": 6.145381277123373e-07, + "loss": 0.8686, + "step": 158600 + }, + { + "epoch": 12.291061257700802, + "grad_norm": 1.5140555460590501, + "learning_rate": 6.145768753874768e-07, + "loss": 0.8689, + "step": 158610 + }, + { + "epoch": 12.291836181177109, + "grad_norm": 1.5187926329626162, + "learning_rate": 6.146156230626163e-07, + "loss": 0.88, + "step": 158620 + }, + { + "epoch": 12.292611104653416, + "grad_norm": 1.4685995456522924, + "learning_rate": 6.146543707377558e-07, + "loss": 0.8826, + "step": 158630 + }, + { + "epoch": 12.293386028129722, + "grad_norm": 1.56137728161034, + "learning_rate": 6.146931184128953e-07, + "loss": 0.8843, + "step": 158640 + }, + { + "epoch": 12.29416095160603, + "grad_norm": 1.463272217305307, + "learning_rate": 6.147318660880348e-07, + "loss": 0.8707, + "step": 158650 + }, + { + "epoch": 12.294935875082336, + "grad_norm": 1.4853489033634821, + "learning_rate": 6.147706137631743e-07, + "loss": 0.891, + "step": 158660 + }, + { + "epoch": 12.295710798558643, + "grad_norm": 1.5245225735702217, + "learning_rate": 6.148093614383137e-07, + "loss": 0.8901, + "step": 158670 + }, + { + "epoch": 12.29648572203495, + "grad_norm": 1.4133418405115483, + "learning_rate": 6.148481091134532e-07, + "loss": 0.8607, + "step": 158680 + }, + { + "epoch": 12.297260645511257, + "grad_norm": 1.4288130969586887, + "learning_rate": 6.148868567885928e-07, + "loss": 0.8708, + "step": 158690 + }, + { + "epoch": 12.298035568987563, + "grad_norm": 1.5260782178300631, + "learning_rate": 6.149256044637323e-07, + "loss": 0.8803, + "step": 158700 + }, + { + "epoch": 12.298810492463868, + "grad_norm": 1.544956101581232, + "learning_rate": 6.149643521388717e-07, + "loss": 0.8716, + "step": 158710 + }, + { + "epoch": 12.299585415940175, + "grad_norm": 1.4131820790227223, + "learning_rate": 6.150030998140112e-07, + "loss": 0.8648, + "step": 158720 + }, + { + "epoch": 12.300360339416482, + "grad_norm": 1.6152277648813882, + "learning_rate": 6.150418474891507e-07, + "loss": 0.8802, + "step": 158730 + }, + { + "epoch": 12.301135262892789, + "grad_norm": 1.470227989959047, + "learning_rate": 6.150805951642902e-07, + "loss": 0.8673, + "step": 158740 + }, + { + "epoch": 12.301910186369096, + "grad_norm": 1.4689466614388416, + "learning_rate": 6.151193428394297e-07, + "loss": 0.8879, + "step": 158750 + }, + { + "epoch": 12.302685109845402, + "grad_norm": 1.5090769026312862, + "learning_rate": 6.151580905145692e-07, + "loss": 0.9108, + "step": 158760 + }, + { + "epoch": 12.30346003332171, + "grad_norm": 1.4801289831933246, + "learning_rate": 6.151968381897087e-07, + "loss": 0.8728, + "step": 158770 + }, + { + "epoch": 12.304234956798016, + "grad_norm": 1.4783413188060317, + "learning_rate": 6.152355858648481e-07, + "loss": 0.8593, + "step": 158780 + }, + { + "epoch": 12.305009880274323, + "grad_norm": 1.55852121264272, + "learning_rate": 6.152743335399877e-07, + "loss": 0.8875, + "step": 158790 + }, + { + "epoch": 12.30578480375063, + "grad_norm": 1.5194239229728286, + "learning_rate": 6.153130812151272e-07, + "loss": 0.8849, + "step": 158800 + }, + { + "epoch": 12.306559727226936, + "grad_norm": 1.5468728815710824, + "learning_rate": 6.153518288902666e-07, + "loss": 0.8798, + "step": 158810 + }, + { + "epoch": 12.307334650703243, + "grad_norm": 1.5026048711639644, + "learning_rate": 6.153905765654061e-07, + "loss": 0.8632, + "step": 158820 + }, + { + "epoch": 12.30810957417955, + "grad_norm": 1.5058504622523943, + "learning_rate": 6.154293242405456e-07, + "loss": 0.897, + "step": 158830 + }, + { + "epoch": 12.308884497655857, + "grad_norm": 1.5720080409589192, + "learning_rate": 6.154680719156852e-07, + "loss": 0.8682, + "step": 158840 + }, + { + "epoch": 12.309659421132164, + "grad_norm": 1.5166071513157284, + "learning_rate": 6.155068195908246e-07, + "loss": 0.8982, + "step": 158850 + }, + { + "epoch": 12.31043434460847, + "grad_norm": 1.4336990447742866, + "learning_rate": 6.155455672659641e-07, + "loss": 0.8672, + "step": 158860 + }, + { + "epoch": 12.311209268084777, + "grad_norm": 1.4564913725835176, + "learning_rate": 6.155843149411036e-07, + "loss": 0.8796, + "step": 158870 + }, + { + "epoch": 12.311984191561084, + "grad_norm": 1.4964798851739571, + "learning_rate": 6.15623062616243e-07, + "loss": 0.8796, + "step": 158880 + }, + { + "epoch": 12.312759115037391, + "grad_norm": 1.4961717661640155, + "learning_rate": 6.156618102913826e-07, + "loss": 0.8776, + "step": 158890 + }, + { + "epoch": 12.313534038513696, + "grad_norm": 1.4461361190887672, + "learning_rate": 6.157005579665221e-07, + "loss": 0.8784, + "step": 158900 + }, + { + "epoch": 12.314308961990003, + "grad_norm": 1.4929713164115408, + "learning_rate": 6.157393056416616e-07, + "loss": 0.8812, + "step": 158910 + }, + { + "epoch": 12.31508388546631, + "grad_norm": 1.5150032595269447, + "learning_rate": 6.15778053316801e-07, + "loss": 0.8804, + "step": 158920 + }, + { + "epoch": 12.315858808942616, + "grad_norm": 1.635088583388079, + "learning_rate": 6.158168009919405e-07, + "loss": 0.8947, + "step": 158930 + }, + { + "epoch": 12.316633732418923, + "grad_norm": 1.5981898356125208, + "learning_rate": 6.158555486670801e-07, + "loss": 0.884, + "step": 158940 + }, + { + "epoch": 12.31740865589523, + "grad_norm": 1.5999725588359306, + "learning_rate": 6.158942963422195e-07, + "loss": 0.8966, + "step": 158950 + }, + { + "epoch": 12.318183579371537, + "grad_norm": 1.6241898665085734, + "learning_rate": 6.15933044017359e-07, + "loss": 0.8725, + "step": 158960 + }, + { + "epoch": 12.318958502847844, + "grad_norm": 1.512328495248266, + "learning_rate": 6.159717916924985e-07, + "loss": 0.8972, + "step": 158970 + }, + { + "epoch": 12.31973342632415, + "grad_norm": 1.5357698679523977, + "learning_rate": 6.16010539367638e-07, + "loss": 0.8664, + "step": 158980 + }, + { + "epoch": 12.320508349800457, + "grad_norm": 1.4848670846389278, + "learning_rate": 6.160492870427775e-07, + "loss": 0.8802, + "step": 158990 + }, + { + "epoch": 12.321283273276764, + "grad_norm": 1.608245366650411, + "learning_rate": 6.16088034717917e-07, + "loss": 0.872, + "step": 159000 + }, + { + "epoch": 12.321283273276764, + "eval_loss": 0.8985007405281067, + "eval_runtime": 327.676, + "eval_samples_per_second": 35.007, + "eval_steps_per_second": 8.753, + "step": 159000 + }, + { + "epoch": 12.322058196753071, + "grad_norm": 1.4073033022792036, + "learning_rate": 6.161267823930565e-07, + "loss": 0.8608, + "step": 159010 + }, + { + "epoch": 12.322833120229378, + "grad_norm": 1.5802149803026946, + "learning_rate": 6.161655300681959e-07, + "loss": 0.8673, + "step": 159020 + }, + { + "epoch": 12.323608043705685, + "grad_norm": 1.4338890280326373, + "learning_rate": 6.162042777433354e-07, + "loss": 0.8981, + "step": 159030 + }, + { + "epoch": 12.324382967181991, + "grad_norm": 1.553196766345236, + "learning_rate": 6.16243025418475e-07, + "loss": 0.8672, + "step": 159040 + }, + { + "epoch": 12.325157890658298, + "grad_norm": 1.5000022911430224, + "learning_rate": 6.162817730936145e-07, + "loss": 0.8837, + "step": 159050 + }, + { + "epoch": 12.325932814134605, + "grad_norm": 1.4485303165323908, + "learning_rate": 6.163205207687539e-07, + "loss": 0.8787, + "step": 159060 + }, + { + "epoch": 12.326707737610912, + "grad_norm": 1.6101906391927414, + "learning_rate": 6.163592684438934e-07, + "loss": 0.8791, + "step": 159070 + }, + { + "epoch": 12.327482661087217, + "grad_norm": 1.4923073379790641, + "learning_rate": 6.16398016119033e-07, + "loss": 0.8806, + "step": 159080 + }, + { + "epoch": 12.328257584563524, + "grad_norm": 1.4949543451881624, + "learning_rate": 6.164367637941724e-07, + "loss": 0.8763, + "step": 159090 + }, + { + "epoch": 12.32903250803983, + "grad_norm": 1.4709475982937235, + "learning_rate": 6.164755114693119e-07, + "loss": 0.8556, + "step": 159100 + }, + { + "epoch": 12.329807431516137, + "grad_norm": 1.5334677967573558, + "learning_rate": 6.165142591444514e-07, + "loss": 0.8697, + "step": 159110 + }, + { + "epoch": 12.330582354992444, + "grad_norm": 1.5495288434213044, + "learning_rate": 6.165530068195908e-07, + "loss": 0.8739, + "step": 159120 + }, + { + "epoch": 12.33135727846875, + "grad_norm": 1.4616758019424747, + "learning_rate": 6.165917544947303e-07, + "loss": 0.8843, + "step": 159130 + }, + { + "epoch": 12.332132201945058, + "grad_norm": 1.428512561569282, + "learning_rate": 6.166305021698699e-07, + "loss": 0.8728, + "step": 159140 + }, + { + "epoch": 12.332907125421364, + "grad_norm": 1.4609186905609426, + "learning_rate": 6.166692498450094e-07, + "loss": 0.8947, + "step": 159150 + }, + { + "epoch": 12.333682048897671, + "grad_norm": 1.4523931114709596, + "learning_rate": 6.167079975201488e-07, + "loss": 0.8983, + "step": 159160 + }, + { + "epoch": 12.334456972373978, + "grad_norm": 1.4536557533376993, + "learning_rate": 6.167467451952883e-07, + "loss": 0.9034, + "step": 159170 + }, + { + "epoch": 12.335231895850285, + "grad_norm": 1.4746647774754587, + "learning_rate": 6.167854928704279e-07, + "loss": 0.8764, + "step": 159180 + }, + { + "epoch": 12.336006819326592, + "grad_norm": 1.5773571427997706, + "learning_rate": 6.168242405455673e-07, + "loss": 0.8676, + "step": 159190 + }, + { + "epoch": 12.336781742802899, + "grad_norm": 1.4904196781050463, + "learning_rate": 6.168629882207068e-07, + "loss": 0.9037, + "step": 159200 + }, + { + "epoch": 12.337556666279205, + "grad_norm": 1.453137215001349, + "learning_rate": 6.169017358958463e-07, + "loss": 0.8563, + "step": 159210 + }, + { + "epoch": 12.338331589755512, + "grad_norm": 1.5446579164451577, + "learning_rate": 6.169404835709858e-07, + "loss": 0.8789, + "step": 159220 + }, + { + "epoch": 12.339106513231819, + "grad_norm": 1.4476197284441867, + "learning_rate": 6.169792312461252e-07, + "loss": 0.8647, + "step": 159230 + }, + { + "epoch": 12.339881436708126, + "grad_norm": 1.5260537712203668, + "learning_rate": 6.170179789212648e-07, + "loss": 0.8842, + "step": 159240 + }, + { + "epoch": 12.340656360184433, + "grad_norm": 1.5164874255618408, + "learning_rate": 6.170567265964043e-07, + "loss": 0.881, + "step": 159250 + }, + { + "epoch": 12.34143128366074, + "grad_norm": 1.4285463897805293, + "learning_rate": 6.170954742715437e-07, + "loss": 0.8817, + "step": 159260 + }, + { + "epoch": 12.342206207137044, + "grad_norm": 1.4952235564943697, + "learning_rate": 6.171342219466832e-07, + "loss": 0.8627, + "step": 159270 + }, + { + "epoch": 12.342981130613351, + "grad_norm": 1.5995425783560198, + "learning_rate": 6.171729696218228e-07, + "loss": 0.89, + "step": 159280 + }, + { + "epoch": 12.343756054089658, + "grad_norm": 1.3707634641718656, + "learning_rate": 6.172117172969623e-07, + "loss": 0.8867, + "step": 159290 + }, + { + "epoch": 12.344530977565965, + "grad_norm": 1.5144189794579441, + "learning_rate": 6.172504649721017e-07, + "loss": 0.8807, + "step": 159300 + }, + { + "epoch": 12.345305901042272, + "grad_norm": 1.4637039672807692, + "learning_rate": 6.172892126472412e-07, + "loss": 0.8753, + "step": 159310 + }, + { + "epoch": 12.346080824518578, + "grad_norm": 1.493456150266812, + "learning_rate": 6.173279603223807e-07, + "loss": 0.8824, + "step": 159320 + }, + { + "epoch": 12.346855747994885, + "grad_norm": 1.4735232175035429, + "learning_rate": 6.173667079975202e-07, + "loss": 0.8996, + "step": 159330 + }, + { + "epoch": 12.347630671471192, + "grad_norm": 1.49135424806048, + "learning_rate": 6.174054556726597e-07, + "loss": 0.8848, + "step": 159340 + }, + { + "epoch": 12.348405594947499, + "grad_norm": 1.3707558496552665, + "learning_rate": 6.174442033477992e-07, + "loss": 0.8843, + "step": 159350 + }, + { + "epoch": 12.349180518423806, + "grad_norm": 1.546283049154527, + "learning_rate": 6.174829510229387e-07, + "loss": 0.8711, + "step": 159360 + }, + { + "epoch": 12.349955441900113, + "grad_norm": 1.6599875455542545, + "learning_rate": 6.175216986980781e-07, + "loss": 0.8814, + "step": 159370 + }, + { + "epoch": 12.35073036537642, + "grad_norm": 1.4768439303946488, + "learning_rate": 6.175604463732177e-07, + "loss": 0.8667, + "step": 159380 + }, + { + "epoch": 12.351505288852726, + "grad_norm": 1.5191543235704488, + "learning_rate": 6.175991940483572e-07, + "loss": 0.876, + "step": 159390 + }, + { + "epoch": 12.352280212329033, + "grad_norm": 1.370217841179541, + "learning_rate": 6.176379417234966e-07, + "loss": 0.8713, + "step": 159400 + }, + { + "epoch": 12.35305513580534, + "grad_norm": 1.4937160678874248, + "learning_rate": 6.176766893986361e-07, + "loss": 0.8671, + "step": 159410 + }, + { + "epoch": 12.353830059281647, + "grad_norm": 1.493945674706848, + "learning_rate": 6.177154370737756e-07, + "loss": 0.8768, + "step": 159420 + }, + { + "epoch": 12.354604982757953, + "grad_norm": 1.5356097906743038, + "learning_rate": 6.177541847489152e-07, + "loss": 0.8672, + "step": 159430 + }, + { + "epoch": 12.35537990623426, + "grad_norm": 1.470219674455546, + "learning_rate": 6.177929324240546e-07, + "loss": 0.862, + "step": 159440 + }, + { + "epoch": 12.356154829710565, + "grad_norm": 1.5164381688900643, + "learning_rate": 6.178316800991941e-07, + "loss": 0.8897, + "step": 159450 + }, + { + "epoch": 12.356929753186872, + "grad_norm": 1.4737104657188023, + "learning_rate": 6.178704277743336e-07, + "loss": 0.8876, + "step": 159460 + }, + { + "epoch": 12.357704676663179, + "grad_norm": 1.5652901235586234, + "learning_rate": 6.17909175449473e-07, + "loss": 0.8899, + "step": 159470 + }, + { + "epoch": 12.358479600139486, + "grad_norm": 1.530603946544841, + "learning_rate": 6.179479231246126e-07, + "loss": 0.8793, + "step": 159480 + }, + { + "epoch": 12.359254523615792, + "grad_norm": 1.5357676101510505, + "learning_rate": 6.179866707997521e-07, + "loss": 0.8871, + "step": 159490 + }, + { + "epoch": 12.3600294470921, + "grad_norm": 1.498452945378699, + "learning_rate": 6.180254184748916e-07, + "loss": 0.8688, + "step": 159500 + }, + { + "epoch": 12.3600294470921, + "eval_loss": 0.8983279466629028, + "eval_runtime": 329.863, + "eval_samples_per_second": 34.775, + "eval_steps_per_second": 8.695, + "step": 159500 + }, + { + "epoch": 12.360804370568406, + "grad_norm": 1.5903295633663703, + "learning_rate": 6.18064166150031e-07, + "loss": 0.8917, + "step": 159510 + }, + { + "epoch": 12.361579294044713, + "grad_norm": 1.5031310387298882, + "learning_rate": 6.181029138251705e-07, + "loss": 0.8837, + "step": 159520 + }, + { + "epoch": 12.36235421752102, + "grad_norm": 1.485064962210944, + "learning_rate": 6.181416615003101e-07, + "loss": 0.878, + "step": 159530 + }, + { + "epoch": 12.363129140997327, + "grad_norm": 1.4729036540000295, + "learning_rate": 6.181804091754495e-07, + "loss": 0.8823, + "step": 159540 + }, + { + "epoch": 12.363904064473633, + "grad_norm": 1.5015267850714784, + "learning_rate": 6.18219156850589e-07, + "loss": 0.8714, + "step": 159550 + }, + { + "epoch": 12.36467898794994, + "grad_norm": 1.412395388895869, + "learning_rate": 6.182579045257285e-07, + "loss": 0.8729, + "step": 159560 + }, + { + "epoch": 12.365453911426247, + "grad_norm": 1.4993831907655804, + "learning_rate": 6.18296652200868e-07, + "loss": 0.9061, + "step": 159570 + }, + { + "epoch": 12.366228834902554, + "grad_norm": 1.3860887304950305, + "learning_rate": 6.183353998760075e-07, + "loss": 0.8686, + "step": 159580 + }, + { + "epoch": 12.36700375837886, + "grad_norm": 1.5218057259423252, + "learning_rate": 6.18374147551147e-07, + "loss": 0.8795, + "step": 159590 + }, + { + "epoch": 12.367778681855167, + "grad_norm": 1.5135066839231397, + "learning_rate": 6.184128952262865e-07, + "loss": 0.8899, + "step": 159600 + }, + { + "epoch": 12.368553605331474, + "grad_norm": 1.5216482705999137, + "learning_rate": 6.184516429014259e-07, + "loss": 0.8822, + "step": 159610 + }, + { + "epoch": 12.369328528807781, + "grad_norm": 1.5406024365630673, + "learning_rate": 6.184903905765654e-07, + "loss": 0.8941, + "step": 159620 + }, + { + "epoch": 12.370103452284088, + "grad_norm": 1.4109583291223666, + "learning_rate": 6.18529138251705e-07, + "loss": 0.8831, + "step": 159630 + }, + { + "epoch": 12.370878375760393, + "grad_norm": 1.4462030534314776, + "learning_rate": 6.185678859268445e-07, + "loss": 0.8794, + "step": 159640 + }, + { + "epoch": 12.3716532992367, + "grad_norm": 1.3445305780861043, + "learning_rate": 6.186066336019839e-07, + "loss": 0.8639, + "step": 159650 + }, + { + "epoch": 12.372428222713006, + "grad_norm": 1.5573556182257897, + "learning_rate": 6.186453812771234e-07, + "loss": 0.9244, + "step": 159660 + }, + { + "epoch": 12.373203146189313, + "grad_norm": 1.5166885634627423, + "learning_rate": 6.186841289522629e-07, + "loss": 0.8806, + "step": 159670 + }, + { + "epoch": 12.37397806966562, + "grad_norm": 1.4925960048654563, + "learning_rate": 6.187228766274024e-07, + "loss": 0.8825, + "step": 159680 + }, + { + "epoch": 12.374752993141927, + "grad_norm": 1.4896724079000174, + "learning_rate": 6.187616243025419e-07, + "loss": 0.888, + "step": 159690 + }, + { + "epoch": 12.375527916618234, + "grad_norm": 1.625156366468945, + "learning_rate": 6.188003719776814e-07, + "loss": 0.8759, + "step": 159700 + }, + { + "epoch": 12.37630284009454, + "grad_norm": 1.4709533115047964, + "learning_rate": 6.188391196528209e-07, + "loss": 0.8801, + "step": 159710 + }, + { + "epoch": 12.377077763570847, + "grad_norm": 1.4592191781192771, + "learning_rate": 6.188778673279603e-07, + "loss": 0.8573, + "step": 159720 + }, + { + "epoch": 12.377852687047154, + "grad_norm": 1.4792716549959588, + "learning_rate": 6.189166150030999e-07, + "loss": 0.9108, + "step": 159730 + }, + { + "epoch": 12.378627610523461, + "grad_norm": 1.4820841635401205, + "learning_rate": 6.189553626782394e-07, + "loss": 0.8572, + "step": 159740 + }, + { + "epoch": 12.379402533999768, + "grad_norm": 1.5316428923548762, + "learning_rate": 6.189941103533788e-07, + "loss": 0.8739, + "step": 159750 + }, + { + "epoch": 12.380177457476075, + "grad_norm": 1.5512771053661003, + "learning_rate": 6.190328580285183e-07, + "loss": 0.8751, + "step": 159760 + }, + { + "epoch": 12.380952380952381, + "grad_norm": 1.47646666075895, + "learning_rate": 6.190716057036578e-07, + "loss": 0.8778, + "step": 159770 + }, + { + "epoch": 12.381727304428688, + "grad_norm": 1.506156651356667, + "learning_rate": 6.191103533787974e-07, + "loss": 0.8838, + "step": 159780 + }, + { + "epoch": 12.382502227904995, + "grad_norm": 1.4602467685441198, + "learning_rate": 6.191491010539368e-07, + "loss": 0.8628, + "step": 159790 + }, + { + "epoch": 12.383277151381302, + "grad_norm": 1.5449756062768083, + "learning_rate": 6.191878487290763e-07, + "loss": 0.8649, + "step": 159800 + }, + { + "epoch": 12.384052074857609, + "grad_norm": 1.503610160348841, + "learning_rate": 6.192265964042158e-07, + "loss": 0.8697, + "step": 159810 + }, + { + "epoch": 12.384826998333914, + "grad_norm": 1.5133296260440603, + "learning_rate": 6.192653440793552e-07, + "loss": 0.8711, + "step": 159820 + }, + { + "epoch": 12.38560192181022, + "grad_norm": 1.4929290172754561, + "learning_rate": 6.193040917544948e-07, + "loss": 0.9005, + "step": 159830 + }, + { + "epoch": 12.386376845286527, + "grad_norm": 1.4741242844317002, + "learning_rate": 6.193428394296343e-07, + "loss": 0.8941, + "step": 159840 + }, + { + "epoch": 12.387151768762834, + "grad_norm": 1.4945544568393019, + "learning_rate": 6.193815871047738e-07, + "loss": 0.8882, + "step": 159850 + }, + { + "epoch": 12.387926692239141, + "grad_norm": 1.446997653492123, + "learning_rate": 6.194203347799132e-07, + "loss": 0.8872, + "step": 159860 + }, + { + "epoch": 12.388701615715448, + "grad_norm": 1.5093096034288345, + "learning_rate": 6.194590824550527e-07, + "loss": 0.8686, + "step": 159870 + }, + { + "epoch": 12.389476539191755, + "grad_norm": 1.621693441913774, + "learning_rate": 6.194978301301923e-07, + "loss": 0.8797, + "step": 159880 + }, + { + "epoch": 12.390251462668061, + "grad_norm": 1.4947600623753705, + "learning_rate": 6.195365778053317e-07, + "loss": 0.8803, + "step": 159890 + }, + { + "epoch": 12.391026386144368, + "grad_norm": 1.4911971308128906, + "learning_rate": 6.195753254804712e-07, + "loss": 0.8669, + "step": 159900 + }, + { + "epoch": 12.391801309620675, + "grad_norm": 1.6509428228699023, + "learning_rate": 6.196140731556107e-07, + "loss": 0.8856, + "step": 159910 + }, + { + "epoch": 12.392576233096982, + "grad_norm": 1.524688873990521, + "learning_rate": 6.196528208307503e-07, + "loss": 0.8861, + "step": 159920 + }, + { + "epoch": 12.393351156573289, + "grad_norm": 1.4320313186005744, + "learning_rate": 6.196915685058897e-07, + "loss": 0.896, + "step": 159930 + }, + { + "epoch": 12.394126080049595, + "grad_norm": 1.5002194158873037, + "learning_rate": 6.197303161810292e-07, + "loss": 0.8643, + "step": 159940 + }, + { + "epoch": 12.394901003525902, + "grad_norm": 1.4988922960976234, + "learning_rate": 6.197690638561687e-07, + "loss": 0.8651, + "step": 159950 + }, + { + "epoch": 12.395675927002209, + "grad_norm": 1.4680306677557127, + "learning_rate": 6.198078115313081e-07, + "loss": 0.8819, + "step": 159960 + }, + { + "epoch": 12.396450850478516, + "grad_norm": 1.5507881086701663, + "learning_rate": 6.198465592064477e-07, + "loss": 0.8718, + "step": 159970 + }, + { + "epoch": 12.397225773954823, + "grad_norm": 1.4998794741351278, + "learning_rate": 6.198853068815872e-07, + "loss": 0.8797, + "step": 159980 + }, + { + "epoch": 12.39800069743113, + "grad_norm": 1.4383376602990994, + "learning_rate": 6.199240545567267e-07, + "loss": 0.8694, + "step": 159990 + }, + { + "epoch": 12.398775620907436, + "grad_norm": 1.5268280233306601, + "learning_rate": 6.199628022318661e-07, + "loss": 0.8842, + "step": 160000 + }, + { + "epoch": 12.398775620907436, + "eval_loss": 0.8979268670082092, + "eval_runtime": 332.0985, + "eval_samples_per_second": 34.541, + "eval_steps_per_second": 8.636, + "step": 160000 + }, + { + "epoch": 12.399550544383741, + "grad_norm": 1.4756715180662199, + "learning_rate": 6.200015499070056e-07, + "loss": 0.8993, + "step": 160010 + }, + { + "epoch": 12.400325467860048, + "grad_norm": 1.5017293063894657, + "learning_rate": 6.200402975821452e-07, + "loss": 0.8753, + "step": 160020 + }, + { + "epoch": 12.401100391336355, + "grad_norm": 1.5108729745497131, + "learning_rate": 6.200790452572846e-07, + "loss": 0.859, + "step": 160030 + }, + { + "epoch": 12.401875314812662, + "grad_norm": 1.5475666494309803, + "learning_rate": 6.201177929324241e-07, + "loss": 0.889, + "step": 160040 + }, + { + "epoch": 12.402650238288969, + "grad_norm": 1.482979680499465, + "learning_rate": 6.201565406075636e-07, + "loss": 0.8812, + "step": 160050 + }, + { + "epoch": 12.403425161765275, + "grad_norm": 1.498124288344655, + "learning_rate": 6.201952882827031e-07, + "loss": 0.8795, + "step": 160060 + }, + { + "epoch": 12.404200085241582, + "grad_norm": 1.4254247485615492, + "learning_rate": 6.202340359578426e-07, + "loss": 0.8694, + "step": 160070 + }, + { + "epoch": 12.404975008717889, + "grad_norm": 1.4796263090247197, + "learning_rate": 6.202727836329821e-07, + "loss": 0.8672, + "step": 160080 + }, + { + "epoch": 12.405749932194196, + "grad_norm": 1.518107275096626, + "learning_rate": 6.203115313081216e-07, + "loss": 0.891, + "step": 160090 + }, + { + "epoch": 12.406524855670503, + "grad_norm": 1.4659056622421842, + "learning_rate": 6.20350278983261e-07, + "loss": 0.8921, + "step": 160100 + }, + { + "epoch": 12.40729977914681, + "grad_norm": 1.5461612345529823, + "learning_rate": 6.203890266584005e-07, + "loss": 0.8599, + "step": 160110 + }, + { + "epoch": 12.408074702623116, + "grad_norm": 1.5393604858284657, + "learning_rate": 6.204277743335401e-07, + "loss": 0.869, + "step": 160120 + }, + { + "epoch": 12.408849626099423, + "grad_norm": 1.4213192571547622, + "learning_rate": 6.204665220086796e-07, + "loss": 0.8831, + "step": 160130 + }, + { + "epoch": 12.40962454957573, + "grad_norm": 1.4924655592284264, + "learning_rate": 6.20505269683819e-07, + "loss": 0.8725, + "step": 160140 + }, + { + "epoch": 12.410399473052037, + "grad_norm": 1.4564052865301236, + "learning_rate": 6.205440173589585e-07, + "loss": 0.8796, + "step": 160150 + }, + { + "epoch": 12.411174396528343, + "grad_norm": 1.4707729572792612, + "learning_rate": 6.20582765034098e-07, + "loss": 0.8777, + "step": 160160 + }, + { + "epoch": 12.41194932000465, + "grad_norm": 1.5004990973533157, + "learning_rate": 6.206215127092375e-07, + "loss": 0.8791, + "step": 160170 + }, + { + "epoch": 12.412724243480957, + "grad_norm": 1.5649479077618544, + "learning_rate": 6.20660260384377e-07, + "loss": 0.8853, + "step": 160180 + }, + { + "epoch": 12.413499166957262, + "grad_norm": 1.4848643361142269, + "learning_rate": 6.206990080595165e-07, + "loss": 0.8712, + "step": 160190 + }, + { + "epoch": 12.414274090433569, + "grad_norm": 1.6682193412860782, + "learning_rate": 6.20737755734656e-07, + "loss": 0.8959, + "step": 160200 + }, + { + "epoch": 12.415049013909876, + "grad_norm": 1.4458283632480704, + "learning_rate": 6.207765034097954e-07, + "loss": 0.862, + "step": 160210 + }, + { + "epoch": 12.415823937386183, + "grad_norm": 1.554004656048154, + "learning_rate": 6.20815251084935e-07, + "loss": 0.8704, + "step": 160220 + }, + { + "epoch": 12.41659886086249, + "grad_norm": 1.4663554223264552, + "learning_rate": 6.208539987600745e-07, + "loss": 0.8767, + "step": 160230 + }, + { + "epoch": 12.417373784338796, + "grad_norm": 1.4833924737651576, + "learning_rate": 6.208927464352139e-07, + "loss": 0.882, + "step": 160240 + }, + { + "epoch": 12.418148707815103, + "grad_norm": 1.4493479976314825, + "learning_rate": 6.209314941103534e-07, + "loss": 0.8755, + "step": 160250 + }, + { + "epoch": 12.41892363129141, + "grad_norm": 1.6377131216234981, + "learning_rate": 6.209702417854929e-07, + "loss": 0.8792, + "step": 160260 + }, + { + "epoch": 12.419698554767717, + "grad_norm": 1.5948082025599115, + "learning_rate": 6.210089894606325e-07, + "loss": 0.8863, + "step": 160270 + }, + { + "epoch": 12.420473478244023, + "grad_norm": 1.46922602657583, + "learning_rate": 6.210477371357719e-07, + "loss": 0.8962, + "step": 160280 + }, + { + "epoch": 12.42124840172033, + "grad_norm": 1.4064292791513655, + "learning_rate": 6.210864848109114e-07, + "loss": 0.8742, + "step": 160290 + }, + { + "epoch": 12.422023325196637, + "grad_norm": 1.580615943448784, + "learning_rate": 6.211252324860509e-07, + "loss": 0.8841, + "step": 160300 + }, + { + "epoch": 12.422798248672944, + "grad_norm": 1.4582282290529716, + "learning_rate": 6.211639801611903e-07, + "loss": 0.8695, + "step": 160310 + }, + { + "epoch": 12.42357317214925, + "grad_norm": 1.722879586016779, + "learning_rate": 6.212027278363299e-07, + "loss": 0.8939, + "step": 160320 + }, + { + "epoch": 12.424348095625557, + "grad_norm": 1.5522788382287658, + "learning_rate": 6.212414755114694e-07, + "loss": 0.8798, + "step": 160330 + }, + { + "epoch": 12.425123019101864, + "grad_norm": 1.5039014859517947, + "learning_rate": 6.212802231866089e-07, + "loss": 0.8863, + "step": 160340 + }, + { + "epoch": 12.425897942578171, + "grad_norm": 1.4968620031451172, + "learning_rate": 6.213189708617483e-07, + "loss": 0.8681, + "step": 160350 + }, + { + "epoch": 12.426672866054478, + "grad_norm": 1.5857366161707676, + "learning_rate": 6.213577185368878e-07, + "loss": 0.8723, + "step": 160360 + }, + { + "epoch": 12.427447789530785, + "grad_norm": 1.4613286098484424, + "learning_rate": 6.213964662120274e-07, + "loss": 0.8951, + "step": 160370 + }, + { + "epoch": 12.42822271300709, + "grad_norm": 1.5696721827005926, + "learning_rate": 6.214352138871668e-07, + "loss": 0.8839, + "step": 160380 + }, + { + "epoch": 12.428997636483397, + "grad_norm": 1.5002785737924267, + "learning_rate": 6.214739615623063e-07, + "loss": 0.8825, + "step": 160390 + }, + { + "epoch": 12.429772559959703, + "grad_norm": 1.5345419702290006, + "learning_rate": 6.215127092374458e-07, + "loss": 0.8861, + "step": 160400 + }, + { + "epoch": 12.43054748343601, + "grad_norm": 1.571262218469981, + "learning_rate": 6.215514569125853e-07, + "loss": 0.9055, + "step": 160410 + }, + { + "epoch": 12.431322406912317, + "grad_norm": 1.4994845748113654, + "learning_rate": 6.215902045877248e-07, + "loss": 0.8925, + "step": 160420 + }, + { + "epoch": 12.432097330388624, + "grad_norm": 1.4818461891795993, + "learning_rate": 6.216289522628643e-07, + "loss": 0.8879, + "step": 160430 + }, + { + "epoch": 12.43287225386493, + "grad_norm": 1.455276065806934, + "learning_rate": 6.216676999380038e-07, + "loss": 0.8767, + "step": 160440 + }, + { + "epoch": 12.433647177341237, + "grad_norm": 1.6655570120364782, + "learning_rate": 6.217064476131432e-07, + "loss": 0.8689, + "step": 160450 + }, + { + "epoch": 12.434422100817544, + "grad_norm": 1.478292804507173, + "learning_rate": 6.217451952882827e-07, + "loss": 0.8746, + "step": 160460 + }, + { + "epoch": 12.435197024293851, + "grad_norm": 1.4100172911644928, + "learning_rate": 6.217839429634223e-07, + "loss": 0.8773, + "step": 160470 + }, + { + "epoch": 12.435971947770158, + "grad_norm": 1.5175350372485668, + "learning_rate": 6.218226906385618e-07, + "loss": 0.881, + "step": 160480 + }, + { + "epoch": 12.436746871246465, + "grad_norm": 1.57237791846782, + "learning_rate": 6.218614383137012e-07, + "loss": 0.8741, + "step": 160490 + }, + { + "epoch": 12.437521794722771, + "grad_norm": 1.5476506175411908, + "learning_rate": 6.219001859888407e-07, + "loss": 0.8713, + "step": 160500 + }, + { + "epoch": 12.437521794722771, + "eval_loss": 0.8979949951171875, + "eval_runtime": 330.9875, + "eval_samples_per_second": 34.657, + "eval_steps_per_second": 8.665, + "step": 160500 + }, + { + "epoch": 12.438296718199078, + "grad_norm": 1.436263366141214, + "learning_rate": 6.219389336639802e-07, + "loss": 0.8772, + "step": 160510 + }, + { + "epoch": 12.439071641675385, + "grad_norm": 1.42013842650997, + "learning_rate": 6.219776813391197e-07, + "loss": 0.867, + "step": 160520 + }, + { + "epoch": 12.439846565151692, + "grad_norm": 1.552323668937335, + "learning_rate": 6.220164290142592e-07, + "loss": 0.8789, + "step": 160530 + }, + { + "epoch": 12.440621488627999, + "grad_norm": 1.5332699854406282, + "learning_rate": 6.220551766893987e-07, + "loss": 0.883, + "step": 160540 + }, + { + "epoch": 12.441396412104305, + "grad_norm": 1.509842996667892, + "learning_rate": 6.220939243645382e-07, + "loss": 0.8696, + "step": 160550 + }, + { + "epoch": 12.44217133558061, + "grad_norm": 1.511575217612724, + "learning_rate": 6.221326720396776e-07, + "loss": 0.8671, + "step": 160560 + }, + { + "epoch": 12.442946259056917, + "grad_norm": 1.502523651797714, + "learning_rate": 6.221714197148172e-07, + "loss": 0.9002, + "step": 160570 + }, + { + "epoch": 12.443721182533224, + "grad_norm": 1.5240312737047719, + "learning_rate": 6.222101673899567e-07, + "loss": 0.8783, + "step": 160580 + }, + { + "epoch": 12.444496106009531, + "grad_norm": 1.571384857284159, + "learning_rate": 6.222489150650961e-07, + "loss": 0.886, + "step": 160590 + }, + { + "epoch": 12.445271029485838, + "grad_norm": 1.4460511469710868, + "learning_rate": 6.222876627402356e-07, + "loss": 0.8846, + "step": 160600 + }, + { + "epoch": 12.446045952962145, + "grad_norm": 1.4388460092590476, + "learning_rate": 6.223264104153752e-07, + "loss": 0.8765, + "step": 160610 + }, + { + "epoch": 12.446820876438451, + "grad_norm": 1.4935593363274007, + "learning_rate": 6.223651580905146e-07, + "loss": 0.8883, + "step": 160620 + }, + { + "epoch": 12.447595799914758, + "grad_norm": 1.5087776424246504, + "learning_rate": 6.224039057656541e-07, + "loss": 0.8841, + "step": 160630 + }, + { + "epoch": 12.448370723391065, + "grad_norm": 1.519979789141843, + "learning_rate": 6.224426534407936e-07, + "loss": 0.872, + "step": 160640 + }, + { + "epoch": 12.449145646867372, + "grad_norm": 1.4314627508023139, + "learning_rate": 6.224814011159331e-07, + "loss": 0.8694, + "step": 160650 + }, + { + "epoch": 12.449920570343679, + "grad_norm": 1.6086596185509427, + "learning_rate": 6.225201487910726e-07, + "loss": 0.8949, + "step": 160660 + }, + { + "epoch": 12.450695493819985, + "grad_norm": 1.5337690685384362, + "learning_rate": 6.225588964662121e-07, + "loss": 0.8629, + "step": 160670 + }, + { + "epoch": 12.451470417296292, + "grad_norm": 1.419956832277618, + "learning_rate": 6.225976441413516e-07, + "loss": 0.8744, + "step": 160680 + }, + { + "epoch": 12.452245340772599, + "grad_norm": 1.4776456331315073, + "learning_rate": 6.22636391816491e-07, + "loss": 0.8834, + "step": 160690 + }, + { + "epoch": 12.453020264248906, + "grad_norm": 1.384954606834045, + "learning_rate": 6.226751394916305e-07, + "loss": 0.8796, + "step": 160700 + }, + { + "epoch": 12.453795187725213, + "grad_norm": 1.5156780113842014, + "learning_rate": 6.227138871667701e-07, + "loss": 0.882, + "step": 160710 + }, + { + "epoch": 12.45457011120152, + "grad_norm": 1.4396165079539607, + "learning_rate": 6.227526348419096e-07, + "loss": 0.8851, + "step": 160720 + }, + { + "epoch": 12.455345034677826, + "grad_norm": 1.4627331575439162, + "learning_rate": 6.22791382517049e-07, + "loss": 0.8846, + "step": 160730 + }, + { + "epoch": 12.456119958154133, + "grad_norm": 1.5398253712393877, + "learning_rate": 6.228301301921885e-07, + "loss": 0.8822, + "step": 160740 + }, + { + "epoch": 12.45689488163044, + "grad_norm": 1.540607387752531, + "learning_rate": 6.22868877867328e-07, + "loss": 0.8828, + "step": 160750 + }, + { + "epoch": 12.457669805106745, + "grad_norm": 1.4921946269658475, + "learning_rate": 6.229076255424675e-07, + "loss": 0.8834, + "step": 160760 + }, + { + "epoch": 12.458444728583052, + "grad_norm": 1.46271531258078, + "learning_rate": 6.22946373217607e-07, + "loss": 0.8944, + "step": 160770 + }, + { + "epoch": 12.459219652059359, + "grad_norm": 1.4232457773212803, + "learning_rate": 6.229851208927465e-07, + "loss": 0.8703, + "step": 160780 + }, + { + "epoch": 12.459994575535665, + "grad_norm": 1.5628146727592636, + "learning_rate": 6.23023868567886e-07, + "loss": 0.8939, + "step": 160790 + }, + { + "epoch": 12.460769499011972, + "grad_norm": 1.4540738034224825, + "learning_rate": 6.230626162430254e-07, + "loss": 0.8677, + "step": 160800 + }, + { + "epoch": 12.461544422488279, + "grad_norm": 1.6705795227272473, + "learning_rate": 6.23101363918165e-07, + "loss": 0.8853, + "step": 160810 + }, + { + "epoch": 12.462319345964586, + "grad_norm": 1.5174432088017435, + "learning_rate": 6.231401115933045e-07, + "loss": 0.879, + "step": 160820 + }, + { + "epoch": 12.463094269440893, + "grad_norm": 1.4759923669574948, + "learning_rate": 6.231788592684439e-07, + "loss": 0.879, + "step": 160830 + }, + { + "epoch": 12.4638691929172, + "grad_norm": 1.5128960282104185, + "learning_rate": 6.232176069435834e-07, + "loss": 0.8708, + "step": 160840 + }, + { + "epoch": 12.464644116393506, + "grad_norm": 1.4614077772827203, + "learning_rate": 6.232563546187229e-07, + "loss": 0.8675, + "step": 160850 + }, + { + "epoch": 12.465419039869813, + "grad_norm": 1.5171480073897083, + "learning_rate": 6.232951022938625e-07, + "loss": 0.8746, + "step": 160860 + }, + { + "epoch": 12.46619396334612, + "grad_norm": 1.5227295722414365, + "learning_rate": 6.233338499690019e-07, + "loss": 0.879, + "step": 160870 + }, + { + "epoch": 12.466968886822427, + "grad_norm": 1.5663597224335546, + "learning_rate": 6.233725976441414e-07, + "loss": 0.8724, + "step": 160880 + }, + { + "epoch": 12.467743810298733, + "grad_norm": 1.4111732849450307, + "learning_rate": 6.234113453192809e-07, + "loss": 0.8819, + "step": 160890 + }, + { + "epoch": 12.46851873377504, + "grad_norm": 1.5038877229195473, + "learning_rate": 6.234500929944203e-07, + "loss": 0.8624, + "step": 160900 + }, + { + "epoch": 12.469293657251347, + "grad_norm": 1.528207932494287, + "learning_rate": 6.234888406695599e-07, + "loss": 0.8794, + "step": 160910 + }, + { + "epoch": 12.470068580727654, + "grad_norm": 1.5591046628932912, + "learning_rate": 6.235275883446994e-07, + "loss": 0.8825, + "step": 160920 + }, + { + "epoch": 12.47084350420396, + "grad_norm": 1.4597543265196649, + "learning_rate": 6.235663360198389e-07, + "loss": 0.8584, + "step": 160930 + }, + { + "epoch": 12.471618427680266, + "grad_norm": 1.4399613715102326, + "learning_rate": 6.236050836949783e-07, + "loss": 0.8774, + "step": 160940 + }, + { + "epoch": 12.472393351156573, + "grad_norm": 1.4847816147551334, + "learning_rate": 6.236438313701178e-07, + "loss": 0.8621, + "step": 160950 + }, + { + "epoch": 12.47316827463288, + "grad_norm": 1.498728923592409, + "learning_rate": 6.236825790452574e-07, + "loss": 0.8791, + "step": 160960 + }, + { + "epoch": 12.473943198109186, + "grad_norm": 1.5073923841663925, + "learning_rate": 6.237213267203968e-07, + "loss": 0.8726, + "step": 160970 + }, + { + "epoch": 12.474718121585493, + "grad_norm": 1.5381074516038598, + "learning_rate": 6.237600743955363e-07, + "loss": 0.8747, + "step": 160980 + }, + { + "epoch": 12.4754930450618, + "grad_norm": 1.418146082396861, + "learning_rate": 6.237988220706758e-07, + "loss": 0.8811, + "step": 160990 + }, + { + "epoch": 12.476267968538107, + "grad_norm": 1.4920156896893328, + "learning_rate": 6.238375697458153e-07, + "loss": 0.8767, + "step": 161000 + }, + { + "epoch": 12.476267968538107, + "eval_loss": 0.8979936242103577, + "eval_runtime": 330.696, + "eval_samples_per_second": 34.687, + "eval_steps_per_second": 8.673, + "step": 161000 + }, + { + "epoch": 12.477042892014413, + "grad_norm": 1.4528003433509409, + "learning_rate": 6.238763174209548e-07, + "loss": 0.8812, + "step": 161010 + }, + { + "epoch": 12.47781781549072, + "grad_norm": 1.5376266727296992, + "learning_rate": 6.239150650960943e-07, + "loss": 0.8853, + "step": 161020 + }, + { + "epoch": 12.478592738967027, + "grad_norm": 1.556508252171039, + "learning_rate": 6.239538127712338e-07, + "loss": 0.8878, + "step": 161030 + }, + { + "epoch": 12.479367662443334, + "grad_norm": 1.4970290248462919, + "learning_rate": 6.239925604463732e-07, + "loss": 0.879, + "step": 161040 + }, + { + "epoch": 12.48014258591964, + "grad_norm": 1.4466432441574162, + "learning_rate": 6.240313081215127e-07, + "loss": 0.8855, + "step": 161050 + }, + { + "epoch": 12.480917509395947, + "grad_norm": 1.4368150955378756, + "learning_rate": 6.240700557966523e-07, + "loss": 0.8765, + "step": 161060 + }, + { + "epoch": 12.481692432872254, + "grad_norm": 1.543710361322357, + "learning_rate": 6.241088034717918e-07, + "loss": 0.8674, + "step": 161070 + }, + { + "epoch": 12.482467356348561, + "grad_norm": 1.4784377087530374, + "learning_rate": 6.241475511469312e-07, + "loss": 0.8902, + "step": 161080 + }, + { + "epoch": 12.483242279824868, + "grad_norm": 1.4646953848815616, + "learning_rate": 6.241862988220707e-07, + "loss": 0.8749, + "step": 161090 + }, + { + "epoch": 12.484017203301175, + "grad_norm": 1.4668629267867317, + "learning_rate": 6.242250464972102e-07, + "loss": 0.8898, + "step": 161100 + }, + { + "epoch": 12.484792126777482, + "grad_norm": 1.520840298145348, + "learning_rate": 6.242637941723497e-07, + "loss": 0.8787, + "step": 161110 + }, + { + "epoch": 12.485567050253788, + "grad_norm": 1.4684875830646351, + "learning_rate": 6.243025418474892e-07, + "loss": 0.8769, + "step": 161120 + }, + { + "epoch": 12.486341973730093, + "grad_norm": 1.4865811409890732, + "learning_rate": 6.243412895226287e-07, + "loss": 0.8938, + "step": 161130 + }, + { + "epoch": 12.4871168972064, + "grad_norm": 1.471381372706701, + "learning_rate": 6.243800371977682e-07, + "loss": 0.876, + "step": 161140 + }, + { + "epoch": 12.487891820682707, + "grad_norm": 1.473367828335626, + "learning_rate": 6.244187848729076e-07, + "loss": 0.8947, + "step": 161150 + }, + { + "epoch": 12.488666744159014, + "grad_norm": 1.544028291317053, + "learning_rate": 6.244575325480472e-07, + "loss": 0.8719, + "step": 161160 + }, + { + "epoch": 12.48944166763532, + "grad_norm": 1.5262438529954565, + "learning_rate": 6.244962802231867e-07, + "loss": 0.8892, + "step": 161170 + }, + { + "epoch": 12.490216591111627, + "grad_norm": 1.5655980974190986, + "learning_rate": 6.245350278983261e-07, + "loss": 0.8581, + "step": 161180 + }, + { + "epoch": 12.490991514587934, + "grad_norm": 1.5301195137545256, + "learning_rate": 6.245737755734656e-07, + "loss": 0.8753, + "step": 161190 + }, + { + "epoch": 12.491766438064241, + "grad_norm": 1.4589055554131638, + "learning_rate": 6.246125232486051e-07, + "loss": 0.8894, + "step": 161200 + }, + { + "epoch": 12.492541361540548, + "grad_norm": 1.4476435316544485, + "learning_rate": 6.246512709237447e-07, + "loss": 0.8775, + "step": 161210 + }, + { + "epoch": 12.493316285016855, + "grad_norm": 1.4890782843839987, + "learning_rate": 6.246900185988841e-07, + "loss": 0.8719, + "step": 161220 + }, + { + "epoch": 12.494091208493161, + "grad_norm": 1.48902426622687, + "learning_rate": 6.247287662740236e-07, + "loss": 0.8632, + "step": 161230 + }, + { + "epoch": 12.494866131969468, + "grad_norm": 1.545992345976641, + "learning_rate": 6.247675139491631e-07, + "loss": 0.8912, + "step": 161240 + }, + { + "epoch": 12.495641055445775, + "grad_norm": 1.4574738444115447, + "learning_rate": 6.248062616243025e-07, + "loss": 0.8887, + "step": 161250 + }, + { + "epoch": 12.496415978922082, + "grad_norm": 1.46236354816465, + "learning_rate": 6.248450092994421e-07, + "loss": 0.8905, + "step": 161260 + }, + { + "epoch": 12.497190902398389, + "grad_norm": 1.507159830532298, + "learning_rate": 6.248837569745816e-07, + "loss": 0.8795, + "step": 161270 + }, + { + "epoch": 12.497965825874696, + "grad_norm": 1.5173310649066576, + "learning_rate": 6.249225046497211e-07, + "loss": 0.8713, + "step": 161280 + }, + { + "epoch": 12.498740749351002, + "grad_norm": 1.5238342407400922, + "learning_rate": 6.249612523248605e-07, + "loss": 0.9011, + "step": 161290 + }, + { + "epoch": 12.49951567282731, + "grad_norm": 1.533304687686947, + "learning_rate": 6.25e-07, + "loss": 0.8723, + "step": 161300 + }, + { + "epoch": 12.500290596303614, + "grad_norm": 1.5103485420210725, + "learning_rate": 6.250387476751396e-07, + "loss": 0.8862, + "step": 161310 + }, + { + "epoch": 12.501065519779921, + "grad_norm": 1.56102831428568, + "learning_rate": 6.25077495350279e-07, + "loss": 0.8485, + "step": 161320 + }, + { + "epoch": 12.501840443256228, + "grad_norm": 1.467789983629221, + "learning_rate": 6.251162430254186e-07, + "loss": 0.8902, + "step": 161330 + }, + { + "epoch": 12.502615366732535, + "grad_norm": 1.5515961359676111, + "learning_rate": 6.25154990700558e-07, + "loss": 0.8671, + "step": 161340 + }, + { + "epoch": 12.503390290208841, + "grad_norm": 1.4229416695884993, + "learning_rate": 6.251937383756976e-07, + "loss": 0.8786, + "step": 161350 + }, + { + "epoch": 12.504165213685148, + "grad_norm": 1.5657662611612564, + "learning_rate": 6.25232486050837e-07, + "loss": 0.8866, + "step": 161360 + }, + { + "epoch": 12.504940137161455, + "grad_norm": 1.4272113707149685, + "learning_rate": 6.252712337259766e-07, + "loss": 0.8628, + "step": 161370 + }, + { + "epoch": 12.505715060637762, + "grad_norm": 1.641845159400079, + "learning_rate": 6.25309981401116e-07, + "loss": 0.8866, + "step": 161380 + }, + { + "epoch": 12.506489984114069, + "grad_norm": 1.751206484429111, + "learning_rate": 6.253487290762555e-07, + "loss": 0.8952, + "step": 161390 + }, + { + "epoch": 12.507264907590375, + "grad_norm": 1.4964629546689716, + "learning_rate": 6.25387476751395e-07, + "loss": 0.8931, + "step": 161400 + }, + { + "epoch": 12.508039831066682, + "grad_norm": 1.5945682669991899, + "learning_rate": 6.254262244265346e-07, + "loss": 0.8923, + "step": 161410 + }, + { + "epoch": 12.508814754542989, + "grad_norm": 1.4524755855489486, + "learning_rate": 6.25464972101674e-07, + "loss": 0.8607, + "step": 161420 + }, + { + "epoch": 12.509589678019296, + "grad_norm": 1.4995994896448461, + "learning_rate": 6.255037197768135e-07, + "loss": 0.8826, + "step": 161430 + }, + { + "epoch": 12.510364601495603, + "grad_norm": 1.4697299827898478, + "learning_rate": 6.255424674519529e-07, + "loss": 0.883, + "step": 161440 + }, + { + "epoch": 12.51113952497191, + "grad_norm": 1.5991306222102908, + "learning_rate": 6.255812151270925e-07, + "loss": 0.8727, + "step": 161450 + }, + { + "epoch": 12.511914448448216, + "grad_norm": 1.4769207337334733, + "learning_rate": 6.256199628022319e-07, + "loss": 0.9128, + "step": 161460 + }, + { + "epoch": 12.512689371924523, + "grad_norm": 1.4866414268292896, + "learning_rate": 6.256587104773715e-07, + "loss": 0.852, + "step": 161470 + }, + { + "epoch": 12.51346429540083, + "grad_norm": 1.4522819284436648, + "learning_rate": 6.256974581525109e-07, + "loss": 0.8847, + "step": 161480 + }, + { + "epoch": 12.514239218877137, + "grad_norm": 1.4383872686212256, + "learning_rate": 6.257362058276504e-07, + "loss": 0.9143, + "step": 161490 + }, + { + "epoch": 12.515014142353442, + "grad_norm": 1.628962584111294, + "learning_rate": 6.257749535027899e-07, + "loss": 0.8918, + "step": 161500 + }, + { + "epoch": 12.515014142353442, + "eval_loss": 0.8978570699691772, + "eval_runtime": 330.531, + "eval_samples_per_second": 34.705, + "eval_steps_per_second": 8.677, + "step": 161500 + }, + { + "epoch": 12.515789065829749, + "grad_norm": 1.504336630834937, + "learning_rate": 6.258137011779295e-07, + "loss": 0.8832, + "step": 161510 + }, + { + "epoch": 12.516563989306055, + "grad_norm": 1.4643654584333405, + "learning_rate": 6.258524488530689e-07, + "loss": 0.8788, + "step": 161520 + }, + { + "epoch": 12.517338912782362, + "grad_norm": 1.4942366615397604, + "learning_rate": 6.258911965282084e-07, + "loss": 0.8774, + "step": 161530 + }, + { + "epoch": 12.518113836258669, + "grad_norm": 1.5324666459767704, + "learning_rate": 6.259299442033478e-07, + "loss": 0.8789, + "step": 161540 + }, + { + "epoch": 12.518888759734976, + "grad_norm": 1.5050398305755885, + "learning_rate": 6.259686918784874e-07, + "loss": 0.8769, + "step": 161550 + }, + { + "epoch": 12.519663683211283, + "grad_norm": 1.468613639175874, + "learning_rate": 6.260074395536269e-07, + "loss": 0.8801, + "step": 161560 + }, + { + "epoch": 12.52043860668759, + "grad_norm": 1.5958571867927651, + "learning_rate": 6.260461872287664e-07, + "loss": 0.8964, + "step": 161570 + }, + { + "epoch": 12.521213530163896, + "grad_norm": 1.6269133725222287, + "learning_rate": 6.260849349039058e-07, + "loss": 0.8828, + "step": 161580 + }, + { + "epoch": 12.521988453640203, + "grad_norm": 1.494041148793637, + "learning_rate": 6.261236825790453e-07, + "loss": 0.876, + "step": 161590 + }, + { + "epoch": 12.52276337711651, + "grad_norm": 1.4712408705211022, + "learning_rate": 6.261624302541848e-07, + "loss": 0.8793, + "step": 161600 + }, + { + "epoch": 12.523538300592817, + "grad_norm": 1.3930085980648004, + "learning_rate": 6.262011779293244e-07, + "loss": 0.8908, + "step": 161610 + }, + { + "epoch": 12.524313224069124, + "grad_norm": 1.5429665971732542, + "learning_rate": 6.262399256044638e-07, + "loss": 0.8861, + "step": 161620 + }, + { + "epoch": 12.52508814754543, + "grad_norm": 1.4901018106650752, + "learning_rate": 6.262786732796033e-07, + "loss": 0.8867, + "step": 161630 + }, + { + "epoch": 12.525863071021737, + "grad_norm": 1.4902294813186976, + "learning_rate": 6.263174209547427e-07, + "loss": 0.8824, + "step": 161640 + }, + { + "epoch": 12.526637994498044, + "grad_norm": 1.5185463977929616, + "learning_rate": 6.263561686298824e-07, + "loss": 0.8814, + "step": 161650 + }, + { + "epoch": 12.52741291797435, + "grad_norm": 1.5368080822056318, + "learning_rate": 6.263949163050218e-07, + "loss": 0.861, + "step": 161660 + }, + { + "epoch": 12.528187841450658, + "grad_norm": 1.4891268921424454, + "learning_rate": 6.264336639801613e-07, + "loss": 0.8495, + "step": 161670 + }, + { + "epoch": 12.528962764926963, + "grad_norm": 1.4576264867360436, + "learning_rate": 6.264724116553007e-07, + "loss": 0.8757, + "step": 161680 + }, + { + "epoch": 12.52973768840327, + "grad_norm": 1.5386088236001587, + "learning_rate": 6.265111593304402e-07, + "loss": 0.8773, + "step": 161690 + }, + { + "epoch": 12.530512611879576, + "grad_norm": 1.5316599114702212, + "learning_rate": 6.265499070055798e-07, + "loss": 0.8645, + "step": 161700 + }, + { + "epoch": 12.531287535355883, + "grad_norm": 1.513543134511181, + "learning_rate": 6.265886546807193e-07, + "loss": 0.8935, + "step": 161710 + }, + { + "epoch": 12.53206245883219, + "grad_norm": 1.4853911487829723, + "learning_rate": 6.266274023558587e-07, + "loss": 0.8714, + "step": 161720 + }, + { + "epoch": 12.532837382308497, + "grad_norm": 1.5140824444562007, + "learning_rate": 6.266661500309982e-07, + "loss": 0.8615, + "step": 161730 + }, + { + "epoch": 12.533612305784803, + "grad_norm": 1.4320078990003862, + "learning_rate": 6.267048977061376e-07, + "loss": 0.8877, + "step": 161740 + }, + { + "epoch": 12.53438722926111, + "grad_norm": 1.4254679549237719, + "learning_rate": 6.267436453812773e-07, + "loss": 0.8642, + "step": 161750 + }, + { + "epoch": 12.535162152737417, + "grad_norm": 1.4483465098106798, + "learning_rate": 6.267823930564167e-07, + "loss": 0.8809, + "step": 161760 + }, + { + "epoch": 12.535937076213724, + "grad_norm": 1.502900449044963, + "learning_rate": 6.268211407315562e-07, + "loss": 0.8758, + "step": 161770 + }, + { + "epoch": 12.53671199969003, + "grad_norm": 1.6148664384171199, + "learning_rate": 6.268598884066956e-07, + "loss": 0.8819, + "step": 161780 + }, + { + "epoch": 12.537486923166338, + "grad_norm": 1.5889851574312963, + "learning_rate": 6.268986360818353e-07, + "loss": 0.8658, + "step": 161790 + }, + { + "epoch": 12.538261846642644, + "grad_norm": 1.5299997732284434, + "learning_rate": 6.269373837569747e-07, + "loss": 0.8956, + "step": 161800 + }, + { + "epoch": 12.539036770118951, + "grad_norm": 1.4658689671697576, + "learning_rate": 6.269761314321142e-07, + "loss": 0.8673, + "step": 161810 + }, + { + "epoch": 12.539811693595258, + "grad_norm": 1.5157242576250134, + "learning_rate": 6.270148791072536e-07, + "loss": 0.9138, + "step": 161820 + }, + { + "epoch": 12.540586617071565, + "grad_norm": 1.4983067099133383, + "learning_rate": 6.270536267823931e-07, + "loss": 0.8975, + "step": 161830 + }, + { + "epoch": 12.541361540547872, + "grad_norm": 1.4986659995040807, + "learning_rate": 6.270923744575326e-07, + "loss": 0.8856, + "step": 161840 + }, + { + "epoch": 12.542136464024178, + "grad_norm": 1.4650949761601526, + "learning_rate": 6.271311221326722e-07, + "loss": 0.89, + "step": 161850 + }, + { + "epoch": 12.542911387500485, + "grad_norm": 1.4981052132613013, + "learning_rate": 6.271698698078116e-07, + "loss": 0.8671, + "step": 161860 + }, + { + "epoch": 12.54368631097679, + "grad_norm": 1.5196267386351083, + "learning_rate": 6.272086174829511e-07, + "loss": 0.8832, + "step": 161870 + }, + { + "epoch": 12.544461234453097, + "grad_norm": 1.517150270463171, + "learning_rate": 6.272473651580905e-07, + "loss": 0.8726, + "step": 161880 + }, + { + "epoch": 12.545236157929404, + "grad_norm": 1.4904269184562067, + "learning_rate": 6.272861128332302e-07, + "loss": 0.8714, + "step": 161890 + }, + { + "epoch": 12.54601108140571, + "grad_norm": 1.5341468920976138, + "learning_rate": 6.273248605083696e-07, + "loss": 0.8687, + "step": 161900 + }, + { + "epoch": 12.546786004882017, + "grad_norm": 1.4902009803658438, + "learning_rate": 6.273636081835091e-07, + "loss": 0.8681, + "step": 161910 + }, + { + "epoch": 12.547560928358324, + "grad_norm": 1.4316179928710093, + "learning_rate": 6.274023558586485e-07, + "loss": 0.8592, + "step": 161920 + }, + { + "epoch": 12.548335851834631, + "grad_norm": 1.4710540249290731, + "learning_rate": 6.274411035337881e-07, + "loss": 0.8891, + "step": 161930 + }, + { + "epoch": 12.549110775310938, + "grad_norm": 1.4798972343736343, + "learning_rate": 6.274798512089276e-07, + "loss": 0.8755, + "step": 161940 + }, + { + "epoch": 12.549885698787245, + "grad_norm": 1.5712406342035934, + "learning_rate": 6.275185988840671e-07, + "loss": 0.88, + "step": 161950 + }, + { + "epoch": 12.550660622263552, + "grad_norm": 1.4995813152843687, + "learning_rate": 6.275573465592065e-07, + "loss": 0.8896, + "step": 161960 + }, + { + "epoch": 12.551435545739858, + "grad_norm": 1.5052232651096926, + "learning_rate": 6.27596094234346e-07, + "loss": 0.8989, + "step": 161970 + }, + { + "epoch": 12.552210469216165, + "grad_norm": 1.5245361356347762, + "learning_rate": 6.276348419094855e-07, + "loss": 0.8582, + "step": 161980 + }, + { + "epoch": 12.552985392692472, + "grad_norm": 1.4707150215576292, + "learning_rate": 6.276735895846251e-07, + "loss": 0.8875, + "step": 161990 + }, + { + "epoch": 12.553760316168779, + "grad_norm": 1.5361959643055718, + "learning_rate": 6.277123372597645e-07, + "loss": 0.8703, + "step": 162000 + }, + { + "epoch": 12.553760316168779, + "eval_loss": 0.8976327180862427, + "eval_runtime": 328.3296, + "eval_samples_per_second": 34.937, + "eval_steps_per_second": 8.735, + "step": 162000 + }, + { + "epoch": 12.554535239645086, + "grad_norm": 1.4957194196531274, + "learning_rate": 6.27751084934904e-07, + "loss": 0.8897, + "step": 162010 + }, + { + "epoch": 12.555310163121392, + "grad_norm": 1.4613380347003213, + "learning_rate": 6.277898326100434e-07, + "loss": 0.8722, + "step": 162020 + }, + { + "epoch": 12.5560850865977, + "grad_norm": 1.5350867935217678, + "learning_rate": 6.27828580285183e-07, + "loss": 0.896, + "step": 162030 + }, + { + "epoch": 12.556860010074006, + "grad_norm": 1.4490453396073677, + "learning_rate": 6.278673279603225e-07, + "loss": 0.8838, + "step": 162040 + }, + { + "epoch": 12.557634933550311, + "grad_norm": 1.439655795423125, + "learning_rate": 6.27906075635462e-07, + "loss": 0.8659, + "step": 162050 + }, + { + "epoch": 12.558409857026618, + "grad_norm": 1.526624479100853, + "learning_rate": 6.279448233106014e-07, + "loss": 0.8788, + "step": 162060 + }, + { + "epoch": 12.559184780502925, + "grad_norm": 1.6195282732663474, + "learning_rate": 6.27983570985741e-07, + "loss": 0.8565, + "step": 162070 + }, + { + "epoch": 12.559959703979231, + "grad_norm": 1.4129879634361002, + "learning_rate": 6.280223186608804e-07, + "loss": 0.8815, + "step": 162080 + }, + { + "epoch": 12.560734627455538, + "grad_norm": 1.5642810380628704, + "learning_rate": 6.2806106633602e-07, + "loss": 0.8608, + "step": 162090 + }, + { + "epoch": 12.561509550931845, + "grad_norm": 1.4294528908192219, + "learning_rate": 6.280998140111594e-07, + "loss": 0.8716, + "step": 162100 + }, + { + "epoch": 12.562284474408152, + "grad_norm": 1.6588086904357606, + "learning_rate": 6.281385616862989e-07, + "loss": 0.8867, + "step": 162110 + }, + { + "epoch": 12.563059397884459, + "grad_norm": 1.4831858469115147, + "learning_rate": 6.281773093614383e-07, + "loss": 0.8806, + "step": 162120 + }, + { + "epoch": 12.563834321360766, + "grad_norm": 1.3857454147040207, + "learning_rate": 6.282160570365779e-07, + "loss": 0.869, + "step": 162130 + }, + { + "epoch": 12.564609244837072, + "grad_norm": 1.49799205181194, + "learning_rate": 6.282548047117174e-07, + "loss": 0.886, + "step": 162140 + }, + { + "epoch": 12.56538416831338, + "grad_norm": 1.5170320196574658, + "learning_rate": 6.282935523868569e-07, + "loss": 0.8775, + "step": 162150 + }, + { + "epoch": 12.566159091789686, + "grad_norm": 1.5388425913744848, + "learning_rate": 6.283323000619963e-07, + "loss": 0.883, + "step": 162160 + }, + { + "epoch": 12.566934015265993, + "grad_norm": 1.45480577779384, + "learning_rate": 6.283710477371359e-07, + "loss": 0.8853, + "step": 162170 + }, + { + "epoch": 12.5677089387423, + "grad_norm": 1.3987985220614714, + "learning_rate": 6.284097954122753e-07, + "loss": 0.8847, + "step": 162180 + }, + { + "epoch": 12.568483862218606, + "grad_norm": 1.4833880167275548, + "learning_rate": 6.284485430874149e-07, + "loss": 0.873, + "step": 162190 + }, + { + "epoch": 12.569258785694913, + "grad_norm": 1.5550211315011846, + "learning_rate": 6.284872907625543e-07, + "loss": 0.8776, + "step": 162200 + }, + { + "epoch": 12.57003370917122, + "grad_norm": 1.561801745803239, + "learning_rate": 6.285260384376939e-07, + "loss": 0.8719, + "step": 162210 + }, + { + "epoch": 12.570808632647527, + "grad_norm": 1.4996514106452863, + "learning_rate": 6.285647861128333e-07, + "loss": 0.8991, + "step": 162220 + }, + { + "epoch": 12.571583556123834, + "grad_norm": 1.4744393934254965, + "learning_rate": 6.286035337879728e-07, + "loss": 0.88, + "step": 162230 + }, + { + "epoch": 12.57235847960014, + "grad_norm": 1.486929164693656, + "learning_rate": 6.286422814631123e-07, + "loss": 0.8686, + "step": 162240 + }, + { + "epoch": 12.573133403076445, + "grad_norm": 1.4085210004509245, + "learning_rate": 6.286810291382518e-07, + "loss": 0.8711, + "step": 162250 + }, + { + "epoch": 12.573908326552752, + "grad_norm": 1.501239860950457, + "learning_rate": 6.287197768133912e-07, + "loss": 0.8666, + "step": 162260 + }, + { + "epoch": 12.574683250029059, + "grad_norm": 1.4821708192651373, + "learning_rate": 6.287585244885308e-07, + "loss": 0.8774, + "step": 162270 + }, + { + "epoch": 12.575458173505366, + "grad_norm": 1.4380221898831291, + "learning_rate": 6.287972721636702e-07, + "loss": 0.8766, + "step": 162280 + }, + { + "epoch": 12.576233096981673, + "grad_norm": 1.5019313831256489, + "learning_rate": 6.288360198388098e-07, + "loss": 0.8644, + "step": 162290 + }, + { + "epoch": 12.57700802045798, + "grad_norm": 1.493366390272611, + "learning_rate": 6.288747675139492e-07, + "loss": 0.8646, + "step": 162300 + }, + { + "epoch": 12.577782943934286, + "grad_norm": 1.5052122146787883, + "learning_rate": 6.289135151890888e-07, + "loss": 0.878, + "step": 162310 + }, + { + "epoch": 12.578557867410593, + "grad_norm": 1.5305731115056076, + "learning_rate": 6.289522628642282e-07, + "loss": 0.8928, + "step": 162320 + }, + { + "epoch": 12.5793327908869, + "grad_norm": 1.6120482233335043, + "learning_rate": 6.289910105393677e-07, + "loss": 0.8911, + "step": 162330 + }, + { + "epoch": 12.580107714363207, + "grad_norm": 1.5157977771273168, + "learning_rate": 6.290297582145072e-07, + "loss": 0.8872, + "step": 162340 + }, + { + "epoch": 12.580882637839514, + "grad_norm": 1.541590628106459, + "learning_rate": 6.290685058896468e-07, + "loss": 0.858, + "step": 162350 + }, + { + "epoch": 12.58165756131582, + "grad_norm": 1.6234600299286377, + "learning_rate": 6.291072535647862e-07, + "loss": 0.879, + "step": 162360 + }, + { + "epoch": 12.582432484792127, + "grad_norm": 1.67272165428654, + "learning_rate": 6.291460012399257e-07, + "loss": 0.904, + "step": 162370 + }, + { + "epoch": 12.583207408268434, + "grad_norm": 1.5126014779243513, + "learning_rate": 6.291847489150651e-07, + "loss": 0.8925, + "step": 162380 + }, + { + "epoch": 12.58398233174474, + "grad_norm": 1.4988682181948267, + "learning_rate": 6.292234965902047e-07, + "loss": 0.8613, + "step": 162390 + }, + { + "epoch": 12.584757255221048, + "grad_norm": 1.5738918323916082, + "learning_rate": 6.292622442653441e-07, + "loss": 0.8901, + "step": 162400 + }, + { + "epoch": 12.585532178697354, + "grad_norm": 1.4403186271982944, + "learning_rate": 6.293009919404837e-07, + "loss": 0.8766, + "step": 162410 + }, + { + "epoch": 12.58630710217366, + "grad_norm": 1.530987757040346, + "learning_rate": 6.293397396156231e-07, + "loss": 0.9081, + "step": 162420 + }, + { + "epoch": 12.587082025649966, + "grad_norm": 1.472582592358038, + "learning_rate": 6.293784872907626e-07, + "loss": 0.8871, + "step": 162430 + }, + { + "epoch": 12.587856949126273, + "grad_norm": 1.607993451995373, + "learning_rate": 6.294172349659021e-07, + "loss": 0.8786, + "step": 162440 + }, + { + "epoch": 12.58863187260258, + "grad_norm": 1.4825034829774903, + "learning_rate": 6.294559826410417e-07, + "loss": 0.876, + "step": 162450 + }, + { + "epoch": 12.589406796078887, + "grad_norm": 1.5478064033669452, + "learning_rate": 6.294947303161811e-07, + "loss": 0.8754, + "step": 162460 + }, + { + "epoch": 12.590181719555193, + "grad_norm": 1.4905231185293408, + "learning_rate": 6.295334779913206e-07, + "loss": 0.8624, + "step": 162470 + }, + { + "epoch": 12.5909566430315, + "grad_norm": 1.4496162777435384, + "learning_rate": 6.2957222566646e-07, + "loss": 0.8838, + "step": 162480 + }, + { + "epoch": 12.591731566507807, + "grad_norm": 1.5063657778362818, + "learning_rate": 6.296109733415997e-07, + "loss": 0.8986, + "step": 162490 + }, + { + "epoch": 12.592506489984114, + "grad_norm": 1.5133862306422747, + "learning_rate": 6.296497210167391e-07, + "loss": 0.8785, + "step": 162500 + }, + { + "epoch": 12.592506489984114, + "eval_loss": 0.8974078297615051, + "eval_runtime": 329.2005, + "eval_samples_per_second": 34.845, + "eval_steps_per_second": 8.712, + "step": 162500 + }, + { + "epoch": 12.59328141346042, + "grad_norm": 1.4996415226056528, + "learning_rate": 6.296884686918786e-07, + "loss": 0.8702, + "step": 162510 + }, + { + "epoch": 12.594056336936728, + "grad_norm": 1.4732215300139389, + "learning_rate": 6.29727216367018e-07, + "loss": 0.8876, + "step": 162520 + }, + { + "epoch": 12.594831260413034, + "grad_norm": 1.4915628216414103, + "learning_rate": 6.297659640421575e-07, + "loss": 0.8955, + "step": 162530 + }, + { + "epoch": 12.595606183889341, + "grad_norm": 1.534479132254053, + "learning_rate": 6.29804711717297e-07, + "loss": 0.8782, + "step": 162540 + }, + { + "epoch": 12.596381107365648, + "grad_norm": 1.3810425323393956, + "learning_rate": 6.298434593924366e-07, + "loss": 0.8732, + "step": 162550 + }, + { + "epoch": 12.597156030841955, + "grad_norm": 1.437189428143198, + "learning_rate": 6.29882207067576e-07, + "loss": 0.8868, + "step": 162560 + }, + { + "epoch": 12.597930954318262, + "grad_norm": 1.5414232395296426, + "learning_rate": 6.299209547427155e-07, + "loss": 0.8814, + "step": 162570 + }, + { + "epoch": 12.598705877794568, + "grad_norm": 1.4306711008928967, + "learning_rate": 6.299597024178549e-07, + "loss": 0.8701, + "step": 162580 + }, + { + "epoch": 12.599480801270875, + "grad_norm": 1.5480564177622445, + "learning_rate": 6.299984500929946e-07, + "loss": 0.8841, + "step": 162590 + }, + { + "epoch": 12.600255724747182, + "grad_norm": 1.5994549885408897, + "learning_rate": 6.30037197768134e-07, + "loss": 0.8854, + "step": 162600 + }, + { + "epoch": 12.601030648223489, + "grad_norm": 1.4859973739730794, + "learning_rate": 6.300759454432735e-07, + "loss": 0.889, + "step": 162610 + }, + { + "epoch": 12.601805571699794, + "grad_norm": 1.4269705755765514, + "learning_rate": 6.301146931184129e-07, + "loss": 0.8697, + "step": 162620 + }, + { + "epoch": 12.6025804951761, + "grad_norm": 1.46267574145078, + "learning_rate": 6.301534407935526e-07, + "loss": 0.8769, + "step": 162630 + }, + { + "epoch": 12.603355418652407, + "grad_norm": 1.5276593301004093, + "learning_rate": 6.30192188468692e-07, + "loss": 0.8916, + "step": 162640 + }, + { + "epoch": 12.604130342128714, + "grad_norm": 1.4551549154655143, + "learning_rate": 6.302309361438315e-07, + "loss": 0.8894, + "step": 162650 + }, + { + "epoch": 12.604905265605021, + "grad_norm": 1.436761861247381, + "learning_rate": 6.302696838189709e-07, + "loss": 0.8784, + "step": 162660 + }, + { + "epoch": 12.605680189081328, + "grad_norm": 1.5008652647579257, + "learning_rate": 6.303084314941104e-07, + "loss": 0.8795, + "step": 162670 + }, + { + "epoch": 12.606455112557635, + "grad_norm": 1.5185511875594646, + "learning_rate": 6.303471791692498e-07, + "loss": 0.8735, + "step": 162680 + }, + { + "epoch": 12.607230036033942, + "grad_norm": 1.5857650493071538, + "learning_rate": 6.303859268443895e-07, + "loss": 0.8807, + "step": 162690 + }, + { + "epoch": 12.608004959510248, + "grad_norm": 1.5210308703466437, + "learning_rate": 6.304246745195289e-07, + "loss": 0.8642, + "step": 162700 + }, + { + "epoch": 12.608779882986555, + "grad_norm": 1.5017658104794283, + "learning_rate": 6.304634221946684e-07, + "loss": 0.8593, + "step": 162710 + }, + { + "epoch": 12.609554806462862, + "grad_norm": 1.5318423562480852, + "learning_rate": 6.305021698698078e-07, + "loss": 0.9058, + "step": 162720 + }, + { + "epoch": 12.610329729939169, + "grad_norm": 1.6100570032459691, + "learning_rate": 6.305409175449475e-07, + "loss": 0.8847, + "step": 162730 + }, + { + "epoch": 12.611104653415476, + "grad_norm": 1.5493102881562235, + "learning_rate": 6.305796652200869e-07, + "loss": 0.9085, + "step": 162740 + }, + { + "epoch": 12.611879576891782, + "grad_norm": 1.624345829871011, + "learning_rate": 6.306184128952264e-07, + "loss": 0.8761, + "step": 162750 + }, + { + "epoch": 12.61265450036809, + "grad_norm": 1.4922353607573189, + "learning_rate": 6.306571605703658e-07, + "loss": 0.8815, + "step": 162760 + }, + { + "epoch": 12.613429423844396, + "grad_norm": 1.6458378566443426, + "learning_rate": 6.306959082455054e-07, + "loss": 0.8982, + "step": 162770 + }, + { + "epoch": 12.614204347320703, + "grad_norm": 1.4964628079654592, + "learning_rate": 6.307346559206449e-07, + "loss": 0.8579, + "step": 162780 + }, + { + "epoch": 12.614979270797008, + "grad_norm": 1.4614313826497043, + "learning_rate": 6.307734035957844e-07, + "loss": 0.9001, + "step": 162790 + }, + { + "epoch": 12.615754194273315, + "grad_norm": 1.4414579245247066, + "learning_rate": 6.308121512709238e-07, + "loss": 0.8823, + "step": 162800 + }, + { + "epoch": 12.616529117749621, + "grad_norm": 1.6194102663890748, + "learning_rate": 6.308508989460633e-07, + "loss": 0.8593, + "step": 162810 + }, + { + "epoch": 12.617304041225928, + "grad_norm": 1.576456336599811, + "learning_rate": 6.308896466212027e-07, + "loss": 0.8844, + "step": 162820 + }, + { + "epoch": 12.618078964702235, + "grad_norm": 1.3910004147607304, + "learning_rate": 6.309283942963424e-07, + "loss": 0.8729, + "step": 162830 + }, + { + "epoch": 12.618853888178542, + "grad_norm": 1.439011234778118, + "learning_rate": 6.309671419714818e-07, + "loss": 0.89, + "step": 162840 + }, + { + "epoch": 12.619628811654849, + "grad_norm": 1.5289038584364143, + "learning_rate": 6.310058896466213e-07, + "loss": 0.8772, + "step": 162850 + }, + { + "epoch": 12.620403735131156, + "grad_norm": 1.5410792562103148, + "learning_rate": 6.310446373217607e-07, + "loss": 0.8677, + "step": 162860 + }, + { + "epoch": 12.621178658607462, + "grad_norm": 1.5453547358668898, + "learning_rate": 6.310833849969003e-07, + "loss": 0.86, + "step": 162870 + }, + { + "epoch": 12.62195358208377, + "grad_norm": 1.5325321101252425, + "learning_rate": 6.311221326720398e-07, + "loss": 0.8671, + "step": 162880 + }, + { + "epoch": 12.622728505560076, + "grad_norm": 1.4182226425519098, + "learning_rate": 6.311608803471793e-07, + "loss": 0.8704, + "step": 162890 + }, + { + "epoch": 12.623503429036383, + "grad_norm": 1.4628253901329322, + "learning_rate": 6.311996280223187e-07, + "loss": 0.9104, + "step": 162900 + }, + { + "epoch": 12.62427835251269, + "grad_norm": 1.4388516501994104, + "learning_rate": 6.312383756974582e-07, + "loss": 0.8757, + "step": 162910 + }, + { + "epoch": 12.625053275988996, + "grad_norm": 1.4170783752078522, + "learning_rate": 6.312771233725977e-07, + "loss": 0.861, + "step": 162920 + }, + { + "epoch": 12.625828199465303, + "grad_norm": 1.4916080207699425, + "learning_rate": 6.313158710477373e-07, + "loss": 0.872, + "step": 162930 + }, + { + "epoch": 12.62660312294161, + "grad_norm": 1.577130721275698, + "learning_rate": 6.313546187228767e-07, + "loss": 0.8753, + "step": 162940 + }, + { + "epoch": 12.627378046417917, + "grad_norm": 1.4012105356047115, + "learning_rate": 6.313933663980162e-07, + "loss": 0.8807, + "step": 162950 + }, + { + "epoch": 12.628152969894224, + "grad_norm": 1.573895250331047, + "learning_rate": 6.314321140731556e-07, + "loss": 0.8701, + "step": 162960 + }, + { + "epoch": 12.62892789337053, + "grad_norm": 1.599079112893742, + "learning_rate": 6.314708617482952e-07, + "loss": 0.8625, + "step": 162970 + }, + { + "epoch": 12.629702816846837, + "grad_norm": 1.506289208189083, + "learning_rate": 6.315096094234347e-07, + "loss": 0.8723, + "step": 162980 + }, + { + "epoch": 12.630477740323142, + "grad_norm": 1.4515614024532582, + "learning_rate": 6.315483570985742e-07, + "loss": 0.8752, + "step": 162990 + }, + { + "epoch": 12.631252663799449, + "grad_norm": 1.4382154447693432, + "learning_rate": 6.315871047737136e-07, + "loss": 0.8974, + "step": 163000 + }, + { + "epoch": 12.631252663799449, + "eval_loss": 0.8969818949699402, + "eval_runtime": 327.0891, + "eval_samples_per_second": 35.07, + "eval_steps_per_second": 8.768, + "step": 163000 + }, + { + "epoch": 12.632027587275756, + "grad_norm": 1.4997663520094886, + "learning_rate": 6.316258524488532e-07, + "loss": 0.8866, + "step": 163010 + }, + { + "epoch": 12.632802510752063, + "grad_norm": 1.4969282932416703, + "learning_rate": 6.316646001239926e-07, + "loss": 0.8873, + "step": 163020 + }, + { + "epoch": 12.63357743422837, + "grad_norm": 1.5223156296920415, + "learning_rate": 6.317033477991322e-07, + "loss": 0.8691, + "step": 163030 + }, + { + "epoch": 12.634352357704676, + "grad_norm": 1.5103026665667474, + "learning_rate": 6.317420954742716e-07, + "loss": 0.8851, + "step": 163040 + }, + { + "epoch": 12.635127281180983, + "grad_norm": 1.4242996104814092, + "learning_rate": 6.317808431494111e-07, + "loss": 0.858, + "step": 163050 + }, + { + "epoch": 12.63590220465729, + "grad_norm": 1.529948965566505, + "learning_rate": 6.318195908245506e-07, + "loss": 0.8753, + "step": 163060 + }, + { + "epoch": 12.636677128133597, + "grad_norm": 1.500024422714903, + "learning_rate": 6.318583384996901e-07, + "loss": 0.8879, + "step": 163070 + }, + { + "epoch": 12.637452051609904, + "grad_norm": 1.40636164737878, + "learning_rate": 6.318970861748296e-07, + "loss": 0.8777, + "step": 163080 + }, + { + "epoch": 12.63822697508621, + "grad_norm": 1.4816279899353086, + "learning_rate": 6.319358338499691e-07, + "loss": 0.8755, + "step": 163090 + }, + { + "epoch": 12.639001898562517, + "grad_norm": 1.6025197000193783, + "learning_rate": 6.319745815251085e-07, + "loss": 0.89, + "step": 163100 + }, + { + "epoch": 12.639776822038824, + "grad_norm": 1.509180467329428, + "learning_rate": 6.320133292002481e-07, + "loss": 0.878, + "step": 163110 + }, + { + "epoch": 12.64055174551513, + "grad_norm": 1.628106431567859, + "learning_rate": 6.320520768753875e-07, + "loss": 0.8796, + "step": 163120 + }, + { + "epoch": 12.641326668991438, + "grad_norm": 1.468918094489739, + "learning_rate": 6.320908245505271e-07, + "loss": 0.8669, + "step": 163130 + }, + { + "epoch": 12.642101592467744, + "grad_norm": 1.5433047524851937, + "learning_rate": 6.321295722256665e-07, + "loss": 0.8712, + "step": 163140 + }, + { + "epoch": 12.642876515944051, + "grad_norm": 1.4980305377441074, + "learning_rate": 6.321683199008061e-07, + "loss": 0.8751, + "step": 163150 + }, + { + "epoch": 12.643651439420356, + "grad_norm": 1.4303143062545758, + "learning_rate": 6.322070675759455e-07, + "loss": 0.8832, + "step": 163160 + }, + { + "epoch": 12.644426362896663, + "grad_norm": 1.5457061031938117, + "learning_rate": 6.32245815251085e-07, + "loss": 0.892, + "step": 163170 + }, + { + "epoch": 12.64520128637297, + "grad_norm": 1.4555647023992702, + "learning_rate": 6.322845629262245e-07, + "loss": 0.8769, + "step": 163180 + }, + { + "epoch": 12.645976209849277, + "grad_norm": 1.489704343767743, + "learning_rate": 6.32323310601364e-07, + "loss": 0.8874, + "step": 163190 + }, + { + "epoch": 12.646751133325584, + "grad_norm": 1.5217630774533693, + "learning_rate": 6.323620582765035e-07, + "loss": 0.863, + "step": 163200 + }, + { + "epoch": 12.64752605680189, + "grad_norm": 1.5737694089666587, + "learning_rate": 6.32400805951643e-07, + "loss": 0.885, + "step": 163210 + }, + { + "epoch": 12.648300980278197, + "grad_norm": 1.5110476524737777, + "learning_rate": 6.324395536267824e-07, + "loss": 0.8814, + "step": 163220 + }, + { + "epoch": 12.649075903754504, + "grad_norm": 1.403662104648505, + "learning_rate": 6.32478301301922e-07, + "loss": 0.8769, + "step": 163230 + }, + { + "epoch": 12.64985082723081, + "grad_norm": 1.540046994196187, + "learning_rate": 6.325170489770614e-07, + "loss": 0.8792, + "step": 163240 + }, + { + "epoch": 12.650625750707118, + "grad_norm": 1.4621696992862678, + "learning_rate": 6.32555796652201e-07, + "loss": 0.8753, + "step": 163250 + }, + { + "epoch": 12.651400674183424, + "grad_norm": 1.636537465513271, + "learning_rate": 6.325945443273404e-07, + "loss": 0.8765, + "step": 163260 + }, + { + "epoch": 12.652175597659731, + "grad_norm": 1.5099187066309219, + "learning_rate": 6.3263329200248e-07, + "loss": 0.8879, + "step": 163270 + }, + { + "epoch": 12.652950521136038, + "grad_norm": 1.5592762777897275, + "learning_rate": 6.326720396776194e-07, + "loss": 0.8695, + "step": 163280 + }, + { + "epoch": 12.653725444612345, + "grad_norm": 1.4506476703868967, + "learning_rate": 6.32710787352759e-07, + "loss": 0.8675, + "step": 163290 + }, + { + "epoch": 12.654500368088652, + "grad_norm": 1.4821463023698618, + "learning_rate": 6.327495350278984e-07, + "loss": 0.8797, + "step": 163300 + }, + { + "epoch": 12.655275291564958, + "grad_norm": 1.4701314456409553, + "learning_rate": 6.327882827030379e-07, + "loss": 0.8858, + "step": 163310 + }, + { + "epoch": 12.656050215041265, + "grad_norm": 1.617862644390756, + "learning_rate": 6.328270303781774e-07, + "loss": 0.879, + "step": 163320 + }, + { + "epoch": 12.656825138517572, + "grad_norm": 1.5472831402287832, + "learning_rate": 6.328657780533169e-07, + "loss": 0.869, + "step": 163330 + }, + { + "epoch": 12.657600061993879, + "grad_norm": 1.4975720174527218, + "learning_rate": 6.329045257284564e-07, + "loss": 0.8822, + "step": 163340 + }, + { + "epoch": 12.658374985470186, + "grad_norm": 1.4704915425358505, + "learning_rate": 6.329432734035959e-07, + "loss": 0.9158, + "step": 163350 + }, + { + "epoch": 12.65914990894649, + "grad_norm": 1.523671837482919, + "learning_rate": 6.329820210787353e-07, + "loss": 0.9168, + "step": 163360 + }, + { + "epoch": 12.659924832422798, + "grad_norm": 1.6053858013562377, + "learning_rate": 6.330207687538749e-07, + "loss": 0.8743, + "step": 163370 + }, + { + "epoch": 12.660699755899104, + "grad_norm": 1.5176352846259311, + "learning_rate": 6.330595164290143e-07, + "loss": 0.863, + "step": 163380 + }, + { + "epoch": 12.661474679375411, + "grad_norm": 1.4918671044815102, + "learning_rate": 6.330982641041539e-07, + "loss": 0.8809, + "step": 163390 + }, + { + "epoch": 12.662249602851718, + "grad_norm": 1.5460563636460538, + "learning_rate": 6.331370117792933e-07, + "loss": 0.8917, + "step": 163400 + }, + { + "epoch": 12.663024526328025, + "grad_norm": 1.5472884627310335, + "learning_rate": 6.331757594544328e-07, + "loss": 0.8834, + "step": 163410 + }, + { + "epoch": 12.663799449804332, + "grad_norm": 1.4491035098277874, + "learning_rate": 6.332145071295723e-07, + "loss": 0.8733, + "step": 163420 + }, + { + "epoch": 12.664574373280638, + "grad_norm": 1.4533464358503623, + "learning_rate": 6.332532548047119e-07, + "loss": 0.8764, + "step": 163430 + }, + { + "epoch": 12.665349296756945, + "grad_norm": 1.4602646232406975, + "learning_rate": 6.332920024798513e-07, + "loss": 0.8942, + "step": 163440 + }, + { + "epoch": 12.666124220233252, + "grad_norm": 1.4721151782807753, + "learning_rate": 6.333307501549908e-07, + "loss": 0.8821, + "step": 163450 + }, + { + "epoch": 12.666899143709559, + "grad_norm": 1.4232127940565884, + "learning_rate": 6.333694978301302e-07, + "loss": 0.8695, + "step": 163460 + }, + { + "epoch": 12.667674067185866, + "grad_norm": 1.4522474569371873, + "learning_rate": 6.334082455052698e-07, + "loss": 0.8887, + "step": 163470 + }, + { + "epoch": 12.668448990662172, + "grad_norm": 1.5919003595894905, + "learning_rate": 6.334469931804093e-07, + "loss": 0.8956, + "step": 163480 + }, + { + "epoch": 12.66922391413848, + "grad_norm": 1.5619934552054686, + "learning_rate": 6.334857408555488e-07, + "loss": 0.8803, + "step": 163490 + }, + { + "epoch": 12.669998837614786, + "grad_norm": 1.4533815035111906, + "learning_rate": 6.335244885306882e-07, + "loss": 0.8665, + "step": 163500 + }, + { + "epoch": 12.669998837614786, + "eval_loss": 0.8968996405601501, + "eval_runtime": 328.3401, + "eval_samples_per_second": 34.936, + "eval_steps_per_second": 8.735, + "step": 163500 + }, + { + "epoch": 12.670773761091093, + "grad_norm": 1.4847834300040115, + "learning_rate": 6.335632362058277e-07, + "loss": 0.8933, + "step": 163510 + }, + { + "epoch": 12.6715486845674, + "grad_norm": 1.4701774365286893, + "learning_rate": 6.336019838809672e-07, + "loss": 0.8851, + "step": 163520 + }, + { + "epoch": 12.672323608043706, + "grad_norm": 1.4040357573673665, + "learning_rate": 6.336407315561068e-07, + "loss": 0.8628, + "step": 163530 + }, + { + "epoch": 12.673098531520012, + "grad_norm": 1.3857783444846956, + "learning_rate": 6.336794792312462e-07, + "loss": 0.8741, + "step": 163540 + }, + { + "epoch": 12.673873454996318, + "grad_norm": 1.5004433813347133, + "learning_rate": 6.337182269063857e-07, + "loss": 0.8938, + "step": 163550 + }, + { + "epoch": 12.674648378472625, + "grad_norm": 1.4320343706782894, + "learning_rate": 6.337569745815251e-07, + "loss": 0.8714, + "step": 163560 + }, + { + "epoch": 12.675423301948932, + "grad_norm": 1.6453793146517337, + "learning_rate": 6.337957222566648e-07, + "loss": 0.8986, + "step": 163570 + }, + { + "epoch": 12.676198225425239, + "grad_norm": 1.464097111272432, + "learning_rate": 6.338344699318042e-07, + "loss": 0.9128, + "step": 163580 + }, + { + "epoch": 12.676973148901546, + "grad_norm": 1.5633626028138583, + "learning_rate": 6.338732176069437e-07, + "loss": 0.8699, + "step": 163590 + }, + { + "epoch": 12.677748072377852, + "grad_norm": 1.4866267827827153, + "learning_rate": 6.339119652820831e-07, + "loss": 0.9042, + "step": 163600 + }, + { + "epoch": 12.67852299585416, + "grad_norm": 1.4775100474044458, + "learning_rate": 6.339507129572226e-07, + "loss": 0.8885, + "step": 163610 + }, + { + "epoch": 12.679297919330466, + "grad_norm": 1.5418196764970584, + "learning_rate": 6.339894606323621e-07, + "loss": 0.8759, + "step": 163620 + }, + { + "epoch": 12.680072842806773, + "grad_norm": 1.52813016367213, + "learning_rate": 6.340282083075017e-07, + "loss": 0.8692, + "step": 163630 + }, + { + "epoch": 12.68084776628308, + "grad_norm": 1.475960082373257, + "learning_rate": 6.340669559826411e-07, + "loss": 0.8654, + "step": 163640 + }, + { + "epoch": 12.681622689759386, + "grad_norm": 1.4788943273923576, + "learning_rate": 6.341057036577806e-07, + "loss": 0.8734, + "step": 163650 + }, + { + "epoch": 12.682397613235693, + "grad_norm": 1.458267279146831, + "learning_rate": 6.3414445133292e-07, + "loss": 0.8914, + "step": 163660 + }, + { + "epoch": 12.683172536712, + "grad_norm": 1.4805056965655534, + "learning_rate": 6.341831990080597e-07, + "loss": 0.8727, + "step": 163670 + }, + { + "epoch": 12.683947460188307, + "grad_norm": 1.4266914791091647, + "learning_rate": 6.342219466831991e-07, + "loss": 0.8833, + "step": 163680 + }, + { + "epoch": 12.684722383664614, + "grad_norm": 1.4356942265053045, + "learning_rate": 6.342606943583386e-07, + "loss": 0.8621, + "step": 163690 + }, + { + "epoch": 12.68549730714092, + "grad_norm": 1.8575332412427579, + "learning_rate": 6.34299442033478e-07, + "loss": 0.8859, + "step": 163700 + }, + { + "epoch": 12.686272230617227, + "grad_norm": 1.49185505136469, + "learning_rate": 6.343381897086176e-07, + "loss": 0.8676, + "step": 163710 + }, + { + "epoch": 12.687047154093534, + "grad_norm": 1.5103063023535555, + "learning_rate": 6.343769373837571e-07, + "loss": 0.8748, + "step": 163720 + }, + { + "epoch": 12.68782207756984, + "grad_norm": 1.590668134577989, + "learning_rate": 6.344156850588966e-07, + "loss": 0.9008, + "step": 163730 + }, + { + "epoch": 12.688597001046146, + "grad_norm": 1.50314202939888, + "learning_rate": 6.34454432734036e-07, + "loss": 0.8749, + "step": 163740 + }, + { + "epoch": 12.689371924522453, + "grad_norm": 1.504007635190512, + "learning_rate": 6.344931804091755e-07, + "loss": 0.8989, + "step": 163750 + }, + { + "epoch": 12.69014684799876, + "grad_norm": 1.4567682492064802, + "learning_rate": 6.345319280843149e-07, + "loss": 0.8683, + "step": 163760 + }, + { + "epoch": 12.690921771475066, + "grad_norm": 1.3645888892902402, + "learning_rate": 6.345706757594546e-07, + "loss": 0.8623, + "step": 163770 + }, + { + "epoch": 12.691696694951373, + "grad_norm": 1.4588491630185019, + "learning_rate": 6.34609423434594e-07, + "loss": 0.8893, + "step": 163780 + }, + { + "epoch": 12.69247161842768, + "grad_norm": 1.4424315075904095, + "learning_rate": 6.346481711097335e-07, + "loss": 0.8562, + "step": 163790 + }, + { + "epoch": 12.693246541903987, + "grad_norm": 1.4321431272785952, + "learning_rate": 6.346869187848729e-07, + "loss": 0.883, + "step": 163800 + }, + { + "epoch": 12.694021465380294, + "grad_norm": 1.567650330959381, + "learning_rate": 6.347256664600126e-07, + "loss": 0.8742, + "step": 163810 + }, + { + "epoch": 12.6947963888566, + "grad_norm": 1.5310820029091907, + "learning_rate": 6.34764414135152e-07, + "loss": 0.878, + "step": 163820 + }, + { + "epoch": 12.695571312332907, + "grad_norm": 1.347723775226105, + "learning_rate": 6.348031618102915e-07, + "loss": 0.8754, + "step": 163830 + }, + { + "epoch": 12.696346235809214, + "grad_norm": 1.4785889568256274, + "learning_rate": 6.348419094854309e-07, + "loss": 0.8816, + "step": 163840 + }, + { + "epoch": 12.69712115928552, + "grad_norm": 1.3672995779362738, + "learning_rate": 6.348806571605705e-07, + "loss": 0.8625, + "step": 163850 + }, + { + "epoch": 12.697896082761828, + "grad_norm": 1.490631629054129, + "learning_rate": 6.3491940483571e-07, + "loss": 0.886, + "step": 163860 + }, + { + "epoch": 12.698671006238134, + "grad_norm": 1.439158614930699, + "learning_rate": 6.349581525108495e-07, + "loss": 0.8848, + "step": 163870 + }, + { + "epoch": 12.699445929714441, + "grad_norm": 1.3939686390351012, + "learning_rate": 6.349969001859889e-07, + "loss": 0.903, + "step": 163880 + }, + { + "epoch": 12.700220853190748, + "grad_norm": 1.4576531294381587, + "learning_rate": 6.350356478611284e-07, + "loss": 0.8846, + "step": 163890 + }, + { + "epoch": 12.700995776667055, + "grad_norm": 1.4943765393146322, + "learning_rate": 6.350743955362678e-07, + "loss": 0.881, + "step": 163900 + }, + { + "epoch": 12.70177070014336, + "grad_norm": 1.4226487579733496, + "learning_rate": 6.351131432114075e-07, + "loss": 0.8765, + "step": 163910 + }, + { + "epoch": 12.702545623619667, + "grad_norm": 1.469256344902001, + "learning_rate": 6.351518908865469e-07, + "loss": 0.8719, + "step": 163920 + }, + { + "epoch": 12.703320547095974, + "grad_norm": 1.4498334258839662, + "learning_rate": 6.351906385616864e-07, + "loss": 0.8729, + "step": 163930 + }, + { + "epoch": 12.70409547057228, + "grad_norm": 1.521390993399531, + "learning_rate": 6.352293862368258e-07, + "loss": 0.8822, + "step": 163940 + }, + { + "epoch": 12.704870394048587, + "grad_norm": 1.5804551265598445, + "learning_rate": 6.352681339119654e-07, + "loss": 0.8777, + "step": 163950 + }, + { + "epoch": 12.705645317524894, + "grad_norm": 1.4885578608629921, + "learning_rate": 6.353068815871049e-07, + "loss": 0.8959, + "step": 163960 + }, + { + "epoch": 12.7064202410012, + "grad_norm": 1.512328115729866, + "learning_rate": 6.353456292622444e-07, + "loss": 0.8765, + "step": 163970 + }, + { + "epoch": 12.707195164477508, + "grad_norm": 1.523540590091789, + "learning_rate": 6.353843769373838e-07, + "loss": 0.8754, + "step": 163980 + }, + { + "epoch": 12.707970087953814, + "grad_norm": 1.4782192748162144, + "learning_rate": 6.354231246125234e-07, + "loss": 0.8759, + "step": 163990 + }, + { + "epoch": 12.708745011430121, + "grad_norm": 1.5744385516149206, + "learning_rate": 6.354618722876628e-07, + "loss": 0.9055, + "step": 164000 + }, + { + "epoch": 12.708745011430121, + "eval_loss": 0.896748960018158, + "eval_runtime": 330.7267, + "eval_samples_per_second": 34.684, + "eval_steps_per_second": 8.672, + "step": 164000 + }, + { + "epoch": 12.709519934906428, + "grad_norm": 1.5162646891189788, + "learning_rate": 6.355006199628024e-07, + "loss": 0.8935, + "step": 164010 + }, + { + "epoch": 12.710294858382735, + "grad_norm": 1.572697938542958, + "learning_rate": 6.355393676379418e-07, + "loss": 0.8789, + "step": 164020 + }, + { + "epoch": 12.711069781859042, + "grad_norm": 1.473515599967671, + "learning_rate": 6.355781153130813e-07, + "loss": 0.8738, + "step": 164030 + }, + { + "epoch": 12.711844705335348, + "grad_norm": 1.4088181927691135, + "learning_rate": 6.356168629882207e-07, + "loss": 0.8764, + "step": 164040 + }, + { + "epoch": 12.712619628811655, + "grad_norm": 1.5529349504677648, + "learning_rate": 6.356556106633603e-07, + "loss": 0.8961, + "step": 164050 + }, + { + "epoch": 12.713394552287962, + "grad_norm": 1.5512595462642036, + "learning_rate": 6.356943583384998e-07, + "loss": 0.8809, + "step": 164060 + }, + { + "epoch": 12.714169475764269, + "grad_norm": 1.4230045483814635, + "learning_rate": 6.357331060136393e-07, + "loss": 0.8835, + "step": 164070 + }, + { + "epoch": 12.714944399240576, + "grad_norm": 1.4933257595984681, + "learning_rate": 6.357718536887787e-07, + "loss": 0.8569, + "step": 164080 + }, + { + "epoch": 12.715719322716883, + "grad_norm": 1.5081454146197497, + "learning_rate": 6.358106013639183e-07, + "loss": 0.887, + "step": 164090 + }, + { + "epoch": 12.71649424619319, + "grad_norm": 1.4458284768429455, + "learning_rate": 6.358493490390577e-07, + "loss": 0.8634, + "step": 164100 + }, + { + "epoch": 12.717269169669494, + "grad_norm": 1.4919597897083519, + "learning_rate": 6.358880967141973e-07, + "loss": 0.8703, + "step": 164110 + }, + { + "epoch": 12.718044093145801, + "grad_norm": 1.402755568179907, + "learning_rate": 6.359268443893367e-07, + "loss": 0.8745, + "step": 164120 + }, + { + "epoch": 12.718819016622108, + "grad_norm": 1.431200368023943, + "learning_rate": 6.359655920644763e-07, + "loss": 0.8836, + "step": 164130 + }, + { + "epoch": 12.719593940098415, + "grad_norm": 1.5112392046594214, + "learning_rate": 6.360043397396157e-07, + "loss": 0.8949, + "step": 164140 + }, + { + "epoch": 12.720368863574722, + "grad_norm": 1.4410362445487077, + "learning_rate": 6.360430874147552e-07, + "loss": 0.8619, + "step": 164150 + }, + { + "epoch": 12.721143787051028, + "grad_norm": 1.465560197690555, + "learning_rate": 6.360818350898947e-07, + "loss": 0.8787, + "step": 164160 + }, + { + "epoch": 12.721918710527335, + "grad_norm": 1.4752458210620578, + "learning_rate": 6.361205827650342e-07, + "loss": 0.884, + "step": 164170 + }, + { + "epoch": 12.722693634003642, + "grad_norm": 1.4378946155010577, + "learning_rate": 6.361593304401736e-07, + "loss": 0.8686, + "step": 164180 + }, + { + "epoch": 12.723468557479949, + "grad_norm": 1.5464408707307742, + "learning_rate": 6.361980781153132e-07, + "loss": 0.8874, + "step": 164190 + }, + { + "epoch": 12.724243480956256, + "grad_norm": 1.4929879528171084, + "learning_rate": 6.362368257904526e-07, + "loss": 0.8862, + "step": 164200 + }, + { + "epoch": 12.725018404432562, + "grad_norm": 1.4821951991711668, + "learning_rate": 6.362755734655922e-07, + "loss": 0.8753, + "step": 164210 + }, + { + "epoch": 12.72579332790887, + "grad_norm": 1.5946502920741217, + "learning_rate": 6.363143211407316e-07, + "loss": 0.8965, + "step": 164220 + }, + { + "epoch": 12.726568251385176, + "grad_norm": 1.4149601792010036, + "learning_rate": 6.363530688158712e-07, + "loss": 0.8659, + "step": 164230 + }, + { + "epoch": 12.727343174861483, + "grad_norm": 1.5723119328404138, + "learning_rate": 6.363918164910106e-07, + "loss": 0.8785, + "step": 164240 + }, + { + "epoch": 12.72811809833779, + "grad_norm": 1.505377249855276, + "learning_rate": 6.364305641661501e-07, + "loss": 0.8728, + "step": 164250 + }, + { + "epoch": 12.728893021814097, + "grad_norm": 1.4866208874746611, + "learning_rate": 6.364693118412896e-07, + "loss": 0.8774, + "step": 164260 + }, + { + "epoch": 12.729667945290403, + "grad_norm": 1.5253164142115205, + "learning_rate": 6.365080595164292e-07, + "loss": 0.877, + "step": 164270 + }, + { + "epoch": 12.730442868766708, + "grad_norm": 1.435362124556453, + "learning_rate": 6.365468071915686e-07, + "loss": 0.8807, + "step": 164280 + }, + { + "epoch": 12.731217792243015, + "grad_norm": 1.4731216526012079, + "learning_rate": 6.365855548667081e-07, + "loss": 0.8795, + "step": 164290 + }, + { + "epoch": 12.731992715719322, + "grad_norm": 1.5020164330653907, + "learning_rate": 6.366243025418475e-07, + "loss": 0.875, + "step": 164300 + }, + { + "epoch": 12.732767639195629, + "grad_norm": 1.535652978070278, + "learning_rate": 6.366630502169871e-07, + "loss": 0.8714, + "step": 164310 + }, + { + "epoch": 12.733542562671936, + "grad_norm": 1.4456844870586554, + "learning_rate": 6.367017978921265e-07, + "loss": 0.8756, + "step": 164320 + }, + { + "epoch": 12.734317486148242, + "grad_norm": 1.5186556419958255, + "learning_rate": 6.367405455672661e-07, + "loss": 0.8898, + "step": 164330 + }, + { + "epoch": 12.73509240962455, + "grad_norm": 1.4969476156669288, + "learning_rate": 6.367792932424055e-07, + "loss": 0.8886, + "step": 164340 + }, + { + "epoch": 12.735867333100856, + "grad_norm": 1.4694749746300333, + "learning_rate": 6.36818040917545e-07, + "loss": 0.8873, + "step": 164350 + }, + { + "epoch": 12.736642256577163, + "grad_norm": 1.5331510483899837, + "learning_rate": 6.368567885926845e-07, + "loss": 0.9054, + "step": 164360 + }, + { + "epoch": 12.73741718005347, + "grad_norm": 1.6439191228105963, + "learning_rate": 6.368955362678241e-07, + "loss": 0.8568, + "step": 164370 + }, + { + "epoch": 12.738192103529776, + "grad_norm": 1.4607487114954136, + "learning_rate": 6.369342839429635e-07, + "loss": 0.8662, + "step": 164380 + }, + { + "epoch": 12.738967027006083, + "grad_norm": 1.5174925015704037, + "learning_rate": 6.36973031618103e-07, + "loss": 0.8881, + "step": 164390 + }, + { + "epoch": 12.73974195048239, + "grad_norm": 1.4716324937316858, + "learning_rate": 6.370117792932424e-07, + "loss": 0.8895, + "step": 164400 + }, + { + "epoch": 12.740516873958697, + "grad_norm": 1.5899251013690168, + "learning_rate": 6.37050526968382e-07, + "loss": 0.8905, + "step": 164410 + }, + { + "epoch": 12.741291797435004, + "grad_norm": 1.5221207469695224, + "learning_rate": 6.370892746435215e-07, + "loss": 0.8814, + "step": 164420 + }, + { + "epoch": 12.74206672091131, + "grad_norm": 1.4782000865167562, + "learning_rate": 6.37128022318661e-07, + "loss": 0.8801, + "step": 164430 + }, + { + "epoch": 12.742841644387617, + "grad_norm": 1.458559896808081, + "learning_rate": 6.371667699938004e-07, + "loss": 0.8716, + "step": 164440 + }, + { + "epoch": 12.743616567863924, + "grad_norm": 1.5712046730938811, + "learning_rate": 6.372055176689399e-07, + "loss": 0.8866, + "step": 164450 + }, + { + "epoch": 12.744391491340231, + "grad_norm": 1.4669754475590369, + "learning_rate": 6.372442653440794e-07, + "loss": 0.8551, + "step": 164460 + }, + { + "epoch": 12.745166414816538, + "grad_norm": 1.5378760797791784, + "learning_rate": 6.37283013019219e-07, + "loss": 0.8675, + "step": 164470 + }, + { + "epoch": 12.745941338292843, + "grad_norm": 1.4734682407354092, + "learning_rate": 6.373217606943584e-07, + "loss": 0.8971, + "step": 164480 + }, + { + "epoch": 12.74671626176915, + "grad_norm": 1.5430216796963006, + "learning_rate": 6.373605083694979e-07, + "loss": 0.8898, + "step": 164490 + }, + { + "epoch": 12.747491185245456, + "grad_norm": 1.5185825320536854, + "learning_rate": 6.373992560446373e-07, + "loss": 0.8745, + "step": 164500 + }, + { + "epoch": 12.747491185245456, + "eval_loss": 0.8966236710548401, + "eval_runtime": 328.414, + "eval_samples_per_second": 34.928, + "eval_steps_per_second": 8.733, + "step": 164500 + }, + { + "epoch": 12.748266108721763, + "grad_norm": 1.5324337491503088, + "learning_rate": 6.37438003719777e-07, + "loss": 0.8662, + "step": 164510 + }, + { + "epoch": 12.74904103219807, + "grad_norm": 1.4324691103720286, + "learning_rate": 6.374767513949164e-07, + "loss": 0.8721, + "step": 164520 + }, + { + "epoch": 12.749815955674377, + "grad_norm": 1.5013707027494565, + "learning_rate": 6.375154990700559e-07, + "loss": 0.9136, + "step": 164530 + }, + { + "epoch": 12.750590879150684, + "grad_norm": 1.5457529749416439, + "learning_rate": 6.375542467451953e-07, + "loss": 0.9236, + "step": 164540 + }, + { + "epoch": 12.75136580262699, + "grad_norm": 1.5678030962137035, + "learning_rate": 6.375929944203348e-07, + "loss": 0.8732, + "step": 164550 + }, + { + "epoch": 12.752140726103297, + "grad_norm": 1.4974288238645364, + "learning_rate": 6.376317420954744e-07, + "loss": 0.9079, + "step": 164560 + }, + { + "epoch": 12.752915649579604, + "grad_norm": 1.513415959663403, + "learning_rate": 6.376704897706139e-07, + "loss": 0.892, + "step": 164570 + }, + { + "epoch": 12.753690573055911, + "grad_norm": 1.364070777181064, + "learning_rate": 6.377092374457533e-07, + "loss": 0.8795, + "step": 164580 + }, + { + "epoch": 12.754465496532218, + "grad_norm": 1.4617918213982222, + "learning_rate": 6.377479851208928e-07, + "loss": 0.8619, + "step": 164590 + }, + { + "epoch": 12.755240420008525, + "grad_norm": 1.4779067815421763, + "learning_rate": 6.377867327960322e-07, + "loss": 0.8723, + "step": 164600 + }, + { + "epoch": 12.756015343484831, + "grad_norm": 1.5037552946036894, + "learning_rate": 6.378254804711719e-07, + "loss": 0.8929, + "step": 164610 + }, + { + "epoch": 12.756790266961138, + "grad_norm": 1.5220314040475464, + "learning_rate": 6.378642281463113e-07, + "loss": 0.8748, + "step": 164620 + }, + { + "epoch": 12.757565190437445, + "grad_norm": 1.4514331513283383, + "learning_rate": 6.379029758214508e-07, + "loss": 0.8882, + "step": 164630 + }, + { + "epoch": 12.758340113913752, + "grad_norm": 1.4950638499681184, + "learning_rate": 6.379417234965902e-07, + "loss": 0.8781, + "step": 164640 + }, + { + "epoch": 12.759115037390057, + "grad_norm": 1.4906881974551691, + "learning_rate": 6.379804711717299e-07, + "loss": 0.8852, + "step": 164650 + }, + { + "epoch": 12.759889960866364, + "grad_norm": 1.3922424448144488, + "learning_rate": 6.380192188468693e-07, + "loss": 0.8777, + "step": 164660 + }, + { + "epoch": 12.76066488434267, + "grad_norm": 1.4420738873239707, + "learning_rate": 6.380579665220088e-07, + "loss": 0.8791, + "step": 164670 + }, + { + "epoch": 12.761439807818977, + "grad_norm": 1.446852241275553, + "learning_rate": 6.380967141971482e-07, + "loss": 0.8726, + "step": 164680 + }, + { + "epoch": 12.762214731295284, + "grad_norm": 1.5712840154973269, + "learning_rate": 6.381354618722877e-07, + "loss": 0.8871, + "step": 164690 + }, + { + "epoch": 12.76298965477159, + "grad_norm": 1.4242633845831068, + "learning_rate": 6.381742095474273e-07, + "loss": 0.8788, + "step": 164700 + }, + { + "epoch": 12.763764578247898, + "grad_norm": 1.4614273057103118, + "learning_rate": 6.382129572225668e-07, + "loss": 0.8719, + "step": 164710 + }, + { + "epoch": 12.764539501724204, + "grad_norm": 1.4163065377675153, + "learning_rate": 6.382517048977062e-07, + "loss": 0.8638, + "step": 164720 + }, + { + "epoch": 12.765314425200511, + "grad_norm": 1.4882880050695984, + "learning_rate": 6.382904525728457e-07, + "loss": 0.8762, + "step": 164730 + }, + { + "epoch": 12.766089348676818, + "grad_norm": 1.4750276136354163, + "learning_rate": 6.383292002479851e-07, + "loss": 0.8584, + "step": 164740 + }, + { + "epoch": 12.766864272153125, + "grad_norm": 1.4666785348031333, + "learning_rate": 6.383679479231248e-07, + "loss": 0.893, + "step": 164750 + }, + { + "epoch": 12.767639195629432, + "grad_norm": 1.5042543272047473, + "learning_rate": 6.384066955982642e-07, + "loss": 0.8973, + "step": 164760 + }, + { + "epoch": 12.768414119105739, + "grad_norm": 1.4940105676626714, + "learning_rate": 6.384454432734037e-07, + "loss": 0.8824, + "step": 164770 + }, + { + "epoch": 12.769189042582045, + "grad_norm": 1.4428085958894488, + "learning_rate": 6.384841909485431e-07, + "loss": 0.8854, + "step": 164780 + }, + { + "epoch": 12.769963966058352, + "grad_norm": 1.4545156219745412, + "learning_rate": 6.385229386236827e-07, + "loss": 0.8742, + "step": 164790 + }, + { + "epoch": 12.770738889534659, + "grad_norm": 1.465151859070438, + "learning_rate": 6.385616862988222e-07, + "loss": 0.8815, + "step": 164800 + }, + { + "epoch": 12.771513813010966, + "grad_norm": 1.4865213273457758, + "learning_rate": 6.386004339739617e-07, + "loss": 0.8843, + "step": 164810 + }, + { + "epoch": 12.772288736487273, + "grad_norm": 1.4161459892552783, + "learning_rate": 6.386391816491011e-07, + "loss": 0.8858, + "step": 164820 + }, + { + "epoch": 12.77306365996358, + "grad_norm": 1.5159396618037506, + "learning_rate": 6.386779293242406e-07, + "loss": 0.8716, + "step": 164830 + }, + { + "epoch": 12.773838583439886, + "grad_norm": 1.5189406459139922, + "learning_rate": 6.387166769993801e-07, + "loss": 0.8792, + "step": 164840 + }, + { + "epoch": 12.774613506916191, + "grad_norm": 1.5770260789333292, + "learning_rate": 6.387554246745197e-07, + "loss": 0.8688, + "step": 164850 + }, + { + "epoch": 12.775388430392498, + "grad_norm": 1.5731554774966336, + "learning_rate": 6.387941723496591e-07, + "loss": 0.8643, + "step": 164860 + }, + { + "epoch": 12.776163353868805, + "grad_norm": 1.4980666750628766, + "learning_rate": 6.388329200247986e-07, + "loss": 0.8779, + "step": 164870 + }, + { + "epoch": 12.776938277345112, + "grad_norm": 1.545487637793413, + "learning_rate": 6.38871667699938e-07, + "loss": 0.8841, + "step": 164880 + }, + { + "epoch": 12.777713200821418, + "grad_norm": 1.5051446216093136, + "learning_rate": 6.389104153750776e-07, + "loss": 0.891, + "step": 164890 + }, + { + "epoch": 12.778488124297725, + "grad_norm": 1.5296314081061932, + "learning_rate": 6.389491630502171e-07, + "loss": 0.8785, + "step": 164900 + }, + { + "epoch": 12.779263047774032, + "grad_norm": 1.5757684662539135, + "learning_rate": 6.389879107253566e-07, + "loss": 0.8838, + "step": 164910 + }, + { + "epoch": 12.780037971250339, + "grad_norm": 1.5253904426047753, + "learning_rate": 6.39026658400496e-07, + "loss": 0.8832, + "step": 164920 + }, + { + "epoch": 12.780812894726646, + "grad_norm": 1.443923992288561, + "learning_rate": 6.390654060756356e-07, + "loss": 0.8743, + "step": 164930 + }, + { + "epoch": 12.781587818202953, + "grad_norm": 1.4948772264899417, + "learning_rate": 6.39104153750775e-07, + "loss": 0.8656, + "step": 164940 + }, + { + "epoch": 12.78236274167926, + "grad_norm": 1.4905346453408743, + "learning_rate": 6.391429014259146e-07, + "loss": 0.882, + "step": 164950 + }, + { + "epoch": 12.783137665155566, + "grad_norm": 1.4711109498756119, + "learning_rate": 6.39181649101054e-07, + "loss": 0.8884, + "step": 164960 + }, + { + "epoch": 12.783912588631873, + "grad_norm": 1.4996351058640354, + "learning_rate": 6.392203967761935e-07, + "loss": 0.8746, + "step": 164970 + }, + { + "epoch": 12.78468751210818, + "grad_norm": 1.4352647079191303, + "learning_rate": 6.392591444513329e-07, + "loss": 0.8808, + "step": 164980 + }, + { + "epoch": 12.785462435584487, + "grad_norm": 1.451371408839607, + "learning_rate": 6.392978921264725e-07, + "loss": 0.8758, + "step": 164990 + }, + { + "epoch": 12.786237359060793, + "grad_norm": 1.4950738579280858, + "learning_rate": 6.39336639801612e-07, + "loss": 0.8845, + "step": 165000 + }, + { + "epoch": 12.786237359060793, + "eval_loss": 0.8965321183204651, + "eval_runtime": 327.8225, + "eval_samples_per_second": 34.992, + "eval_steps_per_second": 8.749, + "step": 165000 + }, + { + "epoch": 12.7870122825371, + "grad_norm": 1.4896307535631994, + "learning_rate": 6.393753874767515e-07, + "loss": 0.8804, + "step": 165010 + }, + { + "epoch": 12.787787206013405, + "grad_norm": 1.330258619967187, + "learning_rate": 6.394141351518909e-07, + "loss": 0.9075, + "step": 165020 + }, + { + "epoch": 12.788562129489712, + "grad_norm": 1.467427100793066, + "learning_rate": 6.394528828270305e-07, + "loss": 0.867, + "step": 165030 + }, + { + "epoch": 12.789337052966019, + "grad_norm": 1.6329900503474437, + "learning_rate": 6.394916305021699e-07, + "loss": 0.8751, + "step": 165040 + }, + { + "epoch": 12.790111976442326, + "grad_norm": 1.4574129160738227, + "learning_rate": 6.395303781773095e-07, + "loss": 0.8735, + "step": 165050 + }, + { + "epoch": 12.790886899918632, + "grad_norm": 1.4984700108001119, + "learning_rate": 6.395691258524489e-07, + "loss": 0.874, + "step": 165060 + }, + { + "epoch": 12.79166182339494, + "grad_norm": 1.503290186132937, + "learning_rate": 6.396078735275885e-07, + "loss": 0.8692, + "step": 165070 + }, + { + "epoch": 12.792436746871246, + "grad_norm": 1.550197380062839, + "learning_rate": 6.396466212027279e-07, + "loss": 0.8857, + "step": 165080 + }, + { + "epoch": 12.793211670347553, + "grad_norm": 1.4665956278199648, + "learning_rate": 6.396853688778674e-07, + "loss": 0.883, + "step": 165090 + }, + { + "epoch": 12.79398659382386, + "grad_norm": 1.4042674018844687, + "learning_rate": 6.397241165530069e-07, + "loss": 0.8732, + "step": 165100 + }, + { + "epoch": 12.794761517300167, + "grad_norm": 1.38111104666661, + "learning_rate": 6.397628642281464e-07, + "loss": 0.874, + "step": 165110 + }, + { + "epoch": 12.795536440776473, + "grad_norm": 1.404015047801046, + "learning_rate": 6.398016119032858e-07, + "loss": 0.8774, + "step": 165120 + }, + { + "epoch": 12.79631136425278, + "grad_norm": 1.5126375695286556, + "learning_rate": 6.398403595784254e-07, + "loss": 0.8744, + "step": 165130 + }, + { + "epoch": 12.797086287729087, + "grad_norm": 1.5903385833044483, + "learning_rate": 6.398791072535648e-07, + "loss": 0.8732, + "step": 165140 + }, + { + "epoch": 12.797861211205394, + "grad_norm": 1.5539404272557285, + "learning_rate": 6.399178549287044e-07, + "loss": 0.8755, + "step": 165150 + }, + { + "epoch": 12.7986361346817, + "grad_norm": 1.5426275634023008, + "learning_rate": 6.399566026038438e-07, + "loss": 0.869, + "step": 165160 + }, + { + "epoch": 12.799411058158007, + "grad_norm": 1.4989248251866556, + "learning_rate": 6.399953502789834e-07, + "loss": 0.8968, + "step": 165170 + }, + { + "epoch": 12.800185981634314, + "grad_norm": 1.50391304754129, + "learning_rate": 6.400340979541228e-07, + "loss": 0.8738, + "step": 165180 + }, + { + "epoch": 12.800960905110621, + "grad_norm": 1.4657702708102138, + "learning_rate": 6.400728456292623e-07, + "loss": 0.9028, + "step": 165190 + }, + { + "epoch": 12.801735828586928, + "grad_norm": 1.5398152480907212, + "learning_rate": 6.401115933044018e-07, + "loss": 0.8948, + "step": 165200 + }, + { + "epoch": 12.802510752063235, + "grad_norm": 1.4673751472354088, + "learning_rate": 6.401503409795414e-07, + "loss": 0.8689, + "step": 165210 + }, + { + "epoch": 12.80328567553954, + "grad_norm": 1.4811467732293515, + "learning_rate": 6.401890886546808e-07, + "loss": 0.8818, + "step": 165220 + }, + { + "epoch": 12.804060599015846, + "grad_norm": 1.516609764744015, + "learning_rate": 6.402278363298203e-07, + "loss": 0.8774, + "step": 165230 + }, + { + "epoch": 12.804835522492153, + "grad_norm": 1.563290187196367, + "learning_rate": 6.402665840049597e-07, + "loss": 0.8703, + "step": 165240 + }, + { + "epoch": 12.80561044596846, + "grad_norm": 1.455658005310157, + "learning_rate": 6.403053316800993e-07, + "loss": 0.8593, + "step": 165250 + }, + { + "epoch": 12.806385369444767, + "grad_norm": 1.4541924178860428, + "learning_rate": 6.403440793552387e-07, + "loss": 0.8887, + "step": 165260 + }, + { + "epoch": 12.807160292921074, + "grad_norm": 1.523150230333266, + "learning_rate": 6.403828270303783e-07, + "loss": 0.8651, + "step": 165270 + }, + { + "epoch": 12.80793521639738, + "grad_norm": 1.5129048401898328, + "learning_rate": 6.404215747055177e-07, + "loss": 0.868, + "step": 165280 + }, + { + "epoch": 12.808710139873687, + "grad_norm": 1.6968222330473037, + "learning_rate": 6.404603223806573e-07, + "loss": 0.8854, + "step": 165290 + }, + { + "epoch": 12.809485063349994, + "grad_norm": 1.497847038251656, + "learning_rate": 6.404990700557967e-07, + "loss": 0.8897, + "step": 165300 + }, + { + "epoch": 12.810259986826301, + "grad_norm": 1.4972085754700812, + "learning_rate": 6.405378177309363e-07, + "loss": 0.8872, + "step": 165310 + }, + { + "epoch": 12.811034910302608, + "grad_norm": 1.4772782665098456, + "learning_rate": 6.405765654060757e-07, + "loss": 0.8794, + "step": 165320 + }, + { + "epoch": 12.811809833778915, + "grad_norm": 1.5011971997122335, + "learning_rate": 6.406153130812152e-07, + "loss": 0.8832, + "step": 165330 + }, + { + "epoch": 12.812584757255221, + "grad_norm": 1.492349877482656, + "learning_rate": 6.406540607563546e-07, + "loss": 0.8718, + "step": 165340 + }, + { + "epoch": 12.813359680731528, + "grad_norm": 1.5012242227168777, + "learning_rate": 6.406928084314943e-07, + "loss": 0.8913, + "step": 165350 + }, + { + "epoch": 12.814134604207835, + "grad_norm": 1.4649601960317498, + "learning_rate": 6.407315561066337e-07, + "loss": 0.8658, + "step": 165360 + }, + { + "epoch": 12.814909527684142, + "grad_norm": 1.5913298161385199, + "learning_rate": 6.407703037817732e-07, + "loss": 0.9109, + "step": 165370 + }, + { + "epoch": 12.815684451160449, + "grad_norm": 1.4457732268346568, + "learning_rate": 6.408090514569126e-07, + "loss": 0.8761, + "step": 165380 + }, + { + "epoch": 12.816459374636755, + "grad_norm": 1.4370856307821323, + "learning_rate": 6.408477991320522e-07, + "loss": 0.9007, + "step": 165390 + }, + { + "epoch": 12.81723429811306, + "grad_norm": 1.4948464187070771, + "learning_rate": 6.408865468071916e-07, + "loss": 0.8793, + "step": 165400 + }, + { + "epoch": 12.818009221589367, + "grad_norm": 1.3597195838542249, + "learning_rate": 6.409252944823312e-07, + "loss": 0.8944, + "step": 165410 + }, + { + "epoch": 12.818784145065674, + "grad_norm": 1.4755197261138329, + "learning_rate": 6.409640421574706e-07, + "loss": 0.8707, + "step": 165420 + }, + { + "epoch": 12.819559068541981, + "grad_norm": 1.3744882245233903, + "learning_rate": 6.410027898326101e-07, + "loss": 0.8648, + "step": 165430 + }, + { + "epoch": 12.820333992018288, + "grad_norm": 1.3958738829791322, + "learning_rate": 6.410415375077496e-07, + "loss": 0.8796, + "step": 165440 + }, + { + "epoch": 12.821108915494595, + "grad_norm": 1.4031609116666544, + "learning_rate": 6.410802851828892e-07, + "loss": 0.8724, + "step": 165450 + }, + { + "epoch": 12.821883838970901, + "grad_norm": 1.524048696757876, + "learning_rate": 6.411190328580286e-07, + "loss": 0.871, + "step": 165460 + }, + { + "epoch": 12.822658762447208, + "grad_norm": 1.5532384437166218, + "learning_rate": 6.411577805331681e-07, + "loss": 0.8811, + "step": 165470 + }, + { + "epoch": 12.823433685923515, + "grad_norm": 1.5189508026433223, + "learning_rate": 6.411965282083075e-07, + "loss": 0.8772, + "step": 165480 + }, + { + "epoch": 12.824208609399822, + "grad_norm": 1.4865786669446133, + "learning_rate": 6.412352758834472e-07, + "loss": 0.858, + "step": 165490 + }, + { + "epoch": 12.824983532876129, + "grad_norm": 1.4550693635941039, + "learning_rate": 6.412740235585866e-07, + "loss": 0.876, + "step": 165500 + }, + { + "epoch": 12.824983532876129, + "eval_loss": 0.8963144421577454, + "eval_runtime": 327.6067, + "eval_samples_per_second": 35.015, + "eval_steps_per_second": 8.754, + "step": 165500 + }, + { + "epoch": 12.825758456352435, + "grad_norm": 1.5291889663502232, + "learning_rate": 6.413127712337261e-07, + "loss": 0.8912, + "step": 165510 + }, + { + "epoch": 12.826533379828742, + "grad_norm": 1.55345586049871, + "learning_rate": 6.413515189088655e-07, + "loss": 0.8819, + "step": 165520 + }, + { + "epoch": 12.827308303305049, + "grad_norm": 1.44690750784066, + "learning_rate": 6.41390266584005e-07, + "loss": 0.8741, + "step": 165530 + }, + { + "epoch": 12.828083226781356, + "grad_norm": 1.5175566788767807, + "learning_rate": 6.414290142591445e-07, + "loss": 0.8851, + "step": 165540 + }, + { + "epoch": 12.828858150257663, + "grad_norm": 1.508546869970819, + "learning_rate": 6.414677619342841e-07, + "loss": 0.8928, + "step": 165550 + }, + { + "epoch": 12.82963307373397, + "grad_norm": 1.5045208990311811, + "learning_rate": 6.415065096094235e-07, + "loss": 0.8832, + "step": 165560 + }, + { + "epoch": 12.830407997210276, + "grad_norm": 1.517518445023636, + "learning_rate": 6.41545257284563e-07, + "loss": 0.8727, + "step": 165570 + }, + { + "epoch": 12.831182920686583, + "grad_norm": 1.4814914552262237, + "learning_rate": 6.415840049597024e-07, + "loss": 0.8779, + "step": 165580 + }, + { + "epoch": 12.831957844162888, + "grad_norm": 1.4147560488278614, + "learning_rate": 6.416227526348421e-07, + "loss": 0.8824, + "step": 165590 + }, + { + "epoch": 12.832732767639195, + "grad_norm": 1.5964997021107379, + "learning_rate": 6.416615003099815e-07, + "loss": 0.8928, + "step": 165600 + }, + { + "epoch": 12.833507691115502, + "grad_norm": 1.6857717138945327, + "learning_rate": 6.41700247985121e-07, + "loss": 0.8897, + "step": 165610 + }, + { + "epoch": 12.834282614591809, + "grad_norm": 1.4342449679781641, + "learning_rate": 6.417389956602604e-07, + "loss": 0.8664, + "step": 165620 + }, + { + "epoch": 12.835057538068115, + "grad_norm": 1.5077745991122036, + "learning_rate": 6.417777433354e-07, + "loss": 0.8796, + "step": 165630 + }, + { + "epoch": 12.835832461544422, + "grad_norm": 1.4997228116902757, + "learning_rate": 6.418164910105395e-07, + "loss": 0.8891, + "step": 165640 + }, + { + "epoch": 12.836607385020729, + "grad_norm": 1.5050019977134637, + "learning_rate": 6.41855238685679e-07, + "loss": 0.8714, + "step": 165650 + }, + { + "epoch": 12.837382308497036, + "grad_norm": 1.4828001056401587, + "learning_rate": 6.418939863608184e-07, + "loss": 0.866, + "step": 165660 + }, + { + "epoch": 12.838157231973343, + "grad_norm": 1.542244914514578, + "learning_rate": 6.419327340359579e-07, + "loss": 0.8856, + "step": 165670 + }, + { + "epoch": 12.83893215544965, + "grad_norm": 1.5672041744634704, + "learning_rate": 6.419714817110973e-07, + "loss": 0.8938, + "step": 165680 + }, + { + "epoch": 12.839707078925956, + "grad_norm": 1.4733817045814797, + "learning_rate": 6.42010229386237e-07, + "loss": 0.8901, + "step": 165690 + }, + { + "epoch": 12.840482002402263, + "grad_norm": 1.434438847408778, + "learning_rate": 6.420489770613764e-07, + "loss": 0.8618, + "step": 165700 + }, + { + "epoch": 12.84125692587857, + "grad_norm": 1.5039462672976378, + "learning_rate": 6.420877247365159e-07, + "loss": 0.8832, + "step": 165710 + }, + { + "epoch": 12.842031849354877, + "grad_norm": 1.4969682297313616, + "learning_rate": 6.421264724116553e-07, + "loss": 0.8809, + "step": 165720 + }, + { + "epoch": 12.842806772831183, + "grad_norm": 1.5147572551435244, + "learning_rate": 6.42165220086795e-07, + "loss": 0.8797, + "step": 165730 + }, + { + "epoch": 12.84358169630749, + "grad_norm": 1.4814768890601953, + "learning_rate": 6.422039677619344e-07, + "loss": 0.8752, + "step": 165740 + }, + { + "epoch": 12.844356619783797, + "grad_norm": 1.5117341757939473, + "learning_rate": 6.422427154370739e-07, + "loss": 0.8763, + "step": 165750 + }, + { + "epoch": 12.845131543260104, + "grad_norm": 1.4620970937994062, + "learning_rate": 6.422814631122133e-07, + "loss": 0.8871, + "step": 165760 + }, + { + "epoch": 12.845906466736409, + "grad_norm": 1.477211943569613, + "learning_rate": 6.423202107873529e-07, + "loss": 0.8777, + "step": 165770 + }, + { + "epoch": 12.846681390212716, + "grad_norm": 1.5126536040679024, + "learning_rate": 6.423589584624923e-07, + "loss": 0.8931, + "step": 165780 + }, + { + "epoch": 12.847456313689023, + "grad_norm": 1.4792333399416004, + "learning_rate": 6.423977061376319e-07, + "loss": 0.8717, + "step": 165790 + }, + { + "epoch": 12.84823123716533, + "grad_norm": 1.5020501798369479, + "learning_rate": 6.424364538127713e-07, + "loss": 0.8863, + "step": 165800 + }, + { + "epoch": 12.849006160641636, + "grad_norm": 1.5222839615913069, + "learning_rate": 6.424752014879108e-07, + "loss": 0.9047, + "step": 165810 + }, + { + "epoch": 12.849781084117943, + "grad_norm": 1.4856665806692169, + "learning_rate": 6.425139491630502e-07, + "loss": 0.8663, + "step": 165820 + }, + { + "epoch": 12.85055600759425, + "grad_norm": 1.546726361888305, + "learning_rate": 6.425526968381898e-07, + "loss": 0.8924, + "step": 165830 + }, + { + "epoch": 12.851330931070557, + "grad_norm": 1.500647099872904, + "learning_rate": 6.425914445133293e-07, + "loss": 0.8864, + "step": 165840 + }, + { + "epoch": 12.852105854546863, + "grad_norm": 1.5504044320034345, + "learning_rate": 6.426301921884688e-07, + "loss": 0.8971, + "step": 165850 + }, + { + "epoch": 12.85288077802317, + "grad_norm": 1.4760826950605332, + "learning_rate": 6.426689398636082e-07, + "loss": 0.8809, + "step": 165860 + }, + { + "epoch": 12.853655701499477, + "grad_norm": 1.3998093912241714, + "learning_rate": 6.427076875387478e-07, + "loss": 0.8884, + "step": 165870 + }, + { + "epoch": 12.854430624975784, + "grad_norm": 1.4828955209071009, + "learning_rate": 6.427464352138872e-07, + "loss": 0.8769, + "step": 165880 + }, + { + "epoch": 12.85520554845209, + "grad_norm": 1.4175576630944533, + "learning_rate": 6.427851828890268e-07, + "loss": 0.8763, + "step": 165890 + }, + { + "epoch": 12.855980471928397, + "grad_norm": 1.4565796428947715, + "learning_rate": 6.428239305641662e-07, + "loss": 0.8864, + "step": 165900 + }, + { + "epoch": 12.856755395404704, + "grad_norm": 1.3940112821919073, + "learning_rate": 6.428626782393057e-07, + "loss": 0.8744, + "step": 165910 + }, + { + "epoch": 12.857530318881011, + "grad_norm": 1.5090866449539355, + "learning_rate": 6.429014259144452e-07, + "loss": 0.8657, + "step": 165920 + }, + { + "epoch": 12.858305242357318, + "grad_norm": 1.4656230618306725, + "learning_rate": 6.429401735895848e-07, + "loss": 0.8613, + "step": 165930 + }, + { + "epoch": 12.859080165833625, + "grad_norm": 1.5547083790492129, + "learning_rate": 6.429789212647242e-07, + "loss": 0.8895, + "step": 165940 + }, + { + "epoch": 12.859855089309931, + "grad_norm": 1.4908508464488461, + "learning_rate": 6.430176689398637e-07, + "loss": 0.8794, + "step": 165950 + }, + { + "epoch": 12.860630012786238, + "grad_norm": 1.4905162570942265, + "learning_rate": 6.430564166150031e-07, + "loss": 0.8782, + "step": 165960 + }, + { + "epoch": 12.861404936262543, + "grad_norm": 1.5029121427391559, + "learning_rate": 6.430951642901427e-07, + "loss": 0.8952, + "step": 165970 + }, + { + "epoch": 12.86217985973885, + "grad_norm": 1.4227877929737291, + "learning_rate": 6.431339119652821e-07, + "loss": 0.8881, + "step": 165980 + }, + { + "epoch": 12.862954783215157, + "grad_norm": 1.5232146765382966, + "learning_rate": 6.431726596404217e-07, + "loss": 0.8894, + "step": 165990 + }, + { + "epoch": 12.863729706691464, + "grad_norm": 1.542225725109648, + "learning_rate": 6.432114073155611e-07, + "loss": 0.8665, + "step": 166000 + }, + { + "epoch": 12.863729706691464, + "eval_loss": 0.8963072896003723, + "eval_runtime": 331.3636, + "eval_samples_per_second": 34.618, + "eval_steps_per_second": 8.655, + "step": 166000 + }, + { + "epoch": 12.86450463016777, + "grad_norm": 1.5279784364019955, + "learning_rate": 6.432501549907007e-07, + "loss": 0.8669, + "step": 166010 + }, + { + "epoch": 12.865279553644077, + "grad_norm": 1.483564216634991, + "learning_rate": 6.432889026658401e-07, + "loss": 0.8737, + "step": 166020 + }, + { + "epoch": 12.866054477120384, + "grad_norm": 1.494788820571028, + "learning_rate": 6.433276503409797e-07, + "loss": 0.8744, + "step": 166030 + }, + { + "epoch": 12.866829400596691, + "grad_norm": 1.4915625020793404, + "learning_rate": 6.433663980161191e-07, + "loss": 0.8867, + "step": 166040 + }, + { + "epoch": 12.867604324072998, + "grad_norm": 1.4191449630562758, + "learning_rate": 6.434051456912586e-07, + "loss": 0.8713, + "step": 166050 + }, + { + "epoch": 12.868379247549305, + "grad_norm": 1.5739523180975632, + "learning_rate": 6.434438933663981e-07, + "loss": 0.8769, + "step": 166060 + }, + { + "epoch": 12.869154171025611, + "grad_norm": 1.4739119977597015, + "learning_rate": 6.434826410415376e-07, + "loss": 0.8788, + "step": 166070 + }, + { + "epoch": 12.869929094501918, + "grad_norm": 1.505628459367304, + "learning_rate": 6.43521388716677e-07, + "loss": 0.8706, + "step": 166080 + }, + { + "epoch": 12.870704017978225, + "grad_norm": 1.5002012461714134, + "learning_rate": 6.435601363918166e-07, + "loss": 0.8718, + "step": 166090 + }, + { + "epoch": 12.871478941454532, + "grad_norm": 1.4172769726805954, + "learning_rate": 6.43598884066956e-07, + "loss": 0.8709, + "step": 166100 + }, + { + "epoch": 12.872253864930839, + "grad_norm": 1.598127945970098, + "learning_rate": 6.436376317420956e-07, + "loss": 0.8851, + "step": 166110 + }, + { + "epoch": 12.873028788407145, + "grad_norm": 1.4985900548340363, + "learning_rate": 6.43676379417235e-07, + "loss": 0.8824, + "step": 166120 + }, + { + "epoch": 12.873803711883452, + "grad_norm": 1.5205378667515748, + "learning_rate": 6.437151270923746e-07, + "loss": 0.9004, + "step": 166130 + }, + { + "epoch": 12.874578635359757, + "grad_norm": 1.4962385709353887, + "learning_rate": 6.43753874767514e-07, + "loss": 0.8699, + "step": 166140 + }, + { + "epoch": 12.875353558836064, + "grad_norm": 1.412699046734035, + "learning_rate": 6.437926224426536e-07, + "loss": 0.8819, + "step": 166150 + }, + { + "epoch": 12.876128482312371, + "grad_norm": 1.7222461806019953, + "learning_rate": 6.43831370117793e-07, + "loss": 0.9005, + "step": 166160 + }, + { + "epoch": 12.876903405788678, + "grad_norm": 1.5166272613321563, + "learning_rate": 6.438701177929325e-07, + "loss": 0.9039, + "step": 166170 + }, + { + "epoch": 12.877678329264985, + "grad_norm": 1.4478067793261387, + "learning_rate": 6.43908865468072e-07, + "loss": 0.8958, + "step": 166180 + }, + { + "epoch": 12.878453252741291, + "grad_norm": 1.4524554528233817, + "learning_rate": 6.439476131432115e-07, + "loss": 0.8821, + "step": 166190 + }, + { + "epoch": 12.879228176217598, + "grad_norm": 1.4293300098700008, + "learning_rate": 6.43986360818351e-07, + "loss": 0.8656, + "step": 166200 + }, + { + "epoch": 12.880003099693905, + "grad_norm": 1.5942314426937856, + "learning_rate": 6.440251084934905e-07, + "loss": 0.896, + "step": 166210 + }, + { + "epoch": 12.880778023170212, + "grad_norm": 1.5150890066735145, + "learning_rate": 6.440638561686299e-07, + "loss": 0.8768, + "step": 166220 + }, + { + "epoch": 12.881552946646519, + "grad_norm": 1.4998652548863687, + "learning_rate": 6.441026038437695e-07, + "loss": 0.877, + "step": 166230 + }, + { + "epoch": 12.882327870122825, + "grad_norm": 1.4631961169560146, + "learning_rate": 6.441413515189089e-07, + "loss": 0.88, + "step": 166240 + }, + { + "epoch": 12.883102793599132, + "grad_norm": 1.4216939916100109, + "learning_rate": 6.441800991940485e-07, + "loss": 0.8952, + "step": 166250 + }, + { + "epoch": 12.883877717075439, + "grad_norm": 1.4861691725480946, + "learning_rate": 6.442188468691879e-07, + "loss": 0.8689, + "step": 166260 + }, + { + "epoch": 12.884652640551746, + "grad_norm": 1.456621078026377, + "learning_rate": 6.442575945443274e-07, + "loss": 0.8794, + "step": 166270 + }, + { + "epoch": 12.885427564028053, + "grad_norm": 1.541406775353355, + "learning_rate": 6.442963422194669e-07, + "loss": 0.8799, + "step": 166280 + }, + { + "epoch": 12.88620248750436, + "grad_norm": 1.5047775192273323, + "learning_rate": 6.443350898946065e-07, + "loss": 0.8685, + "step": 166290 + }, + { + "epoch": 12.886977410980666, + "grad_norm": 1.4065203453086221, + "learning_rate": 6.443738375697459e-07, + "loss": 0.8773, + "step": 166300 + }, + { + "epoch": 12.887752334456973, + "grad_norm": 1.541201796928645, + "learning_rate": 6.444125852448854e-07, + "loss": 0.8586, + "step": 166310 + }, + { + "epoch": 12.88852725793328, + "grad_norm": 1.764987339729295, + "learning_rate": 6.444513329200248e-07, + "loss": 0.8926, + "step": 166320 + }, + { + "epoch": 12.889302181409587, + "grad_norm": 1.5056547161422618, + "learning_rate": 6.444900805951644e-07, + "loss": 0.87, + "step": 166330 + }, + { + "epoch": 12.890077104885892, + "grad_norm": 1.5003446076517375, + "learning_rate": 6.445288282703039e-07, + "loss": 0.8783, + "step": 166340 + }, + { + "epoch": 12.890852028362199, + "grad_norm": 1.494276240992606, + "learning_rate": 6.445675759454434e-07, + "loss": 0.8855, + "step": 166350 + }, + { + "epoch": 12.891626951838505, + "grad_norm": 1.4826350970978592, + "learning_rate": 6.446063236205828e-07, + "loss": 0.8873, + "step": 166360 + }, + { + "epoch": 12.892401875314812, + "grad_norm": 1.4600616293523228, + "learning_rate": 6.446450712957223e-07, + "loss": 0.8724, + "step": 166370 + }, + { + "epoch": 12.893176798791119, + "grad_norm": 1.64511564199321, + "learning_rate": 6.446838189708618e-07, + "loss": 0.8566, + "step": 166380 + }, + { + "epoch": 12.893951722267426, + "grad_norm": 1.5135535524162247, + "learning_rate": 6.447225666460014e-07, + "loss": 0.866, + "step": 166390 + }, + { + "epoch": 12.894726645743733, + "grad_norm": 1.5407399806801183, + "learning_rate": 6.447613143211408e-07, + "loss": 0.8887, + "step": 166400 + }, + { + "epoch": 12.89550156922004, + "grad_norm": 1.5045627455954949, + "learning_rate": 6.448000619962803e-07, + "loss": 0.882, + "step": 166410 + }, + { + "epoch": 12.896276492696346, + "grad_norm": 1.5174950284625681, + "learning_rate": 6.448388096714197e-07, + "loss": 0.8837, + "step": 166420 + }, + { + "epoch": 12.897051416172653, + "grad_norm": 1.5080414149781838, + "learning_rate": 6.448775573465594e-07, + "loss": 0.865, + "step": 166430 + }, + { + "epoch": 12.89782633964896, + "grad_norm": 1.507445410352626, + "learning_rate": 6.449163050216988e-07, + "loss": 0.9009, + "step": 166440 + }, + { + "epoch": 12.898601263125267, + "grad_norm": 1.4563419239197721, + "learning_rate": 6.449550526968383e-07, + "loss": 0.8805, + "step": 166450 + }, + { + "epoch": 12.899376186601573, + "grad_norm": 1.5885408905105036, + "learning_rate": 6.449938003719777e-07, + "loss": 0.894, + "step": 166460 + }, + { + "epoch": 12.90015111007788, + "grad_norm": 1.460269719099826, + "learning_rate": 6.450325480471172e-07, + "loss": 0.8766, + "step": 166470 + }, + { + "epoch": 12.900926033554187, + "grad_norm": 1.5003970551758157, + "learning_rate": 6.450712957222567e-07, + "loss": 0.8894, + "step": 166480 + }, + { + "epoch": 12.901700957030494, + "grad_norm": 1.3147845862915115, + "learning_rate": 6.451100433973963e-07, + "loss": 0.87, + "step": 166490 + }, + { + "epoch": 12.9024758805068, + "grad_norm": 1.4242965281227127, + "learning_rate": 6.451487910725357e-07, + "loss": 0.8819, + "step": 166500 + }, + { + "epoch": 12.9024758805068, + "eval_loss": 0.8960025310516357, + "eval_runtime": 331.7854, + "eval_samples_per_second": 34.574, + "eval_steps_per_second": 8.644, + "step": 166500 + }, + { + "epoch": 12.903250803983106, + "grad_norm": 1.5483789779331867, + "learning_rate": 6.451875387476752e-07, + "loss": 0.8661, + "step": 166510 + }, + { + "epoch": 12.904025727459413, + "grad_norm": 1.528181830453329, + "learning_rate": 6.452262864228146e-07, + "loss": 0.8886, + "step": 166520 + }, + { + "epoch": 12.90480065093572, + "grad_norm": 1.500865893573986, + "learning_rate": 6.452650340979543e-07, + "loss": 0.8691, + "step": 166530 + }, + { + "epoch": 12.905575574412026, + "grad_norm": 1.40984437616456, + "learning_rate": 6.453037817730937e-07, + "loss": 0.8661, + "step": 166540 + }, + { + "epoch": 12.906350497888333, + "grad_norm": 1.5684500893973128, + "learning_rate": 6.453425294482332e-07, + "loss": 0.8534, + "step": 166550 + }, + { + "epoch": 12.90712542136464, + "grad_norm": 1.684503443388896, + "learning_rate": 6.453812771233726e-07, + "loss": 0.8821, + "step": 166560 + }, + { + "epoch": 12.907900344840947, + "grad_norm": 1.5023795332402896, + "learning_rate": 6.454200247985123e-07, + "loss": 0.8794, + "step": 166570 + }, + { + "epoch": 12.908675268317253, + "grad_norm": 1.5447773154009434, + "learning_rate": 6.454587724736517e-07, + "loss": 0.8695, + "step": 166580 + }, + { + "epoch": 12.90945019179356, + "grad_norm": 1.5104614399252825, + "learning_rate": 6.454975201487912e-07, + "loss": 0.8828, + "step": 166590 + }, + { + "epoch": 12.910225115269867, + "grad_norm": 1.5290627033419244, + "learning_rate": 6.455362678239306e-07, + "loss": 0.8742, + "step": 166600 + }, + { + "epoch": 12.911000038746174, + "grad_norm": 1.43363159975382, + "learning_rate": 6.455750154990701e-07, + "loss": 0.8814, + "step": 166610 + }, + { + "epoch": 12.91177496222248, + "grad_norm": 1.5065377132286424, + "learning_rate": 6.456137631742095e-07, + "loss": 0.8802, + "step": 166620 + }, + { + "epoch": 12.912549885698787, + "grad_norm": 1.633070001612639, + "learning_rate": 6.456525108493492e-07, + "loss": 0.868, + "step": 166630 + }, + { + "epoch": 12.913324809175094, + "grad_norm": 1.4586008133814896, + "learning_rate": 6.456912585244886e-07, + "loss": 0.8798, + "step": 166640 + }, + { + "epoch": 12.914099732651401, + "grad_norm": 1.494187548822501, + "learning_rate": 6.457300061996281e-07, + "loss": 0.8706, + "step": 166650 + }, + { + "epoch": 12.914874656127708, + "grad_norm": 1.4181460442198501, + "learning_rate": 6.457687538747675e-07, + "loss": 0.8482, + "step": 166660 + }, + { + "epoch": 12.915649579604015, + "grad_norm": 1.497683165064285, + "learning_rate": 6.458075015499072e-07, + "loss": 0.9006, + "step": 166670 + }, + { + "epoch": 12.916424503080322, + "grad_norm": 1.5513269569404973, + "learning_rate": 6.458462492250466e-07, + "loss": 0.8981, + "step": 166680 + }, + { + "epoch": 12.917199426556628, + "grad_norm": 1.4301060870280438, + "learning_rate": 6.458849969001861e-07, + "loss": 0.8739, + "step": 166690 + }, + { + "epoch": 12.917974350032935, + "grad_norm": 1.426211646814986, + "learning_rate": 6.459237445753255e-07, + "loss": 0.8543, + "step": 166700 + }, + { + "epoch": 12.91874927350924, + "grad_norm": 1.5698786160336584, + "learning_rate": 6.459624922504651e-07, + "loss": 0.8666, + "step": 166710 + }, + { + "epoch": 12.919524196985547, + "grad_norm": 1.3993315162367617, + "learning_rate": 6.460012399256046e-07, + "loss": 0.8628, + "step": 166720 + }, + { + "epoch": 12.920299120461854, + "grad_norm": 1.4230539341292796, + "learning_rate": 6.460399876007441e-07, + "loss": 0.8616, + "step": 166730 + }, + { + "epoch": 12.92107404393816, + "grad_norm": 1.4196205686298975, + "learning_rate": 6.460787352758835e-07, + "loss": 0.8774, + "step": 166740 + }, + { + "epoch": 12.921848967414467, + "grad_norm": 1.4671849567419906, + "learning_rate": 6.46117482951023e-07, + "loss": 0.881, + "step": 166750 + }, + { + "epoch": 12.922623890890774, + "grad_norm": 1.4997101028855544, + "learning_rate": 6.461562306261624e-07, + "loss": 0.9212, + "step": 166760 + }, + { + "epoch": 12.923398814367081, + "grad_norm": 1.4636176046535503, + "learning_rate": 6.461949783013021e-07, + "loss": 0.8715, + "step": 166770 + }, + { + "epoch": 12.924173737843388, + "grad_norm": 1.4853452383117731, + "learning_rate": 6.462337259764415e-07, + "loss": 0.8848, + "step": 166780 + }, + { + "epoch": 12.924948661319695, + "grad_norm": 1.4963467689263166, + "learning_rate": 6.46272473651581e-07, + "loss": 0.8698, + "step": 166790 + }, + { + "epoch": 12.925723584796001, + "grad_norm": 1.4559995111868156, + "learning_rate": 6.463112213267204e-07, + "loss": 0.8786, + "step": 166800 + }, + { + "epoch": 12.926498508272308, + "grad_norm": 1.5139328720013387, + "learning_rate": 6.4634996900186e-07, + "loss": 0.8973, + "step": 166810 + }, + { + "epoch": 12.927273431748615, + "grad_norm": 1.6078875645424906, + "learning_rate": 6.463887166769995e-07, + "loss": 0.8849, + "step": 166820 + }, + { + "epoch": 12.928048355224922, + "grad_norm": 1.4473153502681682, + "learning_rate": 6.46427464352139e-07, + "loss": 0.8842, + "step": 166830 + }, + { + "epoch": 12.928823278701229, + "grad_norm": 1.4235025606410945, + "learning_rate": 6.464662120272784e-07, + "loss": 0.8732, + "step": 166840 + }, + { + "epoch": 12.929598202177536, + "grad_norm": 1.3654762489208467, + "learning_rate": 6.46504959702418e-07, + "loss": 0.8711, + "step": 166850 + }, + { + "epoch": 12.930373125653842, + "grad_norm": 1.5906687708790082, + "learning_rate": 6.465437073775574e-07, + "loss": 0.8925, + "step": 166860 + }, + { + "epoch": 12.93114804913015, + "grad_norm": 1.5341665671281302, + "learning_rate": 6.46582455052697e-07, + "loss": 0.883, + "step": 166870 + }, + { + "epoch": 12.931922972606454, + "grad_norm": 1.6843151502667448, + "learning_rate": 6.466212027278364e-07, + "loss": 0.8875, + "step": 166880 + }, + { + "epoch": 12.932697896082761, + "grad_norm": 1.453689927197826, + "learning_rate": 6.466599504029759e-07, + "loss": 0.8789, + "step": 166890 + }, + { + "epoch": 12.933472819559068, + "grad_norm": 1.5093029030088825, + "learning_rate": 6.466986980781153e-07, + "loss": 0.8787, + "step": 166900 + }, + { + "epoch": 12.934247743035375, + "grad_norm": 1.3883886668235674, + "learning_rate": 6.467374457532549e-07, + "loss": 0.8679, + "step": 166910 + }, + { + "epoch": 12.935022666511681, + "grad_norm": 1.4625099888424036, + "learning_rate": 6.467761934283944e-07, + "loss": 0.8841, + "step": 166920 + }, + { + "epoch": 12.935797589987988, + "grad_norm": 1.4280506489677192, + "learning_rate": 6.468149411035339e-07, + "loss": 0.8654, + "step": 166930 + }, + { + "epoch": 12.936572513464295, + "grad_norm": 1.5011684081278989, + "learning_rate": 6.468536887786733e-07, + "loss": 0.8571, + "step": 166940 + }, + { + "epoch": 12.937347436940602, + "grad_norm": 1.4575698793221434, + "learning_rate": 6.468924364538129e-07, + "loss": 0.8796, + "step": 166950 + }, + { + "epoch": 12.938122360416909, + "grad_norm": 1.4519253643853376, + "learning_rate": 6.469311841289523e-07, + "loss": 0.8864, + "step": 166960 + }, + { + "epoch": 12.938897283893215, + "grad_norm": 1.4921981332873688, + "learning_rate": 6.469699318040919e-07, + "loss": 0.8811, + "step": 166970 + }, + { + "epoch": 12.939672207369522, + "grad_norm": 1.5176810364891073, + "learning_rate": 6.470086794792313e-07, + "loss": 0.8898, + "step": 166980 + }, + { + "epoch": 12.940447130845829, + "grad_norm": 1.434214822146269, + "learning_rate": 6.470474271543709e-07, + "loss": 0.8763, + "step": 166990 + }, + { + "epoch": 12.941222054322136, + "grad_norm": 1.4616727314412847, + "learning_rate": 6.470861748295103e-07, + "loss": 0.8656, + "step": 167000 + }, + { + "epoch": 12.941222054322136, + "eval_loss": 0.8960871696472168, + "eval_runtime": 331.2586, + "eval_samples_per_second": 34.629, + "eval_steps_per_second": 8.658, + "step": 167000 + }, + { + "epoch": 12.941996977798443, + "grad_norm": 1.5341243565309481, + "learning_rate": 6.471249225046498e-07, + "loss": 0.9142, + "step": 167010 + }, + { + "epoch": 12.94277190127475, + "grad_norm": 1.6104867557137699, + "learning_rate": 6.471636701797893e-07, + "loss": 0.8742, + "step": 167020 + }, + { + "epoch": 12.943546824751056, + "grad_norm": 1.522659604535787, + "learning_rate": 6.472024178549288e-07, + "loss": 0.9, + "step": 167030 + }, + { + "epoch": 12.944321748227363, + "grad_norm": 1.5615927536314806, + "learning_rate": 6.472411655300682e-07, + "loss": 0.8986, + "step": 167040 + }, + { + "epoch": 12.94509667170367, + "grad_norm": 1.4410130994711106, + "learning_rate": 6.472799132052078e-07, + "loss": 0.8753, + "step": 167050 + }, + { + "epoch": 12.945871595179977, + "grad_norm": 1.4601814113654443, + "learning_rate": 6.473186608803472e-07, + "loss": 0.8841, + "step": 167060 + }, + { + "epoch": 12.946646518656284, + "grad_norm": 1.4563916022093937, + "learning_rate": 6.473574085554868e-07, + "loss": 0.8936, + "step": 167070 + }, + { + "epoch": 12.947421442132589, + "grad_norm": 1.4518174099452814, + "learning_rate": 6.473961562306262e-07, + "loss": 0.8825, + "step": 167080 + }, + { + "epoch": 12.948196365608895, + "grad_norm": 1.4645861526176729, + "learning_rate": 6.474349039057658e-07, + "loss": 0.8794, + "step": 167090 + }, + { + "epoch": 12.948971289085202, + "grad_norm": 1.39052180737509, + "learning_rate": 6.474736515809052e-07, + "loss": 0.8679, + "step": 167100 + }, + { + "epoch": 12.949746212561509, + "grad_norm": 1.4937904688747146, + "learning_rate": 6.475123992560447e-07, + "loss": 0.8687, + "step": 167110 + }, + { + "epoch": 12.950521136037816, + "grad_norm": 1.549117400657045, + "learning_rate": 6.475511469311842e-07, + "loss": 0.8846, + "step": 167120 + }, + { + "epoch": 12.951296059514123, + "grad_norm": 1.5699771984994981, + "learning_rate": 6.475898946063238e-07, + "loss": 0.8687, + "step": 167130 + }, + { + "epoch": 12.95207098299043, + "grad_norm": 1.4823785880290126, + "learning_rate": 6.476286422814632e-07, + "loss": 0.867, + "step": 167140 + }, + { + "epoch": 12.952845906466736, + "grad_norm": 1.5327579535261007, + "learning_rate": 6.476673899566027e-07, + "loss": 0.8688, + "step": 167150 + }, + { + "epoch": 12.953620829943043, + "grad_norm": 1.4468343250595221, + "learning_rate": 6.477061376317421e-07, + "loss": 0.8775, + "step": 167160 + }, + { + "epoch": 12.95439575341935, + "grad_norm": 1.40576222777993, + "learning_rate": 6.477448853068817e-07, + "loss": 0.8959, + "step": 167170 + }, + { + "epoch": 12.955170676895657, + "grad_norm": 1.5250796755034697, + "learning_rate": 6.477836329820211e-07, + "loss": 0.8763, + "step": 167180 + }, + { + "epoch": 12.955945600371964, + "grad_norm": 1.599360889520744, + "learning_rate": 6.478223806571607e-07, + "loss": 0.8873, + "step": 167190 + }, + { + "epoch": 12.95672052384827, + "grad_norm": 1.4751320828337797, + "learning_rate": 6.478611283323001e-07, + "loss": 0.8627, + "step": 167200 + }, + { + "epoch": 12.957495447324577, + "grad_norm": 1.4861961991253594, + "learning_rate": 6.478998760074396e-07, + "loss": 0.8527, + "step": 167210 + }, + { + "epoch": 12.958270370800884, + "grad_norm": 1.4857218892489386, + "learning_rate": 6.479386236825791e-07, + "loss": 0.898, + "step": 167220 + }, + { + "epoch": 12.95904529427719, + "grad_norm": 1.467054828164402, + "learning_rate": 6.479773713577187e-07, + "loss": 0.8958, + "step": 167230 + }, + { + "epoch": 12.959820217753498, + "grad_norm": 1.6650002614193138, + "learning_rate": 6.480161190328581e-07, + "loss": 0.8734, + "step": 167240 + }, + { + "epoch": 12.960595141229804, + "grad_norm": 1.5478955577610365, + "learning_rate": 6.480548667079976e-07, + "loss": 0.88, + "step": 167250 + }, + { + "epoch": 12.96137006470611, + "grad_norm": 1.4904850079758447, + "learning_rate": 6.48093614383137e-07, + "loss": 0.8813, + "step": 167260 + }, + { + "epoch": 12.962144988182416, + "grad_norm": 1.4155786523067084, + "learning_rate": 6.481323620582767e-07, + "loss": 0.8955, + "step": 167270 + }, + { + "epoch": 12.962919911658723, + "grad_norm": 1.4671669565904455, + "learning_rate": 6.481711097334161e-07, + "loss": 0.8958, + "step": 167280 + }, + { + "epoch": 12.96369483513503, + "grad_norm": 1.5561216204133783, + "learning_rate": 6.482098574085556e-07, + "loss": 0.874, + "step": 167290 + }, + { + "epoch": 12.964469758611337, + "grad_norm": 1.541963545271423, + "learning_rate": 6.48248605083695e-07, + "loss": 0.8625, + "step": 167300 + }, + { + "epoch": 12.965244682087643, + "grad_norm": 1.4183205196450028, + "learning_rate": 6.482873527588345e-07, + "loss": 0.8608, + "step": 167310 + }, + { + "epoch": 12.96601960556395, + "grad_norm": 1.547170776082861, + "learning_rate": 6.48326100433974e-07, + "loss": 0.8734, + "step": 167320 + }, + { + "epoch": 12.966794529040257, + "grad_norm": 1.5323678491495307, + "learning_rate": 6.483648481091136e-07, + "loss": 0.8555, + "step": 167330 + }, + { + "epoch": 12.967569452516564, + "grad_norm": 1.4913796019327983, + "learning_rate": 6.48403595784253e-07, + "loss": 0.8627, + "step": 167340 + }, + { + "epoch": 12.96834437599287, + "grad_norm": 1.4980460887384948, + "learning_rate": 6.484423434593925e-07, + "loss": 0.8856, + "step": 167350 + }, + { + "epoch": 12.969119299469178, + "grad_norm": 1.432441459728273, + "learning_rate": 6.48481091134532e-07, + "loss": 0.881, + "step": 167360 + }, + { + "epoch": 12.969894222945484, + "grad_norm": 1.4655138663829745, + "learning_rate": 6.485198388096716e-07, + "loss": 0.8841, + "step": 167370 + }, + { + "epoch": 12.970669146421791, + "grad_norm": 1.5207035569428462, + "learning_rate": 6.48558586484811e-07, + "loss": 0.8667, + "step": 167380 + }, + { + "epoch": 12.971444069898098, + "grad_norm": 1.446574583065071, + "learning_rate": 6.485973341599505e-07, + "loss": 0.8917, + "step": 167390 + }, + { + "epoch": 12.972218993374405, + "grad_norm": 1.4644505742609597, + "learning_rate": 6.486360818350899e-07, + "loss": 0.8662, + "step": 167400 + }, + { + "epoch": 12.972993916850712, + "grad_norm": 1.4965697511086429, + "learning_rate": 6.486748295102295e-07, + "loss": 0.893, + "step": 167410 + }, + { + "epoch": 12.973768840327018, + "grad_norm": 1.662242781007566, + "learning_rate": 6.48713577185369e-07, + "loss": 0.8917, + "step": 167420 + }, + { + "epoch": 12.974543763803325, + "grad_norm": 1.6760843006195787, + "learning_rate": 6.487523248605085e-07, + "loss": 0.8815, + "step": 167430 + }, + { + "epoch": 12.975318687279632, + "grad_norm": 1.6025844075338205, + "learning_rate": 6.487910725356479e-07, + "loss": 0.8671, + "step": 167440 + }, + { + "epoch": 12.976093610755937, + "grad_norm": 1.5508853712579331, + "learning_rate": 6.488298202107874e-07, + "loss": 0.8845, + "step": 167450 + }, + { + "epoch": 12.976868534232244, + "grad_norm": 1.4803569555168896, + "learning_rate": 6.488685678859268e-07, + "loss": 0.8603, + "step": 167460 + }, + { + "epoch": 12.97764345770855, + "grad_norm": 1.4554965596793394, + "learning_rate": 6.489073155610665e-07, + "loss": 0.8816, + "step": 167470 + }, + { + "epoch": 12.978418381184857, + "grad_norm": 1.3423023422515803, + "learning_rate": 6.489460632362059e-07, + "loss": 0.8617, + "step": 167480 + }, + { + "epoch": 12.979193304661164, + "grad_norm": 1.5197628543163226, + "learning_rate": 6.489848109113454e-07, + "loss": 0.8981, + "step": 167490 + }, + { + "epoch": 12.979968228137471, + "grad_norm": 1.456561220334198, + "learning_rate": 6.490235585864848e-07, + "loss": 0.8782, + "step": 167500 + }, + { + "epoch": 12.979968228137471, + "eval_loss": 0.8955289125442505, + "eval_runtime": 330.9564, + "eval_samples_per_second": 34.66, + "eval_steps_per_second": 8.666, + "step": 167500 + }, + { + "epoch": 12.980743151613778, + "grad_norm": 1.557313570487373, + "learning_rate": 6.490623062616245e-07, + "loss": 0.9052, + "step": 167510 + }, + { + "epoch": 12.981518075090085, + "grad_norm": 1.515888614936176, + "learning_rate": 6.491010539367639e-07, + "loss": 0.8723, + "step": 167520 + }, + { + "epoch": 12.982292998566392, + "grad_norm": 1.4611530840799418, + "learning_rate": 6.491398016119034e-07, + "loss": 0.8772, + "step": 167530 + }, + { + "epoch": 12.983067922042698, + "grad_norm": 1.4017740660431643, + "learning_rate": 6.491785492870428e-07, + "loss": 0.8731, + "step": 167540 + }, + { + "epoch": 12.983842845519005, + "grad_norm": 1.4253035377179715, + "learning_rate": 6.492172969621823e-07, + "loss": 0.8818, + "step": 167550 + }, + { + "epoch": 12.984617768995312, + "grad_norm": 1.496430348915266, + "learning_rate": 6.492560446373219e-07, + "loss": 0.883, + "step": 167560 + }, + { + "epoch": 12.985392692471619, + "grad_norm": 1.5200682478624297, + "learning_rate": 6.492947923124614e-07, + "loss": 0.871, + "step": 167570 + }, + { + "epoch": 12.986167615947926, + "grad_norm": 1.5269411309673695, + "learning_rate": 6.493335399876008e-07, + "loss": 0.8861, + "step": 167580 + }, + { + "epoch": 12.986942539424232, + "grad_norm": 1.4237951447730104, + "learning_rate": 6.493722876627403e-07, + "loss": 0.8867, + "step": 167590 + }, + { + "epoch": 12.98771746290054, + "grad_norm": 1.4962989765272945, + "learning_rate": 6.494110353378797e-07, + "loss": 0.8668, + "step": 167600 + }, + { + "epoch": 12.988492386376846, + "grad_norm": 1.446904826859195, + "learning_rate": 6.494497830130194e-07, + "loss": 0.8725, + "step": 167610 + }, + { + "epoch": 12.989267309853153, + "grad_norm": 1.60120504467703, + "learning_rate": 6.494885306881588e-07, + "loss": 0.8649, + "step": 167620 + }, + { + "epoch": 12.990042233329458, + "grad_norm": 1.5254690144840084, + "learning_rate": 6.495272783632983e-07, + "loss": 0.878, + "step": 167630 + }, + { + "epoch": 12.990817156805765, + "grad_norm": 1.531773003023461, + "learning_rate": 6.495660260384377e-07, + "loss": 0.8737, + "step": 167640 + }, + { + "epoch": 12.991592080282071, + "grad_norm": 1.4928975889302438, + "learning_rate": 6.496047737135773e-07, + "loss": 0.8901, + "step": 167650 + }, + { + "epoch": 12.992367003758378, + "grad_norm": 1.4871778276573988, + "learning_rate": 6.496435213887168e-07, + "loss": 0.8665, + "step": 167660 + }, + { + "epoch": 12.993141927234685, + "grad_norm": 1.501604307667281, + "learning_rate": 6.496822690638563e-07, + "loss": 0.8779, + "step": 167670 + }, + { + "epoch": 12.993916850710992, + "grad_norm": 1.503214063225275, + "learning_rate": 6.497210167389957e-07, + "loss": 0.8707, + "step": 167680 + }, + { + "epoch": 12.994691774187299, + "grad_norm": 1.4445324047561872, + "learning_rate": 6.497597644141352e-07, + "loss": 0.8746, + "step": 167690 + }, + { + "epoch": 12.995466697663606, + "grad_norm": 1.4662670160826041, + "learning_rate": 6.497985120892747e-07, + "loss": 0.8813, + "step": 167700 + }, + { + "epoch": 12.996241621139912, + "grad_norm": 1.475822803332231, + "learning_rate": 6.498372597644143e-07, + "loss": 0.8958, + "step": 167710 + }, + { + "epoch": 12.99701654461622, + "grad_norm": 1.4717161822664884, + "learning_rate": 6.498760074395537e-07, + "loss": 0.8744, + "step": 167720 + }, + { + "epoch": 12.997791468092526, + "grad_norm": 1.448981596674379, + "learning_rate": 6.499147551146932e-07, + "loss": 0.8661, + "step": 167730 + }, + { + "epoch": 12.998566391568833, + "grad_norm": 1.4527725682989727, + "learning_rate": 6.499535027898326e-07, + "loss": 0.874, + "step": 167740 + }, + { + "epoch": 12.99934131504514, + "grad_norm": 1.4304498490231352, + "learning_rate": 6.499922504649722e-07, + "loss": 0.8679, + "step": 167750 + }, + { + "epoch": 13.000116238521446, + "grad_norm": 1.3976417215394168, + "learning_rate": 6.500309981401117e-07, + "loss": 0.8993, + "step": 167760 + }, + { + "epoch": 13.000891161997753, + "grad_norm": 1.502450619385773, + "learning_rate": 6.500697458152512e-07, + "loss": 0.8614, + "step": 167770 + }, + { + "epoch": 13.00166608547406, + "grad_norm": 1.5368354530926103, + "learning_rate": 6.501084934903906e-07, + "loss": 0.8686, + "step": 167780 + }, + { + "epoch": 13.002441008950367, + "grad_norm": 1.5166031138500071, + "learning_rate": 6.501472411655302e-07, + "loss": 0.874, + "step": 167790 + }, + { + "epoch": 13.003215932426674, + "grad_norm": 1.5378405574440235, + "learning_rate": 6.501859888406696e-07, + "loss": 0.8827, + "step": 167800 + }, + { + "epoch": 13.00399085590298, + "grad_norm": 1.3910145499103201, + "learning_rate": 6.502247365158092e-07, + "loss": 0.8692, + "step": 167810 + }, + { + "epoch": 13.004765779379285, + "grad_norm": 1.4859345310549534, + "learning_rate": 6.502634841909486e-07, + "loss": 0.8698, + "step": 167820 + }, + { + "epoch": 13.005540702855592, + "grad_norm": 1.565829924103622, + "learning_rate": 6.503022318660881e-07, + "loss": 0.8589, + "step": 167830 + }, + { + "epoch": 13.006315626331899, + "grad_norm": 1.4703368692377385, + "learning_rate": 6.503409795412276e-07, + "loss": 0.8797, + "step": 167840 + }, + { + "epoch": 13.007090549808206, + "grad_norm": 1.5947268905300727, + "learning_rate": 6.503797272163671e-07, + "loss": 0.8682, + "step": 167850 + }, + { + "epoch": 13.007865473284513, + "grad_norm": 1.439732156731079, + "learning_rate": 6.504184748915066e-07, + "loss": 0.8665, + "step": 167860 + }, + { + "epoch": 13.00864039676082, + "grad_norm": 1.4953187785070174, + "learning_rate": 6.504572225666461e-07, + "loss": 0.8722, + "step": 167870 + }, + { + "epoch": 13.009415320237126, + "grad_norm": 1.4633850496635343, + "learning_rate": 6.504959702417855e-07, + "loss": 0.8939, + "step": 167880 + }, + { + "epoch": 13.010190243713433, + "grad_norm": 1.4486870295668886, + "learning_rate": 6.505347179169251e-07, + "loss": 0.8702, + "step": 167890 + }, + { + "epoch": 13.01096516718974, + "grad_norm": 1.611112513272926, + "learning_rate": 6.505734655920645e-07, + "loss": 0.8884, + "step": 167900 + }, + { + "epoch": 13.011740090666047, + "grad_norm": 1.3941698987825335, + "learning_rate": 6.506122132672041e-07, + "loss": 0.8746, + "step": 167910 + }, + { + "epoch": 13.012515014142354, + "grad_norm": 1.4697386441833626, + "learning_rate": 6.506509609423435e-07, + "loss": 0.8566, + "step": 167920 + }, + { + "epoch": 13.01328993761866, + "grad_norm": 1.5315401928477843, + "learning_rate": 6.506897086174831e-07, + "loss": 0.866, + "step": 167930 + }, + { + "epoch": 13.014064861094967, + "grad_norm": 1.5000093080707049, + "learning_rate": 6.507284562926225e-07, + "loss": 0.8638, + "step": 167940 + }, + { + "epoch": 13.014839784571274, + "grad_norm": 1.496248263693984, + "learning_rate": 6.50767203967762e-07, + "loss": 0.8803, + "step": 167950 + }, + { + "epoch": 13.01561470804758, + "grad_norm": 1.514371721816609, + "learning_rate": 6.508059516429015e-07, + "loss": 0.8724, + "step": 167960 + }, + { + "epoch": 13.016389631523888, + "grad_norm": 1.5299351993229233, + "learning_rate": 6.50844699318041e-07, + "loss": 0.8809, + "step": 167970 + }, + { + "epoch": 13.017164555000194, + "grad_norm": 1.430942867991314, + "learning_rate": 6.508834469931804e-07, + "loss": 0.8724, + "step": 167980 + }, + { + "epoch": 13.017939478476501, + "grad_norm": 1.5200361380973526, + "learning_rate": 6.5092219466832e-07, + "loss": 0.8709, + "step": 167990 + }, + { + "epoch": 13.018714401952808, + "grad_norm": 1.5159182472483783, + "learning_rate": 6.509609423434594e-07, + "loss": 0.8783, + "step": 168000 + }, + { + "epoch": 13.018714401952808, + "eval_loss": 0.8961470723152161, + "eval_runtime": 331.8654, + "eval_samples_per_second": 34.565, + "eval_steps_per_second": 8.642, + "step": 168000 + }, + { + "epoch": 13.019489325429113, + "grad_norm": 1.5186547627162719, + "learning_rate": 6.50999690018599e-07, + "loss": 0.8891, + "step": 168010 + }, + { + "epoch": 13.02026424890542, + "grad_norm": 1.501126093874676, + "learning_rate": 6.510384376937384e-07, + "loss": 0.8491, + "step": 168020 + }, + { + "epoch": 13.021039172381727, + "grad_norm": 1.4883587968438106, + "learning_rate": 6.51077185368878e-07, + "loss": 0.8765, + "step": 168030 + }, + { + "epoch": 13.021814095858034, + "grad_norm": 1.4797750318042509, + "learning_rate": 6.511159330440174e-07, + "loss": 0.8745, + "step": 168040 + }, + { + "epoch": 13.02258901933434, + "grad_norm": 1.441834452401341, + "learning_rate": 6.51154680719157e-07, + "loss": 0.8791, + "step": 168050 + }, + { + "epoch": 13.023363942810647, + "grad_norm": 1.5062751637312686, + "learning_rate": 6.511934283942964e-07, + "loss": 0.8643, + "step": 168060 + }, + { + "epoch": 13.024138866286954, + "grad_norm": 1.5608973464447033, + "learning_rate": 6.51232176069436e-07, + "loss": 0.8864, + "step": 168070 + }, + { + "epoch": 13.02491378976326, + "grad_norm": 1.5667114427599047, + "learning_rate": 6.512709237445754e-07, + "loss": 0.878, + "step": 168080 + }, + { + "epoch": 13.025688713239568, + "grad_norm": 1.4676230493672637, + "learning_rate": 6.513096714197149e-07, + "loss": 0.8579, + "step": 168090 + }, + { + "epoch": 13.026463636715874, + "grad_norm": 1.6965840650161106, + "learning_rate": 6.513484190948544e-07, + "loss": 0.8757, + "step": 168100 + }, + { + "epoch": 13.027238560192181, + "grad_norm": 1.6753370671364078, + "learning_rate": 6.513871667699939e-07, + "loss": 0.8767, + "step": 168110 + }, + { + "epoch": 13.028013483668488, + "grad_norm": 1.6279724481911404, + "learning_rate": 6.514259144451333e-07, + "loss": 0.8694, + "step": 168120 + }, + { + "epoch": 13.028788407144795, + "grad_norm": 1.484282287410595, + "learning_rate": 6.514646621202729e-07, + "loss": 0.8879, + "step": 168130 + }, + { + "epoch": 13.029563330621102, + "grad_norm": 1.47696910391389, + "learning_rate": 6.515034097954123e-07, + "loss": 0.8645, + "step": 168140 + }, + { + "epoch": 13.030338254097408, + "grad_norm": 1.5251809308246114, + "learning_rate": 6.515421574705519e-07, + "loss": 0.8594, + "step": 168150 + }, + { + "epoch": 13.031113177573715, + "grad_norm": 1.6076027573077178, + "learning_rate": 6.515809051456913e-07, + "loss": 0.8692, + "step": 168160 + }, + { + "epoch": 13.031888101050022, + "grad_norm": 1.5876756835704846, + "learning_rate": 6.516196528208309e-07, + "loss": 0.8813, + "step": 168170 + }, + { + "epoch": 13.032663024526329, + "grad_norm": 1.492848398855659, + "learning_rate": 6.516584004959703e-07, + "loss": 0.878, + "step": 168180 + }, + { + "epoch": 13.033437948002634, + "grad_norm": 1.4277284619217303, + "learning_rate": 6.516971481711098e-07, + "loss": 0.8806, + "step": 168190 + }, + { + "epoch": 13.03421287147894, + "grad_norm": 1.4249958148011743, + "learning_rate": 6.517358958462493e-07, + "loss": 0.8816, + "step": 168200 + }, + { + "epoch": 13.034987794955248, + "grad_norm": 1.5103909295167575, + "learning_rate": 6.517746435213889e-07, + "loss": 0.8796, + "step": 168210 + }, + { + "epoch": 13.035762718431554, + "grad_norm": 1.5073015515842456, + "learning_rate": 6.518133911965283e-07, + "loss": 0.8573, + "step": 168220 + }, + { + "epoch": 13.036537641907861, + "grad_norm": 1.5264519043231883, + "learning_rate": 6.518521388716678e-07, + "loss": 0.8653, + "step": 168230 + }, + { + "epoch": 13.037312565384168, + "grad_norm": 1.5441538194881832, + "learning_rate": 6.518908865468072e-07, + "loss": 0.8669, + "step": 168240 + }, + { + "epoch": 13.038087488860475, + "grad_norm": 1.5043518891258252, + "learning_rate": 6.519296342219468e-07, + "loss": 0.8812, + "step": 168250 + }, + { + "epoch": 13.038862412336782, + "grad_norm": 1.5146656694690765, + "learning_rate": 6.519683818970862e-07, + "loss": 0.8759, + "step": 168260 + }, + { + "epoch": 13.039637335813088, + "grad_norm": 1.508181410212892, + "learning_rate": 6.520071295722258e-07, + "loss": 0.8698, + "step": 168270 + }, + { + "epoch": 13.040412259289395, + "grad_norm": 1.4945820806220969, + "learning_rate": 6.520458772473652e-07, + "loss": 0.8938, + "step": 168280 + }, + { + "epoch": 13.041187182765702, + "grad_norm": 1.4895546746074269, + "learning_rate": 6.520846249225047e-07, + "loss": 0.8799, + "step": 168290 + }, + { + "epoch": 13.041962106242009, + "grad_norm": 1.4732490745272642, + "learning_rate": 6.521233725976442e-07, + "loss": 0.8842, + "step": 168300 + }, + { + "epoch": 13.042737029718316, + "grad_norm": 1.4884207213521234, + "learning_rate": 6.521621202727838e-07, + "loss": 0.8681, + "step": 168310 + }, + { + "epoch": 13.043511953194622, + "grad_norm": 1.4379930836404482, + "learning_rate": 6.522008679479232e-07, + "loss": 0.8758, + "step": 168320 + }, + { + "epoch": 13.04428687667093, + "grad_norm": 1.4127258242456895, + "learning_rate": 6.522396156230627e-07, + "loss": 0.8602, + "step": 168330 + }, + { + "epoch": 13.045061800147236, + "grad_norm": 1.43674178427141, + "learning_rate": 6.522783632982021e-07, + "loss": 0.8666, + "step": 168340 + }, + { + "epoch": 13.045836723623543, + "grad_norm": 1.7046660747463214, + "learning_rate": 6.523171109733418e-07, + "loss": 0.8727, + "step": 168350 + }, + { + "epoch": 13.04661164709985, + "grad_norm": 1.5583970933519573, + "learning_rate": 6.523558586484812e-07, + "loss": 0.8709, + "step": 168360 + }, + { + "epoch": 13.047386570576156, + "grad_norm": 1.5324330334409932, + "learning_rate": 6.523946063236207e-07, + "loss": 0.8827, + "step": 168370 + }, + { + "epoch": 13.048161494052462, + "grad_norm": 1.5524091385750236, + "learning_rate": 6.524333539987601e-07, + "loss": 0.8612, + "step": 168380 + }, + { + "epoch": 13.048936417528768, + "grad_norm": 1.496915810054134, + "learning_rate": 6.524721016738996e-07, + "loss": 0.8685, + "step": 168390 + }, + { + "epoch": 13.049711341005075, + "grad_norm": 1.5974998894930186, + "learning_rate": 6.525108493490391e-07, + "loss": 0.8515, + "step": 168400 + }, + { + "epoch": 13.050486264481382, + "grad_norm": 1.5049864893330194, + "learning_rate": 6.525495970241787e-07, + "loss": 0.8637, + "step": 168410 + }, + { + "epoch": 13.051261187957689, + "grad_norm": 1.4295644728452448, + "learning_rate": 6.525883446993181e-07, + "loss": 0.8602, + "step": 168420 + }, + { + "epoch": 13.052036111433996, + "grad_norm": 1.595445658951155, + "learning_rate": 6.526270923744576e-07, + "loss": 0.8739, + "step": 168430 + }, + { + "epoch": 13.052811034910302, + "grad_norm": 1.5309350366260064, + "learning_rate": 6.52665840049597e-07, + "loss": 0.8731, + "step": 168440 + }, + { + "epoch": 13.05358595838661, + "grad_norm": 1.5702586179009486, + "learning_rate": 6.527045877247367e-07, + "loss": 0.8591, + "step": 168450 + }, + { + "epoch": 13.054360881862916, + "grad_norm": 1.5661197321064393, + "learning_rate": 6.527433353998761e-07, + "loss": 0.8612, + "step": 168460 + }, + { + "epoch": 13.055135805339223, + "grad_norm": 1.5130802047603849, + "learning_rate": 6.527820830750156e-07, + "loss": 0.8699, + "step": 168470 + }, + { + "epoch": 13.05591072881553, + "grad_norm": 1.429010860126628, + "learning_rate": 6.52820830750155e-07, + "loss": 0.8571, + "step": 168480 + }, + { + "epoch": 13.056685652291836, + "grad_norm": 1.5492155997555335, + "learning_rate": 6.528595784252946e-07, + "loss": 0.8618, + "step": 168490 + }, + { + "epoch": 13.057460575768143, + "grad_norm": 1.4381810086693545, + "learning_rate": 6.528983261004341e-07, + "loss": 0.8612, + "step": 168500 + }, + { + "epoch": 13.057460575768143, + "eval_loss": 0.8961697220802307, + "eval_runtime": 331.6341, + "eval_samples_per_second": 34.589, + "eval_steps_per_second": 8.648, + "step": 168500 + }, + { + "epoch": 13.05823549924445, + "grad_norm": 1.5867577762659566, + "learning_rate": 6.529370737755736e-07, + "loss": 0.8844, + "step": 168510 + }, + { + "epoch": 13.059010422720757, + "grad_norm": 1.5434375382597054, + "learning_rate": 6.52975821450713e-07, + "loss": 0.8784, + "step": 168520 + }, + { + "epoch": 13.059785346197064, + "grad_norm": 1.492439047799136, + "learning_rate": 6.530145691258525e-07, + "loss": 0.8951, + "step": 168530 + }, + { + "epoch": 13.06056026967337, + "grad_norm": 1.4981330440334109, + "learning_rate": 6.530533168009919e-07, + "loss": 0.8504, + "step": 168540 + }, + { + "epoch": 13.061335193149677, + "grad_norm": 1.572768234366022, + "learning_rate": 6.530920644761316e-07, + "loss": 0.8648, + "step": 168550 + }, + { + "epoch": 13.062110116625982, + "grad_norm": 1.449587284677687, + "learning_rate": 6.53130812151271e-07, + "loss": 0.8757, + "step": 168560 + }, + { + "epoch": 13.06288504010229, + "grad_norm": 1.5549868944308467, + "learning_rate": 6.531695598264105e-07, + "loss": 0.8914, + "step": 168570 + }, + { + "epoch": 13.063659963578596, + "grad_norm": 1.5501061037749702, + "learning_rate": 6.532083075015499e-07, + "loss": 0.879, + "step": 168580 + }, + { + "epoch": 13.064434887054903, + "grad_norm": 1.4686905090574194, + "learning_rate": 6.532470551766896e-07, + "loss": 0.8786, + "step": 168590 + }, + { + "epoch": 13.06520981053121, + "grad_norm": 1.4291909502744373, + "learning_rate": 6.53285802851829e-07, + "loss": 0.8642, + "step": 168600 + }, + { + "epoch": 13.065984734007516, + "grad_norm": 1.5422582882384694, + "learning_rate": 6.533245505269685e-07, + "loss": 0.8782, + "step": 168610 + }, + { + "epoch": 13.066759657483823, + "grad_norm": 1.4956299495316046, + "learning_rate": 6.533632982021079e-07, + "loss": 0.8529, + "step": 168620 + }, + { + "epoch": 13.06753458096013, + "grad_norm": 1.5372433279038085, + "learning_rate": 6.534020458772475e-07, + "loss": 0.8763, + "step": 168630 + }, + { + "epoch": 13.068309504436437, + "grad_norm": 1.558199822915854, + "learning_rate": 6.53440793552387e-07, + "loss": 0.8854, + "step": 168640 + }, + { + "epoch": 13.069084427912744, + "grad_norm": 1.5422231460552969, + "learning_rate": 6.534795412275265e-07, + "loss": 0.8789, + "step": 168650 + }, + { + "epoch": 13.06985935138905, + "grad_norm": 1.5669930618917722, + "learning_rate": 6.535182889026659e-07, + "loss": 0.8663, + "step": 168660 + }, + { + "epoch": 13.070634274865357, + "grad_norm": 1.4993950979987383, + "learning_rate": 6.535570365778054e-07, + "loss": 0.8657, + "step": 168670 + }, + { + "epoch": 13.071409198341664, + "grad_norm": 1.491577264948897, + "learning_rate": 6.535957842529448e-07, + "loss": 0.884, + "step": 168680 + }, + { + "epoch": 13.07218412181797, + "grad_norm": 1.5486434357056202, + "learning_rate": 6.536345319280845e-07, + "loss": 0.8816, + "step": 168690 + }, + { + "epoch": 13.072959045294278, + "grad_norm": 1.4498124795277587, + "learning_rate": 6.536732796032239e-07, + "loss": 0.8748, + "step": 168700 + }, + { + "epoch": 13.073733968770584, + "grad_norm": 1.536368587533529, + "learning_rate": 6.537120272783634e-07, + "loss": 0.8688, + "step": 168710 + }, + { + "epoch": 13.074508892246891, + "grad_norm": 1.546563355409699, + "learning_rate": 6.537507749535028e-07, + "loss": 0.8791, + "step": 168720 + }, + { + "epoch": 13.075283815723198, + "grad_norm": 1.5140253351520316, + "learning_rate": 6.537895226286424e-07, + "loss": 0.8782, + "step": 168730 + }, + { + "epoch": 13.076058739199505, + "grad_norm": 1.5478932541992354, + "learning_rate": 6.538282703037819e-07, + "loss": 0.8797, + "step": 168740 + }, + { + "epoch": 13.07683366267581, + "grad_norm": 1.5527308609116535, + "learning_rate": 6.538670179789214e-07, + "loss": 0.8722, + "step": 168750 + }, + { + "epoch": 13.077608586152117, + "grad_norm": 1.682058393851397, + "learning_rate": 6.539057656540608e-07, + "loss": 0.8841, + "step": 168760 + }, + { + "epoch": 13.078383509628424, + "grad_norm": 1.4712983461356093, + "learning_rate": 6.539445133292003e-07, + "loss": 0.869, + "step": 168770 + }, + { + "epoch": 13.07915843310473, + "grad_norm": 1.5514476559409272, + "learning_rate": 6.539832610043398e-07, + "loss": 0.8673, + "step": 168780 + }, + { + "epoch": 13.079933356581037, + "grad_norm": 1.504744428178039, + "learning_rate": 6.540220086794794e-07, + "loss": 0.851, + "step": 168790 + }, + { + "epoch": 13.080708280057344, + "grad_norm": 1.5939839300557903, + "learning_rate": 6.540607563546188e-07, + "loss": 0.8797, + "step": 168800 + }, + { + "epoch": 13.08148320353365, + "grad_norm": 1.4432791509662681, + "learning_rate": 6.540995040297583e-07, + "loss": 0.8568, + "step": 168810 + }, + { + "epoch": 13.082258127009958, + "grad_norm": 1.447834328631517, + "learning_rate": 6.541382517048977e-07, + "loss": 0.8645, + "step": 168820 + }, + { + "epoch": 13.083033050486264, + "grad_norm": 1.4479290191607066, + "learning_rate": 6.541769993800373e-07, + "loss": 0.8618, + "step": 168830 + }, + { + "epoch": 13.083807973962571, + "grad_norm": 1.6797479965505189, + "learning_rate": 6.542157470551768e-07, + "loss": 0.8872, + "step": 168840 + }, + { + "epoch": 13.084582897438878, + "grad_norm": 1.4574459388854903, + "learning_rate": 6.542544947303163e-07, + "loss": 0.8667, + "step": 168850 + }, + { + "epoch": 13.085357820915185, + "grad_norm": 1.4438636982230633, + "learning_rate": 6.542932424054557e-07, + "loss": 0.8669, + "step": 168860 + }, + { + "epoch": 13.086132744391492, + "grad_norm": 1.4836189063297704, + "learning_rate": 6.543319900805953e-07, + "loss": 0.8732, + "step": 168870 + }, + { + "epoch": 13.086907667867798, + "grad_norm": 1.5449056317643373, + "learning_rate": 6.543707377557347e-07, + "loss": 0.8569, + "step": 168880 + }, + { + "epoch": 13.087682591344105, + "grad_norm": 1.5251056413843183, + "learning_rate": 6.544094854308743e-07, + "loss": 0.8707, + "step": 168890 + }, + { + "epoch": 13.088457514820412, + "grad_norm": 1.496261510306802, + "learning_rate": 6.544482331060137e-07, + "loss": 0.8703, + "step": 168900 + }, + { + "epoch": 13.089232438296719, + "grad_norm": 1.5751895027417446, + "learning_rate": 6.544869807811532e-07, + "loss": 0.8701, + "step": 168910 + }, + { + "epoch": 13.090007361773026, + "grad_norm": 1.4700381330173657, + "learning_rate": 6.545257284562927e-07, + "loss": 0.9064, + "step": 168920 + }, + { + "epoch": 13.090782285249333, + "grad_norm": 1.5953123156669355, + "learning_rate": 6.545644761314322e-07, + "loss": 0.8644, + "step": 168930 + }, + { + "epoch": 13.091557208725638, + "grad_norm": 1.4984928771643915, + "learning_rate": 6.546032238065717e-07, + "loss": 0.8855, + "step": 168940 + }, + { + "epoch": 13.092332132201944, + "grad_norm": 1.5034833210797662, + "learning_rate": 6.546419714817112e-07, + "loss": 0.8645, + "step": 168950 + }, + { + "epoch": 13.093107055678251, + "grad_norm": 1.4997673673653336, + "learning_rate": 6.546807191568506e-07, + "loss": 0.8677, + "step": 168960 + }, + { + "epoch": 13.093881979154558, + "grad_norm": 1.5066845175204397, + "learning_rate": 6.547194668319902e-07, + "loss": 0.8976, + "step": 168970 + }, + { + "epoch": 13.094656902630865, + "grad_norm": 1.562402696041578, + "learning_rate": 6.547582145071296e-07, + "loss": 0.8648, + "step": 168980 + }, + { + "epoch": 13.095431826107172, + "grad_norm": 1.5718971917635942, + "learning_rate": 6.547969621822692e-07, + "loss": 0.8939, + "step": 168990 + }, + { + "epoch": 13.096206749583478, + "grad_norm": 1.5444210963608065, + "learning_rate": 6.548357098574086e-07, + "loss": 0.8812, + "step": 169000 + }, + { + "epoch": 13.096206749583478, + "eval_loss": 0.8962686657905579, + "eval_runtime": 330.5162, + "eval_samples_per_second": 34.706, + "eval_steps_per_second": 8.677, + "step": 169000 + }, + { + "epoch": 13.096981673059785, + "grad_norm": 1.4917972959495829, + "learning_rate": 6.548744575325482e-07, + "loss": 0.8581, + "step": 169010 + }, + { + "epoch": 13.097756596536092, + "grad_norm": 1.6034571577699164, + "learning_rate": 6.549132052076876e-07, + "loss": 0.8727, + "step": 169020 + }, + { + "epoch": 13.098531520012399, + "grad_norm": 1.7322497830415016, + "learning_rate": 6.549519528828271e-07, + "loss": 0.8797, + "step": 169030 + }, + { + "epoch": 13.099306443488706, + "grad_norm": 1.636611924163112, + "learning_rate": 6.549907005579666e-07, + "loss": 0.869, + "step": 169040 + }, + { + "epoch": 13.100081366965012, + "grad_norm": 1.5075021658540595, + "learning_rate": 6.550294482331061e-07, + "loss": 0.875, + "step": 169050 + }, + { + "epoch": 13.10085629044132, + "grad_norm": 1.59720673218374, + "learning_rate": 6.550681959082456e-07, + "loss": 0.8766, + "step": 169060 + }, + { + "epoch": 13.101631213917626, + "grad_norm": 1.5695352438122647, + "learning_rate": 6.551069435833851e-07, + "loss": 0.8665, + "step": 169070 + }, + { + "epoch": 13.102406137393933, + "grad_norm": 1.5404961907101078, + "learning_rate": 6.551456912585245e-07, + "loss": 0.8766, + "step": 169080 + }, + { + "epoch": 13.10318106087024, + "grad_norm": 1.531228922127585, + "learning_rate": 6.551844389336641e-07, + "loss": 0.8821, + "step": 169090 + }, + { + "epoch": 13.103955984346547, + "grad_norm": 1.3748903149331662, + "learning_rate": 6.552231866088035e-07, + "loss": 0.8582, + "step": 169100 + }, + { + "epoch": 13.104730907822853, + "grad_norm": 1.48866629091618, + "learning_rate": 6.552619342839431e-07, + "loss": 0.8696, + "step": 169110 + }, + { + "epoch": 13.105505831299158, + "grad_norm": 1.4502060924341147, + "learning_rate": 6.553006819590825e-07, + "loss": 0.8677, + "step": 169120 + }, + { + "epoch": 13.106280754775465, + "grad_norm": 1.4686410502206055, + "learning_rate": 6.55339429634222e-07, + "loss": 0.871, + "step": 169130 + }, + { + "epoch": 13.107055678251772, + "grad_norm": 1.4971952962888109, + "learning_rate": 6.553781773093615e-07, + "loss": 0.8872, + "step": 169140 + }, + { + "epoch": 13.107830601728079, + "grad_norm": 1.500059654544282, + "learning_rate": 6.554169249845011e-07, + "loss": 0.8624, + "step": 169150 + }, + { + "epoch": 13.108605525204386, + "grad_norm": 1.5451789626746026, + "learning_rate": 6.554556726596405e-07, + "loss": 0.8866, + "step": 169160 + }, + { + "epoch": 13.109380448680692, + "grad_norm": 1.4797497019473593, + "learning_rate": 6.5549442033478e-07, + "loss": 0.859, + "step": 169170 + }, + { + "epoch": 13.110155372157, + "grad_norm": 1.4268799937115957, + "learning_rate": 6.555331680099194e-07, + "loss": 0.8866, + "step": 169180 + }, + { + "epoch": 13.110930295633306, + "grad_norm": 1.511710315651371, + "learning_rate": 6.55571915685059e-07, + "loss": 0.8739, + "step": 169190 + }, + { + "epoch": 13.111705219109613, + "grad_norm": 1.5834706620068808, + "learning_rate": 6.556106633601985e-07, + "loss": 0.8632, + "step": 169200 + }, + { + "epoch": 13.11248014258592, + "grad_norm": 1.5785769343902343, + "learning_rate": 6.55649411035338e-07, + "loss": 0.8596, + "step": 169210 + }, + { + "epoch": 13.113255066062226, + "grad_norm": 1.5164952339762852, + "learning_rate": 6.556881587104774e-07, + "loss": 0.8768, + "step": 169220 + }, + { + "epoch": 13.114029989538533, + "grad_norm": 1.5713078360779926, + "learning_rate": 6.557269063856169e-07, + "loss": 0.8681, + "step": 169230 + }, + { + "epoch": 13.11480491301484, + "grad_norm": 1.569565241156554, + "learning_rate": 6.557656540607564e-07, + "loss": 0.8719, + "step": 169240 + }, + { + "epoch": 13.115579836491147, + "grad_norm": 1.462463503219405, + "learning_rate": 6.55804401735896e-07, + "loss": 0.8777, + "step": 169250 + }, + { + "epoch": 13.116354759967454, + "grad_norm": 1.4967799247386042, + "learning_rate": 6.558431494110354e-07, + "loss": 0.8805, + "step": 169260 + }, + { + "epoch": 13.11712968344376, + "grad_norm": 1.4625451986178768, + "learning_rate": 6.558818970861749e-07, + "loss": 0.8721, + "step": 169270 + }, + { + "epoch": 13.117904606920067, + "grad_norm": 1.5309039650694496, + "learning_rate": 6.559206447613143e-07, + "loss": 0.8591, + "step": 169280 + }, + { + "epoch": 13.118679530396374, + "grad_norm": 1.5228459077374727, + "learning_rate": 6.55959392436454e-07, + "loss": 0.8786, + "step": 169290 + }, + { + "epoch": 13.119454453872681, + "grad_norm": 1.4645798988437662, + "learning_rate": 6.559981401115934e-07, + "loss": 0.8627, + "step": 169300 + }, + { + "epoch": 13.120229377348986, + "grad_norm": 1.5479217983112024, + "learning_rate": 6.560368877867329e-07, + "loss": 0.8702, + "step": 169310 + }, + { + "epoch": 13.121004300825293, + "grad_norm": 1.5444320707054529, + "learning_rate": 6.560756354618723e-07, + "loss": 0.8588, + "step": 169320 + }, + { + "epoch": 13.1217792243016, + "grad_norm": 1.6185470455182176, + "learning_rate": 6.561143831370118e-07, + "loss": 0.8845, + "step": 169330 + }, + { + "epoch": 13.122554147777906, + "grad_norm": 1.547796898842378, + "learning_rate": 6.561531308121514e-07, + "loss": 0.8797, + "step": 169340 + }, + { + "epoch": 13.123329071254213, + "grad_norm": 1.5702990937409596, + "learning_rate": 6.561918784872909e-07, + "loss": 0.8633, + "step": 169350 + }, + { + "epoch": 13.12410399473052, + "grad_norm": 1.669778284201472, + "learning_rate": 6.562306261624303e-07, + "loss": 0.8711, + "step": 169360 + }, + { + "epoch": 13.124878918206827, + "grad_norm": 1.4279338510614312, + "learning_rate": 6.562693738375698e-07, + "loss": 0.8758, + "step": 169370 + }, + { + "epoch": 13.125653841683134, + "grad_norm": 1.527162179537539, + "learning_rate": 6.563081215127092e-07, + "loss": 0.8632, + "step": 169380 + }, + { + "epoch": 13.12642876515944, + "grad_norm": 1.5421547127834911, + "learning_rate": 6.563468691878489e-07, + "loss": 0.8806, + "step": 169390 + }, + { + "epoch": 13.127203688635747, + "grad_norm": 1.5572839882517706, + "learning_rate": 6.563856168629883e-07, + "loss": 0.8718, + "step": 169400 + }, + { + "epoch": 13.127978612112054, + "grad_norm": 1.5176590109755994, + "learning_rate": 6.564243645381278e-07, + "loss": 0.8726, + "step": 169410 + }, + { + "epoch": 13.128753535588361, + "grad_norm": 1.5150196770037978, + "learning_rate": 6.564631122132672e-07, + "loss": 0.8679, + "step": 169420 + }, + { + "epoch": 13.129528459064668, + "grad_norm": 1.5182084602640387, + "learning_rate": 6.565018598884069e-07, + "loss": 0.8699, + "step": 169430 + }, + { + "epoch": 13.130303382540975, + "grad_norm": 1.562455982533792, + "learning_rate": 6.565406075635463e-07, + "loss": 0.8764, + "step": 169440 + }, + { + "epoch": 13.131078306017281, + "grad_norm": 1.4902396127152733, + "learning_rate": 6.565793552386858e-07, + "loss": 0.8787, + "step": 169450 + }, + { + "epoch": 13.131853229493588, + "grad_norm": 1.4715230067905278, + "learning_rate": 6.566181029138252e-07, + "loss": 0.8638, + "step": 169460 + }, + { + "epoch": 13.132628152969895, + "grad_norm": 1.4985933406949696, + "learning_rate": 6.566568505889647e-07, + "loss": 0.8714, + "step": 169470 + }, + { + "epoch": 13.133403076446202, + "grad_norm": 1.563964557616376, + "learning_rate": 6.566955982641041e-07, + "loss": 0.8835, + "step": 169480 + }, + { + "epoch": 13.134177999922507, + "grad_norm": 1.570986597085, + "learning_rate": 6.567343459392438e-07, + "loss": 0.8732, + "step": 169490 + }, + { + "epoch": 13.134952923398814, + "grad_norm": 1.563563426359972, + "learning_rate": 6.567730936143832e-07, + "loss": 0.8791, + "step": 169500 + }, + { + "epoch": 13.134952923398814, + "eval_loss": 0.8960757255554199, + "eval_runtime": 332.5336, + "eval_samples_per_second": 34.496, + "eval_steps_per_second": 8.625, + "step": 169500 + }, + { + "epoch": 13.13572784687512, + "grad_norm": 1.5273798611133422, + "learning_rate": 6.568118412895227e-07, + "loss": 0.8583, + "step": 169510 + }, + { + "epoch": 13.136502770351427, + "grad_norm": 1.5135762695947304, + "learning_rate": 6.568505889646621e-07, + "loss": 0.8651, + "step": 169520 + }, + { + "epoch": 13.137277693827734, + "grad_norm": 1.466223465380973, + "learning_rate": 6.568893366398018e-07, + "loss": 0.8662, + "step": 169530 + }, + { + "epoch": 13.13805261730404, + "grad_norm": 1.4521962148612402, + "learning_rate": 6.569280843149412e-07, + "loss": 0.8571, + "step": 169540 + }, + { + "epoch": 13.138827540780348, + "grad_norm": 1.5073356413429562, + "learning_rate": 6.569668319900807e-07, + "loss": 0.8957, + "step": 169550 + }, + { + "epoch": 13.139602464256654, + "grad_norm": 1.4745020698999618, + "learning_rate": 6.570055796652201e-07, + "loss": 0.8764, + "step": 169560 + }, + { + "epoch": 13.140377387732961, + "grad_norm": 1.4711305386279812, + "learning_rate": 6.570443273403597e-07, + "loss": 0.8668, + "step": 169570 + }, + { + "epoch": 13.141152311209268, + "grad_norm": 1.5097479485009782, + "learning_rate": 6.570830750154992e-07, + "loss": 0.877, + "step": 169580 + }, + { + "epoch": 13.141927234685575, + "grad_norm": 1.5307234340145675, + "learning_rate": 6.571218226906387e-07, + "loss": 0.8825, + "step": 169590 + }, + { + "epoch": 13.142702158161882, + "grad_norm": 1.5721763804156026, + "learning_rate": 6.571605703657781e-07, + "loss": 0.882, + "step": 169600 + }, + { + "epoch": 13.143477081638189, + "grad_norm": 1.5848207905586926, + "learning_rate": 6.571993180409176e-07, + "loss": 0.879, + "step": 169610 + }, + { + "epoch": 13.144252005114495, + "grad_norm": 1.49668162180509, + "learning_rate": 6.57238065716057e-07, + "loss": 0.8868, + "step": 169620 + }, + { + "epoch": 13.145026928590802, + "grad_norm": 1.453816040621795, + "learning_rate": 6.572768133911967e-07, + "loss": 0.8643, + "step": 169630 + }, + { + "epoch": 13.145801852067109, + "grad_norm": 1.464486542755134, + "learning_rate": 6.573155610663361e-07, + "loss": 0.8651, + "step": 169640 + }, + { + "epoch": 13.146576775543416, + "grad_norm": 1.4989471320932175, + "learning_rate": 6.573543087414756e-07, + "loss": 0.8687, + "step": 169650 + }, + { + "epoch": 13.147351699019723, + "grad_norm": 1.5851061763219187, + "learning_rate": 6.57393056416615e-07, + "loss": 0.86, + "step": 169660 + }, + { + "epoch": 13.14812662249603, + "grad_norm": 1.5800091551895448, + "learning_rate": 6.574318040917546e-07, + "loss": 0.8756, + "step": 169670 + }, + { + "epoch": 13.148901545972334, + "grad_norm": 1.4748474944520777, + "learning_rate": 6.574705517668941e-07, + "loss": 0.882, + "step": 169680 + }, + { + "epoch": 13.149676469448641, + "grad_norm": 1.5476130203492984, + "learning_rate": 6.575092994420336e-07, + "loss": 0.8523, + "step": 169690 + }, + { + "epoch": 13.150451392924948, + "grad_norm": 1.4869156623091677, + "learning_rate": 6.57548047117173e-07, + "loss": 0.8747, + "step": 169700 + }, + { + "epoch": 13.151226316401255, + "grad_norm": 1.5618511993339381, + "learning_rate": 6.575867947923126e-07, + "loss": 0.8785, + "step": 169710 + }, + { + "epoch": 13.152001239877562, + "grad_norm": 1.5533581757290609, + "learning_rate": 6.57625542467452e-07, + "loss": 0.8622, + "step": 169720 + }, + { + "epoch": 13.152776163353868, + "grad_norm": 1.5761324450635747, + "learning_rate": 6.576642901425916e-07, + "loss": 0.8741, + "step": 169730 + }, + { + "epoch": 13.153551086830175, + "grad_norm": 1.5059187765477433, + "learning_rate": 6.57703037817731e-07, + "loss": 0.8854, + "step": 169740 + }, + { + "epoch": 13.154326010306482, + "grad_norm": 1.5480174851667765, + "learning_rate": 6.577417854928705e-07, + "loss": 0.8551, + "step": 169750 + }, + { + "epoch": 13.155100933782789, + "grad_norm": 1.5525591466428244, + "learning_rate": 6.577805331680099e-07, + "loss": 0.8713, + "step": 169760 + }, + { + "epoch": 13.155875857259096, + "grad_norm": 1.5476227636210698, + "learning_rate": 6.578192808431495e-07, + "loss": 0.862, + "step": 169770 + }, + { + "epoch": 13.156650780735403, + "grad_norm": 1.5101731012903559, + "learning_rate": 6.57858028518289e-07, + "loss": 0.864, + "step": 169780 + }, + { + "epoch": 13.15742570421171, + "grad_norm": 1.5002947671705553, + "learning_rate": 6.578967761934285e-07, + "loss": 0.8756, + "step": 169790 + }, + { + "epoch": 13.158200627688016, + "grad_norm": 1.5661634871148746, + "learning_rate": 6.579355238685679e-07, + "loss": 0.8535, + "step": 169800 + }, + { + "epoch": 13.158975551164323, + "grad_norm": 1.5024215365815174, + "learning_rate": 6.579742715437075e-07, + "loss": 0.8681, + "step": 169810 + }, + { + "epoch": 13.15975047464063, + "grad_norm": 1.5266696162856728, + "learning_rate": 6.580130192188469e-07, + "loss": 0.8657, + "step": 169820 + }, + { + "epoch": 13.160525398116937, + "grad_norm": 1.5304688945084008, + "learning_rate": 6.580517668939865e-07, + "loss": 0.8713, + "step": 169830 + }, + { + "epoch": 13.161300321593243, + "grad_norm": 1.6341370338708205, + "learning_rate": 6.580905145691259e-07, + "loss": 0.8666, + "step": 169840 + }, + { + "epoch": 13.16207524506955, + "grad_norm": 1.6027679656363367, + "learning_rate": 6.581292622442655e-07, + "loss": 0.8725, + "step": 169850 + }, + { + "epoch": 13.162850168545855, + "grad_norm": 1.5061171642661015, + "learning_rate": 6.581680099194049e-07, + "loss": 0.8662, + "step": 169860 + }, + { + "epoch": 13.163625092022162, + "grad_norm": 1.4364431412979646, + "learning_rate": 6.582067575945444e-07, + "loss": 0.8816, + "step": 169870 + }, + { + "epoch": 13.164400015498469, + "grad_norm": 1.459750139868607, + "learning_rate": 6.582455052696839e-07, + "loss": 0.8753, + "step": 169880 + }, + { + "epoch": 13.165174938974776, + "grad_norm": 1.5045984840961015, + "learning_rate": 6.582842529448234e-07, + "loss": 0.8903, + "step": 169890 + }, + { + "epoch": 13.165949862451082, + "grad_norm": 1.4781277474530616, + "learning_rate": 6.583230006199628e-07, + "loss": 0.8851, + "step": 169900 + }, + { + "epoch": 13.16672478592739, + "grad_norm": 1.5178206328772825, + "learning_rate": 6.583617482951024e-07, + "loss": 0.8794, + "step": 169910 + }, + { + "epoch": 13.167499709403696, + "grad_norm": 1.5512439642062548, + "learning_rate": 6.584004959702418e-07, + "loss": 0.8855, + "step": 169920 + }, + { + "epoch": 13.168274632880003, + "grad_norm": 1.5716536573918694, + "learning_rate": 6.584392436453814e-07, + "loss": 0.8642, + "step": 169930 + }, + { + "epoch": 13.16904955635631, + "grad_norm": 1.4990788269260757, + "learning_rate": 6.584779913205208e-07, + "loss": 0.863, + "step": 169940 + }, + { + "epoch": 13.169824479832617, + "grad_norm": 1.5869734541384082, + "learning_rate": 6.585167389956604e-07, + "loss": 0.8626, + "step": 169950 + }, + { + "epoch": 13.170599403308923, + "grad_norm": 1.7274905885281404, + "learning_rate": 6.585554866707998e-07, + "loss": 0.9021, + "step": 169960 + }, + { + "epoch": 13.17137432678523, + "grad_norm": 1.4366819325459472, + "learning_rate": 6.585942343459393e-07, + "loss": 0.8664, + "step": 169970 + }, + { + "epoch": 13.172149250261537, + "grad_norm": 1.47252252142813, + "learning_rate": 6.586329820210788e-07, + "loss": 0.8569, + "step": 169980 + }, + { + "epoch": 13.172924173737844, + "grad_norm": 1.5126440242878474, + "learning_rate": 6.586717296962184e-07, + "loss": 0.884, + "step": 169990 + }, + { + "epoch": 13.17369909721415, + "grad_norm": 1.4953330244787837, + "learning_rate": 6.587104773713578e-07, + "loss": 0.8819, + "step": 170000 + }, + { + "epoch": 13.17369909721415, + "eval_loss": 0.8959144949913025, + "eval_runtime": 333.9868, + "eval_samples_per_second": 34.346, + "eval_steps_per_second": 8.587, + "step": 170000 + }, + { + "epoch": 13.174474020690457, + "grad_norm": 1.5191292851454348, + "learning_rate": 6.587492250464973e-07, + "loss": 0.8766, + "step": 170010 + }, + { + "epoch": 13.175248944166764, + "grad_norm": 1.5742735703049016, + "learning_rate": 6.587879727216367e-07, + "loss": 0.9071, + "step": 170020 + }, + { + "epoch": 13.176023867643071, + "grad_norm": 1.5181483335066397, + "learning_rate": 6.588267203967763e-07, + "loss": 0.8763, + "step": 170030 + }, + { + "epoch": 13.176798791119378, + "grad_norm": 1.5024773698191904, + "learning_rate": 6.588654680719157e-07, + "loss": 0.8932, + "step": 170040 + }, + { + "epoch": 13.177573714595683, + "grad_norm": 1.4612886147153743, + "learning_rate": 6.589042157470553e-07, + "loss": 0.8743, + "step": 170050 + }, + { + "epoch": 13.17834863807199, + "grad_norm": 1.565795796497968, + "learning_rate": 6.589429634221947e-07, + "loss": 0.8603, + "step": 170060 + }, + { + "epoch": 13.179123561548296, + "grad_norm": 1.516828768434961, + "learning_rate": 6.589817110973343e-07, + "loss": 0.8728, + "step": 170070 + }, + { + "epoch": 13.179898485024603, + "grad_norm": 1.4622860808282023, + "learning_rate": 6.590204587724737e-07, + "loss": 0.883, + "step": 170080 + }, + { + "epoch": 13.18067340850091, + "grad_norm": 1.4853896838742298, + "learning_rate": 6.590592064476133e-07, + "loss": 0.8631, + "step": 170090 + }, + { + "epoch": 13.181448331977217, + "grad_norm": 1.4611420612753672, + "learning_rate": 6.590979541227527e-07, + "loss": 0.868, + "step": 170100 + }, + { + "epoch": 13.182223255453524, + "grad_norm": 1.6015080343987027, + "learning_rate": 6.591367017978922e-07, + "loss": 0.8775, + "step": 170110 + }, + { + "epoch": 13.18299817892983, + "grad_norm": 1.4602986836316363, + "learning_rate": 6.591754494730316e-07, + "loss": 0.8727, + "step": 170120 + }, + { + "epoch": 13.183773102406137, + "grad_norm": 1.505090167438094, + "learning_rate": 6.592141971481713e-07, + "loss": 0.8691, + "step": 170130 + }, + { + "epoch": 13.184548025882444, + "grad_norm": 1.485093438893509, + "learning_rate": 6.592529448233107e-07, + "loss": 0.8762, + "step": 170140 + }, + { + "epoch": 13.185322949358751, + "grad_norm": 1.5142452028834597, + "learning_rate": 6.592916924984502e-07, + "loss": 0.8727, + "step": 170150 + }, + { + "epoch": 13.186097872835058, + "grad_norm": 1.5013389701594129, + "learning_rate": 6.593304401735896e-07, + "loss": 0.8693, + "step": 170160 + }, + { + "epoch": 13.186872796311365, + "grad_norm": 1.575233061533299, + "learning_rate": 6.593691878487292e-07, + "loss": 0.8864, + "step": 170170 + }, + { + "epoch": 13.187647719787671, + "grad_norm": 1.537710152729643, + "learning_rate": 6.594079355238686e-07, + "loss": 0.8809, + "step": 170180 + }, + { + "epoch": 13.188422643263978, + "grad_norm": 1.471866337310827, + "learning_rate": 6.594466831990082e-07, + "loss": 0.8807, + "step": 170190 + }, + { + "epoch": 13.189197566740285, + "grad_norm": 1.494977900046862, + "learning_rate": 6.594854308741476e-07, + "loss": 0.8678, + "step": 170200 + }, + { + "epoch": 13.189972490216592, + "grad_norm": 1.4730546811266092, + "learning_rate": 6.595241785492871e-07, + "loss": 0.8681, + "step": 170210 + }, + { + "epoch": 13.190747413692899, + "grad_norm": 1.4525398519235764, + "learning_rate": 6.595629262244266e-07, + "loss": 0.857, + "step": 170220 + }, + { + "epoch": 13.191522337169205, + "grad_norm": 1.468565548636303, + "learning_rate": 6.596016738995662e-07, + "loss": 0.8847, + "step": 170230 + }, + { + "epoch": 13.19229726064551, + "grad_norm": 1.4453111295046512, + "learning_rate": 6.596404215747056e-07, + "loss": 0.8972, + "step": 170240 + }, + { + "epoch": 13.193072184121817, + "grad_norm": 1.5054235154663183, + "learning_rate": 6.596791692498451e-07, + "loss": 0.8759, + "step": 170250 + }, + { + "epoch": 13.193847107598124, + "grad_norm": 1.6241665901079945, + "learning_rate": 6.597179169249845e-07, + "loss": 0.8753, + "step": 170260 + }, + { + "epoch": 13.19462203107443, + "grad_norm": 1.6305609635863534, + "learning_rate": 6.597566646001241e-07, + "loss": 0.8739, + "step": 170270 + }, + { + "epoch": 13.195396954550738, + "grad_norm": 1.487105606898277, + "learning_rate": 6.597954122752636e-07, + "loss": 0.8828, + "step": 170280 + }, + { + "epoch": 13.196171878027044, + "grad_norm": 1.572723444510806, + "learning_rate": 6.598341599504031e-07, + "loss": 0.8751, + "step": 170290 + }, + { + "epoch": 13.196946801503351, + "grad_norm": 1.6033173608169038, + "learning_rate": 6.598729076255425e-07, + "loss": 0.8619, + "step": 170300 + }, + { + "epoch": 13.197721724979658, + "grad_norm": 1.4870811464261782, + "learning_rate": 6.59911655300682e-07, + "loss": 0.8789, + "step": 170310 + }, + { + "epoch": 13.198496648455965, + "grad_norm": 1.465926119314365, + "learning_rate": 6.599504029758215e-07, + "loss": 0.8777, + "step": 170320 + }, + { + "epoch": 13.199271571932272, + "grad_norm": 1.515173073080441, + "learning_rate": 6.599891506509611e-07, + "loss": 0.8786, + "step": 170330 + }, + { + "epoch": 13.200046495408579, + "grad_norm": 1.582801691608556, + "learning_rate": 6.600278983261005e-07, + "loss": 0.8745, + "step": 170340 + }, + { + "epoch": 13.200821418884885, + "grad_norm": 1.654743555884616, + "learning_rate": 6.6006664600124e-07, + "loss": 0.8456, + "step": 170350 + }, + { + "epoch": 13.201596342361192, + "grad_norm": 1.4685217803422723, + "learning_rate": 6.601053936763794e-07, + "loss": 0.8746, + "step": 170360 + }, + { + "epoch": 13.202371265837499, + "grad_norm": 1.544661640929716, + "learning_rate": 6.601441413515191e-07, + "loss": 0.8721, + "step": 170370 + }, + { + "epoch": 13.203146189313806, + "grad_norm": 1.5373973366464833, + "learning_rate": 6.601828890266585e-07, + "loss": 0.8745, + "step": 170380 + }, + { + "epoch": 13.203921112790113, + "grad_norm": 1.5490437822331007, + "learning_rate": 6.60221636701798e-07, + "loss": 0.863, + "step": 170390 + }, + { + "epoch": 13.20469603626642, + "grad_norm": 1.4836837341258613, + "learning_rate": 6.602603843769374e-07, + "loss": 0.8532, + "step": 170400 + }, + { + "epoch": 13.205470959742726, + "grad_norm": 1.5469791133168294, + "learning_rate": 6.602991320520769e-07, + "loss": 0.897, + "step": 170410 + }, + { + "epoch": 13.206245883219031, + "grad_norm": 1.5810792161225575, + "learning_rate": 6.603378797272165e-07, + "loss": 0.8732, + "step": 170420 + }, + { + "epoch": 13.207020806695338, + "grad_norm": 1.5269632496305547, + "learning_rate": 6.60376627402356e-07, + "loss": 0.874, + "step": 170430 + }, + { + "epoch": 13.207795730171645, + "grad_norm": 1.481277621733459, + "learning_rate": 6.604153750774954e-07, + "loss": 0.8758, + "step": 170440 + }, + { + "epoch": 13.208570653647952, + "grad_norm": 1.5352273564951737, + "learning_rate": 6.604541227526349e-07, + "loss": 0.8859, + "step": 170450 + }, + { + "epoch": 13.209345577124258, + "grad_norm": 1.4700717379616006, + "learning_rate": 6.604928704277743e-07, + "loss": 0.8707, + "step": 170460 + }, + { + "epoch": 13.210120500600565, + "grad_norm": 1.5585296130605415, + "learning_rate": 6.60531618102914e-07, + "loss": 0.8605, + "step": 170470 + }, + { + "epoch": 13.210895424076872, + "grad_norm": 1.4557280742203786, + "learning_rate": 6.605703657780534e-07, + "loss": 0.8774, + "step": 170480 + }, + { + "epoch": 13.211670347553179, + "grad_norm": 1.513051158820271, + "learning_rate": 6.606091134531929e-07, + "loss": 0.8863, + "step": 170490 + }, + { + "epoch": 13.212445271029486, + "grad_norm": 1.5030836977706838, + "learning_rate": 6.606478611283323e-07, + "loss": 0.8657, + "step": 170500 + }, + { + "epoch": 13.212445271029486, + "eval_loss": 0.895550012588501, + "eval_runtime": 330.845, + "eval_samples_per_second": 34.672, + "eval_steps_per_second": 8.669, + "step": 170500 + }, + { + "epoch": 13.213220194505793, + "grad_norm": 1.6470437141268879, + "learning_rate": 6.60686608803472e-07, + "loss": 0.8779, + "step": 170510 + }, + { + "epoch": 13.2139951179821, + "grad_norm": 1.5058327004407286, + "learning_rate": 6.607253564786114e-07, + "loss": 0.8687, + "step": 170520 + }, + { + "epoch": 13.214770041458406, + "grad_norm": 1.4757366100471345, + "learning_rate": 6.607641041537509e-07, + "loss": 0.8841, + "step": 170530 + }, + { + "epoch": 13.215544964934713, + "grad_norm": 1.5034299592313374, + "learning_rate": 6.608028518288903e-07, + "loss": 0.8697, + "step": 170540 + }, + { + "epoch": 13.21631988841102, + "grad_norm": 1.5275509656493234, + "learning_rate": 6.608415995040298e-07, + "loss": 0.875, + "step": 170550 + }, + { + "epoch": 13.217094811887327, + "grad_norm": 1.5791342587749098, + "learning_rate": 6.608803471791693e-07, + "loss": 0.8855, + "step": 170560 + }, + { + "epoch": 13.217869735363633, + "grad_norm": 1.510355871028939, + "learning_rate": 6.609190948543089e-07, + "loss": 0.8721, + "step": 170570 + }, + { + "epoch": 13.21864465883994, + "grad_norm": 1.61994153925278, + "learning_rate": 6.609578425294483e-07, + "loss": 0.8759, + "step": 170580 + }, + { + "epoch": 13.219419582316247, + "grad_norm": 1.4559406215727495, + "learning_rate": 6.609965902045878e-07, + "loss": 0.8822, + "step": 170590 + }, + { + "epoch": 13.220194505792554, + "grad_norm": 1.4158196718865177, + "learning_rate": 6.610353378797272e-07, + "loss": 0.8645, + "step": 170600 + }, + { + "epoch": 13.220969429268859, + "grad_norm": 1.496314606074964, + "learning_rate": 6.610740855548669e-07, + "loss": 0.8605, + "step": 170610 + }, + { + "epoch": 13.221744352745166, + "grad_norm": 1.4988019530772605, + "learning_rate": 6.611128332300063e-07, + "loss": 0.8848, + "step": 170620 + }, + { + "epoch": 13.222519276221472, + "grad_norm": 1.4956421954883077, + "learning_rate": 6.611515809051458e-07, + "loss": 0.8629, + "step": 170630 + }, + { + "epoch": 13.22329419969778, + "grad_norm": 1.5347670942889469, + "learning_rate": 6.611903285802852e-07, + "loss": 0.8777, + "step": 170640 + }, + { + "epoch": 13.224069123174086, + "grad_norm": 1.5061861067304556, + "learning_rate": 6.612290762554248e-07, + "loss": 0.8797, + "step": 170650 + }, + { + "epoch": 13.224844046650393, + "grad_norm": 1.4975110831847311, + "learning_rate": 6.612678239305642e-07, + "loss": 0.8702, + "step": 170660 + }, + { + "epoch": 13.2256189701267, + "grad_norm": 1.5327117921925661, + "learning_rate": 6.613065716057038e-07, + "loss": 0.876, + "step": 170670 + }, + { + "epoch": 13.226393893603007, + "grad_norm": 1.4575232677694538, + "learning_rate": 6.613453192808432e-07, + "loss": 0.8835, + "step": 170680 + }, + { + "epoch": 13.227168817079313, + "grad_norm": 1.4845087643664845, + "learning_rate": 6.613840669559827e-07, + "loss": 0.869, + "step": 170690 + }, + { + "epoch": 13.22794374055562, + "grad_norm": 1.5468157249949415, + "learning_rate": 6.614228146311222e-07, + "loss": 0.8686, + "step": 170700 + }, + { + "epoch": 13.228718664031927, + "grad_norm": 1.383875186615768, + "learning_rate": 6.614615623062618e-07, + "loss": 0.874, + "step": 170710 + }, + { + "epoch": 13.229493587508234, + "grad_norm": 1.4519966887462006, + "learning_rate": 6.615003099814012e-07, + "loss": 0.8819, + "step": 170720 + }, + { + "epoch": 13.23026851098454, + "grad_norm": 1.529003350847106, + "learning_rate": 6.615390576565407e-07, + "loss": 0.8706, + "step": 170730 + }, + { + "epoch": 13.231043434460847, + "grad_norm": 1.6096152066533829, + "learning_rate": 6.615778053316801e-07, + "loss": 0.8923, + "step": 170740 + }, + { + "epoch": 13.231818357937154, + "grad_norm": 1.5013884536815658, + "learning_rate": 6.616165530068197e-07, + "loss": 0.8678, + "step": 170750 + }, + { + "epoch": 13.232593281413461, + "grad_norm": 1.5253262459848396, + "learning_rate": 6.616553006819592e-07, + "loss": 0.8767, + "step": 170760 + }, + { + "epoch": 13.233368204889768, + "grad_norm": 1.4870606920739162, + "learning_rate": 6.616940483570987e-07, + "loss": 0.8607, + "step": 170770 + }, + { + "epoch": 13.234143128366075, + "grad_norm": 1.556445541950552, + "learning_rate": 6.617327960322381e-07, + "loss": 0.8803, + "step": 170780 + }, + { + "epoch": 13.234918051842381, + "grad_norm": 1.5255511928485064, + "learning_rate": 6.617715437073777e-07, + "loss": 0.8767, + "step": 170790 + }, + { + "epoch": 13.235692975318686, + "grad_norm": 1.5255844119773634, + "learning_rate": 6.618102913825171e-07, + "loss": 0.8638, + "step": 170800 + }, + { + "epoch": 13.236467898794993, + "grad_norm": 1.6215380618475752, + "learning_rate": 6.618490390576567e-07, + "loss": 0.9187, + "step": 170810 + }, + { + "epoch": 13.2372428222713, + "grad_norm": 1.5320604725381133, + "learning_rate": 6.618877867327961e-07, + "loss": 0.8644, + "step": 170820 + }, + { + "epoch": 13.238017745747607, + "grad_norm": 1.5779685574936013, + "learning_rate": 6.619265344079356e-07, + "loss": 0.8634, + "step": 170830 + }, + { + "epoch": 13.238792669223914, + "grad_norm": 1.4472918747623722, + "learning_rate": 6.61965282083075e-07, + "loss": 0.8622, + "step": 170840 + }, + { + "epoch": 13.23956759270022, + "grad_norm": 1.4542555712069536, + "learning_rate": 6.620040297582146e-07, + "loss": 0.8556, + "step": 170850 + }, + { + "epoch": 13.240342516176527, + "grad_norm": 1.5047726887897863, + "learning_rate": 6.62042777433354e-07, + "loss": 0.8831, + "step": 170860 + }, + { + "epoch": 13.241117439652834, + "grad_norm": 1.5516982682962752, + "learning_rate": 6.620815251084936e-07, + "loss": 0.8645, + "step": 170870 + }, + { + "epoch": 13.241892363129141, + "grad_norm": 1.5772433906658065, + "learning_rate": 6.62120272783633e-07, + "loss": 0.8511, + "step": 170880 + }, + { + "epoch": 13.242667286605448, + "grad_norm": 1.4344768422680523, + "learning_rate": 6.621590204587726e-07, + "loss": 0.867, + "step": 170890 + }, + { + "epoch": 13.243442210081755, + "grad_norm": 1.6051871995332103, + "learning_rate": 6.62197768133912e-07, + "loss": 0.8599, + "step": 170900 + }, + { + "epoch": 13.244217133558061, + "grad_norm": 1.5415490610238491, + "learning_rate": 6.622365158090516e-07, + "loss": 0.8945, + "step": 170910 + }, + { + "epoch": 13.244992057034368, + "grad_norm": 1.5077191035443553, + "learning_rate": 6.62275263484191e-07, + "loss": 0.8845, + "step": 170920 + }, + { + "epoch": 13.245766980510675, + "grad_norm": 1.539178468919903, + "learning_rate": 6.623140111593306e-07, + "loss": 0.8798, + "step": 170930 + }, + { + "epoch": 13.246541903986982, + "grad_norm": 1.5144808082101522, + "learning_rate": 6.6235275883447e-07, + "loss": 0.8928, + "step": 170940 + }, + { + "epoch": 13.247316827463289, + "grad_norm": 1.617175937826386, + "learning_rate": 6.623915065096095e-07, + "loss": 0.8728, + "step": 170950 + }, + { + "epoch": 13.248091750939595, + "grad_norm": 1.555590757999876, + "learning_rate": 6.62430254184749e-07, + "loss": 0.869, + "step": 170960 + }, + { + "epoch": 13.248866674415902, + "grad_norm": 1.550584304420275, + "learning_rate": 6.624690018598885e-07, + "loss": 0.873, + "step": 170970 + }, + { + "epoch": 13.249641597892207, + "grad_norm": 1.5476016611536603, + "learning_rate": 6.625077495350279e-07, + "loss": 0.8759, + "step": 170980 + }, + { + "epoch": 13.250416521368514, + "grad_norm": 1.4562312851324692, + "learning_rate": 6.625464972101675e-07, + "loss": 0.8645, + "step": 170990 + }, + { + "epoch": 13.251191444844821, + "grad_norm": 1.511881807287012, + "learning_rate": 6.625852448853069e-07, + "loss": 0.8706, + "step": 171000 + }, + { + "epoch": 13.251191444844821, + "eval_loss": 0.8954482078552246, + "eval_runtime": 331.2962, + "eval_samples_per_second": 34.625, + "eval_steps_per_second": 8.657, + "step": 171000 + }, + { + "epoch": 13.251966368321128, + "grad_norm": 1.5606136236404844, + "learning_rate": 6.626239925604465e-07, + "loss": 0.877, + "step": 171010 + }, + { + "epoch": 13.252741291797435, + "grad_norm": 1.5251378010096939, + "learning_rate": 6.626627402355859e-07, + "loss": 0.8573, + "step": 171020 + }, + { + "epoch": 13.253516215273741, + "grad_norm": 1.4948240054654225, + "learning_rate": 6.627014879107255e-07, + "loss": 0.8611, + "step": 171030 + }, + { + "epoch": 13.254291138750048, + "grad_norm": 1.501079630679958, + "learning_rate": 6.627402355858649e-07, + "loss": 0.8577, + "step": 171040 + }, + { + "epoch": 13.255066062226355, + "grad_norm": 1.484697676052319, + "learning_rate": 6.627789832610044e-07, + "loss": 0.8663, + "step": 171050 + }, + { + "epoch": 13.255840985702662, + "grad_norm": 1.5443894371795206, + "learning_rate": 6.628177309361439e-07, + "loss": 0.873, + "step": 171060 + }, + { + "epoch": 13.256615909178969, + "grad_norm": 1.5289234530623588, + "learning_rate": 6.628564786112835e-07, + "loss": 0.8705, + "step": 171070 + }, + { + "epoch": 13.257390832655275, + "grad_norm": 1.4967332374990654, + "learning_rate": 6.628952262864229e-07, + "loss": 0.8541, + "step": 171080 + }, + { + "epoch": 13.258165756131582, + "grad_norm": 1.4646300888624304, + "learning_rate": 6.629339739615624e-07, + "loss": 0.88, + "step": 171090 + }, + { + "epoch": 13.258940679607889, + "grad_norm": 1.4593380363680242, + "learning_rate": 6.629727216367018e-07, + "loss": 0.8776, + "step": 171100 + }, + { + "epoch": 13.259715603084196, + "grad_norm": 1.5014223388111116, + "learning_rate": 6.630114693118414e-07, + "loss": 0.8754, + "step": 171110 + }, + { + "epoch": 13.260490526560503, + "grad_norm": 1.506511719433509, + "learning_rate": 6.630502169869808e-07, + "loss": 0.8701, + "step": 171120 + }, + { + "epoch": 13.26126545003681, + "grad_norm": 1.5100665494481142, + "learning_rate": 6.630889646621204e-07, + "loss": 0.8612, + "step": 171130 + }, + { + "epoch": 13.262040373513116, + "grad_norm": 1.6652248646194963, + "learning_rate": 6.631277123372598e-07, + "loss": 0.8682, + "step": 171140 + }, + { + "epoch": 13.262815296989423, + "grad_norm": 1.5268374228828865, + "learning_rate": 6.631664600123993e-07, + "loss": 0.8808, + "step": 171150 + }, + { + "epoch": 13.26359022046573, + "grad_norm": 1.4309195304000395, + "learning_rate": 6.632052076875388e-07, + "loss": 0.8653, + "step": 171160 + }, + { + "epoch": 13.264365143942035, + "grad_norm": 1.5467388627608023, + "learning_rate": 6.632439553626784e-07, + "loss": 0.8658, + "step": 171170 + }, + { + "epoch": 13.265140067418342, + "grad_norm": 1.5806750784349064, + "learning_rate": 6.632827030378178e-07, + "loss": 0.8515, + "step": 171180 + }, + { + "epoch": 13.265914990894649, + "grad_norm": 1.5651477028013534, + "learning_rate": 6.633214507129573e-07, + "loss": 0.8673, + "step": 171190 + }, + { + "epoch": 13.266689914370955, + "grad_norm": 1.5312044970776735, + "learning_rate": 6.633601983880967e-07, + "loss": 0.8698, + "step": 171200 + }, + { + "epoch": 13.267464837847262, + "grad_norm": 1.5701479016724909, + "learning_rate": 6.633989460632364e-07, + "loss": 0.8741, + "step": 171210 + }, + { + "epoch": 13.268239761323569, + "grad_norm": 1.5810639256780519, + "learning_rate": 6.634376937383758e-07, + "loss": 0.8507, + "step": 171220 + }, + { + "epoch": 13.269014684799876, + "grad_norm": 1.5838279029359414, + "learning_rate": 6.634764414135153e-07, + "loss": 0.8723, + "step": 171230 + }, + { + "epoch": 13.269789608276183, + "grad_norm": 1.5018053507677245, + "learning_rate": 6.635151890886547e-07, + "loss": 0.8744, + "step": 171240 + }, + { + "epoch": 13.27056453175249, + "grad_norm": 1.535889456827299, + "learning_rate": 6.635539367637942e-07, + "loss": 0.8664, + "step": 171250 + }, + { + "epoch": 13.271339455228796, + "grad_norm": 1.488876572538204, + "learning_rate": 6.635926844389337e-07, + "loss": 0.8819, + "step": 171260 + }, + { + "epoch": 13.272114378705103, + "grad_norm": 1.4608030410664203, + "learning_rate": 6.636314321140733e-07, + "loss": 0.8772, + "step": 171270 + }, + { + "epoch": 13.27288930218141, + "grad_norm": 1.4351965988962798, + "learning_rate": 6.636701797892127e-07, + "loss": 0.8724, + "step": 171280 + }, + { + "epoch": 13.273664225657717, + "grad_norm": 1.477354146022325, + "learning_rate": 6.637089274643522e-07, + "loss": 0.8771, + "step": 171290 + }, + { + "epoch": 13.274439149134023, + "grad_norm": 1.4867319021745957, + "learning_rate": 6.637476751394916e-07, + "loss": 0.8881, + "step": 171300 + }, + { + "epoch": 13.27521407261033, + "grad_norm": 1.508048722410531, + "learning_rate": 6.637864228146313e-07, + "loss": 0.8958, + "step": 171310 + }, + { + "epoch": 13.275988996086637, + "grad_norm": 1.6217294380221605, + "learning_rate": 6.638251704897707e-07, + "loss": 0.8704, + "step": 171320 + }, + { + "epoch": 13.276763919562944, + "grad_norm": 1.6820459814663997, + "learning_rate": 6.638639181649102e-07, + "loss": 0.8656, + "step": 171330 + }, + { + "epoch": 13.27753884303925, + "grad_norm": 1.5040004218131289, + "learning_rate": 6.639026658400496e-07, + "loss": 0.8823, + "step": 171340 + }, + { + "epoch": 13.278313766515556, + "grad_norm": 1.514124921592134, + "learning_rate": 6.639414135151893e-07, + "loss": 0.8593, + "step": 171350 + }, + { + "epoch": 13.279088689991863, + "grad_norm": 1.5787183789948616, + "learning_rate": 6.639801611903287e-07, + "loss": 0.879, + "step": 171360 + }, + { + "epoch": 13.27986361346817, + "grad_norm": 1.537426855661328, + "learning_rate": 6.640189088654682e-07, + "loss": 0.888, + "step": 171370 + }, + { + "epoch": 13.280638536944476, + "grad_norm": 1.4561199504041398, + "learning_rate": 6.640576565406076e-07, + "loss": 0.8652, + "step": 171380 + }, + { + "epoch": 13.281413460420783, + "grad_norm": 1.5367242749555714, + "learning_rate": 6.640964042157471e-07, + "loss": 0.8777, + "step": 171390 + }, + { + "epoch": 13.28218838389709, + "grad_norm": 1.612785014554225, + "learning_rate": 6.641351518908865e-07, + "loss": 0.8699, + "step": 171400 + }, + { + "epoch": 13.282963307373397, + "grad_norm": 1.5042051886490693, + "learning_rate": 6.641738995660262e-07, + "loss": 0.8629, + "step": 171410 + }, + { + "epoch": 13.283738230849703, + "grad_norm": 1.607402766949793, + "learning_rate": 6.642126472411656e-07, + "loss": 0.868, + "step": 171420 + }, + { + "epoch": 13.28451315432601, + "grad_norm": 1.403836692476543, + "learning_rate": 6.642513949163051e-07, + "loss": 0.8724, + "step": 171430 + }, + { + "epoch": 13.285288077802317, + "grad_norm": 1.5479409309200471, + "learning_rate": 6.642901425914445e-07, + "loss": 0.8874, + "step": 171440 + }, + { + "epoch": 13.286063001278624, + "grad_norm": 1.5820185404898959, + "learning_rate": 6.643288902665842e-07, + "loss": 0.8652, + "step": 171450 + }, + { + "epoch": 13.28683792475493, + "grad_norm": 1.4576791239579407, + "learning_rate": 6.643676379417236e-07, + "loss": 0.8886, + "step": 171460 + }, + { + "epoch": 13.287612848231237, + "grad_norm": 1.5651358850885169, + "learning_rate": 6.644063856168631e-07, + "loss": 0.8552, + "step": 171470 + }, + { + "epoch": 13.288387771707544, + "grad_norm": 1.5334325283409824, + "learning_rate": 6.644451332920025e-07, + "loss": 0.8612, + "step": 171480 + }, + { + "epoch": 13.289162695183851, + "grad_norm": 1.5153450090365428, + "learning_rate": 6.644838809671421e-07, + "loss": 0.8472, + "step": 171490 + }, + { + "epoch": 13.289937618660158, + "grad_norm": 1.5778544393075848, + "learning_rate": 6.645226286422816e-07, + "loss": 0.8783, + "step": 171500 + }, + { + "epoch": 13.289937618660158, + "eval_loss": 0.8955594897270203, + "eval_runtime": 330.5217, + "eval_samples_per_second": 34.706, + "eval_steps_per_second": 8.677, + "step": 171500 + }, + { + "epoch": 13.290712542136465, + "grad_norm": 1.4589879465380473, + "learning_rate": 6.645613763174211e-07, + "loss": 0.8649, + "step": 171510 + }, + { + "epoch": 13.291487465612772, + "grad_norm": 1.5313789817228756, + "learning_rate": 6.646001239925605e-07, + "loss": 0.8914, + "step": 171520 + }, + { + "epoch": 13.292262389089078, + "grad_norm": 1.4449491499850424, + "learning_rate": 6.646388716677e-07, + "loss": 0.869, + "step": 171530 + }, + { + "epoch": 13.293037312565383, + "grad_norm": 1.467548983921065, + "learning_rate": 6.646776193428394e-07, + "loss": 0.8747, + "step": 171540 + }, + { + "epoch": 13.29381223604169, + "grad_norm": 1.5930489593504706, + "learning_rate": 6.647163670179791e-07, + "loss": 0.9071, + "step": 171550 + }, + { + "epoch": 13.294587159517997, + "grad_norm": 1.5273330336777045, + "learning_rate": 6.647551146931185e-07, + "loss": 0.8957, + "step": 171560 + }, + { + "epoch": 13.295362082994304, + "grad_norm": 1.5370846439744774, + "learning_rate": 6.64793862368258e-07, + "loss": 0.876, + "step": 171570 + }, + { + "epoch": 13.29613700647061, + "grad_norm": 1.580707955611042, + "learning_rate": 6.648326100433974e-07, + "loss": 0.8711, + "step": 171580 + }, + { + "epoch": 13.296911929946917, + "grad_norm": 1.4760469367138593, + "learning_rate": 6.64871357718537e-07, + "loss": 0.8692, + "step": 171590 + }, + { + "epoch": 13.297686853423224, + "grad_norm": 1.5105597029849134, + "learning_rate": 6.649101053936765e-07, + "loss": 0.8641, + "step": 171600 + }, + { + "epoch": 13.298461776899531, + "grad_norm": 1.5939435746561605, + "learning_rate": 6.64948853068816e-07, + "loss": 0.8752, + "step": 171610 + }, + { + "epoch": 13.299236700375838, + "grad_norm": 1.5360481727046325, + "learning_rate": 6.649876007439554e-07, + "loss": 0.8762, + "step": 171620 + }, + { + "epoch": 13.300011623852145, + "grad_norm": 1.4992035981103724, + "learning_rate": 6.65026348419095e-07, + "loss": 0.8748, + "step": 171630 + }, + { + "epoch": 13.300786547328451, + "grad_norm": 1.548563294095182, + "learning_rate": 6.650650960942344e-07, + "loss": 0.8815, + "step": 171640 + }, + { + "epoch": 13.301561470804758, + "grad_norm": 1.487292917615847, + "learning_rate": 6.65103843769374e-07, + "loss": 0.8842, + "step": 171650 + }, + { + "epoch": 13.302336394281065, + "grad_norm": 1.4811209752428527, + "learning_rate": 6.651425914445134e-07, + "loss": 0.8653, + "step": 171660 + }, + { + "epoch": 13.303111317757372, + "grad_norm": 1.3772517465257752, + "learning_rate": 6.651813391196529e-07, + "loss": 0.8629, + "step": 171670 + }, + { + "epoch": 13.303886241233679, + "grad_norm": 1.4934577495973032, + "learning_rate": 6.652200867947923e-07, + "loss": 0.8858, + "step": 171680 + }, + { + "epoch": 13.304661164709985, + "grad_norm": 1.5766122235956985, + "learning_rate": 6.652588344699319e-07, + "loss": 0.866, + "step": 171690 + }, + { + "epoch": 13.305436088186292, + "grad_norm": 1.4811720748492618, + "learning_rate": 6.652975821450714e-07, + "loss": 0.8761, + "step": 171700 + }, + { + "epoch": 13.3062110116626, + "grad_norm": 1.4793827496011078, + "learning_rate": 6.653363298202109e-07, + "loss": 0.8744, + "step": 171710 + }, + { + "epoch": 13.306985935138904, + "grad_norm": 1.472928473646538, + "learning_rate": 6.653750774953503e-07, + "loss": 0.872, + "step": 171720 + }, + { + "epoch": 13.307760858615211, + "grad_norm": 1.569576421009273, + "learning_rate": 6.654138251704899e-07, + "loss": 0.8593, + "step": 171730 + }, + { + "epoch": 13.308535782091518, + "grad_norm": 1.5202172264257412, + "learning_rate": 6.654525728456293e-07, + "loss": 0.8653, + "step": 171740 + }, + { + "epoch": 13.309310705567825, + "grad_norm": 1.4379601398115422, + "learning_rate": 6.654913205207689e-07, + "loss": 0.8616, + "step": 171750 + }, + { + "epoch": 13.310085629044131, + "grad_norm": 1.4965766953910633, + "learning_rate": 6.655300681959083e-07, + "loss": 0.8827, + "step": 171760 + }, + { + "epoch": 13.310860552520438, + "grad_norm": 1.4751839156009792, + "learning_rate": 6.655688158710478e-07, + "loss": 0.8612, + "step": 171770 + }, + { + "epoch": 13.311635475996745, + "grad_norm": 1.4544582040504057, + "learning_rate": 6.656075635461873e-07, + "loss": 0.868, + "step": 171780 + }, + { + "epoch": 13.312410399473052, + "grad_norm": 1.4715094789414924, + "learning_rate": 6.656463112213268e-07, + "loss": 0.8848, + "step": 171790 + }, + { + "epoch": 13.313185322949359, + "grad_norm": 1.5428307459222803, + "learning_rate": 6.656850588964663e-07, + "loss": 0.881, + "step": 171800 + }, + { + "epoch": 13.313960246425665, + "grad_norm": 1.4995401905754022, + "learning_rate": 6.657238065716058e-07, + "loss": 0.8653, + "step": 171810 + }, + { + "epoch": 13.314735169901972, + "grad_norm": 1.4614926591110646, + "learning_rate": 6.657625542467452e-07, + "loss": 0.8903, + "step": 171820 + }, + { + "epoch": 13.315510093378279, + "grad_norm": 1.5284958830672168, + "learning_rate": 6.658013019218848e-07, + "loss": 0.8728, + "step": 171830 + }, + { + "epoch": 13.316285016854586, + "grad_norm": 1.5888600530951618, + "learning_rate": 6.658400495970242e-07, + "loss": 0.8519, + "step": 171840 + }, + { + "epoch": 13.317059940330893, + "grad_norm": 1.4540047389688158, + "learning_rate": 6.658787972721638e-07, + "loss": 0.8582, + "step": 171850 + }, + { + "epoch": 13.3178348638072, + "grad_norm": 1.4888833576618838, + "learning_rate": 6.659175449473032e-07, + "loss": 0.8857, + "step": 171860 + }, + { + "epoch": 13.318609787283506, + "grad_norm": 1.4499723137224296, + "learning_rate": 6.659562926224428e-07, + "loss": 0.8743, + "step": 171870 + }, + { + "epoch": 13.319384710759813, + "grad_norm": 1.5195692406880712, + "learning_rate": 6.659950402975822e-07, + "loss": 0.88, + "step": 171880 + }, + { + "epoch": 13.32015963423612, + "grad_norm": 1.5063322281406577, + "learning_rate": 6.660337879727217e-07, + "loss": 0.8713, + "step": 171890 + }, + { + "epoch": 13.320934557712427, + "grad_norm": 1.534047977357662, + "learning_rate": 6.660725356478612e-07, + "loss": 0.8717, + "step": 171900 + }, + { + "epoch": 13.321709481188732, + "grad_norm": 1.4987763981528899, + "learning_rate": 6.661112833230007e-07, + "loss": 0.8721, + "step": 171910 + }, + { + "epoch": 13.322484404665039, + "grad_norm": 1.5550207523444828, + "learning_rate": 6.661500309981402e-07, + "loss": 0.855, + "step": 171920 + }, + { + "epoch": 13.323259328141345, + "grad_norm": 1.512138360549271, + "learning_rate": 6.661887786732797e-07, + "loss": 0.8775, + "step": 171930 + }, + { + "epoch": 13.324034251617652, + "grad_norm": 1.541101007483704, + "learning_rate": 6.662275263484191e-07, + "loss": 0.8872, + "step": 171940 + }, + { + "epoch": 13.324809175093959, + "grad_norm": 1.538976663395008, + "learning_rate": 6.662662740235587e-07, + "loss": 0.8761, + "step": 171950 + }, + { + "epoch": 13.325584098570266, + "grad_norm": 1.5289690219399246, + "learning_rate": 6.663050216986981e-07, + "loss": 0.872, + "step": 171960 + }, + { + "epoch": 13.326359022046573, + "grad_norm": 1.6585253951689027, + "learning_rate": 6.663437693738377e-07, + "loss": 0.8605, + "step": 171970 + }, + { + "epoch": 13.32713394552288, + "grad_norm": 1.530745596075152, + "learning_rate": 6.663825170489771e-07, + "loss": 0.8654, + "step": 171980 + }, + { + "epoch": 13.327908868999186, + "grad_norm": 1.4952999180509672, + "learning_rate": 6.664212647241166e-07, + "loss": 0.852, + "step": 171990 + }, + { + "epoch": 13.328683792475493, + "grad_norm": 1.5797629998972207, + "learning_rate": 6.664600123992561e-07, + "loss": 0.8894, + "step": 172000 + }, + { + "epoch": 13.328683792475493, + "eval_loss": 0.8954764604568481, + "eval_runtime": 331.0461, + "eval_samples_per_second": 34.651, + "eval_steps_per_second": 8.663, + "step": 172000 + }, + { + "epoch": 13.3294587159518, + "grad_norm": 1.4852839178830064, + "learning_rate": 6.664987600743957e-07, + "loss": 0.8663, + "step": 172010 + }, + { + "epoch": 13.330233639428107, + "grad_norm": 1.5219962675876624, + "learning_rate": 6.665375077495351e-07, + "loss": 0.8822, + "step": 172020 + }, + { + "epoch": 13.331008562904413, + "grad_norm": 1.520151180138857, + "learning_rate": 6.665762554246746e-07, + "loss": 0.8686, + "step": 172030 + }, + { + "epoch": 13.33178348638072, + "grad_norm": 1.48903279117242, + "learning_rate": 6.66615003099814e-07, + "loss": 0.8902, + "step": 172040 + }, + { + "epoch": 13.332558409857027, + "grad_norm": 1.4936754474747567, + "learning_rate": 6.666537507749536e-07, + "loss": 0.869, + "step": 172050 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 1.5426035816418509, + "learning_rate": 6.666924984500931e-07, + "loss": 0.8628, + "step": 172060 + }, + { + "epoch": 13.33410825680964, + "grad_norm": 1.5282374972417005, + "learning_rate": 6.667312461252326e-07, + "loss": 0.8864, + "step": 172070 + }, + { + "epoch": 13.334883180285948, + "grad_norm": 1.4890829777112908, + "learning_rate": 6.66769993800372e-07, + "loss": 0.8958, + "step": 172080 + }, + { + "epoch": 13.335658103762253, + "grad_norm": 1.6357783271975197, + "learning_rate": 6.668087414755116e-07, + "loss": 0.8858, + "step": 172090 + }, + { + "epoch": 13.33643302723856, + "grad_norm": 1.6041823331855847, + "learning_rate": 6.66847489150651e-07, + "loss": 0.8865, + "step": 172100 + }, + { + "epoch": 13.337207950714866, + "grad_norm": 1.535935692631142, + "learning_rate": 6.668862368257906e-07, + "loss": 0.8819, + "step": 172110 + }, + { + "epoch": 13.337982874191173, + "grad_norm": 1.5069967375672315, + "learning_rate": 6.6692498450093e-07, + "loss": 0.9004, + "step": 172120 + }, + { + "epoch": 13.33875779766748, + "grad_norm": 1.6067679262700174, + "learning_rate": 6.669637321760695e-07, + "loss": 0.8689, + "step": 172130 + }, + { + "epoch": 13.339532721143787, + "grad_norm": 1.4349241817927167, + "learning_rate": 6.67002479851209e-07, + "loss": 0.8706, + "step": 172140 + }, + { + "epoch": 13.340307644620093, + "grad_norm": 1.5382490863577056, + "learning_rate": 6.670412275263486e-07, + "loss": 0.862, + "step": 172150 + }, + { + "epoch": 13.3410825680964, + "grad_norm": 1.4740069539998841, + "learning_rate": 6.67079975201488e-07, + "loss": 0.8675, + "step": 172160 + }, + { + "epoch": 13.341857491572707, + "grad_norm": 1.5504011577877506, + "learning_rate": 6.671187228766275e-07, + "loss": 0.8699, + "step": 172170 + }, + { + "epoch": 13.342632415049014, + "grad_norm": 1.4184364769103848, + "learning_rate": 6.671574705517669e-07, + "loss": 0.8575, + "step": 172180 + }, + { + "epoch": 13.34340733852532, + "grad_norm": 1.4604749221119495, + "learning_rate": 6.671962182269065e-07, + "loss": 0.8803, + "step": 172190 + }, + { + "epoch": 13.344182262001627, + "grad_norm": 1.5174103657504898, + "learning_rate": 6.67234965902046e-07, + "loss": 0.863, + "step": 172200 + }, + { + "epoch": 13.344957185477934, + "grad_norm": 1.5423883357377877, + "learning_rate": 6.672737135771855e-07, + "loss": 0.8613, + "step": 172210 + }, + { + "epoch": 13.345732108954241, + "grad_norm": 1.5273902749488526, + "learning_rate": 6.673124612523249e-07, + "loss": 0.8719, + "step": 172220 + }, + { + "epoch": 13.346507032430548, + "grad_norm": 1.4571482346074853, + "learning_rate": 6.673512089274644e-07, + "loss": 0.8699, + "step": 172230 + }, + { + "epoch": 13.347281955906855, + "grad_norm": 1.379983903073707, + "learning_rate": 6.673899566026039e-07, + "loss": 0.8771, + "step": 172240 + }, + { + "epoch": 13.348056879383162, + "grad_norm": 1.5925317628931763, + "learning_rate": 6.674287042777435e-07, + "loss": 0.8716, + "step": 172250 + }, + { + "epoch": 13.348831802859468, + "grad_norm": 1.4659063900575682, + "learning_rate": 6.674674519528829e-07, + "loss": 0.8744, + "step": 172260 + }, + { + "epoch": 13.349606726335775, + "grad_norm": 1.5911408278997592, + "learning_rate": 6.675061996280224e-07, + "loss": 0.8826, + "step": 172270 + }, + { + "epoch": 13.35038164981208, + "grad_norm": 1.5240873885541264, + "learning_rate": 6.675449473031618e-07, + "loss": 0.8678, + "step": 172280 + }, + { + "epoch": 13.351156573288387, + "grad_norm": 1.5453442665842967, + "learning_rate": 6.675836949783015e-07, + "loss": 0.8726, + "step": 172290 + }, + { + "epoch": 13.351931496764694, + "grad_norm": 1.510079669729456, + "learning_rate": 6.676224426534409e-07, + "loss": 0.889, + "step": 172300 + }, + { + "epoch": 13.352706420241, + "grad_norm": 1.5845866742164625, + "learning_rate": 6.676611903285804e-07, + "loss": 0.8887, + "step": 172310 + }, + { + "epoch": 13.353481343717307, + "grad_norm": 1.5809716520399366, + "learning_rate": 6.676999380037198e-07, + "loss": 0.8711, + "step": 172320 + }, + { + "epoch": 13.354256267193614, + "grad_norm": 1.5272635646944932, + "learning_rate": 6.677386856788593e-07, + "loss": 0.8758, + "step": 172330 + }, + { + "epoch": 13.355031190669921, + "grad_norm": 1.4741397146375284, + "learning_rate": 6.677774333539988e-07, + "loss": 0.8711, + "step": 172340 + }, + { + "epoch": 13.355806114146228, + "grad_norm": 1.581014549292977, + "learning_rate": 6.678161810291384e-07, + "loss": 0.8683, + "step": 172350 + }, + { + "epoch": 13.356581037622535, + "grad_norm": 1.54311606769741, + "learning_rate": 6.678549287042778e-07, + "loss": 0.8644, + "step": 172360 + }, + { + "epoch": 13.357355961098841, + "grad_norm": 1.4648068899344653, + "learning_rate": 6.678936763794173e-07, + "loss": 0.8753, + "step": 172370 + }, + { + "epoch": 13.358130884575148, + "grad_norm": 1.526244512624004, + "learning_rate": 6.679324240545567e-07, + "loss": 0.862, + "step": 172380 + }, + { + "epoch": 13.358905808051455, + "grad_norm": 1.531281140587342, + "learning_rate": 6.679711717296964e-07, + "loss": 0.8762, + "step": 172390 + }, + { + "epoch": 13.359680731527762, + "grad_norm": 1.5150897568910595, + "learning_rate": 6.680099194048358e-07, + "loss": 0.8705, + "step": 172400 + }, + { + "epoch": 13.360455655004069, + "grad_norm": 1.4653452456631424, + "learning_rate": 6.680486670799753e-07, + "loss": 0.8843, + "step": 172410 + }, + { + "epoch": 13.361230578480376, + "grad_norm": 1.5821322926036088, + "learning_rate": 6.680874147551147e-07, + "loss": 0.8748, + "step": 172420 + }, + { + "epoch": 13.362005501956682, + "grad_norm": 1.5054697318403993, + "learning_rate": 6.681261624302543e-07, + "loss": 0.8881, + "step": 172430 + }, + { + "epoch": 13.36278042543299, + "grad_norm": 1.429615935844756, + "learning_rate": 6.681649101053938e-07, + "loss": 0.8666, + "step": 172440 + }, + { + "epoch": 13.363555348909296, + "grad_norm": 1.554551791366923, + "learning_rate": 6.682036577805333e-07, + "loss": 0.8621, + "step": 172450 + }, + { + "epoch": 13.364330272385601, + "grad_norm": 1.507491721756449, + "learning_rate": 6.682424054556727e-07, + "loss": 0.8834, + "step": 172460 + }, + { + "epoch": 13.365105195861908, + "grad_norm": 1.5587213069696766, + "learning_rate": 6.682811531308122e-07, + "loss": 0.8709, + "step": 172470 + }, + { + "epoch": 13.365880119338215, + "grad_norm": 1.7084161146265395, + "learning_rate": 6.683199008059516e-07, + "loss": 0.9044, + "step": 172480 + }, + { + "epoch": 13.366655042814521, + "grad_norm": 1.586376901174847, + "learning_rate": 6.683586484810913e-07, + "loss": 0.8913, + "step": 172490 + }, + { + "epoch": 13.367429966290828, + "grad_norm": 1.5401540775578348, + "learning_rate": 6.683973961562307e-07, + "loss": 0.874, + "step": 172500 + }, + { + "epoch": 13.367429966290828, + "eval_loss": 0.8951730132102966, + "eval_runtime": 331.3498, + "eval_samples_per_second": 34.619, + "eval_steps_per_second": 8.656, + "step": 172500 + }, + { + "epoch": 13.368204889767135, + "grad_norm": 1.5027430687458307, + "learning_rate": 6.684361438313702e-07, + "loss": 0.8692, + "step": 172510 + }, + { + "epoch": 13.368979813243442, + "grad_norm": 1.5907169330557667, + "learning_rate": 6.684748915065096e-07, + "loss": 0.887, + "step": 172520 + }, + { + "epoch": 13.369754736719749, + "grad_norm": 1.4822687359715179, + "learning_rate": 6.685136391816492e-07, + "loss": 0.8716, + "step": 172530 + }, + { + "epoch": 13.370529660196055, + "grad_norm": 1.4417993685666086, + "learning_rate": 6.685523868567887e-07, + "loss": 0.8661, + "step": 172540 + }, + { + "epoch": 13.371304583672362, + "grad_norm": 1.5216362470365508, + "learning_rate": 6.685911345319282e-07, + "loss": 0.8682, + "step": 172550 + }, + { + "epoch": 13.372079507148669, + "grad_norm": 1.6296586957202277, + "learning_rate": 6.686298822070676e-07, + "loss": 0.8889, + "step": 172560 + }, + { + "epoch": 13.372854430624976, + "grad_norm": 1.5019014349657742, + "learning_rate": 6.686686298822072e-07, + "loss": 0.8679, + "step": 172570 + }, + { + "epoch": 13.373629354101283, + "grad_norm": 1.4432275622789994, + "learning_rate": 6.687073775573466e-07, + "loss": 0.8644, + "step": 172580 + }, + { + "epoch": 13.37440427757759, + "grad_norm": 1.4782825112707396, + "learning_rate": 6.687461252324862e-07, + "loss": 0.8584, + "step": 172590 + }, + { + "epoch": 13.375179201053896, + "grad_norm": 1.5320663956831668, + "learning_rate": 6.687848729076256e-07, + "loss": 0.9012, + "step": 172600 + }, + { + "epoch": 13.375954124530203, + "grad_norm": 1.5288918903984006, + "learning_rate": 6.688236205827651e-07, + "loss": 0.8757, + "step": 172610 + }, + { + "epoch": 13.37672904800651, + "grad_norm": 1.5152779667677896, + "learning_rate": 6.688623682579045e-07, + "loss": 0.8551, + "step": 172620 + }, + { + "epoch": 13.377503971482817, + "grad_norm": 1.5349867042923908, + "learning_rate": 6.689011159330441e-07, + "loss": 0.8752, + "step": 172630 + }, + { + "epoch": 13.378278894959124, + "grad_norm": 1.5168944731113252, + "learning_rate": 6.689398636081836e-07, + "loss": 0.8817, + "step": 172640 + }, + { + "epoch": 13.37905381843543, + "grad_norm": 1.4945118780127133, + "learning_rate": 6.689786112833231e-07, + "loss": 0.8613, + "step": 172650 + }, + { + "epoch": 13.379828741911735, + "grad_norm": 1.4795057851555642, + "learning_rate": 6.690173589584625e-07, + "loss": 0.8878, + "step": 172660 + }, + { + "epoch": 13.380603665388042, + "grad_norm": 1.5429001622353145, + "learning_rate": 6.690561066336021e-07, + "loss": 0.8806, + "step": 172670 + }, + { + "epoch": 13.381378588864349, + "grad_norm": 1.5779229666146586, + "learning_rate": 6.690948543087415e-07, + "loss": 0.8895, + "step": 172680 + }, + { + "epoch": 13.382153512340656, + "grad_norm": 1.6129693428350618, + "learning_rate": 6.691336019838811e-07, + "loss": 0.8701, + "step": 172690 + }, + { + "epoch": 13.382928435816963, + "grad_norm": 1.5780570941576904, + "learning_rate": 6.691723496590205e-07, + "loss": 0.8945, + "step": 172700 + }, + { + "epoch": 13.38370335929327, + "grad_norm": 1.6819210121196677, + "learning_rate": 6.692110973341601e-07, + "loss": 0.892, + "step": 172710 + }, + { + "epoch": 13.384478282769576, + "grad_norm": 1.4585819462567635, + "learning_rate": 6.692498450092995e-07, + "loss": 0.8434, + "step": 172720 + }, + { + "epoch": 13.385253206245883, + "grad_norm": 1.7569233000337678, + "learning_rate": 6.69288592684439e-07, + "loss": 0.8872, + "step": 172730 + }, + { + "epoch": 13.38602812972219, + "grad_norm": 1.5150578950158533, + "learning_rate": 6.693273403595785e-07, + "loss": 0.8833, + "step": 172740 + }, + { + "epoch": 13.386803053198497, + "grad_norm": 1.5102445371069213, + "learning_rate": 6.69366088034718e-07, + "loss": 0.8684, + "step": 172750 + }, + { + "epoch": 13.387577976674804, + "grad_norm": 1.452120029926098, + "learning_rate": 6.694048357098574e-07, + "loss": 0.8577, + "step": 172760 + }, + { + "epoch": 13.38835290015111, + "grad_norm": 1.3994106995721067, + "learning_rate": 6.69443583384997e-07, + "loss": 0.8759, + "step": 172770 + }, + { + "epoch": 13.389127823627417, + "grad_norm": 1.5262150325968629, + "learning_rate": 6.694823310601364e-07, + "loss": 0.8622, + "step": 172780 + }, + { + "epoch": 13.389902747103724, + "grad_norm": 1.526789600476637, + "learning_rate": 6.69521078735276e-07, + "loss": 0.8742, + "step": 172790 + }, + { + "epoch": 13.39067767058003, + "grad_norm": 1.5512270138132207, + "learning_rate": 6.695598264104154e-07, + "loss": 0.8787, + "step": 172800 + }, + { + "epoch": 13.391452594056338, + "grad_norm": 1.518104712824473, + "learning_rate": 6.69598574085555e-07, + "loss": 0.8719, + "step": 172810 + }, + { + "epoch": 13.392227517532644, + "grad_norm": 1.4759702533251131, + "learning_rate": 6.696373217606944e-07, + "loss": 0.8645, + "step": 172820 + }, + { + "epoch": 13.393002441008951, + "grad_norm": 1.590954072417904, + "learning_rate": 6.69676069435834e-07, + "loss": 0.8633, + "step": 172830 + }, + { + "epoch": 13.393777364485256, + "grad_norm": 1.5109867701718553, + "learning_rate": 6.697148171109734e-07, + "loss": 0.8611, + "step": 172840 + }, + { + "epoch": 13.394552287961563, + "grad_norm": 1.425325555311002, + "learning_rate": 6.69753564786113e-07, + "loss": 0.8815, + "step": 172850 + }, + { + "epoch": 13.39532721143787, + "grad_norm": 1.5233447179142154, + "learning_rate": 6.697923124612524e-07, + "loss": 0.8737, + "step": 172860 + }, + { + "epoch": 13.396102134914177, + "grad_norm": 1.5909240817655783, + "learning_rate": 6.698310601363919e-07, + "loss": 0.8718, + "step": 172870 + }, + { + "epoch": 13.396877058390483, + "grad_norm": 1.71057266177302, + "learning_rate": 6.698698078115314e-07, + "loss": 0.8666, + "step": 172880 + }, + { + "epoch": 13.39765198186679, + "grad_norm": 1.6186383171873016, + "learning_rate": 6.699085554866709e-07, + "loss": 0.8546, + "step": 172890 + }, + { + "epoch": 13.398426905343097, + "grad_norm": 1.52760106772113, + "learning_rate": 6.699473031618103e-07, + "loss": 0.8662, + "step": 172900 + }, + { + "epoch": 13.399201828819404, + "grad_norm": 1.478119171345675, + "learning_rate": 6.699860508369499e-07, + "loss": 0.8641, + "step": 172910 + }, + { + "epoch": 13.39997675229571, + "grad_norm": 1.5457395774359093, + "learning_rate": 6.700247985120893e-07, + "loss": 0.8913, + "step": 172920 + }, + { + "epoch": 13.400751675772018, + "grad_norm": 1.6500169598159489, + "learning_rate": 6.700635461872289e-07, + "loss": 0.8862, + "step": 172930 + }, + { + "epoch": 13.401526599248324, + "grad_norm": 1.5025516772737526, + "learning_rate": 6.701022938623683e-07, + "loss": 0.8684, + "step": 172940 + }, + { + "epoch": 13.402301522724631, + "grad_norm": 1.5725447653575153, + "learning_rate": 6.701410415375079e-07, + "loss": 0.8777, + "step": 172950 + }, + { + "epoch": 13.403076446200938, + "grad_norm": 1.4188847747114524, + "learning_rate": 6.701797892126473e-07, + "loss": 0.8677, + "step": 172960 + }, + { + "epoch": 13.403851369677245, + "grad_norm": 1.5339329115440763, + "learning_rate": 6.702185368877868e-07, + "loss": 0.8745, + "step": 172970 + }, + { + "epoch": 13.404626293153552, + "grad_norm": 1.494725268016696, + "learning_rate": 6.702572845629263e-07, + "loss": 0.875, + "step": 172980 + }, + { + "epoch": 13.405401216629858, + "grad_norm": 1.5323023932561484, + "learning_rate": 6.702960322380659e-07, + "loss": 0.8676, + "step": 172990 + }, + { + "epoch": 13.406176140106165, + "grad_norm": 1.525496791112072, + "learning_rate": 6.703347799132053e-07, + "loss": 0.8869, + "step": 173000 + }, + { + "epoch": 13.406176140106165, + "eval_loss": 0.89521723985672, + "eval_runtime": 330.8014, + "eval_samples_per_second": 34.676, + "eval_steps_per_second": 8.67, + "step": 173000 + }, + { + "epoch": 13.406951063582472, + "grad_norm": 1.5843650524503838, + "learning_rate": 6.703735275883448e-07, + "loss": 0.8799, + "step": 173010 + }, + { + "epoch": 13.407725987058779, + "grad_norm": 1.5973775144993498, + "learning_rate": 6.704122752634842e-07, + "loss": 0.8854, + "step": 173020 + }, + { + "epoch": 13.408500910535084, + "grad_norm": 1.494806464701543, + "learning_rate": 6.704510229386238e-07, + "loss": 0.8665, + "step": 173030 + }, + { + "epoch": 13.40927583401139, + "grad_norm": 1.5207953222565835, + "learning_rate": 6.704897706137632e-07, + "loss": 0.8766, + "step": 173040 + }, + { + "epoch": 13.410050757487697, + "grad_norm": 1.5253979085000562, + "learning_rate": 6.705285182889028e-07, + "loss": 0.8705, + "step": 173050 + }, + { + "epoch": 13.410825680964004, + "grad_norm": 1.4405869515490402, + "learning_rate": 6.705672659640422e-07, + "loss": 0.8732, + "step": 173060 + }, + { + "epoch": 13.411600604440311, + "grad_norm": 1.436655113342009, + "learning_rate": 6.706060136391817e-07, + "loss": 0.8955, + "step": 173070 + }, + { + "epoch": 13.412375527916618, + "grad_norm": 1.482973136960414, + "learning_rate": 6.706447613143212e-07, + "loss": 0.8674, + "step": 173080 + }, + { + "epoch": 13.413150451392925, + "grad_norm": 1.3988056459812137, + "learning_rate": 6.706835089894608e-07, + "loss": 0.8563, + "step": 173090 + }, + { + "epoch": 13.413925374869232, + "grad_norm": 1.4739767130433623, + "learning_rate": 6.707222566646002e-07, + "loss": 0.8511, + "step": 173100 + }, + { + "epoch": 13.414700298345538, + "grad_norm": 1.4929079379000918, + "learning_rate": 6.707610043397397e-07, + "loss": 0.8622, + "step": 173110 + }, + { + "epoch": 13.415475221821845, + "grad_norm": 1.7119781149632156, + "learning_rate": 6.707997520148791e-07, + "loss": 0.8617, + "step": 173120 + }, + { + "epoch": 13.416250145298152, + "grad_norm": 1.5164588324984485, + "learning_rate": 6.708384996900188e-07, + "loss": 0.8769, + "step": 173130 + }, + { + "epoch": 13.417025068774459, + "grad_norm": 1.5417508641351907, + "learning_rate": 6.708772473651582e-07, + "loss": 0.8637, + "step": 173140 + }, + { + "epoch": 13.417799992250766, + "grad_norm": 1.5038059034306033, + "learning_rate": 6.709159950402977e-07, + "loss": 0.8621, + "step": 173150 + }, + { + "epoch": 13.418574915727072, + "grad_norm": 1.628017394862929, + "learning_rate": 6.709547427154371e-07, + "loss": 0.8831, + "step": 173160 + }, + { + "epoch": 13.41934983920338, + "grad_norm": 1.5896459814726112, + "learning_rate": 6.709934903905766e-07, + "loss": 0.8574, + "step": 173170 + }, + { + "epoch": 13.420124762679686, + "grad_norm": 1.5371626116091985, + "learning_rate": 6.710322380657161e-07, + "loss": 0.8597, + "step": 173180 + }, + { + "epoch": 13.420899686155993, + "grad_norm": 1.4275215443880533, + "learning_rate": 6.710709857408557e-07, + "loss": 0.8702, + "step": 173190 + }, + { + "epoch": 13.4216746096323, + "grad_norm": 1.4811721881000466, + "learning_rate": 6.711097334159951e-07, + "loss": 0.8914, + "step": 173200 + }, + { + "epoch": 13.422449533108605, + "grad_norm": 1.4427537006286282, + "learning_rate": 6.711484810911346e-07, + "loss": 0.8766, + "step": 173210 + }, + { + "epoch": 13.423224456584911, + "grad_norm": 1.6270929168147095, + "learning_rate": 6.71187228766274e-07, + "loss": 0.8871, + "step": 173220 + }, + { + "epoch": 13.423999380061218, + "grad_norm": 1.546491043291234, + "learning_rate": 6.712259764414137e-07, + "loss": 0.8741, + "step": 173230 + }, + { + "epoch": 13.424774303537525, + "grad_norm": 1.7394343497566025, + "learning_rate": 6.712647241165531e-07, + "loss": 0.8954, + "step": 173240 + }, + { + "epoch": 13.425549227013832, + "grad_norm": 1.5389954282815588, + "learning_rate": 6.713034717916926e-07, + "loss": 0.8685, + "step": 173250 + }, + { + "epoch": 13.426324150490139, + "grad_norm": 1.4336713569745696, + "learning_rate": 6.71342219466832e-07, + "loss": 0.8665, + "step": 173260 + }, + { + "epoch": 13.427099073966446, + "grad_norm": 1.5222223631151135, + "learning_rate": 6.713809671419715e-07, + "loss": 0.8823, + "step": 173270 + }, + { + "epoch": 13.427873997442752, + "grad_norm": 1.494462632318576, + "learning_rate": 6.714197148171111e-07, + "loss": 0.8649, + "step": 173280 + }, + { + "epoch": 13.42864892091906, + "grad_norm": 1.520703461767177, + "learning_rate": 6.714584624922506e-07, + "loss": 0.8757, + "step": 173290 + }, + { + "epoch": 13.429423844395366, + "grad_norm": 1.584611028419962, + "learning_rate": 6.7149721016739e-07, + "loss": 0.872, + "step": 173300 + }, + { + "epoch": 13.430198767871673, + "grad_norm": 1.5795576707485233, + "learning_rate": 6.715359578425295e-07, + "loss": 0.8705, + "step": 173310 + }, + { + "epoch": 13.43097369134798, + "grad_norm": 1.4846136746847156, + "learning_rate": 6.715747055176689e-07, + "loss": 0.8774, + "step": 173320 + }, + { + "epoch": 13.431748614824286, + "grad_norm": 1.4492455666136799, + "learning_rate": 6.716134531928086e-07, + "loss": 0.8602, + "step": 173330 + }, + { + "epoch": 13.432523538300593, + "grad_norm": 1.5555470151974167, + "learning_rate": 6.71652200867948e-07, + "loss": 0.8785, + "step": 173340 + }, + { + "epoch": 13.4332984617769, + "grad_norm": 1.5163598182758191, + "learning_rate": 6.716909485430875e-07, + "loss": 0.8873, + "step": 173350 + }, + { + "epoch": 13.434073385253207, + "grad_norm": 1.4745486430267005, + "learning_rate": 6.717296962182269e-07, + "loss": 0.8794, + "step": 173360 + }, + { + "epoch": 13.434848308729514, + "grad_norm": 1.5636267261930223, + "learning_rate": 6.717684438933666e-07, + "loss": 0.8814, + "step": 173370 + }, + { + "epoch": 13.43562323220582, + "grad_norm": 1.6175143557000138, + "learning_rate": 6.71807191568506e-07, + "loss": 0.8865, + "step": 173380 + }, + { + "epoch": 13.436398155682127, + "grad_norm": 1.557895146402484, + "learning_rate": 6.718459392436455e-07, + "loss": 0.8867, + "step": 173390 + }, + { + "epoch": 13.437173079158432, + "grad_norm": 1.427383453259499, + "learning_rate": 6.718846869187849e-07, + "loss": 0.8613, + "step": 173400 + }, + { + "epoch": 13.437948002634739, + "grad_norm": 1.5040587553270524, + "learning_rate": 6.719234345939244e-07, + "loss": 0.8659, + "step": 173410 + }, + { + "epoch": 13.438722926111046, + "grad_norm": 1.5176233359072044, + "learning_rate": 6.71962182269064e-07, + "loss": 0.8612, + "step": 173420 + }, + { + "epoch": 13.439497849587353, + "grad_norm": 1.509875120864835, + "learning_rate": 6.720009299442035e-07, + "loss": 0.879, + "step": 173430 + }, + { + "epoch": 13.44027277306366, + "grad_norm": 1.4602926167953119, + "learning_rate": 6.720396776193429e-07, + "loss": 0.8814, + "step": 173440 + }, + { + "epoch": 13.441047696539966, + "grad_norm": 1.468575090902318, + "learning_rate": 6.720784252944824e-07, + "loss": 0.8833, + "step": 173450 + }, + { + "epoch": 13.441822620016273, + "grad_norm": 1.5515219449696902, + "learning_rate": 6.721171729696218e-07, + "loss": 0.8701, + "step": 173460 + }, + { + "epoch": 13.44259754349258, + "grad_norm": 1.4833779916926366, + "learning_rate": 6.721559206447615e-07, + "loss": 0.8749, + "step": 173470 + }, + { + "epoch": 13.443372466968887, + "grad_norm": 1.5831555067945506, + "learning_rate": 6.721946683199009e-07, + "loss": 0.8797, + "step": 173480 + }, + { + "epoch": 13.444147390445194, + "grad_norm": 1.4303148005327324, + "learning_rate": 6.722334159950404e-07, + "loss": 0.8822, + "step": 173490 + }, + { + "epoch": 13.4449223139215, + "grad_norm": 1.4827302605327675, + "learning_rate": 6.722721636701798e-07, + "loss": 0.8662, + "step": 173500 + }, + { + "epoch": 13.4449223139215, + "eval_loss": 0.8950643539428711, + "eval_runtime": 330.5078, + "eval_samples_per_second": 34.707, + "eval_steps_per_second": 8.678, + "step": 173500 + }, + { + "epoch": 13.445697237397807, + "grad_norm": 1.602659619086851, + "learning_rate": 6.723109113453194e-07, + "loss": 0.8701, + "step": 173510 + }, + { + "epoch": 13.446472160874114, + "grad_norm": 1.599806961552858, + "learning_rate": 6.723496590204589e-07, + "loss": 0.8823, + "step": 173520 + }, + { + "epoch": 13.44724708435042, + "grad_norm": 1.530332276608997, + "learning_rate": 6.723884066955984e-07, + "loss": 0.8897, + "step": 173530 + }, + { + "epoch": 13.448022007826728, + "grad_norm": 1.4344973517526667, + "learning_rate": 6.724271543707378e-07, + "loss": 0.8774, + "step": 173540 + }, + { + "epoch": 13.448796931303034, + "grad_norm": 1.6200902129253671, + "learning_rate": 6.724659020458773e-07, + "loss": 0.8738, + "step": 173550 + }, + { + "epoch": 13.449571854779341, + "grad_norm": 1.5089164426712232, + "learning_rate": 6.725046497210168e-07, + "loss": 0.8893, + "step": 173560 + }, + { + "epoch": 13.450346778255648, + "grad_norm": 1.462286481677562, + "learning_rate": 6.725433973961564e-07, + "loss": 0.862, + "step": 173570 + }, + { + "epoch": 13.451121701731953, + "grad_norm": 1.6092001265423115, + "learning_rate": 6.725821450712958e-07, + "loss": 0.89, + "step": 173580 + }, + { + "epoch": 13.45189662520826, + "grad_norm": 1.5131466137157323, + "learning_rate": 6.726208927464353e-07, + "loss": 0.8833, + "step": 173590 + }, + { + "epoch": 13.452671548684567, + "grad_norm": 1.5882558815682255, + "learning_rate": 6.726596404215747e-07, + "loss": 0.8812, + "step": 173600 + }, + { + "epoch": 13.453446472160874, + "grad_norm": 1.4411275746009744, + "learning_rate": 6.726983880967143e-07, + "loss": 0.8631, + "step": 173610 + }, + { + "epoch": 13.45422139563718, + "grad_norm": 1.5441545080976185, + "learning_rate": 6.727371357718538e-07, + "loss": 0.875, + "step": 173620 + }, + { + "epoch": 13.454996319113487, + "grad_norm": 1.5110856988975379, + "learning_rate": 6.727758834469933e-07, + "loss": 0.8855, + "step": 173630 + }, + { + "epoch": 13.455771242589794, + "grad_norm": 1.5184346425488748, + "learning_rate": 6.728146311221327e-07, + "loss": 0.8844, + "step": 173640 + }, + { + "epoch": 13.4565461660661, + "grad_norm": 1.5796680080173788, + "learning_rate": 6.728533787972723e-07, + "loss": 0.8846, + "step": 173650 + }, + { + "epoch": 13.457321089542408, + "grad_norm": 1.4740314897398314, + "learning_rate": 6.728921264724117e-07, + "loss": 0.8711, + "step": 173660 + }, + { + "epoch": 13.458096013018714, + "grad_norm": 1.4984404025565063, + "learning_rate": 6.729308741475513e-07, + "loss": 0.8636, + "step": 173670 + }, + { + "epoch": 13.458870936495021, + "grad_norm": 1.5884187833980767, + "learning_rate": 6.729696218226907e-07, + "loss": 0.8833, + "step": 173680 + }, + { + "epoch": 13.459645859971328, + "grad_norm": 1.6112224909667328, + "learning_rate": 6.730083694978302e-07, + "loss": 0.8601, + "step": 173690 + }, + { + "epoch": 13.460420783447635, + "grad_norm": 1.514330130327053, + "learning_rate": 6.730471171729697e-07, + "loss": 0.8836, + "step": 173700 + }, + { + "epoch": 13.461195706923942, + "grad_norm": 1.4714809549088599, + "learning_rate": 6.730858648481092e-07, + "loss": 0.8631, + "step": 173710 + }, + { + "epoch": 13.461970630400248, + "grad_norm": 1.484523037757334, + "learning_rate": 6.731246125232487e-07, + "loss": 0.8743, + "step": 173720 + }, + { + "epoch": 13.462745553876555, + "grad_norm": 1.5207800489082643, + "learning_rate": 6.731633601983882e-07, + "loss": 0.8722, + "step": 173730 + }, + { + "epoch": 13.463520477352862, + "grad_norm": 1.5392209163943564, + "learning_rate": 6.732021078735276e-07, + "loss": 0.8668, + "step": 173740 + }, + { + "epoch": 13.464295400829169, + "grad_norm": 1.4214369590505704, + "learning_rate": 6.732408555486672e-07, + "loss": 0.8627, + "step": 173750 + }, + { + "epoch": 13.465070324305476, + "grad_norm": 1.4853648191090605, + "learning_rate": 6.732796032238066e-07, + "loss": 0.8724, + "step": 173760 + }, + { + "epoch": 13.46584524778178, + "grad_norm": 1.5315023836065615, + "learning_rate": 6.733183508989462e-07, + "loss": 0.8698, + "step": 173770 + }, + { + "epoch": 13.466620171258088, + "grad_norm": 1.5367994653607957, + "learning_rate": 6.733570985740856e-07, + "loss": 0.8824, + "step": 173780 + }, + { + "epoch": 13.467395094734394, + "grad_norm": 1.535339548152937, + "learning_rate": 6.733958462492252e-07, + "loss": 0.868, + "step": 173790 + }, + { + "epoch": 13.468170018210701, + "grad_norm": 1.3997547221435906, + "learning_rate": 6.734345939243646e-07, + "loss": 0.8448, + "step": 173800 + }, + { + "epoch": 13.468944941687008, + "grad_norm": 1.4462745316329877, + "learning_rate": 6.734733415995041e-07, + "loss": 0.8757, + "step": 173810 + }, + { + "epoch": 13.469719865163315, + "grad_norm": 1.5740822105142178, + "learning_rate": 6.735120892746436e-07, + "loss": 0.8873, + "step": 173820 + }, + { + "epoch": 13.470494788639622, + "grad_norm": 1.4981424776153005, + "learning_rate": 6.735508369497831e-07, + "loss": 0.8768, + "step": 173830 + }, + { + "epoch": 13.471269712115928, + "grad_norm": 1.4444802398667993, + "learning_rate": 6.735895846249225e-07, + "loss": 0.8792, + "step": 173840 + }, + { + "epoch": 13.472044635592235, + "grad_norm": 1.545194755763305, + "learning_rate": 6.736283323000621e-07, + "loss": 0.8631, + "step": 173850 + }, + { + "epoch": 13.472819559068542, + "grad_norm": 1.5728549556334759, + "learning_rate": 6.736670799752015e-07, + "loss": 0.8689, + "step": 173860 + }, + { + "epoch": 13.473594482544849, + "grad_norm": 1.5663088690570444, + "learning_rate": 6.737058276503411e-07, + "loss": 0.8497, + "step": 173870 + }, + { + "epoch": 13.474369406021156, + "grad_norm": 1.5682203612176064, + "learning_rate": 6.737445753254805e-07, + "loss": 0.867, + "step": 173880 + }, + { + "epoch": 13.475144329497462, + "grad_norm": 1.4708122653056503, + "learning_rate": 6.737833230006201e-07, + "loss": 0.8662, + "step": 173890 + }, + { + "epoch": 13.47591925297377, + "grad_norm": 1.5869464774027517, + "learning_rate": 6.738220706757595e-07, + "loss": 0.8655, + "step": 173900 + }, + { + "epoch": 13.476694176450076, + "grad_norm": 1.6149891653907928, + "learning_rate": 6.73860818350899e-07, + "loss": 0.8614, + "step": 173910 + }, + { + "epoch": 13.477469099926383, + "grad_norm": 1.5208495311673815, + "learning_rate": 6.738995660260385e-07, + "loss": 0.8798, + "step": 173920 + }, + { + "epoch": 13.47824402340269, + "grad_norm": 1.5206580709625448, + "learning_rate": 6.739383137011781e-07, + "loss": 0.8886, + "step": 173930 + }, + { + "epoch": 13.479018946878996, + "grad_norm": 1.514423589820137, + "learning_rate": 6.739770613763175e-07, + "loss": 0.9034, + "step": 173940 + }, + { + "epoch": 13.479793870355302, + "grad_norm": 1.5715472683288414, + "learning_rate": 6.74015809051457e-07, + "loss": 0.8841, + "step": 173950 + }, + { + "epoch": 13.480568793831608, + "grad_norm": 1.5308918396389213, + "learning_rate": 6.740545567265964e-07, + "loss": 0.877, + "step": 173960 + }, + { + "epoch": 13.481343717307915, + "grad_norm": 1.4266807557177543, + "learning_rate": 6.74093304401736e-07, + "loss": 0.8855, + "step": 173970 + }, + { + "epoch": 13.482118640784222, + "grad_norm": 1.5195459972616276, + "learning_rate": 6.741320520768754e-07, + "loss": 0.8831, + "step": 173980 + }, + { + "epoch": 13.482893564260529, + "grad_norm": 1.525747602446132, + "learning_rate": 6.74170799752015e-07, + "loss": 0.8741, + "step": 173990 + }, + { + "epoch": 13.483668487736836, + "grad_norm": 1.4951966751155659, + "learning_rate": 6.742095474271544e-07, + "loss": 0.879, + "step": 174000 + }, + { + "epoch": 13.483668487736836, + "eval_loss": 0.894912838935852, + "eval_runtime": 332.4272, + "eval_samples_per_second": 34.507, + "eval_steps_per_second": 8.627, + "step": 174000 + }, + { + "epoch": 13.484443411213142, + "grad_norm": 1.4690339024058776, + "learning_rate": 6.74248295102294e-07, + "loss": 0.8695, + "step": 174010 + }, + { + "epoch": 13.48521833468945, + "grad_norm": 1.584944346244852, + "learning_rate": 6.742870427774334e-07, + "loss": 0.8675, + "step": 174020 + }, + { + "epoch": 13.485993258165756, + "grad_norm": 1.505901393472118, + "learning_rate": 6.74325790452573e-07, + "loss": 0.8712, + "step": 174030 + }, + { + "epoch": 13.486768181642063, + "grad_norm": 1.4583869572992316, + "learning_rate": 6.743645381277124e-07, + "loss": 0.8584, + "step": 174040 + }, + { + "epoch": 13.48754310511837, + "grad_norm": 1.477050596915568, + "learning_rate": 6.744032858028519e-07, + "loss": 0.863, + "step": 174050 + }, + { + "epoch": 13.488318028594676, + "grad_norm": 1.7039840148272962, + "learning_rate": 6.744420334779913e-07, + "loss": 0.8985, + "step": 174060 + }, + { + "epoch": 13.489092952070983, + "grad_norm": 1.5416054407037258, + "learning_rate": 6.74480781153131e-07, + "loss": 0.87, + "step": 174070 + }, + { + "epoch": 13.48986787554729, + "grad_norm": 1.4458875815514005, + "learning_rate": 6.745195288282704e-07, + "loss": 0.869, + "step": 174080 + }, + { + "epoch": 13.490642799023597, + "grad_norm": 1.5531122759271532, + "learning_rate": 6.745582765034099e-07, + "loss": 0.8729, + "step": 174090 + }, + { + "epoch": 13.491417722499904, + "grad_norm": 1.4407310365216683, + "learning_rate": 6.745970241785493e-07, + "loss": 0.8684, + "step": 174100 + }, + { + "epoch": 13.49219264597621, + "grad_norm": 1.447397062842885, + "learning_rate": 6.746357718536888e-07, + "loss": 0.8703, + "step": 174110 + }, + { + "epoch": 13.492967569452517, + "grad_norm": 1.440976417463623, + "learning_rate": 6.746745195288283e-07, + "loss": 0.8792, + "step": 174120 + }, + { + "epoch": 13.493742492928824, + "grad_norm": 1.5174374563794524, + "learning_rate": 6.747132672039679e-07, + "loss": 0.8724, + "step": 174130 + }, + { + "epoch": 13.49451741640513, + "grad_norm": 1.453648795709049, + "learning_rate": 6.747520148791073e-07, + "loss": 0.8635, + "step": 174140 + }, + { + "epoch": 13.495292339881436, + "grad_norm": 1.5088995148425746, + "learning_rate": 6.747907625542468e-07, + "loss": 0.9106, + "step": 174150 + }, + { + "epoch": 13.496067263357743, + "grad_norm": 1.5418146690908632, + "learning_rate": 6.748295102293862e-07, + "loss": 0.8601, + "step": 174160 + }, + { + "epoch": 13.49684218683405, + "grad_norm": 1.497181615088126, + "learning_rate": 6.748682579045259e-07, + "loss": 0.8741, + "step": 174170 + }, + { + "epoch": 13.497617110310356, + "grad_norm": 1.6630561855675317, + "learning_rate": 6.749070055796653e-07, + "loss": 0.8763, + "step": 174180 + }, + { + "epoch": 13.498392033786663, + "grad_norm": 1.5252360860372935, + "learning_rate": 6.749457532548048e-07, + "loss": 0.8705, + "step": 174190 + }, + { + "epoch": 13.49916695726297, + "grad_norm": 1.517686539854332, + "learning_rate": 6.749845009299442e-07, + "loss": 0.8764, + "step": 174200 + }, + { + "epoch": 13.499941880739277, + "grad_norm": 1.4375413127677008, + "learning_rate": 6.750232486050839e-07, + "loss": 0.8642, + "step": 174210 + }, + { + "epoch": 13.500716804215584, + "grad_norm": 1.4448894540815895, + "learning_rate": 6.750619962802233e-07, + "loss": 0.8699, + "step": 174220 + }, + { + "epoch": 13.50149172769189, + "grad_norm": 1.6015566036043223, + "learning_rate": 6.751007439553628e-07, + "loss": 0.8675, + "step": 174230 + }, + { + "epoch": 13.502266651168197, + "grad_norm": 1.5542603686433099, + "learning_rate": 6.751394916305022e-07, + "loss": 0.8837, + "step": 174240 + }, + { + "epoch": 13.503041574644504, + "grad_norm": 1.5465530211502756, + "learning_rate": 6.751782393056417e-07, + "loss": 0.8734, + "step": 174250 + }, + { + "epoch": 13.50381649812081, + "grad_norm": 1.4818603602012859, + "learning_rate": 6.752169869807811e-07, + "loss": 0.8743, + "step": 174260 + }, + { + "epoch": 13.504591421597118, + "grad_norm": 1.4053657758179265, + "learning_rate": 6.752557346559208e-07, + "loss": 0.8709, + "step": 174270 + }, + { + "epoch": 13.505366345073424, + "grad_norm": 1.473514592348981, + "learning_rate": 6.752944823310602e-07, + "loss": 0.8667, + "step": 174280 + }, + { + "epoch": 13.506141268549731, + "grad_norm": 1.4843808071683215, + "learning_rate": 6.753332300061997e-07, + "loss": 0.8642, + "step": 174290 + }, + { + "epoch": 13.506916192026038, + "grad_norm": 1.4532434443409104, + "learning_rate": 6.753719776813391e-07, + "loss": 0.8936, + "step": 174300 + }, + { + "epoch": 13.507691115502345, + "grad_norm": 1.46362395695869, + "learning_rate": 6.754107253564788e-07, + "loss": 0.8939, + "step": 174310 + }, + { + "epoch": 13.50846603897865, + "grad_norm": 1.4515994107626882, + "learning_rate": 6.754494730316182e-07, + "loss": 0.8912, + "step": 174320 + }, + { + "epoch": 13.509240962454957, + "grad_norm": 1.453185778037444, + "learning_rate": 6.754882207067577e-07, + "loss": 0.862, + "step": 174330 + }, + { + "epoch": 13.510015885931264, + "grad_norm": 1.5349114416550433, + "learning_rate": 6.755269683818971e-07, + "loss": 0.8693, + "step": 174340 + }, + { + "epoch": 13.51079080940757, + "grad_norm": 1.5270677666521366, + "learning_rate": 6.755657160570367e-07, + "loss": 0.8792, + "step": 174350 + }, + { + "epoch": 13.511565732883877, + "grad_norm": 1.6210845882991902, + "learning_rate": 6.756044637321762e-07, + "loss": 0.8735, + "step": 174360 + }, + { + "epoch": 13.512340656360184, + "grad_norm": 1.474711841483051, + "learning_rate": 6.756432114073157e-07, + "loss": 0.86, + "step": 174370 + }, + { + "epoch": 13.51311557983649, + "grad_norm": 1.602260144670732, + "learning_rate": 6.756819590824551e-07, + "loss": 0.8983, + "step": 174380 + }, + { + "epoch": 13.513890503312798, + "grad_norm": 1.4192301409336594, + "learning_rate": 6.757207067575946e-07, + "loss": 0.874, + "step": 174390 + }, + { + "epoch": 13.514665426789104, + "grad_norm": 1.4476895476294909, + "learning_rate": 6.75759454432734e-07, + "loss": 0.881, + "step": 174400 + }, + { + "epoch": 13.515440350265411, + "grad_norm": 1.4771615698960787, + "learning_rate": 6.757982021078737e-07, + "loss": 0.8873, + "step": 174410 + }, + { + "epoch": 13.516215273741718, + "grad_norm": 1.4615903743245775, + "learning_rate": 6.758369497830131e-07, + "loss": 0.8638, + "step": 174420 + }, + { + "epoch": 13.516990197218025, + "grad_norm": 1.5451270559430172, + "learning_rate": 6.758756974581526e-07, + "loss": 0.8847, + "step": 174430 + }, + { + "epoch": 13.517765120694332, + "grad_norm": 1.4954290343131384, + "learning_rate": 6.75914445133292e-07, + "loss": 0.862, + "step": 174440 + }, + { + "epoch": 13.518540044170638, + "grad_norm": 1.6164409827828008, + "learning_rate": 6.759531928084316e-07, + "loss": 0.8595, + "step": 174450 + }, + { + "epoch": 13.519314967646945, + "grad_norm": 1.5305822383256007, + "learning_rate": 6.759919404835711e-07, + "loss": 0.8608, + "step": 174460 + }, + { + "epoch": 13.520089891123252, + "grad_norm": 1.5576547346849101, + "learning_rate": 6.760306881587106e-07, + "loss": 0.8802, + "step": 174470 + }, + { + "epoch": 13.520864814599559, + "grad_norm": 1.5312206136562119, + "learning_rate": 6.7606943583385e-07, + "loss": 0.8797, + "step": 174480 + }, + { + "epoch": 13.521639738075866, + "grad_norm": 1.4868241343941178, + "learning_rate": 6.761081835089896e-07, + "loss": 0.8686, + "step": 174490 + }, + { + "epoch": 13.522414661552173, + "grad_norm": 1.4998926759161437, + "learning_rate": 6.76146931184129e-07, + "loss": 0.8667, + "step": 174500 + }, + { + "epoch": 13.522414661552173, + "eval_loss": 0.8946263194084167, + "eval_runtime": 329.2211, + "eval_samples_per_second": 34.843, + "eval_steps_per_second": 8.711, + "step": 174500 + }, + { + "epoch": 13.52318958502848, + "grad_norm": 1.532815034534677, + "learning_rate": 6.761856788592686e-07, + "loss": 0.8519, + "step": 174510 + }, + { + "epoch": 13.523964508504784, + "grad_norm": 1.5433340508237432, + "learning_rate": 6.76224426534408e-07, + "loss": 0.8802, + "step": 174520 + }, + { + "epoch": 13.524739431981091, + "grad_norm": 1.5253746057905926, + "learning_rate": 6.762631742095475e-07, + "loss": 0.8631, + "step": 174530 + }, + { + "epoch": 13.525514355457398, + "grad_norm": 1.4870348594551235, + "learning_rate": 6.763019218846869e-07, + "loss": 0.8657, + "step": 174540 + }, + { + "epoch": 13.526289278933705, + "grad_norm": 1.5513227586139662, + "learning_rate": 6.763406695598265e-07, + "loss": 0.8653, + "step": 174550 + }, + { + "epoch": 13.527064202410012, + "grad_norm": 1.5641165268828547, + "learning_rate": 6.76379417234966e-07, + "loss": 0.8615, + "step": 174560 + }, + { + "epoch": 13.527839125886318, + "grad_norm": 1.4725179336432992, + "learning_rate": 6.764181649101055e-07, + "loss": 0.8905, + "step": 174570 + }, + { + "epoch": 13.528614049362625, + "grad_norm": 1.5232002482397358, + "learning_rate": 6.764569125852449e-07, + "loss": 0.8958, + "step": 174580 + }, + { + "epoch": 13.529388972838932, + "grad_norm": 1.5541704582732163, + "learning_rate": 6.764956602603845e-07, + "loss": 0.8475, + "step": 174590 + }, + { + "epoch": 13.530163896315239, + "grad_norm": 1.5036839684645624, + "learning_rate": 6.765344079355239e-07, + "loss": 0.848, + "step": 174600 + }, + { + "epoch": 13.530938819791546, + "grad_norm": 1.4973110148996662, + "learning_rate": 6.765731556106635e-07, + "loss": 0.8546, + "step": 174610 + }, + { + "epoch": 13.531713743267852, + "grad_norm": 1.6124384730441421, + "learning_rate": 6.766119032858029e-07, + "loss": 0.8733, + "step": 174620 + }, + { + "epoch": 13.53248866674416, + "grad_norm": 1.5259675856662516, + "learning_rate": 6.766506509609424e-07, + "loss": 0.8777, + "step": 174630 + }, + { + "epoch": 13.533263590220466, + "grad_norm": 1.450101645803376, + "learning_rate": 6.766893986360819e-07, + "loss": 0.8726, + "step": 174640 + }, + { + "epoch": 13.534038513696773, + "grad_norm": 1.5468491932908042, + "learning_rate": 6.767281463112214e-07, + "loss": 0.8703, + "step": 174650 + }, + { + "epoch": 13.53481343717308, + "grad_norm": 1.5262019743585231, + "learning_rate": 6.767668939863609e-07, + "loss": 0.8732, + "step": 174660 + }, + { + "epoch": 13.535588360649387, + "grad_norm": 1.4484626775084664, + "learning_rate": 6.768056416615004e-07, + "loss": 0.8544, + "step": 174670 + }, + { + "epoch": 13.536363284125693, + "grad_norm": 1.4703024308639803, + "learning_rate": 6.768443893366398e-07, + "loss": 0.8418, + "step": 174680 + }, + { + "epoch": 13.537138207601998, + "grad_norm": 1.481089186667158, + "learning_rate": 6.768831370117794e-07, + "loss": 0.877, + "step": 174690 + }, + { + "epoch": 13.537913131078305, + "grad_norm": 1.638981508179088, + "learning_rate": 6.769218846869188e-07, + "loss": 0.8764, + "step": 174700 + }, + { + "epoch": 13.538688054554612, + "grad_norm": 1.4877327629520292, + "learning_rate": 6.769606323620584e-07, + "loss": 0.8669, + "step": 174710 + }, + { + "epoch": 13.539462978030919, + "grad_norm": 1.4467311720281861, + "learning_rate": 6.769993800371978e-07, + "loss": 0.8824, + "step": 174720 + }, + { + "epoch": 13.540237901507226, + "grad_norm": 1.5436029508879983, + "learning_rate": 6.770381277123374e-07, + "loss": 0.8717, + "step": 174730 + }, + { + "epoch": 13.541012824983532, + "grad_norm": 1.5952883666404278, + "learning_rate": 6.770768753874768e-07, + "loss": 0.8709, + "step": 174740 + }, + { + "epoch": 13.54178774845984, + "grad_norm": 1.6498828801194163, + "learning_rate": 6.771156230626164e-07, + "loss": 0.8748, + "step": 174750 + }, + { + "epoch": 13.542562671936146, + "grad_norm": 1.5125068056999875, + "learning_rate": 6.771543707377558e-07, + "loss": 0.8486, + "step": 174760 + }, + { + "epoch": 13.543337595412453, + "grad_norm": 1.4353106253262526, + "learning_rate": 6.771931184128953e-07, + "loss": 0.8625, + "step": 174770 + }, + { + "epoch": 13.54411251888876, + "grad_norm": 1.5191932869718254, + "learning_rate": 6.772318660880348e-07, + "loss": 0.8691, + "step": 174780 + }, + { + "epoch": 13.544887442365066, + "grad_norm": 1.5271655039779197, + "learning_rate": 6.772706137631743e-07, + "loss": 0.8939, + "step": 174790 + }, + { + "epoch": 13.545662365841373, + "grad_norm": 1.5459477379624933, + "learning_rate": 6.773093614383137e-07, + "loss": 0.88, + "step": 174800 + }, + { + "epoch": 13.54643728931768, + "grad_norm": 1.4621018786948625, + "learning_rate": 6.773481091134533e-07, + "loss": 0.881, + "step": 174810 + }, + { + "epoch": 13.547212212793987, + "grad_norm": 1.5522713607156595, + "learning_rate": 6.773868567885927e-07, + "loss": 0.8768, + "step": 174820 + }, + { + "epoch": 13.547987136270294, + "grad_norm": 1.4846706559755176, + "learning_rate": 6.774256044637323e-07, + "loss": 0.8619, + "step": 174830 + }, + { + "epoch": 13.5487620597466, + "grad_norm": 1.453436834372622, + "learning_rate": 6.774643521388717e-07, + "loss": 0.871, + "step": 174840 + }, + { + "epoch": 13.549536983222907, + "grad_norm": 1.500836834209436, + "learning_rate": 6.775030998140113e-07, + "loss": 0.8604, + "step": 174850 + }, + { + "epoch": 13.550311906699214, + "grad_norm": 1.4663213668418673, + "learning_rate": 6.775418474891507e-07, + "loss": 0.8723, + "step": 174860 + }, + { + "epoch": 13.551086830175521, + "grad_norm": 1.5314551392212976, + "learning_rate": 6.775805951642903e-07, + "loss": 0.8641, + "step": 174870 + }, + { + "epoch": 13.551861753651828, + "grad_norm": 1.5776884016404484, + "learning_rate": 6.776193428394297e-07, + "loss": 0.8662, + "step": 174880 + }, + { + "epoch": 13.552636677128133, + "grad_norm": 1.5160590041282453, + "learning_rate": 6.776580905145692e-07, + "loss": 0.8788, + "step": 174890 + }, + { + "epoch": 13.55341160060444, + "grad_norm": 1.4369916336258695, + "learning_rate": 6.776968381897087e-07, + "loss": 0.8722, + "step": 174900 + }, + { + "epoch": 13.554186524080746, + "grad_norm": 1.5658994421033405, + "learning_rate": 6.777355858648482e-07, + "loss": 0.8835, + "step": 174910 + }, + { + "epoch": 13.554961447557053, + "grad_norm": 1.494500100837142, + "learning_rate": 6.777743335399877e-07, + "loss": 0.9026, + "step": 174920 + }, + { + "epoch": 13.55573637103336, + "grad_norm": 1.4288650411873616, + "learning_rate": 6.778130812151272e-07, + "loss": 0.8723, + "step": 174930 + }, + { + "epoch": 13.556511294509667, + "grad_norm": 1.4964806140692999, + "learning_rate": 6.778518288902666e-07, + "loss": 0.8892, + "step": 174940 + }, + { + "epoch": 13.557286217985974, + "grad_norm": 1.4390724767155814, + "learning_rate": 6.778905765654062e-07, + "loss": 0.8614, + "step": 174950 + }, + { + "epoch": 13.55806114146228, + "grad_norm": 1.463205557498205, + "learning_rate": 6.779293242405456e-07, + "loss": 0.8745, + "step": 174960 + }, + { + "epoch": 13.558836064938587, + "grad_norm": 1.5958348638719164, + "learning_rate": 6.779680719156852e-07, + "loss": 0.8781, + "step": 174970 + }, + { + "epoch": 13.559610988414894, + "grad_norm": 1.6038551329536685, + "learning_rate": 6.780068195908246e-07, + "loss": 0.8771, + "step": 174980 + }, + { + "epoch": 13.560385911891201, + "grad_norm": 1.499607992013941, + "learning_rate": 6.780455672659641e-07, + "loss": 0.887, + "step": 174990 + }, + { + "epoch": 13.561160835367508, + "grad_norm": 1.65869381934333, + "learning_rate": 6.780843149411036e-07, + "loss": 0.8911, + "step": 175000 + }, + { + "epoch": 13.561160835367508, + "eval_loss": 0.8945085406303406, + "eval_runtime": 330.2753, + "eval_samples_per_second": 34.732, + "eval_steps_per_second": 8.684, + "step": 175000 + }, + { + "epoch": 13.561935758843815, + "grad_norm": 1.452134205749976, + "learning_rate": 6.781230626162432e-07, + "loss": 0.8683, + "step": 175010 + }, + { + "epoch": 13.562710682320121, + "grad_norm": 1.5105853661526332, + "learning_rate": 6.781618102913826e-07, + "loss": 0.8757, + "step": 175020 + }, + { + "epoch": 13.563485605796428, + "grad_norm": 1.4847056377308159, + "learning_rate": 6.782005579665221e-07, + "loss": 0.8851, + "step": 175030 + }, + { + "epoch": 13.564260529272735, + "grad_norm": 1.4968380506587602, + "learning_rate": 6.782393056416615e-07, + "loss": 0.8733, + "step": 175040 + }, + { + "epoch": 13.565035452749042, + "grad_norm": 1.5012766750934452, + "learning_rate": 6.782780533168011e-07, + "loss": 0.8792, + "step": 175050 + }, + { + "epoch": 13.565810376225349, + "grad_norm": 1.5268561327570018, + "learning_rate": 6.783168009919406e-07, + "loss": 0.8592, + "step": 175060 + }, + { + "epoch": 13.566585299701654, + "grad_norm": 1.50099368802292, + "learning_rate": 6.783555486670801e-07, + "loss": 0.88, + "step": 175070 + }, + { + "epoch": 13.56736022317796, + "grad_norm": 1.5146844476550665, + "learning_rate": 6.783942963422195e-07, + "loss": 0.886, + "step": 175080 + }, + { + "epoch": 13.568135146654267, + "grad_norm": 1.7668276806831784, + "learning_rate": 6.78433044017359e-07, + "loss": 0.8928, + "step": 175090 + }, + { + "epoch": 13.568910070130574, + "grad_norm": 1.5984596221996161, + "learning_rate": 6.784717916924985e-07, + "loss": 0.8781, + "step": 175100 + }, + { + "epoch": 13.56968499360688, + "grad_norm": 1.5681328478779557, + "learning_rate": 6.785105393676381e-07, + "loss": 0.8874, + "step": 175110 + }, + { + "epoch": 13.570459917083188, + "grad_norm": 1.546037698773571, + "learning_rate": 6.785492870427775e-07, + "loss": 0.8702, + "step": 175120 + }, + { + "epoch": 13.571234840559494, + "grad_norm": 1.507716054315368, + "learning_rate": 6.78588034717917e-07, + "loss": 0.8866, + "step": 175130 + }, + { + "epoch": 13.572009764035801, + "grad_norm": 1.4833950651599863, + "learning_rate": 6.786267823930564e-07, + "loss": 0.8833, + "step": 175140 + }, + { + "epoch": 13.572784687512108, + "grad_norm": 1.4943671647310028, + "learning_rate": 6.786655300681961e-07, + "loss": 0.8778, + "step": 175150 + }, + { + "epoch": 13.573559610988415, + "grad_norm": 1.4839658490110688, + "learning_rate": 6.787042777433355e-07, + "loss": 0.8701, + "step": 175160 + }, + { + "epoch": 13.574334534464722, + "grad_norm": 1.4657504674293103, + "learning_rate": 6.78743025418475e-07, + "loss": 0.8673, + "step": 175170 + }, + { + "epoch": 13.575109457941029, + "grad_norm": 1.4261436381054464, + "learning_rate": 6.787817730936144e-07, + "loss": 0.8804, + "step": 175180 + }, + { + "epoch": 13.575884381417335, + "grad_norm": 1.5023303134728812, + "learning_rate": 6.788205207687539e-07, + "loss": 0.856, + "step": 175190 + }, + { + "epoch": 13.576659304893642, + "grad_norm": 1.623288824772111, + "learning_rate": 6.788592684438935e-07, + "loss": 0.8589, + "step": 175200 + }, + { + "epoch": 13.577434228369949, + "grad_norm": 1.4815938173007424, + "learning_rate": 6.78898016119033e-07, + "loss": 0.8683, + "step": 175210 + }, + { + "epoch": 13.578209151846256, + "grad_norm": 1.5145459027452677, + "learning_rate": 6.789367637941724e-07, + "loss": 0.8742, + "step": 175220 + }, + { + "epoch": 13.578984075322563, + "grad_norm": 1.4840917393629738, + "learning_rate": 6.789755114693119e-07, + "loss": 0.8747, + "step": 175230 + }, + { + "epoch": 13.57975899879887, + "grad_norm": 1.5866908718413513, + "learning_rate": 6.790142591444513e-07, + "loss": 0.8725, + "step": 175240 + }, + { + "epoch": 13.580533922275176, + "grad_norm": 1.4133036100885459, + "learning_rate": 6.79053006819591e-07, + "loss": 0.863, + "step": 175250 + }, + { + "epoch": 13.581308845751481, + "grad_norm": 1.4870243059966057, + "learning_rate": 6.790917544947304e-07, + "loss": 0.8735, + "step": 175260 + }, + { + "epoch": 13.582083769227788, + "grad_norm": 1.5116822970567627, + "learning_rate": 6.791305021698699e-07, + "loss": 0.8635, + "step": 175270 + }, + { + "epoch": 13.582858692704095, + "grad_norm": 1.5171965739155846, + "learning_rate": 6.791692498450093e-07, + "loss": 0.8697, + "step": 175280 + }, + { + "epoch": 13.583633616180402, + "grad_norm": 1.4551225959094456, + "learning_rate": 6.79207997520149e-07, + "loss": 0.8612, + "step": 175290 + }, + { + "epoch": 13.584408539656708, + "grad_norm": 1.5215244883964867, + "learning_rate": 6.792467451952884e-07, + "loss": 0.8723, + "step": 175300 + }, + { + "epoch": 13.585183463133015, + "grad_norm": 1.5338340862948507, + "learning_rate": 6.792854928704279e-07, + "loss": 0.8633, + "step": 175310 + }, + { + "epoch": 13.585958386609322, + "grad_norm": 1.502649085218767, + "learning_rate": 6.793242405455673e-07, + "loss": 0.8619, + "step": 175320 + }, + { + "epoch": 13.586733310085629, + "grad_norm": 1.5457818452734098, + "learning_rate": 6.793629882207068e-07, + "loss": 0.8618, + "step": 175330 + }, + { + "epoch": 13.587508233561936, + "grad_norm": 1.4808866288828275, + "learning_rate": 6.794017358958462e-07, + "loss": 0.8643, + "step": 175340 + }, + { + "epoch": 13.588283157038243, + "grad_norm": 1.5427049087525793, + "learning_rate": 6.794404835709859e-07, + "loss": 0.8657, + "step": 175350 + }, + { + "epoch": 13.58905808051455, + "grad_norm": 1.5231606522948136, + "learning_rate": 6.794792312461253e-07, + "loss": 0.8668, + "step": 175360 + }, + { + "epoch": 13.589833003990856, + "grad_norm": 1.5426111516416934, + "learning_rate": 6.795179789212648e-07, + "loss": 0.875, + "step": 175370 + }, + { + "epoch": 13.590607927467163, + "grad_norm": 1.5541576270424486, + "learning_rate": 6.795567265964042e-07, + "loss": 0.8555, + "step": 175380 + }, + { + "epoch": 13.59138285094347, + "grad_norm": 1.5123636713981152, + "learning_rate": 6.795954742715439e-07, + "loss": 0.8817, + "step": 175390 + }, + { + "epoch": 13.592157774419777, + "grad_norm": 1.5315687937500904, + "learning_rate": 6.796342219466833e-07, + "loss": 0.8687, + "step": 175400 + }, + { + "epoch": 13.592932697896083, + "grad_norm": 1.5244228933167066, + "learning_rate": 6.796729696218228e-07, + "loss": 0.8785, + "step": 175410 + }, + { + "epoch": 13.59370762137239, + "grad_norm": 1.6153309395660589, + "learning_rate": 6.797117172969622e-07, + "loss": 0.869, + "step": 175420 + }, + { + "epoch": 13.594482544848697, + "grad_norm": 1.650009447266915, + "learning_rate": 6.797504649721018e-07, + "loss": 0.8835, + "step": 175430 + }, + { + "epoch": 13.595257468325002, + "grad_norm": 1.5508608743553014, + "learning_rate": 6.797892126472412e-07, + "loss": 0.8664, + "step": 175440 + }, + { + "epoch": 13.596032391801309, + "grad_norm": 1.5547872788426398, + "learning_rate": 6.798279603223808e-07, + "loss": 0.8699, + "step": 175450 + }, + { + "epoch": 13.596807315277616, + "grad_norm": 1.5383876335970688, + "learning_rate": 6.798667079975202e-07, + "loss": 0.8825, + "step": 175460 + }, + { + "epoch": 13.597582238753922, + "grad_norm": 1.5575083792300353, + "learning_rate": 6.799054556726597e-07, + "loss": 0.8612, + "step": 175470 + }, + { + "epoch": 13.59835716223023, + "grad_norm": 1.5249435734732746, + "learning_rate": 6.799442033477991e-07, + "loss": 0.893, + "step": 175480 + }, + { + "epoch": 13.599132085706536, + "grad_norm": 1.5719898456731731, + "learning_rate": 6.799829510229388e-07, + "loss": 0.9137, + "step": 175490 + }, + { + "epoch": 13.599907009182843, + "grad_norm": 1.5580626770955242, + "learning_rate": 6.800216986980782e-07, + "loss": 0.8693, + "step": 175500 + }, + { + "epoch": 13.599907009182843, + "eval_loss": 0.8942967057228088, + "eval_runtime": 330.405, + "eval_samples_per_second": 34.718, + "eval_steps_per_second": 8.68, + "step": 175500 + }, + { + "epoch": 13.60068193265915, + "grad_norm": 1.5284240311442554, + "learning_rate": 6.800604463732177e-07, + "loss": 0.8903, + "step": 175510 + }, + { + "epoch": 13.601456856135457, + "grad_norm": 1.522586948186225, + "learning_rate": 6.800991940483571e-07, + "loss": 0.8579, + "step": 175520 + }, + { + "epoch": 13.602231779611763, + "grad_norm": 1.5634421544963795, + "learning_rate": 6.801379417234967e-07, + "loss": 0.8652, + "step": 175530 + }, + { + "epoch": 13.60300670308807, + "grad_norm": 1.467691131959694, + "learning_rate": 6.801766893986362e-07, + "loss": 0.8698, + "step": 175540 + }, + { + "epoch": 13.603781626564377, + "grad_norm": 1.536519176579363, + "learning_rate": 6.802154370737757e-07, + "loss": 0.8548, + "step": 175550 + }, + { + "epoch": 13.604556550040684, + "grad_norm": 1.5318210892054032, + "learning_rate": 6.802541847489151e-07, + "loss": 0.8593, + "step": 175560 + }, + { + "epoch": 13.60533147351699, + "grad_norm": 1.5358113848895787, + "learning_rate": 6.802929324240547e-07, + "loss": 0.877, + "step": 175570 + }, + { + "epoch": 13.606106396993297, + "grad_norm": 1.4375423922271238, + "learning_rate": 6.803316800991941e-07, + "loss": 0.8704, + "step": 175580 + }, + { + "epoch": 13.606881320469604, + "grad_norm": 1.5681139991419129, + "learning_rate": 6.803704277743337e-07, + "loss": 0.8506, + "step": 175590 + }, + { + "epoch": 13.607656243945911, + "grad_norm": 1.495984886430035, + "learning_rate": 6.804091754494731e-07, + "loss": 0.8764, + "step": 175600 + }, + { + "epoch": 13.608431167422218, + "grad_norm": 1.4936682826546002, + "learning_rate": 6.804479231246126e-07, + "loss": 0.8783, + "step": 175610 + }, + { + "epoch": 13.609206090898525, + "grad_norm": 1.5667520225379776, + "learning_rate": 6.80486670799752e-07, + "loss": 0.8693, + "step": 175620 + }, + { + "epoch": 13.60998101437483, + "grad_norm": 1.5447116233900424, + "learning_rate": 6.805254184748916e-07, + "loss": 0.8947, + "step": 175630 + }, + { + "epoch": 13.610755937851136, + "grad_norm": 1.5023856097065218, + "learning_rate": 6.805641661500311e-07, + "loss": 0.8574, + "step": 175640 + }, + { + "epoch": 13.611530861327443, + "grad_norm": 1.489149603445948, + "learning_rate": 6.806029138251706e-07, + "loss": 0.8847, + "step": 175650 + }, + { + "epoch": 13.61230578480375, + "grad_norm": 1.543102675303558, + "learning_rate": 6.8064166150031e-07, + "loss": 0.8834, + "step": 175660 + }, + { + "epoch": 13.613080708280057, + "grad_norm": 1.5613535164915653, + "learning_rate": 6.806804091754496e-07, + "loss": 0.8876, + "step": 175670 + }, + { + "epoch": 13.613855631756364, + "grad_norm": 1.5230298146883419, + "learning_rate": 6.80719156850589e-07, + "loss": 0.8805, + "step": 175680 + }, + { + "epoch": 13.61463055523267, + "grad_norm": 1.6161334262099811, + "learning_rate": 6.807579045257286e-07, + "loss": 0.8696, + "step": 175690 + }, + { + "epoch": 13.615405478708977, + "grad_norm": 1.581611502666268, + "learning_rate": 6.80796652200868e-07, + "loss": 0.8608, + "step": 175700 + }, + { + "epoch": 13.616180402185284, + "grad_norm": 1.5432607027808884, + "learning_rate": 6.808353998760076e-07, + "loss": 0.8673, + "step": 175710 + }, + { + "epoch": 13.616955325661591, + "grad_norm": 1.612174397307069, + "learning_rate": 6.80874147551147e-07, + "loss": 0.8761, + "step": 175720 + }, + { + "epoch": 13.617730249137898, + "grad_norm": 1.506682710462081, + "learning_rate": 6.809128952262865e-07, + "loss": 0.8758, + "step": 175730 + }, + { + "epoch": 13.618505172614205, + "grad_norm": 1.4795103865566395, + "learning_rate": 6.80951642901426e-07, + "loss": 0.8641, + "step": 175740 + }, + { + "epoch": 13.619280096090511, + "grad_norm": 1.4745917704856457, + "learning_rate": 6.809903905765655e-07, + "loss": 0.8759, + "step": 175750 + }, + { + "epoch": 13.620055019566818, + "grad_norm": 1.500326166146721, + "learning_rate": 6.810291382517049e-07, + "loss": 0.8592, + "step": 175760 + }, + { + "epoch": 13.620829943043125, + "grad_norm": 1.5488915361036566, + "learning_rate": 6.810678859268445e-07, + "loss": 0.8791, + "step": 175770 + }, + { + "epoch": 13.621604866519432, + "grad_norm": 1.515356970201616, + "learning_rate": 6.811066336019839e-07, + "loss": 0.89, + "step": 175780 + }, + { + "epoch": 13.622379789995739, + "grad_norm": 1.5256942025231643, + "learning_rate": 6.811453812771235e-07, + "loss": 0.8829, + "step": 175790 + }, + { + "epoch": 13.623154713472045, + "grad_norm": 1.569302805263313, + "learning_rate": 6.811841289522629e-07, + "loss": 0.8555, + "step": 175800 + }, + { + "epoch": 13.62392963694835, + "grad_norm": 1.4078132220669266, + "learning_rate": 6.812228766274025e-07, + "loss": 0.8529, + "step": 175810 + }, + { + "epoch": 13.624704560424657, + "grad_norm": 1.382285237152993, + "learning_rate": 6.812616243025419e-07, + "loss": 0.8702, + "step": 175820 + }, + { + "epoch": 13.625479483900964, + "grad_norm": 1.5039555251573389, + "learning_rate": 6.813003719776814e-07, + "loss": 0.8518, + "step": 175830 + }, + { + "epoch": 13.626254407377271, + "grad_norm": 1.445721760895116, + "learning_rate": 6.813391196528209e-07, + "loss": 0.8764, + "step": 175840 + }, + { + "epoch": 13.627029330853578, + "grad_norm": 1.6109314666348324, + "learning_rate": 6.813778673279605e-07, + "loss": 0.8866, + "step": 175850 + }, + { + "epoch": 13.627804254329885, + "grad_norm": 1.6500580916818408, + "learning_rate": 6.814166150030999e-07, + "loss": 0.893, + "step": 175860 + }, + { + "epoch": 13.628579177806191, + "grad_norm": 1.5785284181239947, + "learning_rate": 6.814553626782394e-07, + "loss": 0.8696, + "step": 175870 + }, + { + "epoch": 13.629354101282498, + "grad_norm": 1.4195119173925106, + "learning_rate": 6.814941103533788e-07, + "loss": 0.848, + "step": 175880 + }, + { + "epoch": 13.630129024758805, + "grad_norm": 1.5043997645614673, + "learning_rate": 6.815328580285184e-07, + "loss": 0.8862, + "step": 175890 + }, + { + "epoch": 13.630903948235112, + "grad_norm": 1.5609657602973377, + "learning_rate": 6.815716057036578e-07, + "loss": 0.8484, + "step": 175900 + }, + { + "epoch": 13.631678871711419, + "grad_norm": 1.470281061633342, + "learning_rate": 6.816103533787974e-07, + "loss": 0.8677, + "step": 175910 + }, + { + "epoch": 13.632453795187725, + "grad_norm": 1.5614773121079986, + "learning_rate": 6.816491010539368e-07, + "loss": 0.9075, + "step": 175920 + }, + { + "epoch": 13.633228718664032, + "grad_norm": 1.520277247991535, + "learning_rate": 6.816878487290763e-07, + "loss": 0.8611, + "step": 175930 + }, + { + "epoch": 13.634003642140339, + "grad_norm": 1.563165957121387, + "learning_rate": 6.817265964042158e-07, + "loss": 0.8661, + "step": 175940 + }, + { + "epoch": 13.634778565616646, + "grad_norm": 1.5101908202884002, + "learning_rate": 6.817653440793554e-07, + "loss": 0.8554, + "step": 175950 + }, + { + "epoch": 13.635553489092953, + "grad_norm": 1.5255016061977504, + "learning_rate": 6.818040917544948e-07, + "loss": 0.8788, + "step": 175960 + }, + { + "epoch": 13.63632841256926, + "grad_norm": 1.6822774831187406, + "learning_rate": 6.818428394296343e-07, + "loss": 0.8764, + "step": 175970 + }, + { + "epoch": 13.637103336045566, + "grad_norm": 1.4877840586887638, + "learning_rate": 6.818815871047737e-07, + "loss": 0.863, + "step": 175980 + }, + { + "epoch": 13.637878259521873, + "grad_norm": 1.5738333930534725, + "learning_rate": 6.819203347799134e-07, + "loss": 0.8813, + "step": 175990 + }, + { + "epoch": 13.63865318299818, + "grad_norm": 1.5579908776087859, + "learning_rate": 6.819590824550528e-07, + "loss": 0.8743, + "step": 176000 + }, + { + "epoch": 13.63865318299818, + "eval_loss": 0.8943333029747009, + "eval_runtime": 327.8202, + "eval_samples_per_second": 34.992, + "eval_steps_per_second": 8.749, + "step": 176000 + }, + { + "epoch": 13.639428106474485, + "grad_norm": 1.4036660806507526, + "learning_rate": 6.819978301301923e-07, + "loss": 0.872, + "step": 176010 + }, + { + "epoch": 13.640203029950792, + "grad_norm": 1.6335010183298861, + "learning_rate": 6.820365778053317e-07, + "loss": 0.8589, + "step": 176020 + }, + { + "epoch": 13.640977953427099, + "grad_norm": 1.6561329723596354, + "learning_rate": 6.820753254804712e-07, + "loss": 0.8901, + "step": 176030 + }, + { + "epoch": 13.641752876903405, + "grad_norm": 1.5114215420757675, + "learning_rate": 6.821140731556107e-07, + "loss": 0.8646, + "step": 176040 + }, + { + "epoch": 13.642527800379712, + "grad_norm": 1.4524090292147822, + "learning_rate": 6.821528208307503e-07, + "loss": 0.8699, + "step": 176050 + }, + { + "epoch": 13.643302723856019, + "grad_norm": 1.5233375503226736, + "learning_rate": 6.821915685058897e-07, + "loss": 0.8722, + "step": 176060 + }, + { + "epoch": 13.644077647332326, + "grad_norm": 1.576243315799052, + "learning_rate": 6.822303161810292e-07, + "loss": 0.864, + "step": 176070 + }, + { + "epoch": 13.644852570808633, + "grad_norm": 1.539514111964497, + "learning_rate": 6.822690638561686e-07, + "loss": 0.8612, + "step": 176080 + }, + { + "epoch": 13.64562749428494, + "grad_norm": 1.5282561972638067, + "learning_rate": 6.823078115313083e-07, + "loss": 0.8581, + "step": 176090 + }, + { + "epoch": 13.646402417761246, + "grad_norm": 1.5461966834180654, + "learning_rate": 6.823465592064477e-07, + "loss": 0.8717, + "step": 176100 + }, + { + "epoch": 13.647177341237553, + "grad_norm": 1.5065188586207188, + "learning_rate": 6.823853068815872e-07, + "loss": 0.8785, + "step": 176110 + }, + { + "epoch": 13.64795226471386, + "grad_norm": 1.5449747358715165, + "learning_rate": 6.824240545567266e-07, + "loss": 0.8904, + "step": 176120 + }, + { + "epoch": 13.648727188190167, + "grad_norm": 1.4593552515456498, + "learning_rate": 6.824628022318661e-07, + "loss": 0.8785, + "step": 176130 + }, + { + "epoch": 13.649502111666473, + "grad_norm": 1.471761728943114, + "learning_rate": 6.825015499070057e-07, + "loss": 0.8558, + "step": 176140 + }, + { + "epoch": 13.65027703514278, + "grad_norm": 1.6021403900125344, + "learning_rate": 6.825402975821452e-07, + "loss": 0.8763, + "step": 176150 + }, + { + "epoch": 13.651051958619087, + "grad_norm": 1.5834790205431875, + "learning_rate": 6.825790452572846e-07, + "loss": 0.8682, + "step": 176160 + }, + { + "epoch": 13.651826882095394, + "grad_norm": 1.4761610541463872, + "learning_rate": 6.826177929324241e-07, + "loss": 0.8858, + "step": 176170 + }, + { + "epoch": 13.652601805571699, + "grad_norm": 1.5050921431979802, + "learning_rate": 6.826565406075635e-07, + "loss": 0.8625, + "step": 176180 + }, + { + "epoch": 13.653376729048006, + "grad_norm": 1.521590791565554, + "learning_rate": 6.826952882827032e-07, + "loss": 0.9064, + "step": 176190 + }, + { + "epoch": 13.654151652524313, + "grad_norm": 1.4681347869770358, + "learning_rate": 6.827340359578426e-07, + "loss": 0.8721, + "step": 176200 + }, + { + "epoch": 13.65492657600062, + "grad_norm": 1.5408766929780076, + "learning_rate": 6.827727836329821e-07, + "loss": 0.8708, + "step": 176210 + }, + { + "epoch": 13.655701499476926, + "grad_norm": 1.5058494457411837, + "learning_rate": 6.828115313081215e-07, + "loss": 0.9016, + "step": 176220 + }, + { + "epoch": 13.656476422953233, + "grad_norm": 1.5129962865610989, + "learning_rate": 6.828502789832612e-07, + "loss": 0.8809, + "step": 176230 + }, + { + "epoch": 13.65725134642954, + "grad_norm": 1.589181189634625, + "learning_rate": 6.828890266584006e-07, + "loss": 0.8686, + "step": 176240 + }, + { + "epoch": 13.658026269905847, + "grad_norm": 1.5750427398862756, + "learning_rate": 6.829277743335401e-07, + "loss": 0.904, + "step": 176250 + }, + { + "epoch": 13.658801193382153, + "grad_norm": 1.5711393002927543, + "learning_rate": 6.829665220086795e-07, + "loss": 0.874, + "step": 176260 + }, + { + "epoch": 13.65957611685846, + "grad_norm": 1.6581843500350757, + "learning_rate": 6.83005269683819e-07, + "loss": 0.8909, + "step": 176270 + }, + { + "epoch": 13.660351040334767, + "grad_norm": 1.5780072977432231, + "learning_rate": 6.830440173589586e-07, + "loss": 0.8673, + "step": 176280 + }, + { + "epoch": 13.661125963811074, + "grad_norm": 1.5518021807523374, + "learning_rate": 6.830827650340981e-07, + "loss": 0.8796, + "step": 176290 + }, + { + "epoch": 13.66190088728738, + "grad_norm": 1.5373715916232538, + "learning_rate": 6.831215127092375e-07, + "loss": 0.8744, + "step": 176300 + }, + { + "epoch": 13.662675810763687, + "grad_norm": 1.525925723271007, + "learning_rate": 6.83160260384377e-07, + "loss": 0.8704, + "step": 176310 + }, + { + "epoch": 13.663450734239994, + "grad_norm": 1.4484267515619, + "learning_rate": 6.831990080595164e-07, + "loss": 0.877, + "step": 176320 + }, + { + "epoch": 13.664225657716301, + "grad_norm": 1.6106240830553415, + "learning_rate": 6.832377557346561e-07, + "loss": 0.8748, + "step": 176330 + }, + { + "epoch": 13.665000581192608, + "grad_norm": 1.6086036810163, + "learning_rate": 6.832765034097955e-07, + "loss": 0.8683, + "step": 176340 + }, + { + "epoch": 13.665775504668915, + "grad_norm": 1.4811135276000758, + "learning_rate": 6.83315251084935e-07, + "loss": 0.8742, + "step": 176350 + }, + { + "epoch": 13.666550428145221, + "grad_norm": 1.48937422468669, + "learning_rate": 6.833539987600744e-07, + "loss": 0.891, + "step": 176360 + }, + { + "epoch": 13.667325351621528, + "grad_norm": 1.5854655721806474, + "learning_rate": 6.83392746435214e-07, + "loss": 0.8814, + "step": 176370 + }, + { + "epoch": 13.668100275097833, + "grad_norm": 1.6058578330284792, + "learning_rate": 6.834314941103535e-07, + "loss": 0.871, + "step": 176380 + }, + { + "epoch": 13.66887519857414, + "grad_norm": 1.472547793756648, + "learning_rate": 6.83470241785493e-07, + "loss": 0.8595, + "step": 176390 + }, + { + "epoch": 13.669650122050447, + "grad_norm": 1.4613805036402852, + "learning_rate": 6.835089894606324e-07, + "loss": 0.8721, + "step": 176400 + }, + { + "epoch": 13.670425045526754, + "grad_norm": 1.4814165755749487, + "learning_rate": 6.835477371357719e-07, + "loss": 0.8695, + "step": 176410 + }, + { + "epoch": 13.67119996900306, + "grad_norm": 1.610858401616551, + "learning_rate": 6.835864848109114e-07, + "loss": 0.87, + "step": 176420 + }, + { + "epoch": 13.671974892479367, + "grad_norm": 1.53052778704749, + "learning_rate": 6.83625232486051e-07, + "loss": 0.8676, + "step": 176430 + }, + { + "epoch": 13.672749815955674, + "grad_norm": 1.4858968091882914, + "learning_rate": 6.836639801611904e-07, + "loss": 0.8746, + "step": 176440 + }, + { + "epoch": 13.673524739431981, + "grad_norm": 1.386499395060308, + "learning_rate": 6.837027278363299e-07, + "loss": 0.8793, + "step": 176450 + }, + { + "epoch": 13.674299662908288, + "grad_norm": 1.442066673000774, + "learning_rate": 6.837414755114693e-07, + "loss": 0.8783, + "step": 176460 + }, + { + "epoch": 13.675074586384595, + "grad_norm": 1.5791955612161375, + "learning_rate": 6.837802231866089e-07, + "loss": 0.8705, + "step": 176470 + }, + { + "epoch": 13.675849509860901, + "grad_norm": 1.6097658216378574, + "learning_rate": 6.838189708617484e-07, + "loss": 0.8907, + "step": 176480 + }, + { + "epoch": 13.676624433337208, + "grad_norm": 1.4802176762607022, + "learning_rate": 6.838577185368879e-07, + "loss": 0.886, + "step": 176490 + }, + { + "epoch": 13.677399356813515, + "grad_norm": 1.4175785394640104, + "learning_rate": 6.838964662120273e-07, + "loss": 0.8683, + "step": 176500 + }, + { + "epoch": 13.677399356813515, + "eval_loss": 0.8938866257667542, + "eval_runtime": 328.8334, + "eval_samples_per_second": 34.884, + "eval_steps_per_second": 8.722, + "step": 176500 + }, + { + "epoch": 13.678174280289822, + "grad_norm": 1.554692187441874, + "learning_rate": 6.839352138871669e-07, + "loss": 0.8879, + "step": 176510 + }, + { + "epoch": 13.678949203766129, + "grad_norm": 1.4106446306908442, + "learning_rate": 6.839739615623063e-07, + "loss": 0.875, + "step": 176520 + }, + { + "epoch": 13.679724127242435, + "grad_norm": 1.4410368315826234, + "learning_rate": 6.840127092374459e-07, + "loss": 0.8819, + "step": 176530 + }, + { + "epoch": 13.680499050718742, + "grad_norm": 1.4455587082006447, + "learning_rate": 6.840514569125853e-07, + "loss": 0.8657, + "step": 176540 + }, + { + "epoch": 13.681273974195047, + "grad_norm": 1.4692066303714564, + "learning_rate": 6.840902045877248e-07, + "loss": 0.8731, + "step": 176550 + }, + { + "epoch": 13.682048897671354, + "grad_norm": 1.4647568769545602, + "learning_rate": 6.841289522628643e-07, + "loss": 0.8597, + "step": 176560 + }, + { + "epoch": 13.682823821147661, + "grad_norm": 1.4815249266901278, + "learning_rate": 6.841676999380038e-07, + "loss": 0.8795, + "step": 176570 + }, + { + "epoch": 13.683598744623968, + "grad_norm": 1.5179800597191535, + "learning_rate": 6.842064476131433e-07, + "loss": 0.86, + "step": 176580 + }, + { + "epoch": 13.684373668100275, + "grad_norm": 1.631831824034731, + "learning_rate": 6.842451952882828e-07, + "loss": 0.9042, + "step": 176590 + }, + { + "epoch": 13.685148591576581, + "grad_norm": 1.569762839397159, + "learning_rate": 6.842839429634222e-07, + "loss": 0.8791, + "step": 176600 + }, + { + "epoch": 13.685923515052888, + "grad_norm": 1.5217652627717264, + "learning_rate": 6.843226906385618e-07, + "loss": 0.8757, + "step": 176610 + }, + { + "epoch": 13.686698438529195, + "grad_norm": 1.5307055863141172, + "learning_rate": 6.843614383137012e-07, + "loss": 0.8685, + "step": 176620 + }, + { + "epoch": 13.687473362005502, + "grad_norm": 1.658940109171605, + "learning_rate": 6.844001859888408e-07, + "loss": 0.8653, + "step": 176630 + }, + { + "epoch": 13.688248285481809, + "grad_norm": 1.5802269975164, + "learning_rate": 6.844389336639802e-07, + "loss": 0.8895, + "step": 176640 + }, + { + "epoch": 13.689023208958115, + "grad_norm": 1.4112812500278074, + "learning_rate": 6.844776813391198e-07, + "loss": 0.8733, + "step": 176650 + }, + { + "epoch": 13.689798132434422, + "grad_norm": 1.5515331503622745, + "learning_rate": 6.845164290142592e-07, + "loss": 0.8473, + "step": 176660 + }, + { + "epoch": 13.690573055910729, + "grad_norm": 1.4384256425133175, + "learning_rate": 6.845551766893987e-07, + "loss": 0.8826, + "step": 176670 + }, + { + "epoch": 13.691347979387036, + "grad_norm": 1.4127915496466903, + "learning_rate": 6.845939243645382e-07, + "loss": 0.8798, + "step": 176680 + }, + { + "epoch": 13.692122902863343, + "grad_norm": 1.5141958151201904, + "learning_rate": 6.846326720396777e-07, + "loss": 0.8683, + "step": 176690 + }, + { + "epoch": 13.69289782633965, + "grad_norm": 1.4262876833077816, + "learning_rate": 6.846714197148171e-07, + "loss": 0.8506, + "step": 176700 + }, + { + "epoch": 13.693672749815956, + "grad_norm": 1.5178369733263342, + "learning_rate": 6.847101673899567e-07, + "loss": 0.8707, + "step": 176710 + }, + { + "epoch": 13.694447673292263, + "grad_norm": 1.5798889710753865, + "learning_rate": 6.847489150650961e-07, + "loss": 0.884, + "step": 176720 + }, + { + "epoch": 13.69522259676857, + "grad_norm": 1.4783446203462816, + "learning_rate": 6.847876627402357e-07, + "loss": 0.8886, + "step": 176730 + }, + { + "epoch": 13.695997520244877, + "grad_norm": 1.4222823291133884, + "learning_rate": 6.848264104153751e-07, + "loss": 0.8713, + "step": 176740 + }, + { + "epoch": 13.696772443721182, + "grad_norm": 1.4967432993055423, + "learning_rate": 6.848651580905147e-07, + "loss": 0.9009, + "step": 176750 + }, + { + "epoch": 13.697547367197489, + "grad_norm": 1.5514264052050553, + "learning_rate": 6.849039057656541e-07, + "loss": 0.8692, + "step": 176760 + }, + { + "epoch": 13.698322290673795, + "grad_norm": 1.5966505335149879, + "learning_rate": 6.849426534407936e-07, + "loss": 0.8739, + "step": 176770 + }, + { + "epoch": 13.699097214150102, + "grad_norm": 1.4548895025479387, + "learning_rate": 6.849814011159331e-07, + "loss": 0.8574, + "step": 176780 + }, + { + "epoch": 13.699872137626409, + "grad_norm": 1.5258894381245112, + "learning_rate": 6.850201487910727e-07, + "loss": 0.8891, + "step": 176790 + }, + { + "epoch": 13.700647061102716, + "grad_norm": 1.528101250638688, + "learning_rate": 6.850588964662121e-07, + "loss": 0.8711, + "step": 176800 + }, + { + "epoch": 13.701421984579023, + "grad_norm": 1.6070845027614649, + "learning_rate": 6.850976441413516e-07, + "loss": 0.8784, + "step": 176810 + }, + { + "epoch": 13.70219690805533, + "grad_norm": 1.468656247706089, + "learning_rate": 6.85136391816491e-07, + "loss": 0.8783, + "step": 176820 + }, + { + "epoch": 13.702971831531636, + "grad_norm": 1.549165198305285, + "learning_rate": 6.851751394916306e-07, + "loss": 0.8666, + "step": 176830 + }, + { + "epoch": 13.703746755007943, + "grad_norm": 1.4187930319075528, + "learning_rate": 6.8521388716677e-07, + "loss": 0.8628, + "step": 176840 + }, + { + "epoch": 13.70452167848425, + "grad_norm": 1.4851307365485387, + "learning_rate": 6.852526348419096e-07, + "loss": 0.8916, + "step": 176850 + }, + { + "epoch": 13.705296601960557, + "grad_norm": 1.4826025055186507, + "learning_rate": 6.85291382517049e-07, + "loss": 0.8617, + "step": 176860 + }, + { + "epoch": 13.706071525436863, + "grad_norm": 1.5325921675296865, + "learning_rate": 6.853301301921886e-07, + "loss": 0.8765, + "step": 176870 + }, + { + "epoch": 13.70684644891317, + "grad_norm": 1.5231122618818118, + "learning_rate": 6.85368877867328e-07, + "loss": 0.8708, + "step": 176880 + }, + { + "epoch": 13.707621372389477, + "grad_norm": 1.4825294117001084, + "learning_rate": 6.854076255424676e-07, + "loss": 0.8691, + "step": 176890 + }, + { + "epoch": 13.708396295865784, + "grad_norm": 1.571385039863586, + "learning_rate": 6.85446373217607e-07, + "loss": 0.8648, + "step": 176900 + }, + { + "epoch": 13.70917121934209, + "grad_norm": 1.4705531866259505, + "learning_rate": 6.854851208927465e-07, + "loss": 0.8681, + "step": 176910 + }, + { + "epoch": 13.709946142818396, + "grad_norm": 1.4799318595945326, + "learning_rate": 6.85523868567886e-07, + "loss": 0.8605, + "step": 176920 + }, + { + "epoch": 13.710721066294703, + "grad_norm": 1.4542295576034856, + "learning_rate": 6.855626162430256e-07, + "loss": 0.8742, + "step": 176930 + }, + { + "epoch": 13.71149598977101, + "grad_norm": 1.5267086313973162, + "learning_rate": 6.85601363918165e-07, + "loss": 0.8718, + "step": 176940 + }, + { + "epoch": 13.712270913247316, + "grad_norm": 1.550499201968249, + "learning_rate": 6.856401115933045e-07, + "loss": 0.8878, + "step": 176950 + }, + { + "epoch": 13.713045836723623, + "grad_norm": 1.515979457906718, + "learning_rate": 6.856788592684439e-07, + "loss": 0.8763, + "step": 176960 + }, + { + "epoch": 13.71382076019993, + "grad_norm": 1.5192568836482658, + "learning_rate": 6.857176069435835e-07, + "loss": 0.8685, + "step": 176970 + }, + { + "epoch": 13.714595683676237, + "grad_norm": 1.5423427042353113, + "learning_rate": 6.857563546187229e-07, + "loss": 0.8762, + "step": 176980 + }, + { + "epoch": 13.715370607152543, + "grad_norm": 1.4886548767608596, + "learning_rate": 6.857951022938625e-07, + "loss": 0.8778, + "step": 176990 + }, + { + "epoch": 13.71614553062885, + "grad_norm": 1.6095948637050415, + "learning_rate": 6.858338499690019e-07, + "loss": 0.879, + "step": 177000 + }, + { + "epoch": 13.71614553062885, + "eval_loss": 0.8941438794136047, + "eval_runtime": 329.1639, + "eval_samples_per_second": 34.849, + "eval_steps_per_second": 8.713, + "step": 177000 + }, + { + "epoch": 13.716920454105157, + "grad_norm": 1.3950357996582854, + "learning_rate": 6.858725976441414e-07, + "loss": 0.8551, + "step": 177010 + }, + { + "epoch": 13.717695377581464, + "grad_norm": 1.5003577775558763, + "learning_rate": 6.859113453192809e-07, + "loss": 0.8812, + "step": 177020 + }, + { + "epoch": 13.71847030105777, + "grad_norm": 1.525596301236671, + "learning_rate": 6.859500929944205e-07, + "loss": 0.88, + "step": 177030 + }, + { + "epoch": 13.719245224534077, + "grad_norm": 1.4758302064782762, + "learning_rate": 6.859888406695599e-07, + "loss": 0.879, + "step": 177040 + }, + { + "epoch": 13.720020148010384, + "grad_norm": 1.4446673472830085, + "learning_rate": 6.860275883446994e-07, + "loss": 0.8753, + "step": 177050 + }, + { + "epoch": 13.720795071486691, + "grad_norm": 1.5177579423336538, + "learning_rate": 6.860663360198388e-07, + "loss": 0.867, + "step": 177060 + }, + { + "epoch": 13.721569994962998, + "grad_norm": 1.4452611976083045, + "learning_rate": 6.861050836949785e-07, + "loss": 0.8805, + "step": 177070 + }, + { + "epoch": 13.722344918439305, + "grad_norm": 1.592604151601751, + "learning_rate": 6.861438313701179e-07, + "loss": 0.876, + "step": 177080 + }, + { + "epoch": 13.723119841915612, + "grad_norm": 1.6148804733574393, + "learning_rate": 6.861825790452574e-07, + "loss": 0.8773, + "step": 177090 + }, + { + "epoch": 13.723894765391918, + "grad_norm": 1.5025149212742857, + "learning_rate": 6.862213267203968e-07, + "loss": 0.8794, + "step": 177100 + }, + { + "epoch": 13.724669688868225, + "grad_norm": 1.6286085707883544, + "learning_rate": 6.862600743955363e-07, + "loss": 0.8714, + "step": 177110 + }, + { + "epoch": 13.72544461234453, + "grad_norm": 1.4783006410249984, + "learning_rate": 6.862988220706758e-07, + "loss": 0.8779, + "step": 177120 + }, + { + "epoch": 13.726219535820837, + "grad_norm": 1.6149508717814365, + "learning_rate": 6.863375697458154e-07, + "loss": 0.8848, + "step": 177130 + }, + { + "epoch": 13.726994459297144, + "grad_norm": 1.537799254849703, + "learning_rate": 6.863763174209548e-07, + "loss": 0.8698, + "step": 177140 + }, + { + "epoch": 13.72776938277345, + "grad_norm": 1.6834827200661304, + "learning_rate": 6.864150650960943e-07, + "loss": 0.8767, + "step": 177150 + }, + { + "epoch": 13.728544306249757, + "grad_norm": 1.4289350183715734, + "learning_rate": 6.864538127712337e-07, + "loss": 0.8779, + "step": 177160 + }, + { + "epoch": 13.729319229726064, + "grad_norm": 1.592077219415932, + "learning_rate": 6.864925604463734e-07, + "loss": 0.8852, + "step": 177170 + }, + { + "epoch": 13.730094153202371, + "grad_norm": 1.5104798539507327, + "learning_rate": 6.865313081215128e-07, + "loss": 0.8687, + "step": 177180 + }, + { + "epoch": 13.730869076678678, + "grad_norm": 1.57842410119853, + "learning_rate": 6.865700557966523e-07, + "loss": 0.8713, + "step": 177190 + }, + { + "epoch": 13.731644000154985, + "grad_norm": 1.4402688915422317, + "learning_rate": 6.866088034717917e-07, + "loss": 0.8482, + "step": 177200 + }, + { + "epoch": 13.732418923631291, + "grad_norm": 1.6296981059483306, + "learning_rate": 6.866475511469313e-07, + "loss": 0.8654, + "step": 177210 + }, + { + "epoch": 13.733193847107598, + "grad_norm": 1.4973340035693543, + "learning_rate": 6.866862988220708e-07, + "loss": 0.8737, + "step": 177220 + }, + { + "epoch": 13.733968770583905, + "grad_norm": 1.5733543195333826, + "learning_rate": 6.867250464972103e-07, + "loss": 0.8676, + "step": 177230 + }, + { + "epoch": 13.734743694060212, + "grad_norm": 1.5984216162447924, + "learning_rate": 6.867637941723497e-07, + "loss": 0.8682, + "step": 177240 + }, + { + "epoch": 13.735518617536519, + "grad_norm": 1.4853841333335835, + "learning_rate": 6.868025418474892e-07, + "loss": 0.9062, + "step": 177250 + }, + { + "epoch": 13.736293541012826, + "grad_norm": 1.5055078608099024, + "learning_rate": 6.868412895226286e-07, + "loss": 0.8715, + "step": 177260 + }, + { + "epoch": 13.737068464489132, + "grad_norm": 1.4665735212642876, + "learning_rate": 6.868800371977683e-07, + "loss": 0.8601, + "step": 177270 + }, + { + "epoch": 13.73784338796544, + "grad_norm": 1.560519274383301, + "learning_rate": 6.869187848729077e-07, + "loss": 0.8785, + "step": 177280 + }, + { + "epoch": 13.738618311441746, + "grad_norm": 1.5563316317588722, + "learning_rate": 6.869575325480472e-07, + "loss": 0.8838, + "step": 177290 + }, + { + "epoch": 13.739393234918051, + "grad_norm": 1.4141413043354394, + "learning_rate": 6.869962802231866e-07, + "loss": 0.864, + "step": 177300 + }, + { + "epoch": 13.740168158394358, + "grad_norm": 1.5527752198690334, + "learning_rate": 6.870350278983262e-07, + "loss": 0.8738, + "step": 177310 + }, + { + "epoch": 13.740943081870665, + "grad_norm": 1.4552789138754652, + "learning_rate": 6.870737755734657e-07, + "loss": 0.8817, + "step": 177320 + }, + { + "epoch": 13.741718005346971, + "grad_norm": 1.587860626500384, + "learning_rate": 6.871125232486052e-07, + "loss": 0.869, + "step": 177330 + }, + { + "epoch": 13.742492928823278, + "grad_norm": 1.5297381462639648, + "learning_rate": 6.871512709237446e-07, + "loss": 0.8662, + "step": 177340 + }, + { + "epoch": 13.743267852299585, + "grad_norm": 1.5542769194715418, + "learning_rate": 6.871900185988842e-07, + "loss": 0.8917, + "step": 177350 + }, + { + "epoch": 13.744042775775892, + "grad_norm": 1.4406318747210156, + "learning_rate": 6.872287662740236e-07, + "loss": 0.8725, + "step": 177360 + }, + { + "epoch": 13.744817699252199, + "grad_norm": 1.5261706719210928, + "learning_rate": 6.872675139491632e-07, + "loss": 0.8666, + "step": 177370 + }, + { + "epoch": 13.745592622728505, + "grad_norm": 1.467002163244196, + "learning_rate": 6.873062616243026e-07, + "loss": 0.8775, + "step": 177380 + }, + { + "epoch": 13.746367546204812, + "grad_norm": 1.4911406491282528, + "learning_rate": 6.873450092994421e-07, + "loss": 0.8895, + "step": 177390 + }, + { + "epoch": 13.747142469681119, + "grad_norm": 1.525564758079329, + "learning_rate": 6.873837569745815e-07, + "loss": 0.8687, + "step": 177400 + }, + { + "epoch": 13.747917393157426, + "grad_norm": 1.4737603220752868, + "learning_rate": 6.874225046497211e-07, + "loss": 0.8772, + "step": 177410 + }, + { + "epoch": 13.748692316633733, + "grad_norm": 1.477486308668796, + "learning_rate": 6.874612523248606e-07, + "loss": 0.8636, + "step": 177420 + }, + { + "epoch": 13.74946724011004, + "grad_norm": 1.519571088856026, + "learning_rate": 6.875000000000001e-07, + "loss": 0.8763, + "step": 177430 + }, + { + "epoch": 13.750242163586346, + "grad_norm": 1.5829484498355093, + "learning_rate": 6.875387476751395e-07, + "loss": 0.8401, + "step": 177440 + }, + { + "epoch": 13.751017087062653, + "grad_norm": 1.4469911679963487, + "learning_rate": 6.875774953502791e-07, + "loss": 0.8804, + "step": 177450 + }, + { + "epoch": 13.75179201053896, + "grad_norm": 1.5590855981498672, + "learning_rate": 6.876162430254185e-07, + "loss": 0.8908, + "step": 177460 + }, + { + "epoch": 13.752566934015267, + "grad_norm": 1.5308303167793575, + "learning_rate": 6.876549907005581e-07, + "loss": 0.8767, + "step": 177470 + }, + { + "epoch": 13.753341857491574, + "grad_norm": 1.594717138830508, + "learning_rate": 6.876937383756975e-07, + "loss": 0.8702, + "step": 177480 + }, + { + "epoch": 13.754116780967879, + "grad_norm": 1.4618458771220746, + "learning_rate": 6.877324860508371e-07, + "loss": 0.8729, + "step": 177490 + }, + { + "epoch": 13.754891704444185, + "grad_norm": 3.001889416571627, + "learning_rate": 6.877712337259765e-07, + "loss": 0.8771, + "step": 177500 + }, + { + "epoch": 13.754891704444185, + "eval_loss": 0.8935867547988892, + "eval_runtime": 330.2753, + "eval_samples_per_second": 34.732, + "eval_steps_per_second": 8.684, + "step": 177500 + }, + { + "epoch": 13.755666627920492, + "grad_norm": 1.627955907813773, + "learning_rate": 6.87809981401116e-07, + "loss": 0.8738, + "step": 177510 + }, + { + "epoch": 13.756441551396799, + "grad_norm": 1.4419051891754266, + "learning_rate": 6.878487290762555e-07, + "loss": 0.8972, + "step": 177520 + }, + { + "epoch": 13.757216474873106, + "grad_norm": 1.5174133051985443, + "learning_rate": 6.87887476751395e-07, + "loss": 0.8729, + "step": 177530 + }, + { + "epoch": 13.757991398349413, + "grad_norm": 1.4883959782332634, + "learning_rate": 6.879262244265344e-07, + "loss": 0.8689, + "step": 177540 + }, + { + "epoch": 13.75876632182572, + "grad_norm": 1.5861556087260502, + "learning_rate": 6.87964972101674e-07, + "loss": 0.8641, + "step": 177550 + }, + { + "epoch": 13.759541245302026, + "grad_norm": 1.5597855679862762, + "learning_rate": 6.880037197768135e-07, + "loss": 0.887, + "step": 177560 + }, + { + "epoch": 13.760316168778333, + "grad_norm": 1.4774966118057373, + "learning_rate": 6.88042467451953e-07, + "loss": 0.882, + "step": 177570 + }, + { + "epoch": 13.76109109225464, + "grad_norm": 1.499901499008916, + "learning_rate": 6.880812151270924e-07, + "loss": 0.8823, + "step": 177580 + }, + { + "epoch": 13.761866015730947, + "grad_norm": 1.4542902115520826, + "learning_rate": 6.88119962802232e-07, + "loss": 0.8604, + "step": 177590 + }, + { + "epoch": 13.762640939207254, + "grad_norm": 1.4912131550072905, + "learning_rate": 6.881587104773714e-07, + "loss": 0.8466, + "step": 177600 + }, + { + "epoch": 13.76341586268356, + "grad_norm": 1.5852469910069704, + "learning_rate": 6.88197458152511e-07, + "loss": 0.8504, + "step": 177610 + }, + { + "epoch": 13.764190786159867, + "grad_norm": 1.5794325538944505, + "learning_rate": 6.882362058276504e-07, + "loss": 0.8812, + "step": 177620 + }, + { + "epoch": 13.764965709636174, + "grad_norm": 1.4318960238428171, + "learning_rate": 6.882749535027899e-07, + "loss": 0.8736, + "step": 177630 + }, + { + "epoch": 13.76574063311248, + "grad_norm": 1.540558819145081, + "learning_rate": 6.883137011779294e-07, + "loss": 0.8775, + "step": 177640 + }, + { + "epoch": 13.766515556588788, + "grad_norm": 1.4805694440549793, + "learning_rate": 6.883524488530689e-07, + "loss": 0.8583, + "step": 177650 + }, + { + "epoch": 13.767290480065094, + "grad_norm": 1.4971531496061523, + "learning_rate": 6.883911965282084e-07, + "loss": 0.8651, + "step": 177660 + }, + { + "epoch": 13.7680654035414, + "grad_norm": 1.5620279579989838, + "learning_rate": 6.884299442033479e-07, + "loss": 0.8726, + "step": 177670 + }, + { + "epoch": 13.768840327017706, + "grad_norm": 1.528565793668568, + "learning_rate": 6.884686918784873e-07, + "loss": 0.8896, + "step": 177680 + }, + { + "epoch": 13.769615250494013, + "grad_norm": 1.5290197781467618, + "learning_rate": 6.885074395536269e-07, + "loss": 0.8686, + "step": 177690 + }, + { + "epoch": 13.77039017397032, + "grad_norm": 1.4780782053376458, + "learning_rate": 6.885461872287663e-07, + "loss": 0.8869, + "step": 177700 + }, + { + "epoch": 13.771165097446627, + "grad_norm": 1.5123553828499112, + "learning_rate": 6.885849349039059e-07, + "loss": 0.8805, + "step": 177710 + }, + { + "epoch": 13.771940020922933, + "grad_norm": 1.5434305771736438, + "learning_rate": 6.886236825790453e-07, + "loss": 0.8577, + "step": 177720 + }, + { + "epoch": 13.77271494439924, + "grad_norm": 1.4847352609110178, + "learning_rate": 6.886624302541849e-07, + "loss": 0.8487, + "step": 177730 + }, + { + "epoch": 13.773489867875547, + "grad_norm": 1.4696285855380538, + "learning_rate": 6.887011779293243e-07, + "loss": 0.8763, + "step": 177740 + }, + { + "epoch": 13.774264791351854, + "grad_norm": 1.6577094345994015, + "learning_rate": 6.887399256044638e-07, + "loss": 0.8956, + "step": 177750 + }, + { + "epoch": 13.77503971482816, + "grad_norm": 1.4673937997055266, + "learning_rate": 6.887786732796033e-07, + "loss": 0.8888, + "step": 177760 + }, + { + "epoch": 13.775814638304468, + "grad_norm": 1.4966466565307222, + "learning_rate": 6.888174209547428e-07, + "loss": 0.875, + "step": 177770 + }, + { + "epoch": 13.776589561780774, + "grad_norm": 1.5122855008490694, + "learning_rate": 6.888561686298823e-07, + "loss": 0.8683, + "step": 177780 + }, + { + "epoch": 13.777364485257081, + "grad_norm": 1.6335633349444119, + "learning_rate": 6.888949163050218e-07, + "loss": 0.8628, + "step": 177790 + }, + { + "epoch": 13.778139408733388, + "grad_norm": 1.514499294093738, + "learning_rate": 6.889336639801612e-07, + "loss": 0.8732, + "step": 177800 + }, + { + "epoch": 13.778914332209695, + "grad_norm": 1.4985455184784462, + "learning_rate": 6.889724116553008e-07, + "loss": 0.8757, + "step": 177810 + }, + { + "epoch": 13.779689255686002, + "grad_norm": 1.459795550929849, + "learning_rate": 6.890111593304402e-07, + "loss": 0.8656, + "step": 177820 + }, + { + "epoch": 13.780464179162308, + "grad_norm": 1.4787592560531897, + "learning_rate": 6.890499070055798e-07, + "loss": 0.8666, + "step": 177830 + }, + { + "epoch": 13.781239102638615, + "grad_norm": 1.516801835267389, + "learning_rate": 6.890886546807192e-07, + "loss": 0.8541, + "step": 177840 + }, + { + "epoch": 13.782014026114922, + "grad_norm": 1.4439768255729415, + "learning_rate": 6.891274023558587e-07, + "loss": 0.8779, + "step": 177850 + }, + { + "epoch": 13.782788949591229, + "grad_norm": 1.5304012916136491, + "learning_rate": 6.891661500309982e-07, + "loss": 0.8757, + "step": 177860 + }, + { + "epoch": 13.783563873067534, + "grad_norm": 1.6154118204337387, + "learning_rate": 6.892048977061378e-07, + "loss": 0.8569, + "step": 177870 + }, + { + "epoch": 13.78433879654384, + "grad_norm": 1.4432961443752708, + "learning_rate": 6.892436453812772e-07, + "loss": 0.8723, + "step": 177880 + }, + { + "epoch": 13.785113720020147, + "grad_norm": 1.4581536376515725, + "learning_rate": 6.892823930564167e-07, + "loss": 0.8704, + "step": 177890 + }, + { + "epoch": 13.785888643496454, + "grad_norm": 1.5612056365596807, + "learning_rate": 6.893211407315561e-07, + "loss": 0.8717, + "step": 177900 + }, + { + "epoch": 13.786663566972761, + "grad_norm": 1.4928342500270386, + "learning_rate": 6.893598884066957e-07, + "loss": 0.8668, + "step": 177910 + }, + { + "epoch": 13.787438490449068, + "grad_norm": 1.3792304755872824, + "learning_rate": 6.893986360818352e-07, + "loss": 0.86, + "step": 177920 + }, + { + "epoch": 13.788213413925375, + "grad_norm": 1.4171757280286608, + "learning_rate": 6.894373837569747e-07, + "loss": 0.8477, + "step": 177930 + }, + { + "epoch": 13.788988337401682, + "grad_norm": 1.5276421362480366, + "learning_rate": 6.894761314321141e-07, + "loss": 0.8771, + "step": 177940 + }, + { + "epoch": 13.789763260877988, + "grad_norm": 1.4647261526936453, + "learning_rate": 6.895148791072536e-07, + "loss": 0.8521, + "step": 177950 + }, + { + "epoch": 13.790538184354295, + "grad_norm": 1.5952394178166567, + "learning_rate": 6.895536267823931e-07, + "loss": 0.8785, + "step": 177960 + }, + { + "epoch": 13.791313107830602, + "grad_norm": 1.519629422155771, + "learning_rate": 6.895923744575327e-07, + "loss": 0.8641, + "step": 177970 + }, + { + "epoch": 13.792088031306909, + "grad_norm": 1.5335917213612351, + "learning_rate": 6.896311221326721e-07, + "loss": 0.8824, + "step": 177980 + }, + { + "epoch": 13.792862954783216, + "grad_norm": 1.4817076359531192, + "learning_rate": 6.896698698078116e-07, + "loss": 0.8734, + "step": 177990 + }, + { + "epoch": 13.793637878259522, + "grad_norm": 1.5061172071356885, + "learning_rate": 6.89708617482951e-07, + "loss": 0.8742, + "step": 178000 + }, + { + "epoch": 13.793637878259522, + "eval_loss": 0.8934964537620544, + "eval_runtime": 329.0793, + "eval_samples_per_second": 34.858, + "eval_steps_per_second": 8.715, + "step": 178000 + }, + { + "epoch": 13.79441280173583, + "grad_norm": 1.506094374237411, + "learning_rate": 6.897473651580907e-07, + "loss": 0.8822, + "step": 178010 + }, + { + "epoch": 13.795187725212136, + "grad_norm": 1.509565078897164, + "learning_rate": 6.897861128332301e-07, + "loss": 0.8587, + "step": 178020 + }, + { + "epoch": 13.795962648688443, + "grad_norm": 1.5212545347879203, + "learning_rate": 6.898248605083696e-07, + "loss": 0.8539, + "step": 178030 + }, + { + "epoch": 13.796737572164748, + "grad_norm": 1.4867508375914833, + "learning_rate": 6.89863608183509e-07, + "loss": 0.8598, + "step": 178040 + }, + { + "epoch": 13.797512495641055, + "grad_norm": 1.5264573981409286, + "learning_rate": 6.899023558586485e-07, + "loss": 0.8861, + "step": 178050 + }, + { + "epoch": 13.798287419117361, + "grad_norm": 1.3968411431719083, + "learning_rate": 6.899411035337881e-07, + "loss": 0.8917, + "step": 178060 + }, + { + "epoch": 13.799062342593668, + "grad_norm": 1.457060577589465, + "learning_rate": 6.899798512089276e-07, + "loss": 0.8931, + "step": 178070 + }, + { + "epoch": 13.799837266069975, + "grad_norm": 1.503448715843078, + "learning_rate": 6.90018598884067e-07, + "loss": 0.8832, + "step": 178080 + }, + { + "epoch": 13.800612189546282, + "grad_norm": 1.4674846043968937, + "learning_rate": 6.900573465592065e-07, + "loss": 0.8689, + "step": 178090 + }, + { + "epoch": 13.801387113022589, + "grad_norm": 1.567162615074442, + "learning_rate": 6.900960942343459e-07, + "loss": 0.8732, + "step": 178100 + }, + { + "epoch": 13.802162036498896, + "grad_norm": 1.5600536104019929, + "learning_rate": 6.901348419094856e-07, + "loss": 0.8679, + "step": 178110 + }, + { + "epoch": 13.802936959975202, + "grad_norm": 1.4830174663440086, + "learning_rate": 6.90173589584625e-07, + "loss": 0.8755, + "step": 178120 + }, + { + "epoch": 13.80371188345151, + "grad_norm": 1.7022089507146387, + "learning_rate": 6.902123372597645e-07, + "loss": 0.886, + "step": 178130 + }, + { + "epoch": 13.804486806927816, + "grad_norm": 1.5193405245472715, + "learning_rate": 6.902510849349039e-07, + "loss": 0.8694, + "step": 178140 + }, + { + "epoch": 13.805261730404123, + "grad_norm": 1.5294073907940258, + "learning_rate": 6.902898326100436e-07, + "loss": 0.882, + "step": 178150 + }, + { + "epoch": 13.80603665388043, + "grad_norm": 1.592033823971622, + "learning_rate": 6.90328580285183e-07, + "loss": 0.8702, + "step": 178160 + }, + { + "epoch": 13.806811577356736, + "grad_norm": 1.5764313718159344, + "learning_rate": 6.903673279603225e-07, + "loss": 0.8791, + "step": 178170 + }, + { + "epoch": 13.807586500833043, + "grad_norm": 1.3983632108538624, + "learning_rate": 6.904060756354619e-07, + "loss": 0.8672, + "step": 178180 + }, + { + "epoch": 13.80836142430935, + "grad_norm": 1.388682592681265, + "learning_rate": 6.904448233106014e-07, + "loss": 0.8702, + "step": 178190 + }, + { + "epoch": 13.809136347785657, + "grad_norm": 1.514924814573715, + "learning_rate": 6.904835709857408e-07, + "loss": 0.8658, + "step": 178200 + }, + { + "epoch": 13.809911271261964, + "grad_norm": 1.4621332664600328, + "learning_rate": 6.905223186608805e-07, + "loss": 0.864, + "step": 178210 + }, + { + "epoch": 13.81068619473827, + "grad_norm": 1.5823737181228226, + "learning_rate": 6.905610663360199e-07, + "loss": 0.8655, + "step": 178220 + }, + { + "epoch": 13.811461118214577, + "grad_norm": 1.454977573715787, + "learning_rate": 6.905998140111594e-07, + "loss": 0.8508, + "step": 178230 + }, + { + "epoch": 13.812236041690882, + "grad_norm": 1.586730536087839, + "learning_rate": 6.906385616862988e-07, + "loss": 0.8794, + "step": 178240 + }, + { + "epoch": 13.813010965167189, + "grad_norm": 1.463305079483694, + "learning_rate": 6.906773093614385e-07, + "loss": 0.8662, + "step": 178250 + }, + { + "epoch": 13.813785888643496, + "grad_norm": 1.4382064940409378, + "learning_rate": 6.907160570365779e-07, + "loss": 0.8957, + "step": 178260 + }, + { + "epoch": 13.814560812119803, + "grad_norm": 1.498461337062661, + "learning_rate": 6.907548047117174e-07, + "loss": 0.8846, + "step": 178270 + }, + { + "epoch": 13.81533573559611, + "grad_norm": 1.4624015939240274, + "learning_rate": 6.907935523868568e-07, + "loss": 0.8714, + "step": 178280 + }, + { + "epoch": 13.816110659072416, + "grad_norm": 1.6210159272481544, + "learning_rate": 6.908323000619964e-07, + "loss": 0.8866, + "step": 178290 + }, + { + "epoch": 13.816885582548723, + "grad_norm": 1.428938786483163, + "learning_rate": 6.908710477371359e-07, + "loss": 0.8606, + "step": 178300 + }, + { + "epoch": 13.81766050602503, + "grad_norm": 1.5720096520277578, + "learning_rate": 6.909097954122754e-07, + "loss": 0.8699, + "step": 178310 + }, + { + "epoch": 13.818435429501337, + "grad_norm": 1.4791651061046125, + "learning_rate": 6.909485430874148e-07, + "loss": 0.8648, + "step": 178320 + }, + { + "epoch": 13.819210352977644, + "grad_norm": 1.3960566485269255, + "learning_rate": 6.909872907625543e-07, + "loss": 0.8646, + "step": 178330 + }, + { + "epoch": 13.81998527645395, + "grad_norm": 1.6036263088614795, + "learning_rate": 6.910260384376937e-07, + "loss": 0.864, + "step": 178340 + }, + { + "epoch": 13.820760199930257, + "grad_norm": 1.505974017256976, + "learning_rate": 6.910647861128334e-07, + "loss": 0.8639, + "step": 178350 + }, + { + "epoch": 13.821535123406564, + "grad_norm": 1.5321436660957686, + "learning_rate": 6.911035337879728e-07, + "loss": 0.8888, + "step": 178360 + }, + { + "epoch": 13.82231004688287, + "grad_norm": 1.4718049249218907, + "learning_rate": 6.911422814631123e-07, + "loss": 0.8742, + "step": 178370 + }, + { + "epoch": 13.823084970359178, + "grad_norm": 1.4115613038227106, + "learning_rate": 6.911810291382517e-07, + "loss": 0.8785, + "step": 178380 + }, + { + "epoch": 13.823859893835484, + "grad_norm": 1.4609751663556794, + "learning_rate": 6.912197768133913e-07, + "loss": 0.8515, + "step": 178390 + }, + { + "epoch": 13.824634817311791, + "grad_norm": 1.592774427260741, + "learning_rate": 6.912585244885308e-07, + "loss": 0.8937, + "step": 178400 + }, + { + "epoch": 13.825409740788096, + "grad_norm": 1.3897558272167871, + "learning_rate": 6.912972721636703e-07, + "loss": 0.8832, + "step": 178410 + }, + { + "epoch": 13.826184664264403, + "grad_norm": 1.5440249233750745, + "learning_rate": 6.913360198388097e-07, + "loss": 0.8748, + "step": 178420 + }, + { + "epoch": 13.82695958774071, + "grad_norm": 1.547795171465217, + "learning_rate": 6.913747675139493e-07, + "loss": 0.8799, + "step": 178430 + }, + { + "epoch": 13.827734511217017, + "grad_norm": 1.4630573063811707, + "learning_rate": 6.914135151890887e-07, + "loss": 0.8611, + "step": 178440 + }, + { + "epoch": 13.828509434693323, + "grad_norm": 1.6978554704417856, + "learning_rate": 6.914522628642283e-07, + "loss": 0.8623, + "step": 178450 + }, + { + "epoch": 13.82928435816963, + "grad_norm": 1.4602813026326276, + "learning_rate": 6.914910105393677e-07, + "loss": 0.8702, + "step": 178460 + }, + { + "epoch": 13.830059281645937, + "grad_norm": 1.4252174936011017, + "learning_rate": 6.915297582145072e-07, + "loss": 0.8839, + "step": 178470 + }, + { + "epoch": 13.830834205122244, + "grad_norm": 1.479299219043404, + "learning_rate": 6.915685058896466e-07, + "loss": 0.8616, + "step": 178480 + }, + { + "epoch": 13.83160912859855, + "grad_norm": 1.5358970717566192, + "learning_rate": 6.916072535647862e-07, + "loss": 0.8792, + "step": 178490 + }, + { + "epoch": 13.832384052074858, + "grad_norm": 1.4795714633491939, + "learning_rate": 6.916460012399257e-07, + "loss": 0.8814, + "step": 178500 + }, + { + "epoch": 13.832384052074858, + "eval_loss": 0.8934553265571594, + "eval_runtime": 329.1824, + "eval_samples_per_second": 34.847, + "eval_steps_per_second": 8.712, + "step": 178500 + }, + { + "epoch": 13.833158975551164, + "grad_norm": 1.5415008550492006, + "learning_rate": 6.916847489150652e-07, + "loss": 0.8557, + "step": 178510 + }, + { + "epoch": 13.833933899027471, + "grad_norm": 1.4337784154937308, + "learning_rate": 6.917234965902046e-07, + "loss": 0.8852, + "step": 178520 + }, + { + "epoch": 13.834708822503778, + "grad_norm": 1.4879114043915715, + "learning_rate": 6.917622442653442e-07, + "loss": 0.8689, + "step": 178530 + }, + { + "epoch": 13.835483745980085, + "grad_norm": 1.4684253840291772, + "learning_rate": 6.918009919404836e-07, + "loss": 0.8819, + "step": 178540 + }, + { + "epoch": 13.836258669456392, + "grad_norm": 1.507484720702198, + "learning_rate": 6.918397396156232e-07, + "loss": 0.8566, + "step": 178550 + }, + { + "epoch": 13.837033592932698, + "grad_norm": 1.5231278814972598, + "learning_rate": 6.918784872907626e-07, + "loss": 0.8682, + "step": 178560 + }, + { + "epoch": 13.837808516409005, + "grad_norm": 1.4753674254016989, + "learning_rate": 6.919172349659022e-07, + "loss": 0.8445, + "step": 178570 + }, + { + "epoch": 13.838583439885312, + "grad_norm": 1.5679543835632395, + "learning_rate": 6.919559826410416e-07, + "loss": 0.8776, + "step": 178580 + }, + { + "epoch": 13.839358363361619, + "grad_norm": 1.46226143545851, + "learning_rate": 6.919947303161811e-07, + "loss": 0.871, + "step": 178590 + }, + { + "epoch": 13.840133286837926, + "grad_norm": 1.540896190457173, + "learning_rate": 6.920334779913206e-07, + "loss": 0.8721, + "step": 178600 + }, + { + "epoch": 13.84090821031423, + "grad_norm": 1.5565702645159805, + "learning_rate": 6.920722256664601e-07, + "loss": 0.8809, + "step": 178610 + }, + { + "epoch": 13.841683133790537, + "grad_norm": 1.440937320378155, + "learning_rate": 6.921109733415995e-07, + "loss": 0.8775, + "step": 178620 + }, + { + "epoch": 13.842458057266844, + "grad_norm": 1.5116571033469046, + "learning_rate": 6.921497210167391e-07, + "loss": 0.8733, + "step": 178630 + }, + { + "epoch": 13.843232980743151, + "grad_norm": 1.5556456897644555, + "learning_rate": 6.921884686918785e-07, + "loss": 0.8756, + "step": 178640 + }, + { + "epoch": 13.844007904219458, + "grad_norm": 1.527235346020626, + "learning_rate": 6.922272163670181e-07, + "loss": 0.8645, + "step": 178650 + }, + { + "epoch": 13.844782827695765, + "grad_norm": 1.4526994392527068, + "learning_rate": 6.922659640421575e-07, + "loss": 0.867, + "step": 178660 + }, + { + "epoch": 13.845557751172072, + "grad_norm": 1.4959206850397273, + "learning_rate": 6.923047117172971e-07, + "loss": 0.8731, + "step": 178670 + }, + { + "epoch": 13.846332674648378, + "grad_norm": 1.6139069718978274, + "learning_rate": 6.923434593924365e-07, + "loss": 0.8638, + "step": 178680 + }, + { + "epoch": 13.847107598124685, + "grad_norm": 1.565554590988024, + "learning_rate": 6.92382207067576e-07, + "loss": 0.8811, + "step": 178690 + }, + { + "epoch": 13.847882521600992, + "grad_norm": 1.562067216029288, + "learning_rate": 6.924209547427155e-07, + "loss": 0.8732, + "step": 178700 + }, + { + "epoch": 13.848657445077299, + "grad_norm": 1.4800996767970305, + "learning_rate": 6.924597024178551e-07, + "loss": 0.8673, + "step": 178710 + }, + { + "epoch": 13.849432368553606, + "grad_norm": 1.452781977676611, + "learning_rate": 6.924984500929945e-07, + "loss": 0.8843, + "step": 178720 + }, + { + "epoch": 13.850207292029912, + "grad_norm": 1.4199606004319654, + "learning_rate": 6.92537197768134e-07, + "loss": 0.866, + "step": 178730 + }, + { + "epoch": 13.85098221550622, + "grad_norm": 1.5171965387057713, + "learning_rate": 6.925759454432734e-07, + "loss": 0.8514, + "step": 178740 + }, + { + "epoch": 13.851757138982526, + "grad_norm": 1.5897285666835212, + "learning_rate": 6.92614693118413e-07, + "loss": 0.8747, + "step": 178750 + }, + { + "epoch": 13.852532062458833, + "grad_norm": 1.474257681550108, + "learning_rate": 6.926534407935524e-07, + "loss": 0.8697, + "step": 178760 + }, + { + "epoch": 13.85330698593514, + "grad_norm": 1.4792586190720622, + "learning_rate": 6.92692188468692e-07, + "loss": 0.8537, + "step": 178770 + }, + { + "epoch": 13.854081909411445, + "grad_norm": 1.5295256572609788, + "learning_rate": 6.927309361438314e-07, + "loss": 0.8619, + "step": 178780 + }, + { + "epoch": 13.854856832887751, + "grad_norm": 1.5246046710522538, + "learning_rate": 6.92769683818971e-07, + "loss": 0.8856, + "step": 178790 + }, + { + "epoch": 13.855631756364058, + "grad_norm": 1.508185250963361, + "learning_rate": 6.928084314941104e-07, + "loss": 0.882, + "step": 178800 + }, + { + "epoch": 13.856406679840365, + "grad_norm": 1.5805512228278285, + "learning_rate": 6.9284717916925e-07, + "loss": 0.8777, + "step": 178810 + }, + { + "epoch": 13.857181603316672, + "grad_norm": 1.5346591017663398, + "learning_rate": 6.928859268443894e-07, + "loss": 0.8814, + "step": 178820 + }, + { + "epoch": 13.857956526792979, + "grad_norm": 1.4767412443469952, + "learning_rate": 6.929246745195289e-07, + "loss": 0.8763, + "step": 178830 + }, + { + "epoch": 13.858731450269286, + "grad_norm": 1.4649346250133348, + "learning_rate": 6.929634221946683e-07, + "loss": 0.8926, + "step": 178840 + }, + { + "epoch": 13.859506373745592, + "grad_norm": 1.6125851649904521, + "learning_rate": 6.93002169869808e-07, + "loss": 0.8687, + "step": 178850 + }, + { + "epoch": 13.8602812972219, + "grad_norm": 1.4118091247742712, + "learning_rate": 6.930409175449474e-07, + "loss": 0.8835, + "step": 178860 + }, + { + "epoch": 13.861056220698206, + "grad_norm": 1.4561240298078808, + "learning_rate": 6.930796652200869e-07, + "loss": 0.8612, + "step": 178870 + }, + { + "epoch": 13.861831144174513, + "grad_norm": 1.4077273457260366, + "learning_rate": 6.931184128952263e-07, + "loss": 0.8547, + "step": 178880 + }, + { + "epoch": 13.86260606765082, + "grad_norm": 1.5384545183043123, + "learning_rate": 6.931571605703659e-07, + "loss": 0.8802, + "step": 178890 + }, + { + "epoch": 13.863380991127126, + "grad_norm": 1.4755883921881292, + "learning_rate": 6.931959082455053e-07, + "loss": 0.8755, + "step": 178900 + }, + { + "epoch": 13.864155914603433, + "grad_norm": 1.4927725224750146, + "learning_rate": 6.932346559206449e-07, + "loss": 0.901, + "step": 178910 + }, + { + "epoch": 13.86493083807974, + "grad_norm": 1.464700791738123, + "learning_rate": 6.932734035957843e-07, + "loss": 0.8684, + "step": 178920 + }, + { + "epoch": 13.865705761556047, + "grad_norm": 1.4924718770177414, + "learning_rate": 6.933121512709238e-07, + "loss": 0.8724, + "step": 178930 + }, + { + "epoch": 13.866480685032354, + "grad_norm": 1.5214229916795738, + "learning_rate": 6.933508989460632e-07, + "loss": 0.876, + "step": 178940 + }, + { + "epoch": 13.86725560850866, + "grad_norm": 1.5734175066677125, + "learning_rate": 6.933896466212029e-07, + "loss": 0.8629, + "step": 178950 + }, + { + "epoch": 13.868030531984967, + "grad_norm": 1.5429120833073087, + "learning_rate": 6.934283942963423e-07, + "loss": 0.8802, + "step": 178960 + }, + { + "epoch": 13.868805455461274, + "grad_norm": 1.4304233266497348, + "learning_rate": 6.934671419714818e-07, + "loss": 0.8663, + "step": 178970 + }, + { + "epoch": 13.86958037893758, + "grad_norm": 1.5175288061514716, + "learning_rate": 6.935058896466212e-07, + "loss": 0.8551, + "step": 178980 + }, + { + "epoch": 13.870355302413886, + "grad_norm": 1.5162459513999789, + "learning_rate": 6.935446373217609e-07, + "loss": 0.8603, + "step": 178990 + }, + { + "epoch": 13.871130225890193, + "grad_norm": 1.5824862562546944, + "learning_rate": 6.935833849969003e-07, + "loss": 0.8882, + "step": 179000 + }, + { + "epoch": 13.871130225890193, + "eval_loss": 0.8933534622192383, + "eval_runtime": 329.5799, + "eval_samples_per_second": 34.805, + "eval_steps_per_second": 8.702, + "step": 179000 + }, + { + "epoch": 13.8719051493665, + "grad_norm": 1.5151577493475716, + "learning_rate": 6.936221326720398e-07, + "loss": 0.8613, + "step": 179010 + }, + { + "epoch": 13.872680072842806, + "grad_norm": 1.4246540099149767, + "learning_rate": 6.936608803471792e-07, + "loss": 0.8446, + "step": 179020 + }, + { + "epoch": 13.873454996319113, + "grad_norm": 1.5020031157268847, + "learning_rate": 6.936996280223187e-07, + "loss": 0.8589, + "step": 179030 + }, + { + "epoch": 13.87422991979542, + "grad_norm": 1.5271924668883674, + "learning_rate": 6.937383756974582e-07, + "loss": 0.8848, + "step": 179040 + }, + { + "epoch": 13.875004843271727, + "grad_norm": 1.508761683336882, + "learning_rate": 6.937771233725978e-07, + "loss": 0.8986, + "step": 179050 + }, + { + "epoch": 13.875779766748034, + "grad_norm": 1.448755199609989, + "learning_rate": 6.938158710477372e-07, + "loss": 0.8702, + "step": 179060 + }, + { + "epoch": 13.87655469022434, + "grad_norm": 1.5591442663938884, + "learning_rate": 6.938546187228767e-07, + "loss": 0.87, + "step": 179070 + }, + { + "epoch": 13.877329613700647, + "grad_norm": 1.49432791457155, + "learning_rate": 6.938933663980161e-07, + "loss": 0.8671, + "step": 179080 + }, + { + "epoch": 13.878104537176954, + "grad_norm": 1.5423994052854346, + "learning_rate": 6.939321140731558e-07, + "loss": 0.8866, + "step": 179090 + }, + { + "epoch": 13.87887946065326, + "grad_norm": 1.4814505455414742, + "learning_rate": 6.939708617482952e-07, + "loss": 0.8833, + "step": 179100 + }, + { + "epoch": 13.879654384129568, + "grad_norm": 1.48943791233051, + "learning_rate": 6.940096094234347e-07, + "loss": 0.8908, + "step": 179110 + }, + { + "epoch": 13.880429307605874, + "grad_norm": 1.516340878749964, + "learning_rate": 6.940483570985741e-07, + "loss": 0.8755, + "step": 179120 + }, + { + "epoch": 13.881204231082181, + "grad_norm": 1.472522782064446, + "learning_rate": 6.940871047737136e-07, + "loss": 0.9076, + "step": 179130 + }, + { + "epoch": 13.881979154558488, + "grad_norm": 1.3996733859150057, + "learning_rate": 6.941258524488532e-07, + "loss": 0.8866, + "step": 179140 + }, + { + "epoch": 13.882754078034795, + "grad_norm": 1.4290401752634223, + "learning_rate": 6.941646001239927e-07, + "loss": 0.8656, + "step": 179150 + }, + { + "epoch": 13.8835290015111, + "grad_norm": 1.4179458712393769, + "learning_rate": 6.942033477991321e-07, + "loss": 0.8736, + "step": 179160 + }, + { + "epoch": 13.884303924987407, + "grad_norm": 1.6785884602669146, + "learning_rate": 6.942420954742716e-07, + "loss": 0.8618, + "step": 179170 + }, + { + "epoch": 13.885078848463714, + "grad_norm": 1.5589785513472942, + "learning_rate": 6.94280843149411e-07, + "loss": 0.8767, + "step": 179180 + }, + { + "epoch": 13.88585377194002, + "grad_norm": 1.5050044438852208, + "learning_rate": 6.943195908245507e-07, + "loss": 0.8614, + "step": 179190 + }, + { + "epoch": 13.886628695416327, + "grad_norm": 1.5753771153470175, + "learning_rate": 6.943583384996901e-07, + "loss": 0.8885, + "step": 179200 + }, + { + "epoch": 13.887403618892634, + "grad_norm": 1.459457733913989, + "learning_rate": 6.943970861748296e-07, + "loss": 0.8727, + "step": 179210 + }, + { + "epoch": 13.88817854236894, + "grad_norm": 1.5257743790019962, + "learning_rate": 6.94435833849969e-07, + "loss": 0.8712, + "step": 179220 + }, + { + "epoch": 13.888953465845248, + "grad_norm": 1.577936002116018, + "learning_rate": 6.944745815251086e-07, + "loss": 0.8596, + "step": 179230 + }, + { + "epoch": 13.889728389321554, + "grad_norm": 1.593402255316136, + "learning_rate": 6.945133292002481e-07, + "loss": 0.8701, + "step": 179240 + }, + { + "epoch": 13.890503312797861, + "grad_norm": 1.4833447002809443, + "learning_rate": 6.945520768753876e-07, + "loss": 0.8914, + "step": 179250 + }, + { + "epoch": 13.891278236274168, + "grad_norm": 1.5011235590048377, + "learning_rate": 6.94590824550527e-07, + "loss": 0.849, + "step": 179260 + }, + { + "epoch": 13.892053159750475, + "grad_norm": 1.54446409269725, + "learning_rate": 6.946295722256665e-07, + "loss": 0.8709, + "step": 179270 + }, + { + "epoch": 13.892828083226782, + "grad_norm": 1.498283896575009, + "learning_rate": 6.94668319900806e-07, + "loss": 0.8565, + "step": 179280 + }, + { + "epoch": 13.893603006703088, + "grad_norm": 1.5639992756194394, + "learning_rate": 6.947070675759456e-07, + "loss": 0.8667, + "step": 179290 + }, + { + "epoch": 13.894377930179395, + "grad_norm": 1.4217152158127182, + "learning_rate": 6.94745815251085e-07, + "loss": 0.8545, + "step": 179300 + }, + { + "epoch": 13.895152853655702, + "grad_norm": 1.596722469522158, + "learning_rate": 6.947845629262245e-07, + "loss": 0.8837, + "step": 179310 + }, + { + "epoch": 13.895927777132009, + "grad_norm": 1.5022654686721943, + "learning_rate": 6.948233106013639e-07, + "loss": 0.8794, + "step": 179320 + }, + { + "epoch": 13.896702700608316, + "grad_norm": 1.4440284194827258, + "learning_rate": 6.948620582765035e-07, + "loss": 0.8798, + "step": 179330 + }, + { + "epoch": 13.897477624084623, + "grad_norm": 1.53193800573353, + "learning_rate": 6.94900805951643e-07, + "loss": 0.8821, + "step": 179340 + }, + { + "epoch": 13.898252547560928, + "grad_norm": 1.538684638517938, + "learning_rate": 6.949395536267825e-07, + "loss": 0.879, + "step": 179350 + }, + { + "epoch": 13.899027471037234, + "grad_norm": 1.5261642498539125, + "learning_rate": 6.949783013019219e-07, + "loss": 0.864, + "step": 179360 + }, + { + "epoch": 13.899802394513541, + "grad_norm": 1.5660140930159432, + "learning_rate": 6.950170489770615e-07, + "loss": 0.878, + "step": 179370 + }, + { + "epoch": 13.900577317989848, + "grad_norm": 1.4777723665085387, + "learning_rate": 6.950557966522009e-07, + "loss": 0.8755, + "step": 179380 + }, + { + "epoch": 13.901352241466155, + "grad_norm": 1.535575277858188, + "learning_rate": 6.950945443273405e-07, + "loss": 0.8839, + "step": 179390 + }, + { + "epoch": 13.902127164942462, + "grad_norm": 1.4278075531812553, + "learning_rate": 6.951332920024799e-07, + "loss": 0.8804, + "step": 179400 + }, + { + "epoch": 13.902902088418768, + "grad_norm": 1.4954939467207753, + "learning_rate": 6.951720396776194e-07, + "loss": 0.8924, + "step": 179410 + }, + { + "epoch": 13.903677011895075, + "grad_norm": 1.5562534631422096, + "learning_rate": 6.952107873527589e-07, + "loss": 0.8756, + "step": 179420 + }, + { + "epoch": 13.904451935371382, + "grad_norm": 1.5006113550584115, + "learning_rate": 6.952495350278984e-07, + "loss": 0.8583, + "step": 179430 + }, + { + "epoch": 13.905226858847689, + "grad_norm": 1.444215831774514, + "learning_rate": 6.952882827030379e-07, + "loss": 0.8754, + "step": 179440 + }, + { + "epoch": 13.906001782323996, + "grad_norm": 1.5088497550113162, + "learning_rate": 6.953270303781774e-07, + "loss": 0.8735, + "step": 179450 + }, + { + "epoch": 13.906776705800302, + "grad_norm": 1.491268237668642, + "learning_rate": 6.953657780533168e-07, + "loss": 0.8641, + "step": 179460 + }, + { + "epoch": 13.90755162927661, + "grad_norm": 1.4866076961745653, + "learning_rate": 6.954045257284564e-07, + "loss": 0.8688, + "step": 179470 + }, + { + "epoch": 13.908326552752916, + "grad_norm": 1.4861547971622884, + "learning_rate": 6.954432734035958e-07, + "loss": 0.8722, + "step": 179480 + }, + { + "epoch": 13.909101476229223, + "grad_norm": 1.4850975369905337, + "learning_rate": 6.954820210787354e-07, + "loss": 0.8835, + "step": 179490 + }, + { + "epoch": 13.90987639970553, + "grad_norm": 1.4868045378143173, + "learning_rate": 6.955207687538748e-07, + "loss": 0.865, + "step": 179500 + }, + { + "epoch": 13.90987639970553, + "eval_loss": 0.893189013004303, + "eval_runtime": 332.082, + "eval_samples_per_second": 34.543, + "eval_steps_per_second": 8.636, + "step": 179500 + }, + { + "epoch": 13.910651323181837, + "grad_norm": 1.499882170726844, + "learning_rate": 6.955595164290144e-07, + "loss": 0.8828, + "step": 179510 + }, + { + "epoch": 13.911426246658143, + "grad_norm": 1.503479884990932, + "learning_rate": 6.955982641041538e-07, + "loss": 0.8815, + "step": 179520 + }, + { + "epoch": 13.912201170134448, + "grad_norm": 1.5581532135533411, + "learning_rate": 6.956370117792934e-07, + "loss": 0.8849, + "step": 179530 + }, + { + "epoch": 13.912976093610755, + "grad_norm": 1.5139021854184467, + "learning_rate": 6.956757594544328e-07, + "loss": 0.8674, + "step": 179540 + }, + { + "epoch": 13.913751017087062, + "grad_norm": 1.4911173472763741, + "learning_rate": 6.957145071295723e-07, + "loss": 0.8702, + "step": 179550 + }, + { + "epoch": 13.914525940563369, + "grad_norm": 1.4574410786159453, + "learning_rate": 6.957532548047118e-07, + "loss": 0.8821, + "step": 179560 + }, + { + "epoch": 13.915300864039676, + "grad_norm": 1.398052489662445, + "learning_rate": 6.957920024798513e-07, + "loss": 0.8371, + "step": 179570 + }, + { + "epoch": 13.916075787515982, + "grad_norm": 1.4533173949848794, + "learning_rate": 6.958307501549907e-07, + "loss": 0.8822, + "step": 179580 + }, + { + "epoch": 13.91685071099229, + "grad_norm": 1.4586379604089088, + "learning_rate": 6.958694978301303e-07, + "loss": 0.8791, + "step": 179590 + }, + { + "epoch": 13.917625634468596, + "grad_norm": 1.438926686128797, + "learning_rate": 6.959082455052697e-07, + "loss": 0.875, + "step": 179600 + }, + { + "epoch": 13.918400557944903, + "grad_norm": 1.456646838355951, + "learning_rate": 6.959469931804093e-07, + "loss": 0.8742, + "step": 179610 + }, + { + "epoch": 13.91917548142121, + "grad_norm": 1.4888274354517277, + "learning_rate": 6.959857408555487e-07, + "loss": 0.8765, + "step": 179620 + }, + { + "epoch": 13.919950404897516, + "grad_norm": 1.4464184430236766, + "learning_rate": 6.960244885306883e-07, + "loss": 0.8848, + "step": 179630 + }, + { + "epoch": 13.920725328373823, + "grad_norm": 1.4721207596951218, + "learning_rate": 6.960632362058277e-07, + "loss": 0.8816, + "step": 179640 + }, + { + "epoch": 13.92150025185013, + "grad_norm": 1.4939805622450641, + "learning_rate": 6.961019838809673e-07, + "loss": 0.8851, + "step": 179650 + }, + { + "epoch": 13.922275175326437, + "grad_norm": 1.501574932673903, + "learning_rate": 6.961407315561067e-07, + "loss": 0.871, + "step": 179660 + }, + { + "epoch": 13.923050098802744, + "grad_norm": 1.5577193993100644, + "learning_rate": 6.961794792312462e-07, + "loss": 0.9033, + "step": 179670 + }, + { + "epoch": 13.92382502227905, + "grad_norm": 1.4796143826278154, + "learning_rate": 6.962182269063857e-07, + "loss": 0.8694, + "step": 179680 + }, + { + "epoch": 13.924599945755357, + "grad_norm": 1.520943047850929, + "learning_rate": 6.962569745815252e-07, + "loss": 0.9014, + "step": 179690 + }, + { + "epoch": 13.925374869231664, + "grad_norm": 1.4251603629068066, + "learning_rate": 6.962957222566646e-07, + "loss": 0.8627, + "step": 179700 + }, + { + "epoch": 13.926149792707971, + "grad_norm": 1.4659998727678012, + "learning_rate": 6.963344699318042e-07, + "loss": 0.8542, + "step": 179710 + }, + { + "epoch": 13.926924716184278, + "grad_norm": 1.4453616826114715, + "learning_rate": 6.963732176069436e-07, + "loss": 0.8852, + "step": 179720 + }, + { + "epoch": 13.927699639660583, + "grad_norm": 1.6271294054907466, + "learning_rate": 6.964119652820832e-07, + "loss": 0.8855, + "step": 179730 + }, + { + "epoch": 13.92847456313689, + "grad_norm": 1.5203722437011997, + "learning_rate": 6.964507129572226e-07, + "loss": 0.8738, + "step": 179740 + }, + { + "epoch": 13.929249486613196, + "grad_norm": 1.5696540727434052, + "learning_rate": 6.964894606323622e-07, + "loss": 0.8783, + "step": 179750 + }, + { + "epoch": 13.930024410089503, + "grad_norm": 1.5188967655815646, + "learning_rate": 6.965282083075016e-07, + "loss": 0.8557, + "step": 179760 + }, + { + "epoch": 13.93079933356581, + "grad_norm": 1.5165934218271193, + "learning_rate": 6.965669559826411e-07, + "loss": 0.8644, + "step": 179770 + }, + { + "epoch": 13.931574257042117, + "grad_norm": 1.5302546256319296, + "learning_rate": 6.966057036577806e-07, + "loss": 0.8628, + "step": 179780 + }, + { + "epoch": 13.932349180518424, + "grad_norm": 1.5203097144588675, + "learning_rate": 6.966444513329202e-07, + "loss": 0.8821, + "step": 179790 + }, + { + "epoch": 13.93312410399473, + "grad_norm": 1.4805896699429968, + "learning_rate": 6.966831990080596e-07, + "loss": 0.8872, + "step": 179800 + }, + { + "epoch": 13.933899027471037, + "grad_norm": 1.5011875464916953, + "learning_rate": 6.967219466831991e-07, + "loss": 0.8666, + "step": 179810 + }, + { + "epoch": 13.934673950947344, + "grad_norm": 1.4602271573813805, + "learning_rate": 6.967606943583385e-07, + "loss": 0.8701, + "step": 179820 + }, + { + "epoch": 13.93544887442365, + "grad_norm": 1.451487200184132, + "learning_rate": 6.967994420334781e-07, + "loss": 0.8682, + "step": 179830 + }, + { + "epoch": 13.936223797899958, + "grad_norm": 1.5012411473339837, + "learning_rate": 6.968381897086175e-07, + "loss": 0.8751, + "step": 179840 + }, + { + "epoch": 13.936998721376264, + "grad_norm": 1.4282071820115816, + "learning_rate": 6.968769373837571e-07, + "loss": 0.8581, + "step": 179850 + }, + { + "epoch": 13.937773644852571, + "grad_norm": 1.4994408994695803, + "learning_rate": 6.969156850588965e-07, + "loss": 0.8706, + "step": 179860 + }, + { + "epoch": 13.938548568328878, + "grad_norm": 1.5058768806431055, + "learning_rate": 6.96954432734036e-07, + "loss": 0.8798, + "step": 179870 + }, + { + "epoch": 13.939323491805185, + "grad_norm": 1.5003075442764198, + "learning_rate": 6.969931804091755e-07, + "loss": 0.8789, + "step": 179880 + }, + { + "epoch": 13.940098415281492, + "grad_norm": 1.3604288427020734, + "learning_rate": 6.970319280843151e-07, + "loss": 0.8731, + "step": 179890 + }, + { + "epoch": 13.940873338757797, + "grad_norm": 1.4675378997097694, + "learning_rate": 6.970706757594545e-07, + "loss": 0.8709, + "step": 179900 + }, + { + "epoch": 13.941648262234104, + "grad_norm": 1.5295280399060442, + "learning_rate": 6.97109423434594e-07, + "loss": 0.8898, + "step": 179910 + }, + { + "epoch": 13.94242318571041, + "grad_norm": 1.6001727793861367, + "learning_rate": 6.971481711097334e-07, + "loss": 0.852, + "step": 179920 + }, + { + "epoch": 13.943198109186717, + "grad_norm": 1.5447538877087277, + "learning_rate": 6.971869187848731e-07, + "loss": 0.8721, + "step": 179930 + }, + { + "epoch": 13.943973032663024, + "grad_norm": 1.5027609949754128, + "learning_rate": 6.972256664600125e-07, + "loss": 0.8938, + "step": 179940 + }, + { + "epoch": 13.94474795613933, + "grad_norm": 2.942436759299281, + "learning_rate": 6.97264414135152e-07, + "loss": 0.8632, + "step": 179950 + }, + { + "epoch": 13.945522879615638, + "grad_norm": 1.4691291801895228, + "learning_rate": 6.973031618102914e-07, + "loss": 0.8632, + "step": 179960 + }, + { + "epoch": 13.946297803091944, + "grad_norm": 1.523787241390096, + "learning_rate": 6.973419094854309e-07, + "loss": 0.8724, + "step": 179970 + }, + { + "epoch": 13.947072726568251, + "grad_norm": 1.5485483582410304, + "learning_rate": 6.973806571605704e-07, + "loss": 0.8801, + "step": 179980 + }, + { + "epoch": 13.947847650044558, + "grad_norm": 1.4977583101692078, + "learning_rate": 6.9741940483571e-07, + "loss": 0.8711, + "step": 179990 + }, + { + "epoch": 13.948622573520865, + "grad_norm": 1.4695092946669654, + "learning_rate": 6.974581525108494e-07, + "loss": 0.8932, + "step": 180000 + }, + { + "epoch": 13.948622573520865, + "eval_loss": 0.8928706645965576, + "eval_runtime": 332.5718, + "eval_samples_per_second": 34.492, + "eval_steps_per_second": 8.624, + "step": 180000 + }, + { + "epoch": 13.949397496997172, + "grad_norm": 1.599053249115666, + "learning_rate": 6.974969001859889e-07, + "loss": 0.8649, + "step": 180010 + }, + { + "epoch": 13.950172420473478, + "grad_norm": 1.4043342158733012, + "learning_rate": 6.975356478611283e-07, + "loss": 0.8808, + "step": 180020 + }, + { + "epoch": 13.950947343949785, + "grad_norm": 1.4864657912396544, + "learning_rate": 6.97574395536268e-07, + "loss": 0.903, + "step": 180030 + }, + { + "epoch": 13.951722267426092, + "grad_norm": 1.5609357325114728, + "learning_rate": 6.976131432114074e-07, + "loss": 0.8908, + "step": 180040 + }, + { + "epoch": 13.952497190902399, + "grad_norm": 1.5241240160090543, + "learning_rate": 6.976518908865469e-07, + "loss": 0.8892, + "step": 180050 + }, + { + "epoch": 13.953272114378706, + "grad_norm": 1.4969562911977023, + "learning_rate": 6.976906385616863e-07, + "loss": 0.8758, + "step": 180060 + }, + { + "epoch": 13.954047037855013, + "grad_norm": 1.6209296841291079, + "learning_rate": 6.97729386236826e-07, + "loss": 0.8822, + "step": 180070 + }, + { + "epoch": 13.95482196133132, + "grad_norm": 1.4941104682996387, + "learning_rate": 6.977681339119654e-07, + "loss": 0.8579, + "step": 180080 + }, + { + "epoch": 13.955596884807626, + "grad_norm": 1.466962456725586, + "learning_rate": 6.978068815871049e-07, + "loss": 0.8561, + "step": 180090 + }, + { + "epoch": 13.956371808283931, + "grad_norm": 1.6020732390711043, + "learning_rate": 6.978456292622443e-07, + "loss": 0.8913, + "step": 180100 + }, + { + "epoch": 13.957146731760238, + "grad_norm": 1.4961993542955607, + "learning_rate": 6.978843769373838e-07, + "loss": 0.8758, + "step": 180110 + }, + { + "epoch": 13.957921655236545, + "grad_norm": 1.4904421774198924, + "learning_rate": 6.979231246125232e-07, + "loss": 0.8458, + "step": 180120 + }, + { + "epoch": 13.958696578712852, + "grad_norm": 1.4985618921531385, + "learning_rate": 6.979618722876629e-07, + "loss": 0.8753, + "step": 180130 + }, + { + "epoch": 13.959471502189158, + "grad_norm": 1.5443028464045347, + "learning_rate": 6.980006199628023e-07, + "loss": 0.86, + "step": 180140 + }, + { + "epoch": 13.960246425665465, + "grad_norm": 1.5232098514854315, + "learning_rate": 6.980393676379418e-07, + "loss": 0.8567, + "step": 180150 + }, + { + "epoch": 13.961021349141772, + "grad_norm": 1.4884994443149298, + "learning_rate": 6.980781153130812e-07, + "loss": 0.8562, + "step": 180160 + }, + { + "epoch": 13.961796272618079, + "grad_norm": 1.489234650465074, + "learning_rate": 6.981168629882209e-07, + "loss": 0.8697, + "step": 180170 + }, + { + "epoch": 13.962571196094386, + "grad_norm": 1.5346425006215103, + "learning_rate": 6.981556106633603e-07, + "loss": 0.8789, + "step": 180180 + }, + { + "epoch": 13.963346119570692, + "grad_norm": 1.503821728636184, + "learning_rate": 6.981943583384998e-07, + "loss": 0.8881, + "step": 180190 + }, + { + "epoch": 13.964121043047, + "grad_norm": 1.4486662089843627, + "learning_rate": 6.982331060136392e-07, + "loss": 0.8788, + "step": 180200 + }, + { + "epoch": 13.964895966523306, + "grad_norm": 1.5595746372188934, + "learning_rate": 6.982718536887788e-07, + "loss": 0.8956, + "step": 180210 + }, + { + "epoch": 13.965670889999613, + "grad_norm": 1.4768319520402917, + "learning_rate": 6.983106013639182e-07, + "loss": 0.8682, + "step": 180220 + }, + { + "epoch": 13.96644581347592, + "grad_norm": 1.4322909411865314, + "learning_rate": 6.983493490390578e-07, + "loss": 0.8768, + "step": 180230 + }, + { + "epoch": 13.967220736952227, + "grad_norm": 1.4764953268051424, + "learning_rate": 6.983880967141972e-07, + "loss": 0.8812, + "step": 180240 + }, + { + "epoch": 13.967995660428533, + "grad_norm": 1.6453899985297598, + "learning_rate": 6.984268443893367e-07, + "loss": 0.8704, + "step": 180250 + }, + { + "epoch": 13.96877058390484, + "grad_norm": 1.4060840405019062, + "learning_rate": 6.984655920644761e-07, + "loss": 0.8676, + "step": 180260 + }, + { + "epoch": 13.969545507381145, + "grad_norm": 1.5176463503632942, + "learning_rate": 6.985043397396158e-07, + "loss": 0.8747, + "step": 180270 + }, + { + "epoch": 13.970320430857452, + "grad_norm": 1.5591367093666706, + "learning_rate": 6.985430874147552e-07, + "loss": 0.8709, + "step": 180280 + }, + { + "epoch": 13.971095354333759, + "grad_norm": 1.5004315287371859, + "learning_rate": 6.985818350898947e-07, + "loss": 0.8856, + "step": 180290 + }, + { + "epoch": 13.971870277810066, + "grad_norm": 1.59642307213383, + "learning_rate": 6.986205827650341e-07, + "loss": 0.8698, + "step": 180300 + }, + { + "epoch": 13.972645201286372, + "grad_norm": 1.6098389094817445, + "learning_rate": 6.986593304401737e-07, + "loss": 0.8702, + "step": 180310 + }, + { + "epoch": 13.97342012476268, + "grad_norm": 1.5700050858078642, + "learning_rate": 6.986980781153132e-07, + "loss": 0.9082, + "step": 180320 + }, + { + "epoch": 13.974195048238986, + "grad_norm": 1.518913602304419, + "learning_rate": 6.987368257904527e-07, + "loss": 0.8788, + "step": 180330 + }, + { + "epoch": 13.974969971715293, + "grad_norm": 1.4886467735300708, + "learning_rate": 6.987755734655921e-07, + "loss": 0.8932, + "step": 180340 + }, + { + "epoch": 13.9757448951916, + "grad_norm": 1.5114785018915649, + "learning_rate": 6.988143211407317e-07, + "loss": 0.8619, + "step": 180350 + }, + { + "epoch": 13.976519818667906, + "grad_norm": 1.5794453025962985, + "learning_rate": 6.988530688158711e-07, + "loss": 0.8719, + "step": 180360 + }, + { + "epoch": 13.977294742144213, + "grad_norm": 1.4284114877869878, + "learning_rate": 6.988918164910107e-07, + "loss": 0.8484, + "step": 180370 + }, + { + "epoch": 13.97806966562052, + "grad_norm": 1.4054416627327695, + "learning_rate": 6.989305641661501e-07, + "loss": 0.8667, + "step": 180380 + }, + { + "epoch": 13.978844589096827, + "grad_norm": 1.5437189794182409, + "learning_rate": 6.989693118412896e-07, + "loss": 0.8773, + "step": 180390 + }, + { + "epoch": 13.979619512573134, + "grad_norm": 1.441379284293818, + "learning_rate": 6.99008059516429e-07, + "loss": 0.8617, + "step": 180400 + }, + { + "epoch": 13.98039443604944, + "grad_norm": 1.4969403904427976, + "learning_rate": 6.990468071915686e-07, + "loss": 0.853, + "step": 180410 + }, + { + "epoch": 13.981169359525747, + "grad_norm": 1.446560615306109, + "learning_rate": 6.990855548667081e-07, + "loss": 0.8826, + "step": 180420 + }, + { + "epoch": 13.981944283002054, + "grad_norm": 1.4319572260712243, + "learning_rate": 6.991243025418476e-07, + "loss": 0.851, + "step": 180430 + }, + { + "epoch": 13.982719206478361, + "grad_norm": 1.6397592146832702, + "learning_rate": 6.99163050216987e-07, + "loss": 0.8681, + "step": 180440 + }, + { + "epoch": 13.983494129954668, + "grad_norm": 1.5750796621279253, + "learning_rate": 6.992017978921266e-07, + "loss": 0.8725, + "step": 180450 + }, + { + "epoch": 13.984269053430975, + "grad_norm": 1.6989084051940928, + "learning_rate": 6.99240545567266e-07, + "loss": 0.8977, + "step": 180460 + }, + { + "epoch": 13.98504397690728, + "grad_norm": 1.48091678288449, + "learning_rate": 6.992792932424056e-07, + "loss": 0.8616, + "step": 180470 + }, + { + "epoch": 13.985818900383586, + "grad_norm": 1.445406774098241, + "learning_rate": 6.99318040917545e-07, + "loss": 0.869, + "step": 180480 + }, + { + "epoch": 13.986593823859893, + "grad_norm": 1.5319962577570145, + "learning_rate": 6.993567885926845e-07, + "loss": 0.8671, + "step": 180490 + }, + { + "epoch": 13.9873687473362, + "grad_norm": 1.4855508542540965, + "learning_rate": 6.99395536267824e-07, + "loss": 0.8823, + "step": 180500 + }, + { + "epoch": 13.9873687473362, + "eval_loss": 0.8928434252738953, + "eval_runtime": 329.6618, + "eval_samples_per_second": 34.796, + "eval_steps_per_second": 8.7, + "step": 180500 + }, + { + "epoch": 13.988143670812507, + "grad_norm": 1.5611313720256705, + "learning_rate": 6.994342839429635e-07, + "loss": 0.8721, + "step": 180510 + }, + { + "epoch": 13.988918594288814, + "grad_norm": 1.4452699690459394, + "learning_rate": 6.99473031618103e-07, + "loss": 0.8552, + "step": 180520 + }, + { + "epoch": 13.98969351776512, + "grad_norm": 1.4860704211139908, + "learning_rate": 6.995117792932425e-07, + "loss": 0.8671, + "step": 180530 + }, + { + "epoch": 13.990468441241427, + "grad_norm": 1.4686392678427898, + "learning_rate": 6.995505269683819e-07, + "loss": 0.8848, + "step": 180540 + }, + { + "epoch": 13.991243364717734, + "grad_norm": 1.484088681854559, + "learning_rate": 6.995892746435215e-07, + "loss": 0.8718, + "step": 180550 + }, + { + "epoch": 13.992018288194041, + "grad_norm": 1.4791421324841505, + "learning_rate": 6.996280223186609e-07, + "loss": 0.882, + "step": 180560 + }, + { + "epoch": 13.992793211670348, + "grad_norm": 1.5197387888630915, + "learning_rate": 6.996667699938005e-07, + "loss": 0.8614, + "step": 180570 + }, + { + "epoch": 13.993568135146655, + "grad_norm": 1.468054606071931, + "learning_rate": 6.997055176689399e-07, + "loss": 0.8776, + "step": 180580 + }, + { + "epoch": 13.994343058622961, + "grad_norm": 1.493886320223971, + "learning_rate": 6.997442653440795e-07, + "loss": 0.8643, + "step": 180590 + }, + { + "epoch": 13.995117982099268, + "grad_norm": 1.5459285611818614, + "learning_rate": 6.997830130192189e-07, + "loss": 0.8754, + "step": 180600 + }, + { + "epoch": 13.995892905575575, + "grad_norm": 1.4851111670288433, + "learning_rate": 6.998217606943584e-07, + "loss": 0.8835, + "step": 180610 + }, + { + "epoch": 13.996667829051882, + "grad_norm": 1.4360969635845735, + "learning_rate": 6.998605083694979e-07, + "loss": 0.8668, + "step": 180620 + }, + { + "epoch": 13.997442752528189, + "grad_norm": 1.52058581672722, + "learning_rate": 6.998992560446374e-07, + "loss": 0.8808, + "step": 180630 + }, + { + "epoch": 13.998217676004494, + "grad_norm": 1.4217307981624094, + "learning_rate": 6.999380037197769e-07, + "loss": 0.8738, + "step": 180640 + }, + { + "epoch": 13.9989925994808, + "grad_norm": 1.504869884592953, + "learning_rate": 6.999767513949164e-07, + "loss": 0.894, + "step": 180650 + }, + { + "epoch": 13.999767522957107, + "grad_norm": 1.505705172268361, + "learning_rate": 7.000154990700558e-07, + "loss": 0.8672, + "step": 180660 + }, + { + "epoch": 14.000542446433414, + "grad_norm": 1.4655103142386443, + "learning_rate": 7.000542467451954e-07, + "loss": 0.8759, + "step": 180670 + }, + { + "epoch": 14.00131736990972, + "grad_norm": 1.570586598192463, + "learning_rate": 7.000929944203348e-07, + "loss": 0.8814, + "step": 180680 + }, + { + "epoch": 14.002092293386028, + "grad_norm": 1.5463928843009749, + "learning_rate": 7.001317420954744e-07, + "loss": 0.8625, + "step": 180690 + }, + { + "epoch": 14.002867216862334, + "grad_norm": 1.5108871670718163, + "learning_rate": 7.001704897706138e-07, + "loss": 0.8646, + "step": 180700 + }, + { + "epoch": 14.003642140338641, + "grad_norm": 1.4912718475799691, + "learning_rate": 7.002092374457533e-07, + "loss": 0.8827, + "step": 180710 + }, + { + "epoch": 14.004417063814948, + "grad_norm": 1.4950512777654859, + "learning_rate": 7.002479851208928e-07, + "loss": 0.857, + "step": 180720 + }, + { + "epoch": 14.005191987291255, + "grad_norm": 1.6239782749418716, + "learning_rate": 7.002867327960324e-07, + "loss": 0.8463, + "step": 180730 + }, + { + "epoch": 14.005966910767562, + "grad_norm": 1.557779792024487, + "learning_rate": 7.003254804711718e-07, + "loss": 0.8504, + "step": 180740 + }, + { + "epoch": 14.006741834243869, + "grad_norm": 1.5288608579514782, + "learning_rate": 7.003642281463113e-07, + "loss": 0.8574, + "step": 180750 + }, + { + "epoch": 14.007516757720175, + "grad_norm": 1.4930691941073748, + "learning_rate": 7.004029758214507e-07, + "loss": 0.8636, + "step": 180760 + }, + { + "epoch": 14.008291681196482, + "grad_norm": 1.5071820506988762, + "learning_rate": 7.004417234965903e-07, + "loss": 0.8952, + "step": 180770 + }, + { + "epoch": 14.009066604672789, + "grad_norm": 1.5369238340118045, + "learning_rate": 7.004804711717298e-07, + "loss": 0.8687, + "step": 180780 + }, + { + "epoch": 14.009841528149096, + "grad_norm": 1.4830339399364216, + "learning_rate": 7.005192188468693e-07, + "loss": 0.8574, + "step": 180790 + }, + { + "epoch": 14.010616451625403, + "grad_norm": 1.538093432965291, + "learning_rate": 7.005579665220087e-07, + "loss": 0.8747, + "step": 180800 + }, + { + "epoch": 14.01139137510171, + "grad_norm": 1.5356428796277175, + "learning_rate": 7.005967141971482e-07, + "loss": 0.8455, + "step": 180810 + }, + { + "epoch": 14.012166298578016, + "grad_norm": 1.5741641072397499, + "learning_rate": 7.006354618722877e-07, + "loss": 0.8645, + "step": 180820 + }, + { + "epoch": 14.012941222054323, + "grad_norm": 1.504318164120338, + "learning_rate": 7.006742095474273e-07, + "loss": 0.8623, + "step": 180830 + }, + { + "epoch": 14.013716145530628, + "grad_norm": 1.5925937603433218, + "learning_rate": 7.007129572225667e-07, + "loss": 0.8582, + "step": 180840 + }, + { + "epoch": 14.014491069006935, + "grad_norm": 1.4863296617384723, + "learning_rate": 7.007517048977062e-07, + "loss": 0.8595, + "step": 180850 + }, + { + "epoch": 14.015265992483242, + "grad_norm": 1.559096570629823, + "learning_rate": 7.007904525728456e-07, + "loss": 0.8649, + "step": 180860 + }, + { + "epoch": 14.016040915959548, + "grad_norm": 1.6637739139810959, + "learning_rate": 7.008292002479853e-07, + "loss": 0.8753, + "step": 180870 + }, + { + "epoch": 14.016815839435855, + "grad_norm": 1.486596148073862, + "learning_rate": 7.008679479231247e-07, + "loss": 0.866, + "step": 180880 + }, + { + "epoch": 14.017590762912162, + "grad_norm": 1.5206263353449276, + "learning_rate": 7.009066955982642e-07, + "loss": 0.8605, + "step": 180890 + }, + { + "epoch": 14.018365686388469, + "grad_norm": 1.5041479929756834, + "learning_rate": 7.009454432734036e-07, + "loss": 0.872, + "step": 180900 + }, + { + "epoch": 14.019140609864776, + "grad_norm": 1.5939005731018427, + "learning_rate": 7.009841909485431e-07, + "loss": 0.8667, + "step": 180910 + }, + { + "epoch": 14.019915533341083, + "grad_norm": 1.4690764125816718, + "learning_rate": 7.010229386236827e-07, + "loss": 0.8622, + "step": 180920 + }, + { + "epoch": 14.02069045681739, + "grad_norm": 1.6734528193693718, + "learning_rate": 7.010616862988222e-07, + "loss": 0.8746, + "step": 180930 + }, + { + "epoch": 14.021465380293696, + "grad_norm": 1.4797691966580637, + "learning_rate": 7.011004339739616e-07, + "loss": 0.8537, + "step": 180940 + }, + { + "epoch": 14.022240303770003, + "grad_norm": 1.434167879883711, + "learning_rate": 7.011391816491011e-07, + "loss": 0.8743, + "step": 180950 + }, + { + "epoch": 14.02301522724631, + "grad_norm": 1.5008547938113213, + "learning_rate": 7.011779293242405e-07, + "loss": 0.8519, + "step": 180960 + }, + { + "epoch": 14.023790150722617, + "grad_norm": 1.5029290953536543, + "learning_rate": 7.012166769993802e-07, + "loss": 0.8679, + "step": 180970 + }, + { + "epoch": 14.024565074198923, + "grad_norm": 1.5321823347164454, + "learning_rate": 7.012554246745196e-07, + "loss": 0.8518, + "step": 180980 + }, + { + "epoch": 14.02533999767523, + "grad_norm": 1.5854242368374225, + "learning_rate": 7.012941723496591e-07, + "loss": 0.8702, + "step": 180990 + }, + { + "epoch": 14.026114921151537, + "grad_norm": 1.5942753126086384, + "learning_rate": 7.013329200247985e-07, + "loss": 0.8612, + "step": 181000 + }, + { + "epoch": 14.026114921151537, + "eval_loss": 0.8938263654708862, + "eval_runtime": 328.8717, + "eval_samples_per_second": 34.88, + "eval_steps_per_second": 8.721, + "step": 181000 + }, + { + "epoch": 14.026889844627844, + "grad_norm": 1.5642521511610803, + "learning_rate": 7.013716676999382e-07, + "loss": 0.8695, + "step": 181010 + }, + { + "epoch": 14.027664768104149, + "grad_norm": 1.5410238533710627, + "learning_rate": 7.014104153750776e-07, + "loss": 0.8608, + "step": 181020 + }, + { + "epoch": 14.028439691580456, + "grad_norm": 1.6506706752159381, + "learning_rate": 7.014491630502171e-07, + "loss": 0.8617, + "step": 181030 + }, + { + "epoch": 14.029214615056762, + "grad_norm": 1.585418813663947, + "learning_rate": 7.014879107253565e-07, + "loss": 0.8733, + "step": 181040 + }, + { + "epoch": 14.02998953853307, + "grad_norm": 1.6009882742304637, + "learning_rate": 7.01526658400496e-07, + "loss": 0.8754, + "step": 181050 + }, + { + "epoch": 14.030764462009376, + "grad_norm": 1.4942965722685013, + "learning_rate": 7.015654060756356e-07, + "loss": 0.8624, + "step": 181060 + }, + { + "epoch": 14.031539385485683, + "grad_norm": 1.4498191696681695, + "learning_rate": 7.016041537507751e-07, + "loss": 0.8486, + "step": 181070 + }, + { + "epoch": 14.03231430896199, + "grad_norm": 1.4576114160785107, + "learning_rate": 7.016429014259145e-07, + "loss": 0.8497, + "step": 181080 + }, + { + "epoch": 14.033089232438297, + "grad_norm": 1.4739278322555296, + "learning_rate": 7.01681649101054e-07, + "loss": 0.8573, + "step": 181090 + }, + { + "epoch": 14.033864155914603, + "grad_norm": 1.5002501179697585, + "learning_rate": 7.017203967761934e-07, + "loss": 0.8701, + "step": 181100 + }, + { + "epoch": 14.03463907939091, + "grad_norm": 1.5501389658936522, + "learning_rate": 7.017591444513331e-07, + "loss": 0.8746, + "step": 181110 + }, + { + "epoch": 14.035414002867217, + "grad_norm": 1.5813898619751305, + "learning_rate": 7.017978921264725e-07, + "loss": 0.8723, + "step": 181120 + }, + { + "epoch": 14.036188926343524, + "grad_norm": 1.682472645865546, + "learning_rate": 7.01836639801612e-07, + "loss": 0.8715, + "step": 181130 + }, + { + "epoch": 14.03696384981983, + "grad_norm": 1.5267934514174566, + "learning_rate": 7.018753874767514e-07, + "loss": 0.8839, + "step": 181140 + }, + { + "epoch": 14.037738773296137, + "grad_norm": 1.5830167975111908, + "learning_rate": 7.01914135151891e-07, + "loss": 0.8597, + "step": 181150 + }, + { + "epoch": 14.038513696772444, + "grad_norm": 1.4651278469540756, + "learning_rate": 7.019528828270305e-07, + "loss": 0.8613, + "step": 181160 + }, + { + "epoch": 14.039288620248751, + "grad_norm": 1.5221062660028846, + "learning_rate": 7.0199163050217e-07, + "loss": 0.8569, + "step": 181170 + }, + { + "epoch": 14.040063543725058, + "grad_norm": 1.5647592676094826, + "learning_rate": 7.020303781773094e-07, + "loss": 0.8706, + "step": 181180 + }, + { + "epoch": 14.040838467201365, + "grad_norm": 1.5080722109818216, + "learning_rate": 7.020691258524489e-07, + "loss": 0.8621, + "step": 181190 + }, + { + "epoch": 14.041613390677671, + "grad_norm": 1.4773046308919728, + "learning_rate": 7.021078735275883e-07, + "loss": 0.8718, + "step": 181200 + }, + { + "epoch": 14.042388314153976, + "grad_norm": 1.4743487545438205, + "learning_rate": 7.02146621202728e-07, + "loss": 0.8564, + "step": 181210 + }, + { + "epoch": 14.043163237630283, + "grad_norm": 1.4581203966509058, + "learning_rate": 7.021853688778674e-07, + "loss": 0.8592, + "step": 181220 + }, + { + "epoch": 14.04393816110659, + "grad_norm": 1.5314735577238368, + "learning_rate": 7.022241165530069e-07, + "loss": 0.8662, + "step": 181230 + }, + { + "epoch": 14.044713084582897, + "grad_norm": 1.6269129239045883, + "learning_rate": 7.022628642281463e-07, + "loss": 0.8718, + "step": 181240 + }, + { + "epoch": 14.045488008059204, + "grad_norm": 1.5863523639716792, + "learning_rate": 7.023016119032859e-07, + "loss": 0.8634, + "step": 181250 + }, + { + "epoch": 14.04626293153551, + "grad_norm": 1.5084873616553556, + "learning_rate": 7.023403595784254e-07, + "loss": 0.8733, + "step": 181260 + }, + { + "epoch": 14.047037855011817, + "grad_norm": 1.4962269708476885, + "learning_rate": 7.023791072535649e-07, + "loss": 0.8578, + "step": 181270 + }, + { + "epoch": 14.047812778488124, + "grad_norm": 1.4463379565802017, + "learning_rate": 7.024178549287043e-07, + "loss": 0.8639, + "step": 181280 + }, + { + "epoch": 14.048587701964431, + "grad_norm": 1.481012168736296, + "learning_rate": 7.024566026038439e-07, + "loss": 0.8664, + "step": 181290 + }, + { + "epoch": 14.049362625440738, + "grad_norm": 1.5877354373117865, + "learning_rate": 7.024953502789833e-07, + "loss": 0.8439, + "step": 181300 + }, + { + "epoch": 14.050137548917045, + "grad_norm": 1.5930250351835207, + "learning_rate": 7.025340979541229e-07, + "loss": 0.8882, + "step": 181310 + }, + { + "epoch": 14.050912472393351, + "grad_norm": 1.6213708443470238, + "learning_rate": 7.025728456292623e-07, + "loss": 0.8589, + "step": 181320 + }, + { + "epoch": 14.051687395869658, + "grad_norm": 1.5088935249817303, + "learning_rate": 7.026115933044018e-07, + "loss": 0.8689, + "step": 181330 + }, + { + "epoch": 14.052462319345965, + "grad_norm": 1.5602172709401618, + "learning_rate": 7.026503409795412e-07, + "loss": 0.8603, + "step": 181340 + }, + { + "epoch": 14.053237242822272, + "grad_norm": 1.5536907534640514, + "learning_rate": 7.026890886546808e-07, + "loss": 0.8739, + "step": 181350 + }, + { + "epoch": 14.054012166298579, + "grad_norm": 1.5311015151579086, + "learning_rate": 7.027278363298203e-07, + "loss": 0.8738, + "step": 181360 + }, + { + "epoch": 14.054787089774885, + "grad_norm": 1.5244395779841442, + "learning_rate": 7.027665840049598e-07, + "loss": 0.8703, + "step": 181370 + }, + { + "epoch": 14.055562013251192, + "grad_norm": 1.6144155657175345, + "learning_rate": 7.028053316800992e-07, + "loss": 0.8846, + "step": 181380 + }, + { + "epoch": 14.056336936727497, + "grad_norm": 1.6493685808806033, + "learning_rate": 7.028440793552388e-07, + "loss": 0.8557, + "step": 181390 + }, + { + "epoch": 14.057111860203804, + "grad_norm": 1.6021710563750344, + "learning_rate": 7.028828270303782e-07, + "loss": 0.8967, + "step": 181400 + }, + { + "epoch": 14.057886783680111, + "grad_norm": 1.5633009899845183, + "learning_rate": 7.029215747055178e-07, + "loss": 0.8707, + "step": 181410 + }, + { + "epoch": 14.058661707156418, + "grad_norm": 1.5279893180470385, + "learning_rate": 7.029603223806572e-07, + "loss": 0.8996, + "step": 181420 + }, + { + "epoch": 14.059436630632725, + "grad_norm": 1.528279436094896, + "learning_rate": 7.029990700557968e-07, + "loss": 0.894, + "step": 181430 + }, + { + "epoch": 14.060211554109031, + "grad_norm": 1.5599400300769963, + "learning_rate": 7.030378177309362e-07, + "loss": 0.871, + "step": 181440 + }, + { + "epoch": 14.060986477585338, + "grad_norm": 1.5618188981040348, + "learning_rate": 7.030765654060757e-07, + "loss": 0.87, + "step": 181450 + }, + { + "epoch": 14.061761401061645, + "grad_norm": 1.5647783186042112, + "learning_rate": 7.031153130812152e-07, + "loss": 0.8593, + "step": 181460 + }, + { + "epoch": 14.062536324537952, + "grad_norm": 1.53354789175182, + "learning_rate": 7.031540607563547e-07, + "loss": 0.8757, + "step": 181470 + }, + { + "epoch": 14.063311248014259, + "grad_norm": 1.569324379601365, + "learning_rate": 7.031928084314941e-07, + "loss": 0.8759, + "step": 181480 + }, + { + "epoch": 14.064086171490565, + "grad_norm": 1.4943448730867885, + "learning_rate": 7.032315561066337e-07, + "loss": 0.8824, + "step": 181490 + }, + { + "epoch": 14.064861094966872, + "grad_norm": 1.5447497942929718, + "learning_rate": 7.032703037817731e-07, + "loss": 0.8481, + "step": 181500 + }, + { + "epoch": 14.064861094966872, + "eval_loss": 0.8938381671905518, + "eval_runtime": 330.3547, + "eval_samples_per_second": 34.723, + "eval_steps_per_second": 8.682, + "step": 181500 + }, + { + "epoch": 14.065636018443179, + "grad_norm": 1.5545550985705987, + "learning_rate": 7.033090514569127e-07, + "loss": 0.863, + "step": 181510 + }, + { + "epoch": 14.066410941919486, + "grad_norm": 2.0587300025527453, + "learning_rate": 7.033477991320521e-07, + "loss": 0.8521, + "step": 181520 + }, + { + "epoch": 14.067185865395793, + "grad_norm": 1.4958522335451523, + "learning_rate": 7.033865468071917e-07, + "loss": 0.8838, + "step": 181530 + }, + { + "epoch": 14.0679607888721, + "grad_norm": 1.3986850706275193, + "learning_rate": 7.034252944823311e-07, + "loss": 0.8673, + "step": 181540 + }, + { + "epoch": 14.068735712348406, + "grad_norm": 1.4576942233507004, + "learning_rate": 7.034640421574706e-07, + "loss": 0.8715, + "step": 181550 + }, + { + "epoch": 14.069510635824713, + "grad_norm": 1.472109980841672, + "learning_rate": 7.035027898326101e-07, + "loss": 0.8558, + "step": 181560 + }, + { + "epoch": 14.07028555930102, + "grad_norm": 1.5463358751395566, + "learning_rate": 7.035415375077497e-07, + "loss": 0.8679, + "step": 181570 + }, + { + "epoch": 14.071060482777325, + "grad_norm": 1.5450325708708486, + "learning_rate": 7.035802851828891e-07, + "loss": 0.8548, + "step": 181580 + }, + { + "epoch": 14.071835406253632, + "grad_norm": 1.5478465509914834, + "learning_rate": 7.036190328580286e-07, + "loss": 0.857, + "step": 181590 + }, + { + "epoch": 14.072610329729939, + "grad_norm": 1.4036590780035718, + "learning_rate": 7.03657780533168e-07, + "loss": 0.8615, + "step": 181600 + }, + { + "epoch": 14.073385253206245, + "grad_norm": 1.580860927037601, + "learning_rate": 7.036965282083076e-07, + "loss": 0.8595, + "step": 181610 + }, + { + "epoch": 14.074160176682552, + "grad_norm": 1.5669771488903321, + "learning_rate": 7.03735275883447e-07, + "loss": 0.8883, + "step": 181620 + }, + { + "epoch": 14.074935100158859, + "grad_norm": 1.4657355084889625, + "learning_rate": 7.037740235585866e-07, + "loss": 0.8678, + "step": 181630 + }, + { + "epoch": 14.075710023635166, + "grad_norm": 1.508324761135973, + "learning_rate": 7.03812771233726e-07, + "loss": 0.876, + "step": 181640 + }, + { + "epoch": 14.076484947111473, + "grad_norm": 1.5078526870950377, + "learning_rate": 7.038515189088656e-07, + "loss": 0.8702, + "step": 181650 + }, + { + "epoch": 14.07725987058778, + "grad_norm": 1.519400574589999, + "learning_rate": 7.03890266584005e-07, + "loss": 0.8683, + "step": 181660 + }, + { + "epoch": 14.078034794064086, + "grad_norm": 1.5275921457887902, + "learning_rate": 7.039290142591446e-07, + "loss": 0.861, + "step": 181670 + }, + { + "epoch": 14.078809717540393, + "grad_norm": 1.5200935375401063, + "learning_rate": 7.03967761934284e-07, + "loss": 0.8746, + "step": 181680 + }, + { + "epoch": 14.0795846410167, + "grad_norm": 1.4883309229759527, + "learning_rate": 7.040065096094235e-07, + "loss": 0.8712, + "step": 181690 + }, + { + "epoch": 14.080359564493007, + "grad_norm": 1.531724381128958, + "learning_rate": 7.04045257284563e-07, + "loss": 0.8861, + "step": 181700 + }, + { + "epoch": 14.081134487969313, + "grad_norm": 1.507447894428702, + "learning_rate": 7.040840049597026e-07, + "loss": 0.8627, + "step": 181710 + }, + { + "epoch": 14.08190941144562, + "grad_norm": 1.5416484452512855, + "learning_rate": 7.04122752634842e-07, + "loss": 0.8873, + "step": 181720 + }, + { + "epoch": 14.082684334921927, + "grad_norm": 1.505315251070662, + "learning_rate": 7.041615003099815e-07, + "loss": 0.8619, + "step": 181730 + }, + { + "epoch": 14.083459258398234, + "grad_norm": 1.5335995337620516, + "learning_rate": 7.042002479851209e-07, + "loss": 0.8682, + "step": 181740 + }, + { + "epoch": 14.08423418187454, + "grad_norm": 1.5775609863028726, + "learning_rate": 7.042389956602605e-07, + "loss": 0.8596, + "step": 181750 + }, + { + "epoch": 14.085009105350846, + "grad_norm": 1.5747037567445585, + "learning_rate": 7.042777433353999e-07, + "loss": 0.8708, + "step": 181760 + }, + { + "epoch": 14.085784028827153, + "grad_norm": 1.5429815728638672, + "learning_rate": 7.043164910105395e-07, + "loss": 0.8655, + "step": 181770 + }, + { + "epoch": 14.08655895230346, + "grad_norm": 1.5878456977414688, + "learning_rate": 7.043552386856789e-07, + "loss": 0.8736, + "step": 181780 + }, + { + "epoch": 14.087333875779766, + "grad_norm": 1.5554970806697461, + "learning_rate": 7.043939863608184e-07, + "loss": 0.863, + "step": 181790 + }, + { + "epoch": 14.088108799256073, + "grad_norm": 1.6059271882961432, + "learning_rate": 7.044327340359579e-07, + "loss": 0.8768, + "step": 181800 + }, + { + "epoch": 14.08888372273238, + "grad_norm": 1.5537887189282042, + "learning_rate": 7.044714817110975e-07, + "loss": 0.8639, + "step": 181810 + }, + { + "epoch": 14.089658646208687, + "grad_norm": 1.4846806856532928, + "learning_rate": 7.045102293862369e-07, + "loss": 0.8708, + "step": 181820 + }, + { + "epoch": 14.090433569684993, + "grad_norm": 1.5240825504513549, + "learning_rate": 7.045489770613764e-07, + "loss": 0.8563, + "step": 181830 + }, + { + "epoch": 14.0912084931613, + "grad_norm": 1.4447283199577132, + "learning_rate": 7.045877247365158e-07, + "loss": 0.8751, + "step": 181840 + }, + { + "epoch": 14.091983416637607, + "grad_norm": 1.4896139067693104, + "learning_rate": 7.046264724116555e-07, + "loss": 0.8552, + "step": 181850 + }, + { + "epoch": 14.092758340113914, + "grad_norm": 1.4985717263935805, + "learning_rate": 7.046652200867949e-07, + "loss": 0.8509, + "step": 181860 + }, + { + "epoch": 14.09353326359022, + "grad_norm": 1.542921557076881, + "learning_rate": 7.047039677619344e-07, + "loss": 0.8602, + "step": 181870 + }, + { + "epoch": 14.094308187066527, + "grad_norm": 1.4394125491426981, + "learning_rate": 7.047427154370738e-07, + "loss": 0.8834, + "step": 181880 + }, + { + "epoch": 14.095083110542834, + "grad_norm": 1.5302048353280056, + "learning_rate": 7.047814631122133e-07, + "loss": 0.8913, + "step": 181890 + }, + { + "epoch": 14.095858034019141, + "grad_norm": 1.447493275822713, + "learning_rate": 7.048202107873528e-07, + "loss": 0.8895, + "step": 181900 + }, + { + "epoch": 14.096632957495448, + "grad_norm": 1.4985424354509773, + "learning_rate": 7.048589584624924e-07, + "loss": 0.856, + "step": 181910 + }, + { + "epoch": 14.097407880971755, + "grad_norm": 1.5983706688176897, + "learning_rate": 7.048977061376318e-07, + "loss": 0.8679, + "step": 181920 + }, + { + "epoch": 14.098182804448061, + "grad_norm": 1.444531083646488, + "learning_rate": 7.049364538127713e-07, + "loss": 0.8638, + "step": 181930 + }, + { + "epoch": 14.098957727924368, + "grad_norm": 1.58265052321587, + "learning_rate": 7.049752014879107e-07, + "loss": 0.8698, + "step": 181940 + }, + { + "epoch": 14.099732651400673, + "grad_norm": 1.5143190789741239, + "learning_rate": 7.050139491630504e-07, + "loss": 0.8853, + "step": 181950 + }, + { + "epoch": 14.10050757487698, + "grad_norm": 1.4269135980069163, + "learning_rate": 7.050526968381898e-07, + "loss": 0.8561, + "step": 181960 + }, + { + "epoch": 14.101282498353287, + "grad_norm": 1.5201072237193605, + "learning_rate": 7.050914445133293e-07, + "loss": 0.8576, + "step": 181970 + }, + { + "epoch": 14.102057421829594, + "grad_norm": 1.588456804067971, + "learning_rate": 7.051301921884687e-07, + "loss": 0.8538, + "step": 181980 + }, + { + "epoch": 14.1028323453059, + "grad_norm": 1.6325608067026458, + "learning_rate": 7.051689398636082e-07, + "loss": 0.8567, + "step": 181990 + }, + { + "epoch": 14.103607268782207, + "grad_norm": 1.6237958103140502, + "learning_rate": 7.052076875387478e-07, + "loss": 0.874, + "step": 182000 + }, + { + "epoch": 14.103607268782207, + "eval_loss": 0.893494725227356, + "eval_runtime": 328.0224, + "eval_samples_per_second": 34.97, + "eval_steps_per_second": 8.743, + "step": 182000 + }, + { + "epoch": 14.104382192258514, + "grad_norm": 1.5266220979702003, + "learning_rate": 7.052464352138873e-07, + "loss": 0.8624, + "step": 182010 + }, + { + "epoch": 14.105157115734821, + "grad_norm": 1.4513427552472968, + "learning_rate": 7.052851828890267e-07, + "loss": 0.8526, + "step": 182020 + }, + { + "epoch": 14.105932039211128, + "grad_norm": 1.5555299424777254, + "learning_rate": 7.053239305641662e-07, + "loss": 0.8815, + "step": 182030 + }, + { + "epoch": 14.106706962687435, + "grad_norm": 1.5291537893685847, + "learning_rate": 7.053626782393056e-07, + "loss": 0.8828, + "step": 182040 + }, + { + "epoch": 14.107481886163741, + "grad_norm": 1.5655321060954148, + "learning_rate": 7.054014259144453e-07, + "loss": 0.8757, + "step": 182050 + }, + { + "epoch": 14.108256809640048, + "grad_norm": 1.5407071856902919, + "learning_rate": 7.054401735895847e-07, + "loss": 0.8625, + "step": 182060 + }, + { + "epoch": 14.109031733116355, + "grad_norm": 1.6198263282527205, + "learning_rate": 7.054789212647242e-07, + "loss": 0.892, + "step": 182070 + }, + { + "epoch": 14.109806656592662, + "grad_norm": 1.5315439904651516, + "learning_rate": 7.055176689398636e-07, + "loss": 0.8607, + "step": 182080 + }, + { + "epoch": 14.110581580068969, + "grad_norm": 1.4924172022404179, + "learning_rate": 7.055564166150032e-07, + "loss": 0.871, + "step": 182090 + }, + { + "epoch": 14.111356503545275, + "grad_norm": 1.6548194137221228, + "learning_rate": 7.055951642901427e-07, + "loss": 0.874, + "step": 182100 + }, + { + "epoch": 14.112131427021582, + "grad_norm": 1.601856961786269, + "learning_rate": 7.056339119652822e-07, + "loss": 0.8726, + "step": 182110 + }, + { + "epoch": 14.112906350497889, + "grad_norm": 1.4629249282393344, + "learning_rate": 7.056726596404216e-07, + "loss": 0.8778, + "step": 182120 + }, + { + "epoch": 14.113681273974196, + "grad_norm": 1.5280820952691043, + "learning_rate": 7.057114073155611e-07, + "loss": 0.8516, + "step": 182130 + }, + { + "epoch": 14.114456197450501, + "grad_norm": 1.4822419245776477, + "learning_rate": 7.057501549907006e-07, + "loss": 0.859, + "step": 182140 + }, + { + "epoch": 14.115231120926808, + "grad_norm": 1.4628603706914083, + "learning_rate": 7.057889026658402e-07, + "loss": 0.8356, + "step": 182150 + }, + { + "epoch": 14.116006044403115, + "grad_norm": 1.832963277013659, + "learning_rate": 7.058276503409796e-07, + "loss": 0.8639, + "step": 182160 + }, + { + "epoch": 14.116780967879421, + "grad_norm": 1.556178165540651, + "learning_rate": 7.058663980161191e-07, + "loss": 0.8648, + "step": 182170 + }, + { + "epoch": 14.117555891355728, + "grad_norm": 1.5131224281033617, + "learning_rate": 7.059051456912585e-07, + "loss": 0.8656, + "step": 182180 + }, + { + "epoch": 14.118330814832035, + "grad_norm": 1.728040243333251, + "learning_rate": 7.059438933663982e-07, + "loss": 0.8676, + "step": 182190 + }, + { + "epoch": 14.119105738308342, + "grad_norm": 1.4715469764683422, + "learning_rate": 7.059826410415376e-07, + "loss": 0.871, + "step": 182200 + }, + { + "epoch": 14.119880661784649, + "grad_norm": 1.4803755567347154, + "learning_rate": 7.060213887166771e-07, + "loss": 0.86, + "step": 182210 + }, + { + "epoch": 14.120655585260955, + "grad_norm": 1.6079019788122075, + "learning_rate": 7.060601363918165e-07, + "loss": 0.8626, + "step": 182220 + }, + { + "epoch": 14.121430508737262, + "grad_norm": 1.4880447808936088, + "learning_rate": 7.060988840669561e-07, + "loss": 0.8671, + "step": 182230 + }, + { + "epoch": 14.122205432213569, + "grad_norm": 1.5667326093040708, + "learning_rate": 7.061376317420955e-07, + "loss": 0.8589, + "step": 182240 + }, + { + "epoch": 14.122980355689876, + "grad_norm": 1.5604849815401138, + "learning_rate": 7.061763794172351e-07, + "loss": 0.8614, + "step": 182250 + }, + { + "epoch": 14.123755279166183, + "grad_norm": 1.5304724877385898, + "learning_rate": 7.062151270923745e-07, + "loss": 0.8594, + "step": 182260 + }, + { + "epoch": 14.12453020264249, + "grad_norm": 1.4513943606052382, + "learning_rate": 7.06253874767514e-07, + "loss": 0.8716, + "step": 182270 + }, + { + "epoch": 14.125305126118796, + "grad_norm": 1.5898116685716854, + "learning_rate": 7.062926224426535e-07, + "loss": 0.8599, + "step": 182280 + }, + { + "epoch": 14.126080049595103, + "grad_norm": 1.5250066823526751, + "learning_rate": 7.06331370117793e-07, + "loss": 0.8627, + "step": 182290 + }, + { + "epoch": 14.12685497307141, + "grad_norm": 1.512340873963595, + "learning_rate": 7.063701177929325e-07, + "loss": 0.8682, + "step": 182300 + }, + { + "epoch": 14.127629896547717, + "grad_norm": 1.4348254210128266, + "learning_rate": 7.06408865468072e-07, + "loss": 0.8493, + "step": 182310 + }, + { + "epoch": 14.128404820024022, + "grad_norm": 1.4294674223305093, + "learning_rate": 7.064476131432114e-07, + "loss": 0.856, + "step": 182320 + }, + { + "epoch": 14.129179743500329, + "grad_norm": 1.4588572410260545, + "learning_rate": 7.06486360818351e-07, + "loss": 0.8724, + "step": 182330 + }, + { + "epoch": 14.129954666976635, + "grad_norm": 1.4994874809595906, + "learning_rate": 7.065251084934905e-07, + "loss": 0.8535, + "step": 182340 + }, + { + "epoch": 14.130729590452942, + "grad_norm": 1.547628437208532, + "learning_rate": 7.0656385616863e-07, + "loss": 0.8616, + "step": 182350 + }, + { + "epoch": 14.131504513929249, + "grad_norm": 1.5199560248334152, + "learning_rate": 7.066026038437694e-07, + "loss": 0.8555, + "step": 182360 + }, + { + "epoch": 14.132279437405556, + "grad_norm": 1.5090043707001366, + "learning_rate": 7.06641351518909e-07, + "loss": 0.863, + "step": 182370 + }, + { + "epoch": 14.133054360881863, + "grad_norm": 1.4880692535346445, + "learning_rate": 7.066800991940484e-07, + "loss": 0.8615, + "step": 182380 + }, + { + "epoch": 14.13382928435817, + "grad_norm": 1.4955913537529448, + "learning_rate": 7.06718846869188e-07, + "loss": 0.8774, + "step": 182390 + }, + { + "epoch": 14.134604207834476, + "grad_norm": 1.5244289421305328, + "learning_rate": 7.067575945443274e-07, + "loss": 0.8638, + "step": 182400 + }, + { + "epoch": 14.135379131310783, + "grad_norm": 1.4758847566441666, + "learning_rate": 7.067963422194669e-07, + "loss": 0.8842, + "step": 182410 + }, + { + "epoch": 14.13615405478709, + "grad_norm": 1.5395498420015632, + "learning_rate": 7.068350898946064e-07, + "loss": 0.8971, + "step": 182420 + }, + { + "epoch": 14.136928978263397, + "grad_norm": 1.4995106286719277, + "learning_rate": 7.068738375697459e-07, + "loss": 0.8527, + "step": 182430 + }, + { + "epoch": 14.137703901739703, + "grad_norm": 1.5379627526875543, + "learning_rate": 7.069125852448854e-07, + "loss": 0.8738, + "step": 182440 + }, + { + "epoch": 14.13847882521601, + "grad_norm": 1.5991754813308225, + "learning_rate": 7.069513329200249e-07, + "loss": 0.9023, + "step": 182450 + }, + { + "epoch": 14.139253748692317, + "grad_norm": 1.5563506543603236, + "learning_rate": 7.069900805951643e-07, + "loss": 0.8898, + "step": 182460 + }, + { + "epoch": 14.140028672168624, + "grad_norm": 1.5138519775328247, + "learning_rate": 7.070288282703039e-07, + "loss": 0.8799, + "step": 182470 + }, + { + "epoch": 14.14080359564493, + "grad_norm": 1.5087597991239268, + "learning_rate": 7.070675759454433e-07, + "loss": 0.865, + "step": 182480 + }, + { + "epoch": 14.141578519121238, + "grad_norm": 1.5601641026160553, + "learning_rate": 7.071063236205829e-07, + "loss": 0.8704, + "step": 182490 + }, + { + "epoch": 14.142353442597544, + "grad_norm": 1.4887642010781001, + "learning_rate": 7.071450712957223e-07, + "loss": 0.8529, + "step": 182500 + }, + { + "epoch": 14.142353442597544, + "eval_loss": 0.8935125470161438, + "eval_runtime": 328.3121, + "eval_samples_per_second": 34.939, + "eval_steps_per_second": 8.736, + "step": 182500 + }, + { + "epoch": 14.14312836607385, + "grad_norm": 1.6324684431332426, + "learning_rate": 7.071838189708619e-07, + "loss": 0.8496, + "step": 182510 + }, + { + "epoch": 14.143903289550156, + "grad_norm": 1.5058800073134648, + "learning_rate": 7.072225666460013e-07, + "loss": 0.8645, + "step": 182520 + }, + { + "epoch": 14.144678213026463, + "grad_norm": 1.5526142285693552, + "learning_rate": 7.072613143211408e-07, + "loss": 0.8656, + "step": 182530 + }, + { + "epoch": 14.14545313650277, + "grad_norm": 1.7071044171014904, + "learning_rate": 7.073000619962803e-07, + "loss": 0.888, + "step": 182540 + }, + { + "epoch": 14.146228059979077, + "grad_norm": 1.6398836982657345, + "learning_rate": 7.073388096714198e-07, + "loss": 0.8773, + "step": 182550 + }, + { + "epoch": 14.147002983455383, + "grad_norm": 1.5340555276271, + "learning_rate": 7.073775573465592e-07, + "loss": 0.8686, + "step": 182560 + }, + { + "epoch": 14.14777790693169, + "grad_norm": 1.541584211463686, + "learning_rate": 7.074163050216988e-07, + "loss": 0.8572, + "step": 182570 + }, + { + "epoch": 14.148552830407997, + "grad_norm": 1.595925710102966, + "learning_rate": 7.074550526968382e-07, + "loss": 0.8644, + "step": 182580 + }, + { + "epoch": 14.149327753884304, + "grad_norm": 1.5453435165202916, + "learning_rate": 7.074938003719778e-07, + "loss": 0.8714, + "step": 182590 + }, + { + "epoch": 14.15010267736061, + "grad_norm": 1.6031465133664364, + "learning_rate": 7.075325480471172e-07, + "loss": 0.8788, + "step": 182600 + }, + { + "epoch": 14.150877600836917, + "grad_norm": 1.5852752910776458, + "learning_rate": 7.075712957222568e-07, + "loss": 0.8954, + "step": 182610 + }, + { + "epoch": 14.151652524313224, + "grad_norm": 1.6721911407298167, + "learning_rate": 7.076100433973962e-07, + "loss": 0.8743, + "step": 182620 + }, + { + "epoch": 14.152427447789531, + "grad_norm": 1.533201294226598, + "learning_rate": 7.076487910725357e-07, + "loss": 0.8654, + "step": 182630 + }, + { + "epoch": 14.153202371265838, + "grad_norm": 1.4859442631271316, + "learning_rate": 7.076875387476752e-07, + "loss": 0.8569, + "step": 182640 + }, + { + "epoch": 14.153977294742145, + "grad_norm": 1.5921332182589265, + "learning_rate": 7.077262864228148e-07, + "loss": 0.8699, + "step": 182650 + }, + { + "epoch": 14.154752218218452, + "grad_norm": 1.7071611259526882, + "learning_rate": 7.077650340979542e-07, + "loss": 0.8836, + "step": 182660 + }, + { + "epoch": 14.155527141694758, + "grad_norm": 1.5407891592468772, + "learning_rate": 7.078037817730937e-07, + "loss": 0.8728, + "step": 182670 + }, + { + "epoch": 14.156302065171065, + "grad_norm": 1.4553698654341862, + "learning_rate": 7.078425294482331e-07, + "loss": 0.8607, + "step": 182680 + }, + { + "epoch": 14.157076988647372, + "grad_norm": 1.4564651203517698, + "learning_rate": 7.078812771233727e-07, + "loss": 0.8657, + "step": 182690 + }, + { + "epoch": 14.157851912123677, + "grad_norm": 1.5047729700996113, + "learning_rate": 7.079200247985121e-07, + "loss": 0.8741, + "step": 182700 + }, + { + "epoch": 14.158626835599984, + "grad_norm": 1.4805783115119733, + "learning_rate": 7.079587724736517e-07, + "loss": 0.87, + "step": 182710 + }, + { + "epoch": 14.15940175907629, + "grad_norm": 1.482323062914231, + "learning_rate": 7.079975201487911e-07, + "loss": 0.8609, + "step": 182720 + }, + { + "epoch": 14.160176682552597, + "grad_norm": 1.5413713360120893, + "learning_rate": 7.080362678239306e-07, + "loss": 0.8685, + "step": 182730 + }, + { + "epoch": 14.160951606028904, + "grad_norm": 1.6034202659432335, + "learning_rate": 7.080750154990701e-07, + "loss": 0.8531, + "step": 182740 + }, + { + "epoch": 14.161726529505211, + "grad_norm": 1.529386715521334, + "learning_rate": 7.081137631742097e-07, + "loss": 0.8582, + "step": 182750 + }, + { + "epoch": 14.162501452981518, + "grad_norm": 1.593559077249852, + "learning_rate": 7.081525108493491e-07, + "loss": 0.8794, + "step": 182760 + }, + { + "epoch": 14.163276376457825, + "grad_norm": 1.6619737411913984, + "learning_rate": 7.081912585244886e-07, + "loss": 0.8757, + "step": 182770 + }, + { + "epoch": 14.164051299934131, + "grad_norm": 1.4854746666514904, + "learning_rate": 7.08230006199628e-07, + "loss": 0.8561, + "step": 182780 + }, + { + "epoch": 14.164826223410438, + "grad_norm": 1.65529893800656, + "learning_rate": 7.082687538747677e-07, + "loss": 0.8676, + "step": 182790 + }, + { + "epoch": 14.165601146886745, + "grad_norm": 1.5607161441798263, + "learning_rate": 7.083075015499071e-07, + "loss": 0.8659, + "step": 182800 + }, + { + "epoch": 14.166376070363052, + "grad_norm": 1.59184336515939, + "learning_rate": 7.083462492250466e-07, + "loss": 0.8588, + "step": 182810 + }, + { + "epoch": 14.167150993839359, + "grad_norm": 1.5002454054713346, + "learning_rate": 7.08384996900186e-07, + "loss": 0.8914, + "step": 182820 + }, + { + "epoch": 14.167925917315666, + "grad_norm": 1.5615463040092206, + "learning_rate": 7.084237445753255e-07, + "loss": 0.8564, + "step": 182830 + }, + { + "epoch": 14.168700840791972, + "grad_norm": 1.4555585111345144, + "learning_rate": 7.08462492250465e-07, + "loss": 0.8746, + "step": 182840 + }, + { + "epoch": 14.16947576426828, + "grad_norm": 1.5366468725925388, + "learning_rate": 7.085012399256046e-07, + "loss": 0.8686, + "step": 182850 + }, + { + "epoch": 14.170250687744586, + "grad_norm": 1.6582346573210391, + "learning_rate": 7.08539987600744e-07, + "loss": 0.871, + "step": 182860 + }, + { + "epoch": 14.171025611220893, + "grad_norm": 1.5040987533897745, + "learning_rate": 7.085787352758835e-07, + "loss": 0.856, + "step": 182870 + }, + { + "epoch": 14.171800534697198, + "grad_norm": 1.475285124223956, + "learning_rate": 7.086174829510229e-07, + "loss": 0.8721, + "step": 182880 + }, + { + "epoch": 14.172575458173505, + "grad_norm": 1.562526892216964, + "learning_rate": 7.086562306261626e-07, + "loss": 0.8529, + "step": 182890 + }, + { + "epoch": 14.173350381649811, + "grad_norm": 1.514897303866745, + "learning_rate": 7.08694978301302e-07, + "loss": 0.8683, + "step": 182900 + }, + { + "epoch": 14.174125305126118, + "grad_norm": 1.5306700689378163, + "learning_rate": 7.087337259764415e-07, + "loss": 0.856, + "step": 182910 + }, + { + "epoch": 14.174900228602425, + "grad_norm": 1.6374720326205408, + "learning_rate": 7.087724736515809e-07, + "loss": 0.8654, + "step": 182920 + }, + { + "epoch": 14.175675152078732, + "grad_norm": 1.4365440262093394, + "learning_rate": 7.088112213267206e-07, + "loss": 0.8567, + "step": 182930 + }, + { + "epoch": 14.176450075555039, + "grad_norm": 1.4573829055494478, + "learning_rate": 7.0884996900186e-07, + "loss": 0.8421, + "step": 182940 + }, + { + "epoch": 14.177224999031345, + "grad_norm": 1.5257260598356757, + "learning_rate": 7.088887166769995e-07, + "loss": 0.8771, + "step": 182950 + }, + { + "epoch": 14.177999922507652, + "grad_norm": 1.4770743451087331, + "learning_rate": 7.089274643521389e-07, + "loss": 0.8554, + "step": 182960 + }, + { + "epoch": 14.178774845983959, + "grad_norm": 1.5467303632872154, + "learning_rate": 7.089662120272784e-07, + "loss": 0.8611, + "step": 182970 + }, + { + "epoch": 14.179549769460266, + "grad_norm": 1.739343236550731, + "learning_rate": 7.090049597024178e-07, + "loss": 0.8646, + "step": 182980 + }, + { + "epoch": 14.180324692936573, + "grad_norm": 1.6152921032336776, + "learning_rate": 7.090437073775575e-07, + "loss": 0.862, + "step": 182990 + }, + { + "epoch": 14.18109961641288, + "grad_norm": 1.5320494429655476, + "learning_rate": 7.090824550526969e-07, + "loss": 0.8699, + "step": 183000 + }, + { + "epoch": 14.18109961641288, + "eval_loss": 0.8933964371681213, + "eval_runtime": 328.4171, + "eval_samples_per_second": 34.928, + "eval_steps_per_second": 8.733, + "step": 183000 + }, + { + "epoch": 14.181874539889186, + "grad_norm": 1.5276005575901808, + "learning_rate": 7.091212027278364e-07, + "loss": 0.8521, + "step": 183010 + }, + { + "epoch": 14.182649463365493, + "grad_norm": 1.5384395242016826, + "learning_rate": 7.091599504029758e-07, + "loss": 0.8711, + "step": 183020 + }, + { + "epoch": 14.1834243868418, + "grad_norm": 1.47481757490842, + "learning_rate": 7.091986980781155e-07, + "loss": 0.8507, + "step": 183030 + }, + { + "epoch": 14.184199310318107, + "grad_norm": 1.5380123021386498, + "learning_rate": 7.092374457532549e-07, + "loss": 0.8776, + "step": 183040 + }, + { + "epoch": 14.184974233794414, + "grad_norm": 1.6528310424699892, + "learning_rate": 7.092761934283944e-07, + "loss": 0.8673, + "step": 183050 + }, + { + "epoch": 14.18574915727072, + "grad_norm": 1.4620389243440757, + "learning_rate": 7.093149411035338e-07, + "loss": 0.8675, + "step": 183060 + }, + { + "epoch": 14.186524080747025, + "grad_norm": 1.4757104874830491, + "learning_rate": 7.093536887786734e-07, + "loss": 0.8656, + "step": 183070 + }, + { + "epoch": 14.187299004223332, + "grad_norm": 1.5824600635060697, + "learning_rate": 7.093924364538129e-07, + "loss": 0.8687, + "step": 183080 + }, + { + "epoch": 14.188073927699639, + "grad_norm": 1.5043278037662393, + "learning_rate": 7.094311841289524e-07, + "loss": 0.8738, + "step": 183090 + }, + { + "epoch": 14.188848851175946, + "grad_norm": 1.5081404091084178, + "learning_rate": 7.094699318040918e-07, + "loss": 0.8804, + "step": 183100 + }, + { + "epoch": 14.189623774652253, + "grad_norm": 1.4541627297984765, + "learning_rate": 7.095086794792313e-07, + "loss": 0.862, + "step": 183110 + }, + { + "epoch": 14.19039869812856, + "grad_norm": 1.5245465697092544, + "learning_rate": 7.095474271543707e-07, + "loss": 0.8781, + "step": 183120 + }, + { + "epoch": 14.191173621604866, + "grad_norm": 1.5074185225892347, + "learning_rate": 7.095861748295104e-07, + "loss": 0.8555, + "step": 183130 + }, + { + "epoch": 14.191948545081173, + "grad_norm": 1.5876669837612727, + "learning_rate": 7.096249225046498e-07, + "loss": 0.861, + "step": 183140 + }, + { + "epoch": 14.19272346855748, + "grad_norm": 1.5993985811496323, + "learning_rate": 7.096636701797893e-07, + "loss": 0.8742, + "step": 183150 + }, + { + "epoch": 14.193498392033787, + "grad_norm": 1.5505930122112686, + "learning_rate": 7.097024178549287e-07, + "loss": 0.8557, + "step": 183160 + }, + { + "epoch": 14.194273315510094, + "grad_norm": 1.5428434659467196, + "learning_rate": 7.097411655300683e-07, + "loss": 0.8614, + "step": 183170 + }, + { + "epoch": 14.1950482389864, + "grad_norm": 1.590944807703482, + "learning_rate": 7.097799132052078e-07, + "loss": 0.8467, + "step": 183180 + }, + { + "epoch": 14.195823162462707, + "grad_norm": 1.4632749877313833, + "learning_rate": 7.098186608803473e-07, + "loss": 0.8693, + "step": 183190 + }, + { + "epoch": 14.196598085939014, + "grad_norm": 1.5824199724723453, + "learning_rate": 7.098574085554867e-07, + "loss": 0.8677, + "step": 183200 + }, + { + "epoch": 14.19737300941532, + "grad_norm": 1.5400246580568508, + "learning_rate": 7.098961562306263e-07, + "loss": 0.8697, + "step": 183210 + }, + { + "epoch": 14.198147932891628, + "grad_norm": 1.4435945709204372, + "learning_rate": 7.099349039057657e-07, + "loss": 0.8604, + "step": 183220 + }, + { + "epoch": 14.198922856367934, + "grad_norm": 1.4855649364306298, + "learning_rate": 7.099736515809053e-07, + "loss": 0.8747, + "step": 183230 + }, + { + "epoch": 14.199697779844241, + "grad_norm": 1.5349201771508618, + "learning_rate": 7.100123992560447e-07, + "loss": 0.8711, + "step": 183240 + }, + { + "epoch": 14.200472703320546, + "grad_norm": 1.4700797783664503, + "learning_rate": 7.100511469311842e-07, + "loss": 0.8839, + "step": 183250 + }, + { + "epoch": 14.201247626796853, + "grad_norm": 1.5999975470091599, + "learning_rate": 7.100898946063236e-07, + "loss": 0.857, + "step": 183260 + }, + { + "epoch": 14.20202255027316, + "grad_norm": 1.4993852051885939, + "learning_rate": 7.101286422814632e-07, + "loss": 0.8587, + "step": 183270 + }, + { + "epoch": 14.202797473749467, + "grad_norm": 1.6379603098437168, + "learning_rate": 7.101673899566027e-07, + "loss": 0.8613, + "step": 183280 + }, + { + "epoch": 14.203572397225773, + "grad_norm": 1.5716834019661032, + "learning_rate": 7.102061376317422e-07, + "loss": 0.8528, + "step": 183290 + }, + { + "epoch": 14.20434732070208, + "grad_norm": 1.4887026100618383, + "learning_rate": 7.102448853068816e-07, + "loss": 0.8843, + "step": 183300 + }, + { + "epoch": 14.205122244178387, + "grad_norm": 1.664072972402432, + "learning_rate": 7.102836329820212e-07, + "loss": 0.8793, + "step": 183310 + }, + { + "epoch": 14.205897167654694, + "grad_norm": 1.5545097628947475, + "learning_rate": 7.103223806571606e-07, + "loss": 0.855, + "step": 183320 + }, + { + "epoch": 14.206672091131, + "grad_norm": 1.560428367690619, + "learning_rate": 7.103611283323002e-07, + "loss": 0.8828, + "step": 183330 + }, + { + "epoch": 14.207447014607308, + "grad_norm": 1.5198824919107208, + "learning_rate": 7.103998760074396e-07, + "loss": 0.8702, + "step": 183340 + }, + { + "epoch": 14.208221938083614, + "grad_norm": 1.5323475480122863, + "learning_rate": 7.104386236825792e-07, + "loss": 0.872, + "step": 183350 + }, + { + "epoch": 14.208996861559921, + "grad_norm": 1.555017596464048, + "learning_rate": 7.104773713577186e-07, + "loss": 0.8677, + "step": 183360 + }, + { + "epoch": 14.209771785036228, + "grad_norm": 1.4252955229663036, + "learning_rate": 7.105161190328581e-07, + "loss": 0.8645, + "step": 183370 + }, + { + "epoch": 14.210546708512535, + "grad_norm": 1.5313033518168924, + "learning_rate": 7.105548667079976e-07, + "loss": 0.872, + "step": 183380 + }, + { + "epoch": 14.211321631988842, + "grad_norm": 1.4874750248436015, + "learning_rate": 7.105936143831371e-07, + "loss": 0.8905, + "step": 183390 + }, + { + "epoch": 14.212096555465148, + "grad_norm": 1.5331655945716318, + "learning_rate": 7.106323620582765e-07, + "loss": 0.8528, + "step": 183400 + }, + { + "epoch": 14.212871478941455, + "grad_norm": 1.5776417592196972, + "learning_rate": 7.106711097334161e-07, + "loss": 0.8789, + "step": 183410 + }, + { + "epoch": 14.213646402417762, + "grad_norm": 1.592068294694207, + "learning_rate": 7.107098574085555e-07, + "loss": 0.8994, + "step": 183420 + }, + { + "epoch": 14.214421325894069, + "grad_norm": 1.6343769559893204, + "learning_rate": 7.107486050836951e-07, + "loss": 0.8688, + "step": 183430 + }, + { + "epoch": 14.215196249370374, + "grad_norm": 1.6054151224052233, + "learning_rate": 7.107873527588345e-07, + "loss": 0.8547, + "step": 183440 + }, + { + "epoch": 14.21597117284668, + "grad_norm": 1.5245178102850332, + "learning_rate": 7.108261004339741e-07, + "loss": 0.8684, + "step": 183450 + }, + { + "epoch": 14.216746096322987, + "grad_norm": 1.6243861774183994, + "learning_rate": 7.108648481091135e-07, + "loss": 0.8568, + "step": 183460 + }, + { + "epoch": 14.217521019799294, + "grad_norm": 1.699876189722067, + "learning_rate": 7.10903595784253e-07, + "loss": 0.8798, + "step": 183470 + }, + { + "epoch": 14.218295943275601, + "grad_norm": 1.5709585656215719, + "learning_rate": 7.109423434593925e-07, + "loss": 0.8675, + "step": 183480 + }, + { + "epoch": 14.219070866751908, + "grad_norm": 1.5345952568460457, + "learning_rate": 7.10981091134532e-07, + "loss": 0.8734, + "step": 183490 + }, + { + "epoch": 14.219845790228215, + "grad_norm": 1.536693367180424, + "learning_rate": 7.110198388096715e-07, + "loss": 0.8616, + "step": 183500 + }, + { + "epoch": 14.219845790228215, + "eval_loss": 0.8930777907371521, + "eval_runtime": 330.5718, + "eval_samples_per_second": 34.7, + "eval_steps_per_second": 8.676, + "step": 183500 + }, + { + "epoch": 14.220620713704522, + "grad_norm": 1.5397729102644337, + "learning_rate": 7.11058586484811e-07, + "loss": 0.8697, + "step": 183510 + }, + { + "epoch": 14.221395637180828, + "grad_norm": 1.4263752796178262, + "learning_rate": 7.110973341599504e-07, + "loss": 0.8646, + "step": 183520 + }, + { + "epoch": 14.222170560657135, + "grad_norm": 1.613530240972072, + "learning_rate": 7.1113608183509e-07, + "loss": 0.8769, + "step": 183530 + }, + { + "epoch": 14.222945484133442, + "grad_norm": 1.5094172370473113, + "learning_rate": 7.111748295102294e-07, + "loss": 0.8786, + "step": 183540 + }, + { + "epoch": 14.223720407609749, + "grad_norm": 1.5925776280210322, + "learning_rate": 7.11213577185369e-07, + "loss": 0.8653, + "step": 183550 + }, + { + "epoch": 14.224495331086056, + "grad_norm": 1.654014327645659, + "learning_rate": 7.112523248605084e-07, + "loss": 0.8798, + "step": 183560 + }, + { + "epoch": 14.225270254562362, + "grad_norm": 1.4884896642660517, + "learning_rate": 7.11291072535648e-07, + "loss": 0.8776, + "step": 183570 + }, + { + "epoch": 14.22604517803867, + "grad_norm": 1.6217985158807835, + "learning_rate": 7.113298202107874e-07, + "loss": 0.8533, + "step": 183580 + }, + { + "epoch": 14.226820101514976, + "grad_norm": 1.5285224628293441, + "learning_rate": 7.11368567885927e-07, + "loss": 0.8633, + "step": 183590 + }, + { + "epoch": 14.227595024991283, + "grad_norm": 1.5556820119057113, + "learning_rate": 7.114073155610664e-07, + "loss": 0.8638, + "step": 183600 + }, + { + "epoch": 14.22836994846759, + "grad_norm": 1.577860566553318, + "learning_rate": 7.114460632362059e-07, + "loss": 0.8689, + "step": 183610 + }, + { + "epoch": 14.229144871943895, + "grad_norm": 1.4808556209730497, + "learning_rate": 7.114848109113453e-07, + "loss": 0.8648, + "step": 183620 + }, + { + "epoch": 14.229919795420201, + "grad_norm": 1.6016723074781543, + "learning_rate": 7.115235585864849e-07, + "loss": 0.869, + "step": 183630 + }, + { + "epoch": 14.230694718896508, + "grad_norm": 1.5348722653852482, + "learning_rate": 7.115623062616244e-07, + "loss": 0.8597, + "step": 183640 + }, + { + "epoch": 14.231469642372815, + "grad_norm": 1.6231792429107283, + "learning_rate": 7.116010539367639e-07, + "loss": 0.8579, + "step": 183650 + }, + { + "epoch": 14.232244565849122, + "grad_norm": 1.5532387852098133, + "learning_rate": 7.116398016119033e-07, + "loss": 0.8788, + "step": 183660 + }, + { + "epoch": 14.233019489325429, + "grad_norm": 1.4156129678259306, + "learning_rate": 7.116785492870429e-07, + "loss": 0.8468, + "step": 183670 + }, + { + "epoch": 14.233794412801736, + "grad_norm": 1.533268814683731, + "learning_rate": 7.117172969621823e-07, + "loss": 0.8607, + "step": 183680 + }, + { + "epoch": 14.234569336278042, + "grad_norm": 1.4822183227578494, + "learning_rate": 7.117560446373219e-07, + "loss": 0.8705, + "step": 183690 + }, + { + "epoch": 14.23534425975435, + "grad_norm": 1.68906662605339, + "learning_rate": 7.117947923124613e-07, + "loss": 0.8669, + "step": 183700 + }, + { + "epoch": 14.236119183230656, + "grad_norm": 1.619864719446248, + "learning_rate": 7.118335399876008e-07, + "loss": 0.8552, + "step": 183710 + }, + { + "epoch": 14.236894106706963, + "grad_norm": 1.6539327078741022, + "learning_rate": 7.118722876627402e-07, + "loss": 0.8719, + "step": 183720 + }, + { + "epoch": 14.23766903018327, + "grad_norm": 1.4765980744740985, + "learning_rate": 7.119110353378799e-07, + "loss": 0.8663, + "step": 183730 + }, + { + "epoch": 14.238443953659576, + "grad_norm": 1.4999839999923648, + "learning_rate": 7.119497830130193e-07, + "loss": 0.8663, + "step": 183740 + }, + { + "epoch": 14.239218877135883, + "grad_norm": 1.5742736957749162, + "learning_rate": 7.119885306881588e-07, + "loss": 0.8645, + "step": 183750 + }, + { + "epoch": 14.23999380061219, + "grad_norm": 1.5814553506605533, + "learning_rate": 7.120272783632982e-07, + "loss": 0.8729, + "step": 183760 + }, + { + "epoch": 14.240768724088497, + "grad_norm": 1.5324032029648014, + "learning_rate": 7.120660260384378e-07, + "loss": 0.8639, + "step": 183770 + }, + { + "epoch": 14.241543647564804, + "grad_norm": 1.3996402338727627, + "learning_rate": 7.121047737135773e-07, + "loss": 0.8716, + "step": 183780 + }, + { + "epoch": 14.24231857104111, + "grad_norm": 1.5940495259178673, + "learning_rate": 7.121435213887168e-07, + "loss": 0.8756, + "step": 183790 + }, + { + "epoch": 14.243093494517417, + "grad_norm": 1.580462053029806, + "learning_rate": 7.121822690638562e-07, + "loss": 0.8779, + "step": 183800 + }, + { + "epoch": 14.243868417993722, + "grad_norm": 1.5073578208247744, + "learning_rate": 7.122210167389957e-07, + "loss": 0.8736, + "step": 183810 + }, + { + "epoch": 14.244643341470029, + "grad_norm": 1.5189638127320368, + "learning_rate": 7.122597644141352e-07, + "loss": 0.8614, + "step": 183820 + }, + { + "epoch": 14.245418264946336, + "grad_norm": 1.482440643300307, + "learning_rate": 7.122985120892748e-07, + "loss": 0.844, + "step": 183830 + }, + { + "epoch": 14.246193188422643, + "grad_norm": 1.5478102803087181, + "learning_rate": 7.123372597644142e-07, + "loss": 0.8613, + "step": 183840 + }, + { + "epoch": 14.24696811189895, + "grad_norm": 1.4391876788493827, + "learning_rate": 7.123760074395537e-07, + "loss": 0.8853, + "step": 183850 + }, + { + "epoch": 14.247743035375256, + "grad_norm": 1.6320890887893107, + "learning_rate": 7.124147551146931e-07, + "loss": 0.8655, + "step": 183860 + }, + { + "epoch": 14.248517958851563, + "grad_norm": 1.3959122653918266, + "learning_rate": 7.124535027898328e-07, + "loss": 0.8601, + "step": 183870 + }, + { + "epoch": 14.24929288232787, + "grad_norm": 1.5463250396998427, + "learning_rate": 7.124922504649722e-07, + "loss": 0.8741, + "step": 183880 + }, + { + "epoch": 14.250067805804177, + "grad_norm": 1.5281930846113236, + "learning_rate": 7.125309981401117e-07, + "loss": 0.8629, + "step": 183890 + }, + { + "epoch": 14.250842729280484, + "grad_norm": 1.5262834334688153, + "learning_rate": 7.125697458152511e-07, + "loss": 0.8653, + "step": 183900 + }, + { + "epoch": 14.25161765275679, + "grad_norm": 1.5801404352986153, + "learning_rate": 7.126084934903906e-07, + "loss": 0.8466, + "step": 183910 + }, + { + "epoch": 14.252392576233097, + "grad_norm": 1.4660059087817197, + "learning_rate": 7.126472411655302e-07, + "loss": 0.8443, + "step": 183920 + }, + { + "epoch": 14.253167499709404, + "grad_norm": 1.489935677811283, + "learning_rate": 7.126859888406697e-07, + "loss": 0.8549, + "step": 183930 + }, + { + "epoch": 14.25394242318571, + "grad_norm": 1.6248593191931058, + "learning_rate": 7.127247365158091e-07, + "loss": 0.9005, + "step": 183940 + }, + { + "epoch": 14.254717346662018, + "grad_norm": 1.544906324769308, + "learning_rate": 7.127634841909486e-07, + "loss": 0.8489, + "step": 183950 + }, + { + "epoch": 14.255492270138324, + "grad_norm": 1.5789262175334793, + "learning_rate": 7.12802231866088e-07, + "loss": 0.8695, + "step": 183960 + }, + { + "epoch": 14.256267193614631, + "grad_norm": 1.634669544019888, + "learning_rate": 7.128409795412277e-07, + "loss": 0.8566, + "step": 183970 + }, + { + "epoch": 14.257042117090938, + "grad_norm": 1.5439831509701987, + "learning_rate": 7.128797272163671e-07, + "loss": 0.8599, + "step": 183980 + }, + { + "epoch": 14.257817040567243, + "grad_norm": 1.5164926336195192, + "learning_rate": 7.129184748915066e-07, + "loss": 0.8623, + "step": 183990 + }, + { + "epoch": 14.25859196404355, + "grad_norm": 1.5271162538140945, + "learning_rate": 7.12957222566646e-07, + "loss": 0.8606, + "step": 184000 + }, + { + "epoch": 14.25859196404355, + "eval_loss": 0.8933966159820557, + "eval_runtime": 327.4282, + "eval_samples_per_second": 35.034, + "eval_steps_per_second": 8.759, + "step": 184000 + }, + { + "epoch": 14.259366887519857, + "grad_norm": 1.6479436573331216, + "learning_rate": 7.129959702417856e-07, + "loss": 0.8686, + "step": 184010 + }, + { + "epoch": 14.260141810996164, + "grad_norm": 1.7213250642161866, + "learning_rate": 7.130347179169251e-07, + "loss": 0.8632, + "step": 184020 + }, + { + "epoch": 14.26091673447247, + "grad_norm": 1.5535111332269085, + "learning_rate": 7.130734655920646e-07, + "loss": 0.8701, + "step": 184030 + }, + { + "epoch": 14.261691657948777, + "grad_norm": 1.472692096043485, + "learning_rate": 7.13112213267204e-07, + "loss": 0.8827, + "step": 184040 + }, + { + "epoch": 14.262466581425084, + "grad_norm": 1.5312573026304945, + "learning_rate": 7.131509609423435e-07, + "loss": 0.8615, + "step": 184050 + }, + { + "epoch": 14.26324150490139, + "grad_norm": 1.436600189439299, + "learning_rate": 7.131897086174829e-07, + "loss": 0.8621, + "step": 184060 + }, + { + "epoch": 14.264016428377698, + "grad_norm": 1.5432155543865107, + "learning_rate": 7.132284562926226e-07, + "loss": 0.8499, + "step": 184070 + }, + { + "epoch": 14.264791351854004, + "grad_norm": 1.518797125229993, + "learning_rate": 7.13267203967762e-07, + "loss": 0.8738, + "step": 184080 + }, + { + "epoch": 14.265566275330311, + "grad_norm": 1.4057620871741603, + "learning_rate": 7.133059516429015e-07, + "loss": 0.8527, + "step": 184090 + }, + { + "epoch": 14.266341198806618, + "grad_norm": 1.6457661367465581, + "learning_rate": 7.133446993180409e-07, + "loss": 0.8955, + "step": 184100 + }, + { + "epoch": 14.267116122282925, + "grad_norm": 1.4719918035842945, + "learning_rate": 7.133834469931805e-07, + "loss": 0.8777, + "step": 184110 + }, + { + "epoch": 14.267891045759232, + "grad_norm": 1.5655680174755653, + "learning_rate": 7.1342219466832e-07, + "loss": 0.8802, + "step": 184120 + }, + { + "epoch": 14.268665969235538, + "grad_norm": 1.5748987113396091, + "learning_rate": 7.134609423434595e-07, + "loss": 0.8603, + "step": 184130 + }, + { + "epoch": 14.269440892711845, + "grad_norm": 1.535378131606922, + "learning_rate": 7.134996900185989e-07, + "loss": 0.8606, + "step": 184140 + }, + { + "epoch": 14.270215816188152, + "grad_norm": 1.4110799342764948, + "learning_rate": 7.135384376937385e-07, + "loss": 0.8549, + "step": 184150 + }, + { + "epoch": 14.270990739664459, + "grad_norm": 1.5511994400959175, + "learning_rate": 7.135771853688779e-07, + "loss": 0.8666, + "step": 184160 + }, + { + "epoch": 14.271765663140766, + "grad_norm": 1.5719339205089244, + "learning_rate": 7.136159330440175e-07, + "loss": 0.8721, + "step": 184170 + }, + { + "epoch": 14.272540586617072, + "grad_norm": 1.679579118970865, + "learning_rate": 7.136546807191569e-07, + "loss": 0.8787, + "step": 184180 + }, + { + "epoch": 14.273315510093378, + "grad_norm": 1.4151518786080826, + "learning_rate": 7.136934283942964e-07, + "loss": 0.8627, + "step": 184190 + }, + { + "epoch": 14.274090433569684, + "grad_norm": 1.4881731039329271, + "learning_rate": 7.137321760694358e-07, + "loss": 0.8627, + "step": 184200 + }, + { + "epoch": 14.274865357045991, + "grad_norm": 1.5690241562544796, + "learning_rate": 7.137709237445754e-07, + "loss": 0.87, + "step": 184210 + }, + { + "epoch": 14.275640280522298, + "grad_norm": 1.4245203317652526, + "learning_rate": 7.138096714197149e-07, + "loss": 0.8734, + "step": 184220 + }, + { + "epoch": 14.276415203998605, + "grad_norm": 1.5208939229510976, + "learning_rate": 7.138484190948544e-07, + "loss": 0.8463, + "step": 184230 + }, + { + "epoch": 14.277190127474912, + "grad_norm": 1.535283760832821, + "learning_rate": 7.138871667699938e-07, + "loss": 0.8526, + "step": 184240 + }, + { + "epoch": 14.277965050951218, + "grad_norm": 1.4875506902947837, + "learning_rate": 7.139259144451334e-07, + "loss": 0.8504, + "step": 184250 + }, + { + "epoch": 14.278739974427525, + "grad_norm": 1.5715589603821205, + "learning_rate": 7.139646621202728e-07, + "loss": 0.8694, + "step": 184260 + }, + { + "epoch": 14.279514897903832, + "grad_norm": 1.4271922272030306, + "learning_rate": 7.140034097954124e-07, + "loss": 0.8751, + "step": 184270 + }, + { + "epoch": 14.280289821380139, + "grad_norm": 1.5221050360129227, + "learning_rate": 7.140421574705518e-07, + "loss": 0.8823, + "step": 184280 + }, + { + "epoch": 14.281064744856446, + "grad_norm": 1.4550184575027771, + "learning_rate": 7.140809051456914e-07, + "loss": 0.8671, + "step": 184290 + }, + { + "epoch": 14.281839668332752, + "grad_norm": 1.5645133448096478, + "learning_rate": 7.141196528208308e-07, + "loss": 0.8962, + "step": 184300 + }, + { + "epoch": 14.28261459180906, + "grad_norm": 1.6779072805274806, + "learning_rate": 7.141584004959704e-07, + "loss": 0.8584, + "step": 184310 + }, + { + "epoch": 14.283389515285366, + "grad_norm": 1.5957698779778178, + "learning_rate": 7.141971481711098e-07, + "loss": 0.865, + "step": 184320 + }, + { + "epoch": 14.284164438761673, + "grad_norm": 1.473446879538907, + "learning_rate": 7.142358958462493e-07, + "loss": 0.8602, + "step": 184330 + }, + { + "epoch": 14.28493936223798, + "grad_norm": 1.5516640995597137, + "learning_rate": 7.142746435213887e-07, + "loss": 0.8393, + "step": 184340 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 1.5255669950228654, + "learning_rate": 7.143133911965283e-07, + "loss": 0.8672, + "step": 184350 + }, + { + "epoch": 14.286489209190593, + "grad_norm": 1.4452986454403807, + "learning_rate": 7.143521388716677e-07, + "loss": 0.8524, + "step": 184360 + }, + { + "epoch": 14.287264132666898, + "grad_norm": 1.4089087782434495, + "learning_rate": 7.143908865468073e-07, + "loss": 0.8572, + "step": 184370 + }, + { + "epoch": 14.288039056143205, + "grad_norm": 1.541316276621472, + "learning_rate": 7.144296342219467e-07, + "loss": 0.8507, + "step": 184380 + }, + { + "epoch": 14.288813979619512, + "grad_norm": 1.5414743330452385, + "learning_rate": 7.144683818970863e-07, + "loss": 0.8636, + "step": 184390 + }, + { + "epoch": 14.289588903095819, + "grad_norm": 1.4983458599338397, + "learning_rate": 7.145071295722257e-07, + "loss": 0.8801, + "step": 184400 + }, + { + "epoch": 14.290363826572126, + "grad_norm": 1.5523367670272648, + "learning_rate": 7.145458772473653e-07, + "loss": 0.8604, + "step": 184410 + }, + { + "epoch": 14.291138750048432, + "grad_norm": 1.5337363156791524, + "learning_rate": 7.145846249225047e-07, + "loss": 0.8589, + "step": 184420 + }, + { + "epoch": 14.29191367352474, + "grad_norm": 1.5537311913504737, + "learning_rate": 7.146233725976443e-07, + "loss": 0.8572, + "step": 184430 + }, + { + "epoch": 14.292688597001046, + "grad_norm": 1.499919522999176, + "learning_rate": 7.146621202727837e-07, + "loss": 0.8671, + "step": 184440 + }, + { + "epoch": 14.293463520477353, + "grad_norm": 1.569352831867998, + "learning_rate": 7.147008679479232e-07, + "loss": 0.8683, + "step": 184450 + }, + { + "epoch": 14.29423844395366, + "grad_norm": 1.5556389893941636, + "learning_rate": 7.147396156230627e-07, + "loss": 0.8713, + "step": 184460 + }, + { + "epoch": 14.295013367429966, + "grad_norm": 1.5998511183344473, + "learning_rate": 7.147783632982022e-07, + "loss": 0.8573, + "step": 184470 + }, + { + "epoch": 14.295788290906273, + "grad_norm": 1.5652591559366824, + "learning_rate": 7.148171109733416e-07, + "loss": 0.8717, + "step": 184480 + }, + { + "epoch": 14.29656321438258, + "grad_norm": 1.554221011486393, + "learning_rate": 7.148558586484812e-07, + "loss": 0.8715, + "step": 184490 + }, + { + "epoch": 14.297338137858887, + "grad_norm": 1.5643183359546964, + "learning_rate": 7.148946063236206e-07, + "loss": 0.8785, + "step": 184500 + }, + { + "epoch": 14.297338137858887, + "eval_loss": 0.8931780457496643, + "eval_runtime": 329.744, + "eval_samples_per_second": 34.788, + "eval_steps_per_second": 8.698, + "step": 184500 + }, + { + "epoch": 14.298113061335194, + "grad_norm": 1.4428474548215242, + "learning_rate": 7.149333539987602e-07, + "loss": 0.8866, + "step": 184510 + }, + { + "epoch": 14.2988879848115, + "grad_norm": 1.6004465075059677, + "learning_rate": 7.149721016738996e-07, + "loss": 0.8743, + "step": 184520 + }, + { + "epoch": 14.299662908287807, + "grad_norm": 1.6246053180314317, + "learning_rate": 7.150108493490392e-07, + "loss": 0.8607, + "step": 184530 + }, + { + "epoch": 14.300437831764114, + "grad_norm": 1.564435737294796, + "learning_rate": 7.150495970241786e-07, + "loss": 0.8733, + "step": 184540 + }, + { + "epoch": 14.301212755240421, + "grad_norm": 1.5910601249852934, + "learning_rate": 7.150883446993181e-07, + "loss": 0.8948, + "step": 184550 + }, + { + "epoch": 14.301987678716726, + "grad_norm": 1.5397523734444984, + "learning_rate": 7.151270923744576e-07, + "loss": 0.8424, + "step": 184560 + }, + { + "epoch": 14.302762602193033, + "grad_norm": 1.5837323293215817, + "learning_rate": 7.151658400495972e-07, + "loss": 0.8854, + "step": 184570 + }, + { + "epoch": 14.30353752566934, + "grad_norm": 1.596216973184072, + "learning_rate": 7.152045877247366e-07, + "loss": 0.8716, + "step": 184580 + }, + { + "epoch": 14.304312449145646, + "grad_norm": 1.4945482877916088, + "learning_rate": 7.152433353998761e-07, + "loss": 0.8749, + "step": 184590 + }, + { + "epoch": 14.305087372621953, + "grad_norm": 1.5066641697729226, + "learning_rate": 7.152820830750155e-07, + "loss": 0.8488, + "step": 184600 + }, + { + "epoch": 14.30586229609826, + "grad_norm": 1.5744309824587888, + "learning_rate": 7.153208307501551e-07, + "loss": 0.8558, + "step": 184610 + }, + { + "epoch": 14.306637219574567, + "grad_norm": 1.5103230430164485, + "learning_rate": 7.153595784252945e-07, + "loss": 0.8561, + "step": 184620 + }, + { + "epoch": 14.307412143050874, + "grad_norm": 1.486298729985601, + "learning_rate": 7.153983261004341e-07, + "loss": 0.8651, + "step": 184630 + }, + { + "epoch": 14.30818706652718, + "grad_norm": 1.5887326793548449, + "learning_rate": 7.154370737755735e-07, + "loss": 0.8857, + "step": 184640 + }, + { + "epoch": 14.308961990003487, + "grad_norm": 1.5240474752437745, + "learning_rate": 7.15475821450713e-07, + "loss": 0.8583, + "step": 184650 + }, + { + "epoch": 14.309736913479794, + "grad_norm": 1.4773279490554159, + "learning_rate": 7.155145691258525e-07, + "loss": 0.8634, + "step": 184660 + }, + { + "epoch": 14.3105118369561, + "grad_norm": 1.5439695693374353, + "learning_rate": 7.155533168009921e-07, + "loss": 0.8579, + "step": 184670 + }, + { + "epoch": 14.311286760432408, + "grad_norm": 1.5360001762567101, + "learning_rate": 7.155920644761315e-07, + "loss": 0.8601, + "step": 184680 + }, + { + "epoch": 14.312061683908714, + "grad_norm": 1.4481469460789194, + "learning_rate": 7.15630812151271e-07, + "loss": 0.8803, + "step": 184690 + }, + { + "epoch": 14.312836607385021, + "grad_norm": 1.4567102709738493, + "learning_rate": 7.156695598264104e-07, + "loss": 0.8596, + "step": 184700 + }, + { + "epoch": 14.313611530861328, + "grad_norm": 1.6102215994549476, + "learning_rate": 7.157083075015501e-07, + "loss": 0.8649, + "step": 184710 + }, + { + "epoch": 14.314386454337635, + "grad_norm": 1.5743198621078158, + "learning_rate": 7.157470551766895e-07, + "loss": 0.8512, + "step": 184720 + }, + { + "epoch": 14.315161377813942, + "grad_norm": 1.5972957217013268, + "learning_rate": 7.15785802851829e-07, + "loss": 0.8663, + "step": 184730 + }, + { + "epoch": 14.315936301290247, + "grad_norm": 1.4435165814874453, + "learning_rate": 7.158245505269684e-07, + "loss": 0.8549, + "step": 184740 + }, + { + "epoch": 14.316711224766554, + "grad_norm": 1.4968122363929164, + "learning_rate": 7.158632982021079e-07, + "loss": 0.8575, + "step": 184750 + }, + { + "epoch": 14.31748614824286, + "grad_norm": 1.5241591688302574, + "learning_rate": 7.159020458772474e-07, + "loss": 0.8544, + "step": 184760 + }, + { + "epoch": 14.318261071719167, + "grad_norm": 1.584189200239884, + "learning_rate": 7.15940793552387e-07, + "loss": 0.8652, + "step": 184770 + }, + { + "epoch": 14.319035995195474, + "grad_norm": 1.5406637037477438, + "learning_rate": 7.159795412275264e-07, + "loss": 0.8703, + "step": 184780 + }, + { + "epoch": 14.31981091867178, + "grad_norm": 1.667637946472761, + "learning_rate": 7.160182889026659e-07, + "loss": 0.8549, + "step": 184790 + }, + { + "epoch": 14.320585842148088, + "grad_norm": 1.536911413309468, + "learning_rate": 7.160570365778053e-07, + "loss": 0.8628, + "step": 184800 + }, + { + "epoch": 14.321360765624394, + "grad_norm": 1.513853372318486, + "learning_rate": 7.16095784252945e-07, + "loss": 0.8859, + "step": 184810 + }, + { + "epoch": 14.322135689100701, + "grad_norm": 1.6973823678938513, + "learning_rate": 7.161345319280844e-07, + "loss": 0.864, + "step": 184820 + }, + { + "epoch": 14.322910612577008, + "grad_norm": 1.4973150762121046, + "learning_rate": 7.161732796032239e-07, + "loss": 0.8502, + "step": 184830 + }, + { + "epoch": 14.323685536053315, + "grad_norm": 1.5475284930679962, + "learning_rate": 7.162120272783633e-07, + "loss": 0.8621, + "step": 184840 + }, + { + "epoch": 14.324460459529622, + "grad_norm": 1.629520208600283, + "learning_rate": 7.16250774953503e-07, + "loss": 0.881, + "step": 184850 + }, + { + "epoch": 14.325235383005928, + "grad_norm": 1.56799068049649, + "learning_rate": 7.162895226286424e-07, + "loss": 0.8771, + "step": 184860 + }, + { + "epoch": 14.326010306482235, + "grad_norm": 1.5181055771939453, + "learning_rate": 7.163282703037819e-07, + "loss": 0.8715, + "step": 184870 + }, + { + "epoch": 14.326785229958542, + "grad_norm": 1.4549125635892797, + "learning_rate": 7.163670179789213e-07, + "loss": 0.8736, + "step": 184880 + }, + { + "epoch": 14.327560153434849, + "grad_norm": 1.4715682496967, + "learning_rate": 7.164057656540608e-07, + "loss": 0.8765, + "step": 184890 + }, + { + "epoch": 14.328335076911156, + "grad_norm": 1.5903234511340478, + "learning_rate": 7.164445133292002e-07, + "loss": 0.8633, + "step": 184900 + }, + { + "epoch": 14.329110000387463, + "grad_norm": 1.5713621179982915, + "learning_rate": 7.164832610043399e-07, + "loss": 0.8643, + "step": 184910 + }, + { + "epoch": 14.32988492386377, + "grad_norm": 1.6162666076995005, + "learning_rate": 7.165220086794793e-07, + "loss": 0.8698, + "step": 184920 + }, + { + "epoch": 14.330659847340074, + "grad_norm": 1.4823374982984319, + "learning_rate": 7.165607563546188e-07, + "loss": 0.876, + "step": 184930 + }, + { + "epoch": 14.331434770816381, + "grad_norm": 1.5441517181077598, + "learning_rate": 7.165995040297582e-07, + "loss": 0.8694, + "step": 184940 + }, + { + "epoch": 14.332209694292688, + "grad_norm": 1.5856356369730211, + "learning_rate": 7.166382517048979e-07, + "loss": 0.8503, + "step": 184950 + }, + { + "epoch": 14.332984617768995, + "grad_norm": 1.5377016143057975, + "learning_rate": 7.166769993800373e-07, + "loss": 0.8798, + "step": 184960 + }, + { + "epoch": 14.333759541245302, + "grad_norm": 1.5824970227131232, + "learning_rate": 7.167157470551768e-07, + "loss": 0.8595, + "step": 184970 + }, + { + "epoch": 14.334534464721608, + "grad_norm": 1.4512937415654974, + "learning_rate": 7.167544947303162e-07, + "loss": 0.876, + "step": 184980 + }, + { + "epoch": 14.335309388197915, + "grad_norm": 1.4579946805232273, + "learning_rate": 7.167932424054557e-07, + "loss": 0.8718, + "step": 184990 + }, + { + "epoch": 14.336084311674222, + "grad_norm": 1.570010508334759, + "learning_rate": 7.168319900805953e-07, + "loss": 0.8695, + "step": 185000 + }, + { + "epoch": 14.336084311674222, + "eval_loss": 0.893106997013092, + "eval_runtime": 330.595, + "eval_samples_per_second": 34.698, + "eval_steps_per_second": 8.675, + "step": 185000 + }, + { + "epoch": 14.336859235150529, + "grad_norm": 1.5649915180794247, + "learning_rate": 7.168707377557348e-07, + "loss": 0.852, + "step": 185010 + }, + { + "epoch": 14.337634158626836, + "grad_norm": 1.5207560048466158, + "learning_rate": 7.169094854308742e-07, + "loss": 0.8922, + "step": 185020 + }, + { + "epoch": 14.338409082103142, + "grad_norm": 1.5292635201474956, + "learning_rate": 7.169482331060137e-07, + "loss": 0.8686, + "step": 185030 + }, + { + "epoch": 14.33918400557945, + "grad_norm": 1.526992570237347, + "learning_rate": 7.169869807811531e-07, + "loss": 0.9046, + "step": 185040 + }, + { + "epoch": 14.339958929055756, + "grad_norm": 1.5870778387052764, + "learning_rate": 7.170257284562928e-07, + "loss": 0.8883, + "step": 185050 + }, + { + "epoch": 14.340733852532063, + "grad_norm": 1.4762120954977873, + "learning_rate": 7.170644761314322e-07, + "loss": 0.8847, + "step": 185060 + }, + { + "epoch": 14.34150877600837, + "grad_norm": 1.4522956303540633, + "learning_rate": 7.171032238065717e-07, + "loss": 0.8583, + "step": 185070 + }, + { + "epoch": 14.342283699484677, + "grad_norm": 1.5565918410070245, + "learning_rate": 7.171419714817111e-07, + "loss": 0.838, + "step": 185080 + }, + { + "epoch": 14.343058622960983, + "grad_norm": 1.5122140875186592, + "learning_rate": 7.171807191568507e-07, + "loss": 0.8598, + "step": 185090 + }, + { + "epoch": 14.34383354643729, + "grad_norm": 1.5141105387964813, + "learning_rate": 7.172194668319902e-07, + "loss": 0.8631, + "step": 185100 + }, + { + "epoch": 14.344608469913595, + "grad_norm": 1.7025734038284455, + "learning_rate": 7.172582145071297e-07, + "loss": 0.8681, + "step": 185110 + }, + { + "epoch": 14.345383393389902, + "grad_norm": 1.5993870941639918, + "learning_rate": 7.172969621822691e-07, + "loss": 0.8743, + "step": 185120 + }, + { + "epoch": 14.346158316866209, + "grad_norm": 1.6360136958121279, + "learning_rate": 7.173357098574086e-07, + "loss": 0.8719, + "step": 185130 + }, + { + "epoch": 14.346933240342516, + "grad_norm": 1.5871841483408826, + "learning_rate": 7.173744575325481e-07, + "loss": 0.8516, + "step": 185140 + }, + { + "epoch": 14.347708163818822, + "grad_norm": 1.4193333083838044, + "learning_rate": 7.174132052076877e-07, + "loss": 0.8652, + "step": 185150 + }, + { + "epoch": 14.34848308729513, + "grad_norm": 1.4859089825267522, + "learning_rate": 7.174519528828271e-07, + "loss": 0.8591, + "step": 185160 + }, + { + "epoch": 14.349258010771436, + "grad_norm": 1.4761188973766437, + "learning_rate": 7.174907005579666e-07, + "loss": 0.865, + "step": 185170 + }, + { + "epoch": 14.350032934247743, + "grad_norm": 1.4646842348037206, + "learning_rate": 7.17529448233106e-07, + "loss": 0.8548, + "step": 185180 + }, + { + "epoch": 14.35080785772405, + "grad_norm": 1.5795702960890479, + "learning_rate": 7.175681959082456e-07, + "loss": 0.8608, + "step": 185190 + }, + { + "epoch": 14.351582781200356, + "grad_norm": 1.4697136645669586, + "learning_rate": 7.176069435833851e-07, + "loss": 0.867, + "step": 185200 + }, + { + "epoch": 14.352357704676663, + "grad_norm": 1.550111432494634, + "learning_rate": 7.176456912585246e-07, + "loss": 0.8872, + "step": 185210 + }, + { + "epoch": 14.35313262815297, + "grad_norm": 1.5901464796587161, + "learning_rate": 7.17684438933664e-07, + "loss": 0.8634, + "step": 185220 + }, + { + "epoch": 14.353907551629277, + "grad_norm": 1.6652101209803982, + "learning_rate": 7.177231866088036e-07, + "loss": 0.8777, + "step": 185230 + }, + { + "epoch": 14.354682475105584, + "grad_norm": 1.5465606760752126, + "learning_rate": 7.17761934283943e-07, + "loss": 0.8684, + "step": 185240 + }, + { + "epoch": 14.35545739858189, + "grad_norm": 1.5766320821018078, + "learning_rate": 7.178006819590826e-07, + "loss": 0.8647, + "step": 185250 + }, + { + "epoch": 14.356232322058197, + "grad_norm": 1.4915756178150381, + "learning_rate": 7.17839429634222e-07, + "loss": 0.8582, + "step": 185260 + }, + { + "epoch": 14.357007245534504, + "grad_norm": 1.6822542667681561, + "learning_rate": 7.178781773093615e-07, + "loss": 0.8695, + "step": 185270 + }, + { + "epoch": 14.357782169010811, + "grad_norm": 1.617762706329565, + "learning_rate": 7.17916924984501e-07, + "loss": 0.8781, + "step": 185280 + }, + { + "epoch": 14.358557092487118, + "grad_norm": 1.4512271836421062, + "learning_rate": 7.179556726596405e-07, + "loss": 0.8652, + "step": 185290 + }, + { + "epoch": 14.359332015963423, + "grad_norm": 1.5932342057846014, + "learning_rate": 7.1799442033478e-07, + "loss": 0.8648, + "step": 185300 + }, + { + "epoch": 14.36010693943973, + "grad_norm": 1.491034069382152, + "learning_rate": 7.180331680099195e-07, + "loss": 0.8661, + "step": 185310 + }, + { + "epoch": 14.360881862916036, + "grad_norm": 1.5182841046364919, + "learning_rate": 7.180719156850589e-07, + "loss": 0.8432, + "step": 185320 + }, + { + "epoch": 14.361656786392343, + "grad_norm": 1.4940013520405055, + "learning_rate": 7.181106633601985e-07, + "loss": 0.8488, + "step": 185330 + }, + { + "epoch": 14.36243170986865, + "grad_norm": 1.5930848099998922, + "learning_rate": 7.181494110353379e-07, + "loss": 0.8878, + "step": 185340 + }, + { + "epoch": 14.363206633344957, + "grad_norm": 1.536937015873921, + "learning_rate": 7.181881587104775e-07, + "loss": 0.8506, + "step": 185350 + }, + { + "epoch": 14.363981556821264, + "grad_norm": 1.5736966770400218, + "learning_rate": 7.182269063856169e-07, + "loss": 0.872, + "step": 185360 + }, + { + "epoch": 14.36475648029757, + "grad_norm": 1.561311427374137, + "learning_rate": 7.182656540607565e-07, + "loss": 0.8522, + "step": 185370 + }, + { + "epoch": 14.365531403773877, + "grad_norm": 1.5563821107646096, + "learning_rate": 7.183044017358959e-07, + "loss": 0.8524, + "step": 185380 + }, + { + "epoch": 14.366306327250184, + "grad_norm": 1.4997743360902969, + "learning_rate": 7.183431494110354e-07, + "loss": 0.866, + "step": 185390 + }, + { + "epoch": 14.367081250726491, + "grad_norm": 1.535882324542105, + "learning_rate": 7.183818970861749e-07, + "loss": 0.856, + "step": 185400 + }, + { + "epoch": 14.367856174202798, + "grad_norm": 1.5152378646579163, + "learning_rate": 7.184206447613144e-07, + "loss": 0.8588, + "step": 185410 + }, + { + "epoch": 14.368631097679105, + "grad_norm": 1.5772198599081697, + "learning_rate": 7.184593924364539e-07, + "loss": 0.8568, + "step": 185420 + }, + { + "epoch": 14.369406021155411, + "grad_norm": 1.623561622536576, + "learning_rate": 7.184981401115934e-07, + "loss": 0.869, + "step": 185430 + }, + { + "epoch": 14.370180944631718, + "grad_norm": 1.5571510870267498, + "learning_rate": 7.185368877867328e-07, + "loss": 0.8831, + "step": 185440 + }, + { + "epoch": 14.370955868108025, + "grad_norm": 1.5030936647895872, + "learning_rate": 7.185756354618724e-07, + "loss": 0.8734, + "step": 185450 + }, + { + "epoch": 14.371730791584332, + "grad_norm": 1.4783305268365043, + "learning_rate": 7.186143831370118e-07, + "loss": 0.8541, + "step": 185460 + }, + { + "epoch": 14.372505715060639, + "grad_norm": 1.505734255864562, + "learning_rate": 7.186531308121514e-07, + "loss": 0.8798, + "step": 185470 + }, + { + "epoch": 14.373280638536944, + "grad_norm": 1.4408658623894253, + "learning_rate": 7.186918784872908e-07, + "loss": 0.8699, + "step": 185480 + }, + { + "epoch": 14.37405556201325, + "grad_norm": 1.5891289175761218, + "learning_rate": 7.187306261624303e-07, + "loss": 0.8653, + "step": 185490 + }, + { + "epoch": 14.374830485489557, + "grad_norm": 1.563897047924361, + "learning_rate": 7.187693738375698e-07, + "loss": 0.8752, + "step": 185500 + }, + { + "epoch": 14.374830485489557, + "eval_loss": 0.8931103944778442, + "eval_runtime": 332.0283, + "eval_samples_per_second": 34.548, + "eval_steps_per_second": 8.638, + "step": 185500 + }, + { + "epoch": 14.375605408965864, + "grad_norm": 1.4442409213331104, + "learning_rate": 7.188081215127094e-07, + "loss": 0.8502, + "step": 185510 + }, + { + "epoch": 14.37638033244217, + "grad_norm": 1.564027578852739, + "learning_rate": 7.188468691878488e-07, + "loss": 0.8811, + "step": 185520 + }, + { + "epoch": 14.377155255918478, + "grad_norm": 1.6300117235326455, + "learning_rate": 7.188856168629883e-07, + "loss": 0.8803, + "step": 185530 + }, + { + "epoch": 14.377930179394784, + "grad_norm": 1.4952515779145183, + "learning_rate": 7.189243645381277e-07, + "loss": 0.8548, + "step": 185540 + }, + { + "epoch": 14.378705102871091, + "grad_norm": 1.5086812800518392, + "learning_rate": 7.189631122132673e-07, + "loss": 0.8817, + "step": 185550 + }, + { + "epoch": 14.379480026347398, + "grad_norm": 1.5123136182933012, + "learning_rate": 7.190018598884067e-07, + "loss": 0.865, + "step": 185560 + }, + { + "epoch": 14.380254949823705, + "grad_norm": 1.4871138473583985, + "learning_rate": 7.190406075635463e-07, + "loss": 0.8643, + "step": 185570 + }, + { + "epoch": 14.381029873300012, + "grad_norm": 1.7201050520993244, + "learning_rate": 7.190793552386857e-07, + "loss": 0.8746, + "step": 185580 + }, + { + "epoch": 14.381804796776319, + "grad_norm": 1.6193993819167654, + "learning_rate": 7.191181029138252e-07, + "loss": 0.8718, + "step": 185590 + }, + { + "epoch": 14.382579720252625, + "grad_norm": 1.558413279990059, + "learning_rate": 7.191568505889647e-07, + "loss": 0.8597, + "step": 185600 + }, + { + "epoch": 14.383354643728932, + "grad_norm": 1.4475430599539223, + "learning_rate": 7.191955982641043e-07, + "loss": 0.8707, + "step": 185610 + }, + { + "epoch": 14.384129567205239, + "grad_norm": 1.5633511175172332, + "learning_rate": 7.192343459392437e-07, + "loss": 0.8711, + "step": 185620 + }, + { + "epoch": 14.384904490681546, + "grad_norm": 1.7132368017074633, + "learning_rate": 7.192730936143832e-07, + "loss": 0.8817, + "step": 185630 + }, + { + "epoch": 14.385679414157853, + "grad_norm": 1.6410950897158652, + "learning_rate": 7.193118412895226e-07, + "loss": 0.8766, + "step": 185640 + }, + { + "epoch": 14.38645433763416, + "grad_norm": 1.6203475008456896, + "learning_rate": 7.193505889646623e-07, + "loss": 0.8749, + "step": 185650 + }, + { + "epoch": 14.387229261110466, + "grad_norm": 1.3914536338620542, + "learning_rate": 7.193893366398017e-07, + "loss": 0.8655, + "step": 185660 + }, + { + "epoch": 14.388004184586771, + "grad_norm": 1.5506689306905976, + "learning_rate": 7.194280843149412e-07, + "loss": 0.8695, + "step": 185670 + }, + { + "epoch": 14.388779108063078, + "grad_norm": 1.6361992652106396, + "learning_rate": 7.194668319900806e-07, + "loss": 0.8746, + "step": 185680 + }, + { + "epoch": 14.389554031539385, + "grad_norm": 1.54882620157463, + "learning_rate": 7.195055796652201e-07, + "loss": 0.8745, + "step": 185690 + }, + { + "epoch": 14.390328955015692, + "grad_norm": 1.597799518694798, + "learning_rate": 7.195443273403596e-07, + "loss": 0.8845, + "step": 185700 + }, + { + "epoch": 14.391103878491998, + "grad_norm": 1.5361030567717144, + "learning_rate": 7.195830750154992e-07, + "loss": 0.8525, + "step": 185710 + }, + { + "epoch": 14.391878801968305, + "grad_norm": 1.5415813793962263, + "learning_rate": 7.196218226906386e-07, + "loss": 0.8867, + "step": 185720 + }, + { + "epoch": 14.392653725444612, + "grad_norm": 1.4299567429716353, + "learning_rate": 7.196605703657781e-07, + "loss": 0.8532, + "step": 185730 + }, + { + "epoch": 14.393428648920919, + "grad_norm": 1.5069246696998397, + "learning_rate": 7.196993180409175e-07, + "loss": 0.8674, + "step": 185740 + }, + { + "epoch": 14.394203572397226, + "grad_norm": 1.5626441920444532, + "learning_rate": 7.197380657160572e-07, + "loss": 0.8587, + "step": 185750 + }, + { + "epoch": 14.394978495873533, + "grad_norm": 1.4495369658382387, + "learning_rate": 7.197768133911966e-07, + "loss": 0.8643, + "step": 185760 + }, + { + "epoch": 14.39575341934984, + "grad_norm": 1.5168648358781747, + "learning_rate": 7.198155610663361e-07, + "loss": 0.835, + "step": 185770 + }, + { + "epoch": 14.396528342826146, + "grad_norm": 1.517479861329931, + "learning_rate": 7.198543087414755e-07, + "loss": 0.8882, + "step": 185780 + }, + { + "epoch": 14.397303266302453, + "grad_norm": 1.5620465563236647, + "learning_rate": 7.198930564166152e-07, + "loss": 0.8733, + "step": 185790 + }, + { + "epoch": 14.39807818977876, + "grad_norm": 1.547565667554109, + "learning_rate": 7.199318040917546e-07, + "loss": 0.8755, + "step": 185800 + }, + { + "epoch": 14.398853113255067, + "grad_norm": 1.5539443208744468, + "learning_rate": 7.199705517668941e-07, + "loss": 0.8832, + "step": 185810 + }, + { + "epoch": 14.399628036731373, + "grad_norm": 1.504559016875564, + "learning_rate": 7.200092994420335e-07, + "loss": 0.8805, + "step": 185820 + }, + { + "epoch": 14.40040296020768, + "grad_norm": 1.4800832241650874, + "learning_rate": 7.20048047117173e-07, + "loss": 0.8726, + "step": 185830 + }, + { + "epoch": 14.401177883683987, + "grad_norm": 1.4958011568870424, + "learning_rate": 7.200867947923125e-07, + "loss": 0.8466, + "step": 185840 + }, + { + "epoch": 14.401952807160292, + "grad_norm": 1.5534547250375916, + "learning_rate": 7.201255424674521e-07, + "loss": 0.8538, + "step": 185850 + }, + { + "epoch": 14.402727730636599, + "grad_norm": 1.5697679061504273, + "learning_rate": 7.201642901425915e-07, + "loss": 0.8655, + "step": 185860 + }, + { + "epoch": 14.403502654112906, + "grad_norm": 1.5517962113706452, + "learning_rate": 7.20203037817731e-07, + "loss": 0.8906, + "step": 185870 + }, + { + "epoch": 14.404277577589212, + "grad_norm": 1.4943465572437447, + "learning_rate": 7.202417854928704e-07, + "loss": 0.8394, + "step": 185880 + }, + { + "epoch": 14.40505250106552, + "grad_norm": 1.5532464276136744, + "learning_rate": 7.202805331680101e-07, + "loss": 0.8675, + "step": 185890 + }, + { + "epoch": 14.405827424541826, + "grad_norm": 1.5365336028595558, + "learning_rate": 7.203192808431495e-07, + "loss": 0.8536, + "step": 185900 + }, + { + "epoch": 14.406602348018133, + "grad_norm": 1.5235145825219392, + "learning_rate": 7.20358028518289e-07, + "loss": 0.8756, + "step": 185910 + }, + { + "epoch": 14.40737727149444, + "grad_norm": 1.572495577263898, + "learning_rate": 7.203967761934284e-07, + "loss": 0.8783, + "step": 185920 + }, + { + "epoch": 14.408152194970747, + "grad_norm": 1.5771182752480672, + "learning_rate": 7.20435523868568e-07, + "loss": 0.8836, + "step": 185930 + }, + { + "epoch": 14.408927118447053, + "grad_norm": 1.720127293955178, + "learning_rate": 7.204742715437075e-07, + "loss": 0.8538, + "step": 185940 + }, + { + "epoch": 14.40970204192336, + "grad_norm": 1.6346436768757782, + "learning_rate": 7.20513019218847e-07, + "loss": 0.8692, + "step": 185950 + }, + { + "epoch": 14.410476965399667, + "grad_norm": 1.506924965301869, + "learning_rate": 7.205517668939864e-07, + "loss": 0.8748, + "step": 185960 + }, + { + "epoch": 14.411251888875974, + "grad_norm": 1.5757432560185642, + "learning_rate": 7.205905145691259e-07, + "loss": 0.874, + "step": 185970 + }, + { + "epoch": 14.41202681235228, + "grad_norm": 1.5234605526694536, + "learning_rate": 7.206292622442653e-07, + "loss": 0.8647, + "step": 185980 + }, + { + "epoch": 14.412801735828587, + "grad_norm": 1.5203192478343106, + "learning_rate": 7.20668009919405e-07, + "loss": 0.876, + "step": 185990 + }, + { + "epoch": 14.413576659304894, + "grad_norm": 1.5340818725089573, + "learning_rate": 7.207067575945444e-07, + "loss": 0.8731, + "step": 186000 + }, + { + "epoch": 14.413576659304894, + "eval_loss": 0.8926821351051331, + "eval_runtime": 331.4793, + "eval_samples_per_second": 34.605, + "eval_steps_per_second": 8.652, + "step": 186000 + }, + { + "epoch": 14.414351582781201, + "grad_norm": 1.4925716108367735, + "learning_rate": 7.207455052696839e-07, + "loss": 0.8701, + "step": 186010 + }, + { + "epoch": 14.415126506257508, + "grad_norm": 1.5380732610447008, + "learning_rate": 7.207842529448233e-07, + "loss": 0.8534, + "step": 186020 + }, + { + "epoch": 14.415901429733815, + "grad_norm": 1.6638944899812265, + "learning_rate": 7.208230006199629e-07, + "loss": 0.8627, + "step": 186030 + }, + { + "epoch": 14.41667635321012, + "grad_norm": 1.584299559285424, + "learning_rate": 7.208617482951024e-07, + "loss": 0.8709, + "step": 186040 + }, + { + "epoch": 14.417451276686426, + "grad_norm": 1.5213238401229146, + "learning_rate": 7.209004959702419e-07, + "loss": 0.8781, + "step": 186050 + }, + { + "epoch": 14.418226200162733, + "grad_norm": 1.5645493595527213, + "learning_rate": 7.209392436453813e-07, + "loss": 0.8821, + "step": 186060 + }, + { + "epoch": 14.41900112363904, + "grad_norm": 1.5650054033488492, + "learning_rate": 7.209779913205209e-07, + "loss": 0.8653, + "step": 186070 + }, + { + "epoch": 14.419776047115347, + "grad_norm": 1.540810621440229, + "learning_rate": 7.210167389956603e-07, + "loss": 0.877, + "step": 186080 + }, + { + "epoch": 14.420550970591654, + "grad_norm": 1.5647653392465812, + "learning_rate": 7.210554866707999e-07, + "loss": 0.8782, + "step": 186090 + }, + { + "epoch": 14.42132589406796, + "grad_norm": 1.5213985715512504, + "learning_rate": 7.210942343459393e-07, + "loss": 0.8542, + "step": 186100 + }, + { + "epoch": 14.422100817544267, + "grad_norm": 1.4890018394385616, + "learning_rate": 7.211329820210788e-07, + "loss": 0.8632, + "step": 186110 + }, + { + "epoch": 14.422875741020574, + "grad_norm": 1.429672287317297, + "learning_rate": 7.211717296962182e-07, + "loss": 0.8691, + "step": 186120 + }, + { + "epoch": 14.423650664496881, + "grad_norm": 1.469152747067032, + "learning_rate": 7.212104773713578e-07, + "loss": 0.8758, + "step": 186130 + }, + { + "epoch": 14.424425587973188, + "grad_norm": 1.562902045182587, + "learning_rate": 7.212492250464973e-07, + "loss": 0.8678, + "step": 186140 + }, + { + "epoch": 14.425200511449495, + "grad_norm": 1.4971926577872194, + "learning_rate": 7.212879727216368e-07, + "loss": 0.8671, + "step": 186150 + }, + { + "epoch": 14.425975434925801, + "grad_norm": 1.5437053201037239, + "learning_rate": 7.213267203967762e-07, + "loss": 0.8553, + "step": 186160 + }, + { + "epoch": 14.426750358402108, + "grad_norm": 1.6090773891193484, + "learning_rate": 7.213654680719158e-07, + "loss": 0.8939, + "step": 186170 + }, + { + "epoch": 14.427525281878415, + "grad_norm": 1.506409099594679, + "learning_rate": 7.214042157470552e-07, + "loss": 0.8647, + "step": 186180 + }, + { + "epoch": 14.428300205354722, + "grad_norm": 1.547461533617321, + "learning_rate": 7.214429634221948e-07, + "loss": 0.8602, + "step": 186190 + }, + { + "epoch": 14.429075128831029, + "grad_norm": 1.5248897342406957, + "learning_rate": 7.214817110973342e-07, + "loss": 0.8808, + "step": 186200 + }, + { + "epoch": 14.429850052307335, + "grad_norm": 1.4801975435101669, + "learning_rate": 7.215204587724738e-07, + "loss": 0.8578, + "step": 186210 + }, + { + "epoch": 14.43062497578364, + "grad_norm": 1.4822290215435983, + "learning_rate": 7.215592064476132e-07, + "loss": 0.88, + "step": 186220 + }, + { + "epoch": 14.431399899259947, + "grad_norm": 1.5516226281990049, + "learning_rate": 7.215979541227527e-07, + "loss": 0.8677, + "step": 186230 + }, + { + "epoch": 14.432174822736254, + "grad_norm": 1.5030843729476826, + "learning_rate": 7.216367017978922e-07, + "loss": 0.8648, + "step": 186240 + }, + { + "epoch": 14.43294974621256, + "grad_norm": 1.5708990725730434, + "learning_rate": 7.216754494730317e-07, + "loss": 0.856, + "step": 186250 + }, + { + "epoch": 14.433724669688868, + "grad_norm": 1.5600452068583697, + "learning_rate": 7.217141971481711e-07, + "loss": 0.8595, + "step": 186260 + }, + { + "epoch": 14.434499593165175, + "grad_norm": 1.5519549966632133, + "learning_rate": 7.217529448233107e-07, + "loss": 0.8719, + "step": 186270 + }, + { + "epoch": 14.435274516641481, + "grad_norm": 1.5058043391541724, + "learning_rate": 7.217916924984501e-07, + "loss": 0.861, + "step": 186280 + }, + { + "epoch": 14.436049440117788, + "grad_norm": 1.607001782017282, + "learning_rate": 7.218304401735897e-07, + "loss": 0.8889, + "step": 186290 + }, + { + "epoch": 14.436824363594095, + "grad_norm": 1.6136330995076171, + "learning_rate": 7.218691878487291e-07, + "loss": 0.8766, + "step": 186300 + }, + { + "epoch": 14.437599287070402, + "grad_norm": 1.6112397445124997, + "learning_rate": 7.219079355238687e-07, + "loss": 0.8923, + "step": 186310 + }, + { + "epoch": 14.438374210546709, + "grad_norm": 1.5295015804484204, + "learning_rate": 7.219466831990081e-07, + "loss": 0.8494, + "step": 186320 + }, + { + "epoch": 14.439149134023015, + "grad_norm": 1.5556951359447084, + "learning_rate": 7.219854308741477e-07, + "loss": 0.9048, + "step": 186330 + }, + { + "epoch": 14.439924057499322, + "grad_norm": 1.6401632321632238, + "learning_rate": 7.220241785492871e-07, + "loss": 0.8496, + "step": 186340 + }, + { + "epoch": 14.440698980975629, + "grad_norm": 1.5173293595809776, + "learning_rate": 7.220629262244266e-07, + "loss": 0.8645, + "step": 186350 + }, + { + "epoch": 14.441473904451936, + "grad_norm": 1.5561727160615018, + "learning_rate": 7.221016738995661e-07, + "loss": 0.8657, + "step": 186360 + }, + { + "epoch": 14.442248827928243, + "grad_norm": 1.5042304301249105, + "learning_rate": 7.221404215747056e-07, + "loss": 0.8525, + "step": 186370 + }, + { + "epoch": 14.44302375140455, + "grad_norm": 1.558036067349698, + "learning_rate": 7.22179169249845e-07, + "loss": 0.8664, + "step": 186380 + }, + { + "epoch": 14.443798674880856, + "grad_norm": 1.578481637655296, + "learning_rate": 7.222179169249846e-07, + "loss": 0.8736, + "step": 186390 + }, + { + "epoch": 14.444573598357163, + "grad_norm": 1.5496176064717437, + "learning_rate": 7.22256664600124e-07, + "loss": 0.8573, + "step": 186400 + }, + { + "epoch": 14.44534852183347, + "grad_norm": 1.6383862150510156, + "learning_rate": 7.222954122752636e-07, + "loss": 0.868, + "step": 186410 + }, + { + "epoch": 14.446123445309775, + "grad_norm": 1.5704843748545667, + "learning_rate": 7.22334159950403e-07, + "loss": 0.8705, + "step": 186420 + }, + { + "epoch": 14.446898368786082, + "grad_norm": 1.5057915177350183, + "learning_rate": 7.223729076255426e-07, + "loss": 0.882, + "step": 186430 + }, + { + "epoch": 14.447673292262389, + "grad_norm": 1.543787885487974, + "learning_rate": 7.22411655300682e-07, + "loss": 0.8694, + "step": 186440 + }, + { + "epoch": 14.448448215738695, + "grad_norm": 1.5295469237562107, + "learning_rate": 7.224504029758216e-07, + "loss": 0.8689, + "step": 186450 + }, + { + "epoch": 14.449223139215002, + "grad_norm": 1.5465856453163938, + "learning_rate": 7.22489150650961e-07, + "loss": 0.853, + "step": 186460 + }, + { + "epoch": 14.449998062691309, + "grad_norm": 1.5263839937430796, + "learning_rate": 7.225278983261005e-07, + "loss": 0.8556, + "step": 186470 + }, + { + "epoch": 14.450772986167616, + "grad_norm": 1.7109802493419362, + "learning_rate": 7.2256664600124e-07, + "loss": 0.8803, + "step": 186480 + }, + { + "epoch": 14.451547909643923, + "grad_norm": 1.5675278544350366, + "learning_rate": 7.226053936763795e-07, + "loss": 0.8795, + "step": 186490 + }, + { + "epoch": 14.45232283312023, + "grad_norm": 1.560191305271846, + "learning_rate": 7.22644141351519e-07, + "loss": 0.8744, + "step": 186500 + }, + { + "epoch": 14.45232283312023, + "eval_loss": 0.8928088545799255, + "eval_runtime": 331.0731, + "eval_samples_per_second": 34.648, + "eval_steps_per_second": 8.663, + "step": 186500 + }, + { + "epoch": 14.453097756596536, + "grad_norm": 1.5398125931181137, + "learning_rate": 7.226828890266585e-07, + "loss": 0.8492, + "step": 186510 + }, + { + "epoch": 14.453872680072843, + "grad_norm": 1.4888256159142594, + "learning_rate": 7.227216367017979e-07, + "loss": 0.8579, + "step": 186520 + }, + { + "epoch": 14.45464760354915, + "grad_norm": 1.446426693901105, + "learning_rate": 7.227603843769375e-07, + "loss": 0.8702, + "step": 186530 + }, + { + "epoch": 14.455422527025457, + "grad_norm": 1.630447221459676, + "learning_rate": 7.227991320520769e-07, + "loss": 0.8631, + "step": 186540 + }, + { + "epoch": 14.456197450501763, + "grad_norm": 1.4976762864311657, + "learning_rate": 7.228378797272165e-07, + "loss": 0.8772, + "step": 186550 + }, + { + "epoch": 14.45697237397807, + "grad_norm": 1.4504358983179573, + "learning_rate": 7.228766274023559e-07, + "loss": 0.8595, + "step": 186560 + }, + { + "epoch": 14.457747297454377, + "grad_norm": 1.4845807700420517, + "learning_rate": 7.229153750774954e-07, + "loss": 0.8689, + "step": 186570 + }, + { + "epoch": 14.458522220930684, + "grad_norm": 1.5218298834537203, + "learning_rate": 7.229541227526349e-07, + "loss": 0.8676, + "step": 186580 + }, + { + "epoch": 14.45929714440699, + "grad_norm": 1.4980074083081512, + "learning_rate": 7.229928704277745e-07, + "loss": 0.8672, + "step": 186590 + }, + { + "epoch": 14.460072067883296, + "grad_norm": 1.4756654913368354, + "learning_rate": 7.230316181029139e-07, + "loss": 0.8763, + "step": 186600 + }, + { + "epoch": 14.460846991359602, + "grad_norm": 1.5380662443109632, + "learning_rate": 7.230703657780534e-07, + "loss": 0.8702, + "step": 186610 + }, + { + "epoch": 14.46162191483591, + "grad_norm": 1.5411486151945017, + "learning_rate": 7.231091134531928e-07, + "loss": 0.8735, + "step": 186620 + }, + { + "epoch": 14.462396838312216, + "grad_norm": 1.511383450871319, + "learning_rate": 7.231478611283324e-07, + "loss": 0.8643, + "step": 186630 + }, + { + "epoch": 14.463171761788523, + "grad_norm": 1.5278658704713635, + "learning_rate": 7.231866088034719e-07, + "loss": 0.8773, + "step": 186640 + }, + { + "epoch": 14.46394668526483, + "grad_norm": 1.463702908268152, + "learning_rate": 7.232253564786114e-07, + "loss": 0.8729, + "step": 186650 + }, + { + "epoch": 14.464721608741137, + "grad_norm": 1.5360419260434643, + "learning_rate": 7.232641041537508e-07, + "loss": 0.8811, + "step": 186660 + }, + { + "epoch": 14.465496532217443, + "grad_norm": 1.4991552496541003, + "learning_rate": 7.233028518288903e-07, + "loss": 0.862, + "step": 186670 + }, + { + "epoch": 14.46627145569375, + "grad_norm": 1.5390439982705955, + "learning_rate": 7.233415995040298e-07, + "loss": 0.8697, + "step": 186680 + }, + { + "epoch": 14.467046379170057, + "grad_norm": 1.4850859526916445, + "learning_rate": 7.233803471791694e-07, + "loss": 0.8651, + "step": 186690 + }, + { + "epoch": 14.467821302646364, + "grad_norm": 1.6083395563059182, + "learning_rate": 7.234190948543088e-07, + "loss": 0.8561, + "step": 186700 + }, + { + "epoch": 14.46859622612267, + "grad_norm": 1.4907626768611266, + "learning_rate": 7.234578425294483e-07, + "loss": 0.8819, + "step": 186710 + }, + { + "epoch": 14.469371149598977, + "grad_norm": 1.498522952214542, + "learning_rate": 7.234965902045877e-07, + "loss": 0.8747, + "step": 186720 + }, + { + "epoch": 14.470146073075284, + "grad_norm": 1.5399417079152122, + "learning_rate": 7.235353378797274e-07, + "loss": 0.8689, + "step": 186730 + }, + { + "epoch": 14.470920996551591, + "grad_norm": 1.6394276108434347, + "learning_rate": 7.235740855548668e-07, + "loss": 0.8625, + "step": 186740 + }, + { + "epoch": 14.471695920027898, + "grad_norm": 1.4703786211043104, + "learning_rate": 7.236128332300063e-07, + "loss": 0.8633, + "step": 186750 + }, + { + "epoch": 14.472470843504205, + "grad_norm": 1.6132064782507733, + "learning_rate": 7.236515809051457e-07, + "loss": 0.856, + "step": 186760 + }, + { + "epoch": 14.473245766980511, + "grad_norm": 1.5970889667517425, + "learning_rate": 7.236903285802852e-07, + "loss": 0.8498, + "step": 186770 + }, + { + "epoch": 14.474020690456818, + "grad_norm": 1.525259938739075, + "learning_rate": 7.237290762554248e-07, + "loss": 0.8623, + "step": 186780 + }, + { + "epoch": 14.474795613933123, + "grad_norm": 1.5543207284084521, + "learning_rate": 7.237678239305643e-07, + "loss": 0.8703, + "step": 186790 + }, + { + "epoch": 14.47557053740943, + "grad_norm": 1.5636456582265752, + "learning_rate": 7.238065716057037e-07, + "loss": 0.8572, + "step": 186800 + }, + { + "epoch": 14.476345460885737, + "grad_norm": 1.471627516826065, + "learning_rate": 7.238453192808432e-07, + "loss": 0.8804, + "step": 186810 + }, + { + "epoch": 14.477120384362044, + "grad_norm": 1.77920626337549, + "learning_rate": 7.238840669559826e-07, + "loss": 0.873, + "step": 186820 + }, + { + "epoch": 14.47789530783835, + "grad_norm": 1.5565933450063334, + "learning_rate": 7.239228146311223e-07, + "loss": 0.8841, + "step": 186830 + }, + { + "epoch": 14.478670231314657, + "grad_norm": 1.5011564309492789, + "learning_rate": 7.239615623062617e-07, + "loss": 0.9046, + "step": 186840 + }, + { + "epoch": 14.479445154790964, + "grad_norm": 1.6042803038921607, + "learning_rate": 7.240003099814012e-07, + "loss": 0.87, + "step": 186850 + }, + { + "epoch": 14.480220078267271, + "grad_norm": 1.6086999895759801, + "learning_rate": 7.240390576565406e-07, + "loss": 0.8604, + "step": 186860 + }, + { + "epoch": 14.480995001743578, + "grad_norm": 1.5374437639123024, + "learning_rate": 7.240778053316802e-07, + "loss": 0.8645, + "step": 186870 + }, + { + "epoch": 14.481769925219885, + "grad_norm": 1.5968588039867377, + "learning_rate": 7.241165530068197e-07, + "loss": 0.8522, + "step": 186880 + }, + { + "epoch": 14.482544848696191, + "grad_norm": 1.6136422846668335, + "learning_rate": 7.241553006819592e-07, + "loss": 0.8601, + "step": 186890 + }, + { + "epoch": 14.483319772172498, + "grad_norm": 1.536982193951947, + "learning_rate": 7.241940483570986e-07, + "loss": 0.8506, + "step": 186900 + }, + { + "epoch": 14.484094695648805, + "grad_norm": 1.6412864217445107, + "learning_rate": 7.242327960322381e-07, + "loss": 0.8784, + "step": 186910 + }, + { + "epoch": 14.484869619125112, + "grad_norm": 1.500363220501097, + "learning_rate": 7.242715437073776e-07, + "loss": 0.858, + "step": 186920 + }, + { + "epoch": 14.485644542601419, + "grad_norm": 1.6174796148737947, + "learning_rate": 7.243102913825172e-07, + "loss": 0.8619, + "step": 186930 + }, + { + "epoch": 14.486419466077725, + "grad_norm": 1.4638662663669697, + "learning_rate": 7.243490390576566e-07, + "loss": 0.8803, + "step": 186940 + }, + { + "epoch": 14.487194389554032, + "grad_norm": 1.5620841910884342, + "learning_rate": 7.243877867327961e-07, + "loss": 0.8746, + "step": 186950 + }, + { + "epoch": 14.487969313030339, + "grad_norm": 1.6804688291998502, + "learning_rate": 7.244265344079355e-07, + "loss": 0.8698, + "step": 186960 + }, + { + "epoch": 14.488744236506644, + "grad_norm": 1.5413163542331807, + "learning_rate": 7.244652820830752e-07, + "loss": 0.8437, + "step": 186970 + }, + { + "epoch": 14.489519159982951, + "grad_norm": 1.6667151207046846, + "learning_rate": 7.245040297582146e-07, + "loss": 0.8654, + "step": 186980 + }, + { + "epoch": 14.490294083459258, + "grad_norm": 1.5536235272334504, + "learning_rate": 7.245427774333541e-07, + "loss": 0.8813, + "step": 186990 + }, + { + "epoch": 14.491069006935565, + "grad_norm": 1.557529496976189, + "learning_rate": 7.245815251084935e-07, + "loss": 0.8665, + "step": 187000 + }, + { + "epoch": 14.491069006935565, + "eval_loss": 0.8924190402030945, + "eval_runtime": 333.5042, + "eval_samples_per_second": 34.395, + "eval_steps_per_second": 8.6, + "step": 187000 + }, + { + "epoch": 14.491843930411871, + "grad_norm": 1.6743867378564061, + "learning_rate": 7.246202727836331e-07, + "loss": 0.8459, + "step": 187010 + }, + { + "epoch": 14.492618853888178, + "grad_norm": 1.5974979113397914, + "learning_rate": 7.246590204587725e-07, + "loss": 0.8759, + "step": 187020 + }, + { + "epoch": 14.493393777364485, + "grad_norm": 1.4397932042349235, + "learning_rate": 7.246977681339121e-07, + "loss": 0.8621, + "step": 187030 + }, + { + "epoch": 14.494168700840792, + "grad_norm": 1.5323180210810365, + "learning_rate": 7.247365158090515e-07, + "loss": 0.8623, + "step": 187040 + }, + { + "epoch": 14.494943624317099, + "grad_norm": 1.5725480260309326, + "learning_rate": 7.24775263484191e-07, + "loss": 0.8675, + "step": 187050 + }, + { + "epoch": 14.495718547793405, + "grad_norm": 1.4579337451609633, + "learning_rate": 7.248140111593304e-07, + "loss": 0.8777, + "step": 187060 + }, + { + "epoch": 14.496493471269712, + "grad_norm": 1.553290753544102, + "learning_rate": 7.248527588344701e-07, + "loss": 0.8628, + "step": 187070 + }, + { + "epoch": 14.497268394746019, + "grad_norm": 1.6739808160056928, + "learning_rate": 7.248915065096095e-07, + "loss": 0.8759, + "step": 187080 + }, + { + "epoch": 14.498043318222326, + "grad_norm": 1.5168228549594545, + "learning_rate": 7.24930254184749e-07, + "loss": 0.864, + "step": 187090 + }, + { + "epoch": 14.498818241698633, + "grad_norm": 1.5747391152558472, + "learning_rate": 7.249690018598884e-07, + "loss": 0.8918, + "step": 187100 + }, + { + "epoch": 14.49959316517494, + "grad_norm": 1.421318913141668, + "learning_rate": 7.25007749535028e-07, + "loss": 0.8547, + "step": 187110 + }, + { + "epoch": 14.500368088651246, + "grad_norm": 1.6651297356227555, + "learning_rate": 7.250464972101675e-07, + "loss": 0.8728, + "step": 187120 + }, + { + "epoch": 14.501143012127553, + "grad_norm": 1.541785243446213, + "learning_rate": 7.25085244885307e-07, + "loss": 0.8655, + "step": 187130 + }, + { + "epoch": 14.50191793560386, + "grad_norm": 1.5956849926409042, + "learning_rate": 7.251239925604464e-07, + "loss": 0.8627, + "step": 187140 + }, + { + "epoch": 14.502692859080167, + "grad_norm": 1.5999349411173758, + "learning_rate": 7.25162740235586e-07, + "loss": 0.8709, + "step": 187150 + }, + { + "epoch": 14.503467782556472, + "grad_norm": 1.593233375900099, + "learning_rate": 7.252014879107254e-07, + "loss": 0.886, + "step": 187160 + }, + { + "epoch": 14.504242706032779, + "grad_norm": 1.5387331605317318, + "learning_rate": 7.25240235585865e-07, + "loss": 0.8602, + "step": 187170 + }, + { + "epoch": 14.505017629509085, + "grad_norm": 1.577752347203315, + "learning_rate": 7.252789832610044e-07, + "loss": 0.879, + "step": 187180 + }, + { + "epoch": 14.505792552985392, + "grad_norm": 1.5222315729645517, + "learning_rate": 7.253177309361439e-07, + "loss": 0.8709, + "step": 187190 + }, + { + "epoch": 14.506567476461699, + "grad_norm": 1.5478790903083297, + "learning_rate": 7.253564786112833e-07, + "loss": 0.8656, + "step": 187200 + }, + { + "epoch": 14.507342399938006, + "grad_norm": 1.4947947160100403, + "learning_rate": 7.253952262864229e-07, + "loss": 0.8774, + "step": 187210 + }, + { + "epoch": 14.508117323414313, + "grad_norm": 1.6001157212269341, + "learning_rate": 7.254339739615624e-07, + "loss": 0.8549, + "step": 187220 + }, + { + "epoch": 14.50889224689062, + "grad_norm": 1.5390645716077405, + "learning_rate": 7.254727216367019e-07, + "loss": 0.876, + "step": 187230 + }, + { + "epoch": 14.509667170366926, + "grad_norm": 1.4697411330298769, + "learning_rate": 7.255114693118413e-07, + "loss": 0.8723, + "step": 187240 + }, + { + "epoch": 14.510442093843233, + "grad_norm": 1.5436876931152554, + "learning_rate": 7.255502169869809e-07, + "loss": 0.8486, + "step": 187250 + }, + { + "epoch": 14.51121701731954, + "grad_norm": 1.4664136059543003, + "learning_rate": 7.255889646621203e-07, + "loss": 0.9001, + "step": 187260 + }, + { + "epoch": 14.511991940795847, + "grad_norm": 1.5163851945393836, + "learning_rate": 7.256277123372599e-07, + "loss": 0.8675, + "step": 187270 + }, + { + "epoch": 14.512766864272153, + "grad_norm": 1.4763463920943622, + "learning_rate": 7.256664600123993e-07, + "loss": 0.8569, + "step": 187280 + }, + { + "epoch": 14.51354178774846, + "grad_norm": 1.5376848426570846, + "learning_rate": 7.257052076875389e-07, + "loss": 0.8614, + "step": 187290 + }, + { + "epoch": 14.514316711224767, + "grad_norm": 1.518476826947967, + "learning_rate": 7.257439553626783e-07, + "loss": 0.8492, + "step": 187300 + }, + { + "epoch": 14.515091634701074, + "grad_norm": 1.5864759925256229, + "learning_rate": 7.257827030378178e-07, + "loss": 0.8733, + "step": 187310 + }, + { + "epoch": 14.51586655817738, + "grad_norm": 1.5674205339127059, + "learning_rate": 7.258214507129573e-07, + "loss": 0.8511, + "step": 187320 + }, + { + "epoch": 14.516641481653688, + "grad_norm": 1.5434510099235745, + "learning_rate": 7.258601983880968e-07, + "loss": 0.8555, + "step": 187330 + }, + { + "epoch": 14.517416405129993, + "grad_norm": 1.5309868648566585, + "learning_rate": 7.258989460632362e-07, + "loss": 0.8585, + "step": 187340 + }, + { + "epoch": 14.5181913286063, + "grad_norm": 1.5166742154695994, + "learning_rate": 7.259376937383758e-07, + "loss": 0.8697, + "step": 187350 + }, + { + "epoch": 14.518966252082606, + "grad_norm": 1.5186893684542517, + "learning_rate": 7.259764414135152e-07, + "loss": 0.8559, + "step": 187360 + }, + { + "epoch": 14.519741175558913, + "grad_norm": 1.5556493195527956, + "learning_rate": 7.260151890886548e-07, + "loss": 0.8768, + "step": 187370 + }, + { + "epoch": 14.52051609903522, + "grad_norm": 1.509315211299999, + "learning_rate": 7.260539367637942e-07, + "loss": 0.8887, + "step": 187380 + }, + { + "epoch": 14.521291022511527, + "grad_norm": 1.6226647732965471, + "learning_rate": 7.260926844389338e-07, + "loss": 0.8607, + "step": 187390 + }, + { + "epoch": 14.522065945987833, + "grad_norm": 1.7248740227845594, + "learning_rate": 7.261314321140732e-07, + "loss": 0.8664, + "step": 187400 + }, + { + "epoch": 14.52284086946414, + "grad_norm": 1.4593458268289936, + "learning_rate": 7.261701797892127e-07, + "loss": 0.8574, + "step": 187410 + }, + { + "epoch": 14.523615792940447, + "grad_norm": 1.5339377019367264, + "learning_rate": 7.262089274643522e-07, + "loss": 0.8592, + "step": 187420 + }, + { + "epoch": 14.524390716416754, + "grad_norm": 1.7451569380352956, + "learning_rate": 7.262476751394918e-07, + "loss": 0.8544, + "step": 187430 + }, + { + "epoch": 14.52516563989306, + "grad_norm": 1.6155076824166161, + "learning_rate": 7.262864228146312e-07, + "loss": 0.8568, + "step": 187440 + }, + { + "epoch": 14.525940563369367, + "grad_norm": 1.5255725306765269, + "learning_rate": 7.263251704897707e-07, + "loss": 0.8675, + "step": 187450 + }, + { + "epoch": 14.526715486845674, + "grad_norm": 1.5894192028699856, + "learning_rate": 7.263639181649101e-07, + "loss": 0.8709, + "step": 187460 + }, + { + "epoch": 14.527490410321981, + "grad_norm": 1.5178277111371419, + "learning_rate": 7.264026658400497e-07, + "loss": 0.8685, + "step": 187470 + }, + { + "epoch": 14.528265333798288, + "grad_norm": 1.5833073441746965, + "learning_rate": 7.264414135151891e-07, + "loss": 0.8648, + "step": 187480 + }, + { + "epoch": 14.529040257274595, + "grad_norm": 1.511014867819236, + "learning_rate": 7.264801611903287e-07, + "loss": 0.8559, + "step": 187490 + }, + { + "epoch": 14.529815180750902, + "grad_norm": 1.534355482574282, + "learning_rate": 7.265189088654681e-07, + "loss": 0.8605, + "step": 187500 + }, + { + "epoch": 14.529815180750902, + "eval_loss": 0.8921893835067749, + "eval_runtime": 334.2662, + "eval_samples_per_second": 34.317, + "eval_steps_per_second": 8.58, + "step": 187500 + }, + { + "epoch": 14.530590104227208, + "grad_norm": 1.405243238050969, + "learning_rate": 7.265576565406076e-07, + "loss": 0.8757, + "step": 187510 + }, + { + "epoch": 14.531365027703515, + "grad_norm": 1.5441188546963034, + "learning_rate": 7.265964042157471e-07, + "loss": 0.8672, + "step": 187520 + }, + { + "epoch": 14.532139951179822, + "grad_norm": 1.5586477936395444, + "learning_rate": 7.266351518908867e-07, + "loss": 0.8625, + "step": 187530 + }, + { + "epoch": 14.532914874656127, + "grad_norm": 1.4285571046420729, + "learning_rate": 7.266738995660261e-07, + "loss": 0.8558, + "step": 187540 + }, + { + "epoch": 14.533689798132434, + "grad_norm": 1.5507561504350091, + "learning_rate": 7.267126472411656e-07, + "loss": 0.8631, + "step": 187550 + }, + { + "epoch": 14.53446472160874, + "grad_norm": 1.4664904842931588, + "learning_rate": 7.26751394916305e-07, + "loss": 0.8767, + "step": 187560 + }, + { + "epoch": 14.535239645085047, + "grad_norm": 1.5730901059539608, + "learning_rate": 7.267901425914447e-07, + "loss": 0.8669, + "step": 187570 + }, + { + "epoch": 14.536014568561354, + "grad_norm": 1.4974720315324068, + "learning_rate": 7.268288902665841e-07, + "loss": 0.8542, + "step": 187580 + }, + { + "epoch": 14.536789492037661, + "grad_norm": 1.5229454201657209, + "learning_rate": 7.268676379417236e-07, + "loss": 0.852, + "step": 187590 + }, + { + "epoch": 14.537564415513968, + "grad_norm": 1.5236836249898238, + "learning_rate": 7.26906385616863e-07, + "loss": 0.8599, + "step": 187600 + }, + { + "epoch": 14.538339338990275, + "grad_norm": 1.4656135207024235, + "learning_rate": 7.269451332920025e-07, + "loss": 0.8793, + "step": 187610 + }, + { + "epoch": 14.539114262466581, + "grad_norm": 1.5746080760983647, + "learning_rate": 7.26983880967142e-07, + "loss": 0.86, + "step": 187620 + }, + { + "epoch": 14.539889185942888, + "grad_norm": 1.5286232496363055, + "learning_rate": 7.270226286422816e-07, + "loss": 0.8738, + "step": 187630 + }, + { + "epoch": 14.540664109419195, + "grad_norm": 1.5907202250385486, + "learning_rate": 7.27061376317421e-07, + "loss": 0.8579, + "step": 187640 + }, + { + "epoch": 14.541439032895502, + "grad_norm": 1.438136563777754, + "learning_rate": 7.271001239925605e-07, + "loss": 0.8677, + "step": 187650 + }, + { + "epoch": 14.542213956371809, + "grad_norm": 1.5324784940651266, + "learning_rate": 7.271388716676999e-07, + "loss": 0.8562, + "step": 187660 + }, + { + "epoch": 14.542988879848116, + "grad_norm": 1.6542186009314463, + "learning_rate": 7.271776193428396e-07, + "loss": 0.8797, + "step": 187670 + }, + { + "epoch": 14.543763803324422, + "grad_norm": 1.4776434990937706, + "learning_rate": 7.27216367017979e-07, + "loss": 0.8741, + "step": 187680 + }, + { + "epoch": 14.54453872680073, + "grad_norm": 1.5555364925427084, + "learning_rate": 7.272551146931185e-07, + "loss": 0.8623, + "step": 187690 + }, + { + "epoch": 14.545313650277036, + "grad_norm": 1.527120488116832, + "learning_rate": 7.272938623682579e-07, + "loss": 0.868, + "step": 187700 + }, + { + "epoch": 14.546088573753341, + "grad_norm": 1.4876155752972464, + "learning_rate": 7.273326100433976e-07, + "loss": 0.8551, + "step": 187710 + }, + { + "epoch": 14.546863497229648, + "grad_norm": 1.4749569457168352, + "learning_rate": 7.27371357718537e-07, + "loss": 0.8721, + "step": 187720 + }, + { + "epoch": 14.547638420705955, + "grad_norm": 1.5417391042206887, + "learning_rate": 7.274101053936765e-07, + "loss": 0.8574, + "step": 187730 + }, + { + "epoch": 14.548413344182261, + "grad_norm": 1.515040578252517, + "learning_rate": 7.274488530688159e-07, + "loss": 0.8744, + "step": 187740 + }, + { + "epoch": 14.549188267658568, + "grad_norm": 1.4554916175122328, + "learning_rate": 7.274876007439554e-07, + "loss": 0.8644, + "step": 187750 + }, + { + "epoch": 14.549963191134875, + "grad_norm": 1.456130792028118, + "learning_rate": 7.275263484190948e-07, + "loss": 0.8639, + "step": 187760 + }, + { + "epoch": 14.550738114611182, + "grad_norm": 1.5032398076330167, + "learning_rate": 7.275650960942345e-07, + "loss": 0.8535, + "step": 187770 + }, + { + "epoch": 14.551513038087489, + "grad_norm": 1.6040995171911545, + "learning_rate": 7.276038437693739e-07, + "loss": 0.8503, + "step": 187780 + }, + { + "epoch": 14.552287961563795, + "grad_norm": 1.539272292477108, + "learning_rate": 7.276425914445134e-07, + "loss": 0.8588, + "step": 187790 + }, + { + "epoch": 14.553062885040102, + "grad_norm": 1.5388772475236459, + "learning_rate": 7.276813391196528e-07, + "loss": 0.8685, + "step": 187800 + }, + { + "epoch": 14.553837808516409, + "grad_norm": 1.511828193474924, + "learning_rate": 7.277200867947925e-07, + "loss": 0.8648, + "step": 187810 + }, + { + "epoch": 14.554612731992716, + "grad_norm": 1.4085273595493129, + "learning_rate": 7.277588344699319e-07, + "loss": 0.8824, + "step": 187820 + }, + { + "epoch": 14.555387655469023, + "grad_norm": 1.4488147485076812, + "learning_rate": 7.277975821450714e-07, + "loss": 0.8614, + "step": 187830 + }, + { + "epoch": 14.55616257894533, + "grad_norm": 1.6381505193271213, + "learning_rate": 7.278363298202108e-07, + "loss": 0.8642, + "step": 187840 + }, + { + "epoch": 14.556937502421636, + "grad_norm": 1.6629139661401742, + "learning_rate": 7.278750774953503e-07, + "loss": 0.8978, + "step": 187850 + }, + { + "epoch": 14.557712425897943, + "grad_norm": 1.408241757551883, + "learning_rate": 7.279138251704899e-07, + "loss": 0.8644, + "step": 187860 + }, + { + "epoch": 14.55848734937425, + "grad_norm": 1.4881436349658717, + "learning_rate": 7.279525728456294e-07, + "loss": 0.8557, + "step": 187870 + }, + { + "epoch": 14.559262272850557, + "grad_norm": 1.5280723787054638, + "learning_rate": 7.279913205207688e-07, + "loss": 0.8577, + "step": 187880 + }, + { + "epoch": 14.560037196326864, + "grad_norm": 1.5506556353393477, + "learning_rate": 7.280300681959083e-07, + "loss": 0.8601, + "step": 187890 + }, + { + "epoch": 14.56081211980317, + "grad_norm": 1.531507411963207, + "learning_rate": 7.280688158710477e-07, + "loss": 0.8533, + "step": 187900 + }, + { + "epoch": 14.561587043279475, + "grad_norm": 1.6309432879033314, + "learning_rate": 7.281075635461874e-07, + "loss": 0.8532, + "step": 187910 + }, + { + "epoch": 14.562361966755782, + "grad_norm": 1.5635446571694473, + "learning_rate": 7.281463112213268e-07, + "loss": 0.8896, + "step": 187920 + }, + { + "epoch": 14.563136890232089, + "grad_norm": 1.5224092268640195, + "learning_rate": 7.281850588964663e-07, + "loss": 0.8735, + "step": 187930 + }, + { + "epoch": 14.563911813708396, + "grad_norm": 1.532868022783584, + "learning_rate": 7.282238065716057e-07, + "loss": 0.8778, + "step": 187940 + }, + { + "epoch": 14.564686737184703, + "grad_norm": 1.560074037160918, + "learning_rate": 7.282625542467453e-07, + "loss": 0.89, + "step": 187950 + }, + { + "epoch": 14.56546166066101, + "grad_norm": 1.631393302440251, + "learning_rate": 7.283013019218848e-07, + "loss": 0.878, + "step": 187960 + }, + { + "epoch": 14.566236584137316, + "grad_norm": 1.613389764566305, + "learning_rate": 7.283400495970243e-07, + "loss": 0.8624, + "step": 187970 + }, + { + "epoch": 14.567011507613623, + "grad_norm": 1.4967619666655485, + "learning_rate": 7.283787972721637e-07, + "loss": 0.863, + "step": 187980 + }, + { + "epoch": 14.56778643108993, + "grad_norm": 1.5334152667146088, + "learning_rate": 7.284175449473032e-07, + "loss": 0.8793, + "step": 187990 + }, + { + "epoch": 14.568561354566237, + "grad_norm": 1.5279497950265681, + "learning_rate": 7.284562926224427e-07, + "loss": 0.8746, + "step": 188000 + }, + { + "epoch": 14.568561354566237, + "eval_loss": 0.8921414613723755, + "eval_runtime": 332.4386, + "eval_samples_per_second": 34.506, + "eval_steps_per_second": 8.627, + "step": 188000 + }, + { + "epoch": 14.569336278042543, + "grad_norm": 1.4658189359610287, + "learning_rate": 7.284950402975823e-07, + "loss": 0.8685, + "step": 188010 + }, + { + "epoch": 14.57011120151885, + "grad_norm": 1.654424587923059, + "learning_rate": 7.285337879727217e-07, + "loss": 0.8858, + "step": 188020 + }, + { + "epoch": 14.570886124995157, + "grad_norm": 1.5000477408763513, + "learning_rate": 7.285725356478612e-07, + "loss": 0.8632, + "step": 188030 + }, + { + "epoch": 14.571661048471464, + "grad_norm": 1.5809793641661152, + "learning_rate": 7.286112833230006e-07, + "loss": 0.8854, + "step": 188040 + }, + { + "epoch": 14.57243597194777, + "grad_norm": 1.47724905804358, + "learning_rate": 7.286500309981402e-07, + "loss": 0.8741, + "step": 188050 + }, + { + "epoch": 14.573210895424078, + "grad_norm": 1.4408959941395667, + "learning_rate": 7.286887786732797e-07, + "loss": 0.8639, + "step": 188060 + }, + { + "epoch": 14.573985818900384, + "grad_norm": 1.5068767084962875, + "learning_rate": 7.287275263484192e-07, + "loss": 0.8812, + "step": 188070 + }, + { + "epoch": 14.57476074237669, + "grad_norm": 1.4710449807154273, + "learning_rate": 7.287662740235586e-07, + "loss": 0.8709, + "step": 188080 + }, + { + "epoch": 14.575535665852996, + "grad_norm": 1.4611262397032756, + "learning_rate": 7.288050216986982e-07, + "loss": 0.8572, + "step": 188090 + }, + { + "epoch": 14.576310589329303, + "grad_norm": 1.5142289170170682, + "learning_rate": 7.288437693738376e-07, + "loss": 0.8764, + "step": 188100 + }, + { + "epoch": 14.57708551280561, + "grad_norm": 1.4743224004636917, + "learning_rate": 7.288825170489772e-07, + "loss": 0.8728, + "step": 188110 + }, + { + "epoch": 14.577860436281917, + "grad_norm": 1.5286148867360128, + "learning_rate": 7.289212647241166e-07, + "loss": 0.8555, + "step": 188120 + }, + { + "epoch": 14.578635359758223, + "grad_norm": 1.4961074143747712, + "learning_rate": 7.289600123992561e-07, + "loss": 0.8612, + "step": 188130 + }, + { + "epoch": 14.57941028323453, + "grad_norm": 1.486091804216793, + "learning_rate": 7.289987600743956e-07, + "loss": 0.8631, + "step": 188140 + }, + { + "epoch": 14.580185206710837, + "grad_norm": 1.499039876382418, + "learning_rate": 7.290375077495351e-07, + "loss": 0.8714, + "step": 188150 + }, + { + "epoch": 14.580960130187144, + "grad_norm": 1.4500021325717143, + "learning_rate": 7.290762554246746e-07, + "loss": 0.8459, + "step": 188160 + }, + { + "epoch": 14.58173505366345, + "grad_norm": 1.4822738798355763, + "learning_rate": 7.291150030998141e-07, + "loss": 0.8626, + "step": 188170 + }, + { + "epoch": 14.582509977139757, + "grad_norm": 1.5859203928954473, + "learning_rate": 7.291537507749535e-07, + "loss": 0.8641, + "step": 188180 + }, + { + "epoch": 14.583284900616064, + "grad_norm": 1.5614562531287102, + "learning_rate": 7.291924984500931e-07, + "loss": 0.8644, + "step": 188190 + }, + { + "epoch": 14.584059824092371, + "grad_norm": 1.5227282264577395, + "learning_rate": 7.292312461252325e-07, + "loss": 0.8946, + "step": 188200 + }, + { + "epoch": 14.584834747568678, + "grad_norm": 1.4157922904833526, + "learning_rate": 7.292699938003721e-07, + "loss": 0.8551, + "step": 188210 + }, + { + "epoch": 14.585609671044985, + "grad_norm": 1.5825009052150363, + "learning_rate": 7.293087414755115e-07, + "loss": 0.8719, + "step": 188220 + }, + { + "epoch": 14.586384594521292, + "grad_norm": 1.569963447650081, + "learning_rate": 7.293474891506511e-07, + "loss": 0.8487, + "step": 188230 + }, + { + "epoch": 14.587159517997598, + "grad_norm": 1.4579941827548886, + "learning_rate": 7.293862368257905e-07, + "loss": 0.8582, + "step": 188240 + }, + { + "epoch": 14.587934441473905, + "grad_norm": 1.5009637253643129, + "learning_rate": 7.2942498450093e-07, + "loss": 0.8711, + "step": 188250 + }, + { + "epoch": 14.588709364950212, + "grad_norm": 1.4605505394821425, + "learning_rate": 7.294637321760695e-07, + "loss": 0.8844, + "step": 188260 + }, + { + "epoch": 14.589484288426519, + "grad_norm": 1.5186838925191462, + "learning_rate": 7.29502479851209e-07, + "loss": 0.8731, + "step": 188270 + }, + { + "epoch": 14.590259211902824, + "grad_norm": 1.5713836108678882, + "learning_rate": 7.295412275263485e-07, + "loss": 0.8734, + "step": 188280 + }, + { + "epoch": 14.59103413537913, + "grad_norm": 1.485690120538287, + "learning_rate": 7.29579975201488e-07, + "loss": 0.8764, + "step": 188290 + }, + { + "epoch": 14.591809058855437, + "grad_norm": 1.6244734138703814, + "learning_rate": 7.296187228766274e-07, + "loss": 0.8738, + "step": 188300 + }, + { + "epoch": 14.592583982331744, + "grad_norm": 1.4552402269003728, + "learning_rate": 7.29657470551767e-07, + "loss": 0.881, + "step": 188310 + }, + { + "epoch": 14.593358905808051, + "grad_norm": 1.4990401593982399, + "learning_rate": 7.296962182269064e-07, + "loss": 0.8654, + "step": 188320 + }, + { + "epoch": 14.594133829284358, + "grad_norm": 1.7412086567000944, + "learning_rate": 7.29734965902046e-07, + "loss": 0.856, + "step": 188330 + }, + { + "epoch": 14.594908752760665, + "grad_norm": 1.4503751435750964, + "learning_rate": 7.297737135771854e-07, + "loss": 0.8579, + "step": 188340 + }, + { + "epoch": 14.595683676236971, + "grad_norm": 1.5728053066248595, + "learning_rate": 7.29812461252325e-07, + "loss": 0.8669, + "step": 188350 + }, + { + "epoch": 14.596458599713278, + "grad_norm": 1.5140946424768942, + "learning_rate": 7.298512089274644e-07, + "loss": 0.8796, + "step": 188360 + }, + { + "epoch": 14.597233523189585, + "grad_norm": 1.4974014234104418, + "learning_rate": 7.29889956602604e-07, + "loss": 0.8752, + "step": 188370 + }, + { + "epoch": 14.598008446665892, + "grad_norm": 1.6776373875929877, + "learning_rate": 7.299287042777434e-07, + "loss": 0.8752, + "step": 188380 + }, + { + "epoch": 14.598783370142199, + "grad_norm": 1.556277943394787, + "learning_rate": 7.299674519528829e-07, + "loss": 0.8679, + "step": 188390 + }, + { + "epoch": 14.599558293618506, + "grad_norm": 1.509725875327059, + "learning_rate": 7.300061996280223e-07, + "loss": 0.8692, + "step": 188400 + }, + { + "epoch": 14.600333217094812, + "grad_norm": 1.5117462059014102, + "learning_rate": 7.300449473031619e-07, + "loss": 0.8789, + "step": 188410 + }, + { + "epoch": 14.60110814057112, + "grad_norm": 1.5069935552577327, + "learning_rate": 7.300836949783013e-07, + "loss": 0.8615, + "step": 188420 + }, + { + "epoch": 14.601883064047426, + "grad_norm": 1.599947457609972, + "learning_rate": 7.301224426534409e-07, + "loss": 0.8752, + "step": 188430 + }, + { + "epoch": 14.602657987523733, + "grad_norm": 1.5629864631410941, + "learning_rate": 7.301611903285803e-07, + "loss": 0.8537, + "step": 188440 + }, + { + "epoch": 14.603432911000038, + "grad_norm": 1.502842872874994, + "learning_rate": 7.301999380037199e-07, + "loss": 0.8535, + "step": 188450 + }, + { + "epoch": 14.604207834476345, + "grad_norm": 1.4861036601557993, + "learning_rate": 7.302386856788593e-07, + "loss": 0.8535, + "step": 188460 + }, + { + "epoch": 14.604982757952651, + "grad_norm": 1.5780993527855212, + "learning_rate": 7.302774333539989e-07, + "loss": 0.883, + "step": 188470 + }, + { + "epoch": 14.605757681428958, + "grad_norm": 1.5036173294646595, + "learning_rate": 7.303161810291383e-07, + "loss": 0.8671, + "step": 188480 + }, + { + "epoch": 14.606532604905265, + "grad_norm": 1.6500752735573747, + "learning_rate": 7.303549287042778e-07, + "loss": 0.8844, + "step": 188490 + }, + { + "epoch": 14.607307528381572, + "grad_norm": 1.452340191086136, + "learning_rate": 7.303936763794172e-07, + "loss": 0.8712, + "step": 188500 + }, + { + "epoch": 14.607307528381572, + "eval_loss": 0.8919835686683655, + "eval_runtime": 329.3303, + "eval_samples_per_second": 34.831, + "eval_steps_per_second": 8.709, + "step": 188500 + }, + { + "epoch": 14.608082451857879, + "grad_norm": 1.4913331011924207, + "learning_rate": 7.304324240545569e-07, + "loss": 0.8672, + "step": 188510 + }, + { + "epoch": 14.608857375334185, + "grad_norm": 1.6461252509607032, + "learning_rate": 7.304711717296963e-07, + "loss": 0.8592, + "step": 188520 + }, + { + "epoch": 14.609632298810492, + "grad_norm": 1.54590815057834, + "learning_rate": 7.305099194048358e-07, + "loss": 0.8713, + "step": 188530 + }, + { + "epoch": 14.610407222286799, + "grad_norm": 1.5556195658639016, + "learning_rate": 7.305486670799752e-07, + "loss": 0.8714, + "step": 188540 + }, + { + "epoch": 14.611182145763106, + "grad_norm": 1.5362700432983476, + "learning_rate": 7.305874147551148e-07, + "loss": 0.8659, + "step": 188550 + }, + { + "epoch": 14.611957069239413, + "grad_norm": 1.5901291626154712, + "learning_rate": 7.306261624302542e-07, + "loss": 0.8672, + "step": 188560 + }, + { + "epoch": 14.61273199271572, + "grad_norm": 1.503189830921577, + "learning_rate": 7.306649101053938e-07, + "loss": 0.8555, + "step": 188570 + }, + { + "epoch": 14.613506916192026, + "grad_norm": 1.3999009404806146, + "learning_rate": 7.307036577805332e-07, + "loss": 0.8776, + "step": 188580 + }, + { + "epoch": 14.614281839668333, + "grad_norm": 1.6068494403465947, + "learning_rate": 7.307424054556727e-07, + "loss": 0.8859, + "step": 188590 + }, + { + "epoch": 14.61505676314464, + "grad_norm": 1.495511696939278, + "learning_rate": 7.307811531308122e-07, + "loss": 0.8584, + "step": 188600 + }, + { + "epoch": 14.615831686620947, + "grad_norm": 1.5886142251035869, + "learning_rate": 7.308199008059518e-07, + "loss": 0.8719, + "step": 188610 + }, + { + "epoch": 14.616606610097254, + "grad_norm": 1.5620294862869846, + "learning_rate": 7.308586484810912e-07, + "loss": 0.882, + "step": 188620 + }, + { + "epoch": 14.61738153357356, + "grad_norm": 1.658981847048212, + "learning_rate": 7.308973961562307e-07, + "loss": 0.8666, + "step": 188630 + }, + { + "epoch": 14.618156457049867, + "grad_norm": 1.5768927798842423, + "learning_rate": 7.309361438313701e-07, + "loss": 0.8627, + "step": 188640 + }, + { + "epoch": 14.618931380526172, + "grad_norm": 1.5856786763207527, + "learning_rate": 7.309748915065098e-07, + "loss": 0.8658, + "step": 188650 + }, + { + "epoch": 14.619706304002479, + "grad_norm": 1.5733112818967119, + "learning_rate": 7.310136391816492e-07, + "loss": 0.8777, + "step": 188660 + }, + { + "epoch": 14.620481227478786, + "grad_norm": 1.4577916199328615, + "learning_rate": 7.310523868567887e-07, + "loss": 0.8643, + "step": 188670 + }, + { + "epoch": 14.621256150955093, + "grad_norm": 1.5339553878207746, + "learning_rate": 7.310911345319281e-07, + "loss": 0.8648, + "step": 188680 + }, + { + "epoch": 14.6220310744314, + "grad_norm": 1.4623159613605432, + "learning_rate": 7.311298822070676e-07, + "loss": 0.8638, + "step": 188690 + }, + { + "epoch": 14.622805997907706, + "grad_norm": 1.5353217090538183, + "learning_rate": 7.311686298822071e-07, + "loss": 0.8662, + "step": 188700 + }, + { + "epoch": 14.623580921384013, + "grad_norm": 1.5145394960507192, + "learning_rate": 7.312073775573467e-07, + "loss": 0.87, + "step": 188710 + }, + { + "epoch": 14.62435584486032, + "grad_norm": 1.634490951067835, + "learning_rate": 7.312461252324861e-07, + "loss": 0.8371, + "step": 188720 + }, + { + "epoch": 14.625130768336627, + "grad_norm": 1.4567767396798672, + "learning_rate": 7.312848729076256e-07, + "loss": 0.8674, + "step": 188730 + }, + { + "epoch": 14.625905691812934, + "grad_norm": 1.5707698310893248, + "learning_rate": 7.31323620582765e-07, + "loss": 0.8594, + "step": 188740 + }, + { + "epoch": 14.62668061528924, + "grad_norm": 1.5484951355082184, + "learning_rate": 7.313623682579047e-07, + "loss": 0.8834, + "step": 188750 + }, + { + "epoch": 14.627455538765547, + "grad_norm": 1.5846555846802237, + "learning_rate": 7.314011159330441e-07, + "loss": 0.8791, + "step": 188760 + }, + { + "epoch": 14.628230462241854, + "grad_norm": 1.620629097661996, + "learning_rate": 7.314398636081836e-07, + "loss": 0.8608, + "step": 188770 + }, + { + "epoch": 14.62900538571816, + "grad_norm": 1.6423917077550583, + "learning_rate": 7.31478611283323e-07, + "loss": 0.8574, + "step": 188780 + }, + { + "epoch": 14.629780309194468, + "grad_norm": 1.5740896441470544, + "learning_rate": 7.315173589584626e-07, + "loss": 0.8817, + "step": 188790 + }, + { + "epoch": 14.630555232670774, + "grad_norm": 1.4644461330468361, + "learning_rate": 7.315561066336021e-07, + "loss": 0.8602, + "step": 188800 + }, + { + "epoch": 14.631330156147081, + "grad_norm": 1.4966051225700554, + "learning_rate": 7.315948543087416e-07, + "loss": 0.8557, + "step": 188810 + }, + { + "epoch": 14.632105079623388, + "grad_norm": 1.6337301384499097, + "learning_rate": 7.31633601983881e-07, + "loss": 0.8621, + "step": 188820 + }, + { + "epoch": 14.632880003099693, + "grad_norm": 1.8118418334116986, + "learning_rate": 7.316723496590205e-07, + "loss": 0.8788, + "step": 188830 + }, + { + "epoch": 14.633654926576, + "grad_norm": 1.588721259435108, + "learning_rate": 7.317110973341599e-07, + "loss": 0.8717, + "step": 188840 + }, + { + "epoch": 14.634429850052307, + "grad_norm": 1.6130066513128865, + "learning_rate": 7.317498450092996e-07, + "loss": 0.8565, + "step": 188850 + }, + { + "epoch": 14.635204773528613, + "grad_norm": 1.5902536633150595, + "learning_rate": 7.31788592684439e-07, + "loss": 0.8775, + "step": 188860 + }, + { + "epoch": 14.63597969700492, + "grad_norm": 1.5942035407793427, + "learning_rate": 7.318273403595785e-07, + "loss": 0.8795, + "step": 188870 + }, + { + "epoch": 14.636754620481227, + "grad_norm": 1.5110127074975552, + "learning_rate": 7.318660880347179e-07, + "loss": 0.8605, + "step": 188880 + }, + { + "epoch": 14.637529543957534, + "grad_norm": 1.504636765387044, + "learning_rate": 7.319048357098575e-07, + "loss": 0.8741, + "step": 188890 + }, + { + "epoch": 14.63830446743384, + "grad_norm": 1.548860068197771, + "learning_rate": 7.31943583384997e-07, + "loss": 0.8611, + "step": 188900 + }, + { + "epoch": 14.639079390910148, + "grad_norm": 1.5476914801697972, + "learning_rate": 7.319823310601365e-07, + "loss": 0.8346, + "step": 188910 + }, + { + "epoch": 14.639854314386454, + "grad_norm": 1.4624074262677402, + "learning_rate": 7.320210787352759e-07, + "loss": 0.876, + "step": 188920 + }, + { + "epoch": 14.640629237862761, + "grad_norm": 1.5417107309844675, + "learning_rate": 7.320598264104155e-07, + "loss": 0.8473, + "step": 188930 + }, + { + "epoch": 14.641404161339068, + "grad_norm": 1.5767035388086668, + "learning_rate": 7.320985740855549e-07, + "loss": 0.8605, + "step": 188940 + }, + { + "epoch": 14.642179084815375, + "grad_norm": 1.5495101212813085, + "learning_rate": 7.321373217606945e-07, + "loss": 0.8553, + "step": 188950 + }, + { + "epoch": 14.642954008291682, + "grad_norm": 1.5627064382596636, + "learning_rate": 7.321760694358339e-07, + "loss": 0.8631, + "step": 188960 + }, + { + "epoch": 14.643728931767988, + "grad_norm": 1.56595853075407, + "learning_rate": 7.322148171109734e-07, + "loss": 0.878, + "step": 188970 + }, + { + "epoch": 14.644503855244295, + "grad_norm": 1.4180143245223593, + "learning_rate": 7.322535647861128e-07, + "loss": 0.8661, + "step": 188980 + }, + { + "epoch": 14.645278778720602, + "grad_norm": 1.5959743782188267, + "learning_rate": 7.322923124612525e-07, + "loss": 0.8718, + "step": 188990 + }, + { + "epoch": 14.646053702196909, + "grad_norm": 1.528414229571751, + "learning_rate": 7.323310601363919e-07, + "loss": 0.8691, + "step": 189000 + }, + { + "epoch": 14.646053702196909, + "eval_loss": 0.8919057250022888, + "eval_runtime": 329.0021, + "eval_samples_per_second": 34.866, + "eval_steps_per_second": 8.717, + "step": 189000 + }, + { + "epoch": 14.646828625673216, + "grad_norm": 1.5967529490572259, + "learning_rate": 7.323698078115314e-07, + "loss": 0.8887, + "step": 189010 + }, + { + "epoch": 14.64760354914952, + "grad_norm": 1.4386554291180929, + "learning_rate": 7.324085554866708e-07, + "loss": 0.8566, + "step": 189020 + }, + { + "epoch": 14.648378472625827, + "grad_norm": 1.4830587926867473, + "learning_rate": 7.324473031618104e-07, + "loss": 0.8515, + "step": 189030 + }, + { + "epoch": 14.649153396102134, + "grad_norm": 1.6081575059507254, + "learning_rate": 7.324860508369498e-07, + "loss": 0.865, + "step": 189040 + }, + { + "epoch": 14.649928319578441, + "grad_norm": 1.647364180900999, + "learning_rate": 7.325247985120894e-07, + "loss": 0.8696, + "step": 189050 + }, + { + "epoch": 14.650703243054748, + "grad_norm": 1.7060662089282432, + "learning_rate": 7.325635461872288e-07, + "loss": 0.8604, + "step": 189060 + }, + { + "epoch": 14.651478166531055, + "grad_norm": 1.5936423938896662, + "learning_rate": 7.326022938623684e-07, + "loss": 0.8697, + "step": 189070 + }, + { + "epoch": 14.652253090007362, + "grad_norm": 1.5378180472603165, + "learning_rate": 7.326410415375078e-07, + "loss": 0.8513, + "step": 189080 + }, + { + "epoch": 14.653028013483668, + "grad_norm": 1.52873359196684, + "learning_rate": 7.326797892126474e-07, + "loss": 0.8659, + "step": 189090 + }, + { + "epoch": 14.653802936959975, + "grad_norm": 1.5406870640239985, + "learning_rate": 7.327185368877868e-07, + "loss": 0.8632, + "step": 189100 + }, + { + "epoch": 14.654577860436282, + "grad_norm": 1.5503308778397058, + "learning_rate": 7.327572845629263e-07, + "loss": 0.8543, + "step": 189110 + }, + { + "epoch": 14.655352783912589, + "grad_norm": 1.586948289730755, + "learning_rate": 7.327960322380657e-07, + "loss": 0.8667, + "step": 189120 + }, + { + "epoch": 14.656127707388896, + "grad_norm": 1.5503816876206555, + "learning_rate": 7.328347799132053e-07, + "loss": 0.8777, + "step": 189130 + }, + { + "epoch": 14.656902630865202, + "grad_norm": 1.684809560226236, + "learning_rate": 7.328735275883448e-07, + "loss": 0.8691, + "step": 189140 + }, + { + "epoch": 14.65767755434151, + "grad_norm": 1.5426075316325418, + "learning_rate": 7.329122752634843e-07, + "loss": 0.8959, + "step": 189150 + }, + { + "epoch": 14.658452477817816, + "grad_norm": 1.494077728137148, + "learning_rate": 7.329510229386237e-07, + "loss": 0.8691, + "step": 189160 + }, + { + "epoch": 14.659227401294123, + "grad_norm": 1.6435003602761105, + "learning_rate": 7.329897706137633e-07, + "loss": 0.8621, + "step": 189170 + }, + { + "epoch": 14.66000232477043, + "grad_norm": 1.4328475153274582, + "learning_rate": 7.330285182889027e-07, + "loss": 0.8693, + "step": 189180 + }, + { + "epoch": 14.660777248246736, + "grad_norm": 1.5659344160680857, + "learning_rate": 7.330672659640423e-07, + "loss": 0.8474, + "step": 189190 + }, + { + "epoch": 14.661552171723041, + "grad_norm": 1.4517893995768445, + "learning_rate": 7.331060136391817e-07, + "loss": 0.8685, + "step": 189200 + }, + { + "epoch": 14.662327095199348, + "grad_norm": 1.5823876949938365, + "learning_rate": 7.331447613143213e-07, + "loss": 0.8814, + "step": 189210 + }, + { + "epoch": 14.663102018675655, + "grad_norm": 1.5068004668265256, + "learning_rate": 7.331835089894607e-07, + "loss": 0.8504, + "step": 189220 + }, + { + "epoch": 14.663876942151962, + "grad_norm": 1.6264953395794994, + "learning_rate": 7.332222566646002e-07, + "loss": 0.8736, + "step": 189230 + }, + { + "epoch": 14.664651865628269, + "grad_norm": 1.6473451782561381, + "learning_rate": 7.332610043397397e-07, + "loss": 0.895, + "step": 189240 + }, + { + "epoch": 14.665426789104576, + "grad_norm": 1.5852311588254335, + "learning_rate": 7.332997520148792e-07, + "loss": 0.8667, + "step": 189250 + }, + { + "epoch": 14.666201712580882, + "grad_norm": 1.578887558619317, + "learning_rate": 7.333384996900186e-07, + "loss": 0.8634, + "step": 189260 + }, + { + "epoch": 14.66697663605719, + "grad_norm": 1.469114454584315, + "learning_rate": 7.333772473651582e-07, + "loss": 0.8644, + "step": 189270 + }, + { + "epoch": 14.667751559533496, + "grad_norm": 1.597293472525918, + "learning_rate": 7.334159950402976e-07, + "loss": 0.8541, + "step": 189280 + }, + { + "epoch": 14.668526483009803, + "grad_norm": 1.5384895522771287, + "learning_rate": 7.334547427154372e-07, + "loss": 0.8732, + "step": 189290 + }, + { + "epoch": 14.66930140648611, + "grad_norm": 1.5908400353335956, + "learning_rate": 7.334934903905766e-07, + "loss": 0.8679, + "step": 189300 + }, + { + "epoch": 14.670076329962416, + "grad_norm": 1.5340813676135172, + "learning_rate": 7.335322380657162e-07, + "loss": 0.8613, + "step": 189310 + }, + { + "epoch": 14.670851253438723, + "grad_norm": 1.5754631539522614, + "learning_rate": 7.335709857408556e-07, + "loss": 0.8568, + "step": 189320 + }, + { + "epoch": 14.67162617691503, + "grad_norm": 1.5839369488865107, + "learning_rate": 7.336097334159951e-07, + "loss": 0.8639, + "step": 189330 + }, + { + "epoch": 14.672401100391337, + "grad_norm": 1.541115949191544, + "learning_rate": 7.336484810911346e-07, + "loss": 0.8893, + "step": 189340 + }, + { + "epoch": 14.673176023867644, + "grad_norm": 1.5451029656096897, + "learning_rate": 7.336872287662741e-07, + "loss": 0.8595, + "step": 189350 + }, + { + "epoch": 14.67395094734395, + "grad_norm": 1.5644544117712307, + "learning_rate": 7.337259764414136e-07, + "loss": 0.8775, + "step": 189360 + }, + { + "epoch": 14.674725870820257, + "grad_norm": 1.4787510514120137, + "learning_rate": 7.337647241165531e-07, + "loss": 0.8508, + "step": 189370 + }, + { + "epoch": 14.675500794296564, + "grad_norm": 1.560369686176045, + "learning_rate": 7.338034717916925e-07, + "loss": 0.854, + "step": 189380 + }, + { + "epoch": 14.676275717772869, + "grad_norm": 1.431233380675777, + "learning_rate": 7.338422194668321e-07, + "loss": 0.8592, + "step": 189390 + }, + { + "epoch": 14.677050641249176, + "grad_norm": 1.4599392102767002, + "learning_rate": 7.338809671419715e-07, + "loss": 0.8726, + "step": 189400 + }, + { + "epoch": 14.677825564725483, + "grad_norm": 1.570368537462186, + "learning_rate": 7.339197148171111e-07, + "loss": 0.8823, + "step": 189410 + }, + { + "epoch": 14.67860048820179, + "grad_norm": 1.5302185693331978, + "learning_rate": 7.339584624922505e-07, + "loss": 0.8707, + "step": 189420 + }, + { + "epoch": 14.679375411678096, + "grad_norm": 1.5777984844444557, + "learning_rate": 7.3399721016739e-07, + "loss": 0.8932, + "step": 189430 + }, + { + "epoch": 14.680150335154403, + "grad_norm": 1.5012621669970152, + "learning_rate": 7.340359578425295e-07, + "loss": 0.8562, + "step": 189440 + }, + { + "epoch": 14.68092525863071, + "grad_norm": 1.5411627119762785, + "learning_rate": 7.340747055176691e-07, + "loss": 0.8687, + "step": 189450 + }, + { + "epoch": 14.681700182107017, + "grad_norm": 1.5580325656678808, + "learning_rate": 7.341134531928085e-07, + "loss": 0.8606, + "step": 189460 + }, + { + "epoch": 14.682475105583324, + "grad_norm": 1.4800615925627556, + "learning_rate": 7.34152200867948e-07, + "loss": 0.8607, + "step": 189470 + }, + { + "epoch": 14.68325002905963, + "grad_norm": 1.5175383547026196, + "learning_rate": 7.341909485430874e-07, + "loss": 0.8517, + "step": 189480 + }, + { + "epoch": 14.684024952535937, + "grad_norm": 1.4890082903267672, + "learning_rate": 7.34229696218227e-07, + "loss": 0.8523, + "step": 189490 + }, + { + "epoch": 14.684799876012244, + "grad_norm": 1.5844206548152735, + "learning_rate": 7.342684438933665e-07, + "loss": 0.8613, + "step": 189500 + }, + { + "epoch": 14.684799876012244, + "eval_loss": 0.8920210003852844, + "eval_runtime": 328.3114, + "eval_samples_per_second": 34.939, + "eval_steps_per_second": 8.736, + "step": 189500 + }, + { + "epoch": 14.68557479948855, + "grad_norm": 1.5413581201536575, + "learning_rate": 7.34307191568506e-07, + "loss": 0.8536, + "step": 189510 + }, + { + "epoch": 14.686349722964858, + "grad_norm": 1.5580179720760388, + "learning_rate": 7.343459392436454e-07, + "loss": 0.8603, + "step": 189520 + }, + { + "epoch": 14.687124646441164, + "grad_norm": 1.5163511047469465, + "learning_rate": 7.343846869187849e-07, + "loss": 0.8487, + "step": 189530 + }, + { + "epoch": 14.687899569917471, + "grad_norm": 1.5140717761194764, + "learning_rate": 7.344234345939244e-07, + "loss": 0.8719, + "step": 189540 + }, + { + "epoch": 14.688674493393778, + "grad_norm": 1.496106547279653, + "learning_rate": 7.34462182269064e-07, + "loss": 0.8504, + "step": 189550 + }, + { + "epoch": 14.689449416870085, + "grad_norm": 1.551672986182988, + "learning_rate": 7.345009299442034e-07, + "loss": 0.8615, + "step": 189560 + }, + { + "epoch": 14.69022434034639, + "grad_norm": 1.4641491532614312, + "learning_rate": 7.345396776193429e-07, + "loss": 0.8723, + "step": 189570 + }, + { + "epoch": 14.690999263822697, + "grad_norm": 1.4829743395356196, + "learning_rate": 7.345784252944823e-07, + "loss": 0.8531, + "step": 189580 + }, + { + "epoch": 14.691774187299004, + "grad_norm": 1.495346416285071, + "learning_rate": 7.34617172969622e-07, + "loss": 0.865, + "step": 189590 + }, + { + "epoch": 14.69254911077531, + "grad_norm": 1.6052358050138553, + "learning_rate": 7.346559206447614e-07, + "loss": 0.8746, + "step": 189600 + }, + { + "epoch": 14.693324034251617, + "grad_norm": 1.5295502654266793, + "learning_rate": 7.346946683199009e-07, + "loss": 0.8786, + "step": 189610 + }, + { + "epoch": 14.694098957727924, + "grad_norm": 1.4448401582699764, + "learning_rate": 7.347334159950403e-07, + "loss": 0.8602, + "step": 189620 + }, + { + "epoch": 14.69487388120423, + "grad_norm": 1.5420105442270724, + "learning_rate": 7.347721636701798e-07, + "loss": 0.8876, + "step": 189630 + }, + { + "epoch": 14.695648804680538, + "grad_norm": 1.5327753317357695, + "learning_rate": 7.348109113453194e-07, + "loss": 0.8739, + "step": 189640 + }, + { + "epoch": 14.696423728156844, + "grad_norm": 1.6380542759045875, + "learning_rate": 7.348496590204589e-07, + "loss": 0.8708, + "step": 189650 + }, + { + "epoch": 14.697198651633151, + "grad_norm": 1.495519834523854, + "learning_rate": 7.348884066955983e-07, + "loss": 0.8838, + "step": 189660 + }, + { + "epoch": 14.697973575109458, + "grad_norm": 1.4630285728835084, + "learning_rate": 7.349271543707378e-07, + "loss": 0.8595, + "step": 189670 + }, + { + "epoch": 14.698748498585765, + "grad_norm": 1.5239064075380728, + "learning_rate": 7.349659020458772e-07, + "loss": 0.8806, + "step": 189680 + }, + { + "epoch": 14.699523422062072, + "grad_norm": 1.5531741647819362, + "learning_rate": 7.350046497210169e-07, + "loss": 0.881, + "step": 189690 + }, + { + "epoch": 14.700298345538378, + "grad_norm": 1.5782732037754956, + "learning_rate": 7.350433973961563e-07, + "loss": 0.8665, + "step": 189700 + }, + { + "epoch": 14.701073269014685, + "grad_norm": 1.6414454591440435, + "learning_rate": 7.350821450712958e-07, + "loss": 0.8722, + "step": 189710 + }, + { + "epoch": 14.701848192490992, + "grad_norm": 1.6656909660885242, + "learning_rate": 7.351208927464352e-07, + "loss": 0.9001, + "step": 189720 + }, + { + "epoch": 14.702623115967299, + "grad_norm": 1.5257863948040034, + "learning_rate": 7.351596404215749e-07, + "loss": 0.8521, + "step": 189730 + }, + { + "epoch": 14.703398039443606, + "grad_norm": 1.6495586661136048, + "learning_rate": 7.351983880967143e-07, + "loss": 0.8813, + "step": 189740 + }, + { + "epoch": 14.704172962919912, + "grad_norm": 1.5689184815556323, + "learning_rate": 7.352371357718538e-07, + "loss": 0.8623, + "step": 189750 + }, + { + "epoch": 14.70494788639622, + "grad_norm": 1.4669399853221692, + "learning_rate": 7.352758834469932e-07, + "loss": 0.8704, + "step": 189760 + }, + { + "epoch": 14.705722809872524, + "grad_norm": 1.5986421481561475, + "learning_rate": 7.353146311221327e-07, + "loss": 0.8713, + "step": 189770 + }, + { + "epoch": 14.706497733348831, + "grad_norm": 1.5798459437277887, + "learning_rate": 7.353533787972723e-07, + "loss": 0.8773, + "step": 189780 + }, + { + "epoch": 14.707272656825138, + "grad_norm": 1.5294888538410083, + "learning_rate": 7.353921264724118e-07, + "loss": 0.8532, + "step": 189790 + }, + { + "epoch": 14.708047580301445, + "grad_norm": 1.5163755230026392, + "learning_rate": 7.354308741475512e-07, + "loss": 0.8683, + "step": 189800 + }, + { + "epoch": 14.708822503777752, + "grad_norm": 1.5831277540252389, + "learning_rate": 7.354696218226907e-07, + "loss": 0.8553, + "step": 189810 + }, + { + "epoch": 14.709597427254058, + "grad_norm": 1.4918290279302961, + "learning_rate": 7.355083694978301e-07, + "loss": 0.8667, + "step": 189820 + }, + { + "epoch": 14.710372350730365, + "grad_norm": 1.5550601002683098, + "learning_rate": 7.355471171729698e-07, + "loss": 0.86, + "step": 189830 + }, + { + "epoch": 14.711147274206672, + "grad_norm": 1.503870457414302, + "learning_rate": 7.355858648481092e-07, + "loss": 0.8618, + "step": 189840 + }, + { + "epoch": 14.711922197682979, + "grad_norm": 1.5089085618413403, + "learning_rate": 7.356246125232487e-07, + "loss": 0.867, + "step": 189850 + }, + { + "epoch": 14.712697121159286, + "grad_norm": 1.442942438375115, + "learning_rate": 7.356633601983881e-07, + "loss": 0.8639, + "step": 189860 + }, + { + "epoch": 14.713472044635592, + "grad_norm": 1.557461648971965, + "learning_rate": 7.357021078735277e-07, + "loss": 0.8699, + "step": 189870 + }, + { + "epoch": 14.7142469681119, + "grad_norm": 1.4858642885040265, + "learning_rate": 7.357408555486672e-07, + "loss": 0.8629, + "step": 189880 + }, + { + "epoch": 14.715021891588206, + "grad_norm": 1.508212849861617, + "learning_rate": 7.357796032238067e-07, + "loss": 0.8654, + "step": 189890 + }, + { + "epoch": 14.715796815064513, + "grad_norm": 1.4557139106862913, + "learning_rate": 7.358183508989461e-07, + "loss": 0.8491, + "step": 189900 + }, + { + "epoch": 14.71657173854082, + "grad_norm": 1.5547585229005534, + "learning_rate": 7.358570985740856e-07, + "loss": 0.8612, + "step": 189910 + }, + { + "epoch": 14.717346662017126, + "grad_norm": 1.5933593725552069, + "learning_rate": 7.35895846249225e-07, + "loss": 0.8695, + "step": 189920 + }, + { + "epoch": 14.718121585493433, + "grad_norm": 1.475684168044526, + "learning_rate": 7.359345939243647e-07, + "loss": 0.865, + "step": 189930 + }, + { + "epoch": 14.718896508969738, + "grad_norm": 1.5052038224323816, + "learning_rate": 7.359733415995041e-07, + "loss": 0.8684, + "step": 189940 + }, + { + "epoch": 14.719671432446045, + "grad_norm": 1.5534607320940044, + "learning_rate": 7.360120892746436e-07, + "loss": 0.8594, + "step": 189950 + }, + { + "epoch": 14.720446355922352, + "grad_norm": 1.5597177433584413, + "learning_rate": 7.36050836949783e-07, + "loss": 0.8707, + "step": 189960 + }, + { + "epoch": 14.721221279398659, + "grad_norm": 1.530408530883588, + "learning_rate": 7.360895846249226e-07, + "loss": 0.865, + "step": 189970 + }, + { + "epoch": 14.721996202874966, + "grad_norm": 1.4341823799811442, + "learning_rate": 7.361283323000621e-07, + "loss": 0.8852, + "step": 189980 + }, + { + "epoch": 14.722771126351272, + "grad_norm": 1.56586443919487, + "learning_rate": 7.361670799752016e-07, + "loss": 0.8742, + "step": 189990 + }, + { + "epoch": 14.72354604982758, + "grad_norm": 1.486371720141818, + "learning_rate": 7.36205827650341e-07, + "loss": 0.86, + "step": 190000 + }, + { + "epoch": 14.72354604982758, + "eval_loss": 0.8915137052536011, + "eval_runtime": 328.7371, + "eval_samples_per_second": 34.894, + "eval_steps_per_second": 8.724, + "step": 190000 + }, + { + "epoch": 14.724320973303886, + "grad_norm": 1.5860539966892386, + "learning_rate": 7.362445753254806e-07, + "loss": 0.8817, + "step": 190010 + }, + { + "epoch": 14.725095896780193, + "grad_norm": 1.520258872228051, + "learning_rate": 7.3628332300062e-07, + "loss": 0.8499, + "step": 190020 + }, + { + "epoch": 14.7258708202565, + "grad_norm": 1.627718699755981, + "learning_rate": 7.363220706757596e-07, + "loss": 0.8608, + "step": 190030 + }, + { + "epoch": 14.726645743732806, + "grad_norm": 1.5032201860507293, + "learning_rate": 7.36360818350899e-07, + "loss": 0.8651, + "step": 190040 + }, + { + "epoch": 14.727420667209113, + "grad_norm": 1.560408814474355, + "learning_rate": 7.363995660260385e-07, + "loss": 0.8581, + "step": 190050 + }, + { + "epoch": 14.72819559068542, + "grad_norm": 1.4935390938397275, + "learning_rate": 7.364383137011779e-07, + "loss": 0.853, + "step": 190060 + }, + { + "epoch": 14.728970514161727, + "grad_norm": 1.4813040658185415, + "learning_rate": 7.364770613763175e-07, + "loss": 0.8878, + "step": 190070 + }, + { + "epoch": 14.729745437638034, + "grad_norm": 1.53042417538236, + "learning_rate": 7.36515809051457e-07, + "loss": 0.8634, + "step": 190080 + }, + { + "epoch": 14.73052036111434, + "grad_norm": 1.5313203460971365, + "learning_rate": 7.365545567265965e-07, + "loss": 0.8641, + "step": 190090 + }, + { + "epoch": 14.731295284590647, + "grad_norm": 1.5522766438465725, + "learning_rate": 7.365933044017359e-07, + "loss": 0.8571, + "step": 190100 + }, + { + "epoch": 14.732070208066954, + "grad_norm": 1.6482269383168462, + "learning_rate": 7.366320520768755e-07, + "loss": 0.8662, + "step": 190110 + }, + { + "epoch": 14.732845131543261, + "grad_norm": 1.5411625250194256, + "learning_rate": 7.366707997520149e-07, + "loss": 0.881, + "step": 190120 + }, + { + "epoch": 14.733620055019568, + "grad_norm": 1.5674130174935832, + "learning_rate": 7.367095474271545e-07, + "loss": 0.8554, + "step": 190130 + }, + { + "epoch": 14.734394978495873, + "grad_norm": 1.6230751783381685, + "learning_rate": 7.367482951022939e-07, + "loss": 0.8762, + "step": 190140 + }, + { + "epoch": 14.73516990197218, + "grad_norm": 1.4642172070879644, + "learning_rate": 7.367870427774335e-07, + "loss": 0.8652, + "step": 190150 + }, + { + "epoch": 14.735944825448486, + "grad_norm": 1.520780738876786, + "learning_rate": 7.368257904525729e-07, + "loss": 0.8883, + "step": 190160 + }, + { + "epoch": 14.736719748924793, + "grad_norm": 1.462631721345161, + "learning_rate": 7.368645381277124e-07, + "loss": 0.8806, + "step": 190170 + }, + { + "epoch": 14.7374946724011, + "grad_norm": 1.618688094653018, + "learning_rate": 7.369032858028519e-07, + "loss": 0.8598, + "step": 190180 + }, + { + "epoch": 14.738269595877407, + "grad_norm": 1.6381855815726216, + "learning_rate": 7.369420334779914e-07, + "loss": 0.884, + "step": 190190 + }, + { + "epoch": 14.739044519353714, + "grad_norm": 1.5579765587427588, + "learning_rate": 7.369807811531308e-07, + "loss": 0.8501, + "step": 190200 + }, + { + "epoch": 14.73981944283002, + "grad_norm": 1.4859479923728287, + "learning_rate": 7.370195288282704e-07, + "loss": 0.8554, + "step": 190210 + }, + { + "epoch": 14.740594366306327, + "grad_norm": 1.4991601161683459, + "learning_rate": 7.370582765034098e-07, + "loss": 0.8648, + "step": 190220 + }, + { + "epoch": 14.741369289782634, + "grad_norm": 1.4689613374597599, + "learning_rate": 7.370970241785494e-07, + "loss": 0.8595, + "step": 190230 + }, + { + "epoch": 14.74214421325894, + "grad_norm": 1.4762021586546183, + "learning_rate": 7.371357718536888e-07, + "loss": 0.8931, + "step": 190240 + }, + { + "epoch": 14.742919136735248, + "grad_norm": 1.517090812477274, + "learning_rate": 7.371745195288284e-07, + "loss": 0.8658, + "step": 190250 + }, + { + "epoch": 14.743694060211554, + "grad_norm": 1.569830116657777, + "learning_rate": 7.372132672039678e-07, + "loss": 0.873, + "step": 190260 + }, + { + "epoch": 14.744468983687861, + "grad_norm": 1.640025554296372, + "learning_rate": 7.372520148791073e-07, + "loss": 0.8522, + "step": 190270 + }, + { + "epoch": 14.745243907164168, + "grad_norm": 1.4730337223586758, + "learning_rate": 7.372907625542468e-07, + "loss": 0.8775, + "step": 190280 + }, + { + "epoch": 14.746018830640475, + "grad_norm": 1.5373241580460515, + "learning_rate": 7.373295102293864e-07, + "loss": 0.8588, + "step": 190290 + }, + { + "epoch": 14.746793754116782, + "grad_norm": 1.5304906187601515, + "learning_rate": 7.373682579045258e-07, + "loss": 0.8534, + "step": 190300 + }, + { + "epoch": 14.747568677593087, + "grad_norm": 1.605385613570655, + "learning_rate": 7.374070055796653e-07, + "loss": 0.8624, + "step": 190310 + }, + { + "epoch": 14.748343601069394, + "grad_norm": 1.633091992830315, + "learning_rate": 7.374457532548047e-07, + "loss": 0.8487, + "step": 190320 + }, + { + "epoch": 14.7491185245457, + "grad_norm": 1.5190145990520507, + "learning_rate": 7.374845009299443e-07, + "loss": 0.8778, + "step": 190330 + }, + { + "epoch": 14.749893448022007, + "grad_norm": 1.4899716951622815, + "learning_rate": 7.375232486050837e-07, + "loss": 0.8581, + "step": 190340 + }, + { + "epoch": 14.750668371498314, + "grad_norm": 1.458459763290813, + "learning_rate": 7.375619962802233e-07, + "loss": 0.8587, + "step": 190350 + }, + { + "epoch": 14.75144329497462, + "grad_norm": 1.603027110471485, + "learning_rate": 7.376007439553627e-07, + "loss": 0.8707, + "step": 190360 + }, + { + "epoch": 14.752218218450928, + "grad_norm": 1.6458813374251533, + "learning_rate": 7.376394916305022e-07, + "loss": 0.8552, + "step": 190370 + }, + { + "epoch": 14.752993141927234, + "grad_norm": 1.4778368585529271, + "learning_rate": 7.376782393056417e-07, + "loss": 0.859, + "step": 190380 + }, + { + "epoch": 14.753768065403541, + "grad_norm": 1.4476118047075845, + "learning_rate": 7.377169869807813e-07, + "loss": 0.8807, + "step": 190390 + }, + { + "epoch": 14.754542988879848, + "grad_norm": 1.6960036708193122, + "learning_rate": 7.377557346559207e-07, + "loss": 0.8691, + "step": 190400 + }, + { + "epoch": 14.755317912356155, + "grad_norm": 1.523774594152699, + "learning_rate": 7.377944823310602e-07, + "loss": 0.8743, + "step": 190410 + }, + { + "epoch": 14.756092835832462, + "grad_norm": 1.5395343328351792, + "learning_rate": 7.378332300061996e-07, + "loss": 0.8646, + "step": 190420 + }, + { + "epoch": 14.756867759308768, + "grad_norm": 1.4780012596761478, + "learning_rate": 7.378719776813393e-07, + "loss": 0.8653, + "step": 190430 + }, + { + "epoch": 14.757642682785075, + "grad_norm": 1.5611469312507122, + "learning_rate": 7.379107253564787e-07, + "loss": 0.8871, + "step": 190440 + }, + { + "epoch": 14.758417606261382, + "grad_norm": 1.5945458772304462, + "learning_rate": 7.379494730316182e-07, + "loss": 0.8761, + "step": 190450 + }, + { + "epoch": 14.759192529737689, + "grad_norm": 1.5442307479419153, + "learning_rate": 7.379882207067576e-07, + "loss": 0.8836, + "step": 190460 + }, + { + "epoch": 14.759967453213996, + "grad_norm": 1.5411587134123563, + "learning_rate": 7.380269683818972e-07, + "loss": 0.8719, + "step": 190470 + }, + { + "epoch": 14.760742376690303, + "grad_norm": 1.6603169172147223, + "learning_rate": 7.380657160570366e-07, + "loss": 0.8617, + "step": 190480 + }, + { + "epoch": 14.76151730016661, + "grad_norm": 1.538760204628353, + "learning_rate": 7.381044637321762e-07, + "loss": 0.8807, + "step": 190490 + }, + { + "epoch": 14.762292223642916, + "grad_norm": 1.5722040379020779, + "learning_rate": 7.381432114073156e-07, + "loss": 0.8503, + "step": 190500 + }, + { + "epoch": 14.762292223642916, + "eval_loss": 0.8916757106781006, + "eval_runtime": 333.557, + "eval_samples_per_second": 34.39, + "eval_steps_per_second": 8.598, + "step": 190500 + }, + { + "epoch": 14.763067147119221, + "grad_norm": 1.5362851548062546, + "learning_rate": 7.381819590824551e-07, + "loss": 0.8632, + "step": 190510 + }, + { + "epoch": 14.763842070595528, + "grad_norm": 1.4776565435601932, + "learning_rate": 7.382207067575945e-07, + "loss": 0.8524, + "step": 190520 + }, + { + "epoch": 14.764616994071835, + "grad_norm": 1.627831927278165, + "learning_rate": 7.382594544327342e-07, + "loss": 0.8575, + "step": 190530 + }, + { + "epoch": 14.765391917548142, + "grad_norm": 1.4996710669914874, + "learning_rate": 7.382982021078736e-07, + "loss": 0.8528, + "step": 190540 + }, + { + "epoch": 14.766166841024448, + "grad_norm": 1.5470928227862375, + "learning_rate": 7.383369497830131e-07, + "loss": 0.8692, + "step": 190550 + }, + { + "epoch": 14.766941764500755, + "grad_norm": 1.579634775707904, + "learning_rate": 7.383756974581525e-07, + "loss": 0.8765, + "step": 190560 + }, + { + "epoch": 14.767716687977062, + "grad_norm": 1.5615699499483366, + "learning_rate": 7.384144451332922e-07, + "loss": 0.8756, + "step": 190570 + }, + { + "epoch": 14.768491611453369, + "grad_norm": 1.4815901021088544, + "learning_rate": 7.384531928084316e-07, + "loss": 0.8685, + "step": 190580 + }, + { + "epoch": 14.769266534929676, + "grad_norm": 1.5843050847295619, + "learning_rate": 7.384919404835711e-07, + "loss": 0.875, + "step": 190590 + }, + { + "epoch": 14.770041458405982, + "grad_norm": 1.7188616499122316, + "learning_rate": 7.385306881587105e-07, + "loss": 0.9025, + "step": 190600 + }, + { + "epoch": 14.77081638188229, + "grad_norm": 1.5161879220809058, + "learning_rate": 7.3856943583385e-07, + "loss": 0.8748, + "step": 190610 + }, + { + "epoch": 14.771591305358596, + "grad_norm": 1.5581320728976957, + "learning_rate": 7.386081835089895e-07, + "loss": 0.8668, + "step": 190620 + }, + { + "epoch": 14.772366228834903, + "grad_norm": 1.4606787840692173, + "learning_rate": 7.386469311841291e-07, + "loss": 0.853, + "step": 190630 + }, + { + "epoch": 14.77314115231121, + "grad_norm": 1.6008615103237136, + "learning_rate": 7.386856788592685e-07, + "loss": 0.856, + "step": 190640 + }, + { + "epoch": 14.773916075787517, + "grad_norm": 1.4936426413669797, + "learning_rate": 7.38724426534408e-07, + "loss": 0.8625, + "step": 190650 + }, + { + "epoch": 14.774690999263823, + "grad_norm": 1.4524779713425768, + "learning_rate": 7.387631742095474e-07, + "loss": 0.8458, + "step": 190660 + }, + { + "epoch": 14.77546592274013, + "grad_norm": 1.4669930763580192, + "learning_rate": 7.388019218846871e-07, + "loss": 0.8566, + "step": 190670 + }, + { + "epoch": 14.776240846216435, + "grad_norm": 1.7228547831131706, + "learning_rate": 7.388406695598265e-07, + "loss": 0.8536, + "step": 190680 + }, + { + "epoch": 14.777015769692742, + "grad_norm": 1.5908314777680295, + "learning_rate": 7.38879417234966e-07, + "loss": 0.8634, + "step": 190690 + }, + { + "epoch": 14.777790693169049, + "grad_norm": 1.5040761000712994, + "learning_rate": 7.389181649101054e-07, + "loss": 0.8774, + "step": 190700 + }, + { + "epoch": 14.778565616645356, + "grad_norm": 1.534795939303018, + "learning_rate": 7.389569125852449e-07, + "loss": 0.8479, + "step": 190710 + }, + { + "epoch": 14.779340540121662, + "grad_norm": 1.5222733176399526, + "learning_rate": 7.389956602603845e-07, + "loss": 0.8624, + "step": 190720 + }, + { + "epoch": 14.78011546359797, + "grad_norm": 1.528664175799337, + "learning_rate": 7.39034407935524e-07, + "loss": 0.8753, + "step": 190730 + }, + { + "epoch": 14.780890387074276, + "grad_norm": 1.603112032645023, + "learning_rate": 7.390731556106634e-07, + "loss": 0.8538, + "step": 190740 + }, + { + "epoch": 14.781665310550583, + "grad_norm": 1.6589822002757877, + "learning_rate": 7.391119032858029e-07, + "loss": 0.8588, + "step": 190750 + }, + { + "epoch": 14.78244023402689, + "grad_norm": 1.5754784754856799, + "learning_rate": 7.391506509609423e-07, + "loss": 0.8827, + "step": 190760 + }, + { + "epoch": 14.783215157503196, + "grad_norm": 1.5311717086796401, + "learning_rate": 7.39189398636082e-07, + "loss": 0.8674, + "step": 190770 + }, + { + "epoch": 14.783990080979503, + "grad_norm": 1.5754329738998014, + "learning_rate": 7.392281463112214e-07, + "loss": 0.8664, + "step": 190780 + }, + { + "epoch": 14.78476500445581, + "grad_norm": 1.4693590182961902, + "learning_rate": 7.392668939863609e-07, + "loss": 0.8606, + "step": 190790 + }, + { + "epoch": 14.785539927932117, + "grad_norm": 1.4917212940850424, + "learning_rate": 7.393056416615003e-07, + "loss": 0.8539, + "step": 190800 + }, + { + "epoch": 14.786314851408424, + "grad_norm": 1.563445802800563, + "learning_rate": 7.393443893366399e-07, + "loss": 0.865, + "step": 190810 + }, + { + "epoch": 14.78708977488473, + "grad_norm": 1.5912494376811437, + "learning_rate": 7.393831370117794e-07, + "loss": 0.8661, + "step": 190820 + }, + { + "epoch": 14.787864698361037, + "grad_norm": 1.5263928034979912, + "learning_rate": 7.394218846869189e-07, + "loss": 0.8681, + "step": 190830 + }, + { + "epoch": 14.788639621837344, + "grad_norm": 1.5971716849097632, + "learning_rate": 7.394606323620583e-07, + "loss": 0.8742, + "step": 190840 + }, + { + "epoch": 14.789414545313651, + "grad_norm": 1.5591516143093012, + "learning_rate": 7.394993800371978e-07, + "loss": 0.8541, + "step": 190850 + }, + { + "epoch": 14.790189468789958, + "grad_norm": 1.5321035040224211, + "learning_rate": 7.395381277123373e-07, + "loss": 0.8586, + "step": 190860 + }, + { + "epoch": 14.790964392266265, + "grad_norm": 1.6100488758120157, + "learning_rate": 7.395768753874769e-07, + "loss": 0.859, + "step": 190870 + }, + { + "epoch": 14.79173931574257, + "grad_norm": 1.5943053466444255, + "learning_rate": 7.396156230626163e-07, + "loss": 0.8395, + "step": 190880 + }, + { + "epoch": 14.792514239218876, + "grad_norm": 1.503960233905795, + "learning_rate": 7.396543707377558e-07, + "loss": 0.88, + "step": 190890 + }, + { + "epoch": 14.793289162695183, + "grad_norm": 1.542269487792935, + "learning_rate": 7.396931184128952e-07, + "loss": 0.8601, + "step": 190900 + }, + { + "epoch": 14.79406408617149, + "grad_norm": 1.5484125094695849, + "learning_rate": 7.397318660880348e-07, + "loss": 0.8568, + "step": 190910 + }, + { + "epoch": 14.794839009647797, + "grad_norm": 1.475991852883927, + "learning_rate": 7.397706137631743e-07, + "loss": 0.8557, + "step": 190920 + }, + { + "epoch": 14.795613933124104, + "grad_norm": 1.5543917011759945, + "learning_rate": 7.398093614383138e-07, + "loss": 0.8589, + "step": 190930 + }, + { + "epoch": 14.79638885660041, + "grad_norm": 1.5177246434704665, + "learning_rate": 7.398481091134532e-07, + "loss": 0.8712, + "step": 190940 + }, + { + "epoch": 14.797163780076717, + "grad_norm": 1.5709717758314026, + "learning_rate": 7.398868567885928e-07, + "loss": 0.8764, + "step": 190950 + }, + { + "epoch": 14.797938703553024, + "grad_norm": 1.60588996169625, + "learning_rate": 7.399256044637322e-07, + "loss": 0.8746, + "step": 190960 + }, + { + "epoch": 14.798713627029331, + "grad_norm": 1.5047681353306503, + "learning_rate": 7.399643521388718e-07, + "loss": 0.8532, + "step": 190970 + }, + { + "epoch": 14.799488550505638, + "grad_norm": 1.506620346838556, + "learning_rate": 7.400030998140112e-07, + "loss": 0.866, + "step": 190980 + }, + { + "epoch": 14.800263473981945, + "grad_norm": 1.4954896584836646, + "learning_rate": 7.400418474891507e-07, + "loss": 0.8545, + "step": 190990 + }, + { + "epoch": 14.801038397458251, + "grad_norm": 1.5417445187953012, + "learning_rate": 7.400805951642902e-07, + "loss": 0.8565, + "step": 191000 + }, + { + "epoch": 14.801038397458251, + "eval_loss": 0.8914669752120972, + "eval_runtime": 333.697, + "eval_samples_per_second": 34.376, + "eval_steps_per_second": 8.595, + "step": 191000 + }, + { + "epoch": 14.801813320934558, + "grad_norm": 1.4751794717000069, + "learning_rate": 7.401193428394297e-07, + "loss": 0.8613, + "step": 191010 + }, + { + "epoch": 14.802588244410865, + "grad_norm": 1.4753869247546607, + "learning_rate": 7.401580905145692e-07, + "loss": 0.8701, + "step": 191020 + }, + { + "epoch": 14.803363167887172, + "grad_norm": 1.5951633123677265, + "learning_rate": 7.401968381897087e-07, + "loss": 0.8404, + "step": 191030 + }, + { + "epoch": 14.804138091363479, + "grad_norm": 1.530120847622962, + "learning_rate": 7.402355858648481e-07, + "loss": 0.8675, + "step": 191040 + }, + { + "epoch": 14.804913014839785, + "grad_norm": 1.5213667174575274, + "learning_rate": 7.402743335399877e-07, + "loss": 0.8591, + "step": 191050 + }, + { + "epoch": 14.80568793831609, + "grad_norm": 1.5112090198750177, + "learning_rate": 7.403130812151271e-07, + "loss": 0.8681, + "step": 191060 + }, + { + "epoch": 14.806462861792397, + "grad_norm": 1.482229891177765, + "learning_rate": 7.403518288902667e-07, + "loss": 0.8739, + "step": 191070 + }, + { + "epoch": 14.807237785268704, + "grad_norm": 1.6043259200440683, + "learning_rate": 7.403905765654061e-07, + "loss": 0.8587, + "step": 191080 + }, + { + "epoch": 14.80801270874501, + "grad_norm": 1.5872753434184679, + "learning_rate": 7.404293242405457e-07, + "loss": 0.887, + "step": 191090 + }, + { + "epoch": 14.808787632221318, + "grad_norm": 1.5377984253567065, + "learning_rate": 7.404680719156851e-07, + "loss": 0.8465, + "step": 191100 + }, + { + "epoch": 14.809562555697624, + "grad_norm": 1.5267680141075675, + "learning_rate": 7.405068195908247e-07, + "loss": 0.8527, + "step": 191110 + }, + { + "epoch": 14.810337479173931, + "grad_norm": 1.5485149882872615, + "learning_rate": 7.405455672659641e-07, + "loss": 0.871, + "step": 191120 + }, + { + "epoch": 14.811112402650238, + "grad_norm": 1.4878918627105997, + "learning_rate": 7.405843149411036e-07, + "loss": 0.8624, + "step": 191130 + }, + { + "epoch": 14.811887326126545, + "grad_norm": 1.518296826118913, + "learning_rate": 7.406230626162431e-07, + "loss": 0.8561, + "step": 191140 + }, + { + "epoch": 14.812662249602852, + "grad_norm": 1.4956948813772948, + "learning_rate": 7.406618102913826e-07, + "loss": 0.8647, + "step": 191150 + }, + { + "epoch": 14.813437173079159, + "grad_norm": 1.4873426196644561, + "learning_rate": 7.40700557966522e-07, + "loss": 0.8614, + "step": 191160 + }, + { + "epoch": 14.814212096555465, + "grad_norm": 1.5581055546374976, + "learning_rate": 7.407393056416616e-07, + "loss": 0.8489, + "step": 191170 + }, + { + "epoch": 14.814987020031772, + "grad_norm": 1.5054048035073784, + "learning_rate": 7.40778053316801e-07, + "loss": 0.8456, + "step": 191180 + }, + { + "epoch": 14.815761943508079, + "grad_norm": 1.5462474118169631, + "learning_rate": 7.408168009919406e-07, + "loss": 0.8633, + "step": 191190 + }, + { + "epoch": 14.816536866984386, + "grad_norm": 1.6047110744649, + "learning_rate": 7.4085554866708e-07, + "loss": 0.854, + "step": 191200 + }, + { + "epoch": 14.817311790460693, + "grad_norm": 1.4762078660668738, + "learning_rate": 7.408942963422196e-07, + "loss": 0.8619, + "step": 191210 + }, + { + "epoch": 14.818086713937, + "grad_norm": 1.4950520037316517, + "learning_rate": 7.40933044017359e-07, + "loss": 0.8695, + "step": 191220 + }, + { + "epoch": 14.818861637413306, + "grad_norm": 1.5381656857688364, + "learning_rate": 7.409717916924986e-07, + "loss": 0.8907, + "step": 191230 + }, + { + "epoch": 14.819636560889613, + "grad_norm": 1.486956886402339, + "learning_rate": 7.41010539367638e-07, + "loss": 0.8768, + "step": 191240 + }, + { + "epoch": 14.820411484365918, + "grad_norm": 1.5985437053688702, + "learning_rate": 7.410492870427775e-07, + "loss": 0.8776, + "step": 191250 + }, + { + "epoch": 14.821186407842225, + "grad_norm": 1.5190936097893413, + "learning_rate": 7.41088034717917e-07, + "loss": 0.8866, + "step": 191260 + }, + { + "epoch": 14.821961331318532, + "grad_norm": 1.562776753417474, + "learning_rate": 7.411267823930565e-07, + "loss": 0.8535, + "step": 191270 + }, + { + "epoch": 14.822736254794838, + "grad_norm": 1.4461812491967216, + "learning_rate": 7.41165530068196e-07, + "loss": 0.8664, + "step": 191280 + }, + { + "epoch": 14.823511178271145, + "grad_norm": 1.6826452221308652, + "learning_rate": 7.412042777433355e-07, + "loss": 0.8591, + "step": 191290 + }, + { + "epoch": 14.824286101747452, + "grad_norm": 1.5010204489248598, + "learning_rate": 7.412430254184749e-07, + "loss": 0.8817, + "step": 191300 + }, + { + "epoch": 14.825061025223759, + "grad_norm": 1.5397930769748982, + "learning_rate": 7.412817730936145e-07, + "loss": 0.87, + "step": 191310 + }, + { + "epoch": 14.825835948700066, + "grad_norm": 1.5154255531758154, + "learning_rate": 7.413205207687539e-07, + "loss": 0.8456, + "step": 191320 + }, + { + "epoch": 14.826610872176373, + "grad_norm": 1.622599232420161, + "learning_rate": 7.413592684438935e-07, + "loss": 0.8836, + "step": 191330 + }, + { + "epoch": 14.82738579565268, + "grad_norm": 1.5925430923524486, + "learning_rate": 7.413980161190329e-07, + "loss": 0.8739, + "step": 191340 + }, + { + "epoch": 14.828160719128986, + "grad_norm": 1.6676379137584718, + "learning_rate": 7.414367637941724e-07, + "loss": 0.8667, + "step": 191350 + }, + { + "epoch": 14.828935642605293, + "grad_norm": 1.4369764316861238, + "learning_rate": 7.414755114693119e-07, + "loss": 0.8496, + "step": 191360 + }, + { + "epoch": 14.8297105660816, + "grad_norm": 1.5615097251984655, + "learning_rate": 7.415142591444515e-07, + "loss": 0.8698, + "step": 191370 + }, + { + "epoch": 14.830485489557907, + "grad_norm": 1.6920795621807765, + "learning_rate": 7.415530068195909e-07, + "loss": 0.8721, + "step": 191380 + }, + { + "epoch": 14.831260413034213, + "grad_norm": 1.543053189416324, + "learning_rate": 7.415917544947304e-07, + "loss": 0.8657, + "step": 191390 + }, + { + "epoch": 14.83203533651052, + "grad_norm": 1.6497561095185627, + "learning_rate": 7.416305021698698e-07, + "loss": 0.8785, + "step": 191400 + }, + { + "epoch": 14.832810259986827, + "grad_norm": 1.5537296782957473, + "learning_rate": 7.416692498450094e-07, + "loss": 0.8654, + "step": 191410 + }, + { + "epoch": 14.833585183463134, + "grad_norm": 1.5429799380213154, + "learning_rate": 7.417079975201488e-07, + "loss": 0.871, + "step": 191420 + }, + { + "epoch": 14.834360106939439, + "grad_norm": 1.4956802536545966, + "learning_rate": 7.417467451952884e-07, + "loss": 0.86, + "step": 191430 + }, + { + "epoch": 14.835135030415746, + "grad_norm": 1.5129815988300515, + "learning_rate": 7.417854928704278e-07, + "loss": 0.8563, + "step": 191440 + }, + { + "epoch": 14.835909953892052, + "grad_norm": 1.5242123467917974, + "learning_rate": 7.418242405455673e-07, + "loss": 0.8545, + "step": 191450 + }, + { + "epoch": 14.83668487736836, + "grad_norm": 1.4883751753530894, + "learning_rate": 7.418629882207068e-07, + "loss": 0.8712, + "step": 191460 + }, + { + "epoch": 14.837459800844666, + "grad_norm": 1.4913170094983903, + "learning_rate": 7.419017358958464e-07, + "loss": 0.8751, + "step": 191470 + }, + { + "epoch": 14.838234724320973, + "grad_norm": 1.4583946715477214, + "learning_rate": 7.419404835709858e-07, + "loss": 0.8665, + "step": 191480 + }, + { + "epoch": 14.83900964779728, + "grad_norm": 1.547116073778088, + "learning_rate": 7.419792312461253e-07, + "loss": 0.8615, + "step": 191490 + }, + { + "epoch": 14.839784571273587, + "grad_norm": 1.6214667790132424, + "learning_rate": 7.420179789212647e-07, + "loss": 0.8719, + "step": 191500 + }, + { + "epoch": 14.839784571273587, + "eval_loss": 0.8910685181617737, + "eval_runtime": 334.532, + "eval_samples_per_second": 34.29, + "eval_steps_per_second": 8.573, + "step": 191500 + }, + { + "epoch": 14.840559494749893, + "grad_norm": 1.6095440295732484, + "learning_rate": 7.420567265964044e-07, + "loss": 0.8569, + "step": 191510 + }, + { + "epoch": 14.8413344182262, + "grad_norm": 1.6186416070028002, + "learning_rate": 7.420954742715438e-07, + "loss": 0.8774, + "step": 191520 + }, + { + "epoch": 14.842109341702507, + "grad_norm": 1.4804012503319222, + "learning_rate": 7.421342219466833e-07, + "loss": 0.8544, + "step": 191530 + }, + { + "epoch": 14.842884265178814, + "grad_norm": 1.5871590507731317, + "learning_rate": 7.421729696218227e-07, + "loss": 0.859, + "step": 191540 + }, + { + "epoch": 14.84365918865512, + "grad_norm": 1.5687863239839583, + "learning_rate": 7.422117172969622e-07, + "loss": 0.8474, + "step": 191550 + }, + { + "epoch": 14.844434112131427, + "grad_norm": 1.58421967120788, + "learning_rate": 7.422504649721017e-07, + "loss": 0.8637, + "step": 191560 + }, + { + "epoch": 14.845209035607734, + "grad_norm": 1.496452976793545, + "learning_rate": 7.422892126472413e-07, + "loss": 0.8643, + "step": 191570 + }, + { + "epoch": 14.845983959084041, + "grad_norm": 1.5692803562501556, + "learning_rate": 7.423279603223807e-07, + "loss": 0.8729, + "step": 191580 + }, + { + "epoch": 14.846758882560348, + "grad_norm": 1.5264774976655948, + "learning_rate": 7.423667079975202e-07, + "loss": 0.8642, + "step": 191590 + }, + { + "epoch": 14.847533806036655, + "grad_norm": 1.5454208686747772, + "learning_rate": 7.424054556726596e-07, + "loss": 0.8687, + "step": 191600 + }, + { + "epoch": 14.848308729512961, + "grad_norm": 1.584432309605185, + "learning_rate": 7.424442033477993e-07, + "loss": 0.8679, + "step": 191610 + }, + { + "epoch": 14.849083652989268, + "grad_norm": 1.4233687578811645, + "learning_rate": 7.424829510229387e-07, + "loss": 0.8492, + "step": 191620 + }, + { + "epoch": 14.849858576465573, + "grad_norm": 1.5303612816805512, + "learning_rate": 7.425216986980782e-07, + "loss": 0.8796, + "step": 191630 + }, + { + "epoch": 14.85063349994188, + "grad_norm": 1.6132111307466421, + "learning_rate": 7.425604463732176e-07, + "loss": 0.8583, + "step": 191640 + }, + { + "epoch": 14.851408423418187, + "grad_norm": 1.5061403260335722, + "learning_rate": 7.425991940483573e-07, + "loss": 0.8647, + "step": 191650 + }, + { + "epoch": 14.852183346894494, + "grad_norm": 1.5511938116981405, + "learning_rate": 7.426379417234967e-07, + "loss": 0.8706, + "step": 191660 + }, + { + "epoch": 14.8529582703708, + "grad_norm": 1.5600449792313027, + "learning_rate": 7.426766893986362e-07, + "loss": 0.8781, + "step": 191670 + }, + { + "epoch": 14.853733193847107, + "grad_norm": 1.7113618294766104, + "learning_rate": 7.427154370737756e-07, + "loss": 0.8845, + "step": 191680 + }, + { + "epoch": 14.854508117323414, + "grad_norm": 1.5037485991224817, + "learning_rate": 7.427541847489151e-07, + "loss": 0.858, + "step": 191690 + }, + { + "epoch": 14.855283040799721, + "grad_norm": 1.6180744458527765, + "learning_rate": 7.427929324240545e-07, + "loss": 0.8693, + "step": 191700 + }, + { + "epoch": 14.856057964276028, + "grad_norm": 1.533283762232357, + "learning_rate": 7.428316800991942e-07, + "loss": 0.8629, + "step": 191710 + }, + { + "epoch": 14.856832887752335, + "grad_norm": 1.5809506766695776, + "learning_rate": 7.428704277743336e-07, + "loss": 0.8773, + "step": 191720 + }, + { + "epoch": 14.857607811228641, + "grad_norm": 1.5224008055384768, + "learning_rate": 7.429091754494731e-07, + "loss": 0.8796, + "step": 191730 + }, + { + "epoch": 14.858382734704948, + "grad_norm": 1.5204123684294186, + "learning_rate": 7.429479231246125e-07, + "loss": 0.8903, + "step": 191740 + }, + { + "epoch": 14.859157658181255, + "grad_norm": 1.5319678872715514, + "learning_rate": 7.429866707997522e-07, + "loss": 0.8799, + "step": 191750 + }, + { + "epoch": 14.859932581657562, + "grad_norm": 1.5374232926175784, + "learning_rate": 7.430254184748916e-07, + "loss": 0.8923, + "step": 191760 + }, + { + "epoch": 14.860707505133869, + "grad_norm": 1.5432678195253213, + "learning_rate": 7.430641661500311e-07, + "loss": 0.8648, + "step": 191770 + }, + { + "epoch": 14.861482428610175, + "grad_norm": 1.624736630053486, + "learning_rate": 7.431029138251705e-07, + "loss": 0.8645, + "step": 191780 + }, + { + "epoch": 14.862257352086482, + "grad_norm": 1.5292605344402868, + "learning_rate": 7.431416615003101e-07, + "loss": 0.8661, + "step": 191790 + }, + { + "epoch": 14.863032275562787, + "grad_norm": 1.4381339977457064, + "learning_rate": 7.431804091754496e-07, + "loss": 0.869, + "step": 191800 + }, + { + "epoch": 14.863807199039094, + "grad_norm": 1.5190438231998449, + "learning_rate": 7.432191568505891e-07, + "loss": 0.8642, + "step": 191810 + }, + { + "epoch": 14.864582122515401, + "grad_norm": 1.5250771245490748, + "learning_rate": 7.432579045257285e-07, + "loss": 0.8545, + "step": 191820 + }, + { + "epoch": 14.865357045991708, + "grad_norm": 1.4971856446379643, + "learning_rate": 7.43296652200868e-07, + "loss": 0.8654, + "step": 191830 + }, + { + "epoch": 14.866131969468015, + "grad_norm": 1.5172128060601584, + "learning_rate": 7.433353998760074e-07, + "loss": 0.8724, + "step": 191840 + }, + { + "epoch": 14.866906892944321, + "grad_norm": 1.6366322065864538, + "learning_rate": 7.433741475511471e-07, + "loss": 0.8652, + "step": 191850 + }, + { + "epoch": 14.867681816420628, + "grad_norm": 1.58736165614396, + "learning_rate": 7.434128952262865e-07, + "loss": 0.862, + "step": 191860 + }, + { + "epoch": 14.868456739896935, + "grad_norm": 1.5375118625908495, + "learning_rate": 7.43451642901426e-07, + "loss": 0.8604, + "step": 191870 + }, + { + "epoch": 14.869231663373242, + "grad_norm": 1.4728743894335852, + "learning_rate": 7.434903905765654e-07, + "loss": 0.8909, + "step": 191880 + }, + { + "epoch": 14.870006586849549, + "grad_norm": 1.5715771210388672, + "learning_rate": 7.43529138251705e-07, + "loss": 0.8636, + "step": 191890 + }, + { + "epoch": 14.870781510325855, + "grad_norm": 1.5835058221006357, + "learning_rate": 7.435678859268445e-07, + "loss": 0.8749, + "step": 191900 + }, + { + "epoch": 14.871556433802162, + "grad_norm": 1.5646269766447651, + "learning_rate": 7.43606633601984e-07, + "loss": 0.8779, + "step": 191910 + }, + { + "epoch": 14.872331357278469, + "grad_norm": 1.419042105277589, + "learning_rate": 7.436453812771234e-07, + "loss": 0.8544, + "step": 191920 + }, + { + "epoch": 14.873106280754776, + "grad_norm": 1.5394270694006769, + "learning_rate": 7.43684128952263e-07, + "loss": 0.8542, + "step": 191930 + }, + { + "epoch": 14.873881204231083, + "grad_norm": 1.5380883967415249, + "learning_rate": 7.437228766274024e-07, + "loss": 0.8499, + "step": 191940 + }, + { + "epoch": 14.87465612770739, + "grad_norm": 1.4873362484426627, + "learning_rate": 7.43761624302542e-07, + "loss": 0.8632, + "step": 191950 + }, + { + "epoch": 14.875431051183696, + "grad_norm": 1.6155320162354458, + "learning_rate": 7.438003719776814e-07, + "loss": 0.8864, + "step": 191960 + }, + { + "epoch": 14.876205974660003, + "grad_norm": 1.5332417015232924, + "learning_rate": 7.438391196528209e-07, + "loss": 0.8744, + "step": 191970 + }, + { + "epoch": 14.87698089813631, + "grad_norm": 1.6631730926898678, + "learning_rate": 7.438778673279603e-07, + "loss": 0.8555, + "step": 191980 + }, + { + "epoch": 14.877755821612617, + "grad_norm": 1.563506680794737, + "learning_rate": 7.439166150030999e-07, + "loss": 0.8733, + "step": 191990 + }, + { + "epoch": 14.878530745088922, + "grad_norm": 1.5568033578703167, + "learning_rate": 7.439553626782394e-07, + "loss": 0.8741, + "step": 192000 + }, + { + "epoch": 14.878530745088922, + "eval_loss": 0.8913910388946533, + "eval_runtime": 330.6447, + "eval_samples_per_second": 34.693, + "eval_steps_per_second": 8.674, + "step": 192000 + }, + { + "epoch": 14.879305668565229, + "grad_norm": 1.5845841224765276, + "learning_rate": 7.439941103533789e-07, + "loss": 0.8513, + "step": 192010 + }, + { + "epoch": 14.880080592041535, + "grad_norm": 1.4992600998275287, + "learning_rate": 7.440328580285183e-07, + "loss": 0.8662, + "step": 192020 + }, + { + "epoch": 14.880855515517842, + "grad_norm": 1.4424062730564748, + "learning_rate": 7.440716057036579e-07, + "loss": 0.8901, + "step": 192030 + }, + { + "epoch": 14.881630438994149, + "grad_norm": 1.531124204169404, + "learning_rate": 7.441103533787973e-07, + "loss": 0.8746, + "step": 192040 + }, + { + "epoch": 14.882405362470456, + "grad_norm": 1.5395948438080411, + "learning_rate": 7.441491010539369e-07, + "loss": 0.8556, + "step": 192050 + }, + { + "epoch": 14.883180285946763, + "grad_norm": 1.5364666170629946, + "learning_rate": 7.441878487290763e-07, + "loss": 0.8678, + "step": 192060 + }, + { + "epoch": 14.88395520942307, + "grad_norm": 1.6137539319659486, + "learning_rate": 7.442265964042159e-07, + "loss": 0.891, + "step": 192070 + }, + { + "epoch": 14.884730132899376, + "grad_norm": 1.4601372307198677, + "learning_rate": 7.442653440793553e-07, + "loss": 0.8681, + "step": 192080 + }, + { + "epoch": 14.885505056375683, + "grad_norm": 1.510772264837995, + "learning_rate": 7.443040917544948e-07, + "loss": 0.8868, + "step": 192090 + }, + { + "epoch": 14.88627997985199, + "grad_norm": 1.5607021208980083, + "learning_rate": 7.443428394296343e-07, + "loss": 0.8612, + "step": 192100 + }, + { + "epoch": 14.887054903328297, + "grad_norm": 1.5534486939726415, + "learning_rate": 7.443815871047738e-07, + "loss": 0.8641, + "step": 192110 + }, + { + "epoch": 14.887829826804603, + "grad_norm": 1.5518146992983655, + "learning_rate": 7.444203347799132e-07, + "loss": 0.8742, + "step": 192120 + }, + { + "epoch": 14.88860475028091, + "grad_norm": 1.6195133109408215, + "learning_rate": 7.444590824550528e-07, + "loss": 0.8451, + "step": 192130 + }, + { + "epoch": 14.889379673757217, + "grad_norm": 1.5386515801286305, + "learning_rate": 7.444978301301922e-07, + "loss": 0.8784, + "step": 192140 + }, + { + "epoch": 14.890154597233524, + "grad_norm": 1.4314631968746094, + "learning_rate": 7.445365778053318e-07, + "loss": 0.8558, + "step": 192150 + }, + { + "epoch": 14.89092952070983, + "grad_norm": 1.4674438684046613, + "learning_rate": 7.445753254804712e-07, + "loss": 0.8804, + "step": 192160 + }, + { + "epoch": 14.891704444186136, + "grad_norm": 1.528944642766799, + "learning_rate": 7.446140731556108e-07, + "loss": 0.8572, + "step": 192170 + }, + { + "epoch": 14.892479367662443, + "grad_norm": 1.490822010015024, + "learning_rate": 7.446528208307502e-07, + "loss": 0.8535, + "step": 192180 + }, + { + "epoch": 14.89325429113875, + "grad_norm": 1.5574415903717522, + "learning_rate": 7.446915685058897e-07, + "loss": 0.8616, + "step": 192190 + }, + { + "epoch": 14.894029214615056, + "grad_norm": 1.5362161126969505, + "learning_rate": 7.447303161810292e-07, + "loss": 0.8668, + "step": 192200 + }, + { + "epoch": 14.894804138091363, + "grad_norm": 1.4411014376781055, + "learning_rate": 7.447690638561687e-07, + "loss": 0.8632, + "step": 192210 + }, + { + "epoch": 14.89557906156767, + "grad_norm": 1.5987058269049026, + "learning_rate": 7.448078115313082e-07, + "loss": 0.8587, + "step": 192220 + }, + { + "epoch": 14.896353985043977, + "grad_norm": 1.4979510590389529, + "learning_rate": 7.448465592064477e-07, + "loss": 0.8758, + "step": 192230 + }, + { + "epoch": 14.897128908520283, + "grad_norm": 1.591432155200343, + "learning_rate": 7.448853068815871e-07, + "loss": 0.8427, + "step": 192240 + }, + { + "epoch": 14.89790383199659, + "grad_norm": 1.6089173513582573, + "learning_rate": 7.449240545567267e-07, + "loss": 0.8599, + "step": 192250 + }, + { + "epoch": 14.898678755472897, + "grad_norm": 1.4752172538126047, + "learning_rate": 7.449628022318661e-07, + "loss": 0.8673, + "step": 192260 + }, + { + "epoch": 14.899453678949204, + "grad_norm": 1.7048171074893081, + "learning_rate": 7.450015499070057e-07, + "loss": 0.8758, + "step": 192270 + }, + { + "epoch": 14.90022860242551, + "grad_norm": 1.5946490298804583, + "learning_rate": 7.450402975821451e-07, + "loss": 0.8774, + "step": 192280 + }, + { + "epoch": 14.901003525901817, + "grad_norm": 1.5677920278939443, + "learning_rate": 7.450790452572846e-07, + "loss": 0.8773, + "step": 192290 + }, + { + "epoch": 14.901778449378124, + "grad_norm": 1.473763082437975, + "learning_rate": 7.451177929324241e-07, + "loss": 0.8832, + "step": 192300 + }, + { + "epoch": 14.902553372854431, + "grad_norm": 1.581824694609039, + "learning_rate": 7.451565406075637e-07, + "loss": 0.8686, + "step": 192310 + }, + { + "epoch": 14.903328296330738, + "grad_norm": 1.5256983069236143, + "learning_rate": 7.451952882827031e-07, + "loss": 0.8611, + "step": 192320 + }, + { + "epoch": 14.904103219807045, + "grad_norm": 1.5237840948854233, + "learning_rate": 7.452340359578426e-07, + "loss": 0.852, + "step": 192330 + }, + { + "epoch": 14.904878143283351, + "grad_norm": 1.588900734479841, + "learning_rate": 7.45272783632982e-07, + "loss": 0.8536, + "step": 192340 + }, + { + "epoch": 14.905653066759658, + "grad_norm": 1.4644743777592504, + "learning_rate": 7.453115313081216e-07, + "loss": 0.8635, + "step": 192350 + }, + { + "epoch": 14.906427990235965, + "grad_norm": 1.6138441371461336, + "learning_rate": 7.453502789832611e-07, + "loss": 0.8666, + "step": 192360 + }, + { + "epoch": 14.90720291371227, + "grad_norm": 1.6034625826856144, + "learning_rate": 7.453890266584006e-07, + "loss": 0.875, + "step": 192370 + }, + { + "epoch": 14.907977837188577, + "grad_norm": 1.6994243755261227, + "learning_rate": 7.4542777433354e-07, + "loss": 0.8698, + "step": 192380 + }, + { + "epoch": 14.908752760664884, + "grad_norm": 1.496068611310191, + "learning_rate": 7.454665220086795e-07, + "loss": 0.8539, + "step": 192390 + }, + { + "epoch": 14.90952768414119, + "grad_norm": 1.672422149081776, + "learning_rate": 7.45505269683819e-07, + "loss": 0.8781, + "step": 192400 + }, + { + "epoch": 14.910302607617497, + "grad_norm": 1.642448564410914, + "learning_rate": 7.455440173589586e-07, + "loss": 0.8772, + "step": 192410 + }, + { + "epoch": 14.911077531093804, + "grad_norm": 1.5891569262673162, + "learning_rate": 7.45582765034098e-07, + "loss": 0.888, + "step": 192420 + }, + { + "epoch": 14.911852454570111, + "grad_norm": 1.4640543216402262, + "learning_rate": 7.456215127092375e-07, + "loss": 0.8712, + "step": 192430 + }, + { + "epoch": 14.912627378046418, + "grad_norm": 1.5032058404504371, + "learning_rate": 7.456602603843769e-07, + "loss": 0.8774, + "step": 192440 + }, + { + "epoch": 14.913402301522725, + "grad_norm": 1.4562794240196901, + "learning_rate": 7.456990080595166e-07, + "loss": 0.872, + "step": 192450 + }, + { + "epoch": 14.914177224999031, + "grad_norm": 1.6245195157258605, + "learning_rate": 7.45737755734656e-07, + "loss": 0.8662, + "step": 192460 + }, + { + "epoch": 14.914952148475338, + "grad_norm": 1.440081267201478, + "learning_rate": 7.457765034097955e-07, + "loss": 0.8589, + "step": 192470 + }, + { + "epoch": 14.915727071951645, + "grad_norm": 1.4433475103118938, + "learning_rate": 7.458152510849349e-07, + "loss": 0.8486, + "step": 192480 + }, + { + "epoch": 14.916501995427952, + "grad_norm": 1.5101618462177762, + "learning_rate": 7.458539987600744e-07, + "loss": 0.8935, + "step": 192490 + }, + { + "epoch": 14.917276918904259, + "grad_norm": 1.516698633190296, + "learning_rate": 7.45892746435214e-07, + "loss": 0.8667, + "step": 192500 + }, + { + "epoch": 14.917276918904259, + "eval_loss": 0.8910549879074097, + "eval_runtime": 328.6639, + "eval_samples_per_second": 34.902, + "eval_steps_per_second": 8.726, + "step": 192500 + }, + { + "epoch": 14.918051842380565, + "grad_norm": 1.5703204862254927, + "learning_rate": 7.459314941103535e-07, + "loss": 0.8658, + "step": 192510 + }, + { + "epoch": 14.918826765856872, + "grad_norm": 1.4435821898109207, + "learning_rate": 7.459702417854929e-07, + "loss": 0.866, + "step": 192520 + }, + { + "epoch": 14.919601689333179, + "grad_norm": 1.419103939806216, + "learning_rate": 7.460089894606324e-07, + "loss": 0.8431, + "step": 192530 + }, + { + "epoch": 14.920376612809484, + "grad_norm": 1.5813603842823039, + "learning_rate": 7.460477371357718e-07, + "loss": 0.8566, + "step": 192540 + }, + { + "epoch": 14.921151536285791, + "grad_norm": 1.5712798032537425, + "learning_rate": 7.460864848109115e-07, + "loss": 0.8585, + "step": 192550 + }, + { + "epoch": 14.921926459762098, + "grad_norm": 1.529473656666096, + "learning_rate": 7.461252324860509e-07, + "loss": 0.8649, + "step": 192560 + }, + { + "epoch": 14.922701383238405, + "grad_norm": 1.5436540973225708, + "learning_rate": 7.461639801611904e-07, + "loss": 0.8513, + "step": 192570 + }, + { + "epoch": 14.923476306714711, + "grad_norm": 1.4598129334109078, + "learning_rate": 7.462027278363298e-07, + "loss": 0.843, + "step": 192580 + }, + { + "epoch": 14.924251230191018, + "grad_norm": 1.5315008586563639, + "learning_rate": 7.462414755114695e-07, + "loss": 0.8549, + "step": 192590 + }, + { + "epoch": 14.925026153667325, + "grad_norm": 1.4344716481054196, + "learning_rate": 7.462802231866089e-07, + "loss": 0.8585, + "step": 192600 + }, + { + "epoch": 14.925801077143632, + "grad_norm": 1.5382184441690758, + "learning_rate": 7.463189708617484e-07, + "loss": 0.8711, + "step": 192610 + }, + { + "epoch": 14.926576000619939, + "grad_norm": 1.5650296246262825, + "learning_rate": 7.463577185368878e-07, + "loss": 0.8699, + "step": 192620 + }, + { + "epoch": 14.927350924096245, + "grad_norm": 1.5131994925713612, + "learning_rate": 7.463964662120273e-07, + "loss": 0.8475, + "step": 192630 + }, + { + "epoch": 14.928125847572552, + "grad_norm": 1.5372203979559236, + "learning_rate": 7.464352138871669e-07, + "loss": 0.8863, + "step": 192640 + }, + { + "epoch": 14.928900771048859, + "grad_norm": 1.4415201973928147, + "learning_rate": 7.464739615623064e-07, + "loss": 0.9005, + "step": 192650 + }, + { + "epoch": 14.929675694525166, + "grad_norm": 1.5148002544275487, + "learning_rate": 7.465127092374458e-07, + "loss": 0.8795, + "step": 192660 + }, + { + "epoch": 14.930450618001473, + "grad_norm": 1.542429629353994, + "learning_rate": 7.465514569125853e-07, + "loss": 0.8686, + "step": 192670 + }, + { + "epoch": 14.93122554147778, + "grad_norm": 1.5760307657668315, + "learning_rate": 7.465902045877247e-07, + "loss": 0.8741, + "step": 192680 + }, + { + "epoch": 14.932000464954086, + "grad_norm": 1.5147844004313331, + "learning_rate": 7.466289522628644e-07, + "loss": 0.8627, + "step": 192690 + }, + { + "epoch": 14.932775388430393, + "grad_norm": 1.5622275486012558, + "learning_rate": 7.466676999380038e-07, + "loss": 0.8919, + "step": 192700 + }, + { + "epoch": 14.9335503119067, + "grad_norm": 1.4304534373362918, + "learning_rate": 7.467064476131433e-07, + "loss": 0.8598, + "step": 192710 + }, + { + "epoch": 14.934325235383007, + "grad_norm": 1.5656414942947094, + "learning_rate": 7.467451952882827e-07, + "loss": 0.8645, + "step": 192720 + }, + { + "epoch": 14.935100158859314, + "grad_norm": 1.5695612880991912, + "learning_rate": 7.467839429634223e-07, + "loss": 0.8772, + "step": 192730 + }, + { + "epoch": 14.935875082335619, + "grad_norm": 1.5971400907154636, + "learning_rate": 7.468226906385618e-07, + "loss": 0.8743, + "step": 192740 + }, + { + "epoch": 14.936650005811925, + "grad_norm": 1.6017198540848345, + "learning_rate": 7.468614383137013e-07, + "loss": 0.8634, + "step": 192750 + }, + { + "epoch": 14.937424929288232, + "grad_norm": 1.4576550634545178, + "learning_rate": 7.469001859888407e-07, + "loss": 0.8538, + "step": 192760 + }, + { + "epoch": 14.938199852764539, + "grad_norm": 1.537651412707396, + "learning_rate": 7.469389336639802e-07, + "loss": 0.862, + "step": 192770 + }, + { + "epoch": 14.938974776240846, + "grad_norm": 1.5478787053511214, + "learning_rate": 7.469776813391196e-07, + "loss": 0.8767, + "step": 192780 + }, + { + "epoch": 14.939749699717153, + "grad_norm": 1.5504396953203399, + "learning_rate": 7.470164290142593e-07, + "loss": 0.8476, + "step": 192790 + }, + { + "epoch": 14.94052462319346, + "grad_norm": 1.579181634310783, + "learning_rate": 7.470551766893987e-07, + "loss": 0.8693, + "step": 192800 + }, + { + "epoch": 14.941299546669766, + "grad_norm": 1.5525164823883228, + "learning_rate": 7.470939243645382e-07, + "loss": 0.8692, + "step": 192810 + }, + { + "epoch": 14.942074470146073, + "grad_norm": 1.5455119166003932, + "learning_rate": 7.471326720396776e-07, + "loss": 0.8595, + "step": 192820 + }, + { + "epoch": 14.94284939362238, + "grad_norm": 1.6221197551811273, + "learning_rate": 7.471714197148172e-07, + "loss": 0.875, + "step": 192830 + }, + { + "epoch": 14.943624317098687, + "grad_norm": 1.5755798548485542, + "learning_rate": 7.472101673899567e-07, + "loss": 0.8427, + "step": 192840 + }, + { + "epoch": 14.944399240574993, + "grad_norm": 1.5029863078286056, + "learning_rate": 7.472489150650962e-07, + "loss": 0.8531, + "step": 192850 + }, + { + "epoch": 14.9451741640513, + "grad_norm": 1.5635546074003754, + "learning_rate": 7.472876627402356e-07, + "loss": 0.8746, + "step": 192860 + }, + { + "epoch": 14.945949087527607, + "grad_norm": 1.5111494546715165, + "learning_rate": 7.473264104153752e-07, + "loss": 0.8711, + "step": 192870 + }, + { + "epoch": 14.946724011003914, + "grad_norm": 1.435794362962837, + "learning_rate": 7.473651580905146e-07, + "loss": 0.8671, + "step": 192880 + }, + { + "epoch": 14.94749893448022, + "grad_norm": 1.7117654701605833, + "learning_rate": 7.474039057656542e-07, + "loss": 0.8657, + "step": 192890 + }, + { + "epoch": 14.948273857956528, + "grad_norm": 1.521413762791206, + "learning_rate": 7.474426534407936e-07, + "loss": 0.8666, + "step": 192900 + }, + { + "epoch": 14.949048781432834, + "grad_norm": 1.489221671754202, + "learning_rate": 7.474814011159331e-07, + "loss": 0.8707, + "step": 192910 + }, + { + "epoch": 14.94982370490914, + "grad_norm": 1.5248488659188957, + "learning_rate": 7.475201487910725e-07, + "loss": 0.8728, + "step": 192920 + }, + { + "epoch": 14.950598628385446, + "grad_norm": 1.4864986908334146, + "learning_rate": 7.475588964662121e-07, + "loss": 0.8605, + "step": 192930 + }, + { + "epoch": 14.951373551861753, + "grad_norm": 1.4092802819672434, + "learning_rate": 7.475976441413516e-07, + "loss": 0.8696, + "step": 192940 + }, + { + "epoch": 14.95214847533806, + "grad_norm": 1.5758791197031896, + "learning_rate": 7.476363918164911e-07, + "loss": 0.8666, + "step": 192950 + }, + { + "epoch": 14.952923398814367, + "grad_norm": 1.464262143868181, + "learning_rate": 7.476751394916305e-07, + "loss": 0.8772, + "step": 192960 + }, + { + "epoch": 14.953698322290673, + "grad_norm": 1.4683442026101772, + "learning_rate": 7.477138871667701e-07, + "loss": 0.8545, + "step": 192970 + }, + { + "epoch": 14.95447324576698, + "grad_norm": 1.5569745343414265, + "learning_rate": 7.477526348419095e-07, + "loss": 0.8819, + "step": 192980 + }, + { + "epoch": 14.955248169243287, + "grad_norm": 1.5063523364132014, + "learning_rate": 7.477913825170491e-07, + "loss": 0.8627, + "step": 192990 + }, + { + "epoch": 14.956023092719594, + "grad_norm": 1.5855586028151634, + "learning_rate": 7.478301301921885e-07, + "loss": 0.856, + "step": 193000 + }, + { + "epoch": 14.956023092719594, + "eval_loss": 0.8907816410064697, + "eval_runtime": 329.2753, + "eval_samples_per_second": 34.837, + "eval_steps_per_second": 8.71, + "step": 193000 + }, + { + "epoch": 14.9567980161959, + "grad_norm": 1.6589147913608082, + "learning_rate": 7.478688778673281e-07, + "loss": 0.8588, + "step": 193010 + }, + { + "epoch": 14.957572939672207, + "grad_norm": 1.4966744804735872, + "learning_rate": 7.479076255424675e-07, + "loss": 0.863, + "step": 193020 + }, + { + "epoch": 14.958347863148514, + "grad_norm": 1.5112644805272297, + "learning_rate": 7.47946373217607e-07, + "loss": 0.8851, + "step": 193030 + }, + { + "epoch": 14.959122786624821, + "grad_norm": 1.5659761636127536, + "learning_rate": 7.479851208927465e-07, + "loss": 0.892, + "step": 193040 + }, + { + "epoch": 14.959897710101128, + "grad_norm": 1.499214860795502, + "learning_rate": 7.48023868567886e-07, + "loss": 0.8682, + "step": 193050 + }, + { + "epoch": 14.960672633577435, + "grad_norm": 1.6122232209164595, + "learning_rate": 7.480626162430254e-07, + "loss": 0.8663, + "step": 193060 + }, + { + "epoch": 14.961447557053742, + "grad_norm": 1.5669962989183142, + "learning_rate": 7.48101363918165e-07, + "loss": 0.8654, + "step": 193070 + }, + { + "epoch": 14.962222480530048, + "grad_norm": 1.4762472499146368, + "learning_rate": 7.481401115933044e-07, + "loss": 0.8664, + "step": 193080 + }, + { + "epoch": 14.962997404006355, + "grad_norm": 1.5819497627375096, + "learning_rate": 7.48178859268444e-07, + "loss": 0.8809, + "step": 193090 + }, + { + "epoch": 14.963772327482662, + "grad_norm": 1.5778414006293717, + "learning_rate": 7.482176069435834e-07, + "loss": 0.8738, + "step": 193100 + }, + { + "epoch": 14.964547250958967, + "grad_norm": 1.4914489015366403, + "learning_rate": 7.48256354618723e-07, + "loss": 0.8729, + "step": 193110 + }, + { + "epoch": 14.965322174435274, + "grad_norm": 1.4600510109181537, + "learning_rate": 7.482951022938624e-07, + "loss": 0.8547, + "step": 193120 + }, + { + "epoch": 14.96609709791158, + "grad_norm": 1.6215458850639548, + "learning_rate": 7.48333849969002e-07, + "loss": 0.8537, + "step": 193130 + }, + { + "epoch": 14.966872021387887, + "grad_norm": 1.4739136591277433, + "learning_rate": 7.483725976441414e-07, + "loss": 0.8693, + "step": 193140 + }, + { + "epoch": 14.967646944864194, + "grad_norm": 1.503761956574187, + "learning_rate": 7.48411345319281e-07, + "loss": 0.8535, + "step": 193150 + }, + { + "epoch": 14.968421868340501, + "grad_norm": 1.559935291731795, + "learning_rate": 7.484500929944204e-07, + "loss": 0.851, + "step": 193160 + }, + { + "epoch": 14.969196791816808, + "grad_norm": 1.5612969566279749, + "learning_rate": 7.484888406695599e-07, + "loss": 0.8996, + "step": 193170 + }, + { + "epoch": 14.969971715293115, + "grad_norm": 1.564397214386145, + "learning_rate": 7.485275883446993e-07, + "loss": 0.8686, + "step": 193180 + }, + { + "epoch": 14.970746638769421, + "grad_norm": 1.583468638356918, + "learning_rate": 7.485663360198389e-07, + "loss": 0.8712, + "step": 193190 + }, + { + "epoch": 14.971521562245728, + "grad_norm": 1.5636815612218016, + "learning_rate": 7.486050836949783e-07, + "loss": 0.8517, + "step": 193200 + }, + { + "epoch": 14.972296485722035, + "grad_norm": 1.5233609765264886, + "learning_rate": 7.486438313701179e-07, + "loss": 0.87, + "step": 193210 + }, + { + "epoch": 14.973071409198342, + "grad_norm": 1.5303336949350277, + "learning_rate": 7.486825790452573e-07, + "loss": 0.8627, + "step": 193220 + }, + { + "epoch": 14.973846332674649, + "grad_norm": 1.596871873721459, + "learning_rate": 7.487213267203969e-07, + "loss": 0.8672, + "step": 193230 + }, + { + "epoch": 14.974621256150956, + "grad_norm": 1.5477001227971687, + "learning_rate": 7.487600743955363e-07, + "loss": 0.8724, + "step": 193240 + }, + { + "epoch": 14.975396179627262, + "grad_norm": 1.474232107240251, + "learning_rate": 7.487988220706759e-07, + "loss": 0.8692, + "step": 193250 + }, + { + "epoch": 14.97617110310357, + "grad_norm": 1.621136084007643, + "learning_rate": 7.488375697458153e-07, + "loss": 0.8521, + "step": 193260 + }, + { + "epoch": 14.976946026579876, + "grad_norm": 1.41449166569332, + "learning_rate": 7.488763174209548e-07, + "loss": 0.8738, + "step": 193270 + }, + { + "epoch": 14.977720950056183, + "grad_norm": 1.4591501210181608, + "learning_rate": 7.489150650960943e-07, + "loss": 0.8694, + "step": 193280 + }, + { + "epoch": 14.978495873532488, + "grad_norm": 1.5223378303062771, + "learning_rate": 7.489538127712339e-07, + "loss": 0.869, + "step": 193290 + }, + { + "epoch": 14.979270797008795, + "grad_norm": 1.5237704932893144, + "learning_rate": 7.489925604463733e-07, + "loss": 0.8776, + "step": 193300 + }, + { + "epoch": 14.980045720485101, + "grad_norm": 1.531322605634487, + "learning_rate": 7.490313081215128e-07, + "loss": 0.8529, + "step": 193310 + }, + { + "epoch": 14.980820643961408, + "grad_norm": 1.3657301562675912, + "learning_rate": 7.490700557966522e-07, + "loss": 0.8606, + "step": 193320 + }, + { + "epoch": 14.981595567437715, + "grad_norm": 1.5161839506131556, + "learning_rate": 7.491088034717918e-07, + "loss": 0.8919, + "step": 193330 + }, + { + "epoch": 14.982370490914022, + "grad_norm": 1.6871609270042782, + "learning_rate": 7.491475511469312e-07, + "loss": 0.8659, + "step": 193340 + }, + { + "epoch": 14.983145414390329, + "grad_norm": 1.5240345173858685, + "learning_rate": 7.491862988220708e-07, + "loss": 0.8747, + "step": 193350 + }, + { + "epoch": 14.983920337866635, + "grad_norm": 1.5576337508936136, + "learning_rate": 7.492250464972102e-07, + "loss": 0.8553, + "step": 193360 + }, + { + "epoch": 14.984695261342942, + "grad_norm": 1.5878434565674664, + "learning_rate": 7.492637941723497e-07, + "loss": 0.8668, + "step": 193370 + }, + { + "epoch": 14.985470184819249, + "grad_norm": 1.4954679498641457, + "learning_rate": 7.493025418474892e-07, + "loss": 0.8683, + "step": 193380 + }, + { + "epoch": 14.986245108295556, + "grad_norm": 1.5667753506374678, + "learning_rate": 7.493412895226288e-07, + "loss": 0.873, + "step": 193390 + }, + { + "epoch": 14.987020031771863, + "grad_norm": 1.5417908122782784, + "learning_rate": 7.493800371977682e-07, + "loss": 0.865, + "step": 193400 + }, + { + "epoch": 14.98779495524817, + "grad_norm": 1.5246771092352418, + "learning_rate": 7.494187848729077e-07, + "loss": 0.8888, + "step": 193410 + }, + { + "epoch": 14.988569878724476, + "grad_norm": 1.4800242724532362, + "learning_rate": 7.494575325480471e-07, + "loss": 0.8566, + "step": 193420 + }, + { + "epoch": 14.989344802200783, + "grad_norm": 1.4913228657845408, + "learning_rate": 7.494962802231868e-07, + "loss": 0.8609, + "step": 193430 + }, + { + "epoch": 14.99011972567709, + "grad_norm": 1.489658918656494, + "learning_rate": 7.495350278983262e-07, + "loss": 0.8678, + "step": 193440 + }, + { + "epoch": 14.990894649153397, + "grad_norm": 1.4683648278070573, + "learning_rate": 7.495737755734657e-07, + "loss": 0.8596, + "step": 193450 + }, + { + "epoch": 14.991669572629704, + "grad_norm": 1.5184088075593236, + "learning_rate": 7.496125232486051e-07, + "loss": 0.8592, + "step": 193460 + }, + { + "epoch": 14.99244449610601, + "grad_norm": 1.4845436145656237, + "learning_rate": 7.496512709237446e-07, + "loss": 0.8598, + "step": 193470 + }, + { + "epoch": 14.993219419582317, + "grad_norm": 1.6701114975397515, + "learning_rate": 7.496900185988841e-07, + "loss": 0.8618, + "step": 193480 + }, + { + "epoch": 14.993994343058622, + "grad_norm": 1.5971922518415504, + "learning_rate": 7.497287662740237e-07, + "loss": 0.8696, + "step": 193490 + }, + { + "epoch": 14.994769266534929, + "grad_norm": 1.5692228517683382, + "learning_rate": 7.497675139491631e-07, + "loss": 0.8726, + "step": 193500 + }, + { + "epoch": 14.994769266534929, + "eval_loss": 0.8907031416893005, + "eval_runtime": 328.6608, + "eval_samples_per_second": 34.902, + "eval_steps_per_second": 8.726, + "step": 193500 + }, + { + "epoch": 14.995544190011236, + "grad_norm": 1.5183475197472043, + "learning_rate": 7.498062616243026e-07, + "loss": 0.8601, + "step": 193510 + }, + { + "epoch": 14.996319113487543, + "grad_norm": 1.5632060019906489, + "learning_rate": 7.49845009299442e-07, + "loss": 0.8568, + "step": 193520 + }, + { + "epoch": 14.99709403696385, + "grad_norm": 1.4663098205472536, + "learning_rate": 7.498837569745817e-07, + "loss": 0.8816, + "step": 193530 + }, + { + "epoch": 14.997868960440156, + "grad_norm": 1.6166281301914383, + "learning_rate": 7.499225046497211e-07, + "loss": 0.8694, + "step": 193540 + }, + { + "epoch": 14.998643883916463, + "grad_norm": 1.6552411623453473, + "learning_rate": 7.499612523248606e-07, + "loss": 0.8548, + "step": 193550 + }, + { + "epoch": 14.99941880739277, + "grad_norm": 1.4840153242994487, + "learning_rate": 7.5e-07, + "loss": 0.8498, + "step": 193560 + }, + { + "epoch": 15.000193730869077, + "grad_norm": 1.5585302406622392, + "learning_rate": 7.500387476751396e-07, + "loss": 0.8612, + "step": 193570 + }, + { + "epoch": 15.000968654345384, + "grad_norm": 1.5588876009891914, + "learning_rate": 7.500774953502791e-07, + "loss": 0.8769, + "step": 193580 + }, + { + "epoch": 15.00174357782169, + "grad_norm": 1.4851981507858933, + "learning_rate": 7.501162430254186e-07, + "loss": 0.8798, + "step": 193590 + }, + { + "epoch": 15.002518501297997, + "grad_norm": 1.5495236577640894, + "learning_rate": 7.50154990700558e-07, + "loss": 0.8558, + "step": 193600 + }, + { + "epoch": 15.003293424774304, + "grad_norm": 1.4831788149400182, + "learning_rate": 7.501937383756975e-07, + "loss": 0.8469, + "step": 193610 + }, + { + "epoch": 15.00406834825061, + "grad_norm": 1.6728036680066432, + "learning_rate": 7.502324860508369e-07, + "loss": 0.8798, + "step": 193620 + }, + { + "epoch": 15.004843271726918, + "grad_norm": 1.5051779716074907, + "learning_rate": 7.502712337259766e-07, + "loss": 0.8441, + "step": 193630 + }, + { + "epoch": 15.005618195203224, + "grad_norm": 1.547717146453008, + "learning_rate": 7.50309981401116e-07, + "loss": 0.8541, + "step": 193640 + }, + { + "epoch": 15.006393118679531, + "grad_norm": 1.5483436907208128, + "learning_rate": 7.503487290762555e-07, + "loss": 0.8537, + "step": 193650 + }, + { + "epoch": 15.007168042155838, + "grad_norm": 1.5291862930791118, + "learning_rate": 7.503874767513949e-07, + "loss": 0.8587, + "step": 193660 + }, + { + "epoch": 15.007942965632143, + "grad_norm": 1.4539912692685362, + "learning_rate": 7.504262244265345e-07, + "loss": 0.8638, + "step": 193670 + }, + { + "epoch": 15.00871788910845, + "grad_norm": 1.5347348404740166, + "learning_rate": 7.50464972101674e-07, + "loss": 0.8597, + "step": 193680 + }, + { + "epoch": 15.009492812584757, + "grad_norm": 1.5999894245186919, + "learning_rate": 7.505037197768135e-07, + "loss": 0.8636, + "step": 193690 + }, + { + "epoch": 15.010267736061063, + "grad_norm": 1.5215596381583758, + "learning_rate": 7.505424674519529e-07, + "loss": 0.847, + "step": 193700 + }, + { + "epoch": 15.01104265953737, + "grad_norm": 1.6238647588598751, + "learning_rate": 7.505812151270924e-07, + "loss": 0.8491, + "step": 193710 + }, + { + "epoch": 15.011817583013677, + "grad_norm": 1.682202777002283, + "learning_rate": 7.50619962802232e-07, + "loss": 0.8595, + "step": 193720 + }, + { + "epoch": 15.012592506489984, + "grad_norm": 1.6462507576051313, + "learning_rate": 7.506587104773715e-07, + "loss": 0.8631, + "step": 193730 + }, + { + "epoch": 15.01336742996629, + "grad_norm": 1.5230580389721775, + "learning_rate": 7.506974581525109e-07, + "loss": 0.8548, + "step": 193740 + }, + { + "epoch": 15.014142353442598, + "grad_norm": 1.53054821522533, + "learning_rate": 7.507362058276504e-07, + "loss": 0.8612, + "step": 193750 + }, + { + "epoch": 15.014917276918904, + "grad_norm": 1.5810172998078686, + "learning_rate": 7.507749535027898e-07, + "loss": 0.8634, + "step": 193760 + }, + { + "epoch": 15.015692200395211, + "grad_norm": 1.4523207062256906, + "learning_rate": 7.508137011779295e-07, + "loss": 0.8609, + "step": 193770 + }, + { + "epoch": 15.016467123871518, + "grad_norm": 1.602120090797386, + "learning_rate": 7.508524488530689e-07, + "loss": 0.87, + "step": 193780 + }, + { + "epoch": 15.017242047347825, + "grad_norm": 1.4809404960635957, + "learning_rate": 7.508911965282084e-07, + "loss": 0.8596, + "step": 193790 + }, + { + "epoch": 15.018016970824132, + "grad_norm": 1.5845791428263376, + "learning_rate": 7.509299442033478e-07, + "loss": 0.8551, + "step": 193800 + }, + { + "epoch": 15.018791894300438, + "grad_norm": 1.525779076993296, + "learning_rate": 7.509686918784874e-07, + "loss": 0.842, + "step": 193810 + }, + { + "epoch": 15.019566817776745, + "grad_norm": 1.5657121784476897, + "learning_rate": 7.510074395536268e-07, + "loss": 0.8592, + "step": 193820 + }, + { + "epoch": 15.020341741253052, + "grad_norm": 1.5288270827569799, + "learning_rate": 7.510461872287664e-07, + "loss": 0.8624, + "step": 193830 + }, + { + "epoch": 15.021116664729359, + "grad_norm": 1.5052299681413224, + "learning_rate": 7.510849349039058e-07, + "loss": 0.8601, + "step": 193840 + }, + { + "epoch": 15.021891588205664, + "grad_norm": 1.550134345675001, + "learning_rate": 7.511236825790453e-07, + "loss": 0.8755, + "step": 193850 + }, + { + "epoch": 15.02266651168197, + "grad_norm": 1.577420721222716, + "learning_rate": 7.511624302541848e-07, + "loss": 0.8479, + "step": 193860 + }, + { + "epoch": 15.023441435158277, + "grad_norm": 1.5234887588404713, + "learning_rate": 7.512011779293244e-07, + "loss": 0.8658, + "step": 193870 + }, + { + "epoch": 15.024216358634584, + "grad_norm": 1.627097688866278, + "learning_rate": 7.512399256044638e-07, + "loss": 0.8542, + "step": 193880 + }, + { + "epoch": 15.024991282110891, + "grad_norm": 1.4779046537683567, + "learning_rate": 7.512786732796033e-07, + "loss": 0.8714, + "step": 193890 + }, + { + "epoch": 15.025766205587198, + "grad_norm": 1.4967916807056036, + "learning_rate": 7.513174209547427e-07, + "loss": 0.8686, + "step": 193900 + }, + { + "epoch": 15.026541129063505, + "grad_norm": 1.505640503624235, + "learning_rate": 7.513561686298823e-07, + "loss": 0.8528, + "step": 193910 + }, + { + "epoch": 15.027316052539812, + "grad_norm": 1.533670300972005, + "learning_rate": 7.513949163050218e-07, + "loss": 0.8562, + "step": 193920 + }, + { + "epoch": 15.028090976016118, + "grad_norm": 1.6390671622583806, + "learning_rate": 7.514336639801613e-07, + "loss": 0.8719, + "step": 193930 + }, + { + "epoch": 15.028865899492425, + "grad_norm": 1.536320565836978, + "learning_rate": 7.514724116553007e-07, + "loss": 0.8541, + "step": 193940 + }, + { + "epoch": 15.029640822968732, + "grad_norm": 1.6112484495206554, + "learning_rate": 7.515111593304403e-07, + "loss": 0.8512, + "step": 193950 + }, + { + "epoch": 15.030415746445039, + "grad_norm": 1.6349828424783328, + "learning_rate": 7.515499070055797e-07, + "loss": 0.8638, + "step": 193960 + }, + { + "epoch": 15.031190669921346, + "grad_norm": 1.5552396774165762, + "learning_rate": 7.515886546807193e-07, + "loss": 0.8399, + "step": 193970 + }, + { + "epoch": 15.031965593397652, + "grad_norm": 1.6756567645358695, + "learning_rate": 7.516274023558587e-07, + "loss": 0.8723, + "step": 193980 + }, + { + "epoch": 15.03274051687396, + "grad_norm": 1.5763327023969025, + "learning_rate": 7.516661500309982e-07, + "loss": 0.8668, + "step": 193990 + }, + { + "epoch": 15.033515440350266, + "grad_norm": 1.5346205160473216, + "learning_rate": 7.517048977061377e-07, + "loss": 0.8659, + "step": 194000 + }, + { + "epoch": 15.033515440350266, + "eval_loss": 0.8917452692985535, + "eval_runtime": 331.0486, + "eval_samples_per_second": 34.651, + "eval_steps_per_second": 8.663, + "step": 194000 + }, + { + "epoch": 15.034290363826573, + "grad_norm": 1.6474439247525714, + "learning_rate": 7.517436453812772e-07, + "loss": 0.8724, + "step": 194010 + }, + { + "epoch": 15.03506528730288, + "grad_norm": 1.552355007354451, + "learning_rate": 7.517823930564167e-07, + "loss": 0.8471, + "step": 194020 + }, + { + "epoch": 15.035840210779186, + "grad_norm": 1.5429176110823395, + "learning_rate": 7.518211407315562e-07, + "loss": 0.857, + "step": 194030 + }, + { + "epoch": 15.036615134255491, + "grad_norm": 1.5405714367275436, + "learning_rate": 7.518598884066956e-07, + "loss": 0.8734, + "step": 194040 + }, + { + "epoch": 15.037390057731798, + "grad_norm": 1.5571112064066321, + "learning_rate": 7.518986360818352e-07, + "loss": 0.8613, + "step": 194050 + }, + { + "epoch": 15.038164981208105, + "grad_norm": 1.7469602924262724, + "learning_rate": 7.519373837569746e-07, + "loss": 0.8554, + "step": 194060 + }, + { + "epoch": 15.038939904684412, + "grad_norm": 1.5578953383604786, + "learning_rate": 7.519761314321142e-07, + "loss": 0.8606, + "step": 194070 + }, + { + "epoch": 15.039714828160719, + "grad_norm": 1.5885336619553556, + "learning_rate": 7.520148791072536e-07, + "loss": 0.8831, + "step": 194080 + }, + { + "epoch": 15.040489751637026, + "grad_norm": 1.5324657017865846, + "learning_rate": 7.520536267823932e-07, + "loss": 0.8759, + "step": 194090 + }, + { + "epoch": 15.041264675113332, + "grad_norm": 1.5764887195374944, + "learning_rate": 7.520923744575326e-07, + "loss": 0.8702, + "step": 194100 + }, + { + "epoch": 15.04203959858964, + "grad_norm": 1.5412187554942034, + "learning_rate": 7.521311221326721e-07, + "loss": 0.8679, + "step": 194110 + }, + { + "epoch": 15.042814522065946, + "grad_norm": 1.5707549488551056, + "learning_rate": 7.521698698078116e-07, + "loss": 0.8726, + "step": 194120 + }, + { + "epoch": 15.043589445542253, + "grad_norm": 1.5654317754660432, + "learning_rate": 7.522086174829511e-07, + "loss": 0.8663, + "step": 194130 + }, + { + "epoch": 15.04436436901856, + "grad_norm": 1.5664684387759125, + "learning_rate": 7.522473651580906e-07, + "loss": 0.8542, + "step": 194140 + }, + { + "epoch": 15.045139292494866, + "grad_norm": 1.5692011200406057, + "learning_rate": 7.522861128332301e-07, + "loss": 0.8698, + "step": 194150 + }, + { + "epoch": 15.045914215971173, + "grad_norm": 1.5531067933519254, + "learning_rate": 7.523248605083695e-07, + "loss": 0.8737, + "step": 194160 + }, + { + "epoch": 15.04668913944748, + "grad_norm": 1.6157392288769152, + "learning_rate": 7.523636081835091e-07, + "loss": 0.8586, + "step": 194170 + }, + { + "epoch": 15.047464062923787, + "grad_norm": 1.5227530009203063, + "learning_rate": 7.524023558586485e-07, + "loss": 0.8411, + "step": 194180 + }, + { + "epoch": 15.048238986400094, + "grad_norm": 1.5159054847542648, + "learning_rate": 7.524411035337881e-07, + "loss": 0.8477, + "step": 194190 + }, + { + "epoch": 15.0490139098764, + "grad_norm": 1.5843900102717454, + "learning_rate": 7.524798512089275e-07, + "loss": 0.8699, + "step": 194200 + }, + { + "epoch": 15.049788833352707, + "grad_norm": 1.5404203335696176, + "learning_rate": 7.52518598884067e-07, + "loss": 0.8627, + "step": 194210 + }, + { + "epoch": 15.050563756829012, + "grad_norm": 1.6034848800769115, + "learning_rate": 7.525573465592065e-07, + "loss": 0.8674, + "step": 194220 + }, + { + "epoch": 15.051338680305319, + "grad_norm": 1.5972065045902304, + "learning_rate": 7.525960942343461e-07, + "loss": 0.8547, + "step": 194230 + }, + { + "epoch": 15.052113603781626, + "grad_norm": 1.6223927827374722, + "learning_rate": 7.526348419094855e-07, + "loss": 0.8682, + "step": 194240 + }, + { + "epoch": 15.052888527257933, + "grad_norm": 1.5233064277389372, + "learning_rate": 7.52673589584625e-07, + "loss": 0.8637, + "step": 194250 + }, + { + "epoch": 15.05366345073424, + "grad_norm": 1.5521039732166848, + "learning_rate": 7.527123372597644e-07, + "loss": 0.8633, + "step": 194260 + }, + { + "epoch": 15.054438374210546, + "grad_norm": 1.5960040561176985, + "learning_rate": 7.52751084934904e-07, + "loss": 0.8746, + "step": 194270 + }, + { + "epoch": 15.055213297686853, + "grad_norm": 1.5174337907137223, + "learning_rate": 7.527898326100434e-07, + "loss": 0.8438, + "step": 194280 + }, + { + "epoch": 15.05598822116316, + "grad_norm": 1.4364074958874888, + "learning_rate": 7.52828580285183e-07, + "loss": 0.8522, + "step": 194290 + }, + { + "epoch": 15.056763144639467, + "grad_norm": 1.5716087374138272, + "learning_rate": 7.528673279603224e-07, + "loss": 0.8606, + "step": 194300 + }, + { + "epoch": 15.057538068115774, + "grad_norm": 1.4570000044838785, + "learning_rate": 7.529060756354619e-07, + "loss": 0.8555, + "step": 194310 + }, + { + "epoch": 15.05831299159208, + "grad_norm": 1.5218923780112115, + "learning_rate": 7.529448233106014e-07, + "loss": 0.8583, + "step": 194320 + }, + { + "epoch": 15.059087915068387, + "grad_norm": 1.5104301359320118, + "learning_rate": 7.52983570985741e-07, + "loss": 0.8548, + "step": 194330 + }, + { + "epoch": 15.059862838544694, + "grad_norm": 1.5983207735151754, + "learning_rate": 7.530223186608804e-07, + "loss": 0.8585, + "step": 194340 + }, + { + "epoch": 15.060637762021, + "grad_norm": 1.532587438588934, + "learning_rate": 7.530610663360199e-07, + "loss": 0.8579, + "step": 194350 + }, + { + "epoch": 15.061412685497308, + "grad_norm": 1.6378604231616958, + "learning_rate": 7.530998140111593e-07, + "loss": 0.8775, + "step": 194360 + }, + { + "epoch": 15.062187608973614, + "grad_norm": 1.5401689878071523, + "learning_rate": 7.53138561686299e-07, + "loss": 0.8604, + "step": 194370 + }, + { + "epoch": 15.062962532449921, + "grad_norm": 1.5655955661032885, + "learning_rate": 7.531773093614384e-07, + "loss": 0.8602, + "step": 194380 + }, + { + "epoch": 15.063737455926228, + "grad_norm": 1.5771487996782285, + "learning_rate": 7.532160570365779e-07, + "loss": 0.8581, + "step": 194390 + }, + { + "epoch": 15.064512379402535, + "grad_norm": 1.5208226852296434, + "learning_rate": 7.532548047117173e-07, + "loss": 0.8712, + "step": 194400 + }, + { + "epoch": 15.06528730287884, + "grad_norm": 1.5526596571960052, + "learning_rate": 7.532935523868568e-07, + "loss": 0.861, + "step": 194410 + }, + { + "epoch": 15.066062226355147, + "grad_norm": 1.482079203984422, + "learning_rate": 7.533323000619963e-07, + "loss": 0.8601, + "step": 194420 + }, + { + "epoch": 15.066837149831454, + "grad_norm": 1.5423879561912428, + "learning_rate": 7.533710477371359e-07, + "loss": 0.8479, + "step": 194430 + }, + { + "epoch": 15.06761207330776, + "grad_norm": 1.5704034310357717, + "learning_rate": 7.534097954122753e-07, + "loss": 0.8546, + "step": 194440 + }, + { + "epoch": 15.068386996784067, + "grad_norm": 1.5710577615048373, + "learning_rate": 7.534485430874148e-07, + "loss": 0.8786, + "step": 194450 + }, + { + "epoch": 15.069161920260374, + "grad_norm": 1.5606848719323372, + "learning_rate": 7.534872907625542e-07, + "loss": 0.8408, + "step": 194460 + }, + { + "epoch": 15.06993684373668, + "grad_norm": 1.4788941455693339, + "learning_rate": 7.535260384376939e-07, + "loss": 0.8314, + "step": 194470 + }, + { + "epoch": 15.070711767212988, + "grad_norm": 1.5522595813405669, + "learning_rate": 7.535647861128333e-07, + "loss": 0.87, + "step": 194480 + }, + { + "epoch": 15.071486690689294, + "grad_norm": 1.5025588860709165, + "learning_rate": 7.536035337879728e-07, + "loss": 0.8656, + "step": 194490 + }, + { + "epoch": 15.072261614165601, + "grad_norm": 1.5822551550375694, + "learning_rate": 7.536422814631122e-07, + "loss": 0.8586, + "step": 194500 + }, + { + "epoch": 15.072261614165601, + "eval_loss": 0.8919945359230042, + "eval_runtime": 332.8003, + "eval_samples_per_second": 34.468, + "eval_steps_per_second": 8.618, + "step": 194500 + }, + { + "epoch": 15.073036537641908, + "grad_norm": 1.6160380115319122, + "learning_rate": 7.536810291382519e-07, + "loss": 0.8576, + "step": 194510 + }, + { + "epoch": 15.073811461118215, + "grad_norm": 1.6367192972159095, + "learning_rate": 7.537197768133913e-07, + "loss": 0.8607, + "step": 194520 + }, + { + "epoch": 15.074586384594522, + "grad_norm": 1.5237151796457593, + "learning_rate": 7.537585244885308e-07, + "loss": 0.8688, + "step": 194530 + }, + { + "epoch": 15.075361308070828, + "grad_norm": 1.496944146475464, + "learning_rate": 7.537972721636702e-07, + "loss": 0.8591, + "step": 194540 + }, + { + "epoch": 15.076136231547135, + "grad_norm": 1.5685475002909044, + "learning_rate": 7.538360198388097e-07, + "loss": 0.8571, + "step": 194550 + }, + { + "epoch": 15.076911155023442, + "grad_norm": 1.6378677527802257, + "learning_rate": 7.538747675139491e-07, + "loss": 0.8578, + "step": 194560 + }, + { + "epoch": 15.077686078499749, + "grad_norm": 1.485023056937457, + "learning_rate": 7.539135151890888e-07, + "loss": 0.8676, + "step": 194570 + }, + { + "epoch": 15.078461001976056, + "grad_norm": 1.6187821118293013, + "learning_rate": 7.539522628642282e-07, + "loss": 0.8644, + "step": 194580 + }, + { + "epoch": 15.079235925452362, + "grad_norm": 1.5637044657227197, + "learning_rate": 7.539910105393677e-07, + "loss": 0.8753, + "step": 194590 + }, + { + "epoch": 15.080010848928667, + "grad_norm": 1.5420953899081276, + "learning_rate": 7.540297582145071e-07, + "loss": 0.862, + "step": 194600 + }, + { + "epoch": 15.080785772404974, + "grad_norm": 1.5953888921326072, + "learning_rate": 7.540685058896468e-07, + "loss": 0.8566, + "step": 194610 + }, + { + "epoch": 15.081560695881281, + "grad_norm": 1.5075886056043957, + "learning_rate": 7.541072535647862e-07, + "loss": 0.8474, + "step": 194620 + }, + { + "epoch": 15.082335619357588, + "grad_norm": 1.538855462084736, + "learning_rate": 7.541460012399257e-07, + "loss": 0.8776, + "step": 194630 + }, + { + "epoch": 15.083110542833895, + "grad_norm": 1.5879209317702934, + "learning_rate": 7.541847489150651e-07, + "loss": 0.8774, + "step": 194640 + }, + { + "epoch": 15.083885466310202, + "grad_norm": 1.5341527747159749, + "learning_rate": 7.542234965902047e-07, + "loss": 0.8734, + "step": 194650 + }, + { + "epoch": 15.084660389786508, + "grad_norm": 1.5086574793334326, + "learning_rate": 7.542622442653442e-07, + "loss": 0.8916, + "step": 194660 + }, + { + "epoch": 15.085435313262815, + "grad_norm": 1.467106133638799, + "learning_rate": 7.543009919404837e-07, + "loss": 0.8404, + "step": 194670 + }, + { + "epoch": 15.086210236739122, + "grad_norm": 1.6488482466590524, + "learning_rate": 7.543397396156231e-07, + "loss": 0.8466, + "step": 194680 + }, + { + "epoch": 15.086985160215429, + "grad_norm": 1.5727352146575553, + "learning_rate": 7.543784872907626e-07, + "loss": 0.8513, + "step": 194690 + }, + { + "epoch": 15.087760083691736, + "grad_norm": 1.6238570356068476, + "learning_rate": 7.54417234965902e-07, + "loss": 0.8503, + "step": 194700 + }, + { + "epoch": 15.088535007168042, + "grad_norm": 1.6296607508501257, + "learning_rate": 7.544559826410417e-07, + "loss": 0.838, + "step": 194710 + }, + { + "epoch": 15.08930993064435, + "grad_norm": 1.5887465829427891, + "learning_rate": 7.544947303161811e-07, + "loss": 0.8579, + "step": 194720 + }, + { + "epoch": 15.090084854120656, + "grad_norm": 1.5824679688407943, + "learning_rate": 7.545334779913206e-07, + "loss": 0.858, + "step": 194730 + }, + { + "epoch": 15.090859777596963, + "grad_norm": 1.556809366536295, + "learning_rate": 7.5457222566646e-07, + "loss": 0.8503, + "step": 194740 + }, + { + "epoch": 15.09163470107327, + "grad_norm": 1.5745620249689896, + "learning_rate": 7.546109733415996e-07, + "loss": 0.8503, + "step": 194750 + }, + { + "epoch": 15.092409624549576, + "grad_norm": 1.6164545283422393, + "learning_rate": 7.546497210167391e-07, + "loss": 0.883, + "step": 194760 + }, + { + "epoch": 15.093184548025883, + "grad_norm": 1.8478400971276165, + "learning_rate": 7.546884686918786e-07, + "loss": 0.8558, + "step": 194770 + }, + { + "epoch": 15.093959471502188, + "grad_norm": 1.730970810366751, + "learning_rate": 7.54727216367018e-07, + "loss": 0.8722, + "step": 194780 + }, + { + "epoch": 15.094734394978495, + "grad_norm": 1.5779946711516135, + "learning_rate": 7.547659640421576e-07, + "loss": 0.8376, + "step": 194790 + }, + { + "epoch": 15.095509318454802, + "grad_norm": 1.5939446382671554, + "learning_rate": 7.54804711717297e-07, + "loss": 0.8699, + "step": 194800 + }, + { + "epoch": 15.096284241931109, + "grad_norm": 1.4456833681202825, + "learning_rate": 7.548434593924366e-07, + "loss": 0.8639, + "step": 194810 + }, + { + "epoch": 15.097059165407416, + "grad_norm": 1.5716763508486955, + "learning_rate": 7.54882207067576e-07, + "loss": 0.8529, + "step": 194820 + }, + { + "epoch": 15.097834088883722, + "grad_norm": 1.5807234622833768, + "learning_rate": 7.549209547427155e-07, + "loss": 0.8557, + "step": 194830 + }, + { + "epoch": 15.09860901236003, + "grad_norm": 1.6216161989221307, + "learning_rate": 7.549597024178549e-07, + "loss": 0.8455, + "step": 194840 + }, + { + "epoch": 15.099383935836336, + "grad_norm": 1.572957194474429, + "learning_rate": 7.549984500929945e-07, + "loss": 0.8664, + "step": 194850 + }, + { + "epoch": 15.100158859312643, + "grad_norm": 1.494887682478409, + "learning_rate": 7.55037197768134e-07, + "loss": 0.8657, + "step": 194860 + }, + { + "epoch": 15.10093378278895, + "grad_norm": 1.5848093275701962, + "learning_rate": 7.550759454432735e-07, + "loss": 0.8513, + "step": 194870 + }, + { + "epoch": 15.101708706265256, + "grad_norm": 1.7094647673499903, + "learning_rate": 7.551146931184129e-07, + "loss": 0.8727, + "step": 194880 + }, + { + "epoch": 15.102483629741563, + "grad_norm": 1.5829401133084893, + "learning_rate": 7.551534407935525e-07, + "loss": 0.8546, + "step": 194890 + }, + { + "epoch": 15.10325855321787, + "grad_norm": 1.5995796486350011, + "learning_rate": 7.551921884686919e-07, + "loss": 0.8466, + "step": 194900 + }, + { + "epoch": 15.104033476694177, + "grad_norm": 1.5988365111870413, + "learning_rate": 7.552309361438315e-07, + "loss": 0.8526, + "step": 194910 + }, + { + "epoch": 15.104808400170484, + "grad_norm": 1.5777723176748781, + "learning_rate": 7.552696838189709e-07, + "loss": 0.8796, + "step": 194920 + }, + { + "epoch": 15.10558332364679, + "grad_norm": 1.616899366802949, + "learning_rate": 7.553084314941105e-07, + "loss": 0.8592, + "step": 194930 + }, + { + "epoch": 15.106358247123097, + "grad_norm": 1.5056300820023998, + "learning_rate": 7.553471791692499e-07, + "loss": 0.8907, + "step": 194940 + }, + { + "epoch": 15.107133170599404, + "grad_norm": 1.5848728314634075, + "learning_rate": 7.553859268443894e-07, + "loss": 0.872, + "step": 194950 + }, + { + "epoch": 15.107908094075711, + "grad_norm": 1.5719730665122418, + "learning_rate": 7.554246745195289e-07, + "loss": 0.8665, + "step": 194960 + }, + { + "epoch": 15.108683017552016, + "grad_norm": 1.5977092729030715, + "learning_rate": 7.554634221946684e-07, + "loss": 0.8626, + "step": 194970 + }, + { + "epoch": 15.109457941028323, + "grad_norm": 1.5527892979322948, + "learning_rate": 7.555021698698078e-07, + "loss": 0.8488, + "step": 194980 + }, + { + "epoch": 15.11023286450463, + "grad_norm": 1.6364204292910343, + "learning_rate": 7.555409175449474e-07, + "loss": 0.8461, + "step": 194990 + }, + { + "epoch": 15.111007787980936, + "grad_norm": 1.5837996273821418, + "learning_rate": 7.555796652200868e-07, + "loss": 0.859, + "step": 195000 + }, + { + "epoch": 15.111007787980936, + "eval_loss": 0.8919285535812378, + "eval_runtime": 333.1701, + "eval_samples_per_second": 34.43, + "eval_steps_per_second": 8.608, + "step": 195000 + }, + { + "epoch": 15.111782711457243, + "grad_norm": 1.5772999163154169, + "learning_rate": 7.556184128952264e-07, + "loss": 0.8566, + "step": 195010 + }, + { + "epoch": 15.11255763493355, + "grad_norm": 1.574654917577278, + "learning_rate": 7.556571605703658e-07, + "loss": 0.8516, + "step": 195020 + }, + { + "epoch": 15.113332558409857, + "grad_norm": 1.630895449045101, + "learning_rate": 7.556959082455054e-07, + "loss": 0.854, + "step": 195030 + }, + { + "epoch": 15.114107481886164, + "grad_norm": 1.6177931576523186, + "learning_rate": 7.557346559206448e-07, + "loss": 0.834, + "step": 195040 + }, + { + "epoch": 15.11488240536247, + "grad_norm": 1.5391784102072301, + "learning_rate": 7.557734035957843e-07, + "loss": 0.8665, + "step": 195050 + }, + { + "epoch": 15.115657328838777, + "grad_norm": 1.5893743781184615, + "learning_rate": 7.558121512709238e-07, + "loss": 0.8562, + "step": 195060 + }, + { + "epoch": 15.116432252315084, + "grad_norm": 1.539071265326434, + "learning_rate": 7.558508989460634e-07, + "loss": 0.8508, + "step": 195070 + }, + { + "epoch": 15.11720717579139, + "grad_norm": 1.4864102281008862, + "learning_rate": 7.558896466212028e-07, + "loss": 0.8647, + "step": 195080 + }, + { + "epoch": 15.117982099267698, + "grad_norm": 1.5650484330265029, + "learning_rate": 7.559283942963423e-07, + "loss": 0.8564, + "step": 195090 + }, + { + "epoch": 15.118757022744004, + "grad_norm": 1.5415529907415537, + "learning_rate": 7.559671419714817e-07, + "loss": 0.8551, + "step": 195100 + }, + { + "epoch": 15.119531946220311, + "grad_norm": 1.6208359768800567, + "learning_rate": 7.560058896466213e-07, + "loss": 0.8672, + "step": 195110 + }, + { + "epoch": 15.120306869696618, + "grad_norm": 1.6480472276935545, + "learning_rate": 7.560446373217607e-07, + "loss": 0.853, + "step": 195120 + }, + { + "epoch": 15.121081793172925, + "grad_norm": 1.5957353933341998, + "learning_rate": 7.560833849969003e-07, + "loss": 0.864, + "step": 195130 + }, + { + "epoch": 15.121856716649232, + "grad_norm": 1.6375195831678413, + "learning_rate": 7.561221326720397e-07, + "loss": 0.8781, + "step": 195140 + }, + { + "epoch": 15.122631640125537, + "grad_norm": 1.5273557505437474, + "learning_rate": 7.561608803471792e-07, + "loss": 0.8459, + "step": 195150 + }, + { + "epoch": 15.123406563601844, + "grad_norm": 1.4922778735684865, + "learning_rate": 7.561996280223187e-07, + "loss": 0.8612, + "step": 195160 + }, + { + "epoch": 15.12418148707815, + "grad_norm": 1.4888726286330547, + "learning_rate": 7.562383756974583e-07, + "loss": 0.8478, + "step": 195170 + }, + { + "epoch": 15.124956410554457, + "grad_norm": 1.6181354387073792, + "learning_rate": 7.562771233725977e-07, + "loss": 0.8519, + "step": 195180 + }, + { + "epoch": 15.125731334030764, + "grad_norm": 1.4238737039184943, + "learning_rate": 7.563158710477372e-07, + "loss": 0.8545, + "step": 195190 + }, + { + "epoch": 15.12650625750707, + "grad_norm": 1.730822737457182, + "learning_rate": 7.563546187228766e-07, + "loss": 0.8459, + "step": 195200 + }, + { + "epoch": 15.127281180983378, + "grad_norm": 1.562891289094331, + "learning_rate": 7.563933663980162e-07, + "loss": 0.8427, + "step": 195210 + }, + { + "epoch": 15.128056104459684, + "grad_norm": 1.5533954227534976, + "learning_rate": 7.564321140731557e-07, + "loss": 0.8621, + "step": 195220 + }, + { + "epoch": 15.128831027935991, + "grad_norm": 1.5818795263035919, + "learning_rate": 7.564708617482952e-07, + "loss": 0.8389, + "step": 195230 + }, + { + "epoch": 15.129605951412298, + "grad_norm": 1.5176996627489898, + "learning_rate": 7.565096094234346e-07, + "loss": 0.8707, + "step": 195240 + }, + { + "epoch": 15.130380874888605, + "grad_norm": 1.5591893161200903, + "learning_rate": 7.565483570985742e-07, + "loss": 0.8484, + "step": 195250 + }, + { + "epoch": 15.131155798364912, + "grad_norm": 1.5432520528808267, + "learning_rate": 7.565871047737136e-07, + "loss": 0.8676, + "step": 195260 + }, + { + "epoch": 15.131930721841218, + "grad_norm": 1.658197970308016, + "learning_rate": 7.566258524488532e-07, + "loss": 0.8488, + "step": 195270 + }, + { + "epoch": 15.132705645317525, + "grad_norm": 1.6510179270917436, + "learning_rate": 7.566646001239926e-07, + "loss": 0.8458, + "step": 195280 + }, + { + "epoch": 15.133480568793832, + "grad_norm": 1.5986562514267373, + "learning_rate": 7.567033477991321e-07, + "loss": 0.8625, + "step": 195290 + }, + { + "epoch": 15.134255492270139, + "grad_norm": 1.5822979872137186, + "learning_rate": 7.567420954742715e-07, + "loss": 0.8563, + "step": 195300 + }, + { + "epoch": 15.135030415746446, + "grad_norm": 1.5898524862217818, + "learning_rate": 7.567808431494112e-07, + "loss": 0.8688, + "step": 195310 + }, + { + "epoch": 15.135805339222753, + "grad_norm": 1.4537779703769356, + "learning_rate": 7.568195908245506e-07, + "loss": 0.8629, + "step": 195320 + }, + { + "epoch": 15.13658026269906, + "grad_norm": 1.6387984470308938, + "learning_rate": 7.568583384996901e-07, + "loss": 0.8756, + "step": 195330 + }, + { + "epoch": 15.137355186175364, + "grad_norm": 1.538009269683856, + "learning_rate": 7.568970861748295e-07, + "loss": 0.8653, + "step": 195340 + }, + { + "epoch": 15.138130109651671, + "grad_norm": 1.4748936945430005, + "learning_rate": 7.569358338499691e-07, + "loss": 0.8521, + "step": 195350 + }, + { + "epoch": 15.138905033127978, + "grad_norm": 1.53027669887593, + "learning_rate": 7.569745815251086e-07, + "loss": 0.8537, + "step": 195360 + }, + { + "epoch": 15.139679956604285, + "grad_norm": 1.607283755237642, + "learning_rate": 7.570133292002481e-07, + "loss": 0.8567, + "step": 195370 + }, + { + "epoch": 15.140454880080592, + "grad_norm": 1.5835713167894208, + "learning_rate": 7.570520768753875e-07, + "loss": 0.8485, + "step": 195380 + }, + { + "epoch": 15.141229803556898, + "grad_norm": 1.5152673877475147, + "learning_rate": 7.57090824550527e-07, + "loss": 0.8671, + "step": 195390 + }, + { + "epoch": 15.142004727033205, + "grad_norm": 1.541514805605987, + "learning_rate": 7.571295722256665e-07, + "loss": 0.8758, + "step": 195400 + }, + { + "epoch": 15.142779650509512, + "grad_norm": 1.4580668021358076, + "learning_rate": 7.571683199008061e-07, + "loss": 0.8567, + "step": 195410 + }, + { + "epoch": 15.143554573985819, + "grad_norm": 1.5171427194795992, + "learning_rate": 7.572070675759455e-07, + "loss": 0.8711, + "step": 195420 + }, + { + "epoch": 15.144329497462126, + "grad_norm": 1.5102156409886052, + "learning_rate": 7.57245815251085e-07, + "loss": 0.8584, + "step": 195430 + }, + { + "epoch": 15.145104420938432, + "grad_norm": 1.5588122461568286, + "learning_rate": 7.572845629262244e-07, + "loss": 0.8559, + "step": 195440 + }, + { + "epoch": 15.14587934441474, + "grad_norm": 1.490497456782019, + "learning_rate": 7.573233106013641e-07, + "loss": 0.8427, + "step": 195450 + }, + { + "epoch": 15.146654267891046, + "grad_norm": 1.4831710133136584, + "learning_rate": 7.573620582765035e-07, + "loss": 0.8542, + "step": 195460 + }, + { + "epoch": 15.147429191367353, + "grad_norm": 1.5732569569703991, + "learning_rate": 7.57400805951643e-07, + "loss": 0.8627, + "step": 195470 + }, + { + "epoch": 15.14820411484366, + "grad_norm": 1.6118675856211777, + "learning_rate": 7.574395536267824e-07, + "loss": 0.8578, + "step": 195480 + }, + { + "epoch": 15.148979038319967, + "grad_norm": 1.5416062569017805, + "learning_rate": 7.574783013019219e-07, + "loss": 0.8553, + "step": 195490 + }, + { + "epoch": 15.149753961796273, + "grad_norm": 1.443385262548556, + "learning_rate": 7.575170489770615e-07, + "loss": 0.8403, + "step": 195500 + }, + { + "epoch": 15.149753961796273, + "eval_loss": 0.89179927110672, + "eval_runtime": 330.4188, + "eval_samples_per_second": 34.717, + "eval_steps_per_second": 8.68, + "step": 195500 + }, + { + "epoch": 15.15052888527258, + "grad_norm": 1.5439962091428578, + "learning_rate": 7.57555796652201e-07, + "loss": 0.8472, + "step": 195510 + }, + { + "epoch": 15.151303808748885, + "grad_norm": 1.5611401641688463, + "learning_rate": 7.575945443273404e-07, + "loss": 0.8732, + "step": 195520 + }, + { + "epoch": 15.152078732225192, + "grad_norm": 1.5359403451978204, + "learning_rate": 7.576332920024799e-07, + "loss": 0.8522, + "step": 195530 + }, + { + "epoch": 15.152853655701499, + "grad_norm": 1.5427747653587027, + "learning_rate": 7.576720396776193e-07, + "loss": 0.8776, + "step": 195540 + }, + { + "epoch": 15.153628579177806, + "grad_norm": 1.6003493925395487, + "learning_rate": 7.57710787352759e-07, + "loss": 0.8503, + "step": 195550 + }, + { + "epoch": 15.154403502654112, + "grad_norm": 1.488068357314952, + "learning_rate": 7.577495350278984e-07, + "loss": 0.8521, + "step": 195560 + }, + { + "epoch": 15.15517842613042, + "grad_norm": 1.5791938165725448, + "learning_rate": 7.577882827030379e-07, + "loss": 0.8361, + "step": 195570 + }, + { + "epoch": 15.155953349606726, + "grad_norm": 1.5896694386297932, + "learning_rate": 7.578270303781773e-07, + "loss": 0.8458, + "step": 195580 + }, + { + "epoch": 15.156728273083033, + "grad_norm": 1.6252561450381002, + "learning_rate": 7.578657780533169e-07, + "loss": 0.8753, + "step": 195590 + }, + { + "epoch": 15.15750319655934, + "grad_norm": 1.5673212146986462, + "learning_rate": 7.579045257284564e-07, + "loss": 0.8568, + "step": 195600 + }, + { + "epoch": 15.158278120035646, + "grad_norm": 1.615979900696485, + "learning_rate": 7.579432734035959e-07, + "loss": 0.8718, + "step": 195610 + }, + { + "epoch": 15.159053043511953, + "grad_norm": 1.5398437857146214, + "learning_rate": 7.579820210787353e-07, + "loss": 0.8438, + "step": 195620 + }, + { + "epoch": 15.15982796698826, + "grad_norm": 1.5719729440680479, + "learning_rate": 7.580207687538748e-07, + "loss": 0.8672, + "step": 195630 + }, + { + "epoch": 15.160602890464567, + "grad_norm": 1.5139395886778726, + "learning_rate": 7.580595164290143e-07, + "loss": 0.8466, + "step": 195640 + }, + { + "epoch": 15.161377813940874, + "grad_norm": 1.592252043164667, + "learning_rate": 7.580982641041539e-07, + "loss": 0.8608, + "step": 195650 + }, + { + "epoch": 15.16215273741718, + "grad_norm": 1.6081925917469766, + "learning_rate": 7.581370117792933e-07, + "loss": 0.8503, + "step": 195660 + }, + { + "epoch": 15.162927660893487, + "grad_norm": 1.5525881921159355, + "learning_rate": 7.581757594544328e-07, + "loss": 0.8476, + "step": 195670 + }, + { + "epoch": 15.163702584369794, + "grad_norm": 1.5014459754134042, + "learning_rate": 7.582145071295722e-07, + "loss": 0.8585, + "step": 195680 + }, + { + "epoch": 15.164477507846101, + "grad_norm": 1.5247189361816569, + "learning_rate": 7.582532548047118e-07, + "loss": 0.8489, + "step": 195690 + }, + { + "epoch": 15.165252431322408, + "grad_norm": 1.5310340557600168, + "learning_rate": 7.582920024798513e-07, + "loss": 0.8633, + "step": 195700 + }, + { + "epoch": 15.166027354798713, + "grad_norm": 1.5525566930516157, + "learning_rate": 7.583307501549908e-07, + "loss": 0.8993, + "step": 195710 + }, + { + "epoch": 15.16680227827502, + "grad_norm": 1.6120492840352931, + "learning_rate": 7.583694978301302e-07, + "loss": 0.8814, + "step": 195720 + }, + { + "epoch": 15.167577201751326, + "grad_norm": 1.488598469682819, + "learning_rate": 7.584082455052698e-07, + "loss": 0.8568, + "step": 195730 + }, + { + "epoch": 15.168352125227633, + "grad_norm": 1.539789134371239, + "learning_rate": 7.584469931804092e-07, + "loss": 0.8527, + "step": 195740 + }, + { + "epoch": 15.16912704870394, + "grad_norm": 1.5874465453205202, + "learning_rate": 7.584857408555488e-07, + "loss": 0.8848, + "step": 195750 + }, + { + "epoch": 15.169901972180247, + "grad_norm": 1.6124943983828688, + "learning_rate": 7.585244885306882e-07, + "loss": 0.8599, + "step": 195760 + }, + { + "epoch": 15.170676895656554, + "grad_norm": 1.5515066811352012, + "learning_rate": 7.585632362058277e-07, + "loss": 0.8811, + "step": 195770 + }, + { + "epoch": 15.17145181913286, + "grad_norm": 1.5640543418270352, + "learning_rate": 7.586019838809671e-07, + "loss": 0.8483, + "step": 195780 + }, + { + "epoch": 15.172226742609167, + "grad_norm": 1.594799029173386, + "learning_rate": 7.586407315561068e-07, + "loss": 0.8627, + "step": 195790 + }, + { + "epoch": 15.173001666085474, + "grad_norm": 1.5567119374930285, + "learning_rate": 7.586794792312462e-07, + "loss": 0.8554, + "step": 195800 + }, + { + "epoch": 15.17377658956178, + "grad_norm": 1.6027522395688056, + "learning_rate": 7.587182269063857e-07, + "loss": 0.8624, + "step": 195810 + }, + { + "epoch": 15.174551513038088, + "grad_norm": 1.619077008378134, + "learning_rate": 7.587569745815251e-07, + "loss": 0.8579, + "step": 195820 + }, + { + "epoch": 15.175326436514395, + "grad_norm": 1.5606080393589798, + "learning_rate": 7.587957222566647e-07, + "loss": 0.8624, + "step": 195830 + }, + { + "epoch": 15.176101359990701, + "grad_norm": 1.4514059476182049, + "learning_rate": 7.588344699318041e-07, + "loss": 0.8421, + "step": 195840 + }, + { + "epoch": 15.176876283467008, + "grad_norm": 1.5759537759182387, + "learning_rate": 7.588732176069437e-07, + "loss": 0.8841, + "step": 195850 + }, + { + "epoch": 15.177651206943315, + "grad_norm": 1.5640706126250778, + "learning_rate": 7.589119652820831e-07, + "loss": 0.8585, + "step": 195860 + }, + { + "epoch": 15.178426130419622, + "grad_norm": 1.565800862288853, + "learning_rate": 7.589507129572227e-07, + "loss": 0.8574, + "step": 195870 + }, + { + "epoch": 15.179201053895929, + "grad_norm": 1.4749280350968326, + "learning_rate": 7.589894606323621e-07, + "loss": 0.8776, + "step": 195880 + }, + { + "epoch": 15.179975977372235, + "grad_norm": 1.556372988883418, + "learning_rate": 7.590282083075017e-07, + "loss": 0.8628, + "step": 195890 + }, + { + "epoch": 15.18075090084854, + "grad_norm": 1.424954768013705, + "learning_rate": 7.590669559826411e-07, + "loss": 0.8676, + "step": 195900 + }, + { + "epoch": 15.181525824324847, + "grad_norm": 1.6122899671091206, + "learning_rate": 7.591057036577806e-07, + "loss": 0.8576, + "step": 195910 + }, + { + "epoch": 15.182300747801154, + "grad_norm": 1.4800808930519935, + "learning_rate": 7.5914445133292e-07, + "loss": 0.8654, + "step": 195920 + }, + { + "epoch": 15.18307567127746, + "grad_norm": 1.5376528625881707, + "learning_rate": 7.591831990080596e-07, + "loss": 0.8412, + "step": 195930 + }, + { + "epoch": 15.183850594753768, + "grad_norm": 1.5856818831025952, + "learning_rate": 7.59221946683199e-07, + "loss": 0.8604, + "step": 195940 + }, + { + "epoch": 15.184625518230074, + "grad_norm": 1.6006153488419432, + "learning_rate": 7.592606943583386e-07, + "loss": 0.8588, + "step": 195950 + }, + { + "epoch": 15.185400441706381, + "grad_norm": 1.5900233958125727, + "learning_rate": 7.59299442033478e-07, + "loss": 0.8596, + "step": 195960 + }, + { + "epoch": 15.186175365182688, + "grad_norm": 1.5946022700987958, + "learning_rate": 7.593381897086176e-07, + "loss": 0.852, + "step": 195970 + }, + { + "epoch": 15.186950288658995, + "grad_norm": 1.5761972067864123, + "learning_rate": 7.59376937383757e-07, + "loss": 0.8475, + "step": 195980 + }, + { + "epoch": 15.187725212135302, + "grad_norm": 1.5566940516791796, + "learning_rate": 7.594156850588966e-07, + "loss": 0.8449, + "step": 195990 + }, + { + "epoch": 15.188500135611608, + "grad_norm": 1.5770258453586246, + "learning_rate": 7.59454432734036e-07, + "loss": 0.8506, + "step": 196000 + }, + { + "epoch": 15.188500135611608, + "eval_loss": 0.891482949256897, + "eval_runtime": 330.6777, + "eval_samples_per_second": 34.689, + "eval_steps_per_second": 8.673, + "step": 196000 + }, + { + "epoch": 15.189275059087915, + "grad_norm": 1.5794365885131942, + "learning_rate": 7.594931804091756e-07, + "loss": 0.8647, + "step": 196010 + }, + { + "epoch": 15.190049982564222, + "grad_norm": 1.622446751667484, + "learning_rate": 7.59531928084315e-07, + "loss": 0.8424, + "step": 196020 + }, + { + "epoch": 15.190824906040529, + "grad_norm": 1.5349916189773634, + "learning_rate": 7.595706757594545e-07, + "loss": 0.8524, + "step": 196030 + }, + { + "epoch": 15.191599829516836, + "grad_norm": 1.4882634361410427, + "learning_rate": 7.59609423434594e-07, + "loss": 0.8668, + "step": 196040 + }, + { + "epoch": 15.192374752993143, + "grad_norm": 1.5731975720356273, + "learning_rate": 7.596481711097335e-07, + "loss": 0.8474, + "step": 196050 + }, + { + "epoch": 15.19314967646945, + "grad_norm": 1.6108761606116766, + "learning_rate": 7.596869187848729e-07, + "loss": 0.8376, + "step": 196060 + }, + { + "epoch": 15.193924599945756, + "grad_norm": 1.7220342460242535, + "learning_rate": 7.597256664600125e-07, + "loss": 0.8637, + "step": 196070 + }, + { + "epoch": 15.194699523422061, + "grad_norm": 1.4779662419703483, + "learning_rate": 7.597644141351519e-07, + "loss": 0.857, + "step": 196080 + }, + { + "epoch": 15.195474446898368, + "grad_norm": 1.537581059362779, + "learning_rate": 7.598031618102915e-07, + "loss": 0.8782, + "step": 196090 + }, + { + "epoch": 15.196249370374675, + "grad_norm": 1.485436300034211, + "learning_rate": 7.598419094854309e-07, + "loss": 0.8507, + "step": 196100 + }, + { + "epoch": 15.197024293850982, + "grad_norm": 1.499887265552928, + "learning_rate": 7.598806571605705e-07, + "loss": 0.8552, + "step": 196110 + }, + { + "epoch": 15.197799217327288, + "grad_norm": 1.5131979743651751, + "learning_rate": 7.599194048357099e-07, + "loss": 0.862, + "step": 196120 + }, + { + "epoch": 15.198574140803595, + "grad_norm": 1.5537644104097268, + "learning_rate": 7.599581525108494e-07, + "loss": 0.8575, + "step": 196130 + }, + { + "epoch": 15.199349064279902, + "grad_norm": 1.580779122536785, + "learning_rate": 7.599969001859889e-07, + "loss": 0.8537, + "step": 196140 + }, + { + "epoch": 15.200123987756209, + "grad_norm": 1.6028085502507108, + "learning_rate": 7.600356478611285e-07, + "loss": 0.854, + "step": 196150 + }, + { + "epoch": 15.200898911232516, + "grad_norm": 1.6140640488121158, + "learning_rate": 7.600743955362679e-07, + "loss": 0.873, + "step": 196160 + }, + { + "epoch": 15.201673834708822, + "grad_norm": 1.4810836003129915, + "learning_rate": 7.601131432114074e-07, + "loss": 0.8561, + "step": 196170 + }, + { + "epoch": 15.20244875818513, + "grad_norm": 1.5015363846558214, + "learning_rate": 7.601518908865468e-07, + "loss": 0.8479, + "step": 196180 + }, + { + "epoch": 15.203223681661436, + "grad_norm": 1.5801861955079195, + "learning_rate": 7.601906385616864e-07, + "loss": 0.8535, + "step": 196190 + }, + { + "epoch": 15.203998605137743, + "grad_norm": 1.5725387192575044, + "learning_rate": 7.602293862368258e-07, + "loss": 0.847, + "step": 196200 + }, + { + "epoch": 15.20477352861405, + "grad_norm": 1.5181010842629237, + "learning_rate": 7.602681339119654e-07, + "loss": 0.8587, + "step": 196210 + }, + { + "epoch": 15.205548452090357, + "grad_norm": 1.7307215405492604, + "learning_rate": 7.603068815871048e-07, + "loss": 0.8623, + "step": 196220 + }, + { + "epoch": 15.206323375566663, + "grad_norm": 1.6136884403897063, + "learning_rate": 7.603456292622443e-07, + "loss": 0.8616, + "step": 196230 + }, + { + "epoch": 15.20709829904297, + "grad_norm": 1.5232640359747711, + "learning_rate": 7.603843769373838e-07, + "loss": 0.8677, + "step": 196240 + }, + { + "epoch": 15.207873222519277, + "grad_norm": 1.5911809431546087, + "learning_rate": 7.604231246125234e-07, + "loss": 0.8734, + "step": 196250 + }, + { + "epoch": 15.208648145995584, + "grad_norm": 1.5525943397767663, + "learning_rate": 7.604618722876628e-07, + "loss": 0.8539, + "step": 196260 + }, + { + "epoch": 15.209423069471889, + "grad_norm": 1.5588702116745927, + "learning_rate": 7.605006199628023e-07, + "loss": 0.8601, + "step": 196270 + }, + { + "epoch": 15.210197992948196, + "grad_norm": 1.5955718713259384, + "learning_rate": 7.605393676379417e-07, + "loss": 0.8671, + "step": 196280 + }, + { + "epoch": 15.210972916424502, + "grad_norm": 1.4950440917724843, + "learning_rate": 7.605781153130814e-07, + "loss": 0.8289, + "step": 196290 + }, + { + "epoch": 15.21174783990081, + "grad_norm": 1.5313293116739983, + "learning_rate": 7.606168629882208e-07, + "loss": 0.8771, + "step": 196300 + }, + { + "epoch": 15.212522763377116, + "grad_norm": 1.5291999795052906, + "learning_rate": 7.606556106633603e-07, + "loss": 0.8619, + "step": 196310 + }, + { + "epoch": 15.213297686853423, + "grad_norm": 1.559902028570645, + "learning_rate": 7.606943583384997e-07, + "loss": 0.8841, + "step": 196320 + }, + { + "epoch": 15.21407261032973, + "grad_norm": 1.6505136791708972, + "learning_rate": 7.607331060136392e-07, + "loss": 0.8667, + "step": 196330 + }, + { + "epoch": 15.214847533806036, + "grad_norm": 1.6383773041888838, + "learning_rate": 7.607718536887787e-07, + "loss": 0.8609, + "step": 196340 + }, + { + "epoch": 15.215622457282343, + "grad_norm": 1.5038022086653955, + "learning_rate": 7.608106013639183e-07, + "loss": 0.8714, + "step": 196350 + }, + { + "epoch": 15.21639738075865, + "grad_norm": 1.6370430559985574, + "learning_rate": 7.608493490390577e-07, + "loss": 0.8372, + "step": 196360 + }, + { + "epoch": 15.217172304234957, + "grad_norm": 1.521270081138983, + "learning_rate": 7.608880967141972e-07, + "loss": 0.8758, + "step": 196370 + }, + { + "epoch": 15.217947227711264, + "grad_norm": 1.547421844128992, + "learning_rate": 7.609268443893366e-07, + "loss": 0.8552, + "step": 196380 + }, + { + "epoch": 15.21872215118757, + "grad_norm": 1.4811373867580888, + "learning_rate": 7.609655920644763e-07, + "loss": 0.8671, + "step": 196390 + }, + { + "epoch": 15.219497074663877, + "grad_norm": 1.524169836394166, + "learning_rate": 7.610043397396157e-07, + "loss": 0.8677, + "step": 196400 + }, + { + "epoch": 15.220271998140184, + "grad_norm": 1.6309494844493486, + "learning_rate": 7.610430874147552e-07, + "loss": 0.8618, + "step": 196410 + }, + { + "epoch": 15.221046921616491, + "grad_norm": 1.485214177228174, + "learning_rate": 7.610818350898946e-07, + "loss": 0.8669, + "step": 196420 + }, + { + "epoch": 15.221821845092798, + "grad_norm": 1.4558830820596302, + "learning_rate": 7.611205827650343e-07, + "loss": 0.8557, + "step": 196430 + }, + { + "epoch": 15.222596768569105, + "grad_norm": 1.6117484362444896, + "learning_rate": 7.611593304401737e-07, + "loss": 0.8628, + "step": 196440 + }, + { + "epoch": 15.223371692045411, + "grad_norm": 1.7035142141710395, + "learning_rate": 7.611980781153132e-07, + "loss": 0.8559, + "step": 196450 + }, + { + "epoch": 15.224146615521716, + "grad_norm": 1.510663103340592, + "learning_rate": 7.612368257904526e-07, + "loss": 0.8598, + "step": 196460 + }, + { + "epoch": 15.224921538998023, + "grad_norm": 1.5794023309656797, + "learning_rate": 7.612755734655921e-07, + "loss": 0.8514, + "step": 196470 + }, + { + "epoch": 15.22569646247433, + "grad_norm": 1.5200102960050137, + "learning_rate": 7.613143211407315e-07, + "loss": 0.8641, + "step": 196480 + }, + { + "epoch": 15.226471385950637, + "grad_norm": 1.568203953149502, + "learning_rate": 7.613530688158712e-07, + "loss": 0.8621, + "step": 196490 + }, + { + "epoch": 15.227246309426944, + "grad_norm": 1.5507939940958102, + "learning_rate": 7.613918164910106e-07, + "loss": 0.878, + "step": 196500 + }, + { + "epoch": 15.227246309426944, + "eval_loss": 0.8916081190109253, + "eval_runtime": 329.9923, + "eval_samples_per_second": 34.761, + "eval_steps_per_second": 8.691, + "step": 196500 + }, + { + "epoch": 15.22802123290325, + "grad_norm": 1.6797979599381472, + "learning_rate": 7.614305641661501e-07, + "loss": 0.8767, + "step": 196510 + }, + { + "epoch": 15.228796156379557, + "grad_norm": 1.5481720040598832, + "learning_rate": 7.614693118412895e-07, + "loss": 0.8507, + "step": 196520 + }, + { + "epoch": 15.229571079855864, + "grad_norm": 1.4754424110562991, + "learning_rate": 7.615080595164292e-07, + "loss": 0.8459, + "step": 196530 + }, + { + "epoch": 15.230346003332171, + "grad_norm": 1.5731071517986615, + "learning_rate": 7.615468071915686e-07, + "loss": 0.8586, + "step": 196540 + }, + { + "epoch": 15.231120926808478, + "grad_norm": 1.5877295341254234, + "learning_rate": 7.615855548667081e-07, + "loss": 0.8439, + "step": 196550 + }, + { + "epoch": 15.231895850284785, + "grad_norm": 1.5590525946894114, + "learning_rate": 7.616243025418475e-07, + "loss": 0.8483, + "step": 196560 + }, + { + "epoch": 15.232670773761091, + "grad_norm": 1.5474881606397317, + "learning_rate": 7.61663050216987e-07, + "loss": 0.8763, + "step": 196570 + }, + { + "epoch": 15.233445697237398, + "grad_norm": 1.6078357126752871, + "learning_rate": 7.617017978921266e-07, + "loss": 0.8802, + "step": 196580 + }, + { + "epoch": 15.234220620713705, + "grad_norm": 1.6377329844765818, + "learning_rate": 7.617405455672661e-07, + "loss": 0.8743, + "step": 196590 + }, + { + "epoch": 15.234995544190012, + "grad_norm": 1.5681964886026285, + "learning_rate": 7.617792932424055e-07, + "loss": 0.8626, + "step": 196600 + }, + { + "epoch": 15.235770467666319, + "grad_norm": 1.6511486092047412, + "learning_rate": 7.61818040917545e-07, + "loss": 0.8609, + "step": 196610 + }, + { + "epoch": 15.236545391142625, + "grad_norm": 1.56352870198453, + "learning_rate": 7.618567885926844e-07, + "loss": 0.8513, + "step": 196620 + }, + { + "epoch": 15.237320314618932, + "grad_norm": 1.6447872819748102, + "learning_rate": 7.618955362678241e-07, + "loss": 0.8698, + "step": 196630 + }, + { + "epoch": 15.238095238095237, + "grad_norm": 1.5116872626828513, + "learning_rate": 7.619342839429635e-07, + "loss": 0.8553, + "step": 196640 + }, + { + "epoch": 15.238870161571544, + "grad_norm": 1.6410788388061004, + "learning_rate": 7.61973031618103e-07, + "loss": 0.8724, + "step": 196650 + }, + { + "epoch": 15.23964508504785, + "grad_norm": 1.6108113255041612, + "learning_rate": 7.620117792932424e-07, + "loss": 0.8595, + "step": 196660 + }, + { + "epoch": 15.240420008524158, + "grad_norm": 1.5323268018158307, + "learning_rate": 7.62050526968382e-07, + "loss": 0.8621, + "step": 196670 + }, + { + "epoch": 15.241194932000464, + "grad_norm": 1.584803761675128, + "learning_rate": 7.620892746435215e-07, + "loss": 0.8525, + "step": 196680 + }, + { + "epoch": 15.241969855476771, + "grad_norm": 1.5638579498598097, + "learning_rate": 7.62128022318661e-07, + "loss": 0.8503, + "step": 196690 + }, + { + "epoch": 15.242744778953078, + "grad_norm": 1.5480579254524642, + "learning_rate": 7.621667699938004e-07, + "loss": 0.8542, + "step": 196700 + }, + { + "epoch": 15.243519702429385, + "grad_norm": 1.5239499851600316, + "learning_rate": 7.622055176689399e-07, + "loss": 0.8477, + "step": 196710 + }, + { + "epoch": 15.244294625905692, + "grad_norm": 1.561382057830554, + "learning_rate": 7.622442653440794e-07, + "loss": 0.8565, + "step": 196720 + }, + { + "epoch": 15.245069549381999, + "grad_norm": 1.5776202575248266, + "learning_rate": 7.62283013019219e-07, + "loss": 0.8529, + "step": 196730 + }, + { + "epoch": 15.245844472858305, + "grad_norm": 1.6262734623380775, + "learning_rate": 7.623217606943584e-07, + "loss": 0.8638, + "step": 196740 + }, + { + "epoch": 15.246619396334612, + "grad_norm": 1.5037360188270654, + "learning_rate": 7.623605083694979e-07, + "loss": 0.8653, + "step": 196750 + }, + { + "epoch": 15.247394319810919, + "grad_norm": 1.5226725944653978, + "learning_rate": 7.623992560446373e-07, + "loss": 0.8719, + "step": 196760 + }, + { + "epoch": 15.248169243287226, + "grad_norm": 1.5876270956244565, + "learning_rate": 7.624380037197769e-07, + "loss": 0.8697, + "step": 196770 + }, + { + "epoch": 15.248944166763533, + "grad_norm": 1.7563571327395482, + "learning_rate": 7.624767513949164e-07, + "loss": 0.8785, + "step": 196780 + }, + { + "epoch": 15.24971909023984, + "grad_norm": 1.5507281725832955, + "learning_rate": 7.625154990700559e-07, + "loss": 0.852, + "step": 196790 + }, + { + "epoch": 15.250494013716146, + "grad_norm": 1.621290350329808, + "learning_rate": 7.625542467451953e-07, + "loss": 0.8663, + "step": 196800 + }, + { + "epoch": 15.251268937192453, + "grad_norm": 1.6028296902880461, + "learning_rate": 7.625929944203349e-07, + "loss": 0.8716, + "step": 196810 + }, + { + "epoch": 15.25204386066876, + "grad_norm": 1.5924001170029927, + "learning_rate": 7.626317420954743e-07, + "loss": 0.8618, + "step": 196820 + }, + { + "epoch": 15.252818784145065, + "grad_norm": 1.4954175307151338, + "learning_rate": 7.626704897706139e-07, + "loss": 0.8411, + "step": 196830 + }, + { + "epoch": 15.253593707621372, + "grad_norm": 1.5245332394225555, + "learning_rate": 7.627092374457533e-07, + "loss": 0.8698, + "step": 196840 + }, + { + "epoch": 15.254368631097678, + "grad_norm": 1.5931857597352561, + "learning_rate": 7.627479851208928e-07, + "loss": 0.8331, + "step": 196850 + }, + { + "epoch": 15.255143554573985, + "grad_norm": 1.4944406645488486, + "learning_rate": 7.627867327960323e-07, + "loss": 0.873, + "step": 196860 + }, + { + "epoch": 15.255918478050292, + "grad_norm": 1.5589118331057548, + "learning_rate": 7.628254804711718e-07, + "loss": 0.8708, + "step": 196870 + }, + { + "epoch": 15.256693401526599, + "grad_norm": 1.5697417270404563, + "learning_rate": 7.628642281463113e-07, + "loss": 0.8577, + "step": 196880 + }, + { + "epoch": 15.257468325002906, + "grad_norm": 1.6744232981471625, + "learning_rate": 7.629029758214508e-07, + "loss": 0.8482, + "step": 196890 + }, + { + "epoch": 15.258243248479213, + "grad_norm": 1.6613550887074027, + "learning_rate": 7.629417234965902e-07, + "loss": 0.8437, + "step": 196900 + }, + { + "epoch": 15.25901817195552, + "grad_norm": 1.715345144426575, + "learning_rate": 7.629804711717298e-07, + "loss": 0.8665, + "step": 196910 + }, + { + "epoch": 15.259793095431826, + "grad_norm": 1.5843451895981588, + "learning_rate": 7.630192188468692e-07, + "loss": 0.8509, + "step": 196920 + }, + { + "epoch": 15.260568018908133, + "grad_norm": 1.5961194627286532, + "learning_rate": 7.630579665220088e-07, + "loss": 0.861, + "step": 196930 + }, + { + "epoch": 15.26134294238444, + "grad_norm": 1.6027065931382323, + "learning_rate": 7.630967141971482e-07, + "loss": 0.8732, + "step": 196940 + }, + { + "epoch": 15.262117865860747, + "grad_norm": 1.5380879522537598, + "learning_rate": 7.631354618722878e-07, + "loss": 0.8566, + "step": 196950 + }, + { + "epoch": 15.262892789337053, + "grad_norm": 1.545772189741049, + "learning_rate": 7.631742095474272e-07, + "loss": 0.8531, + "step": 196960 + }, + { + "epoch": 15.26366771281336, + "grad_norm": 1.5845354319623273, + "learning_rate": 7.632129572225667e-07, + "loss": 0.8676, + "step": 196970 + }, + { + "epoch": 15.264442636289667, + "grad_norm": 1.622514768972115, + "learning_rate": 7.632517048977062e-07, + "loss": 0.8612, + "step": 196980 + }, + { + "epoch": 15.265217559765974, + "grad_norm": 1.6010560968293803, + "learning_rate": 7.632904525728457e-07, + "loss": 0.8599, + "step": 196990 + }, + { + "epoch": 15.26599248324228, + "grad_norm": 1.5111662277623181, + "learning_rate": 7.633292002479852e-07, + "loss": 0.8701, + "step": 197000 + }, + { + "epoch": 15.26599248324228, + "eval_loss": 0.8912608623504639, + "eval_runtime": 330.6905, + "eval_samples_per_second": 34.688, + "eval_steps_per_second": 8.673, + "step": 197000 + }, + { + "epoch": 15.266767406718586, + "grad_norm": 1.5654960607383146, + "learning_rate": 7.633679479231247e-07, + "loss": 0.8624, + "step": 197010 + }, + { + "epoch": 15.267542330194892, + "grad_norm": 1.5745837194667587, + "learning_rate": 7.634066955982641e-07, + "loss": 0.8744, + "step": 197020 + }, + { + "epoch": 15.2683172536712, + "grad_norm": 1.605715047661005, + "learning_rate": 7.634454432734037e-07, + "loss": 0.8718, + "step": 197030 + }, + { + "epoch": 15.269092177147506, + "grad_norm": 1.562643906050445, + "learning_rate": 7.634841909485431e-07, + "loss": 0.8576, + "step": 197040 + }, + { + "epoch": 15.269867100623813, + "grad_norm": 1.5852715098879715, + "learning_rate": 7.635229386236827e-07, + "loss": 0.8516, + "step": 197050 + }, + { + "epoch": 15.27064202410012, + "grad_norm": 1.5806800447393348, + "learning_rate": 7.635616862988221e-07, + "loss": 0.8559, + "step": 197060 + }, + { + "epoch": 15.271416947576427, + "grad_norm": 1.5276355966948658, + "learning_rate": 7.636004339739616e-07, + "loss": 0.8616, + "step": 197070 + }, + { + "epoch": 15.272191871052733, + "grad_norm": 1.5810200334762021, + "learning_rate": 7.636391816491011e-07, + "loss": 0.8674, + "step": 197080 + }, + { + "epoch": 15.27296679452904, + "grad_norm": 1.5148934535732608, + "learning_rate": 7.636779293242407e-07, + "loss": 0.8687, + "step": 197090 + }, + { + "epoch": 15.273741718005347, + "grad_norm": 1.6135400793106536, + "learning_rate": 7.637166769993801e-07, + "loss": 0.853, + "step": 197100 + }, + { + "epoch": 15.274516641481654, + "grad_norm": 1.6118932490261126, + "learning_rate": 7.637554246745196e-07, + "loss": 0.8521, + "step": 197110 + }, + { + "epoch": 15.27529156495796, + "grad_norm": 1.5149881439332618, + "learning_rate": 7.63794172349659e-07, + "loss": 0.8571, + "step": 197120 + }, + { + "epoch": 15.276066488434267, + "grad_norm": 1.5986348043663532, + "learning_rate": 7.638329200247986e-07, + "loss": 0.8557, + "step": 197130 + }, + { + "epoch": 15.276841411910574, + "grad_norm": 1.5151426620915809, + "learning_rate": 7.638716676999381e-07, + "loss": 0.8649, + "step": 197140 + }, + { + "epoch": 15.277616335386881, + "grad_norm": 1.6848361366172333, + "learning_rate": 7.639104153750776e-07, + "loss": 0.877, + "step": 197150 + }, + { + "epoch": 15.278391258863188, + "grad_norm": 1.5548034238886284, + "learning_rate": 7.63949163050217e-07, + "loss": 0.8429, + "step": 197160 + }, + { + "epoch": 15.279166182339495, + "grad_norm": 1.5422686419674894, + "learning_rate": 7.639879107253565e-07, + "loss": 0.8729, + "step": 197170 + }, + { + "epoch": 15.279941105815801, + "grad_norm": 1.5499203575730491, + "learning_rate": 7.64026658400496e-07, + "loss": 0.865, + "step": 197180 + }, + { + "epoch": 15.280716029292108, + "grad_norm": 1.5422900951695295, + "learning_rate": 7.640654060756356e-07, + "loss": 0.8638, + "step": 197190 + }, + { + "epoch": 15.281490952768413, + "grad_norm": 1.498456888263048, + "learning_rate": 7.64104153750775e-07, + "loss": 0.8401, + "step": 197200 + }, + { + "epoch": 15.28226587624472, + "grad_norm": 1.6010402276468672, + "learning_rate": 7.641429014259145e-07, + "loss": 0.8734, + "step": 197210 + }, + { + "epoch": 15.283040799721027, + "grad_norm": 1.6272387585274486, + "learning_rate": 7.641816491010539e-07, + "loss": 0.8597, + "step": 197220 + }, + { + "epoch": 15.283815723197334, + "grad_norm": 1.788065895768899, + "learning_rate": 7.642203967761936e-07, + "loss": 0.8806, + "step": 197230 + }, + { + "epoch": 15.28459064667364, + "grad_norm": 1.5625465579233988, + "learning_rate": 7.64259144451333e-07, + "loss": 0.8383, + "step": 197240 + }, + { + "epoch": 15.285365570149947, + "grad_norm": 1.5028469733823058, + "learning_rate": 7.642978921264725e-07, + "loss": 0.8708, + "step": 197250 + }, + { + "epoch": 15.286140493626254, + "grad_norm": 1.478439541108295, + "learning_rate": 7.643366398016119e-07, + "loss": 0.8556, + "step": 197260 + }, + { + "epoch": 15.286915417102561, + "grad_norm": 1.5433831585081093, + "learning_rate": 7.643753874767515e-07, + "loss": 0.8634, + "step": 197270 + }, + { + "epoch": 15.287690340578868, + "grad_norm": 1.60008244558488, + "learning_rate": 7.644141351518909e-07, + "loss": 0.8636, + "step": 197280 + }, + { + "epoch": 15.288465264055175, + "grad_norm": 1.4529674308624125, + "learning_rate": 7.644528828270305e-07, + "loss": 0.858, + "step": 197290 + }, + { + "epoch": 15.289240187531481, + "grad_norm": 1.511972058182751, + "learning_rate": 7.644916305021699e-07, + "loss": 0.8549, + "step": 197300 + }, + { + "epoch": 15.290015111007788, + "grad_norm": 1.537403304171868, + "learning_rate": 7.645303781773094e-07, + "loss": 0.8703, + "step": 197310 + }, + { + "epoch": 15.290790034484095, + "grad_norm": 1.4859598686020161, + "learning_rate": 7.645691258524488e-07, + "loss": 0.8572, + "step": 197320 + }, + { + "epoch": 15.291564957960402, + "grad_norm": 1.629073071852819, + "learning_rate": 7.646078735275885e-07, + "loss": 0.8428, + "step": 197330 + }, + { + "epoch": 15.292339881436709, + "grad_norm": 1.5169217571670592, + "learning_rate": 7.646466212027279e-07, + "loss": 0.8515, + "step": 197340 + }, + { + "epoch": 15.293114804913015, + "grad_norm": 1.606743234235619, + "learning_rate": 7.646853688778674e-07, + "loss": 0.8573, + "step": 197350 + }, + { + "epoch": 15.293889728389322, + "grad_norm": 1.630344598078535, + "learning_rate": 7.647241165530068e-07, + "loss": 0.8692, + "step": 197360 + }, + { + "epoch": 15.294664651865629, + "grad_norm": 1.52265933850163, + "learning_rate": 7.647628642281465e-07, + "loss": 0.8713, + "step": 197370 + }, + { + "epoch": 15.295439575341934, + "grad_norm": 1.516638090652187, + "learning_rate": 7.648016119032859e-07, + "loss": 0.8625, + "step": 197380 + }, + { + "epoch": 15.296214498818241, + "grad_norm": 1.6440486526373148, + "learning_rate": 7.648403595784254e-07, + "loss": 0.8674, + "step": 197390 + }, + { + "epoch": 15.296989422294548, + "grad_norm": 1.5676607531350784, + "learning_rate": 7.648791072535648e-07, + "loss": 0.8625, + "step": 197400 + }, + { + "epoch": 15.297764345770855, + "grad_norm": 1.69655260925198, + "learning_rate": 7.649178549287043e-07, + "loss": 0.8571, + "step": 197410 + }, + { + "epoch": 15.298539269247161, + "grad_norm": 1.5345517346932551, + "learning_rate": 7.649566026038438e-07, + "loss": 0.8346, + "step": 197420 + }, + { + "epoch": 15.299314192723468, + "grad_norm": 1.5755611948038775, + "learning_rate": 7.649953502789834e-07, + "loss": 0.8604, + "step": 197430 + }, + { + "epoch": 15.300089116199775, + "grad_norm": 1.5491842144746333, + "learning_rate": 7.650340979541228e-07, + "loss": 0.8632, + "step": 197440 + }, + { + "epoch": 15.300864039676082, + "grad_norm": 1.5774812383261572, + "learning_rate": 7.650728456292623e-07, + "loss": 0.855, + "step": 197450 + }, + { + "epoch": 15.301638963152389, + "grad_norm": 1.5703547560799256, + "learning_rate": 7.651115933044017e-07, + "loss": 0.8393, + "step": 197460 + }, + { + "epoch": 15.302413886628695, + "grad_norm": 1.5496734776560628, + "learning_rate": 7.651503409795414e-07, + "loss": 0.8678, + "step": 197470 + }, + { + "epoch": 15.303188810105002, + "grad_norm": 1.4333823541711135, + "learning_rate": 7.651890886546808e-07, + "loss": 0.837, + "step": 197480 + }, + { + "epoch": 15.303963733581309, + "grad_norm": 1.6083142513205697, + "learning_rate": 7.652278363298203e-07, + "loss": 0.8575, + "step": 197490 + }, + { + "epoch": 15.304738657057616, + "grad_norm": 1.6371875348622096, + "learning_rate": 7.652665840049597e-07, + "loss": 0.8541, + "step": 197500 + }, + { + "epoch": 15.304738657057616, + "eval_loss": 0.8913307785987854, + "eval_runtime": 330.7483, + "eval_samples_per_second": 34.682, + "eval_steps_per_second": 8.671, + "step": 197500 + }, + { + "epoch": 15.305513580533923, + "grad_norm": 1.5480457848853568, + "learning_rate": 7.653053316800993e-07, + "loss": 0.8642, + "step": 197510 + }, + { + "epoch": 15.30628850401023, + "grad_norm": 1.6033806095206764, + "learning_rate": 7.653440793552388e-07, + "loss": 0.856, + "step": 197520 + }, + { + "epoch": 15.307063427486536, + "grad_norm": 1.6169227410893454, + "learning_rate": 7.653828270303783e-07, + "loss": 0.8673, + "step": 197530 + }, + { + "epoch": 15.307838350962843, + "grad_norm": 1.730216986629946, + "learning_rate": 7.654215747055177e-07, + "loss": 0.8614, + "step": 197540 + }, + { + "epoch": 15.30861327443915, + "grad_norm": 1.5870793079973249, + "learning_rate": 7.654603223806572e-07, + "loss": 0.8899, + "step": 197550 + }, + { + "epoch": 15.309388197915457, + "grad_norm": 1.5190420426216844, + "learning_rate": 7.654990700557966e-07, + "loss": 0.8613, + "step": 197560 + }, + { + "epoch": 15.310163121391762, + "grad_norm": 1.4937409097793608, + "learning_rate": 7.655378177309363e-07, + "loss": 0.8476, + "step": 197570 + }, + { + "epoch": 15.310938044868069, + "grad_norm": 1.5598407113050425, + "learning_rate": 7.655765654060757e-07, + "loss": 0.8648, + "step": 197580 + }, + { + "epoch": 15.311712968344375, + "grad_norm": 1.6700925033000238, + "learning_rate": 7.656153130812152e-07, + "loss": 0.8822, + "step": 197590 + }, + { + "epoch": 15.312487891820682, + "grad_norm": 1.4665728181668463, + "learning_rate": 7.656540607563546e-07, + "loss": 0.8493, + "step": 197600 + }, + { + "epoch": 15.313262815296989, + "grad_norm": 1.540540897167378, + "learning_rate": 7.656928084314942e-07, + "loss": 0.8298, + "step": 197610 + }, + { + "epoch": 15.314037738773296, + "grad_norm": 1.5913204474590736, + "learning_rate": 7.657315561066337e-07, + "loss": 0.863, + "step": 197620 + }, + { + "epoch": 15.314812662249603, + "grad_norm": 1.6805380334991034, + "learning_rate": 7.657703037817732e-07, + "loss": 0.8632, + "step": 197630 + }, + { + "epoch": 15.31558758572591, + "grad_norm": 1.4986317818376365, + "learning_rate": 7.658090514569126e-07, + "loss": 0.8601, + "step": 197640 + }, + { + "epoch": 15.316362509202216, + "grad_norm": 1.6165539367004065, + "learning_rate": 7.658477991320522e-07, + "loss": 0.8468, + "step": 197650 + }, + { + "epoch": 15.317137432678523, + "grad_norm": 1.5822676187156395, + "learning_rate": 7.658865468071916e-07, + "loss": 0.8622, + "step": 197660 + }, + { + "epoch": 15.31791235615483, + "grad_norm": 1.6111063253371734, + "learning_rate": 7.659252944823312e-07, + "loss": 0.8767, + "step": 197670 + }, + { + "epoch": 15.318687279631137, + "grad_norm": 1.5559704554245193, + "learning_rate": 7.659640421574706e-07, + "loss": 0.8775, + "step": 197680 + }, + { + "epoch": 15.319462203107443, + "grad_norm": 1.7307337976854096, + "learning_rate": 7.660027898326101e-07, + "loss": 0.8758, + "step": 197690 + }, + { + "epoch": 15.32023712658375, + "grad_norm": 1.664620371934854, + "learning_rate": 7.660415375077495e-07, + "loss": 0.8639, + "step": 197700 + }, + { + "epoch": 15.321012050060057, + "grad_norm": 1.5914020316754944, + "learning_rate": 7.660802851828891e-07, + "loss": 0.8705, + "step": 197710 + }, + { + "epoch": 15.321786973536364, + "grad_norm": 1.4977969055170324, + "learning_rate": 7.661190328580286e-07, + "loss": 0.8559, + "step": 197720 + }, + { + "epoch": 15.32256189701267, + "grad_norm": 1.542256563299059, + "learning_rate": 7.661577805331681e-07, + "loss": 0.8404, + "step": 197730 + }, + { + "epoch": 15.323336820488977, + "grad_norm": 1.5634495094458538, + "learning_rate": 7.661965282083075e-07, + "loss": 0.8608, + "step": 197740 + }, + { + "epoch": 15.324111743965283, + "grad_norm": 1.585007352999631, + "learning_rate": 7.662352758834471e-07, + "loss": 0.8684, + "step": 197750 + }, + { + "epoch": 15.32488666744159, + "grad_norm": 1.5381780294993666, + "learning_rate": 7.662740235585865e-07, + "loss": 0.8623, + "step": 197760 + }, + { + "epoch": 15.325661590917896, + "grad_norm": 1.5451140855824828, + "learning_rate": 7.663127712337261e-07, + "loss": 0.8539, + "step": 197770 + }, + { + "epoch": 15.326436514394203, + "grad_norm": 1.4670834388156018, + "learning_rate": 7.663515189088655e-07, + "loss": 0.8618, + "step": 197780 + }, + { + "epoch": 15.32721143787051, + "grad_norm": 1.4992141026981156, + "learning_rate": 7.663902665840051e-07, + "loss": 0.8451, + "step": 197790 + }, + { + "epoch": 15.327986361346817, + "grad_norm": 1.5185699803181365, + "learning_rate": 7.664290142591445e-07, + "loss": 0.8525, + "step": 197800 + }, + { + "epoch": 15.328761284823123, + "grad_norm": 1.6183031768150649, + "learning_rate": 7.66467761934284e-07, + "loss": 0.8573, + "step": 197810 + }, + { + "epoch": 15.32953620829943, + "grad_norm": 1.6112566188692732, + "learning_rate": 7.665065096094235e-07, + "loss": 0.8788, + "step": 197820 + }, + { + "epoch": 15.330311131775737, + "grad_norm": 1.6007206867214738, + "learning_rate": 7.66545257284563e-07, + "loss": 0.8469, + "step": 197830 + }, + { + "epoch": 15.331086055252044, + "grad_norm": 1.646962066933567, + "learning_rate": 7.665840049597024e-07, + "loss": 0.8622, + "step": 197840 + }, + { + "epoch": 15.33186097872835, + "grad_norm": 1.5755401395690294, + "learning_rate": 7.66622752634842e-07, + "loss": 0.8724, + "step": 197850 + }, + { + "epoch": 15.332635902204657, + "grad_norm": 1.54242738546826, + "learning_rate": 7.666615003099814e-07, + "loss": 0.8633, + "step": 197860 + }, + { + "epoch": 15.333410825680964, + "grad_norm": 1.5208550474543336, + "learning_rate": 7.66700247985121e-07, + "loss": 0.8648, + "step": 197870 + }, + { + "epoch": 15.334185749157271, + "grad_norm": 1.5261735399397922, + "learning_rate": 7.667389956602604e-07, + "loss": 0.8788, + "step": 197880 + }, + { + "epoch": 15.334960672633578, + "grad_norm": 1.703056698565137, + "learning_rate": 7.667777433354e-07, + "loss": 0.8463, + "step": 197890 + }, + { + "epoch": 15.335735596109885, + "grad_norm": 1.55086917667936, + "learning_rate": 7.668164910105394e-07, + "loss": 0.8579, + "step": 197900 + }, + { + "epoch": 15.336510519586191, + "grad_norm": 1.5685073627404673, + "learning_rate": 7.66855238685679e-07, + "loss": 0.8637, + "step": 197910 + }, + { + "epoch": 15.337285443062498, + "grad_norm": 1.6669415617587493, + "learning_rate": 7.668939863608184e-07, + "loss": 0.8612, + "step": 197920 + }, + { + "epoch": 15.338060366538805, + "grad_norm": 1.5883261417092343, + "learning_rate": 7.66932734035958e-07, + "loss": 0.8577, + "step": 197930 + }, + { + "epoch": 15.338835290015112, + "grad_norm": 1.6778918779607175, + "learning_rate": 7.669714817110974e-07, + "loss": 0.8576, + "step": 197940 + }, + { + "epoch": 15.339610213491417, + "grad_norm": 1.6314357887570445, + "learning_rate": 7.670102293862369e-07, + "loss": 0.8539, + "step": 197950 + }, + { + "epoch": 15.340385136967724, + "grad_norm": 1.590903622716477, + "learning_rate": 7.670489770613763e-07, + "loss": 0.8745, + "step": 197960 + }, + { + "epoch": 15.34116006044403, + "grad_norm": 1.563351278017226, + "learning_rate": 7.670877247365159e-07, + "loss": 0.8735, + "step": 197970 + }, + { + "epoch": 15.341934983920337, + "grad_norm": 1.6038213024773853, + "learning_rate": 7.671264724116553e-07, + "loss": 0.862, + "step": 197980 + }, + { + "epoch": 15.342709907396644, + "grad_norm": 1.5534004266643184, + "learning_rate": 7.671652200867949e-07, + "loss": 0.8666, + "step": 197990 + }, + { + "epoch": 15.343484830872951, + "grad_norm": 1.5441790098783472, + "learning_rate": 7.672039677619343e-07, + "loss": 0.862, + "step": 198000 + }, + { + "epoch": 15.343484830872951, + "eval_loss": 0.8909204006195068, + "eval_runtime": 329.74, + "eval_samples_per_second": 34.788, + "eval_steps_per_second": 8.698, + "step": 198000 + }, + { + "epoch": 15.344259754349258, + "grad_norm": 1.6193212216236426, + "learning_rate": 7.672427154370739e-07, + "loss": 0.8622, + "step": 198010 + }, + { + "epoch": 15.345034677825565, + "grad_norm": 1.6741324840161769, + "learning_rate": 7.672814631122133e-07, + "loss": 0.8869, + "step": 198020 + }, + { + "epoch": 15.345809601301871, + "grad_norm": 1.5735017776899567, + "learning_rate": 7.673202107873529e-07, + "loss": 0.8813, + "step": 198030 + }, + { + "epoch": 15.346584524778178, + "grad_norm": 1.6471295062292153, + "learning_rate": 7.673589584624923e-07, + "loss": 0.8568, + "step": 198040 + }, + { + "epoch": 15.347359448254485, + "grad_norm": 1.5598248415880538, + "learning_rate": 7.673977061376318e-07, + "loss": 0.8373, + "step": 198050 + }, + { + "epoch": 15.348134371730792, + "grad_norm": 1.5539130179073029, + "learning_rate": 7.674364538127713e-07, + "loss": 0.8568, + "step": 198060 + }, + { + "epoch": 15.348909295207099, + "grad_norm": 1.548436168405942, + "learning_rate": 7.674752014879108e-07, + "loss": 0.8773, + "step": 198070 + }, + { + "epoch": 15.349684218683405, + "grad_norm": 1.5582542219513178, + "learning_rate": 7.675139491630503e-07, + "loss": 0.85, + "step": 198080 + }, + { + "epoch": 15.350459142159712, + "grad_norm": 1.6112399331080551, + "learning_rate": 7.675526968381898e-07, + "loss": 0.8682, + "step": 198090 + }, + { + "epoch": 15.351234065636019, + "grad_norm": 1.6149717510433206, + "learning_rate": 7.675914445133292e-07, + "loss": 0.852, + "step": 198100 + }, + { + "epoch": 15.352008989112326, + "grad_norm": 1.418069067994803, + "learning_rate": 7.676301921884688e-07, + "loss": 0.8577, + "step": 198110 + }, + { + "epoch": 15.352783912588633, + "grad_norm": 1.5723617141887738, + "learning_rate": 7.676689398636082e-07, + "loss": 0.8565, + "step": 198120 + }, + { + "epoch": 15.353558836064938, + "grad_norm": 1.58647096876226, + "learning_rate": 7.677076875387478e-07, + "loss": 0.8831, + "step": 198130 + }, + { + "epoch": 15.354333759541245, + "grad_norm": 2.2865159387050893, + "learning_rate": 7.677464352138872e-07, + "loss": 0.8891, + "step": 198140 + }, + { + "epoch": 15.355108683017551, + "grad_norm": 1.4508765717644052, + "learning_rate": 7.677851828890267e-07, + "loss": 0.8722, + "step": 198150 + }, + { + "epoch": 15.355883606493858, + "grad_norm": 1.4835276108888389, + "learning_rate": 7.678239305641662e-07, + "loss": 0.8767, + "step": 198160 + }, + { + "epoch": 15.356658529970165, + "grad_norm": 1.5731400540685274, + "learning_rate": 7.678626782393058e-07, + "loss": 0.8704, + "step": 198170 + }, + { + "epoch": 15.357433453446472, + "grad_norm": 1.5983808644529345, + "learning_rate": 7.679014259144452e-07, + "loss": 0.8561, + "step": 198180 + }, + { + "epoch": 15.358208376922779, + "grad_norm": 1.6310926884765382, + "learning_rate": 7.679401735895847e-07, + "loss": 0.8523, + "step": 198190 + }, + { + "epoch": 15.358983300399085, + "grad_norm": 1.499964683082152, + "learning_rate": 7.679789212647241e-07, + "loss": 0.8591, + "step": 198200 + }, + { + "epoch": 15.359758223875392, + "grad_norm": 1.4978289229677957, + "learning_rate": 7.680176689398637e-07, + "loss": 0.849, + "step": 198210 + }, + { + "epoch": 15.360533147351699, + "grad_norm": 1.6558405164913825, + "learning_rate": 7.680564166150032e-07, + "loss": 0.8557, + "step": 198220 + }, + { + "epoch": 15.361308070828006, + "grad_norm": 1.6619049699451012, + "learning_rate": 7.680951642901427e-07, + "loss": 0.871, + "step": 198230 + }, + { + "epoch": 15.362082994304313, + "grad_norm": 1.5161720176843112, + "learning_rate": 7.681339119652821e-07, + "loss": 0.8611, + "step": 198240 + }, + { + "epoch": 15.36285791778062, + "grad_norm": 1.6492600625060976, + "learning_rate": 7.681726596404216e-07, + "loss": 0.856, + "step": 198250 + }, + { + "epoch": 15.363632841256926, + "grad_norm": 1.5278762981532397, + "learning_rate": 7.682114073155611e-07, + "loss": 0.8653, + "step": 198260 + }, + { + "epoch": 15.364407764733233, + "grad_norm": 1.4827680690917917, + "learning_rate": 7.682501549907007e-07, + "loss": 0.8517, + "step": 198270 + }, + { + "epoch": 15.36518268820954, + "grad_norm": 1.4652601907823763, + "learning_rate": 7.682889026658401e-07, + "loss": 0.8458, + "step": 198280 + }, + { + "epoch": 15.365957611685847, + "grad_norm": 1.38343269323522, + "learning_rate": 7.683276503409796e-07, + "loss": 0.8655, + "step": 198290 + }, + { + "epoch": 15.366732535162154, + "grad_norm": 1.5175451003549638, + "learning_rate": 7.68366398016119e-07, + "loss": 0.8536, + "step": 198300 + }, + { + "epoch": 15.36750745863846, + "grad_norm": 1.530011048413216, + "learning_rate": 7.684051456912587e-07, + "loss": 0.848, + "step": 198310 + }, + { + "epoch": 15.368282382114765, + "grad_norm": 1.5459510722831273, + "learning_rate": 7.684438933663981e-07, + "loss": 0.8388, + "step": 198320 + }, + { + "epoch": 15.369057305591072, + "grad_norm": 1.5871865913153242, + "learning_rate": 7.684826410415376e-07, + "loss": 0.8611, + "step": 198330 + }, + { + "epoch": 15.369832229067379, + "grad_norm": 1.5252310502461692, + "learning_rate": 7.68521388716677e-07, + "loss": 0.8759, + "step": 198340 + }, + { + "epoch": 15.370607152543686, + "grad_norm": 1.6099749022779344, + "learning_rate": 7.685601363918165e-07, + "loss": 0.8745, + "step": 198350 + }, + { + "epoch": 15.371382076019993, + "grad_norm": 1.6135297860657143, + "learning_rate": 7.685988840669561e-07, + "loss": 0.8691, + "step": 198360 + }, + { + "epoch": 15.3721569994963, + "grad_norm": 1.603310340065867, + "learning_rate": 7.686376317420956e-07, + "loss": 0.8497, + "step": 198370 + }, + { + "epoch": 15.372931922972606, + "grad_norm": 1.5830196361537068, + "learning_rate": 7.68676379417235e-07, + "loss": 0.8734, + "step": 198380 + }, + { + "epoch": 15.373706846448913, + "grad_norm": 1.5727407817583672, + "learning_rate": 7.687151270923745e-07, + "loss": 0.8506, + "step": 198390 + }, + { + "epoch": 15.37448176992522, + "grad_norm": 1.6089545239713696, + "learning_rate": 7.687538747675139e-07, + "loss": 0.8637, + "step": 198400 + }, + { + "epoch": 15.375256693401527, + "grad_norm": 1.558657208985056, + "learning_rate": 7.687926224426536e-07, + "loss": 0.8391, + "step": 198410 + }, + { + "epoch": 15.376031616877833, + "grad_norm": 1.6116609156005977, + "learning_rate": 7.68831370117793e-07, + "loss": 0.903, + "step": 198420 + }, + { + "epoch": 15.37680654035414, + "grad_norm": 1.5756972729095813, + "learning_rate": 7.688701177929325e-07, + "loss": 0.8805, + "step": 198430 + }, + { + "epoch": 15.377581463830447, + "grad_norm": 1.5102208426383255, + "learning_rate": 7.689088654680719e-07, + "loss": 0.8571, + "step": 198440 + }, + { + "epoch": 15.378356387306754, + "grad_norm": 1.5484168670474727, + "learning_rate": 7.689476131432115e-07, + "loss": 0.8582, + "step": 198450 + }, + { + "epoch": 15.37913131078306, + "grad_norm": 1.4988814190725996, + "learning_rate": 7.68986360818351e-07, + "loss": 0.8704, + "step": 198460 + }, + { + "epoch": 15.379906234259368, + "grad_norm": 1.6222145394259588, + "learning_rate": 7.690251084934905e-07, + "loss": 0.8841, + "step": 198470 + }, + { + "epoch": 15.380681157735674, + "grad_norm": 1.6187239435816356, + "learning_rate": 7.690638561686299e-07, + "loss": 0.853, + "step": 198480 + }, + { + "epoch": 15.381456081211981, + "grad_norm": 1.5550668155982104, + "learning_rate": 7.691026038437694e-07, + "loss": 0.8725, + "step": 198490 + }, + { + "epoch": 15.382231004688286, + "grad_norm": 1.5939066464816718, + "learning_rate": 7.69141351518909e-07, + "loss": 0.8671, + "step": 198500 + }, + { + "epoch": 15.382231004688286, + "eval_loss": 0.8912938833236694, + "eval_runtime": 329.2471, + "eval_samples_per_second": 34.84, + "eval_steps_per_second": 8.711, + "step": 198500 + }, + { + "epoch": 15.383005928164593, + "grad_norm": 1.5610250159348222, + "learning_rate": 7.691800991940485e-07, + "loss": 0.8509, + "step": 198510 + }, + { + "epoch": 15.3837808516409, + "grad_norm": 1.4995391402243037, + "learning_rate": 7.692188468691879e-07, + "loss": 0.8657, + "step": 198520 + }, + { + "epoch": 15.384555775117207, + "grad_norm": 1.5072545104863568, + "learning_rate": 7.692575945443274e-07, + "loss": 0.8448, + "step": 198530 + }, + { + "epoch": 15.385330698593513, + "grad_norm": 1.519228883951812, + "learning_rate": 7.692963422194668e-07, + "loss": 0.8557, + "step": 198540 + }, + { + "epoch": 15.38610562206982, + "grad_norm": 1.4760678916793573, + "learning_rate": 7.693350898946065e-07, + "loss": 0.8601, + "step": 198550 + }, + { + "epoch": 15.386880545546127, + "grad_norm": 1.5434433893847925, + "learning_rate": 7.693738375697459e-07, + "loss": 0.8722, + "step": 198560 + }, + { + "epoch": 15.387655469022434, + "grad_norm": 1.558721438302802, + "learning_rate": 7.694125852448854e-07, + "loss": 0.8634, + "step": 198570 + }, + { + "epoch": 15.38843039249874, + "grad_norm": 1.5477518144434372, + "learning_rate": 7.694513329200248e-07, + "loss": 0.8731, + "step": 198580 + }, + { + "epoch": 15.389205315975047, + "grad_norm": 1.588435693074282, + "learning_rate": 7.694900805951644e-07, + "loss": 0.8789, + "step": 198590 + }, + { + "epoch": 15.389980239451354, + "grad_norm": 1.5455431319578459, + "learning_rate": 7.695288282703039e-07, + "loss": 0.8496, + "step": 198600 + }, + { + "epoch": 15.390755162927661, + "grad_norm": 1.6232589538429212, + "learning_rate": 7.695675759454434e-07, + "loss": 0.8484, + "step": 198610 + }, + { + "epoch": 15.391530086403968, + "grad_norm": 1.5001467483543214, + "learning_rate": 7.696063236205828e-07, + "loss": 0.8301, + "step": 198620 + }, + { + "epoch": 15.392305009880275, + "grad_norm": 1.5547556107115417, + "learning_rate": 7.696450712957223e-07, + "loss": 0.8487, + "step": 198630 + }, + { + "epoch": 15.393079933356582, + "grad_norm": 1.522277591894033, + "learning_rate": 7.696838189708617e-07, + "loss": 0.8484, + "step": 198640 + }, + { + "epoch": 15.393854856832888, + "grad_norm": 1.657596166604795, + "learning_rate": 7.697225666460014e-07, + "loss": 0.8654, + "step": 198650 + }, + { + "epoch": 15.394629780309195, + "grad_norm": 1.5838981710695859, + "learning_rate": 7.697613143211408e-07, + "loss": 0.8692, + "step": 198660 + }, + { + "epoch": 15.395404703785502, + "grad_norm": 1.6204742183090504, + "learning_rate": 7.698000619962803e-07, + "loss": 0.8614, + "step": 198670 + }, + { + "epoch": 15.396179627261809, + "grad_norm": 1.4699981359582641, + "learning_rate": 7.698388096714197e-07, + "loss": 0.854, + "step": 198680 + }, + { + "epoch": 15.396954550738114, + "grad_norm": 1.653333439297183, + "learning_rate": 7.698775573465593e-07, + "loss": 0.8575, + "step": 198690 + }, + { + "epoch": 15.39772947421442, + "grad_norm": 1.6077829429792045, + "learning_rate": 7.699163050216988e-07, + "loss": 0.869, + "step": 198700 + }, + { + "epoch": 15.398504397690727, + "grad_norm": 1.5229538361939337, + "learning_rate": 7.699550526968383e-07, + "loss": 0.8482, + "step": 198710 + }, + { + "epoch": 15.399279321167034, + "grad_norm": 1.5100235748357842, + "learning_rate": 7.699938003719777e-07, + "loss": 0.8456, + "step": 198720 + }, + { + "epoch": 15.400054244643341, + "grad_norm": 1.586030314693552, + "learning_rate": 7.700325480471173e-07, + "loss": 0.8538, + "step": 198730 + }, + { + "epoch": 15.400829168119648, + "grad_norm": 1.5881366199653084, + "learning_rate": 7.700712957222567e-07, + "loss": 0.86, + "step": 198740 + }, + { + "epoch": 15.401604091595955, + "grad_norm": 1.516515505301047, + "learning_rate": 7.701100433973963e-07, + "loss": 0.8469, + "step": 198750 + }, + { + "epoch": 15.402379015072261, + "grad_norm": 1.660015752789606, + "learning_rate": 7.701487910725357e-07, + "loss": 0.8666, + "step": 198760 + }, + { + "epoch": 15.403153938548568, + "grad_norm": 1.5448010410536952, + "learning_rate": 7.701875387476752e-07, + "loss": 0.8831, + "step": 198770 + }, + { + "epoch": 15.403928862024875, + "grad_norm": 1.5984276340221455, + "learning_rate": 7.702262864228146e-07, + "loss": 0.8564, + "step": 198780 + }, + { + "epoch": 15.404703785501182, + "grad_norm": 1.535453558851504, + "learning_rate": 7.702650340979542e-07, + "loss": 0.8698, + "step": 198790 + }, + { + "epoch": 15.405478708977489, + "grad_norm": 1.5846626149492937, + "learning_rate": 7.703037817730937e-07, + "loss": 0.8503, + "step": 198800 + }, + { + "epoch": 15.406253632453796, + "grad_norm": 1.5390544634194567, + "learning_rate": 7.703425294482332e-07, + "loss": 0.8395, + "step": 198810 + }, + { + "epoch": 15.407028555930102, + "grad_norm": 1.4941630398169137, + "learning_rate": 7.703812771233726e-07, + "loss": 0.879, + "step": 198820 + }, + { + "epoch": 15.40780347940641, + "grad_norm": 1.5925948974748243, + "learning_rate": 7.704200247985122e-07, + "loss": 0.8644, + "step": 198830 + }, + { + "epoch": 15.408578402882716, + "grad_norm": 1.5082105533283878, + "learning_rate": 7.704587724736516e-07, + "loss": 0.8783, + "step": 198840 + }, + { + "epoch": 15.409353326359023, + "grad_norm": 1.5811253290915088, + "learning_rate": 7.704975201487912e-07, + "loss": 0.87, + "step": 198850 + }, + { + "epoch": 15.41012824983533, + "grad_norm": 1.5907604457817346, + "learning_rate": 7.705362678239306e-07, + "loss": 0.8645, + "step": 198860 + }, + { + "epoch": 15.410903173311635, + "grad_norm": 1.6425135473132166, + "learning_rate": 7.705750154990702e-07, + "loss": 0.8788, + "step": 198870 + }, + { + "epoch": 15.411678096787941, + "grad_norm": 1.7370822240605843, + "learning_rate": 7.706137631742096e-07, + "loss": 0.8679, + "step": 198880 + }, + { + "epoch": 15.412453020264248, + "grad_norm": 1.5320564705048587, + "learning_rate": 7.706525108493491e-07, + "loss": 0.8619, + "step": 198890 + }, + { + "epoch": 15.413227943740555, + "grad_norm": 1.5570514116992917, + "learning_rate": 7.706912585244886e-07, + "loss": 0.8698, + "step": 198900 + }, + { + "epoch": 15.414002867216862, + "grad_norm": 1.576754967885824, + "learning_rate": 7.707300061996281e-07, + "loss": 0.8622, + "step": 198910 + }, + { + "epoch": 15.414777790693169, + "grad_norm": 1.4680133878649229, + "learning_rate": 7.707687538747675e-07, + "loss": 0.8605, + "step": 198920 + }, + { + "epoch": 15.415552714169475, + "grad_norm": 1.568896866440834, + "learning_rate": 7.708075015499071e-07, + "loss": 0.8692, + "step": 198930 + }, + { + "epoch": 15.416327637645782, + "grad_norm": 1.5953350285664376, + "learning_rate": 7.708462492250465e-07, + "loss": 0.8677, + "step": 198940 + }, + { + "epoch": 15.417102561122089, + "grad_norm": 1.5168041674931292, + "learning_rate": 7.708849969001861e-07, + "loss": 0.8593, + "step": 198950 + }, + { + "epoch": 15.417877484598396, + "grad_norm": 1.5487335797401796, + "learning_rate": 7.709237445753255e-07, + "loss": 0.8679, + "step": 198960 + }, + { + "epoch": 15.418652408074703, + "grad_norm": 1.640992565634334, + "learning_rate": 7.709624922504651e-07, + "loss": 0.8556, + "step": 198970 + }, + { + "epoch": 15.41942733155101, + "grad_norm": 1.5577255566067674, + "learning_rate": 7.710012399256045e-07, + "loss": 0.8517, + "step": 198980 + }, + { + "epoch": 15.420202255027316, + "grad_norm": 1.543764379112028, + "learning_rate": 7.71039987600744e-07, + "loss": 0.8445, + "step": 198990 + }, + { + "epoch": 15.420977178503623, + "grad_norm": 1.5439224896055843, + "learning_rate": 7.710787352758835e-07, + "loss": 0.8705, + "step": 199000 + }, + { + "epoch": 15.420977178503623, + "eval_loss": 0.8909161686897278, + "eval_runtime": 329.9791, + "eval_samples_per_second": 34.763, + "eval_steps_per_second": 8.691, + "step": 199000 + }, + { + "epoch": 15.42175210197993, + "grad_norm": 1.5804221802447271, + "learning_rate": 7.711174829510231e-07, + "loss": 0.8616, + "step": 199010 + }, + { + "epoch": 15.422527025456237, + "grad_norm": 1.7680562926556422, + "learning_rate": 7.711562306261625e-07, + "loss": 0.8777, + "step": 199020 + }, + { + "epoch": 15.423301948932544, + "grad_norm": 1.6490027238246807, + "learning_rate": 7.71194978301302e-07, + "loss": 0.8803, + "step": 199030 + }, + { + "epoch": 15.42407687240885, + "grad_norm": 1.5451073525163896, + "learning_rate": 7.712337259764414e-07, + "loss": 0.8595, + "step": 199040 + }, + { + "epoch": 15.424851795885157, + "grad_norm": 1.537657843302763, + "learning_rate": 7.71272473651581e-07, + "loss": 0.8538, + "step": 199050 + }, + { + "epoch": 15.425626719361462, + "grad_norm": 1.6127596288462336, + "learning_rate": 7.713112213267204e-07, + "loss": 0.8737, + "step": 199060 + }, + { + "epoch": 15.426401642837769, + "grad_norm": 1.4764051012850137, + "learning_rate": 7.7134996900186e-07, + "loss": 0.8633, + "step": 199070 + }, + { + "epoch": 15.427176566314076, + "grad_norm": 1.6410861414806384, + "learning_rate": 7.713887166769994e-07, + "loss": 0.8624, + "step": 199080 + }, + { + "epoch": 15.427951489790383, + "grad_norm": 1.560608913266429, + "learning_rate": 7.714274643521389e-07, + "loss": 0.8605, + "step": 199090 + }, + { + "epoch": 15.42872641326669, + "grad_norm": 1.5470460095535818, + "learning_rate": 7.714662120272784e-07, + "loss": 0.8693, + "step": 199100 + }, + { + "epoch": 15.429501336742996, + "grad_norm": 1.5391011646902015, + "learning_rate": 7.71504959702418e-07, + "loss": 0.8466, + "step": 199110 + }, + { + "epoch": 15.430276260219303, + "grad_norm": 1.572284414472585, + "learning_rate": 7.715437073775574e-07, + "loss": 0.8638, + "step": 199120 + }, + { + "epoch": 15.43105118369561, + "grad_norm": 1.5708867107890667, + "learning_rate": 7.715824550526969e-07, + "loss": 0.8605, + "step": 199130 + }, + { + "epoch": 15.431826107171917, + "grad_norm": 1.6376537212452766, + "learning_rate": 7.716212027278363e-07, + "loss": 0.8605, + "step": 199140 + }, + { + "epoch": 15.432601030648224, + "grad_norm": 1.636345805643707, + "learning_rate": 7.71659950402976e-07, + "loss": 0.8867, + "step": 199150 + }, + { + "epoch": 15.43337595412453, + "grad_norm": 1.5216039912802848, + "learning_rate": 7.716986980781154e-07, + "loss": 0.8706, + "step": 199160 + }, + { + "epoch": 15.434150877600837, + "grad_norm": 1.498535195813158, + "learning_rate": 7.717374457532549e-07, + "loss": 0.8523, + "step": 199170 + }, + { + "epoch": 15.434925801077144, + "grad_norm": 1.4959326926252134, + "learning_rate": 7.717761934283943e-07, + "loss": 0.8633, + "step": 199180 + }, + { + "epoch": 15.43570072455345, + "grad_norm": 1.5152533089872504, + "learning_rate": 7.718149411035338e-07, + "loss": 0.8585, + "step": 199190 + }, + { + "epoch": 15.436475648029758, + "grad_norm": 1.542710834953793, + "learning_rate": 7.718536887786733e-07, + "loss": 0.8445, + "step": 199200 + }, + { + "epoch": 15.437250571506064, + "grad_norm": 1.5361444588652968, + "learning_rate": 7.718924364538129e-07, + "loss": 0.8755, + "step": 199210 + }, + { + "epoch": 15.438025494982371, + "grad_norm": 1.628713761566305, + "learning_rate": 7.719311841289523e-07, + "loss": 0.8602, + "step": 199220 + }, + { + "epoch": 15.438800418458678, + "grad_norm": 1.525491439231722, + "learning_rate": 7.719699318040918e-07, + "loss": 0.862, + "step": 199230 + }, + { + "epoch": 15.439575341934983, + "grad_norm": 1.519118377630652, + "learning_rate": 7.720086794792312e-07, + "loss": 0.8776, + "step": 199240 + }, + { + "epoch": 15.44035026541129, + "grad_norm": 1.6219373829329915, + "learning_rate": 7.720474271543709e-07, + "loss": 0.8556, + "step": 199250 + }, + { + "epoch": 15.441125188887597, + "grad_norm": 1.4937156472990667, + "learning_rate": 7.720861748295103e-07, + "loss": 0.8806, + "step": 199260 + }, + { + "epoch": 15.441900112363903, + "grad_norm": 1.757602255966724, + "learning_rate": 7.721249225046498e-07, + "loss": 0.8669, + "step": 199270 + }, + { + "epoch": 15.44267503584021, + "grad_norm": 1.6102008291511356, + "learning_rate": 7.721636701797892e-07, + "loss": 0.8681, + "step": 199280 + }, + { + "epoch": 15.443449959316517, + "grad_norm": 1.535551985856146, + "learning_rate": 7.722024178549289e-07, + "loss": 0.8715, + "step": 199290 + }, + { + "epoch": 15.444224882792824, + "grad_norm": 1.6224661237783697, + "learning_rate": 7.722411655300683e-07, + "loss": 0.8645, + "step": 199300 + }, + { + "epoch": 15.44499980626913, + "grad_norm": 1.5031591773569304, + "learning_rate": 7.722799132052078e-07, + "loss": 0.8672, + "step": 199310 + }, + { + "epoch": 15.445774729745438, + "grad_norm": 1.5547985047779915, + "learning_rate": 7.723186608803472e-07, + "loss": 0.8599, + "step": 199320 + }, + { + "epoch": 15.446549653221744, + "grad_norm": 1.5747178270209419, + "learning_rate": 7.723574085554867e-07, + "loss": 0.8633, + "step": 199330 + }, + { + "epoch": 15.447324576698051, + "grad_norm": 1.5240598234181988, + "learning_rate": 7.723961562306261e-07, + "loss": 0.8628, + "step": 199340 + }, + { + "epoch": 15.448099500174358, + "grad_norm": 1.566242553799917, + "learning_rate": 7.724349039057658e-07, + "loss": 0.8609, + "step": 199350 + }, + { + "epoch": 15.448874423650665, + "grad_norm": 1.6237566709118523, + "learning_rate": 7.724736515809052e-07, + "loss": 0.8519, + "step": 199360 + }, + { + "epoch": 15.449649347126972, + "grad_norm": 1.558968855378618, + "learning_rate": 7.725123992560447e-07, + "loss": 0.8793, + "step": 199370 + }, + { + "epoch": 15.450424270603278, + "grad_norm": 1.51216916061557, + "learning_rate": 7.725511469311841e-07, + "loss": 0.8591, + "step": 199380 + }, + { + "epoch": 15.451199194079585, + "grad_norm": 1.5082098596353821, + "learning_rate": 7.725898946063238e-07, + "loss": 0.8605, + "step": 199390 + }, + { + "epoch": 15.451974117555892, + "grad_norm": 1.5637433245945227, + "learning_rate": 7.726286422814632e-07, + "loss": 0.8491, + "step": 199400 + }, + { + "epoch": 15.452749041032199, + "grad_norm": 1.5627136522804888, + "learning_rate": 7.726673899566027e-07, + "loss": 0.8721, + "step": 199410 + }, + { + "epoch": 15.453523964508506, + "grad_norm": 1.4154110849098618, + "learning_rate": 7.727061376317421e-07, + "loss": 0.8342, + "step": 199420 + }, + { + "epoch": 15.45429888798481, + "grad_norm": 1.599118424625706, + "learning_rate": 7.727448853068817e-07, + "loss": 0.8535, + "step": 199430 + }, + { + "epoch": 15.455073811461117, + "grad_norm": 1.610308898728198, + "learning_rate": 7.727836329820212e-07, + "loss": 0.8632, + "step": 199440 + }, + { + "epoch": 15.455848734937424, + "grad_norm": 1.5540874663280744, + "learning_rate": 7.728223806571607e-07, + "loss": 0.8681, + "step": 199450 + }, + { + "epoch": 15.456623658413731, + "grad_norm": 1.5303929879992082, + "learning_rate": 7.728611283323001e-07, + "loss": 0.8572, + "step": 199460 + }, + { + "epoch": 15.457398581890038, + "grad_norm": 1.6945085186385973, + "learning_rate": 7.728998760074396e-07, + "loss": 0.8535, + "step": 199470 + }, + { + "epoch": 15.458173505366345, + "grad_norm": 1.5476431124384502, + "learning_rate": 7.72938623682579e-07, + "loss": 0.8604, + "step": 199480 + }, + { + "epoch": 15.458948428842652, + "grad_norm": 1.5885963944554342, + "learning_rate": 7.729773713577187e-07, + "loss": 0.8527, + "step": 199490 + }, + { + "epoch": 15.459723352318958, + "grad_norm": 1.567573453915359, + "learning_rate": 7.730161190328581e-07, + "loss": 0.8674, + "step": 199500 + }, + { + "epoch": 15.459723352318958, + "eval_loss": 0.8907887935638428, + "eval_runtime": 331.7037, + "eval_samples_per_second": 34.582, + "eval_steps_per_second": 8.646, + "step": 199500 + }, + { + "epoch": 15.460498275795265, + "grad_norm": 1.5070770781738723, + "learning_rate": 7.730548667079976e-07, + "loss": 0.8636, + "step": 199510 + }, + { + "epoch": 15.461273199271572, + "grad_norm": 1.5017715160478708, + "learning_rate": 7.73093614383137e-07, + "loss": 0.8657, + "step": 199520 + }, + { + "epoch": 15.462048122747879, + "grad_norm": 1.6097352924607038, + "learning_rate": 7.731323620582766e-07, + "loss": 0.8577, + "step": 199530 + }, + { + "epoch": 15.462823046224186, + "grad_norm": 1.565116408170806, + "learning_rate": 7.731711097334161e-07, + "loss": 0.8689, + "step": 199540 + }, + { + "epoch": 15.463597969700492, + "grad_norm": 1.4831303676854357, + "learning_rate": 7.732098574085556e-07, + "loss": 0.8608, + "step": 199550 + }, + { + "epoch": 15.4643728931768, + "grad_norm": 1.4785008736002514, + "learning_rate": 7.73248605083695e-07, + "loss": 0.8537, + "step": 199560 + }, + { + "epoch": 15.465147816653106, + "grad_norm": 1.6029349916745794, + "learning_rate": 7.732873527588345e-07, + "loss": 0.8619, + "step": 199570 + }, + { + "epoch": 15.465922740129413, + "grad_norm": 1.5386290012223771, + "learning_rate": 7.73326100433974e-07, + "loss": 0.8414, + "step": 199580 + }, + { + "epoch": 15.46669766360572, + "grad_norm": 1.5511429826957863, + "learning_rate": 7.733648481091136e-07, + "loss": 0.849, + "step": 199590 + }, + { + "epoch": 15.467472587082026, + "grad_norm": 1.642218427837155, + "learning_rate": 7.73403595784253e-07, + "loss": 0.8614, + "step": 199600 + }, + { + "epoch": 15.468247510558331, + "grad_norm": 1.596425640999771, + "learning_rate": 7.734423434593925e-07, + "loss": 0.8414, + "step": 199610 + }, + { + "epoch": 15.469022434034638, + "grad_norm": 1.5497054992574075, + "learning_rate": 7.734810911345319e-07, + "loss": 0.8505, + "step": 199620 + }, + { + "epoch": 15.469797357510945, + "grad_norm": 1.5777791782964306, + "learning_rate": 7.735198388096715e-07, + "loss": 0.851, + "step": 199630 + }, + { + "epoch": 15.470572280987252, + "grad_norm": 1.6039125865226798, + "learning_rate": 7.73558586484811e-07, + "loss": 0.8642, + "step": 199640 + }, + { + "epoch": 15.471347204463559, + "grad_norm": 1.6022259847849294, + "learning_rate": 7.735973341599505e-07, + "loss": 0.8425, + "step": 199650 + }, + { + "epoch": 15.472122127939866, + "grad_norm": 1.5364004489425749, + "learning_rate": 7.736360818350899e-07, + "loss": 0.8593, + "step": 199660 + }, + { + "epoch": 15.472897051416172, + "grad_norm": 1.5090917836284226, + "learning_rate": 7.736748295102295e-07, + "loss": 0.8558, + "step": 199670 + }, + { + "epoch": 15.47367197489248, + "grad_norm": 1.6193449113268925, + "learning_rate": 7.737135771853689e-07, + "loss": 0.8765, + "step": 199680 + }, + { + "epoch": 15.474446898368786, + "grad_norm": 2.2479810694523983, + "learning_rate": 7.737523248605085e-07, + "loss": 0.8554, + "step": 199690 + }, + { + "epoch": 15.475221821845093, + "grad_norm": 1.5120330483270417, + "learning_rate": 7.737910725356479e-07, + "loss": 0.8502, + "step": 199700 + }, + { + "epoch": 15.4759967453214, + "grad_norm": 1.4785296353045103, + "learning_rate": 7.738298202107874e-07, + "loss": 0.8658, + "step": 199710 + }, + { + "epoch": 15.476771668797706, + "grad_norm": 1.5588997840431287, + "learning_rate": 7.738685678859269e-07, + "loss": 0.8515, + "step": 199720 + }, + { + "epoch": 15.477546592274013, + "grad_norm": 1.6244880668330661, + "learning_rate": 7.739073155610664e-07, + "loss": 0.8544, + "step": 199730 + }, + { + "epoch": 15.47832151575032, + "grad_norm": 1.4702913042504058, + "learning_rate": 7.739460632362059e-07, + "loss": 0.8442, + "step": 199740 + }, + { + "epoch": 15.479096439226627, + "grad_norm": 1.5753513126504397, + "learning_rate": 7.739848109113454e-07, + "loss": 0.9038, + "step": 199750 + }, + { + "epoch": 15.479871362702934, + "grad_norm": 1.5038709216590096, + "learning_rate": 7.740235585864848e-07, + "loss": 0.8568, + "step": 199760 + }, + { + "epoch": 15.48064628617924, + "grad_norm": 1.6309895423771112, + "learning_rate": 7.740623062616244e-07, + "loss": 0.8388, + "step": 199770 + }, + { + "epoch": 15.481421209655547, + "grad_norm": 1.5934578328538391, + "learning_rate": 7.741010539367638e-07, + "loss": 0.8614, + "step": 199780 + }, + { + "epoch": 15.482196133131854, + "grad_norm": 1.602369304279668, + "learning_rate": 7.741398016119034e-07, + "loss": 0.8595, + "step": 199790 + }, + { + "epoch": 15.482971056608159, + "grad_norm": 1.5074444491792367, + "learning_rate": 7.741785492870428e-07, + "loss": 0.8544, + "step": 199800 + }, + { + "epoch": 15.483745980084466, + "grad_norm": 1.6739628557594899, + "learning_rate": 7.742172969621824e-07, + "loss": 0.8565, + "step": 199810 + }, + { + "epoch": 15.484520903560773, + "grad_norm": 1.594939073758892, + "learning_rate": 7.742560446373218e-07, + "loss": 0.8547, + "step": 199820 + }, + { + "epoch": 15.48529582703708, + "grad_norm": 1.6075128762376434, + "learning_rate": 7.742947923124613e-07, + "loss": 0.8833, + "step": 199830 + }, + { + "epoch": 15.486070750513386, + "grad_norm": 1.6552635063979624, + "learning_rate": 7.743335399876008e-07, + "loss": 0.8655, + "step": 199840 + }, + { + "epoch": 15.486845673989693, + "grad_norm": 1.6708814190306385, + "learning_rate": 7.743722876627403e-07, + "loss": 0.8554, + "step": 199850 + }, + { + "epoch": 15.487620597466, + "grad_norm": 1.4640371681438358, + "learning_rate": 7.744110353378798e-07, + "loss": 0.8653, + "step": 199860 + }, + { + "epoch": 15.488395520942307, + "grad_norm": 1.5804594699603185, + "learning_rate": 7.744497830130193e-07, + "loss": 0.8655, + "step": 199870 + }, + { + "epoch": 15.489170444418614, + "grad_norm": 1.4795786044462629, + "learning_rate": 7.744885306881587e-07, + "loss": 0.8593, + "step": 199880 + }, + { + "epoch": 15.48994536789492, + "grad_norm": 1.5069891332813425, + "learning_rate": 7.745272783632983e-07, + "loss": 0.8718, + "step": 199890 + }, + { + "epoch": 15.490720291371227, + "grad_norm": 1.47080751661567, + "learning_rate": 7.745660260384377e-07, + "loss": 0.8474, + "step": 199900 + }, + { + "epoch": 15.491495214847534, + "grad_norm": 1.6099553332007597, + "learning_rate": 7.746047737135773e-07, + "loss": 0.8746, + "step": 199910 + }, + { + "epoch": 15.49227013832384, + "grad_norm": 1.520509867025069, + "learning_rate": 7.746435213887167e-07, + "loss": 0.8542, + "step": 199920 + }, + { + "epoch": 15.493045061800148, + "grad_norm": 1.4825846856208436, + "learning_rate": 7.746822690638562e-07, + "loss": 0.8715, + "step": 199930 + }, + { + "epoch": 15.493819985276454, + "grad_norm": 1.5980174787062345, + "learning_rate": 7.747210167389957e-07, + "loss": 0.8687, + "step": 199940 + }, + { + "epoch": 15.494594908752761, + "grad_norm": 1.4623392424193182, + "learning_rate": 7.747597644141353e-07, + "loss": 0.8772, + "step": 199950 + }, + { + "epoch": 15.495369832229068, + "grad_norm": 1.521629403800944, + "learning_rate": 7.747985120892747e-07, + "loss": 0.8507, + "step": 199960 + }, + { + "epoch": 15.496144755705375, + "grad_norm": 1.6357293880820472, + "learning_rate": 7.748372597644142e-07, + "loss": 0.8627, + "step": 199970 + }, + { + "epoch": 15.49691967918168, + "grad_norm": 1.5583599920848121, + "learning_rate": 7.748760074395536e-07, + "loss": 0.8591, + "step": 199980 + }, + { + "epoch": 15.497694602657987, + "grad_norm": 1.6702167298449557, + "learning_rate": 7.749147551146932e-07, + "loss": 0.8698, + "step": 199990 + }, + { + "epoch": 15.498469526134294, + "grad_norm": 1.5151952014868955, + "learning_rate": 7.749535027898327e-07, + "loss": 0.8571, + "step": 200000 + }, + { + "epoch": 15.498469526134294, + "eval_loss": 0.8909350633621216, + "eval_runtime": 331.8509, + "eval_samples_per_second": 34.567, + "eval_steps_per_second": 8.642, + "step": 200000 + }, + { + "epoch": 15.4992444496106, + "grad_norm": 1.5980697606662375, + "learning_rate": 7.749922504649722e-07, + "loss": 0.8422, + "step": 200010 + }, + { + "epoch": 15.500019373086907, + "grad_norm": 1.4823753341287855, + "learning_rate": 7.750309981401116e-07, + "loss": 0.8503, + "step": 200020 + }, + { + "epoch": 15.500794296563214, + "grad_norm": 1.5262107445427713, + "learning_rate": 7.750697458152512e-07, + "loss": 0.8687, + "step": 200030 + }, + { + "epoch": 15.50156922003952, + "grad_norm": 1.5861121693777724, + "learning_rate": 7.751084934903906e-07, + "loss": 0.8619, + "step": 200040 + }, + { + "epoch": 15.502344143515828, + "grad_norm": 1.488239893814523, + "learning_rate": 7.751472411655302e-07, + "loss": 0.8577, + "step": 200050 + }, + { + "epoch": 15.503119066992134, + "grad_norm": 1.5655187720315942, + "learning_rate": 7.751859888406696e-07, + "loss": 0.8563, + "step": 200060 + }, + { + "epoch": 15.503893990468441, + "grad_norm": 1.5543539791093122, + "learning_rate": 7.752247365158091e-07, + "loss": 0.8468, + "step": 200070 + }, + { + "epoch": 15.504668913944748, + "grad_norm": 1.5211887800071369, + "learning_rate": 7.752634841909486e-07, + "loss": 0.8621, + "step": 200080 + }, + { + "epoch": 15.505443837421055, + "grad_norm": 5.6111721924980005, + "learning_rate": 7.753022318660882e-07, + "loss": 0.8731, + "step": 200090 + }, + { + "epoch": 15.506218760897362, + "grad_norm": 1.5945611156761959, + "learning_rate": 7.753409795412276e-07, + "loss": 0.8606, + "step": 200100 + }, + { + "epoch": 15.506993684373668, + "grad_norm": 1.5206844880325259, + "learning_rate": 7.753797272163671e-07, + "loss": 0.8666, + "step": 200110 + }, + { + "epoch": 15.507768607849975, + "grad_norm": 1.5935944024786206, + "learning_rate": 7.754184748915065e-07, + "loss": 0.8534, + "step": 200120 + }, + { + "epoch": 15.508543531326282, + "grad_norm": 1.607476868520192, + "learning_rate": 7.754572225666461e-07, + "loss": 0.8556, + "step": 200130 + }, + { + "epoch": 15.509318454802589, + "grad_norm": 1.594860477335256, + "learning_rate": 7.754959702417855e-07, + "loss": 0.8584, + "step": 200140 + }, + { + "epoch": 15.510093378278896, + "grad_norm": 1.6508962633403712, + "learning_rate": 7.755347179169251e-07, + "loss": 0.8615, + "step": 200150 + }, + { + "epoch": 15.510868301755202, + "grad_norm": 1.544053322043852, + "learning_rate": 7.755734655920645e-07, + "loss": 0.8785, + "step": 200160 + }, + { + "epoch": 15.51164322523151, + "grad_norm": 1.5982068121375947, + "learning_rate": 7.75612213267204e-07, + "loss": 0.8544, + "step": 200170 + }, + { + "epoch": 15.512418148707814, + "grad_norm": 1.5340084501404132, + "learning_rate": 7.756509609423435e-07, + "loss": 0.8659, + "step": 200180 + }, + { + "epoch": 15.513193072184121, + "grad_norm": 1.5092071835626493, + "learning_rate": 7.756897086174831e-07, + "loss": 0.8472, + "step": 200190 + }, + { + "epoch": 15.513967995660428, + "grad_norm": 1.6286563664521905, + "learning_rate": 7.757284562926225e-07, + "loss": 0.8628, + "step": 200200 + }, + { + "epoch": 15.514742919136735, + "grad_norm": 1.6261461450437635, + "learning_rate": 7.75767203967762e-07, + "loss": 0.8599, + "step": 200210 + }, + { + "epoch": 15.515517842613042, + "grad_norm": 1.706364933510379, + "learning_rate": 7.758059516429014e-07, + "loss": 0.8652, + "step": 200220 + }, + { + "epoch": 15.516292766089348, + "grad_norm": 1.565642505839004, + "learning_rate": 7.758446993180411e-07, + "loss": 0.869, + "step": 200230 + }, + { + "epoch": 15.517067689565655, + "grad_norm": 1.6251038007882315, + "learning_rate": 7.758834469931805e-07, + "loss": 0.8644, + "step": 200240 + }, + { + "epoch": 15.517842613041962, + "grad_norm": 1.5314328741447845, + "learning_rate": 7.7592219466832e-07, + "loss": 0.8385, + "step": 200250 + }, + { + "epoch": 15.518617536518269, + "grad_norm": 1.6344336425376635, + "learning_rate": 7.759609423434594e-07, + "loss": 0.8826, + "step": 200260 + }, + { + "epoch": 15.519392459994576, + "grad_norm": 1.707358598241876, + "learning_rate": 7.759996900185989e-07, + "loss": 0.8425, + "step": 200270 + }, + { + "epoch": 15.520167383470882, + "grad_norm": 1.461381268252806, + "learning_rate": 7.760384376937384e-07, + "loss": 0.8733, + "step": 200280 + }, + { + "epoch": 15.52094230694719, + "grad_norm": 1.55916194994929, + "learning_rate": 7.76077185368878e-07, + "loss": 0.8716, + "step": 200290 + }, + { + "epoch": 15.521717230423496, + "grad_norm": 1.6449045906186086, + "learning_rate": 7.761159330440174e-07, + "loss": 0.8458, + "step": 200300 + }, + { + "epoch": 15.522492153899803, + "grad_norm": 1.576255469565185, + "learning_rate": 7.761546807191569e-07, + "loss": 0.8558, + "step": 200310 + }, + { + "epoch": 15.52326707737611, + "grad_norm": 1.5802971537214354, + "learning_rate": 7.761934283942963e-07, + "loss": 0.8796, + "step": 200320 + }, + { + "epoch": 15.524042000852416, + "grad_norm": 1.5386360652726336, + "learning_rate": 7.76232176069436e-07, + "loss": 0.8663, + "step": 200330 + }, + { + "epoch": 15.524816924328723, + "grad_norm": 1.5408818788821104, + "learning_rate": 7.762709237445754e-07, + "loss": 0.873, + "step": 200340 + }, + { + "epoch": 15.525591847805028, + "grad_norm": 1.4789945756671183, + "learning_rate": 7.763096714197149e-07, + "loss": 0.8627, + "step": 200350 + }, + { + "epoch": 15.526366771281335, + "grad_norm": 1.5114933612040977, + "learning_rate": 7.763484190948543e-07, + "loss": 0.8619, + "step": 200360 + }, + { + "epoch": 15.527141694757642, + "grad_norm": 1.5650467003944453, + "learning_rate": 7.763871667699939e-07, + "loss": 0.8463, + "step": 200370 + }, + { + "epoch": 15.527916618233949, + "grad_norm": 1.5271735106277702, + "learning_rate": 7.764259144451334e-07, + "loss": 0.862, + "step": 200380 + }, + { + "epoch": 15.528691541710256, + "grad_norm": 1.4835014526855603, + "learning_rate": 7.764646621202729e-07, + "loss": 0.8401, + "step": 200390 + }, + { + "epoch": 15.529466465186562, + "grad_norm": 1.6144289276350636, + "learning_rate": 7.765034097954123e-07, + "loss": 0.8578, + "step": 200400 + }, + { + "epoch": 15.53024138866287, + "grad_norm": 1.5060645563125592, + "learning_rate": 7.765421574705518e-07, + "loss": 0.8503, + "step": 200410 + }, + { + "epoch": 15.531016312139176, + "grad_norm": 1.5711557076882194, + "learning_rate": 7.765809051456912e-07, + "loss": 0.8626, + "step": 200420 + }, + { + "epoch": 15.531791235615483, + "grad_norm": 1.5181629447871514, + "learning_rate": 7.766196528208309e-07, + "loss": 0.8492, + "step": 200430 + }, + { + "epoch": 15.53256615909179, + "grad_norm": 1.5712818939954412, + "learning_rate": 7.766584004959703e-07, + "loss": 0.8601, + "step": 200440 + }, + { + "epoch": 15.533341082568096, + "grad_norm": 1.5312869281525112, + "learning_rate": 7.766971481711098e-07, + "loss": 0.8482, + "step": 200450 + }, + { + "epoch": 15.534116006044403, + "grad_norm": 1.5023034026465396, + "learning_rate": 7.767358958462492e-07, + "loss": 0.8759, + "step": 200460 + }, + { + "epoch": 15.53489092952071, + "grad_norm": 1.5859868974660398, + "learning_rate": 7.767746435213888e-07, + "loss": 0.8635, + "step": 200470 + }, + { + "epoch": 15.535665852997017, + "grad_norm": 1.5872084876185533, + "learning_rate": 7.768133911965283e-07, + "loss": 0.8563, + "step": 200480 + }, + { + "epoch": 15.536440776473324, + "grad_norm": 1.6014904110623547, + "learning_rate": 7.768521388716678e-07, + "loss": 0.8621, + "step": 200490 + }, + { + "epoch": 15.53721569994963, + "grad_norm": 1.4943889758961573, + "learning_rate": 7.768908865468072e-07, + "loss": 0.8431, + "step": 200500 + }, + { + "epoch": 15.53721569994963, + "eval_loss": 0.8906089663505554, + "eval_runtime": 332.8012, + "eval_samples_per_second": 34.468, + "eval_steps_per_second": 8.618, + "step": 200500 + }, + { + "epoch": 15.537990623425937, + "grad_norm": 1.5547492703451844, + "learning_rate": 7.769296342219468e-07, + "loss": 0.8498, + "step": 200510 + }, + { + "epoch": 15.538765546902244, + "grad_norm": 1.5613184086588323, + "learning_rate": 7.769683818970862e-07, + "loss": 0.8392, + "step": 200520 + }, + { + "epoch": 15.539540470378551, + "grad_norm": 1.5870383964519652, + "learning_rate": 7.770071295722258e-07, + "loss": 0.8519, + "step": 200530 + }, + { + "epoch": 15.540315393854858, + "grad_norm": 1.6030461446839734, + "learning_rate": 7.770458772473652e-07, + "loss": 0.845, + "step": 200540 + }, + { + "epoch": 15.541090317331163, + "grad_norm": 1.7436966283305722, + "learning_rate": 7.770846249225047e-07, + "loss": 0.879, + "step": 200550 + }, + { + "epoch": 15.54186524080747, + "grad_norm": 1.493127246178196, + "learning_rate": 7.771233725976441e-07, + "loss": 0.8683, + "step": 200560 + }, + { + "epoch": 15.542640164283776, + "grad_norm": 1.6166269345371775, + "learning_rate": 7.771621202727838e-07, + "loss": 0.8787, + "step": 200570 + }, + { + "epoch": 15.543415087760083, + "grad_norm": 1.4889664024179847, + "learning_rate": 7.772008679479232e-07, + "loss": 0.8434, + "step": 200580 + }, + { + "epoch": 15.54419001123639, + "grad_norm": 1.5137350226344113, + "learning_rate": 7.772396156230627e-07, + "loss": 0.8456, + "step": 200590 + }, + { + "epoch": 15.544964934712697, + "grad_norm": 1.5939646960919591, + "learning_rate": 7.772783632982021e-07, + "loss": 0.8662, + "step": 200600 + }, + { + "epoch": 15.545739858189004, + "grad_norm": 1.5389144155113623, + "learning_rate": 7.773171109733417e-07, + "loss": 0.8524, + "step": 200610 + }, + { + "epoch": 15.54651478166531, + "grad_norm": 1.6520333694982479, + "learning_rate": 7.773558586484811e-07, + "loss": 0.8591, + "step": 200620 + }, + { + "epoch": 15.547289705141617, + "grad_norm": 1.4737987891915882, + "learning_rate": 7.773946063236207e-07, + "loss": 0.8732, + "step": 200630 + }, + { + "epoch": 15.548064628617924, + "grad_norm": 1.6016767504631118, + "learning_rate": 7.774333539987601e-07, + "loss": 0.843, + "step": 200640 + }, + { + "epoch": 15.54883955209423, + "grad_norm": 1.6421069209451533, + "learning_rate": 7.774721016738997e-07, + "loss": 0.8855, + "step": 200650 + }, + { + "epoch": 15.549614475570538, + "grad_norm": 1.6326285385581334, + "learning_rate": 7.775108493490391e-07, + "loss": 0.8507, + "step": 200660 + }, + { + "epoch": 15.550389399046844, + "grad_norm": 1.6177292328038084, + "learning_rate": 7.775495970241787e-07, + "loss": 0.8611, + "step": 200670 + }, + { + "epoch": 15.551164322523151, + "grad_norm": 1.4604067300210748, + "learning_rate": 7.775883446993181e-07, + "loss": 0.8688, + "step": 200680 + }, + { + "epoch": 15.551939245999458, + "grad_norm": 1.6188854397013548, + "learning_rate": 7.776270923744576e-07, + "loss": 0.887, + "step": 200690 + }, + { + "epoch": 15.552714169475765, + "grad_norm": 1.5704100345216179, + "learning_rate": 7.77665840049597e-07, + "loss": 0.8592, + "step": 200700 + }, + { + "epoch": 15.553489092952072, + "grad_norm": 1.4969528309855773, + "learning_rate": 7.777045877247366e-07, + "loss": 0.8532, + "step": 200710 + }, + { + "epoch": 15.554264016428379, + "grad_norm": 1.6278718911819645, + "learning_rate": 7.77743335399876e-07, + "loss": 0.8601, + "step": 200720 + }, + { + "epoch": 15.555038939904684, + "grad_norm": 1.5874937120237396, + "learning_rate": 7.777820830750156e-07, + "loss": 0.871, + "step": 200730 + }, + { + "epoch": 15.55581386338099, + "grad_norm": 1.5054393448676753, + "learning_rate": 7.77820830750155e-07, + "loss": 0.8522, + "step": 200740 + }, + { + "epoch": 15.556588786857297, + "grad_norm": 1.5877043243681424, + "learning_rate": 7.778595784252946e-07, + "loss": 0.8531, + "step": 200750 + }, + { + "epoch": 15.557363710333604, + "grad_norm": 1.5872008005213836, + "learning_rate": 7.77898326100434e-07, + "loss": 0.8664, + "step": 200760 + }, + { + "epoch": 15.55813863380991, + "grad_norm": 1.6530771840366147, + "learning_rate": 7.779370737755736e-07, + "loss": 0.8675, + "step": 200770 + }, + { + "epoch": 15.558913557286218, + "grad_norm": 1.6593074069167386, + "learning_rate": 7.77975821450713e-07, + "loss": 0.8631, + "step": 200780 + }, + { + "epoch": 15.559688480762524, + "grad_norm": 1.5850636417564878, + "learning_rate": 7.780145691258526e-07, + "loss": 0.8634, + "step": 200790 + }, + { + "epoch": 15.560463404238831, + "grad_norm": 1.5347380363358916, + "learning_rate": 7.78053316800992e-07, + "loss": 0.8478, + "step": 200800 + }, + { + "epoch": 15.561238327715138, + "grad_norm": 1.6011597399761184, + "learning_rate": 7.780920644761315e-07, + "loss": 0.8648, + "step": 200810 + }, + { + "epoch": 15.562013251191445, + "grad_norm": 1.6543853368551669, + "learning_rate": 7.78130812151271e-07, + "loss": 0.8681, + "step": 200820 + }, + { + "epoch": 15.562788174667752, + "grad_norm": 1.6446913470034092, + "learning_rate": 7.781695598264105e-07, + "loss": 0.8657, + "step": 200830 + }, + { + "epoch": 15.563563098144058, + "grad_norm": 1.5755879153597192, + "learning_rate": 7.782083075015499e-07, + "loss": 0.8832, + "step": 200840 + }, + { + "epoch": 15.564338021620365, + "grad_norm": 1.5184146157661171, + "learning_rate": 7.782470551766895e-07, + "loss": 0.8684, + "step": 200850 + }, + { + "epoch": 15.565112945096672, + "grad_norm": 1.5699928260451146, + "learning_rate": 7.782858028518289e-07, + "loss": 0.8394, + "step": 200860 + }, + { + "epoch": 15.565887868572979, + "grad_norm": 1.6305573858014246, + "learning_rate": 7.783245505269685e-07, + "loss": 0.8612, + "step": 200870 + }, + { + "epoch": 15.566662792049286, + "grad_norm": 1.5026198742282126, + "learning_rate": 7.783632982021079e-07, + "loss": 0.835, + "step": 200880 + }, + { + "epoch": 15.567437715525593, + "grad_norm": 1.498757338910422, + "learning_rate": 7.784020458772475e-07, + "loss": 0.863, + "step": 200890 + }, + { + "epoch": 15.5682126390019, + "grad_norm": 1.4127995679176253, + "learning_rate": 7.784407935523869e-07, + "loss": 0.8672, + "step": 200900 + }, + { + "epoch": 15.568987562478206, + "grad_norm": 1.601714196099094, + "learning_rate": 7.784795412275264e-07, + "loss": 0.875, + "step": 200910 + }, + { + "epoch": 15.569762485954511, + "grad_norm": 1.5912085393502957, + "learning_rate": 7.785182889026659e-07, + "loss": 0.8634, + "step": 200920 + }, + { + "epoch": 15.570537409430818, + "grad_norm": 1.5499059119075425, + "learning_rate": 7.785570365778055e-07, + "loss": 0.868, + "step": 200930 + }, + { + "epoch": 15.571312332907125, + "grad_norm": 1.621947499696596, + "learning_rate": 7.785957842529449e-07, + "loss": 0.8669, + "step": 200940 + }, + { + "epoch": 15.572087256383432, + "grad_norm": 1.4906532655333817, + "learning_rate": 7.786345319280844e-07, + "loss": 0.8509, + "step": 200950 + }, + { + "epoch": 15.572862179859738, + "grad_norm": 1.6260059088041272, + "learning_rate": 7.786732796032238e-07, + "loss": 0.8618, + "step": 200960 + }, + { + "epoch": 15.573637103336045, + "grad_norm": 1.5595178453474385, + "learning_rate": 7.787120272783634e-07, + "loss": 0.8742, + "step": 200970 + }, + { + "epoch": 15.574412026812352, + "grad_norm": 1.5843806666740587, + "learning_rate": 7.787507749535028e-07, + "loss": 0.8469, + "step": 200980 + }, + { + "epoch": 15.575186950288659, + "grad_norm": 1.4938266269477138, + "learning_rate": 7.787895226286424e-07, + "loss": 0.8489, + "step": 200990 + }, + { + "epoch": 15.575961873764966, + "grad_norm": 1.5869103766693464, + "learning_rate": 7.788282703037818e-07, + "loss": 0.8673, + "step": 201000 + }, + { + "epoch": 15.575961873764966, + "eval_loss": 0.8904051780700684, + "eval_runtime": 333.6027, + "eval_samples_per_second": 34.385, + "eval_steps_per_second": 8.597, + "step": 201000 + }, + { + "epoch": 15.576736797241272, + "grad_norm": 1.7017746126683773, + "learning_rate": 7.788670179789213e-07, + "loss": 0.8619, + "step": 201010 + }, + { + "epoch": 15.57751172071758, + "grad_norm": 1.5354721605925756, + "learning_rate": 7.789057656540608e-07, + "loss": 0.8603, + "step": 201020 + }, + { + "epoch": 15.578286644193886, + "grad_norm": 1.6042844846517463, + "learning_rate": 7.789445133292004e-07, + "loss": 0.8551, + "step": 201030 + }, + { + "epoch": 15.579061567670193, + "grad_norm": 1.70076600989615, + "learning_rate": 7.789832610043398e-07, + "loss": 0.8975, + "step": 201040 + }, + { + "epoch": 15.5798364911465, + "grad_norm": 1.5488009569448933, + "learning_rate": 7.790220086794793e-07, + "loss": 0.8517, + "step": 201050 + }, + { + "epoch": 15.580611414622807, + "grad_norm": 1.5689174729327637, + "learning_rate": 7.790607563546187e-07, + "loss": 0.849, + "step": 201060 + }, + { + "epoch": 15.581386338099113, + "grad_norm": 1.5770211484727386, + "learning_rate": 7.790995040297583e-07, + "loss": 0.8659, + "step": 201070 + }, + { + "epoch": 15.58216126157542, + "grad_norm": 1.5283180732107111, + "learning_rate": 7.791382517048978e-07, + "loss": 0.8383, + "step": 201080 + }, + { + "epoch": 15.582936185051727, + "grad_norm": 1.5729597791950327, + "learning_rate": 7.791769993800373e-07, + "loss": 0.8662, + "step": 201090 + }, + { + "epoch": 15.583711108528032, + "grad_norm": 1.6181514404142845, + "learning_rate": 7.792157470551767e-07, + "loss": 0.8777, + "step": 201100 + }, + { + "epoch": 15.584486032004339, + "grad_norm": 1.5103929804582996, + "learning_rate": 7.792544947303162e-07, + "loss": 0.8564, + "step": 201110 + }, + { + "epoch": 15.585260955480646, + "grad_norm": 1.5372418998712492, + "learning_rate": 7.792932424054557e-07, + "loss": 0.8694, + "step": 201120 + }, + { + "epoch": 15.586035878956952, + "grad_norm": 1.5685236987986448, + "learning_rate": 7.793319900805953e-07, + "loss": 0.8482, + "step": 201130 + }, + { + "epoch": 15.58681080243326, + "grad_norm": 1.5230837912471806, + "learning_rate": 7.793707377557347e-07, + "loss": 0.8669, + "step": 201140 + }, + { + "epoch": 15.587585725909566, + "grad_norm": 1.5913740325396517, + "learning_rate": 7.794094854308742e-07, + "loss": 0.8563, + "step": 201150 + }, + { + "epoch": 15.588360649385873, + "grad_norm": 1.6791114639008091, + "learning_rate": 7.794482331060136e-07, + "loss": 0.8486, + "step": 201160 + }, + { + "epoch": 15.58913557286218, + "grad_norm": 1.5078887265802086, + "learning_rate": 7.794869807811533e-07, + "loss": 0.8621, + "step": 201170 + }, + { + "epoch": 15.589910496338486, + "grad_norm": 1.5459860674655106, + "learning_rate": 7.795257284562927e-07, + "loss": 0.8527, + "step": 201180 + }, + { + "epoch": 15.590685419814793, + "grad_norm": 1.6307196721217585, + "learning_rate": 7.795644761314322e-07, + "loss": 0.8645, + "step": 201190 + }, + { + "epoch": 15.5914603432911, + "grad_norm": 1.4857372121198864, + "learning_rate": 7.796032238065716e-07, + "loss": 0.8418, + "step": 201200 + }, + { + "epoch": 15.592235266767407, + "grad_norm": 1.5663470390823448, + "learning_rate": 7.796419714817111e-07, + "loss": 0.8591, + "step": 201210 + }, + { + "epoch": 15.593010190243714, + "grad_norm": 1.6526399837123997, + "learning_rate": 7.796807191568507e-07, + "loss": 0.858, + "step": 201220 + }, + { + "epoch": 15.59378511372002, + "grad_norm": 1.496088782514604, + "learning_rate": 7.797194668319902e-07, + "loss": 0.8508, + "step": 201230 + }, + { + "epoch": 15.594560037196327, + "grad_norm": 1.5019651237365117, + "learning_rate": 7.797582145071296e-07, + "loss": 0.8478, + "step": 201240 + }, + { + "epoch": 15.595334960672634, + "grad_norm": 1.614819061783244, + "learning_rate": 7.797969621822691e-07, + "loss": 0.8485, + "step": 201250 + }, + { + "epoch": 15.596109884148941, + "grad_norm": 1.4233210830125143, + "learning_rate": 7.798357098574085e-07, + "loss": 0.8464, + "step": 201260 + }, + { + "epoch": 15.596884807625248, + "grad_norm": 1.5510567080115487, + "learning_rate": 7.798744575325482e-07, + "loss": 0.8532, + "step": 201270 + }, + { + "epoch": 15.597659731101555, + "grad_norm": 1.5289895592759237, + "learning_rate": 7.799132052076876e-07, + "loss": 0.8685, + "step": 201280 + }, + { + "epoch": 15.59843465457786, + "grad_norm": 1.4996569134140916, + "learning_rate": 7.799519528828271e-07, + "loss": 0.8598, + "step": 201290 + }, + { + "epoch": 15.599209578054166, + "grad_norm": 1.7770234268312992, + "learning_rate": 7.799907005579665e-07, + "loss": 0.8598, + "step": 201300 + }, + { + "epoch": 15.599984501530473, + "grad_norm": 1.565429707826461, + "learning_rate": 7.800294482331062e-07, + "loss": 0.858, + "step": 201310 + }, + { + "epoch": 15.60075942500678, + "grad_norm": 1.6581293498038747, + "learning_rate": 7.800681959082456e-07, + "loss": 0.8634, + "step": 201320 + }, + { + "epoch": 15.601534348483087, + "grad_norm": 1.5724977695399385, + "learning_rate": 7.801069435833851e-07, + "loss": 0.8574, + "step": 201330 + }, + { + "epoch": 15.602309271959394, + "grad_norm": 1.6631624105378946, + "learning_rate": 7.801456912585245e-07, + "loss": 0.847, + "step": 201340 + }, + { + "epoch": 15.6030841954357, + "grad_norm": 1.588508787869779, + "learning_rate": 7.80184438933664e-07, + "loss": 0.8372, + "step": 201350 + }, + { + "epoch": 15.603859118912007, + "grad_norm": 1.5857009531320043, + "learning_rate": 7.802231866088036e-07, + "loss": 0.8663, + "step": 201360 + }, + { + "epoch": 15.604634042388314, + "grad_norm": 1.5952228149242085, + "learning_rate": 7.802619342839431e-07, + "loss": 0.8659, + "step": 201370 + }, + { + "epoch": 15.605408965864621, + "grad_norm": 1.4692354683323348, + "learning_rate": 7.803006819590825e-07, + "loss": 0.8529, + "step": 201380 + }, + { + "epoch": 15.606183889340928, + "grad_norm": 1.5905773771662528, + "learning_rate": 7.80339429634222e-07, + "loss": 0.8684, + "step": 201390 + }, + { + "epoch": 15.606958812817235, + "grad_norm": 1.5871083991761776, + "learning_rate": 7.803781773093614e-07, + "loss": 0.8751, + "step": 201400 + }, + { + "epoch": 15.607733736293541, + "grad_norm": 1.5692411860225248, + "learning_rate": 7.804169249845011e-07, + "loss": 0.8507, + "step": 201410 + }, + { + "epoch": 15.608508659769848, + "grad_norm": 1.6951278658124092, + "learning_rate": 7.804556726596405e-07, + "loss": 0.8658, + "step": 201420 + }, + { + "epoch": 15.609283583246155, + "grad_norm": 1.559597284695359, + "learning_rate": 7.8049442033478e-07, + "loss": 0.8706, + "step": 201430 + }, + { + "epoch": 15.610058506722462, + "grad_norm": 1.4760674363836095, + "learning_rate": 7.805331680099194e-07, + "loss": 0.8681, + "step": 201440 + }, + { + "epoch": 15.610833430198769, + "grad_norm": 1.537484869547969, + "learning_rate": 7.80571915685059e-07, + "loss": 0.8562, + "step": 201450 + }, + { + "epoch": 15.611608353675075, + "grad_norm": 1.4403522355516163, + "learning_rate": 7.806106633601985e-07, + "loss": 0.8575, + "step": 201460 + }, + { + "epoch": 15.61238327715138, + "grad_norm": 1.5833681632874435, + "learning_rate": 7.80649411035338e-07, + "loss": 0.8581, + "step": 201470 + }, + { + "epoch": 15.613158200627687, + "grad_norm": 1.6494785470757545, + "learning_rate": 7.806881587104774e-07, + "loss": 0.8664, + "step": 201480 + }, + { + "epoch": 15.613933124103994, + "grad_norm": 1.5072132092085295, + "learning_rate": 7.807269063856169e-07, + "loss": 0.8738, + "step": 201490 + }, + { + "epoch": 15.6147080475803, + "grad_norm": 1.599891475915327, + "learning_rate": 7.807656540607564e-07, + "loss": 0.8756, + "step": 201500 + }, + { + "epoch": 15.6147080475803, + "eval_loss": 0.8903772830963135, + "eval_runtime": 330.9251, + "eval_samples_per_second": 34.663, + "eval_steps_per_second": 8.667, + "step": 201500 + }, + { + "epoch": 15.615482971056608, + "grad_norm": 1.6721967821952155, + "learning_rate": 7.80804401735896e-07, + "loss": 0.8613, + "step": 201510 + }, + { + "epoch": 15.616257894532914, + "grad_norm": 1.5685880483817125, + "learning_rate": 7.808431494110354e-07, + "loss": 0.8553, + "step": 201520 + }, + { + "epoch": 15.617032818009221, + "grad_norm": 1.598129715577663, + "learning_rate": 7.808818970861749e-07, + "loss": 0.8613, + "step": 201530 + }, + { + "epoch": 15.617807741485528, + "grad_norm": 1.4859195772024665, + "learning_rate": 7.809206447613143e-07, + "loss": 0.8423, + "step": 201540 + }, + { + "epoch": 15.618582664961835, + "grad_norm": 1.6421683167037353, + "learning_rate": 7.809593924364539e-07, + "loss": 0.8574, + "step": 201550 + }, + { + "epoch": 15.619357588438142, + "grad_norm": 1.505622764795307, + "learning_rate": 7.809981401115934e-07, + "loss": 0.8642, + "step": 201560 + }, + { + "epoch": 15.620132511914449, + "grad_norm": 1.580405069641713, + "learning_rate": 7.810368877867329e-07, + "loss": 0.8579, + "step": 201570 + }, + { + "epoch": 15.620907435390755, + "grad_norm": 1.6556273497048883, + "learning_rate": 7.810756354618723e-07, + "loss": 0.8795, + "step": 201580 + }, + { + "epoch": 15.621682358867062, + "grad_norm": 1.6938465644027028, + "learning_rate": 7.811143831370119e-07, + "loss": 0.8628, + "step": 201590 + }, + { + "epoch": 15.622457282343369, + "grad_norm": 1.6174737963582095, + "learning_rate": 7.811531308121513e-07, + "loss": 0.8549, + "step": 201600 + }, + { + "epoch": 15.623232205819676, + "grad_norm": 1.5340944042245623, + "learning_rate": 7.811918784872909e-07, + "loss": 0.8923, + "step": 201610 + }, + { + "epoch": 15.624007129295983, + "grad_norm": 1.6246332586077412, + "learning_rate": 7.812306261624303e-07, + "loss": 0.8374, + "step": 201620 + }, + { + "epoch": 15.62478205277229, + "grad_norm": 1.496586711937548, + "learning_rate": 7.812693738375698e-07, + "loss": 0.8638, + "step": 201630 + }, + { + "epoch": 15.625556976248596, + "grad_norm": 1.5262821863586555, + "learning_rate": 7.813081215127092e-07, + "loss": 0.8467, + "step": 201640 + }, + { + "epoch": 15.626331899724903, + "grad_norm": 1.5702809958525414, + "learning_rate": 7.813468691878488e-07, + "loss": 0.8855, + "step": 201650 + }, + { + "epoch": 15.62710682320121, + "grad_norm": 1.548262896818546, + "learning_rate": 7.813856168629883e-07, + "loss": 0.8511, + "step": 201660 + }, + { + "epoch": 15.627881746677515, + "grad_norm": 1.5190601320299064, + "learning_rate": 7.814243645381278e-07, + "loss": 0.8581, + "step": 201670 + }, + { + "epoch": 15.628656670153822, + "grad_norm": 1.5330357880074108, + "learning_rate": 7.814631122132672e-07, + "loss": 0.8622, + "step": 201680 + }, + { + "epoch": 15.629431593630128, + "grad_norm": 1.6160353756806405, + "learning_rate": 7.815018598884068e-07, + "loss": 0.8683, + "step": 201690 + }, + { + "epoch": 15.630206517106435, + "grad_norm": 1.6054599216073633, + "learning_rate": 7.815406075635462e-07, + "loss": 0.8772, + "step": 201700 + }, + { + "epoch": 15.630981440582742, + "grad_norm": 1.6688072307988426, + "learning_rate": 7.815793552386858e-07, + "loss": 0.8726, + "step": 201710 + }, + { + "epoch": 15.631756364059049, + "grad_norm": 1.4759168310408504, + "learning_rate": 7.816181029138252e-07, + "loss": 0.8547, + "step": 201720 + }, + { + "epoch": 15.632531287535356, + "grad_norm": 1.5551405103869504, + "learning_rate": 7.816568505889648e-07, + "loss": 0.8507, + "step": 201730 + }, + { + "epoch": 15.633306211011663, + "grad_norm": 1.5246156672153988, + "learning_rate": 7.816955982641042e-07, + "loss": 0.8656, + "step": 201740 + }, + { + "epoch": 15.63408113448797, + "grad_norm": 1.5365547153150643, + "learning_rate": 7.817343459392437e-07, + "loss": 0.8514, + "step": 201750 + }, + { + "epoch": 15.634856057964276, + "grad_norm": 1.5024074107082253, + "learning_rate": 7.817730936143832e-07, + "loss": 0.849, + "step": 201760 + }, + { + "epoch": 15.635630981440583, + "grad_norm": 1.7160247069597323, + "learning_rate": 7.818118412895227e-07, + "loss": 0.8656, + "step": 201770 + }, + { + "epoch": 15.63640590491689, + "grad_norm": 1.6144349656006844, + "learning_rate": 7.818505889646621e-07, + "loss": 0.8534, + "step": 201780 + }, + { + "epoch": 15.637180828393197, + "grad_norm": 1.552886325842611, + "learning_rate": 7.818893366398017e-07, + "loss": 0.8554, + "step": 201790 + }, + { + "epoch": 15.637955751869503, + "grad_norm": 1.5398690643883064, + "learning_rate": 7.819280843149411e-07, + "loss": 0.8539, + "step": 201800 + }, + { + "epoch": 15.63873067534581, + "grad_norm": 1.6187591313084033, + "learning_rate": 7.819668319900807e-07, + "loss": 0.852, + "step": 201810 + }, + { + "epoch": 15.639505598822117, + "grad_norm": 1.4303964833047345, + "learning_rate": 7.820055796652201e-07, + "loss": 0.8488, + "step": 201820 + }, + { + "epoch": 15.640280522298424, + "grad_norm": 1.547386575947912, + "learning_rate": 7.820443273403597e-07, + "loss": 0.8529, + "step": 201830 + }, + { + "epoch": 15.641055445774729, + "grad_norm": 1.5335668458789808, + "learning_rate": 7.820830750154991e-07, + "loss": 0.8709, + "step": 201840 + }, + { + "epoch": 15.641830369251036, + "grad_norm": 1.624803910269738, + "learning_rate": 7.821218226906386e-07, + "loss": 0.852, + "step": 201850 + }, + { + "epoch": 15.642605292727342, + "grad_norm": 1.5699497099825785, + "learning_rate": 7.821605703657781e-07, + "loss": 0.866, + "step": 201860 + }, + { + "epoch": 15.64338021620365, + "grad_norm": 1.554012705717906, + "learning_rate": 7.821993180409177e-07, + "loss": 0.8807, + "step": 201870 + }, + { + "epoch": 15.644155139679956, + "grad_norm": 1.5904048524809122, + "learning_rate": 7.822380657160571e-07, + "loss": 0.8617, + "step": 201880 + }, + { + "epoch": 15.644930063156263, + "grad_norm": 1.584221225215546, + "learning_rate": 7.822768133911966e-07, + "loss": 0.869, + "step": 201890 + }, + { + "epoch": 15.64570498663257, + "grad_norm": 1.6311300085372042, + "learning_rate": 7.82315561066336e-07, + "loss": 0.85, + "step": 201900 + }, + { + "epoch": 15.646479910108877, + "grad_norm": 1.595388391352514, + "learning_rate": 7.823543087414756e-07, + "loss": 0.8775, + "step": 201910 + }, + { + "epoch": 15.647254833585183, + "grad_norm": 1.5013875048756142, + "learning_rate": 7.82393056416615e-07, + "loss": 0.863, + "step": 201920 + }, + { + "epoch": 15.64802975706149, + "grad_norm": 1.6193474959596277, + "learning_rate": 7.824318040917546e-07, + "loss": 0.8534, + "step": 201930 + }, + { + "epoch": 15.648804680537797, + "grad_norm": 1.498435134326116, + "learning_rate": 7.82470551766894e-07, + "loss": 0.8744, + "step": 201940 + }, + { + "epoch": 15.649579604014104, + "grad_norm": 1.587886052383863, + "learning_rate": 7.825092994420335e-07, + "loss": 0.8433, + "step": 201950 + }, + { + "epoch": 15.65035452749041, + "grad_norm": 1.5646014249645355, + "learning_rate": 7.82548047117173e-07, + "loss": 0.8657, + "step": 201960 + }, + { + "epoch": 15.651129450966717, + "grad_norm": 1.531299905187077, + "learning_rate": 7.825867947923126e-07, + "loss": 0.8869, + "step": 201970 + }, + { + "epoch": 15.651904374443024, + "grad_norm": 1.5241110333980081, + "learning_rate": 7.82625542467452e-07, + "loss": 0.8368, + "step": 201980 + }, + { + "epoch": 15.652679297919331, + "grad_norm": 1.5421137330978716, + "learning_rate": 7.826642901425915e-07, + "loss": 0.8745, + "step": 201990 + }, + { + "epoch": 15.653454221395638, + "grad_norm": 1.5379600183909936, + "learning_rate": 7.827030378177309e-07, + "loss": 0.8663, + "step": 202000 + }, + { + "epoch": 15.653454221395638, + "eval_loss": 0.8899983167648315, + "eval_runtime": 333.8974, + "eval_samples_per_second": 34.355, + "eval_steps_per_second": 8.589, + "step": 202000 + }, + { + "epoch": 15.654229144871945, + "grad_norm": 1.5160103152405493, + "learning_rate": 7.827417854928706e-07, + "loss": 0.8657, + "step": 202010 + }, + { + "epoch": 15.655004068348251, + "grad_norm": 1.5570572652259145, + "learning_rate": 7.8278053316801e-07, + "loss": 0.8656, + "step": 202020 + }, + { + "epoch": 15.655778991824558, + "grad_norm": 1.5597570906481266, + "learning_rate": 7.828192808431495e-07, + "loss": 0.8582, + "step": 202030 + }, + { + "epoch": 15.656553915300863, + "grad_norm": 1.576762486463653, + "learning_rate": 7.828580285182889e-07, + "loss": 0.8735, + "step": 202040 + }, + { + "epoch": 15.65732883877717, + "grad_norm": 1.5352386255348849, + "learning_rate": 7.828967761934285e-07, + "loss": 0.8425, + "step": 202050 + }, + { + "epoch": 15.658103762253477, + "grad_norm": 1.6541903371314766, + "learning_rate": 7.829355238685679e-07, + "loss": 0.8391, + "step": 202060 + }, + { + "epoch": 15.658878685729784, + "grad_norm": 1.4454764190040255, + "learning_rate": 7.829742715437075e-07, + "loss": 0.8692, + "step": 202070 + }, + { + "epoch": 15.65965360920609, + "grad_norm": 1.5745098842070084, + "learning_rate": 7.830130192188469e-07, + "loss": 0.8582, + "step": 202080 + }, + { + "epoch": 15.660428532682397, + "grad_norm": 1.4914633841790363, + "learning_rate": 7.830517668939864e-07, + "loss": 0.8547, + "step": 202090 + }, + { + "epoch": 15.661203456158704, + "grad_norm": 1.6508802709648727, + "learning_rate": 7.830905145691258e-07, + "loss": 0.8481, + "step": 202100 + }, + { + "epoch": 15.661978379635011, + "grad_norm": 1.4949181026764662, + "learning_rate": 7.831292622442655e-07, + "loss": 0.8508, + "step": 202110 + }, + { + "epoch": 15.662753303111318, + "grad_norm": 1.4319573811330206, + "learning_rate": 7.831680099194049e-07, + "loss": 0.8601, + "step": 202120 + }, + { + "epoch": 15.663528226587625, + "grad_norm": 1.5228971655777466, + "learning_rate": 7.832067575945444e-07, + "loss": 0.8807, + "step": 202130 + }, + { + "epoch": 15.664303150063931, + "grad_norm": 1.4476861851518783, + "learning_rate": 7.832455052696838e-07, + "loss": 0.8558, + "step": 202140 + }, + { + "epoch": 15.665078073540238, + "grad_norm": 1.6054526472761974, + "learning_rate": 7.832842529448235e-07, + "loss": 0.8718, + "step": 202150 + }, + { + "epoch": 15.665852997016545, + "grad_norm": 1.5637896774223021, + "learning_rate": 7.833230006199629e-07, + "loss": 0.8553, + "step": 202160 + }, + { + "epoch": 15.666627920492852, + "grad_norm": 1.558586665599162, + "learning_rate": 7.833617482951024e-07, + "loss": 0.8666, + "step": 202170 + }, + { + "epoch": 15.667402843969159, + "grad_norm": 1.512422393433632, + "learning_rate": 7.834004959702418e-07, + "loss": 0.8655, + "step": 202180 + }, + { + "epoch": 15.668177767445465, + "grad_norm": 1.584748949456291, + "learning_rate": 7.834392436453813e-07, + "loss": 0.854, + "step": 202190 + }, + { + "epoch": 15.668952690921772, + "grad_norm": 1.5624904771880488, + "learning_rate": 7.834779913205208e-07, + "loss": 0.8669, + "step": 202200 + }, + { + "epoch": 15.669727614398077, + "grad_norm": 1.5402031280360577, + "learning_rate": 7.835167389956604e-07, + "loss": 0.8735, + "step": 202210 + }, + { + "epoch": 15.670502537874384, + "grad_norm": 1.5762114730908163, + "learning_rate": 7.835554866707998e-07, + "loss": 0.8629, + "step": 202220 + }, + { + "epoch": 15.67127746135069, + "grad_norm": 1.553997637966504, + "learning_rate": 7.835942343459393e-07, + "loss": 0.8609, + "step": 202230 + }, + { + "epoch": 15.672052384826998, + "grad_norm": 1.4465107380970077, + "learning_rate": 7.836329820210787e-07, + "loss": 0.8611, + "step": 202240 + }, + { + "epoch": 15.672827308303305, + "grad_norm": 1.6379006254709945, + "learning_rate": 7.836717296962184e-07, + "loss": 0.8497, + "step": 202250 + }, + { + "epoch": 15.673602231779611, + "grad_norm": 1.5398586023122305, + "learning_rate": 7.837104773713578e-07, + "loss": 0.8438, + "step": 202260 + }, + { + "epoch": 15.674377155255918, + "grad_norm": 1.5465809648004916, + "learning_rate": 7.837492250464973e-07, + "loss": 0.8805, + "step": 202270 + }, + { + "epoch": 15.675152078732225, + "grad_norm": 1.5705784758531698, + "learning_rate": 7.837879727216367e-07, + "loss": 0.8517, + "step": 202280 + }, + { + "epoch": 15.675927002208532, + "grad_norm": 1.6561174414595572, + "learning_rate": 7.838267203967763e-07, + "loss": 0.8655, + "step": 202290 + }, + { + "epoch": 15.676701925684839, + "grad_norm": 1.6275572022818665, + "learning_rate": 7.838654680719158e-07, + "loss": 0.8639, + "step": 202300 + }, + { + "epoch": 15.677476849161145, + "grad_norm": 1.543864514685476, + "learning_rate": 7.839042157470553e-07, + "loss": 0.8561, + "step": 202310 + }, + { + "epoch": 15.678251772637452, + "grad_norm": 1.6369063270955366, + "learning_rate": 7.839429634221947e-07, + "loss": 0.862, + "step": 202320 + }, + { + "epoch": 15.679026696113759, + "grad_norm": 1.7118220769999615, + "learning_rate": 7.839817110973342e-07, + "loss": 0.8613, + "step": 202330 + }, + { + "epoch": 15.679801619590066, + "grad_norm": 1.8354616100430035, + "learning_rate": 7.840204587724736e-07, + "loss": 0.8829, + "step": 202340 + }, + { + "epoch": 15.680576543066373, + "grad_norm": 1.4586321023147435, + "learning_rate": 7.840592064476133e-07, + "loss": 0.8727, + "step": 202350 + }, + { + "epoch": 15.68135146654268, + "grad_norm": 1.5499713401651896, + "learning_rate": 7.840979541227527e-07, + "loss": 0.885, + "step": 202360 + }, + { + "epoch": 15.682126390018986, + "grad_norm": 1.6661241921558334, + "learning_rate": 7.841367017978922e-07, + "loss": 0.872, + "step": 202370 + }, + { + "epoch": 15.682901313495293, + "grad_norm": 1.568751742558913, + "learning_rate": 7.841754494730316e-07, + "loss": 0.8496, + "step": 202380 + }, + { + "epoch": 15.6836762369716, + "grad_norm": 1.584248685211062, + "learning_rate": 7.842141971481712e-07, + "loss": 0.8666, + "step": 202390 + }, + { + "epoch": 15.684451160447907, + "grad_norm": 1.4623076817056464, + "learning_rate": 7.842529448233107e-07, + "loss": 0.8436, + "step": 202400 + }, + { + "epoch": 15.685226083924212, + "grad_norm": 1.5867309509989078, + "learning_rate": 7.842916924984502e-07, + "loss": 0.8655, + "step": 202410 + }, + { + "epoch": 15.686001007400519, + "grad_norm": 1.4935280793656964, + "learning_rate": 7.843304401735896e-07, + "loss": 0.8832, + "step": 202420 + }, + { + "epoch": 15.686775930876825, + "grad_norm": 1.5232280938684535, + "learning_rate": 7.843691878487291e-07, + "loss": 0.8623, + "step": 202430 + }, + { + "epoch": 15.687550854353132, + "grad_norm": 1.6302366625716747, + "learning_rate": 7.844079355238686e-07, + "loss": 0.8724, + "step": 202440 + }, + { + "epoch": 15.688325777829439, + "grad_norm": 1.5001799335528632, + "learning_rate": 7.844466831990082e-07, + "loss": 0.8715, + "step": 202450 + }, + { + "epoch": 15.689100701305746, + "grad_norm": 1.500465062921685, + "learning_rate": 7.844854308741476e-07, + "loss": 0.8505, + "step": 202460 + }, + { + "epoch": 15.689875624782053, + "grad_norm": 1.6964633601047343, + "learning_rate": 7.845241785492871e-07, + "loss": 0.8739, + "step": 202470 + }, + { + "epoch": 15.69065054825836, + "grad_norm": 1.5251274375065995, + "learning_rate": 7.845629262244265e-07, + "loss": 0.846, + "step": 202480 + }, + { + "epoch": 15.691425471734666, + "grad_norm": 1.719582508722896, + "learning_rate": 7.846016738995661e-07, + "loss": 0.8783, + "step": 202490 + }, + { + "epoch": 15.692200395210973, + "grad_norm": 1.5635879580112102, + "learning_rate": 7.846404215747056e-07, + "loss": 0.8648, + "step": 202500 + }, + { + "epoch": 15.692200395210973, + "eval_loss": 0.8901020288467407, + "eval_runtime": 333.94, + "eval_samples_per_second": 34.35, + "eval_steps_per_second": 8.588, + "step": 202500 + }, + { + "epoch": 15.69297531868728, + "grad_norm": 1.65198961961727, + "learning_rate": 7.846791692498451e-07, + "loss": 0.8558, + "step": 202510 + }, + { + "epoch": 15.693750242163587, + "grad_norm": 1.5558307018801105, + "learning_rate": 7.847179169249845e-07, + "loss": 0.863, + "step": 202520 + }, + { + "epoch": 15.694525165639893, + "grad_norm": 1.5696133589769725, + "learning_rate": 7.847566646001241e-07, + "loss": 0.8614, + "step": 202530 + }, + { + "epoch": 15.6953000891162, + "grad_norm": 1.53460698217635, + "learning_rate": 7.847954122752635e-07, + "loss": 0.894, + "step": 202540 + }, + { + "epoch": 15.696075012592507, + "grad_norm": 1.5620485776946915, + "learning_rate": 7.848341599504031e-07, + "loss": 0.8693, + "step": 202550 + }, + { + "epoch": 15.696849936068814, + "grad_norm": 1.4896150781654365, + "learning_rate": 7.848729076255425e-07, + "loss": 0.8603, + "step": 202560 + }, + { + "epoch": 15.69762485954512, + "grad_norm": 1.520086573007979, + "learning_rate": 7.84911655300682e-07, + "loss": 0.849, + "step": 202570 + }, + { + "epoch": 15.698399783021426, + "grad_norm": 1.5741762576172282, + "learning_rate": 7.849504029758215e-07, + "loss": 0.8378, + "step": 202580 + }, + { + "epoch": 15.699174706497733, + "grad_norm": 1.5811094639703975, + "learning_rate": 7.84989150650961e-07, + "loss": 0.865, + "step": 202590 + }, + { + "epoch": 15.69994962997404, + "grad_norm": 1.44329282577975, + "learning_rate": 7.850278983261005e-07, + "loss": 0.8516, + "step": 202600 + }, + { + "epoch": 15.700724553450346, + "grad_norm": 1.5915777908601303, + "learning_rate": 7.8506664600124e-07, + "loss": 0.8839, + "step": 202610 + }, + { + "epoch": 15.701499476926653, + "grad_norm": 1.4554038601655088, + "learning_rate": 7.851053936763794e-07, + "loss": 0.855, + "step": 202620 + }, + { + "epoch": 15.70227440040296, + "grad_norm": 1.4961458173820887, + "learning_rate": 7.85144141351519e-07, + "loss": 0.8591, + "step": 202630 + }, + { + "epoch": 15.703049323879267, + "grad_norm": 1.5335485033077607, + "learning_rate": 7.851828890266584e-07, + "loss": 0.8576, + "step": 202640 + }, + { + "epoch": 15.703824247355573, + "grad_norm": 1.4901387068661511, + "learning_rate": 7.85221636701798e-07, + "loss": 0.879, + "step": 202650 + }, + { + "epoch": 15.70459917083188, + "grad_norm": 1.6960661539724793, + "learning_rate": 7.852603843769374e-07, + "loss": 0.8681, + "step": 202660 + }, + { + "epoch": 15.705374094308187, + "grad_norm": 1.5404136268215538, + "learning_rate": 7.85299132052077e-07, + "loss": 0.8557, + "step": 202670 + }, + { + "epoch": 15.706149017784494, + "grad_norm": 1.5491024536997413, + "learning_rate": 7.853378797272164e-07, + "loss": 0.847, + "step": 202680 + }, + { + "epoch": 15.7069239412608, + "grad_norm": 1.4994697503357084, + "learning_rate": 7.85376627402356e-07, + "loss": 0.8421, + "step": 202690 + }, + { + "epoch": 15.707698864737107, + "grad_norm": 1.5998072479077823, + "learning_rate": 7.854153750774954e-07, + "loss": 0.8775, + "step": 202700 + }, + { + "epoch": 15.708473788213414, + "grad_norm": 1.4550379730807732, + "learning_rate": 7.854541227526349e-07, + "loss": 0.8599, + "step": 202710 + }, + { + "epoch": 15.709248711689721, + "grad_norm": 1.6416449222210974, + "learning_rate": 7.854928704277744e-07, + "loss": 0.8615, + "step": 202720 + }, + { + "epoch": 15.710023635166028, + "grad_norm": 1.5174682888365694, + "learning_rate": 7.855316181029139e-07, + "loss": 0.8583, + "step": 202730 + }, + { + "epoch": 15.710798558642335, + "grad_norm": 1.6500757558793266, + "learning_rate": 7.855703657780534e-07, + "loss": 0.8512, + "step": 202740 + }, + { + "epoch": 15.711573482118641, + "grad_norm": 1.5321953414889298, + "learning_rate": 7.856091134531929e-07, + "loss": 0.8842, + "step": 202750 + }, + { + "epoch": 15.712348405594948, + "grad_norm": 1.6228312609368434, + "learning_rate": 7.856478611283323e-07, + "loss": 0.8403, + "step": 202760 + }, + { + "epoch": 15.713123329071255, + "grad_norm": 1.643401539402815, + "learning_rate": 7.856866088034719e-07, + "loss": 0.8716, + "step": 202770 + }, + { + "epoch": 15.71389825254756, + "grad_norm": 1.5763992716690856, + "learning_rate": 7.857253564786113e-07, + "loss": 0.8594, + "step": 202780 + }, + { + "epoch": 15.714673176023867, + "grad_norm": 1.5282451688981284, + "learning_rate": 7.857641041537509e-07, + "loss": 0.87, + "step": 202790 + }, + { + "epoch": 15.715448099500174, + "grad_norm": 1.5045278812869396, + "learning_rate": 7.858028518288903e-07, + "loss": 0.8385, + "step": 202800 + }, + { + "epoch": 15.71622302297648, + "grad_norm": 1.6331690053740717, + "learning_rate": 7.858415995040299e-07, + "loss": 0.8796, + "step": 202810 + }, + { + "epoch": 15.716997946452787, + "grad_norm": 1.5255575131842312, + "learning_rate": 7.858803471791693e-07, + "loss": 0.8691, + "step": 202820 + }, + { + "epoch": 15.717772869929094, + "grad_norm": 1.5848589926486367, + "learning_rate": 7.859190948543088e-07, + "loss": 0.8488, + "step": 202830 + }, + { + "epoch": 15.718547793405401, + "grad_norm": 1.6661929743659492, + "learning_rate": 7.859578425294483e-07, + "loss": 0.8679, + "step": 202840 + }, + { + "epoch": 15.719322716881708, + "grad_norm": 1.5102973132204078, + "learning_rate": 7.859965902045878e-07, + "loss": 0.8553, + "step": 202850 + }, + { + "epoch": 15.720097640358015, + "grad_norm": 1.5982013553949483, + "learning_rate": 7.860353378797273e-07, + "loss": 0.8583, + "step": 202860 + }, + { + "epoch": 15.720872563834321, + "grad_norm": 1.6120499001704205, + "learning_rate": 7.860740855548668e-07, + "loss": 0.859, + "step": 202870 + }, + { + "epoch": 15.721647487310628, + "grad_norm": 1.5717838034734504, + "learning_rate": 7.861128332300062e-07, + "loss": 0.8725, + "step": 202880 + }, + { + "epoch": 15.722422410786935, + "grad_norm": 1.5401278969616252, + "learning_rate": 7.861515809051458e-07, + "loss": 0.846, + "step": 202890 + }, + { + "epoch": 15.723197334263242, + "grad_norm": 1.4097884449716767, + "learning_rate": 7.861903285802852e-07, + "loss": 0.8471, + "step": 202900 + }, + { + "epoch": 15.723972257739549, + "grad_norm": 1.5685088906536098, + "learning_rate": 7.862290762554248e-07, + "loss": 0.8501, + "step": 202910 + }, + { + "epoch": 15.724747181215855, + "grad_norm": 1.6350349716576011, + "learning_rate": 7.862678239305642e-07, + "loss": 0.8745, + "step": 202920 + }, + { + "epoch": 15.725522104692162, + "grad_norm": 1.7127139175678265, + "learning_rate": 7.863065716057037e-07, + "loss": 0.8637, + "step": 202930 + }, + { + "epoch": 15.726297028168469, + "grad_norm": 1.5614242755453185, + "learning_rate": 7.863453192808432e-07, + "loss": 0.8536, + "step": 202940 + }, + { + "epoch": 15.727071951644776, + "grad_norm": 1.4607235588192562, + "learning_rate": 7.863840669559828e-07, + "loss": 0.869, + "step": 202950 + }, + { + "epoch": 15.727846875121081, + "grad_norm": 1.6855889769783634, + "learning_rate": 7.864228146311222e-07, + "loss": 0.8697, + "step": 202960 + }, + { + "epoch": 15.728621798597388, + "grad_norm": 1.5204750321084615, + "learning_rate": 7.864615623062617e-07, + "loss": 0.8492, + "step": 202970 + }, + { + "epoch": 15.729396722073695, + "grad_norm": 1.6161964089997958, + "learning_rate": 7.865003099814011e-07, + "loss": 0.8805, + "step": 202980 + }, + { + "epoch": 15.730171645550001, + "grad_norm": 1.5252691166140775, + "learning_rate": 7.865390576565407e-07, + "loss": 0.8658, + "step": 202990 + }, + { + "epoch": 15.730946569026308, + "grad_norm": 1.5516295319374631, + "learning_rate": 7.865778053316802e-07, + "loss": 0.8437, + "step": 203000 + }, + { + "epoch": 15.730946569026308, + "eval_loss": 0.8899557590484619, + "eval_runtime": 328.6937, + "eval_samples_per_second": 34.899, + "eval_steps_per_second": 8.725, + "step": 203000 + }, + { + "epoch": 15.731721492502615, + "grad_norm": 1.59518442229596, + "learning_rate": 7.866165530068197e-07, + "loss": 0.8787, + "step": 203010 + }, + { + "epoch": 15.732496415978922, + "grad_norm": 1.3955432176208407, + "learning_rate": 7.866553006819591e-07, + "loss": 0.872, + "step": 203020 + }, + { + "epoch": 15.733271339455229, + "grad_norm": 1.4464877493036186, + "learning_rate": 7.866940483570986e-07, + "loss": 0.8478, + "step": 203030 + }, + { + "epoch": 15.734046262931535, + "grad_norm": 1.5900948745279997, + "learning_rate": 7.867327960322381e-07, + "loss": 0.8619, + "step": 203040 + }, + { + "epoch": 15.734821186407842, + "grad_norm": 1.6617560331911263, + "learning_rate": 7.867715437073777e-07, + "loss": 0.8686, + "step": 203050 + }, + { + "epoch": 15.735596109884149, + "grad_norm": 1.5897807845038365, + "learning_rate": 7.868102913825171e-07, + "loss": 0.8709, + "step": 203060 + }, + { + "epoch": 15.736371033360456, + "grad_norm": 1.5604496855764116, + "learning_rate": 7.868490390576566e-07, + "loss": 0.8723, + "step": 203070 + }, + { + "epoch": 15.737145956836763, + "grad_norm": 1.6056465560233901, + "learning_rate": 7.86887786732796e-07, + "loss": 0.8554, + "step": 203080 + }, + { + "epoch": 15.73792088031307, + "grad_norm": 1.8698973559025736, + "learning_rate": 7.869265344079357e-07, + "loss": 0.8816, + "step": 203090 + }, + { + "epoch": 15.738695803789376, + "grad_norm": 1.6472150162742363, + "learning_rate": 7.869652820830751e-07, + "loss": 0.8676, + "step": 203100 + }, + { + "epoch": 15.739470727265683, + "grad_norm": 1.5859149632024732, + "learning_rate": 7.870040297582146e-07, + "loss": 0.8853, + "step": 203110 + }, + { + "epoch": 15.74024565074199, + "grad_norm": 1.5600133973629324, + "learning_rate": 7.87042777433354e-07, + "loss": 0.8477, + "step": 203120 + }, + { + "epoch": 15.741020574218297, + "grad_norm": 1.4947740789690998, + "learning_rate": 7.870815251084935e-07, + "loss": 0.8642, + "step": 203130 + }, + { + "epoch": 15.741795497694604, + "grad_norm": 1.7338393644558308, + "learning_rate": 7.87120272783633e-07, + "loss": 0.856, + "step": 203140 + }, + { + "epoch": 15.742570421170909, + "grad_norm": 1.7102950359378315, + "learning_rate": 7.871590204587726e-07, + "loss": 0.867, + "step": 203150 + }, + { + "epoch": 15.743345344647215, + "grad_norm": 1.4880818732337884, + "learning_rate": 7.87197768133912e-07, + "loss": 0.853, + "step": 203160 + }, + { + "epoch": 15.744120268123522, + "grad_norm": 1.6191154874868787, + "learning_rate": 7.872365158090515e-07, + "loss": 0.8485, + "step": 203170 + }, + { + "epoch": 15.744895191599829, + "grad_norm": 1.6140987510216236, + "learning_rate": 7.872752634841909e-07, + "loss": 0.8779, + "step": 203180 + }, + { + "epoch": 15.745670115076136, + "grad_norm": 1.4708271150928367, + "learning_rate": 7.873140111593306e-07, + "loss": 0.8549, + "step": 203190 + }, + { + "epoch": 15.746445038552443, + "grad_norm": 1.5732903748004363, + "learning_rate": 7.8735275883447e-07, + "loss": 0.8512, + "step": 203200 + }, + { + "epoch": 15.74721996202875, + "grad_norm": 1.4802850411309665, + "learning_rate": 7.873915065096095e-07, + "loss": 0.8479, + "step": 203210 + }, + { + "epoch": 15.747994885505056, + "grad_norm": 1.4127959156346859, + "learning_rate": 7.874302541847489e-07, + "loss": 0.8687, + "step": 203220 + }, + { + "epoch": 15.748769808981363, + "grad_norm": 1.5578510670153214, + "learning_rate": 7.874690018598886e-07, + "loss": 0.8524, + "step": 203230 + }, + { + "epoch": 15.74954473245767, + "grad_norm": 1.5925631263809787, + "learning_rate": 7.87507749535028e-07, + "loss": 0.8504, + "step": 203240 + }, + { + "epoch": 15.750319655933977, + "grad_norm": 1.5145578762343717, + "learning_rate": 7.875464972101675e-07, + "loss": 0.8615, + "step": 203250 + }, + { + "epoch": 15.751094579410283, + "grad_norm": 1.4992285735827158, + "learning_rate": 7.875852448853069e-07, + "loss": 0.8855, + "step": 203260 + }, + { + "epoch": 15.75186950288659, + "grad_norm": 1.5468230327030768, + "learning_rate": 7.876239925604464e-07, + "loss": 0.8575, + "step": 203270 + }, + { + "epoch": 15.752644426362897, + "grad_norm": 1.4929003835575283, + "learning_rate": 7.876627402355858e-07, + "loss": 0.8613, + "step": 203280 + }, + { + "epoch": 15.753419349839204, + "grad_norm": 1.7071669620525771, + "learning_rate": 7.877014879107255e-07, + "loss": 0.8572, + "step": 203290 + }, + { + "epoch": 15.75419427331551, + "grad_norm": 1.535280644081538, + "learning_rate": 7.877402355858649e-07, + "loss": 0.842, + "step": 203300 + }, + { + "epoch": 15.754969196791818, + "grad_norm": 1.5798674844288838, + "learning_rate": 7.877789832610044e-07, + "loss": 0.8608, + "step": 203310 + }, + { + "epoch": 15.755744120268124, + "grad_norm": 1.4632757347666148, + "learning_rate": 7.878177309361438e-07, + "loss": 0.8653, + "step": 203320 + }, + { + "epoch": 15.75651904374443, + "grad_norm": 1.4682224817575247, + "learning_rate": 7.878564786112835e-07, + "loss": 0.8595, + "step": 203330 + }, + { + "epoch": 15.757293967220736, + "grad_norm": 1.5211428746325162, + "learning_rate": 7.878952262864229e-07, + "loss": 0.8695, + "step": 203340 + }, + { + "epoch": 15.758068890697043, + "grad_norm": 1.5128395686965195, + "learning_rate": 7.879339739615624e-07, + "loss": 0.8672, + "step": 203350 + }, + { + "epoch": 15.75884381417335, + "grad_norm": 1.49740740971731, + "learning_rate": 7.879727216367018e-07, + "loss": 0.8595, + "step": 203360 + }, + { + "epoch": 15.759618737649657, + "grad_norm": 1.5502404663569933, + "learning_rate": 7.880114693118414e-07, + "loss": 0.8489, + "step": 203370 + }, + { + "epoch": 15.760393661125963, + "grad_norm": 1.5380502324831757, + "learning_rate": 7.880502169869809e-07, + "loss": 0.8625, + "step": 203380 + }, + { + "epoch": 15.76116858460227, + "grad_norm": 1.488800998374155, + "learning_rate": 7.880889646621204e-07, + "loss": 0.8703, + "step": 203390 + }, + { + "epoch": 15.761943508078577, + "grad_norm": 1.5146695380941086, + "learning_rate": 7.881277123372598e-07, + "loss": 0.861, + "step": 203400 + }, + { + "epoch": 15.762718431554884, + "grad_norm": 1.5432489394508055, + "learning_rate": 7.881664600123993e-07, + "loss": 0.8588, + "step": 203410 + }, + { + "epoch": 15.76349335503119, + "grad_norm": 1.4495847818648595, + "learning_rate": 7.882052076875387e-07, + "loss": 0.8625, + "step": 203420 + }, + { + "epoch": 15.764268278507497, + "grad_norm": 1.6434639125903103, + "learning_rate": 7.882439553626784e-07, + "loss": 0.8589, + "step": 203430 + }, + { + "epoch": 15.765043201983804, + "grad_norm": 1.6282333687297041, + "learning_rate": 7.882827030378178e-07, + "loss": 0.8527, + "step": 203440 + }, + { + "epoch": 15.765818125460111, + "grad_norm": 1.6276265457171892, + "learning_rate": 7.883214507129573e-07, + "loss": 0.8476, + "step": 203450 + }, + { + "epoch": 15.766593048936418, + "grad_norm": 1.5801490686513953, + "learning_rate": 7.883601983880967e-07, + "loss": 0.8721, + "step": 203460 + }, + { + "epoch": 15.767367972412725, + "grad_norm": 1.6120336276235654, + "learning_rate": 7.883989460632363e-07, + "loss": 0.8452, + "step": 203470 + }, + { + "epoch": 15.768142895889032, + "grad_norm": 1.5629846686874862, + "learning_rate": 7.884376937383758e-07, + "loss": 0.871, + "step": 203480 + }, + { + "epoch": 15.768917819365338, + "grad_norm": 1.5894751113777439, + "learning_rate": 7.884764414135153e-07, + "loss": 0.8558, + "step": 203490 + }, + { + "epoch": 15.769692742841645, + "grad_norm": 1.5204898285492947, + "learning_rate": 7.885151890886547e-07, + "loss": 0.8874, + "step": 203500 + }, + { + "epoch": 15.769692742841645, + "eval_loss": 0.8897650837898254, + "eval_runtime": 328.3387, + "eval_samples_per_second": 34.936, + "eval_steps_per_second": 8.735, + "step": 203500 + }, + { + "epoch": 15.770467666317952, + "grad_norm": 1.5443615291020243, + "learning_rate": 7.885539367637943e-07, + "loss": 0.8549, + "step": 203510 + }, + { + "epoch": 15.771242589794259, + "grad_norm": 1.5169756551624984, + "learning_rate": 7.885926844389337e-07, + "loss": 0.8762, + "step": 203520 + }, + { + "epoch": 15.772017513270564, + "grad_norm": 1.6076887889364513, + "learning_rate": 7.886314321140733e-07, + "loss": 0.8714, + "step": 203530 + }, + { + "epoch": 15.77279243674687, + "grad_norm": 1.4896873965303634, + "learning_rate": 7.886701797892127e-07, + "loss": 0.8445, + "step": 203540 + }, + { + "epoch": 15.773567360223177, + "grad_norm": 1.5302420588266468, + "learning_rate": 7.887089274643522e-07, + "loss": 0.87, + "step": 203550 + }, + { + "epoch": 15.774342283699484, + "grad_norm": 1.6054515257925146, + "learning_rate": 7.887476751394916e-07, + "loss": 0.8651, + "step": 203560 + }, + { + "epoch": 15.775117207175791, + "grad_norm": 1.5389403256217677, + "learning_rate": 7.887864228146312e-07, + "loss": 0.8751, + "step": 203570 + }, + { + "epoch": 15.775892130652098, + "grad_norm": 1.5616066148288688, + "learning_rate": 7.888251704897707e-07, + "loss": 0.8484, + "step": 203580 + }, + { + "epoch": 15.776667054128405, + "grad_norm": 1.572437951598128, + "learning_rate": 7.888639181649102e-07, + "loss": 0.8318, + "step": 203590 + }, + { + "epoch": 15.777441977604711, + "grad_norm": 1.4926226853318605, + "learning_rate": 7.889026658400496e-07, + "loss": 0.8892, + "step": 203600 + }, + { + "epoch": 15.778216901081018, + "grad_norm": 1.5408497188874588, + "learning_rate": 7.889414135151892e-07, + "loss": 0.8601, + "step": 203610 + }, + { + "epoch": 15.778991824557325, + "grad_norm": 1.53617859884413, + "learning_rate": 7.889801611903286e-07, + "loss": 0.8832, + "step": 203620 + }, + { + "epoch": 15.779766748033632, + "grad_norm": 1.5539629510886008, + "learning_rate": 7.890189088654682e-07, + "loss": 0.8846, + "step": 203630 + }, + { + "epoch": 15.780541671509939, + "grad_norm": 1.5281663850933125, + "learning_rate": 7.890576565406076e-07, + "loss": 0.8769, + "step": 203640 + }, + { + "epoch": 15.781316594986246, + "grad_norm": 1.4798187840791805, + "learning_rate": 7.890964042157472e-07, + "loss": 0.8436, + "step": 203650 + }, + { + "epoch": 15.782091518462552, + "grad_norm": 1.596185532823444, + "learning_rate": 7.891351518908866e-07, + "loss": 0.8424, + "step": 203660 + }, + { + "epoch": 15.78286644193886, + "grad_norm": 1.588411217551808, + "learning_rate": 7.891738995660261e-07, + "loss": 0.8664, + "step": 203670 + }, + { + "epoch": 15.783641365415166, + "grad_norm": 1.5250104082343394, + "learning_rate": 7.892126472411656e-07, + "loss": 0.8751, + "step": 203680 + }, + { + "epoch": 15.784416288891473, + "grad_norm": 1.4814312178826656, + "learning_rate": 7.892513949163051e-07, + "loss": 0.8598, + "step": 203690 + }, + { + "epoch": 15.785191212367778, + "grad_norm": 1.574019632275046, + "learning_rate": 7.892901425914445e-07, + "loss": 0.8757, + "step": 203700 + }, + { + "epoch": 15.785966135844085, + "grad_norm": 1.588353181308838, + "learning_rate": 7.893288902665841e-07, + "loss": 0.8634, + "step": 203710 + }, + { + "epoch": 15.786741059320391, + "grad_norm": 1.66288317350922, + "learning_rate": 7.893676379417235e-07, + "loss": 0.8638, + "step": 203720 + }, + { + "epoch": 15.787515982796698, + "grad_norm": 1.5777288009881327, + "learning_rate": 7.894063856168631e-07, + "loss": 0.8558, + "step": 203730 + }, + { + "epoch": 15.788290906273005, + "grad_norm": 1.5182555722060667, + "learning_rate": 7.894451332920025e-07, + "loss": 0.8729, + "step": 203740 + }, + { + "epoch": 15.789065829749312, + "grad_norm": 1.441945015067755, + "learning_rate": 7.894838809671421e-07, + "loss": 0.8631, + "step": 203750 + }, + { + "epoch": 15.789840753225619, + "grad_norm": 1.5366879189165006, + "learning_rate": 7.895226286422815e-07, + "loss": 0.8386, + "step": 203760 + }, + { + "epoch": 15.790615676701925, + "grad_norm": 1.5270749108927844, + "learning_rate": 7.89561376317421e-07, + "loss": 0.8419, + "step": 203770 + }, + { + "epoch": 15.791390600178232, + "grad_norm": 1.5317675819233745, + "learning_rate": 7.896001239925605e-07, + "loss": 0.8577, + "step": 203780 + }, + { + "epoch": 15.792165523654539, + "grad_norm": 1.5983120853651793, + "learning_rate": 7.896388716677001e-07, + "loss": 0.8461, + "step": 203790 + }, + { + "epoch": 15.792940447130846, + "grad_norm": 1.4304156997258846, + "learning_rate": 7.896776193428395e-07, + "loss": 0.8646, + "step": 203800 + }, + { + "epoch": 15.793715370607153, + "grad_norm": 1.606836288635036, + "learning_rate": 7.89716367017979e-07, + "loss": 0.8771, + "step": 203810 + }, + { + "epoch": 15.79449029408346, + "grad_norm": 1.504677660892026, + "learning_rate": 7.897551146931184e-07, + "loss": 0.8567, + "step": 203820 + }, + { + "epoch": 15.795265217559766, + "grad_norm": 1.6337003996653776, + "learning_rate": 7.89793862368258e-07, + "loss": 0.8517, + "step": 203830 + }, + { + "epoch": 15.796040141036073, + "grad_norm": 1.6022331508938146, + "learning_rate": 7.898326100433974e-07, + "loss": 0.8514, + "step": 203840 + }, + { + "epoch": 15.79681506451238, + "grad_norm": 1.5441504516563171, + "learning_rate": 7.89871357718537e-07, + "loss": 0.8583, + "step": 203850 + }, + { + "epoch": 15.797589987988687, + "grad_norm": 1.5627209198074727, + "learning_rate": 7.899101053936764e-07, + "loss": 0.8524, + "step": 203860 + }, + { + "epoch": 15.798364911464994, + "grad_norm": 1.5852231459412511, + "learning_rate": 7.899488530688159e-07, + "loss": 0.8639, + "step": 203870 + }, + { + "epoch": 15.7991398349413, + "grad_norm": 1.5309333612281502, + "learning_rate": 7.899876007439554e-07, + "loss": 0.8601, + "step": 203880 + }, + { + "epoch": 15.799914758417607, + "grad_norm": 1.5985220551542285, + "learning_rate": 7.90026348419095e-07, + "loss": 0.8605, + "step": 203890 + }, + { + "epoch": 15.800689681893912, + "grad_norm": 1.5632270705746572, + "learning_rate": 7.900650960942344e-07, + "loss": 0.8649, + "step": 203900 + }, + { + "epoch": 15.801464605370219, + "grad_norm": 1.4457210950016803, + "learning_rate": 7.901038437693739e-07, + "loss": 0.8558, + "step": 203910 + }, + { + "epoch": 15.802239528846526, + "grad_norm": 1.5033492482940973, + "learning_rate": 7.901425914445133e-07, + "loss": 0.8743, + "step": 203920 + }, + { + "epoch": 15.803014452322833, + "grad_norm": 1.6160339901289682, + "learning_rate": 7.901813391196529e-07, + "loss": 0.8661, + "step": 203930 + }, + { + "epoch": 15.80378937579914, + "grad_norm": 1.5179920536168836, + "learning_rate": 7.902200867947924e-07, + "loss": 0.8728, + "step": 203940 + }, + { + "epoch": 15.804564299275446, + "grad_norm": 1.790461116500637, + "learning_rate": 7.902588344699319e-07, + "loss": 0.8595, + "step": 203950 + }, + { + "epoch": 15.805339222751753, + "grad_norm": 1.7088908931545355, + "learning_rate": 7.902975821450713e-07, + "loss": 0.859, + "step": 203960 + }, + { + "epoch": 15.80611414622806, + "grad_norm": 1.6897972726491934, + "learning_rate": 7.903363298202108e-07, + "loss": 0.8796, + "step": 203970 + }, + { + "epoch": 15.806889069704367, + "grad_norm": 1.5171633649972802, + "learning_rate": 7.903750774953503e-07, + "loss": 0.8786, + "step": 203980 + }, + { + "epoch": 15.807663993180674, + "grad_norm": 1.5280784474682325, + "learning_rate": 7.904138251704899e-07, + "loss": 0.8542, + "step": 203990 + }, + { + "epoch": 15.80843891665698, + "grad_norm": 1.4815494516193883, + "learning_rate": 7.904525728456293e-07, + "loss": 0.853, + "step": 204000 + }, + { + "epoch": 15.80843891665698, + "eval_loss": 0.8894774317741394, + "eval_runtime": 327.4104, + "eval_samples_per_second": 35.036, + "eval_steps_per_second": 8.76, + "step": 204000 + }, + { + "epoch": 15.809213840133287, + "grad_norm": 1.5631881148165316, + "learning_rate": 7.904913205207688e-07, + "loss": 0.8381, + "step": 204010 + }, + { + "epoch": 15.809988763609594, + "grad_norm": 1.4924941697606984, + "learning_rate": 7.905300681959082e-07, + "loss": 0.8698, + "step": 204020 + }, + { + "epoch": 15.8107636870859, + "grad_norm": 1.5480829946369592, + "learning_rate": 7.905688158710479e-07, + "loss": 0.8594, + "step": 204030 + }, + { + "epoch": 15.811538610562208, + "grad_norm": 1.5527734394854438, + "learning_rate": 7.906075635461873e-07, + "loss": 0.8427, + "step": 204040 + }, + { + "epoch": 15.812313534038514, + "grad_norm": 1.5637612079723093, + "learning_rate": 7.906463112213268e-07, + "loss": 0.8511, + "step": 204050 + }, + { + "epoch": 15.813088457514821, + "grad_norm": 1.5393076836528437, + "learning_rate": 7.906850588964662e-07, + "loss": 0.8726, + "step": 204060 + }, + { + "epoch": 15.813863380991126, + "grad_norm": 1.4421757628956628, + "learning_rate": 7.907238065716057e-07, + "loss": 0.8576, + "step": 204070 + }, + { + "epoch": 15.814638304467433, + "grad_norm": 1.6562706116033457, + "learning_rate": 7.907625542467453e-07, + "loss": 0.8594, + "step": 204080 + }, + { + "epoch": 15.81541322794374, + "grad_norm": 1.5963189728580758, + "learning_rate": 7.908013019218848e-07, + "loss": 0.8565, + "step": 204090 + }, + { + "epoch": 15.816188151420047, + "grad_norm": 1.5411279907941544, + "learning_rate": 7.908400495970242e-07, + "loss": 0.8591, + "step": 204100 + }, + { + "epoch": 15.816963074896353, + "grad_norm": 1.640637539033136, + "learning_rate": 7.908787972721637e-07, + "loss": 0.8524, + "step": 204110 + }, + { + "epoch": 15.81773799837266, + "grad_norm": 1.5599113581716972, + "learning_rate": 7.909175449473031e-07, + "loss": 0.8655, + "step": 204120 + }, + { + "epoch": 15.818512921848967, + "grad_norm": 1.5833399440041602, + "learning_rate": 7.909562926224428e-07, + "loss": 0.857, + "step": 204130 + }, + { + "epoch": 15.819287845325274, + "grad_norm": 1.508303312030088, + "learning_rate": 7.909950402975822e-07, + "loss": 0.8357, + "step": 204140 + }, + { + "epoch": 15.82006276880158, + "grad_norm": 1.5943988299668872, + "learning_rate": 7.910337879727217e-07, + "loss": 0.8623, + "step": 204150 + }, + { + "epoch": 15.820837692277887, + "grad_norm": 1.6186235252446877, + "learning_rate": 7.910725356478611e-07, + "loss": 0.863, + "step": 204160 + }, + { + "epoch": 15.821612615754194, + "grad_norm": 1.563405029401226, + "learning_rate": 7.911112833230008e-07, + "loss": 0.868, + "step": 204170 + }, + { + "epoch": 15.822387539230501, + "grad_norm": 1.5919419916645468, + "learning_rate": 7.911500309981402e-07, + "loss": 0.8623, + "step": 204180 + }, + { + "epoch": 15.823162462706808, + "grad_norm": 1.5021292321432156, + "learning_rate": 7.911887786732797e-07, + "loss": 0.8579, + "step": 204190 + }, + { + "epoch": 15.823937386183115, + "grad_norm": 1.5365411644209137, + "learning_rate": 7.912275263484191e-07, + "loss": 0.8714, + "step": 204200 + }, + { + "epoch": 15.824712309659422, + "grad_norm": 1.59854625615412, + "learning_rate": 7.912662740235586e-07, + "loss": 0.8454, + "step": 204210 + }, + { + "epoch": 15.825487233135728, + "grad_norm": 1.591373968200136, + "learning_rate": 7.913050216986982e-07, + "loss": 0.8509, + "step": 204220 + }, + { + "epoch": 15.826262156612035, + "grad_norm": 1.4831336751290798, + "learning_rate": 7.913437693738377e-07, + "loss": 0.8792, + "step": 204230 + }, + { + "epoch": 15.827037080088342, + "grad_norm": 1.5842722094284933, + "learning_rate": 7.913825170489771e-07, + "loss": 0.8555, + "step": 204240 + }, + { + "epoch": 15.827812003564649, + "grad_norm": 1.5587427366528446, + "learning_rate": 7.914212647241166e-07, + "loss": 0.8539, + "step": 204250 + }, + { + "epoch": 15.828586927040956, + "grad_norm": 1.5465539602166984, + "learning_rate": 7.91460012399256e-07, + "loss": 0.8759, + "step": 204260 + }, + { + "epoch": 15.82936185051726, + "grad_norm": 1.4628896002080267, + "learning_rate": 7.914987600743957e-07, + "loss": 0.8486, + "step": 204270 + }, + { + "epoch": 15.830136773993567, + "grad_norm": 1.5721318413125551, + "learning_rate": 7.915375077495351e-07, + "loss": 0.8464, + "step": 204280 + }, + { + "epoch": 15.830911697469874, + "grad_norm": 1.514822811009068, + "learning_rate": 7.915762554246746e-07, + "loss": 0.8809, + "step": 204290 + }, + { + "epoch": 15.831686620946181, + "grad_norm": 1.485435980480856, + "learning_rate": 7.91615003099814e-07, + "loss": 0.873, + "step": 204300 + }, + { + "epoch": 15.832461544422488, + "grad_norm": 1.6247679897968315, + "learning_rate": 7.916537507749536e-07, + "loss": 0.8663, + "step": 204310 + }, + { + "epoch": 15.833236467898795, + "grad_norm": 1.649732337746416, + "learning_rate": 7.916924984500931e-07, + "loss": 0.8507, + "step": 204320 + }, + { + "epoch": 15.834011391375101, + "grad_norm": 1.573799864859392, + "learning_rate": 7.917312461252326e-07, + "loss": 0.8589, + "step": 204330 + }, + { + "epoch": 15.834786314851408, + "grad_norm": 1.4596307224829035, + "learning_rate": 7.91769993800372e-07, + "loss": 0.8327, + "step": 204340 + }, + { + "epoch": 15.835561238327715, + "grad_norm": 1.5989719771398532, + "learning_rate": 7.918087414755115e-07, + "loss": 0.8763, + "step": 204350 + }, + { + "epoch": 15.836336161804022, + "grad_norm": 1.6307160596273684, + "learning_rate": 7.91847489150651e-07, + "loss": 0.8514, + "step": 204360 + }, + { + "epoch": 15.837111085280329, + "grad_norm": 1.4485982149507082, + "learning_rate": 7.918862368257906e-07, + "loss": 0.8464, + "step": 204370 + }, + { + "epoch": 15.837886008756636, + "grad_norm": 1.5762336982225449, + "learning_rate": 7.9192498450093e-07, + "loss": 0.8625, + "step": 204380 + }, + { + "epoch": 15.838660932232942, + "grad_norm": 1.5847078226959284, + "learning_rate": 7.919637321760695e-07, + "loss": 0.8651, + "step": 204390 + }, + { + "epoch": 15.83943585570925, + "grad_norm": 1.597452572497491, + "learning_rate": 7.920024798512089e-07, + "loss": 0.8642, + "step": 204400 + }, + { + "epoch": 15.840210779185556, + "grad_norm": 1.5447595958231615, + "learning_rate": 7.920412275263485e-07, + "loss": 0.8798, + "step": 204410 + }, + { + "epoch": 15.840985702661863, + "grad_norm": 1.6911456764855526, + "learning_rate": 7.92079975201488e-07, + "loss": 0.846, + "step": 204420 + }, + { + "epoch": 15.84176062613817, + "grad_norm": 1.5239524465857328, + "learning_rate": 7.921187228766275e-07, + "loss": 0.8583, + "step": 204430 + }, + { + "epoch": 15.842535549614475, + "grad_norm": 1.6019531314258735, + "learning_rate": 7.921574705517669e-07, + "loss": 0.8671, + "step": 204440 + }, + { + "epoch": 15.843310473090781, + "grad_norm": 1.5472971896211223, + "learning_rate": 7.921962182269065e-07, + "loss": 0.8551, + "step": 204450 + }, + { + "epoch": 15.844085396567088, + "grad_norm": 1.5866906799344507, + "learning_rate": 7.922349659020459e-07, + "loss": 0.868, + "step": 204460 + }, + { + "epoch": 15.844860320043395, + "grad_norm": 1.547858829890692, + "learning_rate": 7.922737135771855e-07, + "loss": 0.8713, + "step": 204470 + }, + { + "epoch": 15.845635243519702, + "grad_norm": 1.6645361044356008, + "learning_rate": 7.923124612523249e-07, + "loss": 0.8681, + "step": 204480 + }, + { + "epoch": 15.846410166996009, + "grad_norm": 1.5164905806190552, + "learning_rate": 7.923512089274644e-07, + "loss": 0.87, + "step": 204490 + }, + { + "epoch": 15.847185090472315, + "grad_norm": 1.528910183068831, + "learning_rate": 7.923899566026038e-07, + "loss": 0.8451, + "step": 204500 + }, + { + "epoch": 15.847185090472315, + "eval_loss": 0.8895244598388672, + "eval_runtime": 329.8393, + "eval_samples_per_second": 34.778, + "eval_steps_per_second": 8.695, + "step": 204500 + }, + { + "epoch": 15.847960013948622, + "grad_norm": 1.5230191170484264, + "learning_rate": 7.924287042777434e-07, + "loss": 0.8713, + "step": 204510 + }, + { + "epoch": 15.84873493742493, + "grad_norm": 1.6068927214465514, + "learning_rate": 7.924674519528829e-07, + "loss": 0.8694, + "step": 204520 + }, + { + "epoch": 15.849509860901236, + "grad_norm": 1.4892864948220519, + "learning_rate": 7.925061996280224e-07, + "loss": 0.8471, + "step": 204530 + }, + { + "epoch": 15.850284784377543, + "grad_norm": 1.680269049326047, + "learning_rate": 7.925449473031618e-07, + "loss": 0.8772, + "step": 204540 + }, + { + "epoch": 15.85105970785385, + "grad_norm": 1.4647738180888796, + "learning_rate": 7.925836949783014e-07, + "loss": 0.8557, + "step": 204550 + }, + { + "epoch": 15.851834631330156, + "grad_norm": 1.4331275178796181, + "learning_rate": 7.926224426534408e-07, + "loss": 0.8651, + "step": 204560 + }, + { + "epoch": 15.852609554806463, + "grad_norm": 1.5979929167914824, + "learning_rate": 7.926611903285804e-07, + "loss": 0.8733, + "step": 204570 + }, + { + "epoch": 15.85338447828277, + "grad_norm": 1.5839517480695626, + "learning_rate": 7.926999380037198e-07, + "loss": 0.8539, + "step": 204580 + }, + { + "epoch": 15.854159401759077, + "grad_norm": 1.5490146458481076, + "learning_rate": 7.927386856788594e-07, + "loss": 0.8633, + "step": 204590 + }, + { + "epoch": 15.854934325235384, + "grad_norm": 1.5789246179592065, + "learning_rate": 7.927774333539988e-07, + "loss": 0.8583, + "step": 204600 + }, + { + "epoch": 15.85570924871169, + "grad_norm": 1.5970106148308323, + "learning_rate": 7.928161810291383e-07, + "loss": 0.8611, + "step": 204610 + }, + { + "epoch": 15.856484172187997, + "grad_norm": 1.5098827250423503, + "learning_rate": 7.928549287042778e-07, + "loss": 0.8464, + "step": 204620 + }, + { + "epoch": 15.857259095664304, + "grad_norm": 1.5918276379662244, + "learning_rate": 7.928936763794173e-07, + "loss": 0.8696, + "step": 204630 + }, + { + "epoch": 15.858034019140609, + "grad_norm": 1.6224428833653846, + "learning_rate": 7.929324240545567e-07, + "loss": 0.8458, + "step": 204640 + }, + { + "epoch": 15.858808942616916, + "grad_norm": 1.560395832792127, + "learning_rate": 7.929711717296963e-07, + "loss": 0.8447, + "step": 204650 + }, + { + "epoch": 15.859583866093223, + "grad_norm": 1.5462721281129177, + "learning_rate": 7.930099194048357e-07, + "loss": 0.8523, + "step": 204660 + }, + { + "epoch": 15.86035878956953, + "grad_norm": 1.4749309844805156, + "learning_rate": 7.930486670799753e-07, + "loss": 0.8669, + "step": 204670 + }, + { + "epoch": 15.861133713045836, + "grad_norm": 1.5575374241854867, + "learning_rate": 7.930874147551147e-07, + "loss": 0.8463, + "step": 204680 + }, + { + "epoch": 15.861908636522143, + "grad_norm": 1.6483138656044143, + "learning_rate": 7.931261624302543e-07, + "loss": 0.8481, + "step": 204690 + }, + { + "epoch": 15.86268355999845, + "grad_norm": 1.549599014496052, + "learning_rate": 7.931649101053937e-07, + "loss": 0.8501, + "step": 204700 + }, + { + "epoch": 15.863458483474757, + "grad_norm": 1.609967411008049, + "learning_rate": 7.932036577805333e-07, + "loss": 0.8756, + "step": 204710 + }, + { + "epoch": 15.864233406951064, + "grad_norm": 1.638668601873028, + "learning_rate": 7.932424054556727e-07, + "loss": 0.8568, + "step": 204720 + }, + { + "epoch": 15.86500833042737, + "grad_norm": 1.5825699638981199, + "learning_rate": 7.932811531308123e-07, + "loss": 0.8499, + "step": 204730 + }, + { + "epoch": 15.865783253903677, + "grad_norm": 1.4819589235513209, + "learning_rate": 7.933199008059517e-07, + "loss": 0.8542, + "step": 204740 + }, + { + "epoch": 15.866558177379984, + "grad_norm": 1.576861560611301, + "learning_rate": 7.933586484810912e-07, + "loss": 0.8513, + "step": 204750 + }, + { + "epoch": 15.86733310085629, + "grad_norm": 1.6954269688008914, + "learning_rate": 7.933973961562306e-07, + "loss": 0.8613, + "step": 204760 + }, + { + "epoch": 15.868108024332598, + "grad_norm": 1.533697723975759, + "learning_rate": 7.934361438313702e-07, + "loss": 0.8758, + "step": 204770 + }, + { + "epoch": 15.868882947808904, + "grad_norm": 1.6065075863287854, + "learning_rate": 7.934748915065096e-07, + "loss": 0.8691, + "step": 204780 + }, + { + "epoch": 15.869657871285211, + "grad_norm": 1.650627842564279, + "learning_rate": 7.935136391816492e-07, + "loss": 0.8666, + "step": 204790 + }, + { + "epoch": 15.870432794761518, + "grad_norm": 1.458595976214762, + "learning_rate": 7.935523868567886e-07, + "loss": 0.8457, + "step": 204800 + }, + { + "epoch": 15.871207718237825, + "grad_norm": 1.5950716374283973, + "learning_rate": 7.935911345319282e-07, + "loss": 0.8667, + "step": 204810 + }, + { + "epoch": 15.87198264171413, + "grad_norm": 1.675259664229227, + "learning_rate": 7.936298822070676e-07, + "loss": 0.8517, + "step": 204820 + }, + { + "epoch": 15.872757565190437, + "grad_norm": 1.5656915857936282, + "learning_rate": 7.936686298822072e-07, + "loss": 0.8705, + "step": 204830 + }, + { + "epoch": 15.873532488666743, + "grad_norm": 1.51540423360771, + "learning_rate": 7.937073775573466e-07, + "loss": 0.8599, + "step": 204840 + }, + { + "epoch": 15.87430741214305, + "grad_norm": 1.5611621528956106, + "learning_rate": 7.937461252324861e-07, + "loss": 0.8649, + "step": 204850 + }, + { + "epoch": 15.875082335619357, + "grad_norm": 1.5562610173780522, + "learning_rate": 7.937848729076256e-07, + "loss": 0.8459, + "step": 204860 + }, + { + "epoch": 15.875857259095664, + "grad_norm": 1.5545031594616687, + "learning_rate": 7.938236205827652e-07, + "loss": 0.8811, + "step": 204870 + }, + { + "epoch": 15.87663218257197, + "grad_norm": 1.5244802606776224, + "learning_rate": 7.938623682579046e-07, + "loss": 0.8677, + "step": 204880 + }, + { + "epoch": 15.877407106048278, + "grad_norm": 1.5263037023264772, + "learning_rate": 7.939011159330441e-07, + "loss": 0.8676, + "step": 204890 + }, + { + "epoch": 15.878182029524584, + "grad_norm": 1.4888339264469979, + "learning_rate": 7.939398636081835e-07, + "loss": 0.8725, + "step": 204900 + }, + { + "epoch": 15.878956953000891, + "grad_norm": 1.5198725506509105, + "learning_rate": 7.939786112833231e-07, + "loss": 0.8483, + "step": 204910 + }, + { + "epoch": 15.879731876477198, + "grad_norm": 1.4825490193024016, + "learning_rate": 7.940173589584625e-07, + "loss": 0.8652, + "step": 204920 + }, + { + "epoch": 15.880506799953505, + "grad_norm": 1.4890577677251862, + "learning_rate": 7.940561066336021e-07, + "loss": 0.8728, + "step": 204930 + }, + { + "epoch": 15.881281723429812, + "grad_norm": 1.608880439995518, + "learning_rate": 7.940948543087415e-07, + "loss": 0.8599, + "step": 204940 + }, + { + "epoch": 15.882056646906118, + "grad_norm": 1.5945707474766575, + "learning_rate": 7.94133601983881e-07, + "loss": 0.8432, + "step": 204950 + }, + { + "epoch": 15.882831570382425, + "grad_norm": 1.5699103754172934, + "learning_rate": 7.941723496590205e-07, + "loss": 0.8711, + "step": 204960 + }, + { + "epoch": 15.883606493858732, + "grad_norm": 1.5917834591957685, + "learning_rate": 7.942110973341601e-07, + "loss": 0.8527, + "step": 204970 + }, + { + "epoch": 15.884381417335039, + "grad_norm": 1.6606499620415758, + "learning_rate": 7.942498450092995e-07, + "loss": 0.8533, + "step": 204980 + }, + { + "epoch": 15.885156340811346, + "grad_norm": 1.5853296477849856, + "learning_rate": 7.94288592684439e-07, + "loss": 0.8574, + "step": 204990 + }, + { + "epoch": 15.885931264287652, + "grad_norm": 1.630199461749753, + "learning_rate": 7.943273403595784e-07, + "loss": 0.8634, + "step": 205000 + }, + { + "epoch": 15.885931264287652, + "eval_loss": 0.8894057273864746, + "eval_runtime": 330.2649, + "eval_samples_per_second": 34.733, + "eval_steps_per_second": 8.684, + "step": 205000 + }, + { + "epoch": 15.886706187763957, + "grad_norm": 1.6379080503496601, + "learning_rate": 7.943660880347181e-07, + "loss": 0.8724, + "step": 205010 + }, + { + "epoch": 15.887481111240264, + "grad_norm": 1.5569926962610114, + "learning_rate": 7.944048357098575e-07, + "loss": 0.8541, + "step": 205020 + }, + { + "epoch": 15.888256034716571, + "grad_norm": 1.470341522419748, + "learning_rate": 7.94443583384997e-07, + "loss": 0.8579, + "step": 205030 + }, + { + "epoch": 15.889030958192878, + "grad_norm": 1.5422826351360546, + "learning_rate": 7.944823310601364e-07, + "loss": 0.8406, + "step": 205040 + }, + { + "epoch": 15.889805881669185, + "grad_norm": 1.6045067791509084, + "learning_rate": 7.945210787352759e-07, + "loss": 0.8767, + "step": 205050 + }, + { + "epoch": 15.890580805145492, + "grad_norm": 1.6034348203528042, + "learning_rate": 7.945598264104154e-07, + "loss": 0.8607, + "step": 205060 + }, + { + "epoch": 15.891355728621798, + "grad_norm": 1.5499271543319875, + "learning_rate": 7.94598574085555e-07, + "loss": 0.8727, + "step": 205070 + }, + { + "epoch": 15.892130652098105, + "grad_norm": 1.5527485069214637, + "learning_rate": 7.946373217606944e-07, + "loss": 0.8699, + "step": 205080 + }, + { + "epoch": 15.892905575574412, + "grad_norm": 1.482622802091365, + "learning_rate": 7.946760694358339e-07, + "loss": 0.8607, + "step": 205090 + }, + { + "epoch": 15.893680499050719, + "grad_norm": 1.4496086011600353, + "learning_rate": 7.947148171109733e-07, + "loss": 0.8764, + "step": 205100 + }, + { + "epoch": 15.894455422527026, + "grad_norm": 1.5156454385962774, + "learning_rate": 7.94753564786113e-07, + "loss": 0.8541, + "step": 205110 + }, + { + "epoch": 15.895230346003332, + "grad_norm": 1.5445100947553807, + "learning_rate": 7.947923124612524e-07, + "loss": 0.8711, + "step": 205120 + }, + { + "epoch": 15.89600526947964, + "grad_norm": 1.5676198487060267, + "learning_rate": 7.948310601363919e-07, + "loss": 0.8722, + "step": 205130 + }, + { + "epoch": 15.896780192955946, + "grad_norm": 1.6049663665098846, + "learning_rate": 7.948698078115313e-07, + "loss": 0.8565, + "step": 205140 + }, + { + "epoch": 15.897555116432253, + "grad_norm": 1.6015350036624767, + "learning_rate": 7.94908555486671e-07, + "loss": 0.8565, + "step": 205150 + }, + { + "epoch": 15.89833003990856, + "grad_norm": 1.5636349015979103, + "learning_rate": 7.949473031618104e-07, + "loss": 0.8403, + "step": 205160 + }, + { + "epoch": 15.899104963384866, + "grad_norm": 1.5745323267510472, + "learning_rate": 7.949860508369499e-07, + "loss": 0.8614, + "step": 205170 + }, + { + "epoch": 15.899879886861173, + "grad_norm": 1.5631369494215839, + "learning_rate": 7.950247985120893e-07, + "loss": 0.8439, + "step": 205180 + }, + { + "epoch": 15.900654810337478, + "grad_norm": 1.5443126713012185, + "learning_rate": 7.950635461872288e-07, + "loss": 0.8867, + "step": 205190 + }, + { + "epoch": 15.901429733813785, + "grad_norm": 1.7434052439197891, + "learning_rate": 7.951022938623682e-07, + "loss": 0.8535, + "step": 205200 + }, + { + "epoch": 15.902204657290092, + "grad_norm": 1.6512011806398539, + "learning_rate": 7.951410415375079e-07, + "loss": 0.8311, + "step": 205210 + }, + { + "epoch": 15.902979580766399, + "grad_norm": 1.6175878101339225, + "learning_rate": 7.951797892126473e-07, + "loss": 0.8672, + "step": 205220 + }, + { + "epoch": 15.903754504242706, + "grad_norm": 1.559241472258878, + "learning_rate": 7.952185368877868e-07, + "loss": 0.8818, + "step": 205230 + }, + { + "epoch": 15.904529427719012, + "grad_norm": 1.6726878180253306, + "learning_rate": 7.952572845629262e-07, + "loss": 0.8598, + "step": 205240 + }, + { + "epoch": 15.90530435119532, + "grad_norm": 1.6176489096077526, + "learning_rate": 7.952960322380658e-07, + "loss": 0.8736, + "step": 205250 + }, + { + "epoch": 15.906079274671626, + "grad_norm": 1.6102797081319329, + "learning_rate": 7.953347799132053e-07, + "loss": 0.8546, + "step": 205260 + }, + { + "epoch": 15.906854198147933, + "grad_norm": 1.5168605043028296, + "learning_rate": 7.953735275883448e-07, + "loss": 0.8501, + "step": 205270 + }, + { + "epoch": 15.90762912162424, + "grad_norm": 1.6718005682403663, + "learning_rate": 7.954122752634842e-07, + "loss": 0.8726, + "step": 205280 + }, + { + "epoch": 15.908404045100546, + "grad_norm": 1.574324320383556, + "learning_rate": 7.954510229386238e-07, + "loss": 0.8568, + "step": 205290 + }, + { + "epoch": 15.909178968576853, + "grad_norm": 1.6025800991421812, + "learning_rate": 7.954897706137632e-07, + "loss": 0.8627, + "step": 205300 + }, + { + "epoch": 15.90995389205316, + "grad_norm": 1.5115850539362128, + "learning_rate": 7.955285182889028e-07, + "loss": 0.8545, + "step": 205310 + }, + { + "epoch": 15.910728815529467, + "grad_norm": 1.5756432547249002, + "learning_rate": 7.955672659640422e-07, + "loss": 0.88, + "step": 205320 + }, + { + "epoch": 15.911503739005774, + "grad_norm": 1.6159080826695504, + "learning_rate": 7.956060136391817e-07, + "loss": 0.8734, + "step": 205330 + }, + { + "epoch": 15.91227866248208, + "grad_norm": 1.5518294276535545, + "learning_rate": 7.956447613143211e-07, + "loss": 0.8694, + "step": 205340 + }, + { + "epoch": 15.913053585958387, + "grad_norm": 1.6527306065241611, + "learning_rate": 7.956835089894608e-07, + "loss": 0.852, + "step": 205350 + }, + { + "epoch": 15.913828509434694, + "grad_norm": 1.6229172724080305, + "learning_rate": 7.957222566646002e-07, + "loss": 0.8713, + "step": 205360 + }, + { + "epoch": 15.914603432911, + "grad_norm": 1.6616728562450804, + "learning_rate": 7.957610043397397e-07, + "loss": 0.8632, + "step": 205370 + }, + { + "epoch": 15.915378356387308, + "grad_norm": 1.5495906932887065, + "learning_rate": 7.957997520148791e-07, + "loss": 0.8564, + "step": 205380 + }, + { + "epoch": 15.916153279863613, + "grad_norm": 1.5220683894964993, + "learning_rate": 7.958384996900187e-07, + "loss": 0.8632, + "step": 205390 + }, + { + "epoch": 15.91692820333992, + "grad_norm": 1.6833964446016858, + "learning_rate": 7.958772473651581e-07, + "loss": 0.8788, + "step": 205400 + }, + { + "epoch": 15.917703126816226, + "grad_norm": 1.525666872979974, + "learning_rate": 7.959159950402977e-07, + "loss": 0.8619, + "step": 205410 + }, + { + "epoch": 15.918478050292533, + "grad_norm": 1.5457721419116446, + "learning_rate": 7.959547427154371e-07, + "loss": 0.8497, + "step": 205420 + }, + { + "epoch": 15.91925297376884, + "grad_norm": 1.573571679027779, + "learning_rate": 7.959934903905766e-07, + "loss": 0.8448, + "step": 205430 + }, + { + "epoch": 15.920027897245147, + "grad_norm": 1.4879944155395344, + "learning_rate": 7.960322380657161e-07, + "loss": 0.8496, + "step": 205440 + }, + { + "epoch": 15.920802820721454, + "grad_norm": 1.5422140627000909, + "learning_rate": 7.960709857408557e-07, + "loss": 0.8698, + "step": 205450 + }, + { + "epoch": 15.92157774419776, + "grad_norm": 1.640838392188894, + "learning_rate": 7.961097334159951e-07, + "loss": 0.8608, + "step": 205460 + }, + { + "epoch": 15.922352667674067, + "grad_norm": 1.6036833196081661, + "learning_rate": 7.961484810911346e-07, + "loss": 0.8665, + "step": 205470 + }, + { + "epoch": 15.923127591150374, + "grad_norm": 1.539145101900545, + "learning_rate": 7.96187228766274e-07, + "loss": 0.855, + "step": 205480 + }, + { + "epoch": 15.92390251462668, + "grad_norm": 1.6345383362076082, + "learning_rate": 7.962259764414136e-07, + "loss": 0.862, + "step": 205490 + }, + { + "epoch": 15.924677438102988, + "grad_norm": 1.5809308527297676, + "learning_rate": 7.96264724116553e-07, + "loss": 0.8634, + "step": 205500 + }, + { + "epoch": 15.924677438102988, + "eval_loss": 0.8890607357025146, + "eval_runtime": 327.3529, + "eval_samples_per_second": 35.042, + "eval_steps_per_second": 8.761, + "step": 205500 + }, + { + "epoch": 15.925452361579294, + "grad_norm": 1.4657653489697167, + "learning_rate": 7.963034717916926e-07, + "loss": 0.8724, + "step": 205510 + }, + { + "epoch": 15.926227285055601, + "grad_norm": 1.5416562756048993, + "learning_rate": 7.96342219466832e-07, + "loss": 0.8641, + "step": 205520 + }, + { + "epoch": 15.927002208531908, + "grad_norm": 1.584488932508413, + "learning_rate": 7.963809671419716e-07, + "loss": 0.8581, + "step": 205530 + }, + { + "epoch": 15.927777132008215, + "grad_norm": 1.5272924043568923, + "learning_rate": 7.96419714817111e-07, + "loss": 0.8589, + "step": 205540 + }, + { + "epoch": 15.928552055484522, + "grad_norm": 1.6135194592530078, + "learning_rate": 7.964584624922506e-07, + "loss": 0.8709, + "step": 205550 + }, + { + "epoch": 15.929326978960827, + "grad_norm": 1.5687617364058128, + "learning_rate": 7.9649721016739e-07, + "loss": 0.8745, + "step": 205560 + }, + { + "epoch": 15.930101902437134, + "grad_norm": 1.5455054587672243, + "learning_rate": 7.965359578425295e-07, + "loss": 0.8572, + "step": 205570 + }, + { + "epoch": 15.93087682591344, + "grad_norm": 1.5461858678993197, + "learning_rate": 7.96574705517669e-07, + "loss": 0.8662, + "step": 205580 + }, + { + "epoch": 15.931651749389747, + "grad_norm": 1.523149770476267, + "learning_rate": 7.966134531928085e-07, + "loss": 0.8811, + "step": 205590 + }, + { + "epoch": 15.932426672866054, + "grad_norm": 1.464501825069899, + "learning_rate": 7.96652200867948e-07, + "loss": 0.875, + "step": 205600 + }, + { + "epoch": 15.93320159634236, + "grad_norm": 1.6439863666509644, + "learning_rate": 7.966909485430875e-07, + "loss": 0.8654, + "step": 205610 + }, + { + "epoch": 15.933976519818668, + "grad_norm": 1.4561641172015933, + "learning_rate": 7.967296962182269e-07, + "loss": 0.8558, + "step": 205620 + }, + { + "epoch": 15.934751443294974, + "grad_norm": 1.6597597827450106, + "learning_rate": 7.967684438933665e-07, + "loss": 0.8771, + "step": 205630 + }, + { + "epoch": 15.935526366771281, + "grad_norm": 1.522523213629012, + "learning_rate": 7.968071915685059e-07, + "loss": 0.8431, + "step": 205640 + }, + { + "epoch": 15.936301290247588, + "grad_norm": 1.598490821488874, + "learning_rate": 7.968459392436455e-07, + "loss": 0.8567, + "step": 205650 + }, + { + "epoch": 15.937076213723895, + "grad_norm": 1.8151389066007704, + "learning_rate": 7.968846869187849e-07, + "loss": 0.867, + "step": 205660 + }, + { + "epoch": 15.937851137200202, + "grad_norm": 1.516681090429295, + "learning_rate": 7.969234345939245e-07, + "loss": 0.8657, + "step": 205670 + }, + { + "epoch": 15.938626060676508, + "grad_norm": 1.5043923438647355, + "learning_rate": 7.969621822690639e-07, + "loss": 0.8586, + "step": 205680 + }, + { + "epoch": 15.939400984152815, + "grad_norm": 1.584058371563046, + "learning_rate": 7.970009299442034e-07, + "loss": 0.8527, + "step": 205690 + }, + { + "epoch": 15.940175907629122, + "grad_norm": 1.5625326352968407, + "learning_rate": 7.970396776193429e-07, + "loss": 0.8689, + "step": 205700 + }, + { + "epoch": 15.940950831105429, + "grad_norm": 1.5827305852929066, + "learning_rate": 7.970784252944824e-07, + "loss": 0.8806, + "step": 205710 + }, + { + "epoch": 15.941725754581736, + "grad_norm": 1.5754703630967724, + "learning_rate": 7.971171729696219e-07, + "loss": 0.8588, + "step": 205720 + }, + { + "epoch": 15.942500678058042, + "grad_norm": 1.523264791828677, + "learning_rate": 7.971559206447614e-07, + "loss": 0.8476, + "step": 205730 + }, + { + "epoch": 15.94327560153435, + "grad_norm": 1.5741823141547995, + "learning_rate": 7.971946683199008e-07, + "loss": 0.8722, + "step": 205740 + }, + { + "epoch": 15.944050525010656, + "grad_norm": 1.5648396611544473, + "learning_rate": 7.972334159950404e-07, + "loss": 0.8511, + "step": 205750 + }, + { + "epoch": 15.944825448486961, + "grad_norm": 1.5932182745203844, + "learning_rate": 7.972721636701798e-07, + "loss": 0.8628, + "step": 205760 + }, + { + "epoch": 15.945600371963268, + "grad_norm": 1.7326193765559559, + "learning_rate": 7.973109113453194e-07, + "loss": 0.8698, + "step": 205770 + }, + { + "epoch": 15.946375295439575, + "grad_norm": 1.5864375539826538, + "learning_rate": 7.973496590204588e-07, + "loss": 0.8637, + "step": 205780 + }, + { + "epoch": 15.947150218915882, + "grad_norm": 1.4577750198218309, + "learning_rate": 7.973884066955983e-07, + "loss": 0.8567, + "step": 205790 + }, + { + "epoch": 15.947925142392188, + "grad_norm": 1.6601130019984842, + "learning_rate": 7.974271543707378e-07, + "loss": 0.8703, + "step": 205800 + }, + { + "epoch": 15.948700065868495, + "grad_norm": 1.5383888474774332, + "learning_rate": 7.974659020458774e-07, + "loss": 0.8565, + "step": 205810 + }, + { + "epoch": 15.949474989344802, + "grad_norm": 1.5610928665590533, + "learning_rate": 7.975046497210168e-07, + "loss": 0.8436, + "step": 205820 + }, + { + "epoch": 15.950249912821109, + "grad_norm": 1.63993384876226, + "learning_rate": 7.975433973961563e-07, + "loss": 0.8491, + "step": 205830 + }, + { + "epoch": 15.951024836297416, + "grad_norm": 1.5372642965917502, + "learning_rate": 7.975821450712957e-07, + "loss": 0.8567, + "step": 205840 + }, + { + "epoch": 15.951799759773722, + "grad_norm": 1.4935750402521109, + "learning_rate": 7.976208927464353e-07, + "loss": 0.8419, + "step": 205850 + }, + { + "epoch": 15.95257468325003, + "grad_norm": 1.5121071976095055, + "learning_rate": 7.976596404215748e-07, + "loss": 0.8688, + "step": 205860 + }, + { + "epoch": 15.953349606726336, + "grad_norm": 1.4835003488051917, + "learning_rate": 7.976983880967143e-07, + "loss": 0.8714, + "step": 205870 + }, + { + "epoch": 15.954124530202643, + "grad_norm": 1.5198233370248344, + "learning_rate": 7.977371357718537e-07, + "loss": 0.8689, + "step": 205880 + }, + { + "epoch": 15.95489945367895, + "grad_norm": 1.6050967660345332, + "learning_rate": 7.977758834469932e-07, + "loss": 0.8763, + "step": 205890 + }, + { + "epoch": 15.955674377155256, + "grad_norm": 1.5181317210389276, + "learning_rate": 7.978146311221327e-07, + "loss": 0.8627, + "step": 205900 + }, + { + "epoch": 15.956449300631563, + "grad_norm": 1.649100741652708, + "learning_rate": 7.978533787972723e-07, + "loss": 0.8437, + "step": 205910 + }, + { + "epoch": 15.95722422410787, + "grad_norm": 1.5966211186530712, + "learning_rate": 7.978921264724117e-07, + "loss": 0.8549, + "step": 205920 + }, + { + "epoch": 15.957999147584175, + "grad_norm": 1.4352287774584016, + "learning_rate": 7.979308741475512e-07, + "loss": 0.8498, + "step": 205930 + }, + { + "epoch": 15.958774071060482, + "grad_norm": 1.5004887235962983, + "learning_rate": 7.979696218226906e-07, + "loss": 0.8802, + "step": 205940 + }, + { + "epoch": 15.959548994536789, + "grad_norm": 1.5199970885750969, + "learning_rate": 7.980083694978303e-07, + "loss": 0.8666, + "step": 205950 + }, + { + "epoch": 15.960323918013096, + "grad_norm": 1.5962641046703323, + "learning_rate": 7.980471171729697e-07, + "loss": 0.8533, + "step": 205960 + }, + { + "epoch": 15.961098841489402, + "grad_norm": 1.6042314113734433, + "learning_rate": 7.980858648481092e-07, + "loss": 0.8533, + "step": 205970 + }, + { + "epoch": 15.96187376496571, + "grad_norm": 1.6340655640057025, + "learning_rate": 7.981246125232486e-07, + "loss": 0.8705, + "step": 205980 + }, + { + "epoch": 15.962648688442016, + "grad_norm": 1.548349558276009, + "learning_rate": 7.981633601983881e-07, + "loss": 0.8607, + "step": 205990 + }, + { + "epoch": 15.963423611918323, + "grad_norm": 1.5579042934974012, + "learning_rate": 7.982021078735276e-07, + "loss": 0.8555, + "step": 206000 + }, + { + "epoch": 15.963423611918323, + "eval_loss": 0.8893323540687561, + "eval_runtime": 327.9683, + "eval_samples_per_second": 34.976, + "eval_steps_per_second": 8.745, + "step": 206000 + }, + { + "epoch": 15.96419853539463, + "grad_norm": 1.4821012667750322, + "learning_rate": 7.982408555486672e-07, + "loss": 0.8625, + "step": 206010 + }, + { + "epoch": 15.964973458870936, + "grad_norm": 1.5966152146758685, + "learning_rate": 7.982796032238066e-07, + "loss": 0.8581, + "step": 206020 + }, + { + "epoch": 15.965748382347243, + "grad_norm": 1.5132366354818092, + "learning_rate": 7.983183508989461e-07, + "loss": 0.8811, + "step": 206030 + }, + { + "epoch": 15.96652330582355, + "grad_norm": 1.7259874441331398, + "learning_rate": 7.983570985740855e-07, + "loss": 0.8365, + "step": 206040 + }, + { + "epoch": 15.967298229299857, + "grad_norm": 1.4783814124406027, + "learning_rate": 7.983958462492252e-07, + "loss": 0.8782, + "step": 206050 + }, + { + "epoch": 15.968073152776164, + "grad_norm": 1.5389524684511222, + "learning_rate": 7.984345939243646e-07, + "loss": 0.853, + "step": 206060 + }, + { + "epoch": 15.96884807625247, + "grad_norm": 1.4567290803589532, + "learning_rate": 7.984733415995041e-07, + "loss": 0.8591, + "step": 206070 + }, + { + "epoch": 15.969622999728777, + "grad_norm": 1.5313575799231138, + "learning_rate": 7.985120892746435e-07, + "loss": 0.8394, + "step": 206080 + }, + { + "epoch": 15.970397923205084, + "grad_norm": 1.5690763617215653, + "learning_rate": 7.985508369497832e-07, + "loss": 0.8478, + "step": 206090 + }, + { + "epoch": 15.971172846681391, + "grad_norm": 1.5049357806164283, + "learning_rate": 7.985895846249226e-07, + "loss": 0.8663, + "step": 206100 + }, + { + "epoch": 15.971947770157698, + "grad_norm": 1.4608491982953533, + "learning_rate": 7.986283323000621e-07, + "loss": 0.8673, + "step": 206110 + }, + { + "epoch": 15.972722693634005, + "grad_norm": 1.5019693400641496, + "learning_rate": 7.986670799752015e-07, + "loss": 0.8635, + "step": 206120 + }, + { + "epoch": 15.97349761711031, + "grad_norm": 1.4921566374374418, + "learning_rate": 7.98705827650341e-07, + "loss": 0.8431, + "step": 206130 + }, + { + "epoch": 15.974272540586616, + "grad_norm": 1.5861924905530886, + "learning_rate": 7.987445753254804e-07, + "loss": 0.8601, + "step": 206140 + }, + { + "epoch": 15.975047464062923, + "grad_norm": 1.5293909592629955, + "learning_rate": 7.987833230006201e-07, + "loss": 0.8716, + "step": 206150 + }, + { + "epoch": 15.97582238753923, + "grad_norm": 1.5800902444272378, + "learning_rate": 7.988220706757595e-07, + "loss": 0.8465, + "step": 206160 + }, + { + "epoch": 15.976597311015537, + "grad_norm": 1.580638619600741, + "learning_rate": 7.98860818350899e-07, + "loss": 0.8287, + "step": 206170 + }, + { + "epoch": 15.977372234491844, + "grad_norm": 1.520561739494198, + "learning_rate": 7.988995660260384e-07, + "loss": 0.8518, + "step": 206180 + }, + { + "epoch": 15.97814715796815, + "grad_norm": 1.5486474468512108, + "learning_rate": 7.989383137011781e-07, + "loss": 0.8468, + "step": 206190 + }, + { + "epoch": 15.978922081444457, + "grad_norm": 1.5388562575386673, + "learning_rate": 7.989770613763175e-07, + "loss": 0.8655, + "step": 206200 + }, + { + "epoch": 15.979697004920764, + "grad_norm": 1.5564465116509774, + "learning_rate": 7.99015809051457e-07, + "loss": 0.8782, + "step": 206210 + }, + { + "epoch": 15.98047192839707, + "grad_norm": 1.4792476649221264, + "learning_rate": 7.990545567265964e-07, + "loss": 0.8572, + "step": 206220 + }, + { + "epoch": 15.981246851873378, + "grad_norm": 1.5485348972766761, + "learning_rate": 7.99093304401736e-07, + "loss": 0.841, + "step": 206230 + }, + { + "epoch": 15.982021775349684, + "grad_norm": 1.5292270876533058, + "learning_rate": 7.991320520768755e-07, + "loss": 0.8602, + "step": 206240 + }, + { + "epoch": 15.982796698825991, + "grad_norm": 1.410372428018099, + "learning_rate": 7.99170799752015e-07, + "loss": 0.8529, + "step": 206250 + }, + { + "epoch": 15.983571622302298, + "grad_norm": 1.5494281353475197, + "learning_rate": 7.992095474271544e-07, + "loss": 0.859, + "step": 206260 + }, + { + "epoch": 15.984346545778605, + "grad_norm": 1.6360183217816662, + "learning_rate": 7.992482951022939e-07, + "loss": 0.8698, + "step": 206270 + }, + { + "epoch": 15.985121469254912, + "grad_norm": 1.5261315084938636, + "learning_rate": 7.992870427774333e-07, + "loss": 0.8661, + "step": 206280 + }, + { + "epoch": 15.985896392731219, + "grad_norm": 1.639305946018993, + "learning_rate": 7.99325790452573e-07, + "loss": 0.8522, + "step": 206290 + }, + { + "epoch": 15.986671316207524, + "grad_norm": 1.4776385082870174, + "learning_rate": 7.993645381277124e-07, + "loss": 0.8563, + "step": 206300 + }, + { + "epoch": 15.98744623968383, + "grad_norm": 1.584714739346496, + "learning_rate": 7.994032858028519e-07, + "loss": 0.8527, + "step": 206310 + }, + { + "epoch": 15.988221163160137, + "grad_norm": 1.555504632094309, + "learning_rate": 7.994420334779913e-07, + "loss": 0.8736, + "step": 206320 + }, + { + "epoch": 15.988996086636444, + "grad_norm": 1.5301002253363454, + "learning_rate": 7.994807811531309e-07, + "loss": 0.8634, + "step": 206330 + }, + { + "epoch": 15.98977101011275, + "grad_norm": 1.594933595447038, + "learning_rate": 7.995195288282704e-07, + "loss": 0.8515, + "step": 206340 + }, + { + "epoch": 15.990545933589058, + "grad_norm": 1.5476435083595728, + "learning_rate": 7.995582765034099e-07, + "loss": 0.8453, + "step": 206350 + }, + { + "epoch": 15.991320857065364, + "grad_norm": 1.4918445124927078, + "learning_rate": 7.995970241785493e-07, + "loss": 0.8582, + "step": 206360 + }, + { + "epoch": 15.992095780541671, + "grad_norm": 1.6039365948457511, + "learning_rate": 7.996357718536889e-07, + "loss": 0.8581, + "step": 206370 + }, + { + "epoch": 15.992870704017978, + "grad_norm": 1.6083423243638146, + "learning_rate": 7.996745195288283e-07, + "loss": 0.8593, + "step": 206380 + }, + { + "epoch": 15.993645627494285, + "grad_norm": 1.5661455921177019, + "learning_rate": 7.997132672039679e-07, + "loss": 0.8491, + "step": 206390 + }, + { + "epoch": 15.994420550970592, + "grad_norm": 1.5354903770417532, + "learning_rate": 7.997520148791073e-07, + "loss": 0.8515, + "step": 206400 + }, + { + "epoch": 15.995195474446898, + "grad_norm": 1.4685438863164635, + "learning_rate": 7.997907625542468e-07, + "loss": 0.8555, + "step": 206410 + }, + { + "epoch": 15.995970397923205, + "grad_norm": 1.6782972541949772, + "learning_rate": 7.998295102293862e-07, + "loss": 0.8716, + "step": 206420 + }, + { + "epoch": 15.996745321399512, + "grad_norm": 1.548841331221358, + "learning_rate": 7.998682579045258e-07, + "loss": 0.8365, + "step": 206430 + }, + { + "epoch": 15.997520244875819, + "grad_norm": 1.5672505289247125, + "learning_rate": 7.999070055796653e-07, + "loss": 0.858, + "step": 206440 + }, + { + "epoch": 15.998295168352126, + "grad_norm": 1.5754309594571978, + "learning_rate": 7.999457532548048e-07, + "loss": 0.8806, + "step": 206450 + }, + { + "epoch": 15.999070091828433, + "grad_norm": 1.5675441230628326, + "learning_rate": 7.999845009299442e-07, + "loss": 0.8518, + "step": 206460 + }, + { + "epoch": 15.99984501530474, + "grad_norm": 1.7237623443117276, + "learning_rate": 8.000232486050838e-07, + "loss": 0.8589, + "step": 206470 + }, + { + "epoch": 16.000619938781046, + "grad_norm": 1.4700604820717762, + "learning_rate": 8.000619962802232e-07, + "loss": 0.8467, + "step": 206480 + }, + { + "epoch": 16.001394862257353, + "grad_norm": 1.5807587313841525, + "learning_rate": 8.001007439553628e-07, + "loss": 0.8304, + "step": 206490 + }, + { + "epoch": 16.00216978573366, + "grad_norm": 1.6080482571582295, + "learning_rate": 8.001394916305022e-07, + "loss": 0.8508, + "step": 206500 + }, + { + "epoch": 16.00216978573366, + "eval_loss": 0.8904383182525635, + "eval_runtime": 329.3275, + "eval_samples_per_second": 34.832, + "eval_steps_per_second": 8.709, + "step": 206500 + }, + { + "epoch": 16.002944709209967, + "grad_norm": 1.5109827279806405, + "learning_rate": 8.001782393056418e-07, + "loss": 0.8747, + "step": 206510 + }, + { + "epoch": 16.003719632686273, + "grad_norm": 1.5810460476679755, + "learning_rate": 8.002169869807812e-07, + "loss": 0.8751, + "step": 206520 + }, + { + "epoch": 16.00449455616258, + "grad_norm": 1.592911091132815, + "learning_rate": 8.002557346559207e-07, + "loss": 0.8527, + "step": 206530 + }, + { + "epoch": 16.005269479638887, + "grad_norm": 1.5413529490497266, + "learning_rate": 8.002944823310602e-07, + "loss": 0.8806, + "step": 206540 + }, + { + "epoch": 16.006044403115194, + "grad_norm": 1.5498483238289884, + "learning_rate": 8.003332300061997e-07, + "loss": 0.8515, + "step": 206550 + }, + { + "epoch": 16.0068193265915, + "grad_norm": 1.614910959965292, + "learning_rate": 8.003719776813391e-07, + "loss": 0.8473, + "step": 206560 + }, + { + "epoch": 16.007594250067807, + "grad_norm": 1.5906782922835825, + "learning_rate": 8.004107253564787e-07, + "loss": 0.8508, + "step": 206570 + }, + { + "epoch": 16.008369173544114, + "grad_norm": 1.6821156668500103, + "learning_rate": 8.004494730316181e-07, + "loss": 0.8581, + "step": 206580 + }, + { + "epoch": 16.009144097020418, + "grad_norm": 1.5560428494704768, + "learning_rate": 8.004882207067577e-07, + "loss": 0.8423, + "step": 206590 + }, + { + "epoch": 16.009919020496724, + "grad_norm": 1.6413708487831131, + "learning_rate": 8.005269683818971e-07, + "loss": 0.8702, + "step": 206600 + }, + { + "epoch": 16.01069394397303, + "grad_norm": 1.5538285767413398, + "learning_rate": 8.005657160570367e-07, + "loss": 0.8522, + "step": 206610 + }, + { + "epoch": 16.011468867449338, + "grad_norm": 1.5205117236677714, + "learning_rate": 8.006044637321761e-07, + "loss": 0.8639, + "step": 206620 + }, + { + "epoch": 16.012243790925645, + "grad_norm": 1.5403459154689763, + "learning_rate": 8.006432114073156e-07, + "loss": 0.8628, + "step": 206630 + }, + { + "epoch": 16.01301871440195, + "grad_norm": 1.510122621723228, + "learning_rate": 8.006819590824551e-07, + "loss": 0.8681, + "step": 206640 + }, + { + "epoch": 16.01379363787826, + "grad_norm": 1.6051954993748463, + "learning_rate": 8.007207067575947e-07, + "loss": 0.8403, + "step": 206650 + }, + { + "epoch": 16.014568561354565, + "grad_norm": 1.5577247979167876, + "learning_rate": 8.007594544327341e-07, + "loss": 0.8524, + "step": 206660 + }, + { + "epoch": 16.015343484830872, + "grad_norm": 1.5853453135822404, + "learning_rate": 8.007982021078736e-07, + "loss": 0.8414, + "step": 206670 + }, + { + "epoch": 16.01611840830718, + "grad_norm": 1.5405397776302983, + "learning_rate": 8.00836949783013e-07, + "loss": 0.8523, + "step": 206680 + }, + { + "epoch": 16.016893331783486, + "grad_norm": 1.593811078484619, + "learning_rate": 8.008756974581526e-07, + "loss": 0.8529, + "step": 206690 + }, + { + "epoch": 16.017668255259792, + "grad_norm": 1.4837955386265311, + "learning_rate": 8.00914445133292e-07, + "loss": 0.8452, + "step": 206700 + }, + { + "epoch": 16.0184431787361, + "grad_norm": 1.663071106666621, + "learning_rate": 8.009531928084316e-07, + "loss": 0.8552, + "step": 206710 + }, + { + "epoch": 16.019218102212406, + "grad_norm": 1.5609616883784512, + "learning_rate": 8.00991940483571e-07, + "loss": 0.8358, + "step": 206720 + }, + { + "epoch": 16.019993025688713, + "grad_norm": 1.8209124567478838, + "learning_rate": 8.010306881587105e-07, + "loss": 0.8719, + "step": 206730 + }, + { + "epoch": 16.02076794916502, + "grad_norm": 1.6483409240496452, + "learning_rate": 8.0106943583385e-07, + "loss": 0.8538, + "step": 206740 + }, + { + "epoch": 16.021542872641326, + "grad_norm": 1.4949826030229518, + "learning_rate": 8.011081835089896e-07, + "loss": 0.8432, + "step": 206750 + }, + { + "epoch": 16.022317796117633, + "grad_norm": 1.6581837702184234, + "learning_rate": 8.01146931184129e-07, + "loss": 0.857, + "step": 206760 + }, + { + "epoch": 16.02309271959394, + "grad_norm": 1.4659495309920003, + "learning_rate": 8.011856788592685e-07, + "loss": 0.853, + "step": 206770 + }, + { + "epoch": 16.023867643070247, + "grad_norm": 1.6600044257565856, + "learning_rate": 8.01224426534408e-07, + "loss": 0.8467, + "step": 206780 + }, + { + "epoch": 16.024642566546554, + "grad_norm": 1.5108486122549432, + "learning_rate": 8.012631742095476e-07, + "loss": 0.8613, + "step": 206790 + }, + { + "epoch": 16.02541749002286, + "grad_norm": 1.5989425664573398, + "learning_rate": 8.01301921884687e-07, + "loss": 0.8574, + "step": 206800 + }, + { + "epoch": 16.026192413499167, + "grad_norm": 1.7137724559868024, + "learning_rate": 8.013406695598265e-07, + "loss": 0.8331, + "step": 206810 + }, + { + "epoch": 16.026967336975474, + "grad_norm": 1.5824454643003192, + "learning_rate": 8.013794172349659e-07, + "loss": 0.8589, + "step": 206820 + }, + { + "epoch": 16.02774226045178, + "grad_norm": 1.6489328059170527, + "learning_rate": 8.014181649101055e-07, + "loss": 0.8554, + "step": 206830 + }, + { + "epoch": 16.028517183928088, + "grad_norm": 1.632099952158966, + "learning_rate": 8.014569125852449e-07, + "loss": 0.8504, + "step": 206840 + }, + { + "epoch": 16.029292107404395, + "grad_norm": 1.5802640479485994, + "learning_rate": 8.014956602603845e-07, + "loss": 0.8723, + "step": 206850 + }, + { + "epoch": 16.0300670308807, + "grad_norm": 1.5512221182812167, + "learning_rate": 8.015344079355239e-07, + "loss": 0.8478, + "step": 206860 + }, + { + "epoch": 16.030841954357008, + "grad_norm": 1.6990678556463275, + "learning_rate": 8.015731556106634e-07, + "loss": 0.8679, + "step": 206870 + }, + { + "epoch": 16.031616877833315, + "grad_norm": 1.602419725703477, + "learning_rate": 8.016119032858028e-07, + "loss": 0.8538, + "step": 206880 + }, + { + "epoch": 16.032391801309622, + "grad_norm": 1.5376600979185455, + "learning_rate": 8.016506509609425e-07, + "loss": 0.8542, + "step": 206890 + }, + { + "epoch": 16.03316672478593, + "grad_norm": 1.5591824948419937, + "learning_rate": 8.016893986360819e-07, + "loss": 0.8571, + "step": 206900 + }, + { + "epoch": 16.033941648262235, + "grad_norm": 1.529752569120272, + "learning_rate": 8.017281463112214e-07, + "loss": 0.8488, + "step": 206910 + }, + { + "epoch": 16.034716571738542, + "grad_norm": 1.569480991211493, + "learning_rate": 8.017668939863608e-07, + "loss": 0.8555, + "step": 206920 + }, + { + "epoch": 16.03549149521485, + "grad_norm": 1.7198765456737215, + "learning_rate": 8.018056416615004e-07, + "loss": 0.8549, + "step": 206930 + }, + { + "epoch": 16.036266418691156, + "grad_norm": 1.585096221559163, + "learning_rate": 8.018443893366399e-07, + "loss": 0.8508, + "step": 206940 + }, + { + "epoch": 16.037041342167463, + "grad_norm": 1.5671089512301195, + "learning_rate": 8.018831370117794e-07, + "loss": 0.8474, + "step": 206950 + }, + { + "epoch": 16.037816265643766, + "grad_norm": 1.6263955083363797, + "learning_rate": 8.019218846869188e-07, + "loss": 0.8605, + "step": 206960 + }, + { + "epoch": 16.038591189120073, + "grad_norm": 1.5390844051593557, + "learning_rate": 8.019606323620583e-07, + "loss": 0.8491, + "step": 206970 + }, + { + "epoch": 16.03936611259638, + "grad_norm": 1.6205800554678618, + "learning_rate": 8.019993800371978e-07, + "loss": 0.8608, + "step": 206980 + }, + { + "epoch": 16.040141036072686, + "grad_norm": 1.6293317781817167, + "learning_rate": 8.020381277123374e-07, + "loss": 0.8499, + "step": 206990 + }, + { + "epoch": 16.040915959548993, + "grad_norm": 1.6098079283025701, + "learning_rate": 8.020768753874768e-07, + "loss": 0.8658, + "step": 207000 + }, + { + "epoch": 16.040915959548993, + "eval_loss": 0.8907023072242737, + "eval_runtime": 328.5435, + "eval_samples_per_second": 34.915, + "eval_steps_per_second": 8.729, + "step": 207000 + }, + { + "epoch": 16.0416908830253, + "grad_norm": 1.7260948818437853, + "learning_rate": 8.021156230626163e-07, + "loss": 0.8565, + "step": 207010 + }, + { + "epoch": 16.042465806501607, + "grad_norm": 1.511839167817736, + "learning_rate": 8.021543707377557e-07, + "loss": 0.8502, + "step": 207020 + }, + { + "epoch": 16.043240729977914, + "grad_norm": 1.612214200847088, + "learning_rate": 8.021931184128954e-07, + "loss": 0.8505, + "step": 207030 + }, + { + "epoch": 16.04401565345422, + "grad_norm": 1.5170086859140512, + "learning_rate": 8.022318660880348e-07, + "loss": 0.8454, + "step": 207040 + }, + { + "epoch": 16.044790576930527, + "grad_norm": 1.7153631799934252, + "learning_rate": 8.022706137631743e-07, + "loss": 0.8611, + "step": 207050 + }, + { + "epoch": 16.045565500406834, + "grad_norm": 1.670235826489259, + "learning_rate": 8.023093614383137e-07, + "loss": 0.8487, + "step": 207060 + }, + { + "epoch": 16.04634042388314, + "grad_norm": 1.4917357008224637, + "learning_rate": 8.023481091134532e-07, + "loss": 0.8476, + "step": 207070 + }, + { + "epoch": 16.047115347359448, + "grad_norm": 1.6702711089845015, + "learning_rate": 8.023868567885928e-07, + "loss": 0.8574, + "step": 207080 + }, + { + "epoch": 16.047890270835754, + "grad_norm": 1.6768012045074516, + "learning_rate": 8.024256044637323e-07, + "loss": 0.8666, + "step": 207090 + }, + { + "epoch": 16.04866519431206, + "grad_norm": 1.638920434850963, + "learning_rate": 8.024643521388717e-07, + "loss": 0.8709, + "step": 207100 + }, + { + "epoch": 16.049440117788368, + "grad_norm": 1.505801848429678, + "learning_rate": 8.025030998140112e-07, + "loss": 0.8521, + "step": 207110 + }, + { + "epoch": 16.050215041264675, + "grad_norm": 1.6057950836413328, + "learning_rate": 8.025418474891506e-07, + "loss": 0.8738, + "step": 207120 + }, + { + "epoch": 16.05098996474098, + "grad_norm": 1.5641179613971783, + "learning_rate": 8.025805951642903e-07, + "loss": 0.8517, + "step": 207130 + }, + { + "epoch": 16.05176488821729, + "grad_norm": 1.702456242446204, + "learning_rate": 8.026193428394297e-07, + "loss": 0.8512, + "step": 207140 + }, + { + "epoch": 16.052539811693595, + "grad_norm": 1.6680354979777934, + "learning_rate": 8.026580905145692e-07, + "loss": 0.8676, + "step": 207150 + }, + { + "epoch": 16.053314735169902, + "grad_norm": 1.4675876506571546, + "learning_rate": 8.026968381897086e-07, + "loss": 0.8518, + "step": 207160 + }, + { + "epoch": 16.05408965864621, + "grad_norm": 1.5981078420906685, + "learning_rate": 8.027355858648482e-07, + "loss": 0.861, + "step": 207170 + }, + { + "epoch": 16.054864582122516, + "grad_norm": 1.600844375642704, + "learning_rate": 8.027743335399877e-07, + "loss": 0.8589, + "step": 207180 + }, + { + "epoch": 16.055639505598823, + "grad_norm": 1.498587934795176, + "learning_rate": 8.028130812151272e-07, + "loss": 0.8352, + "step": 207190 + }, + { + "epoch": 16.05641442907513, + "grad_norm": 1.6187141688891222, + "learning_rate": 8.028518288902666e-07, + "loss": 0.8342, + "step": 207200 + }, + { + "epoch": 16.057189352551436, + "grad_norm": 1.6067205209776574, + "learning_rate": 8.028905765654061e-07, + "loss": 0.8511, + "step": 207210 + }, + { + "epoch": 16.057964276027743, + "grad_norm": 1.6308608944152538, + "learning_rate": 8.029293242405456e-07, + "loss": 0.8433, + "step": 207220 + }, + { + "epoch": 16.05873919950405, + "grad_norm": 1.6653522793279245, + "learning_rate": 8.029680719156852e-07, + "loss": 0.8334, + "step": 207230 + }, + { + "epoch": 16.059514122980357, + "grad_norm": 1.6578120173314217, + "learning_rate": 8.030068195908246e-07, + "loss": 0.8441, + "step": 207240 + }, + { + "epoch": 16.060289046456663, + "grad_norm": 1.74014487874449, + "learning_rate": 8.030455672659641e-07, + "loss": 0.8566, + "step": 207250 + }, + { + "epoch": 16.06106396993297, + "grad_norm": 1.609685120908604, + "learning_rate": 8.030843149411035e-07, + "loss": 0.8384, + "step": 207260 + }, + { + "epoch": 16.061838893409277, + "grad_norm": 1.6194474288646994, + "learning_rate": 8.031230626162431e-07, + "loss": 0.8436, + "step": 207270 + }, + { + "epoch": 16.062613816885584, + "grad_norm": 1.6059672884430756, + "learning_rate": 8.031618102913826e-07, + "loss": 0.8554, + "step": 207280 + }, + { + "epoch": 16.06338874036189, + "grad_norm": 1.5630059881427998, + "learning_rate": 8.032005579665221e-07, + "loss": 0.8455, + "step": 207290 + }, + { + "epoch": 16.064163663838197, + "grad_norm": 1.6723421500502256, + "learning_rate": 8.032393056416615e-07, + "loss": 0.8372, + "step": 207300 + }, + { + "epoch": 16.064938587314504, + "grad_norm": 1.569786923419821, + "learning_rate": 8.032780533168011e-07, + "loss": 0.8617, + "step": 207310 + }, + { + "epoch": 16.06571351079081, + "grad_norm": 1.7456108843881155, + "learning_rate": 8.033168009919405e-07, + "loss": 0.8499, + "step": 207320 + }, + { + "epoch": 16.066488434267114, + "grad_norm": 1.5519053653301316, + "learning_rate": 8.033555486670801e-07, + "loss": 0.8524, + "step": 207330 + }, + { + "epoch": 16.06726335774342, + "grad_norm": 1.6055871442047294, + "learning_rate": 8.033942963422195e-07, + "loss": 0.8366, + "step": 207340 + }, + { + "epoch": 16.068038281219728, + "grad_norm": 1.6670138154761827, + "learning_rate": 8.03433044017359e-07, + "loss": 0.8556, + "step": 207350 + }, + { + "epoch": 16.068813204696035, + "grad_norm": 1.650314833433548, + "learning_rate": 8.034717916924985e-07, + "loss": 0.8358, + "step": 207360 + }, + { + "epoch": 16.06958812817234, + "grad_norm": 1.525456499220059, + "learning_rate": 8.03510539367638e-07, + "loss": 0.8395, + "step": 207370 + }, + { + "epoch": 16.07036305164865, + "grad_norm": 1.7014582859435958, + "learning_rate": 8.035492870427775e-07, + "loss": 0.8559, + "step": 207380 + }, + { + "epoch": 16.071137975124955, + "grad_norm": 1.5798250817941688, + "learning_rate": 8.03588034717917e-07, + "loss": 0.8529, + "step": 207390 + }, + { + "epoch": 16.071912898601262, + "grad_norm": 1.6552870990183217, + "learning_rate": 8.036267823930564e-07, + "loss": 0.8523, + "step": 207400 + }, + { + "epoch": 16.07268782207757, + "grad_norm": 1.5831553556546503, + "learning_rate": 8.03665530068196e-07, + "loss": 0.8594, + "step": 207410 + }, + { + "epoch": 16.073462745553876, + "grad_norm": 1.6278600327814599, + "learning_rate": 8.037042777433354e-07, + "loss": 0.8548, + "step": 207420 + }, + { + "epoch": 16.074237669030182, + "grad_norm": 1.5501523568418518, + "learning_rate": 8.03743025418475e-07, + "loss": 0.8629, + "step": 207430 + }, + { + "epoch": 16.07501259250649, + "grad_norm": 1.5578184682179788, + "learning_rate": 8.037817730936144e-07, + "loss": 0.8606, + "step": 207440 + }, + { + "epoch": 16.075787515982796, + "grad_norm": 1.6498367654465325, + "learning_rate": 8.03820520768754e-07, + "loss": 0.8183, + "step": 207450 + }, + { + "epoch": 16.076562439459103, + "grad_norm": 1.5791685702019118, + "learning_rate": 8.038592684438934e-07, + "loss": 0.8437, + "step": 207460 + }, + { + "epoch": 16.07733736293541, + "grad_norm": 1.6432343645335619, + "learning_rate": 8.03898016119033e-07, + "loss": 0.8578, + "step": 207470 + }, + { + "epoch": 16.078112286411717, + "grad_norm": 1.5565821769325119, + "learning_rate": 8.039367637941724e-07, + "loss": 0.8548, + "step": 207480 + }, + { + "epoch": 16.078887209888023, + "grad_norm": 1.5318495389062876, + "learning_rate": 8.039755114693119e-07, + "loss": 0.8393, + "step": 207490 + }, + { + "epoch": 16.07966213336433, + "grad_norm": 1.609253213077234, + "learning_rate": 8.040142591444513e-07, + "loss": 0.843, + "step": 207500 + }, + { + "epoch": 16.07966213336433, + "eval_loss": 0.890404224395752, + "eval_runtime": 328.9768, + "eval_samples_per_second": 34.869, + "eval_steps_per_second": 8.718, + "step": 207500 + }, + { + "epoch": 16.080437056840637, + "grad_norm": 1.5762432505892472, + "learning_rate": 8.040530068195909e-07, + "loss": 0.8524, + "step": 207510 + }, + { + "epoch": 16.081211980316944, + "grad_norm": 1.5227785583355828, + "learning_rate": 8.040917544947304e-07, + "loss": 0.8497, + "step": 207520 + }, + { + "epoch": 16.08198690379325, + "grad_norm": 1.5439699375063984, + "learning_rate": 8.041305021698699e-07, + "loss": 0.844, + "step": 207530 + }, + { + "epoch": 16.082761827269557, + "grad_norm": 1.516572588168775, + "learning_rate": 8.041692498450093e-07, + "loss": 0.8611, + "step": 207540 + }, + { + "epoch": 16.083536750745864, + "grad_norm": 1.6528446245904211, + "learning_rate": 8.042079975201489e-07, + "loss": 0.8474, + "step": 207550 + }, + { + "epoch": 16.08431167422217, + "grad_norm": 1.6866914735215823, + "learning_rate": 8.042467451952883e-07, + "loss": 0.8664, + "step": 207560 + }, + { + "epoch": 16.085086597698478, + "grad_norm": 1.5230788870245804, + "learning_rate": 8.042854928704279e-07, + "loss": 0.8369, + "step": 207570 + }, + { + "epoch": 16.085861521174785, + "grad_norm": 1.6084734155983744, + "learning_rate": 8.043242405455673e-07, + "loss": 0.8609, + "step": 207580 + }, + { + "epoch": 16.08663644465109, + "grad_norm": 1.7057199629634197, + "learning_rate": 8.043629882207069e-07, + "loss": 0.8862, + "step": 207590 + }, + { + "epoch": 16.0874113681274, + "grad_norm": 1.5444439646996437, + "learning_rate": 8.044017358958463e-07, + "loss": 0.8329, + "step": 207600 + }, + { + "epoch": 16.088186291603705, + "grad_norm": 1.5572683315276346, + "learning_rate": 8.044404835709858e-07, + "loss": 0.8599, + "step": 207610 + }, + { + "epoch": 16.088961215080012, + "grad_norm": 1.5680663293494517, + "learning_rate": 8.044792312461253e-07, + "loss": 0.8488, + "step": 207620 + }, + { + "epoch": 16.08973613855632, + "grad_norm": 1.657168388102786, + "learning_rate": 8.045179789212648e-07, + "loss": 0.8545, + "step": 207630 + }, + { + "epoch": 16.090511062032625, + "grad_norm": 1.5745480118290482, + "learning_rate": 8.045567265964042e-07, + "loss": 0.8463, + "step": 207640 + }, + { + "epoch": 16.091285985508932, + "grad_norm": 1.6772272635772485, + "learning_rate": 8.045954742715438e-07, + "loss": 0.8425, + "step": 207650 + }, + { + "epoch": 16.09206090898524, + "grad_norm": 1.5875424112973475, + "learning_rate": 8.046342219466832e-07, + "loss": 0.86, + "step": 207660 + }, + { + "epoch": 16.092835832461546, + "grad_norm": 1.5959506872964153, + "learning_rate": 8.046729696218228e-07, + "loss": 0.8527, + "step": 207670 + }, + { + "epoch": 16.093610755937853, + "grad_norm": 1.630535037153674, + "learning_rate": 8.047117172969622e-07, + "loss": 0.8333, + "step": 207680 + }, + { + "epoch": 16.09438567941416, + "grad_norm": 1.5295460185152114, + "learning_rate": 8.047504649721018e-07, + "loss": 0.8423, + "step": 207690 + }, + { + "epoch": 16.095160602890463, + "grad_norm": 1.5492042744035934, + "learning_rate": 8.047892126472412e-07, + "loss": 0.8453, + "step": 207700 + }, + { + "epoch": 16.09593552636677, + "grad_norm": 1.4748686617995146, + "learning_rate": 8.048279603223807e-07, + "loss": 0.8557, + "step": 207710 + }, + { + "epoch": 16.096710449843076, + "grad_norm": 1.5490837942787319, + "learning_rate": 8.048667079975202e-07, + "loss": 0.8505, + "step": 207720 + }, + { + "epoch": 16.097485373319383, + "grad_norm": 1.605158994130901, + "learning_rate": 8.049054556726598e-07, + "loss": 0.8575, + "step": 207730 + }, + { + "epoch": 16.09826029679569, + "grad_norm": 1.674893387873105, + "learning_rate": 8.049442033477992e-07, + "loss": 0.8363, + "step": 207740 + }, + { + "epoch": 16.099035220271997, + "grad_norm": 1.5618693780657962, + "learning_rate": 8.049829510229387e-07, + "loss": 0.844, + "step": 207750 + }, + { + "epoch": 16.099810143748304, + "grad_norm": 1.5621569636335049, + "learning_rate": 8.050216986980781e-07, + "loss": 0.8539, + "step": 207760 + }, + { + "epoch": 16.10058506722461, + "grad_norm": 1.5765362505440983, + "learning_rate": 8.050604463732177e-07, + "loss": 0.8623, + "step": 207770 + }, + { + "epoch": 16.101359990700917, + "grad_norm": 1.5772808915500307, + "learning_rate": 8.050991940483571e-07, + "loss": 0.855, + "step": 207780 + }, + { + "epoch": 16.102134914177224, + "grad_norm": 1.5393420318236852, + "learning_rate": 8.051379417234967e-07, + "loss": 0.8512, + "step": 207790 + }, + { + "epoch": 16.10290983765353, + "grad_norm": 1.624210372963065, + "learning_rate": 8.051766893986361e-07, + "loss": 0.8482, + "step": 207800 + }, + { + "epoch": 16.103684761129838, + "grad_norm": 1.5713524981867146, + "learning_rate": 8.052154370737756e-07, + "loss": 0.8627, + "step": 207810 + }, + { + "epoch": 16.104459684606145, + "grad_norm": 1.576715225836395, + "learning_rate": 8.052541847489151e-07, + "loss": 0.8357, + "step": 207820 + }, + { + "epoch": 16.10523460808245, + "grad_norm": 1.5533168671686977, + "learning_rate": 8.052929324240547e-07, + "loss": 0.863, + "step": 207830 + }, + { + "epoch": 16.106009531558758, + "grad_norm": 1.6381862275019266, + "learning_rate": 8.053316800991941e-07, + "loss": 0.8478, + "step": 207840 + }, + { + "epoch": 16.106784455035065, + "grad_norm": 1.5676389465360951, + "learning_rate": 8.053704277743336e-07, + "loss": 0.836, + "step": 207850 + }, + { + "epoch": 16.10755937851137, + "grad_norm": 1.6031293245499771, + "learning_rate": 8.05409175449473e-07, + "loss": 0.8542, + "step": 207860 + }, + { + "epoch": 16.10833430198768, + "grad_norm": 1.5226358781993752, + "learning_rate": 8.054479231246127e-07, + "loss": 0.8603, + "step": 207870 + }, + { + "epoch": 16.109109225463985, + "grad_norm": 1.575374936314425, + "learning_rate": 8.054866707997521e-07, + "loss": 0.8462, + "step": 207880 + }, + { + "epoch": 16.109884148940292, + "grad_norm": 1.662749649575115, + "learning_rate": 8.055254184748916e-07, + "loss": 0.8635, + "step": 207890 + }, + { + "epoch": 16.1106590724166, + "grad_norm": 1.711788077186871, + "learning_rate": 8.05564166150031e-07, + "loss": 0.8615, + "step": 207900 + }, + { + "epoch": 16.111433995892906, + "grad_norm": 1.4863562591478527, + "learning_rate": 8.056029138251705e-07, + "loss": 0.8728, + "step": 207910 + }, + { + "epoch": 16.112208919369213, + "grad_norm": 1.5550753078210147, + "learning_rate": 8.0564166150031e-07, + "loss": 0.8577, + "step": 207920 + }, + { + "epoch": 16.11298384284552, + "grad_norm": 1.6140564243365385, + "learning_rate": 8.056804091754496e-07, + "loss": 0.8582, + "step": 207930 + }, + { + "epoch": 16.113758766321826, + "grad_norm": 1.592068312901116, + "learning_rate": 8.05719156850589e-07, + "loss": 0.8461, + "step": 207940 + }, + { + "epoch": 16.114533689798133, + "grad_norm": 1.636361848030011, + "learning_rate": 8.057579045257285e-07, + "loss": 0.8537, + "step": 207950 + }, + { + "epoch": 16.11530861327444, + "grad_norm": 1.5418863567394265, + "learning_rate": 8.057966522008679e-07, + "loss": 0.8607, + "step": 207960 + }, + { + "epoch": 16.116083536750747, + "grad_norm": 1.6021299750426499, + "learning_rate": 8.058353998760076e-07, + "loss": 0.85, + "step": 207970 + }, + { + "epoch": 16.116858460227053, + "grad_norm": 1.6253378086115418, + "learning_rate": 8.05874147551147e-07, + "loss": 0.8705, + "step": 207980 + }, + { + "epoch": 16.11763338370336, + "grad_norm": 1.6151041060254627, + "learning_rate": 8.059128952262865e-07, + "loss": 0.8492, + "step": 207990 + }, + { + "epoch": 16.118408307179667, + "grad_norm": 1.5168848711747187, + "learning_rate": 8.059516429014259e-07, + "loss": 0.8489, + "step": 208000 + }, + { + "epoch": 16.118408307179667, + "eval_loss": 0.8906925320625305, + "eval_runtime": 330.07, + "eval_samples_per_second": 34.753, + "eval_steps_per_second": 8.689, + "step": 208000 + }, + { + "epoch": 16.119183230655974, + "grad_norm": 1.6652821347863769, + "learning_rate": 8.059903905765656e-07, + "loss": 0.8528, + "step": 208010 + }, + { + "epoch": 16.11995815413228, + "grad_norm": 1.6882040323061924, + "learning_rate": 8.06029138251705e-07, + "loss": 0.8451, + "step": 208020 + }, + { + "epoch": 16.120733077608588, + "grad_norm": 1.5427524603496934, + "learning_rate": 8.060678859268445e-07, + "loss": 0.8579, + "step": 208030 + }, + { + "epoch": 16.121508001084894, + "grad_norm": 1.6925780390280265, + "learning_rate": 8.061066336019839e-07, + "loss": 0.8443, + "step": 208040 + }, + { + "epoch": 16.1222829245612, + "grad_norm": 1.6405753176904525, + "learning_rate": 8.061453812771234e-07, + "loss": 0.8471, + "step": 208050 + }, + { + "epoch": 16.123057848037508, + "grad_norm": 1.7048617031451265, + "learning_rate": 8.061841289522628e-07, + "loss": 0.843, + "step": 208060 + }, + { + "epoch": 16.123832771513815, + "grad_norm": 1.4604233511791787, + "learning_rate": 8.062228766274025e-07, + "loss": 0.846, + "step": 208070 + }, + { + "epoch": 16.124607694990118, + "grad_norm": 1.5738349125073605, + "learning_rate": 8.062616243025419e-07, + "loss": 0.8336, + "step": 208080 + }, + { + "epoch": 16.125382618466425, + "grad_norm": 1.5319579313680134, + "learning_rate": 8.063003719776814e-07, + "loss": 0.8565, + "step": 208090 + }, + { + "epoch": 16.12615754194273, + "grad_norm": 1.672986296628112, + "learning_rate": 8.063391196528208e-07, + "loss": 0.8719, + "step": 208100 + }, + { + "epoch": 16.12693246541904, + "grad_norm": 1.7282917166956175, + "learning_rate": 8.063778673279605e-07, + "loss": 0.851, + "step": 208110 + }, + { + "epoch": 16.127707388895345, + "grad_norm": 1.5803875672487837, + "learning_rate": 8.064166150030999e-07, + "loss": 0.8739, + "step": 208120 + }, + { + "epoch": 16.128482312371652, + "grad_norm": 1.6319482554788582, + "learning_rate": 8.064553626782394e-07, + "loss": 0.8683, + "step": 208130 + }, + { + "epoch": 16.12925723584796, + "grad_norm": 1.568261410236879, + "learning_rate": 8.064941103533788e-07, + "loss": 0.859, + "step": 208140 + }, + { + "epoch": 16.130032159324266, + "grad_norm": 1.6634357036198058, + "learning_rate": 8.065328580285184e-07, + "loss": 0.862, + "step": 208150 + }, + { + "epoch": 16.130807082800573, + "grad_norm": 1.6612291409391287, + "learning_rate": 8.065716057036579e-07, + "loss": 0.8521, + "step": 208160 + }, + { + "epoch": 16.13158200627688, + "grad_norm": 1.6797993860858502, + "learning_rate": 8.066103533787974e-07, + "loss": 0.8518, + "step": 208170 + }, + { + "epoch": 16.132356929753186, + "grad_norm": 1.5393121529610168, + "learning_rate": 8.066491010539368e-07, + "loss": 0.8491, + "step": 208180 + }, + { + "epoch": 16.133131853229493, + "grad_norm": 1.5599124205251464, + "learning_rate": 8.066878487290763e-07, + "loss": 0.8547, + "step": 208190 + }, + { + "epoch": 16.1339067767058, + "grad_norm": 1.5150141171501792, + "learning_rate": 8.067265964042157e-07, + "loss": 0.8629, + "step": 208200 + }, + { + "epoch": 16.134681700182107, + "grad_norm": 1.5792140612151193, + "learning_rate": 8.067653440793554e-07, + "loss": 0.881, + "step": 208210 + }, + { + "epoch": 16.135456623658413, + "grad_norm": 1.559106926762241, + "learning_rate": 8.068040917544948e-07, + "loss": 0.8698, + "step": 208220 + }, + { + "epoch": 16.13623154713472, + "grad_norm": 1.553249073431084, + "learning_rate": 8.068428394296343e-07, + "loss": 0.8542, + "step": 208230 + }, + { + "epoch": 16.137006470611027, + "grad_norm": 1.6011759233202634, + "learning_rate": 8.068815871047737e-07, + "loss": 0.857, + "step": 208240 + }, + { + "epoch": 16.137781394087334, + "grad_norm": 1.5859073156127796, + "learning_rate": 8.069203347799133e-07, + "loss": 0.8533, + "step": 208250 + }, + { + "epoch": 16.13855631756364, + "grad_norm": 1.6542882565263524, + "learning_rate": 8.069590824550528e-07, + "loss": 0.8542, + "step": 208260 + }, + { + "epoch": 16.139331241039947, + "grad_norm": 1.6383434927232012, + "learning_rate": 8.069978301301923e-07, + "loss": 0.8524, + "step": 208270 + }, + { + "epoch": 16.140106164516254, + "grad_norm": 1.6199399567306187, + "learning_rate": 8.070365778053317e-07, + "loss": 0.8575, + "step": 208280 + }, + { + "epoch": 16.14088108799256, + "grad_norm": 1.5405345102702952, + "learning_rate": 8.070753254804712e-07, + "loss": 0.8297, + "step": 208290 + }, + { + "epoch": 16.141656011468868, + "grad_norm": 1.6579383449904943, + "learning_rate": 8.071140731556107e-07, + "loss": 0.846, + "step": 208300 + }, + { + "epoch": 16.142430934945175, + "grad_norm": 1.6753054136687588, + "learning_rate": 8.071528208307503e-07, + "loss": 0.8534, + "step": 208310 + }, + { + "epoch": 16.14320585842148, + "grad_norm": 1.7244197369636869, + "learning_rate": 8.071915685058897e-07, + "loss": 0.8491, + "step": 208320 + }, + { + "epoch": 16.14398078189779, + "grad_norm": 1.5298273048906914, + "learning_rate": 8.072303161810292e-07, + "loss": 0.8556, + "step": 208330 + }, + { + "epoch": 16.144755705374095, + "grad_norm": 1.7042981420491432, + "learning_rate": 8.072690638561686e-07, + "loss": 0.8963, + "step": 208340 + }, + { + "epoch": 16.145530628850402, + "grad_norm": 1.7123609624648275, + "learning_rate": 8.073078115313082e-07, + "loss": 0.8561, + "step": 208350 + }, + { + "epoch": 16.14630555232671, + "grad_norm": 1.6448920051742935, + "learning_rate": 8.073465592064477e-07, + "loss": 0.858, + "step": 208360 + }, + { + "epoch": 16.147080475803016, + "grad_norm": 1.6733034740315844, + "learning_rate": 8.073853068815872e-07, + "loss": 0.8595, + "step": 208370 + }, + { + "epoch": 16.147855399279322, + "grad_norm": 1.5477771738119734, + "learning_rate": 8.074240545567266e-07, + "loss": 0.8479, + "step": 208380 + }, + { + "epoch": 16.14863032275563, + "grad_norm": 1.7615047163702624, + "learning_rate": 8.074628022318662e-07, + "loss": 0.8462, + "step": 208390 + }, + { + "epoch": 16.149405246231936, + "grad_norm": 1.5223649869663054, + "learning_rate": 8.075015499070056e-07, + "loss": 0.8482, + "step": 208400 + }, + { + "epoch": 16.150180169708243, + "grad_norm": 1.4958211469901712, + "learning_rate": 8.075402975821452e-07, + "loss": 0.8595, + "step": 208410 + }, + { + "epoch": 16.15095509318455, + "grad_norm": 1.5525892845240092, + "learning_rate": 8.075790452572846e-07, + "loss": 0.855, + "step": 208420 + }, + { + "epoch": 16.151730016660856, + "grad_norm": 1.5444678421209797, + "learning_rate": 8.076177929324241e-07, + "loss": 0.8431, + "step": 208430 + }, + { + "epoch": 16.152504940137163, + "grad_norm": 1.6406518187244616, + "learning_rate": 8.076565406075636e-07, + "loss": 0.8497, + "step": 208440 + }, + { + "epoch": 16.153279863613466, + "grad_norm": 1.5877257791064934, + "learning_rate": 8.076952882827031e-07, + "loss": 0.8618, + "step": 208450 + }, + { + "epoch": 16.154054787089773, + "grad_norm": 1.6901003725117707, + "learning_rate": 8.077340359578426e-07, + "loss": 0.8557, + "step": 208460 + }, + { + "epoch": 16.15482971056608, + "grad_norm": 1.679850883403875, + "learning_rate": 8.077727836329821e-07, + "loss": 0.8696, + "step": 208470 + }, + { + "epoch": 16.155604634042387, + "grad_norm": 1.6630910913336088, + "learning_rate": 8.078115313081215e-07, + "loss": 0.8388, + "step": 208480 + }, + { + "epoch": 16.156379557518694, + "grad_norm": 1.685823678014817, + "learning_rate": 8.078502789832611e-07, + "loss": 0.8578, + "step": 208490 + }, + { + "epoch": 16.157154480995, + "grad_norm": 1.6058867056395403, + "learning_rate": 8.078890266584005e-07, + "loss": 0.8434, + "step": 208500 + }, + { + "epoch": 16.157154480995, + "eval_loss": 0.8906611800193787, + "eval_runtime": 327.997, + "eval_samples_per_second": 34.973, + "eval_steps_per_second": 8.744, + "step": 208500 + }, + { + "epoch": 16.157929404471307, + "grad_norm": 1.5355888970005367, + "learning_rate": 8.079277743335401e-07, + "loss": 0.8402, + "step": 208510 + }, + { + "epoch": 16.158704327947614, + "grad_norm": 1.4723607057300934, + "learning_rate": 8.079665220086795e-07, + "loss": 0.8583, + "step": 208520 + }, + { + "epoch": 16.15947925142392, + "grad_norm": 1.5792166047659955, + "learning_rate": 8.080052696838191e-07, + "loss": 0.8699, + "step": 208530 + }, + { + "epoch": 16.160254174900228, + "grad_norm": 1.5274629574357639, + "learning_rate": 8.080440173589585e-07, + "loss": 0.8624, + "step": 208540 + }, + { + "epoch": 16.161029098376535, + "grad_norm": 1.5923567275271575, + "learning_rate": 8.08082765034098e-07, + "loss": 0.8632, + "step": 208550 + }, + { + "epoch": 16.16180402185284, + "grad_norm": 1.6813825794681938, + "learning_rate": 8.081215127092375e-07, + "loss": 0.8649, + "step": 208560 + }, + { + "epoch": 16.162578945329148, + "grad_norm": 1.5999390978333112, + "learning_rate": 8.08160260384377e-07, + "loss": 0.8464, + "step": 208570 + }, + { + "epoch": 16.163353868805455, + "grad_norm": 1.570071400992599, + "learning_rate": 8.081990080595165e-07, + "loss": 0.8542, + "step": 208580 + }, + { + "epoch": 16.164128792281762, + "grad_norm": 1.6559033413835165, + "learning_rate": 8.08237755734656e-07, + "loss": 0.8511, + "step": 208590 + }, + { + "epoch": 16.16490371575807, + "grad_norm": 1.6286475065813264, + "learning_rate": 8.082765034097954e-07, + "loss": 0.8563, + "step": 208600 + }, + { + "epoch": 16.165678639234375, + "grad_norm": 1.583790930673367, + "learning_rate": 8.08315251084935e-07, + "loss": 0.8498, + "step": 208610 + }, + { + "epoch": 16.166453562710682, + "grad_norm": 1.6649477086204836, + "learning_rate": 8.083539987600744e-07, + "loss": 0.8667, + "step": 208620 + }, + { + "epoch": 16.16722848618699, + "grad_norm": 1.6250608278017673, + "learning_rate": 8.08392746435214e-07, + "loss": 0.8564, + "step": 208630 + }, + { + "epoch": 16.168003409663296, + "grad_norm": 1.6279391310993239, + "learning_rate": 8.084314941103534e-07, + "loss": 0.858, + "step": 208640 + }, + { + "epoch": 16.168778333139603, + "grad_norm": 1.563598707811345, + "learning_rate": 8.084702417854929e-07, + "loss": 0.8573, + "step": 208650 + }, + { + "epoch": 16.16955325661591, + "grad_norm": 1.7118396989683669, + "learning_rate": 8.085089894606324e-07, + "loss": 0.857, + "step": 208660 + }, + { + "epoch": 16.170328180092216, + "grad_norm": 1.61686498768033, + "learning_rate": 8.08547737135772e-07, + "loss": 0.8548, + "step": 208670 + }, + { + "epoch": 16.171103103568523, + "grad_norm": 1.5093923163707978, + "learning_rate": 8.085864848109114e-07, + "loss": 0.8383, + "step": 208680 + }, + { + "epoch": 16.17187802704483, + "grad_norm": 1.4900053681154268, + "learning_rate": 8.086252324860509e-07, + "loss": 0.8841, + "step": 208690 + }, + { + "epoch": 16.172652950521137, + "grad_norm": 1.6573800559132292, + "learning_rate": 8.086639801611903e-07, + "loss": 0.8471, + "step": 208700 + }, + { + "epoch": 16.173427873997444, + "grad_norm": 1.5903339144200848, + "learning_rate": 8.087027278363299e-07, + "loss": 0.8428, + "step": 208710 + }, + { + "epoch": 16.17420279747375, + "grad_norm": 1.6484167455532983, + "learning_rate": 8.087414755114694e-07, + "loss": 0.8591, + "step": 208720 + }, + { + "epoch": 16.174977720950057, + "grad_norm": 1.630867435525039, + "learning_rate": 8.087802231866089e-07, + "loss": 0.8565, + "step": 208730 + }, + { + "epoch": 16.175752644426364, + "grad_norm": 1.6248621095482643, + "learning_rate": 8.088189708617483e-07, + "loss": 0.8423, + "step": 208740 + }, + { + "epoch": 16.17652756790267, + "grad_norm": 1.6439422401717618, + "learning_rate": 8.088577185368878e-07, + "loss": 0.8788, + "step": 208750 + }, + { + "epoch": 16.177302491378978, + "grad_norm": 1.5797684028568006, + "learning_rate": 8.088964662120273e-07, + "loss": 0.858, + "step": 208760 + }, + { + "epoch": 16.178077414855284, + "grad_norm": 1.5737281068076072, + "learning_rate": 8.089352138871669e-07, + "loss": 0.8607, + "step": 208770 + }, + { + "epoch": 16.17885233833159, + "grad_norm": 1.6266503490417417, + "learning_rate": 8.089739615623063e-07, + "loss": 0.8552, + "step": 208780 + }, + { + "epoch": 16.179627261807898, + "grad_norm": 1.6442962562807886, + "learning_rate": 8.090127092374458e-07, + "loss": 0.8606, + "step": 208790 + }, + { + "epoch": 16.180402185284205, + "grad_norm": 1.9012355698247172, + "learning_rate": 8.090514569125852e-07, + "loss": 0.8371, + "step": 208800 + }, + { + "epoch": 16.18117710876051, + "grad_norm": 1.5588356487858397, + "learning_rate": 8.090902045877249e-07, + "loss": 0.841, + "step": 208810 + }, + { + "epoch": 16.181952032236815, + "grad_norm": 1.5610631932572094, + "learning_rate": 8.091289522628643e-07, + "loss": 0.849, + "step": 208820 + }, + { + "epoch": 16.18272695571312, + "grad_norm": 1.58148553347137, + "learning_rate": 8.091676999380038e-07, + "loss": 0.8387, + "step": 208830 + }, + { + "epoch": 16.18350187918943, + "grad_norm": 1.6708403698428356, + "learning_rate": 8.092064476131432e-07, + "loss": 0.8564, + "step": 208840 + }, + { + "epoch": 16.184276802665735, + "grad_norm": 1.6319540056151658, + "learning_rate": 8.092451952882828e-07, + "loss": 0.8439, + "step": 208850 + }, + { + "epoch": 16.185051726142042, + "grad_norm": 1.6111152654383534, + "learning_rate": 8.092839429634223e-07, + "loss": 0.8598, + "step": 208860 + }, + { + "epoch": 16.18582664961835, + "grad_norm": 1.6316077607226676, + "learning_rate": 8.093226906385618e-07, + "loss": 0.8411, + "step": 208870 + }, + { + "epoch": 16.186601573094656, + "grad_norm": 1.5498815429655315, + "learning_rate": 8.093614383137012e-07, + "loss": 0.8763, + "step": 208880 + }, + { + "epoch": 16.187376496570963, + "grad_norm": 1.6970662669791952, + "learning_rate": 8.094001859888407e-07, + "loss": 0.8459, + "step": 208890 + }, + { + "epoch": 16.18815142004727, + "grad_norm": 1.5612518370475261, + "learning_rate": 8.094389336639801e-07, + "loss": 0.8614, + "step": 208900 + }, + { + "epoch": 16.188926343523576, + "grad_norm": 1.5278556684627713, + "learning_rate": 8.094776813391198e-07, + "loss": 0.8499, + "step": 208910 + }, + { + "epoch": 16.189701266999883, + "grad_norm": 1.5829278165740914, + "learning_rate": 8.095164290142592e-07, + "loss": 0.8601, + "step": 208920 + }, + { + "epoch": 16.19047619047619, + "grad_norm": 1.6334737586367638, + "learning_rate": 8.095551766893987e-07, + "loss": 0.8384, + "step": 208930 + }, + { + "epoch": 16.191251113952497, + "grad_norm": 1.6184307837425305, + "learning_rate": 8.095939243645381e-07, + "loss": 0.8618, + "step": 208940 + }, + { + "epoch": 16.192026037428803, + "grad_norm": 1.6096349951286653, + "learning_rate": 8.096326720396778e-07, + "loss": 0.85, + "step": 208950 + }, + { + "epoch": 16.19280096090511, + "grad_norm": 1.6506391247891956, + "learning_rate": 8.096714197148172e-07, + "loss": 0.8627, + "step": 208960 + }, + { + "epoch": 16.193575884381417, + "grad_norm": 1.675457933755695, + "learning_rate": 8.097101673899567e-07, + "loss": 0.8526, + "step": 208970 + }, + { + "epoch": 16.194350807857724, + "grad_norm": 1.50411861724833, + "learning_rate": 8.097489150650961e-07, + "loss": 0.8532, + "step": 208980 + }, + { + "epoch": 16.19512573133403, + "grad_norm": 1.5868497395789107, + "learning_rate": 8.097876627402356e-07, + "loss": 0.8411, + "step": 208990 + }, + { + "epoch": 16.195900654810337, + "grad_norm": 1.6568394924311358, + "learning_rate": 8.09826410415375e-07, + "loss": 0.8542, + "step": 209000 + }, + { + "epoch": 16.195900654810337, + "eval_loss": 0.8905114531517029, + "eval_runtime": 330.1783, + "eval_samples_per_second": 34.742, + "eval_steps_per_second": 8.686, + "step": 209000 + }, + { + "epoch": 16.196675578286644, + "grad_norm": 1.6525967851953378, + "learning_rate": 8.098651580905147e-07, + "loss": 0.8562, + "step": 209010 + }, + { + "epoch": 16.19745050176295, + "grad_norm": 1.546588886333184, + "learning_rate": 8.099039057656541e-07, + "loss": 0.8703, + "step": 209020 + }, + { + "epoch": 16.198225425239258, + "grad_norm": 1.6837029244609538, + "learning_rate": 8.099426534407936e-07, + "loss": 0.844, + "step": 209030 + }, + { + "epoch": 16.199000348715565, + "grad_norm": 1.6277352589162752, + "learning_rate": 8.09981401115933e-07, + "loss": 0.8709, + "step": 209040 + }, + { + "epoch": 16.19977527219187, + "grad_norm": 1.5400602259653937, + "learning_rate": 8.100201487910727e-07, + "loss": 0.8523, + "step": 209050 + }, + { + "epoch": 16.20055019566818, + "grad_norm": 1.5212623861671666, + "learning_rate": 8.100588964662121e-07, + "loss": 0.8568, + "step": 209060 + }, + { + "epoch": 16.201325119144485, + "grad_norm": 1.7289327436702249, + "learning_rate": 8.100976441413516e-07, + "loss": 0.8496, + "step": 209070 + }, + { + "epoch": 16.202100042620792, + "grad_norm": 1.6478077267234292, + "learning_rate": 8.10136391816491e-07, + "loss": 0.8571, + "step": 209080 + }, + { + "epoch": 16.2028749660971, + "grad_norm": 1.6205531538064528, + "learning_rate": 8.101751394916306e-07, + "loss": 0.8355, + "step": 209090 + }, + { + "epoch": 16.203649889573406, + "grad_norm": 1.5517989456949868, + "learning_rate": 8.102138871667701e-07, + "loss": 0.8556, + "step": 209100 + }, + { + "epoch": 16.204424813049712, + "grad_norm": 1.589645766508986, + "learning_rate": 8.102526348419096e-07, + "loss": 0.8289, + "step": 209110 + }, + { + "epoch": 16.20519973652602, + "grad_norm": 1.5540706454210476, + "learning_rate": 8.10291382517049e-07, + "loss": 0.8569, + "step": 209120 + }, + { + "epoch": 16.205974660002326, + "grad_norm": 1.702827130337304, + "learning_rate": 8.103301301921885e-07, + "loss": 0.8478, + "step": 209130 + }, + { + "epoch": 16.206749583478633, + "grad_norm": 1.5916331954973326, + "learning_rate": 8.103688778673279e-07, + "loss": 0.8475, + "step": 209140 + }, + { + "epoch": 16.20752450695494, + "grad_norm": 1.6408099359452037, + "learning_rate": 8.104076255424676e-07, + "loss": 0.8397, + "step": 209150 + }, + { + "epoch": 16.208299430431246, + "grad_norm": 1.6438767631775508, + "learning_rate": 8.10446373217607e-07, + "loss": 0.8395, + "step": 209160 + }, + { + "epoch": 16.209074353907553, + "grad_norm": 1.4942961892181374, + "learning_rate": 8.104851208927465e-07, + "loss": 0.8471, + "step": 209170 + }, + { + "epoch": 16.20984927738386, + "grad_norm": 1.7142551205490069, + "learning_rate": 8.105238685678859e-07, + "loss": 0.8443, + "step": 209180 + }, + { + "epoch": 16.210624200860163, + "grad_norm": 1.6053081445703066, + "learning_rate": 8.105626162430255e-07, + "loss": 0.8465, + "step": 209190 + }, + { + "epoch": 16.21139912433647, + "grad_norm": 1.663213821098823, + "learning_rate": 8.10601363918165e-07, + "loss": 0.8288, + "step": 209200 + }, + { + "epoch": 16.212174047812777, + "grad_norm": 1.7206736045734035, + "learning_rate": 8.106401115933045e-07, + "loss": 0.8543, + "step": 209210 + }, + { + "epoch": 16.212948971289084, + "grad_norm": 1.5656257145535468, + "learning_rate": 8.106788592684439e-07, + "loss": 0.8575, + "step": 209220 + }, + { + "epoch": 16.21372389476539, + "grad_norm": 1.6468864737200994, + "learning_rate": 8.107176069435835e-07, + "loss": 0.868, + "step": 209230 + }, + { + "epoch": 16.214498818241697, + "grad_norm": 1.5525215051374222, + "learning_rate": 8.107563546187229e-07, + "loss": 0.8582, + "step": 209240 + }, + { + "epoch": 16.215273741718004, + "grad_norm": 1.6330507860471086, + "learning_rate": 8.107951022938625e-07, + "loss": 0.8443, + "step": 209250 + }, + { + "epoch": 16.21604866519431, + "grad_norm": 1.6308002129049874, + "learning_rate": 8.108338499690019e-07, + "loss": 0.8605, + "step": 209260 + }, + { + "epoch": 16.216823588670618, + "grad_norm": 1.5480728556483407, + "learning_rate": 8.108725976441414e-07, + "loss": 0.8443, + "step": 209270 + }, + { + "epoch": 16.217598512146925, + "grad_norm": 1.5810688545522897, + "learning_rate": 8.109113453192808e-07, + "loss": 0.8515, + "step": 209280 + }, + { + "epoch": 16.21837343562323, + "grad_norm": 1.5173351609538852, + "learning_rate": 8.109500929944204e-07, + "loss": 0.8312, + "step": 209290 + }, + { + "epoch": 16.21914835909954, + "grad_norm": 1.6137682571276306, + "learning_rate": 8.109888406695599e-07, + "loss": 0.8571, + "step": 209300 + }, + { + "epoch": 16.219923282575845, + "grad_norm": 1.4738368264758683, + "learning_rate": 8.110275883446994e-07, + "loss": 0.8564, + "step": 209310 + }, + { + "epoch": 16.220698206052152, + "grad_norm": 1.6098228092718703, + "learning_rate": 8.110663360198388e-07, + "loss": 0.854, + "step": 209320 + }, + { + "epoch": 16.22147312952846, + "grad_norm": 1.5934128560415375, + "learning_rate": 8.111050836949784e-07, + "loss": 0.85, + "step": 209330 + }, + { + "epoch": 16.222248053004765, + "grad_norm": 1.607259581090853, + "learning_rate": 8.111438313701178e-07, + "loss": 0.8423, + "step": 209340 + }, + { + "epoch": 16.223022976481072, + "grad_norm": 1.6265677533954028, + "learning_rate": 8.111825790452574e-07, + "loss": 0.8558, + "step": 209350 + }, + { + "epoch": 16.22379789995738, + "grad_norm": 1.6629112338259302, + "learning_rate": 8.112213267203968e-07, + "loss": 0.845, + "step": 209360 + }, + { + "epoch": 16.224572823433686, + "grad_norm": 1.6176625610531405, + "learning_rate": 8.112600743955364e-07, + "loss": 0.8426, + "step": 209370 + }, + { + "epoch": 16.225347746909993, + "grad_norm": 1.6318298607646111, + "learning_rate": 8.112988220706758e-07, + "loss": 0.8496, + "step": 209380 + }, + { + "epoch": 16.2261226703863, + "grad_norm": 1.4955149060545456, + "learning_rate": 8.113375697458153e-07, + "loss": 0.8614, + "step": 209390 + }, + { + "epoch": 16.226897593862606, + "grad_norm": 1.6049458728002373, + "learning_rate": 8.113763174209548e-07, + "loss": 0.8518, + "step": 209400 + }, + { + "epoch": 16.227672517338913, + "grad_norm": 1.5223778583694125, + "learning_rate": 8.114150650960943e-07, + "loss": 0.8429, + "step": 209410 + }, + { + "epoch": 16.22844744081522, + "grad_norm": 1.6731772010525636, + "learning_rate": 8.114538127712337e-07, + "loss": 0.8585, + "step": 209420 + }, + { + "epoch": 16.229222364291527, + "grad_norm": 1.6595645375159096, + "learning_rate": 8.114925604463733e-07, + "loss": 0.8448, + "step": 209430 + }, + { + "epoch": 16.229997287767834, + "grad_norm": 1.6001770355674199, + "learning_rate": 8.115313081215127e-07, + "loss": 0.8554, + "step": 209440 + }, + { + "epoch": 16.23077221124414, + "grad_norm": 1.4542167343934267, + "learning_rate": 8.115700557966523e-07, + "loss": 0.8544, + "step": 209450 + }, + { + "epoch": 16.231547134720447, + "grad_norm": 1.5892784580345847, + "learning_rate": 8.116088034717917e-07, + "loss": 0.8497, + "step": 209460 + }, + { + "epoch": 16.232322058196754, + "grad_norm": 1.6852024257595617, + "learning_rate": 8.116475511469313e-07, + "loss": 0.8414, + "step": 209470 + }, + { + "epoch": 16.23309698167306, + "grad_norm": 1.7005784352003637, + "learning_rate": 8.116862988220707e-07, + "loss": 0.8538, + "step": 209480 + }, + { + "epoch": 16.233871905149368, + "grad_norm": 1.5452668962414138, + "learning_rate": 8.117250464972103e-07, + "loss": 0.8356, + "step": 209490 + }, + { + "epoch": 16.234646828625674, + "grad_norm": 1.5496977509628735, + "learning_rate": 8.117637941723497e-07, + "loss": 0.8544, + "step": 209500 + }, + { + "epoch": 16.234646828625674, + "eval_loss": 0.8903323411941528, + "eval_runtime": 330.243, + "eval_samples_per_second": 34.735, + "eval_steps_per_second": 8.685, + "step": 209500 + }, + { + "epoch": 16.23542175210198, + "grad_norm": 1.6337880938995524, + "learning_rate": 8.118025418474893e-07, + "loss": 0.8571, + "step": 209510 + }, + { + "epoch": 16.236196675578288, + "grad_norm": 1.6258191162338094, + "learning_rate": 8.118412895226287e-07, + "loss": 0.8732, + "step": 209520 + }, + { + "epoch": 16.236971599054595, + "grad_norm": 1.622197282518488, + "learning_rate": 8.118800371977682e-07, + "loss": 0.8352, + "step": 209530 + }, + { + "epoch": 16.2377465225309, + "grad_norm": 1.6295706922610733, + "learning_rate": 8.119187848729076e-07, + "loss": 0.8623, + "step": 209540 + }, + { + "epoch": 16.23852144600721, + "grad_norm": 1.6706725970451592, + "learning_rate": 8.119575325480472e-07, + "loss": 0.8705, + "step": 209550 + }, + { + "epoch": 16.23929636948351, + "grad_norm": 1.6797901363753924, + "learning_rate": 8.119962802231866e-07, + "loss": 0.8465, + "step": 209560 + }, + { + "epoch": 16.24007129295982, + "grad_norm": 1.6559709181048825, + "learning_rate": 8.120350278983262e-07, + "loss": 0.8388, + "step": 209570 + }, + { + "epoch": 16.240846216436125, + "grad_norm": 1.6176063763833697, + "learning_rate": 8.120737755734656e-07, + "loss": 0.8451, + "step": 209580 + }, + { + "epoch": 16.241621139912432, + "grad_norm": 1.5558709405228217, + "learning_rate": 8.121125232486052e-07, + "loss": 0.8378, + "step": 209590 + }, + { + "epoch": 16.24239606338874, + "grad_norm": 1.7166204757844454, + "learning_rate": 8.121512709237446e-07, + "loss": 0.8578, + "step": 209600 + }, + { + "epoch": 16.243170986865046, + "grad_norm": 1.6114524048730292, + "learning_rate": 8.121900185988842e-07, + "loss": 0.8614, + "step": 209610 + }, + { + "epoch": 16.243945910341353, + "grad_norm": 1.5902520354589718, + "learning_rate": 8.122287662740236e-07, + "loss": 0.8525, + "step": 209620 + }, + { + "epoch": 16.24472083381766, + "grad_norm": 1.5467394918822501, + "learning_rate": 8.122675139491631e-07, + "loss": 0.8467, + "step": 209630 + }, + { + "epoch": 16.245495757293966, + "grad_norm": 1.7233980729670362, + "learning_rate": 8.123062616243026e-07, + "loss": 0.8359, + "step": 209640 + }, + { + "epoch": 16.246270680770273, + "grad_norm": 1.675738912439101, + "learning_rate": 8.123450092994422e-07, + "loss": 0.862, + "step": 209650 + }, + { + "epoch": 16.24704560424658, + "grad_norm": 1.712176155171347, + "learning_rate": 8.123837569745816e-07, + "loss": 0.8426, + "step": 209660 + }, + { + "epoch": 16.247820527722887, + "grad_norm": 1.6895617151420668, + "learning_rate": 8.124225046497211e-07, + "loss": 0.8567, + "step": 209670 + }, + { + "epoch": 16.248595451199193, + "grad_norm": 1.51431900183267, + "learning_rate": 8.124612523248605e-07, + "loss": 0.835, + "step": 209680 + }, + { + "epoch": 16.2493703746755, + "grad_norm": 1.5824042649109389, + "learning_rate": 8.125000000000001e-07, + "loss": 0.8652, + "step": 209690 + }, + { + "epoch": 16.250145298151807, + "grad_norm": 1.765242733310348, + "learning_rate": 8.125387476751395e-07, + "loss": 0.8344, + "step": 209700 + }, + { + "epoch": 16.250920221628114, + "grad_norm": 1.5241083557099702, + "learning_rate": 8.125774953502791e-07, + "loss": 0.8468, + "step": 209710 + }, + { + "epoch": 16.25169514510442, + "grad_norm": 1.6754323010428405, + "learning_rate": 8.126162430254185e-07, + "loss": 0.8851, + "step": 209720 + }, + { + "epoch": 16.252470068580728, + "grad_norm": 1.622285711784602, + "learning_rate": 8.12654990700558e-07, + "loss": 0.851, + "step": 209730 + }, + { + "epoch": 16.253244992057034, + "grad_norm": 1.581609839822699, + "learning_rate": 8.126937383756975e-07, + "loss": 0.8566, + "step": 209740 + }, + { + "epoch": 16.25401991553334, + "grad_norm": 1.7141642750293906, + "learning_rate": 8.127324860508371e-07, + "loss": 0.8589, + "step": 209750 + }, + { + "epoch": 16.254794839009648, + "grad_norm": 1.5445206529990403, + "learning_rate": 8.127712337259765e-07, + "loss": 0.8701, + "step": 209760 + }, + { + "epoch": 16.255569762485955, + "grad_norm": 1.6811116197286835, + "learning_rate": 8.12809981401116e-07, + "loss": 0.8459, + "step": 209770 + }, + { + "epoch": 16.25634468596226, + "grad_norm": 1.5032918762145355, + "learning_rate": 8.128487290762554e-07, + "loss": 0.8645, + "step": 209780 + }, + { + "epoch": 16.25711960943857, + "grad_norm": 1.5697419441096339, + "learning_rate": 8.12887476751395e-07, + "loss": 0.8537, + "step": 209790 + }, + { + "epoch": 16.257894532914875, + "grad_norm": 1.5898605771961536, + "learning_rate": 8.129262244265345e-07, + "loss": 0.8451, + "step": 209800 + }, + { + "epoch": 16.258669456391182, + "grad_norm": 1.968824726314759, + "learning_rate": 8.12964972101674e-07, + "loss": 0.8621, + "step": 209810 + }, + { + "epoch": 16.25944437986749, + "grad_norm": 1.5791723177686887, + "learning_rate": 8.130037197768134e-07, + "loss": 0.8726, + "step": 209820 + }, + { + "epoch": 16.260219303343796, + "grad_norm": 1.7047956123586034, + "learning_rate": 8.130424674519529e-07, + "loss": 0.8386, + "step": 209830 + }, + { + "epoch": 16.260994226820102, + "grad_norm": 1.580416737557189, + "learning_rate": 8.130812151270924e-07, + "loss": 0.8744, + "step": 209840 + }, + { + "epoch": 16.26176915029641, + "grad_norm": 1.5414544844404543, + "learning_rate": 8.13119962802232e-07, + "loss": 0.8468, + "step": 209850 + }, + { + "epoch": 16.262544073772716, + "grad_norm": 1.673446126177349, + "learning_rate": 8.131587104773714e-07, + "loss": 0.8532, + "step": 209860 + }, + { + "epoch": 16.263318997249023, + "grad_norm": 1.6579142235942008, + "learning_rate": 8.131974581525109e-07, + "loss": 0.8546, + "step": 209870 + }, + { + "epoch": 16.26409392072533, + "grad_norm": 1.5424199379424548, + "learning_rate": 8.132362058276503e-07, + "loss": 0.8468, + "step": 209880 + }, + { + "epoch": 16.264868844201636, + "grad_norm": 1.648824670290035, + "learning_rate": 8.1327495350279e-07, + "loss": 0.8523, + "step": 209890 + }, + { + "epoch": 16.265643767677943, + "grad_norm": 1.646272296210319, + "learning_rate": 8.133137011779294e-07, + "loss": 0.8507, + "step": 209900 + }, + { + "epoch": 16.26641869115425, + "grad_norm": 1.5850121680092732, + "learning_rate": 8.133524488530689e-07, + "loss": 0.8549, + "step": 209910 + }, + { + "epoch": 16.267193614630557, + "grad_norm": 1.5628199823608102, + "learning_rate": 8.133911965282083e-07, + "loss": 0.856, + "step": 209920 + }, + { + "epoch": 16.267968538106864, + "grad_norm": 1.669737633149794, + "learning_rate": 8.134299442033478e-07, + "loss": 0.8491, + "step": 209930 + }, + { + "epoch": 16.268743461583167, + "grad_norm": 1.5365468785139986, + "learning_rate": 8.134686918784874e-07, + "loss": 0.8327, + "step": 209940 + }, + { + "epoch": 16.269518385059474, + "grad_norm": 1.6158605300021613, + "learning_rate": 8.135074395536269e-07, + "loss": 0.8665, + "step": 209950 + }, + { + "epoch": 16.27029330853578, + "grad_norm": 1.5248521759959417, + "learning_rate": 8.135461872287663e-07, + "loss": 0.8575, + "step": 209960 + }, + { + "epoch": 16.271068232012087, + "grad_norm": 1.6567828344847007, + "learning_rate": 8.135849349039058e-07, + "loss": 0.838, + "step": 209970 + }, + { + "epoch": 16.271843155488394, + "grad_norm": 1.5874751901758697, + "learning_rate": 8.136236825790452e-07, + "loss": 0.845, + "step": 209980 + }, + { + "epoch": 16.2726180789647, + "grad_norm": 1.681311061392104, + "learning_rate": 8.136624302541849e-07, + "loss": 0.8461, + "step": 209990 + }, + { + "epoch": 16.273393002441008, + "grad_norm": 1.6825431453542463, + "learning_rate": 8.137011779293243e-07, + "loss": 0.8504, + "step": 210000 + }, + { + "epoch": 16.273393002441008, + "eval_loss": 0.890357255935669, + "eval_runtime": 327.9499, + "eval_samples_per_second": 34.978, + "eval_steps_per_second": 8.745, + "step": 210000 + }, + { + "epoch": 16.274167925917315, + "grad_norm": 1.5588894632864523, + "learning_rate": 8.137399256044638e-07, + "loss": 0.8474, + "step": 210010 + }, + { + "epoch": 16.27494284939362, + "grad_norm": 1.5530437987672345, + "learning_rate": 8.137786732796032e-07, + "loss": 0.8618, + "step": 210020 + }, + { + "epoch": 16.27571777286993, + "grad_norm": 1.6691648019887315, + "learning_rate": 8.138174209547429e-07, + "loss": 0.8646, + "step": 210030 + }, + { + "epoch": 16.276492696346235, + "grad_norm": 1.5634439055443892, + "learning_rate": 8.138561686298823e-07, + "loss": 0.8648, + "step": 210040 + }, + { + "epoch": 16.277267619822542, + "grad_norm": 1.735823973185273, + "learning_rate": 8.138949163050218e-07, + "loss": 0.855, + "step": 210050 + }, + { + "epoch": 16.27804254329885, + "grad_norm": 1.7079974724760876, + "learning_rate": 8.139336639801612e-07, + "loss": 0.855, + "step": 210060 + }, + { + "epoch": 16.278817466775156, + "grad_norm": 1.6229265593491968, + "learning_rate": 8.139724116553007e-07, + "loss": 0.8654, + "step": 210070 + }, + { + "epoch": 16.279592390251462, + "grad_norm": 1.5641707054874643, + "learning_rate": 8.140111593304402e-07, + "loss": 0.8398, + "step": 210080 + }, + { + "epoch": 16.28036731372777, + "grad_norm": 1.6973798275250636, + "learning_rate": 8.140499070055798e-07, + "loss": 0.8551, + "step": 210090 + }, + { + "epoch": 16.281142237204076, + "grad_norm": 1.6224114804933625, + "learning_rate": 8.140886546807192e-07, + "loss": 0.8446, + "step": 210100 + }, + { + "epoch": 16.281917160680383, + "grad_norm": 1.5845150426524748, + "learning_rate": 8.141274023558587e-07, + "loss": 0.8487, + "step": 210110 + }, + { + "epoch": 16.28269208415669, + "grad_norm": 1.5992641321141183, + "learning_rate": 8.141661500309981e-07, + "loss": 0.8521, + "step": 210120 + }, + { + "epoch": 16.283467007632996, + "grad_norm": 1.6536875353205545, + "learning_rate": 8.142048977061378e-07, + "loss": 0.8582, + "step": 210130 + }, + { + "epoch": 16.284241931109303, + "grad_norm": 1.6502577229378097, + "learning_rate": 8.142436453812772e-07, + "loss": 0.8584, + "step": 210140 + }, + { + "epoch": 16.28501685458561, + "grad_norm": 1.6680482924994178, + "learning_rate": 8.142823930564167e-07, + "loss": 0.8593, + "step": 210150 + }, + { + "epoch": 16.285791778061917, + "grad_norm": 1.605209160929274, + "learning_rate": 8.143211407315561e-07, + "loss": 0.8389, + "step": 210160 + }, + { + "epoch": 16.286566701538224, + "grad_norm": 1.5178801146562875, + "learning_rate": 8.143598884066957e-07, + "loss": 0.8489, + "step": 210170 + }, + { + "epoch": 16.28734162501453, + "grad_norm": 1.6557354791480738, + "learning_rate": 8.143986360818352e-07, + "loss": 0.837, + "step": 210180 + }, + { + "epoch": 16.288116548490837, + "grad_norm": 1.5408563747431885, + "learning_rate": 8.144373837569747e-07, + "loss": 0.8365, + "step": 210190 + }, + { + "epoch": 16.288891471967144, + "grad_norm": 1.6914078305192897, + "learning_rate": 8.144761314321141e-07, + "loss": 0.8669, + "step": 210200 + }, + { + "epoch": 16.28966639544345, + "grad_norm": 1.7009776298461725, + "learning_rate": 8.145148791072536e-07, + "loss": 0.8562, + "step": 210210 + }, + { + "epoch": 16.290441318919758, + "grad_norm": 1.6314825512313005, + "learning_rate": 8.145536267823931e-07, + "loss": 0.8666, + "step": 210220 + }, + { + "epoch": 16.291216242396064, + "grad_norm": 1.5450325277022638, + "learning_rate": 8.145923744575327e-07, + "loss": 0.8712, + "step": 210230 + }, + { + "epoch": 16.29199116587237, + "grad_norm": 1.5467900078112313, + "learning_rate": 8.146311221326721e-07, + "loss": 0.8843, + "step": 210240 + }, + { + "epoch": 16.292766089348678, + "grad_norm": 1.666480512879758, + "learning_rate": 8.146698698078116e-07, + "loss": 0.8582, + "step": 210250 + }, + { + "epoch": 16.293541012824985, + "grad_norm": 1.5798189184860816, + "learning_rate": 8.14708617482951e-07, + "loss": 0.8483, + "step": 210260 + }, + { + "epoch": 16.29431593630129, + "grad_norm": 1.6334006970378085, + "learning_rate": 8.147473651580906e-07, + "loss": 0.8731, + "step": 210270 + }, + { + "epoch": 16.2950908597776, + "grad_norm": 1.599082275165122, + "learning_rate": 8.1478611283323e-07, + "loss": 0.8412, + "step": 210280 + }, + { + "epoch": 16.295865783253905, + "grad_norm": 1.574989504114007, + "learning_rate": 8.148248605083696e-07, + "loss": 0.8469, + "step": 210290 + }, + { + "epoch": 16.29664070673021, + "grad_norm": 1.6882562499294322, + "learning_rate": 8.14863608183509e-07, + "loss": 0.8496, + "step": 210300 + }, + { + "epoch": 16.297415630206515, + "grad_norm": 1.5981055925979868, + "learning_rate": 8.149023558586486e-07, + "loss": 0.8764, + "step": 210310 + }, + { + "epoch": 16.298190553682822, + "grad_norm": 1.5626464235496818, + "learning_rate": 8.14941103533788e-07, + "loss": 0.8607, + "step": 210320 + }, + { + "epoch": 16.29896547715913, + "grad_norm": 1.679851028029831, + "learning_rate": 8.149798512089276e-07, + "loss": 0.8616, + "step": 210330 + }, + { + "epoch": 16.299740400635436, + "grad_norm": 1.544279958712685, + "learning_rate": 8.15018598884067e-07, + "loss": 0.8897, + "step": 210340 + }, + { + "epoch": 16.300515324111743, + "grad_norm": 1.4997173969485829, + "learning_rate": 8.150573465592065e-07, + "loss": 0.8457, + "step": 210350 + }, + { + "epoch": 16.30129024758805, + "grad_norm": 1.7420034043028398, + "learning_rate": 8.150960942343459e-07, + "loss": 0.8487, + "step": 210360 + }, + { + "epoch": 16.302065171064356, + "grad_norm": 1.6410409271090192, + "learning_rate": 8.151348419094855e-07, + "loss": 0.8634, + "step": 210370 + }, + { + "epoch": 16.302840094540663, + "grad_norm": 1.5815053143261215, + "learning_rate": 8.15173589584625e-07, + "loss": 0.8465, + "step": 210380 + }, + { + "epoch": 16.30361501801697, + "grad_norm": 1.593915343516627, + "learning_rate": 8.152123372597645e-07, + "loss": 0.8745, + "step": 210390 + }, + { + "epoch": 16.304389941493277, + "grad_norm": 1.6078131255605652, + "learning_rate": 8.152510849349039e-07, + "loss": 0.853, + "step": 210400 + }, + { + "epoch": 16.305164864969584, + "grad_norm": 1.5995822100373345, + "learning_rate": 8.152898326100435e-07, + "loss": 0.8521, + "step": 210410 + }, + { + "epoch": 16.30593978844589, + "grad_norm": 1.610261518507977, + "learning_rate": 8.153285802851829e-07, + "loss": 0.8565, + "step": 210420 + }, + { + "epoch": 16.306714711922197, + "grad_norm": 1.6573440077417938, + "learning_rate": 8.153673279603225e-07, + "loss": 0.8518, + "step": 210430 + }, + { + "epoch": 16.307489635398504, + "grad_norm": 1.6948978626952191, + "learning_rate": 8.154060756354619e-07, + "loss": 0.8623, + "step": 210440 + }, + { + "epoch": 16.30826455887481, + "grad_norm": 1.6404371182569093, + "learning_rate": 8.154448233106015e-07, + "loss": 0.8538, + "step": 210450 + }, + { + "epoch": 16.309039482351118, + "grad_norm": 1.4485621930446173, + "learning_rate": 8.154835709857409e-07, + "loss": 0.8294, + "step": 210460 + }, + { + "epoch": 16.309814405827424, + "grad_norm": 1.6046391631653594, + "learning_rate": 8.155223186608804e-07, + "loss": 0.8749, + "step": 210470 + }, + { + "epoch": 16.31058932930373, + "grad_norm": 1.6565648722654223, + "learning_rate": 8.155610663360199e-07, + "loss": 0.8436, + "step": 210480 + }, + { + "epoch": 16.311364252780038, + "grad_norm": 1.5196393373619181, + "learning_rate": 8.155998140111594e-07, + "loss": 0.8556, + "step": 210490 + }, + { + "epoch": 16.312139176256345, + "grad_norm": 1.5621849014779925, + "learning_rate": 8.156385616862988e-07, + "loss": 0.8454, + "step": 210500 + }, + { + "epoch": 16.312139176256345, + "eval_loss": 0.8902855515480042, + "eval_runtime": 330.0572, + "eval_samples_per_second": 34.755, + "eval_steps_per_second": 8.689, + "step": 210500 + }, + { + "epoch": 16.31291409973265, + "grad_norm": 1.6519054389868517, + "learning_rate": 8.156773093614384e-07, + "loss": 0.8427, + "step": 210510 + }, + { + "epoch": 16.31368902320896, + "grad_norm": 1.6590442858115442, + "learning_rate": 8.157160570365778e-07, + "loss": 0.8567, + "step": 210520 + }, + { + "epoch": 16.314463946685265, + "grad_norm": 1.579027043839219, + "learning_rate": 8.157548047117174e-07, + "loss": 0.8607, + "step": 210530 + }, + { + "epoch": 16.315238870161572, + "grad_norm": 1.5136551227823885, + "learning_rate": 8.157935523868568e-07, + "loss": 0.8556, + "step": 210540 + }, + { + "epoch": 16.31601379363788, + "grad_norm": 1.6038821931094143, + "learning_rate": 8.158323000619964e-07, + "loss": 0.8511, + "step": 210550 + }, + { + "epoch": 16.316788717114186, + "grad_norm": 1.7675006643652595, + "learning_rate": 8.158710477371358e-07, + "loss": 0.8579, + "step": 210560 + }, + { + "epoch": 16.317563640590492, + "grad_norm": 1.6026538127020469, + "learning_rate": 8.159097954122753e-07, + "loss": 0.8595, + "step": 210570 + }, + { + "epoch": 16.3183385640668, + "grad_norm": 1.5825993699178367, + "learning_rate": 8.159485430874148e-07, + "loss": 0.845, + "step": 210580 + }, + { + "epoch": 16.319113487543106, + "grad_norm": 1.6230557361749498, + "learning_rate": 8.159872907625544e-07, + "loss": 0.8666, + "step": 210590 + }, + { + "epoch": 16.319888411019413, + "grad_norm": 1.5971716543458936, + "learning_rate": 8.160260384376938e-07, + "loss": 0.8568, + "step": 210600 + }, + { + "epoch": 16.32066333449572, + "grad_norm": 1.7525802067556937, + "learning_rate": 8.160647861128333e-07, + "loss": 0.8634, + "step": 210610 + }, + { + "epoch": 16.321438257972027, + "grad_norm": 1.555544987332355, + "learning_rate": 8.161035337879727e-07, + "loss": 0.86, + "step": 210620 + }, + { + "epoch": 16.322213181448333, + "grad_norm": 1.5857208702129009, + "learning_rate": 8.161422814631123e-07, + "loss": 0.8486, + "step": 210630 + }, + { + "epoch": 16.32298810492464, + "grad_norm": 1.6178406145671393, + "learning_rate": 8.161810291382517e-07, + "loss": 0.8465, + "step": 210640 + }, + { + "epoch": 16.323763028400947, + "grad_norm": 1.6407380913028147, + "learning_rate": 8.162197768133913e-07, + "loss": 0.8401, + "step": 210650 + }, + { + "epoch": 16.324537951877254, + "grad_norm": 1.6262132769401585, + "learning_rate": 8.162585244885307e-07, + "loss": 0.8547, + "step": 210660 + }, + { + "epoch": 16.32531287535356, + "grad_norm": 1.5712342252302194, + "learning_rate": 8.162972721636702e-07, + "loss": 0.8607, + "step": 210670 + }, + { + "epoch": 16.326087798829864, + "grad_norm": 1.5979815335987313, + "learning_rate": 8.163360198388097e-07, + "loss": 0.8557, + "step": 210680 + }, + { + "epoch": 16.32686272230617, + "grad_norm": 1.584242884431485, + "learning_rate": 8.163747675139493e-07, + "loss": 0.8468, + "step": 210690 + }, + { + "epoch": 16.327637645782477, + "grad_norm": 1.5379531438585539, + "learning_rate": 8.164135151890887e-07, + "loss": 0.8472, + "step": 210700 + }, + { + "epoch": 16.328412569258784, + "grad_norm": 1.6015212049140712, + "learning_rate": 8.164522628642282e-07, + "loss": 0.8745, + "step": 210710 + }, + { + "epoch": 16.32918749273509, + "grad_norm": 1.557983553493898, + "learning_rate": 8.164910105393676e-07, + "loss": 0.8655, + "step": 210720 + }, + { + "epoch": 16.329962416211398, + "grad_norm": 1.5886706589471615, + "learning_rate": 8.165297582145073e-07, + "loss": 0.8506, + "step": 210730 + }, + { + "epoch": 16.330737339687705, + "grad_norm": 1.6912310268276245, + "learning_rate": 8.165685058896467e-07, + "loss": 0.8429, + "step": 210740 + }, + { + "epoch": 16.33151226316401, + "grad_norm": 1.5244906906118914, + "learning_rate": 8.166072535647862e-07, + "loss": 0.8495, + "step": 210750 + }, + { + "epoch": 16.33228718664032, + "grad_norm": 1.6104939792673645, + "learning_rate": 8.166460012399256e-07, + "loss": 0.8579, + "step": 210760 + }, + { + "epoch": 16.333062110116625, + "grad_norm": 1.5599139664309123, + "learning_rate": 8.166847489150651e-07, + "loss": 0.8503, + "step": 210770 + }, + { + "epoch": 16.333837033592932, + "grad_norm": 1.626679779187578, + "learning_rate": 8.167234965902046e-07, + "loss": 0.847, + "step": 210780 + }, + { + "epoch": 16.33461195706924, + "grad_norm": 1.5588412411915262, + "learning_rate": 8.167622442653442e-07, + "loss": 0.8446, + "step": 210790 + }, + { + "epoch": 16.335386880545546, + "grad_norm": 1.6220955312849645, + "learning_rate": 8.168009919404836e-07, + "loss": 0.8497, + "step": 210800 + }, + { + "epoch": 16.336161804021852, + "grad_norm": 1.5019407253004582, + "learning_rate": 8.168397396156231e-07, + "loss": 0.8379, + "step": 210810 + }, + { + "epoch": 16.33693672749816, + "grad_norm": 1.5882096001540482, + "learning_rate": 8.168784872907625e-07, + "loss": 0.8554, + "step": 210820 + }, + { + "epoch": 16.337711650974466, + "grad_norm": 1.5939402078178535, + "learning_rate": 8.169172349659022e-07, + "loss": 0.872, + "step": 210830 + }, + { + "epoch": 16.338486574450773, + "grad_norm": 1.5861712715719394, + "learning_rate": 8.169559826410416e-07, + "loss": 0.8608, + "step": 210840 + }, + { + "epoch": 16.33926149792708, + "grad_norm": 1.5080101758944278, + "learning_rate": 8.169947303161811e-07, + "loss": 0.8547, + "step": 210850 + }, + { + "epoch": 16.340036421403386, + "grad_norm": 1.5389426533764516, + "learning_rate": 8.170334779913205e-07, + "loss": 0.8597, + "step": 210860 + }, + { + "epoch": 16.340811344879693, + "grad_norm": 1.6446343724232986, + "learning_rate": 8.170722256664602e-07, + "loss": 0.8593, + "step": 210870 + }, + { + "epoch": 16.341586268356, + "grad_norm": 1.5965158221247007, + "learning_rate": 8.171109733415996e-07, + "loss": 0.8599, + "step": 210880 + }, + { + "epoch": 16.342361191832307, + "grad_norm": 1.7129644473549024, + "learning_rate": 8.171497210167391e-07, + "loss": 0.8692, + "step": 210890 + }, + { + "epoch": 16.343136115308614, + "grad_norm": 1.6334426191253564, + "learning_rate": 8.171884686918785e-07, + "loss": 0.8551, + "step": 210900 + }, + { + "epoch": 16.34391103878492, + "grad_norm": 1.6148160410685672, + "learning_rate": 8.17227216367018e-07, + "loss": 0.8445, + "step": 210910 + }, + { + "epoch": 16.344685962261227, + "grad_norm": 1.5969022034921443, + "learning_rate": 8.172659640421574e-07, + "loss": 0.8415, + "step": 210920 + }, + { + "epoch": 16.345460885737534, + "grad_norm": 1.6792068069331665, + "learning_rate": 8.173047117172971e-07, + "loss": 0.8732, + "step": 210930 + }, + { + "epoch": 16.34623580921384, + "grad_norm": 1.6198338673384811, + "learning_rate": 8.173434593924365e-07, + "loss": 0.8531, + "step": 210940 + }, + { + "epoch": 16.347010732690148, + "grad_norm": 1.6268793287223584, + "learning_rate": 8.17382207067576e-07, + "loss": 0.8551, + "step": 210950 + }, + { + "epoch": 16.347785656166455, + "grad_norm": 1.5440035432618455, + "learning_rate": 8.174209547427154e-07, + "loss": 0.8581, + "step": 210960 + }, + { + "epoch": 16.34856057964276, + "grad_norm": 1.5976731008778768, + "learning_rate": 8.174597024178551e-07, + "loss": 0.837, + "step": 210970 + }, + { + "epoch": 16.349335503119068, + "grad_norm": 1.6054998466954218, + "learning_rate": 8.174984500929945e-07, + "loss": 0.8717, + "step": 210980 + }, + { + "epoch": 16.350110426595375, + "grad_norm": 1.5523040237504853, + "learning_rate": 8.17537197768134e-07, + "loss": 0.8577, + "step": 210990 + }, + { + "epoch": 16.35088535007168, + "grad_norm": 1.5901071148391888, + "learning_rate": 8.175759454432734e-07, + "loss": 0.8528, + "step": 211000 + }, + { + "epoch": 16.35088535007168, + "eval_loss": 0.8904452919960022, + "eval_runtime": 331.6057, + "eval_samples_per_second": 34.592, + "eval_steps_per_second": 8.649, + "step": 211000 + }, + { + "epoch": 16.35166027354799, + "grad_norm": 1.6234232916828188, + "learning_rate": 8.17614693118413e-07, + "loss": 0.8705, + "step": 211010 + }, + { + "epoch": 16.352435197024295, + "grad_norm": 1.666353082815923, + "learning_rate": 8.176534407935525e-07, + "loss": 0.8631, + "step": 211020 + }, + { + "epoch": 16.353210120500602, + "grad_norm": 1.5646256927842501, + "learning_rate": 8.17692188468692e-07, + "loss": 0.8483, + "step": 211030 + }, + { + "epoch": 16.35398504397691, + "grad_norm": 1.5362263496014612, + "learning_rate": 8.177309361438314e-07, + "loss": 0.8588, + "step": 211040 + }, + { + "epoch": 16.354759967453212, + "grad_norm": 1.5825159184670319, + "learning_rate": 8.177696838189709e-07, + "loss": 0.8625, + "step": 211050 + }, + { + "epoch": 16.35553489092952, + "grad_norm": 1.6544891915855533, + "learning_rate": 8.178084314941103e-07, + "loss": 0.8412, + "step": 211060 + }, + { + "epoch": 16.356309814405826, + "grad_norm": 1.5453138530698387, + "learning_rate": 8.1784717916925e-07, + "loss": 0.8453, + "step": 211070 + }, + { + "epoch": 16.357084737882133, + "grad_norm": 1.648272776657855, + "learning_rate": 8.178859268443894e-07, + "loss": 0.8582, + "step": 211080 + }, + { + "epoch": 16.35785966135844, + "grad_norm": 1.6496338786423728, + "learning_rate": 8.179246745195289e-07, + "loss": 0.8576, + "step": 211090 + }, + { + "epoch": 16.358634584834746, + "grad_norm": 1.7534059646000042, + "learning_rate": 8.179634221946683e-07, + "loss": 0.8503, + "step": 211100 + }, + { + "epoch": 16.359409508311053, + "grad_norm": 1.5204102646242652, + "learning_rate": 8.180021698698079e-07, + "loss": 0.8584, + "step": 211110 + }, + { + "epoch": 16.36018443178736, + "grad_norm": 1.6992341092034449, + "learning_rate": 8.180409175449474e-07, + "loss": 0.8687, + "step": 211120 + }, + { + "epoch": 16.360959355263667, + "grad_norm": 1.7287406471552167, + "learning_rate": 8.180796652200869e-07, + "loss": 0.8556, + "step": 211130 + }, + { + "epoch": 16.361734278739974, + "grad_norm": 1.690876077565057, + "learning_rate": 8.181184128952263e-07, + "loss": 0.845, + "step": 211140 + }, + { + "epoch": 16.36250920221628, + "grad_norm": 1.5679625130788029, + "learning_rate": 8.181571605703659e-07, + "loss": 0.8483, + "step": 211150 + }, + { + "epoch": 16.363284125692587, + "grad_norm": 1.5150339092481226, + "learning_rate": 8.181959082455053e-07, + "loss": 0.8473, + "step": 211160 + }, + { + "epoch": 16.364059049168894, + "grad_norm": 1.5411585467824687, + "learning_rate": 8.182346559206449e-07, + "loss": 0.8488, + "step": 211170 + }, + { + "epoch": 16.3648339726452, + "grad_norm": 1.5330980167251385, + "learning_rate": 8.182734035957843e-07, + "loss": 0.8386, + "step": 211180 + }, + { + "epoch": 16.365608896121508, + "grad_norm": 1.6776217648233116, + "learning_rate": 8.183121512709238e-07, + "loss": 0.8571, + "step": 211190 + }, + { + "epoch": 16.366383819597814, + "grad_norm": 1.6786747595655893, + "learning_rate": 8.183508989460632e-07, + "loss": 0.8503, + "step": 211200 + }, + { + "epoch": 16.36715874307412, + "grad_norm": 1.555317573439075, + "learning_rate": 8.183896466212028e-07, + "loss": 0.8534, + "step": 211210 + }, + { + "epoch": 16.367933666550428, + "grad_norm": 1.6064525374234275, + "learning_rate": 8.184283942963423e-07, + "loss": 0.8528, + "step": 211220 + }, + { + "epoch": 16.368708590026735, + "grad_norm": 1.618059067018588, + "learning_rate": 8.184671419714818e-07, + "loss": 0.8571, + "step": 211230 + }, + { + "epoch": 16.36948351350304, + "grad_norm": 1.6593082525740197, + "learning_rate": 8.185058896466212e-07, + "loss": 0.8411, + "step": 211240 + }, + { + "epoch": 16.37025843697935, + "grad_norm": 1.6562388750287274, + "learning_rate": 8.185446373217608e-07, + "loss": 0.8348, + "step": 211250 + }, + { + "epoch": 16.371033360455655, + "grad_norm": 1.621463443047196, + "learning_rate": 8.185833849969002e-07, + "loss": 0.8534, + "step": 211260 + }, + { + "epoch": 16.371808283931962, + "grad_norm": 1.5671053734001805, + "learning_rate": 8.186221326720398e-07, + "loss": 0.8631, + "step": 211270 + }, + { + "epoch": 16.37258320740827, + "grad_norm": 1.6922740144639752, + "learning_rate": 8.186608803471792e-07, + "loss": 0.8487, + "step": 211280 + }, + { + "epoch": 16.373358130884576, + "grad_norm": 1.630401092018667, + "learning_rate": 8.186996280223187e-07, + "loss": 0.8612, + "step": 211290 + }, + { + "epoch": 16.374133054360883, + "grad_norm": 1.6888542805379751, + "learning_rate": 8.187383756974582e-07, + "loss": 0.8472, + "step": 211300 + }, + { + "epoch": 16.37490797783719, + "grad_norm": 1.5520049817118287, + "learning_rate": 8.187771233725977e-07, + "loss": 0.8751, + "step": 211310 + }, + { + "epoch": 16.375682901313496, + "grad_norm": 1.6399008966256927, + "learning_rate": 8.188158710477372e-07, + "loss": 0.8445, + "step": 211320 + }, + { + "epoch": 16.376457824789803, + "grad_norm": 1.5913722253476676, + "learning_rate": 8.188546187228767e-07, + "loss": 0.8432, + "step": 211330 + }, + { + "epoch": 16.37723274826611, + "grad_norm": 1.5408475188724908, + "learning_rate": 8.188933663980161e-07, + "loss": 0.8497, + "step": 211340 + }, + { + "epoch": 16.378007671742417, + "grad_norm": 1.8373493537206314, + "learning_rate": 8.189321140731557e-07, + "loss": 0.853, + "step": 211350 + }, + { + "epoch": 16.378782595218723, + "grad_norm": 1.6177755884521394, + "learning_rate": 8.189708617482951e-07, + "loss": 0.8596, + "step": 211360 + }, + { + "epoch": 16.37955751869503, + "grad_norm": 1.6590178129119728, + "learning_rate": 8.190096094234347e-07, + "loss": 0.8656, + "step": 211370 + }, + { + "epoch": 16.380332442171337, + "grad_norm": 1.5514310298644647, + "learning_rate": 8.190483570985741e-07, + "loss": 0.8213, + "step": 211380 + }, + { + "epoch": 16.381107365647644, + "grad_norm": 1.7781974847115523, + "learning_rate": 8.190871047737137e-07, + "loss": 0.8487, + "step": 211390 + }, + { + "epoch": 16.38188228912395, + "grad_norm": 1.6495888278609236, + "learning_rate": 8.191258524488531e-07, + "loss": 0.8599, + "step": 211400 + }, + { + "epoch": 16.382657212600257, + "grad_norm": 1.591711055011741, + "learning_rate": 8.191646001239926e-07, + "loss": 0.8554, + "step": 211410 + }, + { + "epoch": 16.383432136076564, + "grad_norm": 1.7801050273719934, + "learning_rate": 8.192033477991321e-07, + "loss": 0.866, + "step": 211420 + }, + { + "epoch": 16.384207059552867, + "grad_norm": 1.6092478281849767, + "learning_rate": 8.192420954742716e-07, + "loss": 0.8482, + "step": 211430 + }, + { + "epoch": 16.384981983029174, + "grad_norm": 1.595134232728265, + "learning_rate": 8.192808431494111e-07, + "loss": 0.872, + "step": 211440 + }, + { + "epoch": 16.38575690650548, + "grad_norm": 1.5939891115644889, + "learning_rate": 8.193195908245506e-07, + "loss": 0.8628, + "step": 211450 + }, + { + "epoch": 16.386531829981788, + "grad_norm": 1.651894237983756, + "learning_rate": 8.1935833849969e-07, + "loss": 0.8586, + "step": 211460 + }, + { + "epoch": 16.387306753458095, + "grad_norm": 1.7057984683888574, + "learning_rate": 8.193970861748296e-07, + "loss": 0.8531, + "step": 211470 + }, + { + "epoch": 16.3880816769344, + "grad_norm": 1.5048293818307175, + "learning_rate": 8.19435833849969e-07, + "loss": 0.849, + "step": 211480 + }, + { + "epoch": 16.38885660041071, + "grad_norm": 1.5944035477141203, + "learning_rate": 8.194745815251086e-07, + "loss": 0.8651, + "step": 211490 + }, + { + "epoch": 16.389631523887015, + "grad_norm": 1.6051433083582956, + "learning_rate": 8.19513329200248e-07, + "loss": 0.8562, + "step": 211500 + }, + { + "epoch": 16.389631523887015, + "eval_loss": 0.8899914622306824, + "eval_runtime": 328.8973, + "eval_samples_per_second": 34.877, + "eval_steps_per_second": 8.72, + "step": 211500 + }, + { + "epoch": 16.390406447363322, + "grad_norm": 1.7411713046349133, + "learning_rate": 8.195520768753876e-07, + "loss": 0.8632, + "step": 211510 + }, + { + "epoch": 16.39118137083963, + "grad_norm": 1.598805531665068, + "learning_rate": 8.19590824550527e-07, + "loss": 0.8559, + "step": 211520 + }, + { + "epoch": 16.391956294315936, + "grad_norm": 1.656661576639178, + "learning_rate": 8.196295722256666e-07, + "loss": 0.8649, + "step": 211530 + }, + { + "epoch": 16.392731217792242, + "grad_norm": 1.715416564305104, + "learning_rate": 8.19668319900806e-07, + "loss": 0.8573, + "step": 211540 + }, + { + "epoch": 16.39350614126855, + "grad_norm": 1.5667634643792179, + "learning_rate": 8.197070675759455e-07, + "loss": 0.856, + "step": 211550 + }, + { + "epoch": 16.394281064744856, + "grad_norm": 1.5675787452976038, + "learning_rate": 8.19745815251085e-07, + "loss": 0.8589, + "step": 211560 + }, + { + "epoch": 16.395055988221163, + "grad_norm": 1.6106556524180586, + "learning_rate": 8.197845629262245e-07, + "loss": 0.857, + "step": 211570 + }, + { + "epoch": 16.39583091169747, + "grad_norm": 1.7317191731004546, + "learning_rate": 8.19823310601364e-07, + "loss": 0.8625, + "step": 211580 + }, + { + "epoch": 16.396605835173776, + "grad_norm": 1.4980777023236787, + "learning_rate": 8.198620582765035e-07, + "loss": 0.8424, + "step": 211590 + }, + { + "epoch": 16.397380758650083, + "grad_norm": 1.5232437579082805, + "learning_rate": 8.199008059516429e-07, + "loss": 0.8388, + "step": 211600 + }, + { + "epoch": 16.39815568212639, + "grad_norm": 1.563047150813596, + "learning_rate": 8.199395536267825e-07, + "loss": 0.8737, + "step": 211610 + }, + { + "epoch": 16.398930605602697, + "grad_norm": 1.6126633075170735, + "learning_rate": 8.199783013019219e-07, + "loss": 0.8587, + "step": 211620 + }, + { + "epoch": 16.399705529079004, + "grad_norm": 1.5425342088726828, + "learning_rate": 8.200170489770615e-07, + "loss": 0.844, + "step": 211630 + }, + { + "epoch": 16.40048045255531, + "grad_norm": 1.5595590282919116, + "learning_rate": 8.200557966522009e-07, + "loss": 0.8625, + "step": 211640 + }, + { + "epoch": 16.401255376031617, + "grad_norm": 1.535284931026745, + "learning_rate": 8.200945443273404e-07, + "loss": 0.8598, + "step": 211650 + }, + { + "epoch": 16.402030299507924, + "grad_norm": 1.5974098191739525, + "learning_rate": 8.201332920024799e-07, + "loss": 0.8339, + "step": 211660 + }, + { + "epoch": 16.40280522298423, + "grad_norm": 1.5739883463768476, + "learning_rate": 8.201720396776195e-07, + "loss": 0.8561, + "step": 211670 + }, + { + "epoch": 16.403580146460538, + "grad_norm": 1.5274988834137118, + "learning_rate": 8.202107873527589e-07, + "loss": 0.8502, + "step": 211680 + }, + { + "epoch": 16.404355069936845, + "grad_norm": 1.6084048205624752, + "learning_rate": 8.202495350278984e-07, + "loss": 0.8397, + "step": 211690 + }, + { + "epoch": 16.40512999341315, + "grad_norm": 1.5648430986975461, + "learning_rate": 8.202882827030378e-07, + "loss": 0.8511, + "step": 211700 + }, + { + "epoch": 16.405904916889458, + "grad_norm": 1.5881163907879365, + "learning_rate": 8.203270303781774e-07, + "loss": 0.8524, + "step": 211710 + }, + { + "epoch": 16.406679840365765, + "grad_norm": 1.5240355348438992, + "learning_rate": 8.203657780533169e-07, + "loss": 0.8619, + "step": 211720 + }, + { + "epoch": 16.407454763842072, + "grad_norm": 1.601258371432715, + "learning_rate": 8.204045257284564e-07, + "loss": 0.8427, + "step": 211730 + }, + { + "epoch": 16.40822968731838, + "grad_norm": 1.627047286975239, + "learning_rate": 8.204432734035958e-07, + "loss": 0.8652, + "step": 211740 + }, + { + "epoch": 16.409004610794685, + "grad_norm": 1.5517361394532314, + "learning_rate": 8.204820210787353e-07, + "loss": 0.8634, + "step": 211750 + }, + { + "epoch": 16.409779534270992, + "grad_norm": 1.6670547491221994, + "learning_rate": 8.205207687538748e-07, + "loss": 0.8553, + "step": 211760 + }, + { + "epoch": 16.4105544577473, + "grad_norm": 1.5843368904850106, + "learning_rate": 8.205595164290144e-07, + "loss": 0.8633, + "step": 211770 + }, + { + "epoch": 16.411329381223606, + "grad_norm": 1.7243825284540633, + "learning_rate": 8.205982641041538e-07, + "loss": 0.8564, + "step": 211780 + }, + { + "epoch": 16.41210430469991, + "grad_norm": 1.5782261957974921, + "learning_rate": 8.206370117792933e-07, + "loss": 0.8492, + "step": 211790 + }, + { + "epoch": 16.412879228176216, + "grad_norm": 1.7364486607722243, + "learning_rate": 8.206757594544327e-07, + "loss": 0.8495, + "step": 211800 + }, + { + "epoch": 16.413654151652523, + "grad_norm": 1.6093071188916368, + "learning_rate": 8.207145071295724e-07, + "loss": 0.8516, + "step": 211810 + }, + { + "epoch": 16.41442907512883, + "grad_norm": 1.640532050996729, + "learning_rate": 8.207532548047118e-07, + "loss": 0.8663, + "step": 211820 + }, + { + "epoch": 16.415203998605136, + "grad_norm": 1.5238749625104735, + "learning_rate": 8.207920024798513e-07, + "loss": 0.8682, + "step": 211830 + }, + { + "epoch": 16.415978922081443, + "grad_norm": 1.5025255597110119, + "learning_rate": 8.208307501549907e-07, + "loss": 0.8537, + "step": 211840 + }, + { + "epoch": 16.41675384555775, + "grad_norm": 1.684692893233586, + "learning_rate": 8.208694978301302e-07, + "loss": 0.8534, + "step": 211850 + }, + { + "epoch": 16.417528769034057, + "grad_norm": 1.6662507480892357, + "learning_rate": 8.209082455052697e-07, + "loss": 0.8665, + "step": 211860 + }, + { + "epoch": 16.418303692510364, + "grad_norm": 1.4366071958155582, + "learning_rate": 8.209469931804093e-07, + "loss": 0.8524, + "step": 211870 + }, + { + "epoch": 16.41907861598667, + "grad_norm": 1.7127578639796297, + "learning_rate": 8.209857408555487e-07, + "loss": 0.8539, + "step": 211880 + }, + { + "epoch": 16.419853539462977, + "grad_norm": 1.588092217147379, + "learning_rate": 8.210244885306882e-07, + "loss": 0.8624, + "step": 211890 + }, + { + "epoch": 16.420628462939284, + "grad_norm": 1.631034970701614, + "learning_rate": 8.210632362058276e-07, + "loss": 0.853, + "step": 211900 + }, + { + "epoch": 16.42140338641559, + "grad_norm": 1.6400450668734399, + "learning_rate": 8.211019838809673e-07, + "loss": 0.8736, + "step": 211910 + }, + { + "epoch": 16.422178309891898, + "grad_norm": 1.7186307401226701, + "learning_rate": 8.211407315561067e-07, + "loss": 0.8656, + "step": 211920 + }, + { + "epoch": 16.422953233368204, + "grad_norm": 1.6358483748676138, + "learning_rate": 8.211794792312462e-07, + "loss": 0.8421, + "step": 211930 + }, + { + "epoch": 16.42372815684451, + "grad_norm": 1.6154386535288647, + "learning_rate": 8.212182269063856e-07, + "loss": 0.8462, + "step": 211940 + }, + { + "epoch": 16.424503080320818, + "grad_norm": 1.6185924347139427, + "learning_rate": 8.212569745815252e-07, + "loss": 0.8506, + "step": 211950 + }, + { + "epoch": 16.425278003797125, + "grad_norm": 1.5220302869553426, + "learning_rate": 8.212957222566647e-07, + "loss": 0.8581, + "step": 211960 + }, + { + "epoch": 16.42605292727343, + "grad_norm": 1.5856520600567194, + "learning_rate": 8.213344699318042e-07, + "loss": 0.8467, + "step": 211970 + }, + { + "epoch": 16.42682785074974, + "grad_norm": 1.6256395875880738, + "learning_rate": 8.213732176069436e-07, + "loss": 0.8631, + "step": 211980 + }, + { + "epoch": 16.427602774226045, + "grad_norm": 1.6074748575608009, + "learning_rate": 8.214119652820831e-07, + "loss": 0.8592, + "step": 211990 + }, + { + "epoch": 16.428377697702352, + "grad_norm": 1.5780080370529028, + "learning_rate": 8.214507129572225e-07, + "loss": 0.8438, + "step": 212000 + }, + { + "epoch": 16.428377697702352, + "eval_loss": 0.8900773525238037, + "eval_runtime": 330.2153, + "eval_samples_per_second": 34.738, + "eval_steps_per_second": 8.685, + "step": 212000 + }, + { + "epoch": 16.42915262117866, + "grad_norm": 1.6343408395546075, + "learning_rate": 8.214894606323622e-07, + "loss": 0.8546, + "step": 212010 + }, + { + "epoch": 16.429927544654966, + "grad_norm": 1.5537791719892204, + "learning_rate": 8.215282083075016e-07, + "loss": 0.8674, + "step": 212020 + }, + { + "epoch": 16.430702468131273, + "grad_norm": 1.5633120281832862, + "learning_rate": 8.215669559826411e-07, + "loss": 0.8643, + "step": 212030 + }, + { + "epoch": 16.43147739160758, + "grad_norm": 1.6682297525537777, + "learning_rate": 8.216057036577805e-07, + "loss": 0.8584, + "step": 212040 + }, + { + "epoch": 16.432252315083886, + "grad_norm": 1.451212803446218, + "learning_rate": 8.216444513329201e-07, + "loss": 0.8667, + "step": 212050 + }, + { + "epoch": 16.433027238560193, + "grad_norm": 1.664224318120752, + "learning_rate": 8.216831990080596e-07, + "loss": 0.8624, + "step": 212060 + }, + { + "epoch": 16.4338021620365, + "grad_norm": 1.6316168961980253, + "learning_rate": 8.217219466831991e-07, + "loss": 0.8608, + "step": 212070 + }, + { + "epoch": 16.434577085512807, + "grad_norm": 1.7618405843649902, + "learning_rate": 8.217606943583385e-07, + "loss": 0.8483, + "step": 212080 + }, + { + "epoch": 16.435352008989113, + "grad_norm": 1.5431187126161827, + "learning_rate": 8.217994420334781e-07, + "loss": 0.837, + "step": 212090 + }, + { + "epoch": 16.43612693246542, + "grad_norm": 1.6158837638962023, + "learning_rate": 8.218381897086175e-07, + "loss": 0.8608, + "step": 212100 + }, + { + "epoch": 16.436901855941727, + "grad_norm": 1.493709967989661, + "learning_rate": 8.218769373837571e-07, + "loss": 0.84, + "step": 212110 + }, + { + "epoch": 16.437676779418034, + "grad_norm": 1.614538045058741, + "learning_rate": 8.219156850588965e-07, + "loss": 0.866, + "step": 212120 + }, + { + "epoch": 16.43845170289434, + "grad_norm": 1.617553023660267, + "learning_rate": 8.21954432734036e-07, + "loss": 0.8705, + "step": 212130 + }, + { + "epoch": 16.439226626370647, + "grad_norm": 1.49168720821986, + "learning_rate": 8.219931804091754e-07, + "loss": 0.8478, + "step": 212140 + }, + { + "epoch": 16.440001549846954, + "grad_norm": 1.615593815308653, + "learning_rate": 8.22031928084315e-07, + "loss": 0.8514, + "step": 212150 + }, + { + "epoch": 16.44077647332326, + "grad_norm": 1.572191020773806, + "learning_rate": 8.220706757594545e-07, + "loss": 0.8624, + "step": 212160 + }, + { + "epoch": 16.441551396799564, + "grad_norm": 1.634517791136214, + "learning_rate": 8.22109423434594e-07, + "loss": 0.8555, + "step": 212170 + }, + { + "epoch": 16.44232632027587, + "grad_norm": 1.600021645466131, + "learning_rate": 8.221481711097334e-07, + "loss": 0.8367, + "step": 212180 + }, + { + "epoch": 16.443101243752178, + "grad_norm": 1.6141989843270645, + "learning_rate": 8.22186918784873e-07, + "loss": 0.8535, + "step": 212190 + }, + { + "epoch": 16.443876167228485, + "grad_norm": 1.5150685938523474, + "learning_rate": 8.222256664600124e-07, + "loss": 0.8477, + "step": 212200 + }, + { + "epoch": 16.44465109070479, + "grad_norm": 1.6120068884778758, + "learning_rate": 8.22264414135152e-07, + "loss": 0.8556, + "step": 212210 + }, + { + "epoch": 16.4454260141811, + "grad_norm": 1.5510388995650044, + "learning_rate": 8.223031618102914e-07, + "loss": 0.8441, + "step": 212220 + }, + { + "epoch": 16.446200937657405, + "grad_norm": 1.7708359189006322, + "learning_rate": 8.22341909485431e-07, + "loss": 0.8461, + "step": 212230 + }, + { + "epoch": 16.446975861133712, + "grad_norm": 1.5254066775144002, + "learning_rate": 8.223806571605704e-07, + "loss": 0.8589, + "step": 212240 + }, + { + "epoch": 16.44775078461002, + "grad_norm": 1.6446587188077404, + "learning_rate": 8.2241940483571e-07, + "loss": 0.8393, + "step": 212250 + }, + { + "epoch": 16.448525708086326, + "grad_norm": 1.5837971246141813, + "learning_rate": 8.224581525108494e-07, + "loss": 0.8708, + "step": 212260 + }, + { + "epoch": 16.449300631562632, + "grad_norm": 1.5667784246906886, + "learning_rate": 8.224969001859889e-07, + "loss": 0.8497, + "step": 212270 + }, + { + "epoch": 16.45007555503894, + "grad_norm": 1.6523954007759647, + "learning_rate": 8.225356478611283e-07, + "loss": 0.8543, + "step": 212280 + }, + { + "epoch": 16.450850478515246, + "grad_norm": 1.6599101689924767, + "learning_rate": 8.225743955362679e-07, + "loss": 0.8742, + "step": 212290 + }, + { + "epoch": 16.451625401991553, + "grad_norm": 1.560631060100635, + "learning_rate": 8.226131432114074e-07, + "loss": 0.856, + "step": 212300 + }, + { + "epoch": 16.45240032546786, + "grad_norm": 1.5509339015016599, + "learning_rate": 8.226518908865469e-07, + "loss": 0.8509, + "step": 212310 + }, + { + "epoch": 16.453175248944166, + "grad_norm": 1.6322848746585639, + "learning_rate": 8.226906385616863e-07, + "loss": 0.8373, + "step": 212320 + }, + { + "epoch": 16.453950172420473, + "grad_norm": 1.6713039070845672, + "learning_rate": 8.227293862368259e-07, + "loss": 0.8511, + "step": 212330 + }, + { + "epoch": 16.45472509589678, + "grad_norm": 1.6887923115064079, + "learning_rate": 8.227681339119653e-07, + "loss": 0.8621, + "step": 212340 + }, + { + "epoch": 16.455500019373087, + "grad_norm": 1.518250367273062, + "learning_rate": 8.228068815871049e-07, + "loss": 0.8542, + "step": 212350 + }, + { + "epoch": 16.456274942849394, + "grad_norm": 1.5583710305643346, + "learning_rate": 8.228456292622443e-07, + "loss": 0.8384, + "step": 212360 + }, + { + "epoch": 16.4570498663257, + "grad_norm": 1.5658689234500562, + "learning_rate": 8.228843769373839e-07, + "loss": 0.85, + "step": 212370 + }, + { + "epoch": 16.457824789802007, + "grad_norm": 1.6683435972099991, + "learning_rate": 8.229231246125233e-07, + "loss": 0.8689, + "step": 212380 + }, + { + "epoch": 16.458599713278314, + "grad_norm": 1.5519327059856194, + "learning_rate": 8.229618722876628e-07, + "loss": 0.8337, + "step": 212390 + }, + { + "epoch": 16.45937463675462, + "grad_norm": 1.6019643473094625, + "learning_rate": 8.230006199628023e-07, + "loss": 0.8676, + "step": 212400 + }, + { + "epoch": 16.460149560230928, + "grad_norm": 1.569116139142149, + "learning_rate": 8.230393676379418e-07, + "loss": 0.8376, + "step": 212410 + }, + { + "epoch": 16.460924483707235, + "grad_norm": 1.6391956739693383, + "learning_rate": 8.230781153130812e-07, + "loss": 0.8919, + "step": 212420 + }, + { + "epoch": 16.46169940718354, + "grad_norm": 1.693340910874654, + "learning_rate": 8.231168629882208e-07, + "loss": 0.8583, + "step": 212430 + }, + { + "epoch": 16.462474330659848, + "grad_norm": 1.595751874325111, + "learning_rate": 8.231556106633602e-07, + "loss": 0.8437, + "step": 212440 + }, + { + "epoch": 16.463249254136155, + "grad_norm": 1.6515383075744936, + "learning_rate": 8.231943583384998e-07, + "loss": 0.8571, + "step": 212450 + }, + { + "epoch": 16.464024177612462, + "grad_norm": 1.6097201092272897, + "learning_rate": 8.232331060136392e-07, + "loss": 0.847, + "step": 212460 + }, + { + "epoch": 16.46479910108877, + "grad_norm": 1.5546765700992833, + "learning_rate": 8.232718536887788e-07, + "loss": 0.8356, + "step": 212470 + }, + { + "epoch": 16.465574024565075, + "grad_norm": 1.5812404593557587, + "learning_rate": 8.233106013639182e-07, + "loss": 0.8624, + "step": 212480 + }, + { + "epoch": 16.466348948041382, + "grad_norm": 1.5831840741514924, + "learning_rate": 8.233493490390577e-07, + "loss": 0.8364, + "step": 212490 + }, + { + "epoch": 16.46712387151769, + "grad_norm": 1.6535846541872194, + "learning_rate": 8.233880967141972e-07, + "loss": 0.8582, + "step": 212500 + }, + { + "epoch": 16.46712387151769, + "eval_loss": 0.8899034261703491, + "eval_runtime": 331.1976, + "eval_samples_per_second": 34.635, + "eval_steps_per_second": 8.659, + "step": 212500 + }, + { + "epoch": 16.467898794993996, + "grad_norm": 1.5402034999077159, + "learning_rate": 8.234268443893368e-07, + "loss": 0.8513, + "step": 212510 + }, + { + "epoch": 16.468673718470303, + "grad_norm": 1.58559599035687, + "learning_rate": 8.234655920644762e-07, + "loss": 0.8497, + "step": 212520 + }, + { + "epoch": 16.46944864194661, + "grad_norm": 2.1988824473871458, + "learning_rate": 8.235043397396157e-07, + "loss": 0.877, + "step": 212530 + }, + { + "epoch": 16.470223565422913, + "grad_norm": 1.601153297008097, + "learning_rate": 8.235430874147551e-07, + "loss": 0.8463, + "step": 212540 + }, + { + "epoch": 16.47099848889922, + "grad_norm": 1.5329607328080108, + "learning_rate": 8.235818350898947e-07, + "loss": 0.8423, + "step": 212550 + }, + { + "epoch": 16.471773412375526, + "grad_norm": 1.508303110383949, + "learning_rate": 8.236205827650341e-07, + "loss": 0.8581, + "step": 212560 + }, + { + "epoch": 16.472548335851833, + "grad_norm": 1.5781985750775853, + "learning_rate": 8.236593304401737e-07, + "loss": 0.8573, + "step": 212570 + }, + { + "epoch": 16.47332325932814, + "grad_norm": 1.5960482104014424, + "learning_rate": 8.236980781153131e-07, + "loss": 0.8571, + "step": 212580 + }, + { + "epoch": 16.474098182804447, + "grad_norm": 1.6885700117087632, + "learning_rate": 8.237368257904526e-07, + "loss": 0.8451, + "step": 212590 + }, + { + "epoch": 16.474873106280754, + "grad_norm": 1.5822420221030644, + "learning_rate": 8.237755734655921e-07, + "loss": 0.8651, + "step": 212600 + }, + { + "epoch": 16.47564802975706, + "grad_norm": 1.708115242920358, + "learning_rate": 8.238143211407317e-07, + "loss": 0.8654, + "step": 212610 + }, + { + "epoch": 16.476422953233367, + "grad_norm": 1.553085964893353, + "learning_rate": 8.238530688158711e-07, + "loss": 0.8636, + "step": 212620 + }, + { + "epoch": 16.477197876709674, + "grad_norm": 1.6712735400451324, + "learning_rate": 8.238918164910106e-07, + "loss": 0.8425, + "step": 212630 + }, + { + "epoch": 16.47797280018598, + "grad_norm": 1.7404290673638878, + "learning_rate": 8.2393056416615e-07, + "loss": 0.8534, + "step": 212640 + }, + { + "epoch": 16.478747723662288, + "grad_norm": 1.5966068194632073, + "learning_rate": 8.239693118412897e-07, + "loss": 0.8482, + "step": 212650 + }, + { + "epoch": 16.479522647138594, + "grad_norm": 1.5416639943313335, + "learning_rate": 8.240080595164291e-07, + "loss": 0.8573, + "step": 212660 + }, + { + "epoch": 16.4802975706149, + "grad_norm": 1.5669587030267405, + "learning_rate": 8.240468071915686e-07, + "loss": 0.871, + "step": 212670 + }, + { + "epoch": 16.481072494091208, + "grad_norm": 1.5799141571357092, + "learning_rate": 8.24085554866708e-07, + "loss": 0.8544, + "step": 212680 + }, + { + "epoch": 16.481847417567515, + "grad_norm": 1.7192308384651078, + "learning_rate": 8.241243025418475e-07, + "loss": 0.8797, + "step": 212690 + }, + { + "epoch": 16.48262234104382, + "grad_norm": 1.8124482620883444, + "learning_rate": 8.24163050216987e-07, + "loss": 0.8516, + "step": 212700 + }, + { + "epoch": 16.48339726452013, + "grad_norm": 1.6270413088894715, + "learning_rate": 8.242017978921266e-07, + "loss": 0.8634, + "step": 212710 + }, + { + "epoch": 16.484172187996435, + "grad_norm": 1.5404011337372254, + "learning_rate": 8.24240545567266e-07, + "loss": 0.8627, + "step": 212720 + }, + { + "epoch": 16.484947111472742, + "grad_norm": 1.6052270549974679, + "learning_rate": 8.242792932424055e-07, + "loss": 0.8569, + "step": 212730 + }, + { + "epoch": 16.48572203494905, + "grad_norm": 1.5757681444052403, + "learning_rate": 8.243180409175449e-07, + "loss": 0.8524, + "step": 212740 + }, + { + "epoch": 16.486496958425356, + "grad_norm": 1.521828817062114, + "learning_rate": 8.243567885926846e-07, + "loss": 0.8564, + "step": 212750 + }, + { + "epoch": 16.487271881901663, + "grad_norm": 1.708029562963261, + "learning_rate": 8.24395536267824e-07, + "loss": 0.8452, + "step": 212760 + }, + { + "epoch": 16.48804680537797, + "grad_norm": 1.4640499540844794, + "learning_rate": 8.244342839429635e-07, + "loss": 0.8665, + "step": 212770 + }, + { + "epoch": 16.488821728854276, + "grad_norm": 1.4902622004371546, + "learning_rate": 8.244730316181029e-07, + "loss": 0.8656, + "step": 212780 + }, + { + "epoch": 16.489596652330583, + "grad_norm": 1.6073544015948165, + "learning_rate": 8.245117792932424e-07, + "loss": 0.8456, + "step": 212790 + }, + { + "epoch": 16.49037157580689, + "grad_norm": 1.72772957466148, + "learning_rate": 8.24550526968382e-07, + "loss": 0.8416, + "step": 212800 + }, + { + "epoch": 16.491146499283197, + "grad_norm": 1.595446427701053, + "learning_rate": 8.245892746435215e-07, + "loss": 0.8587, + "step": 212810 + }, + { + "epoch": 16.491921422759503, + "grad_norm": 1.6046426837465189, + "learning_rate": 8.246280223186609e-07, + "loss": 0.8478, + "step": 212820 + }, + { + "epoch": 16.49269634623581, + "grad_norm": 1.4349347530606642, + "learning_rate": 8.246667699938004e-07, + "loss": 0.8533, + "step": 212830 + }, + { + "epoch": 16.493471269712117, + "grad_norm": 1.5674353486445587, + "learning_rate": 8.247055176689398e-07, + "loss": 0.8602, + "step": 212840 + }, + { + "epoch": 16.494246193188424, + "grad_norm": 1.577861530168568, + "learning_rate": 8.247442653440795e-07, + "loss": 0.8466, + "step": 212850 + }, + { + "epoch": 16.49502111666473, + "grad_norm": 1.5351566324991743, + "learning_rate": 8.247830130192189e-07, + "loss": 0.8485, + "step": 212860 + }, + { + "epoch": 16.495796040141038, + "grad_norm": 1.562736375389919, + "learning_rate": 8.248217606943584e-07, + "loss": 0.851, + "step": 212870 + }, + { + "epoch": 16.496570963617344, + "grad_norm": 1.5872404108740628, + "learning_rate": 8.248605083694978e-07, + "loss": 0.8611, + "step": 212880 + }, + { + "epoch": 16.49734588709365, + "grad_norm": 1.571043081737468, + "learning_rate": 8.248992560446375e-07, + "loss": 0.8434, + "step": 212890 + }, + { + "epoch": 16.498120810569958, + "grad_norm": 1.5514463295295744, + "learning_rate": 8.249380037197769e-07, + "loss": 0.8494, + "step": 212900 + }, + { + "epoch": 16.49889573404626, + "grad_norm": 1.5784762412486877, + "learning_rate": 8.249767513949164e-07, + "loss": 0.8681, + "step": 212910 + }, + { + "epoch": 16.499670657522568, + "grad_norm": 1.577073789134299, + "learning_rate": 8.250154990700558e-07, + "loss": 0.8743, + "step": 212920 + }, + { + "epoch": 16.500445580998875, + "grad_norm": 1.6207062765088902, + "learning_rate": 8.250542467451953e-07, + "loss": 0.8546, + "step": 212930 + }, + { + "epoch": 16.50122050447518, + "grad_norm": 1.603076658049729, + "learning_rate": 8.250929944203349e-07, + "loss": 0.8583, + "step": 212940 + }, + { + "epoch": 16.50199542795149, + "grad_norm": 1.562874113376688, + "learning_rate": 8.251317420954744e-07, + "loss": 0.8652, + "step": 212950 + }, + { + "epoch": 16.502770351427795, + "grad_norm": 1.6875342576086654, + "learning_rate": 8.251704897706138e-07, + "loss": 0.8481, + "step": 212960 + }, + { + "epoch": 16.503545274904102, + "grad_norm": 1.6219878779239547, + "learning_rate": 8.252092374457533e-07, + "loss": 0.8569, + "step": 212970 + }, + { + "epoch": 16.50432019838041, + "grad_norm": 1.646750866797221, + "learning_rate": 8.252479851208927e-07, + "loss": 0.8463, + "step": 212980 + }, + { + "epoch": 16.505095121856716, + "grad_norm": 1.600408328376305, + "learning_rate": 8.252867327960324e-07, + "loss": 0.8814, + "step": 212990 + }, + { + "epoch": 16.505870045333022, + "grad_norm": 1.5028196791912989, + "learning_rate": 8.253254804711718e-07, + "loss": 0.8588, + "step": 213000 + }, + { + "epoch": 16.505870045333022, + "eval_loss": 0.889689028263092, + "eval_runtime": 331.5163, + "eval_samples_per_second": 34.602, + "eval_steps_per_second": 8.651, + "step": 213000 + }, + { + "epoch": 16.50664496880933, + "grad_norm": 1.7454701552478011, + "learning_rate": 8.253642281463113e-07, + "loss": 0.8439, + "step": 213010 + }, + { + "epoch": 16.507419892285636, + "grad_norm": 1.4794225567600476, + "learning_rate": 8.254029758214507e-07, + "loss": 0.859, + "step": 213020 + }, + { + "epoch": 16.508194815761943, + "grad_norm": 1.513985492803242, + "learning_rate": 8.254417234965903e-07, + "loss": 0.8521, + "step": 213030 + }, + { + "epoch": 16.50896973923825, + "grad_norm": 1.6702711733348397, + "learning_rate": 8.254804711717298e-07, + "loss": 0.8522, + "step": 213040 + }, + { + "epoch": 16.509744662714557, + "grad_norm": 1.7303010342112433, + "learning_rate": 8.255192188468693e-07, + "loss": 0.8509, + "step": 213050 + }, + { + "epoch": 16.510519586190863, + "grad_norm": 1.7049578665147536, + "learning_rate": 8.255579665220087e-07, + "loss": 0.8673, + "step": 213060 + }, + { + "epoch": 16.51129450966717, + "grad_norm": 1.5495766200412635, + "learning_rate": 8.255967141971482e-07, + "loss": 0.8471, + "step": 213070 + }, + { + "epoch": 16.512069433143477, + "grad_norm": 1.6154588306823403, + "learning_rate": 8.256354618722877e-07, + "loss": 0.8616, + "step": 213080 + }, + { + "epoch": 16.512844356619784, + "grad_norm": 1.5487208691512298, + "learning_rate": 8.256742095474273e-07, + "loss": 0.8611, + "step": 213090 + }, + { + "epoch": 16.51361928009609, + "grad_norm": 1.5519367709456389, + "learning_rate": 8.257129572225667e-07, + "loss": 0.8529, + "step": 213100 + }, + { + "epoch": 16.514394203572397, + "grad_norm": 1.6648964988371953, + "learning_rate": 8.257517048977062e-07, + "loss": 0.8231, + "step": 213110 + }, + { + "epoch": 16.515169127048704, + "grad_norm": 1.6299800746723372, + "learning_rate": 8.257904525728456e-07, + "loss": 0.8533, + "step": 213120 + }, + { + "epoch": 16.51594405052501, + "grad_norm": 1.6957745856060948, + "learning_rate": 8.258292002479852e-07, + "loss": 0.8445, + "step": 213130 + }, + { + "epoch": 16.516718974001318, + "grad_norm": 1.5381278118209616, + "learning_rate": 8.258679479231247e-07, + "loss": 0.8611, + "step": 213140 + }, + { + "epoch": 16.517493897477625, + "grad_norm": 1.541695949969045, + "learning_rate": 8.259066955982642e-07, + "loss": 0.8604, + "step": 213150 + }, + { + "epoch": 16.51826882095393, + "grad_norm": 1.5803976207838784, + "learning_rate": 8.259454432734036e-07, + "loss": 0.876, + "step": 213160 + }, + { + "epoch": 16.51904374443024, + "grad_norm": 1.6702600351485657, + "learning_rate": 8.259841909485432e-07, + "loss": 0.867, + "step": 213170 + }, + { + "epoch": 16.519818667906545, + "grad_norm": 1.757989739896917, + "learning_rate": 8.260229386236826e-07, + "loss": 0.86, + "step": 213180 + }, + { + "epoch": 16.520593591382852, + "grad_norm": 1.6579352402811403, + "learning_rate": 8.260616862988222e-07, + "loss": 0.8558, + "step": 213190 + }, + { + "epoch": 16.52136851485916, + "grad_norm": 1.7526879700326357, + "learning_rate": 8.261004339739616e-07, + "loss": 0.8614, + "step": 213200 + }, + { + "epoch": 16.522143438335466, + "grad_norm": 1.637561739963567, + "learning_rate": 8.261391816491011e-07, + "loss": 0.8737, + "step": 213210 + }, + { + "epoch": 16.522918361811772, + "grad_norm": 1.715689950382935, + "learning_rate": 8.261779293242406e-07, + "loss": 0.8396, + "step": 213220 + }, + { + "epoch": 16.52369328528808, + "grad_norm": 1.6087300630858308, + "learning_rate": 8.262166769993801e-07, + "loss": 0.8689, + "step": 213230 + }, + { + "epoch": 16.524468208764386, + "grad_norm": 1.59253909653174, + "learning_rate": 8.262554246745196e-07, + "loss": 0.8549, + "step": 213240 + }, + { + "epoch": 16.525243132240693, + "grad_norm": 1.6049719451462519, + "learning_rate": 8.262941723496591e-07, + "loss": 0.8463, + "step": 213250 + }, + { + "epoch": 16.526018055717, + "grad_norm": 1.7276875467794397, + "learning_rate": 8.263329200247985e-07, + "loss": 0.852, + "step": 213260 + }, + { + "epoch": 16.526792979193306, + "grad_norm": 1.5943300759752892, + "learning_rate": 8.263716676999381e-07, + "loss": 0.8535, + "step": 213270 + }, + { + "epoch": 16.52756790266961, + "grad_norm": 1.4925058347870273, + "learning_rate": 8.264104153750775e-07, + "loss": 0.8647, + "step": 213280 + }, + { + "epoch": 16.528342826145916, + "grad_norm": 1.5150532760757838, + "learning_rate": 8.264491630502171e-07, + "loss": 0.8542, + "step": 213290 + }, + { + "epoch": 16.529117749622223, + "grad_norm": 1.6841218574616648, + "learning_rate": 8.264879107253565e-07, + "loss": 0.8507, + "step": 213300 + }, + { + "epoch": 16.52989267309853, + "grad_norm": 1.6307043153190155, + "learning_rate": 8.265266584004961e-07, + "loss": 0.8394, + "step": 213310 + }, + { + "epoch": 16.530667596574837, + "grad_norm": 1.6416642792211198, + "learning_rate": 8.265654060756355e-07, + "loss": 0.8578, + "step": 213320 + }, + { + "epoch": 16.531442520051144, + "grad_norm": 1.6838728942654295, + "learning_rate": 8.26604153750775e-07, + "loss": 0.8668, + "step": 213330 + }, + { + "epoch": 16.53221744352745, + "grad_norm": 1.6497802761397375, + "learning_rate": 8.266429014259145e-07, + "loss": 0.8584, + "step": 213340 + }, + { + "epoch": 16.532992367003757, + "grad_norm": 1.6611969402943412, + "learning_rate": 8.26681649101054e-07, + "loss": 0.8402, + "step": 213350 + }, + { + "epoch": 16.533767290480064, + "grad_norm": 1.6045173480682593, + "learning_rate": 8.267203967761934e-07, + "loss": 0.8483, + "step": 213360 + }, + { + "epoch": 16.53454221395637, + "grad_norm": 1.6306530892015711, + "learning_rate": 8.26759144451333e-07, + "loss": 0.8675, + "step": 213370 + }, + { + "epoch": 16.535317137432678, + "grad_norm": 1.666405061458607, + "learning_rate": 8.267978921264724e-07, + "loss": 0.8498, + "step": 213380 + }, + { + "epoch": 16.536092060908985, + "grad_norm": 1.45961703868013, + "learning_rate": 8.26836639801612e-07, + "loss": 0.854, + "step": 213390 + }, + { + "epoch": 16.53686698438529, + "grad_norm": 1.5921294556879533, + "learning_rate": 8.268753874767514e-07, + "loss": 0.8624, + "step": 213400 + }, + { + "epoch": 16.537641907861598, + "grad_norm": 1.5833968906685676, + "learning_rate": 8.26914135151891e-07, + "loss": 0.8653, + "step": 213410 + }, + { + "epoch": 16.538416831337905, + "grad_norm": 1.5662548878411868, + "learning_rate": 8.269528828270304e-07, + "loss": 0.8666, + "step": 213420 + }, + { + "epoch": 16.539191754814212, + "grad_norm": 1.639039159740873, + "learning_rate": 8.2699163050217e-07, + "loss": 0.8522, + "step": 213430 + }, + { + "epoch": 16.53996667829052, + "grad_norm": 1.6470608431274838, + "learning_rate": 8.270303781773094e-07, + "loss": 0.8468, + "step": 213440 + }, + { + "epoch": 16.540741601766825, + "grad_norm": 1.5063375257923863, + "learning_rate": 8.27069125852449e-07, + "loss": 0.8499, + "step": 213450 + }, + { + "epoch": 16.541516525243132, + "grad_norm": 1.6410583722943861, + "learning_rate": 8.271078735275884e-07, + "loss": 0.8699, + "step": 213460 + }, + { + "epoch": 16.54229144871944, + "grad_norm": 1.769230782859308, + "learning_rate": 8.271466212027279e-07, + "loss": 0.8539, + "step": 213470 + }, + { + "epoch": 16.543066372195746, + "grad_norm": 1.6595881031793491, + "learning_rate": 8.271853688778673e-07, + "loss": 0.8548, + "step": 213480 + }, + { + "epoch": 16.543841295672053, + "grad_norm": 1.5148519332514847, + "learning_rate": 8.272241165530069e-07, + "loss": 0.8362, + "step": 213490 + }, + { + "epoch": 16.54461621914836, + "grad_norm": 1.597467710083928, + "learning_rate": 8.272628642281463e-07, + "loss": 0.8408, + "step": 213500 + }, + { + "epoch": 16.54461621914836, + "eval_loss": 0.8897623419761658, + "eval_runtime": 331.874, + "eval_samples_per_second": 34.564, + "eval_steps_per_second": 8.642, + "step": 213500 + }, + { + "epoch": 16.545391142624666, + "grad_norm": 1.687242852297848, + "learning_rate": 8.273016119032859e-07, + "loss": 0.8702, + "step": 213510 + }, + { + "epoch": 16.546166066100973, + "grad_norm": 1.60558934642048, + "learning_rate": 8.273403595784253e-07, + "loss": 0.8566, + "step": 213520 + }, + { + "epoch": 16.54694098957728, + "grad_norm": 1.5095092268232333, + "learning_rate": 8.273791072535648e-07, + "loss": 0.8564, + "step": 213530 + }, + { + "epoch": 16.547715913053587, + "grad_norm": 1.5781047054885198, + "learning_rate": 8.274178549287043e-07, + "loss": 0.8395, + "step": 213540 + }, + { + "epoch": 16.548490836529893, + "grad_norm": 1.5114323133126621, + "learning_rate": 8.274566026038439e-07, + "loss": 0.8553, + "step": 213550 + }, + { + "epoch": 16.5492657600062, + "grad_norm": 1.6487026408253438, + "learning_rate": 8.274953502789833e-07, + "loss": 0.8381, + "step": 213560 + }, + { + "epoch": 16.550040683482507, + "grad_norm": 1.5238530209328467, + "learning_rate": 8.275340979541228e-07, + "loss": 0.8475, + "step": 213570 + }, + { + "epoch": 16.550815606958814, + "grad_norm": 1.6660395188946424, + "learning_rate": 8.275728456292622e-07, + "loss": 0.8451, + "step": 213580 + }, + { + "epoch": 16.55159053043512, + "grad_norm": 1.7312101059354108, + "learning_rate": 8.276115933044019e-07, + "loss": 0.8512, + "step": 213590 + }, + { + "epoch": 16.552365453911428, + "grad_norm": 1.5199507752048567, + "learning_rate": 8.276503409795413e-07, + "loss": 0.8506, + "step": 213600 + }, + { + "epoch": 16.553140377387734, + "grad_norm": 1.5137574720632085, + "learning_rate": 8.276890886546808e-07, + "loss": 0.8628, + "step": 213610 + }, + { + "epoch": 16.55391530086404, + "grad_norm": 1.5688929620101024, + "learning_rate": 8.277278363298202e-07, + "loss": 0.8489, + "step": 213620 + }, + { + "epoch": 16.554690224340348, + "grad_norm": 1.5050329974710632, + "learning_rate": 8.277665840049598e-07, + "loss": 0.875, + "step": 213630 + }, + { + "epoch": 16.555465147816655, + "grad_norm": 1.62193063196046, + "learning_rate": 8.278053316800992e-07, + "loss": 0.8505, + "step": 213640 + }, + { + "epoch": 16.55624007129296, + "grad_norm": 1.5664225787084223, + "learning_rate": 8.278440793552388e-07, + "loss": 0.8479, + "step": 213650 + }, + { + "epoch": 16.557014994769265, + "grad_norm": 1.6386888722576423, + "learning_rate": 8.278828270303782e-07, + "loss": 0.8472, + "step": 213660 + }, + { + "epoch": 16.55778991824557, + "grad_norm": 1.601545344238378, + "learning_rate": 8.279215747055177e-07, + "loss": 0.8475, + "step": 213670 + }, + { + "epoch": 16.55856484172188, + "grad_norm": 1.5424014354565343, + "learning_rate": 8.279603223806571e-07, + "loss": 0.8398, + "step": 213680 + }, + { + "epoch": 16.559339765198185, + "grad_norm": 1.5871210960132855, + "learning_rate": 8.279990700557968e-07, + "loss": 0.8481, + "step": 213690 + }, + { + "epoch": 16.560114688674492, + "grad_norm": 1.6104725301604765, + "learning_rate": 8.280378177309362e-07, + "loss": 0.8509, + "step": 213700 + }, + { + "epoch": 16.5608896121508, + "grad_norm": 1.5683798814729175, + "learning_rate": 8.280765654060757e-07, + "loss": 0.8502, + "step": 213710 + }, + { + "epoch": 16.561664535627106, + "grad_norm": 1.6246517324570242, + "learning_rate": 8.281153130812151e-07, + "loss": 0.8556, + "step": 213720 + }, + { + "epoch": 16.562439459103413, + "grad_norm": 1.5458606053012067, + "learning_rate": 8.281540607563548e-07, + "loss": 0.8695, + "step": 213730 + }, + { + "epoch": 16.56321438257972, + "grad_norm": 1.5329868066326335, + "learning_rate": 8.281928084314942e-07, + "loss": 0.876, + "step": 213740 + }, + { + "epoch": 16.563989306056026, + "grad_norm": 1.5959826110854194, + "learning_rate": 8.282315561066337e-07, + "loss": 0.8509, + "step": 213750 + }, + { + "epoch": 16.564764229532333, + "grad_norm": 1.5776797283693844, + "learning_rate": 8.282703037817731e-07, + "loss": 0.8571, + "step": 213760 + }, + { + "epoch": 16.56553915300864, + "grad_norm": 1.5358610960072228, + "learning_rate": 8.283090514569126e-07, + "loss": 0.8686, + "step": 213770 + }, + { + "epoch": 16.566314076484947, + "grad_norm": 1.6363194149440015, + "learning_rate": 8.28347799132052e-07, + "loss": 0.8548, + "step": 213780 + }, + { + "epoch": 16.567088999961253, + "grad_norm": 1.4783664371989467, + "learning_rate": 8.283865468071917e-07, + "loss": 0.8512, + "step": 213790 + }, + { + "epoch": 16.56786392343756, + "grad_norm": 1.6953526583771172, + "learning_rate": 8.284252944823311e-07, + "loss": 0.8532, + "step": 213800 + }, + { + "epoch": 16.568638846913867, + "grad_norm": 1.5909623143813096, + "learning_rate": 8.284640421574706e-07, + "loss": 0.8585, + "step": 213810 + }, + { + "epoch": 16.569413770390174, + "grad_norm": 1.6266384216387162, + "learning_rate": 8.2850278983261e-07, + "loss": 0.8687, + "step": 213820 + }, + { + "epoch": 16.57018869386648, + "grad_norm": 1.5588409858373318, + "learning_rate": 8.285415375077497e-07, + "loss": 0.8621, + "step": 213830 + }, + { + "epoch": 16.570963617342787, + "grad_norm": 1.6313992933512678, + "learning_rate": 8.285802851828891e-07, + "loss": 0.8533, + "step": 213840 + }, + { + "epoch": 16.571738540819094, + "grad_norm": 1.6862406139580945, + "learning_rate": 8.286190328580286e-07, + "loss": 0.8485, + "step": 213850 + }, + { + "epoch": 16.5725134642954, + "grad_norm": 1.7743052380927065, + "learning_rate": 8.28657780533168e-07, + "loss": 0.8497, + "step": 213860 + }, + { + "epoch": 16.573288387771708, + "grad_norm": 1.5378912581343935, + "learning_rate": 8.286965282083076e-07, + "loss": 0.8448, + "step": 213870 + }, + { + "epoch": 16.574063311248015, + "grad_norm": 1.5496681664954381, + "learning_rate": 8.287352758834471e-07, + "loss": 0.8806, + "step": 213880 + }, + { + "epoch": 16.57483823472432, + "grad_norm": 1.5001690080144872, + "learning_rate": 8.287740235585866e-07, + "loss": 0.8382, + "step": 213890 + }, + { + "epoch": 16.57561315820063, + "grad_norm": 1.541682910839811, + "learning_rate": 8.28812771233726e-07, + "loss": 0.8501, + "step": 213900 + }, + { + "epoch": 16.576388081676935, + "grad_norm": 1.7537461126745537, + "learning_rate": 8.288515189088655e-07, + "loss": 0.8748, + "step": 213910 + }, + { + "epoch": 16.577163005153242, + "grad_norm": 1.5210087847896843, + "learning_rate": 8.288902665840049e-07, + "loss": 0.8486, + "step": 213920 + }, + { + "epoch": 16.57793792862955, + "grad_norm": 1.5432893890805526, + "learning_rate": 8.289290142591446e-07, + "loss": 0.8224, + "step": 213930 + }, + { + "epoch": 16.578712852105856, + "grad_norm": 1.643428076083185, + "learning_rate": 8.28967761934284e-07, + "loss": 0.8522, + "step": 213940 + }, + { + "epoch": 16.579487775582162, + "grad_norm": 1.7020507381829886, + "learning_rate": 8.290065096094235e-07, + "loss": 0.8437, + "step": 213950 + }, + { + "epoch": 16.58026269905847, + "grad_norm": 1.5733911739214401, + "learning_rate": 8.290452572845629e-07, + "loss": 0.8569, + "step": 213960 + }, + { + "epoch": 16.581037622534776, + "grad_norm": 1.5698532843368151, + "learning_rate": 8.290840049597025e-07, + "loss": 0.8719, + "step": 213970 + }, + { + "epoch": 16.581812546011083, + "grad_norm": 1.5422497248109113, + "learning_rate": 8.29122752634842e-07, + "loss": 0.8554, + "step": 213980 + }, + { + "epoch": 16.58258746948739, + "grad_norm": 1.6930751227913754, + "learning_rate": 8.291615003099815e-07, + "loss": 0.8509, + "step": 213990 + }, + { + "epoch": 16.583362392963696, + "grad_norm": 1.6500070800092015, + "learning_rate": 8.292002479851209e-07, + "loss": 0.8579, + "step": 214000 + }, + { + "epoch": 16.583362392963696, + "eval_loss": 0.8894564509391785, + "eval_runtime": 330.8296, + "eval_samples_per_second": 34.673, + "eval_steps_per_second": 8.669, + "step": 214000 + }, + { + "epoch": 16.584137316440003, + "grad_norm": 1.5150552805352502, + "learning_rate": 8.292389956602605e-07, + "loss": 0.8351, + "step": 214010 + }, + { + "epoch": 16.584912239916306, + "grad_norm": 1.5497071770602187, + "learning_rate": 8.292777433353999e-07, + "loss": 0.8478, + "step": 214020 + }, + { + "epoch": 16.585687163392613, + "grad_norm": 1.64354597998867, + "learning_rate": 8.293164910105395e-07, + "loss": 0.8584, + "step": 214030 + }, + { + "epoch": 16.58646208686892, + "grad_norm": 1.6397724138156633, + "learning_rate": 8.293552386856789e-07, + "loss": 0.83, + "step": 214040 + }, + { + "epoch": 16.587237010345227, + "grad_norm": 1.527717999473457, + "learning_rate": 8.293939863608184e-07, + "loss": 0.8473, + "step": 214050 + }, + { + "epoch": 16.588011933821534, + "grad_norm": 1.570559208525436, + "learning_rate": 8.294327340359578e-07, + "loss": 0.8612, + "step": 214060 + }, + { + "epoch": 16.58878685729784, + "grad_norm": 1.7223994864226198, + "learning_rate": 8.294714817110974e-07, + "loss": 0.8454, + "step": 214070 + }, + { + "epoch": 16.589561780774147, + "grad_norm": 1.7381258411500613, + "learning_rate": 8.295102293862369e-07, + "loss": 0.8417, + "step": 214080 + }, + { + "epoch": 16.590336704250454, + "grad_norm": 1.5719380775180334, + "learning_rate": 8.295489770613764e-07, + "loss": 0.8598, + "step": 214090 + }, + { + "epoch": 16.59111162772676, + "grad_norm": 1.5926922827303258, + "learning_rate": 8.295877247365158e-07, + "loss": 0.8512, + "step": 214100 + }, + { + "epoch": 16.591886551203068, + "grad_norm": 1.5343211164271975, + "learning_rate": 8.296264724116554e-07, + "loss": 0.8672, + "step": 214110 + }, + { + "epoch": 16.592661474679375, + "grad_norm": 1.6101408720276125, + "learning_rate": 8.296652200867948e-07, + "loss": 0.8537, + "step": 214120 + }, + { + "epoch": 16.59343639815568, + "grad_norm": 1.4693874368338737, + "learning_rate": 8.297039677619344e-07, + "loss": 0.8563, + "step": 214130 + }, + { + "epoch": 16.594211321631988, + "grad_norm": 1.5565616326582563, + "learning_rate": 8.297427154370738e-07, + "loss": 0.8412, + "step": 214140 + }, + { + "epoch": 16.594986245108295, + "grad_norm": 1.6011534154945508, + "learning_rate": 8.297814631122133e-07, + "loss": 0.8489, + "step": 214150 + }, + { + "epoch": 16.595761168584602, + "grad_norm": 1.6380577811721293, + "learning_rate": 8.298202107873528e-07, + "loss": 0.8529, + "step": 214160 + }, + { + "epoch": 16.59653609206091, + "grad_norm": 1.5888104891939896, + "learning_rate": 8.298589584624924e-07, + "loss": 0.8463, + "step": 214170 + }, + { + "epoch": 16.597311015537215, + "grad_norm": 1.5742350643935403, + "learning_rate": 8.298977061376318e-07, + "loss": 0.8512, + "step": 214180 + }, + { + "epoch": 16.598085939013522, + "grad_norm": 1.4980480648654793, + "learning_rate": 8.299364538127713e-07, + "loss": 0.8423, + "step": 214190 + }, + { + "epoch": 16.59886086248983, + "grad_norm": 1.5776405974197758, + "learning_rate": 8.299752014879107e-07, + "loss": 0.8438, + "step": 214200 + }, + { + "epoch": 16.599635785966136, + "grad_norm": 1.6213957711892617, + "learning_rate": 8.300139491630503e-07, + "loss": 0.8629, + "step": 214210 + }, + { + "epoch": 16.600410709442443, + "grad_norm": 1.644858853133176, + "learning_rate": 8.300526968381897e-07, + "loss": 0.8623, + "step": 214220 + }, + { + "epoch": 16.60118563291875, + "grad_norm": 1.6925622450527336, + "learning_rate": 8.300914445133293e-07, + "loss": 0.839, + "step": 214230 + }, + { + "epoch": 16.601960556395056, + "grad_norm": 1.5781289733273514, + "learning_rate": 8.301301921884687e-07, + "loss": 0.8541, + "step": 214240 + }, + { + "epoch": 16.602735479871363, + "grad_norm": 1.6796501263075525, + "learning_rate": 8.301689398636083e-07, + "loss": 0.8476, + "step": 214250 + }, + { + "epoch": 16.60351040334767, + "grad_norm": 1.7925247304308294, + "learning_rate": 8.302076875387477e-07, + "loss": 0.864, + "step": 214260 + }, + { + "epoch": 16.604285326823977, + "grad_norm": 1.5370419126336383, + "learning_rate": 8.302464352138873e-07, + "loss": 0.8446, + "step": 214270 + }, + { + "epoch": 16.605060250300284, + "grad_norm": 1.5525072186297169, + "learning_rate": 8.302851828890267e-07, + "loss": 0.8467, + "step": 214280 + }, + { + "epoch": 16.60583517377659, + "grad_norm": 1.4828549195267386, + "learning_rate": 8.303239305641662e-07, + "loss": 0.8456, + "step": 214290 + }, + { + "epoch": 16.606610097252897, + "grad_norm": 1.6244307611247144, + "learning_rate": 8.303626782393057e-07, + "loss": 0.864, + "step": 214300 + }, + { + "epoch": 16.607385020729204, + "grad_norm": 1.5690079213276042, + "learning_rate": 8.304014259144452e-07, + "loss": 0.8513, + "step": 214310 + }, + { + "epoch": 16.60815994420551, + "grad_norm": 1.5521320832819896, + "learning_rate": 8.304401735895847e-07, + "loss": 0.8646, + "step": 214320 + }, + { + "epoch": 16.608934867681818, + "grad_norm": 1.6039786497003004, + "learning_rate": 8.304789212647242e-07, + "loss": 0.8467, + "step": 214330 + }, + { + "epoch": 16.609709791158124, + "grad_norm": 1.5841245816625436, + "learning_rate": 8.305176689398636e-07, + "loss": 0.8461, + "step": 214340 + }, + { + "epoch": 16.61048471463443, + "grad_norm": 1.5983989975175106, + "learning_rate": 8.305564166150032e-07, + "loss": 0.8767, + "step": 214350 + }, + { + "epoch": 16.611259638110738, + "grad_norm": 1.6005815506032726, + "learning_rate": 8.305951642901426e-07, + "loss": 0.8419, + "step": 214360 + }, + { + "epoch": 16.612034561587045, + "grad_norm": 1.5742680453962177, + "learning_rate": 8.306339119652822e-07, + "loss": 0.8519, + "step": 214370 + }, + { + "epoch": 16.61280948506335, + "grad_norm": 1.4977968566792792, + "learning_rate": 8.306726596404216e-07, + "loss": 0.8511, + "step": 214380 + }, + { + "epoch": 16.61358440853966, + "grad_norm": 1.546412124048209, + "learning_rate": 8.307114073155612e-07, + "loss": 0.8586, + "step": 214390 + }, + { + "epoch": 16.61435933201596, + "grad_norm": 1.6358509080424115, + "learning_rate": 8.307501549907006e-07, + "loss": 0.8619, + "step": 214400 + }, + { + "epoch": 16.61513425549227, + "grad_norm": 1.5365419930434192, + "learning_rate": 8.307889026658401e-07, + "loss": 0.8624, + "step": 214410 + }, + { + "epoch": 16.615909178968575, + "grad_norm": 1.5490547043353544, + "learning_rate": 8.308276503409796e-07, + "loss": 0.841, + "step": 214420 + }, + { + "epoch": 16.616684102444882, + "grad_norm": 1.6360267758840974, + "learning_rate": 8.308663980161191e-07, + "loss": 0.8586, + "step": 214430 + }, + { + "epoch": 16.61745902592119, + "grad_norm": 1.5192057172835214, + "learning_rate": 8.309051456912586e-07, + "loss": 0.8595, + "step": 214440 + }, + { + "epoch": 16.618233949397496, + "grad_norm": 1.5339631231599096, + "learning_rate": 8.309438933663981e-07, + "loss": 0.8516, + "step": 214450 + }, + { + "epoch": 16.619008872873803, + "grad_norm": 1.5926906345831344, + "learning_rate": 8.309826410415375e-07, + "loss": 0.8491, + "step": 214460 + }, + { + "epoch": 16.61978379635011, + "grad_norm": 1.697687854404147, + "learning_rate": 8.310213887166771e-07, + "loss": 0.8448, + "step": 214470 + }, + { + "epoch": 16.620558719826416, + "grad_norm": 1.667679553390163, + "learning_rate": 8.310601363918165e-07, + "loss": 0.8476, + "step": 214480 + }, + { + "epoch": 16.621333643302723, + "grad_norm": 1.5355207142789795, + "learning_rate": 8.310988840669561e-07, + "loss": 0.8561, + "step": 214490 + }, + { + "epoch": 16.62210856677903, + "grad_norm": 1.664703156213317, + "learning_rate": 8.311376317420955e-07, + "loss": 0.8437, + "step": 214500 + }, + { + "epoch": 16.62210856677903, + "eval_loss": 0.8893710970878601, + "eval_runtime": 329.9018, + "eval_samples_per_second": 34.771, + "eval_steps_per_second": 8.693, + "step": 214500 + }, + { + "epoch": 16.622883490255337, + "grad_norm": 1.5997427664155555, + "learning_rate": 8.31176379417235e-07, + "loss": 0.8508, + "step": 214510 + }, + { + "epoch": 16.623658413731643, + "grad_norm": 1.5428991683459123, + "learning_rate": 8.312151270923745e-07, + "loss": 0.8638, + "step": 214520 + }, + { + "epoch": 16.62443333720795, + "grad_norm": 1.562223783204709, + "learning_rate": 8.312538747675141e-07, + "loss": 0.8325, + "step": 214530 + }, + { + "epoch": 16.625208260684257, + "grad_norm": 1.6520151347659633, + "learning_rate": 8.312926224426535e-07, + "loss": 0.849, + "step": 214540 + }, + { + "epoch": 16.625983184160564, + "grad_norm": 1.583364452265871, + "learning_rate": 8.31331370117793e-07, + "loss": 0.8438, + "step": 214550 + }, + { + "epoch": 16.62675810763687, + "grad_norm": 1.5376673618664716, + "learning_rate": 8.313701177929324e-07, + "loss": 0.8662, + "step": 214560 + }, + { + "epoch": 16.627533031113177, + "grad_norm": 1.590684873630741, + "learning_rate": 8.31408865468072e-07, + "loss": 0.8571, + "step": 214570 + }, + { + "epoch": 16.628307954589484, + "grad_norm": 1.544281174618821, + "learning_rate": 8.314476131432115e-07, + "loss": 0.8679, + "step": 214580 + }, + { + "epoch": 16.62908287806579, + "grad_norm": 1.5536405027196016, + "learning_rate": 8.31486360818351e-07, + "loss": 0.8495, + "step": 214590 + }, + { + "epoch": 16.629857801542098, + "grad_norm": 1.5066726417480314, + "learning_rate": 8.315251084934904e-07, + "loss": 0.8715, + "step": 214600 + }, + { + "epoch": 16.630632725018405, + "grad_norm": 1.6218958004184716, + "learning_rate": 8.315638561686299e-07, + "loss": 0.8586, + "step": 214610 + }, + { + "epoch": 16.63140764849471, + "grad_norm": 1.5858716251469813, + "learning_rate": 8.316026038437694e-07, + "loss": 0.84, + "step": 214620 + }, + { + "epoch": 16.63218257197102, + "grad_norm": 1.5965662084425827, + "learning_rate": 8.31641351518909e-07, + "loss": 0.859, + "step": 214630 + }, + { + "epoch": 16.632957495447325, + "grad_norm": 1.5272417096687583, + "learning_rate": 8.316800991940484e-07, + "loss": 0.8579, + "step": 214640 + }, + { + "epoch": 16.633732418923632, + "grad_norm": 1.4614812138077746, + "learning_rate": 8.317188468691879e-07, + "loss": 0.8385, + "step": 214650 + }, + { + "epoch": 16.63450734239994, + "grad_norm": 1.5169884911708686, + "learning_rate": 8.317575945443273e-07, + "loss": 0.86, + "step": 214660 + }, + { + "epoch": 16.635282265876246, + "grad_norm": 1.5681057127945006, + "learning_rate": 8.31796342219467e-07, + "loss": 0.8383, + "step": 214670 + }, + { + "epoch": 16.636057189352552, + "grad_norm": 1.5613098542338901, + "learning_rate": 8.318350898946064e-07, + "loss": 0.8539, + "step": 214680 + }, + { + "epoch": 16.63683211282886, + "grad_norm": 1.6582805685758912, + "learning_rate": 8.318738375697459e-07, + "loss": 0.8413, + "step": 214690 + }, + { + "epoch": 16.637607036305166, + "grad_norm": 1.611690965658145, + "learning_rate": 8.319125852448853e-07, + "loss": 0.8458, + "step": 214700 + }, + { + "epoch": 16.638381959781473, + "grad_norm": 1.4752857445813643, + "learning_rate": 8.319513329200248e-07, + "loss": 0.8524, + "step": 214710 + }, + { + "epoch": 16.63915688325778, + "grad_norm": 1.6413291746628649, + "learning_rate": 8.319900805951644e-07, + "loss": 0.8559, + "step": 214720 + }, + { + "epoch": 16.639931806734086, + "grad_norm": 1.5644047358757647, + "learning_rate": 8.320288282703039e-07, + "loss": 0.8571, + "step": 214730 + }, + { + "epoch": 16.640706730210393, + "grad_norm": 1.605682496743367, + "learning_rate": 8.320675759454433e-07, + "loss": 0.861, + "step": 214740 + }, + { + "epoch": 16.6414816536867, + "grad_norm": 1.6795589679436966, + "learning_rate": 8.321063236205828e-07, + "loss": 0.8413, + "step": 214750 + }, + { + "epoch": 16.642256577163003, + "grad_norm": 1.5016239185660032, + "learning_rate": 8.321450712957222e-07, + "loss": 0.8423, + "step": 214760 + }, + { + "epoch": 16.64303150063931, + "grad_norm": 1.591398462999533, + "learning_rate": 8.321838189708619e-07, + "loss": 0.8768, + "step": 214770 + }, + { + "epoch": 16.643806424115617, + "grad_norm": 1.6453407353882001, + "learning_rate": 8.322225666460013e-07, + "loss": 0.834, + "step": 214780 + }, + { + "epoch": 16.644581347591924, + "grad_norm": 1.5777306983708534, + "learning_rate": 8.322613143211408e-07, + "loss": 0.8508, + "step": 214790 + }, + { + "epoch": 16.64535627106823, + "grad_norm": 1.5509399781866497, + "learning_rate": 8.323000619962802e-07, + "loss": 0.8723, + "step": 214800 + }, + { + "epoch": 16.646131194544537, + "grad_norm": 1.5512576467903076, + "learning_rate": 8.323388096714199e-07, + "loss": 0.8649, + "step": 214810 + }, + { + "epoch": 16.646906118020844, + "grad_norm": 1.6889741641377038, + "learning_rate": 8.323775573465593e-07, + "loss": 0.8585, + "step": 214820 + }, + { + "epoch": 16.64768104149715, + "grad_norm": 1.5580592496892198, + "learning_rate": 8.324163050216988e-07, + "loss": 0.8534, + "step": 214830 + }, + { + "epoch": 16.648455964973458, + "grad_norm": 1.4862723319509412, + "learning_rate": 8.324550526968382e-07, + "loss": 0.8523, + "step": 214840 + }, + { + "epoch": 16.649230888449765, + "grad_norm": 1.6444485396126036, + "learning_rate": 8.324938003719777e-07, + "loss": 0.8574, + "step": 214850 + }, + { + "epoch": 16.65000581192607, + "grad_norm": 1.607606279743942, + "learning_rate": 8.325325480471171e-07, + "loss": 0.8674, + "step": 214860 + }, + { + "epoch": 16.65078073540238, + "grad_norm": 1.5956210100983304, + "learning_rate": 8.325712957222568e-07, + "loss": 0.8647, + "step": 214870 + }, + { + "epoch": 16.651555658878685, + "grad_norm": 1.6121732057996563, + "learning_rate": 8.326100433973962e-07, + "loss": 0.8584, + "step": 214880 + }, + { + "epoch": 16.652330582354992, + "grad_norm": 1.6253158832230796, + "learning_rate": 8.326487910725357e-07, + "loss": 0.8575, + "step": 214890 + }, + { + "epoch": 16.6531055058313, + "grad_norm": 1.5935722209277883, + "learning_rate": 8.326875387476751e-07, + "loss": 0.8584, + "step": 214900 + }, + { + "epoch": 16.653880429307605, + "grad_norm": 1.6669203657194498, + "learning_rate": 8.327262864228148e-07, + "loss": 0.8466, + "step": 214910 + }, + { + "epoch": 16.654655352783912, + "grad_norm": 1.5955927927625746, + "learning_rate": 8.327650340979542e-07, + "loss": 0.8476, + "step": 214920 + }, + { + "epoch": 16.65543027626022, + "grad_norm": 1.724993750796063, + "learning_rate": 8.328037817730937e-07, + "loss": 0.8489, + "step": 214930 + }, + { + "epoch": 16.656205199736526, + "grad_norm": 1.5614434643452288, + "learning_rate": 8.328425294482331e-07, + "loss": 0.8301, + "step": 214940 + }, + { + "epoch": 16.656980123212833, + "grad_norm": 1.6239427694222484, + "learning_rate": 8.328812771233727e-07, + "loss": 0.8644, + "step": 214950 + }, + { + "epoch": 16.65775504668914, + "grad_norm": 1.545223713265733, + "learning_rate": 8.329200247985122e-07, + "loss": 0.8503, + "step": 214960 + }, + { + "epoch": 16.658529970165446, + "grad_norm": 1.528818686661723, + "learning_rate": 8.329587724736517e-07, + "loss": 0.8431, + "step": 214970 + }, + { + "epoch": 16.659304893641753, + "grad_norm": 2.1232949547053046, + "learning_rate": 8.329975201487911e-07, + "loss": 0.873, + "step": 214980 + }, + { + "epoch": 16.66007981711806, + "grad_norm": 1.5919906205781063, + "learning_rate": 8.330362678239306e-07, + "loss": 0.8635, + "step": 214990 + }, + { + "epoch": 16.660854740594367, + "grad_norm": 1.7222290039051331, + "learning_rate": 8.3307501549907e-07, + "loss": 0.8693, + "step": 215000 + }, + { + "epoch": 16.660854740594367, + "eval_loss": 0.8894798755645752, + "eval_runtime": 332.1597, + "eval_samples_per_second": 34.535, + "eval_steps_per_second": 8.634, + "step": 215000 + }, + { + "epoch": 16.661629664070674, + "grad_norm": 1.6383287943561795, + "learning_rate": 8.331137631742097e-07, + "loss": 0.8544, + "step": 215010 + }, + { + "epoch": 16.66240458754698, + "grad_norm": 1.76892674823717, + "learning_rate": 8.331525108493491e-07, + "loss": 0.8759, + "step": 215020 + }, + { + "epoch": 16.663179511023287, + "grad_norm": 1.7185524040730882, + "learning_rate": 8.331912585244886e-07, + "loss": 0.853, + "step": 215030 + }, + { + "epoch": 16.663954434499594, + "grad_norm": 1.5990683963682166, + "learning_rate": 8.33230006199628e-07, + "loss": 0.858, + "step": 215040 + }, + { + "epoch": 16.6647293579759, + "grad_norm": 1.540066117095582, + "learning_rate": 8.332687538747676e-07, + "loss": 0.8462, + "step": 215050 + }, + { + "epoch": 16.665504281452208, + "grad_norm": 1.547125517305137, + "learning_rate": 8.333075015499071e-07, + "loss": 0.8414, + "step": 215060 + }, + { + "epoch": 16.666279204928514, + "grad_norm": 1.720188732788385, + "learning_rate": 8.333462492250466e-07, + "loss": 0.8606, + "step": 215070 + }, + { + "epoch": 16.66705412840482, + "grad_norm": 1.620655646020511, + "learning_rate": 8.33384996900186e-07, + "loss": 0.8564, + "step": 215080 + }, + { + "epoch": 16.667829051881128, + "grad_norm": 1.66190329005613, + "learning_rate": 8.334237445753256e-07, + "loss": 0.852, + "step": 215090 + }, + { + "epoch": 16.668603975357435, + "grad_norm": 1.7271249484322935, + "learning_rate": 8.33462492250465e-07, + "loss": 0.863, + "step": 215100 + }, + { + "epoch": 16.66937889883374, + "grad_norm": 1.6578780869351635, + "learning_rate": 8.335012399256046e-07, + "loss": 0.8454, + "step": 215110 + }, + { + "epoch": 16.67015382231005, + "grad_norm": 1.5703761896554087, + "learning_rate": 8.33539987600744e-07, + "loss": 0.8578, + "step": 215120 + }, + { + "epoch": 16.670928745786355, + "grad_norm": 1.7300057411701306, + "learning_rate": 8.335787352758835e-07, + "loss": 0.85, + "step": 215130 + }, + { + "epoch": 16.671703669262662, + "grad_norm": 1.829241436674151, + "learning_rate": 8.336174829510229e-07, + "loss": 0.8718, + "step": 215140 + }, + { + "epoch": 16.672478592738965, + "grad_norm": 1.5193153688668002, + "learning_rate": 8.336562306261625e-07, + "loss": 0.8558, + "step": 215150 + }, + { + "epoch": 16.673253516215272, + "grad_norm": 1.5245245660634654, + "learning_rate": 8.33694978301302e-07, + "loss": 0.8609, + "step": 215160 + }, + { + "epoch": 16.67402843969158, + "grad_norm": 1.5498836952097603, + "learning_rate": 8.337337259764415e-07, + "loss": 0.8425, + "step": 215170 + }, + { + "epoch": 16.674803363167886, + "grad_norm": 1.6100591114492033, + "learning_rate": 8.337724736515809e-07, + "loss": 0.8687, + "step": 215180 + }, + { + "epoch": 16.675578286644193, + "grad_norm": 1.5941281794703612, + "learning_rate": 8.338112213267205e-07, + "loss": 0.8781, + "step": 215190 + }, + { + "epoch": 16.6763532101205, + "grad_norm": 1.55129497749613, + "learning_rate": 8.338499690018599e-07, + "loss": 0.8673, + "step": 215200 + }, + { + "epoch": 16.677128133596806, + "grad_norm": 1.5422464262700188, + "learning_rate": 8.338887166769995e-07, + "loss": 0.8461, + "step": 215210 + }, + { + "epoch": 16.677903057073113, + "grad_norm": 1.6314113168163062, + "learning_rate": 8.339274643521389e-07, + "loss": 0.8624, + "step": 215220 + }, + { + "epoch": 16.67867798054942, + "grad_norm": 1.640375524627635, + "learning_rate": 8.339662120272785e-07, + "loss": 0.8752, + "step": 215230 + }, + { + "epoch": 16.679452904025727, + "grad_norm": 1.678004357583086, + "learning_rate": 8.340049597024179e-07, + "loss": 0.8562, + "step": 215240 + }, + { + "epoch": 16.680227827502033, + "grad_norm": 1.5913011012000953, + "learning_rate": 8.340437073775574e-07, + "loss": 0.8629, + "step": 215250 + }, + { + "epoch": 16.68100275097834, + "grad_norm": 1.6374659141105863, + "learning_rate": 8.340824550526969e-07, + "loss": 0.8557, + "step": 215260 + }, + { + "epoch": 16.681777674454647, + "grad_norm": 1.584690242234495, + "learning_rate": 8.341212027278364e-07, + "loss": 0.8594, + "step": 215270 + }, + { + "epoch": 16.682552597930954, + "grad_norm": 1.4993801855745943, + "learning_rate": 8.341599504029758e-07, + "loss": 0.8516, + "step": 215280 + }, + { + "epoch": 16.68332752140726, + "grad_norm": 1.6101264842005356, + "learning_rate": 8.341986980781154e-07, + "loss": 0.8701, + "step": 215290 + }, + { + "epoch": 16.684102444883568, + "grad_norm": 1.6329396723294207, + "learning_rate": 8.342374457532548e-07, + "loss": 0.858, + "step": 215300 + }, + { + "epoch": 16.684877368359874, + "grad_norm": 1.7050475672723222, + "learning_rate": 8.342761934283944e-07, + "loss": 0.8486, + "step": 215310 + }, + { + "epoch": 16.68565229183618, + "grad_norm": 1.5322352413842721, + "learning_rate": 8.343149411035338e-07, + "loss": 0.8339, + "step": 215320 + }, + { + "epoch": 16.686427215312488, + "grad_norm": 1.5564404572272184, + "learning_rate": 8.343536887786734e-07, + "loss": 0.865, + "step": 215330 + }, + { + "epoch": 16.687202138788795, + "grad_norm": 1.5599816786312684, + "learning_rate": 8.343924364538128e-07, + "loss": 0.8521, + "step": 215340 + }, + { + "epoch": 16.6879770622651, + "grad_norm": 1.6375677841396254, + "learning_rate": 8.344311841289523e-07, + "loss": 0.8416, + "step": 215350 + }, + { + "epoch": 16.68875198574141, + "grad_norm": 1.6465909142436874, + "learning_rate": 8.344699318040918e-07, + "loss": 0.8598, + "step": 215360 + }, + { + "epoch": 16.689526909217715, + "grad_norm": 1.5828089457132826, + "learning_rate": 8.345086794792314e-07, + "loss": 0.8419, + "step": 215370 + }, + { + "epoch": 16.690301832694022, + "grad_norm": 1.5548014541552437, + "learning_rate": 8.345474271543708e-07, + "loss": 0.8689, + "step": 215380 + }, + { + "epoch": 16.69107675617033, + "grad_norm": 1.863139863669479, + "learning_rate": 8.345861748295103e-07, + "loss": 0.851, + "step": 215390 + }, + { + "epoch": 16.691851679646636, + "grad_norm": 1.6981941653392318, + "learning_rate": 8.346249225046497e-07, + "loss": 0.8624, + "step": 215400 + }, + { + "epoch": 16.692626603122942, + "grad_norm": 1.65593710253385, + "learning_rate": 8.346636701797893e-07, + "loss": 0.8534, + "step": 215410 + }, + { + "epoch": 16.69340152659925, + "grad_norm": 1.4963004999276814, + "learning_rate": 8.347024178549287e-07, + "loss": 0.8679, + "step": 215420 + }, + { + "epoch": 16.694176450075556, + "grad_norm": 1.7589792355402383, + "learning_rate": 8.347411655300683e-07, + "loss": 0.848, + "step": 215430 + }, + { + "epoch": 16.694951373551863, + "grad_norm": 1.6934317830648606, + "learning_rate": 8.347799132052077e-07, + "loss": 0.8585, + "step": 215440 + }, + { + "epoch": 16.69572629702817, + "grad_norm": 1.4909552375212298, + "learning_rate": 8.348186608803472e-07, + "loss": 0.8549, + "step": 215450 + }, + { + "epoch": 16.696501220504476, + "grad_norm": 1.590735141502718, + "learning_rate": 8.348574085554867e-07, + "loss": 0.842, + "step": 215460 + }, + { + "epoch": 16.697276143980783, + "grad_norm": 1.545624428467034, + "learning_rate": 8.348961562306263e-07, + "loss": 0.8494, + "step": 215470 + }, + { + "epoch": 16.69805106745709, + "grad_norm": 1.5449708776517306, + "learning_rate": 8.349349039057657e-07, + "loss": 0.854, + "step": 215480 + }, + { + "epoch": 16.698825990933397, + "grad_norm": 1.7023695102629233, + "learning_rate": 8.349736515809052e-07, + "loss": 0.8553, + "step": 215490 + }, + { + "epoch": 16.699600914409704, + "grad_norm": 1.7208192268938427, + "learning_rate": 8.350123992560446e-07, + "loss": 0.8763, + "step": 215500 + }, + { + "epoch": 16.699600914409704, + "eval_loss": 0.8889040350914001, + "eval_runtime": 331.6192, + "eval_samples_per_second": 34.591, + "eval_steps_per_second": 8.648, + "step": 215500 + }, + { + "epoch": 16.700375837886007, + "grad_norm": 1.656717275524332, + "learning_rate": 8.350511469311843e-07, + "loss": 0.8602, + "step": 215510 + }, + { + "epoch": 16.701150761362314, + "grad_norm": 1.7026921484245274, + "learning_rate": 8.350898946063237e-07, + "loss": 0.8777, + "step": 215520 + }, + { + "epoch": 16.70192568483862, + "grad_norm": 1.552658999705311, + "learning_rate": 8.351286422814632e-07, + "loss": 0.8451, + "step": 215530 + }, + { + "epoch": 16.702700608314927, + "grad_norm": 1.5736798514584731, + "learning_rate": 8.351673899566026e-07, + "loss": 0.8468, + "step": 215540 + }, + { + "epoch": 16.703475531791234, + "grad_norm": 1.5766816264451249, + "learning_rate": 8.352061376317421e-07, + "loss": 0.8673, + "step": 215550 + }, + { + "epoch": 16.70425045526754, + "grad_norm": 1.5732982355999745, + "learning_rate": 8.352448853068816e-07, + "loss": 0.8609, + "step": 215560 + }, + { + "epoch": 16.705025378743848, + "grad_norm": 1.4697010315404215, + "learning_rate": 8.352836329820212e-07, + "loss": 0.8593, + "step": 215570 + }, + { + "epoch": 16.705800302220155, + "grad_norm": 1.5781574194417092, + "learning_rate": 8.353223806571606e-07, + "loss": 0.8598, + "step": 215580 + }, + { + "epoch": 16.70657522569646, + "grad_norm": 1.6302658052727548, + "learning_rate": 8.353611283323001e-07, + "loss": 0.8651, + "step": 215590 + }, + { + "epoch": 16.70735014917277, + "grad_norm": 1.5691173489853445, + "learning_rate": 8.353998760074395e-07, + "loss": 0.853, + "step": 215600 + }, + { + "epoch": 16.708125072649075, + "grad_norm": 1.6697032756109624, + "learning_rate": 8.354386236825792e-07, + "loss": 0.8641, + "step": 215610 + }, + { + "epoch": 16.708899996125382, + "grad_norm": 1.6478166805196117, + "learning_rate": 8.354773713577186e-07, + "loss": 0.8712, + "step": 215620 + }, + { + "epoch": 16.70967491960169, + "grad_norm": 1.7040628713462984, + "learning_rate": 8.355161190328581e-07, + "loss": 0.8542, + "step": 215630 + }, + { + "epoch": 16.710449843077996, + "grad_norm": 1.6015396712903662, + "learning_rate": 8.355548667079975e-07, + "loss": 0.8664, + "step": 215640 + }, + { + "epoch": 16.711224766554302, + "grad_norm": 1.5629716467689272, + "learning_rate": 8.35593614383137e-07, + "loss": 0.8493, + "step": 215650 + }, + { + "epoch": 16.71199969003061, + "grad_norm": 1.6188407721084892, + "learning_rate": 8.356323620582766e-07, + "loss": 0.8533, + "step": 215660 + }, + { + "epoch": 16.712774613506916, + "grad_norm": 1.6955849473726412, + "learning_rate": 8.356711097334161e-07, + "loss": 0.8563, + "step": 215670 + }, + { + "epoch": 16.713549536983223, + "grad_norm": 1.526605593593813, + "learning_rate": 8.357098574085555e-07, + "loss": 0.8499, + "step": 215680 + }, + { + "epoch": 16.71432446045953, + "grad_norm": 1.4870681370119507, + "learning_rate": 8.35748605083695e-07, + "loss": 0.8666, + "step": 215690 + }, + { + "epoch": 16.715099383935836, + "grad_norm": 1.5276649726094695, + "learning_rate": 8.357873527588344e-07, + "loss": 0.8585, + "step": 215700 + }, + { + "epoch": 16.715874307412143, + "grad_norm": 1.5580552315009015, + "learning_rate": 8.358261004339741e-07, + "loss": 0.8489, + "step": 215710 + }, + { + "epoch": 16.71664923088845, + "grad_norm": 1.7526935040643667, + "learning_rate": 8.358648481091135e-07, + "loss": 0.8572, + "step": 215720 + }, + { + "epoch": 16.717424154364757, + "grad_norm": 1.6188091627052863, + "learning_rate": 8.35903595784253e-07, + "loss": 0.8581, + "step": 215730 + }, + { + "epoch": 16.718199077841064, + "grad_norm": 1.523118964698444, + "learning_rate": 8.359423434593924e-07, + "loss": 0.873, + "step": 215740 + }, + { + "epoch": 16.71897400131737, + "grad_norm": 1.5866736552869862, + "learning_rate": 8.359810911345321e-07, + "loss": 0.8449, + "step": 215750 + }, + { + "epoch": 16.719748924793677, + "grad_norm": 1.575722162450454, + "learning_rate": 8.360198388096715e-07, + "loss": 0.8794, + "step": 215760 + }, + { + "epoch": 16.720523848269984, + "grad_norm": 1.584142134818217, + "learning_rate": 8.36058586484811e-07, + "loss": 0.882, + "step": 215770 + }, + { + "epoch": 16.72129877174629, + "grad_norm": 1.5813926608714264, + "learning_rate": 8.360973341599504e-07, + "loss": 0.8654, + "step": 215780 + }, + { + "epoch": 16.722073695222598, + "grad_norm": 1.545156242179772, + "learning_rate": 8.361360818350899e-07, + "loss": 0.8502, + "step": 215790 + }, + { + "epoch": 16.722848618698904, + "grad_norm": 1.484322899187015, + "learning_rate": 8.361748295102295e-07, + "loss": 0.8533, + "step": 215800 + }, + { + "epoch": 16.72362354217521, + "grad_norm": 1.4873316031781272, + "learning_rate": 8.36213577185369e-07, + "loss": 0.852, + "step": 215810 + }, + { + "epoch": 16.724398465651518, + "grad_norm": 1.7403105921175817, + "learning_rate": 8.362523248605084e-07, + "loss": 0.842, + "step": 215820 + }, + { + "epoch": 16.725173389127825, + "grad_norm": 1.6536330245691553, + "learning_rate": 8.362910725356479e-07, + "loss": 0.8681, + "step": 215830 + }, + { + "epoch": 16.72594831260413, + "grad_norm": 1.5827586991558598, + "learning_rate": 8.363298202107873e-07, + "loss": 0.8666, + "step": 215840 + }, + { + "epoch": 16.72672323608044, + "grad_norm": 1.5148341254589277, + "learning_rate": 8.36368567885927e-07, + "loss": 0.8592, + "step": 215850 + }, + { + "epoch": 16.727498159556745, + "grad_norm": 1.5140509556795587, + "learning_rate": 8.364073155610664e-07, + "loss": 0.8438, + "step": 215860 + }, + { + "epoch": 16.728273083033052, + "grad_norm": 1.6802368242567545, + "learning_rate": 8.364460632362059e-07, + "loss": 0.8574, + "step": 215870 + }, + { + "epoch": 16.72904800650936, + "grad_norm": 1.6327142164751984, + "learning_rate": 8.364848109113453e-07, + "loss": 0.8498, + "step": 215880 + }, + { + "epoch": 16.729822929985662, + "grad_norm": 1.513967322903238, + "learning_rate": 8.365235585864849e-07, + "loss": 0.8608, + "step": 215890 + }, + { + "epoch": 16.73059785346197, + "grad_norm": 1.679547578270937, + "learning_rate": 8.365623062616244e-07, + "loss": 0.8484, + "step": 215900 + }, + { + "epoch": 16.731372776938276, + "grad_norm": 1.6533499749395941, + "learning_rate": 8.366010539367639e-07, + "loss": 0.8517, + "step": 215910 + }, + { + "epoch": 16.732147700414583, + "grad_norm": 1.5633905848232799, + "learning_rate": 8.366398016119033e-07, + "loss": 0.8594, + "step": 215920 + }, + { + "epoch": 16.73292262389089, + "grad_norm": 1.5528760029133266, + "learning_rate": 8.366785492870428e-07, + "loss": 0.8753, + "step": 215930 + }, + { + "epoch": 16.733697547367196, + "grad_norm": 1.579844452174418, + "learning_rate": 8.367172969621823e-07, + "loss": 0.8578, + "step": 215940 + }, + { + "epoch": 16.734472470843503, + "grad_norm": 1.6693398491805065, + "learning_rate": 8.367560446373219e-07, + "loss": 0.8604, + "step": 215950 + }, + { + "epoch": 16.73524739431981, + "grad_norm": 1.6267231093392058, + "learning_rate": 8.367947923124613e-07, + "loss": 0.8282, + "step": 215960 + }, + { + "epoch": 16.736022317796117, + "grad_norm": 1.5285116156126968, + "learning_rate": 8.368335399876008e-07, + "loss": 0.8308, + "step": 215970 + }, + { + "epoch": 16.736797241272424, + "grad_norm": 1.5679017908350938, + "learning_rate": 8.368722876627402e-07, + "loss": 0.8441, + "step": 215980 + }, + { + "epoch": 16.73757216474873, + "grad_norm": 1.641654801465252, + "learning_rate": 8.369110353378798e-07, + "loss": 0.8398, + "step": 215990 + }, + { + "epoch": 16.738347088225037, + "grad_norm": 1.6428261746702884, + "learning_rate": 8.369497830130193e-07, + "loss": 0.8489, + "step": 216000 + }, + { + "epoch": 16.738347088225037, + "eval_loss": 0.8892397284507751, + "eval_runtime": 331.2725, + "eval_samples_per_second": 34.627, + "eval_steps_per_second": 8.658, + "step": 216000 + }, + { + "epoch": 16.739122011701344, + "grad_norm": 1.5647219827707821, + "learning_rate": 8.369885306881588e-07, + "loss": 0.8536, + "step": 216010 + }, + { + "epoch": 16.73989693517765, + "grad_norm": 1.4792625732166078, + "learning_rate": 8.370272783632982e-07, + "loss": 0.8549, + "step": 216020 + }, + { + "epoch": 16.740671858653958, + "grad_norm": 1.647346120600335, + "learning_rate": 8.370660260384378e-07, + "loss": 0.8528, + "step": 216030 + }, + { + "epoch": 16.741446782130264, + "grad_norm": 1.5424219742267145, + "learning_rate": 8.371047737135772e-07, + "loss": 0.84, + "step": 216040 + }, + { + "epoch": 16.74222170560657, + "grad_norm": 1.5631082074539389, + "learning_rate": 8.371435213887168e-07, + "loss": 0.8536, + "step": 216050 + }, + { + "epoch": 16.742996629082878, + "grad_norm": 1.5658726851449172, + "learning_rate": 8.371822690638562e-07, + "loss": 0.8376, + "step": 216060 + }, + { + "epoch": 16.743771552559185, + "grad_norm": 1.6541812834608782, + "learning_rate": 8.372210167389957e-07, + "loss": 0.8604, + "step": 216070 + }, + { + "epoch": 16.74454647603549, + "grad_norm": 1.6060401701110174, + "learning_rate": 8.372597644141352e-07, + "loss": 0.8417, + "step": 216080 + }, + { + "epoch": 16.7453213995118, + "grad_norm": 1.6412726504383826, + "learning_rate": 8.372985120892747e-07, + "loss": 0.8358, + "step": 216090 + }, + { + "epoch": 16.746096322988105, + "grad_norm": 1.6272726299569888, + "learning_rate": 8.373372597644142e-07, + "loss": 0.8541, + "step": 216100 + }, + { + "epoch": 16.746871246464412, + "grad_norm": 1.6230113297908526, + "learning_rate": 8.373760074395537e-07, + "loss": 0.8666, + "step": 216110 + }, + { + "epoch": 16.74764616994072, + "grad_norm": 1.7372084035039153, + "learning_rate": 8.374147551146931e-07, + "loss": 0.8511, + "step": 216120 + }, + { + "epoch": 16.748421093417026, + "grad_norm": 1.59078294713859, + "learning_rate": 8.374535027898327e-07, + "loss": 0.8572, + "step": 216130 + }, + { + "epoch": 16.749196016893332, + "grad_norm": 1.378605546792449, + "learning_rate": 8.374922504649721e-07, + "loss": 0.8438, + "step": 216140 + }, + { + "epoch": 16.74997094036964, + "grad_norm": 1.6306920253560955, + "learning_rate": 8.375309981401117e-07, + "loss": 0.8597, + "step": 216150 + }, + { + "epoch": 16.750745863845946, + "grad_norm": 1.6057572166658796, + "learning_rate": 8.375697458152511e-07, + "loss": 0.8414, + "step": 216160 + }, + { + "epoch": 16.751520787322253, + "grad_norm": 1.5253675683251167, + "learning_rate": 8.376084934903907e-07, + "loss": 0.8488, + "step": 216170 + }, + { + "epoch": 16.75229571079856, + "grad_norm": 1.612330079168095, + "learning_rate": 8.376472411655301e-07, + "loss": 0.8615, + "step": 216180 + }, + { + "epoch": 16.753070634274867, + "grad_norm": 1.6209989777298686, + "learning_rate": 8.376859888406696e-07, + "loss": 0.8377, + "step": 216190 + }, + { + "epoch": 16.753845557751173, + "grad_norm": 1.5938948077322397, + "learning_rate": 8.377247365158091e-07, + "loss": 0.8427, + "step": 216200 + }, + { + "epoch": 16.75462048122748, + "grad_norm": 1.5713445291331352, + "learning_rate": 8.377634841909486e-07, + "loss": 0.8567, + "step": 216210 + }, + { + "epoch": 16.755395404703787, + "grad_norm": 1.6748943121689785, + "learning_rate": 8.37802231866088e-07, + "loss": 0.8527, + "step": 216220 + }, + { + "epoch": 16.756170328180094, + "grad_norm": 1.5921331754193624, + "learning_rate": 8.378409795412276e-07, + "loss": 0.849, + "step": 216230 + }, + { + "epoch": 16.7569452516564, + "grad_norm": 1.6023037295945362, + "learning_rate": 8.37879727216367e-07, + "loss": 0.8564, + "step": 216240 + }, + { + "epoch": 16.757720175132704, + "grad_norm": 1.8042986502148748, + "learning_rate": 8.379184748915066e-07, + "loss": 0.8697, + "step": 216250 + }, + { + "epoch": 16.75849509860901, + "grad_norm": 1.6190430565149594, + "learning_rate": 8.37957222566646e-07, + "loss": 0.8704, + "step": 216260 + }, + { + "epoch": 16.759270022085317, + "grad_norm": 1.5982716645448125, + "learning_rate": 8.379959702417856e-07, + "loss": 0.8482, + "step": 216270 + }, + { + "epoch": 16.760044945561624, + "grad_norm": 1.5344212379380668, + "learning_rate": 8.38034717916925e-07, + "loss": 0.835, + "step": 216280 + }, + { + "epoch": 16.76081986903793, + "grad_norm": 1.5963204375263789, + "learning_rate": 8.380734655920646e-07, + "loss": 0.8592, + "step": 216290 + }, + { + "epoch": 16.761594792514238, + "grad_norm": 1.5882451453743691, + "learning_rate": 8.38112213267204e-07, + "loss": 0.8473, + "step": 216300 + }, + { + "epoch": 16.762369715990545, + "grad_norm": 1.6353222404385466, + "learning_rate": 8.381509609423436e-07, + "loss": 0.8578, + "step": 216310 + }, + { + "epoch": 16.76314463946685, + "grad_norm": 1.7324045494389135, + "learning_rate": 8.38189708617483e-07, + "loss": 0.8563, + "step": 216320 + }, + { + "epoch": 16.76391956294316, + "grad_norm": 1.579283065249744, + "learning_rate": 8.382284562926225e-07, + "loss": 0.8471, + "step": 216330 + }, + { + "epoch": 16.764694486419465, + "grad_norm": 1.5893175072542474, + "learning_rate": 8.38267203967762e-07, + "loss": 0.8575, + "step": 216340 + }, + { + "epoch": 16.765469409895772, + "grad_norm": 1.5123399409419407, + "learning_rate": 8.383059516429015e-07, + "loss": 0.8539, + "step": 216350 + }, + { + "epoch": 16.76624433337208, + "grad_norm": 1.6009299464472695, + "learning_rate": 8.383446993180409e-07, + "loss": 0.8452, + "step": 216360 + }, + { + "epoch": 16.767019256848386, + "grad_norm": 1.6395447469314326, + "learning_rate": 8.383834469931805e-07, + "loss": 0.846, + "step": 216370 + }, + { + "epoch": 16.767794180324692, + "grad_norm": 1.5901041288800044, + "learning_rate": 8.384221946683199e-07, + "loss": 0.8528, + "step": 216380 + }, + { + "epoch": 16.768569103801, + "grad_norm": 1.555285540419973, + "learning_rate": 8.384609423434595e-07, + "loss": 0.8473, + "step": 216390 + }, + { + "epoch": 16.769344027277306, + "grad_norm": 1.5986903167072908, + "learning_rate": 8.384996900185989e-07, + "loss": 0.8458, + "step": 216400 + }, + { + "epoch": 16.770118950753613, + "grad_norm": 1.5316881772333213, + "learning_rate": 8.385384376937385e-07, + "loss": 0.8614, + "step": 216410 + }, + { + "epoch": 16.77089387422992, + "grad_norm": 1.6140237767549839, + "learning_rate": 8.385771853688779e-07, + "loss": 0.8783, + "step": 216420 + }, + { + "epoch": 16.771668797706226, + "grad_norm": 1.5876476320844501, + "learning_rate": 8.386159330440174e-07, + "loss": 0.8571, + "step": 216430 + }, + { + "epoch": 16.772443721182533, + "grad_norm": 1.54442524062211, + "learning_rate": 8.386546807191569e-07, + "loss": 0.8699, + "step": 216440 + }, + { + "epoch": 16.77321864465884, + "grad_norm": 1.664595775321911, + "learning_rate": 8.386934283942965e-07, + "loss": 0.8599, + "step": 216450 + }, + { + "epoch": 16.773993568135147, + "grad_norm": 1.5828438942852903, + "learning_rate": 8.387321760694359e-07, + "loss": 0.864, + "step": 216460 + }, + { + "epoch": 16.774768491611454, + "grad_norm": 1.6807179801231618, + "learning_rate": 8.387709237445754e-07, + "loss": 0.8576, + "step": 216470 + }, + { + "epoch": 16.77554341508776, + "grad_norm": 1.6089584105517891, + "learning_rate": 8.388096714197148e-07, + "loss": 0.8592, + "step": 216480 + }, + { + "epoch": 16.776318338564067, + "grad_norm": 1.6309072204289508, + "learning_rate": 8.388484190948544e-07, + "loss": 0.8645, + "step": 216490 + }, + { + "epoch": 16.777093262040374, + "grad_norm": 1.5220407459960479, + "learning_rate": 8.388871667699938e-07, + "loss": 0.8549, + "step": 216500 + }, + { + "epoch": 16.777093262040374, + "eval_loss": 0.8888506293296814, + "eval_runtime": 331.4984, + "eval_samples_per_second": 34.603, + "eval_steps_per_second": 8.652, + "step": 216500 + }, + { + "epoch": 16.77786818551668, + "grad_norm": 1.6120600839547965, + "learning_rate": 8.389259144451334e-07, + "loss": 0.8712, + "step": 216510 + }, + { + "epoch": 16.778643108992988, + "grad_norm": 1.5671924702305853, + "learning_rate": 8.389646621202728e-07, + "loss": 0.8497, + "step": 216520 + }, + { + "epoch": 16.779418032469295, + "grad_norm": 1.5626875485964302, + "learning_rate": 8.390034097954123e-07, + "loss": 0.8623, + "step": 216530 + }, + { + "epoch": 16.7801929559456, + "grad_norm": 1.6665756984326447, + "learning_rate": 8.390421574705518e-07, + "loss": 0.8572, + "step": 216540 + }, + { + "epoch": 16.780967879421908, + "grad_norm": 1.6574006330772533, + "learning_rate": 8.390809051456914e-07, + "loss": 0.8288, + "step": 216550 + }, + { + "epoch": 16.781742802898215, + "grad_norm": 1.6991160301962966, + "learning_rate": 8.391196528208308e-07, + "loss": 0.8779, + "step": 216560 + }, + { + "epoch": 16.78251772637452, + "grad_norm": 1.5733199775803945, + "learning_rate": 8.391584004959703e-07, + "loss": 0.8461, + "step": 216570 + }, + { + "epoch": 16.78329264985083, + "grad_norm": 1.5395439737648042, + "learning_rate": 8.391971481711097e-07, + "loss": 0.827, + "step": 216580 + }, + { + "epoch": 16.784067573327135, + "grad_norm": 1.567942182682598, + "learning_rate": 8.392358958462494e-07, + "loss": 0.8567, + "step": 216590 + }, + { + "epoch": 16.784842496803442, + "grad_norm": 1.5636434183198364, + "learning_rate": 8.392746435213888e-07, + "loss": 0.8518, + "step": 216600 + }, + { + "epoch": 16.78561742027975, + "grad_norm": 1.588549661461649, + "learning_rate": 8.393133911965283e-07, + "loss": 0.8399, + "step": 216610 + }, + { + "epoch": 16.786392343756056, + "grad_norm": 1.60212000630303, + "learning_rate": 8.393521388716677e-07, + "loss": 0.841, + "step": 216620 + }, + { + "epoch": 16.787167267232363, + "grad_norm": 1.5324056658679126, + "learning_rate": 8.393908865468072e-07, + "loss": 0.8529, + "step": 216630 + }, + { + "epoch": 16.787942190708666, + "grad_norm": 1.5890721374002736, + "learning_rate": 8.394296342219467e-07, + "loss": 0.8582, + "step": 216640 + }, + { + "epoch": 16.788717114184973, + "grad_norm": 1.627777228022653, + "learning_rate": 8.394683818970863e-07, + "loss": 0.8529, + "step": 216650 + }, + { + "epoch": 16.78949203766128, + "grad_norm": 1.6284934093472305, + "learning_rate": 8.395071295722257e-07, + "loss": 0.8616, + "step": 216660 + }, + { + "epoch": 16.790266961137586, + "grad_norm": 1.4798833280275616, + "learning_rate": 8.395458772473652e-07, + "loss": 0.8464, + "step": 216670 + }, + { + "epoch": 16.791041884613893, + "grad_norm": 1.6125311968742946, + "learning_rate": 8.395846249225046e-07, + "loss": 0.8547, + "step": 216680 + }, + { + "epoch": 16.7918168080902, + "grad_norm": 1.5672172614906832, + "learning_rate": 8.396233725976443e-07, + "loss": 0.8476, + "step": 216690 + }, + { + "epoch": 16.792591731566507, + "grad_norm": 1.6787563497524416, + "learning_rate": 8.396621202727837e-07, + "loss": 0.8568, + "step": 216700 + }, + { + "epoch": 16.793366655042814, + "grad_norm": 1.5640155493878012, + "learning_rate": 8.397008679479232e-07, + "loss": 0.8591, + "step": 216710 + }, + { + "epoch": 16.79414157851912, + "grad_norm": 1.5653036026338198, + "learning_rate": 8.397396156230626e-07, + "loss": 0.8676, + "step": 216720 + }, + { + "epoch": 16.794916501995427, + "grad_norm": 1.5264929745046534, + "learning_rate": 8.397783632982022e-07, + "loss": 0.8522, + "step": 216730 + }, + { + "epoch": 16.795691425471734, + "grad_norm": 1.6204870105702198, + "learning_rate": 8.398171109733417e-07, + "loss": 0.8444, + "step": 216740 + }, + { + "epoch": 16.79646634894804, + "grad_norm": 1.5265025968886046, + "learning_rate": 8.398558586484812e-07, + "loss": 0.8504, + "step": 216750 + }, + { + "epoch": 16.797241272424348, + "grad_norm": 1.5567634780568413, + "learning_rate": 8.398946063236206e-07, + "loss": 0.8728, + "step": 216760 + }, + { + "epoch": 16.798016195900654, + "grad_norm": 1.5531383521288318, + "learning_rate": 8.399333539987601e-07, + "loss": 0.8694, + "step": 216770 + }, + { + "epoch": 16.79879111937696, + "grad_norm": 1.579122920870954, + "learning_rate": 8.399721016738995e-07, + "loss": 0.8469, + "step": 216780 + }, + { + "epoch": 16.799566042853268, + "grad_norm": 1.509357414249215, + "learning_rate": 8.400108493490392e-07, + "loss": 0.8632, + "step": 216790 + }, + { + "epoch": 16.800340966329575, + "grad_norm": 1.5907575952466095, + "learning_rate": 8.400495970241786e-07, + "loss": 0.8558, + "step": 216800 + }, + { + "epoch": 16.80111588980588, + "grad_norm": 1.5131334732317863, + "learning_rate": 8.400883446993181e-07, + "loss": 0.8558, + "step": 216810 + }, + { + "epoch": 16.80189081328219, + "grad_norm": 1.613031094015427, + "learning_rate": 8.401270923744575e-07, + "loss": 0.8495, + "step": 216820 + }, + { + "epoch": 16.802665736758495, + "grad_norm": 1.5997913069045049, + "learning_rate": 8.401658400495971e-07, + "loss": 0.8576, + "step": 216830 + }, + { + "epoch": 16.803440660234802, + "grad_norm": 1.5174795701830177, + "learning_rate": 8.402045877247366e-07, + "loss": 0.8623, + "step": 216840 + }, + { + "epoch": 16.80421558371111, + "grad_norm": 1.6400807259427745, + "learning_rate": 8.402433353998761e-07, + "loss": 0.8512, + "step": 216850 + }, + { + "epoch": 16.804990507187416, + "grad_norm": 1.5403583804579015, + "learning_rate": 8.402820830750155e-07, + "loss": 0.8676, + "step": 216860 + }, + { + "epoch": 16.805765430663723, + "grad_norm": 1.6626380874228626, + "learning_rate": 8.403208307501551e-07, + "loss": 0.8591, + "step": 216870 + }, + { + "epoch": 16.80654035414003, + "grad_norm": 1.5667671756649029, + "learning_rate": 8.403595784252945e-07, + "loss": 0.8607, + "step": 216880 + }, + { + "epoch": 16.807315277616336, + "grad_norm": 1.5815256879964235, + "learning_rate": 8.403983261004341e-07, + "loss": 0.8569, + "step": 216890 + }, + { + "epoch": 16.808090201092643, + "grad_norm": 1.6807180105907398, + "learning_rate": 8.404370737755735e-07, + "loss": 0.8605, + "step": 216900 + }, + { + "epoch": 16.80886512456895, + "grad_norm": 1.622210242405503, + "learning_rate": 8.40475821450713e-07, + "loss": 0.8567, + "step": 216910 + }, + { + "epoch": 16.809640048045257, + "grad_norm": 1.5402043311130078, + "learning_rate": 8.405145691258524e-07, + "loss": 0.8459, + "step": 216920 + }, + { + "epoch": 16.810414971521563, + "grad_norm": 1.5292716292527506, + "learning_rate": 8.40553316800992e-07, + "loss": 0.8442, + "step": 216930 + }, + { + "epoch": 16.81118989499787, + "grad_norm": 1.734042656347605, + "learning_rate": 8.405920644761315e-07, + "loss": 0.8493, + "step": 216940 + }, + { + "epoch": 16.811964818474177, + "grad_norm": 1.4921337327282997, + "learning_rate": 8.40630812151271e-07, + "loss": 0.8444, + "step": 216950 + }, + { + "epoch": 16.812739741950484, + "grad_norm": 1.7265996740143723, + "learning_rate": 8.406695598264104e-07, + "loss": 0.8508, + "step": 216960 + }, + { + "epoch": 16.81351466542679, + "grad_norm": 1.5114274113116275, + "learning_rate": 8.4070830750155e-07, + "loss": 0.8568, + "step": 216970 + }, + { + "epoch": 16.814289588903097, + "grad_norm": 1.6498888459687224, + "learning_rate": 8.407470551766895e-07, + "loss": 0.8585, + "step": 216980 + }, + { + "epoch": 16.815064512379404, + "grad_norm": 1.704557133846023, + "learning_rate": 8.40785802851829e-07, + "loss": 0.8646, + "step": 216990 + }, + { + "epoch": 16.815839435855708, + "grad_norm": 1.5930339194392638, + "learning_rate": 8.408245505269684e-07, + "loss": 0.8517, + "step": 217000 + }, + { + "epoch": 16.815839435855708, + "eval_loss": 0.8883286118507385, + "eval_runtime": 329.0724, + "eval_samples_per_second": 34.859, + "eval_steps_per_second": 8.715, + "step": 217000 + }, + { + "epoch": 16.816614359332014, + "grad_norm": 1.5832578086759486, + "learning_rate": 8.40863298202108e-07, + "loss": 0.8654, + "step": 217010 + }, + { + "epoch": 16.81738928280832, + "grad_norm": 1.5289700091941, + "learning_rate": 8.409020458772474e-07, + "loss": 0.8549, + "step": 217020 + }, + { + "epoch": 16.818164206284628, + "grad_norm": 1.640337750994925, + "learning_rate": 8.40940793552387e-07, + "loss": 0.8576, + "step": 217030 + }, + { + "epoch": 16.818939129760935, + "grad_norm": 1.5575520728437042, + "learning_rate": 8.409795412275264e-07, + "loss": 0.8441, + "step": 217040 + }, + { + "epoch": 16.81971405323724, + "grad_norm": 1.752897206158001, + "learning_rate": 8.410182889026659e-07, + "loss": 0.8576, + "step": 217050 + }, + { + "epoch": 16.82048897671355, + "grad_norm": 1.6693570832040572, + "learning_rate": 8.410570365778053e-07, + "loss": 0.8444, + "step": 217060 + }, + { + "epoch": 16.821263900189855, + "grad_norm": 1.6455034701268743, + "learning_rate": 8.410957842529449e-07, + "loss": 0.8674, + "step": 217070 + }, + { + "epoch": 16.822038823666162, + "grad_norm": 1.5914500361539505, + "learning_rate": 8.411345319280844e-07, + "loss": 0.8675, + "step": 217080 + }, + { + "epoch": 16.82281374714247, + "grad_norm": 1.8070593355120395, + "learning_rate": 8.411732796032239e-07, + "loss": 0.8561, + "step": 217090 + }, + { + "epoch": 16.823588670618776, + "grad_norm": 1.6627216352138456, + "learning_rate": 8.412120272783633e-07, + "loss": 0.8449, + "step": 217100 + }, + { + "epoch": 16.824363594095082, + "grad_norm": 1.6735799766110313, + "learning_rate": 8.412507749535029e-07, + "loss": 0.8734, + "step": 217110 + }, + { + "epoch": 16.82513851757139, + "grad_norm": 1.5821658438748496, + "learning_rate": 8.412895226286423e-07, + "loss": 0.841, + "step": 217120 + }, + { + "epoch": 16.825913441047696, + "grad_norm": 1.575485519349194, + "learning_rate": 8.413282703037819e-07, + "loss": 0.8563, + "step": 217130 + }, + { + "epoch": 16.826688364524003, + "grad_norm": 1.5726289739939112, + "learning_rate": 8.413670179789213e-07, + "loss": 0.8676, + "step": 217140 + }, + { + "epoch": 16.82746328800031, + "grad_norm": 1.5149224420810823, + "learning_rate": 8.414057656540608e-07, + "loss": 0.8478, + "step": 217150 + }, + { + "epoch": 16.828238211476616, + "grad_norm": 1.545287432273084, + "learning_rate": 8.414445133292003e-07, + "loss": 0.8445, + "step": 217160 + }, + { + "epoch": 16.829013134952923, + "grad_norm": 1.5578570184736107, + "learning_rate": 8.414832610043398e-07, + "loss": 0.8499, + "step": 217170 + }, + { + "epoch": 16.82978805842923, + "grad_norm": 1.6411514436214907, + "learning_rate": 8.415220086794793e-07, + "loss": 0.8842, + "step": 217180 + }, + { + "epoch": 16.830562981905537, + "grad_norm": 1.4925393695159936, + "learning_rate": 8.415607563546188e-07, + "loss": 0.8446, + "step": 217190 + }, + { + "epoch": 16.831337905381844, + "grad_norm": 1.5403388654739871, + "learning_rate": 8.415995040297582e-07, + "loss": 0.8446, + "step": 217200 + }, + { + "epoch": 16.83211282885815, + "grad_norm": 1.6041127124044001, + "learning_rate": 8.416382517048978e-07, + "loss": 0.8659, + "step": 217210 + }, + { + "epoch": 16.832887752334457, + "grad_norm": 1.6819723991411626, + "learning_rate": 8.416769993800372e-07, + "loss": 0.8485, + "step": 217220 + }, + { + "epoch": 16.833662675810764, + "grad_norm": 1.6628545915286714, + "learning_rate": 8.417157470551768e-07, + "loss": 0.8789, + "step": 217230 + }, + { + "epoch": 16.83443759928707, + "grad_norm": 1.6968011548473036, + "learning_rate": 8.417544947303162e-07, + "loss": 0.854, + "step": 217240 + }, + { + "epoch": 16.835212522763378, + "grad_norm": 1.692720182284368, + "learning_rate": 8.417932424054558e-07, + "loss": 0.8561, + "step": 217250 + }, + { + "epoch": 16.835987446239685, + "grad_norm": 1.700873763130327, + "learning_rate": 8.418319900805952e-07, + "loss": 0.8397, + "step": 217260 + }, + { + "epoch": 16.83676236971599, + "grad_norm": 1.5697926540741418, + "learning_rate": 8.418707377557347e-07, + "loss": 0.8562, + "step": 217270 + }, + { + "epoch": 16.837537293192298, + "grad_norm": 1.6141285720874543, + "learning_rate": 8.419094854308742e-07, + "loss": 0.8628, + "step": 217280 + }, + { + "epoch": 16.838312216668605, + "grad_norm": 1.6759279666098919, + "learning_rate": 8.419482331060137e-07, + "loss": 0.8516, + "step": 217290 + }, + { + "epoch": 16.839087140144912, + "grad_norm": 1.62028627812922, + "learning_rate": 8.419869807811532e-07, + "loss": 0.8552, + "step": 217300 + }, + { + "epoch": 16.83986206362122, + "grad_norm": 1.7097148709410213, + "learning_rate": 8.420257284562927e-07, + "loss": 0.8599, + "step": 217310 + }, + { + "epoch": 16.840636987097525, + "grad_norm": 1.5568887006509555, + "learning_rate": 8.420644761314321e-07, + "loss": 0.8441, + "step": 217320 + }, + { + "epoch": 16.841411910573832, + "grad_norm": 1.573311219423137, + "learning_rate": 8.421032238065717e-07, + "loss": 0.8469, + "step": 217330 + }, + { + "epoch": 16.84218683405014, + "grad_norm": 1.6278857053103895, + "learning_rate": 8.421419714817111e-07, + "loss": 0.8428, + "step": 217340 + }, + { + "epoch": 16.842961757526446, + "grad_norm": 1.5277932558283178, + "learning_rate": 8.421807191568507e-07, + "loss": 0.8539, + "step": 217350 + }, + { + "epoch": 16.843736681002753, + "grad_norm": 1.650994634781606, + "learning_rate": 8.422194668319901e-07, + "loss": 0.8644, + "step": 217360 + }, + { + "epoch": 16.84451160447906, + "grad_norm": 1.741523619875369, + "learning_rate": 8.422582145071296e-07, + "loss": 0.8469, + "step": 217370 + }, + { + "epoch": 16.845286527955363, + "grad_norm": 1.7867260243215066, + "learning_rate": 8.422969621822691e-07, + "loss": 0.8449, + "step": 217380 + }, + { + "epoch": 16.84606145143167, + "grad_norm": 1.6901806556122654, + "learning_rate": 8.423357098574087e-07, + "loss": 0.8627, + "step": 217390 + }, + { + "epoch": 16.846836374907976, + "grad_norm": 1.4809316753813506, + "learning_rate": 8.423744575325481e-07, + "loss": 0.8445, + "step": 217400 + }, + { + "epoch": 16.847611298384283, + "grad_norm": 1.5994655341193618, + "learning_rate": 8.424132052076876e-07, + "loss": 0.8795, + "step": 217410 + }, + { + "epoch": 16.84838622186059, + "grad_norm": 1.5495413950256614, + "learning_rate": 8.42451952882827e-07, + "loss": 0.8459, + "step": 217420 + }, + { + "epoch": 16.849161145336897, + "grad_norm": 1.5780587900006962, + "learning_rate": 8.424907005579666e-07, + "loss": 0.8665, + "step": 217430 + }, + { + "epoch": 16.849936068813204, + "grad_norm": 1.5271340606387027, + "learning_rate": 8.425294482331061e-07, + "loss": 0.866, + "step": 217440 + }, + { + "epoch": 16.85071099228951, + "grad_norm": 1.5951853951185575, + "learning_rate": 8.425681959082456e-07, + "loss": 0.8512, + "step": 217450 + }, + { + "epoch": 16.851485915765817, + "grad_norm": 1.5926813520341094, + "learning_rate": 8.42606943583385e-07, + "loss": 0.8652, + "step": 217460 + }, + { + "epoch": 16.852260839242124, + "grad_norm": 1.5788358933221702, + "learning_rate": 8.426456912585245e-07, + "loss": 0.8659, + "step": 217470 + }, + { + "epoch": 16.85303576271843, + "grad_norm": 1.7571249736881132, + "learning_rate": 8.42684438933664e-07, + "loss": 0.8507, + "step": 217480 + }, + { + "epoch": 16.853810686194738, + "grad_norm": 1.5454973579648772, + "learning_rate": 8.427231866088036e-07, + "loss": 0.8471, + "step": 217490 + }, + { + "epoch": 16.854585609671044, + "grad_norm": 1.7365505991828651, + "learning_rate": 8.42761934283943e-07, + "loss": 0.8637, + "step": 217500 + }, + { + "epoch": 16.854585609671044, + "eval_loss": 0.8887977004051208, + "eval_runtime": 327.3709, + "eval_samples_per_second": 35.04, + "eval_steps_per_second": 8.761, + "step": 217500 + }, + { + "epoch": 16.85536053314735, + "grad_norm": 1.625183025787728, + "learning_rate": 8.428006819590825e-07, + "loss": 0.8399, + "step": 217510 + }, + { + "epoch": 16.856135456623658, + "grad_norm": 1.5707215759521276, + "learning_rate": 8.428394296342219e-07, + "loss": 0.8546, + "step": 217520 + }, + { + "epoch": 16.856910380099965, + "grad_norm": 1.604734268624766, + "learning_rate": 8.428781773093616e-07, + "loss": 0.8757, + "step": 217530 + }, + { + "epoch": 16.85768530357627, + "grad_norm": 1.5615023797398797, + "learning_rate": 8.42916924984501e-07, + "loss": 0.8374, + "step": 217540 + }, + { + "epoch": 16.85846022705258, + "grad_norm": 1.6364080836126222, + "learning_rate": 8.429556726596405e-07, + "loss": 0.8333, + "step": 217550 + }, + { + "epoch": 16.859235150528885, + "grad_norm": 1.598262794538644, + "learning_rate": 8.429944203347799e-07, + "loss": 0.8463, + "step": 217560 + }, + { + "epoch": 16.860010074005192, + "grad_norm": 1.6040970532230696, + "learning_rate": 8.430331680099194e-07, + "loss": 0.8651, + "step": 217570 + }, + { + "epoch": 16.8607849974815, + "grad_norm": 1.612236324147994, + "learning_rate": 8.43071915685059e-07, + "loss": 0.8535, + "step": 217580 + }, + { + "epoch": 16.861559920957806, + "grad_norm": 1.5726082485190973, + "learning_rate": 8.431106633601985e-07, + "loss": 0.8628, + "step": 217590 + }, + { + "epoch": 16.862334844434113, + "grad_norm": 1.6616818152155979, + "learning_rate": 8.431494110353379e-07, + "loss": 0.8434, + "step": 217600 + }, + { + "epoch": 16.86310976791042, + "grad_norm": 1.5153059376627138, + "learning_rate": 8.431881587104774e-07, + "loss": 0.8567, + "step": 217610 + }, + { + "epoch": 16.863884691386726, + "grad_norm": 1.5682633315801389, + "learning_rate": 8.432269063856168e-07, + "loss": 0.8584, + "step": 217620 + }, + { + "epoch": 16.864659614863033, + "grad_norm": 1.5603225005788357, + "learning_rate": 8.432656540607565e-07, + "loss": 0.8651, + "step": 217630 + }, + { + "epoch": 16.86543453833934, + "grad_norm": 1.5502235839186818, + "learning_rate": 8.433044017358959e-07, + "loss": 0.8462, + "step": 217640 + }, + { + "epoch": 16.866209461815647, + "grad_norm": 1.5132681658029854, + "learning_rate": 8.433431494110354e-07, + "loss": 0.8511, + "step": 217650 + }, + { + "epoch": 16.866984385291953, + "grad_norm": 1.7078590319208047, + "learning_rate": 8.433818970861748e-07, + "loss": 0.8782, + "step": 217660 + }, + { + "epoch": 16.86775930876826, + "grad_norm": 1.622888085970179, + "learning_rate": 8.434206447613145e-07, + "loss": 0.8513, + "step": 217670 + }, + { + "epoch": 16.868534232244567, + "grad_norm": 1.5375761641686942, + "learning_rate": 8.434593924364539e-07, + "loss": 0.8522, + "step": 217680 + }, + { + "epoch": 16.869309155720874, + "grad_norm": 1.7824408812755526, + "learning_rate": 8.434981401115934e-07, + "loss": 0.8308, + "step": 217690 + }, + { + "epoch": 16.87008407919718, + "grad_norm": 1.5870542078655858, + "learning_rate": 8.435368877867328e-07, + "loss": 0.8428, + "step": 217700 + }, + { + "epoch": 16.870859002673487, + "grad_norm": 1.6210300806644173, + "learning_rate": 8.435756354618723e-07, + "loss": 0.8538, + "step": 217710 + }, + { + "epoch": 16.871633926149794, + "grad_norm": 1.6156517557730912, + "learning_rate": 8.436143831370117e-07, + "loss": 0.8591, + "step": 217720 + }, + { + "epoch": 16.8724088496261, + "grad_norm": 1.5684558589256634, + "learning_rate": 8.436531308121514e-07, + "loss": 0.875, + "step": 217730 + }, + { + "epoch": 16.873183773102404, + "grad_norm": 1.5380080418348352, + "learning_rate": 8.436918784872908e-07, + "loss": 0.8618, + "step": 217740 + }, + { + "epoch": 16.87395869657871, + "grad_norm": 1.5043676619523851, + "learning_rate": 8.437306261624303e-07, + "loss": 0.8753, + "step": 217750 + }, + { + "epoch": 16.874733620055018, + "grad_norm": 1.609565685097529, + "learning_rate": 8.437693738375697e-07, + "loss": 0.8471, + "step": 217760 + }, + { + "epoch": 16.875508543531325, + "grad_norm": 1.5540310448240704, + "learning_rate": 8.438081215127094e-07, + "loss": 0.8519, + "step": 217770 + }, + { + "epoch": 16.87628346700763, + "grad_norm": 1.5843948645889732, + "learning_rate": 8.438468691878488e-07, + "loss": 0.8439, + "step": 217780 + }, + { + "epoch": 16.87705839048394, + "grad_norm": 1.720946071060008, + "learning_rate": 8.438856168629883e-07, + "loss": 0.8582, + "step": 217790 + }, + { + "epoch": 16.877833313960245, + "grad_norm": 1.5265304364639205, + "learning_rate": 8.439243645381277e-07, + "loss": 0.8673, + "step": 217800 + }, + { + "epoch": 16.878608237436552, + "grad_norm": 1.6370595400092522, + "learning_rate": 8.439631122132673e-07, + "loss": 0.8784, + "step": 217810 + }, + { + "epoch": 16.87938316091286, + "grad_norm": 1.6054255951987535, + "learning_rate": 8.440018598884068e-07, + "loss": 0.857, + "step": 217820 + }, + { + "epoch": 16.880158084389166, + "grad_norm": 1.5524089244539019, + "learning_rate": 8.440406075635463e-07, + "loss": 0.8679, + "step": 217830 + }, + { + "epoch": 16.880933007865472, + "grad_norm": 1.6382519124432513, + "learning_rate": 8.440793552386857e-07, + "loss": 0.8418, + "step": 217840 + }, + { + "epoch": 16.88170793134178, + "grad_norm": 1.664764520192733, + "learning_rate": 8.441181029138252e-07, + "loss": 0.8398, + "step": 217850 + }, + { + "epoch": 16.882482854818086, + "grad_norm": 1.5423962819927608, + "learning_rate": 8.441568505889646e-07, + "loss": 0.8391, + "step": 217860 + }, + { + "epoch": 16.883257778294393, + "grad_norm": 1.6629255566306835, + "learning_rate": 8.441955982641043e-07, + "loss": 0.8413, + "step": 217870 + }, + { + "epoch": 16.8840327017707, + "grad_norm": 1.5423231369365926, + "learning_rate": 8.442343459392437e-07, + "loss": 0.8398, + "step": 217880 + }, + { + "epoch": 16.884807625247007, + "grad_norm": 1.557139599752453, + "learning_rate": 8.442730936143832e-07, + "loss": 0.8439, + "step": 217890 + }, + { + "epoch": 16.885582548723313, + "grad_norm": 1.6288814757101215, + "learning_rate": 8.443118412895226e-07, + "loss": 0.8589, + "step": 217900 + }, + { + "epoch": 16.88635747219962, + "grad_norm": 1.6079460366388019, + "learning_rate": 8.443505889646622e-07, + "loss": 0.8704, + "step": 217910 + }, + { + "epoch": 16.887132395675927, + "grad_norm": 1.6264157683722298, + "learning_rate": 8.443893366398017e-07, + "loss": 0.8442, + "step": 217920 + }, + { + "epoch": 16.887907319152234, + "grad_norm": 1.6502290857155462, + "learning_rate": 8.444280843149412e-07, + "loss": 0.858, + "step": 217930 + }, + { + "epoch": 16.88868224262854, + "grad_norm": 1.5966252489040482, + "learning_rate": 8.444668319900806e-07, + "loss": 0.8489, + "step": 217940 + }, + { + "epoch": 16.889457166104847, + "grad_norm": 1.6001466087700649, + "learning_rate": 8.445055796652202e-07, + "loss": 0.8453, + "step": 217950 + }, + { + "epoch": 16.890232089581154, + "grad_norm": 1.641144963797374, + "learning_rate": 8.445443273403596e-07, + "loss": 0.8397, + "step": 217960 + }, + { + "epoch": 16.89100701305746, + "grad_norm": 1.593385059601785, + "learning_rate": 8.445830750154992e-07, + "loss": 0.8631, + "step": 217970 + }, + { + "epoch": 16.891781936533768, + "grad_norm": 1.5833191306240466, + "learning_rate": 8.446218226906386e-07, + "loss": 0.8595, + "step": 217980 + }, + { + "epoch": 16.892556860010075, + "grad_norm": 1.5437172275381912, + "learning_rate": 8.446605703657781e-07, + "loss": 0.8531, + "step": 217990 + }, + { + "epoch": 16.89333178348638, + "grad_norm": 1.6222308650785837, + "learning_rate": 8.446993180409175e-07, + "loss": 0.8551, + "step": 218000 + }, + { + "epoch": 16.89333178348638, + "eval_loss": 0.8883914351463318, + "eval_runtime": 328.1006, + "eval_samples_per_second": 34.962, + "eval_steps_per_second": 8.741, + "step": 218000 + }, + { + "epoch": 16.89410670696269, + "grad_norm": 1.6164015436357, + "learning_rate": 8.447380657160571e-07, + "loss": 0.8605, + "step": 218010 + }, + { + "epoch": 16.894881630438995, + "grad_norm": 1.6466338254998731, + "learning_rate": 8.447768133911966e-07, + "loss": 0.843, + "step": 218020 + }, + { + "epoch": 16.895656553915302, + "grad_norm": 1.6703566078661554, + "learning_rate": 8.448155610663361e-07, + "loss": 0.8533, + "step": 218030 + }, + { + "epoch": 16.89643147739161, + "grad_norm": 1.5100319807166822, + "learning_rate": 8.448543087414755e-07, + "loss": 0.8382, + "step": 218040 + }, + { + "epoch": 16.897206400867915, + "grad_norm": 1.6156505202618106, + "learning_rate": 8.448930564166151e-07, + "loss": 0.8496, + "step": 218050 + }, + { + "epoch": 16.897981324344222, + "grad_norm": 1.5407183682396923, + "learning_rate": 8.449318040917545e-07, + "loss": 0.8481, + "step": 218060 + }, + { + "epoch": 16.89875624782053, + "grad_norm": 1.5775575517308815, + "learning_rate": 8.449705517668941e-07, + "loss": 0.8612, + "step": 218070 + }, + { + "epoch": 16.899531171296836, + "grad_norm": 1.7738243592286014, + "learning_rate": 8.450092994420335e-07, + "loss": 0.8576, + "step": 218080 + }, + { + "epoch": 16.900306094773143, + "grad_norm": 1.576950286853752, + "learning_rate": 8.450480471171731e-07, + "loss": 0.8462, + "step": 218090 + }, + { + "epoch": 16.90108101824945, + "grad_norm": 1.6478165411452803, + "learning_rate": 8.450867947923125e-07, + "loss": 0.859, + "step": 218100 + }, + { + "epoch": 16.901855941725756, + "grad_norm": 1.624184853234039, + "learning_rate": 8.45125542467452e-07, + "loss": 0.8536, + "step": 218110 + }, + { + "epoch": 16.90263086520206, + "grad_norm": 1.775482828303328, + "learning_rate": 8.451642901425915e-07, + "loss": 0.8485, + "step": 218120 + }, + { + "epoch": 16.903405788678366, + "grad_norm": 1.4596184537628987, + "learning_rate": 8.45203037817731e-07, + "loss": 0.8672, + "step": 218130 + }, + { + "epoch": 16.904180712154673, + "grad_norm": 1.5249453133916255, + "learning_rate": 8.452417854928704e-07, + "loss": 0.852, + "step": 218140 + }, + { + "epoch": 16.90495563563098, + "grad_norm": 1.593600421949521, + "learning_rate": 8.4528053316801e-07, + "loss": 0.8671, + "step": 218150 + }, + { + "epoch": 16.905730559107287, + "grad_norm": 1.5919826223681148, + "learning_rate": 8.453192808431494e-07, + "loss": 0.8358, + "step": 218160 + }, + { + "epoch": 16.906505482583594, + "grad_norm": 1.555963967841974, + "learning_rate": 8.45358028518289e-07, + "loss": 0.8416, + "step": 218170 + }, + { + "epoch": 16.9072804060599, + "grad_norm": 1.6876713928808982, + "learning_rate": 8.453967761934284e-07, + "loss": 0.8679, + "step": 218180 + }, + { + "epoch": 16.908055329536207, + "grad_norm": 1.6164917007951882, + "learning_rate": 8.45435523868568e-07, + "loss": 0.8725, + "step": 218190 + }, + { + "epoch": 16.908830253012514, + "grad_norm": 1.559079905043652, + "learning_rate": 8.454742715437074e-07, + "loss": 0.8437, + "step": 218200 + }, + { + "epoch": 16.90960517648882, + "grad_norm": 1.5506320775407587, + "learning_rate": 8.45513019218847e-07, + "loss": 0.8518, + "step": 218210 + }, + { + "epoch": 16.910380099965128, + "grad_norm": 1.5428237402681446, + "learning_rate": 8.455517668939864e-07, + "loss": 0.8536, + "step": 218220 + }, + { + "epoch": 16.911155023441435, + "grad_norm": 1.5703463218680276, + "learning_rate": 8.45590514569126e-07, + "loss": 0.8383, + "step": 218230 + }, + { + "epoch": 16.91192994691774, + "grad_norm": 1.6630386401651303, + "learning_rate": 8.456292622442654e-07, + "loss": 0.8547, + "step": 218240 + }, + { + "epoch": 16.912704870394048, + "grad_norm": 1.673885993803983, + "learning_rate": 8.456680099194049e-07, + "loss": 0.8753, + "step": 218250 + }, + { + "epoch": 16.913479793870355, + "grad_norm": 1.7025275221833185, + "learning_rate": 8.457067575945443e-07, + "loss": 0.8574, + "step": 218260 + }, + { + "epoch": 16.91425471734666, + "grad_norm": 1.6497733512612152, + "learning_rate": 8.457455052696839e-07, + "loss": 0.863, + "step": 218270 + }, + { + "epoch": 16.91502964082297, + "grad_norm": 1.6077063872190427, + "learning_rate": 8.457842529448233e-07, + "loss": 0.8846, + "step": 218280 + }, + { + "epoch": 16.915804564299275, + "grad_norm": 1.6296086983629676, + "learning_rate": 8.458230006199629e-07, + "loss": 0.8658, + "step": 218290 + }, + { + "epoch": 16.916579487775582, + "grad_norm": 1.5070676955675957, + "learning_rate": 8.458617482951023e-07, + "loss": 0.8426, + "step": 218300 + }, + { + "epoch": 16.91735441125189, + "grad_norm": 1.6759542693904412, + "learning_rate": 8.459004959702419e-07, + "loss": 0.8354, + "step": 218310 + }, + { + "epoch": 16.918129334728196, + "grad_norm": 1.4948977994028119, + "learning_rate": 8.459392436453813e-07, + "loss": 0.8757, + "step": 218320 + }, + { + "epoch": 16.918904258204503, + "grad_norm": 1.5892267850896649, + "learning_rate": 8.459779913205209e-07, + "loss": 0.853, + "step": 218330 + }, + { + "epoch": 16.91967918168081, + "grad_norm": 1.6962783880774168, + "learning_rate": 8.460167389956603e-07, + "loss": 0.8669, + "step": 218340 + }, + { + "epoch": 16.920454105157116, + "grad_norm": 1.55691918716841, + "learning_rate": 8.460554866707998e-07, + "loss": 0.8533, + "step": 218350 + }, + { + "epoch": 16.921229028633423, + "grad_norm": 1.570391560430322, + "learning_rate": 8.460942343459392e-07, + "loss": 0.8565, + "step": 218360 + }, + { + "epoch": 16.92200395210973, + "grad_norm": 1.537221111498431, + "learning_rate": 8.461329820210789e-07, + "loss": 0.8452, + "step": 218370 + }, + { + "epoch": 16.922778875586037, + "grad_norm": 1.5892819146959138, + "learning_rate": 8.461717296962183e-07, + "loss": 0.8588, + "step": 218380 + }, + { + "epoch": 16.923553799062343, + "grad_norm": 1.6473888294908765, + "learning_rate": 8.462104773713578e-07, + "loss": 0.8615, + "step": 218390 + }, + { + "epoch": 16.92432872253865, + "grad_norm": 1.4890865732923824, + "learning_rate": 8.462492250464972e-07, + "loss": 0.8582, + "step": 218400 + }, + { + "epoch": 16.925103646014957, + "grad_norm": 1.5796637777060203, + "learning_rate": 8.462879727216368e-07, + "loss": 0.8475, + "step": 218410 + }, + { + "epoch": 16.925878569491264, + "grad_norm": 1.5383757803261306, + "learning_rate": 8.463267203967762e-07, + "loss": 0.8399, + "step": 218420 + }, + { + "epoch": 16.92665349296757, + "grad_norm": 1.6966336117847356, + "learning_rate": 8.463654680719158e-07, + "loss": 0.8783, + "step": 218430 + }, + { + "epoch": 16.927428416443878, + "grad_norm": 1.5043527345125858, + "learning_rate": 8.464042157470552e-07, + "loss": 0.8467, + "step": 218440 + }, + { + "epoch": 16.928203339920184, + "grad_norm": 1.5972529216327178, + "learning_rate": 8.464429634221947e-07, + "loss": 0.8478, + "step": 218450 + }, + { + "epoch": 16.92897826339649, + "grad_norm": 1.4708881825669275, + "learning_rate": 8.464817110973342e-07, + "loss": 0.8446, + "step": 218460 + }, + { + "epoch": 16.929753186872798, + "grad_norm": 1.6772899046695608, + "learning_rate": 8.465204587724738e-07, + "loss": 0.846, + "step": 218470 + }, + { + "epoch": 16.9305281103491, + "grad_norm": 1.6033453110760307, + "learning_rate": 8.465592064476132e-07, + "loss": 0.853, + "step": 218480 + }, + { + "epoch": 16.931303033825408, + "grad_norm": 1.6850060497275872, + "learning_rate": 8.465979541227527e-07, + "loss": 0.8485, + "step": 218490 + }, + { + "epoch": 16.932077957301715, + "grad_norm": 1.6204882054647274, + "learning_rate": 8.466367017978921e-07, + "loss": 0.8453, + "step": 218500 + }, + { + "epoch": 16.932077957301715, + "eval_loss": 0.8882694244384766, + "eval_runtime": 327.3473, + "eval_samples_per_second": 35.042, + "eval_steps_per_second": 8.761, + "step": 218500 + }, + { + "epoch": 16.93285288077802, + "grad_norm": 1.6119531229654744, + "learning_rate": 8.466754494730318e-07, + "loss": 0.8612, + "step": 218510 + }, + { + "epoch": 16.93362780425433, + "grad_norm": 1.6223991949896748, + "learning_rate": 8.467141971481712e-07, + "loss": 0.8719, + "step": 218520 + }, + { + "epoch": 16.934402727730635, + "grad_norm": 1.6316381891442335, + "learning_rate": 8.467529448233107e-07, + "loss": 0.8546, + "step": 218530 + }, + { + "epoch": 16.935177651206942, + "grad_norm": 1.9310232345069394, + "learning_rate": 8.467916924984501e-07, + "loss": 0.8658, + "step": 218540 + }, + { + "epoch": 16.93595257468325, + "grad_norm": 1.5423907277769373, + "learning_rate": 8.468304401735896e-07, + "loss": 0.8601, + "step": 218550 + }, + { + "epoch": 16.936727498159556, + "grad_norm": 1.509842423825284, + "learning_rate": 8.46869187848729e-07, + "loss": 0.864, + "step": 218560 + }, + { + "epoch": 16.937502421635863, + "grad_norm": 1.7440573891982327, + "learning_rate": 8.469079355238687e-07, + "loss": 0.8514, + "step": 218570 + }, + { + "epoch": 16.93827734511217, + "grad_norm": 1.5041362751967375, + "learning_rate": 8.469466831990081e-07, + "loss": 0.8709, + "step": 218580 + }, + { + "epoch": 16.939052268588476, + "grad_norm": 1.474208870838467, + "learning_rate": 8.469854308741476e-07, + "loss": 0.8605, + "step": 218590 + }, + { + "epoch": 16.939827192064783, + "grad_norm": 1.5666865687026177, + "learning_rate": 8.47024178549287e-07, + "loss": 0.8353, + "step": 218600 + }, + { + "epoch": 16.94060211554109, + "grad_norm": 1.5941446286227852, + "learning_rate": 8.470629262244267e-07, + "loss": 0.853, + "step": 218610 + }, + { + "epoch": 16.941377039017397, + "grad_norm": 1.5460292511712572, + "learning_rate": 8.471016738995661e-07, + "loss": 0.8614, + "step": 218620 + }, + { + "epoch": 16.942151962493703, + "grad_norm": 1.5073323259902347, + "learning_rate": 8.471404215747056e-07, + "loss": 0.8406, + "step": 218630 + }, + { + "epoch": 16.94292688597001, + "grad_norm": 1.5440697025693362, + "learning_rate": 8.47179169249845e-07, + "loss": 0.8531, + "step": 218640 + }, + { + "epoch": 16.943701809446317, + "grad_norm": 1.4667429757794865, + "learning_rate": 8.472179169249845e-07, + "loss": 0.8728, + "step": 218650 + }, + { + "epoch": 16.944476732922624, + "grad_norm": 1.6696058826417703, + "learning_rate": 8.472566646001241e-07, + "loss": 0.8589, + "step": 218660 + }, + { + "epoch": 16.94525165639893, + "grad_norm": 1.6958092183320925, + "learning_rate": 8.472954122752636e-07, + "loss": 0.8527, + "step": 218670 + }, + { + "epoch": 16.946026579875237, + "grad_norm": 1.5256801095435062, + "learning_rate": 8.47334159950403e-07, + "loss": 0.851, + "step": 218680 + }, + { + "epoch": 16.946801503351544, + "grad_norm": 1.7934010520372428, + "learning_rate": 8.473729076255425e-07, + "loss": 0.8371, + "step": 218690 + }, + { + "epoch": 16.94757642682785, + "grad_norm": 1.7509297658014427, + "learning_rate": 8.474116553006819e-07, + "loss": 0.8623, + "step": 218700 + }, + { + "epoch": 16.948351350304158, + "grad_norm": 1.5652722598360922, + "learning_rate": 8.474504029758216e-07, + "loss": 0.8414, + "step": 218710 + }, + { + "epoch": 16.949126273780465, + "grad_norm": 1.597209921719016, + "learning_rate": 8.47489150650961e-07, + "loss": 0.8604, + "step": 218720 + }, + { + "epoch": 16.94990119725677, + "grad_norm": 1.6385710113332663, + "learning_rate": 8.475278983261005e-07, + "loss": 0.8473, + "step": 218730 + }, + { + "epoch": 16.95067612073308, + "grad_norm": 1.5285063098155016, + "learning_rate": 8.475666460012399e-07, + "loss": 0.8485, + "step": 218740 + }, + { + "epoch": 16.951451044209385, + "grad_norm": 1.5579133509577878, + "learning_rate": 8.476053936763795e-07, + "loss": 0.8541, + "step": 218750 + }, + { + "epoch": 16.952225967685692, + "grad_norm": 1.6224738810347425, + "learning_rate": 8.47644141351519e-07, + "loss": 0.8588, + "step": 218760 + }, + { + "epoch": 16.953000891162, + "grad_norm": 1.7318958549725332, + "learning_rate": 8.476828890266585e-07, + "loss": 0.8539, + "step": 218770 + }, + { + "epoch": 16.953775814638306, + "grad_norm": 1.4110340942245765, + "learning_rate": 8.477216367017979e-07, + "loss": 0.8476, + "step": 218780 + }, + { + "epoch": 16.954550738114612, + "grad_norm": 1.6865371612987037, + "learning_rate": 8.477603843769374e-07, + "loss": 0.8429, + "step": 218790 + }, + { + "epoch": 16.95532566159092, + "grad_norm": 1.7173716544978108, + "learning_rate": 8.477991320520769e-07, + "loss": 0.8545, + "step": 218800 + }, + { + "epoch": 16.956100585067226, + "grad_norm": 1.6318475794162117, + "learning_rate": 8.478378797272165e-07, + "loss": 0.8589, + "step": 218810 + }, + { + "epoch": 16.956875508543533, + "grad_norm": 1.5350853018747805, + "learning_rate": 8.478766274023559e-07, + "loss": 0.8542, + "step": 218820 + }, + { + "epoch": 16.95765043201984, + "grad_norm": 1.5563545942604606, + "learning_rate": 8.479153750774954e-07, + "loss": 0.8379, + "step": 218830 + }, + { + "epoch": 16.958425355496146, + "grad_norm": 1.6370460397069337, + "learning_rate": 8.479541227526348e-07, + "loss": 0.8586, + "step": 218840 + }, + { + "epoch": 16.959200278972453, + "grad_norm": 1.5744798234495765, + "learning_rate": 8.479928704277744e-07, + "loss": 0.8352, + "step": 218850 + }, + { + "epoch": 16.95997520244876, + "grad_norm": 1.5493593479270495, + "learning_rate": 8.480316181029139e-07, + "loss": 0.8521, + "step": 218860 + }, + { + "epoch": 16.960750125925063, + "grad_norm": 1.754214926917328, + "learning_rate": 8.480703657780534e-07, + "loss": 0.8568, + "step": 218870 + }, + { + "epoch": 16.96152504940137, + "grad_norm": 1.5921642054959873, + "learning_rate": 8.481091134531928e-07, + "loss": 0.8413, + "step": 218880 + }, + { + "epoch": 16.962299972877677, + "grad_norm": 1.4950848766099891, + "learning_rate": 8.481478611283324e-07, + "loss": 0.8427, + "step": 218890 + }, + { + "epoch": 16.963074896353984, + "grad_norm": 1.5601656857262947, + "learning_rate": 8.481866088034718e-07, + "loss": 0.8512, + "step": 218900 + }, + { + "epoch": 16.96384981983029, + "grad_norm": 1.6892339024755096, + "learning_rate": 8.482253564786114e-07, + "loss": 0.8386, + "step": 218910 + }, + { + "epoch": 16.964624743306597, + "grad_norm": 1.7001662181497257, + "learning_rate": 8.482641041537508e-07, + "loss": 0.8572, + "step": 218920 + }, + { + "epoch": 16.965399666782904, + "grad_norm": 1.5340738099654219, + "learning_rate": 8.483028518288903e-07, + "loss": 0.8619, + "step": 218930 + }, + { + "epoch": 16.96617459025921, + "grad_norm": 1.468692644199097, + "learning_rate": 8.483415995040298e-07, + "loss": 0.8534, + "step": 218940 + }, + { + "epoch": 16.966949513735518, + "grad_norm": 1.6460140914594592, + "learning_rate": 8.483803471791694e-07, + "loss": 0.8586, + "step": 218950 + }, + { + "epoch": 16.967724437211825, + "grad_norm": 1.6255584886411134, + "learning_rate": 8.484190948543088e-07, + "loss": 0.8546, + "step": 218960 + }, + { + "epoch": 16.96849936068813, + "grad_norm": 1.5934619162117543, + "learning_rate": 8.484578425294483e-07, + "loss": 0.8513, + "step": 218970 + }, + { + "epoch": 16.969274284164438, + "grad_norm": 1.7508065451023467, + "learning_rate": 8.484965902045877e-07, + "loss": 0.856, + "step": 218980 + }, + { + "epoch": 16.970049207640745, + "grad_norm": 1.560748406550182, + "learning_rate": 8.485353378797273e-07, + "loss": 0.8583, + "step": 218990 + }, + { + "epoch": 16.970824131117052, + "grad_norm": 1.6077955046058148, + "learning_rate": 8.485740855548667e-07, + "loss": 0.8697, + "step": 219000 + }, + { + "epoch": 16.970824131117052, + "eval_loss": 0.8882214426994324, + "eval_runtime": 327.9654, + "eval_samples_per_second": 34.976, + "eval_steps_per_second": 8.745, + "step": 219000 + }, + { + "epoch": 16.97159905459336, + "grad_norm": 1.6531123363323474, + "learning_rate": 8.486128332300063e-07, + "loss": 0.8638, + "step": 219010 + }, + { + "epoch": 16.972373978069665, + "grad_norm": 1.6495841354288883, + "learning_rate": 8.486515809051457e-07, + "loss": 0.8676, + "step": 219020 + }, + { + "epoch": 16.973148901545972, + "grad_norm": 1.587099549397335, + "learning_rate": 8.486903285802853e-07, + "loss": 0.8449, + "step": 219030 + }, + { + "epoch": 16.97392382502228, + "grad_norm": 1.5217180159493071, + "learning_rate": 8.487290762554247e-07, + "loss": 0.8486, + "step": 219040 + }, + { + "epoch": 16.974698748498586, + "grad_norm": 1.5832242173495474, + "learning_rate": 8.487678239305643e-07, + "loss": 0.8626, + "step": 219050 + }, + { + "epoch": 16.975473671974893, + "grad_norm": 1.5788890583668236, + "learning_rate": 8.488065716057037e-07, + "loss": 0.8547, + "step": 219060 + }, + { + "epoch": 16.9762485954512, + "grad_norm": 1.6275826343599034, + "learning_rate": 8.488453192808432e-07, + "loss": 0.8505, + "step": 219070 + }, + { + "epoch": 16.977023518927506, + "grad_norm": 1.523751236223332, + "learning_rate": 8.488840669559827e-07, + "loss": 0.8543, + "step": 219080 + }, + { + "epoch": 16.977798442403813, + "grad_norm": 1.575785593760453, + "learning_rate": 8.489228146311222e-07, + "loss": 0.8652, + "step": 219090 + }, + { + "epoch": 16.97857336588012, + "grad_norm": 1.586689289005555, + "learning_rate": 8.489615623062617e-07, + "loss": 0.8445, + "step": 219100 + }, + { + "epoch": 16.979348289356427, + "grad_norm": 1.626401521293348, + "learning_rate": 8.490003099814012e-07, + "loss": 0.8791, + "step": 219110 + }, + { + "epoch": 16.980123212832734, + "grad_norm": 1.619362404032359, + "learning_rate": 8.490390576565406e-07, + "loss": 0.8511, + "step": 219120 + }, + { + "epoch": 16.98089813630904, + "grad_norm": 1.5541758209404124, + "learning_rate": 8.490778053316802e-07, + "loss": 0.8526, + "step": 219130 + }, + { + "epoch": 16.981673059785347, + "grad_norm": 1.596309185840184, + "learning_rate": 8.491165530068196e-07, + "loss": 0.8834, + "step": 219140 + }, + { + "epoch": 16.982447983261654, + "grad_norm": 1.789058576636557, + "learning_rate": 8.491553006819592e-07, + "loss": 0.8541, + "step": 219150 + }, + { + "epoch": 16.98322290673796, + "grad_norm": 1.6030689115354895, + "learning_rate": 8.491940483570986e-07, + "loss": 0.8607, + "step": 219160 + }, + { + "epoch": 16.983997830214268, + "grad_norm": 1.5351815247524256, + "learning_rate": 8.492327960322382e-07, + "loss": 0.8432, + "step": 219170 + }, + { + "epoch": 16.984772753690574, + "grad_norm": 1.662573019465635, + "learning_rate": 8.492715437073776e-07, + "loss": 0.8436, + "step": 219180 + }, + { + "epoch": 16.98554767716688, + "grad_norm": 1.6038874372094096, + "learning_rate": 8.493102913825171e-07, + "loss": 0.8564, + "step": 219190 + }, + { + "epoch": 16.986322600643188, + "grad_norm": 1.58732501391456, + "learning_rate": 8.493490390576566e-07, + "loss": 0.8376, + "step": 219200 + }, + { + "epoch": 16.987097524119495, + "grad_norm": 1.6447895512658723, + "learning_rate": 8.493877867327961e-07, + "loss": 0.8707, + "step": 219210 + }, + { + "epoch": 16.9878724475958, + "grad_norm": 1.599739524016379, + "learning_rate": 8.494265344079355e-07, + "loss": 0.8573, + "step": 219220 + }, + { + "epoch": 16.988647371072105, + "grad_norm": 1.586260394419912, + "learning_rate": 8.494652820830751e-07, + "loss": 0.8884, + "step": 219230 + }, + { + "epoch": 16.98942229454841, + "grad_norm": 1.6992999000021909, + "learning_rate": 8.495040297582145e-07, + "loss": 0.8676, + "step": 219240 + }, + { + "epoch": 16.99019721802472, + "grad_norm": 1.5108884804075267, + "learning_rate": 8.495427774333541e-07, + "loss": 0.8628, + "step": 219250 + }, + { + "epoch": 16.990972141501025, + "grad_norm": 1.6694239228791024, + "learning_rate": 8.495815251084935e-07, + "loss": 0.8573, + "step": 219260 + }, + { + "epoch": 16.991747064977332, + "grad_norm": 1.548996198810137, + "learning_rate": 8.496202727836331e-07, + "loss": 0.8526, + "step": 219270 + }, + { + "epoch": 16.99252198845364, + "grad_norm": 1.5212297308924316, + "learning_rate": 8.496590204587725e-07, + "loss": 0.862, + "step": 219280 + }, + { + "epoch": 16.993296911929946, + "grad_norm": 1.4549956917401177, + "learning_rate": 8.49697768133912e-07, + "loss": 0.8314, + "step": 219290 + }, + { + "epoch": 16.994071835406253, + "grad_norm": 1.5862237880597208, + "learning_rate": 8.497365158090515e-07, + "loss": 0.8556, + "step": 219300 + }, + { + "epoch": 16.99484675888256, + "grad_norm": 1.6427310334634697, + "learning_rate": 8.497752634841911e-07, + "loss": 0.8688, + "step": 219310 + }, + { + "epoch": 16.995621682358866, + "grad_norm": 1.6200461037690443, + "learning_rate": 8.498140111593305e-07, + "loss": 0.8622, + "step": 219320 + }, + { + "epoch": 16.996396605835173, + "grad_norm": 1.5659431563097181, + "learning_rate": 8.4985275883447e-07, + "loss": 0.8352, + "step": 219330 + }, + { + "epoch": 16.99717152931148, + "grad_norm": 1.8483456958197313, + "learning_rate": 8.498915065096094e-07, + "loss": 0.8534, + "step": 219340 + }, + { + "epoch": 16.997946452787787, + "grad_norm": 1.5465599645378292, + "learning_rate": 8.49930254184749e-07, + "loss": 0.8588, + "step": 219350 + }, + { + "epoch": 16.998721376264093, + "grad_norm": 1.6031870522122418, + "learning_rate": 8.499690018598884e-07, + "loss": 0.8585, + "step": 219360 + }, + { + "epoch": 16.9994962997404, + "grad_norm": 1.6290295347849506, + "learning_rate": 8.50007749535028e-07, + "loss": 0.8437, + "step": 219370 + }, + { + "epoch": 17.000271223216707, + "grad_norm": 1.5656432768727986, + "learning_rate": 8.500464972101674e-07, + "loss": 0.8471, + "step": 219380 + }, + { + "epoch": 17.001046146693014, + "grad_norm": 1.5392837832984676, + "learning_rate": 8.500852448853069e-07, + "loss": 0.8389, + "step": 219390 + }, + { + "epoch": 17.00182107016932, + "grad_norm": 1.667124268284474, + "learning_rate": 8.501239925604464e-07, + "loss": 0.8473, + "step": 219400 + }, + { + "epoch": 17.002595993645627, + "grad_norm": 1.6119607135957204, + "learning_rate": 8.50162740235586e-07, + "loss": 0.8448, + "step": 219410 + }, + { + "epoch": 17.003370917121934, + "grad_norm": 1.5861187270520714, + "learning_rate": 8.502014879107254e-07, + "loss": 0.8473, + "step": 219420 + }, + { + "epoch": 17.00414584059824, + "grad_norm": 1.5325794172713154, + "learning_rate": 8.502402355858649e-07, + "loss": 0.8422, + "step": 219430 + }, + { + "epoch": 17.004920764074548, + "grad_norm": 1.642728175977851, + "learning_rate": 8.502789832610043e-07, + "loss": 0.8399, + "step": 219440 + }, + { + "epoch": 17.005695687550855, + "grad_norm": 1.4445797751643037, + "learning_rate": 8.50317730936144e-07, + "loss": 0.8424, + "step": 219450 + }, + { + "epoch": 17.00647061102716, + "grad_norm": 1.6131984754335191, + "learning_rate": 8.503564786112834e-07, + "loss": 0.8451, + "step": 219460 + }, + { + "epoch": 17.00724553450347, + "grad_norm": 1.5924372135468188, + "learning_rate": 8.503952262864229e-07, + "loss": 0.8277, + "step": 219470 + }, + { + "epoch": 17.008020457979775, + "grad_norm": 1.6693015105876496, + "learning_rate": 8.504339739615623e-07, + "loss": 0.847, + "step": 219480 + }, + { + "epoch": 17.008795381456082, + "grad_norm": 1.6468838611889618, + "learning_rate": 8.504727216367018e-07, + "loss": 0.8291, + "step": 219490 + }, + { + "epoch": 17.00957030493239, + "grad_norm": 1.6764619365455047, + "learning_rate": 8.505114693118413e-07, + "loss": 0.8545, + "step": 219500 + }, + { + "epoch": 17.00957030493239, + "eval_loss": 0.8897426128387451, + "eval_runtime": 328.7872, + "eval_samples_per_second": 34.889, + "eval_steps_per_second": 8.723, + "step": 219500 + }, + { + "epoch": 17.010345228408696, + "grad_norm": 1.7028003747884555, + "learning_rate": 8.505502169869809e-07, + "loss": 0.8642, + "step": 219510 + }, + { + "epoch": 17.011120151885002, + "grad_norm": 1.5314958219938524, + "learning_rate": 8.505889646621203e-07, + "loss": 0.8437, + "step": 219520 + }, + { + "epoch": 17.01189507536131, + "grad_norm": 1.6133558993104036, + "learning_rate": 8.506277123372598e-07, + "loss": 0.8418, + "step": 219530 + }, + { + "epoch": 17.012669998837616, + "grad_norm": 1.5544859530683368, + "learning_rate": 8.506664600123992e-07, + "loss": 0.8582, + "step": 219540 + }, + { + "epoch": 17.013444922313923, + "grad_norm": 1.524671047173263, + "learning_rate": 8.507052076875389e-07, + "loss": 0.8388, + "step": 219550 + }, + { + "epoch": 17.01421984579023, + "grad_norm": 1.6436523613234744, + "learning_rate": 8.507439553626783e-07, + "loss": 0.8397, + "step": 219560 + }, + { + "epoch": 17.014994769266536, + "grad_norm": 1.6033640620163572, + "learning_rate": 8.507827030378178e-07, + "loss": 0.8292, + "step": 219570 + }, + { + "epoch": 17.015769692742843, + "grad_norm": 1.6266129256157462, + "learning_rate": 8.508214507129572e-07, + "loss": 0.8559, + "step": 219580 + }, + { + "epoch": 17.01654461621915, + "grad_norm": 1.6846860857709791, + "learning_rate": 8.508601983880969e-07, + "loss": 0.843, + "step": 219590 + }, + { + "epoch": 17.017319539695453, + "grad_norm": 1.5860196242039792, + "learning_rate": 8.508989460632363e-07, + "loss": 0.8324, + "step": 219600 + }, + { + "epoch": 17.01809446317176, + "grad_norm": 1.5522149817164617, + "learning_rate": 8.509376937383758e-07, + "loss": 0.8597, + "step": 219610 + }, + { + "epoch": 17.018869386648067, + "grad_norm": 1.708789982701428, + "learning_rate": 8.509764414135152e-07, + "loss": 0.8599, + "step": 219620 + }, + { + "epoch": 17.019644310124374, + "grad_norm": 1.922159266071149, + "learning_rate": 8.510151890886547e-07, + "loss": 0.8487, + "step": 219630 + }, + { + "epoch": 17.02041923360068, + "grad_norm": 1.5559521904298035, + "learning_rate": 8.510539367637941e-07, + "loss": 0.8437, + "step": 219640 + }, + { + "epoch": 17.021194157076987, + "grad_norm": 1.6031548078291806, + "learning_rate": 8.510926844389338e-07, + "loss": 0.8309, + "step": 219650 + }, + { + "epoch": 17.021969080553294, + "grad_norm": 1.6498155789375346, + "learning_rate": 8.511314321140732e-07, + "loss": 0.8528, + "step": 219660 + }, + { + "epoch": 17.0227440040296, + "grad_norm": 1.6377111821284671, + "learning_rate": 8.511701797892127e-07, + "loss": 0.8562, + "step": 219670 + }, + { + "epoch": 17.023518927505908, + "grad_norm": 1.579795953829026, + "learning_rate": 8.512089274643521e-07, + "loss": 0.8463, + "step": 219680 + }, + { + "epoch": 17.024293850982215, + "grad_norm": 1.536515292042486, + "learning_rate": 8.512476751394918e-07, + "loss": 0.8369, + "step": 219690 + }, + { + "epoch": 17.02506877445852, + "grad_norm": 1.6581569923306, + "learning_rate": 8.512864228146312e-07, + "loss": 0.8596, + "step": 219700 + }, + { + "epoch": 17.025843697934828, + "grad_norm": 1.527476332348909, + "learning_rate": 8.513251704897707e-07, + "loss": 0.8435, + "step": 219710 + }, + { + "epoch": 17.026618621411135, + "grad_norm": 1.6986020456198074, + "learning_rate": 8.513639181649101e-07, + "loss": 0.8454, + "step": 219720 + }, + { + "epoch": 17.027393544887442, + "grad_norm": 1.6898337765948772, + "learning_rate": 8.514026658400497e-07, + "loss": 0.8346, + "step": 219730 + }, + { + "epoch": 17.02816846836375, + "grad_norm": 1.6484672038663626, + "learning_rate": 8.514414135151892e-07, + "loss": 0.8657, + "step": 219740 + }, + { + "epoch": 17.028943391840055, + "grad_norm": 1.6403862064313912, + "learning_rate": 8.514801611903287e-07, + "loss": 0.8448, + "step": 219750 + }, + { + "epoch": 17.029718315316362, + "grad_norm": 1.659831608214712, + "learning_rate": 8.515189088654681e-07, + "loss": 0.8611, + "step": 219760 + }, + { + "epoch": 17.03049323879267, + "grad_norm": 1.6400284457470857, + "learning_rate": 8.515576565406076e-07, + "loss": 0.8457, + "step": 219770 + }, + { + "epoch": 17.031268162268976, + "grad_norm": 1.7643089521772568, + "learning_rate": 8.51596404215747e-07, + "loss": 0.8421, + "step": 219780 + }, + { + "epoch": 17.032043085745283, + "grad_norm": 1.6580066085762493, + "learning_rate": 8.516351518908867e-07, + "loss": 0.8439, + "step": 219790 + }, + { + "epoch": 17.03281800922159, + "grad_norm": 1.6234066753598386, + "learning_rate": 8.516738995660261e-07, + "loss": 0.8529, + "step": 219800 + }, + { + "epoch": 17.033592932697896, + "grad_norm": 1.7226963374051252, + "learning_rate": 8.517126472411656e-07, + "loss": 0.8558, + "step": 219810 + }, + { + "epoch": 17.034367856174203, + "grad_norm": 1.7452783000379901, + "learning_rate": 8.51751394916305e-07, + "loss": 0.8519, + "step": 219820 + }, + { + "epoch": 17.03514277965051, + "grad_norm": 1.851302729214319, + "learning_rate": 8.517901425914446e-07, + "loss": 0.8324, + "step": 219830 + }, + { + "epoch": 17.035917703126817, + "grad_norm": 1.5128797511869598, + "learning_rate": 8.518288902665841e-07, + "loss": 0.8472, + "step": 219840 + }, + { + "epoch": 17.036692626603124, + "grad_norm": 1.5507352229738498, + "learning_rate": 8.518676379417236e-07, + "loss": 0.8485, + "step": 219850 + }, + { + "epoch": 17.03746755007943, + "grad_norm": 1.6276947967564332, + "learning_rate": 8.51906385616863e-07, + "loss": 0.8418, + "step": 219860 + }, + { + "epoch": 17.038242473555737, + "grad_norm": 1.644136706469596, + "learning_rate": 8.519451332920026e-07, + "loss": 0.8353, + "step": 219870 + }, + { + "epoch": 17.039017397032044, + "grad_norm": 1.557267268939145, + "learning_rate": 8.51983880967142e-07, + "loss": 0.8267, + "step": 219880 + }, + { + "epoch": 17.03979232050835, + "grad_norm": 1.8300901260522664, + "learning_rate": 8.520226286422816e-07, + "loss": 0.8385, + "step": 219890 + }, + { + "epoch": 17.040567243984658, + "grad_norm": 1.691866494114483, + "learning_rate": 8.52061376317421e-07, + "loss": 0.8239, + "step": 219900 + }, + { + "epoch": 17.041342167460964, + "grad_norm": 1.5955370021045325, + "learning_rate": 8.521001239925605e-07, + "loss": 0.8384, + "step": 219910 + }, + { + "epoch": 17.04211709093727, + "grad_norm": 1.6152842222394777, + "learning_rate": 8.521388716676999e-07, + "loss": 0.8342, + "step": 219920 + }, + { + "epoch": 17.042892014413578, + "grad_norm": 1.7083964815124668, + "learning_rate": 8.521776193428395e-07, + "loss": 0.8414, + "step": 219930 + }, + { + "epoch": 17.043666937889885, + "grad_norm": 1.7497972255152854, + "learning_rate": 8.52216367017979e-07, + "loss": 0.8274, + "step": 219940 + }, + { + "epoch": 17.04444186136619, + "grad_norm": 1.6740049721641992, + "learning_rate": 8.522551146931185e-07, + "loss": 0.8351, + "step": 219950 + }, + { + "epoch": 17.0452167848425, + "grad_norm": 1.7662601542518395, + "learning_rate": 8.522938623682579e-07, + "loss": 0.8462, + "step": 219960 + }, + { + "epoch": 17.045991708318805, + "grad_norm": 1.6381209898905074, + "learning_rate": 8.523326100433975e-07, + "loss": 0.8425, + "step": 219970 + }, + { + "epoch": 17.04676663179511, + "grad_norm": 1.661370706572133, + "learning_rate": 8.523713577185369e-07, + "loss": 0.8558, + "step": 219980 + }, + { + "epoch": 17.047541555271415, + "grad_norm": 1.69882850582938, + "learning_rate": 8.524101053936765e-07, + "loss": 0.8423, + "step": 219990 + }, + { + "epoch": 17.048316478747722, + "grad_norm": 1.5697176947655873, + "learning_rate": 8.524488530688159e-07, + "loss": 0.8533, + "step": 220000 + }, + { + "epoch": 17.048316478747722, + "eval_loss": 0.8901394009590149, + "eval_runtime": 328.6319, + "eval_samples_per_second": 34.905, + "eval_steps_per_second": 8.727, + "step": 220000 + }, + { + "epoch": 17.04909140222403, + "grad_norm": 1.6798541306119323, + "learning_rate": 8.524876007439554e-07, + "loss": 0.8393, + "step": 220010 + }, + { + "epoch": 17.049866325700336, + "grad_norm": 1.816554455121353, + "learning_rate": 8.525263484190949e-07, + "loss": 0.8507, + "step": 220020 + }, + { + "epoch": 17.050641249176643, + "grad_norm": 1.5431176124136738, + "learning_rate": 8.525650960942344e-07, + "loss": 0.8322, + "step": 220030 + }, + { + "epoch": 17.05141617265295, + "grad_norm": 1.6082720714835765, + "learning_rate": 8.526038437693739e-07, + "loss": 0.8383, + "step": 220040 + }, + { + "epoch": 17.052191096129256, + "grad_norm": 1.7232333781212739, + "learning_rate": 8.526425914445134e-07, + "loss": 0.8338, + "step": 220050 + }, + { + "epoch": 17.052966019605563, + "grad_norm": 1.649139718286142, + "learning_rate": 8.526813391196528e-07, + "loss": 0.8659, + "step": 220060 + }, + { + "epoch": 17.05374094308187, + "grad_norm": 1.5509286620509042, + "learning_rate": 8.527200867947924e-07, + "loss": 0.837, + "step": 220070 + }, + { + "epoch": 17.054515866558177, + "grad_norm": 1.6215134674979965, + "learning_rate": 8.527588344699318e-07, + "loss": 0.8391, + "step": 220080 + }, + { + "epoch": 17.055290790034483, + "grad_norm": 1.6638344016463626, + "learning_rate": 8.527975821450714e-07, + "loss": 0.8552, + "step": 220090 + }, + { + "epoch": 17.05606571351079, + "grad_norm": 1.6286965088656422, + "learning_rate": 8.528363298202108e-07, + "loss": 0.8562, + "step": 220100 + }, + { + "epoch": 17.056840636987097, + "grad_norm": 1.6904943116135813, + "learning_rate": 8.528750774953504e-07, + "loss": 0.8432, + "step": 220110 + }, + { + "epoch": 17.057615560463404, + "grad_norm": 1.5496855380970995, + "learning_rate": 8.529138251704898e-07, + "loss": 0.8423, + "step": 220120 + }, + { + "epoch": 17.05839048393971, + "grad_norm": 1.622935213480955, + "learning_rate": 8.529525728456293e-07, + "loss": 0.8306, + "step": 220130 + }, + { + "epoch": 17.059165407416018, + "grad_norm": 1.7372969182442375, + "learning_rate": 8.529913205207688e-07, + "loss": 0.8393, + "step": 220140 + }, + { + "epoch": 17.059940330892324, + "grad_norm": 1.621833094410565, + "learning_rate": 8.530300681959083e-07, + "loss": 0.836, + "step": 220150 + }, + { + "epoch": 17.06071525436863, + "grad_norm": 1.6414761511659768, + "learning_rate": 8.530688158710478e-07, + "loss": 0.8625, + "step": 220160 + }, + { + "epoch": 17.061490177844938, + "grad_norm": 1.664040091592825, + "learning_rate": 8.531075635461873e-07, + "loss": 0.8352, + "step": 220170 + }, + { + "epoch": 17.062265101321245, + "grad_norm": 1.711544401659983, + "learning_rate": 8.531463112213267e-07, + "loss": 0.8413, + "step": 220180 + }, + { + "epoch": 17.06304002479755, + "grad_norm": 1.7198252869788242, + "learning_rate": 8.531850588964663e-07, + "loss": 0.8516, + "step": 220190 + }, + { + "epoch": 17.06381494827386, + "grad_norm": 1.497529290327028, + "learning_rate": 8.532238065716057e-07, + "loss": 0.8562, + "step": 220200 + }, + { + "epoch": 17.064589871750165, + "grad_norm": 1.7395886710660033, + "learning_rate": 8.532625542467453e-07, + "loss": 0.8389, + "step": 220210 + }, + { + "epoch": 17.065364795226472, + "grad_norm": 1.6646521765875724, + "learning_rate": 8.533013019218847e-07, + "loss": 0.844, + "step": 220220 + }, + { + "epoch": 17.06613971870278, + "grad_norm": 1.6870355901218823, + "learning_rate": 8.533400495970242e-07, + "loss": 0.8523, + "step": 220230 + }, + { + "epoch": 17.066914642179086, + "grad_norm": 1.6111412565249428, + "learning_rate": 8.533787972721637e-07, + "loss": 0.844, + "step": 220240 + }, + { + "epoch": 17.067689565655392, + "grad_norm": 1.5297782902432486, + "learning_rate": 8.534175449473033e-07, + "loss": 0.8515, + "step": 220250 + }, + { + "epoch": 17.0684644891317, + "grad_norm": 1.5801382494928353, + "learning_rate": 8.534562926224427e-07, + "loss": 0.857, + "step": 220260 + }, + { + "epoch": 17.069239412608006, + "grad_norm": 1.6455931368752152, + "learning_rate": 8.534950402975822e-07, + "loss": 0.8389, + "step": 220270 + }, + { + "epoch": 17.070014336084313, + "grad_norm": 1.5711776470415675, + "learning_rate": 8.535337879727216e-07, + "loss": 0.8487, + "step": 220280 + }, + { + "epoch": 17.07078925956062, + "grad_norm": 1.6425189533473425, + "learning_rate": 8.535725356478612e-07, + "loss": 0.8544, + "step": 220290 + }, + { + "epoch": 17.071564183036926, + "grad_norm": 1.645016028279874, + "learning_rate": 8.536112833230007e-07, + "loss": 0.8557, + "step": 220300 + }, + { + "epoch": 17.072339106513233, + "grad_norm": 1.6165924473318949, + "learning_rate": 8.536500309981402e-07, + "loss": 0.8373, + "step": 220310 + }, + { + "epoch": 17.07311402998954, + "grad_norm": 1.7816425163358072, + "learning_rate": 8.536887786732796e-07, + "loss": 0.8396, + "step": 220320 + }, + { + "epoch": 17.073888953465847, + "grad_norm": 1.9506442223918434, + "learning_rate": 8.537275263484191e-07, + "loss": 0.8438, + "step": 220330 + }, + { + "epoch": 17.074663876942154, + "grad_norm": 1.6379443963633733, + "learning_rate": 8.537662740235586e-07, + "loss": 0.852, + "step": 220340 + }, + { + "epoch": 17.075438800418457, + "grad_norm": 1.7880481862393574, + "learning_rate": 8.538050216986982e-07, + "loss": 0.8518, + "step": 220350 + }, + { + "epoch": 17.076213723894764, + "grad_norm": 1.6285749856633824, + "learning_rate": 8.538437693738376e-07, + "loss": 0.8648, + "step": 220360 + }, + { + "epoch": 17.07698864737107, + "grad_norm": 1.647299734606403, + "learning_rate": 8.538825170489771e-07, + "loss": 0.8467, + "step": 220370 + }, + { + "epoch": 17.077763570847377, + "grad_norm": 1.623435018073017, + "learning_rate": 8.539212647241165e-07, + "loss": 0.8405, + "step": 220380 + }, + { + "epoch": 17.078538494323684, + "grad_norm": 1.73216491148961, + "learning_rate": 8.539600123992562e-07, + "loss": 0.8504, + "step": 220390 + }, + { + "epoch": 17.07931341779999, + "grad_norm": 1.6422608750425194, + "learning_rate": 8.539987600743956e-07, + "loss": 0.8461, + "step": 220400 + }, + { + "epoch": 17.080088341276298, + "grad_norm": 1.7678029964890067, + "learning_rate": 8.540375077495351e-07, + "loss": 0.8412, + "step": 220410 + }, + { + "epoch": 17.080863264752605, + "grad_norm": 1.5783091021119369, + "learning_rate": 8.540762554246745e-07, + "loss": 0.8465, + "step": 220420 + }, + { + "epoch": 17.08163818822891, + "grad_norm": 1.6329718417300596, + "learning_rate": 8.54115003099814e-07, + "loss": 0.8676, + "step": 220430 + }, + { + "epoch": 17.08241311170522, + "grad_norm": 1.6288485698576773, + "learning_rate": 8.541537507749536e-07, + "loss": 0.8575, + "step": 220440 + }, + { + "epoch": 17.083188035181525, + "grad_norm": 1.6631236572064958, + "learning_rate": 8.541924984500931e-07, + "loss": 0.8723, + "step": 220450 + }, + { + "epoch": 17.083962958657832, + "grad_norm": 1.5575536187251084, + "learning_rate": 8.542312461252325e-07, + "loss": 0.8372, + "step": 220460 + }, + { + "epoch": 17.08473788213414, + "grad_norm": 1.615691691053049, + "learning_rate": 8.54269993800372e-07, + "loss": 0.8324, + "step": 220470 + }, + { + "epoch": 17.085512805610445, + "grad_norm": 1.655571305936082, + "learning_rate": 8.543087414755114e-07, + "loss": 0.8649, + "step": 220480 + }, + { + "epoch": 17.086287729086752, + "grad_norm": 1.6075202236038928, + "learning_rate": 8.543474891506511e-07, + "loss": 0.8487, + "step": 220490 + }, + { + "epoch": 17.08706265256306, + "grad_norm": 1.5644592259175303, + "learning_rate": 8.543862368257905e-07, + "loss": 0.8406, + "step": 220500 + }, + { + "epoch": 17.08706265256306, + "eval_loss": 0.890101432800293, + "eval_runtime": 328.6919, + "eval_samples_per_second": 34.899, + "eval_steps_per_second": 8.725, + "step": 220500 + }, + { + "epoch": 17.087837576039366, + "grad_norm": 1.7296816215118, + "learning_rate": 8.5442498450093e-07, + "loss": 0.8491, + "step": 220510 + }, + { + "epoch": 17.088612499515673, + "grad_norm": 1.715405958145129, + "learning_rate": 8.544637321760694e-07, + "loss": 0.8348, + "step": 220520 + }, + { + "epoch": 17.08938742299198, + "grad_norm": 1.5935312873558636, + "learning_rate": 8.545024798512091e-07, + "loss": 0.8634, + "step": 220530 + }, + { + "epoch": 17.090162346468286, + "grad_norm": 1.6008626231927936, + "learning_rate": 8.545412275263485e-07, + "loss": 0.8538, + "step": 220540 + }, + { + "epoch": 17.090937269944593, + "grad_norm": 1.6367185040142411, + "learning_rate": 8.54579975201488e-07, + "loss": 0.8514, + "step": 220550 + }, + { + "epoch": 17.0917121934209, + "grad_norm": 1.6179900297032335, + "learning_rate": 8.546187228766274e-07, + "loss": 0.8477, + "step": 220560 + }, + { + "epoch": 17.092487116897207, + "grad_norm": 1.7132140253081016, + "learning_rate": 8.546574705517669e-07, + "loss": 0.8473, + "step": 220570 + }, + { + "epoch": 17.093262040373514, + "grad_norm": 1.4653290432087804, + "learning_rate": 8.546962182269065e-07, + "loss": 0.8328, + "step": 220580 + }, + { + "epoch": 17.09403696384982, + "grad_norm": 1.7419338155078292, + "learning_rate": 8.54734965902046e-07, + "loss": 0.8462, + "step": 220590 + }, + { + "epoch": 17.094811887326127, + "grad_norm": 1.5516869249802072, + "learning_rate": 8.547737135771854e-07, + "loss": 0.8736, + "step": 220600 + }, + { + "epoch": 17.095586810802434, + "grad_norm": 1.7044177230571356, + "learning_rate": 8.548124612523249e-07, + "loss": 0.8554, + "step": 220610 + }, + { + "epoch": 17.09636173427874, + "grad_norm": 1.5578833325917485, + "learning_rate": 8.548512089274643e-07, + "loss": 0.8443, + "step": 220620 + }, + { + "epoch": 17.097136657755048, + "grad_norm": 1.6998237297349765, + "learning_rate": 8.54889956602604e-07, + "loss": 0.8493, + "step": 220630 + }, + { + "epoch": 17.097911581231354, + "grad_norm": 1.529869958743755, + "learning_rate": 8.549287042777434e-07, + "loss": 0.8421, + "step": 220640 + }, + { + "epoch": 17.09868650470766, + "grad_norm": 1.6063517903325513, + "learning_rate": 8.549674519528829e-07, + "loss": 0.8632, + "step": 220650 + }, + { + "epoch": 17.099461428183968, + "grad_norm": 1.7062325666491822, + "learning_rate": 8.550061996280223e-07, + "loss": 0.8593, + "step": 220660 + }, + { + "epoch": 17.100236351660275, + "grad_norm": 1.6511736948779296, + "learning_rate": 8.550449473031619e-07, + "loss": 0.8359, + "step": 220670 + }, + { + "epoch": 17.10101127513658, + "grad_norm": 1.6910204713939156, + "learning_rate": 8.550836949783014e-07, + "loss": 0.8546, + "step": 220680 + }, + { + "epoch": 17.10178619861289, + "grad_norm": 1.724376357446189, + "learning_rate": 8.551224426534409e-07, + "loss": 0.8374, + "step": 220690 + }, + { + "epoch": 17.102561122089195, + "grad_norm": 1.6377345225541202, + "learning_rate": 8.551611903285803e-07, + "loss": 0.8284, + "step": 220700 + }, + { + "epoch": 17.103336045565502, + "grad_norm": 1.7422589577122203, + "learning_rate": 8.551999380037198e-07, + "loss": 0.8502, + "step": 220710 + }, + { + "epoch": 17.104110969041805, + "grad_norm": 1.6120202317199968, + "learning_rate": 8.552386856788592e-07, + "loss": 0.8497, + "step": 220720 + }, + { + "epoch": 17.104885892518112, + "grad_norm": 1.7320702579586447, + "learning_rate": 8.552774333539989e-07, + "loss": 0.8398, + "step": 220730 + }, + { + "epoch": 17.10566081599442, + "grad_norm": 1.678288063568811, + "learning_rate": 8.553161810291383e-07, + "loss": 0.8448, + "step": 220740 + }, + { + "epoch": 17.106435739470726, + "grad_norm": 1.6751976946152, + "learning_rate": 8.553549287042778e-07, + "loss": 0.8691, + "step": 220750 + }, + { + "epoch": 17.107210662947033, + "grad_norm": 1.7305344417707957, + "learning_rate": 8.553936763794172e-07, + "loss": 0.8537, + "step": 220760 + }, + { + "epoch": 17.10798558642334, + "grad_norm": 1.565882193575213, + "learning_rate": 8.554324240545568e-07, + "loss": 0.8534, + "step": 220770 + }, + { + "epoch": 17.108760509899646, + "grad_norm": 1.663223958148145, + "learning_rate": 8.554711717296963e-07, + "loss": 0.8346, + "step": 220780 + }, + { + "epoch": 17.109535433375953, + "grad_norm": 1.5422892782445545, + "learning_rate": 8.555099194048358e-07, + "loss": 0.8516, + "step": 220790 + }, + { + "epoch": 17.11031035685226, + "grad_norm": 1.628377710135688, + "learning_rate": 8.555486670799752e-07, + "loss": 0.8501, + "step": 220800 + }, + { + "epoch": 17.111085280328567, + "grad_norm": 1.5555061734226474, + "learning_rate": 8.555874147551148e-07, + "loss": 0.8616, + "step": 220810 + }, + { + "epoch": 17.111860203804873, + "grad_norm": 1.569663358071406, + "learning_rate": 8.556261624302542e-07, + "loss": 0.8411, + "step": 220820 + }, + { + "epoch": 17.11263512728118, + "grad_norm": 1.6046627762555437, + "learning_rate": 8.556649101053938e-07, + "loss": 0.8469, + "step": 220830 + }, + { + "epoch": 17.113410050757487, + "grad_norm": 1.6878919190801003, + "learning_rate": 8.557036577805332e-07, + "loss": 0.8449, + "step": 220840 + }, + { + "epoch": 17.114184974233794, + "grad_norm": 1.6839543468744882, + "learning_rate": 8.557424054556727e-07, + "loss": 0.8456, + "step": 220850 + }, + { + "epoch": 17.1149598977101, + "grad_norm": 1.5968593411019616, + "learning_rate": 8.557811531308121e-07, + "loss": 0.8333, + "step": 220860 + }, + { + "epoch": 17.115734821186408, + "grad_norm": 1.6937589259182428, + "learning_rate": 8.558199008059517e-07, + "loss": 0.8573, + "step": 220870 + }, + { + "epoch": 17.116509744662714, + "grad_norm": 1.5532444332441828, + "learning_rate": 8.558586484810912e-07, + "loss": 0.8467, + "step": 220880 + }, + { + "epoch": 17.11728466813902, + "grad_norm": 1.6081526797231982, + "learning_rate": 8.558973961562307e-07, + "loss": 0.8434, + "step": 220890 + }, + { + "epoch": 17.118059591615328, + "grad_norm": 1.6372711341688175, + "learning_rate": 8.559361438313701e-07, + "loss": 0.8553, + "step": 220900 + }, + { + "epoch": 17.118834515091635, + "grad_norm": 1.6202037791904655, + "learning_rate": 8.559748915065097e-07, + "loss": 0.8467, + "step": 220910 + }, + { + "epoch": 17.11960943856794, + "grad_norm": 1.6162934928528525, + "learning_rate": 8.560136391816491e-07, + "loss": 0.8676, + "step": 220920 + }, + { + "epoch": 17.12038436204425, + "grad_norm": 1.596781682472567, + "learning_rate": 8.560523868567887e-07, + "loss": 0.856, + "step": 220930 + }, + { + "epoch": 17.121159285520555, + "grad_norm": 1.5438322932794366, + "learning_rate": 8.560911345319281e-07, + "loss": 0.8403, + "step": 220940 + }, + { + "epoch": 17.121934208996862, + "grad_norm": 1.5822967774491654, + "learning_rate": 8.561298822070677e-07, + "loss": 0.8359, + "step": 220950 + }, + { + "epoch": 17.12270913247317, + "grad_norm": 1.6284488047133137, + "learning_rate": 8.561686298822071e-07, + "loss": 0.8328, + "step": 220960 + }, + { + "epoch": 17.123484055949476, + "grad_norm": 1.595741119806462, + "learning_rate": 8.562073775573466e-07, + "loss": 0.8332, + "step": 220970 + }, + { + "epoch": 17.124258979425782, + "grad_norm": 1.5389986686008967, + "learning_rate": 8.562461252324861e-07, + "loss": 0.8442, + "step": 220980 + }, + { + "epoch": 17.12503390290209, + "grad_norm": 1.6689255673264276, + "learning_rate": 8.562848729076256e-07, + "loss": 0.856, + "step": 220990 + }, + { + "epoch": 17.125808826378396, + "grad_norm": 1.662917752076841, + "learning_rate": 8.56323620582765e-07, + "loss": 0.8376, + "step": 221000 + }, + { + "epoch": 17.125808826378396, + "eval_loss": 0.8904843330383301, + "eval_runtime": 328.4004, + "eval_samples_per_second": 34.93, + "eval_steps_per_second": 8.733, + "step": 221000 + }, + { + "epoch": 17.126583749854703, + "grad_norm": 1.610701858355254, + "learning_rate": 8.563623682579046e-07, + "loss": 0.8348, + "step": 221010 + }, + { + "epoch": 17.12735867333101, + "grad_norm": 1.61869979813801, + "learning_rate": 8.56401115933044e-07, + "loss": 0.8431, + "step": 221020 + }, + { + "epoch": 17.128133596807317, + "grad_norm": 1.7341385612625737, + "learning_rate": 8.564398636081836e-07, + "loss": 0.8588, + "step": 221030 + }, + { + "epoch": 17.128908520283623, + "grad_norm": 1.6994224036446974, + "learning_rate": 8.56478611283323e-07, + "loss": 0.8398, + "step": 221040 + }, + { + "epoch": 17.12968344375993, + "grad_norm": 1.6155250152792233, + "learning_rate": 8.565173589584626e-07, + "loss": 0.8519, + "step": 221050 + }, + { + "epoch": 17.130458367236237, + "grad_norm": 1.5597653286463131, + "learning_rate": 8.56556106633602e-07, + "loss": 0.847, + "step": 221060 + }, + { + "epoch": 17.131233290712544, + "grad_norm": 1.7523701338521307, + "learning_rate": 8.565948543087416e-07, + "loss": 0.8327, + "step": 221070 + }, + { + "epoch": 17.13200821418885, + "grad_norm": 1.576365030140208, + "learning_rate": 8.56633601983881e-07, + "loss": 0.8433, + "step": 221080 + }, + { + "epoch": 17.132783137665154, + "grad_norm": 1.6537142503538875, + "learning_rate": 8.566723496590206e-07, + "loss": 0.8558, + "step": 221090 + }, + { + "epoch": 17.13355806114146, + "grad_norm": 1.657660815115012, + "learning_rate": 8.5671109733416e-07, + "loss": 0.8556, + "step": 221100 + }, + { + "epoch": 17.134332984617767, + "grad_norm": 1.6835282137577372, + "learning_rate": 8.567498450092995e-07, + "loss": 0.841, + "step": 221110 + }, + { + "epoch": 17.135107908094074, + "grad_norm": 1.6470693584278524, + "learning_rate": 8.56788592684439e-07, + "loss": 0.8461, + "step": 221120 + }, + { + "epoch": 17.13588283157038, + "grad_norm": 1.653459859840959, + "learning_rate": 8.568273403595785e-07, + "loss": 0.853, + "step": 221130 + }, + { + "epoch": 17.136657755046688, + "grad_norm": 1.6775032900222606, + "learning_rate": 8.568660880347179e-07, + "loss": 0.8458, + "step": 221140 + }, + { + "epoch": 17.137432678522995, + "grad_norm": 1.832817587665629, + "learning_rate": 8.569048357098575e-07, + "loss": 0.8497, + "step": 221150 + }, + { + "epoch": 17.1382076019993, + "grad_norm": 1.5764633705639228, + "learning_rate": 8.569435833849969e-07, + "loss": 0.8479, + "step": 221160 + }, + { + "epoch": 17.13898252547561, + "grad_norm": 1.6282184733392597, + "learning_rate": 8.569823310601365e-07, + "loss": 0.8218, + "step": 221170 + }, + { + "epoch": 17.139757448951915, + "grad_norm": 1.5946089278026532, + "learning_rate": 8.570210787352759e-07, + "loss": 0.8279, + "step": 221180 + }, + { + "epoch": 17.140532372428222, + "grad_norm": 1.6398537799345385, + "learning_rate": 8.570598264104155e-07, + "loss": 0.8589, + "step": 221190 + }, + { + "epoch": 17.14130729590453, + "grad_norm": 1.6429258611291642, + "learning_rate": 8.570985740855549e-07, + "loss": 0.8531, + "step": 221200 + }, + { + "epoch": 17.142082219380836, + "grad_norm": 1.7326707049130003, + "learning_rate": 8.571373217606944e-07, + "loss": 0.8506, + "step": 221210 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 1.5812209491562315, + "learning_rate": 8.571760694358339e-07, + "loss": 0.8418, + "step": 221220 + }, + { + "epoch": 17.14363206633345, + "grad_norm": 1.7928711250543465, + "learning_rate": 8.572148171109735e-07, + "loss": 0.8409, + "step": 221230 + }, + { + "epoch": 17.144406989809756, + "grad_norm": 1.6023448610805533, + "learning_rate": 8.572535647861129e-07, + "loss": 0.8527, + "step": 221240 + }, + { + "epoch": 17.145181913286063, + "grad_norm": 1.6288881520571992, + "learning_rate": 8.572923124612524e-07, + "loss": 0.8431, + "step": 221250 + }, + { + "epoch": 17.14595683676237, + "grad_norm": 1.6226420327861546, + "learning_rate": 8.573310601363918e-07, + "loss": 0.8314, + "step": 221260 + }, + { + "epoch": 17.146731760238676, + "grad_norm": 1.6018450312088168, + "learning_rate": 8.573698078115314e-07, + "loss": 0.8415, + "step": 221270 + }, + { + "epoch": 17.147506683714983, + "grad_norm": 1.510149380309408, + "learning_rate": 8.574085554866708e-07, + "loss": 0.8516, + "step": 221280 + }, + { + "epoch": 17.14828160719129, + "grad_norm": 1.6857515373041532, + "learning_rate": 8.574473031618104e-07, + "loss": 0.8365, + "step": 221290 + }, + { + "epoch": 17.149056530667597, + "grad_norm": 1.6789017200905068, + "learning_rate": 8.574860508369498e-07, + "loss": 0.8516, + "step": 221300 + }, + { + "epoch": 17.149831454143904, + "grad_norm": 1.6914355335802902, + "learning_rate": 8.575247985120893e-07, + "loss": 0.8415, + "step": 221310 + }, + { + "epoch": 17.15060637762021, + "grad_norm": 1.57387602979731, + "learning_rate": 8.575635461872288e-07, + "loss": 0.8497, + "step": 221320 + }, + { + "epoch": 17.151381301096517, + "grad_norm": 1.674465948501484, + "learning_rate": 8.576022938623684e-07, + "loss": 0.8359, + "step": 221330 + }, + { + "epoch": 17.152156224572824, + "grad_norm": 1.6744790216747547, + "learning_rate": 8.576410415375078e-07, + "loss": 0.8531, + "step": 221340 + }, + { + "epoch": 17.15293114804913, + "grad_norm": 1.5270659131050797, + "learning_rate": 8.576797892126473e-07, + "loss": 0.8633, + "step": 221350 + }, + { + "epoch": 17.153706071525438, + "grad_norm": 1.4725154267747997, + "learning_rate": 8.577185368877867e-07, + "loss": 0.8422, + "step": 221360 + }, + { + "epoch": 17.154480995001745, + "grad_norm": 1.6243532630370252, + "learning_rate": 8.577572845629264e-07, + "loss": 0.833, + "step": 221370 + }, + { + "epoch": 17.15525591847805, + "grad_norm": 1.5908609369443931, + "learning_rate": 8.577960322380658e-07, + "loss": 0.835, + "step": 221380 + }, + { + "epoch": 17.156030841954358, + "grad_norm": 1.6115306532893627, + "learning_rate": 8.578347799132053e-07, + "loss": 0.8552, + "step": 221390 + }, + { + "epoch": 17.156805765430665, + "grad_norm": 1.9236848626500938, + "learning_rate": 8.578735275883447e-07, + "loss": 0.8506, + "step": 221400 + }, + { + "epoch": 17.15758068890697, + "grad_norm": 1.6863534995146257, + "learning_rate": 8.579122752634842e-07, + "loss": 0.8592, + "step": 221410 + }, + { + "epoch": 17.15835561238328, + "grad_norm": 1.627881868375184, + "learning_rate": 8.579510229386237e-07, + "loss": 0.8545, + "step": 221420 + }, + { + "epoch": 17.159130535859585, + "grad_norm": 1.6823870175327287, + "learning_rate": 8.579897706137633e-07, + "loss": 0.8298, + "step": 221430 + }, + { + "epoch": 17.159905459335892, + "grad_norm": 1.6386853476329188, + "learning_rate": 8.580285182889027e-07, + "loss": 0.8441, + "step": 221440 + }, + { + "epoch": 17.1606803828122, + "grad_norm": 1.5639724789838758, + "learning_rate": 8.580672659640422e-07, + "loss": 0.8536, + "step": 221450 + }, + { + "epoch": 17.161455306288502, + "grad_norm": 1.6912016562142433, + "learning_rate": 8.581060136391816e-07, + "loss": 0.837, + "step": 221460 + }, + { + "epoch": 17.16223022976481, + "grad_norm": 1.6075381391808934, + "learning_rate": 8.581447613143213e-07, + "loss": 0.8567, + "step": 221470 + }, + { + "epoch": 17.163005153241116, + "grad_norm": 1.652701877601793, + "learning_rate": 8.581835089894607e-07, + "loss": 0.8439, + "step": 221480 + }, + { + "epoch": 17.163780076717423, + "grad_norm": 1.6136764213666233, + "learning_rate": 8.582222566646002e-07, + "loss": 0.849, + "step": 221490 + }, + { + "epoch": 17.16455500019373, + "grad_norm": 1.6673899688463012, + "learning_rate": 8.582610043397396e-07, + "loss": 0.8539, + "step": 221500 + }, + { + "epoch": 17.16455500019373, + "eval_loss": 0.8903098106384277, + "eval_runtime": 329.0604, + "eval_samples_per_second": 34.86, + "eval_steps_per_second": 8.716, + "step": 221500 + }, + { + "epoch": 17.165329923670036, + "grad_norm": 1.6762366199912175, + "learning_rate": 8.582997520148791e-07, + "loss": 0.8548, + "step": 221510 + }, + { + "epoch": 17.166104847146343, + "grad_norm": 1.6423134144106681, + "learning_rate": 8.583384996900187e-07, + "loss": 0.8657, + "step": 221520 + }, + { + "epoch": 17.16687977062265, + "grad_norm": 1.566423434744799, + "learning_rate": 8.583772473651582e-07, + "loss": 0.846, + "step": 221530 + }, + { + "epoch": 17.167654694098957, + "grad_norm": 1.7310015908465346, + "learning_rate": 8.584159950402976e-07, + "loss": 0.8464, + "step": 221540 + }, + { + "epoch": 17.168429617575264, + "grad_norm": 1.590000399344665, + "learning_rate": 8.584547427154371e-07, + "loss": 0.8454, + "step": 221550 + }, + { + "epoch": 17.16920454105157, + "grad_norm": 1.5214379058226921, + "learning_rate": 8.584934903905765e-07, + "loss": 0.8444, + "step": 221560 + }, + { + "epoch": 17.169979464527877, + "grad_norm": 1.734283951080796, + "learning_rate": 8.585322380657162e-07, + "loss": 0.8375, + "step": 221570 + }, + { + "epoch": 17.170754388004184, + "grad_norm": 1.7365444501685, + "learning_rate": 8.585709857408556e-07, + "loss": 0.8641, + "step": 221580 + }, + { + "epoch": 17.17152931148049, + "grad_norm": 1.599443635265817, + "learning_rate": 8.586097334159951e-07, + "loss": 0.8565, + "step": 221590 + }, + { + "epoch": 17.172304234956798, + "grad_norm": 1.5700725464771632, + "learning_rate": 8.586484810911345e-07, + "loss": 0.846, + "step": 221600 + }, + { + "epoch": 17.173079158433104, + "grad_norm": 1.5329481042173392, + "learning_rate": 8.586872287662742e-07, + "loss": 0.8523, + "step": 221610 + }, + { + "epoch": 17.17385408190941, + "grad_norm": 1.6497184831880414, + "learning_rate": 8.587259764414136e-07, + "loss": 0.8343, + "step": 221620 + }, + { + "epoch": 17.174629005385718, + "grad_norm": 1.713208392929116, + "learning_rate": 8.587647241165531e-07, + "loss": 0.8485, + "step": 221630 + }, + { + "epoch": 17.175403928862025, + "grad_norm": 1.57362247636087, + "learning_rate": 8.588034717916925e-07, + "loss": 0.854, + "step": 221640 + }, + { + "epoch": 17.17617885233833, + "grad_norm": 1.6797152735543706, + "learning_rate": 8.58842219466832e-07, + "loss": 0.8521, + "step": 221650 + }, + { + "epoch": 17.17695377581464, + "grad_norm": 1.586267862027165, + "learning_rate": 8.588809671419715e-07, + "loss": 0.8431, + "step": 221660 + }, + { + "epoch": 17.177728699290945, + "grad_norm": 2.0374152575780893, + "learning_rate": 8.589197148171111e-07, + "loss": 0.8776, + "step": 221670 + }, + { + "epoch": 17.178503622767252, + "grad_norm": 1.6304819938420485, + "learning_rate": 8.589584624922505e-07, + "loss": 0.846, + "step": 221680 + }, + { + "epoch": 17.17927854624356, + "grad_norm": 1.6900079352552015, + "learning_rate": 8.5899721016739e-07, + "loss": 0.8418, + "step": 221690 + }, + { + "epoch": 17.180053469719866, + "grad_norm": 1.6084246285739316, + "learning_rate": 8.590359578425294e-07, + "loss": 0.8553, + "step": 221700 + }, + { + "epoch": 17.180828393196172, + "grad_norm": 1.5813282017399763, + "learning_rate": 8.59074705517669e-07, + "loss": 0.84, + "step": 221710 + }, + { + "epoch": 17.18160331667248, + "grad_norm": 1.7425416583320477, + "learning_rate": 8.591134531928085e-07, + "loss": 0.831, + "step": 221720 + }, + { + "epoch": 17.182378240148786, + "grad_norm": 1.6811334798288702, + "learning_rate": 8.59152200867948e-07, + "loss": 0.8384, + "step": 221730 + }, + { + "epoch": 17.183153163625093, + "grad_norm": 1.6087878827137996, + "learning_rate": 8.591909485430874e-07, + "loss": 0.8508, + "step": 221740 + }, + { + "epoch": 17.1839280871014, + "grad_norm": 1.6462487954949785, + "learning_rate": 8.59229696218227e-07, + "loss": 0.8438, + "step": 221750 + }, + { + "epoch": 17.184703010577707, + "grad_norm": 1.6892379617186164, + "learning_rate": 8.592684438933665e-07, + "loss": 0.8434, + "step": 221760 + }, + { + "epoch": 17.185477934054013, + "grad_norm": 1.531470692979799, + "learning_rate": 8.59307191568506e-07, + "loss": 0.8379, + "step": 221770 + }, + { + "epoch": 17.18625285753032, + "grad_norm": 1.7897053382172754, + "learning_rate": 8.593459392436454e-07, + "loss": 0.8399, + "step": 221780 + }, + { + "epoch": 17.187027781006627, + "grad_norm": 1.5527635035056504, + "learning_rate": 8.593846869187849e-07, + "loss": 0.8614, + "step": 221790 + }, + { + "epoch": 17.187802704482934, + "grad_norm": 1.6686118418712297, + "learning_rate": 8.594234345939244e-07, + "loss": 0.8375, + "step": 221800 + }, + { + "epoch": 17.18857762795924, + "grad_norm": 1.707659649564342, + "learning_rate": 8.59462182269064e-07, + "loss": 0.8581, + "step": 221810 + }, + { + "epoch": 17.189352551435547, + "grad_norm": 1.6428285740998148, + "learning_rate": 8.595009299442034e-07, + "loss": 0.8492, + "step": 221820 + }, + { + "epoch": 17.190127474911854, + "grad_norm": 1.5905535294581272, + "learning_rate": 8.595396776193429e-07, + "loss": 0.8767, + "step": 221830 + }, + { + "epoch": 17.190902398388157, + "grad_norm": 1.577666443988374, + "learning_rate": 8.595784252944823e-07, + "loss": 0.8468, + "step": 221840 + }, + { + "epoch": 17.191677321864464, + "grad_norm": 1.5758014431075398, + "learning_rate": 8.596171729696219e-07, + "loss": 0.8505, + "step": 221850 + }, + { + "epoch": 17.19245224534077, + "grad_norm": 1.7171685508464951, + "learning_rate": 8.596559206447614e-07, + "loss": 0.8398, + "step": 221860 + }, + { + "epoch": 17.193227168817078, + "grad_norm": 1.624089326798371, + "learning_rate": 8.596946683199009e-07, + "loss": 0.8431, + "step": 221870 + }, + { + "epoch": 17.194002092293385, + "grad_norm": 1.5963093880579597, + "learning_rate": 8.597334159950403e-07, + "loss": 0.8428, + "step": 221880 + }, + { + "epoch": 17.19477701576969, + "grad_norm": 1.647092411482164, + "learning_rate": 8.597721636701799e-07, + "loss": 0.8409, + "step": 221890 + }, + { + "epoch": 17.195551939246, + "grad_norm": 1.6729725127678368, + "learning_rate": 8.598109113453193e-07, + "loss": 0.8455, + "step": 221900 + }, + { + "epoch": 17.196326862722305, + "grad_norm": 1.5290352774781792, + "learning_rate": 8.598496590204589e-07, + "loss": 0.8271, + "step": 221910 + }, + { + "epoch": 17.197101786198612, + "grad_norm": 1.6493441396840265, + "learning_rate": 8.598884066955983e-07, + "loss": 0.849, + "step": 221920 + }, + { + "epoch": 17.19787670967492, + "grad_norm": 1.7735598140030508, + "learning_rate": 8.599271543707378e-07, + "loss": 0.8413, + "step": 221930 + }, + { + "epoch": 17.198651633151226, + "grad_norm": 1.5965321717928596, + "learning_rate": 8.599659020458773e-07, + "loss": 0.8524, + "step": 221940 + }, + { + "epoch": 17.199426556627532, + "grad_norm": 1.7223669953556793, + "learning_rate": 8.600046497210168e-07, + "loss": 0.8376, + "step": 221950 + }, + { + "epoch": 17.20020148010384, + "grad_norm": 1.687701591143879, + "learning_rate": 8.600433973961563e-07, + "loss": 0.8473, + "step": 221960 + }, + { + "epoch": 17.200976403580146, + "grad_norm": 1.718986898498741, + "learning_rate": 8.600821450712958e-07, + "loss": 0.8452, + "step": 221970 + }, + { + "epoch": 17.201751327056453, + "grad_norm": 1.5868000499786257, + "learning_rate": 8.601208927464352e-07, + "loss": 0.8486, + "step": 221980 + }, + { + "epoch": 17.20252625053276, + "grad_norm": 1.9771980518496732, + "learning_rate": 8.601596404215748e-07, + "loss": 0.8691, + "step": 221990 + }, + { + "epoch": 17.203301174009066, + "grad_norm": 1.6328299836271418, + "learning_rate": 8.601983880967142e-07, + "loss": 0.8426, + "step": 222000 + }, + { + "epoch": 17.203301174009066, + "eval_loss": 0.8898579478263855, + "eval_runtime": 330.8402, + "eval_samples_per_second": 34.672, + "eval_steps_per_second": 8.669, + "step": 222000 + }, + { + "epoch": 17.204076097485373, + "grad_norm": 1.5768863736046896, + "learning_rate": 8.602371357718538e-07, + "loss": 0.8381, + "step": 222010 + }, + { + "epoch": 17.20485102096168, + "grad_norm": 1.6202266009673363, + "learning_rate": 8.602758834469932e-07, + "loss": 0.8514, + "step": 222020 + }, + { + "epoch": 17.205625944437987, + "grad_norm": 1.76339933848177, + "learning_rate": 8.603146311221328e-07, + "loss": 0.8517, + "step": 222030 + }, + { + "epoch": 17.206400867914294, + "grad_norm": 1.6523718205702957, + "learning_rate": 8.603533787972722e-07, + "loss": 0.8557, + "step": 222040 + }, + { + "epoch": 17.2071757913906, + "grad_norm": 1.606069747050505, + "learning_rate": 8.603921264724117e-07, + "loss": 0.8345, + "step": 222050 + }, + { + "epoch": 17.207950714866907, + "grad_norm": 1.6167800413493154, + "learning_rate": 8.604308741475512e-07, + "loss": 0.8557, + "step": 222060 + }, + { + "epoch": 17.208725638343214, + "grad_norm": 1.6824154648163947, + "learning_rate": 8.604696218226907e-07, + "loss": 0.8627, + "step": 222070 + }, + { + "epoch": 17.20950056181952, + "grad_norm": 1.6372738868368095, + "learning_rate": 8.605083694978301e-07, + "loss": 0.8505, + "step": 222080 + }, + { + "epoch": 17.210275485295828, + "grad_norm": 1.6227904963511977, + "learning_rate": 8.605471171729697e-07, + "loss": 0.8445, + "step": 222090 + }, + { + "epoch": 17.211050408772135, + "grad_norm": 1.690609513298617, + "learning_rate": 8.605858648481091e-07, + "loss": 0.8595, + "step": 222100 + }, + { + "epoch": 17.21182533224844, + "grad_norm": 1.546180125009989, + "learning_rate": 8.606246125232487e-07, + "loss": 0.8392, + "step": 222110 + }, + { + "epoch": 17.212600255724748, + "grad_norm": 1.5525726538734645, + "learning_rate": 8.606633601983881e-07, + "loss": 0.8727, + "step": 222120 + }, + { + "epoch": 17.213375179201055, + "grad_norm": 1.5423043204774662, + "learning_rate": 8.607021078735277e-07, + "loss": 0.8605, + "step": 222130 + }, + { + "epoch": 17.214150102677362, + "grad_norm": 1.6143445093438882, + "learning_rate": 8.607408555486671e-07, + "loss": 0.8535, + "step": 222140 + }, + { + "epoch": 17.21492502615367, + "grad_norm": 1.6142277906190488, + "learning_rate": 8.607796032238066e-07, + "loss": 0.8597, + "step": 222150 + }, + { + "epoch": 17.215699949629975, + "grad_norm": 1.6976422229146073, + "learning_rate": 8.608183508989461e-07, + "loss": 0.845, + "step": 222160 + }, + { + "epoch": 17.216474873106282, + "grad_norm": 1.5878810371504337, + "learning_rate": 8.608570985740857e-07, + "loss": 0.8475, + "step": 222170 + }, + { + "epoch": 17.21724979658259, + "grad_norm": 1.596029276306302, + "learning_rate": 8.608958462492251e-07, + "loss": 0.8571, + "step": 222180 + }, + { + "epoch": 17.218024720058896, + "grad_norm": 1.6139056425685094, + "learning_rate": 8.609345939243646e-07, + "loss": 0.8464, + "step": 222190 + }, + { + "epoch": 17.218799643535203, + "grad_norm": 1.6785873091772738, + "learning_rate": 8.60973341599504e-07, + "loss": 0.8499, + "step": 222200 + }, + { + "epoch": 17.219574567011506, + "grad_norm": 1.5727879354081902, + "learning_rate": 8.610120892746436e-07, + "loss": 0.8466, + "step": 222210 + }, + { + "epoch": 17.220349490487813, + "grad_norm": 1.62932123850597, + "learning_rate": 8.61050836949783e-07, + "loss": 0.8454, + "step": 222220 + }, + { + "epoch": 17.22112441396412, + "grad_norm": 1.6123296637955213, + "learning_rate": 8.610895846249226e-07, + "loss": 0.8557, + "step": 222230 + }, + { + "epoch": 17.221899337440426, + "grad_norm": 1.6320395573011302, + "learning_rate": 8.61128332300062e-07, + "loss": 0.8423, + "step": 222240 + }, + { + "epoch": 17.222674260916733, + "grad_norm": 1.683660688295373, + "learning_rate": 8.611670799752015e-07, + "loss": 0.8306, + "step": 222250 + }, + { + "epoch": 17.22344918439304, + "grad_norm": 1.5851505336065395, + "learning_rate": 8.61205827650341e-07, + "loss": 0.8349, + "step": 222260 + }, + { + "epoch": 17.224224107869347, + "grad_norm": 1.698134862837524, + "learning_rate": 8.612445753254806e-07, + "loss": 0.8298, + "step": 222270 + }, + { + "epoch": 17.224999031345654, + "grad_norm": 1.6300136617607708, + "learning_rate": 8.6128332300062e-07, + "loss": 0.8234, + "step": 222280 + }, + { + "epoch": 17.22577395482196, + "grad_norm": 1.5711255497009384, + "learning_rate": 8.613220706757595e-07, + "loss": 0.8639, + "step": 222290 + }, + { + "epoch": 17.226548878298267, + "grad_norm": 1.6968646985167157, + "learning_rate": 8.613608183508989e-07, + "loss": 0.8424, + "step": 222300 + }, + { + "epoch": 17.227323801774574, + "grad_norm": 1.7316616411236017, + "learning_rate": 8.613995660260386e-07, + "loss": 0.8397, + "step": 222310 + }, + { + "epoch": 17.22809872525088, + "grad_norm": 1.6218117854177012, + "learning_rate": 8.61438313701178e-07, + "loss": 0.8227, + "step": 222320 + }, + { + "epoch": 17.228873648727188, + "grad_norm": 1.6276712426269009, + "learning_rate": 8.614770613763175e-07, + "loss": 0.8478, + "step": 222330 + }, + { + "epoch": 17.229648572203494, + "grad_norm": 1.566257689735368, + "learning_rate": 8.615158090514569e-07, + "loss": 0.8333, + "step": 222340 + }, + { + "epoch": 17.2304234956798, + "grad_norm": 1.5848526772115221, + "learning_rate": 8.615545567265964e-07, + "loss": 0.8384, + "step": 222350 + }, + { + "epoch": 17.231198419156108, + "grad_norm": 1.556864884788297, + "learning_rate": 8.615933044017359e-07, + "loss": 0.8459, + "step": 222360 + }, + { + "epoch": 17.231973342632415, + "grad_norm": 1.7035330281513978, + "learning_rate": 8.616320520768755e-07, + "loss": 0.8473, + "step": 222370 + }, + { + "epoch": 17.23274826610872, + "grad_norm": 1.7810174291332295, + "learning_rate": 8.616707997520149e-07, + "loss": 0.8419, + "step": 222380 + }, + { + "epoch": 17.23352318958503, + "grad_norm": 1.582968918269541, + "learning_rate": 8.617095474271544e-07, + "loss": 0.8243, + "step": 222390 + }, + { + "epoch": 17.234298113061335, + "grad_norm": 1.6951607770760784, + "learning_rate": 8.617482951022938e-07, + "loss": 0.8468, + "step": 222400 + }, + { + "epoch": 17.235073036537642, + "grad_norm": 1.6909445718931686, + "learning_rate": 8.617870427774335e-07, + "loss": 0.8524, + "step": 222410 + }, + { + "epoch": 17.23584796001395, + "grad_norm": 1.750205480680284, + "learning_rate": 8.618257904525729e-07, + "loss": 0.8497, + "step": 222420 + }, + { + "epoch": 17.236622883490256, + "grad_norm": 1.6539011586326222, + "learning_rate": 8.618645381277124e-07, + "loss": 0.8352, + "step": 222430 + }, + { + "epoch": 17.237397806966563, + "grad_norm": 1.7714336688733296, + "learning_rate": 8.619032858028518e-07, + "loss": 0.8384, + "step": 222440 + }, + { + "epoch": 17.23817273044287, + "grad_norm": 1.6462437981895424, + "learning_rate": 8.619420334779915e-07, + "loss": 0.8402, + "step": 222450 + }, + { + "epoch": 17.238947653919176, + "grad_norm": 1.6340954664404075, + "learning_rate": 8.619807811531309e-07, + "loss": 0.8564, + "step": 222460 + }, + { + "epoch": 17.239722577395483, + "grad_norm": 1.7363231839272897, + "learning_rate": 8.620195288282704e-07, + "loss": 0.85, + "step": 222470 + }, + { + "epoch": 17.24049750087179, + "grad_norm": 1.6032384332100154, + "learning_rate": 8.620582765034098e-07, + "loss": 0.8519, + "step": 222480 + }, + { + "epoch": 17.241272424348097, + "grad_norm": 1.582596744772214, + "learning_rate": 8.620970241785493e-07, + "loss": 0.8529, + "step": 222490 + }, + { + "epoch": 17.242047347824403, + "grad_norm": 1.6230570993217903, + "learning_rate": 8.621357718536887e-07, + "loss": 0.8567, + "step": 222500 + }, + { + "epoch": 17.242047347824403, + "eval_loss": 0.8897494077682495, + "eval_runtime": 329.106, + "eval_samples_per_second": 34.855, + "eval_steps_per_second": 8.715, + "step": 222500 + }, + { + "epoch": 17.24282227130071, + "grad_norm": 1.623786212874591, + "learning_rate": 8.621745195288284e-07, + "loss": 0.8704, + "step": 222510 + }, + { + "epoch": 17.243597194777017, + "grad_norm": 1.6349031749947387, + "learning_rate": 8.622132672039678e-07, + "loss": 0.8492, + "step": 222520 + }, + { + "epoch": 17.244372118253324, + "grad_norm": 1.6542668592935958, + "learning_rate": 8.622520148791073e-07, + "loss": 0.8488, + "step": 222530 + }, + { + "epoch": 17.24514704172963, + "grad_norm": 1.6173719720204003, + "learning_rate": 8.622907625542467e-07, + "loss": 0.8426, + "step": 222540 + }, + { + "epoch": 17.245921965205937, + "grad_norm": 1.754588320247158, + "learning_rate": 8.623295102293864e-07, + "loss": 0.861, + "step": 222550 + }, + { + "epoch": 17.246696888682244, + "grad_norm": 1.6207992318779474, + "learning_rate": 8.623682579045258e-07, + "loss": 0.856, + "step": 222560 + }, + { + "epoch": 17.24747181215855, + "grad_norm": 1.6797089702817998, + "learning_rate": 8.624070055796653e-07, + "loss": 0.8495, + "step": 222570 + }, + { + "epoch": 17.248246735634854, + "grad_norm": 1.532478717447235, + "learning_rate": 8.624457532548047e-07, + "loss": 0.8502, + "step": 222580 + }, + { + "epoch": 17.24902165911116, + "grad_norm": 1.7997212510332707, + "learning_rate": 8.624845009299443e-07, + "loss": 0.8497, + "step": 222590 + }, + { + "epoch": 17.249796582587468, + "grad_norm": 1.682957886985198, + "learning_rate": 8.625232486050838e-07, + "loss": 0.8531, + "step": 222600 + }, + { + "epoch": 17.250571506063775, + "grad_norm": 1.7555453721732643, + "learning_rate": 8.625619962802233e-07, + "loss": 0.8418, + "step": 222610 + }, + { + "epoch": 17.25134642954008, + "grad_norm": 1.6944745833810126, + "learning_rate": 8.626007439553627e-07, + "loss": 0.8491, + "step": 222620 + }, + { + "epoch": 17.25212135301639, + "grad_norm": 1.6520267077730675, + "learning_rate": 8.626394916305022e-07, + "loss": 0.833, + "step": 222630 + }, + { + "epoch": 17.252896276492695, + "grad_norm": 1.6755383087851903, + "learning_rate": 8.626782393056416e-07, + "loss": 0.8479, + "step": 222640 + }, + { + "epoch": 17.253671199969002, + "grad_norm": 1.5627245529431855, + "learning_rate": 8.627169869807813e-07, + "loss": 0.8664, + "step": 222650 + }, + { + "epoch": 17.25444612344531, + "grad_norm": 1.6896403338598962, + "learning_rate": 8.627557346559207e-07, + "loss": 0.8324, + "step": 222660 + }, + { + "epoch": 17.255221046921616, + "grad_norm": 1.6988041487064354, + "learning_rate": 8.627944823310602e-07, + "loss": 0.8453, + "step": 222670 + }, + { + "epoch": 17.255995970397922, + "grad_norm": 1.6926077520343936, + "learning_rate": 8.628332300061996e-07, + "loss": 0.8498, + "step": 222680 + }, + { + "epoch": 17.25677089387423, + "grad_norm": 1.668292924658509, + "learning_rate": 8.628719776813392e-07, + "loss": 0.8423, + "step": 222690 + }, + { + "epoch": 17.257545817350536, + "grad_norm": 1.6837909587382738, + "learning_rate": 8.629107253564787e-07, + "loss": 0.8433, + "step": 222700 + }, + { + "epoch": 17.258320740826843, + "grad_norm": 1.6480572195526306, + "learning_rate": 8.629494730316182e-07, + "loss": 0.8627, + "step": 222710 + }, + { + "epoch": 17.25909566430315, + "grad_norm": 1.6341442785784903, + "learning_rate": 8.629882207067576e-07, + "loss": 0.8462, + "step": 222720 + }, + { + "epoch": 17.259870587779456, + "grad_norm": 1.5594332725724775, + "learning_rate": 8.630269683818972e-07, + "loss": 0.8545, + "step": 222730 + }, + { + "epoch": 17.260645511255763, + "grad_norm": 1.675176550843585, + "learning_rate": 8.630657160570366e-07, + "loss": 0.8629, + "step": 222740 + }, + { + "epoch": 17.26142043473207, + "grad_norm": 1.719880537858093, + "learning_rate": 8.631044637321762e-07, + "loss": 0.8273, + "step": 222750 + }, + { + "epoch": 17.262195358208377, + "grad_norm": 1.5682379420067314, + "learning_rate": 8.631432114073156e-07, + "loss": 0.8406, + "step": 222760 + }, + { + "epoch": 17.262970281684684, + "grad_norm": 1.4408320666849503, + "learning_rate": 8.631819590824551e-07, + "loss": 0.8432, + "step": 222770 + }, + { + "epoch": 17.26374520516099, + "grad_norm": 1.6726535171543564, + "learning_rate": 8.632207067575945e-07, + "loss": 0.8479, + "step": 222780 + }, + { + "epoch": 17.264520128637297, + "grad_norm": 1.4885195315957136, + "learning_rate": 8.632594544327341e-07, + "loss": 0.8319, + "step": 222790 + }, + { + "epoch": 17.265295052113604, + "grad_norm": 2.0272949534055593, + "learning_rate": 8.632982021078736e-07, + "loss": 0.8489, + "step": 222800 + }, + { + "epoch": 17.26606997558991, + "grad_norm": 1.5705899903921159, + "learning_rate": 8.633369497830131e-07, + "loss": 0.8473, + "step": 222810 + }, + { + "epoch": 17.266844899066218, + "grad_norm": 1.6805482623717907, + "learning_rate": 8.633756974581525e-07, + "loss": 0.8656, + "step": 222820 + }, + { + "epoch": 17.267619822542525, + "grad_norm": 1.6003319884178284, + "learning_rate": 8.634144451332921e-07, + "loss": 0.8457, + "step": 222830 + }, + { + "epoch": 17.26839474601883, + "grad_norm": 1.6885995575263377, + "learning_rate": 8.634531928084315e-07, + "loss": 0.8412, + "step": 222840 + }, + { + "epoch": 17.269169669495138, + "grad_norm": 1.7458823430070516, + "learning_rate": 8.634919404835711e-07, + "loss": 0.847, + "step": 222850 + }, + { + "epoch": 17.269944592971445, + "grad_norm": 1.6153410653406801, + "learning_rate": 8.635306881587105e-07, + "loss": 0.8364, + "step": 222860 + }, + { + "epoch": 17.270719516447752, + "grad_norm": 1.7011005122203622, + "learning_rate": 8.635694358338501e-07, + "loss": 0.8631, + "step": 222870 + }, + { + "epoch": 17.27149443992406, + "grad_norm": 1.8128075654076934, + "learning_rate": 8.636081835089895e-07, + "loss": 0.8673, + "step": 222880 + }, + { + "epoch": 17.272269363400365, + "grad_norm": 1.6656379138145778, + "learning_rate": 8.63646931184129e-07, + "loss": 0.8205, + "step": 222890 + }, + { + "epoch": 17.273044286876672, + "grad_norm": 1.631281223495839, + "learning_rate": 8.636856788592685e-07, + "loss": 0.8512, + "step": 222900 + }, + { + "epoch": 17.27381921035298, + "grad_norm": 1.6196889527685276, + "learning_rate": 8.63724426534408e-07, + "loss": 0.8585, + "step": 222910 + }, + { + "epoch": 17.274594133829286, + "grad_norm": 1.5622307748398778, + "learning_rate": 8.637631742095474e-07, + "loss": 0.8393, + "step": 222920 + }, + { + "epoch": 17.275369057305593, + "grad_norm": 1.6458092976359753, + "learning_rate": 8.63801921884687e-07, + "loss": 0.8369, + "step": 222930 + }, + { + "epoch": 17.2761439807819, + "grad_norm": 1.7595731062014761, + "learning_rate": 8.638406695598264e-07, + "loss": 0.8556, + "step": 222940 + }, + { + "epoch": 17.276918904258203, + "grad_norm": 1.6828611616239881, + "learning_rate": 8.63879417234966e-07, + "loss": 0.8576, + "step": 222950 + }, + { + "epoch": 17.27769382773451, + "grad_norm": 1.7158880819906779, + "learning_rate": 8.639181649101054e-07, + "loss": 0.8471, + "step": 222960 + }, + { + "epoch": 17.278468751210816, + "grad_norm": 1.7245794811212396, + "learning_rate": 8.63956912585245e-07, + "loss": 0.8492, + "step": 222970 + }, + { + "epoch": 17.279243674687123, + "grad_norm": 1.6175460928375174, + "learning_rate": 8.639956602603844e-07, + "loss": 0.8516, + "step": 222980 + }, + { + "epoch": 17.28001859816343, + "grad_norm": 1.704742095640831, + "learning_rate": 8.64034407935524e-07, + "loss": 0.8362, + "step": 222990 + }, + { + "epoch": 17.280793521639737, + "grad_norm": 1.6736857018871645, + "learning_rate": 8.640731556106634e-07, + "loss": 0.854, + "step": 223000 + }, + { + "epoch": 17.280793521639737, + "eval_loss": 0.8897544145584106, + "eval_runtime": 333.1983, + "eval_samples_per_second": 34.427, + "eval_steps_per_second": 8.607, + "step": 223000 + }, + { + "epoch": 17.281568445116044, + "grad_norm": 1.6491724403483325, + "learning_rate": 8.641119032858029e-07, + "loss": 0.8565, + "step": 223010 + }, + { + "epoch": 17.28234336859235, + "grad_norm": 1.6003162268127715, + "learning_rate": 8.641506509609424e-07, + "loss": 0.8543, + "step": 223020 + }, + { + "epoch": 17.283118292068657, + "grad_norm": 1.7088663871329883, + "learning_rate": 8.641893986360819e-07, + "loss": 0.8339, + "step": 223030 + }, + { + "epoch": 17.283893215544964, + "grad_norm": 1.7164174804723031, + "learning_rate": 8.642281463112213e-07, + "loss": 0.8519, + "step": 223040 + }, + { + "epoch": 17.28466813902127, + "grad_norm": 1.7969284852715546, + "learning_rate": 8.642668939863609e-07, + "loss": 0.8358, + "step": 223050 + }, + { + "epoch": 17.285443062497578, + "grad_norm": 1.6335653358783087, + "learning_rate": 8.643056416615003e-07, + "loss": 0.8392, + "step": 223060 + }, + { + "epoch": 17.286217985973884, + "grad_norm": 1.7686648028724077, + "learning_rate": 8.643443893366399e-07, + "loss": 0.8514, + "step": 223070 + }, + { + "epoch": 17.28699290945019, + "grad_norm": 1.6166046589086867, + "learning_rate": 8.643831370117793e-07, + "loss": 0.8437, + "step": 223080 + }, + { + "epoch": 17.287767832926498, + "grad_norm": 1.7939016310583782, + "learning_rate": 8.644218846869189e-07, + "loss": 0.8475, + "step": 223090 + }, + { + "epoch": 17.288542756402805, + "grad_norm": 1.663692162643579, + "learning_rate": 8.644606323620583e-07, + "loss": 0.8357, + "step": 223100 + }, + { + "epoch": 17.28931767987911, + "grad_norm": 1.6407290295935362, + "learning_rate": 8.644993800371979e-07, + "loss": 0.8447, + "step": 223110 + }, + { + "epoch": 17.29009260335542, + "grad_norm": 1.6270639693531124, + "learning_rate": 8.645381277123373e-07, + "loss": 0.855, + "step": 223120 + }, + { + "epoch": 17.290867526831725, + "grad_norm": 1.621422653606517, + "learning_rate": 8.645768753874768e-07, + "loss": 0.8387, + "step": 223130 + }, + { + "epoch": 17.291642450308032, + "grad_norm": 1.6310986572382051, + "learning_rate": 8.646156230626162e-07, + "loss": 0.8437, + "step": 223140 + }, + { + "epoch": 17.29241737378434, + "grad_norm": 1.6479923123200038, + "learning_rate": 8.646543707377558e-07, + "loss": 0.8472, + "step": 223150 + }, + { + "epoch": 17.293192297260646, + "grad_norm": 1.654096930714857, + "learning_rate": 8.646931184128953e-07, + "loss": 0.8397, + "step": 223160 + }, + { + "epoch": 17.293967220736953, + "grad_norm": 1.6368735351696893, + "learning_rate": 8.647318660880348e-07, + "loss": 0.8433, + "step": 223170 + }, + { + "epoch": 17.29474214421326, + "grad_norm": 1.6023151062536223, + "learning_rate": 8.647706137631742e-07, + "loss": 0.8444, + "step": 223180 + }, + { + "epoch": 17.295517067689566, + "grad_norm": 1.7884173978143831, + "learning_rate": 8.648093614383138e-07, + "loss": 0.8437, + "step": 223190 + }, + { + "epoch": 17.296291991165873, + "grad_norm": 1.5591604990916859, + "learning_rate": 8.648481091134532e-07, + "loss": 0.8328, + "step": 223200 + }, + { + "epoch": 17.29706691464218, + "grad_norm": 1.5586723815162484, + "learning_rate": 8.648868567885928e-07, + "loss": 0.8406, + "step": 223210 + }, + { + "epoch": 17.297841838118487, + "grad_norm": 1.5613233904742043, + "learning_rate": 8.649256044637322e-07, + "loss": 0.8612, + "step": 223220 + }, + { + "epoch": 17.298616761594793, + "grad_norm": 1.5480507584659466, + "learning_rate": 8.649643521388717e-07, + "loss": 0.8335, + "step": 223230 + }, + { + "epoch": 17.2993916850711, + "grad_norm": 1.8079246814029275, + "learning_rate": 8.650030998140112e-07, + "loss": 0.8651, + "step": 223240 + }, + { + "epoch": 17.300166608547407, + "grad_norm": 1.714631675878963, + "learning_rate": 8.650418474891508e-07, + "loss": 0.8442, + "step": 223250 + }, + { + "epoch": 17.300941532023714, + "grad_norm": 1.6716617417400879, + "learning_rate": 8.650805951642902e-07, + "loss": 0.8613, + "step": 223260 + }, + { + "epoch": 17.30171645550002, + "grad_norm": 1.667157058268115, + "learning_rate": 8.651193428394297e-07, + "loss": 0.8509, + "step": 223270 + }, + { + "epoch": 17.302491378976327, + "grad_norm": 1.7304738466433822, + "learning_rate": 8.651580905145691e-07, + "loss": 0.8456, + "step": 223280 + }, + { + "epoch": 17.303266302452634, + "grad_norm": 1.7025108004132663, + "learning_rate": 8.651968381897087e-07, + "loss": 0.863, + "step": 223290 + }, + { + "epoch": 17.30404122592894, + "grad_norm": 1.6450985058351522, + "learning_rate": 8.652355858648482e-07, + "loss": 0.8427, + "step": 223300 + }, + { + "epoch": 17.304816149405248, + "grad_norm": 1.639934688604859, + "learning_rate": 8.652743335399877e-07, + "loss": 0.8406, + "step": 223310 + }, + { + "epoch": 17.305591072881555, + "grad_norm": 1.5729755541883172, + "learning_rate": 8.653130812151271e-07, + "loss": 0.8475, + "step": 223320 + }, + { + "epoch": 17.306365996357858, + "grad_norm": 1.656877594199705, + "learning_rate": 8.653518288902666e-07, + "loss": 0.8467, + "step": 223330 + }, + { + "epoch": 17.307140919834165, + "grad_norm": 1.7022938867086739, + "learning_rate": 8.653905765654061e-07, + "loss": 0.8487, + "step": 223340 + }, + { + "epoch": 17.30791584331047, + "grad_norm": 1.6649880559108425, + "learning_rate": 8.654293242405457e-07, + "loss": 0.8457, + "step": 223350 + }, + { + "epoch": 17.30869076678678, + "grad_norm": 1.961198092117871, + "learning_rate": 8.654680719156851e-07, + "loss": 0.866, + "step": 223360 + }, + { + "epoch": 17.309465690263085, + "grad_norm": 1.6585241420769667, + "learning_rate": 8.655068195908246e-07, + "loss": 0.828, + "step": 223370 + }, + { + "epoch": 17.310240613739392, + "grad_norm": 1.666602431174678, + "learning_rate": 8.65545567265964e-07, + "loss": 0.8607, + "step": 223380 + }, + { + "epoch": 17.3110155372157, + "grad_norm": 1.6650163236106665, + "learning_rate": 8.655843149411037e-07, + "loss": 0.871, + "step": 223390 + }, + { + "epoch": 17.311790460692006, + "grad_norm": 1.7411528899259758, + "learning_rate": 8.656230626162431e-07, + "loss": 0.8502, + "step": 223400 + }, + { + "epoch": 17.312565384168312, + "grad_norm": 1.5776329526046053, + "learning_rate": 8.656618102913826e-07, + "loss": 0.8487, + "step": 223410 + }, + { + "epoch": 17.31334030764462, + "grad_norm": 1.707507409149356, + "learning_rate": 8.65700557966522e-07, + "loss": 0.8707, + "step": 223420 + }, + { + "epoch": 17.314115231120926, + "grad_norm": 1.7734403632378697, + "learning_rate": 8.657393056416615e-07, + "loss": 0.8436, + "step": 223430 + }, + { + "epoch": 17.314890154597233, + "grad_norm": 1.5320097940986008, + "learning_rate": 8.657780533168011e-07, + "loss": 0.8428, + "step": 223440 + }, + { + "epoch": 17.31566507807354, + "grad_norm": 1.7027449734760969, + "learning_rate": 8.658168009919406e-07, + "loss": 0.8475, + "step": 223450 + }, + { + "epoch": 17.316440001549847, + "grad_norm": 1.5279997910951861, + "learning_rate": 8.6585554866708e-07, + "loss": 0.8337, + "step": 223460 + }, + { + "epoch": 17.317214925026153, + "grad_norm": 1.757855704982786, + "learning_rate": 8.658942963422195e-07, + "loss": 0.8503, + "step": 223470 + }, + { + "epoch": 17.31798984850246, + "grad_norm": 1.709872531779401, + "learning_rate": 8.659330440173589e-07, + "loss": 0.8358, + "step": 223480 + }, + { + "epoch": 17.318764771978767, + "grad_norm": 1.635273411385255, + "learning_rate": 8.659717916924986e-07, + "loss": 0.845, + "step": 223490 + }, + { + "epoch": 17.319539695455074, + "grad_norm": 1.6090989449551532, + "learning_rate": 8.66010539367638e-07, + "loss": 0.8454, + "step": 223500 + }, + { + "epoch": 17.319539695455074, + "eval_loss": 0.8895756602287292, + "eval_runtime": 331.4333, + "eval_samples_per_second": 34.61, + "eval_steps_per_second": 8.653, + "step": 223500 + }, + { + "epoch": 17.32031461893138, + "grad_norm": 1.6805840694334764, + "learning_rate": 8.660492870427775e-07, + "loss": 0.8528, + "step": 223510 + }, + { + "epoch": 17.321089542407687, + "grad_norm": 1.6801654959098953, + "learning_rate": 8.660880347179169e-07, + "loss": 0.8343, + "step": 223520 + }, + { + "epoch": 17.321864465883994, + "grad_norm": 1.6453405500148581, + "learning_rate": 8.661267823930565e-07, + "loss": 0.8492, + "step": 223530 + }, + { + "epoch": 17.3226393893603, + "grad_norm": 1.7076017709461866, + "learning_rate": 8.66165530068196e-07, + "loss": 0.8334, + "step": 223540 + }, + { + "epoch": 17.323414312836608, + "grad_norm": 1.6673012908400673, + "learning_rate": 8.662042777433355e-07, + "loss": 0.8339, + "step": 223550 + }, + { + "epoch": 17.324189236312915, + "grad_norm": 1.8129737109774633, + "learning_rate": 8.662430254184749e-07, + "loss": 0.8654, + "step": 223560 + }, + { + "epoch": 17.32496415978922, + "grad_norm": 1.635771446599036, + "learning_rate": 8.662817730936144e-07, + "loss": 0.8466, + "step": 223570 + }, + { + "epoch": 17.32573908326553, + "grad_norm": 1.70307697557474, + "learning_rate": 8.663205207687538e-07, + "loss": 0.8553, + "step": 223580 + }, + { + "epoch": 17.326514006741835, + "grad_norm": 1.5811177882298915, + "learning_rate": 8.663592684438935e-07, + "loss": 0.8457, + "step": 223590 + }, + { + "epoch": 17.327288930218142, + "grad_norm": 1.7282181479772585, + "learning_rate": 8.663980161190329e-07, + "loss": 0.8425, + "step": 223600 + }, + { + "epoch": 17.32806385369445, + "grad_norm": 1.6719050170421192, + "learning_rate": 8.664367637941724e-07, + "loss": 0.8544, + "step": 223610 + }, + { + "epoch": 17.328838777170755, + "grad_norm": 1.5867776557155713, + "learning_rate": 8.664755114693118e-07, + "loss": 0.8526, + "step": 223620 + }, + { + "epoch": 17.329613700647062, + "grad_norm": 1.7061155171984366, + "learning_rate": 8.665142591444514e-07, + "loss": 0.8554, + "step": 223630 + }, + { + "epoch": 17.33038862412337, + "grad_norm": 1.6191124238641388, + "learning_rate": 8.665530068195909e-07, + "loss": 0.842, + "step": 223640 + }, + { + "epoch": 17.331163547599676, + "grad_norm": 1.5595818803925263, + "learning_rate": 8.665917544947304e-07, + "loss": 0.8441, + "step": 223650 + }, + { + "epoch": 17.331938471075983, + "grad_norm": 1.59411570711331, + "learning_rate": 8.666305021698698e-07, + "loss": 0.8339, + "step": 223660 + }, + { + "epoch": 17.33271339455229, + "grad_norm": 1.5133159745322853, + "learning_rate": 8.666692498450094e-07, + "loss": 0.8203, + "step": 223670 + }, + { + "epoch": 17.333488318028596, + "grad_norm": 1.7304818481008033, + "learning_rate": 8.667079975201488e-07, + "loss": 0.8506, + "step": 223680 + }, + { + "epoch": 17.3342632415049, + "grad_norm": 1.6323148147474162, + "learning_rate": 8.667467451952884e-07, + "loss": 0.8501, + "step": 223690 + }, + { + "epoch": 17.335038164981206, + "grad_norm": 1.5705214176628155, + "learning_rate": 8.667854928704278e-07, + "loss": 0.8497, + "step": 223700 + }, + { + "epoch": 17.335813088457513, + "grad_norm": 1.7433399956147646, + "learning_rate": 8.668242405455673e-07, + "loss": 0.8452, + "step": 223710 + }, + { + "epoch": 17.33658801193382, + "grad_norm": 1.7532779692106655, + "learning_rate": 8.668629882207067e-07, + "loss": 0.8562, + "step": 223720 + }, + { + "epoch": 17.337362935410127, + "grad_norm": 1.7272037049343238, + "learning_rate": 8.669017358958464e-07, + "loss": 0.8362, + "step": 223730 + }, + { + "epoch": 17.338137858886434, + "grad_norm": 1.7530925019946768, + "learning_rate": 8.669404835709858e-07, + "loss": 0.8526, + "step": 223740 + }, + { + "epoch": 17.33891278236274, + "grad_norm": 1.6929744785199805, + "learning_rate": 8.669792312461253e-07, + "loss": 0.8349, + "step": 223750 + }, + { + "epoch": 17.339687705839047, + "grad_norm": 1.683643879108877, + "learning_rate": 8.670179789212647e-07, + "loss": 0.8523, + "step": 223760 + }, + { + "epoch": 17.340462629315354, + "grad_norm": 1.5899266336306235, + "learning_rate": 8.670567265964043e-07, + "loss": 0.8439, + "step": 223770 + }, + { + "epoch": 17.34123755279166, + "grad_norm": 1.7177345306492577, + "learning_rate": 8.670954742715437e-07, + "loss": 0.8639, + "step": 223780 + }, + { + "epoch": 17.342012476267968, + "grad_norm": 1.6724357467391866, + "learning_rate": 8.671342219466833e-07, + "loss": 0.8542, + "step": 223790 + }, + { + "epoch": 17.342787399744275, + "grad_norm": 1.639844082949553, + "learning_rate": 8.671729696218227e-07, + "loss": 0.8478, + "step": 223800 + }, + { + "epoch": 17.34356232322058, + "grad_norm": 1.708840120282928, + "learning_rate": 8.672117172969623e-07, + "loss": 0.8608, + "step": 223810 + }, + { + "epoch": 17.344337246696888, + "grad_norm": 1.7344041523065352, + "learning_rate": 8.672504649721017e-07, + "loss": 0.8485, + "step": 223820 + }, + { + "epoch": 17.345112170173195, + "grad_norm": 1.6576751043001887, + "learning_rate": 8.672892126472413e-07, + "loss": 0.8392, + "step": 223830 + }, + { + "epoch": 17.3458870936495, + "grad_norm": 1.589643045913441, + "learning_rate": 8.673279603223807e-07, + "loss": 0.8664, + "step": 223840 + }, + { + "epoch": 17.34666201712581, + "grad_norm": 1.782002935241426, + "learning_rate": 8.673667079975202e-07, + "loss": 0.8393, + "step": 223850 + }, + { + "epoch": 17.347436940602115, + "grad_norm": 1.8247326053070925, + "learning_rate": 8.674054556726596e-07, + "loss": 0.8434, + "step": 223860 + }, + { + "epoch": 17.348211864078422, + "grad_norm": 1.6727885607696467, + "learning_rate": 8.674442033477992e-07, + "loss": 0.8551, + "step": 223870 + }, + { + "epoch": 17.34898678755473, + "grad_norm": 1.5405624121824482, + "learning_rate": 8.674829510229387e-07, + "loss": 0.861, + "step": 223880 + }, + { + "epoch": 17.349761711031036, + "grad_norm": 1.572174729379926, + "learning_rate": 8.675216986980782e-07, + "loss": 0.8681, + "step": 223890 + }, + { + "epoch": 17.350536634507343, + "grad_norm": 1.7536280035441068, + "learning_rate": 8.675604463732176e-07, + "loss": 0.8481, + "step": 223900 + }, + { + "epoch": 17.35131155798365, + "grad_norm": 1.7427694158976237, + "learning_rate": 8.675991940483572e-07, + "loss": 0.8411, + "step": 223910 + }, + { + "epoch": 17.352086481459956, + "grad_norm": 1.656216605829334, + "learning_rate": 8.676379417234966e-07, + "loss": 0.8304, + "step": 223920 + }, + { + "epoch": 17.352861404936263, + "grad_norm": 1.7585429975849765, + "learning_rate": 8.676766893986362e-07, + "loss": 0.8528, + "step": 223930 + }, + { + "epoch": 17.35363632841257, + "grad_norm": 1.6488169353096633, + "learning_rate": 8.677154370737756e-07, + "loss": 0.8513, + "step": 223940 + }, + { + "epoch": 17.354411251888877, + "grad_norm": 1.6142815623242193, + "learning_rate": 8.677541847489152e-07, + "loss": 0.8517, + "step": 223950 + }, + { + "epoch": 17.355186175365183, + "grad_norm": 1.8236089798200763, + "learning_rate": 8.677929324240546e-07, + "loss": 0.8478, + "step": 223960 + }, + { + "epoch": 17.35596109884149, + "grad_norm": 1.616093248841766, + "learning_rate": 8.678316800991941e-07, + "loss": 0.8331, + "step": 223970 + }, + { + "epoch": 17.356736022317797, + "grad_norm": 1.6225100711007832, + "learning_rate": 8.678704277743336e-07, + "loss": 0.845, + "step": 223980 + }, + { + "epoch": 17.357510945794104, + "grad_norm": 1.7688716817385441, + "learning_rate": 8.679091754494731e-07, + "loss": 0.8331, + "step": 223990 + }, + { + "epoch": 17.35828586927041, + "grad_norm": 1.6776090878259577, + "learning_rate": 8.679479231246125e-07, + "loss": 0.8476, + "step": 224000 + }, + { + "epoch": 17.35828586927041, + "eval_loss": 0.8897639513015747, + "eval_runtime": 332.1212, + "eval_samples_per_second": 34.539, + "eval_steps_per_second": 8.635, + "step": 224000 + }, + { + "epoch": 17.359060792746718, + "grad_norm": 1.5598008080596977, + "learning_rate": 8.679866707997521e-07, + "loss": 0.8492, + "step": 224010 + }, + { + "epoch": 17.359835716223024, + "grad_norm": 1.7574899496801908, + "learning_rate": 8.680254184748915e-07, + "loss": 0.8463, + "step": 224020 + }, + { + "epoch": 17.36061063969933, + "grad_norm": 1.7594018155517293, + "learning_rate": 8.680641661500311e-07, + "loss": 0.8385, + "step": 224030 + }, + { + "epoch": 17.361385563175638, + "grad_norm": 1.73634059835606, + "learning_rate": 8.681029138251705e-07, + "loss": 0.851, + "step": 224040 + }, + { + "epoch": 17.362160486651945, + "grad_norm": 1.6373808058908315, + "learning_rate": 8.681416615003101e-07, + "loss": 0.853, + "step": 224050 + }, + { + "epoch": 17.36293541012825, + "grad_norm": 1.6009280324213992, + "learning_rate": 8.681804091754495e-07, + "loss": 0.8316, + "step": 224060 + }, + { + "epoch": 17.363710333604555, + "grad_norm": 1.7725698866612136, + "learning_rate": 8.68219156850589e-07, + "loss": 0.8525, + "step": 224070 + }, + { + "epoch": 17.36448525708086, + "grad_norm": 1.740063518390895, + "learning_rate": 8.682579045257285e-07, + "loss": 0.853, + "step": 224080 + }, + { + "epoch": 17.36526018055717, + "grad_norm": 1.645706161868523, + "learning_rate": 8.682966522008681e-07, + "loss": 0.8623, + "step": 224090 + }, + { + "epoch": 17.366035104033475, + "grad_norm": 1.7038504639168206, + "learning_rate": 8.683353998760075e-07, + "loss": 0.8475, + "step": 224100 + }, + { + "epoch": 17.366810027509782, + "grad_norm": 1.6351153331615496, + "learning_rate": 8.68374147551147e-07, + "loss": 0.8579, + "step": 224110 + }, + { + "epoch": 17.36758495098609, + "grad_norm": 1.6671856257774649, + "learning_rate": 8.684128952262864e-07, + "loss": 0.8243, + "step": 224120 + }, + { + "epoch": 17.368359874462396, + "grad_norm": 1.687345424224812, + "learning_rate": 8.68451642901426e-07, + "loss": 0.8296, + "step": 224130 + }, + { + "epoch": 17.369134797938703, + "grad_norm": 1.7737791238853784, + "learning_rate": 8.684903905765654e-07, + "loss": 0.8389, + "step": 224140 + }, + { + "epoch": 17.36990972141501, + "grad_norm": 1.6470835829038875, + "learning_rate": 8.68529138251705e-07, + "loss": 0.858, + "step": 224150 + }, + { + "epoch": 17.370684644891316, + "grad_norm": 1.6166735837959851, + "learning_rate": 8.685678859268444e-07, + "loss": 0.8489, + "step": 224160 + }, + { + "epoch": 17.371459568367623, + "grad_norm": 1.848718632923758, + "learning_rate": 8.686066336019839e-07, + "loss": 0.8409, + "step": 224170 + }, + { + "epoch": 17.37223449184393, + "grad_norm": 1.6377364076460548, + "learning_rate": 8.686453812771234e-07, + "loss": 0.8489, + "step": 224180 + }, + { + "epoch": 17.373009415320237, + "grad_norm": 1.605138439558711, + "learning_rate": 8.68684128952263e-07, + "loss": 0.8512, + "step": 224190 + }, + { + "epoch": 17.373784338796543, + "grad_norm": 1.6368528506607523, + "learning_rate": 8.687228766274024e-07, + "loss": 0.8453, + "step": 224200 + }, + { + "epoch": 17.37455926227285, + "grad_norm": 1.5926399527911275, + "learning_rate": 8.687616243025419e-07, + "loss": 0.8401, + "step": 224210 + }, + { + "epoch": 17.375334185749157, + "grad_norm": 1.719392625837788, + "learning_rate": 8.688003719776813e-07, + "loss": 0.866, + "step": 224220 + }, + { + "epoch": 17.376109109225464, + "grad_norm": 1.6988721986872948, + "learning_rate": 8.68839119652821e-07, + "loss": 0.8537, + "step": 224230 + }, + { + "epoch": 17.37688403270177, + "grad_norm": 1.602091718513227, + "learning_rate": 8.688778673279604e-07, + "loss": 0.8437, + "step": 224240 + }, + { + "epoch": 17.377658956178077, + "grad_norm": 1.5737561996055414, + "learning_rate": 8.689166150030999e-07, + "loss": 0.8531, + "step": 224250 + }, + { + "epoch": 17.378433879654384, + "grad_norm": 1.5859419579650018, + "learning_rate": 8.689553626782393e-07, + "loss": 0.8452, + "step": 224260 + }, + { + "epoch": 17.37920880313069, + "grad_norm": 1.6941344681853512, + "learning_rate": 8.689941103533788e-07, + "loss": 0.8544, + "step": 224270 + }, + { + "epoch": 17.379983726606998, + "grad_norm": 1.7557868106983059, + "learning_rate": 8.690328580285183e-07, + "loss": 0.8586, + "step": 224280 + }, + { + "epoch": 17.380758650083305, + "grad_norm": 1.6095775331278082, + "learning_rate": 8.690716057036579e-07, + "loss": 0.8564, + "step": 224290 + }, + { + "epoch": 17.38153357355961, + "grad_norm": 1.5454194860981136, + "learning_rate": 8.691103533787973e-07, + "loss": 0.8451, + "step": 224300 + }, + { + "epoch": 17.38230849703592, + "grad_norm": 1.7041139002763495, + "learning_rate": 8.691491010539368e-07, + "loss": 0.8296, + "step": 224310 + }, + { + "epoch": 17.383083420512225, + "grad_norm": 1.6738883690504873, + "learning_rate": 8.691878487290762e-07, + "loss": 0.8333, + "step": 224320 + }, + { + "epoch": 17.383858343988532, + "grad_norm": 1.7332653138033423, + "learning_rate": 8.692265964042159e-07, + "loss": 0.8632, + "step": 224330 + }, + { + "epoch": 17.38463326746484, + "grad_norm": 1.568619894161251, + "learning_rate": 8.692653440793553e-07, + "loss": 0.8558, + "step": 224340 + }, + { + "epoch": 17.385408190941146, + "grad_norm": 1.5685689656138166, + "learning_rate": 8.693040917544948e-07, + "loss": 0.8365, + "step": 224350 + }, + { + "epoch": 17.386183114417452, + "grad_norm": 1.6407706967447866, + "learning_rate": 8.693428394296342e-07, + "loss": 0.8664, + "step": 224360 + }, + { + "epoch": 17.38695803789376, + "grad_norm": 1.5679360194355036, + "learning_rate": 8.693815871047739e-07, + "loss": 0.8553, + "step": 224370 + }, + { + "epoch": 17.387732961370066, + "grad_norm": 1.5348631622086943, + "learning_rate": 8.694203347799133e-07, + "loss": 0.8402, + "step": 224380 + }, + { + "epoch": 17.388507884846373, + "grad_norm": 1.7783616695846602, + "learning_rate": 8.694590824550528e-07, + "loss": 0.8678, + "step": 224390 + }, + { + "epoch": 17.38928280832268, + "grad_norm": 1.615267159996828, + "learning_rate": 8.694978301301922e-07, + "loss": 0.8433, + "step": 224400 + }, + { + "epoch": 17.390057731798986, + "grad_norm": 1.5962672040080692, + "learning_rate": 8.695365778053317e-07, + "loss": 0.8236, + "step": 224410 + }, + { + "epoch": 17.390832655275293, + "grad_norm": 1.604469509697241, + "learning_rate": 8.695753254804711e-07, + "loss": 0.851, + "step": 224420 + }, + { + "epoch": 17.3916075787516, + "grad_norm": 1.5551302579120574, + "learning_rate": 8.696140731556108e-07, + "loss": 0.85, + "step": 224430 + }, + { + "epoch": 17.392382502227903, + "grad_norm": 1.7427010769200608, + "learning_rate": 8.696528208307502e-07, + "loss": 0.8512, + "step": 224440 + }, + { + "epoch": 17.39315742570421, + "grad_norm": 2.1275125164966675, + "learning_rate": 8.696915685058897e-07, + "loss": 0.8312, + "step": 224450 + }, + { + "epoch": 17.393932349180517, + "grad_norm": 1.5889185289664152, + "learning_rate": 8.697303161810291e-07, + "loss": 0.8518, + "step": 224460 + }, + { + "epoch": 17.394707272656824, + "grad_norm": 1.7478280310160983, + "learning_rate": 8.697690638561688e-07, + "loss": 0.8665, + "step": 224470 + }, + { + "epoch": 17.39548219613313, + "grad_norm": 1.7287618086609866, + "learning_rate": 8.698078115313082e-07, + "loss": 0.8439, + "step": 224480 + }, + { + "epoch": 17.396257119609437, + "grad_norm": 1.7045107295162876, + "learning_rate": 8.698465592064477e-07, + "loss": 0.8611, + "step": 224490 + }, + { + "epoch": 17.397032043085744, + "grad_norm": 1.7459625670957826, + "learning_rate": 8.698853068815871e-07, + "loss": 0.8486, + "step": 224500 + }, + { + "epoch": 17.397032043085744, + "eval_loss": 0.889834463596344, + "eval_runtime": 325.8021, + "eval_samples_per_second": 35.208, + "eval_steps_per_second": 8.803, + "step": 224500 + }, + { + "epoch": 17.39780696656205, + "grad_norm": 1.6805940558043682, + "learning_rate": 8.699240545567266e-07, + "loss": 0.8683, + "step": 224510 + }, + { + "epoch": 17.398581890038358, + "grad_norm": 1.6171141632921509, + "learning_rate": 8.699628022318662e-07, + "loss": 0.8495, + "step": 224520 + }, + { + "epoch": 17.399356813514665, + "grad_norm": 1.562360178907797, + "learning_rate": 8.700015499070057e-07, + "loss": 0.833, + "step": 224530 + }, + { + "epoch": 17.40013173699097, + "grad_norm": 1.6675768160796465, + "learning_rate": 8.700402975821451e-07, + "loss": 0.8389, + "step": 224540 + }, + { + "epoch": 17.400906660467278, + "grad_norm": 1.592629767018269, + "learning_rate": 8.700790452572846e-07, + "loss": 0.8579, + "step": 224550 + }, + { + "epoch": 17.401681583943585, + "grad_norm": 1.641593181566823, + "learning_rate": 8.70117792932424e-07, + "loss": 0.8345, + "step": 224560 + }, + { + "epoch": 17.402456507419892, + "grad_norm": 1.6134676814137519, + "learning_rate": 8.701565406075637e-07, + "loss": 0.8577, + "step": 224570 + }, + { + "epoch": 17.4032314308962, + "grad_norm": 1.5235231222117995, + "learning_rate": 8.701952882827031e-07, + "loss": 0.8489, + "step": 224580 + }, + { + "epoch": 17.404006354372505, + "grad_norm": 1.591902617252256, + "learning_rate": 8.702340359578426e-07, + "loss": 0.8378, + "step": 224590 + }, + { + "epoch": 17.404781277848812, + "grad_norm": 1.6547891291722456, + "learning_rate": 8.70272783632982e-07, + "loss": 0.8476, + "step": 224600 + }, + { + "epoch": 17.40555620132512, + "grad_norm": 1.8139869378233118, + "learning_rate": 8.703115313081216e-07, + "loss": 0.8412, + "step": 224610 + }, + { + "epoch": 17.406331124801426, + "grad_norm": 1.6454549022582017, + "learning_rate": 8.703502789832611e-07, + "loss": 0.8332, + "step": 224620 + }, + { + "epoch": 17.407106048277733, + "grad_norm": 1.6244400908271193, + "learning_rate": 8.703890266584006e-07, + "loss": 0.8349, + "step": 224630 + }, + { + "epoch": 17.40788097175404, + "grad_norm": 1.6312825960448556, + "learning_rate": 8.7042777433354e-07, + "loss": 0.8616, + "step": 224640 + }, + { + "epoch": 17.408655895230346, + "grad_norm": 1.529303571455956, + "learning_rate": 8.704665220086795e-07, + "loss": 0.8414, + "step": 224650 + }, + { + "epoch": 17.409430818706653, + "grad_norm": 1.6257458885700538, + "learning_rate": 8.70505269683819e-07, + "loss": 0.8456, + "step": 224660 + }, + { + "epoch": 17.41020574218296, + "grad_norm": 1.6603182689815807, + "learning_rate": 8.705440173589586e-07, + "loss": 0.8471, + "step": 224670 + }, + { + "epoch": 17.410980665659267, + "grad_norm": 1.6186009791279745, + "learning_rate": 8.70582765034098e-07, + "loss": 0.8545, + "step": 224680 + }, + { + "epoch": 17.411755589135574, + "grad_norm": 1.6543907588790878, + "learning_rate": 8.706215127092375e-07, + "loss": 0.8455, + "step": 224690 + }, + { + "epoch": 17.41253051261188, + "grad_norm": 1.6887169490681624, + "learning_rate": 8.706602603843769e-07, + "loss": 0.841, + "step": 224700 + }, + { + "epoch": 17.413305436088187, + "grad_norm": 1.6393382028227097, + "learning_rate": 8.706990080595165e-07, + "loss": 0.8621, + "step": 224710 + }, + { + "epoch": 17.414080359564494, + "grad_norm": 1.6556784262214623, + "learning_rate": 8.70737755734656e-07, + "loss": 0.8543, + "step": 224720 + }, + { + "epoch": 17.4148552830408, + "grad_norm": 1.652146313450646, + "learning_rate": 8.707765034097955e-07, + "loss": 0.8507, + "step": 224730 + }, + { + "epoch": 17.415630206517108, + "grad_norm": 1.6505843859780942, + "learning_rate": 8.708152510849349e-07, + "loss": 0.848, + "step": 224740 + }, + { + "epoch": 17.416405129993414, + "grad_norm": 1.6435466659401803, + "learning_rate": 8.708539987600745e-07, + "loss": 0.8416, + "step": 224750 + }, + { + "epoch": 17.41718005346972, + "grad_norm": 1.6978296652167237, + "learning_rate": 8.708927464352139e-07, + "loss": 0.8447, + "step": 224760 + }, + { + "epoch": 17.417954976946028, + "grad_norm": 1.7144552425970296, + "learning_rate": 8.709314941103535e-07, + "loss": 0.8407, + "step": 224770 + }, + { + "epoch": 17.418729900422335, + "grad_norm": 1.7472740501064963, + "learning_rate": 8.709702417854929e-07, + "loss": 0.8635, + "step": 224780 + }, + { + "epoch": 17.41950482389864, + "grad_norm": 1.5261288101949024, + "learning_rate": 8.710089894606324e-07, + "loss": 0.842, + "step": 224790 + }, + { + "epoch": 17.42027974737495, + "grad_norm": 1.7122647110831317, + "learning_rate": 8.710477371357719e-07, + "loss": 0.8462, + "step": 224800 + }, + { + "epoch": 17.42105467085125, + "grad_norm": 1.7005532165216508, + "learning_rate": 8.710864848109114e-07, + "loss": 0.8515, + "step": 224810 + }, + { + "epoch": 17.42182959432756, + "grad_norm": 1.61904181380234, + "learning_rate": 8.711252324860509e-07, + "loss": 0.8711, + "step": 224820 + }, + { + "epoch": 17.422604517803865, + "grad_norm": 1.7034616836714371, + "learning_rate": 8.711639801611904e-07, + "loss": 0.8569, + "step": 224830 + }, + { + "epoch": 17.423379441280172, + "grad_norm": 1.6338166878877496, + "learning_rate": 8.712027278363298e-07, + "loss": 0.8316, + "step": 224840 + }, + { + "epoch": 17.42415436475648, + "grad_norm": 1.5693320150521373, + "learning_rate": 8.712414755114694e-07, + "loss": 0.8438, + "step": 224850 + }, + { + "epoch": 17.424929288232786, + "grad_norm": 1.6455565880866498, + "learning_rate": 8.712802231866088e-07, + "loss": 0.8731, + "step": 224860 + }, + { + "epoch": 17.425704211709093, + "grad_norm": 1.6825400423096761, + "learning_rate": 8.713189708617484e-07, + "loss": 0.8305, + "step": 224870 + }, + { + "epoch": 17.4264791351854, + "grad_norm": 1.5658461741167187, + "learning_rate": 8.713577185368878e-07, + "loss": 0.8411, + "step": 224880 + }, + { + "epoch": 17.427254058661706, + "grad_norm": 1.606367567769249, + "learning_rate": 8.713964662120274e-07, + "loss": 0.8294, + "step": 224890 + }, + { + "epoch": 17.428028982138013, + "grad_norm": 1.7640192123737666, + "learning_rate": 8.714352138871668e-07, + "loss": 0.8538, + "step": 224900 + }, + { + "epoch": 17.42880390561432, + "grad_norm": 1.7416978406698633, + "learning_rate": 8.714739615623063e-07, + "loss": 0.8607, + "step": 224910 + }, + { + "epoch": 17.429578829090627, + "grad_norm": 1.5185723717722381, + "learning_rate": 8.715127092374458e-07, + "loss": 0.8367, + "step": 224920 + }, + { + "epoch": 17.430353752566933, + "grad_norm": 1.6516196404034666, + "learning_rate": 8.715514569125853e-07, + "loss": 0.8412, + "step": 224930 + }, + { + "epoch": 17.43112867604324, + "grad_norm": 1.7323930483014425, + "learning_rate": 8.715902045877248e-07, + "loss": 0.8408, + "step": 224940 + }, + { + "epoch": 17.431903599519547, + "grad_norm": 1.5033323969927253, + "learning_rate": 8.716289522628643e-07, + "loss": 0.8414, + "step": 224950 + }, + { + "epoch": 17.432678522995854, + "grad_norm": 1.6672383384376332, + "learning_rate": 8.716676999380037e-07, + "loss": 0.8505, + "step": 224960 + }, + { + "epoch": 17.43345344647216, + "grad_norm": 1.6521271553126893, + "learning_rate": 8.717064476131433e-07, + "loss": 0.856, + "step": 224970 + }, + { + "epoch": 17.434228369948467, + "grad_norm": 1.7042802102069585, + "learning_rate": 8.717451952882827e-07, + "loss": 0.853, + "step": 224980 + }, + { + "epoch": 17.435003293424774, + "grad_norm": 1.5914783479243835, + "learning_rate": 8.717839429634223e-07, + "loss": 0.8492, + "step": 224990 + }, + { + "epoch": 17.43577821690108, + "grad_norm": 1.5507412706202996, + "learning_rate": 8.718226906385617e-07, + "loss": 0.8696, + "step": 225000 + }, + { + "epoch": 17.43577821690108, + "eval_loss": 0.8897844552993774, + "eval_runtime": 331.3176, + "eval_samples_per_second": 34.622, + "eval_steps_per_second": 8.656, + "step": 225000 + }, + { + "epoch": 17.436553140377388, + "grad_norm": 1.694258055295411, + "learning_rate": 8.718614383137012e-07, + "loss": 0.8474, + "step": 225010 + }, + { + "epoch": 17.437328063853695, + "grad_norm": 1.5975100456625577, + "learning_rate": 8.719001859888407e-07, + "loss": 0.8255, + "step": 225020 + }, + { + "epoch": 17.43810298733, + "grad_norm": 1.692195085881148, + "learning_rate": 8.719389336639803e-07, + "loss": 0.8498, + "step": 225030 + }, + { + "epoch": 17.43887791080631, + "grad_norm": 1.712812026661728, + "learning_rate": 8.719776813391197e-07, + "loss": 0.8479, + "step": 225040 + }, + { + "epoch": 17.439652834282615, + "grad_norm": 1.6911737916047769, + "learning_rate": 8.720164290142592e-07, + "loss": 0.8497, + "step": 225050 + }, + { + "epoch": 17.440427757758922, + "grad_norm": 1.6559194699932063, + "learning_rate": 8.720551766893986e-07, + "loss": 0.8548, + "step": 225060 + }, + { + "epoch": 17.44120268123523, + "grad_norm": 1.541786821933045, + "learning_rate": 8.720939243645382e-07, + "loss": 0.8343, + "step": 225070 + }, + { + "epoch": 17.441977604711536, + "grad_norm": 1.6407940052930707, + "learning_rate": 8.721326720396776e-07, + "loss": 0.8395, + "step": 225080 + }, + { + "epoch": 17.442752528187842, + "grad_norm": 1.7153600742679356, + "learning_rate": 8.721714197148172e-07, + "loss": 0.8556, + "step": 225090 + }, + { + "epoch": 17.44352745166415, + "grad_norm": 1.6036169278364825, + "learning_rate": 8.722101673899566e-07, + "loss": 0.8581, + "step": 225100 + }, + { + "epoch": 17.444302375140456, + "grad_norm": 1.6240723073738645, + "learning_rate": 8.722489150650961e-07, + "loss": 0.8607, + "step": 225110 + }, + { + "epoch": 17.445077298616763, + "grad_norm": 1.5306677140226477, + "learning_rate": 8.722876627402356e-07, + "loss": 0.8387, + "step": 225120 + }, + { + "epoch": 17.44585222209307, + "grad_norm": 1.7040761186778148, + "learning_rate": 8.723264104153752e-07, + "loss": 0.8567, + "step": 225130 + }, + { + "epoch": 17.446627145569376, + "grad_norm": 1.7008489367881212, + "learning_rate": 8.723651580905146e-07, + "loss": 0.8204, + "step": 225140 + }, + { + "epoch": 17.447402069045683, + "grad_norm": 1.6052322056759072, + "learning_rate": 8.724039057656541e-07, + "loss": 0.8469, + "step": 225150 + }, + { + "epoch": 17.44817699252199, + "grad_norm": 1.5746989624026888, + "learning_rate": 8.724426534407935e-07, + "loss": 0.8426, + "step": 225160 + }, + { + "epoch": 17.448951915998297, + "grad_norm": 1.6209591179501892, + "learning_rate": 8.724814011159332e-07, + "loss": 0.8527, + "step": 225170 + }, + { + "epoch": 17.4497268394746, + "grad_norm": 1.5156272936615875, + "learning_rate": 8.725201487910726e-07, + "loss": 0.8237, + "step": 225180 + }, + { + "epoch": 17.450501762950907, + "grad_norm": 1.6126049474707396, + "learning_rate": 8.725588964662121e-07, + "loss": 0.8361, + "step": 225190 + }, + { + "epoch": 17.451276686427214, + "grad_norm": 1.6855118809985965, + "learning_rate": 8.725976441413515e-07, + "loss": 0.8499, + "step": 225200 + }, + { + "epoch": 17.45205160990352, + "grad_norm": 1.7042354072553823, + "learning_rate": 8.72636391816491e-07, + "loss": 0.8635, + "step": 225210 + }, + { + "epoch": 17.452826533379827, + "grad_norm": 1.63680667676488, + "learning_rate": 8.726751394916305e-07, + "loss": 0.8364, + "step": 225220 + }, + { + "epoch": 17.453601456856134, + "grad_norm": 1.6064928783671784, + "learning_rate": 8.727138871667701e-07, + "loss": 0.8504, + "step": 225230 + }, + { + "epoch": 17.45437638033244, + "grad_norm": 1.6053804566004586, + "learning_rate": 8.727526348419095e-07, + "loss": 0.8527, + "step": 225240 + }, + { + "epoch": 17.455151303808748, + "grad_norm": 1.5698283368868264, + "learning_rate": 8.72791382517049e-07, + "loss": 0.8477, + "step": 225250 + }, + { + "epoch": 17.455926227285055, + "grad_norm": 1.6590737772039956, + "learning_rate": 8.728301301921885e-07, + "loss": 0.8546, + "step": 225260 + }, + { + "epoch": 17.45670115076136, + "grad_norm": 1.6302100569136013, + "learning_rate": 8.728688778673281e-07, + "loss": 0.8273, + "step": 225270 + }, + { + "epoch": 17.45747607423767, + "grad_norm": 1.6521647197652274, + "learning_rate": 8.729076255424675e-07, + "loss": 0.8438, + "step": 225280 + }, + { + "epoch": 17.458250997713975, + "grad_norm": 1.5927849458156562, + "learning_rate": 8.72946373217607e-07, + "loss": 0.8407, + "step": 225290 + }, + { + "epoch": 17.459025921190282, + "grad_norm": 1.679159409442493, + "learning_rate": 8.729851208927464e-07, + "loss": 0.8455, + "step": 225300 + }, + { + "epoch": 17.45980084466659, + "grad_norm": 1.7222911516869706, + "learning_rate": 8.730238685678861e-07, + "loss": 0.857, + "step": 225310 + }, + { + "epoch": 17.460575768142895, + "grad_norm": 1.6803970729566204, + "learning_rate": 8.730626162430255e-07, + "loss": 0.8448, + "step": 225320 + }, + { + "epoch": 17.461350691619202, + "grad_norm": 1.6552140278492355, + "learning_rate": 8.73101363918165e-07, + "loss": 0.8517, + "step": 225330 + }, + { + "epoch": 17.46212561509551, + "grad_norm": 1.5902790573065562, + "learning_rate": 8.731401115933044e-07, + "loss": 0.8431, + "step": 225340 + }, + { + "epoch": 17.462900538571816, + "grad_norm": 1.5658308056214432, + "learning_rate": 8.731788592684439e-07, + "loss": 0.8331, + "step": 225350 + }, + { + "epoch": 17.463675462048123, + "grad_norm": 1.6191707637870445, + "learning_rate": 8.732176069435834e-07, + "loss": 0.8475, + "step": 225360 + }, + { + "epoch": 17.46445038552443, + "grad_norm": 1.5152976874060498, + "learning_rate": 8.73256354618723e-07, + "loss": 0.8448, + "step": 225370 + }, + { + "epoch": 17.465225309000736, + "grad_norm": 1.555938898924301, + "learning_rate": 8.732951022938624e-07, + "loss": 0.8419, + "step": 225380 + }, + { + "epoch": 17.466000232477043, + "grad_norm": 1.6241324126581629, + "learning_rate": 8.733338499690019e-07, + "loss": 0.8685, + "step": 225390 + }, + { + "epoch": 17.46677515595335, + "grad_norm": 1.7842693924967408, + "learning_rate": 8.733725976441413e-07, + "loss": 0.8501, + "step": 225400 + }, + { + "epoch": 17.467550079429657, + "grad_norm": 1.6045204208920256, + "learning_rate": 8.73411345319281e-07, + "loss": 0.8476, + "step": 225410 + }, + { + "epoch": 17.468325002905964, + "grad_norm": 1.6727402558454234, + "learning_rate": 8.734500929944204e-07, + "loss": 0.8475, + "step": 225420 + }, + { + "epoch": 17.46909992638227, + "grad_norm": 1.5999090409313148, + "learning_rate": 8.734888406695599e-07, + "loss": 0.8354, + "step": 225430 + }, + { + "epoch": 17.469874849858577, + "grad_norm": 1.5813047860688212, + "learning_rate": 8.735275883446993e-07, + "loss": 0.8564, + "step": 225440 + }, + { + "epoch": 17.470649773334884, + "grad_norm": 1.6273737329942861, + "learning_rate": 8.735663360198389e-07, + "loss": 0.8529, + "step": 225450 + }, + { + "epoch": 17.47142469681119, + "grad_norm": 1.674990471235353, + "learning_rate": 8.736050836949784e-07, + "loss": 0.8558, + "step": 225460 + }, + { + "epoch": 17.472199620287498, + "grad_norm": 2.169951944491587, + "learning_rate": 8.736438313701179e-07, + "loss": 0.8703, + "step": 225470 + }, + { + "epoch": 17.472974543763804, + "grad_norm": 1.6140112919145253, + "learning_rate": 8.736825790452573e-07, + "loss": 0.8305, + "step": 225480 + }, + { + "epoch": 17.47374946724011, + "grad_norm": 1.5449193499179887, + "learning_rate": 8.737213267203968e-07, + "loss": 0.8428, + "step": 225490 + }, + { + "epoch": 17.474524390716418, + "grad_norm": 1.780355947183946, + "learning_rate": 8.737600743955362e-07, + "loss": 0.8576, + "step": 225500 + }, + { + "epoch": 17.474524390716418, + "eval_loss": 0.8892439007759094, + "eval_runtime": 331.9057, + "eval_samples_per_second": 34.561, + "eval_steps_per_second": 8.641, + "step": 225500 + }, + { + "epoch": 17.475299314192725, + "grad_norm": 1.5594905943963981, + "learning_rate": 8.737988220706759e-07, + "loss": 0.8594, + "step": 225510 + }, + { + "epoch": 17.47607423766903, + "grad_norm": 1.6711748862304618, + "learning_rate": 8.738375697458153e-07, + "loss": 0.845, + "step": 225520 + }, + { + "epoch": 17.47684916114534, + "grad_norm": 1.6053947978977807, + "learning_rate": 8.738763174209548e-07, + "loss": 0.8469, + "step": 225530 + }, + { + "epoch": 17.477624084621645, + "grad_norm": 1.5984217466424402, + "learning_rate": 8.739150650960942e-07, + "loss": 0.8242, + "step": 225540 + }, + { + "epoch": 17.478399008097952, + "grad_norm": 1.6983747199775934, + "learning_rate": 8.739538127712338e-07, + "loss": 0.8473, + "step": 225550 + }, + { + "epoch": 17.479173931574255, + "grad_norm": 1.6190201278827225, + "learning_rate": 8.739925604463733e-07, + "loss": 0.8471, + "step": 225560 + }, + { + "epoch": 17.479948855050562, + "grad_norm": 1.5762295541770315, + "learning_rate": 8.740313081215128e-07, + "loss": 0.8471, + "step": 225570 + }, + { + "epoch": 17.48072377852687, + "grad_norm": 1.6061562599505395, + "learning_rate": 8.740700557966522e-07, + "loss": 0.8286, + "step": 225580 + }, + { + "epoch": 17.481498702003176, + "grad_norm": 1.6002771600789092, + "learning_rate": 8.741088034717918e-07, + "loss": 0.8292, + "step": 225590 + }, + { + "epoch": 17.482273625479483, + "grad_norm": 1.5423769914048835, + "learning_rate": 8.741475511469312e-07, + "loss": 0.8409, + "step": 225600 + }, + { + "epoch": 17.48304854895579, + "grad_norm": 1.7282526922101384, + "learning_rate": 8.741862988220708e-07, + "loss": 0.8469, + "step": 225610 + }, + { + "epoch": 17.483823472432096, + "grad_norm": 1.6771521713517301, + "learning_rate": 8.742250464972102e-07, + "loss": 0.8488, + "step": 225620 + }, + { + "epoch": 17.484598395908403, + "grad_norm": 1.7232344539944524, + "learning_rate": 8.742637941723497e-07, + "loss": 0.8632, + "step": 225630 + }, + { + "epoch": 17.48537331938471, + "grad_norm": 1.707818157361295, + "learning_rate": 8.743025418474891e-07, + "loss": 0.8643, + "step": 225640 + }, + { + "epoch": 17.486148242861017, + "grad_norm": 1.7297022486295688, + "learning_rate": 8.743412895226287e-07, + "loss": 0.8426, + "step": 225650 + }, + { + "epoch": 17.486923166337323, + "grad_norm": 1.7360139607621086, + "learning_rate": 8.743800371977682e-07, + "loss": 0.8779, + "step": 225660 + }, + { + "epoch": 17.48769808981363, + "grad_norm": 1.736463365663499, + "learning_rate": 8.744187848729077e-07, + "loss": 0.8496, + "step": 225670 + }, + { + "epoch": 17.488473013289937, + "grad_norm": 1.657293911482415, + "learning_rate": 8.744575325480471e-07, + "loss": 0.8494, + "step": 225680 + }, + { + "epoch": 17.489247936766244, + "grad_norm": 1.6359771792697153, + "learning_rate": 8.744962802231867e-07, + "loss": 0.8398, + "step": 225690 + }, + { + "epoch": 17.49002286024255, + "grad_norm": 1.6968524064276316, + "learning_rate": 8.745350278983261e-07, + "loss": 0.8559, + "step": 225700 + }, + { + "epoch": 17.490797783718858, + "grad_norm": 1.688523435172778, + "learning_rate": 8.745737755734657e-07, + "loss": 0.8421, + "step": 225710 + }, + { + "epoch": 17.491572707195164, + "grad_norm": 1.6076112910034586, + "learning_rate": 8.746125232486051e-07, + "loss": 0.8608, + "step": 225720 + }, + { + "epoch": 17.49234763067147, + "grad_norm": 1.5356128563765088, + "learning_rate": 8.746512709237447e-07, + "loss": 0.8343, + "step": 225730 + }, + { + "epoch": 17.493122554147778, + "grad_norm": 1.7716537731192923, + "learning_rate": 8.746900185988841e-07, + "loss": 0.8481, + "step": 225740 + }, + { + "epoch": 17.493897477624085, + "grad_norm": 1.7867738933505168, + "learning_rate": 8.747287662740237e-07, + "loss": 0.8363, + "step": 225750 + }, + { + "epoch": 17.49467240110039, + "grad_norm": 1.743264645941835, + "learning_rate": 8.747675139491631e-07, + "loss": 0.842, + "step": 225760 + }, + { + "epoch": 17.4954473245767, + "grad_norm": 1.5243516607472107, + "learning_rate": 8.748062616243026e-07, + "loss": 0.8619, + "step": 225770 + }, + { + "epoch": 17.496222248053005, + "grad_norm": 1.7776416466316425, + "learning_rate": 8.74845009299442e-07, + "loss": 0.8487, + "step": 225780 + }, + { + "epoch": 17.496997171529312, + "grad_norm": 1.7715954143362254, + "learning_rate": 8.748837569745816e-07, + "loss": 0.8436, + "step": 225790 + }, + { + "epoch": 17.49777209500562, + "grad_norm": 1.595310545507262, + "learning_rate": 8.74922504649721e-07, + "loss": 0.8636, + "step": 225800 + }, + { + "epoch": 17.498547018481926, + "grad_norm": 1.6155904108032735, + "learning_rate": 8.749612523248606e-07, + "loss": 0.8611, + "step": 225810 + }, + { + "epoch": 17.499321941958232, + "grad_norm": 1.7949651372629472, + "learning_rate": 8.75e-07, + "loss": 0.841, + "step": 225820 + }, + { + "epoch": 17.50009686543454, + "grad_norm": 1.8644020370996508, + "learning_rate": 8.750387476751396e-07, + "loss": 0.8504, + "step": 225830 + }, + { + "epoch": 17.500871788910846, + "grad_norm": 1.6030982660025255, + "learning_rate": 8.75077495350279e-07, + "loss": 0.8594, + "step": 225840 + }, + { + "epoch": 17.501646712387153, + "grad_norm": 1.6613072944742169, + "learning_rate": 8.751162430254186e-07, + "loss": 0.8543, + "step": 225850 + }, + { + "epoch": 17.50242163586346, + "grad_norm": 1.7554744160056774, + "learning_rate": 8.75154990700558e-07, + "loss": 0.8547, + "step": 225860 + }, + { + "epoch": 17.503196559339766, + "grad_norm": 1.6263445051924856, + "learning_rate": 8.751937383756975e-07, + "loss": 0.8488, + "step": 225870 + }, + { + "epoch": 17.503971482816073, + "grad_norm": 1.742222801650065, + "learning_rate": 8.75232486050837e-07, + "loss": 0.8704, + "step": 225880 + }, + { + "epoch": 17.50474640629238, + "grad_norm": 1.7532664935231306, + "learning_rate": 8.752712337259765e-07, + "loss": 0.8572, + "step": 225890 + }, + { + "epoch": 17.505521329768687, + "grad_norm": 1.5944713274369595, + "learning_rate": 8.75309981401116e-07, + "loss": 0.8479, + "step": 225900 + }, + { + "epoch": 17.506296253244994, + "grad_norm": 1.6472620417536674, + "learning_rate": 8.753487290762555e-07, + "loss": 0.8543, + "step": 225910 + }, + { + "epoch": 17.507071176721297, + "grad_norm": 1.558424738791647, + "learning_rate": 8.753874767513949e-07, + "loss": 0.8582, + "step": 225920 + }, + { + "epoch": 17.507846100197604, + "grad_norm": 1.613670523466858, + "learning_rate": 8.754262244265345e-07, + "loss": 0.8667, + "step": 225930 + }, + { + "epoch": 17.50862102367391, + "grad_norm": 1.5031325198906464, + "learning_rate": 8.754649721016739e-07, + "loss": 0.8556, + "step": 225940 + }, + { + "epoch": 17.509395947150217, + "grad_norm": 1.7193164287727527, + "learning_rate": 8.755037197768135e-07, + "loss": 0.8407, + "step": 225950 + }, + { + "epoch": 17.510170870626524, + "grad_norm": 1.5810202516016654, + "learning_rate": 8.755424674519529e-07, + "loss": 0.8321, + "step": 225960 + }, + { + "epoch": 17.51094579410283, + "grad_norm": 1.5792218886297584, + "learning_rate": 8.755812151270925e-07, + "loss": 0.8351, + "step": 225970 + }, + { + "epoch": 17.511720717579138, + "grad_norm": 1.5589518449209785, + "learning_rate": 8.756199628022319e-07, + "loss": 0.8519, + "step": 225980 + }, + { + "epoch": 17.512495641055445, + "grad_norm": 1.590667079222195, + "learning_rate": 8.756587104773714e-07, + "loss": 0.8602, + "step": 225990 + }, + { + "epoch": 17.51327056453175, + "grad_norm": 1.668425433681378, + "learning_rate": 8.756974581525109e-07, + "loss": 0.8616, + "step": 226000 + }, + { + "epoch": 17.51327056453175, + "eval_loss": 0.8893086910247803, + "eval_runtime": 330.9441, + "eval_samples_per_second": 34.661, + "eval_steps_per_second": 8.666, + "step": 226000 + }, + { + "epoch": 17.51404548800806, + "grad_norm": 1.5794092845935412, + "learning_rate": 8.757362058276504e-07, + "loss": 0.8459, + "step": 226010 + }, + { + "epoch": 17.514820411484365, + "grad_norm": 1.5593768316503631, + "learning_rate": 8.757749535027899e-07, + "loss": 0.8603, + "step": 226020 + }, + { + "epoch": 17.515595334960672, + "grad_norm": 1.5827219260621201, + "learning_rate": 8.758137011779294e-07, + "loss": 0.8549, + "step": 226030 + }, + { + "epoch": 17.51637025843698, + "grad_norm": 1.5913269361955489, + "learning_rate": 8.758524488530688e-07, + "loss": 0.8418, + "step": 226040 + }, + { + "epoch": 17.517145181913286, + "grad_norm": 1.6160922463653828, + "learning_rate": 8.758911965282084e-07, + "loss": 0.8369, + "step": 226050 + }, + { + "epoch": 17.517920105389592, + "grad_norm": 1.7661867585295101, + "learning_rate": 8.759299442033478e-07, + "loss": 0.8679, + "step": 226060 + }, + { + "epoch": 17.5186950288659, + "grad_norm": 1.547706704675642, + "learning_rate": 8.759686918784874e-07, + "loss": 0.8531, + "step": 226070 + }, + { + "epoch": 17.519469952342206, + "grad_norm": 1.697333623085481, + "learning_rate": 8.760074395536268e-07, + "loss": 0.8453, + "step": 226080 + }, + { + "epoch": 17.520244875818513, + "grad_norm": 1.6266770535176904, + "learning_rate": 8.760461872287663e-07, + "loss": 0.8432, + "step": 226090 + }, + { + "epoch": 17.52101979929482, + "grad_norm": 1.5569022012344733, + "learning_rate": 8.760849349039058e-07, + "loss": 0.838, + "step": 226100 + }, + { + "epoch": 17.521794722771126, + "grad_norm": 1.6915852802201157, + "learning_rate": 8.761236825790454e-07, + "loss": 0.8522, + "step": 226110 + }, + { + "epoch": 17.522569646247433, + "grad_norm": 1.6304318093210943, + "learning_rate": 8.761624302541848e-07, + "loss": 0.8389, + "step": 226120 + }, + { + "epoch": 17.52334456972374, + "grad_norm": 1.6635872658123698, + "learning_rate": 8.762011779293243e-07, + "loss": 0.8544, + "step": 226130 + }, + { + "epoch": 17.524119493200047, + "grad_norm": 1.6375275968420193, + "learning_rate": 8.762399256044637e-07, + "loss": 0.834, + "step": 226140 + }, + { + "epoch": 17.524894416676354, + "grad_norm": 1.7206445148558493, + "learning_rate": 8.762786732796033e-07, + "loss": 0.846, + "step": 226150 + }, + { + "epoch": 17.52566934015266, + "grad_norm": 1.7251645718954787, + "learning_rate": 8.763174209547428e-07, + "loss": 0.8567, + "step": 226160 + }, + { + "epoch": 17.526444263628967, + "grad_norm": 1.6560055747293536, + "learning_rate": 8.763561686298823e-07, + "loss": 0.8492, + "step": 226170 + }, + { + "epoch": 17.527219187105274, + "grad_norm": 1.6649789371516985, + "learning_rate": 8.763949163050217e-07, + "loss": 0.8564, + "step": 226180 + }, + { + "epoch": 17.52799411058158, + "grad_norm": 1.6986989939252146, + "learning_rate": 8.764336639801612e-07, + "loss": 0.8479, + "step": 226190 + }, + { + "epoch": 17.528769034057888, + "grad_norm": 1.725672644655192, + "learning_rate": 8.764724116553007e-07, + "loss": 0.8547, + "step": 226200 + }, + { + "epoch": 17.529543957534194, + "grad_norm": 1.6478391660484442, + "learning_rate": 8.765111593304403e-07, + "loss": 0.8412, + "step": 226210 + }, + { + "epoch": 17.5303188810105, + "grad_norm": 1.6025963312972957, + "learning_rate": 8.765499070055797e-07, + "loss": 0.8581, + "step": 226220 + }, + { + "epoch": 17.531093804486808, + "grad_norm": 1.7806677990782118, + "learning_rate": 8.765886546807192e-07, + "loss": 0.8632, + "step": 226230 + }, + { + "epoch": 17.531868727963115, + "grad_norm": 1.657242362298086, + "learning_rate": 8.766274023558586e-07, + "loss": 0.8557, + "step": 226240 + }, + { + "epoch": 17.53264365143942, + "grad_norm": 1.5830641837618706, + "learning_rate": 8.766661500309983e-07, + "loss": 0.848, + "step": 226250 + }, + { + "epoch": 17.53341857491573, + "grad_norm": 1.842562964342666, + "learning_rate": 8.767048977061377e-07, + "loss": 0.8521, + "step": 226260 + }, + { + "epoch": 17.534193498392035, + "grad_norm": 1.646860664904995, + "learning_rate": 8.767436453812772e-07, + "loss": 0.8592, + "step": 226270 + }, + { + "epoch": 17.534968421868342, + "grad_norm": 1.6099637723838636, + "learning_rate": 8.767823930564166e-07, + "loss": 0.8602, + "step": 226280 + }, + { + "epoch": 17.53574334534465, + "grad_norm": 1.488478070932253, + "learning_rate": 8.768211407315561e-07, + "loss": 0.8649, + "step": 226290 + }, + { + "epoch": 17.536518268820952, + "grad_norm": 1.6417986221956058, + "learning_rate": 8.768598884066957e-07, + "loss": 0.8312, + "step": 226300 + }, + { + "epoch": 17.53729319229726, + "grad_norm": 1.7008034004064125, + "learning_rate": 8.768986360818352e-07, + "loss": 0.8534, + "step": 226310 + }, + { + "epoch": 17.538068115773566, + "grad_norm": 1.4872652743001946, + "learning_rate": 8.769373837569746e-07, + "loss": 0.8496, + "step": 226320 + }, + { + "epoch": 17.538843039249873, + "grad_norm": 1.654229636837183, + "learning_rate": 8.769761314321141e-07, + "loss": 0.8488, + "step": 226330 + }, + { + "epoch": 17.53961796272618, + "grad_norm": 1.6289436461897056, + "learning_rate": 8.770148791072535e-07, + "loss": 0.8478, + "step": 226340 + }, + { + "epoch": 17.540392886202486, + "grad_norm": 1.6711685131582814, + "learning_rate": 8.770536267823932e-07, + "loss": 0.8529, + "step": 226350 + }, + { + "epoch": 17.541167809678793, + "grad_norm": 1.6163610276689293, + "learning_rate": 8.770923744575326e-07, + "loss": 0.8342, + "step": 226360 + }, + { + "epoch": 17.5419427331551, + "grad_norm": 1.5795875491901152, + "learning_rate": 8.771311221326721e-07, + "loss": 0.8661, + "step": 226370 + }, + { + "epoch": 17.542717656631407, + "grad_norm": 1.6034028160913778, + "learning_rate": 8.771698698078115e-07, + "loss": 0.8615, + "step": 226380 + }, + { + "epoch": 17.543492580107714, + "grad_norm": 1.5806816535773625, + "learning_rate": 8.772086174829512e-07, + "loss": 0.8714, + "step": 226390 + }, + { + "epoch": 17.54426750358402, + "grad_norm": 1.7299551812680969, + "learning_rate": 8.772473651580906e-07, + "loss": 0.8456, + "step": 226400 + }, + { + "epoch": 17.545042427060327, + "grad_norm": 1.7010548801807228, + "learning_rate": 8.772861128332301e-07, + "loss": 0.8441, + "step": 226410 + }, + { + "epoch": 17.545817350536634, + "grad_norm": 1.6059733697655703, + "learning_rate": 8.773248605083695e-07, + "loss": 0.8645, + "step": 226420 + }, + { + "epoch": 17.54659227401294, + "grad_norm": 1.7726037982902592, + "learning_rate": 8.77363608183509e-07, + "loss": 0.8414, + "step": 226430 + }, + { + "epoch": 17.547367197489248, + "grad_norm": 1.6725606586012118, + "learning_rate": 8.774023558586485e-07, + "loss": 0.836, + "step": 226440 + }, + { + "epoch": 17.548142120965554, + "grad_norm": 1.7688665033534892, + "learning_rate": 8.774411035337881e-07, + "loss": 0.8599, + "step": 226450 + }, + { + "epoch": 17.54891704444186, + "grad_norm": 1.659117633291169, + "learning_rate": 8.774798512089275e-07, + "loss": 0.8509, + "step": 226460 + }, + { + "epoch": 17.549691967918168, + "grad_norm": 1.6895024906236065, + "learning_rate": 8.77518598884067e-07, + "loss": 0.8432, + "step": 226470 + }, + { + "epoch": 17.550466891394475, + "grad_norm": 1.6668392591594534, + "learning_rate": 8.775573465592064e-07, + "loss": 0.8423, + "step": 226480 + }, + { + "epoch": 17.55124181487078, + "grad_norm": 1.7350183392239544, + "learning_rate": 8.775960942343461e-07, + "loss": 0.8432, + "step": 226490 + }, + { + "epoch": 17.55201673834709, + "grad_norm": 1.576919008401679, + "learning_rate": 8.776348419094855e-07, + "loss": 0.8414, + "step": 226500 + }, + { + "epoch": 17.55201673834709, + "eval_loss": 0.889241099357605, + "eval_runtime": 331.2602, + "eval_samples_per_second": 34.628, + "eval_steps_per_second": 8.658, + "step": 226500 + }, + { + "epoch": 17.552791661823395, + "grad_norm": 1.5994759914950003, + "learning_rate": 8.77673589584625e-07, + "loss": 0.8381, + "step": 226510 + }, + { + "epoch": 17.553566585299702, + "grad_norm": 1.8143494237610833, + "learning_rate": 8.777123372597644e-07, + "loss": 0.8488, + "step": 226520 + }, + { + "epoch": 17.55434150877601, + "grad_norm": 1.5232188936367204, + "learning_rate": 8.77751084934904e-07, + "loss": 0.8559, + "step": 226530 + }, + { + "epoch": 17.555116432252316, + "grad_norm": 1.6736235091407325, + "learning_rate": 8.777898326100435e-07, + "loss": 0.8544, + "step": 226540 + }, + { + "epoch": 17.555891355728622, + "grad_norm": 1.6984526538418094, + "learning_rate": 8.77828580285183e-07, + "loss": 0.8528, + "step": 226550 + }, + { + "epoch": 17.55666627920493, + "grad_norm": 1.5811825277417848, + "learning_rate": 8.778673279603224e-07, + "loss": 0.8375, + "step": 226560 + }, + { + "epoch": 17.557441202681236, + "grad_norm": 1.6942116046760753, + "learning_rate": 8.779060756354619e-07, + "loss": 0.8451, + "step": 226570 + }, + { + "epoch": 17.558216126157543, + "grad_norm": 1.728167427957996, + "learning_rate": 8.779448233106013e-07, + "loss": 0.8394, + "step": 226580 + }, + { + "epoch": 17.55899104963385, + "grad_norm": 1.738446726437668, + "learning_rate": 8.77983570985741e-07, + "loss": 0.8477, + "step": 226590 + }, + { + "epoch": 17.559765973110157, + "grad_norm": 1.6992086738967072, + "learning_rate": 8.780223186608804e-07, + "loss": 0.846, + "step": 226600 + }, + { + "epoch": 17.560540896586463, + "grad_norm": 1.7079212870233262, + "learning_rate": 8.780610663360199e-07, + "loss": 0.8514, + "step": 226610 + }, + { + "epoch": 17.56131582006277, + "grad_norm": 1.7246037676070265, + "learning_rate": 8.780998140111593e-07, + "loss": 0.8569, + "step": 226620 + }, + { + "epoch": 17.562090743539077, + "grad_norm": 1.552968084133509, + "learning_rate": 8.781385616862989e-07, + "loss": 0.8566, + "step": 226630 + }, + { + "epoch": 17.562865667015384, + "grad_norm": 1.618373851751955, + "learning_rate": 8.781773093614384e-07, + "loss": 0.8555, + "step": 226640 + }, + { + "epoch": 17.56364059049169, + "grad_norm": 1.6610814838927852, + "learning_rate": 8.782160570365779e-07, + "loss": 0.8499, + "step": 226650 + }, + { + "epoch": 17.564415513967994, + "grad_norm": 1.6659659222918932, + "learning_rate": 8.782548047117173e-07, + "loss": 0.8532, + "step": 226660 + }, + { + "epoch": 17.5651904374443, + "grad_norm": 1.6879856919974896, + "learning_rate": 8.782935523868569e-07, + "loss": 0.8498, + "step": 226670 + }, + { + "epoch": 17.565965360920607, + "grad_norm": 1.5674782558566331, + "learning_rate": 8.783323000619963e-07, + "loss": 0.8514, + "step": 226680 + }, + { + "epoch": 17.566740284396914, + "grad_norm": 1.572144865078225, + "learning_rate": 8.783710477371359e-07, + "loss": 0.857, + "step": 226690 + }, + { + "epoch": 17.56751520787322, + "grad_norm": 1.7327311226963076, + "learning_rate": 8.784097954122753e-07, + "loss": 0.841, + "step": 226700 + }, + { + "epoch": 17.568290131349528, + "grad_norm": 1.6099813451577472, + "learning_rate": 8.784485430874148e-07, + "loss": 0.8374, + "step": 226710 + }, + { + "epoch": 17.569065054825835, + "grad_norm": 1.5948910672294252, + "learning_rate": 8.784872907625542e-07, + "loss": 0.8361, + "step": 226720 + }, + { + "epoch": 17.56983997830214, + "grad_norm": 1.7323883657466308, + "learning_rate": 8.785260384376938e-07, + "loss": 0.8503, + "step": 226730 + }, + { + "epoch": 17.57061490177845, + "grad_norm": 1.6436781854233953, + "learning_rate": 8.785647861128333e-07, + "loss": 0.8431, + "step": 226740 + }, + { + "epoch": 17.571389825254755, + "grad_norm": 1.766850763453985, + "learning_rate": 8.786035337879728e-07, + "loss": 0.8385, + "step": 226750 + }, + { + "epoch": 17.572164748731062, + "grad_norm": 1.6786333695667395, + "learning_rate": 8.786422814631122e-07, + "loss": 0.8379, + "step": 226760 + }, + { + "epoch": 17.57293967220737, + "grad_norm": 1.601536054463861, + "learning_rate": 8.786810291382518e-07, + "loss": 0.8552, + "step": 226770 + }, + { + "epoch": 17.573714595683676, + "grad_norm": 1.6616713006151005, + "learning_rate": 8.787197768133912e-07, + "loss": 0.8422, + "step": 226780 + }, + { + "epoch": 17.574489519159982, + "grad_norm": 1.611385729745291, + "learning_rate": 8.787585244885308e-07, + "loss": 0.8489, + "step": 226790 + }, + { + "epoch": 17.57526444263629, + "grad_norm": 1.6570687224257867, + "learning_rate": 8.787972721636702e-07, + "loss": 0.8553, + "step": 226800 + }, + { + "epoch": 17.576039366112596, + "grad_norm": 1.7356488353169897, + "learning_rate": 8.788360198388098e-07, + "loss": 0.847, + "step": 226810 + }, + { + "epoch": 17.576814289588903, + "grad_norm": 1.634072663420703, + "learning_rate": 8.788747675139492e-07, + "loss": 0.8777, + "step": 226820 + }, + { + "epoch": 17.57758921306521, + "grad_norm": 1.6003401140715894, + "learning_rate": 8.789135151890887e-07, + "loss": 0.8591, + "step": 226830 + }, + { + "epoch": 17.578364136541516, + "grad_norm": 1.6744914766795065, + "learning_rate": 8.789522628642282e-07, + "loss": 0.8309, + "step": 226840 + }, + { + "epoch": 17.579139060017823, + "grad_norm": 1.5893452881252113, + "learning_rate": 8.789910105393677e-07, + "loss": 0.8528, + "step": 226850 + }, + { + "epoch": 17.57991398349413, + "grad_norm": 1.7471347111331215, + "learning_rate": 8.790297582145071e-07, + "loss": 0.8564, + "step": 226860 + }, + { + "epoch": 17.580688906970437, + "grad_norm": 1.7452831625891694, + "learning_rate": 8.790685058896467e-07, + "loss": 0.8575, + "step": 226870 + }, + { + "epoch": 17.581463830446744, + "grad_norm": 1.4721191241168727, + "learning_rate": 8.791072535647861e-07, + "loss": 0.8349, + "step": 226880 + }, + { + "epoch": 17.58223875392305, + "grad_norm": 1.6168142174043412, + "learning_rate": 8.791460012399257e-07, + "loss": 0.827, + "step": 226890 + }, + { + "epoch": 17.583013677399357, + "grad_norm": 1.6965269358207935, + "learning_rate": 8.791847489150651e-07, + "loss": 0.8619, + "step": 226900 + }, + { + "epoch": 17.583788600875664, + "grad_norm": 1.6167585968516425, + "learning_rate": 8.792234965902047e-07, + "loss": 0.8413, + "step": 226910 + }, + { + "epoch": 17.58456352435197, + "grad_norm": 1.67326477249949, + "learning_rate": 8.792622442653441e-07, + "loss": 0.8508, + "step": 226920 + }, + { + "epoch": 17.585338447828278, + "grad_norm": 1.585613299859433, + "learning_rate": 8.793009919404836e-07, + "loss": 0.8566, + "step": 226930 + }, + { + "epoch": 17.586113371304585, + "grad_norm": 1.5559255883093723, + "learning_rate": 8.793397396156231e-07, + "loss": 0.8442, + "step": 226940 + }, + { + "epoch": 17.58688829478089, + "grad_norm": 1.8253412911825397, + "learning_rate": 8.793784872907627e-07, + "loss": 0.8594, + "step": 226950 + }, + { + "epoch": 17.587663218257198, + "grad_norm": 1.7170063445763224, + "learning_rate": 8.794172349659021e-07, + "loss": 0.8448, + "step": 226960 + }, + { + "epoch": 17.588438141733505, + "grad_norm": 1.5676567907496528, + "learning_rate": 8.794559826410416e-07, + "loss": 0.8404, + "step": 226970 + }, + { + "epoch": 17.58921306520981, + "grad_norm": 1.6206391187525213, + "learning_rate": 8.79494730316181e-07, + "loss": 0.846, + "step": 226980 + }, + { + "epoch": 17.58998798868612, + "grad_norm": 1.6953740708559517, + "learning_rate": 8.795334779913206e-07, + "loss": 0.8423, + "step": 226990 + }, + { + "epoch": 17.590762912162425, + "grad_norm": 1.5889936132822287, + "learning_rate": 8.7957222566646e-07, + "loss": 0.8377, + "step": 227000 + }, + { + "epoch": 17.590762912162425, + "eval_loss": 0.8891258835792542, + "eval_runtime": 329.6458, + "eval_samples_per_second": 34.798, + "eval_steps_per_second": 8.7, + "step": 227000 + }, + { + "epoch": 17.591537835638732, + "grad_norm": 1.580480044031217, + "learning_rate": 8.796109733415996e-07, + "loss": 0.8612, + "step": 227010 + }, + { + "epoch": 17.59231275911504, + "grad_norm": 1.5846303434638034, + "learning_rate": 8.79649721016739e-07, + "loss": 0.837, + "step": 227020 + }, + { + "epoch": 17.593087682591346, + "grad_norm": 1.6898995845264442, + "learning_rate": 8.796884686918785e-07, + "loss": 0.8468, + "step": 227030 + }, + { + "epoch": 17.593862606067653, + "grad_norm": 1.6858665741368997, + "learning_rate": 8.79727216367018e-07, + "loss": 0.8285, + "step": 227040 + }, + { + "epoch": 17.594637529543956, + "grad_norm": 1.6386255027897538, + "learning_rate": 8.797659640421576e-07, + "loss": 0.8296, + "step": 227050 + }, + { + "epoch": 17.595412453020263, + "grad_norm": 1.5878641313877633, + "learning_rate": 8.79804711717297e-07, + "loss": 0.8447, + "step": 227060 + }, + { + "epoch": 17.59618737649657, + "grad_norm": 1.6643581001872223, + "learning_rate": 8.798434593924365e-07, + "loss": 0.8466, + "step": 227070 + }, + { + "epoch": 17.596962299972876, + "grad_norm": 1.5788382445468274, + "learning_rate": 8.798822070675759e-07, + "loss": 0.837, + "step": 227080 + }, + { + "epoch": 17.597737223449183, + "grad_norm": 1.5816833805057646, + "learning_rate": 8.799209547427156e-07, + "loss": 0.8262, + "step": 227090 + }, + { + "epoch": 17.59851214692549, + "grad_norm": 1.6214296153421202, + "learning_rate": 8.79959702417855e-07, + "loss": 0.8537, + "step": 227100 + }, + { + "epoch": 17.599287070401797, + "grad_norm": 1.777747050880772, + "learning_rate": 8.799984500929945e-07, + "loss": 0.8436, + "step": 227110 + }, + { + "epoch": 17.600061993878104, + "grad_norm": 1.63145818655414, + "learning_rate": 8.800371977681339e-07, + "loss": 0.8305, + "step": 227120 + }, + { + "epoch": 17.60083691735441, + "grad_norm": 1.5495742339398813, + "learning_rate": 8.800759454432734e-07, + "loss": 0.8484, + "step": 227130 + }, + { + "epoch": 17.601611840830717, + "grad_norm": 1.5874343861103262, + "learning_rate": 8.801146931184129e-07, + "loss": 0.8535, + "step": 227140 + }, + { + "epoch": 17.602386764307024, + "grad_norm": 1.7304523134805416, + "learning_rate": 8.801534407935525e-07, + "loss": 0.8556, + "step": 227150 + }, + { + "epoch": 17.60316168778333, + "grad_norm": 1.4542615174023013, + "learning_rate": 8.801921884686919e-07, + "loss": 0.8332, + "step": 227160 + }, + { + "epoch": 17.603936611259638, + "grad_norm": 1.6239487392091794, + "learning_rate": 8.802309361438314e-07, + "loss": 0.8357, + "step": 227170 + }, + { + "epoch": 17.604711534735944, + "grad_norm": 1.8432100204059696, + "learning_rate": 8.802696838189708e-07, + "loss": 0.845, + "step": 227180 + }, + { + "epoch": 17.60548645821225, + "grad_norm": 1.7892096453695894, + "learning_rate": 8.803084314941105e-07, + "loss": 0.8558, + "step": 227190 + }, + { + "epoch": 17.606261381688558, + "grad_norm": 1.7169245257414152, + "learning_rate": 8.803471791692499e-07, + "loss": 0.8368, + "step": 227200 + }, + { + "epoch": 17.607036305164865, + "grad_norm": 1.5886867775869846, + "learning_rate": 8.803859268443894e-07, + "loss": 0.837, + "step": 227210 + }, + { + "epoch": 17.60781122864117, + "grad_norm": 1.5946758251882662, + "learning_rate": 8.804246745195288e-07, + "loss": 0.8339, + "step": 227220 + }, + { + "epoch": 17.60858615211748, + "grad_norm": 1.60811933892744, + "learning_rate": 8.804634221946685e-07, + "loss": 0.8286, + "step": 227230 + }, + { + "epoch": 17.609361075593785, + "grad_norm": 1.6969270375413372, + "learning_rate": 8.805021698698079e-07, + "loss": 0.8492, + "step": 227240 + }, + { + "epoch": 17.610135999070092, + "grad_norm": 1.602867030467308, + "learning_rate": 8.805409175449474e-07, + "loss": 0.8404, + "step": 227250 + }, + { + "epoch": 17.6109109225464, + "grad_norm": 1.6517738869320588, + "learning_rate": 8.805796652200868e-07, + "loss": 0.8405, + "step": 227260 + }, + { + "epoch": 17.611685846022706, + "grad_norm": 1.7076010795115346, + "learning_rate": 8.806184128952263e-07, + "loss": 0.8277, + "step": 227270 + }, + { + "epoch": 17.612460769499013, + "grad_norm": 1.6763865837338363, + "learning_rate": 8.806571605703657e-07, + "loss": 0.8466, + "step": 227280 + }, + { + "epoch": 17.61323569297532, + "grad_norm": 1.6747594637122896, + "learning_rate": 8.806959082455054e-07, + "loss": 0.8483, + "step": 227290 + }, + { + "epoch": 17.614010616451626, + "grad_norm": 1.63863644708562, + "learning_rate": 8.807346559206448e-07, + "loss": 0.8591, + "step": 227300 + }, + { + "epoch": 17.614785539927933, + "grad_norm": 1.586415288653749, + "learning_rate": 8.807734035957843e-07, + "loss": 0.8448, + "step": 227310 + }, + { + "epoch": 17.61556046340424, + "grad_norm": 1.5623374734962656, + "learning_rate": 8.808121512709237e-07, + "loss": 0.841, + "step": 227320 + }, + { + "epoch": 17.616335386880547, + "grad_norm": 1.6843763997514969, + "learning_rate": 8.808508989460634e-07, + "loss": 0.8529, + "step": 227330 + }, + { + "epoch": 17.617110310356853, + "grad_norm": 1.6659291012301056, + "learning_rate": 8.808896466212028e-07, + "loss": 0.8532, + "step": 227340 + }, + { + "epoch": 17.61788523383316, + "grad_norm": 1.5527730665608828, + "learning_rate": 8.809283942963423e-07, + "loss": 0.8541, + "step": 227350 + }, + { + "epoch": 17.618660157309467, + "grad_norm": 1.6082189295796554, + "learning_rate": 8.809671419714817e-07, + "loss": 0.8483, + "step": 227360 + }, + { + "epoch": 17.619435080785774, + "grad_norm": 1.662928481574201, + "learning_rate": 8.810058896466212e-07, + "loss": 0.8593, + "step": 227370 + }, + { + "epoch": 17.62021000426208, + "grad_norm": 1.6680806129242445, + "learning_rate": 8.810446373217608e-07, + "loss": 0.8427, + "step": 227380 + }, + { + "epoch": 17.620984927738387, + "grad_norm": 1.67680492773435, + "learning_rate": 8.810833849969003e-07, + "loss": 0.8442, + "step": 227390 + }, + { + "epoch": 17.621759851214694, + "grad_norm": 1.6385437119124986, + "learning_rate": 8.811221326720397e-07, + "loss": 0.8319, + "step": 227400 + }, + { + "epoch": 17.622534774690997, + "grad_norm": 1.6841572370807771, + "learning_rate": 8.811608803471792e-07, + "loss": 0.8602, + "step": 227410 + }, + { + "epoch": 17.623309698167304, + "grad_norm": 1.6920541983951103, + "learning_rate": 8.811996280223186e-07, + "loss": 0.8421, + "step": 227420 + }, + { + "epoch": 17.62408462164361, + "grad_norm": 1.5170080938469024, + "learning_rate": 8.812383756974583e-07, + "loss": 0.8496, + "step": 227430 + }, + { + "epoch": 17.624859545119918, + "grad_norm": 1.8527887703103803, + "learning_rate": 8.812771233725977e-07, + "loss": 0.8414, + "step": 227440 + }, + { + "epoch": 17.625634468596225, + "grad_norm": 1.5466990817243247, + "learning_rate": 8.813158710477372e-07, + "loss": 0.8449, + "step": 227450 + }, + { + "epoch": 17.62640939207253, + "grad_norm": 1.564993343584306, + "learning_rate": 8.813546187228766e-07, + "loss": 0.8655, + "step": 227460 + }, + { + "epoch": 17.62718431554884, + "grad_norm": 1.576141187275956, + "learning_rate": 8.813933663980162e-07, + "loss": 0.8551, + "step": 227470 + }, + { + "epoch": 17.627959239025145, + "grad_norm": 1.6090728628086015, + "learning_rate": 8.814321140731557e-07, + "loss": 0.8645, + "step": 227480 + }, + { + "epoch": 17.628734162501452, + "grad_norm": 1.6170961847248895, + "learning_rate": 8.814708617482952e-07, + "loss": 0.8786, + "step": 227490 + }, + { + "epoch": 17.62950908597776, + "grad_norm": 1.8126581865449352, + "learning_rate": 8.815096094234346e-07, + "loss": 0.8459, + "step": 227500 + }, + { + "epoch": 17.62950908597776, + "eval_loss": 0.8885133862495422, + "eval_runtime": 328.0577, + "eval_samples_per_second": 34.966, + "eval_steps_per_second": 8.742, + "step": 227500 + }, + { + "epoch": 17.630284009454066, + "grad_norm": 1.605024197373196, + "learning_rate": 8.815483570985741e-07, + "loss": 0.8445, + "step": 227510 + }, + { + "epoch": 17.631058932930372, + "grad_norm": 1.630109389967211, + "learning_rate": 8.815871047737136e-07, + "loss": 0.8524, + "step": 227520 + }, + { + "epoch": 17.63183385640668, + "grad_norm": 1.642371602129831, + "learning_rate": 8.816258524488532e-07, + "loss": 0.8364, + "step": 227530 + }, + { + "epoch": 17.632608779882986, + "grad_norm": 1.7186126846496945, + "learning_rate": 8.816646001239926e-07, + "loss": 0.8733, + "step": 227540 + }, + { + "epoch": 17.633383703359293, + "grad_norm": 1.5887836178137313, + "learning_rate": 8.817033477991321e-07, + "loss": 0.8532, + "step": 227550 + }, + { + "epoch": 17.6341586268356, + "grad_norm": 1.6657086319271848, + "learning_rate": 8.817420954742715e-07, + "loss": 0.837, + "step": 227560 + }, + { + "epoch": 17.634933550311906, + "grad_norm": 1.553574414932561, + "learning_rate": 8.817808431494111e-07, + "loss": 0.8217, + "step": 227570 + }, + { + "epoch": 17.635708473788213, + "grad_norm": 1.6771414474081656, + "learning_rate": 8.818195908245506e-07, + "loss": 0.8426, + "step": 227580 + }, + { + "epoch": 17.63648339726452, + "grad_norm": 1.6733166829462198, + "learning_rate": 8.818583384996901e-07, + "loss": 0.8443, + "step": 227590 + }, + { + "epoch": 17.637258320740827, + "grad_norm": 1.7013692819783315, + "learning_rate": 8.818970861748295e-07, + "loss": 0.8517, + "step": 227600 + }, + { + "epoch": 17.638033244217134, + "grad_norm": 1.8635894403205149, + "learning_rate": 8.819358338499691e-07, + "loss": 0.8628, + "step": 227610 + }, + { + "epoch": 17.63880816769344, + "grad_norm": 1.6530043258374525, + "learning_rate": 8.819745815251085e-07, + "loss": 0.8348, + "step": 227620 + }, + { + "epoch": 17.639583091169747, + "grad_norm": 1.667444268273544, + "learning_rate": 8.820133292002481e-07, + "loss": 0.8319, + "step": 227630 + }, + { + "epoch": 17.640358014646054, + "grad_norm": 1.623361197436227, + "learning_rate": 8.820520768753875e-07, + "loss": 0.8451, + "step": 227640 + }, + { + "epoch": 17.64113293812236, + "grad_norm": 1.6868728591525395, + "learning_rate": 8.82090824550527e-07, + "loss": 0.8487, + "step": 227650 + }, + { + "epoch": 17.641907861598668, + "grad_norm": 1.648828475151752, + "learning_rate": 8.821295722256665e-07, + "loss": 0.841, + "step": 227660 + }, + { + "epoch": 17.642682785074975, + "grad_norm": 1.6069944298340018, + "learning_rate": 8.82168319900806e-07, + "loss": 0.8648, + "step": 227670 + }, + { + "epoch": 17.64345770855128, + "grad_norm": 1.6291926148917775, + "learning_rate": 8.822070675759455e-07, + "loss": 0.8468, + "step": 227680 + }, + { + "epoch": 17.644232632027588, + "grad_norm": 1.6175953216820504, + "learning_rate": 8.82245815251085e-07, + "loss": 0.8549, + "step": 227690 + }, + { + "epoch": 17.645007555503895, + "grad_norm": 1.630961330329225, + "learning_rate": 8.822845629262244e-07, + "loss": 0.8373, + "step": 227700 + }, + { + "epoch": 17.645782478980202, + "grad_norm": 1.6128019080302167, + "learning_rate": 8.82323310601364e-07, + "loss": 0.8666, + "step": 227710 + }, + { + "epoch": 17.64655740245651, + "grad_norm": 1.5203074912086039, + "learning_rate": 8.823620582765034e-07, + "loss": 0.8583, + "step": 227720 + }, + { + "epoch": 17.647332325932815, + "grad_norm": 1.6032967933582853, + "learning_rate": 8.82400805951643e-07, + "loss": 0.8576, + "step": 227730 + }, + { + "epoch": 17.648107249409122, + "grad_norm": 1.5731056985529328, + "learning_rate": 8.824395536267824e-07, + "loss": 0.8404, + "step": 227740 + }, + { + "epoch": 17.64888217288543, + "grad_norm": 1.6620149903540293, + "learning_rate": 8.82478301301922e-07, + "loss": 0.8535, + "step": 227750 + }, + { + "epoch": 17.649657096361736, + "grad_norm": 1.7016268471725386, + "learning_rate": 8.825170489770614e-07, + "loss": 0.8376, + "step": 227760 + }, + { + "epoch": 17.650432019838043, + "grad_norm": 1.6131455242869843, + "learning_rate": 8.82555796652201e-07, + "loss": 0.8531, + "step": 227770 + }, + { + "epoch": 17.65120694331435, + "grad_norm": 1.686427049312705, + "learning_rate": 8.825945443273404e-07, + "loss": 0.8435, + "step": 227780 + }, + { + "epoch": 17.651981866790653, + "grad_norm": 1.6354578920683032, + "learning_rate": 8.826332920024799e-07, + "loss": 0.8587, + "step": 227790 + }, + { + "epoch": 17.65275679026696, + "grad_norm": 1.6226549519551137, + "learning_rate": 8.826720396776194e-07, + "loss": 0.8592, + "step": 227800 + }, + { + "epoch": 17.653531713743266, + "grad_norm": 1.6600689147792076, + "learning_rate": 8.827107873527589e-07, + "loss": 0.8404, + "step": 227810 + }, + { + "epoch": 17.654306637219573, + "grad_norm": 1.7224146016751192, + "learning_rate": 8.827495350278983e-07, + "loss": 0.858, + "step": 227820 + }, + { + "epoch": 17.65508156069588, + "grad_norm": 1.5329480569698366, + "learning_rate": 8.827882827030379e-07, + "loss": 0.8455, + "step": 227830 + }, + { + "epoch": 17.655856484172187, + "grad_norm": 1.5613712681133054, + "learning_rate": 8.828270303781773e-07, + "loss": 0.8405, + "step": 227840 + }, + { + "epoch": 17.656631407648494, + "grad_norm": 1.6526732340314085, + "learning_rate": 8.828657780533169e-07, + "loss": 0.8383, + "step": 227850 + }, + { + "epoch": 17.6574063311248, + "grad_norm": 1.7391103480227617, + "learning_rate": 8.829045257284563e-07, + "loss": 0.8396, + "step": 227860 + }, + { + "epoch": 17.658181254601107, + "grad_norm": 1.7196368658948713, + "learning_rate": 8.829432734035959e-07, + "loss": 0.8502, + "step": 227870 + }, + { + "epoch": 17.658956178077414, + "grad_norm": 1.841404353341216, + "learning_rate": 8.829820210787353e-07, + "loss": 0.8386, + "step": 227880 + }, + { + "epoch": 17.65973110155372, + "grad_norm": 1.767654850516087, + "learning_rate": 8.830207687538749e-07, + "loss": 0.8431, + "step": 227890 + }, + { + "epoch": 17.660506025030028, + "grad_norm": 1.6145458678906475, + "learning_rate": 8.830595164290143e-07, + "loss": 0.8437, + "step": 227900 + }, + { + "epoch": 17.661280948506334, + "grad_norm": 1.7666935533161077, + "learning_rate": 8.830982641041538e-07, + "loss": 0.8851, + "step": 227910 + }, + { + "epoch": 17.66205587198264, + "grad_norm": 1.7291181487093024, + "learning_rate": 8.831370117792932e-07, + "loss": 0.8498, + "step": 227920 + }, + { + "epoch": 17.662830795458948, + "grad_norm": 1.7211992321956893, + "learning_rate": 8.831757594544328e-07, + "loss": 0.8569, + "step": 227930 + }, + { + "epoch": 17.663605718935255, + "grad_norm": 1.6285597205047415, + "learning_rate": 8.832145071295722e-07, + "loss": 0.8366, + "step": 227940 + }, + { + "epoch": 17.66438064241156, + "grad_norm": 1.592062313217477, + "learning_rate": 8.832532548047118e-07, + "loss": 0.8495, + "step": 227950 + }, + { + "epoch": 17.66515556588787, + "grad_norm": 1.668152231820608, + "learning_rate": 8.832920024798512e-07, + "loss": 0.8557, + "step": 227960 + }, + { + "epoch": 17.665930489364175, + "grad_norm": 1.5693093402610698, + "learning_rate": 8.833307501549908e-07, + "loss": 0.8524, + "step": 227970 + }, + { + "epoch": 17.666705412840482, + "grad_norm": 1.8113471960734095, + "learning_rate": 8.833694978301302e-07, + "loss": 0.856, + "step": 227980 + }, + { + "epoch": 17.66748033631679, + "grad_norm": 1.6446387345368945, + "learning_rate": 8.834082455052698e-07, + "loss": 0.8497, + "step": 227990 + }, + { + "epoch": 17.668255259793096, + "grad_norm": 1.5294546715121922, + "learning_rate": 8.834469931804092e-07, + "loss": 0.8449, + "step": 228000 + }, + { + "epoch": 17.668255259793096, + "eval_loss": 0.8889694809913635, + "eval_runtime": 328.4595, + "eval_samples_per_second": 34.924, + "eval_steps_per_second": 8.732, + "step": 228000 + }, + { + "epoch": 17.669030183269403, + "grad_norm": 1.6502743280571615, + "learning_rate": 8.834857408555487e-07, + "loss": 0.854, + "step": 228010 + }, + { + "epoch": 17.66980510674571, + "grad_norm": 1.6466090347398106, + "learning_rate": 8.835244885306882e-07, + "loss": 0.8433, + "step": 228020 + }, + { + "epoch": 17.670580030222016, + "grad_norm": 1.5827640724642131, + "learning_rate": 8.835632362058278e-07, + "loss": 0.856, + "step": 228030 + }, + { + "epoch": 17.671354953698323, + "grad_norm": 1.6082689012847664, + "learning_rate": 8.836019838809672e-07, + "loss": 0.8506, + "step": 228040 + }, + { + "epoch": 17.67212987717463, + "grad_norm": 1.6841285821975578, + "learning_rate": 8.836407315561067e-07, + "loss": 0.8416, + "step": 228050 + }, + { + "epoch": 17.672904800650937, + "grad_norm": 1.6546793083634683, + "learning_rate": 8.836794792312461e-07, + "loss": 0.8654, + "step": 228060 + }, + { + "epoch": 17.673679724127243, + "grad_norm": 1.6275374564260612, + "learning_rate": 8.837182269063857e-07, + "loss": 0.8418, + "step": 228070 + }, + { + "epoch": 17.67445464760355, + "grad_norm": 1.6822000928772434, + "learning_rate": 8.837569745815251e-07, + "loss": 0.8365, + "step": 228080 + }, + { + "epoch": 17.675229571079857, + "grad_norm": 1.6375879997605216, + "learning_rate": 8.837957222566647e-07, + "loss": 0.8381, + "step": 228090 + }, + { + "epoch": 17.676004494556164, + "grad_norm": 1.6427416074883916, + "learning_rate": 8.838344699318041e-07, + "loss": 0.8424, + "step": 228100 + }, + { + "epoch": 17.67677941803247, + "grad_norm": 1.6175513181994692, + "learning_rate": 8.838732176069436e-07, + "loss": 0.8294, + "step": 228110 + }, + { + "epoch": 17.677554341508777, + "grad_norm": 1.5425320912988307, + "learning_rate": 8.839119652820831e-07, + "loss": 0.8447, + "step": 228120 + }, + { + "epoch": 17.678329264985084, + "grad_norm": 1.7755395445328537, + "learning_rate": 8.839507129572227e-07, + "loss": 0.8471, + "step": 228130 + }, + { + "epoch": 17.67910418846139, + "grad_norm": 1.5831264843160555, + "learning_rate": 8.839894606323621e-07, + "loss": 0.8327, + "step": 228140 + }, + { + "epoch": 17.679879111937694, + "grad_norm": 1.5589881416881353, + "learning_rate": 8.840282083075016e-07, + "loss": 0.8512, + "step": 228150 + }, + { + "epoch": 17.680654035414, + "grad_norm": 1.6336403223848197, + "learning_rate": 8.84066955982641e-07, + "loss": 0.8496, + "step": 228160 + }, + { + "epoch": 17.681428958890308, + "grad_norm": 1.5357262878049982, + "learning_rate": 8.841057036577807e-07, + "loss": 0.8452, + "step": 228170 + }, + { + "epoch": 17.682203882366615, + "grad_norm": 1.6393611749663644, + "learning_rate": 8.841444513329201e-07, + "loss": 0.8556, + "step": 228180 + }, + { + "epoch": 17.68297880584292, + "grad_norm": 1.604102764805199, + "learning_rate": 8.841831990080596e-07, + "loss": 0.8699, + "step": 228190 + }, + { + "epoch": 17.68375372931923, + "grad_norm": 1.6673134190301968, + "learning_rate": 8.84221946683199e-07, + "loss": 0.8321, + "step": 228200 + }, + { + "epoch": 17.684528652795535, + "grad_norm": 1.5869782050828385, + "learning_rate": 8.842606943583385e-07, + "loss": 0.8557, + "step": 228210 + }, + { + "epoch": 17.685303576271842, + "grad_norm": 1.623059135943947, + "learning_rate": 8.84299442033478e-07, + "loss": 0.8513, + "step": 228220 + }, + { + "epoch": 17.68607849974815, + "grad_norm": 1.6220577268279477, + "learning_rate": 8.843381897086176e-07, + "loss": 0.8472, + "step": 228230 + }, + { + "epoch": 17.686853423224456, + "grad_norm": 1.6751787213539793, + "learning_rate": 8.84376937383757e-07, + "loss": 0.842, + "step": 228240 + }, + { + "epoch": 17.687628346700762, + "grad_norm": 1.683735521864269, + "learning_rate": 8.844156850588965e-07, + "loss": 0.8548, + "step": 228250 + }, + { + "epoch": 17.68840327017707, + "grad_norm": 1.6739982216932596, + "learning_rate": 8.844544327340359e-07, + "loss": 0.8506, + "step": 228260 + }, + { + "epoch": 17.689178193653376, + "grad_norm": 1.622377086889807, + "learning_rate": 8.844931804091756e-07, + "loss": 0.8497, + "step": 228270 + }, + { + "epoch": 17.689953117129683, + "grad_norm": 1.6909750576031575, + "learning_rate": 8.84531928084315e-07, + "loss": 0.8406, + "step": 228280 + }, + { + "epoch": 17.69072804060599, + "grad_norm": 1.7146672556599287, + "learning_rate": 8.845706757594545e-07, + "loss": 0.85, + "step": 228290 + }, + { + "epoch": 17.691502964082297, + "grad_norm": 1.5458960113226938, + "learning_rate": 8.846094234345939e-07, + "loss": 0.8419, + "step": 228300 + }, + { + "epoch": 17.692277887558603, + "grad_norm": 1.6694373943536602, + "learning_rate": 8.846481711097335e-07, + "loss": 0.8592, + "step": 228310 + }, + { + "epoch": 17.69305281103491, + "grad_norm": 1.6721960433835854, + "learning_rate": 8.84686918784873e-07, + "loss": 0.847, + "step": 228320 + }, + { + "epoch": 17.693827734511217, + "grad_norm": 1.7669389361862768, + "learning_rate": 8.847256664600125e-07, + "loss": 0.8352, + "step": 228330 + }, + { + "epoch": 17.694602657987524, + "grad_norm": 1.5994277653564088, + "learning_rate": 8.847644141351519e-07, + "loss": 0.8568, + "step": 228340 + }, + { + "epoch": 17.69537758146383, + "grad_norm": 1.6030016355641916, + "learning_rate": 8.848031618102914e-07, + "loss": 0.8446, + "step": 228350 + }, + { + "epoch": 17.696152504940137, + "grad_norm": 1.5804818501727143, + "learning_rate": 8.848419094854308e-07, + "loss": 0.8398, + "step": 228360 + }, + { + "epoch": 17.696927428416444, + "grad_norm": 1.6290284817163032, + "learning_rate": 8.848806571605705e-07, + "loss": 0.8241, + "step": 228370 + }, + { + "epoch": 17.69770235189275, + "grad_norm": 1.8118492071757826, + "learning_rate": 8.849194048357099e-07, + "loss": 0.8479, + "step": 228380 + }, + { + "epoch": 17.698477275369058, + "grad_norm": 1.622190325678727, + "learning_rate": 8.849581525108494e-07, + "loss": 0.8503, + "step": 228390 + }, + { + "epoch": 17.699252198845365, + "grad_norm": 1.6424100398398216, + "learning_rate": 8.849969001859888e-07, + "loss": 0.8383, + "step": 228400 + }, + { + "epoch": 17.70002712232167, + "grad_norm": 1.7476768732488859, + "learning_rate": 8.850356478611285e-07, + "loss": 0.8616, + "step": 228410 + }, + { + "epoch": 17.700802045797978, + "grad_norm": 1.730410112412794, + "learning_rate": 8.850743955362679e-07, + "loss": 0.8287, + "step": 228420 + }, + { + "epoch": 17.701576969274285, + "grad_norm": 1.5997279435264673, + "learning_rate": 8.851131432114074e-07, + "loss": 0.8554, + "step": 228430 + }, + { + "epoch": 17.702351892750592, + "grad_norm": 1.7398371887145936, + "learning_rate": 8.851518908865468e-07, + "loss": 0.8344, + "step": 228440 + }, + { + "epoch": 17.7031268162269, + "grad_norm": 1.6445356917062177, + "learning_rate": 8.851906385616864e-07, + "loss": 0.821, + "step": 228450 + }, + { + "epoch": 17.703901739703205, + "grad_norm": 1.6107068138557208, + "learning_rate": 8.852293862368258e-07, + "loss": 0.8329, + "step": 228460 + }, + { + "epoch": 17.704676663179512, + "grad_norm": 1.7283053863472697, + "learning_rate": 8.852681339119654e-07, + "loss": 0.8529, + "step": 228470 + }, + { + "epoch": 17.70545158665582, + "grad_norm": 1.5825816094659204, + "learning_rate": 8.853068815871048e-07, + "loss": 0.8655, + "step": 228480 + }, + { + "epoch": 17.706226510132126, + "grad_norm": 1.7078781581650662, + "learning_rate": 8.853456292622443e-07, + "loss": 0.8428, + "step": 228490 + }, + { + "epoch": 17.707001433608433, + "grad_norm": 1.609864004888087, + "learning_rate": 8.853843769373837e-07, + "loss": 0.8503, + "step": 228500 + }, + { + "epoch": 17.707001433608433, + "eval_loss": 0.8890738487243652, + "eval_runtime": 330.1583, + "eval_samples_per_second": 34.744, + "eval_steps_per_second": 8.687, + "step": 228500 + }, + { + "epoch": 17.70777635708474, + "grad_norm": 1.633500560923521, + "learning_rate": 8.854231246125234e-07, + "loss": 0.8614, + "step": 228510 + }, + { + "epoch": 17.708551280561046, + "grad_norm": 1.665347764678652, + "learning_rate": 8.854618722876628e-07, + "loss": 0.8433, + "step": 228520 + }, + { + "epoch": 17.709326204037353, + "grad_norm": 1.6707288646892064, + "learning_rate": 8.855006199628023e-07, + "loss": 0.8452, + "step": 228530 + }, + { + "epoch": 17.710101127513656, + "grad_norm": 1.5842581416420904, + "learning_rate": 8.855393676379417e-07, + "loss": 0.8408, + "step": 228540 + }, + { + "epoch": 17.710876050989963, + "grad_norm": 1.6602314670873695, + "learning_rate": 8.855781153130813e-07, + "loss": 0.8502, + "step": 228550 + }, + { + "epoch": 17.71165097446627, + "grad_norm": 1.7785796719228177, + "learning_rate": 8.856168629882208e-07, + "loss": 0.8498, + "step": 228560 + }, + { + "epoch": 17.712425897942577, + "grad_norm": 1.496610058660053, + "learning_rate": 8.856556106633603e-07, + "loss": 0.8654, + "step": 228570 + }, + { + "epoch": 17.713200821418884, + "grad_norm": 1.5909349467202036, + "learning_rate": 8.856943583384997e-07, + "loss": 0.8644, + "step": 228580 + }, + { + "epoch": 17.71397574489519, + "grad_norm": 1.6255228431722353, + "learning_rate": 8.857331060136393e-07, + "loss": 0.857, + "step": 228590 + }, + { + "epoch": 17.714750668371497, + "grad_norm": 1.777196281441445, + "learning_rate": 8.857718536887787e-07, + "loss": 0.8548, + "step": 228600 + }, + { + "epoch": 17.715525591847804, + "grad_norm": 1.6894689116922568, + "learning_rate": 8.858106013639183e-07, + "loss": 0.8274, + "step": 228610 + }, + { + "epoch": 17.71630051532411, + "grad_norm": 1.6468171158225131, + "learning_rate": 8.858493490390577e-07, + "loss": 0.8525, + "step": 228620 + }, + { + "epoch": 17.717075438800418, + "grad_norm": 1.6516059618932386, + "learning_rate": 8.858880967141972e-07, + "loss": 0.8556, + "step": 228630 + }, + { + "epoch": 17.717850362276724, + "grad_norm": 1.7427667494404895, + "learning_rate": 8.859268443893366e-07, + "loss": 0.8606, + "step": 228640 + }, + { + "epoch": 17.71862528575303, + "grad_norm": 1.6589889475259574, + "learning_rate": 8.859655920644762e-07, + "loss": 0.8557, + "step": 228650 + }, + { + "epoch": 17.719400209229338, + "grad_norm": 1.7489697645118103, + "learning_rate": 8.860043397396157e-07, + "loss": 0.8462, + "step": 228660 + }, + { + "epoch": 17.720175132705645, + "grad_norm": 1.5742205260362365, + "learning_rate": 8.860430874147552e-07, + "loss": 0.8431, + "step": 228670 + }, + { + "epoch": 17.72095005618195, + "grad_norm": 1.6055272042465822, + "learning_rate": 8.860818350898946e-07, + "loss": 0.8427, + "step": 228680 + }, + { + "epoch": 17.72172497965826, + "grad_norm": 1.6658256851455353, + "learning_rate": 8.861205827650342e-07, + "loss": 0.8466, + "step": 228690 + }, + { + "epoch": 17.722499903134565, + "grad_norm": 1.565770912304112, + "learning_rate": 8.861593304401736e-07, + "loss": 0.832, + "step": 228700 + }, + { + "epoch": 17.723274826610872, + "grad_norm": 1.6336935296977744, + "learning_rate": 8.861980781153132e-07, + "loss": 0.8431, + "step": 228710 + }, + { + "epoch": 17.72404975008718, + "grad_norm": 1.7566131621282477, + "learning_rate": 8.862368257904526e-07, + "loss": 0.8736, + "step": 228720 + }, + { + "epoch": 17.724824673563486, + "grad_norm": 1.6108828514855889, + "learning_rate": 8.862755734655922e-07, + "loss": 0.8648, + "step": 228730 + }, + { + "epoch": 17.725599597039793, + "grad_norm": 1.7374022141245853, + "learning_rate": 8.863143211407316e-07, + "loss": 0.8309, + "step": 228740 + }, + { + "epoch": 17.7263745205161, + "grad_norm": 1.61737648082339, + "learning_rate": 8.863530688158711e-07, + "loss": 0.8528, + "step": 228750 + }, + { + "epoch": 17.727149443992406, + "grad_norm": 1.6980966444077368, + "learning_rate": 8.863918164910106e-07, + "loss": 0.8429, + "step": 228760 + }, + { + "epoch": 17.727924367468713, + "grad_norm": 1.6715769717016458, + "learning_rate": 8.864305641661501e-07, + "loss": 0.8394, + "step": 228770 + }, + { + "epoch": 17.72869929094502, + "grad_norm": 1.6454427406631533, + "learning_rate": 8.864693118412895e-07, + "loss": 0.8425, + "step": 228780 + }, + { + "epoch": 17.729474214421327, + "grad_norm": 1.6942748101970861, + "learning_rate": 8.865080595164291e-07, + "loss": 0.8511, + "step": 228790 + }, + { + "epoch": 17.730249137897633, + "grad_norm": 1.5714204676827197, + "learning_rate": 8.865468071915685e-07, + "loss": 0.85, + "step": 228800 + }, + { + "epoch": 17.73102406137394, + "grad_norm": 1.5947793911628714, + "learning_rate": 8.865855548667081e-07, + "loss": 0.8476, + "step": 228810 + }, + { + "epoch": 17.731798984850247, + "grad_norm": 1.7501157783104027, + "learning_rate": 8.866243025418475e-07, + "loss": 0.859, + "step": 228820 + }, + { + "epoch": 17.732573908326554, + "grad_norm": 1.6138164786423614, + "learning_rate": 8.866630502169871e-07, + "loss": 0.8519, + "step": 228830 + }, + { + "epoch": 17.73334883180286, + "grad_norm": 1.6091940214686207, + "learning_rate": 8.867017978921265e-07, + "loss": 0.8553, + "step": 228840 + }, + { + "epoch": 17.734123755279168, + "grad_norm": 1.6329904137897202, + "learning_rate": 8.86740545567266e-07, + "loss": 0.8424, + "step": 228850 + }, + { + "epoch": 17.734898678755474, + "grad_norm": 1.5947536489611938, + "learning_rate": 8.867792932424055e-07, + "loss": 0.8351, + "step": 228860 + }, + { + "epoch": 17.73567360223178, + "grad_norm": 1.739732319229272, + "learning_rate": 8.86818040917545e-07, + "loss": 0.8521, + "step": 228870 + }, + { + "epoch": 17.736448525708088, + "grad_norm": 1.643486698624077, + "learning_rate": 8.868567885926845e-07, + "loss": 0.8453, + "step": 228880 + }, + { + "epoch": 17.737223449184395, + "grad_norm": 1.6897763114365802, + "learning_rate": 8.86895536267824e-07, + "loss": 0.8501, + "step": 228890 + }, + { + "epoch": 17.737998372660698, + "grad_norm": 1.639935017328498, + "learning_rate": 8.869342839429634e-07, + "loss": 0.8408, + "step": 228900 + }, + { + "epoch": 17.738773296137005, + "grad_norm": 1.6605556458448507, + "learning_rate": 8.86973031618103e-07, + "loss": 0.8496, + "step": 228910 + }, + { + "epoch": 17.73954821961331, + "grad_norm": 1.6380618899941475, + "learning_rate": 8.870117792932424e-07, + "loss": 0.848, + "step": 228920 + }, + { + "epoch": 17.74032314308962, + "grad_norm": 1.7418897143541463, + "learning_rate": 8.87050526968382e-07, + "loss": 0.8526, + "step": 228930 + }, + { + "epoch": 17.741098066565925, + "grad_norm": 1.775183250286149, + "learning_rate": 8.870892746435214e-07, + "loss": 0.8643, + "step": 228940 + }, + { + "epoch": 17.741872990042232, + "grad_norm": 1.624699817683975, + "learning_rate": 8.871280223186609e-07, + "loss": 0.8231, + "step": 228950 + }, + { + "epoch": 17.74264791351854, + "grad_norm": 1.7142152356339282, + "learning_rate": 8.871667699938004e-07, + "loss": 0.8607, + "step": 228960 + }, + { + "epoch": 17.743422836994846, + "grad_norm": 1.6820321461507737, + "learning_rate": 8.8720551766894e-07, + "loss": 0.8431, + "step": 228970 + }, + { + "epoch": 17.744197760471152, + "grad_norm": 1.7216771263722324, + "learning_rate": 8.872442653440794e-07, + "loss": 0.8367, + "step": 228980 + }, + { + "epoch": 17.74497268394746, + "grad_norm": 1.60781799839807, + "learning_rate": 8.872830130192189e-07, + "loss": 0.8642, + "step": 228990 + }, + { + "epoch": 17.745747607423766, + "grad_norm": 1.5735245321512121, + "learning_rate": 8.873217606943583e-07, + "loss": 0.8563, + "step": 229000 + }, + { + "epoch": 17.745747607423766, + "eval_loss": 0.8884791135787964, + "eval_runtime": 329.4418, + "eval_samples_per_second": 34.82, + "eval_steps_per_second": 8.706, + "step": 229000 + }, + { + "epoch": 17.746522530900073, + "grad_norm": 1.5929292648943674, + "learning_rate": 8.873605083694979e-07, + "loss": 0.8421, + "step": 229010 + }, + { + "epoch": 17.74729745437638, + "grad_norm": 1.7160615311283682, + "learning_rate": 8.873992560446374e-07, + "loss": 0.8538, + "step": 229020 + }, + { + "epoch": 17.748072377852687, + "grad_norm": 1.614170761002644, + "learning_rate": 8.874380037197769e-07, + "loss": 0.8496, + "step": 229030 + }, + { + "epoch": 17.748847301328993, + "grad_norm": 1.8167452890682307, + "learning_rate": 8.874767513949163e-07, + "loss": 0.8446, + "step": 229040 + }, + { + "epoch": 17.7496222248053, + "grad_norm": 1.671401064841677, + "learning_rate": 8.875154990700558e-07, + "loss": 0.8629, + "step": 229050 + }, + { + "epoch": 17.750397148281607, + "grad_norm": 1.7219877352178903, + "learning_rate": 8.875542467451953e-07, + "loss": 0.8625, + "step": 229060 + }, + { + "epoch": 17.751172071757914, + "grad_norm": 1.6482975119390022, + "learning_rate": 8.875929944203349e-07, + "loss": 0.8688, + "step": 229070 + }, + { + "epoch": 17.75194699523422, + "grad_norm": 1.7422346375248479, + "learning_rate": 8.876317420954743e-07, + "loss": 0.8386, + "step": 229080 + }, + { + "epoch": 17.752721918710527, + "grad_norm": 1.6719421615763463, + "learning_rate": 8.876704897706138e-07, + "loss": 0.8388, + "step": 229090 + }, + { + "epoch": 17.753496842186834, + "grad_norm": 1.7563060331878708, + "learning_rate": 8.877092374457532e-07, + "loss": 0.8478, + "step": 229100 + }, + { + "epoch": 17.75427176566314, + "grad_norm": 1.6201720390202594, + "learning_rate": 8.877479851208929e-07, + "loss": 0.8306, + "step": 229110 + }, + { + "epoch": 17.755046689139448, + "grad_norm": 1.5804396795665248, + "learning_rate": 8.877867327960323e-07, + "loss": 0.8484, + "step": 229120 + }, + { + "epoch": 17.755821612615755, + "grad_norm": 1.745114575008032, + "learning_rate": 8.878254804711718e-07, + "loss": 0.8502, + "step": 229130 + }, + { + "epoch": 17.75659653609206, + "grad_norm": 1.5866022660232344, + "learning_rate": 8.878642281463112e-07, + "loss": 0.8476, + "step": 229140 + }, + { + "epoch": 17.75737145956837, + "grad_norm": 1.690434431651738, + "learning_rate": 8.879029758214507e-07, + "loss": 0.8399, + "step": 229150 + }, + { + "epoch": 17.758146383044675, + "grad_norm": 1.7022753563381088, + "learning_rate": 8.879417234965903e-07, + "loss": 0.8565, + "step": 229160 + }, + { + "epoch": 17.758921306520982, + "grad_norm": 1.6056178304657869, + "learning_rate": 8.879804711717298e-07, + "loss": 0.8578, + "step": 229170 + }, + { + "epoch": 17.75969622999729, + "grad_norm": 1.6475293380730243, + "learning_rate": 8.880192188468692e-07, + "loss": 0.8461, + "step": 229180 + }, + { + "epoch": 17.760471153473596, + "grad_norm": 1.606393308987929, + "learning_rate": 8.880579665220087e-07, + "loss": 0.8496, + "step": 229190 + }, + { + "epoch": 17.761246076949902, + "grad_norm": 1.6035317242715725, + "learning_rate": 8.880967141971481e-07, + "loss": 0.8375, + "step": 229200 + }, + { + "epoch": 17.76202100042621, + "grad_norm": 1.6781820900282267, + "learning_rate": 8.881354618722878e-07, + "loss": 0.8514, + "step": 229210 + }, + { + "epoch": 17.762795923902516, + "grad_norm": 1.64640440169653, + "learning_rate": 8.881742095474272e-07, + "loss": 0.8565, + "step": 229220 + }, + { + "epoch": 17.763570847378823, + "grad_norm": 1.4943851942789577, + "learning_rate": 8.882129572225667e-07, + "loss": 0.8532, + "step": 229230 + }, + { + "epoch": 17.76434577085513, + "grad_norm": 1.7111308406318915, + "learning_rate": 8.882517048977061e-07, + "loss": 0.8303, + "step": 229240 + }, + { + "epoch": 17.765120694331436, + "grad_norm": 1.6134736635763174, + "learning_rate": 8.882904525728458e-07, + "loss": 0.8414, + "step": 229250 + }, + { + "epoch": 17.765895617807743, + "grad_norm": 1.597779176968822, + "learning_rate": 8.883292002479852e-07, + "loss": 0.8598, + "step": 229260 + }, + { + "epoch": 17.76667054128405, + "grad_norm": 1.587072672863288, + "learning_rate": 8.883679479231247e-07, + "loss": 0.8415, + "step": 229270 + }, + { + "epoch": 17.767445464760353, + "grad_norm": 1.6087211719849461, + "learning_rate": 8.884066955982641e-07, + "loss": 0.848, + "step": 229280 + }, + { + "epoch": 17.76822038823666, + "grad_norm": 1.6633119564448084, + "learning_rate": 8.884454432734036e-07, + "loss": 0.8522, + "step": 229290 + }, + { + "epoch": 17.768995311712967, + "grad_norm": 1.605050124046115, + "learning_rate": 8.884841909485432e-07, + "loss": 0.8504, + "step": 229300 + }, + { + "epoch": 17.769770235189274, + "grad_norm": 1.730322450344454, + "learning_rate": 8.885229386236827e-07, + "loss": 0.8197, + "step": 229310 + }, + { + "epoch": 17.77054515866558, + "grad_norm": 1.5523811307214634, + "learning_rate": 8.885616862988221e-07, + "loss": 0.8459, + "step": 229320 + }, + { + "epoch": 17.771320082141887, + "grad_norm": 1.6015132174255173, + "learning_rate": 8.886004339739616e-07, + "loss": 0.8533, + "step": 229330 + }, + { + "epoch": 17.772095005618194, + "grad_norm": 1.5906105605728642, + "learning_rate": 8.88639181649101e-07, + "loss": 0.8672, + "step": 229340 + }, + { + "epoch": 17.7728699290945, + "grad_norm": 1.6016021587199243, + "learning_rate": 8.886779293242407e-07, + "loss": 0.8335, + "step": 229350 + }, + { + "epoch": 17.773644852570808, + "grad_norm": 1.629866453465662, + "learning_rate": 8.887166769993801e-07, + "loss": 0.837, + "step": 229360 + }, + { + "epoch": 17.774419776047115, + "grad_norm": 1.5961027084693538, + "learning_rate": 8.887554246745196e-07, + "loss": 0.8486, + "step": 229370 + }, + { + "epoch": 17.77519469952342, + "grad_norm": 1.7343574781865487, + "learning_rate": 8.88794172349659e-07, + "loss": 0.8531, + "step": 229380 + }, + { + "epoch": 17.775969622999728, + "grad_norm": 1.6407090126349495, + "learning_rate": 8.888329200247986e-07, + "loss": 0.8447, + "step": 229390 + }, + { + "epoch": 17.776744546476035, + "grad_norm": 1.5412298533214288, + "learning_rate": 8.888716676999381e-07, + "loss": 0.856, + "step": 229400 + }, + { + "epoch": 17.777519469952342, + "grad_norm": 1.6048323426807791, + "learning_rate": 8.889104153750776e-07, + "loss": 0.8558, + "step": 229410 + }, + { + "epoch": 17.77829439342865, + "grad_norm": 1.6738959284858652, + "learning_rate": 8.88949163050217e-07, + "loss": 0.8426, + "step": 229420 + }, + { + "epoch": 17.779069316904955, + "grad_norm": 1.6373512229736693, + "learning_rate": 8.889879107253565e-07, + "loss": 0.8477, + "step": 229430 + }, + { + "epoch": 17.779844240381262, + "grad_norm": 1.573194632424116, + "learning_rate": 8.890266584004959e-07, + "loss": 0.8456, + "step": 229440 + }, + { + "epoch": 17.78061916385757, + "grad_norm": 1.5705846329373616, + "learning_rate": 8.890654060756356e-07, + "loss": 0.8506, + "step": 229450 + }, + { + "epoch": 17.781394087333876, + "grad_norm": 1.8344082961095032, + "learning_rate": 8.89104153750775e-07, + "loss": 0.8281, + "step": 229460 + }, + { + "epoch": 17.782169010810183, + "grad_norm": 1.5197043479832921, + "learning_rate": 8.891429014259145e-07, + "loss": 0.8549, + "step": 229470 + }, + { + "epoch": 17.78294393428649, + "grad_norm": 1.5864873885397601, + "learning_rate": 8.891816491010539e-07, + "loss": 0.8674, + "step": 229480 + }, + { + "epoch": 17.783718857762796, + "grad_norm": 1.7137911540219994, + "learning_rate": 8.892203967761935e-07, + "loss": 0.842, + "step": 229490 + }, + { + "epoch": 17.784493781239103, + "grad_norm": 1.6517982251301393, + "learning_rate": 8.89259144451333e-07, + "loss": 0.8547, + "step": 229500 + }, + { + "epoch": 17.784493781239103, + "eval_loss": 0.8883308172225952, + "eval_runtime": 329.6118, + "eval_samples_per_second": 34.802, + "eval_steps_per_second": 8.701, + "step": 229500 + }, + { + "epoch": 17.78526870471541, + "grad_norm": 1.6912937804033215, + "learning_rate": 8.892978921264725e-07, + "loss": 0.8621, + "step": 229510 + }, + { + "epoch": 17.786043628191717, + "grad_norm": 1.77955644592867, + "learning_rate": 8.893366398016119e-07, + "loss": 0.8556, + "step": 229520 + }, + { + "epoch": 17.786818551668024, + "grad_norm": 1.6983992772211731, + "learning_rate": 8.893753874767515e-07, + "loss": 0.8671, + "step": 229530 + }, + { + "epoch": 17.78759347514433, + "grad_norm": 1.7194969311263137, + "learning_rate": 8.894141351518909e-07, + "loss": 0.8519, + "step": 229540 + }, + { + "epoch": 17.788368398620637, + "grad_norm": 1.6400727169222062, + "learning_rate": 8.894528828270305e-07, + "loss": 0.8473, + "step": 229550 + }, + { + "epoch": 17.789143322096944, + "grad_norm": 1.65820429068243, + "learning_rate": 8.894916305021699e-07, + "loss": 0.8573, + "step": 229560 + }, + { + "epoch": 17.78991824557325, + "grad_norm": 1.5549210993682496, + "learning_rate": 8.895303781773094e-07, + "loss": 0.8577, + "step": 229570 + }, + { + "epoch": 17.790693169049558, + "grad_norm": 1.6130297276736691, + "learning_rate": 8.895691258524488e-07, + "loss": 0.856, + "step": 229580 + }, + { + "epoch": 17.791468092525864, + "grad_norm": 1.6404395231183575, + "learning_rate": 8.896078735275884e-07, + "loss": 0.8521, + "step": 229590 + }, + { + "epoch": 17.79224301600217, + "grad_norm": 1.6001082053277487, + "learning_rate": 8.896466212027279e-07, + "loss": 0.8507, + "step": 229600 + }, + { + "epoch": 17.793017939478478, + "grad_norm": 1.548572500762838, + "learning_rate": 8.896853688778674e-07, + "loss": 0.8312, + "step": 229610 + }, + { + "epoch": 17.793792862954785, + "grad_norm": 1.59306439353532, + "learning_rate": 8.897241165530068e-07, + "loss": 0.8667, + "step": 229620 + }, + { + "epoch": 17.79456778643109, + "grad_norm": 1.5990618404055692, + "learning_rate": 8.897628642281464e-07, + "loss": 0.8634, + "step": 229630 + }, + { + "epoch": 17.795342709907395, + "grad_norm": 1.6612707870367656, + "learning_rate": 8.898016119032858e-07, + "loss": 0.8685, + "step": 229640 + }, + { + "epoch": 17.7961176333837, + "grad_norm": 1.630746689293729, + "learning_rate": 8.898403595784254e-07, + "loss": 0.8505, + "step": 229650 + }, + { + "epoch": 17.79689255686001, + "grad_norm": 1.6444177448948252, + "learning_rate": 8.898791072535648e-07, + "loss": 0.8578, + "step": 229660 + }, + { + "epoch": 17.797667480336315, + "grad_norm": 1.6457700910933237, + "learning_rate": 8.899178549287044e-07, + "loss": 0.849, + "step": 229670 + }, + { + "epoch": 17.798442403812622, + "grad_norm": 1.6530653479322346, + "learning_rate": 8.899566026038438e-07, + "loss": 0.8423, + "step": 229680 + }, + { + "epoch": 17.79921732728893, + "grad_norm": 1.8776745337504481, + "learning_rate": 8.899953502789833e-07, + "loss": 0.8742, + "step": 229690 + }, + { + "epoch": 17.799992250765236, + "grad_norm": 1.6907819620669273, + "learning_rate": 8.900340979541228e-07, + "loss": 0.8297, + "step": 229700 + }, + { + "epoch": 17.800767174241543, + "grad_norm": 1.79014658394313, + "learning_rate": 8.900728456292623e-07, + "loss": 0.8486, + "step": 229710 + }, + { + "epoch": 17.80154209771785, + "grad_norm": 1.7317176824231308, + "learning_rate": 8.901115933044017e-07, + "loss": 0.8476, + "step": 229720 + }, + { + "epoch": 17.802317021194156, + "grad_norm": 1.5884387374813316, + "learning_rate": 8.901503409795413e-07, + "loss": 0.846, + "step": 229730 + }, + { + "epoch": 17.803091944670463, + "grad_norm": 1.6417961157372507, + "learning_rate": 8.901890886546807e-07, + "loss": 0.8651, + "step": 229740 + }, + { + "epoch": 17.80386686814677, + "grad_norm": 1.7103622841374644, + "learning_rate": 8.902278363298203e-07, + "loss": 0.8576, + "step": 229750 + }, + { + "epoch": 17.804641791623077, + "grad_norm": 1.6186921987494856, + "learning_rate": 8.902665840049597e-07, + "loss": 0.849, + "step": 229760 + }, + { + "epoch": 17.805416715099383, + "grad_norm": 1.6925007910485126, + "learning_rate": 8.903053316800993e-07, + "loss": 0.8581, + "step": 229770 + }, + { + "epoch": 17.80619163857569, + "grad_norm": 1.5510126450041708, + "learning_rate": 8.903440793552387e-07, + "loss": 0.8173, + "step": 229780 + }, + { + "epoch": 17.806966562051997, + "grad_norm": 1.6187059316328973, + "learning_rate": 8.903828270303782e-07, + "loss": 0.8588, + "step": 229790 + }, + { + "epoch": 17.807741485528304, + "grad_norm": 1.6156526885584015, + "learning_rate": 8.904215747055177e-07, + "loss": 0.8678, + "step": 229800 + }, + { + "epoch": 17.80851640900461, + "grad_norm": 1.525554402640669, + "learning_rate": 8.904603223806573e-07, + "loss": 0.844, + "step": 229810 + }, + { + "epoch": 17.809291332480917, + "grad_norm": 1.6192931984146217, + "learning_rate": 8.904990700557967e-07, + "loss": 0.8277, + "step": 229820 + }, + { + "epoch": 17.810066255957224, + "grad_norm": 1.7925395101927022, + "learning_rate": 8.905378177309362e-07, + "loss": 0.8425, + "step": 229830 + }, + { + "epoch": 17.81084117943353, + "grad_norm": 1.5950354163085303, + "learning_rate": 8.905765654060756e-07, + "loss": 0.8461, + "step": 229840 + }, + { + "epoch": 17.811616102909838, + "grad_norm": 1.6940782822834515, + "learning_rate": 8.906153130812152e-07, + "loss": 0.8547, + "step": 229850 + }, + { + "epoch": 17.812391026386145, + "grad_norm": 1.627701008465844, + "learning_rate": 8.906540607563546e-07, + "loss": 0.8723, + "step": 229860 + }, + { + "epoch": 17.81316594986245, + "grad_norm": 1.5807586476403843, + "learning_rate": 8.906928084314942e-07, + "loss": 0.838, + "step": 229870 + }, + { + "epoch": 17.81394087333876, + "grad_norm": 1.7058380033255756, + "learning_rate": 8.907315561066336e-07, + "loss": 0.8482, + "step": 229880 + }, + { + "epoch": 17.814715796815065, + "grad_norm": 1.4530493552499166, + "learning_rate": 8.907703037817732e-07, + "loss": 0.8469, + "step": 229890 + }, + { + "epoch": 17.815490720291372, + "grad_norm": 1.579418407534758, + "learning_rate": 8.908090514569126e-07, + "loss": 0.8531, + "step": 229900 + }, + { + "epoch": 17.81626564376768, + "grad_norm": 1.6925200703235708, + "learning_rate": 8.908477991320522e-07, + "loss": 0.8611, + "step": 229910 + }, + { + "epoch": 17.817040567243986, + "grad_norm": 1.5117076454776734, + "learning_rate": 8.908865468071916e-07, + "loss": 0.8512, + "step": 229920 + }, + { + "epoch": 17.817815490720292, + "grad_norm": 1.7041180396602775, + "learning_rate": 8.909252944823311e-07, + "loss": 0.8489, + "step": 229930 + }, + { + "epoch": 17.8185904141966, + "grad_norm": 1.5474296233334073, + "learning_rate": 8.909640421574705e-07, + "loss": 0.836, + "step": 229940 + }, + { + "epoch": 17.819365337672906, + "grad_norm": 1.6360926626594565, + "learning_rate": 8.910027898326102e-07, + "loss": 0.8448, + "step": 229950 + }, + { + "epoch": 17.820140261149213, + "grad_norm": 1.6995280942662665, + "learning_rate": 8.910415375077496e-07, + "loss": 0.8447, + "step": 229960 + }, + { + "epoch": 17.82091518462552, + "grad_norm": 1.6454275215837793, + "learning_rate": 8.910802851828891e-07, + "loss": 0.8419, + "step": 229970 + }, + { + "epoch": 17.821690108101826, + "grad_norm": 1.540930253671509, + "learning_rate": 8.911190328580285e-07, + "loss": 0.8273, + "step": 229980 + }, + { + "epoch": 17.822465031578133, + "grad_norm": 1.6486953878069872, + "learning_rate": 8.91157780533168e-07, + "loss": 0.8471, + "step": 229990 + }, + { + "epoch": 17.82323995505444, + "grad_norm": 1.640103206830258, + "learning_rate": 8.911965282083075e-07, + "loss": 0.8535, + "step": 230000 + }, + { + "epoch": 17.82323995505444, + "eval_loss": 0.8882104754447937, + "eval_runtime": 329.5248, + "eval_samples_per_second": 34.811, + "eval_steps_per_second": 8.703, + "step": 230000 + }, + { + "epoch": 17.824014878530747, + "grad_norm": 1.6498001928940407, + "learning_rate": 8.912352758834471e-07, + "loss": 0.8446, + "step": 230010 + }, + { + "epoch": 17.82478980200705, + "grad_norm": 1.7080922261204126, + "learning_rate": 8.912740235585865e-07, + "loss": 0.8346, + "step": 230020 + }, + { + "epoch": 17.825564725483357, + "grad_norm": 1.566427896031055, + "learning_rate": 8.91312771233726e-07, + "loss": 0.8441, + "step": 230030 + }, + { + "epoch": 17.826339648959664, + "grad_norm": 1.7319486947841254, + "learning_rate": 8.913515189088655e-07, + "loss": 0.8433, + "step": 230040 + }, + { + "epoch": 17.82711457243597, + "grad_norm": 1.6247159794879502, + "learning_rate": 8.913902665840051e-07, + "loss": 0.859, + "step": 230050 + }, + { + "epoch": 17.827889495912277, + "grad_norm": 1.7595931694554086, + "learning_rate": 8.914290142591445e-07, + "loss": 0.8473, + "step": 230060 + }, + { + "epoch": 17.828664419388584, + "grad_norm": 1.5918754264538062, + "learning_rate": 8.91467761934284e-07, + "loss": 0.856, + "step": 230070 + }, + { + "epoch": 17.82943934286489, + "grad_norm": 1.6419365703271522, + "learning_rate": 8.915065096094234e-07, + "loss": 0.852, + "step": 230080 + }, + { + "epoch": 17.830214266341198, + "grad_norm": 1.7171225521788887, + "learning_rate": 8.915452572845631e-07, + "loss": 0.8258, + "step": 230090 + }, + { + "epoch": 17.830989189817505, + "grad_norm": 1.680752113386224, + "learning_rate": 8.915840049597025e-07, + "loss": 0.8648, + "step": 230100 + }, + { + "epoch": 17.83176411329381, + "grad_norm": 1.6638028192928147, + "learning_rate": 8.91622752634842e-07, + "loss": 0.8612, + "step": 230110 + }, + { + "epoch": 17.832539036770118, + "grad_norm": 1.5869790935256467, + "learning_rate": 8.916615003099814e-07, + "loss": 0.8327, + "step": 230120 + }, + { + "epoch": 17.833313960246425, + "grad_norm": 1.67367424537967, + "learning_rate": 8.917002479851209e-07, + "loss": 0.8454, + "step": 230130 + }, + { + "epoch": 17.834088883722732, + "grad_norm": 1.5812604412539373, + "learning_rate": 8.917389956602604e-07, + "loss": 0.8514, + "step": 230140 + }, + { + "epoch": 17.83486380719904, + "grad_norm": 1.6009899118796664, + "learning_rate": 8.917777433354e-07, + "loss": 0.8512, + "step": 230150 + }, + { + "epoch": 17.835638730675345, + "grad_norm": 1.960263702468375, + "learning_rate": 8.918164910105394e-07, + "loss": 0.8557, + "step": 230160 + }, + { + "epoch": 17.836413654151652, + "grad_norm": 1.651584406766894, + "learning_rate": 8.918552386856789e-07, + "loss": 0.8627, + "step": 230170 + }, + { + "epoch": 17.83718857762796, + "grad_norm": 1.5541636599546889, + "learning_rate": 8.918939863608183e-07, + "loss": 0.8531, + "step": 230180 + }, + { + "epoch": 17.837963501104266, + "grad_norm": 1.7438654269983183, + "learning_rate": 8.91932734035958e-07, + "loss": 0.8403, + "step": 230190 + }, + { + "epoch": 17.838738424580573, + "grad_norm": 1.6508425312412296, + "learning_rate": 8.919714817110974e-07, + "loss": 0.843, + "step": 230200 + }, + { + "epoch": 17.83951334805688, + "grad_norm": 1.6544246606608735, + "learning_rate": 8.920102293862369e-07, + "loss": 0.8502, + "step": 230210 + }, + { + "epoch": 17.840288271533186, + "grad_norm": 1.5722476213574363, + "learning_rate": 8.920489770613763e-07, + "loss": 0.8566, + "step": 230220 + }, + { + "epoch": 17.841063195009493, + "grad_norm": 1.5372353220846011, + "learning_rate": 8.920877247365159e-07, + "loss": 0.8454, + "step": 230230 + }, + { + "epoch": 17.8418381184858, + "grad_norm": 1.5535104598229958, + "learning_rate": 8.921264724116554e-07, + "loss": 0.8416, + "step": 230240 + }, + { + "epoch": 17.842613041962107, + "grad_norm": 1.589674296369334, + "learning_rate": 8.921652200867949e-07, + "loss": 0.8607, + "step": 230250 + }, + { + "epoch": 17.843387965438414, + "grad_norm": 1.6142811728202229, + "learning_rate": 8.922039677619343e-07, + "loss": 0.8358, + "step": 230260 + }, + { + "epoch": 17.84416288891472, + "grad_norm": 1.5097712829759817, + "learning_rate": 8.922427154370738e-07, + "loss": 0.851, + "step": 230270 + }, + { + "epoch": 17.844937812391027, + "grad_norm": 1.592083344086325, + "learning_rate": 8.922814631122132e-07, + "loss": 0.8635, + "step": 230280 + }, + { + "epoch": 17.845712735867334, + "grad_norm": 1.6302739850807697, + "learning_rate": 8.923202107873529e-07, + "loss": 0.8626, + "step": 230290 + }, + { + "epoch": 17.84648765934364, + "grad_norm": 1.6718478465038948, + "learning_rate": 8.923589584624923e-07, + "loss": 0.8437, + "step": 230300 + }, + { + "epoch": 17.847262582819948, + "grad_norm": 1.621023024216165, + "learning_rate": 8.923977061376318e-07, + "loss": 0.8355, + "step": 230310 + }, + { + "epoch": 17.848037506296254, + "grad_norm": 1.6043218944286468, + "learning_rate": 8.924364538127712e-07, + "loss": 0.8511, + "step": 230320 + }, + { + "epoch": 17.84881242977256, + "grad_norm": 1.5840597334102355, + "learning_rate": 8.924752014879108e-07, + "loss": 0.8538, + "step": 230330 + }, + { + "epoch": 17.849587353248868, + "grad_norm": 1.6348168782344104, + "learning_rate": 8.925139491630503e-07, + "loss": 0.8407, + "step": 230340 + }, + { + "epoch": 17.850362276725175, + "grad_norm": 1.6894769618522563, + "learning_rate": 8.925526968381898e-07, + "loss": 0.8548, + "step": 230350 + }, + { + "epoch": 17.85113720020148, + "grad_norm": 1.687747829877015, + "learning_rate": 8.925914445133292e-07, + "loss": 0.8492, + "step": 230360 + }, + { + "epoch": 17.85191212367779, + "grad_norm": 1.6472234436675617, + "learning_rate": 8.926301921884687e-07, + "loss": 0.8435, + "step": 230370 + }, + { + "epoch": 17.85268704715409, + "grad_norm": 1.7156834550194633, + "learning_rate": 8.926689398636082e-07, + "loss": 0.841, + "step": 230380 + }, + { + "epoch": 17.8534619706304, + "grad_norm": 1.720339920117951, + "learning_rate": 8.927076875387478e-07, + "loss": 0.8526, + "step": 230390 + }, + { + "epoch": 17.854236894106705, + "grad_norm": 1.755247197310395, + "learning_rate": 8.927464352138872e-07, + "loss": 0.8638, + "step": 230400 + }, + { + "epoch": 17.855011817583012, + "grad_norm": 1.4600543832565935, + "learning_rate": 8.927851828890267e-07, + "loss": 0.8638, + "step": 230410 + }, + { + "epoch": 17.85578674105932, + "grad_norm": 1.6129677051179419, + "learning_rate": 8.928239305641661e-07, + "loss": 0.8467, + "step": 230420 + }, + { + "epoch": 17.856561664535626, + "grad_norm": 1.6747461182560093, + "learning_rate": 8.928626782393057e-07, + "loss": 0.852, + "step": 230430 + }, + { + "epoch": 17.857336588011933, + "grad_norm": 1.5769699265272683, + "learning_rate": 8.929014259144452e-07, + "loss": 0.8407, + "step": 230440 + }, + { + "epoch": 17.85811151148824, + "grad_norm": 1.7211224378776404, + "learning_rate": 8.929401735895847e-07, + "loss": 0.8511, + "step": 230450 + }, + { + "epoch": 17.858886434964546, + "grad_norm": 1.6176427868375307, + "learning_rate": 8.929789212647241e-07, + "loss": 0.8388, + "step": 230460 + }, + { + "epoch": 17.859661358440853, + "grad_norm": 1.7003669300904125, + "learning_rate": 8.930176689398637e-07, + "loss": 0.8339, + "step": 230470 + }, + { + "epoch": 17.86043628191716, + "grad_norm": 1.6161605818247273, + "learning_rate": 8.930564166150031e-07, + "loss": 0.8511, + "step": 230480 + }, + { + "epoch": 17.861211205393467, + "grad_norm": 1.5502603844510847, + "learning_rate": 8.930951642901427e-07, + "loss": 0.8329, + "step": 230490 + }, + { + "epoch": 17.861986128869773, + "grad_norm": 1.6318366286969144, + "learning_rate": 8.931339119652821e-07, + "loss": 0.8474, + "step": 230500 + }, + { + "epoch": 17.861986128869773, + "eval_loss": 0.8879110217094421, + "eval_runtime": 328.3778, + "eval_samples_per_second": 34.932, + "eval_steps_per_second": 8.734, + "step": 230500 + }, + { + "epoch": 17.86276105234608, + "grad_norm": 1.6557339636606203, + "learning_rate": 8.931726596404216e-07, + "loss": 0.8574, + "step": 230510 + }, + { + "epoch": 17.863535975822387, + "grad_norm": 1.581573885802168, + "learning_rate": 8.932114073155611e-07, + "loss": 0.8481, + "step": 230520 + }, + { + "epoch": 17.864310899298694, + "grad_norm": 1.7162722289151477, + "learning_rate": 8.932501549907007e-07, + "loss": 0.8498, + "step": 230530 + }, + { + "epoch": 17.865085822775, + "grad_norm": 1.5844898265831842, + "learning_rate": 8.932889026658401e-07, + "loss": 0.8499, + "step": 230540 + }, + { + "epoch": 17.865860746251307, + "grad_norm": 1.663937687817391, + "learning_rate": 8.933276503409796e-07, + "loss": 0.8464, + "step": 230550 + }, + { + "epoch": 17.866635669727614, + "grad_norm": 1.5369840680165519, + "learning_rate": 8.93366398016119e-07, + "loss": 0.8498, + "step": 230560 + }, + { + "epoch": 17.86741059320392, + "grad_norm": 1.7763643305133672, + "learning_rate": 8.934051456912586e-07, + "loss": 0.8472, + "step": 230570 + }, + { + "epoch": 17.868185516680228, + "grad_norm": 1.6241600540926238, + "learning_rate": 8.93443893366398e-07, + "loss": 0.8296, + "step": 230580 + }, + { + "epoch": 17.868960440156535, + "grad_norm": 1.6667400979834668, + "learning_rate": 8.934826410415376e-07, + "loss": 0.8566, + "step": 230590 + }, + { + "epoch": 17.86973536363284, + "grad_norm": 1.6369271175464557, + "learning_rate": 8.93521388716677e-07, + "loss": 0.8448, + "step": 230600 + }, + { + "epoch": 17.87051028710915, + "grad_norm": 1.5606620579857273, + "learning_rate": 8.935601363918166e-07, + "loss": 0.8439, + "step": 230610 + }, + { + "epoch": 17.871285210585455, + "grad_norm": 1.622683637311672, + "learning_rate": 8.93598884066956e-07, + "loss": 0.8499, + "step": 230620 + }, + { + "epoch": 17.872060134061762, + "grad_norm": 1.7273749156593443, + "learning_rate": 8.936376317420956e-07, + "loss": 0.8471, + "step": 230630 + }, + { + "epoch": 17.87283505753807, + "grad_norm": 1.5150039263842368, + "learning_rate": 8.93676379417235e-07, + "loss": 0.85, + "step": 230640 + }, + { + "epoch": 17.873609981014376, + "grad_norm": 1.7041881349315802, + "learning_rate": 8.937151270923745e-07, + "loss": 0.8684, + "step": 230650 + }, + { + "epoch": 17.874384904490682, + "grad_norm": 1.67797138234274, + "learning_rate": 8.93753874767514e-07, + "loss": 0.8587, + "step": 230660 + }, + { + "epoch": 17.87515982796699, + "grad_norm": 1.5751685705040013, + "learning_rate": 8.937926224426535e-07, + "loss": 0.8418, + "step": 230670 + }, + { + "epoch": 17.875934751443296, + "grad_norm": 1.5746379854958898, + "learning_rate": 8.93831370117793e-07, + "loss": 0.8756, + "step": 230680 + }, + { + "epoch": 17.876709674919603, + "grad_norm": 1.705728928175762, + "learning_rate": 8.938701177929325e-07, + "loss": 0.8481, + "step": 230690 + }, + { + "epoch": 17.87748459839591, + "grad_norm": 1.628938619426208, + "learning_rate": 8.939088654680719e-07, + "loss": 0.866, + "step": 230700 + }, + { + "epoch": 17.878259521872216, + "grad_norm": 1.6099111489026667, + "learning_rate": 8.939476131432115e-07, + "loss": 0.8486, + "step": 230710 + }, + { + "epoch": 17.879034445348523, + "grad_norm": 1.6866442221607694, + "learning_rate": 8.939863608183509e-07, + "loss": 0.8429, + "step": 230720 + }, + { + "epoch": 17.87980936882483, + "grad_norm": 1.606579092107319, + "learning_rate": 8.940251084934905e-07, + "loss": 0.8291, + "step": 230730 + }, + { + "epoch": 17.880584292301137, + "grad_norm": 1.5251685104278525, + "learning_rate": 8.940638561686299e-07, + "loss": 0.8365, + "step": 230740 + }, + { + "epoch": 17.881359215777444, + "grad_norm": 1.8153300498500184, + "learning_rate": 8.941026038437695e-07, + "loss": 0.8339, + "step": 230750 + }, + { + "epoch": 17.88213413925375, + "grad_norm": 1.7367976273970491, + "learning_rate": 8.941413515189089e-07, + "loss": 0.8499, + "step": 230760 + }, + { + "epoch": 17.882909062730054, + "grad_norm": 1.653107204784218, + "learning_rate": 8.941800991940484e-07, + "loss": 0.8533, + "step": 230770 + }, + { + "epoch": 17.88368398620636, + "grad_norm": 1.613465310376871, + "learning_rate": 8.942188468691879e-07, + "loss": 0.8456, + "step": 230780 + }, + { + "epoch": 17.884458909682667, + "grad_norm": 1.6675475083954778, + "learning_rate": 8.942575945443274e-07, + "loss": 0.8472, + "step": 230790 + }, + { + "epoch": 17.885233833158974, + "grad_norm": 1.8467296519414256, + "learning_rate": 8.942963422194669e-07, + "loss": 0.8401, + "step": 230800 + }, + { + "epoch": 17.88600875663528, + "grad_norm": 1.6868936978441338, + "learning_rate": 8.943350898946064e-07, + "loss": 0.8392, + "step": 230810 + }, + { + "epoch": 17.886783680111588, + "grad_norm": 1.6700799260933334, + "learning_rate": 8.943738375697458e-07, + "loss": 0.8351, + "step": 230820 + }, + { + "epoch": 17.887558603587895, + "grad_norm": 1.6875247862872342, + "learning_rate": 8.944125852448854e-07, + "loss": 0.8485, + "step": 230830 + }, + { + "epoch": 17.8883335270642, + "grad_norm": 1.5369180161463138, + "learning_rate": 8.944513329200248e-07, + "loss": 0.8438, + "step": 230840 + }, + { + "epoch": 17.88910845054051, + "grad_norm": 1.7669127433977874, + "learning_rate": 8.944900805951644e-07, + "loss": 0.8532, + "step": 230850 + }, + { + "epoch": 17.889883374016815, + "grad_norm": 1.6055060211658547, + "learning_rate": 8.945288282703038e-07, + "loss": 0.8562, + "step": 230860 + }, + { + "epoch": 17.890658297493122, + "grad_norm": 1.6175125576913272, + "learning_rate": 8.945675759454433e-07, + "loss": 0.8507, + "step": 230870 + }, + { + "epoch": 17.89143322096943, + "grad_norm": 1.5714005049000632, + "learning_rate": 8.946063236205828e-07, + "loss": 0.8399, + "step": 230880 + }, + { + "epoch": 17.892208144445735, + "grad_norm": 1.6160592779193255, + "learning_rate": 8.946450712957224e-07, + "loss": 0.8514, + "step": 230890 + }, + { + "epoch": 17.892983067922042, + "grad_norm": 1.7432759458461269, + "learning_rate": 8.946838189708618e-07, + "loss": 0.8712, + "step": 230900 + }, + { + "epoch": 17.89375799139835, + "grad_norm": 1.7079844688637391, + "learning_rate": 8.947225666460013e-07, + "loss": 0.8415, + "step": 230910 + }, + { + "epoch": 17.894532914874656, + "grad_norm": 1.7995805787937336, + "learning_rate": 8.947613143211407e-07, + "loss": 0.8565, + "step": 230920 + }, + { + "epoch": 17.895307838350963, + "grad_norm": 1.5766212477755426, + "learning_rate": 8.948000619962803e-07, + "loss": 0.8407, + "step": 230930 + }, + { + "epoch": 17.89608276182727, + "grad_norm": 1.5780615448321191, + "learning_rate": 8.948388096714197e-07, + "loss": 0.8403, + "step": 230940 + }, + { + "epoch": 17.896857685303576, + "grad_norm": 1.6497000018095402, + "learning_rate": 8.948775573465593e-07, + "loss": 0.8425, + "step": 230950 + }, + { + "epoch": 17.897632608779883, + "grad_norm": 1.724909722803827, + "learning_rate": 8.949163050216987e-07, + "loss": 0.8394, + "step": 230960 + }, + { + "epoch": 17.89840753225619, + "grad_norm": 1.6035534474772184, + "learning_rate": 8.949550526968382e-07, + "loss": 0.8613, + "step": 230970 + }, + { + "epoch": 17.899182455732497, + "grad_norm": 1.6825775796761757, + "learning_rate": 8.949938003719777e-07, + "loss": 0.8466, + "step": 230980 + }, + { + "epoch": 17.899957379208804, + "grad_norm": 1.7435679623365523, + "learning_rate": 8.950325480471173e-07, + "loss": 0.8395, + "step": 230990 + }, + { + "epoch": 17.90073230268511, + "grad_norm": 1.7353497428181808, + "learning_rate": 8.950712957222567e-07, + "loss": 0.8399, + "step": 231000 + }, + { + "epoch": 17.90073230268511, + "eval_loss": 0.8882753252983093, + "eval_runtime": 332.4699, + "eval_samples_per_second": 34.502, + "eval_steps_per_second": 8.626, + "step": 231000 + }, + { + "epoch": 17.901507226161417, + "grad_norm": 1.9762343696065534, + "learning_rate": 8.951100433973962e-07, + "loss": 0.8583, + "step": 231010 + }, + { + "epoch": 17.902282149637724, + "grad_norm": 1.5697405995741645, + "learning_rate": 8.951487910725356e-07, + "loss": 0.8616, + "step": 231020 + }, + { + "epoch": 17.90305707311403, + "grad_norm": 1.6445923593711602, + "learning_rate": 8.951875387476753e-07, + "loss": 0.8579, + "step": 231030 + }, + { + "epoch": 17.903831996590338, + "grad_norm": 1.6099624497322556, + "learning_rate": 8.952262864228147e-07, + "loss": 0.8272, + "step": 231040 + }, + { + "epoch": 17.904606920066644, + "grad_norm": 1.6368458060128086, + "learning_rate": 8.952650340979542e-07, + "loss": 0.8582, + "step": 231050 + }, + { + "epoch": 17.90538184354295, + "grad_norm": 1.7934128448232773, + "learning_rate": 8.953037817730936e-07, + "loss": 0.8582, + "step": 231060 + }, + { + "epoch": 17.906156767019258, + "grad_norm": 1.6305840572926713, + "learning_rate": 8.953425294482331e-07, + "loss": 0.841, + "step": 231070 + }, + { + "epoch": 17.906931690495565, + "grad_norm": 1.643989553763036, + "learning_rate": 8.953812771233726e-07, + "loss": 0.8631, + "step": 231080 + }, + { + "epoch": 17.90770661397187, + "grad_norm": 1.543145150687577, + "learning_rate": 8.954200247985122e-07, + "loss": 0.8524, + "step": 231090 + }, + { + "epoch": 17.90848153744818, + "grad_norm": 1.674132726438768, + "learning_rate": 8.954587724736516e-07, + "loss": 0.8415, + "step": 231100 + }, + { + "epoch": 17.909256460924485, + "grad_norm": 1.6000786464350105, + "learning_rate": 8.954975201487911e-07, + "loss": 0.8518, + "step": 231110 + }, + { + "epoch": 17.910031384400792, + "grad_norm": 1.695056016619527, + "learning_rate": 8.955362678239305e-07, + "loss": 0.8275, + "step": 231120 + }, + { + "epoch": 17.910806307877095, + "grad_norm": 1.666429683260396, + "learning_rate": 8.955750154990702e-07, + "loss": 0.8468, + "step": 231130 + }, + { + "epoch": 17.911581231353402, + "grad_norm": 1.6226056888576275, + "learning_rate": 8.956137631742096e-07, + "loss": 0.8412, + "step": 231140 + }, + { + "epoch": 17.91235615482971, + "grad_norm": 1.6308614113286652, + "learning_rate": 8.956525108493491e-07, + "loss": 0.8416, + "step": 231150 + }, + { + "epoch": 17.913131078306016, + "grad_norm": 1.671678466180736, + "learning_rate": 8.956912585244885e-07, + "loss": 0.8572, + "step": 231160 + }, + { + "epoch": 17.913906001782323, + "grad_norm": 1.6359536372643178, + "learning_rate": 8.957300061996282e-07, + "loss": 0.8371, + "step": 231170 + }, + { + "epoch": 17.91468092525863, + "grad_norm": 1.5826845198337227, + "learning_rate": 8.957687538747676e-07, + "loss": 0.8514, + "step": 231180 + }, + { + "epoch": 17.915455848734936, + "grad_norm": 1.5856251253243097, + "learning_rate": 8.958075015499071e-07, + "loss": 0.8389, + "step": 231190 + }, + { + "epoch": 17.916230772211243, + "grad_norm": 1.5978039101455446, + "learning_rate": 8.958462492250465e-07, + "loss": 0.8622, + "step": 231200 + }, + { + "epoch": 17.91700569568755, + "grad_norm": 1.5958116153644035, + "learning_rate": 8.95884996900186e-07, + "loss": 0.8555, + "step": 231210 + }, + { + "epoch": 17.917780619163857, + "grad_norm": 1.5545397001824237, + "learning_rate": 8.959237445753254e-07, + "loss": 0.8521, + "step": 231220 + }, + { + "epoch": 17.918555542640163, + "grad_norm": 1.527276993126569, + "learning_rate": 8.959624922504651e-07, + "loss": 0.8527, + "step": 231230 + }, + { + "epoch": 17.91933046611647, + "grad_norm": 1.6547127320377293, + "learning_rate": 8.960012399256045e-07, + "loss": 0.8452, + "step": 231240 + }, + { + "epoch": 17.920105389592777, + "grad_norm": 1.636461926500864, + "learning_rate": 8.96039987600744e-07, + "loss": 0.8777, + "step": 231250 + }, + { + "epoch": 17.920880313069084, + "grad_norm": 1.5907273516826326, + "learning_rate": 8.960787352758834e-07, + "loss": 0.8404, + "step": 231260 + }, + { + "epoch": 17.92165523654539, + "grad_norm": 1.6476981057625992, + "learning_rate": 8.961174829510231e-07, + "loss": 0.8446, + "step": 231270 + }, + { + "epoch": 17.922430160021698, + "grad_norm": 1.7845148604718883, + "learning_rate": 8.961562306261625e-07, + "loss": 0.8492, + "step": 231280 + }, + { + "epoch": 17.923205083498004, + "grad_norm": 1.66048548381395, + "learning_rate": 8.96194978301302e-07, + "loss": 0.8256, + "step": 231290 + }, + { + "epoch": 17.92398000697431, + "grad_norm": 1.6557885989577783, + "learning_rate": 8.962337259764414e-07, + "loss": 0.8506, + "step": 231300 + }, + { + "epoch": 17.924754930450618, + "grad_norm": 1.5314315764837763, + "learning_rate": 8.96272473651581e-07, + "loss": 0.8461, + "step": 231310 + }, + { + "epoch": 17.925529853926925, + "grad_norm": 1.5695663433426013, + "learning_rate": 8.963112213267205e-07, + "loss": 0.8401, + "step": 231320 + }, + { + "epoch": 17.92630477740323, + "grad_norm": 1.570598456493717, + "learning_rate": 8.9634996900186e-07, + "loss": 0.8456, + "step": 231330 + }, + { + "epoch": 17.92707970087954, + "grad_norm": 1.5798593070850422, + "learning_rate": 8.963887166769994e-07, + "loss": 0.8408, + "step": 231340 + }, + { + "epoch": 17.927854624355845, + "grad_norm": 1.5223804316877554, + "learning_rate": 8.964274643521389e-07, + "loss": 0.8415, + "step": 231350 + }, + { + "epoch": 17.928629547832152, + "grad_norm": 1.6077989927197298, + "learning_rate": 8.964662120272783e-07, + "loss": 0.8398, + "step": 231360 + }, + { + "epoch": 17.92940447130846, + "grad_norm": 1.6619203930882849, + "learning_rate": 8.96504959702418e-07, + "loss": 0.8292, + "step": 231370 + }, + { + "epoch": 17.930179394784766, + "grad_norm": 1.672284787981525, + "learning_rate": 8.965437073775574e-07, + "loss": 0.8549, + "step": 231380 + }, + { + "epoch": 17.930954318261072, + "grad_norm": 1.5821784814921342, + "learning_rate": 8.965824550526969e-07, + "loss": 0.8535, + "step": 231390 + }, + { + "epoch": 17.93172924173738, + "grad_norm": 1.7897446382005184, + "learning_rate": 8.966212027278363e-07, + "loss": 0.8691, + "step": 231400 + }, + { + "epoch": 17.932504165213686, + "grad_norm": 1.5828245091146584, + "learning_rate": 8.966599504029759e-07, + "loss": 0.8317, + "step": 231410 + }, + { + "epoch": 17.933279088689993, + "grad_norm": 1.7074291770337444, + "learning_rate": 8.966986980781154e-07, + "loss": 0.8375, + "step": 231420 + }, + { + "epoch": 17.9340540121663, + "grad_norm": 1.5534091659584437, + "learning_rate": 8.967374457532549e-07, + "loss": 0.8593, + "step": 231430 + }, + { + "epoch": 17.934828935642606, + "grad_norm": 1.4695168202514455, + "learning_rate": 8.967761934283943e-07, + "loss": 0.8634, + "step": 231440 + }, + { + "epoch": 17.935603859118913, + "grad_norm": 1.720192079046523, + "learning_rate": 8.968149411035339e-07, + "loss": 0.8587, + "step": 231450 + }, + { + "epoch": 17.93637878259522, + "grad_norm": 1.570167191168235, + "learning_rate": 8.968536887786733e-07, + "loss": 0.8423, + "step": 231460 + }, + { + "epoch": 17.937153706071527, + "grad_norm": 1.7454815389651712, + "learning_rate": 8.968924364538129e-07, + "loss": 0.8354, + "step": 231470 + }, + { + "epoch": 17.937928629547834, + "grad_norm": 1.6843781463152863, + "learning_rate": 8.969311841289523e-07, + "loss": 0.8447, + "step": 231480 + }, + { + "epoch": 17.93870355302414, + "grad_norm": 1.64606351014982, + "learning_rate": 8.969699318040918e-07, + "loss": 0.8453, + "step": 231490 + }, + { + "epoch": 17.939478476500447, + "grad_norm": 1.6081778494870993, + "learning_rate": 8.970086794792312e-07, + "loss": 0.8601, + "step": 231500 + }, + { + "epoch": 17.939478476500447, + "eval_loss": 0.8879019021987915, + "eval_runtime": 332.2617, + "eval_samples_per_second": 34.524, + "eval_steps_per_second": 8.632, + "step": 231500 + } + ], + "logging_steps": 10, + "max_steps": 25808000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2000, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7969797920784384.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}