{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0006826715212198, "eval_steps": 275, "global_step": 1099, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009102286949596086, "grad_norm": 0.419871062040329, "learning_rate": 2e-05, "loss": 2.8811, "step": 1 }, { "epoch": 0.0009102286949596086, "eval_loss": 2.6547484397888184, "eval_runtime": 203.8787, "eval_samples_per_second": 9.079, "eval_steps_per_second": 4.542, "step": 1 }, { "epoch": 0.0018204573899192173, "grad_norm": 0.47298941016197205, "learning_rate": 4e-05, "loss": 2.8541, "step": 2 }, { "epoch": 0.0027306860848788257, "grad_norm": 0.48272326588630676, "learning_rate": 6e-05, "loss": 2.6363, "step": 3 }, { "epoch": 0.0036409147798384346, "grad_norm": 0.4156210422515869, "learning_rate": 8e-05, "loss": 2.666, "step": 4 }, { "epoch": 0.004551143474798043, "grad_norm": 0.375728964805603, "learning_rate": 0.0001, "loss": 2.7432, "step": 5 }, { "epoch": 0.0054613721697576514, "grad_norm": 0.3113997280597687, "learning_rate": 0.00012, "loss": 2.7398, "step": 6 }, { "epoch": 0.00637160086471726, "grad_norm": 0.4520895183086395, "learning_rate": 0.00014, "loss": 2.7017, "step": 7 }, { "epoch": 0.007281829559676869, "grad_norm": 0.5530128479003906, "learning_rate": 0.00016, "loss": 2.6855, "step": 8 }, { "epoch": 0.008192058254636477, "grad_norm": 0.43422171473503113, "learning_rate": 0.00018, "loss": 2.6454, "step": 9 }, { "epoch": 0.009102286949596085, "grad_norm": 0.3173878490924835, "learning_rate": 0.0002, "loss": 2.6638, "step": 10 }, { "epoch": 0.010012515644555695, "grad_norm": 0.3479019105434418, "learning_rate": 0.00019999958388469571, "loss": 2.5689, "step": 11 }, { "epoch": 0.010922744339515303, "grad_norm": 0.36122018098831177, "learning_rate": 0.00019999833554224577, "loss": 2.4933, "step": 12 }, { "epoch": 0.011832973034474913, "grad_norm": 0.3206821382045746, "learning_rate": 0.00019999625498303932, "loss": 2.5661, "step": 13 }, { "epoch": 0.01274320172943452, "grad_norm": 0.38150081038475037, "learning_rate": 0.00019999334222439147, "loss": 2.5545, "step": 14 }, { "epoch": 0.013653430424394129, "grad_norm": 0.3939971625804901, "learning_rate": 0.00019998959729054295, "loss": 2.5737, "step": 15 }, { "epoch": 0.014563659119353738, "grad_norm": 0.3015269339084625, "learning_rate": 0.0001999850202126604, "loss": 2.524, "step": 16 }, { "epoch": 0.015473887814313346, "grad_norm": 0.3177410662174225, "learning_rate": 0.00019997961102883552, "loss": 2.4203, "step": 17 }, { "epoch": 0.016384116509272954, "grad_norm": 0.3253481388092041, "learning_rate": 0.00019997336978408531, "loss": 2.5163, "step": 18 }, { "epoch": 0.017294345204232564, "grad_norm": 0.3166387975215912, "learning_rate": 0.00019996629653035126, "loss": 2.2021, "step": 19 }, { "epoch": 0.01820457389919217, "grad_norm": 0.32843250036239624, "learning_rate": 0.00019995839132649917, "loss": 2.4009, "step": 20 }, { "epoch": 0.01911480259415178, "grad_norm": 0.36471301317214966, "learning_rate": 0.00019994965423831854, "loss": 2.4734, "step": 21 }, { "epoch": 0.02002503128911139, "grad_norm": 0.36941981315612793, "learning_rate": 0.0001999400853385221, "loss": 2.4731, "step": 22 }, { "epoch": 0.020935259984071, "grad_norm": 0.3467971682548523, "learning_rate": 0.0001999296847067452, "loss": 2.4294, "step": 23 }, { "epoch": 0.021845488679030606, "grad_norm": 0.337288498878479, "learning_rate": 0.00019991845242954505, "loss": 2.0426, "step": 24 }, { "epoch": 0.022755717373990215, "grad_norm": 0.36415189504623413, "learning_rate": 0.00019990638860040006, "loss": 2.2803, "step": 25 }, { "epoch": 0.023665946068949825, "grad_norm": 0.37340012192726135, "learning_rate": 0.00019989349331970923, "loss": 2.2841, "step": 26 }, { "epoch": 0.02457617476390943, "grad_norm": 0.3849464952945709, "learning_rate": 0.00019987976669479088, "loss": 2.206, "step": 27 }, { "epoch": 0.02548640345886904, "grad_norm": 0.41922691464424133, "learning_rate": 0.00019986520883988232, "loss": 2.3736, "step": 28 }, { "epoch": 0.02639663215382865, "grad_norm": 0.41973674297332764, "learning_rate": 0.0001998498198761384, "loss": 2.3605, "step": 29 }, { "epoch": 0.027306860848788257, "grad_norm": 0.3731151223182678, "learning_rate": 0.00019983359993163078, "loss": 2.0799, "step": 30 }, { "epoch": 0.028217089543747867, "grad_norm": 0.41079527139663696, "learning_rate": 0.00019981654914134686, "loss": 2.3176, "step": 31 }, { "epoch": 0.029127318238707477, "grad_norm": 0.43309271335601807, "learning_rate": 0.00019979866764718843, "loss": 2.3306, "step": 32 }, { "epoch": 0.030037546933667083, "grad_norm": 0.45209765434265137, "learning_rate": 0.0001997799555979709, "loss": 2.3582, "step": 33 }, { "epoch": 0.030947775628626693, "grad_norm": 0.4335600435733795, "learning_rate": 0.00019976041314942155, "loss": 2.1077, "step": 34 }, { "epoch": 0.0318580043235863, "grad_norm": 0.43455106019973755, "learning_rate": 0.0001997400404641787, "loss": 2.2497, "step": 35 }, { "epoch": 0.03276823301854591, "grad_norm": 0.4201047718524933, "learning_rate": 0.00019971883771179003, "loss": 2.1547, "step": 36 }, { "epoch": 0.03367846171350552, "grad_norm": 0.43784937262535095, "learning_rate": 0.00019969680506871137, "loss": 2.3883, "step": 37 }, { "epoch": 0.03458869040846513, "grad_norm": 0.4830430746078491, "learning_rate": 0.00019967394271830504, "loss": 2.5713, "step": 38 }, { "epoch": 0.03549891910342474, "grad_norm": 0.4729761779308319, "learning_rate": 0.00019965025085083858, "loss": 2.3565, "step": 39 }, { "epoch": 0.03640914779838434, "grad_norm": 0.43940603733062744, "learning_rate": 0.000199625729663483, "loss": 2.1644, "step": 40 }, { "epoch": 0.03731937649334395, "grad_norm": 0.5039120316505432, "learning_rate": 0.00019960037936031104, "loss": 2.3641, "step": 41 }, { "epoch": 0.03822960518830356, "grad_norm": 0.487318217754364, "learning_rate": 0.00019957420015229572, "loss": 2.4128, "step": 42 }, { "epoch": 0.03913983388326317, "grad_norm": 0.5216456651687622, "learning_rate": 0.00019954719225730847, "loss": 2.3659, "step": 43 }, { "epoch": 0.04005006257822278, "grad_norm": 0.4877496659755707, "learning_rate": 0.00019951935590011718, "loss": 2.2571, "step": 44 }, { "epoch": 0.04096029127318239, "grad_norm": 0.4974242150783539, "learning_rate": 0.0001994906913123846, "loss": 2.1847, "step": 45 }, { "epoch": 0.041870519968142, "grad_norm": 0.5708010196685791, "learning_rate": 0.00019946119873266613, "loss": 2.6954, "step": 46 }, { "epoch": 0.0427807486631016, "grad_norm": 0.5892224311828613, "learning_rate": 0.00019943087840640814, "loss": 2.5149, "step": 47 }, { "epoch": 0.04369097735806121, "grad_norm": 0.6740038394927979, "learning_rate": 0.0001993997305859456, "loss": 2.6992, "step": 48 }, { "epoch": 0.04460120605302082, "grad_norm": 0.7869267463684082, "learning_rate": 0.0001993677555305002, "loss": 2.5103, "step": 49 }, { "epoch": 0.04551143474798043, "grad_norm": 1.244491457939148, "learning_rate": 0.00019933495350617813, "loss": 2.5167, "step": 50 }, { "epoch": 0.04642166344294004, "grad_norm": 1.1311120986938477, "learning_rate": 0.00019930132478596796, "loss": 2.7147, "step": 51 }, { "epoch": 0.04733189213789965, "grad_norm": 0.5104111433029175, "learning_rate": 0.00019926686964973813, "loss": 2.7642, "step": 52 }, { "epoch": 0.04824212083285925, "grad_norm": 0.4251229465007782, "learning_rate": 0.00019923158838423482, "loss": 2.4562, "step": 53 }, { "epoch": 0.04915234952781886, "grad_norm": 0.45901840925216675, "learning_rate": 0.00019919548128307954, "loss": 2.6106, "step": 54 }, { "epoch": 0.05006257822277847, "grad_norm": 0.4177067279815674, "learning_rate": 0.00019915854864676664, "loss": 2.4952, "step": 55 }, { "epoch": 0.05097280691773808, "grad_norm": 0.411510705947876, "learning_rate": 0.00019912079078266085, "loss": 2.6141, "step": 56 }, { "epoch": 0.05188303561269769, "grad_norm": 0.42673829197883606, "learning_rate": 0.0001990822080049946, "loss": 2.5089, "step": 57 }, { "epoch": 0.0527932643076573, "grad_norm": 0.3956531882286072, "learning_rate": 0.0001990428006348656, "loss": 2.4539, "step": 58 }, { "epoch": 0.053703493002616905, "grad_norm": 0.3687964975833893, "learning_rate": 0.00019900256900023413, "loss": 2.3752, "step": 59 }, { "epoch": 0.054613721697576514, "grad_norm": 0.34918078780174255, "learning_rate": 0.00019896151343592008, "loss": 2.2978, "step": 60 }, { "epoch": 0.055523950392536124, "grad_norm": 0.3795469403266907, "learning_rate": 0.00019891963428360043, "loss": 2.4324, "step": 61 }, { "epoch": 0.056434179087495734, "grad_norm": 0.37362977862358093, "learning_rate": 0.00019887693189180633, "loss": 2.3966, "step": 62 }, { "epoch": 0.057344407782455344, "grad_norm": 0.3695915639400482, "learning_rate": 0.00019883340661592015, "loss": 2.3188, "step": 63 }, { "epoch": 0.05825463647741495, "grad_norm": 0.376068651676178, "learning_rate": 0.00019878905881817252, "loss": 2.3933, "step": 64 }, { "epoch": 0.059164865172374556, "grad_norm": 0.3845527470111847, "learning_rate": 0.00019874388886763944, "loss": 2.5612, "step": 65 }, { "epoch": 0.060075093867334166, "grad_norm": 0.3678928315639496, "learning_rate": 0.00019869789714023906, "loss": 2.4174, "step": 66 }, { "epoch": 0.060985322562293776, "grad_norm": 0.3534240126609802, "learning_rate": 0.00019865108401872857, "loss": 2.2843, "step": 67 }, { "epoch": 0.061895551257253385, "grad_norm": 0.3559145927429199, "learning_rate": 0.00019860344989270113, "loss": 2.0911, "step": 68 }, { "epoch": 0.062805779952213, "grad_norm": 0.3940396308898926, "learning_rate": 0.0001985549951585825, "loss": 2.2796, "step": 69 }, { "epoch": 0.0637160086471726, "grad_norm": 0.39386069774627686, "learning_rate": 0.00019850572021962788, "loss": 2.1712, "step": 70 }, { "epoch": 0.06462623734213221, "grad_norm": 0.3923673629760742, "learning_rate": 0.00019845562548591826, "loss": 2.1665, "step": 71 }, { "epoch": 0.06553646603709182, "grad_norm": 0.3916044533252716, "learning_rate": 0.00019840471137435746, "loss": 2.267, "step": 72 }, { "epoch": 0.06644669473205143, "grad_norm": 0.4030390679836273, "learning_rate": 0.00019835297830866826, "loss": 2.2662, "step": 73 }, { "epoch": 0.06735692342701104, "grad_norm": 0.4568188786506653, "learning_rate": 0.00019830042671938904, "loss": 2.4341, "step": 74 }, { "epoch": 0.06826715212197064, "grad_norm": 0.4151420593261719, "learning_rate": 0.00019824705704387028, "loss": 2.2547, "step": 75 }, { "epoch": 0.06917738081693026, "grad_norm": 0.4141649007797241, "learning_rate": 0.00019819286972627066, "loss": 2.157, "step": 76 }, { "epoch": 0.07008760951188986, "grad_norm": 0.3980599343776703, "learning_rate": 0.00019813786521755372, "loss": 2.0681, "step": 77 }, { "epoch": 0.07099783820684948, "grad_norm": 0.41139495372772217, "learning_rate": 0.00019808204397548377, "loss": 2.2297, "step": 78 }, { "epoch": 0.07190806690180908, "grad_norm": 0.4236723482608795, "learning_rate": 0.0001980254064646223, "loss": 2.1775, "step": 79 }, { "epoch": 0.07281829559676868, "grad_norm": 0.413194864988327, "learning_rate": 0.00019796795315632395, "loss": 2.1926, "step": 80 }, { "epoch": 0.0737285242917283, "grad_norm": 0.4185364246368408, "learning_rate": 0.0001979096845287328, "loss": 2.0727, "step": 81 }, { "epoch": 0.0746387529866879, "grad_norm": 0.41983237862586975, "learning_rate": 0.00019785060106677818, "loss": 2.3075, "step": 82 }, { "epoch": 0.07554898168164752, "grad_norm": 0.44238418340682983, "learning_rate": 0.00019779070326217074, "loss": 2.2824, "step": 83 }, { "epoch": 0.07645921037660712, "grad_norm": 0.45874810218811035, "learning_rate": 0.00019772999161339833, "loss": 2.2601, "step": 84 }, { "epoch": 0.07736943907156674, "grad_norm": 0.4516693651676178, "learning_rate": 0.00019766846662572191, "loss": 2.314, "step": 85 }, { "epoch": 0.07827966776652634, "grad_norm": 0.43325892090797424, "learning_rate": 0.00019760612881117125, "loss": 2.1802, "step": 86 }, { "epoch": 0.07918989646148594, "grad_norm": 0.4844377636909485, "learning_rate": 0.00019754297868854073, "loss": 2.3255, "step": 87 }, { "epoch": 0.08010012515644556, "grad_norm": 0.44133615493774414, "learning_rate": 0.00019747901678338496, "loss": 2.2873, "step": 88 }, { "epoch": 0.08101035385140516, "grad_norm": 0.5170537829399109, "learning_rate": 0.00019741424362801452, "loss": 2.3539, "step": 89 }, { "epoch": 0.08192058254636478, "grad_norm": 0.46766072511672974, "learning_rate": 0.00019734865976149145, "loss": 2.2522, "step": 90 }, { "epoch": 0.08283081124132438, "grad_norm": 0.4890718162059784, "learning_rate": 0.00019728226572962473, "loss": 2.4453, "step": 91 }, { "epoch": 0.083741039936284, "grad_norm": 0.481082946062088, "learning_rate": 0.00019721506208496585, "loss": 2.2735, "step": 92 }, { "epoch": 0.0846512686312436, "grad_norm": 0.4898594319820404, "learning_rate": 0.00019714704938680408, "loss": 2.223, "step": 93 }, { "epoch": 0.0855614973262032, "grad_norm": 0.48203563690185547, "learning_rate": 0.00019707822820116193, "loss": 2.2461, "step": 94 }, { "epoch": 0.08647172602116282, "grad_norm": 0.5068606734275818, "learning_rate": 0.00019700859910079036, "loss": 2.1878, "step": 95 }, { "epoch": 0.08738195471612242, "grad_norm": 0.5902338027954102, "learning_rate": 0.00019693816266516407, "loss": 2.6452, "step": 96 }, { "epoch": 0.08829218341108204, "grad_norm": 0.6118764281272888, "learning_rate": 0.00019686691948047664, "loss": 2.4923, "step": 97 }, { "epoch": 0.08920241210604164, "grad_norm": 0.6774008274078369, "learning_rate": 0.00019679487013963564, "loss": 2.7092, "step": 98 }, { "epoch": 0.09011264080100125, "grad_norm": 0.8197199106216431, "learning_rate": 0.00019672201524225776, "loss": 2.7662, "step": 99 }, { "epoch": 0.09102286949596086, "grad_norm": 1.4529683589935303, "learning_rate": 0.0001966483553946637, "loss": 2.7347, "step": 100 }, { "epoch": 0.09193309819092046, "grad_norm": 1.3071473836898804, "learning_rate": 0.00019657389120987333, "loss": 2.8331, "step": 101 }, { "epoch": 0.09284332688588008, "grad_norm": 0.5390121340751648, "learning_rate": 0.00019649862330760036, "loss": 2.724, "step": 102 }, { "epoch": 0.09375355558083968, "grad_norm": 0.43149107694625854, "learning_rate": 0.00019642255231424729, "loss": 2.5445, "step": 103 }, { "epoch": 0.0946637842757993, "grad_norm": 0.4529167115688324, "learning_rate": 0.00019634567886290025, "loss": 2.4586, "step": 104 }, { "epoch": 0.0955740129707589, "grad_norm": 0.4370836317539215, "learning_rate": 0.00019626800359332362, "loss": 2.5472, "step": 105 }, { "epoch": 0.0964842416657185, "grad_norm": 0.43084716796875, "learning_rate": 0.00019618952715195475, "loss": 2.5307, "step": 106 }, { "epoch": 0.09739447036067812, "grad_norm": 0.4851230978965759, "learning_rate": 0.0001961102501918986, "loss": 2.3863, "step": 107 }, { "epoch": 0.09830469905563773, "grad_norm": 0.39606913924217224, "learning_rate": 0.00019603017337292236, "loss": 2.4334, "step": 108 }, { "epoch": 0.09921492775059734, "grad_norm": 0.3680226504802704, "learning_rate": 0.00019594929736144976, "loss": 2.3358, "step": 109 }, { "epoch": 0.10012515644555695, "grad_norm": 0.43170568346977234, "learning_rate": 0.00019586762283055573, "loss": 2.3502, "step": 110 }, { "epoch": 0.10103538514051655, "grad_norm": 0.4067252576351166, "learning_rate": 0.00019578515045996073, "loss": 2.4522, "step": 111 }, { "epoch": 0.10194561383547616, "grad_norm": 0.3645339906215668, "learning_rate": 0.0001957018809360251, "loss": 2.3924, "step": 112 }, { "epoch": 0.10285584253043577, "grad_norm": 0.4669637084007263, "learning_rate": 0.00019561781495174328, "loss": 2.3151, "step": 113 }, { "epoch": 0.10376607122539538, "grad_norm": 0.39779335260391235, "learning_rate": 0.00019553295320673807, "loss": 2.286, "step": 114 }, { "epoch": 0.10467629992035499, "grad_norm": 0.41239792108535767, "learning_rate": 0.00019544729640725498, "loss": 2.4233, "step": 115 }, { "epoch": 0.1055865286153146, "grad_norm": 0.3813844621181488, "learning_rate": 0.0001953608452661561, "loss": 2.3497, "step": 116 }, { "epoch": 0.1064967573102742, "grad_norm": 0.3577946424484253, "learning_rate": 0.0001952736005029142, "loss": 2.2817, "step": 117 }, { "epoch": 0.10740698600523381, "grad_norm": 0.3643464744091034, "learning_rate": 0.00019518556284360696, "loss": 2.2296, "step": 118 }, { "epoch": 0.10831721470019343, "grad_norm": 0.38557112216949463, "learning_rate": 0.00019509673302091075, "loss": 2.3039, "step": 119 }, { "epoch": 0.10922744339515303, "grad_norm": 0.39795589447021484, "learning_rate": 0.00019500711177409454, "loss": 2.3028, "step": 120 }, { "epoch": 0.11013767209011265, "grad_norm": 0.3612115681171417, "learning_rate": 0.00019491669984901379, "loss": 2.0678, "step": 121 }, { "epoch": 0.11104790078507225, "grad_norm": 0.36631131172180176, "learning_rate": 0.00019482549799810413, "loss": 2.2505, "step": 122 }, { "epoch": 0.11195812948003186, "grad_norm": 0.383908748626709, "learning_rate": 0.00019473350698037535, "loss": 2.1246, "step": 123 }, { "epoch": 0.11286835817499147, "grad_norm": 0.4124641716480255, "learning_rate": 0.00019464072756140486, "loss": 2.2511, "step": 124 }, { "epoch": 0.11377858686995107, "grad_norm": 0.3917684555053711, "learning_rate": 0.00019454716051333135, "loss": 2.2266, "step": 125 }, { "epoch": 0.11468881556491069, "grad_norm": 0.40930524468421936, "learning_rate": 0.00019445280661484847, "loss": 2.1455, "step": 126 }, { "epoch": 0.11559904425987029, "grad_norm": 0.4211941063404083, "learning_rate": 0.0001943576666511982, "loss": 2.1991, "step": 127 }, { "epoch": 0.1165092729548299, "grad_norm": 0.4424692392349243, "learning_rate": 0.00019426174141416448, "loss": 2.1868, "step": 128 }, { "epoch": 0.11741950164978951, "grad_norm": 0.41721439361572266, "learning_rate": 0.00019416503170206645, "loss": 2.3098, "step": 129 }, { "epoch": 0.11832973034474911, "grad_norm": 0.40203043818473816, "learning_rate": 0.00019406753831975203, "loss": 2.1059, "step": 130 }, { "epoch": 0.11923995903970873, "grad_norm": 0.4373216927051544, "learning_rate": 0.00019396926207859084, "loss": 2.0548, "step": 131 }, { "epoch": 0.12015018773466833, "grad_norm": 0.46566876769065857, "learning_rate": 0.00019387020379646797, "loss": 2.2075, "step": 132 }, { "epoch": 0.12106041642962795, "grad_norm": 0.4372277557849884, "learning_rate": 0.00019377036429777672, "loss": 2.1329, "step": 133 }, { "epoch": 0.12197064512458755, "grad_norm": 0.42452472448349, "learning_rate": 0.0001936697444134119, "loss": 2.1308, "step": 134 }, { "epoch": 0.12288087381954717, "grad_norm": 0.4379199743270874, "learning_rate": 0.0001935683449807631, "loss": 2.2978, "step": 135 }, { "epoch": 0.12379110251450677, "grad_norm": 0.46032705903053284, "learning_rate": 0.0001934661668437073, "loss": 2.2349, "step": 136 }, { "epoch": 0.12470133120946637, "grad_norm": 0.4644859731197357, "learning_rate": 0.00019336321085260236, "loss": 2.2485, "step": 137 }, { "epoch": 0.125611559904426, "grad_norm": 0.4860181510448456, "learning_rate": 0.00019325947786427952, "loss": 2.2059, "step": 138 }, { "epoch": 0.1265217885993856, "grad_norm": 0.46559709310531616, "learning_rate": 0.0001931549687420364, "loss": 2.2564, "step": 139 }, { "epoch": 0.1274320172943452, "grad_norm": 0.47857019305229187, "learning_rate": 0.00019304968435562993, "loss": 2.1526, "step": 140 }, { "epoch": 0.12834224598930483, "grad_norm": 0.5288405418395996, "learning_rate": 0.00019294362558126905, "loss": 2.399, "step": 141 }, { "epoch": 0.12925247468426443, "grad_norm": 0.4651063084602356, "learning_rate": 0.00019283679330160726, "loss": 2.3092, "step": 142 }, { "epoch": 0.13016270337922403, "grad_norm": 0.4940643310546875, "learning_rate": 0.00019272918840573558, "loss": 2.2838, "step": 143 }, { "epoch": 0.13107293207418363, "grad_norm": 0.5267408490180969, "learning_rate": 0.00019262081178917482, "loss": 2.3948, "step": 144 }, { "epoch": 0.13198316076914324, "grad_norm": 0.5252528190612793, "learning_rate": 0.0001925116643538684, "loss": 2.3055, "step": 145 }, { "epoch": 0.13289338946410287, "grad_norm": 0.5565192699432373, "learning_rate": 0.00019240174700817464, "loss": 2.2392, "step": 146 }, { "epoch": 0.13380361815906247, "grad_norm": 0.6170883774757385, "learning_rate": 0.00019229106066685937, "loss": 2.4212, "step": 147 }, { "epoch": 0.13471384685402207, "grad_norm": 0.623285710811615, "learning_rate": 0.0001921796062510882, "loss": 2.5778, "step": 148 }, { "epoch": 0.13562407554898168, "grad_norm": 0.7146514654159546, "learning_rate": 0.0001920673846884189, "loss": 2.5517, "step": 149 }, { "epoch": 0.13653430424394128, "grad_norm": 1.1342458724975586, "learning_rate": 0.00019195439691279363, "loss": 2.7753, "step": 150 }, { "epoch": 0.1374445329389009, "grad_norm": 0.5309027433395386, "learning_rate": 0.00019184064386453128, "loss": 2.6207, "step": 151 }, { "epoch": 0.1383547616338605, "grad_norm": 0.5145514607429504, "learning_rate": 0.00019172612649031952, "loss": 2.605, "step": 152 }, { "epoch": 0.13926499032882012, "grad_norm": 0.41491782665252686, "learning_rate": 0.00019161084574320696, "loss": 2.3562, "step": 153 }, { "epoch": 0.14017521902377972, "grad_norm": 0.4266800880432129, "learning_rate": 0.00019149480258259533, "loss": 2.5725, "step": 154 }, { "epoch": 0.14108544771873932, "grad_norm": 0.3961375951766968, "learning_rate": 0.00019137799797423126, "loss": 2.4722, "step": 155 }, { "epoch": 0.14199567641369895, "grad_norm": 0.41011351346969604, "learning_rate": 0.00019126043289019852, "loss": 2.5218, "step": 156 }, { "epoch": 0.14290590510865855, "grad_norm": 0.4531441032886505, "learning_rate": 0.00019114210830890969, "loss": 2.5233, "step": 157 }, { "epoch": 0.14381613380361816, "grad_norm": 0.447412371635437, "learning_rate": 0.00019102302521509815, "loss": 2.624, "step": 158 }, { "epoch": 0.14472636249857776, "grad_norm": 0.41504496335983276, "learning_rate": 0.00019090318459980986, "loss": 2.5468, "step": 159 }, { "epoch": 0.14563659119353736, "grad_norm": 0.3992937505245209, "learning_rate": 0.00019078258746039507, "loss": 2.5191, "step": 160 }, { "epoch": 0.146546819888497, "grad_norm": 0.37622761726379395, "learning_rate": 0.00019066123480050015, "loss": 2.3164, "step": 161 }, { "epoch": 0.1474570485834566, "grad_norm": 0.3903200626373291, "learning_rate": 0.00019053912763005907, "loss": 2.1591, "step": 162 }, { "epoch": 0.1483672772784162, "grad_norm": 0.4018140733242035, "learning_rate": 0.00019041626696528503, "loss": 2.3273, "step": 163 }, { "epoch": 0.1492775059733758, "grad_norm": 0.4058266878128052, "learning_rate": 0.00019029265382866214, "loss": 2.4121, "step": 164 }, { "epoch": 0.15018773466833543, "grad_norm": 0.41517823934555054, "learning_rate": 0.0001901682892489367, "loss": 2.3822, "step": 165 }, { "epoch": 0.15109796336329503, "grad_norm": 0.40526044368743896, "learning_rate": 0.0001900431742611089, "loss": 2.3957, "step": 166 }, { "epoch": 0.15200819205825464, "grad_norm": 0.39370712637901306, "learning_rate": 0.00018991730990642388, "loss": 2.2458, "step": 167 }, { "epoch": 0.15291842075321424, "grad_norm": 0.37955179810523987, "learning_rate": 0.00018979069723236333, "loss": 2.246, "step": 168 }, { "epoch": 0.15382864944817384, "grad_norm": 0.3796885907649994, "learning_rate": 0.00018966333729263674, "loss": 2.2288, "step": 169 }, { "epoch": 0.15473887814313347, "grad_norm": 0.37720954418182373, "learning_rate": 0.00018953523114717245, "loss": 2.2816, "step": 170 }, { "epoch": 0.15564910683809308, "grad_norm": 0.3945274353027344, "learning_rate": 0.00018940637986210906, "loss": 2.1191, "step": 171 }, { "epoch": 0.15655933553305268, "grad_norm": 0.42212241888046265, "learning_rate": 0.0001892767845097864, "loss": 2.3253, "step": 172 }, { "epoch": 0.15746956422801228, "grad_norm": 0.3840961456298828, "learning_rate": 0.00018914644616873657, "loss": 2.0736, "step": 173 }, { "epoch": 0.15837979292297188, "grad_norm": 0.3896063268184662, "learning_rate": 0.0001890153659236753, "loss": 2.1033, "step": 174 }, { "epoch": 0.15929002161793152, "grad_norm": 0.3891961872577667, "learning_rate": 0.00018888354486549237, "loss": 2.1627, "step": 175 }, { "epoch": 0.16020025031289112, "grad_norm": 0.4236443042755127, "learning_rate": 0.00018875098409124302, "loss": 2.2984, "step": 176 }, { "epoch": 0.16111047900785072, "grad_norm": 0.39355891942977905, "learning_rate": 0.0001886176847041386, "loss": 2.1823, "step": 177 }, { "epoch": 0.16202070770281032, "grad_norm": 0.4226863384246826, "learning_rate": 0.00018848364781353744, "loss": 2.2797, "step": 178 }, { "epoch": 0.16293093639776993, "grad_norm": 0.4181986451148987, "learning_rate": 0.0001883488745349355, "loss": 2.1999, "step": 179 }, { "epoch": 0.16384116509272956, "grad_norm": 0.4138600528240204, "learning_rate": 0.0001882133659899573, "loss": 1.8991, "step": 180 }, { "epoch": 0.16475139378768916, "grad_norm": 0.43699535727500916, "learning_rate": 0.00018807712330634642, "loss": 2.4304, "step": 181 }, { "epoch": 0.16566162248264876, "grad_norm": 0.4426632523536682, "learning_rate": 0.0001879401476179562, "loss": 2.2588, "step": 182 }, { "epoch": 0.16657185117760837, "grad_norm": 0.4601946473121643, "learning_rate": 0.0001878024400647402, "loss": 2.4086, "step": 183 }, { "epoch": 0.167482079872568, "grad_norm": 0.41164615750312805, "learning_rate": 0.00018766400179274286, "loss": 1.9165, "step": 184 }, { "epoch": 0.1683923085675276, "grad_norm": 0.42600390315055847, "learning_rate": 0.00018752483395408987, "loss": 2.0585, "step": 185 }, { "epoch": 0.1693025372624872, "grad_norm": 0.4481484889984131, "learning_rate": 0.00018738493770697852, "loss": 2.2189, "step": 186 }, { "epoch": 0.1702127659574468, "grad_norm": 0.47090041637420654, "learning_rate": 0.00018724431421566823, "loss": 2.3016, "step": 187 }, { "epoch": 0.1711229946524064, "grad_norm": 0.4611455798149109, "learning_rate": 0.00018710296465047075, "loss": 2.2475, "step": 188 }, { "epoch": 0.17203322334736604, "grad_norm": 0.4549846053123474, "learning_rate": 0.0001869608901877404, "loss": 2.0331, "step": 189 }, { "epoch": 0.17294345204232564, "grad_norm": 0.4108966886997223, "learning_rate": 0.0001868180920098644, "loss": 2.0985, "step": 190 }, { "epoch": 0.17385368073728524, "grad_norm": 0.505251944065094, "learning_rate": 0.00018667457130525284, "loss": 2.1995, "step": 191 }, { "epoch": 0.17476390943224485, "grad_norm": 0.4776213467121124, "learning_rate": 0.00018653032926832896, "loss": 2.2254, "step": 192 }, { "epoch": 0.17567413812720445, "grad_norm": 0.4876101016998291, "learning_rate": 0.00018638536709951917, "loss": 2.2885, "step": 193 }, { "epoch": 0.17658436682216408, "grad_norm": 0.49354955554008484, "learning_rate": 0.000186239686005243, "loss": 2.2939, "step": 194 }, { "epoch": 0.17749459551712368, "grad_norm": 0.5280092358589172, "learning_rate": 0.0001860932871979031, "loss": 2.3555, "step": 195 }, { "epoch": 0.17840482421208329, "grad_norm": 0.5320273637771606, "learning_rate": 0.00018594617189587512, "loss": 2.2854, "step": 196 }, { "epoch": 0.1793150529070429, "grad_norm": 0.6043667793273926, "learning_rate": 0.00018579834132349772, "loss": 2.6826, "step": 197 }, { "epoch": 0.1802252816020025, "grad_norm": 0.6498908996582031, "learning_rate": 0.0001856497967110621, "loss": 2.5815, "step": 198 }, { "epoch": 0.18113551029696212, "grad_norm": 0.8005975484848022, "learning_rate": 0.00018550053929480202, "loss": 2.7034, "step": 199 }, { "epoch": 0.18204573899192172, "grad_norm": 1.9555100202560425, "learning_rate": 0.00018535057031688335, "loss": 2.9439, "step": 200 }, { "epoch": 0.18295596768688133, "grad_norm": 0.65727299451828, "learning_rate": 0.0001851998910253939, "loss": 2.695, "step": 201 }, { "epoch": 0.18386619638184093, "grad_norm": 0.6414365172386169, "learning_rate": 0.0001850485026743328, "loss": 2.5447, "step": 202 }, { "epoch": 0.18477642507680053, "grad_norm": 0.5610425472259521, "learning_rate": 0.00018489640652360022, "loss": 2.4502, "step": 203 }, { "epoch": 0.18568665377176016, "grad_norm": 0.4109022319316864, "learning_rate": 0.00018474360383898694, "loss": 2.5165, "step": 204 }, { "epoch": 0.18659688246671977, "grad_norm": 0.4291420578956604, "learning_rate": 0.00018459009589216364, "loss": 2.4464, "step": 205 }, { "epoch": 0.18750711116167937, "grad_norm": 0.44230878353118896, "learning_rate": 0.0001844358839606705, "loss": 2.4365, "step": 206 }, { "epoch": 0.18841733985663897, "grad_norm": 0.4131537675857544, "learning_rate": 0.00018428096932790632, "loss": 2.5135, "step": 207 }, { "epoch": 0.1893275685515986, "grad_norm": 0.41846412420272827, "learning_rate": 0.00018412535328311814, "loss": 2.3571, "step": 208 }, { "epoch": 0.1902377972465582, "grad_norm": 0.4208964407444, "learning_rate": 0.0001839690371213903, "loss": 2.4642, "step": 209 }, { "epoch": 0.1911480259415178, "grad_norm": 0.40538257360458374, "learning_rate": 0.0001838120221436338, "loss": 2.4407, "step": 210 }, { "epoch": 0.1920582546364774, "grad_norm": 0.4170033633708954, "learning_rate": 0.00018365430965657526, "loss": 2.4844, "step": 211 }, { "epoch": 0.192968483331437, "grad_norm": 0.3793712258338928, "learning_rate": 0.00018349590097274632, "loss": 2.2296, "step": 212 }, { "epoch": 0.19387871202639664, "grad_norm": 0.38766035437583923, "learning_rate": 0.00018333679741047254, "loss": 2.2268, "step": 213 }, { "epoch": 0.19478894072135625, "grad_norm": 0.3814389109611511, "learning_rate": 0.00018317700029386245, "loss": 2.0952, "step": 214 }, { "epoch": 0.19569916941631585, "grad_norm": 0.4213506579399109, "learning_rate": 0.00018301651095279655, "loss": 2.3572, "step": 215 }, { "epoch": 0.19660939811127545, "grad_norm": 0.41608527302742004, "learning_rate": 0.0001828553307229163, "loss": 2.2262, "step": 216 }, { "epoch": 0.19751962680623505, "grad_norm": 0.3709307312965393, "learning_rate": 0.0001826934609456129, "loss": 2.2569, "step": 217 }, { "epoch": 0.19842985550119469, "grad_norm": 0.400329053401947, "learning_rate": 0.00018253090296801614, "loss": 2.2614, "step": 218 }, { "epoch": 0.1993400841961543, "grad_norm": 0.38953447341918945, "learning_rate": 0.0001823676581429833, "loss": 2.1335, "step": 219 }, { "epoch": 0.2002503128911139, "grad_norm": 0.39479631185531616, "learning_rate": 0.00018220372782908777, "loss": 2.2293, "step": 220 }, { "epoch": 0.2011605415860735, "grad_norm": 0.3911544382572174, "learning_rate": 0.00018203911339060783, "loss": 2.0099, "step": 221 }, { "epoch": 0.2020707702810331, "grad_norm": 0.40512001514434814, "learning_rate": 0.00018187381619751516, "loss": 2.2282, "step": 222 }, { "epoch": 0.20298099897599273, "grad_norm": 0.42339497804641724, "learning_rate": 0.00018170783762546365, "loss": 2.1722, "step": 223 }, { "epoch": 0.20389122767095233, "grad_norm": 0.4218563139438629, "learning_rate": 0.00018154117905577776, "loss": 2.1781, "step": 224 }, { "epoch": 0.20480145636591193, "grad_norm": 0.39831289649009705, "learning_rate": 0.00018137384187544116, "loss": 2.1663, "step": 225 }, { "epoch": 0.20571168506087154, "grad_norm": 0.39963746070861816, "learning_rate": 0.00018120582747708502, "loss": 1.97, "step": 226 }, { "epoch": 0.20662191375583117, "grad_norm": 0.38624411821365356, "learning_rate": 0.0001810371372589766, "loss": 1.9793, "step": 227 }, { "epoch": 0.20753214245079077, "grad_norm": 0.4173285961151123, "learning_rate": 0.0001808677726250076, "loss": 2.1832, "step": 228 }, { "epoch": 0.20844237114575037, "grad_norm": 0.393627405166626, "learning_rate": 0.00018069773498468223, "loss": 2.003, "step": 229 }, { "epoch": 0.20935259984070997, "grad_norm": 0.4344862699508667, "learning_rate": 0.00018052702575310588, "loss": 2.235, "step": 230 }, { "epoch": 0.21026282853566958, "grad_norm": 0.42050907015800476, "learning_rate": 0.00018035564635097298, "loss": 2.0272, "step": 231 }, { "epoch": 0.2111730572306292, "grad_norm": 0.45956531167030334, "learning_rate": 0.00018018359820455536, "loss": 2.277, "step": 232 }, { "epoch": 0.2120832859255888, "grad_norm": 0.4257940649986267, "learning_rate": 0.00018001088274569038, "loss": 2.1714, "step": 233 }, { "epoch": 0.2129935146205484, "grad_norm": 0.4153771996498108, "learning_rate": 0.00017983750141176895, "loss": 2.0942, "step": 234 }, { "epoch": 0.21390374331550802, "grad_norm": 0.44307154417037964, "learning_rate": 0.0001796634556457236, "loss": 2.1548, "step": 235 }, { "epoch": 0.21481397201046762, "grad_norm": 0.45095738768577576, "learning_rate": 0.0001794887468960165, "loss": 2.1294, "step": 236 }, { "epoch": 0.21572420070542725, "grad_norm": 0.4348011612892151, "learning_rate": 0.00017931337661662727, "loss": 2.1176, "step": 237 }, { "epoch": 0.21663442940038685, "grad_norm": 0.46650028228759766, "learning_rate": 0.0001791373462670411, "loss": 2.1333, "step": 238 }, { "epoch": 0.21754465809534645, "grad_norm": 0.49439772963523865, "learning_rate": 0.00017896065731223644, "loss": 2.1169, "step": 239 }, { "epoch": 0.21845488679030606, "grad_norm": 0.4834192395210266, "learning_rate": 0.00017878331122267284, "loss": 2.3213, "step": 240 }, { "epoch": 0.21936511548526566, "grad_norm": 0.49015435576438904, "learning_rate": 0.00017860530947427875, "loss": 2.3145, "step": 241 }, { "epoch": 0.2202753441802253, "grad_norm": 0.4994793236255646, "learning_rate": 0.00017842665354843922, "loss": 2.1586, "step": 242 }, { "epoch": 0.2211855728751849, "grad_norm": 0.5262652635574341, "learning_rate": 0.0001782473449319835, "loss": 2.3597, "step": 243 }, { "epoch": 0.2220958015701445, "grad_norm": 0.5197060704231262, "learning_rate": 0.0001780673851171728, "loss": 2.2091, "step": 244 }, { "epoch": 0.2230060302651041, "grad_norm": 0.5736340284347534, "learning_rate": 0.00017788677560168784, "loss": 2.489, "step": 245 }, { "epoch": 0.22391625896006373, "grad_norm": 0.5602148771286011, "learning_rate": 0.0001777055178886162, "loss": 2.4159, "step": 246 }, { "epoch": 0.22482648765502333, "grad_norm": 0.6122560501098633, "learning_rate": 0.0001775236134864401, "loss": 2.5168, "step": 247 }, { "epoch": 0.22573671634998294, "grad_norm": 0.6337876915931702, "learning_rate": 0.00017734106390902366, "loss": 2.522, "step": 248 }, { "epoch": 0.22664694504494254, "grad_norm": 0.8038336634635925, "learning_rate": 0.0001771578706756003, "loss": 2.5793, "step": 249 }, { "epoch": 0.22755717373990214, "grad_norm": 1.3161811828613281, "learning_rate": 0.0001769740353107602, "loss": 2.3232, "step": 250 }, { "epoch": 0.22846740243486177, "grad_norm": 0.5344598293304443, "learning_rate": 0.00017678955934443758, "loss": 2.5626, "step": 251 }, { "epoch": 0.22937763112982137, "grad_norm": 0.5922054052352905, "learning_rate": 0.0001766044443118978, "loss": 2.5528, "step": 252 }, { "epoch": 0.23028785982478098, "grad_norm": 0.4852460026741028, "learning_rate": 0.00017641869175372493, "loss": 2.4348, "step": 253 }, { "epoch": 0.23119808851974058, "grad_norm": 0.40997761487960815, "learning_rate": 0.00017623230321580854, "loss": 2.5167, "step": 254 }, { "epoch": 0.23210831721470018, "grad_norm": 0.40511611104011536, "learning_rate": 0.00017604528024933115, "loss": 2.479, "step": 255 }, { "epoch": 0.2330185459096598, "grad_norm": 0.3989182114601135, "learning_rate": 0.00017585762441075503, "loss": 2.3482, "step": 256 }, { "epoch": 0.23392877460461942, "grad_norm": 0.4280647039413452, "learning_rate": 0.00017566933726180964, "loss": 2.526, "step": 257 }, { "epoch": 0.23483900329957902, "grad_norm": 0.43045175075531006, "learning_rate": 0.0001754804203694782, "loss": 2.4734, "step": 258 }, { "epoch": 0.23574923199453862, "grad_norm": 0.41747456789016724, "learning_rate": 0.0001752908753059849, "loss": 2.3361, "step": 259 }, { "epoch": 0.23665946068949822, "grad_norm": 0.4000650942325592, "learning_rate": 0.00017510070364878177, "loss": 2.4496, "step": 260 }, { "epoch": 0.23756968938445785, "grad_norm": 0.3866179287433624, "learning_rate": 0.00017490990698053563, "loss": 2.3222, "step": 261 }, { "epoch": 0.23847991807941746, "grad_norm": 0.4083816409111023, "learning_rate": 0.00017471848688911464, "loss": 2.3619, "step": 262 }, { "epoch": 0.23939014677437706, "grad_norm": 0.4181024432182312, "learning_rate": 0.0001745264449675755, "loss": 2.3515, "step": 263 }, { "epoch": 0.24030037546933666, "grad_norm": 0.3845669627189636, "learning_rate": 0.00017433378281414975, "loss": 2.3866, "step": 264 }, { "epoch": 0.24121060416429627, "grad_norm": 0.3917085826396942, "learning_rate": 0.0001741405020322309, "loss": 2.4155, "step": 265 }, { "epoch": 0.2421208328592559, "grad_norm": 0.3883097171783447, "learning_rate": 0.00017394660423036075, "loss": 2.1034, "step": 266 }, { "epoch": 0.2430310615542155, "grad_norm": 0.37689700722694397, "learning_rate": 0.00017375209102221613, "loss": 2.1676, "step": 267 }, { "epoch": 0.2439412902491751, "grad_norm": 0.38591262698173523, "learning_rate": 0.00017355696402659548, "loss": 2.3066, "step": 268 }, { "epoch": 0.2448515189441347, "grad_norm": 0.3865531384944916, "learning_rate": 0.00017336122486740548, "loss": 2.1387, "step": 269 }, { "epoch": 0.24576174763909434, "grad_norm": 0.38778188824653625, "learning_rate": 0.00017316487517364721, "loss": 2.2369, "step": 270 }, { "epoch": 0.24667197633405394, "grad_norm": 0.38550639152526855, "learning_rate": 0.000172967916579403, "loss": 2.2013, "step": 271 }, { "epoch": 0.24758220502901354, "grad_norm": 0.41831621527671814, "learning_rate": 0.00017277035072382253, "loss": 2.1315, "step": 272 }, { "epoch": 0.24849243372397314, "grad_norm": 0.386983722448349, "learning_rate": 0.00017257217925110933, "loss": 2.1582, "step": 273 }, { "epoch": 0.24940266241893275, "grad_norm": 0.39023295044898987, "learning_rate": 0.00017237340381050703, "loss": 2.0061, "step": 274 }, { "epoch": 0.2503128911138924, "grad_norm": 0.4187031686306, "learning_rate": 0.00017217402605628572, "loss": 2.0814, "step": 275 }, { "epoch": 0.2503128911138924, "eval_loss": 2.2949306964874268, "eval_runtime": 205.2875, "eval_samples_per_second": 9.017, "eval_steps_per_second": 4.511, "step": 275 }, { "epoch": 0.251223119808852, "grad_norm": 0.39927324652671814, "learning_rate": 0.00017197404764772805, "loss": 2.1982, "step": 276 }, { "epoch": 0.2521333485038116, "grad_norm": 0.4287830889225006, "learning_rate": 0.00017177347024911562, "loss": 2.2733, "step": 277 }, { "epoch": 0.2530435771987712, "grad_norm": 0.3960012197494507, "learning_rate": 0.00017157229552971487, "loss": 2.1884, "step": 278 }, { "epoch": 0.2539538058937308, "grad_norm": 0.40106138586997986, "learning_rate": 0.00017137052516376345, "loss": 2.1207, "step": 279 }, { "epoch": 0.2548640345886904, "grad_norm": 0.4410867393016815, "learning_rate": 0.00017116816083045602, "loss": 2.3589, "step": 280 }, { "epoch": 0.25577426328365, "grad_norm": 0.4092939794063568, "learning_rate": 0.0001709652042139306, "loss": 2.0842, "step": 281 }, { "epoch": 0.25668449197860965, "grad_norm": 0.40820494294166565, "learning_rate": 0.0001707616570032542, "loss": 2.1658, "step": 282 }, { "epoch": 0.25759472067356926, "grad_norm": 0.41664186120033264, "learning_rate": 0.00017055752089240907, "loss": 2.1389, "step": 283 }, { "epoch": 0.25850494936852886, "grad_norm": 0.4125240445137024, "learning_rate": 0.00017035279758027832, "loss": 2.0615, "step": 284 }, { "epoch": 0.25941517806348846, "grad_norm": 0.42702898383140564, "learning_rate": 0.00017014748877063214, "loss": 2.017, "step": 285 }, { "epoch": 0.26032540675844806, "grad_norm": 0.44943541288375854, "learning_rate": 0.00016994159617211317, "loss": 2.1901, "step": 286 }, { "epoch": 0.26123563545340767, "grad_norm": 0.4286860227584839, "learning_rate": 0.00016973512149822274, "loss": 2.0643, "step": 287 }, { "epoch": 0.26214586414836727, "grad_norm": 0.44938111305236816, "learning_rate": 0.0001695280664673062, "loss": 2.1539, "step": 288 }, { "epoch": 0.26305609284332687, "grad_norm": 0.4638296067714691, "learning_rate": 0.0001693204328025389, "loss": 2.291, "step": 289 }, { "epoch": 0.2639663215382865, "grad_norm": 0.49295714497566223, "learning_rate": 0.00016911222223191182, "loss": 2.2538, "step": 290 }, { "epoch": 0.2648765502332461, "grad_norm": 0.48185715079307556, "learning_rate": 0.00016890343648821697, "loss": 2.2792, "step": 291 }, { "epoch": 0.26578677892820574, "grad_norm": 0.4750272035598755, "learning_rate": 0.0001686940773090333, "loss": 2.2774, "step": 292 }, { "epoch": 0.26669700762316534, "grad_norm": 0.5073033571243286, "learning_rate": 0.00016848414643671195, "loss": 2.3261, "step": 293 }, { "epoch": 0.26760723631812494, "grad_norm": 0.5343424081802368, "learning_rate": 0.00016827364561836187, "loss": 2.4097, "step": 294 }, { "epoch": 0.26851746501308454, "grad_norm": 0.5311369895935059, "learning_rate": 0.00016806257660583534, "loss": 2.3821, "step": 295 }, { "epoch": 0.26942769370804415, "grad_norm": 0.5551429986953735, "learning_rate": 0.00016785094115571322, "loss": 2.3795, "step": 296 }, { "epoch": 0.27033792240300375, "grad_norm": 0.6279783248901367, "learning_rate": 0.0001676387410292906, "loss": 2.435, "step": 297 }, { "epoch": 0.27124815109796335, "grad_norm": 0.7317250967025757, "learning_rate": 0.00016742597799256182, "loss": 2.6991, "step": 298 }, { "epoch": 0.27215837979292296, "grad_norm": 0.8485302329063416, "learning_rate": 0.000167212653816206, "loss": 2.7005, "step": 299 }, { "epoch": 0.27306860848788256, "grad_norm": 1.5959185361862183, "learning_rate": 0.00016699877027557226, "loss": 2.7536, "step": 300 }, { "epoch": 0.2739788371828422, "grad_norm": 0.4755174219608307, "learning_rate": 0.00016678432915066488, "loss": 2.5907, "step": 301 }, { "epoch": 0.2748890658778018, "grad_norm": 0.45389342308044434, "learning_rate": 0.00016656933222612854, "loss": 2.4622, "step": 302 }, { "epoch": 0.2757992945727614, "grad_norm": 0.4949435591697693, "learning_rate": 0.00016635378129123342, "loss": 2.4185, "step": 303 }, { "epoch": 0.276709523267721, "grad_norm": 0.4521631896495819, "learning_rate": 0.00016613767813986044, "loss": 2.4918, "step": 304 }, { "epoch": 0.2776197519626806, "grad_norm": 0.4228963553905487, "learning_rate": 0.0001659210245704861, "loss": 2.4194, "step": 305 }, { "epoch": 0.27852998065764023, "grad_norm": 0.4170341491699219, "learning_rate": 0.00016570382238616777, "loss": 2.4185, "step": 306 }, { "epoch": 0.27944020935259983, "grad_norm": 0.4123315215110779, "learning_rate": 0.00016548607339452853, "loss": 2.3737, "step": 307 }, { "epoch": 0.28035043804755944, "grad_norm": 0.4320162832736969, "learning_rate": 0.00016526777940774204, "loss": 2.3317, "step": 308 }, { "epoch": 0.28126066674251904, "grad_norm": 0.4118390381336212, "learning_rate": 0.00016504894224251778, "loss": 2.3786, "step": 309 }, { "epoch": 0.28217089543747864, "grad_norm": 0.39763331413269043, "learning_rate": 0.0001648295637200856, "loss": 2.2968, "step": 310 }, { "epoch": 0.2830811241324383, "grad_norm": 0.4391527473926544, "learning_rate": 0.0001646096456661807, "loss": 2.3764, "step": 311 }, { "epoch": 0.2839913528273979, "grad_norm": 0.43077877163887024, "learning_rate": 0.00016438918991102842, "loss": 2.2013, "step": 312 }, { "epoch": 0.2849015815223575, "grad_norm": 0.43149155378341675, "learning_rate": 0.000164168198289329, "loss": 2.3097, "step": 313 }, { "epoch": 0.2858118102173171, "grad_norm": 0.40134817361831665, "learning_rate": 0.00016394667264024246, "loss": 2.3306, "step": 314 }, { "epoch": 0.2867220389122767, "grad_norm": 0.4056681990623474, "learning_rate": 0.00016372461480737297, "loss": 2.3146, "step": 315 }, { "epoch": 0.2876322676072363, "grad_norm": 0.41738027334213257, "learning_rate": 0.00016350202663875386, "loss": 1.9997, "step": 316 }, { "epoch": 0.2885424963021959, "grad_norm": 0.38182246685028076, "learning_rate": 0.00016327890998683192, "loss": 2.0466, "step": 317 }, { "epoch": 0.2894527249971555, "grad_norm": 0.39759719371795654, "learning_rate": 0.00016305526670845226, "loss": 2.1788, "step": 318 }, { "epoch": 0.2903629536921151, "grad_norm": 0.3982352614402771, "learning_rate": 0.0001628310986648427, "loss": 2.2115, "step": 319 }, { "epoch": 0.2912731823870747, "grad_norm": 0.41679051518440247, "learning_rate": 0.0001626064077215983, "loss": 2.3036, "step": 320 }, { "epoch": 0.2921834110820344, "grad_norm": 0.40436604619026184, "learning_rate": 0.00016238119574866588, "loss": 2.1493, "step": 321 }, { "epoch": 0.293093639776994, "grad_norm": 0.4502476751804352, "learning_rate": 0.0001621554646203284, "loss": 1.8572, "step": 322 }, { "epoch": 0.2940038684719536, "grad_norm": 0.44303473830223083, "learning_rate": 0.00016192921621518944, "loss": 2.1832, "step": 323 }, { "epoch": 0.2949140971669132, "grad_norm": 0.4064692258834839, "learning_rate": 0.0001617024524161574, "loss": 2.2656, "step": 324 }, { "epoch": 0.2958243258618728, "grad_norm": 0.4479392170906067, "learning_rate": 0.0001614751751104301, "loss": 2.2462, "step": 325 }, { "epoch": 0.2967345545568324, "grad_norm": 0.4629363715648651, "learning_rate": 0.0001612473861894788, "loss": 1.9715, "step": 326 }, { "epoch": 0.297644783251792, "grad_norm": 0.3991665542125702, "learning_rate": 0.00016101908754903268, "loss": 2.0642, "step": 327 }, { "epoch": 0.2985550119467516, "grad_norm": 0.42503711581230164, "learning_rate": 0.00016079028108906282, "loss": 2.1403, "step": 328 }, { "epoch": 0.2994652406417112, "grad_norm": 0.4499455392360687, "learning_rate": 0.00016056096871376667, "loss": 2.0534, "step": 329 }, { "epoch": 0.30037546933667086, "grad_norm": 0.4549277424812317, "learning_rate": 0.00016033115233155202, "loss": 2.2083, "step": 330 }, { "epoch": 0.30128569803163047, "grad_norm": 0.3974262773990631, "learning_rate": 0.0001601008338550211, "loss": 2.0156, "step": 331 }, { "epoch": 0.30219592672659007, "grad_norm": 0.43566057085990906, "learning_rate": 0.00015987001520095478, "loss": 2.1801, "step": 332 }, { "epoch": 0.3031061554215497, "grad_norm": 0.47677701711654663, "learning_rate": 0.00015963869829029658, "loss": 2.1415, "step": 333 }, { "epoch": 0.3040163841165093, "grad_norm": 0.4603672921657562, "learning_rate": 0.00015940688504813662, "loss": 2.2967, "step": 334 }, { "epoch": 0.3049266128114689, "grad_norm": 0.4428515136241913, "learning_rate": 0.00015917457740369565, "loss": 2.1447, "step": 335 }, { "epoch": 0.3058368415064285, "grad_norm": 0.4379275441169739, "learning_rate": 0.000158941777290309, "loss": 2.0957, "step": 336 }, { "epoch": 0.3067470702013881, "grad_norm": 0.4831966459751129, "learning_rate": 0.00015870848664541044, "loss": 2.2457, "step": 337 }, { "epoch": 0.3076572988963477, "grad_norm": 0.45160865783691406, "learning_rate": 0.00015847470741051618, "loss": 2.1441, "step": 338 }, { "epoch": 0.3085675275913073, "grad_norm": 0.44453370571136475, "learning_rate": 0.00015824044153120852, "loss": 2.1073, "step": 339 }, { "epoch": 0.30947775628626695, "grad_norm": 0.49965375661849976, "learning_rate": 0.00015800569095711982, "loss": 2.1574, "step": 340 }, { "epoch": 0.31038798498122655, "grad_norm": 0.48138341307640076, "learning_rate": 0.00015777045764191625, "loss": 2.0205, "step": 341 }, { "epoch": 0.31129821367618615, "grad_norm": 0.5034924745559692, "learning_rate": 0.00015753474354328142, "loss": 2.2319, "step": 342 }, { "epoch": 0.31220844237114576, "grad_norm": 0.5034711956977844, "learning_rate": 0.00015729855062290022, "loss": 2.4066, "step": 343 }, { "epoch": 0.31311867106610536, "grad_norm": 0.5409703254699707, "learning_rate": 0.00015706188084644242, "loss": 2.2435, "step": 344 }, { "epoch": 0.31402889976106496, "grad_norm": 0.544597327709198, "learning_rate": 0.00015682473618354635, "loss": 2.2625, "step": 345 }, { "epoch": 0.31493912845602456, "grad_norm": 0.6114000082015991, "learning_rate": 0.0001565871186078025, "loss": 2.4302, "step": 346 }, { "epoch": 0.31584935715098417, "grad_norm": 0.6364843845367432, "learning_rate": 0.00015634903009673705, "loss": 2.5153, "step": 347 }, { "epoch": 0.31675958584594377, "grad_norm": 0.7510351538658142, "learning_rate": 0.00015611047263179548, "loss": 2.5605, "step": 348 }, { "epoch": 0.31766981454090343, "grad_norm": 0.8501291275024414, "learning_rate": 0.000155871448198326, "loss": 2.6519, "step": 349 }, { "epoch": 0.31858004323586303, "grad_norm": 1.7441632747650146, "learning_rate": 0.0001556319587855631, "loss": 2.7517, "step": 350 }, { "epoch": 0.31949027193082263, "grad_norm": 0.5301811695098877, "learning_rate": 0.00015539200638661104, "loss": 2.6647, "step": 351 }, { "epoch": 0.32040050062578224, "grad_norm": 0.5063616633415222, "learning_rate": 0.00015515159299842707, "loss": 2.4961, "step": 352 }, { "epoch": 0.32131072932074184, "grad_norm": 0.4843781590461731, "learning_rate": 0.00015491072062180503, "loss": 2.496, "step": 353 }, { "epoch": 0.32222095801570144, "grad_norm": 0.4524553716182709, "learning_rate": 0.00015466939126135856, "loss": 2.448, "step": 354 }, { "epoch": 0.32313118671066104, "grad_norm": 0.43678200244903564, "learning_rate": 0.00015442760692550443, "loss": 2.2687, "step": 355 }, { "epoch": 0.32404141540562065, "grad_norm": 0.4301970303058624, "learning_rate": 0.00015418536962644592, "loss": 2.4826, "step": 356 }, { "epoch": 0.32495164410058025, "grad_norm": 0.42540326714515686, "learning_rate": 0.00015394268138015598, "loss": 2.4205, "step": 357 }, { "epoch": 0.32586187279553985, "grad_norm": 0.4173906445503235, "learning_rate": 0.00015369954420636048, "loss": 2.394, "step": 358 }, { "epoch": 0.3267721014904995, "grad_norm": 0.43184736371040344, "learning_rate": 0.00015345596012852138, "loss": 2.3504, "step": 359 }, { "epoch": 0.3276823301854591, "grad_norm": 0.4002053141593933, "learning_rate": 0.00015321193117381996, "loss": 2.2951, "step": 360 }, { "epoch": 0.3285925588804187, "grad_norm": 0.39067134261131287, "learning_rate": 0.00015296745937313987, "loss": 2.2768, "step": 361 }, { "epoch": 0.3295027875753783, "grad_norm": 0.40051525831222534, "learning_rate": 0.00015272254676105025, "loss": 2.2235, "step": 362 }, { "epoch": 0.3304130162703379, "grad_norm": 0.3954068422317505, "learning_rate": 0.00015247719537578883, "loss": 2.2502, "step": 363 }, { "epoch": 0.3313232449652975, "grad_norm": 0.4123362600803375, "learning_rate": 0.00015223140725924495, "loss": 2.3309, "step": 364 }, { "epoch": 0.33223347366025713, "grad_norm": 0.4138774871826172, "learning_rate": 0.00015198518445694255, "loss": 2.4107, "step": 365 }, { "epoch": 0.33314370235521673, "grad_norm": 0.3983847498893738, "learning_rate": 0.0001517385290180231, "loss": 2.2718, "step": 366 }, { "epoch": 0.33405393105017633, "grad_norm": 0.36962834000587463, "learning_rate": 0.00015149144299522873, "loss": 2.1744, "step": 367 }, { "epoch": 0.334964159745136, "grad_norm": 0.37924104928970337, "learning_rate": 0.0001512439284448849, "loss": 2.1451, "step": 368 }, { "epoch": 0.3358743884400956, "grad_norm": 0.39990487694740295, "learning_rate": 0.0001509959874268835, "loss": 2.2508, "step": 369 }, { "epoch": 0.3367846171350552, "grad_norm": 0.3862214684486389, "learning_rate": 0.00015074762200466556, "loss": 2.1483, "step": 370 }, { "epoch": 0.3376948458300148, "grad_norm": 0.4037676751613617, "learning_rate": 0.00015049883424520414, "loss": 2.2179, "step": 371 }, { "epoch": 0.3386050745249744, "grad_norm": 0.40439948439598083, "learning_rate": 0.00015024962621898715, "loss": 2.2054, "step": 372 }, { "epoch": 0.339515303219934, "grad_norm": 0.3871942460536957, "learning_rate": 0.00015000000000000001, "loss": 2.129, "step": 373 }, { "epoch": 0.3404255319148936, "grad_norm": 0.4091387093067169, "learning_rate": 0.00014974995766570855, "loss": 2.1395, "step": 374 }, { "epoch": 0.3413357606098532, "grad_norm": 0.4097527265548706, "learning_rate": 0.00014949950129704162, "loss": 2.1789, "step": 375 }, { "epoch": 0.3422459893048128, "grad_norm": 0.4139934480190277, "learning_rate": 0.00014924863297837378, "loss": 2.0611, "step": 376 }, { "epoch": 0.3431562179997724, "grad_norm": 0.4146927297115326, "learning_rate": 0.00014899735479750794, "loss": 2.2488, "step": 377 }, { "epoch": 0.3440664466947321, "grad_norm": 0.4194958209991455, "learning_rate": 0.00014874566884565807, "loss": 2.0164, "step": 378 }, { "epoch": 0.3449766753896917, "grad_norm": 0.41280898451805115, "learning_rate": 0.00014849357721743168, "loss": 2.1503, "step": 379 }, { "epoch": 0.3458869040846513, "grad_norm": 0.4133208692073822, "learning_rate": 0.00014824108201081247, "loss": 2.0895, "step": 380 }, { "epoch": 0.3467971327796109, "grad_norm": 0.41347819566726685, "learning_rate": 0.00014798818532714279, "loss": 2.0479, "step": 381 }, { "epoch": 0.3477073614745705, "grad_norm": 0.43102580308914185, "learning_rate": 0.00014773488927110633, "loss": 2.1458, "step": 382 }, { "epoch": 0.3486175901695301, "grad_norm": 0.41427451372146606, "learning_rate": 0.00014748119595071034, "loss": 1.9396, "step": 383 }, { "epoch": 0.3495278188644897, "grad_norm": 0.46386152505874634, "learning_rate": 0.0001472271074772683, "loss": 2.2446, "step": 384 }, { "epoch": 0.3504380475594493, "grad_norm": 0.4310764670372009, "learning_rate": 0.00014697262596538227, "loss": 2.2144, "step": 385 }, { "epoch": 0.3513482762544089, "grad_norm": 0.4956878423690796, "learning_rate": 0.00014671775353292525, "loss": 2.1875, "step": 386 }, { "epoch": 0.35225850494936856, "grad_norm": 0.4793931543827057, "learning_rate": 0.00014646249230102366, "loss": 2.2733, "step": 387 }, { "epoch": 0.35316873364432816, "grad_norm": 0.46217313408851624, "learning_rate": 0.00014620684439403962, "loss": 2.2812, "step": 388 }, { "epoch": 0.35407896233928776, "grad_norm": 0.4721885323524475, "learning_rate": 0.00014595081193955324, "loss": 2.1223, "step": 389 }, { "epoch": 0.35498919103424736, "grad_norm": 0.49550965428352356, "learning_rate": 0.000145694397068345, "loss": 2.156, "step": 390 }, { "epoch": 0.35589941972920697, "grad_norm": 0.5109139084815979, "learning_rate": 0.0001454376019143779, "loss": 2.1494, "step": 391 }, { "epoch": 0.35680964842416657, "grad_norm": 0.4725574553012848, "learning_rate": 0.00014518042861477986, "loss": 2.1793, "step": 392 }, { "epoch": 0.3577198771191262, "grad_norm": 0.4739914536476135, "learning_rate": 0.00014492287930982576, "loss": 2.1763, "step": 393 }, { "epoch": 0.3586301058140858, "grad_norm": 0.5420114994049072, "learning_rate": 0.00014466495614291977, "loss": 2.4521, "step": 394 }, { "epoch": 0.3595403345090454, "grad_norm": 0.5225427150726318, "learning_rate": 0.00014440666126057744, "loss": 2.372, "step": 395 }, { "epoch": 0.360450563204005, "grad_norm": 0.5337964296340942, "learning_rate": 0.0001441479968124078, "loss": 2.397, "step": 396 }, { "epoch": 0.36136079189896464, "grad_norm": 0.5906230807304382, "learning_rate": 0.0001438889649510956, "loss": 2.506, "step": 397 }, { "epoch": 0.36227102059392424, "grad_norm": 0.6578875780105591, "learning_rate": 0.00014362956783238324, "loss": 2.6408, "step": 398 }, { "epoch": 0.36318124928888385, "grad_norm": 0.7982918620109558, "learning_rate": 0.00014336980761505297, "loss": 2.6612, "step": 399 }, { "epoch": 0.36409147798384345, "grad_norm": 1.4390262365341187, "learning_rate": 0.00014310968646090883, "loss": 2.7073, "step": 400 }, { "epoch": 0.36500170667880305, "grad_norm": 0.5260487198829651, "learning_rate": 0.00014284920653475866, "loss": 2.6269, "step": 401 }, { "epoch": 0.36591193537376265, "grad_norm": 0.4492892026901245, "learning_rate": 0.00014258837000439618, "loss": 2.3863, "step": 402 }, { "epoch": 0.36682216406872226, "grad_norm": 0.4619944095611572, "learning_rate": 0.0001423271790405828, "loss": 2.4595, "step": 403 }, { "epoch": 0.36773239276368186, "grad_norm": 0.4437786638736725, "learning_rate": 0.00014206563581702964, "loss": 2.3674, "step": 404 }, { "epoch": 0.36864262145864146, "grad_norm": 0.4789164364337921, "learning_rate": 0.0001418037425103795, "loss": 2.5203, "step": 405 }, { "epoch": 0.36955285015360106, "grad_norm": 0.44783228635787964, "learning_rate": 0.00014154150130018866, "loss": 2.5183, "step": 406 }, { "epoch": 0.3704630788485607, "grad_norm": 0.40067169070243835, "learning_rate": 0.00014127891436890868, "loss": 2.3846, "step": 407 }, { "epoch": 0.3713733075435203, "grad_norm": 0.3978015184402466, "learning_rate": 0.0001410159839018684, "loss": 2.3146, "step": 408 }, { "epoch": 0.37228353623847993, "grad_norm": 0.4096076190471649, "learning_rate": 0.0001407527120872557, "loss": 2.3617, "step": 409 }, { "epoch": 0.37319376493343953, "grad_norm": 0.4160764217376709, "learning_rate": 0.00014048910111609915, "loss": 2.2909, "step": 410 }, { "epoch": 0.37410399362839913, "grad_norm": 0.3976461887359619, "learning_rate": 0.0001402251531822499, "loss": 2.3111, "step": 411 }, { "epoch": 0.37501422232335874, "grad_norm": 0.3890199065208435, "learning_rate": 0.00013996087048236358, "loss": 2.0969, "step": 412 }, { "epoch": 0.37592445101831834, "grad_norm": 0.4157082140445709, "learning_rate": 0.00013969625521588158, "loss": 2.3205, "step": 413 }, { "epoch": 0.37683467971327794, "grad_norm": 0.4103608727455139, "learning_rate": 0.00013943130958501317, "loss": 2.2622, "step": 414 }, { "epoch": 0.37774490840823755, "grad_norm": 0.40916207432746887, "learning_rate": 0.00013916603579471705, "loss": 2.3585, "step": 415 }, { "epoch": 0.3786551371031972, "grad_norm": 0.39642858505249023, "learning_rate": 0.00013890043605268283, "loss": 2.2196, "step": 416 }, { "epoch": 0.3795653657981568, "grad_norm": 0.3851282596588135, "learning_rate": 0.00013863451256931287, "loss": 2.0298, "step": 417 }, { "epoch": 0.3804755944931164, "grad_norm": 0.38890305161476135, "learning_rate": 0.00013836826755770384, "loss": 2.1601, "step": 418 }, { "epoch": 0.381385823188076, "grad_norm": 0.41382652521133423, "learning_rate": 0.00013810170323362816, "loss": 2.2656, "step": 419 }, { "epoch": 0.3822960518830356, "grad_norm": 0.3820722699165344, "learning_rate": 0.0001378348218155158, "loss": 2.0094, "step": 420 }, { "epoch": 0.3832062805779952, "grad_norm": 0.4150048494338989, "learning_rate": 0.00013756762552443553, "loss": 2.2529, "step": 421 }, { "epoch": 0.3841165092729548, "grad_norm": 0.452776700258255, "learning_rate": 0.00013730011658407676, "loss": 2.1972, "step": 422 }, { "epoch": 0.3850267379679144, "grad_norm": 0.4173040986061096, "learning_rate": 0.00013703229722073065, "loss": 2.1502, "step": 423 }, { "epoch": 0.385936966662874, "grad_norm": 0.4115488529205322, "learning_rate": 0.000136764169663272, "loss": 1.9828, "step": 424 }, { "epoch": 0.38684719535783363, "grad_norm": 0.4060666561126709, "learning_rate": 0.00013649573614314044, "loss": 2.267, "step": 425 }, { "epoch": 0.3877574240527933, "grad_norm": 0.4049409031867981, "learning_rate": 0.00013622699889432184, "loss": 2.2044, "step": 426 }, { "epoch": 0.3886676527477529, "grad_norm": 0.40970832109451294, "learning_rate": 0.00013595796015332984, "loss": 2.0984, "step": 427 }, { "epoch": 0.3895778814427125, "grad_norm": 0.4141111671924591, "learning_rate": 0.00013568862215918717, "loss": 2.109, "step": 428 }, { "epoch": 0.3904881101376721, "grad_norm": 0.43404263257980347, "learning_rate": 0.00013541898715340716, "loss": 2.1763, "step": 429 }, { "epoch": 0.3913983388326317, "grad_norm": 0.41949963569641113, "learning_rate": 0.00013514905737997473, "loss": 2.3086, "step": 430 }, { "epoch": 0.3923085675275913, "grad_norm": 0.41665390133857727, "learning_rate": 0.00013487883508532815, "loss": 2.0726, "step": 431 }, { "epoch": 0.3932187962225509, "grad_norm": 0.4305708110332489, "learning_rate": 0.00013460832251834011, "loss": 2.1975, "step": 432 }, { "epoch": 0.3941290249175105, "grad_norm": 0.44775405526161194, "learning_rate": 0.00013433752193029886, "loss": 2.1503, "step": 433 }, { "epoch": 0.3950392536124701, "grad_norm": 0.44451820850372314, "learning_rate": 0.0001340664355748899, "loss": 2.1004, "step": 434 }, { "epoch": 0.39594948230742977, "grad_norm": 0.44242945313453674, "learning_rate": 0.0001337950657081768, "loss": 2.1074, "step": 435 }, { "epoch": 0.39685971100238937, "grad_norm": 0.4649699926376343, "learning_rate": 0.00013352341458858265, "loss": 2.2468, "step": 436 }, { "epoch": 0.397769939697349, "grad_norm": 0.4718558192253113, "learning_rate": 0.00013325148447687125, "loss": 2.225, "step": 437 }, { "epoch": 0.3986801683923086, "grad_norm": 0.44748789072036743, "learning_rate": 0.0001329792776361282, "loss": 2.0243, "step": 438 }, { "epoch": 0.3995903970872682, "grad_norm": 0.4730619192123413, "learning_rate": 0.00013270679633174218, "loss": 2.0262, "step": 439 }, { "epoch": 0.4005006257822278, "grad_norm": 0.4742071032524109, "learning_rate": 0.00013243404283138597, "loss": 2.1171, "step": 440 }, { "epoch": 0.4014108544771874, "grad_norm": 0.4963454306125641, "learning_rate": 0.00013216101940499768, "loss": 2.051, "step": 441 }, { "epoch": 0.402321083172147, "grad_norm": 0.5127780437469482, "learning_rate": 0.00013188772832476188, "loss": 2.1664, "step": 442 }, { "epoch": 0.4032313118671066, "grad_norm": 0.5129209756851196, "learning_rate": 0.00013161417186509052, "loss": 2.2272, "step": 443 }, { "epoch": 0.4041415405620662, "grad_norm": 0.5068848133087158, "learning_rate": 0.00013134035230260427, "loss": 2.1007, "step": 444 }, { "epoch": 0.40505176925702585, "grad_norm": 0.5721228718757629, "learning_rate": 0.00013106627191611332, "loss": 2.255, "step": 445 }, { "epoch": 0.40596199795198545, "grad_norm": 0.6085918545722961, "learning_rate": 0.0001307919329865985, "loss": 2.456, "step": 446 }, { "epoch": 0.40687222664694506, "grad_norm": 0.6652196645736694, "learning_rate": 0.00013051733779719234, "loss": 2.5504, "step": 447 }, { "epoch": 0.40778245534190466, "grad_norm": 0.7234418392181396, "learning_rate": 0.00013024248863316012, "loss": 2.5796, "step": 448 }, { "epoch": 0.40869268403686426, "grad_norm": 0.8588744401931763, "learning_rate": 0.00012996738778188067, "loss": 2.5756, "step": 449 }, { "epoch": 0.40960291273182386, "grad_norm": 1.2627683877944946, "learning_rate": 0.0001296920375328275, "loss": 2.203, "step": 450 }, { "epoch": 0.41051314142678347, "grad_norm": 0.4838164746761322, "learning_rate": 0.00012941644017754964, "loss": 2.434, "step": 451 }, { "epoch": 0.41142337012174307, "grad_norm": 0.44005534052848816, "learning_rate": 0.00012914059800965268, "loss": 2.55, "step": 452 }, { "epoch": 0.4123335988167027, "grad_norm": 0.4343414604663849, "learning_rate": 0.0001288645133247795, "loss": 2.432, "step": 453 }, { "epoch": 0.41324382751166233, "grad_norm": 0.4588654339313507, "learning_rate": 0.00012858818842059145, "loss": 2.4434, "step": 454 }, { "epoch": 0.41415405620662193, "grad_norm": 0.4294244647026062, "learning_rate": 0.00012831162559674887, "loss": 2.4241, "step": 455 }, { "epoch": 0.41506428490158154, "grad_norm": 0.40034809708595276, "learning_rate": 0.0001280348271548923, "loss": 2.3191, "step": 456 }, { "epoch": 0.41597451359654114, "grad_norm": 0.40817153453826904, "learning_rate": 0.00012775779539862304, "loss": 2.589, "step": 457 }, { "epoch": 0.41688474229150074, "grad_norm": 0.40605810284614563, "learning_rate": 0.0001274805326334842, "loss": 2.3445, "step": 458 }, { "epoch": 0.41779497098646035, "grad_norm": 0.4386533200740814, "learning_rate": 0.00012720304116694138, "loss": 2.4002, "step": 459 }, { "epoch": 0.41870519968141995, "grad_norm": 0.40985172986984253, "learning_rate": 0.00012692532330836346, "loss": 2.3964, "step": 460 }, { "epoch": 0.41961542837637955, "grad_norm": 0.4220562279224396, "learning_rate": 0.00012664738136900348, "loss": 2.3145, "step": 461 }, { "epoch": 0.42052565707133915, "grad_norm": 0.4068267047405243, "learning_rate": 0.00012636921766197943, "loss": 2.3274, "step": 462 }, { "epoch": 0.42143588576629876, "grad_norm": 0.3973187208175659, "learning_rate": 0.0001260908345022547, "loss": 2.1801, "step": 463 }, { "epoch": 0.4223461144612584, "grad_norm": 0.432224303483963, "learning_rate": 0.00012581223420661913, "loss": 2.4079, "step": 464 }, { "epoch": 0.423256343156218, "grad_norm": 0.3939046859741211, "learning_rate": 0.00012553341909366978, "loss": 2.0749, "step": 465 }, { "epoch": 0.4241665718511776, "grad_norm": 0.36949658393859863, "learning_rate": 0.00012525439148379128, "loss": 2.1471, "step": 466 }, { "epoch": 0.4250768005461372, "grad_norm": 0.3828236758708954, "learning_rate": 0.00012497515369913685, "loss": 2.0466, "step": 467 }, { "epoch": 0.4259870292410968, "grad_norm": 0.3874993920326233, "learning_rate": 0.00012469570806360875, "loss": 2.1605, "step": 468 }, { "epoch": 0.42689725793605643, "grad_norm": 0.3854924738407135, "learning_rate": 0.00012441605690283915, "loss": 2.0584, "step": 469 }, { "epoch": 0.42780748663101603, "grad_norm": 0.40301740169525146, "learning_rate": 0.00012413620254417057, "loss": 2.1481, "step": 470 }, { "epoch": 0.42871771532597563, "grad_norm": 0.3891369104385376, "learning_rate": 0.00012385614731663666, "loss": 2.1968, "step": 471 }, { "epoch": 0.42962794402093524, "grad_norm": 0.4305795729160309, "learning_rate": 0.00012357589355094275, "loss": 2.0421, "step": 472 }, { "epoch": 0.4305381727158949, "grad_norm": 0.44661635160446167, "learning_rate": 0.0001232954435794464, "loss": 2.3347, "step": 473 }, { "epoch": 0.4314484014108545, "grad_norm": 0.3984116315841675, "learning_rate": 0.00012301479973613822, "loss": 2.1093, "step": 474 }, { "epoch": 0.4323586301058141, "grad_norm": 0.4153747856616974, "learning_rate": 0.00012273396435662212, "loss": 2.0698, "step": 475 }, { "epoch": 0.4332688588007737, "grad_norm": 0.4589189887046814, "learning_rate": 0.00012245293977809605, "loss": 2.1707, "step": 476 }, { "epoch": 0.4341790874957333, "grad_norm": 0.43936577439308167, "learning_rate": 0.0001221717283393326, "loss": 2.2608, "step": 477 }, { "epoch": 0.4350893161906929, "grad_norm": 0.4170132279396057, "learning_rate": 0.0001218903323806595, "loss": 2.0813, "step": 478 }, { "epoch": 0.4359995448856525, "grad_norm": 0.43124523758888245, "learning_rate": 0.00012160875424393996, "loss": 2.1674, "step": 479 }, { "epoch": 0.4369097735806121, "grad_norm": 0.4394627511501312, "learning_rate": 0.00012132699627255347, "loss": 2.1904, "step": 480 }, { "epoch": 0.4378200022755717, "grad_norm": 0.4404590427875519, "learning_rate": 0.00012104506081137608, "loss": 2.1313, "step": 481 }, { "epoch": 0.4387302309705313, "grad_norm": 0.4580220878124237, "learning_rate": 0.00012076295020676103, "loss": 2.16, "step": 482 }, { "epoch": 0.439640459665491, "grad_norm": 0.4533630311489105, "learning_rate": 0.00012048066680651908, "loss": 2.1153, "step": 483 }, { "epoch": 0.4405506883604506, "grad_norm": 0.47520536184310913, "learning_rate": 0.00012019821295989912, "loss": 2.2152, "step": 484 }, { "epoch": 0.4414609170554102, "grad_norm": 0.44196072220802307, "learning_rate": 0.00011991559101756852, "loss": 2.1375, "step": 485 }, { "epoch": 0.4423711457503698, "grad_norm": 0.43681493401527405, "learning_rate": 0.00011963280333159358, "loss": 2.0552, "step": 486 }, { "epoch": 0.4432813744453294, "grad_norm": 0.4537602961063385, "learning_rate": 0.00011934985225541998, "loss": 2.1473, "step": 487 }, { "epoch": 0.444191603140289, "grad_norm": 0.4935773015022278, "learning_rate": 0.00011906674014385318, "loss": 2.0623, "step": 488 }, { "epoch": 0.4451018318352486, "grad_norm": 0.4802737236022949, "learning_rate": 0.00011878346935303883, "loss": 2.2908, "step": 489 }, { "epoch": 0.4460120605302082, "grad_norm": 0.5020537376403809, "learning_rate": 0.00011850004224044315, "loss": 2.3101, "step": 490 }, { "epoch": 0.4469222892251678, "grad_norm": 0.5106056332588196, "learning_rate": 0.00011821646116483335, "loss": 2.2838, "step": 491 }, { "epoch": 0.44783251792012746, "grad_norm": 0.473910391330719, "learning_rate": 0.00011793272848625797, "loss": 2.0599, "step": 492 }, { "epoch": 0.44874274661508706, "grad_norm": 0.5086584091186523, "learning_rate": 0.0001176488465660271, "loss": 2.1578, "step": 493 }, { "epoch": 0.44965297531004667, "grad_norm": 0.5282394886016846, "learning_rate": 0.00011736481776669306, "loss": 2.2965, "step": 494 }, { "epoch": 0.45056320400500627, "grad_norm": 0.5987780094146729, "learning_rate": 0.00011708064445203042, "loss": 2.3542, "step": 495 }, { "epoch": 0.45147343269996587, "grad_norm": 0.5943189859390259, "learning_rate": 0.00011679632898701649, "loss": 2.4294, "step": 496 }, { "epoch": 0.4523836613949255, "grad_norm": 0.6443737149238586, "learning_rate": 0.0001165118737378116, "loss": 2.605, "step": 497 }, { "epoch": 0.4532938900898851, "grad_norm": 0.7082577347755432, "learning_rate": 0.00011622728107173946, "loss": 2.4254, "step": 498 }, { "epoch": 0.4542041187848447, "grad_norm": 0.8503845930099487, "learning_rate": 0.00011594255335726724, "loss": 2.5187, "step": 499 }, { "epoch": 0.4551143474798043, "grad_norm": 1.6775977611541748, "learning_rate": 0.00011565769296398618, "loss": 2.6669, "step": 500 }, { "epoch": 0.4560245761747639, "grad_norm": 0.45572495460510254, "learning_rate": 0.00011537270226259169, "loss": 2.5806, "step": 501 }, { "epoch": 0.45693480486972354, "grad_norm": 0.45138293504714966, "learning_rate": 0.00011508758362486358, "loss": 2.3935, "step": 502 }, { "epoch": 0.45784503356468315, "grad_norm": 0.4548013210296631, "learning_rate": 0.00011480233942364645, "loss": 2.321, "step": 503 }, { "epoch": 0.45875526225964275, "grad_norm": 0.434442400932312, "learning_rate": 0.00011451697203282982, "loss": 2.375, "step": 504 }, { "epoch": 0.45966549095460235, "grad_norm": 0.4139295816421509, "learning_rate": 0.00011423148382732853, "loss": 2.3997, "step": 505 }, { "epoch": 0.46057571964956195, "grad_norm": 0.46020230650901794, "learning_rate": 0.00011394587718306275, "loss": 2.5745, "step": 506 }, { "epoch": 0.46148594834452156, "grad_norm": 0.4194343090057373, "learning_rate": 0.00011366015447693837, "loss": 2.2597, "step": 507 }, { "epoch": 0.46239617703948116, "grad_norm": 0.43983832001686096, "learning_rate": 0.0001133743180868273, "loss": 2.3511, "step": 508 }, { "epoch": 0.46330640573444076, "grad_norm": 0.41047292947769165, "learning_rate": 0.00011308837039154739, "loss": 2.2614, "step": 509 }, { "epoch": 0.46421663442940037, "grad_norm": 0.4110110104084015, "learning_rate": 0.0001128023137708429, "loss": 2.2719, "step": 510 }, { "epoch": 0.46512686312435997, "grad_norm": 0.41848358511924744, "learning_rate": 0.0001125161506053646, "loss": 2.3872, "step": 511 }, { "epoch": 0.4660370918193196, "grad_norm": 0.39852631092071533, "learning_rate": 0.00011222988327664997, "loss": 2.2001, "step": 512 }, { "epoch": 0.46694732051427923, "grad_norm": 0.4060978293418884, "learning_rate": 0.00011194351416710324, "loss": 2.2474, "step": 513 }, { "epoch": 0.46785754920923883, "grad_norm": 0.4010358452796936, "learning_rate": 0.00011165704565997593, "loss": 2.1262, "step": 514 }, { "epoch": 0.46876777790419843, "grad_norm": 0.4063378572463989, "learning_rate": 0.00011137048013934656, "loss": 2.1583, "step": 515 }, { "epoch": 0.46967800659915804, "grad_norm": 0.40287846326828003, "learning_rate": 0.00011108381999010111, "loss": 2.2351, "step": 516 }, { "epoch": 0.47058823529411764, "grad_norm": 0.3861018717288971, "learning_rate": 0.00011079706759791311, "loss": 2.195, "step": 517 }, { "epoch": 0.47149846398907724, "grad_norm": 0.38855546712875366, "learning_rate": 0.00011051022534922371, "loss": 2.1575, "step": 518 }, { "epoch": 0.47240869268403685, "grad_norm": 0.3941628038883209, "learning_rate": 0.00011022329563122191, "loss": 2.2324, "step": 519 }, { "epoch": 0.47331892137899645, "grad_norm": 0.40604814887046814, "learning_rate": 0.00010993628083182467, "loss": 2.1641, "step": 520 }, { "epoch": 0.4742291500739561, "grad_norm": 0.407815158367157, "learning_rate": 0.000109649183339657, "loss": 2.1648, "step": 521 }, { "epoch": 0.4751393787689157, "grad_norm": 0.400680810213089, "learning_rate": 0.00010936200554403209, "loss": 2.1939, "step": 522 }, { "epoch": 0.4760496074638753, "grad_norm": 0.416537344455719, "learning_rate": 0.00010907474983493144, "loss": 2.1694, "step": 523 }, { "epoch": 0.4769598361588349, "grad_norm": 0.4097869396209717, "learning_rate": 0.00010878741860298503, "loss": 2.1785, "step": 524 }, { "epoch": 0.4778700648537945, "grad_norm": 0.4243004024028778, "learning_rate": 0.00010850001423945126, "loss": 1.9963, "step": 525 }, { "epoch": 0.4787802935487541, "grad_norm": 0.41958731412887573, "learning_rate": 0.00010821253913619726, "loss": 2.1629, "step": 526 }, { "epoch": 0.4796905222437137, "grad_norm": 0.4177284240722656, "learning_rate": 0.00010792499568567884, "loss": 2.1276, "step": 527 }, { "epoch": 0.4806007509386733, "grad_norm": 0.41077664494514465, "learning_rate": 0.00010763738628092062, "loss": 2.0852, "step": 528 }, { "epoch": 0.48151097963363293, "grad_norm": 0.4098223149776459, "learning_rate": 0.00010734971331549603, "loss": 1.9977, "step": 529 }, { "epoch": 0.48242120832859253, "grad_norm": 0.42255935072898865, "learning_rate": 0.00010706197918350758, "loss": 1.9822, "step": 530 }, { "epoch": 0.4833314370235522, "grad_norm": 0.45597127079963684, "learning_rate": 0.0001067741862795668, "loss": 2.1072, "step": 531 }, { "epoch": 0.4842416657185118, "grad_norm": 0.4538208544254303, "learning_rate": 0.0001064863369987743, "loss": 2.41, "step": 532 }, { "epoch": 0.4851518944134714, "grad_norm": 0.4586673676967621, "learning_rate": 0.00010619843373669993, "loss": 2.1736, "step": 533 }, { "epoch": 0.486062123108431, "grad_norm": 0.4433608055114746, "learning_rate": 0.00010591047888936274, "loss": 2.1324, "step": 534 }, { "epoch": 0.4869723518033906, "grad_norm": 0.4421234428882599, "learning_rate": 0.00010562247485321115, "loss": 2.0689, "step": 535 }, { "epoch": 0.4878825804983502, "grad_norm": 0.46843069791793823, "learning_rate": 0.00010533442402510284, "loss": 2.2252, "step": 536 }, { "epoch": 0.4887928091933098, "grad_norm": 0.4747142493724823, "learning_rate": 0.00010504632880228498, "loss": 2.2503, "step": 537 }, { "epoch": 0.4897030378882694, "grad_norm": 0.46643224358558655, "learning_rate": 0.00010475819158237425, "loss": 2.2628, "step": 538 }, { "epoch": 0.490613266583229, "grad_norm": 0.47085490822792053, "learning_rate": 0.00010447001476333673, "loss": 2.0888, "step": 539 }, { "epoch": 0.49152349527818867, "grad_norm": 0.5102598071098328, "learning_rate": 0.00010418180074346815, "loss": 2.2736, "step": 540 }, { "epoch": 0.4924337239731483, "grad_norm": 0.49878573417663574, "learning_rate": 0.00010389355192137377, "loss": 2.1107, "step": 541 }, { "epoch": 0.4933439526681079, "grad_norm": 0.5236616134643555, "learning_rate": 0.00010360527069594859, "loss": 2.4099, "step": 542 }, { "epoch": 0.4942541813630675, "grad_norm": 0.49875032901763916, "learning_rate": 0.00010331695946635708, "loss": 2.1381, "step": 543 }, { "epoch": 0.4951644100580271, "grad_norm": 0.5333012938499451, "learning_rate": 0.00010302862063201367, "loss": 2.2274, "step": 544 }, { "epoch": 0.4960746387529867, "grad_norm": 0.5504993200302124, "learning_rate": 0.00010274025659256232, "loss": 2.2348, "step": 545 }, { "epoch": 0.4969848674479463, "grad_norm": 0.5924202799797058, "learning_rate": 0.00010245186974785685, "loss": 2.3686, "step": 546 }, { "epoch": 0.4978950961429059, "grad_norm": 0.6003567576408386, "learning_rate": 0.00010216346249794087, "loss": 2.3336, "step": 547 }, { "epoch": 0.4988053248378655, "grad_norm": 0.6700019836425781, "learning_rate": 0.00010187503724302776, "loss": 2.4446, "step": 548 }, { "epoch": 0.4997155535328251, "grad_norm": 0.8171781897544861, "learning_rate": 0.00010158659638348081, "loss": 2.4278, "step": 549 }, { "epoch": 0.5006257822277848, "grad_norm": 1.4212020635604858, "learning_rate": 0.0001012981423197931, "loss": 2.6229, "step": 550 }, { "epoch": 0.5006257822277848, "eval_loss": 2.2479705810546875, "eval_runtime": 205.3622, "eval_samples_per_second": 9.013, "eval_steps_per_second": 4.509, "step": 550 }, { "epoch": 0.5015360109227444, "grad_norm": 0.4380887746810913, "learning_rate": 0.00010100967745256766, "loss": 2.4596, "step": 551 }, { "epoch": 0.502446239617704, "grad_norm": 0.4522015154361725, "learning_rate": 0.00010072120418249745, "loss": 2.3217, "step": 552 }, { "epoch": 0.5033564683126636, "grad_norm": 0.4581010043621063, "learning_rate": 0.00010043272491034523, "loss": 2.4948, "step": 553 }, { "epoch": 0.5042666970076232, "grad_norm": 0.4353744685649872, "learning_rate": 0.00010014424203692388, "loss": 2.3769, "step": 554 }, { "epoch": 0.5051769257025828, "grad_norm": 0.4150161147117615, "learning_rate": 9.985575796307615e-05, "loss": 2.3557, "step": 555 }, { "epoch": 0.5060871543975424, "grad_norm": 0.4532707631587982, "learning_rate": 9.956727508965481e-05, "loss": 2.3114, "step": 556 }, { "epoch": 0.506997383092502, "grad_norm": 0.42450255155563354, "learning_rate": 9.927879581750259e-05, "loss": 2.2911, "step": 557 }, { "epoch": 0.5079076117874616, "grad_norm": 0.42910221219062805, "learning_rate": 9.899032254743235e-05, "loss": 2.3062, "step": 558 }, { "epoch": 0.5088178404824212, "grad_norm": 0.42122408747673035, "learning_rate": 9.870185768020693e-05, "loss": 2.3294, "step": 559 }, { "epoch": 0.5097280691773808, "grad_norm": 0.4562203884124756, "learning_rate": 9.84134036165192e-05, "loss": 2.2674, "step": 560 }, { "epoch": 0.5106382978723404, "grad_norm": 0.3905525207519531, "learning_rate": 9.812496275697226e-05, "loss": 2.1259, "step": 561 }, { "epoch": 0.5115485265673, "grad_norm": 0.41641199588775635, "learning_rate": 9.783653750205915e-05, "loss": 2.2191, "step": 562 }, { "epoch": 0.5124587552622596, "grad_norm": 0.4090450704097748, "learning_rate": 9.754813025214317e-05, "loss": 2.2478, "step": 563 }, { "epoch": 0.5133689839572193, "grad_norm": 0.4293293356895447, "learning_rate": 9.725974340743769e-05, "loss": 2.3854, "step": 564 }, { "epoch": 0.5142792126521789, "grad_norm": 0.4138126075267792, "learning_rate": 9.697137936798634e-05, "loss": 2.2903, "step": 565 }, { "epoch": 0.5151894413471385, "grad_norm": 0.3979492783546448, "learning_rate": 9.668304053364294e-05, "loss": 2.1878, "step": 566 }, { "epoch": 0.5160996700420981, "grad_norm": 0.38530561327934265, "learning_rate": 9.639472930405143e-05, "loss": 2.1464, "step": 567 }, { "epoch": 0.5170098987370577, "grad_norm": 0.4263143837451935, "learning_rate": 9.610644807862625e-05, "loss": 2.1856, "step": 568 }, { "epoch": 0.5179201274320173, "grad_norm": 0.41127926111221313, "learning_rate": 9.581819925653188e-05, "loss": 2.198, "step": 569 }, { "epoch": 0.5188303561269769, "grad_norm": 0.3917299509048462, "learning_rate": 9.552998523666326e-05, "loss": 2.1325, "step": 570 }, { "epoch": 0.5197405848219365, "grad_norm": 0.39180508255958557, "learning_rate": 9.524180841762577e-05, "loss": 2.0702, "step": 571 }, { "epoch": 0.5206508135168961, "grad_norm": 0.3829837739467621, "learning_rate": 9.495367119771503e-05, "loss": 1.8913, "step": 572 }, { "epoch": 0.5215610422118557, "grad_norm": 0.412706196308136, "learning_rate": 9.46655759748972e-05, "loss": 2.1221, "step": 573 }, { "epoch": 0.5224712709068153, "grad_norm": 0.39748141169548035, "learning_rate": 9.437752514678887e-05, "loss": 2.0427, "step": 574 }, { "epoch": 0.5233814996017749, "grad_norm": 0.42854538559913635, "learning_rate": 9.408952111063727e-05, "loss": 2.121, "step": 575 }, { "epoch": 0.5242917282967345, "grad_norm": 0.414654016494751, "learning_rate": 9.380156626330009e-05, "loss": 2.038, "step": 576 }, { "epoch": 0.5252019569916941, "grad_norm": 0.4241427183151245, "learning_rate": 9.35136630012257e-05, "loss": 2.1312, "step": 577 }, { "epoch": 0.5261121856866537, "grad_norm": 0.42928779125213623, "learning_rate": 9.322581372043321e-05, "loss": 2.1875, "step": 578 }, { "epoch": 0.5270224143816133, "grad_norm": 0.4133308231830597, "learning_rate": 9.293802081649243e-05, "loss": 2.0477, "step": 579 }, { "epoch": 0.527932643076573, "grad_norm": 0.427898645401001, "learning_rate": 9.265028668450402e-05, "loss": 2.0833, "step": 580 }, { "epoch": 0.5288428717715326, "grad_norm": 0.4321751892566681, "learning_rate": 9.23626137190794e-05, "loss": 2.1698, "step": 581 }, { "epoch": 0.5297531004664922, "grad_norm": 0.4715782105922699, "learning_rate": 9.207500431432115e-05, "loss": 2.1347, "step": 582 }, { "epoch": 0.5306633291614519, "grad_norm": 0.45599547028541565, "learning_rate": 9.178746086380275e-05, "loss": 2.1469, "step": 583 }, { "epoch": 0.5315735578564115, "grad_norm": 0.45286545157432556, "learning_rate": 9.149998576054874e-05, "loss": 2.2013, "step": 584 }, { "epoch": 0.5324837865513711, "grad_norm": 0.47089457511901855, "learning_rate": 9.121258139701502e-05, "loss": 2.2125, "step": 585 }, { "epoch": 0.5333940152463307, "grad_norm": 0.46750229597091675, "learning_rate": 9.092525016506858e-05, "loss": 2.1186, "step": 586 }, { "epoch": 0.5343042439412903, "grad_norm": 0.4931905269622803, "learning_rate": 9.063799445596795e-05, "loss": 2.2185, "step": 587 }, { "epoch": 0.5352144726362499, "grad_norm": 0.48538026213645935, "learning_rate": 9.035081666034304e-05, "loss": 2.2369, "step": 588 }, { "epoch": 0.5361247013312095, "grad_norm": 0.4944066107273102, "learning_rate": 9.006371916817534e-05, "loss": 2.2771, "step": 589 }, { "epoch": 0.5370349300261691, "grad_norm": 0.4564894139766693, "learning_rate": 8.977670436877811e-05, "loss": 2.0879, "step": 590 }, { "epoch": 0.5379451587211287, "grad_norm": 0.5046347379684448, "learning_rate": 8.948977465077632e-05, "loss": 2.2197, "step": 591 }, { "epoch": 0.5388553874160883, "grad_norm": 0.49683472514152527, "learning_rate": 8.920293240208694e-05, "loss": 2.2152, "step": 592 }, { "epoch": 0.5397656161110479, "grad_norm": 0.5223331451416016, "learning_rate": 8.891618000989891e-05, "loss": 2.3358, "step": 593 }, { "epoch": 0.5406758448060075, "grad_norm": 0.5552563667297363, "learning_rate": 8.862951986065345e-05, "loss": 2.1608, "step": 594 }, { "epoch": 0.5415860735009671, "grad_norm": 0.5853347778320312, "learning_rate": 8.83429543400241e-05, "loss": 2.3679, "step": 595 }, { "epoch": 0.5424963021959267, "grad_norm": 0.5858141183853149, "learning_rate": 8.805648583289674e-05, "loss": 2.341, "step": 596 }, { "epoch": 0.5434065308908863, "grad_norm": 0.6405509114265442, "learning_rate": 8.777011672335008e-05, "loss": 2.4773, "step": 597 }, { "epoch": 0.5443167595858459, "grad_norm": 0.7342801094055176, "learning_rate": 8.748384939463543e-05, "loss": 2.557, "step": 598 }, { "epoch": 0.5452269882808055, "grad_norm": 0.8813995122909546, "learning_rate": 8.719768622915714e-05, "loss": 2.5595, "step": 599 }, { "epoch": 0.5461372169757651, "grad_norm": 1.722114086151123, "learning_rate": 8.691162960845264e-05, "loss": 2.7211, "step": 600 }, { "epoch": 0.5470474456707247, "grad_norm": 0.424265056848526, "learning_rate": 8.662568191317273e-05, "loss": 2.3728, "step": 601 }, { "epoch": 0.5479576743656844, "grad_norm": 0.45933809876441956, "learning_rate": 8.633984552306164e-05, "loss": 2.4234, "step": 602 }, { "epoch": 0.548867903060644, "grad_norm": 0.45455530285835266, "learning_rate": 8.605412281693727e-05, "loss": 2.5062, "step": 603 }, { "epoch": 0.5497781317556036, "grad_norm": 0.4334143400192261, "learning_rate": 8.57685161726715e-05, "loss": 2.3087, "step": 604 }, { "epoch": 0.5506883604505632, "grad_norm": 0.4537433385848999, "learning_rate": 8.548302796717019e-05, "loss": 2.395, "step": 605 }, { "epoch": 0.5515985891455228, "grad_norm": 0.43673837184906006, "learning_rate": 8.519766057635355e-05, "loss": 2.3855, "step": 606 }, { "epoch": 0.5525088178404824, "grad_norm": 0.43078145384788513, "learning_rate": 8.491241637513644e-05, "loss": 2.2559, "step": 607 }, { "epoch": 0.553419046535442, "grad_norm": 0.4094640612602234, "learning_rate": 8.462729773740832e-05, "loss": 2.295, "step": 608 }, { "epoch": 0.5543292752304017, "grad_norm": 0.4126126170158386, "learning_rate": 8.434230703601384e-05, "loss": 2.2019, "step": 609 }, { "epoch": 0.5552395039253613, "grad_norm": 0.4372231066226959, "learning_rate": 8.405744664273278e-05, "loss": 2.4243, "step": 610 }, { "epoch": 0.5561497326203209, "grad_norm": 0.42160138487815857, "learning_rate": 8.37727189282606e-05, "loss": 2.2805, "step": 611 }, { "epoch": 0.5570599613152805, "grad_norm": 0.4336857795715332, "learning_rate": 8.34881262621884e-05, "loss": 2.4811, "step": 612 }, { "epoch": 0.5579701900102401, "grad_norm": 0.40520837903022766, "learning_rate": 8.320367101298351e-05, "loss": 2.1731, "step": 613 }, { "epoch": 0.5588804187051997, "grad_norm": 0.42664197087287903, "learning_rate": 8.291935554796962e-05, "loss": 2.3403, "step": 614 }, { "epoch": 0.5597906474001593, "grad_norm": 0.4109039902687073, "learning_rate": 8.263518223330697e-05, "loss": 2.2425, "step": 615 }, { "epoch": 0.5607008760951189, "grad_norm": 0.4032575786113739, "learning_rate": 8.235115343397295e-05, "loss": 2.2593, "step": 616 }, { "epoch": 0.5616111047900785, "grad_norm": 0.3929396867752075, "learning_rate": 8.206727151374207e-05, "loss": 2.0896, "step": 617 }, { "epoch": 0.5625213334850381, "grad_norm": 0.38567835092544556, "learning_rate": 8.178353883516664e-05, "loss": 2.0715, "step": 618 }, { "epoch": 0.5634315621799977, "grad_norm": 0.405369371175766, "learning_rate": 8.149995775955686e-05, "loss": 2.2249, "step": 619 }, { "epoch": 0.5643417908749573, "grad_norm": 0.3889697790145874, "learning_rate": 8.121653064696118e-05, "loss": 2.0797, "step": 620 }, { "epoch": 0.565252019569917, "grad_norm": 0.4065384864807129, "learning_rate": 8.093325985614685e-05, "loss": 2.2012, "step": 621 }, { "epoch": 0.5661622482648766, "grad_norm": 0.4066416323184967, "learning_rate": 8.065014774458003e-05, "loss": 2.1183, "step": 622 }, { "epoch": 0.5670724769598362, "grad_norm": 0.40575870871543884, "learning_rate": 8.036719666840647e-05, "loss": 2.0258, "step": 623 }, { "epoch": 0.5679827056547958, "grad_norm": 0.42911243438720703, "learning_rate": 8.008440898243149e-05, "loss": 2.1186, "step": 624 }, { "epoch": 0.5688929343497554, "grad_norm": 0.4009549021720886, "learning_rate": 7.980178704010089e-05, "loss": 2.0049, "step": 625 }, { "epoch": 0.569803163044715, "grad_norm": 0.41156989336013794, "learning_rate": 7.951933319348095e-05, "loss": 2.0272, "step": 626 }, { "epoch": 0.5707133917396746, "grad_norm": 0.4248954653739929, "learning_rate": 7.923704979323899e-05, "loss": 2.077, "step": 627 }, { "epoch": 0.5716236204346342, "grad_norm": 0.45484524965286255, "learning_rate": 7.895493918862396e-05, "loss": 2.2255, "step": 628 }, { "epoch": 0.5725338491295938, "grad_norm": 0.4571921229362488, "learning_rate": 7.867300372744657e-05, "loss": 2.1373, "step": 629 }, { "epoch": 0.5734440778245534, "grad_norm": 0.44238901138305664, "learning_rate": 7.839124575606004e-05, "loss": 2.1147, "step": 630 }, { "epoch": 0.574354306519513, "grad_norm": 0.4206310510635376, "learning_rate": 7.810966761934053e-05, "loss": 2.0508, "step": 631 }, { "epoch": 0.5752645352144726, "grad_norm": 0.43381330370903015, "learning_rate": 7.782827166066739e-05, "loss": 2.0847, "step": 632 }, { "epoch": 0.5761747639094322, "grad_norm": 0.4460139572620392, "learning_rate": 7.754706022190398e-05, "loss": 2.1288, "step": 633 }, { "epoch": 0.5770849926043918, "grad_norm": 0.4371720850467682, "learning_rate": 7.726603564337791e-05, "loss": 2.0476, "step": 634 }, { "epoch": 0.5779952212993514, "grad_norm": 0.4623599052429199, "learning_rate": 7.69852002638618e-05, "loss": 2.2858, "step": 635 }, { "epoch": 0.578905449994311, "grad_norm": 0.4422992765903473, "learning_rate": 7.670455642055361e-05, "loss": 2.1072, "step": 636 }, { "epoch": 0.5798156786892706, "grad_norm": 0.4804936647415161, "learning_rate": 7.642410644905726e-05, "loss": 2.2218, "step": 637 }, { "epoch": 0.5807259073842302, "grad_norm": 0.48249900341033936, "learning_rate": 7.614385268336336e-05, "loss": 2.2916, "step": 638 }, { "epoch": 0.5816361360791898, "grad_norm": 0.46635982394218445, "learning_rate": 7.586379745582944e-05, "loss": 2.1636, "step": 639 }, { "epoch": 0.5825463647741494, "grad_norm": 0.4670505225658417, "learning_rate": 7.558394309716088e-05, "loss": 2.2052, "step": 640 }, { "epoch": 0.5834565934691092, "grad_norm": 0.49475541710853577, "learning_rate": 7.530429193639128e-05, "loss": 2.18, "step": 641 }, { "epoch": 0.5843668221640688, "grad_norm": 0.5231596231460571, "learning_rate": 7.502484630086318e-05, "loss": 2.2095, "step": 642 }, { "epoch": 0.5852770508590284, "grad_norm": 0.5045900344848633, "learning_rate": 7.474560851620873e-05, "loss": 2.053, "step": 643 }, { "epoch": 0.586187279553988, "grad_norm": 0.5511046051979065, "learning_rate": 7.446658090633026e-05, "loss": 2.2706, "step": 644 }, { "epoch": 0.5870975082489476, "grad_norm": 0.5700446963310242, "learning_rate": 7.41877657933809e-05, "loss": 2.3999, "step": 645 }, { "epoch": 0.5880077369439072, "grad_norm": 0.5792605876922607, "learning_rate": 7.390916549774536e-05, "loss": 2.2391, "step": 646 }, { "epoch": 0.5889179656388668, "grad_norm": 0.6770455241203308, "learning_rate": 7.363078233802063e-05, "loss": 2.6564, "step": 647 }, { "epoch": 0.5898281943338264, "grad_norm": 0.7092955708503723, "learning_rate": 7.335261863099651e-05, "loss": 2.4722, "step": 648 }, { "epoch": 0.590738423028786, "grad_norm": 0.8125056028366089, "learning_rate": 7.307467669163655e-05, "loss": 2.3581, "step": 649 }, { "epoch": 0.5916486517237456, "grad_norm": 1.5941253900527954, "learning_rate": 7.279695883305866e-05, "loss": 2.16, "step": 650 }, { "epoch": 0.5925588804187052, "grad_norm": 0.44398507475852966, "learning_rate": 7.251946736651582e-05, "loss": 2.4689, "step": 651 }, { "epoch": 0.5934691091136648, "grad_norm": 0.4264509975910187, "learning_rate": 7.224220460137701e-05, "loss": 2.4081, "step": 652 }, { "epoch": 0.5943793378086244, "grad_norm": 0.4222484529018402, "learning_rate": 7.196517284510773e-05, "loss": 2.3827, "step": 653 }, { "epoch": 0.595289566503584, "grad_norm": 0.4516051411628723, "learning_rate": 7.168837440325114e-05, "loss": 2.399, "step": 654 }, { "epoch": 0.5961997951985436, "grad_norm": 0.4370306730270386, "learning_rate": 7.141181157940859e-05, "loss": 2.3837, "step": 655 }, { "epoch": 0.5971100238935032, "grad_norm": 0.4236253798007965, "learning_rate": 7.11354866752205e-05, "loss": 2.3066, "step": 656 }, { "epoch": 0.5980202525884628, "grad_norm": 0.41718846559524536, "learning_rate": 7.085940199034735e-05, "loss": 2.3841, "step": 657 }, { "epoch": 0.5989304812834224, "grad_norm": 0.4299750030040741, "learning_rate": 7.058355982245037e-05, "loss": 2.3842, "step": 658 }, { "epoch": 0.599840709978382, "grad_norm": 0.4180915057659149, "learning_rate": 7.030796246717255e-05, "loss": 2.0758, "step": 659 }, { "epoch": 0.6007509386733417, "grad_norm": 0.45195114612579346, "learning_rate": 7.003261221811934e-05, "loss": 2.4826, "step": 660 }, { "epoch": 0.6016611673683013, "grad_norm": 0.4253404140472412, "learning_rate": 6.97575113668399e-05, "loss": 2.3705, "step": 661 }, { "epoch": 0.6025713960632609, "grad_norm": 0.4198931157588959, "learning_rate": 6.948266220280771e-05, "loss": 2.3396, "step": 662 }, { "epoch": 0.6034816247582205, "grad_norm": 0.43457460403442383, "learning_rate": 6.920806701340155e-05, "loss": 2.1447, "step": 663 }, { "epoch": 0.6043918534531801, "grad_norm": 0.40161874890327454, "learning_rate": 6.893372808388675e-05, "loss": 2.2443, "step": 664 }, { "epoch": 0.6053020821481397, "grad_norm": 0.4039609432220459, "learning_rate": 6.865964769739575e-05, "loss": 2.1815, "step": 665 }, { "epoch": 0.6062123108430993, "grad_norm": 0.4061351716518402, "learning_rate": 6.838582813490947e-05, "loss": 2.1073, "step": 666 }, { "epoch": 0.607122539538059, "grad_norm": 0.4206211268901825, "learning_rate": 6.811227167523815e-05, "loss": 2.2549, "step": 667 }, { "epoch": 0.6080327682330186, "grad_norm": 0.3936857283115387, "learning_rate": 6.783898059500233e-05, "loss": 2.1373, "step": 668 }, { "epoch": 0.6089429969279782, "grad_norm": 0.3954029083251953, "learning_rate": 6.756595716861407e-05, "loss": 2.1001, "step": 669 }, { "epoch": 0.6098532256229378, "grad_norm": 0.407713919878006, "learning_rate": 6.729320366825784e-05, "loss": 2.0967, "step": 670 }, { "epoch": 0.6107634543178974, "grad_norm": 0.41096213459968567, "learning_rate": 6.702072236387182e-05, "loss": 2.0899, "step": 671 }, { "epoch": 0.611673683012857, "grad_norm": 0.40465790033340454, "learning_rate": 6.674851552312878e-05, "loss": 2.089, "step": 672 }, { "epoch": 0.6125839117078166, "grad_norm": 0.3991434574127197, "learning_rate": 6.647658541141735e-05, "loss": 1.9788, "step": 673 }, { "epoch": 0.6134941404027762, "grad_norm": 0.42327383160591125, "learning_rate": 6.620493429182323e-05, "loss": 2.1672, "step": 674 }, { "epoch": 0.6144043690977358, "grad_norm": 0.4061299264431, "learning_rate": 6.593356442511015e-05, "loss": 2.1617, "step": 675 }, { "epoch": 0.6153145977926954, "grad_norm": 0.4082658588886261, "learning_rate": 6.566247806970119e-05, "loss": 2.0112, "step": 676 }, { "epoch": 0.616224826487655, "grad_norm": 0.43016669154167175, "learning_rate": 6.539167748165994e-05, "loss": 2.0, "step": 677 }, { "epoch": 0.6171350551826146, "grad_norm": 0.43142226338386536, "learning_rate": 6.512116491467185e-05, "loss": 2.1585, "step": 678 }, { "epoch": 0.6180452838775743, "grad_norm": 0.4271491467952728, "learning_rate": 6.485094262002529e-05, "loss": 1.9628, "step": 679 }, { "epoch": 0.6189555125725339, "grad_norm": 0.44002971053123474, "learning_rate": 6.458101284659286e-05, "loss": 2.2214, "step": 680 }, { "epoch": 0.6198657412674935, "grad_norm": 0.4215126931667328, "learning_rate": 6.431137784081282e-05, "loss": 2.0377, "step": 681 }, { "epoch": 0.6207759699624531, "grad_norm": 0.46792343258857727, "learning_rate": 6.404203984667019e-05, "loss": 2.029, "step": 682 }, { "epoch": 0.6216861986574127, "grad_norm": 0.45737308263778687, "learning_rate": 6.377300110567821e-05, "loss": 2.2375, "step": 683 }, { "epoch": 0.6225964273523723, "grad_norm": 0.4526033401489258, "learning_rate": 6.350426385685957e-05, "loss": 2.2562, "step": 684 }, { "epoch": 0.6235066560473319, "grad_norm": 0.45917776226997375, "learning_rate": 6.323583033672799e-05, "loss": 2.1321, "step": 685 }, { "epoch": 0.6244168847422915, "grad_norm": 0.4713301658630371, "learning_rate": 6.296770277926937e-05, "loss": 2.07, "step": 686 }, { "epoch": 0.6253271134372511, "grad_norm": 0.5036799907684326, "learning_rate": 6.269988341592328e-05, "loss": 2.1103, "step": 687 }, { "epoch": 0.6262373421322107, "grad_norm": 0.4843004643917084, "learning_rate": 6.243237447556449e-05, "loss": 2.0936, "step": 688 }, { "epoch": 0.6271475708271703, "grad_norm": 0.4738497734069824, "learning_rate": 6.216517818448423e-05, "loss": 2.1004, "step": 689 }, { "epoch": 0.6280577995221299, "grad_norm": 0.5081862211227417, "learning_rate": 6.189829676637182e-05, "loss": 2.2177, "step": 690 }, { "epoch": 0.6289680282170895, "grad_norm": 0.5060831904411316, "learning_rate": 6.163173244229619e-05, "loss": 2.1342, "step": 691 }, { "epoch": 0.6298782569120491, "grad_norm": 0.5047332644462585, "learning_rate": 6.136548743068713e-05, "loss": 2.0727, "step": 692 }, { "epoch": 0.6307884856070087, "grad_norm": 0.5174648761749268, "learning_rate": 6.109956394731722e-05, "loss": 2.0623, "step": 693 }, { "epoch": 0.6316987143019683, "grad_norm": 0.5705139636993408, "learning_rate": 6.083396420528298e-05, "loss": 2.4454, "step": 694 }, { "epoch": 0.6326089429969279, "grad_norm": 0.5653088092803955, "learning_rate": 6.056869041498687e-05, "loss": 2.2071, "step": 695 }, { "epoch": 0.6335191716918875, "grad_norm": 0.6105946898460388, "learning_rate": 6.030374478411847e-05, "loss": 2.3081, "step": 696 }, { "epoch": 0.6344294003868471, "grad_norm": 0.6362658143043518, "learning_rate": 6.0039129517636435e-05, "loss": 2.3426, "step": 697 }, { "epoch": 0.6353396290818069, "grad_norm": 0.7242766618728638, "learning_rate": 5.9774846817750105e-05, "loss": 2.4877, "step": 698 }, { "epoch": 0.6362498577767665, "grad_norm": 0.9446219205856323, "learning_rate": 5.951089888390087e-05, "loss": 2.7741, "step": 699 }, { "epoch": 0.6371600864717261, "grad_norm": 1.5826219320297241, "learning_rate": 5.924728791274432e-05, "loss": 2.533, "step": 700 }, { "epoch": 0.6380703151666857, "grad_norm": 0.4994019567966461, "learning_rate": 5.89840160981316e-05, "loss": 2.4388, "step": 701 }, { "epoch": 0.6389805438616453, "grad_norm": 0.45213326811790466, "learning_rate": 5.872108563109131e-05, "loss": 2.3644, "step": 702 }, { "epoch": 0.6398907725566049, "grad_norm": 0.45655617117881775, "learning_rate": 5.845849869981137e-05, "loss": 2.5202, "step": 703 }, { "epoch": 0.6408010012515645, "grad_norm": 0.41640329360961914, "learning_rate": 5.819625748962049e-05, "loss": 2.3097, "step": 704 }, { "epoch": 0.6417112299465241, "grad_norm": 0.43625307083129883, "learning_rate": 5.79343641829704e-05, "loss": 2.3931, "step": 705 }, { "epoch": 0.6426214586414837, "grad_norm": 0.44178506731987, "learning_rate": 5.7672820959417254e-05, "loss": 2.3195, "step": 706 }, { "epoch": 0.6435316873364433, "grad_norm": 0.4416089951992035, "learning_rate": 5.741162999560386e-05, "loss": 2.2446, "step": 707 }, { "epoch": 0.6444419160314029, "grad_norm": 0.4419204890727997, "learning_rate": 5.7150793465241346e-05, "loss": 2.34, "step": 708 }, { "epoch": 0.6453521447263625, "grad_norm": 0.45422717928886414, "learning_rate": 5.68903135390912e-05, "loss": 2.1915, "step": 709 }, { "epoch": 0.6462623734213221, "grad_norm": 0.41635921597480774, "learning_rate": 5.663019238494704e-05, "loss": 2.3147, "step": 710 }, { "epoch": 0.6471726021162817, "grad_norm": 0.4240402579307556, "learning_rate": 5.637043216761678e-05, "loss": 2.1693, "step": 711 }, { "epoch": 0.6480828308112413, "grad_norm": 0.41989627480506897, "learning_rate": 5.611103504890444e-05, "loss": 2.2087, "step": 712 }, { "epoch": 0.6489930595062009, "grad_norm": 0.4187220335006714, "learning_rate": 5.5852003187592226e-05, "loss": 2.3818, "step": 713 }, { "epoch": 0.6499032882011605, "grad_norm": 0.43209201097488403, "learning_rate": 5.559333873942259e-05, "loss": 2.3176, "step": 714 }, { "epoch": 0.6508135168961201, "grad_norm": 0.416291207075119, "learning_rate": 5.533504385708024e-05, "loss": 2.2397, "step": 715 }, { "epoch": 0.6517237455910797, "grad_norm": 0.411857932806015, "learning_rate": 5.5077120690174246e-05, "loss": 2.1142, "step": 716 }, { "epoch": 0.6526339742860394, "grad_norm": 0.4142412543296814, "learning_rate": 5.481957138522018e-05, "loss": 2.2226, "step": 717 }, { "epoch": 0.653544202980999, "grad_norm": 0.4322018325328827, "learning_rate": 5.456239808562209e-05, "loss": 2.2078, "step": 718 }, { "epoch": 0.6544544316759586, "grad_norm": 0.42210081219673157, "learning_rate": 5.4305602931655045e-05, "loss": 2.0579, "step": 719 }, { "epoch": 0.6553646603709182, "grad_norm": 0.4076862335205078, "learning_rate": 5.404918806044679e-05, "loss": 2.1348, "step": 720 }, { "epoch": 0.6562748890658778, "grad_norm": 0.43610820174217224, "learning_rate": 5.379315560596038e-05, "loss": 2.2212, "step": 721 }, { "epoch": 0.6571851177608374, "grad_norm": 0.41793620586395264, "learning_rate": 5.3537507698976365e-05, "loss": 1.9606, "step": 722 }, { "epoch": 0.658095346455797, "grad_norm": 0.40680915117263794, "learning_rate": 5.328224646707479e-05, "loss": 2.0167, "step": 723 }, { "epoch": 0.6590055751507566, "grad_norm": 0.424699991941452, "learning_rate": 5.3027374034617785e-05, "loss": 2.1065, "step": 724 }, { "epoch": 0.6599158038457162, "grad_norm": 0.43258509039878845, "learning_rate": 5.277289252273174e-05, "loss": 2.0974, "step": 725 }, { "epoch": 0.6608260325406758, "grad_norm": 0.4318636655807495, "learning_rate": 5.251880404928971e-05, "loss": 2.3214, "step": 726 }, { "epoch": 0.6617362612356354, "grad_norm": 0.4147786498069763, "learning_rate": 5.226511072889371e-05, "loss": 2.1223, "step": 727 }, { "epoch": 0.662646489930595, "grad_norm": 0.4131387770175934, "learning_rate": 5.201181467285723e-05, "loss": 1.8335, "step": 728 }, { "epoch": 0.6635567186255547, "grad_norm": 0.456827849149704, "learning_rate": 5.175891798918757e-05, "loss": 2.1428, "step": 729 }, { "epoch": 0.6644669473205143, "grad_norm": 0.47478604316711426, "learning_rate": 5.1506422782568345e-05, "loss": 2.0526, "step": 730 }, { "epoch": 0.6653771760154739, "grad_norm": 0.4357281029224396, "learning_rate": 5.125433115434197e-05, "loss": 1.8949, "step": 731 }, { "epoch": 0.6662874047104335, "grad_norm": 0.45749080181121826, "learning_rate": 5.100264520249205e-05, "loss": 2.1637, "step": 732 }, { "epoch": 0.6671976334053931, "grad_norm": 0.47157686948776245, "learning_rate": 5.0751367021626215e-05, "loss": 2.1036, "step": 733 }, { "epoch": 0.6681078621003527, "grad_norm": 0.4490397274494171, "learning_rate": 5.050049870295841e-05, "loss": 1.9553, "step": 734 }, { "epoch": 0.6690180907953123, "grad_norm": 0.4798765480518341, "learning_rate": 5.025004233429145e-05, "loss": 2.0954, "step": 735 }, { "epoch": 0.669928319490272, "grad_norm": 0.5034172534942627, "learning_rate": 5.000000000000002e-05, "loss": 2.2027, "step": 736 }, { "epoch": 0.6708385481852316, "grad_norm": 0.481141060590744, "learning_rate": 4.9750373781012885e-05, "loss": 2.0822, "step": 737 }, { "epoch": 0.6717487768801912, "grad_norm": 0.49731481075286865, "learning_rate": 4.950116575479586e-05, "loss": 2.0196, "step": 738 }, { "epoch": 0.6726590055751508, "grad_norm": 0.48321208357810974, "learning_rate": 4.9252377995334444e-05, "loss": 1.9995, "step": 739 }, { "epoch": 0.6735692342701104, "grad_norm": 0.5173296332359314, "learning_rate": 4.90040125731165e-05, "loss": 2.2078, "step": 740 }, { "epoch": 0.67447946296507, "grad_norm": 0.5107961297035217, "learning_rate": 4.87560715551151e-05, "loss": 2.0684, "step": 741 }, { "epoch": 0.6753896916600296, "grad_norm": 0.522492527961731, "learning_rate": 4.85085570047713e-05, "loss": 2.2168, "step": 742 }, { "epoch": 0.6762999203549892, "grad_norm": 0.5417767763137817, "learning_rate": 4.826147098197691e-05, "loss": 2.2379, "step": 743 }, { "epoch": 0.6772101490499488, "grad_norm": 0.5735164284706116, "learning_rate": 4.8014815543057475e-05, "loss": 2.2132, "step": 744 }, { "epoch": 0.6781203777449084, "grad_norm": 0.5819071531295776, "learning_rate": 4.776859274075506e-05, "loss": 2.2469, "step": 745 }, { "epoch": 0.679030606439868, "grad_norm": 0.6113678216934204, "learning_rate": 4.752280462421117e-05, "loss": 2.3064, "step": 746 }, { "epoch": 0.6799408351348276, "grad_norm": 0.6506679654121399, "learning_rate": 4.727745323894976e-05, "loss": 2.3311, "step": 747 }, { "epoch": 0.6808510638297872, "grad_norm": 0.7372251152992249, "learning_rate": 4.703254062686017e-05, "loss": 2.575, "step": 748 }, { "epoch": 0.6817612925247468, "grad_norm": 0.8235337734222412, "learning_rate": 4.678806882618003e-05, "loss": 2.4711, "step": 749 }, { "epoch": 0.6826715212197064, "grad_norm": 1.295682430267334, "learning_rate": 4.654403987147865e-05, "loss": 2.5713, "step": 750 }, { "epoch": 0.683581749914666, "grad_norm": 0.46585023403167725, "learning_rate": 4.630045579363957e-05, "loss": 2.4203, "step": 751 }, { "epoch": 0.6844919786096256, "grad_norm": 0.4163340628147125, "learning_rate": 4.605731861984401e-05, "loss": 2.261, "step": 752 }, { "epoch": 0.6854022073045852, "grad_norm": 0.4268229007720947, "learning_rate": 4.5814630373554115e-05, "loss": 2.3078, "step": 753 }, { "epoch": 0.6863124359995448, "grad_norm": 0.43664565682411194, "learning_rate": 4.557239307449561e-05, "loss": 2.5044, "step": 754 }, { "epoch": 0.6872226646945045, "grad_norm": 0.44144201278686523, "learning_rate": 4.5330608738641486e-05, "loss": 2.4192, "step": 755 }, { "epoch": 0.6881328933894642, "grad_norm": 0.40867024660110474, "learning_rate": 4.508927937819499e-05, "loss": 2.1908, "step": 756 }, { "epoch": 0.6890431220844238, "grad_norm": 0.4147084057331085, "learning_rate": 4.484840700157295e-05, "loss": 2.2864, "step": 757 }, { "epoch": 0.6899533507793834, "grad_norm": 0.413703054189682, "learning_rate": 4.4607993613388976e-05, "loss": 2.2436, "step": 758 }, { "epoch": 0.690863579474343, "grad_norm": 0.42309460043907166, "learning_rate": 4.436804121443689e-05, "loss": 2.3444, "step": 759 }, { "epoch": 0.6917738081693026, "grad_norm": 0.4229698181152344, "learning_rate": 4.412855180167406e-05, "loss": 2.3264, "step": 760 }, { "epoch": 0.6926840368642622, "grad_norm": 0.4247763156890869, "learning_rate": 4.388952736820453e-05, "loss": 2.257, "step": 761 }, { "epoch": 0.6935942655592218, "grad_norm": 0.42383337020874023, "learning_rate": 4.365096990326297e-05, "loss": 2.1349, "step": 762 }, { "epoch": 0.6945044942541814, "grad_norm": 0.4181368350982666, "learning_rate": 4.3412881392197526e-05, "loss": 2.2587, "step": 763 }, { "epoch": 0.695414722949141, "grad_norm": 0.4086921811103821, "learning_rate": 4.317526381645363e-05, "loss": 2.2378, "step": 764 }, { "epoch": 0.6963249516441006, "grad_norm": 0.4001654088497162, "learning_rate": 4.293811915355761e-05, "loss": 2.1708, "step": 765 }, { "epoch": 0.6972351803390602, "grad_norm": 0.42960214614868164, "learning_rate": 4.270144937709981e-05, "loss": 2.1556, "step": 766 }, { "epoch": 0.6981454090340198, "grad_norm": 0.42000848054885864, "learning_rate": 4.2465256456718615e-05, "loss": 2.1182, "step": 767 }, { "epoch": 0.6990556377289794, "grad_norm": 0.394368052482605, "learning_rate": 4.222954235808378e-05, "loss": 2.0486, "step": 768 }, { "epoch": 0.699965866423939, "grad_norm": 0.3934192359447479, "learning_rate": 4.19943090428802e-05, "loss": 1.9222, "step": 769 }, { "epoch": 0.7008760951188986, "grad_norm": 0.44399651885032654, "learning_rate": 4.175955846879151e-05, "loss": 2.1621, "step": 770 }, { "epoch": 0.7017863238138582, "grad_norm": 0.4081316888332367, "learning_rate": 4.1525292589483843e-05, "loss": 1.9534, "step": 771 }, { "epoch": 0.7026965525088178, "grad_norm": 0.42008188366889954, "learning_rate": 4.129151335458957e-05, "loss": 1.9773, "step": 772 }, { "epoch": 0.7036067812037774, "grad_norm": 0.4209793508052826, "learning_rate": 4.105822270969102e-05, "loss": 2.0403, "step": 773 }, { "epoch": 0.7045170098987371, "grad_norm": 0.43969592452049255, "learning_rate": 4.0825422596304396e-05, "loss": 2.1796, "step": 774 }, { "epoch": 0.7054272385936967, "grad_norm": 0.4333605468273163, "learning_rate": 4.059311495186338e-05, "loss": 2.1115, "step": 775 }, { "epoch": 0.7063374672886563, "grad_norm": 0.42669251561164856, "learning_rate": 4.036130170970341e-05, "loss": 2.1563, "step": 776 }, { "epoch": 0.7072476959836159, "grad_norm": 0.45064857602119446, "learning_rate": 4.012998479904525e-05, "loss": 2.2003, "step": 777 }, { "epoch": 0.7081579246785755, "grad_norm": 0.4487575888633728, "learning_rate": 3.9899166144978904e-05, "loss": 2.1332, "step": 778 }, { "epoch": 0.7090681533735351, "grad_norm": 0.4423072636127472, "learning_rate": 3.966884766844803e-05, "loss": 2.1449, "step": 779 }, { "epoch": 0.7099783820684947, "grad_norm": 0.4436761736869812, "learning_rate": 3.943903128623335e-05, "loss": 2.0025, "step": 780 }, { "epoch": 0.7108886107634543, "grad_norm": 0.4364500045776367, "learning_rate": 3.920971891093718e-05, "loss": 1.9834, "step": 781 }, { "epoch": 0.7117988394584139, "grad_norm": 0.44009071588516235, "learning_rate": 3.8980912450967366e-05, "loss": 2.0204, "step": 782 }, { "epoch": 0.7127090681533735, "grad_norm": 0.4498206079006195, "learning_rate": 3.875261381052121e-05, "loss": 2.0348, "step": 783 }, { "epoch": 0.7136192968483331, "grad_norm": 0.48029908537864685, "learning_rate": 3.852482488956992e-05, "loss": 2.0383, "step": 784 }, { "epoch": 0.7145295255432927, "grad_norm": 0.4986077845096588, "learning_rate": 3.829754758384262e-05, "loss": 2.3006, "step": 785 }, { "epoch": 0.7154397542382523, "grad_norm": 0.5001522302627563, "learning_rate": 3.807078378481059e-05, "loss": 2.3416, "step": 786 }, { "epoch": 0.716349982933212, "grad_norm": 0.5049505829811096, "learning_rate": 3.784453537967161e-05, "loss": 2.1652, "step": 787 }, { "epoch": 0.7172602116281716, "grad_norm": 0.5048404932022095, "learning_rate": 3.761880425133413e-05, "loss": 2.1345, "step": 788 }, { "epoch": 0.7181704403231312, "grad_norm": 0.4869529604911804, "learning_rate": 3.7393592278401704e-05, "loss": 2.0906, "step": 789 }, { "epoch": 0.7190806690180908, "grad_norm": 0.5454188585281372, "learning_rate": 3.7168901335157315e-05, "loss": 2.4214, "step": 790 }, { "epoch": 0.7199908977130504, "grad_norm": 0.5238876938819885, "learning_rate": 3.694473329154778e-05, "loss": 1.9798, "step": 791 }, { "epoch": 0.72090112640801, "grad_norm": 0.5545910596847534, "learning_rate": 3.672109001316809e-05, "loss": 2.4726, "step": 792 }, { "epoch": 0.7218113551029697, "grad_norm": 0.542072594165802, "learning_rate": 3.649797336124615e-05, "loss": 2.016, "step": 793 }, { "epoch": 0.7227215837979293, "grad_norm": 0.5355279445648193, "learning_rate": 3.6275385192627056e-05, "loss": 2.1041, "step": 794 }, { "epoch": 0.7236318124928889, "grad_norm": 0.5673330426216125, "learning_rate": 3.6053327359757535e-05, "loss": 2.1006, "step": 795 }, { "epoch": 0.7245420411878485, "grad_norm": 0.6170483231544495, "learning_rate": 3.583180171067101e-05, "loss": 2.3275, "step": 796 }, { "epoch": 0.7254522698828081, "grad_norm": 0.6877503991127014, "learning_rate": 3.5610810088971625e-05, "loss": 2.4504, "step": 797 }, { "epoch": 0.7263624985777677, "grad_norm": 0.7676892280578613, "learning_rate": 3.5390354333819344e-05, "loss": 2.6627, "step": 798 }, { "epoch": 0.7272727272727273, "grad_norm": 0.9272985458374023, "learning_rate": 3.517043627991441e-05, "loss": 2.5253, "step": 799 }, { "epoch": 0.7281829559676869, "grad_norm": 1.9746160507202148, "learning_rate": 3.4951057757482205e-05, "loss": 2.5993, "step": 800 }, { "epoch": 0.7290931846626465, "grad_norm": 0.4283389449119568, "learning_rate": 3.4732220592257946e-05, "loss": 2.5104, "step": 801 }, { "epoch": 0.7300034133576061, "grad_norm": 0.43243661522865295, "learning_rate": 3.45139266054715e-05, "loss": 2.1707, "step": 802 }, { "epoch": 0.7309136420525657, "grad_norm": 0.4262010455131531, "learning_rate": 3.429617761383222e-05, "loss": 2.2513, "step": 803 }, { "epoch": 0.7318238707475253, "grad_norm": 0.4308786392211914, "learning_rate": 3.40789754295139e-05, "loss": 2.4042, "step": 804 }, { "epoch": 0.7327340994424849, "grad_norm": 0.42787450551986694, "learning_rate": 3.3862321860139576e-05, "loss": 2.4259, "step": 805 }, { "epoch": 0.7336443281374445, "grad_norm": 0.4381856620311737, "learning_rate": 3.364621870876659e-05, "loss": 2.4046, "step": 806 }, { "epoch": 0.7345545568324041, "grad_norm": 0.4300837814807892, "learning_rate": 3.343066777387148e-05, "loss": 2.3713, "step": 807 }, { "epoch": 0.7354647855273637, "grad_norm": 0.4228179156780243, "learning_rate": 3.3215670849335155e-05, "loss": 2.2606, "step": 808 }, { "epoch": 0.7363750142223233, "grad_norm": 0.4267946779727936, "learning_rate": 3.300122972442773e-05, "loss": 2.3377, "step": 809 }, { "epoch": 0.7372852429172829, "grad_norm": 0.41019585728645325, "learning_rate": 3.278734618379402e-05, "loss": 2.1903, "step": 810 }, { "epoch": 0.7381954716122425, "grad_norm": 0.42949211597442627, "learning_rate": 3.257402200743821e-05, "loss": 2.3309, "step": 811 }, { "epoch": 0.7391057003072021, "grad_norm": 0.4410031735897064, "learning_rate": 3.2361258970709397e-05, "loss": 2.3924, "step": 812 }, { "epoch": 0.7400159290021618, "grad_norm": 0.4223659336566925, "learning_rate": 3.21490588442868e-05, "loss": 2.2614, "step": 813 }, { "epoch": 0.7409261576971214, "grad_norm": 0.43348926305770874, "learning_rate": 3.19374233941647e-05, "loss": 2.4255, "step": 814 }, { "epoch": 0.741836386392081, "grad_norm": 0.42184650897979736, "learning_rate": 3.172635438163816e-05, "loss": 2.2794, "step": 815 }, { "epoch": 0.7427466150870407, "grad_norm": 0.4127393066883087, "learning_rate": 3.1515853563288076e-05, "loss": 2.1242, "step": 816 }, { "epoch": 0.7436568437820003, "grad_norm": 0.40478646755218506, "learning_rate": 3.130592269096671e-05, "loss": 2.035, "step": 817 }, { "epoch": 0.7445670724769599, "grad_norm": 0.41883495450019836, "learning_rate": 3.1096563511783014e-05, "loss": 2.1427, "step": 818 }, { "epoch": 0.7454773011719195, "grad_norm": 0.39868757128715515, "learning_rate": 3.08877777680882e-05, "loss": 2.0169, "step": 819 }, { "epoch": 0.7463875298668791, "grad_norm": 0.392415851354599, "learning_rate": 3.0679567197461134e-05, "loss": 2.0969, "step": 820 }, { "epoch": 0.7472977585618387, "grad_norm": 0.4181436598300934, "learning_rate": 3.047193353269382e-05, "loss": 2.1766, "step": 821 }, { "epoch": 0.7482079872567983, "grad_norm": 0.41824692487716675, "learning_rate": 3.0264878501777306e-05, "loss": 2.0897, "step": 822 }, { "epoch": 0.7491182159517579, "grad_norm": 0.442640095949173, "learning_rate": 3.005840382788685e-05, "loss": 2.0851, "step": 823 }, { "epoch": 0.7500284446467175, "grad_norm": 0.43662169575691223, "learning_rate": 2.9852511229367865e-05, "loss": 2.1546, "step": 824 }, { "epoch": 0.7509386733416771, "grad_norm": 0.44712746143341064, "learning_rate": 2.9647202419721687e-05, "loss": 2.2304, "step": 825 }, { "epoch": 0.7509386733416771, "eval_loss": 2.2184386253356934, "eval_runtime": 205.4094, "eval_samples_per_second": 9.011, "eval_steps_per_second": 4.508, "step": 825 }, { "epoch": 0.7518489020366367, "grad_norm": 0.4428364336490631, "learning_rate": 2.944247910759097e-05, "loss": 2.1631, "step": 826 }, { "epoch": 0.7527591307315963, "grad_norm": 0.43727412819862366, "learning_rate": 2.9238342996745817e-05, "loss": 2.1495, "step": 827 }, { "epoch": 0.7536693594265559, "grad_norm": 0.45698437094688416, "learning_rate": 2.9034795786069436e-05, "loss": 2.1497, "step": 828 }, { "epoch": 0.7545795881215155, "grad_norm": 0.423408567905426, "learning_rate": 2.8831839169543996e-05, "loss": 1.9607, "step": 829 }, { "epoch": 0.7554898168164751, "grad_norm": 0.4666843116283417, "learning_rate": 2.862947483623659e-05, "loss": 2.1271, "step": 830 }, { "epoch": 0.7564000455114347, "grad_norm": 0.46026623249053955, "learning_rate": 2.8427704470285144e-05, "loss": 2.1943, "step": 831 }, { "epoch": 0.7573102742063944, "grad_norm": 0.459824800491333, "learning_rate": 2.8226529750884402e-05, "loss": 2.0793, "step": 832 }, { "epoch": 0.758220502901354, "grad_norm": 0.4680033326148987, "learning_rate": 2.8025952352271958e-05, "loss": 2.1652, "step": 833 }, { "epoch": 0.7591307315963136, "grad_norm": 0.47054019570350647, "learning_rate": 2.7825973943714335e-05, "loss": 2.1526, "step": 834 }, { "epoch": 0.7600409602912732, "grad_norm": 0.465668648481369, "learning_rate": 2.7626596189492983e-05, "loss": 2.0476, "step": 835 }, { "epoch": 0.7609511889862328, "grad_norm": 0.47716715931892395, "learning_rate": 2.7427820748890685e-05, "loss": 2.1511, "step": 836 }, { "epoch": 0.7618614176811924, "grad_norm": 0.468532532453537, "learning_rate": 2.7229649276177503e-05, "loss": 2.1065, "step": 837 }, { "epoch": 0.762771646376152, "grad_norm": 0.4883919358253479, "learning_rate": 2.7032083420597e-05, "loss": 2.2111, "step": 838 }, { "epoch": 0.7636818750711116, "grad_norm": 0.5037781000137329, "learning_rate": 2.683512482635281e-05, "loss": 2.1824, "step": 839 }, { "epoch": 0.7645921037660712, "grad_norm": 0.5097115635871887, "learning_rate": 2.6638775132594553e-05, "loss": 2.2818, "step": 840 }, { "epoch": 0.7655023324610308, "grad_norm": 0.5003491640090942, "learning_rate": 2.6443035973404496e-05, "loss": 2.112, "step": 841 }, { "epoch": 0.7664125611559904, "grad_norm": 0.5198303461074829, "learning_rate": 2.624790897778391e-05, "loss": 2.1864, "step": 842 }, { "epoch": 0.76732278985095, "grad_norm": 0.5522372722625732, "learning_rate": 2.605339576963929e-05, "loss": 2.3857, "step": 843 }, { "epoch": 0.7682330185459096, "grad_norm": 0.5393935441970825, "learning_rate": 2.585949796776912e-05, "loss": 2.2549, "step": 844 }, { "epoch": 0.7691432472408692, "grad_norm": 0.573794424533844, "learning_rate": 2.5666217185850262e-05, "loss": 2.3236, "step": 845 }, { "epoch": 0.7700534759358288, "grad_norm": 0.5689824819564819, "learning_rate": 2.5473555032424533e-05, "loss": 2.1463, "step": 846 }, { "epoch": 0.7709637046307884, "grad_norm": 0.6545232534408569, "learning_rate": 2.528151311088537e-05, "loss": 2.3964, "step": 847 }, { "epoch": 0.771873933325748, "grad_norm": 0.7613667845726013, "learning_rate": 2.50900930194644e-05, "loss": 2.7365, "step": 848 }, { "epoch": 0.7727841620207077, "grad_norm": 0.8592699766159058, "learning_rate": 2.4899296351218227e-05, "loss": 2.3281, "step": 849 }, { "epoch": 0.7736943907156673, "grad_norm": 1.6767549514770508, "learning_rate": 2.4709124694015116e-05, "loss": 2.4329, "step": 850 }, { "epoch": 0.774604619410627, "grad_norm": 0.42721498012542725, "learning_rate": 2.451957963052185e-05, "loss": 2.4287, "step": 851 }, { "epoch": 0.7755148481055866, "grad_norm": 0.4105132818222046, "learning_rate": 2.433066273819037e-05, "loss": 2.3069, "step": 852 }, { "epoch": 0.7764250768005462, "grad_norm": 0.42730897665023804, "learning_rate": 2.4142375589244957e-05, "loss": 2.3786, "step": 853 }, { "epoch": 0.7773353054955058, "grad_norm": 0.41130363941192627, "learning_rate": 2.3954719750668907e-05, "loss": 2.1378, "step": 854 }, { "epoch": 0.7782455341904654, "grad_norm": 0.41424882411956787, "learning_rate": 2.3767696784191463e-05, "loss": 2.2526, "step": 855 }, { "epoch": 0.779155762885425, "grad_norm": 0.42201122641563416, "learning_rate": 2.3581308246275103e-05, "loss": 2.3379, "step": 856 }, { "epoch": 0.7800659915803846, "grad_norm": 0.4150107204914093, "learning_rate": 2.339555568810221e-05, "loss": 2.1722, "step": 857 }, { "epoch": 0.7809762202753442, "grad_norm": 0.432271271944046, "learning_rate": 2.321044065556246e-05, "loss": 2.4875, "step": 858 }, { "epoch": 0.7818864489703038, "grad_norm": 0.4255993068218231, "learning_rate": 2.302596468923981e-05, "loss": 2.3043, "step": 859 }, { "epoch": 0.7827966776652634, "grad_norm": 0.4193149507045746, "learning_rate": 2.284212932439972e-05, "loss": 2.3238, "step": 860 }, { "epoch": 0.783706906360223, "grad_norm": 0.3979727625846863, "learning_rate": 2.265893609097637e-05, "loss": 2.0908, "step": 861 }, { "epoch": 0.7846171350551826, "grad_norm": 0.42481502890586853, "learning_rate": 2.247638651355991e-05, "loss": 2.3404, "step": 862 }, { "epoch": 0.7855273637501422, "grad_norm": 0.41932496428489685, "learning_rate": 2.229448211138382e-05, "loss": 2.3529, "step": 863 }, { "epoch": 0.7864375924451018, "grad_norm": 0.4188045263290405, "learning_rate": 2.211322439831218e-05, "loss": 2.1973, "step": 864 }, { "epoch": 0.7873478211400614, "grad_norm": 0.41575223207473755, "learning_rate": 2.1932614882827197e-05, "loss": 2.2664, "step": 865 }, { "epoch": 0.788258049835021, "grad_norm": 0.41103559732437134, "learning_rate": 2.1752655068016515e-05, "loss": 2.1176, "step": 866 }, { "epoch": 0.7891682785299806, "grad_norm": 0.3994426727294922, "learning_rate": 2.1573346451560794e-05, "loss": 2.0824, "step": 867 }, { "epoch": 0.7900785072249402, "grad_norm": 0.40480148792266846, "learning_rate": 2.139469052572127e-05, "loss": 1.9797, "step": 868 }, { "epoch": 0.7909887359198998, "grad_norm": 0.4224672317504883, "learning_rate": 2.1216688777327154e-05, "loss": 2.0783, "step": 869 }, { "epoch": 0.7918989646148595, "grad_norm": 0.4260886013507843, "learning_rate": 2.1039342687763586e-05, "loss": 2.203, "step": 870 }, { "epoch": 0.7928091933098191, "grad_norm": 0.41183462738990784, "learning_rate": 2.0862653732958915e-05, "loss": 1.9724, "step": 871 }, { "epoch": 0.7937194220047787, "grad_norm": 0.43447592854499817, "learning_rate": 2.0686623383372715e-05, "loss": 2.1632, "step": 872 }, { "epoch": 0.7946296506997383, "grad_norm": 0.4297522008419037, "learning_rate": 2.051125310398353e-05, "loss": 2.0486, "step": 873 }, { "epoch": 0.795539879394698, "grad_norm": 0.45072224736213684, "learning_rate": 2.03365443542764e-05, "loss": 2.1973, "step": 874 }, { "epoch": 0.7964501080896575, "grad_norm": 0.4462050199508667, "learning_rate": 2.016249858823106e-05, "loss": 2.0274, "step": 875 }, { "epoch": 0.7973603367846172, "grad_norm": 0.4606810212135315, "learning_rate": 1.998911725430963e-05, "loss": 2.1616, "step": 876 }, { "epoch": 0.7982705654795768, "grad_norm": 0.44487303495407104, "learning_rate": 1.981640179544466e-05, "loss": 2.323, "step": 877 }, { "epoch": 0.7991807941745364, "grad_norm": 0.45202627778053284, "learning_rate": 1.964435364902705e-05, "loss": 2.1361, "step": 878 }, { "epoch": 0.800091022869496, "grad_norm": 0.44588690996170044, "learning_rate": 1.947297424689414e-05, "loss": 2.1173, "step": 879 }, { "epoch": 0.8010012515644556, "grad_norm": 0.46819573640823364, "learning_rate": 1.93022650153178e-05, "loss": 2.0187, "step": 880 }, { "epoch": 0.8019114802594152, "grad_norm": 0.44944408535957336, "learning_rate": 1.913222737499243e-05, "loss": 2.0103, "step": 881 }, { "epoch": 0.8028217089543748, "grad_norm": 0.44194296002388, "learning_rate": 1.8962862741023423e-05, "loss": 1.9489, "step": 882 }, { "epoch": 0.8037319376493344, "grad_norm": 0.4707835614681244, "learning_rate": 1.879417252291502e-05, "loss": 2.1982, "step": 883 }, { "epoch": 0.804642166344294, "grad_norm": 0.4707585573196411, "learning_rate": 1.8626158124558858e-05, "loss": 2.1049, "step": 884 }, { "epoch": 0.8055523950392536, "grad_norm": 0.4964425265789032, "learning_rate": 1.8458820944222255e-05, "loss": 2.2127, "step": 885 }, { "epoch": 0.8064626237342132, "grad_norm": 0.4742617607116699, "learning_rate": 1.829216237453637e-05, "loss": 2.1019, "step": 886 }, { "epoch": 0.8073728524291728, "grad_norm": 0.49655184149742126, "learning_rate": 1.8126183802484865e-05, "loss": 2.2403, "step": 887 }, { "epoch": 0.8082830811241324, "grad_norm": 0.4954749643802643, "learning_rate": 1.7960886609392214e-05, "loss": 2.0321, "step": 888 }, { "epoch": 0.8091933098190921, "grad_norm": 0.4694468379020691, "learning_rate": 1.7796272170912253e-05, "loss": 1.817, "step": 889 }, { "epoch": 0.8101035385140517, "grad_norm": 0.5026715397834778, "learning_rate": 1.763234185701673e-05, "loss": 2.2038, "step": 890 }, { "epoch": 0.8110137672090113, "grad_norm": 0.5050073862075806, "learning_rate": 1.7469097031983893e-05, "loss": 2.0861, "step": 891 }, { "epoch": 0.8119239959039709, "grad_norm": 0.5078185796737671, "learning_rate": 1.730653905438714e-05, "loss": 2.0672, "step": 892 }, { "epoch": 0.8128342245989305, "grad_norm": 0.525215744972229, "learning_rate": 1.7144669277083712e-05, "loss": 2.1502, "step": 893 }, { "epoch": 0.8137444532938901, "grad_norm": 0.5429519414901733, "learning_rate": 1.6983489047203483e-05, "loss": 2.0935, "step": 894 }, { "epoch": 0.8146546819888497, "grad_norm": 0.5544317960739136, "learning_rate": 1.6822999706137567e-05, "loss": 2.0943, "step": 895 }, { "epoch": 0.8155649106838093, "grad_norm": 0.6273201107978821, "learning_rate": 1.6663202589527473e-05, "loss": 2.3608, "step": 896 }, { "epoch": 0.8164751393787689, "grad_norm": 0.7101454734802246, "learning_rate": 1.6504099027253706e-05, "loss": 2.4168, "step": 897 }, { "epoch": 0.8173853680737285, "grad_norm": 0.7550842761993408, "learning_rate": 1.634569034342476e-05, "loss": 2.5798, "step": 898 }, { "epoch": 0.8182955967686881, "grad_norm": 0.8533863425254822, "learning_rate": 1.6187977856366253e-05, "loss": 2.5575, "step": 899 }, { "epoch": 0.8192058254636477, "grad_norm": 1.34774911403656, "learning_rate": 1.6030962878609725e-05, "loss": 2.4134, "step": 900 }, { "epoch": 0.8201160541586073, "grad_norm": 0.4584032893180847, "learning_rate": 1.587464671688187e-05, "loss": 2.4781, "step": 901 }, { "epoch": 0.8210262828535669, "grad_norm": 0.43342748284339905, "learning_rate": 1.5719030672093717e-05, "loss": 2.3685, "step": 902 }, { "epoch": 0.8219365115485265, "grad_norm": 0.4225307106971741, "learning_rate": 1.5564116039329545e-05, "loss": 2.2022, "step": 903 }, { "epoch": 0.8228467402434861, "grad_norm": 0.43026039004325867, "learning_rate": 1.5409904107836358e-05, "loss": 2.2817, "step": 904 }, { "epoch": 0.8237569689384457, "grad_norm": 0.4114493131637573, "learning_rate": 1.5256396161013075e-05, "loss": 2.3298, "step": 905 }, { "epoch": 0.8246671976334053, "grad_norm": 0.42313718795776367, "learning_rate": 1.5103593476399791e-05, "loss": 2.3211, "step": 906 }, { "epoch": 0.825577426328365, "grad_norm": 0.4246841371059418, "learning_rate": 1.495149732566723e-05, "loss": 2.2385, "step": 907 }, { "epoch": 0.8264876550233247, "grad_norm": 0.4131985008716583, "learning_rate": 1.4800108974606119e-05, "loss": 2.2873, "step": 908 }, { "epoch": 0.8273978837182843, "grad_norm": 0.42265599966049194, "learning_rate": 1.4649429683116644e-05, "loss": 2.1486, "step": 909 }, { "epoch": 0.8283081124132439, "grad_norm": 0.4338424801826477, "learning_rate": 1.4499460705197998e-05, "loss": 2.2365, "step": 910 }, { "epoch": 0.8292183411082035, "grad_norm": 0.4278540015220642, "learning_rate": 1.4350203288937936e-05, "loss": 2.36, "step": 911 }, { "epoch": 0.8301285698031631, "grad_norm": 0.41379448771476746, "learning_rate": 1.4201658676502294e-05, "loss": 2.184, "step": 912 }, { "epoch": 0.8310387984981227, "grad_norm": 0.42351198196411133, "learning_rate": 1.4053828104124867e-05, "loss": 2.2505, "step": 913 }, { "epoch": 0.8319490271930823, "grad_norm": 0.40783679485321045, "learning_rate": 1.3906712802096933e-05, "loss": 2.0255, "step": 914 }, { "epoch": 0.8328592558880419, "grad_norm": 0.4174416661262512, "learning_rate": 1.3760313994757001e-05, "loss": 2.2376, "step": 915 }, { "epoch": 0.8337694845830015, "grad_norm": 0.41884645819664, "learning_rate": 1.361463290048085e-05, "loss": 2.0206, "step": 916 }, { "epoch": 0.8346797132779611, "grad_norm": 0.399498850107193, "learning_rate": 1.3469670731671046e-05, "loss": 2.063, "step": 917 }, { "epoch": 0.8355899419729207, "grad_norm": 0.40431055426597595, "learning_rate": 1.3325428694747177e-05, "loss": 2.0053, "step": 918 }, { "epoch": 0.8365001706678803, "grad_norm": 0.40479356050491333, "learning_rate": 1.3181907990135622e-05, "loss": 2.0693, "step": 919 }, { "epoch": 0.8374103993628399, "grad_norm": 0.4056653678417206, "learning_rate": 1.3039109812259598e-05, "loss": 2.0361, "step": 920 }, { "epoch": 0.8383206280577995, "grad_norm": 0.4257088005542755, "learning_rate": 1.2897035349529263e-05, "loss": 2.0589, "step": 921 }, { "epoch": 0.8392308567527591, "grad_norm": 0.43024080991744995, "learning_rate": 1.2755685784331783e-05, "loss": 2.0419, "step": 922 }, { "epoch": 0.8401410854477187, "grad_norm": 0.42889195680618286, "learning_rate": 1.2615062293021507e-05, "loss": 2.0515, "step": 923 }, { "epoch": 0.8410513141426783, "grad_norm": 0.4491952061653137, "learning_rate": 1.2475166045910159e-05, "loss": 2.2535, "step": 924 }, { "epoch": 0.8419615428376379, "grad_norm": 0.43797358870506287, "learning_rate": 1.2335998207257137e-05, "loss": 2.1338, "step": 925 }, { "epoch": 0.8428717715325975, "grad_norm": 0.4491622745990753, "learning_rate": 1.2197559935259795e-05, "loss": 2.2059, "step": 926 }, { "epoch": 0.8437820002275572, "grad_norm": 0.43628188967704773, "learning_rate": 1.20598523820438e-05, "loss": 1.8784, "step": 927 }, { "epoch": 0.8446922289225168, "grad_norm": 0.45739004015922546, "learning_rate": 1.1922876693653585e-05, "loss": 2.0433, "step": 928 }, { "epoch": 0.8456024576174764, "grad_norm": 0.44873446226119995, "learning_rate": 1.1786634010042719e-05, "loss": 1.9578, "step": 929 }, { "epoch": 0.846512686312436, "grad_norm": 0.43957433104515076, "learning_rate": 1.1651125465064516e-05, "loss": 2.0078, "step": 930 }, { "epoch": 0.8474229150073956, "grad_norm": 0.4639342129230499, "learning_rate": 1.1516352186462586e-05, "loss": 2.0714, "step": 931 }, { "epoch": 0.8483331437023552, "grad_norm": 0.44638022780418396, "learning_rate": 1.13823152958614e-05, "loss": 1.8991, "step": 932 }, { "epoch": 0.8492433723973148, "grad_norm": 0.4596819579601288, "learning_rate": 1.1249015908756998e-05, "loss": 1.9595, "step": 933 }, { "epoch": 0.8501536010922744, "grad_norm": 0.47656434774398804, "learning_rate": 1.1116455134507664e-05, "loss": 2.0788, "step": 934 }, { "epoch": 0.851063829787234, "grad_norm": 0.4645254611968994, "learning_rate": 1.098463407632474e-05, "loss": 2.0703, "step": 935 }, { "epoch": 0.8519740584821937, "grad_norm": 0.4659541845321655, "learning_rate": 1.0853553831263418e-05, "loss": 2.0804, "step": 936 }, { "epoch": 0.8528842871771533, "grad_norm": 0.4771886467933655, "learning_rate": 1.0723215490213634e-05, "loss": 2.1124, "step": 937 }, { "epoch": 0.8537945158721129, "grad_norm": 0.49211612343788147, "learning_rate": 1.0593620137890948e-05, "loss": 2.2221, "step": 938 }, { "epoch": 0.8547047445670725, "grad_norm": 0.5174618363380432, "learning_rate": 1.0464768852827545e-05, "loss": 2.1684, "step": 939 }, { "epoch": 0.8556149732620321, "grad_norm": 0.5098733305931091, "learning_rate": 1.0336662707363287e-05, "loss": 2.103, "step": 940 }, { "epoch": 0.8565252019569917, "grad_norm": 0.5197715163230896, "learning_rate": 1.0209302767636664e-05, "loss": 2.2107, "step": 941 }, { "epoch": 0.8574354306519513, "grad_norm": 0.547512412071228, "learning_rate": 1.0082690093576163e-05, "loss": 2.2448, "step": 942 }, { "epoch": 0.8583456593469109, "grad_norm": 0.5418568849563599, "learning_rate": 9.95682573889114e-06, "loss": 2.2423, "step": 943 }, { "epoch": 0.8592558880418705, "grad_norm": 0.5369839072227478, "learning_rate": 9.831710751063283e-06, "loss": 1.9788, "step": 944 }, { "epoch": 0.8601661167368301, "grad_norm": 0.573844313621521, "learning_rate": 9.707346171337894e-06, "loss": 2.2906, "step": 945 }, { "epoch": 0.8610763454317898, "grad_norm": 0.6142247915267944, "learning_rate": 9.583733034714981e-06, "loss": 2.3744, "step": 946 }, { "epoch": 0.8619865741267494, "grad_norm": 0.6646602153778076, "learning_rate": 9.460872369940955e-06, "loss": 2.4641, "step": 947 }, { "epoch": 0.862896802821709, "grad_norm": 0.727783739566803, "learning_rate": 9.338765199499854e-06, "loss": 2.4612, "step": 948 }, { "epoch": 0.8638070315166686, "grad_norm": 0.851578950881958, "learning_rate": 9.217412539604942e-06, "loss": 2.6441, "step": 949 }, { "epoch": 0.8647172602116282, "grad_norm": 1.461125373840332, "learning_rate": 9.096815400190172e-06, "loss": 2.4248, "step": 950 }, { "epoch": 0.8656274889065878, "grad_norm": 0.4408392608165741, "learning_rate": 8.97697478490188e-06, "loss": 2.5431, "step": 951 }, { "epoch": 0.8665377176015474, "grad_norm": 0.4102359414100647, "learning_rate": 8.857891691090337e-06, "loss": 2.3448, "step": 952 }, { "epoch": 0.867447946296507, "grad_norm": 0.4374777674674988, "learning_rate": 8.739567109801494e-06, "loss": 2.3647, "step": 953 }, { "epoch": 0.8683581749914666, "grad_norm": 0.40009114146232605, "learning_rate": 8.62200202576875e-06, "loss": 2.2401, "step": 954 }, { "epoch": 0.8692684036864262, "grad_norm": 0.42013484239578247, "learning_rate": 8.505197417404687e-06, "loss": 2.1772, "step": 955 }, { "epoch": 0.8701786323813858, "grad_norm": 0.43588119745254517, "learning_rate": 8.38915425679304e-06, "loss": 2.4605, "step": 956 }, { "epoch": 0.8710888610763454, "grad_norm": 0.4295041561126709, "learning_rate": 8.273873509680519e-06, "loss": 2.4302, "step": 957 }, { "epoch": 0.871999089771305, "grad_norm": 0.4430733621120453, "learning_rate": 8.15935613546872e-06, "loss": 2.3014, "step": 958 }, { "epoch": 0.8729093184662646, "grad_norm": 0.4275224804878235, "learning_rate": 8.045603087206388e-06, "loss": 2.251, "step": 959 }, { "epoch": 0.8738195471612242, "grad_norm": 0.4218734800815582, "learning_rate": 7.932615311581126e-06, "loss": 2.2841, "step": 960 }, { "epoch": 0.8747297758561838, "grad_norm": 0.4275785982608795, "learning_rate": 7.820393748911791e-06, "loss": 2.2751, "step": 961 }, { "epoch": 0.8756400045511434, "grad_norm": 0.40714067220687866, "learning_rate": 7.708939333140642e-06, "loss": 2.2023, "step": 962 }, { "epoch": 0.876550233246103, "grad_norm": 0.4284750521183014, "learning_rate": 7.598252991825372e-06, "loss": 2.1991, "step": 963 }, { "epoch": 0.8774604619410626, "grad_norm": 0.40348193049430847, "learning_rate": 7.488335646131628e-06, "loss": 2.1214, "step": 964 }, { "epoch": 0.8783706906360224, "grad_norm": 0.4067203998565674, "learning_rate": 7.3791882108251945e-06, "loss": 2.0977, "step": 965 }, { "epoch": 0.879280919330982, "grad_norm": 0.40969371795654297, "learning_rate": 7.270811594264437e-06, "loss": 2.1751, "step": 966 }, { "epoch": 0.8801911480259416, "grad_norm": 0.39071908593177795, "learning_rate": 7.163206698392744e-06, "loss": 2.0464, "step": 967 }, { "epoch": 0.8811013767209012, "grad_norm": 0.4038424789905548, "learning_rate": 7.056374418730971e-06, "loss": 2.1137, "step": 968 }, { "epoch": 0.8820116054158608, "grad_norm": 0.38801443576812744, "learning_rate": 6.950315644370075e-06, "loss": 1.883, "step": 969 }, { "epoch": 0.8829218341108204, "grad_norm": 0.3895006477832794, "learning_rate": 6.845031257963619e-06, "loss": 2.0169, "step": 970 }, { "epoch": 0.88383206280578, "grad_norm": 0.413171648979187, "learning_rate": 6.740522135720517e-06, "loss": 2.2054, "step": 971 }, { "epoch": 0.8847422915007396, "grad_norm": 0.4204188585281372, "learning_rate": 6.636789147397637e-06, "loss": 2.1765, "step": 972 }, { "epoch": 0.8856525201956992, "grad_norm": 0.4209098517894745, "learning_rate": 6.533833156292679e-06, "loss": 1.9617, "step": 973 }, { "epoch": 0.8865627488906588, "grad_norm": 0.4256611168384552, "learning_rate": 6.431655019236948e-06, "loss": 2.108, "step": 974 }, { "epoch": 0.8874729775856184, "grad_norm": 0.43669816851615906, "learning_rate": 6.3302555865880965e-06, "loss": 2.0991, "step": 975 }, { "epoch": 0.888383206280578, "grad_norm": 0.44833648204803467, "learning_rate": 6.229635702223324e-06, "loss": 2.2335, "step": 976 }, { "epoch": 0.8892934349755376, "grad_norm": 0.45070621371269226, "learning_rate": 6.129796203532057e-06, "loss": 2.2487, "step": 977 }, { "epoch": 0.8902036636704972, "grad_norm": 0.4609052538871765, "learning_rate": 6.030737921409169e-06, "loss": 2.1147, "step": 978 }, { "epoch": 0.8911138923654568, "grad_norm": 0.4470416307449341, "learning_rate": 5.932461680248014e-06, "loss": 2.0615, "step": 979 }, { "epoch": 0.8920241210604164, "grad_norm": 0.42171233892440796, "learning_rate": 5.834968297933541e-06, "loss": 2.0669, "step": 980 }, { "epoch": 0.892934349755376, "grad_norm": 0.4385877251625061, "learning_rate": 5.738258585835532e-06, "loss": 1.9846, "step": 981 }, { "epoch": 0.8938445784503356, "grad_norm": 0.4574371576309204, "learning_rate": 5.6423333488018095e-06, "loss": 2.114, "step": 982 }, { "epoch": 0.8947548071452952, "grad_norm": 0.46896499395370483, "learning_rate": 5.547193385151561e-06, "loss": 2.0444, "step": 983 }, { "epoch": 0.8956650358402549, "grad_norm": 0.45737412571907043, "learning_rate": 5.45283948666866e-06, "loss": 2.0976, "step": 984 }, { "epoch": 0.8965752645352145, "grad_norm": 0.47739726305007935, "learning_rate": 5.359272438595153e-06, "loss": 2.1393, "step": 985 }, { "epoch": 0.8974854932301741, "grad_norm": 0.47124338150024414, "learning_rate": 5.266493019624663e-06, "loss": 2.0509, "step": 986 }, { "epoch": 0.8983957219251337, "grad_norm": 0.4660322070121765, "learning_rate": 5.1745020018958866e-06, "loss": 1.9704, "step": 987 }, { "epoch": 0.8993059506200933, "grad_norm": 0.48330241441726685, "learning_rate": 5.083300150986259e-06, "loss": 2.021, "step": 988 }, { "epoch": 0.9002161793150529, "grad_norm": 0.51470547914505, "learning_rate": 4.992888225905468e-06, "loss": 2.1097, "step": 989 }, { "epoch": 0.9011264080100125, "grad_norm": 0.516373336315155, "learning_rate": 4.903266979089249e-06, "loss": 2.1694, "step": 990 }, { "epoch": 0.9020366367049721, "grad_norm": 0.5256400108337402, "learning_rate": 4.8144371563930476e-06, "loss": 2.2843, "step": 991 }, { "epoch": 0.9029468653999317, "grad_norm": 0.5559744238853455, "learning_rate": 4.726399497085832e-06, "loss": 2.2733, "step": 992 }, { "epoch": 0.9038570940948913, "grad_norm": 0.5462202429771423, "learning_rate": 4.6391547338439536e-06, "loss": 2.1758, "step": 993 }, { "epoch": 0.904767322789851, "grad_norm": 0.5769087672233582, "learning_rate": 4.552703592745033e-06, "loss": 2.2552, "step": 994 }, { "epoch": 0.9056775514848106, "grad_norm": 0.5632253289222717, "learning_rate": 4.467046793261931e-06, "loss": 2.2402, "step": 995 }, { "epoch": 0.9065877801797702, "grad_norm": 0.610163688659668, "learning_rate": 4.3821850482567595e-06, "loss": 2.4484, "step": 996 }, { "epoch": 0.9074980088747298, "grad_norm": 0.6236492395401001, "learning_rate": 4.298119063974914e-06, "loss": 2.1914, "step": 997 }, { "epoch": 0.9084082375696894, "grad_norm": 0.7362584471702576, "learning_rate": 4.214849540039267e-06, "loss": 2.5582, "step": 998 }, { "epoch": 0.909318466264649, "grad_norm": 0.8780522346496582, "learning_rate": 4.132377169444279e-06, "loss": 2.5269, "step": 999 }, { "epoch": 0.9102286949596086, "grad_norm": 1.4850975275039673, "learning_rate": 4.050702638550275e-06, "loss": 2.3857, "step": 1000 }, { "epoch": 0.9111389236545682, "grad_norm": 0.4350475072860718, "learning_rate": 3.969826627077655e-06, "loss": 2.5653, "step": 1001 }, { "epoch": 0.9120491523495278, "grad_norm": 0.44277626276016235, "learning_rate": 3.889749808101395e-06, "loss": 2.3969, "step": 1002 }, { "epoch": 0.9129593810444874, "grad_norm": 0.44005268812179565, "learning_rate": 3.810472848045266e-06, "loss": 2.6065, "step": 1003 }, { "epoch": 0.9138696097394471, "grad_norm": 0.41925248503685, "learning_rate": 3.7319964066763858e-06, "loss": 2.3878, "step": 1004 }, { "epoch": 0.9147798384344067, "grad_norm": 0.4320535957813263, "learning_rate": 3.6543211370997587e-06, "loss": 2.3829, "step": 1005 }, { "epoch": 0.9156900671293663, "grad_norm": 0.43817150592803955, "learning_rate": 3.5774476857527107e-06, "loss": 2.3854, "step": 1006 }, { "epoch": 0.9166002958243259, "grad_norm": 0.42210131883621216, "learning_rate": 3.5013766923996604e-06, "loss": 2.2874, "step": 1007 }, { "epoch": 0.9175105245192855, "grad_norm": 0.41610825061798096, "learning_rate": 3.426108790126681e-06, "loss": 2.3301, "step": 1008 }, { "epoch": 0.9184207532142451, "grad_norm": 0.42343541979789734, "learning_rate": 3.3516446053363015e-06, "loss": 2.2083, "step": 1009 }, { "epoch": 0.9193309819092047, "grad_norm": 0.4323045015335083, "learning_rate": 3.2779847577422697e-06, "loss": 2.2401, "step": 1010 }, { "epoch": 0.9202412106041643, "grad_norm": 0.4198078513145447, "learning_rate": 3.2051298603643753e-06, "loss": 2.0988, "step": 1011 }, { "epoch": 0.9211514392991239, "grad_norm": 0.4277539551258087, "learning_rate": 3.133080519523368e-06, "loss": 2.3482, "step": 1012 }, { "epoch": 0.9220616679940835, "grad_norm": 0.42749837040901184, "learning_rate": 3.0618373348359264e-06, "loss": 2.3242, "step": 1013 }, { "epoch": 0.9229718966890431, "grad_norm": 0.4157456159591675, "learning_rate": 2.991400899209651e-06, "loss": 2.11, "step": 1014 }, { "epoch": 0.9238821253840027, "grad_norm": 0.41514283418655396, "learning_rate": 2.921771798838069e-06, "loss": 2.0979, "step": 1015 }, { "epoch": 0.9247923540789623, "grad_norm": 0.4146190285682678, "learning_rate": 2.852950613195915e-06, "loss": 2.1057, "step": 1016 }, { "epoch": 0.9257025827739219, "grad_norm": 0.4031788110733032, "learning_rate": 2.784937915034169e-06, "loss": 2.1094, "step": 1017 }, { "epoch": 0.9266128114688815, "grad_norm": 0.4135347604751587, "learning_rate": 2.717734270375272e-06, "loss": 2.2154, "step": 1018 }, { "epoch": 0.9275230401638411, "grad_norm": 0.40153443813323975, "learning_rate": 2.6513402385085704e-06, "loss": 2.0342, "step": 1019 }, { "epoch": 0.9284332688588007, "grad_norm": 0.4011882543563843, "learning_rate": 2.585756371985493e-06, "loss": 1.9751, "step": 1020 }, { "epoch": 0.9293434975537603, "grad_norm": 0.4032374322414398, "learning_rate": 2.520983216615047e-06, "loss": 2.0868, "step": 1021 }, { "epoch": 0.9302537262487199, "grad_norm": 0.4052782952785492, "learning_rate": 2.4570213114592954e-06, "loss": 2.0716, "step": 1022 }, { "epoch": 0.9311639549436797, "grad_norm": 0.4261015057563782, "learning_rate": 2.393871188828767e-06, "loss": 2.1153, "step": 1023 }, { "epoch": 0.9320741836386393, "grad_norm": 0.4141393303871155, "learning_rate": 2.3315333742780942e-06, "loss": 2.0839, "step": 1024 }, { "epoch": 0.9329844123335989, "grad_norm": 0.41185298562049866, "learning_rate": 2.270008386601685e-06, "loss": 2.0686, "step": 1025 }, { "epoch": 0.9338946410285585, "grad_norm": 0.41932380199432373, "learning_rate": 2.2092967378292915e-06, "loss": 2.0688, "step": 1026 }, { "epoch": 0.9348048697235181, "grad_norm": 0.430480033159256, "learning_rate": 2.1493989332218468e-06, "loss": 2.1202, "step": 1027 }, { "epoch": 0.9357150984184777, "grad_norm": 0.4182969629764557, "learning_rate": 2.0903154712672237e-06, "loss": 1.8457, "step": 1028 }, { "epoch": 0.9366253271134373, "grad_norm": 0.44270989298820496, "learning_rate": 2.032046843676061e-06, "loss": 2.2296, "step": 1029 }, { "epoch": 0.9375355558083969, "grad_norm": 0.4312398433685303, "learning_rate": 1.974593535377722e-06, "loss": 1.9802, "step": 1030 }, { "epoch": 0.9384457845033565, "grad_norm": 0.4557861387729645, "learning_rate": 1.917956024516243e-06, "loss": 1.9306, "step": 1031 }, { "epoch": 0.9393560131983161, "grad_norm": 0.4993920624256134, "learning_rate": 1.8621347824462787e-06, "loss": 2.275, "step": 1032 }, { "epoch": 0.9402662418932757, "grad_norm": 0.4622466266155243, "learning_rate": 1.8071302737293295e-06, "loss": 2.1966, "step": 1033 }, { "epoch": 0.9411764705882353, "grad_norm": 0.4710349142551422, "learning_rate": 1.752942956129744e-06, "loss": 2.1353, "step": 1034 }, { "epoch": 0.9420866992831949, "grad_norm": 0.4602985680103302, "learning_rate": 1.6995732806109554e-06, "loss": 2.0855, "step": 1035 }, { "epoch": 0.9429969279781545, "grad_norm": 0.4736422002315521, "learning_rate": 1.6470216913317626e-06, "loss": 2.081, "step": 1036 }, { "epoch": 0.9439071566731141, "grad_norm": 0.4875909984111786, "learning_rate": 1.5952886256425547e-06, "loss": 2.1607, "step": 1037 }, { "epoch": 0.9448173853680737, "grad_norm": 0.5093516111373901, "learning_rate": 1.5443745140817366e-06, "loss": 2.2025, "step": 1038 }, { "epoch": 0.9457276140630333, "grad_norm": 0.5034651160240173, "learning_rate": 1.4942797803721543e-06, "loss": 1.9985, "step": 1039 }, { "epoch": 0.9466378427579929, "grad_norm": 0.5074111819267273, "learning_rate": 1.4450048414174854e-06, "loss": 2.1175, "step": 1040 }, { "epoch": 0.9475480714529525, "grad_norm": 0.5183742046356201, "learning_rate": 1.3965501072988663e-06, "loss": 2.0718, "step": 1041 }, { "epoch": 0.9484583001479122, "grad_norm": 0.5284718871116638, "learning_rate": 1.348915981271437e-06, "loss": 2.1586, "step": 1042 }, { "epoch": 0.9493685288428718, "grad_norm": 0.545464813709259, "learning_rate": 1.3021028597609675e-06, "loss": 2.2445, "step": 1043 }, { "epoch": 0.9502787575378314, "grad_norm": 0.5713001489639282, "learning_rate": 1.2561111323605712e-06, "loss": 2.1888, "step": 1044 }, { "epoch": 0.951188986232791, "grad_norm": 0.5774447321891785, "learning_rate": 1.2109411818274852e-06, "loss": 2.2029, "step": 1045 }, { "epoch": 0.9520992149277506, "grad_norm": 0.6209971308708191, "learning_rate": 1.1665933840798838e-06, "loss": 2.1735, "step": 1046 }, { "epoch": 0.9530094436227102, "grad_norm": 0.6675162315368652, "learning_rate": 1.1230681081936923e-06, "loss": 2.4231, "step": 1047 }, { "epoch": 0.9539196723176698, "grad_norm": 0.7409051060676575, "learning_rate": 1.0803657163995895e-06, "loss": 2.45, "step": 1048 }, { "epoch": 0.9548299010126294, "grad_norm": 0.8856377601623535, "learning_rate": 1.0384865640799435e-06, "loss": 2.4769, "step": 1049 }, { "epoch": 0.955740129707589, "grad_norm": 1.4897712469100952, "learning_rate": 9.974309997658915e-07, "loss": 2.6822, "step": 1050 }, { "epoch": 0.9566503584025486, "grad_norm": 0.4162799119949341, "learning_rate": 9.57199365134387e-07, "loss": 2.493, "step": 1051 }, { "epoch": 0.9575605870975082, "grad_norm": 0.4336461126804352, "learning_rate": 9.177919950054237e-07, "loss": 2.4071, "step": 1052 }, { "epoch": 0.9584708157924678, "grad_norm": 0.42869681119918823, "learning_rate": 8.792092173391831e-07, "loss": 2.3585, "step": 1053 }, { "epoch": 0.9593810444874274, "grad_norm": 0.4060511887073517, "learning_rate": 8.41451353233369e-07, "loss": 2.3244, "step": 1054 }, { "epoch": 0.960291273182387, "grad_norm": 0.42351028323173523, "learning_rate": 8.04518716920466e-07, "loss": 2.4194, "step": 1055 }, { "epoch": 0.9612015018773467, "grad_norm": 0.42555585503578186, "learning_rate": 7.684116157651966e-07, "loss": 2.368, "step": 1056 }, { "epoch": 0.9621117305723063, "grad_norm": 0.4169003665447235, "learning_rate": 7.331303502618903e-07, "loss": 2.3947, "step": 1057 }, { "epoch": 0.9630219592672659, "grad_norm": 0.43154704570770264, "learning_rate": 6.986752140320518e-07, "loss": 2.2809, "step": 1058 }, { "epoch": 0.9639321879622255, "grad_norm": 0.40992870926856995, "learning_rate": 6.650464938218637e-07, "loss": 2.2827, "step": 1059 }, { "epoch": 0.9648424166571851, "grad_norm": 0.4168105125427246, "learning_rate": 6.322444694998319e-07, "loss": 2.3334, "step": 1060 }, { "epoch": 0.9657526453521448, "grad_norm": 0.43256238102912903, "learning_rate": 6.002694140544329e-07, "loss": 2.2669, "step": 1061 }, { "epoch": 0.9666628740471044, "grad_norm": 0.42290446162223816, "learning_rate": 5.691215935918815e-07, "loss": 2.1027, "step": 1062 }, { "epoch": 0.967573102742064, "grad_norm": 0.4109112024307251, "learning_rate": 5.388012673338661e-07, "loss": 2.15, "step": 1063 }, { "epoch": 0.9684833314370236, "grad_norm": 0.4292824864387512, "learning_rate": 5.093086876154174e-07, "loss": 2.2394, "step": 1064 }, { "epoch": 0.9693935601319832, "grad_norm": 0.4136911928653717, "learning_rate": 4.80644099882821e-07, "loss": 2.1904, "step": 1065 }, { "epoch": 0.9703037888269428, "grad_norm": 0.4183078110218048, "learning_rate": 4.5280774269154115e-07, "loss": 2.1499, "step": 1066 }, { "epoch": 0.9712140175219024, "grad_norm": 0.41571420431137085, "learning_rate": 4.2579984770426686e-07, "loss": 2.0975, "step": 1067 }, { "epoch": 0.972124246216862, "grad_norm": 0.39653652906417847, "learning_rate": 3.99620639688969e-07, "loss": 1.9686, "step": 1068 }, { "epoch": 0.9730344749118216, "grad_norm": 0.4133754372596741, "learning_rate": 3.742703365170241e-07, "loss": 2.0801, "step": 1069 }, { "epoch": 0.9739447036067812, "grad_norm": 0.41434839367866516, "learning_rate": 3.497491491614158e-07, "loss": 1.9311, "step": 1070 }, { "epoch": 0.9748549323017408, "grad_norm": 0.41751164197921753, "learning_rate": 3.260572816949692e-07, "loss": 2.1841, "step": 1071 }, { "epoch": 0.9757651609967004, "grad_norm": 0.41821038722991943, "learning_rate": 3.0319493128866396e-07, "loss": 2.0719, "step": 1072 }, { "epoch": 0.97667538969166, "grad_norm": 0.4422387480735779, "learning_rate": 2.8116228820997957e-07, "loss": 2.2655, "step": 1073 }, { "epoch": 0.9775856183866196, "grad_norm": 0.4252420961856842, "learning_rate": 2.5995953582130804e-07, "loss": 2.098, "step": 1074 }, { "epoch": 0.9784958470815792, "grad_norm": 0.4352913498878479, "learning_rate": 2.395868505784438e-07, "loss": 2.1038, "step": 1075 }, { "epoch": 0.9794060757765388, "grad_norm": 0.43940603733062744, "learning_rate": 2.2004440202911814e-07, "loss": 2.1152, "step": 1076 }, { "epoch": 0.9803163044714984, "grad_norm": 0.43396398425102234, "learning_rate": 2.0133235281156736e-07, "loss": 2.0175, "step": 1077 }, { "epoch": 0.981226533166458, "grad_norm": 0.43348294496536255, "learning_rate": 1.83450858653178e-07, "loss": 2.0593, "step": 1078 }, { "epoch": 0.9821367618614176, "grad_norm": 0.43314129114151, "learning_rate": 1.664000683692324e-07, "loss": 1.9414, "step": 1079 }, { "epoch": 0.9830469905563773, "grad_norm": 0.47113651037216187, "learning_rate": 1.5018012386162072e-07, "loss": 2.3204, "step": 1080 }, { "epoch": 0.983957219251337, "grad_norm": 0.43835383653640747, "learning_rate": 1.3479116011769767e-07, "loss": 2.0271, "step": 1081 }, { "epoch": 0.9848674479462965, "grad_norm": 0.42517396807670593, "learning_rate": 1.2023330520911646e-07, "loss": 1.8366, "step": 1082 }, { "epoch": 0.9857776766412562, "grad_norm": 0.4593140482902527, "learning_rate": 1.0650668029079658e-07, "loss": 2.0962, "step": 1083 }, { "epoch": 0.9866879053362158, "grad_norm": 0.4892318546772003, "learning_rate": 9.361139959993549e-08, "loss": 2.144, "step": 1084 }, { "epoch": 0.9875981340311754, "grad_norm": 0.48190560936927795, "learning_rate": 8.154757045497619e-08, "loss": 2.1081, "step": 1085 }, { "epoch": 0.988508362726135, "grad_norm": 0.4542143940925598, "learning_rate": 7.0315293254819e-08, "loss": 1.9731, "step": 1086 }, { "epoch": 0.9894185914210946, "grad_norm": 0.4879550337791443, "learning_rate": 5.991466147791113e-08, "loss": 2.0791, "step": 1087 }, { "epoch": 0.9903288201160542, "grad_norm": 0.5068708062171936, "learning_rate": 5.0345761681491746e-08, "loss": 2.181, "step": 1088 }, { "epoch": 0.9912390488110138, "grad_norm": 0.5021325945854187, "learning_rate": 4.1608673500859175e-08, "loss": 2.1546, "step": 1089 }, { "epoch": 0.9921492775059734, "grad_norm": 0.5045585036277771, "learning_rate": 3.370346964876036e-08, "loss": 2.0335, "step": 1090 }, { "epoch": 0.993059506200933, "grad_norm": 0.5066413879394531, "learning_rate": 2.6630215914702495e-08, "loss": 2.0639, "step": 1091 }, { "epoch": 0.9939697348958926, "grad_norm": 0.5761440992355347, "learning_rate": 2.038897116447558e-08, "loss": 2.3702, "step": 1092 }, { "epoch": 0.9948799635908522, "grad_norm": 0.5774243474006653, "learning_rate": 1.4979787339619578e-08, "loss": 2.1865, "step": 1093 }, { "epoch": 0.9957901922858118, "grad_norm": 0.6156805157661438, "learning_rate": 1.0402709457035808e-08, "loss": 2.3407, "step": 1094 }, { "epoch": 0.9967004209807714, "grad_norm": 0.6527782082557678, "learning_rate": 6.657775608553962e-09, "loss": 2.3115, "step": 1095 }, { "epoch": 0.997610649675731, "grad_norm": 0.7234971523284912, "learning_rate": 3.745016960665648e-09, "loss": 2.5377, "step": 1096 }, { "epoch": 0.9985208783706906, "grad_norm": 0.7837737798690796, "learning_rate": 1.6644577542357375e-09, "loss": 2.4761, "step": 1097 }, { "epoch": 0.9994311070656502, "grad_norm": 1.1138982772827148, "learning_rate": 4.1611530431362453e-10, "loss": 2.4967, "step": 1098 }, { "epoch": 1.0006826715212198, "grad_norm": 3.5543136596679688, "learning_rate": 0.0, "loss": 4.7799, "step": 1099 } ], "logging_steps": 1, "max_steps": 1099, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 275, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0148357964895355e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }