{ "best_metric": 0.4468539357185364, "best_model_checkpoint": "/scicore/home/graber0001/kakooe0000/tts_rk_main/storage_nobackup/finetuned_models/2024_06_16__speecht5_finetuned_on_swissdial__hp_i_8/checkpoint-10000", "epoch": 843.8818565400844, "eval_steps": 10000, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.42, "grad_norm": 2.223052978515625, "learning_rate": 4.8e-05, "loss": 0.9472, "step": 25 }, { "epoch": 0.84, "grad_norm": 3.6971209049224854, "learning_rate": 9.8e-05, "loss": 0.6658, "step": 50 }, { "epoch": 1.27, "grad_norm": 1.570608377456665, "learning_rate": 9.9975987993997e-05, "loss": 0.585, "step": 75 }, { "epoch": 1.69, "grad_norm": 5.4782023429870605, "learning_rate": 9.995097548774388e-05, "loss": 0.5442, "step": 100 }, { "epoch": 2.11, "grad_norm": 2.785654306411743, "learning_rate": 9.992596298149075e-05, "loss": 0.5399, "step": 125 }, { "epoch": 2.53, "grad_norm": 1.7270556688308716, "learning_rate": 9.990095047523762e-05, "loss": 0.5322, "step": 150 }, { "epoch": 2.95, "grad_norm": 2.075948476791382, "learning_rate": 9.987593796898451e-05, "loss": 0.5179, "step": 175 }, { "epoch": 3.38, "grad_norm": 1.785163164138794, "learning_rate": 9.985092546273137e-05, "loss": 0.5139, "step": 200 }, { "epoch": 3.8, "grad_norm": 2.4875519275665283, "learning_rate": 9.982591295647824e-05, "loss": 0.5028, "step": 225 }, { "epoch": 4.22, "grad_norm": 1.295557975769043, "learning_rate": 9.980090045022513e-05, "loss": 0.5014, "step": 250 }, { "epoch": 4.64, "grad_norm": 1.439899206161499, "learning_rate": 9.977588794397198e-05, "loss": 0.4925, "step": 275 }, { "epoch": 5.06, "grad_norm": 1.7063899040222168, "learning_rate": 9.975087543771886e-05, "loss": 0.4923, "step": 300 }, { "epoch": 5.49, "grad_norm": 1.5692998170852661, "learning_rate": 9.972586293146574e-05, "loss": 0.4942, "step": 325 }, { "epoch": 5.91, "grad_norm": 2.4030182361602783, "learning_rate": 9.970085042521262e-05, "loss": 0.4931, "step": 350 }, { "epoch": 6.33, "grad_norm": 1.3197492361068726, "learning_rate": 9.967583791895947e-05, "loss": 0.4875, "step": 375 }, { "epoch": 6.75, "grad_norm": 1.907379388809204, "learning_rate": 9.965082541270636e-05, "loss": 0.487, "step": 400 }, { "epoch": 7.17, "grad_norm": 1.527170181274414, "learning_rate": 9.962581290645323e-05, "loss": 0.4898, "step": 425 }, { "epoch": 7.59, "grad_norm": 1.8325576782226562, "learning_rate": 9.96008004002001e-05, "loss": 0.4844, "step": 450 }, { "epoch": 8.02, "grad_norm": 2.1454238891601562, "learning_rate": 9.957578789394698e-05, "loss": 0.4793, "step": 475 }, { "epoch": 8.44, "grad_norm": 1.674980878829956, "learning_rate": 9.955077538769385e-05, "loss": 0.4779, "step": 500 }, { "epoch": 8.86, "grad_norm": 1.3077852725982666, "learning_rate": 9.952576288144072e-05, "loss": 0.4753, "step": 525 }, { "epoch": 9.28, "grad_norm": 1.7320929765701294, "learning_rate": 9.95007503751876e-05, "loss": 0.4737, "step": 550 }, { "epoch": 9.7, "grad_norm": 2.366737127304077, "learning_rate": 9.947573786893447e-05, "loss": 0.4784, "step": 575 }, { "epoch": 10.13, "grad_norm": 1.4668282270431519, "learning_rate": 9.945072536268134e-05, "loss": 0.4692, "step": 600 }, { "epoch": 10.55, "grad_norm": 2.044022560119629, "learning_rate": 9.942571285642821e-05, "loss": 0.4713, "step": 625 }, { "epoch": 10.97, "grad_norm": 4.148468494415283, "learning_rate": 9.940070035017509e-05, "loss": 0.4701, "step": 650 }, { "epoch": 11.39, "grad_norm": 3.074256420135498, "learning_rate": 9.937568784392196e-05, "loss": 0.4743, "step": 675 }, { "epoch": 11.81, "grad_norm": 1.8847897052764893, "learning_rate": 9.935067533766885e-05, "loss": 0.4778, "step": 700 }, { "epoch": 12.24, "grad_norm": 2.165544271469116, "learning_rate": 9.93256628314157e-05, "loss": 0.4637, "step": 725 }, { "epoch": 12.66, "grad_norm": 1.545711636543274, "learning_rate": 9.930065032516258e-05, "loss": 0.4694, "step": 750 }, { "epoch": 13.08, "grad_norm": 1.3466665744781494, "learning_rate": 9.927563781890946e-05, "loss": 0.4712, "step": 775 }, { "epoch": 13.5, "grad_norm": 2.915973663330078, "learning_rate": 9.925062531265634e-05, "loss": 0.4686, "step": 800 }, { "epoch": 13.92, "grad_norm": 1.3854703903198242, "learning_rate": 9.92256128064032e-05, "loss": 0.464, "step": 825 }, { "epoch": 14.35, "grad_norm": 1.7382938861846924, "learning_rate": 9.920060030015008e-05, "loss": 0.458, "step": 850 }, { "epoch": 14.77, "grad_norm": 3.0275321006774902, "learning_rate": 9.917558779389696e-05, "loss": 0.4631, "step": 875 }, { "epoch": 15.19, "grad_norm": 1.2588684558868408, "learning_rate": 9.915057528764383e-05, "loss": 0.4621, "step": 900 }, { "epoch": 15.61, "grad_norm": 1.7899609804153442, "learning_rate": 9.91255627813907e-05, "loss": 0.4605, "step": 925 }, { "epoch": 16.03, "grad_norm": 2.1078569889068604, "learning_rate": 9.910055027513757e-05, "loss": 0.4582, "step": 950 }, { "epoch": 16.46, "grad_norm": 1.9400877952575684, "learning_rate": 9.907553776888445e-05, "loss": 0.4585, "step": 975 }, { "epoch": 16.88, "grad_norm": 1.6047954559326172, "learning_rate": 9.905052526263132e-05, "loss": 0.4625, "step": 1000 }, { "epoch": 17.3, "grad_norm": 1.7604362964630127, "learning_rate": 9.902551275637819e-05, "loss": 0.4557, "step": 1025 }, { "epoch": 17.72, "grad_norm": 1.9329571723937988, "learning_rate": 9.900050025012508e-05, "loss": 0.4574, "step": 1050 }, { "epoch": 18.14, "grad_norm": 2.052088499069214, "learning_rate": 9.897548774387194e-05, "loss": 0.4524, "step": 1075 }, { "epoch": 18.57, "grad_norm": 2.4279603958129883, "learning_rate": 9.895047523761881e-05, "loss": 0.4526, "step": 1100 }, { "epoch": 18.99, "grad_norm": 1.7969927787780762, "learning_rate": 9.89254627313657e-05, "loss": 0.4525, "step": 1125 }, { "epoch": 19.41, "grad_norm": 1.2721341848373413, "learning_rate": 9.890045022511257e-05, "loss": 0.4497, "step": 1150 }, { "epoch": 19.83, "grad_norm": 2.87554931640625, "learning_rate": 9.887543771885943e-05, "loss": 0.4553, "step": 1175 }, { "epoch": 20.25, "grad_norm": 4.699853420257568, "learning_rate": 9.885042521260631e-05, "loss": 0.4582, "step": 1200 }, { "epoch": 20.68, "grad_norm": 1.8292546272277832, "learning_rate": 9.882541270635319e-05, "loss": 0.452, "step": 1225 }, { "epoch": 21.1, "grad_norm": 1.0467218160629272, "learning_rate": 9.880040020010006e-05, "loss": 0.4525, "step": 1250 }, { "epoch": 21.52, "grad_norm": 1.2708683013916016, "learning_rate": 9.877538769384693e-05, "loss": 0.4512, "step": 1275 }, { "epoch": 21.94, "grad_norm": 1.5680289268493652, "learning_rate": 9.87503751875938e-05, "loss": 0.4469, "step": 1300 }, { "epoch": 22.36, "grad_norm": 0.9932234287261963, "learning_rate": 9.872536268134068e-05, "loss": 0.4493, "step": 1325 }, { "epoch": 22.78, "grad_norm": 0.9426058530807495, "learning_rate": 9.870035017508755e-05, "loss": 0.4464, "step": 1350 }, { "epoch": 23.21, "grad_norm": 1.8259228467941284, "learning_rate": 9.867533766883442e-05, "loss": 0.4491, "step": 1375 }, { "epoch": 23.63, "grad_norm": 2.100245952606201, "learning_rate": 9.86503251625813e-05, "loss": 0.4464, "step": 1400 }, { "epoch": 24.05, "grad_norm": 1.6127253770828247, "learning_rate": 9.862531265632817e-05, "loss": 0.4468, "step": 1425 }, { "epoch": 24.47, "grad_norm": 1.1558961868286133, "learning_rate": 9.860030015007504e-05, "loss": 0.4436, "step": 1450 }, { "epoch": 24.89, "grad_norm": 1.6495578289031982, "learning_rate": 9.857528764382191e-05, "loss": 0.4477, "step": 1475 }, { "epoch": 25.32, "grad_norm": 1.2517154216766357, "learning_rate": 9.85502751375688e-05, "loss": 0.444, "step": 1500 }, { "epoch": 25.74, "grad_norm": 2.046708345413208, "learning_rate": 9.852526263131566e-05, "loss": 0.4401, "step": 1525 }, { "epoch": 26.16, "grad_norm": 1.5881081819534302, "learning_rate": 9.850025012506253e-05, "loss": 0.443, "step": 1550 }, { "epoch": 26.58, "grad_norm": 1.0406062602996826, "learning_rate": 9.847523761880942e-05, "loss": 0.4403, "step": 1575 }, { "epoch": 27.0, "grad_norm": 1.5876116752624512, "learning_rate": 9.845022511255629e-05, "loss": 0.4427, "step": 1600 }, { "epoch": 27.43, "grad_norm": 1.264975666999817, "learning_rate": 9.842521260630315e-05, "loss": 0.4458, "step": 1625 }, { "epoch": 27.85, "grad_norm": 1.59874427318573, "learning_rate": 9.840020010005003e-05, "loss": 0.4407, "step": 1650 }, { "epoch": 28.27, "grad_norm": 1.3375675678253174, "learning_rate": 9.837518759379691e-05, "loss": 0.4374, "step": 1675 }, { "epoch": 28.69, "grad_norm": 1.0930531024932861, "learning_rate": 9.835017508754377e-05, "loss": 0.4415, "step": 1700 }, { "epoch": 29.11, "grad_norm": 1.4564019441604614, "learning_rate": 9.832516258129065e-05, "loss": 0.444, "step": 1725 }, { "epoch": 29.54, "grad_norm": 3.7895617485046387, "learning_rate": 9.830015007503753e-05, "loss": 0.4378, "step": 1750 }, { "epoch": 29.96, "grad_norm": 1.5943691730499268, "learning_rate": 9.82751375687844e-05, "loss": 0.4413, "step": 1775 }, { "epoch": 30.38, "grad_norm": 1.272739052772522, "learning_rate": 9.825012506253127e-05, "loss": 0.4361, "step": 1800 }, { "epoch": 30.8, "grad_norm": 1.432316541671753, "learning_rate": 9.822511255627814e-05, "loss": 0.4363, "step": 1825 }, { "epoch": 31.22, "grad_norm": 1.60966157913208, "learning_rate": 9.820010005002502e-05, "loss": 0.4376, "step": 1850 }, { "epoch": 31.65, "grad_norm": 2.3171615600585938, "learning_rate": 9.817508754377189e-05, "loss": 0.4353, "step": 1875 }, { "epoch": 32.07, "grad_norm": 1.2164942026138306, "learning_rate": 9.815007503751876e-05, "loss": 0.4357, "step": 1900 }, { "epoch": 32.49, "grad_norm": 2.2668657302856445, "learning_rate": 9.812506253126563e-05, "loss": 0.4332, "step": 1925 }, { "epoch": 32.91, "grad_norm": 1.4237200021743774, "learning_rate": 9.810005002501252e-05, "loss": 0.4359, "step": 1950 }, { "epoch": 33.33, "grad_norm": 1.3995754718780518, "learning_rate": 9.807503751875938e-05, "loss": 0.4331, "step": 1975 }, { "epoch": 33.76, "grad_norm": 1.39100182056427, "learning_rate": 9.805002501250625e-05, "loss": 0.4341, "step": 2000 }, { "epoch": 34.18, "grad_norm": 4.893013954162598, "learning_rate": 9.802501250625314e-05, "loss": 0.4322, "step": 2025 }, { "epoch": 34.6, "grad_norm": 1.3059648275375366, "learning_rate": 9.8e-05, "loss": 0.437, "step": 2050 }, { "epoch": 35.02, "grad_norm": 0.9458933472633362, "learning_rate": 9.797498749374688e-05, "loss": 0.4308, "step": 2075 }, { "epoch": 35.44, "grad_norm": 3.0024828910827637, "learning_rate": 9.794997498749376e-05, "loss": 0.4352, "step": 2100 }, { "epoch": 35.86, "grad_norm": 1.5498872995376587, "learning_rate": 9.792496248124063e-05, "loss": 0.4335, "step": 2125 }, { "epoch": 36.29, "grad_norm": 1.3766555786132812, "learning_rate": 9.78999499749875e-05, "loss": 0.4326, "step": 2150 }, { "epoch": 36.71, "grad_norm": 2.3811893463134766, "learning_rate": 9.787493746873437e-05, "loss": 0.4271, "step": 2175 }, { "epoch": 37.13, "grad_norm": 1.664067029953003, "learning_rate": 9.784992496248125e-05, "loss": 0.4294, "step": 2200 }, { "epoch": 37.55, "grad_norm": 1.6994428634643555, "learning_rate": 9.782491245622812e-05, "loss": 0.4286, "step": 2225 }, { "epoch": 37.97, "grad_norm": 1.6317942142486572, "learning_rate": 9.779989994997499e-05, "loss": 0.439, "step": 2250 }, { "epoch": 38.4, "grad_norm": 2.2735595703125, "learning_rate": 9.777488744372186e-05, "loss": 0.4348, "step": 2275 }, { "epoch": 38.82, "grad_norm": 2.0650172233581543, "learning_rate": 9.774987493746875e-05, "loss": 0.4344, "step": 2300 }, { "epoch": 39.24, "grad_norm": 1.1089218854904175, "learning_rate": 9.772486243121561e-05, "loss": 0.4268, "step": 2325 }, { "epoch": 39.66, "grad_norm": 1.6442523002624512, "learning_rate": 9.769984992496248e-05, "loss": 0.4294, "step": 2350 }, { "epoch": 40.08, "grad_norm": 1.2587043046951294, "learning_rate": 9.767483741870937e-05, "loss": 0.4282, "step": 2375 }, { "epoch": 40.51, "grad_norm": 1.428941249847412, "learning_rate": 9.764982491245623e-05, "loss": 0.4328, "step": 2400 }, { "epoch": 40.93, "grad_norm": 3.2557156085968018, "learning_rate": 9.76248124062031e-05, "loss": 0.43, "step": 2425 }, { "epoch": 41.35, "grad_norm": 1.0433619022369385, "learning_rate": 9.759979989994999e-05, "loss": 0.4332, "step": 2450 }, { "epoch": 41.77, "grad_norm": 1.0124989748001099, "learning_rate": 9.757478739369686e-05, "loss": 0.4277, "step": 2475 }, { "epoch": 42.19, "grad_norm": 1.6991461515426636, "learning_rate": 9.754977488744372e-05, "loss": 0.429, "step": 2500 }, { "epoch": 42.62, "grad_norm": 1.4574916362762451, "learning_rate": 9.75247623811906e-05, "loss": 0.4232, "step": 2525 }, { "epoch": 43.04, "grad_norm": 1.575637698173523, "learning_rate": 9.749974987493748e-05, "loss": 0.4318, "step": 2550 }, { "epoch": 43.46, "grad_norm": 1.1992729902267456, "learning_rate": 9.747473736868435e-05, "loss": 0.4252, "step": 2575 }, { "epoch": 43.88, "grad_norm": 1.2502025365829468, "learning_rate": 9.744972486243122e-05, "loss": 0.4245, "step": 2600 }, { "epoch": 44.3, "grad_norm": 1.0429006814956665, "learning_rate": 9.74247123561781e-05, "loss": 0.422, "step": 2625 }, { "epoch": 44.73, "grad_norm": 2.088129997253418, "learning_rate": 9.739969984992497e-05, "loss": 0.4236, "step": 2650 }, { "epoch": 45.15, "grad_norm": 2.181934356689453, "learning_rate": 9.737468734367184e-05, "loss": 0.4277, "step": 2675 }, { "epoch": 45.57, "grad_norm": 1.410612940788269, "learning_rate": 9.734967483741871e-05, "loss": 0.4251, "step": 2700 }, { "epoch": 45.99, "grad_norm": 1.718837857246399, "learning_rate": 9.732466233116559e-05, "loss": 0.4213, "step": 2725 }, { "epoch": 46.41, "grad_norm": 2.458649158477783, "learning_rate": 9.729964982491246e-05, "loss": 0.4307, "step": 2750 }, { "epoch": 46.84, "grad_norm": 1.368427038192749, "learning_rate": 9.727463731865933e-05, "loss": 0.4234, "step": 2775 }, { "epoch": 47.26, "grad_norm": 1.4321160316467285, "learning_rate": 9.72496248124062e-05, "loss": 0.4201, "step": 2800 }, { "epoch": 47.68, "grad_norm": 1.2676990032196045, "learning_rate": 9.722461230615309e-05, "loss": 0.4217, "step": 2825 }, { "epoch": 48.1, "grad_norm": 2.0436410903930664, "learning_rate": 9.719959979989995e-05, "loss": 0.4229, "step": 2850 }, { "epoch": 48.52, "grad_norm": 2.609133005142212, "learning_rate": 9.717458729364682e-05, "loss": 0.4237, "step": 2875 }, { "epoch": 48.95, "grad_norm": 1.7475526332855225, "learning_rate": 9.714957478739371e-05, "loss": 0.4227, "step": 2900 }, { "epoch": 49.37, "grad_norm": 1.5024336576461792, "learning_rate": 9.712456228114058e-05, "loss": 0.4232, "step": 2925 }, { "epoch": 49.79, "grad_norm": 1.4283182621002197, "learning_rate": 9.709954977488744e-05, "loss": 0.4203, "step": 2950 }, { "epoch": 50.21, "grad_norm": 0.9029533267021179, "learning_rate": 9.707453726863433e-05, "loss": 0.4228, "step": 2975 }, { "epoch": 50.63, "grad_norm": 1.3081523180007935, "learning_rate": 9.70495247623812e-05, "loss": 0.4195, "step": 3000 }, { "epoch": 51.05, "grad_norm": 0.9352352619171143, "learning_rate": 9.702451225612806e-05, "loss": 0.4207, "step": 3025 }, { "epoch": 51.48, "grad_norm": 1.2184823751449585, "learning_rate": 9.699949974987494e-05, "loss": 0.4194, "step": 3050 }, { "epoch": 51.9, "grad_norm": 1.5286707878112793, "learning_rate": 9.697448724362182e-05, "loss": 0.4204, "step": 3075 }, { "epoch": 52.32, "grad_norm": 1.3245766162872314, "learning_rate": 9.694947473736869e-05, "loss": 0.4201, "step": 3100 }, { "epoch": 52.74, "grad_norm": 1.4812688827514648, "learning_rate": 9.692446223111556e-05, "loss": 0.4252, "step": 3125 }, { "epoch": 53.16, "grad_norm": 1.4877307415008545, "learning_rate": 9.689944972486243e-05, "loss": 0.4179, "step": 3150 }, { "epoch": 53.59, "grad_norm": 1.8212426900863647, "learning_rate": 9.687443721860931e-05, "loss": 0.4137, "step": 3175 }, { "epoch": 54.01, "grad_norm": 2.150912284851074, "learning_rate": 9.684942471235618e-05, "loss": 0.4165, "step": 3200 }, { "epoch": 54.43, "grad_norm": 1.9625086784362793, "learning_rate": 9.682441220610305e-05, "loss": 0.413, "step": 3225 }, { "epoch": 54.85, "grad_norm": 1.849048376083374, "learning_rate": 9.679939969984992e-05, "loss": 0.421, "step": 3250 }, { "epoch": 55.27, "grad_norm": 1.7287253141403198, "learning_rate": 9.677438719359681e-05, "loss": 0.4209, "step": 3275 }, { "epoch": 55.7, "grad_norm": 1.8260107040405273, "learning_rate": 9.674937468734367e-05, "loss": 0.4133, "step": 3300 }, { "epoch": 56.12, "grad_norm": 1.7626959085464478, "learning_rate": 9.672436218109056e-05, "loss": 0.4157, "step": 3325 }, { "epoch": 56.54, "grad_norm": 2.6238913536071777, "learning_rate": 9.669934967483743e-05, "loss": 0.4166, "step": 3350 }, { "epoch": 56.96, "grad_norm": 2.7537014484405518, "learning_rate": 9.667433716858429e-05, "loss": 0.4133, "step": 3375 }, { "epoch": 57.38, "grad_norm": 1.0706117153167725, "learning_rate": 9.664932466233117e-05, "loss": 0.4192, "step": 3400 }, { "epoch": 57.81, "grad_norm": 1.2265653610229492, "learning_rate": 9.662431215607805e-05, "loss": 0.4138, "step": 3425 }, { "epoch": 58.23, "grad_norm": 1.219117283821106, "learning_rate": 9.659929964982492e-05, "loss": 0.4244, "step": 3450 }, { "epoch": 58.65, "grad_norm": 1.2219154834747314, "learning_rate": 9.657428714357179e-05, "loss": 0.4144, "step": 3475 }, { "epoch": 59.07, "grad_norm": 2.0221805572509766, "learning_rate": 9.654927463731866e-05, "loss": 0.4153, "step": 3500 }, { "epoch": 59.49, "grad_norm": 1.4994875192642212, "learning_rate": 9.652426213106554e-05, "loss": 0.414, "step": 3525 }, { "epoch": 59.92, "grad_norm": 1.497728943824768, "learning_rate": 9.649924962481241e-05, "loss": 0.4113, "step": 3550 }, { "epoch": 60.34, "grad_norm": 1.1986409425735474, "learning_rate": 9.647423711855928e-05, "loss": 0.4127, "step": 3575 }, { "epoch": 60.76, "grad_norm": 1.1296961307525635, "learning_rate": 9.644922461230616e-05, "loss": 0.4126, "step": 3600 }, { "epoch": 61.18, "grad_norm": 1.4701635837554932, "learning_rate": 9.642421210605304e-05, "loss": 0.4118, "step": 3625 }, { "epoch": 61.6, "grad_norm": 1.1018095016479492, "learning_rate": 9.63991995997999e-05, "loss": 0.4135, "step": 3650 }, { "epoch": 62.03, "grad_norm": 1.7052215337753296, "learning_rate": 9.637418709354677e-05, "loss": 0.4147, "step": 3675 }, { "epoch": 62.45, "grad_norm": 1.9233372211456299, "learning_rate": 9.634917458729366e-05, "loss": 0.4104, "step": 3700 }, { "epoch": 62.87, "grad_norm": 1.402578353881836, "learning_rate": 9.632416208104052e-05, "loss": 0.4112, "step": 3725 }, { "epoch": 63.29, "grad_norm": 2.6864452362060547, "learning_rate": 9.629914957478739e-05, "loss": 0.4125, "step": 3750 }, { "epoch": 63.71, "grad_norm": 1.6179677248001099, "learning_rate": 9.627413706853428e-05, "loss": 0.416, "step": 3775 }, { "epoch": 64.14, "grad_norm": 1.6383166313171387, "learning_rate": 9.624912456228115e-05, "loss": 0.4121, "step": 3800 }, { "epoch": 64.56, "grad_norm": 1.8313114643096924, "learning_rate": 9.622411205602801e-05, "loss": 0.4119, "step": 3825 }, { "epoch": 64.98, "grad_norm": 1.6697313785552979, "learning_rate": 9.61990995497749e-05, "loss": 0.4216, "step": 3850 }, { "epoch": 65.4, "grad_norm": 0.8759865760803223, "learning_rate": 9.617408704352177e-05, "loss": 0.4174, "step": 3875 }, { "epoch": 65.82, "grad_norm": 0.9433050155639648, "learning_rate": 9.614907453726864e-05, "loss": 0.409, "step": 3900 }, { "epoch": 66.24, "grad_norm": 2.1291754245758057, "learning_rate": 9.612406203101551e-05, "loss": 0.4099, "step": 3925 }, { "epoch": 66.67, "grad_norm": 1.3182588815689087, "learning_rate": 9.609904952476239e-05, "loss": 0.4111, "step": 3950 }, { "epoch": 67.09, "grad_norm": 1.76958167552948, "learning_rate": 9.607403701850926e-05, "loss": 0.412, "step": 3975 }, { "epoch": 67.51, "grad_norm": 1.4913214445114136, "learning_rate": 9.604902451225613e-05, "loss": 0.4171, "step": 4000 }, { "epoch": 67.93, "grad_norm": 0.861371636390686, "learning_rate": 9.6024012006003e-05, "loss": 0.405, "step": 4025 }, { "epoch": 68.35, "grad_norm": 1.4924416542053223, "learning_rate": 9.599899949974988e-05, "loss": 0.4092, "step": 4050 }, { "epoch": 68.78, "grad_norm": 1.1858751773834229, "learning_rate": 9.597398699349675e-05, "loss": 0.4072, "step": 4075 }, { "epoch": 69.2, "grad_norm": 0.8791980743408203, "learning_rate": 9.594897448724362e-05, "loss": 0.407, "step": 4100 }, { "epoch": 69.62, "grad_norm": 1.267388105392456, "learning_rate": 9.59239619809905e-05, "loss": 0.4073, "step": 4125 }, { "epoch": 70.04, "grad_norm": 1.0340478420257568, "learning_rate": 9.589894947473738e-05, "loss": 0.4081, "step": 4150 }, { "epoch": 70.46, "grad_norm": 0.9826291799545288, "learning_rate": 9.587493746873437e-05, "loss": 0.408, "step": 4175 }, { "epoch": 70.89, "grad_norm": 0.9153789281845093, "learning_rate": 9.584992496248124e-05, "loss": 0.4111, "step": 4200 }, { "epoch": 71.31, "grad_norm": 0.9967362284660339, "learning_rate": 9.582491245622812e-05, "loss": 0.4088, "step": 4225 }, { "epoch": 71.73, "grad_norm": 1.6984307765960693, "learning_rate": 9.5799899949975e-05, "loss": 0.4074, "step": 4250 }, { "epoch": 72.15, "grad_norm": 1.173675775527954, "learning_rate": 9.577488744372186e-05, "loss": 0.4043, "step": 4275 }, { "epoch": 72.57, "grad_norm": 1.0799657106399536, "learning_rate": 9.574987493746874e-05, "loss": 0.4049, "step": 4300 }, { "epoch": 73.0, "grad_norm": 0.927243709564209, "learning_rate": 9.572486243121562e-05, "loss": 0.4086, "step": 4325 }, { "epoch": 73.42, "grad_norm": 2.179476737976074, "learning_rate": 9.569984992496249e-05, "loss": 0.4107, "step": 4350 }, { "epoch": 73.84, "grad_norm": 3.4262351989746094, "learning_rate": 9.567483741870936e-05, "loss": 0.4093, "step": 4375 }, { "epoch": 74.26, "grad_norm": 0.8927581310272217, "learning_rate": 9.564982491245623e-05, "loss": 0.4097, "step": 4400 }, { "epoch": 74.68, "grad_norm": 1.7713688611984253, "learning_rate": 9.56248124062031e-05, "loss": 0.407, "step": 4425 }, { "epoch": 75.11, "grad_norm": 1.1457606554031372, "learning_rate": 9.559979989994998e-05, "loss": 0.4083, "step": 4450 }, { "epoch": 75.53, "grad_norm": 1.396812081336975, "learning_rate": 9.557478739369685e-05, "loss": 0.4065, "step": 4475 }, { "epoch": 75.95, "grad_norm": 3.076221227645874, "learning_rate": 9.554977488744372e-05, "loss": 0.4158, "step": 4500 }, { "epoch": 76.37, "grad_norm": 1.1549460887908936, "learning_rate": 9.55247623811906e-05, "loss": 0.4067, "step": 4525 }, { "epoch": 76.79, "grad_norm": 1.3812801837921143, "learning_rate": 9.549974987493747e-05, "loss": 0.4054, "step": 4550 }, { "epoch": 77.22, "grad_norm": 1.218714952468872, "learning_rate": 9.547473736868434e-05, "loss": 0.4049, "step": 4575 }, { "epoch": 77.64, "grad_norm": 1.9122352600097656, "learning_rate": 9.544972486243123e-05, "loss": 0.4074, "step": 4600 }, { "epoch": 78.06, "grad_norm": 1.435003399848938, "learning_rate": 9.542471235617809e-05, "loss": 0.4072, "step": 4625 }, { "epoch": 78.48, "grad_norm": 0.9949221611022949, "learning_rate": 9.539969984992496e-05, "loss": 0.4043, "step": 4650 }, { "epoch": 78.9, "grad_norm": 2.491838216781616, "learning_rate": 9.537468734367185e-05, "loss": 0.4062, "step": 4675 }, { "epoch": 79.32, "grad_norm": 1.572746992111206, "learning_rate": 9.534967483741872e-05, "loss": 0.4016, "step": 4700 }, { "epoch": 79.75, "grad_norm": 1.0013829469680786, "learning_rate": 9.532466233116559e-05, "loss": 0.4015, "step": 4725 }, { "epoch": 80.17, "grad_norm": 1.227575421333313, "learning_rate": 9.529964982491246e-05, "loss": 0.4027, "step": 4750 }, { "epoch": 80.59, "grad_norm": 1.5029401779174805, "learning_rate": 9.527463731865934e-05, "loss": 0.3999, "step": 4775 }, { "epoch": 81.01, "grad_norm": 1.391097903251648, "learning_rate": 9.524962481240621e-05, "loss": 0.4035, "step": 4800 }, { "epoch": 81.43, "grad_norm": 1.0933318138122559, "learning_rate": 9.522461230615308e-05, "loss": 0.4014, "step": 4825 }, { "epoch": 81.86, "grad_norm": 1.4575203657150269, "learning_rate": 9.519959979989995e-05, "loss": 0.4024, "step": 4850 }, { "epoch": 82.28, "grad_norm": 1.2732388973236084, "learning_rate": 9.517458729364683e-05, "loss": 0.4065, "step": 4875 }, { "epoch": 82.7, "grad_norm": 1.6714991331100464, "learning_rate": 9.51495747873937e-05, "loss": 0.4064, "step": 4900 }, { "epoch": 83.12, "grad_norm": 0.9786789417266846, "learning_rate": 9.512456228114057e-05, "loss": 0.4022, "step": 4925 }, { "epoch": 83.54, "grad_norm": 1.5005425214767456, "learning_rate": 9.509954977488746e-05, "loss": 0.4008, "step": 4950 }, { "epoch": 83.97, "grad_norm": 1.8007619380950928, "learning_rate": 9.507453726863432e-05, "loss": 0.4049, "step": 4975 }, { "epoch": 84.39, "grad_norm": 0.90321284532547, "learning_rate": 9.504952476238119e-05, "loss": 0.3975, "step": 5000 }, { "epoch": 84.81, "grad_norm": 1.9648014307022095, "learning_rate": 9.502451225612808e-05, "loss": 0.4026, "step": 5025 }, { "epoch": 85.23, "grad_norm": 1.3669122457504272, "learning_rate": 9.499949974987495e-05, "loss": 0.4044, "step": 5050 }, { "epoch": 85.65, "grad_norm": 1.1994420289993286, "learning_rate": 9.497448724362181e-05, "loss": 0.4029, "step": 5075 }, { "epoch": 86.08, "grad_norm": 2.5947866439819336, "learning_rate": 9.49494747373687e-05, "loss": 0.3998, "step": 5100 }, { "epoch": 86.5, "grad_norm": 1.8498542308807373, "learning_rate": 9.492446223111557e-05, "loss": 0.4022, "step": 5125 }, { "epoch": 86.92, "grad_norm": 1.458785057067871, "learning_rate": 9.489944972486243e-05, "loss": 0.4004, "step": 5150 }, { "epoch": 87.34, "grad_norm": 1.534063696861267, "learning_rate": 9.487443721860931e-05, "loss": 0.4009, "step": 5175 }, { "epoch": 87.76, "grad_norm": 1.468458652496338, "learning_rate": 9.484942471235619e-05, "loss": 0.4004, "step": 5200 }, { "epoch": 88.19, "grad_norm": 1.266542911529541, "learning_rate": 9.482441220610306e-05, "loss": 0.4013, "step": 5225 }, { "epoch": 88.61, "grad_norm": 1.6502550840377808, "learning_rate": 9.479939969984993e-05, "loss": 0.4005, "step": 5250 }, { "epoch": 89.03, "grad_norm": 0.8878002166748047, "learning_rate": 9.47743871935968e-05, "loss": 0.3996, "step": 5275 }, { "epoch": 89.45, "grad_norm": 0.7238607406616211, "learning_rate": 9.474937468734368e-05, "loss": 0.3977, "step": 5300 }, { "epoch": 89.87, "grad_norm": 2.6406147480010986, "learning_rate": 9.472436218109055e-05, "loss": 0.4011, "step": 5325 }, { "epoch": 90.3, "grad_norm": 1.0892853736877441, "learning_rate": 9.469934967483742e-05, "loss": 0.4037, "step": 5350 }, { "epoch": 90.72, "grad_norm": 1.2022901773452759, "learning_rate": 9.46743371685843e-05, "loss": 0.4109, "step": 5375 }, { "epoch": 91.14, "grad_norm": 0.7769300937652588, "learning_rate": 9.464932466233118e-05, "loss": 0.4, "step": 5400 }, { "epoch": 91.56, "grad_norm": 0.833206057548523, "learning_rate": 9.462431215607804e-05, "loss": 0.396, "step": 5425 }, { "epoch": 91.98, "grad_norm": 1.156129002571106, "learning_rate": 9.459929964982491e-05, "loss": 0.3966, "step": 5450 }, { "epoch": 92.41, "grad_norm": 0.8081735372543335, "learning_rate": 9.45742871435718e-05, "loss": 0.3974, "step": 5475 }, { "epoch": 92.83, "grad_norm": 1.3812929391860962, "learning_rate": 9.454927463731866e-05, "loss": 0.3987, "step": 5500 }, { "epoch": 93.25, "grad_norm": 1.0805424451828003, "learning_rate": 9.452426213106553e-05, "loss": 0.3985, "step": 5525 }, { "epoch": 93.67, "grad_norm": 1.1991263628005981, "learning_rate": 9.449924962481242e-05, "loss": 0.4023, "step": 5550 }, { "epoch": 94.09, "grad_norm": 1.6367698907852173, "learning_rate": 9.447423711855929e-05, "loss": 0.4027, "step": 5575 }, { "epoch": 94.51, "grad_norm": 1.6391679048538208, "learning_rate": 9.444922461230615e-05, "loss": 0.4053, "step": 5600 }, { "epoch": 94.94, "grad_norm": 1.3066658973693848, "learning_rate": 9.442421210605303e-05, "loss": 0.4022, "step": 5625 }, { "epoch": 95.36, "grad_norm": 1.1946823596954346, "learning_rate": 9.43991995997999e-05, "loss": 0.3975, "step": 5650 }, { "epoch": 95.78, "grad_norm": 1.632548213005066, "learning_rate": 9.437418709354678e-05, "loss": 0.3954, "step": 5675 }, { "epoch": 96.2, "grad_norm": 1.0016242265701294, "learning_rate": 9.434917458729365e-05, "loss": 0.3935, "step": 5700 }, { "epoch": 96.62, "grad_norm": 1.5791822671890259, "learning_rate": 9.432416208104052e-05, "loss": 0.3968, "step": 5725 }, { "epoch": 97.05, "grad_norm": 1.3178212642669678, "learning_rate": 9.42991495747874e-05, "loss": 0.3968, "step": 5750 }, { "epoch": 97.47, "grad_norm": 2.419029474258423, "learning_rate": 9.427413706853427e-05, "loss": 0.3957, "step": 5775 }, { "epoch": 97.89, "grad_norm": 0.996629536151886, "learning_rate": 9.424912456228114e-05, "loss": 0.399, "step": 5800 }, { "epoch": 98.31, "grad_norm": 1.574242353439331, "learning_rate": 9.422411205602801e-05, "loss": 0.3984, "step": 5825 }, { "epoch": 98.73, "grad_norm": 1.3833301067352295, "learning_rate": 9.419909954977489e-05, "loss": 0.3964, "step": 5850 }, { "epoch": 99.16, "grad_norm": 1.6443040370941162, "learning_rate": 9.417408704352176e-05, "loss": 0.3928, "step": 5875 }, { "epoch": 99.58, "grad_norm": 1.1492656469345093, "learning_rate": 9.414907453726863e-05, "loss": 0.3981, "step": 5900 }, { "epoch": 100.0, "grad_norm": 1.0491492748260498, "learning_rate": 9.412406203101552e-05, "loss": 0.395, "step": 5925 }, { "epoch": 100.42, "grad_norm": 0.8829823136329651, "learning_rate": 9.409904952476238e-05, "loss": 0.3951, "step": 5950 }, { "epoch": 100.84, "grad_norm": 0.9524397850036621, "learning_rate": 9.407403701850926e-05, "loss": 0.3967, "step": 5975 }, { "epoch": 101.27, "grad_norm": 0.9339599013328552, "learning_rate": 9.404902451225614e-05, "loss": 0.3974, "step": 6000 }, { "epoch": 101.69, "grad_norm": 1.2430776357650757, "learning_rate": 9.402401200600301e-05, "loss": 0.3944, "step": 6025 }, { "epoch": 102.11, "grad_norm": 1.2382580041885376, "learning_rate": 9.399899949974988e-05, "loss": 0.3974, "step": 6050 }, { "epoch": 102.53, "grad_norm": 1.051041603088379, "learning_rate": 9.397398699349676e-05, "loss": 0.3973, "step": 6075 }, { "epoch": 102.95, "grad_norm": 1.61953866481781, "learning_rate": 9.394897448724363e-05, "loss": 0.3945, "step": 6100 }, { "epoch": 103.38, "grad_norm": 1.3366836309432983, "learning_rate": 9.39239619809905e-05, "loss": 0.3964, "step": 6125 }, { "epoch": 103.8, "grad_norm": 1.1625866889953613, "learning_rate": 9.389894947473737e-05, "loss": 0.3967, "step": 6150 }, { "epoch": 104.22, "grad_norm": 1.3993914127349854, "learning_rate": 9.387393696848425e-05, "loss": 0.396, "step": 6175 }, { "epoch": 104.64, "grad_norm": 1.0538663864135742, "learning_rate": 9.384892446223112e-05, "loss": 0.391, "step": 6200 }, { "epoch": 105.06, "grad_norm": 1.7789326906204224, "learning_rate": 9.382391195597799e-05, "loss": 0.3938, "step": 6225 }, { "epoch": 105.49, "grad_norm": 1.047176718711853, "learning_rate": 9.379889944972486e-05, "loss": 0.3922, "step": 6250 }, { "epoch": 105.91, "grad_norm": 0.8343138694763184, "learning_rate": 9.377388694347175e-05, "loss": 0.3906, "step": 6275 }, { "epoch": 106.33, "grad_norm": 1.7591880559921265, "learning_rate": 9.374887443721861e-05, "loss": 0.3958, "step": 6300 }, { "epoch": 106.75, "grad_norm": 2.4220738410949707, "learning_rate": 9.372386193096548e-05, "loss": 0.3938, "step": 6325 }, { "epoch": 107.17, "grad_norm": 1.5252169370651245, "learning_rate": 9.369884942471237e-05, "loss": 0.3959, "step": 6350 }, { "epoch": 107.59, "grad_norm": 1.0653201341629028, "learning_rate": 9.367383691845924e-05, "loss": 0.3941, "step": 6375 }, { "epoch": 108.02, "grad_norm": 0.9116130471229553, "learning_rate": 9.36488244122061e-05, "loss": 0.3953, "step": 6400 }, { "epoch": 108.44, "grad_norm": 1.8353023529052734, "learning_rate": 9.362381190595299e-05, "loss": 0.393, "step": 6425 }, { "epoch": 108.86, "grad_norm": 0.7851668000221252, "learning_rate": 9.359879939969986e-05, "loss": 0.3972, "step": 6450 }, { "epoch": 109.28, "grad_norm": 2.212912082672119, "learning_rate": 9.357378689344673e-05, "loss": 0.3968, "step": 6475 }, { "epoch": 109.7, "grad_norm": 2.1535210609436035, "learning_rate": 9.35487743871936e-05, "loss": 0.3961, "step": 6500 }, { "epoch": 110.13, "grad_norm": 1.0224483013153076, "learning_rate": 9.352376188094048e-05, "loss": 0.3931, "step": 6525 }, { "epoch": 110.55, "grad_norm": 1.033724069595337, "learning_rate": 9.349874937468735e-05, "loss": 0.3932, "step": 6550 }, { "epoch": 110.97, "grad_norm": 1.3831344842910767, "learning_rate": 9.347373686843422e-05, "loss": 0.3946, "step": 6575 }, { "epoch": 111.39, "grad_norm": 1.1083067655563354, "learning_rate": 9.34487243621811e-05, "loss": 0.391, "step": 6600 }, { "epoch": 111.81, "grad_norm": 1.352753758430481, "learning_rate": 9.342371185592797e-05, "loss": 0.3963, "step": 6625 }, { "epoch": 112.24, "grad_norm": 1.2544857263565063, "learning_rate": 9.339869934967484e-05, "loss": 0.3918, "step": 6650 }, { "epoch": 112.66, "grad_norm": 1.2931941747665405, "learning_rate": 9.337368684342171e-05, "loss": 0.3926, "step": 6675 }, { "epoch": 113.08, "grad_norm": 1.0582399368286133, "learning_rate": 9.334867433716858e-05, "loss": 0.3889, "step": 6700 }, { "epoch": 113.5, "grad_norm": 1.153674602508545, "learning_rate": 9.332366183091547e-05, "loss": 0.3911, "step": 6725 }, { "epoch": 113.92, "grad_norm": 0.8844830393791199, "learning_rate": 9.329864932466233e-05, "loss": 0.3921, "step": 6750 }, { "epoch": 114.35, "grad_norm": 0.8720075488090515, "learning_rate": 9.32736368184092e-05, "loss": 0.3945, "step": 6775 }, { "epoch": 114.77, "grad_norm": 0.9805063605308533, "learning_rate": 9.324862431215609e-05, "loss": 0.3888, "step": 6800 }, { "epoch": 115.19, "grad_norm": 2.2352120876312256, "learning_rate": 9.322361180590296e-05, "loss": 0.3947, "step": 6825 }, { "epoch": 115.61, "grad_norm": 0.8556495308876038, "learning_rate": 9.319859929964982e-05, "loss": 0.3923, "step": 6850 }, { "epoch": 116.03, "grad_norm": 0.7891411185264587, "learning_rate": 9.317358679339671e-05, "loss": 0.3882, "step": 6875 }, { "epoch": 116.46, "grad_norm": 0.9211451411247253, "learning_rate": 9.314857428714358e-05, "loss": 0.3921, "step": 6900 }, { "epoch": 116.88, "grad_norm": 1.5606588125228882, "learning_rate": 9.312356178089044e-05, "loss": 0.3925, "step": 6925 }, { "epoch": 117.3, "grad_norm": 1.576579213142395, "learning_rate": 9.309854927463732e-05, "loss": 0.3894, "step": 6950 }, { "epoch": 117.72, "grad_norm": 1.4606884717941284, "learning_rate": 9.30735367683842e-05, "loss": 0.3905, "step": 6975 }, { "epoch": 118.14, "grad_norm": 1.0755809545516968, "learning_rate": 9.304852426213107e-05, "loss": 0.3891, "step": 7000 }, { "epoch": 118.57, "grad_norm": 0.8471797108650208, "learning_rate": 9.302351175587794e-05, "loss": 0.391, "step": 7025 }, { "epoch": 118.99, "grad_norm": 1.881227731704712, "learning_rate": 9.299849924962482e-05, "loss": 0.3899, "step": 7050 }, { "epoch": 119.41, "grad_norm": 1.5719435214996338, "learning_rate": 9.297348674337169e-05, "loss": 0.3998, "step": 7075 }, { "epoch": 119.83, "grad_norm": 1.2843365669250488, "learning_rate": 9.294847423711856e-05, "loss": 0.3898, "step": 7100 }, { "epoch": 120.25, "grad_norm": 0.8780379295349121, "learning_rate": 9.292346173086543e-05, "loss": 0.3903, "step": 7125 }, { "epoch": 120.68, "grad_norm": 1.330159068107605, "learning_rate": 9.289844922461232e-05, "loss": 0.3907, "step": 7150 }, { "epoch": 121.1, "grad_norm": 0.9735729694366455, "learning_rate": 9.287343671835919e-05, "loss": 0.389, "step": 7175 }, { "epoch": 121.52, "grad_norm": 1.7269222736358643, "learning_rate": 9.284842421210605e-05, "loss": 0.386, "step": 7200 }, { "epoch": 121.94, "grad_norm": 1.6191986799240112, "learning_rate": 9.282341170585294e-05, "loss": 0.3896, "step": 7225 }, { "epoch": 122.36, "grad_norm": 0.9164265394210815, "learning_rate": 9.279839919959981e-05, "loss": 0.3921, "step": 7250 }, { "epoch": 122.78, "grad_norm": 1.2279402017593384, "learning_rate": 9.277338669334667e-05, "loss": 0.3863, "step": 7275 }, { "epoch": 123.21, "grad_norm": 1.214256763458252, "learning_rate": 9.274837418709356e-05, "loss": 0.3876, "step": 7300 }, { "epoch": 123.63, "grad_norm": 0.860033392906189, "learning_rate": 9.272336168084043e-05, "loss": 0.3888, "step": 7325 }, { "epoch": 124.05, "grad_norm": 1.3516547679901123, "learning_rate": 9.26983491745873e-05, "loss": 0.3889, "step": 7350 }, { "epoch": 124.47, "grad_norm": 1.1745275259017944, "learning_rate": 9.267333666833417e-05, "loss": 0.3891, "step": 7375 }, { "epoch": 124.89, "grad_norm": 1.6997791528701782, "learning_rate": 9.264832416208105e-05, "loss": 0.3908, "step": 7400 }, { "epoch": 125.32, "grad_norm": 1.1872189044952393, "learning_rate": 9.262331165582792e-05, "loss": 0.3906, "step": 7425 }, { "epoch": 125.74, "grad_norm": 1.726406216621399, "learning_rate": 9.259829914957479e-05, "loss": 0.3923, "step": 7450 }, { "epoch": 126.16, "grad_norm": 1.3803889751434326, "learning_rate": 9.257328664332166e-05, "loss": 0.3924, "step": 7475 }, { "epoch": 126.58, "grad_norm": 1.3201857805252075, "learning_rate": 9.254827413706854e-05, "loss": 0.3876, "step": 7500 }, { "epoch": 127.0, "grad_norm": 1.3295527696609497, "learning_rate": 9.252326163081542e-05, "loss": 0.3878, "step": 7525 }, { "epoch": 127.43, "grad_norm": 1.228672742843628, "learning_rate": 9.249824912456228e-05, "loss": 0.3881, "step": 7550 }, { "epoch": 127.85, "grad_norm": 1.1604822874069214, "learning_rate": 9.247323661830915e-05, "loss": 0.3855, "step": 7575 }, { "epoch": 128.27, "grad_norm": 0.9629095196723938, "learning_rate": 9.244822411205604e-05, "loss": 0.3887, "step": 7600 }, { "epoch": 128.69, "grad_norm": 1.2206631898880005, "learning_rate": 9.24232116058029e-05, "loss": 0.3916, "step": 7625 }, { "epoch": 129.11, "grad_norm": 1.2908071279525757, "learning_rate": 9.239819909954977e-05, "loss": 0.3891, "step": 7650 }, { "epoch": 129.54, "grad_norm": 1.3963984251022339, "learning_rate": 9.237318659329666e-05, "loss": 0.3897, "step": 7675 }, { "epoch": 129.96, "grad_norm": 1.6635634899139404, "learning_rate": 9.234817408704353e-05, "loss": 0.3864, "step": 7700 }, { "epoch": 130.38, "grad_norm": 0.9927945137023926, "learning_rate": 9.232316158079039e-05, "loss": 0.3893, "step": 7725 }, { "epoch": 130.8, "grad_norm": 0.9489107131958008, "learning_rate": 9.229814907453728e-05, "loss": 0.3851, "step": 7750 }, { "epoch": 131.22, "grad_norm": 0.9850747585296631, "learning_rate": 9.227313656828415e-05, "loss": 0.3869, "step": 7775 }, { "epoch": 131.65, "grad_norm": 0.9255672693252563, "learning_rate": 9.224812406203102e-05, "loss": 0.3885, "step": 7800 }, { "epoch": 132.07, "grad_norm": 1.2656984329223633, "learning_rate": 9.22231115557779e-05, "loss": 0.3882, "step": 7825 }, { "epoch": 132.49, "grad_norm": 1.1444544792175293, "learning_rate": 9.219809904952477e-05, "loss": 0.3887, "step": 7850 }, { "epoch": 132.91, "grad_norm": 1.3210780620574951, "learning_rate": 9.217308654327164e-05, "loss": 0.3892, "step": 7875 }, { "epoch": 133.33, "grad_norm": 1.138453483581543, "learning_rate": 9.214807403701851e-05, "loss": 0.3853, "step": 7900 }, { "epoch": 133.76, "grad_norm": 0.8066700100898743, "learning_rate": 9.212306153076539e-05, "loss": 0.3889, "step": 7925 }, { "epoch": 134.18, "grad_norm": 0.8088155388832092, "learning_rate": 9.209804902451226e-05, "loss": 0.3871, "step": 7950 }, { "epoch": 134.6, "grad_norm": 1.0074987411499023, "learning_rate": 9.207303651825913e-05, "loss": 0.3897, "step": 7975 }, { "epoch": 135.02, "grad_norm": 0.960695207118988, "learning_rate": 9.2048024012006e-05, "loss": 0.3853, "step": 8000 }, { "epoch": 135.44, "grad_norm": 0.9010441899299622, "learning_rate": 9.202301150575288e-05, "loss": 0.3856, "step": 8025 }, { "epoch": 135.86, "grad_norm": 1.3599485158920288, "learning_rate": 9.199799899949976e-05, "loss": 0.3859, "step": 8050 }, { "epoch": 136.29, "grad_norm": 0.9082503318786621, "learning_rate": 9.197298649324662e-05, "loss": 0.383, "step": 8075 }, { "epoch": 136.71, "grad_norm": 2.013598680496216, "learning_rate": 9.19479739869935e-05, "loss": 0.388, "step": 8100 }, { "epoch": 137.13, "grad_norm": 1.3489696979522705, "learning_rate": 9.192296148074038e-05, "loss": 0.3904, "step": 8125 }, { "epoch": 137.55, "grad_norm": 1.196683406829834, "learning_rate": 9.189794897448725e-05, "loss": 0.3892, "step": 8150 }, { "epoch": 137.97, "grad_norm": 1.0592209100723267, "learning_rate": 9.187293646823411e-05, "loss": 0.385, "step": 8175 }, { "epoch": 138.4, "grad_norm": 1.1428028345108032, "learning_rate": 9.1847923961981e-05, "loss": 0.3857, "step": 8200 }, { "epoch": 138.82, "grad_norm": 0.6936319470405579, "learning_rate": 9.182291145572787e-05, "loss": 0.3882, "step": 8225 }, { "epoch": 139.24, "grad_norm": 1.6234769821166992, "learning_rate": 9.179789894947473e-05, "loss": 0.3826, "step": 8250 }, { "epoch": 139.66, "grad_norm": 1.502873182296753, "learning_rate": 9.177288644322162e-05, "loss": 0.3857, "step": 8275 }, { "epoch": 140.08, "grad_norm": 1.4641088247299194, "learning_rate": 9.174787393696849e-05, "loss": 0.3843, "step": 8300 }, { "epoch": 140.51, "grad_norm": 0.9584894776344299, "learning_rate": 9.172286143071536e-05, "loss": 0.387, "step": 8325 }, { "epoch": 140.93, "grad_norm": 0.9996629357337952, "learning_rate": 9.169784892446223e-05, "loss": 0.3844, "step": 8350 }, { "epoch": 141.35, "grad_norm": 1.0110065937042236, "learning_rate": 9.167283641820911e-05, "loss": 0.3847, "step": 8375 }, { "epoch": 141.77, "grad_norm": 1.1155205965042114, "learning_rate": 9.164782391195599e-05, "loss": 0.3855, "step": 8400 }, { "epoch": 142.19, "grad_norm": 1.196765422821045, "learning_rate": 9.162281140570285e-05, "loss": 0.385, "step": 8425 }, { "epoch": 142.62, "grad_norm": 0.8360599279403687, "learning_rate": 9.159779889944972e-05, "loss": 0.3861, "step": 8450 }, { "epoch": 143.04, "grad_norm": 0.9518325328826904, "learning_rate": 9.157278639319661e-05, "loss": 0.3856, "step": 8475 }, { "epoch": 143.46, "grad_norm": 0.7514486908912659, "learning_rate": 9.154777388694348e-05, "loss": 0.3852, "step": 8500 }, { "epoch": 143.88, "grad_norm": 0.8790159821510315, "learning_rate": 9.152276138069034e-05, "loss": 0.3894, "step": 8525 }, { "epoch": 144.3, "grad_norm": 1.1827722787857056, "learning_rate": 9.149774887443723e-05, "loss": 0.3866, "step": 8550 }, { "epoch": 144.73, "grad_norm": 1.1853820085525513, "learning_rate": 9.14727363681841e-05, "loss": 0.386, "step": 8575 }, { "epoch": 145.15, "grad_norm": 0.9088813066482544, "learning_rate": 9.144772386193096e-05, "loss": 0.3871, "step": 8600 }, { "epoch": 145.57, "grad_norm": 1.097094178199768, "learning_rate": 9.142271135567785e-05, "loss": 0.3843, "step": 8625 }, { "epoch": 145.99, "grad_norm": 1.0891205072402954, "learning_rate": 9.139769884942472e-05, "loss": 0.3868, "step": 8650 }, { "epoch": 146.41, "grad_norm": 0.8463541269302368, "learning_rate": 9.137268634317159e-05, "loss": 0.3813, "step": 8675 }, { "epoch": 146.84, "grad_norm": 1.317942500114441, "learning_rate": 9.134767383691846e-05, "loss": 0.3848, "step": 8700 }, { "epoch": 147.26, "grad_norm": 1.430842638015747, "learning_rate": 9.132266133066534e-05, "loss": 0.3828, "step": 8725 }, { "epoch": 147.68, "grad_norm": 1.0514400005340576, "learning_rate": 9.129764882441221e-05, "loss": 0.3844, "step": 8750 }, { "epoch": 148.1, "grad_norm": 1.0628875494003296, "learning_rate": 9.127263631815908e-05, "loss": 0.3844, "step": 8775 }, { "epoch": 148.52, "grad_norm": 2.214803695678711, "learning_rate": 9.124762381190596e-05, "loss": 0.3821, "step": 8800 }, { "epoch": 148.95, "grad_norm": 2.360417127609253, "learning_rate": 9.122361180590295e-05, "loss": 0.3872, "step": 8825 }, { "epoch": 149.37, "grad_norm": 1.1153593063354492, "learning_rate": 9.119859929964984e-05, "loss": 0.3823, "step": 8850 }, { "epoch": 149.79, "grad_norm": 0.8461281657218933, "learning_rate": 9.11735867933967e-05, "loss": 0.3822, "step": 8875 }, { "epoch": 150.21, "grad_norm": 1.3187837600708008, "learning_rate": 9.114857428714357e-05, "loss": 0.3837, "step": 8900 }, { "epoch": 150.63, "grad_norm": 0.8668797016143799, "learning_rate": 9.112356178089046e-05, "loss": 0.3846, "step": 8925 }, { "epoch": 151.05, "grad_norm": 1.3176252841949463, "learning_rate": 9.109854927463732e-05, "loss": 0.3793, "step": 8950 }, { "epoch": 151.48, "grad_norm": 0.8691707849502563, "learning_rate": 9.107353676838419e-05, "loss": 0.3811, "step": 8975 }, { "epoch": 151.9, "grad_norm": 1.6451836824417114, "learning_rate": 9.104852426213108e-05, "loss": 0.3857, "step": 9000 }, { "epoch": 152.32, "grad_norm": 0.8959950804710388, "learning_rate": 9.102351175587795e-05, "loss": 0.381, "step": 9025 }, { "epoch": 152.74, "grad_norm": 1.2829391956329346, "learning_rate": 9.099849924962481e-05, "loss": 0.3824, "step": 9050 }, { "epoch": 153.16, "grad_norm": 1.238585352897644, "learning_rate": 9.09734867433717e-05, "loss": 0.3872, "step": 9075 }, { "epoch": 153.59, "grad_norm": 1.5655521154403687, "learning_rate": 9.094847423711857e-05, "loss": 0.3807, "step": 9100 }, { "epoch": 154.01, "grad_norm": 1.2101978063583374, "learning_rate": 9.092346173086544e-05, "loss": 0.383, "step": 9125 }, { "epoch": 154.43, "grad_norm": 1.039608359336853, "learning_rate": 9.089844922461231e-05, "loss": 0.3816, "step": 9150 }, { "epoch": 154.85, "grad_norm": 0.8065314888954163, "learning_rate": 9.087343671835918e-05, "loss": 0.379, "step": 9175 }, { "epoch": 155.27, "grad_norm": 0.9454945921897888, "learning_rate": 9.084842421210606e-05, "loss": 0.3767, "step": 9200 }, { "epoch": 155.7, "grad_norm": 2.018127679824829, "learning_rate": 9.082341170585293e-05, "loss": 0.3825, "step": 9225 }, { "epoch": 156.12, "grad_norm": 0.7749385833740234, "learning_rate": 9.07983991995998e-05, "loss": 0.3808, "step": 9250 }, { "epoch": 156.54, "grad_norm": 0.8578135371208191, "learning_rate": 9.077338669334668e-05, "loss": 0.3813, "step": 9275 }, { "epoch": 156.96, "grad_norm": 0.7724294662475586, "learning_rate": 9.074837418709355e-05, "loss": 0.3804, "step": 9300 }, { "epoch": 157.38, "grad_norm": 1.237192153930664, "learning_rate": 9.072336168084042e-05, "loss": 0.3802, "step": 9325 }, { "epoch": 157.81, "grad_norm": 2.21505069732666, "learning_rate": 9.069834917458729e-05, "loss": 0.3806, "step": 9350 }, { "epoch": 158.23, "grad_norm": 1.3279848098754883, "learning_rate": 9.067333666833418e-05, "loss": 0.3794, "step": 9375 }, { "epoch": 158.65, "grad_norm": 1.0339349508285522, "learning_rate": 9.064832416208104e-05, "loss": 0.3802, "step": 9400 }, { "epoch": 159.07, "grad_norm": 1.6238765716552734, "learning_rate": 9.062331165582791e-05, "loss": 0.3805, "step": 9425 }, { "epoch": 159.49, "grad_norm": 0.9215278029441833, "learning_rate": 9.05982991495748e-05, "loss": 0.3791, "step": 9450 }, { "epoch": 159.92, "grad_norm": 1.5440635681152344, "learning_rate": 9.057328664332167e-05, "loss": 0.384, "step": 9475 }, { "epoch": 160.34, "grad_norm": 1.3809701204299927, "learning_rate": 9.054827413706853e-05, "loss": 0.3776, "step": 9500 }, { "epoch": 160.76, "grad_norm": 1.1021963357925415, "learning_rate": 9.052326163081542e-05, "loss": 0.3824, "step": 9525 }, { "epoch": 161.18, "grad_norm": 1.5466406345367432, "learning_rate": 9.049824912456229e-05, "loss": 0.3793, "step": 9550 }, { "epoch": 161.6, "grad_norm": 1.3061929941177368, "learning_rate": 9.047323661830916e-05, "loss": 0.3789, "step": 9575 }, { "epoch": 162.03, "grad_norm": 1.1963791847229004, "learning_rate": 9.044822411205603e-05, "loss": 0.3819, "step": 9600 }, { "epoch": 162.45, "grad_norm": 0.8358122110366821, "learning_rate": 9.04232116058029e-05, "loss": 0.3781, "step": 9625 }, { "epoch": 162.87, "grad_norm": 1.0061856508255005, "learning_rate": 9.039819909954978e-05, "loss": 0.3813, "step": 9650 }, { "epoch": 163.29, "grad_norm": 1.0273261070251465, "learning_rate": 9.037418709354678e-05, "loss": 0.3797, "step": 9675 }, { "epoch": 163.71, "grad_norm": 0.7963196039199829, "learning_rate": 9.034917458729365e-05, "loss": 0.3793, "step": 9700 }, { "epoch": 164.14, "grad_norm": 1.308881163597107, "learning_rate": 9.032416208104052e-05, "loss": 0.381, "step": 9725 }, { "epoch": 164.56, "grad_norm": 1.5577151775360107, "learning_rate": 9.02991495747874e-05, "loss": 0.3828, "step": 9750 }, { "epoch": 164.98, "grad_norm": 0.9121168851852417, "learning_rate": 9.027413706853427e-05, "loss": 0.3785, "step": 9775 }, { "epoch": 165.4, "grad_norm": 1.0625081062316895, "learning_rate": 9.024912456228114e-05, "loss": 0.3814, "step": 9800 }, { "epoch": 165.82, "grad_norm": 1.0669788122177124, "learning_rate": 9.022411205602803e-05, "loss": 0.3832, "step": 9825 }, { "epoch": 166.24, "grad_norm": 1.0348986387252808, "learning_rate": 9.019909954977489e-05, "loss": 0.3821, "step": 9850 }, { "epoch": 166.67, "grad_norm": 0.8561428785324097, "learning_rate": 9.017408704352176e-05, "loss": 0.3793, "step": 9875 }, { "epoch": 167.09, "grad_norm": 3.8151609897613525, "learning_rate": 9.014907453726864e-05, "loss": 0.3766, "step": 9900 }, { "epoch": 167.51, "grad_norm": 1.3710705041885376, "learning_rate": 9.012406203101552e-05, "loss": 0.3767, "step": 9925 }, { "epoch": 167.93, "grad_norm": 1.187401533126831, "learning_rate": 9.009904952476238e-05, "loss": 0.3763, "step": 9950 }, { "epoch": 168.35, "grad_norm": 1.0656594038009644, "learning_rate": 9.007403701850926e-05, "loss": 0.3804, "step": 9975 }, { "epoch": 168.78, "grad_norm": 1.5865756273269653, "learning_rate": 9.004902451225613e-05, "loss": 0.3779, "step": 10000 }, { "epoch": 168.78, "eval_loss": 0.4468539357185364, "eval_runtime": 4.3513, "eval_samples_per_second": 71.014, "eval_steps_per_second": 2.298, "step": 10000 }, { "epoch": 169.2, "grad_norm": 1.7549549341201782, "learning_rate": 9.002401200600301e-05, "loss": 0.3846, "step": 10025 }, { "epoch": 169.62, "grad_norm": 2.4609670639038086, "learning_rate": 8.999899949974988e-05, "loss": 0.3808, "step": 10050 }, { "epoch": 170.04, "grad_norm": 3.1165332794189453, "learning_rate": 8.997398699349675e-05, "loss": 0.3764, "step": 10075 }, { "epoch": 170.46, "grad_norm": 1.539887547492981, "learning_rate": 8.994897448724363e-05, "loss": 0.376, "step": 10100 }, { "epoch": 170.89, "grad_norm": 1.0826008319854736, "learning_rate": 8.99239619809905e-05, "loss": 0.3775, "step": 10125 }, { "epoch": 171.31, "grad_norm": 1.8559746742248535, "learning_rate": 8.989894947473737e-05, "loss": 0.3765, "step": 10150 }, { "epoch": 171.73, "grad_norm": 1.00070321559906, "learning_rate": 8.987393696848426e-05, "loss": 0.3797, "step": 10175 }, { "epoch": 172.15, "grad_norm": 0.9511860013008118, "learning_rate": 8.984892446223112e-05, "loss": 0.376, "step": 10200 }, { "epoch": 172.57, "grad_norm": 0.7921368479728699, "learning_rate": 8.982391195597799e-05, "loss": 0.3782, "step": 10225 }, { "epoch": 173.0, "grad_norm": 0.8468682169914246, "learning_rate": 8.979889944972487e-05, "loss": 0.381, "step": 10250 }, { "epoch": 173.42, "grad_norm": 1.39848792552948, "learning_rate": 8.977388694347175e-05, "loss": 0.3805, "step": 10275 }, { "epoch": 173.84, "grad_norm": 0.8561111092567444, "learning_rate": 8.97488744372186e-05, "loss": 0.3769, "step": 10300 }, { "epoch": 174.26, "grad_norm": 0.9401602149009705, "learning_rate": 8.972386193096549e-05, "loss": 0.377, "step": 10325 }, { "epoch": 174.68, "grad_norm": 0.9716879725456238, "learning_rate": 8.969884942471237e-05, "loss": 0.3802, "step": 10350 }, { "epoch": 175.11, "grad_norm": 0.8382692933082581, "learning_rate": 8.967383691845922e-05, "loss": 0.3809, "step": 10375 }, { "epoch": 175.53, "grad_norm": 1.2170737981796265, "learning_rate": 8.964882441220611e-05, "loss": 0.3755, "step": 10400 }, { "epoch": 175.95, "grad_norm": 1.0501155853271484, "learning_rate": 8.962381190595298e-05, "loss": 0.3779, "step": 10425 }, { "epoch": 176.37, "grad_norm": 0.9492490291595459, "learning_rate": 8.959879939969986e-05, "loss": 0.3742, "step": 10450 }, { "epoch": 176.79, "grad_norm": 1.2152857780456543, "learning_rate": 8.957378689344673e-05, "loss": 0.3768, "step": 10475 }, { "epoch": 177.22, "grad_norm": 0.9907167553901672, "learning_rate": 8.95487743871936e-05, "loss": 0.3766, "step": 10500 }, { "epoch": 177.64, "grad_norm": 2.8770523071289062, "learning_rate": 8.952376188094047e-05, "loss": 0.3813, "step": 10525 }, { "epoch": 178.06, "grad_norm": 1.811869740486145, "learning_rate": 8.949874937468735e-05, "loss": 0.3765, "step": 10550 }, { "epoch": 178.48, "grad_norm": 1.5837658643722534, "learning_rate": 8.947373686843422e-05, "loss": 0.3788, "step": 10575 }, { "epoch": 178.9, "grad_norm": 1.495154857635498, "learning_rate": 8.944872436218109e-05, "loss": 0.3821, "step": 10600 }, { "epoch": 179.32, "grad_norm": 1.0738645792007446, "learning_rate": 8.942371185592798e-05, "loss": 0.3819, "step": 10625 }, { "epoch": 179.75, "grad_norm": 0.9852315187454224, "learning_rate": 8.939869934967484e-05, "loss": 0.3779, "step": 10650 }, { "epoch": 180.17, "grad_norm": 1.1253306865692139, "learning_rate": 8.937368684342171e-05, "loss": 0.3741, "step": 10675 }, { "epoch": 180.59, "grad_norm": 1.330562949180603, "learning_rate": 8.93486743371686e-05, "loss": 0.3771, "step": 10700 }, { "epoch": 181.01, "grad_norm": 2.380424976348877, "learning_rate": 8.932366183091546e-05, "loss": 0.3741, "step": 10725 }, { "epoch": 181.43, "grad_norm": 1.1842645406723022, "learning_rate": 8.929864932466233e-05, "loss": 0.373, "step": 10750 }, { "epoch": 181.86, "grad_norm": 1.518761157989502, "learning_rate": 8.927363681840921e-05, "loss": 0.376, "step": 10775 }, { "epoch": 182.28, "grad_norm": 1.5670444965362549, "learning_rate": 8.924862431215609e-05, "loss": 0.3747, "step": 10800 }, { "epoch": 182.7, "grad_norm": 2.5558667182922363, "learning_rate": 8.922361180590295e-05, "loss": 0.3769, "step": 10825 }, { "epoch": 183.12, "grad_norm": 1.4926173686981201, "learning_rate": 8.919859929964983e-05, "loss": 0.3766, "step": 10850 }, { "epoch": 183.54, "grad_norm": 1.756583571434021, "learning_rate": 8.91735867933967e-05, "loss": 0.3745, "step": 10875 }, { "epoch": 183.97, "grad_norm": 3.2641735076904297, "learning_rate": 8.914857428714358e-05, "loss": 0.38, "step": 10900 }, { "epoch": 184.39, "grad_norm": 1.3039026260375977, "learning_rate": 8.912356178089045e-05, "loss": 0.3767, "step": 10925 }, { "epoch": 184.81, "grad_norm": 1.1352310180664062, "learning_rate": 8.909854927463732e-05, "loss": 0.377, "step": 10950 }, { "epoch": 185.23, "grad_norm": 0.62491375207901, "learning_rate": 8.90735367683842e-05, "loss": 0.3784, "step": 10975 }, { "epoch": 185.65, "grad_norm": 1.9646579027175903, "learning_rate": 8.904852426213107e-05, "loss": 0.3745, "step": 11000 }, { "epoch": 186.08, "grad_norm": 0.8279591798782349, "learning_rate": 8.902351175587794e-05, "loss": 0.3718, "step": 11025 }, { "epoch": 186.5, "grad_norm": 1.1819419860839844, "learning_rate": 8.899849924962481e-05, "loss": 0.3756, "step": 11050 }, { "epoch": 186.92, "grad_norm": 1.0372825860977173, "learning_rate": 8.897348674337169e-05, "loss": 0.3775, "step": 11075 }, { "epoch": 187.34, "grad_norm": 1.5705856084823608, "learning_rate": 8.894847423711856e-05, "loss": 0.3785, "step": 11100 }, { "epoch": 187.76, "grad_norm": 0.8573264479637146, "learning_rate": 8.892346173086543e-05, "loss": 0.3792, "step": 11125 }, { "epoch": 188.19, "grad_norm": 0.8272411823272705, "learning_rate": 8.889844922461232e-05, "loss": 0.3726, "step": 11150 }, { "epoch": 188.61, "grad_norm": 0.846712589263916, "learning_rate": 8.887343671835918e-05, "loss": 0.3773, "step": 11175 }, { "epoch": 189.03, "grad_norm": 0.7559719681739807, "learning_rate": 8.884842421210606e-05, "loss": 0.3794, "step": 11200 }, { "epoch": 189.45, "grad_norm": 0.9480248093605042, "learning_rate": 8.882341170585294e-05, "loss": 0.3737, "step": 11225 }, { "epoch": 189.87, "grad_norm": 1.0978518724441528, "learning_rate": 8.879839919959981e-05, "loss": 0.3732, "step": 11250 }, { "epoch": 190.3, "grad_norm": 1.04538893699646, "learning_rate": 8.877338669334668e-05, "loss": 0.3744, "step": 11275 }, { "epoch": 190.72, "grad_norm": 2.1915366649627686, "learning_rate": 8.874837418709355e-05, "loss": 0.3741, "step": 11300 }, { "epoch": 191.14, "grad_norm": 1.2433264255523682, "learning_rate": 8.872336168084043e-05, "loss": 0.3736, "step": 11325 }, { "epoch": 191.56, "grad_norm": 1.2210297584533691, "learning_rate": 8.86983491745873e-05, "loss": 0.3744, "step": 11350 }, { "epoch": 191.98, "grad_norm": 0.8658751845359802, "learning_rate": 8.867333666833417e-05, "loss": 0.3733, "step": 11375 }, { "epoch": 192.41, "grad_norm": 0.7908458113670349, "learning_rate": 8.864832416208104e-05, "loss": 0.3759, "step": 11400 }, { "epoch": 192.83, "grad_norm": 0.9013408422470093, "learning_rate": 8.862331165582792e-05, "loss": 0.3736, "step": 11425 }, { "epoch": 193.25, "grad_norm": 1.28169846534729, "learning_rate": 8.859829914957479e-05, "loss": 0.3756, "step": 11450 }, { "epoch": 193.67, "grad_norm": 1.1181119680404663, "learning_rate": 8.857328664332166e-05, "loss": 0.3771, "step": 11475 }, { "epoch": 194.09, "grad_norm": 0.9982534050941467, "learning_rate": 8.854827413706855e-05, "loss": 0.3722, "step": 11500 }, { "epoch": 194.51, "grad_norm": 1.7999745607376099, "learning_rate": 8.852326163081541e-05, "loss": 0.373, "step": 11525 }, { "epoch": 194.94, "grad_norm": 0.8343027830123901, "learning_rate": 8.849824912456228e-05, "loss": 0.3749, "step": 11550 }, { "epoch": 195.36, "grad_norm": 0.980638861656189, "learning_rate": 8.847323661830917e-05, "loss": 0.3797, "step": 11575 }, { "epoch": 195.78, "grad_norm": 0.8745217323303223, "learning_rate": 8.844822411205604e-05, "loss": 0.3722, "step": 11600 }, { "epoch": 196.2, "grad_norm": 1.0931674242019653, "learning_rate": 8.84232116058029e-05, "loss": 0.3733, "step": 11625 }, { "epoch": 196.62, "grad_norm": 1.0612292289733887, "learning_rate": 8.839819909954978e-05, "loss": 0.3744, "step": 11650 }, { "epoch": 197.05, "grad_norm": 1.3062249422073364, "learning_rate": 8.837318659329666e-05, "loss": 0.3747, "step": 11675 }, { "epoch": 197.47, "grad_norm": 2.0694363117218018, "learning_rate": 8.834817408704353e-05, "loss": 0.3718, "step": 11700 }, { "epoch": 197.89, "grad_norm": 1.778304934501648, "learning_rate": 8.83231615807904e-05, "loss": 0.3749, "step": 11725 }, { "epoch": 198.31, "grad_norm": 0.7739594578742981, "learning_rate": 8.829814907453727e-05, "loss": 0.373, "step": 11750 }, { "epoch": 198.73, "grad_norm": 1.190172553062439, "learning_rate": 8.827313656828415e-05, "loss": 0.3755, "step": 11775 }, { "epoch": 199.16, "grad_norm": 0.8796009421348572, "learning_rate": 8.824812406203102e-05, "loss": 0.3725, "step": 11800 }, { "epoch": 199.58, "grad_norm": 0.7832244038581848, "learning_rate": 8.822311155577789e-05, "loss": 0.3736, "step": 11825 }, { "epoch": 200.0, "grad_norm": 0.9426383376121521, "learning_rate": 8.819809904952477e-05, "loss": 0.3751, "step": 11850 }, { "epoch": 200.42, "grad_norm": 1.008852481842041, "learning_rate": 8.817308654327164e-05, "loss": 0.3728, "step": 11875 }, { "epoch": 200.84, "grad_norm": 1.148977279663086, "learning_rate": 8.814807403701851e-05, "loss": 0.3711, "step": 11900 }, { "epoch": 201.27, "grad_norm": 0.7076432108879089, "learning_rate": 8.812306153076538e-05, "loss": 0.372, "step": 11925 }, { "epoch": 201.69, "grad_norm": 0.8244609832763672, "learning_rate": 8.809804902451227e-05, "loss": 0.3714, "step": 11950 }, { "epoch": 202.11, "grad_norm": 0.737458348274231, "learning_rate": 8.807303651825913e-05, "loss": 0.3729, "step": 11975 }, { "epoch": 202.53, "grad_norm": 0.9494845271110535, "learning_rate": 8.8048024012006e-05, "loss": 0.3742, "step": 12000 }, { "epoch": 202.95, "grad_norm": 2.04544997215271, "learning_rate": 8.802301150575289e-05, "loss": 0.372, "step": 12025 }, { "epoch": 203.38, "grad_norm": 1.140149474143982, "learning_rate": 8.799799899949976e-05, "loss": 0.3753, "step": 12050 }, { "epoch": 203.8, "grad_norm": 1.2513355016708374, "learning_rate": 8.797298649324662e-05, "loss": 0.3714, "step": 12075 }, { "epoch": 204.22, "grad_norm": 1.0195680856704712, "learning_rate": 8.79479739869935e-05, "loss": 0.3748, "step": 12100 }, { "epoch": 204.64, "grad_norm": 0.8244719505310059, "learning_rate": 8.792296148074038e-05, "loss": 0.3773, "step": 12125 }, { "epoch": 205.06, "grad_norm": 0.827566921710968, "learning_rate": 8.789794897448724e-05, "loss": 0.3754, "step": 12150 }, { "epoch": 205.49, "grad_norm": 0.829836368560791, "learning_rate": 8.787293646823412e-05, "loss": 0.3771, "step": 12175 }, { "epoch": 205.91, "grad_norm": 0.887334406375885, "learning_rate": 8.7847923961981e-05, "loss": 0.3725, "step": 12200 }, { "epoch": 206.33, "grad_norm": 1.1143351793289185, "learning_rate": 8.782291145572787e-05, "loss": 0.3731, "step": 12225 }, { "epoch": 206.75, "grad_norm": 0.9924348592758179, "learning_rate": 8.779789894947474e-05, "loss": 0.3777, "step": 12250 }, { "epoch": 207.17, "grad_norm": 1.2308379411697388, "learning_rate": 8.777288644322161e-05, "loss": 0.3723, "step": 12275 }, { "epoch": 207.59, "grad_norm": 1.3781524896621704, "learning_rate": 8.774787393696849e-05, "loss": 0.368, "step": 12300 }, { "epoch": 208.02, "grad_norm": 1.0266764163970947, "learning_rate": 8.772286143071536e-05, "loss": 0.371, "step": 12325 }, { "epoch": 208.44, "grad_norm": 0.7057740092277527, "learning_rate": 8.769784892446223e-05, "loss": 0.3698, "step": 12350 }, { "epoch": 208.86, "grad_norm": 0.7625869512557983, "learning_rate": 8.76728364182091e-05, "loss": 0.371, "step": 12375 }, { "epoch": 209.28, "grad_norm": 2.274927854537964, "learning_rate": 8.764782391195599e-05, "loss": 0.3717, "step": 12400 }, { "epoch": 209.7, "grad_norm": 2.412778854370117, "learning_rate": 8.762281140570285e-05, "loss": 0.3783, "step": 12425 }, { "epoch": 210.13, "grad_norm": 0.6713507175445557, "learning_rate": 8.759779889944974e-05, "loss": 0.3739, "step": 12450 }, { "epoch": 210.55, "grad_norm": 0.9966689348220825, "learning_rate": 8.757278639319661e-05, "loss": 0.3715, "step": 12475 }, { "epoch": 210.97, "grad_norm": 1.130442500114441, "learning_rate": 8.754777388694347e-05, "loss": 0.3733, "step": 12500 }, { "epoch": 211.39, "grad_norm": 1.5019795894622803, "learning_rate": 8.752276138069035e-05, "loss": 0.372, "step": 12525 }, { "epoch": 211.81, "grad_norm": 1.0977762937545776, "learning_rate": 8.749774887443723e-05, "loss": 0.3707, "step": 12550 }, { "epoch": 212.24, "grad_norm": 1.0271332263946533, "learning_rate": 8.74727363681841e-05, "loss": 0.3749, "step": 12575 }, { "epoch": 212.66, "grad_norm": 0.9220431447029114, "learning_rate": 8.744772386193097e-05, "loss": 0.3671, "step": 12600 }, { "epoch": 213.08, "grad_norm": 1.0413817167282104, "learning_rate": 8.742271135567784e-05, "loss": 0.3736, "step": 12625 }, { "epoch": 213.5, "grad_norm": 1.3156681060791016, "learning_rate": 8.739769884942472e-05, "loss": 0.3707, "step": 12650 }, { "epoch": 213.92, "grad_norm": 0.8264446258544922, "learning_rate": 8.737268634317159e-05, "loss": 0.3702, "step": 12675 }, { "epoch": 214.35, "grad_norm": 0.8842846751213074, "learning_rate": 8.734767383691846e-05, "loss": 0.3719, "step": 12700 }, { "epoch": 214.77, "grad_norm": 2.434103488922119, "learning_rate": 8.732266133066534e-05, "loss": 0.3722, "step": 12725 }, { "epoch": 215.19, "grad_norm": 1.3447299003601074, "learning_rate": 8.729764882441222e-05, "loss": 0.3673, "step": 12750 }, { "epoch": 215.61, "grad_norm": 0.9100630879402161, "learning_rate": 8.727263631815908e-05, "loss": 0.3684, "step": 12775 }, { "epoch": 216.03, "grad_norm": 1.509015679359436, "learning_rate": 8.724762381190595e-05, "loss": 0.3696, "step": 12800 }, { "epoch": 216.46, "grad_norm": 1.3275097608566284, "learning_rate": 8.722261130565284e-05, "loss": 0.3695, "step": 12825 }, { "epoch": 216.88, "grad_norm": 1.2652827501296997, "learning_rate": 8.71975987993997e-05, "loss": 0.3702, "step": 12850 }, { "epoch": 217.3, "grad_norm": 0.9624780416488647, "learning_rate": 8.717258629314657e-05, "loss": 0.3755, "step": 12875 }, { "epoch": 217.72, "grad_norm": 0.6027135252952576, "learning_rate": 8.714757378689346e-05, "loss": 0.3715, "step": 12900 }, { "epoch": 218.14, "grad_norm": 1.5112547874450684, "learning_rate": 8.712256128064033e-05, "loss": 0.3723, "step": 12925 }, { "epoch": 218.57, "grad_norm": 1.403754711151123, "learning_rate": 8.709754877438719e-05, "loss": 0.3691, "step": 12950 }, { "epoch": 218.99, "grad_norm": 1.3901664018630981, "learning_rate": 8.707253626813408e-05, "loss": 0.3695, "step": 12975 }, { "epoch": 219.41, "grad_norm": 2.0888004302978516, "learning_rate": 8.704752376188095e-05, "loss": 0.3698, "step": 13000 }, { "epoch": 219.83, "grad_norm": 1.167575478553772, "learning_rate": 8.702251125562782e-05, "loss": 0.3707, "step": 13025 }, { "epoch": 220.25, "grad_norm": 0.8433003425598145, "learning_rate": 8.699749874937469e-05, "loss": 0.3689, "step": 13050 }, { "epoch": 220.68, "grad_norm": 1.0176352262496948, "learning_rate": 8.697248624312157e-05, "loss": 0.3669, "step": 13075 }, { "epoch": 221.1, "grad_norm": 1.1683886051177979, "learning_rate": 8.694747373686844e-05, "loss": 0.3706, "step": 13100 }, { "epoch": 221.52, "grad_norm": 1.1130783557891846, "learning_rate": 8.692246123061531e-05, "loss": 0.3754, "step": 13125 }, { "epoch": 221.94, "grad_norm": 0.834837794303894, "learning_rate": 8.689744872436218e-05, "loss": 0.3678, "step": 13150 }, { "epoch": 222.36, "grad_norm": 0.7506586909294128, "learning_rate": 8.687243621810906e-05, "loss": 0.3707, "step": 13175 }, { "epoch": 222.78, "grad_norm": 1.8632234334945679, "learning_rate": 8.684742371185593e-05, "loss": 0.3704, "step": 13200 }, { "epoch": 223.21, "grad_norm": 0.9851394295692444, "learning_rate": 8.68224112056028e-05, "loss": 0.3714, "step": 13225 }, { "epoch": 223.63, "grad_norm": 1.0051536560058594, "learning_rate": 8.679739869934967e-05, "loss": 0.3671, "step": 13250 }, { "epoch": 224.05, "grad_norm": 1.5273579359054565, "learning_rate": 8.677238619309656e-05, "loss": 0.3688, "step": 13275 }, { "epoch": 224.47, "grad_norm": 1.0769370794296265, "learning_rate": 8.674737368684342e-05, "loss": 0.3711, "step": 13300 }, { "epoch": 224.89, "grad_norm": 0.9148847460746765, "learning_rate": 8.672236118059029e-05, "loss": 0.3671, "step": 13325 }, { "epoch": 225.32, "grad_norm": 1.974008321762085, "learning_rate": 8.669734867433718e-05, "loss": 0.3668, "step": 13350 }, { "epoch": 225.74, "grad_norm": 0.9108849763870239, "learning_rate": 8.667233616808405e-05, "loss": 0.3691, "step": 13375 }, { "epoch": 226.16, "grad_norm": 0.8851766586303711, "learning_rate": 8.664732366183091e-05, "loss": 0.3672, "step": 13400 }, { "epoch": 226.58, "grad_norm": 1.6407252550125122, "learning_rate": 8.66223111555778e-05, "loss": 0.3689, "step": 13425 }, { "epoch": 227.0, "grad_norm": 0.8674168586730957, "learning_rate": 8.659729864932467e-05, "loss": 0.3685, "step": 13450 }, { "epoch": 227.43, "grad_norm": 0.8802807927131653, "learning_rate": 8.657228614307153e-05, "loss": 0.3671, "step": 13475 }, { "epoch": 227.85, "grad_norm": 1.4781070947647095, "learning_rate": 8.654727363681841e-05, "loss": 0.3661, "step": 13500 }, { "epoch": 228.27, "grad_norm": 0.9886232018470764, "learning_rate": 8.652226113056529e-05, "loss": 0.3709, "step": 13525 }, { "epoch": 228.69, "grad_norm": 0.8588663935661316, "learning_rate": 8.649724862431216e-05, "loss": 0.37, "step": 13550 }, { "epoch": 229.11, "grad_norm": 0.9968315362930298, "learning_rate": 8.647223611805903e-05, "loss": 0.367, "step": 13575 }, { "epoch": 229.54, "grad_norm": 0.9380239844322205, "learning_rate": 8.64472236118059e-05, "loss": 0.3669, "step": 13600 }, { "epoch": 229.96, "grad_norm": 0.8430926203727722, "learning_rate": 8.642221110555278e-05, "loss": 0.3664, "step": 13625 }, { "epoch": 230.38, "grad_norm": 1.6085623502731323, "learning_rate": 8.639719859929965e-05, "loss": 0.3692, "step": 13650 }, { "epoch": 230.8, "grad_norm": 1.3879194259643555, "learning_rate": 8.637218609304652e-05, "loss": 0.3688, "step": 13675 }, { "epoch": 231.22, "grad_norm": 0.8363910913467407, "learning_rate": 8.634717358679341e-05, "loss": 0.3659, "step": 13700 }, { "epoch": 231.65, "grad_norm": 1.0508005619049072, "learning_rate": 8.632216108054028e-05, "loss": 0.3672, "step": 13725 }, { "epoch": 232.07, "grad_norm": 0.9068616032600403, "learning_rate": 8.629714857428714e-05, "loss": 0.3681, "step": 13750 }, { "epoch": 232.49, "grad_norm": 1.0732817649841309, "learning_rate": 8.627213606803403e-05, "loss": 0.3661, "step": 13775 }, { "epoch": 232.91, "grad_norm": 1.0774346590042114, "learning_rate": 8.62471235617809e-05, "loss": 0.3684, "step": 13800 }, { "epoch": 233.33, "grad_norm": 0.7777401208877563, "learning_rate": 8.622211105552776e-05, "loss": 0.367, "step": 13825 }, { "epoch": 233.76, "grad_norm": 1.0916180610656738, "learning_rate": 8.619709854927465e-05, "loss": 0.3685, "step": 13850 }, { "epoch": 234.18, "grad_norm": 0.8169525861740112, "learning_rate": 8.617208604302152e-05, "loss": 0.3705, "step": 13875 }, { "epoch": 234.6, "grad_norm": 1.6053544282913208, "learning_rate": 8.614707353676839e-05, "loss": 0.3682, "step": 13900 }, { "epoch": 235.02, "grad_norm": 0.9953093528747559, "learning_rate": 8.612206103051526e-05, "loss": 0.368, "step": 13925 }, { "epoch": 235.44, "grad_norm": 0.9046871066093445, "learning_rate": 8.609804902451226e-05, "loss": 0.3681, "step": 13950 }, { "epoch": 235.86, "grad_norm": 0.9001895785331726, "learning_rate": 8.607303651825913e-05, "loss": 0.368, "step": 13975 }, { "epoch": 236.29, "grad_norm": 1.1987099647521973, "learning_rate": 8.6048024012006e-05, "loss": 0.3686, "step": 14000 }, { "epoch": 236.71, "grad_norm": 1.4472137689590454, "learning_rate": 8.602301150575288e-05, "loss": 0.3679, "step": 14025 }, { "epoch": 237.13, "grad_norm": 1.6880950927734375, "learning_rate": 8.599799899949975e-05, "loss": 0.3682, "step": 14050 }, { "epoch": 237.55, "grad_norm": Infinity, "learning_rate": 8.597398699349676e-05, "loss": 0.3706, "step": 14075 }, { "epoch": 237.97, "grad_norm": 1.2722959518432617, "learning_rate": 8.594897448724362e-05, "loss": 0.3663, "step": 14100 }, { "epoch": 238.4, "grad_norm": 0.9457665681838989, "learning_rate": 8.59239619809905e-05, "loss": 0.3654, "step": 14125 }, { "epoch": 238.82, "grad_norm": 0.817097008228302, "learning_rate": 8.589894947473738e-05, "loss": 0.3678, "step": 14150 }, { "epoch": 239.24, "grad_norm": 1.1400470733642578, "learning_rate": 8.587393696848424e-05, "loss": 0.3666, "step": 14175 }, { "epoch": 239.66, "grad_norm": 1.167304515838623, "learning_rate": 8.584892446223111e-05, "loss": 0.3667, "step": 14200 }, { "epoch": 240.08, "grad_norm": 0.8194668292999268, "learning_rate": 8.5823911955978e-05, "loss": 0.3667, "step": 14225 }, { "epoch": 240.51, "grad_norm": 0.9471569657325745, "learning_rate": 8.579889944972487e-05, "loss": 0.3679, "step": 14250 }, { "epoch": 240.93, "grad_norm": 0.8145942091941833, "learning_rate": 8.577388694347173e-05, "loss": 0.368, "step": 14275 }, { "epoch": 241.35, "grad_norm": 0.9779414534568787, "learning_rate": 8.574887443721862e-05, "loss": 0.3657, "step": 14300 }, { "epoch": 241.77, "grad_norm": 0.8919757008552551, "learning_rate": 8.572386193096549e-05, "loss": 0.3681, "step": 14325 }, { "epoch": 242.19, "grad_norm": 0.9472792744636536, "learning_rate": 8.569884942471236e-05, "loss": 0.3673, "step": 14350 }, { "epoch": 242.62, "grad_norm": 0.7073681950569153, "learning_rate": 8.567383691845924e-05, "loss": 0.3651, "step": 14375 }, { "epoch": 243.04, "grad_norm": 1.0986143350601196, "learning_rate": 8.564882441220611e-05, "loss": 0.3691, "step": 14400 }, { "epoch": 243.46, "grad_norm": 1.026163101196289, "learning_rate": 8.562381190595298e-05, "loss": 0.3695, "step": 14425 }, { "epoch": 243.88, "grad_norm": 0.616768479347229, "learning_rate": 8.559879939969985e-05, "loss": 0.3651, "step": 14450 }, { "epoch": 244.3, "grad_norm": 1.8516837358474731, "learning_rate": 8.557378689344673e-05, "loss": 0.3668, "step": 14475 }, { "epoch": 244.73, "grad_norm": 0.9715454578399658, "learning_rate": 8.55487743871936e-05, "loss": 0.369, "step": 14500 }, { "epoch": 245.15, "grad_norm": 1.7141095399856567, "learning_rate": 8.552376188094047e-05, "loss": 0.3659, "step": 14525 }, { "epoch": 245.57, "grad_norm": 0.8824822902679443, "learning_rate": 8.549874937468734e-05, "loss": 0.3662, "step": 14550 }, { "epoch": 245.99, "grad_norm": 1.1573299169540405, "learning_rate": 8.547373686843422e-05, "loss": 0.3656, "step": 14575 }, { "epoch": 246.41, "grad_norm": 1.152058720588684, "learning_rate": 8.54487243621811e-05, "loss": 0.3692, "step": 14600 }, { "epoch": 246.84, "grad_norm": 1.2361758947372437, "learning_rate": 8.542371185592796e-05, "loss": 0.3661, "step": 14625 }, { "epoch": 247.26, "grad_norm": 1.447004795074463, "learning_rate": 8.539869934967483e-05, "loss": 0.3658, "step": 14650 }, { "epoch": 247.68, "grad_norm": 0.9416847229003906, "learning_rate": 8.537368684342172e-05, "loss": 0.3655, "step": 14675 }, { "epoch": 248.1, "grad_norm": 0.9440823197364807, "learning_rate": 8.53486743371686e-05, "loss": 0.3687, "step": 14700 }, { "epoch": 248.52, "grad_norm": 0.8416768908500671, "learning_rate": 8.532366183091545e-05, "loss": 0.371, "step": 14725 }, { "epoch": 248.95, "grad_norm": 0.8656642436981201, "learning_rate": 8.529864932466234e-05, "loss": 0.3656, "step": 14750 }, { "epoch": 249.37, "grad_norm": 0.7759775519371033, "learning_rate": 8.527363681840921e-05, "loss": 0.3628, "step": 14775 }, { "epoch": 249.79, "grad_norm": 0.6862407326698303, "learning_rate": 8.524862431215608e-05, "loss": 0.369, "step": 14800 }, { "epoch": 250.21, "grad_norm": 1.1237455606460571, "learning_rate": 8.522361180590296e-05, "loss": 0.3662, "step": 14825 }, { "epoch": 250.63, "grad_norm": 0.8318279385566711, "learning_rate": 8.519859929964983e-05, "loss": 0.3664, "step": 14850 }, { "epoch": 251.05, "grad_norm": 1.173898458480835, "learning_rate": 8.51735867933967e-05, "loss": 0.3657, "step": 14875 }, { "epoch": 251.48, "grad_norm": 1.8022644519805908, "learning_rate": 8.514857428714357e-05, "loss": 0.3651, "step": 14900 }, { "epoch": 251.9, "grad_norm": 1.1750956773757935, "learning_rate": 8.512356178089045e-05, "loss": 0.3674, "step": 14925 }, { "epoch": 252.32, "grad_norm": 1.9424494504928589, "learning_rate": 8.509854927463732e-05, "loss": 0.3654, "step": 14950 }, { "epoch": 252.74, "grad_norm": 0.9755445122718811, "learning_rate": 8.507353676838419e-05, "loss": 0.3684, "step": 14975 }, { "epoch": 253.16, "grad_norm": 2.24971342086792, "learning_rate": 8.504852426213107e-05, "loss": 0.3664, "step": 15000 }, { "epoch": 253.59, "grad_norm": 0.7831035852432251, "learning_rate": 8.502351175587794e-05, "loss": 0.3658, "step": 15025 }, { "epoch": 254.01, "grad_norm": 1.495391607284546, "learning_rate": 8.499849924962482e-05, "loss": 0.3676, "step": 15050 }, { "epoch": 254.43, "grad_norm": 0.9528107643127441, "learning_rate": 8.497348674337168e-05, "loss": 0.3648, "step": 15075 }, { "epoch": 254.85, "grad_norm": 1.5344595909118652, "learning_rate": 8.494847423711856e-05, "loss": 0.3679, "step": 15100 }, { "epoch": 255.27, "grad_norm": 1.0981906652450562, "learning_rate": 8.492346173086544e-05, "loss": 0.3634, "step": 15125 }, { "epoch": 255.7, "grad_norm": 0.8158636093139648, "learning_rate": 8.489844922461232e-05, "loss": 0.3634, "step": 15150 }, { "epoch": 256.12, "grad_norm": 0.8795320391654968, "learning_rate": 8.487343671835917e-05, "loss": 0.3663, "step": 15175 }, { "epoch": 256.54, "grad_norm": 1.1062347888946533, "learning_rate": 8.484842421210606e-05, "loss": 0.3632, "step": 15200 }, { "epoch": 256.96, "grad_norm": 0.8237234354019165, "learning_rate": 8.482341170585293e-05, "loss": 0.3674, "step": 15225 }, { "epoch": 257.38, "grad_norm": 1.5648434162139893, "learning_rate": 8.47983991995998e-05, "loss": 0.3645, "step": 15250 }, { "epoch": 257.81, "grad_norm": 0.6461232900619507, "learning_rate": 8.477338669334668e-05, "loss": 0.3636, "step": 15275 }, { "epoch": 258.23, "grad_norm": 0.8710100054740906, "learning_rate": 8.474837418709355e-05, "loss": 0.3657, "step": 15300 }, { "epoch": 258.65, "grad_norm": 0.8583847284317017, "learning_rate": 8.472336168084042e-05, "loss": 0.3661, "step": 15325 }, { "epoch": 259.07, "grad_norm": 1.7212473154067993, "learning_rate": 8.46983491745873e-05, "loss": 0.3696, "step": 15350 }, { "epoch": 259.49, "grad_norm": 0.9687917828559875, "learning_rate": 8.467333666833417e-05, "loss": 0.3616, "step": 15375 }, { "epoch": 259.92, "grad_norm": 1.223983645439148, "learning_rate": 8.464832416208106e-05, "loss": 0.3667, "step": 15400 }, { "epoch": 260.34, "grad_norm": 2.318817377090454, "learning_rate": 8.462331165582791e-05, "loss": 0.3679, "step": 15425 }, { "epoch": 260.76, "grad_norm": 0.9399811029434204, "learning_rate": 8.459829914957479e-05, "loss": 0.3635, "step": 15450 }, { "epoch": 261.18, "grad_norm": 1.2399027347564697, "learning_rate": 8.457328664332167e-05, "loss": 0.3661, "step": 15475 }, { "epoch": 261.6, "grad_norm": 0.7314602136611938, "learning_rate": 8.454827413706855e-05, "loss": 0.369, "step": 15500 }, { "epoch": 262.03, "grad_norm": 1.2635565996170044, "learning_rate": 8.45232616308154e-05, "loss": 0.362, "step": 15525 }, { "epoch": 262.45, "grad_norm": 1.499050498008728, "learning_rate": 8.449824912456229e-05, "loss": 0.3641, "step": 15550 }, { "epoch": 262.87, "grad_norm": 1.1472296714782715, "learning_rate": 8.447323661830916e-05, "loss": 0.3675, "step": 15575 }, { "epoch": 263.29, "grad_norm": 1.165958285331726, "learning_rate": 8.444822411205602e-05, "loss": 0.3615, "step": 15600 }, { "epoch": 263.71, "grad_norm": 0.7624589800834656, "learning_rate": 8.442321160580291e-05, "loss": 0.3676, "step": 15625 }, { "epoch": 264.14, "grad_norm": 1.6784964799880981, "learning_rate": 8.439819909954978e-05, "loss": 0.3663, "step": 15650 }, { "epoch": 264.56, "grad_norm": 1.0375725030899048, "learning_rate": 8.437318659329665e-05, "loss": 0.3637, "step": 15675 }, { "epoch": 264.98, "grad_norm": 1.3697712421417236, "learning_rate": 8.434817408704353e-05, "loss": 0.3635, "step": 15700 }, { "epoch": 265.4, "grad_norm": 0.8125913143157959, "learning_rate": 8.43231615807904e-05, "loss": 0.3644, "step": 15725 }, { "epoch": 265.82, "grad_norm": 0.6654360294342041, "learning_rate": 8.429814907453727e-05, "loss": 0.3724, "step": 15750 }, { "epoch": 266.24, "grad_norm": 0.904873788356781, "learning_rate": 8.427313656828414e-05, "loss": 0.3629, "step": 15775 }, { "epoch": 266.67, "grad_norm": 0.8573073148727417, "learning_rate": 8.424812406203102e-05, "loss": 0.3682, "step": 15800 }, { "epoch": 267.09, "grad_norm": 1.1901441812515259, "learning_rate": 8.422311155577789e-05, "loss": 0.3616, "step": 15825 }, { "epoch": 267.51, "grad_norm": 1.7021052837371826, "learning_rate": 8.419809904952478e-05, "loss": 0.3658, "step": 15850 }, { "epoch": 267.93, "grad_norm": 1.5687404870986938, "learning_rate": 8.417308654327164e-05, "loss": 0.3653, "step": 15875 }, { "epoch": 268.35, "grad_norm": 1.7491443157196045, "learning_rate": 8.414807403701851e-05, "loss": 0.3615, "step": 15900 }, { "epoch": 268.78, "grad_norm": 0.9064961075782776, "learning_rate": 8.41230615307654e-05, "loss": 0.3616, "step": 15925 }, { "epoch": 269.2, "grad_norm": 0.8707164525985718, "learning_rate": 8.409804902451225e-05, "loss": 0.3612, "step": 15950 }, { "epoch": 269.62, "grad_norm": 0.8710625171661377, "learning_rate": 8.407303651825913e-05, "loss": 0.3653, "step": 15975 }, { "epoch": 270.04, "grad_norm": 0.9726456999778748, "learning_rate": 8.404802401200601e-05, "loss": 0.362, "step": 16000 }, { "epoch": 270.46, "grad_norm": 0.8125610947608948, "learning_rate": 8.402301150575288e-05, "loss": 0.3615, "step": 16025 }, { "epoch": 270.89, "grad_norm": 0.8257951140403748, "learning_rate": 8.399799899949974e-05, "loss": 0.3654, "step": 16050 }, { "epoch": 271.31, "grad_norm": 1.304132103919983, "learning_rate": 8.397298649324663e-05, "loss": 0.3642, "step": 16075 }, { "epoch": 271.73, "grad_norm": 1.4190698862075806, "learning_rate": 8.39479739869935e-05, "loss": 0.3641, "step": 16100 }, { "epoch": 272.15, "grad_norm": 1.065217137336731, "learning_rate": 8.392296148074038e-05, "loss": 0.3641, "step": 16125 }, { "epoch": 272.57, "grad_norm": 0.7278902530670166, "learning_rate": 8.389794897448725e-05, "loss": 0.3674, "step": 16150 }, { "epoch": 273.0, "grad_norm": 1.0129950046539307, "learning_rate": 8.387293646823412e-05, "loss": 0.3625, "step": 16175 }, { "epoch": 273.42, "grad_norm": 2.0785419940948486, "learning_rate": 8.3847923961981e-05, "loss": 0.3639, "step": 16200 }, { "epoch": 273.84, "grad_norm": 1.0368536710739136, "learning_rate": 8.382291145572787e-05, "loss": 0.3626, "step": 16225 }, { "epoch": 274.26, "grad_norm": 0.7643749117851257, "learning_rate": 8.379789894947474e-05, "loss": 0.3652, "step": 16250 }, { "epoch": 274.68, "grad_norm": 0.8678352236747742, "learning_rate": 8.377288644322161e-05, "loss": 0.3623, "step": 16275 }, { "epoch": 275.11, "grad_norm": 0.7379581332206726, "learning_rate": 8.374787393696848e-05, "loss": 0.361, "step": 16300 }, { "epoch": 275.53, "grad_norm": 0.795144259929657, "learning_rate": 8.372286143071536e-05, "loss": 0.363, "step": 16325 }, { "epoch": 275.95, "grad_norm": 1.510482668876648, "learning_rate": 8.369784892446223e-05, "loss": 0.3635, "step": 16350 }, { "epoch": 276.37, "grad_norm": 1.0431444644927979, "learning_rate": 8.367283641820912e-05, "loss": 0.3682, "step": 16375 }, { "epoch": 276.79, "grad_norm": 1.334295630455017, "learning_rate": 8.364782391195597e-05, "loss": 0.3666, "step": 16400 }, { "epoch": 277.22, "grad_norm": 0.7187738418579102, "learning_rate": 8.362281140570285e-05, "loss": 0.3643, "step": 16425 }, { "epoch": 277.64, "grad_norm": 0.686591386795044, "learning_rate": 8.359779889944973e-05, "loss": 0.3612, "step": 16450 }, { "epoch": 278.06, "grad_norm": 0.9762983918190002, "learning_rate": 8.35727863931966e-05, "loss": 0.3708, "step": 16475 }, { "epoch": 278.48, "grad_norm": 1.3147562742233276, "learning_rate": 8.354777388694348e-05, "loss": 0.3644, "step": 16500 }, { "epoch": 278.9, "grad_norm": 0.7898246645927429, "learning_rate": 8.352276138069035e-05, "loss": 0.3602, "step": 16525 }, { "epoch": 279.32, "grad_norm": 2.062628984451294, "learning_rate": 8.349774887443722e-05, "loss": 0.3638, "step": 16550 }, { "epoch": 279.75, "grad_norm": 2.230180501937866, "learning_rate": 8.34727363681841e-05, "loss": 0.3624, "step": 16575 }, { "epoch": 280.17, "grad_norm": 1.4661195278167725, "learning_rate": 8.344772386193097e-05, "loss": 0.3602, "step": 16600 }, { "epoch": 280.59, "grad_norm": 1.1526679992675781, "learning_rate": 8.342271135567784e-05, "loss": 0.3634, "step": 16625 }, { "epoch": 281.01, "grad_norm": 0.8428257703781128, "learning_rate": 8.339769884942471e-05, "loss": 0.3626, "step": 16650 }, { "epoch": 281.43, "grad_norm": 0.7157553434371948, "learning_rate": 8.337268634317159e-05, "loss": 0.36, "step": 16675 }, { "epoch": 281.86, "grad_norm": 0.7688283920288086, "learning_rate": 8.334767383691846e-05, "loss": 0.3622, "step": 16700 }, { "epoch": 282.28, "grad_norm": 0.9348931908607483, "learning_rate": 8.332266133066535e-05, "loss": 0.3611, "step": 16725 }, { "epoch": 282.7, "grad_norm": 1.3814972639083862, "learning_rate": 8.32976488244122e-05, "loss": 0.3609, "step": 16750 }, { "epoch": 283.12, "grad_norm": 1.0073652267456055, "learning_rate": 8.327263631815908e-05, "loss": 0.3577, "step": 16775 }, { "epoch": 283.54, "grad_norm": 1.3115264177322388, "learning_rate": 8.324762381190596e-05, "loss": 0.3581, "step": 16800 }, { "epoch": 283.97, "grad_norm": 1.1650562286376953, "learning_rate": 8.322261130565284e-05, "loss": 0.3619, "step": 16825 }, { "epoch": 284.39, "grad_norm": 1.2670111656188965, "learning_rate": 8.31975987993997e-05, "loss": 0.3647, "step": 16850 }, { "epoch": 284.81, "grad_norm": 0.704855740070343, "learning_rate": 8.317258629314658e-05, "loss": 0.3664, "step": 16875 }, { "epoch": 285.23, "grad_norm": 1.7489445209503174, "learning_rate": 8.314757378689345e-05, "loss": 0.3594, "step": 16900 }, { "epoch": 285.65, "grad_norm": 1.8801852464675903, "learning_rate": 8.312256128064033e-05, "loss": 0.3692, "step": 16925 }, { "epoch": 286.08, "grad_norm": 1.594809651374817, "learning_rate": 8.30975487743872e-05, "loss": 0.3676, "step": 16950 }, { "epoch": 286.5, "grad_norm": 1.3559497594833374, "learning_rate": 8.307253626813407e-05, "loss": 0.3602, "step": 16975 }, { "epoch": 286.92, "grad_norm": 0.9619849920272827, "learning_rate": 8.304752376188095e-05, "loss": 0.3604, "step": 17000 }, { "epoch": 287.34, "grad_norm": 1.132236361503601, "learning_rate": 8.302251125562782e-05, "loss": 0.3633, "step": 17025 }, { "epoch": 287.76, "grad_norm": 3.0619399547576904, "learning_rate": 8.299749874937469e-05, "loss": 0.3594, "step": 17050 }, { "epoch": 288.19, "grad_norm": 1.0723527669906616, "learning_rate": 8.297248624312156e-05, "loss": 0.3615, "step": 17075 }, { "epoch": 288.61, "grad_norm": 0.7416146993637085, "learning_rate": 8.294747373686844e-05, "loss": 0.3611, "step": 17100 }, { "epoch": 289.03, "grad_norm": 0.6300391554832458, "learning_rate": 8.292246123061531e-05, "loss": 0.3609, "step": 17125 }, { "epoch": 289.45, "grad_norm": 1.019946813583374, "learning_rate": 8.289744872436218e-05, "loss": 0.3632, "step": 17150 }, { "epoch": 289.87, "grad_norm": 1.1269943714141846, "learning_rate": 8.287243621810907e-05, "loss": 0.3608, "step": 17175 }, { "epoch": 290.3, "grad_norm": 0.7735075950622559, "learning_rate": 8.284742371185593e-05, "loss": 0.3617, "step": 17200 }, { "epoch": 290.72, "grad_norm": 1.0721397399902344, "learning_rate": 8.28224112056028e-05, "loss": 0.3598, "step": 17225 }, { "epoch": 291.14, "grad_norm": 1.506663203239441, "learning_rate": 8.279739869934969e-05, "loss": 0.3636, "step": 17250 }, { "epoch": 291.56, "grad_norm": 0.9027799963951111, "learning_rate": 8.277238619309656e-05, "loss": 0.3648, "step": 17275 }, { "epoch": 291.98, "grad_norm": 0.8493388295173645, "learning_rate": 8.274737368684342e-05, "loss": 0.3611, "step": 17300 }, { "epoch": 292.41, "grad_norm": 1.0340256690979004, "learning_rate": 8.27223611805903e-05, "loss": 0.3586, "step": 17325 }, { "epoch": 292.83, "grad_norm": 1.3038016557693481, "learning_rate": 8.269734867433718e-05, "loss": 0.3637, "step": 17350 }, { "epoch": 293.25, "grad_norm": 1.826208472251892, "learning_rate": 8.267233616808404e-05, "loss": 0.3598, "step": 17375 }, { "epoch": 293.67, "grad_norm": 0.9971561431884766, "learning_rate": 8.264732366183092e-05, "loss": 0.3594, "step": 17400 }, { "epoch": 294.09, "grad_norm": 0.9726247191429138, "learning_rate": 8.26223111555778e-05, "loss": 0.3594, "step": 17425 }, { "epoch": 294.51, "grad_norm": 0.7021756768226624, "learning_rate": 8.259729864932467e-05, "loss": 0.3599, "step": 17450 }, { "epoch": 294.94, "grad_norm": 1.4075045585632324, "learning_rate": 8.257228614307154e-05, "loss": 0.3556, "step": 17475 }, { "epoch": 295.36, "grad_norm": 1.0259191989898682, "learning_rate": 8.254727363681841e-05, "loss": 0.3601, "step": 17500 }, { "epoch": 295.78, "grad_norm": 0.7884707450866699, "learning_rate": 8.252226113056528e-05, "loss": 0.3605, "step": 17525 }, { "epoch": 296.2, "grad_norm": 1.1988799571990967, "learning_rate": 8.249724862431216e-05, "loss": 0.3587, "step": 17550 }, { "epoch": 296.62, "grad_norm": 0.8160145878791809, "learning_rate": 8.247223611805903e-05, "loss": 0.3601, "step": 17575 }, { "epoch": 297.05, "grad_norm": 1.1184805631637573, "learning_rate": 8.24472236118059e-05, "loss": 0.3605, "step": 17600 }, { "epoch": 297.47, "grad_norm": 1.5595628023147583, "learning_rate": 8.242221110555279e-05, "loss": 0.3592, "step": 17625 }, { "epoch": 297.89, "grad_norm": 1.3668043613433838, "learning_rate": 8.239719859929965e-05, "loss": 0.3593, "step": 17650 }, { "epoch": 298.31, "grad_norm": 1.3418548107147217, "learning_rate": 8.237218609304652e-05, "loss": 0.3635, "step": 17675 }, { "epoch": 298.73, "grad_norm": 1.0980030298233032, "learning_rate": 8.234717358679341e-05, "loss": 0.3603, "step": 17700 }, { "epoch": 299.16, "grad_norm": 1.5013407468795776, "learning_rate": 8.232216108054027e-05, "loss": 0.3579, "step": 17725 }, { "epoch": 299.58, "grad_norm": 1.1061452627182007, "learning_rate": 8.229714857428715e-05, "loss": 0.361, "step": 17750 }, { "epoch": 300.0, "grad_norm": 0.6798700094223022, "learning_rate": 8.227213606803402e-05, "loss": 0.3581, "step": 17775 }, { "epoch": 300.42, "grad_norm": 0.9691147804260254, "learning_rate": 8.22471235617809e-05, "loss": 0.3585, "step": 17800 }, { "epoch": 300.84, "grad_norm": 1.0259848833084106, "learning_rate": 8.222211105552777e-05, "loss": 0.3584, "step": 17825 }, { "epoch": 301.27, "grad_norm": 1.409461498260498, "learning_rate": 8.219709854927464e-05, "loss": 0.3608, "step": 17850 }, { "epoch": 301.69, "grad_norm": 1.0659101009368896, "learning_rate": 8.217208604302152e-05, "loss": 0.3605, "step": 17875 }, { "epoch": 302.11, "grad_norm": 1.4667974710464478, "learning_rate": 8.214707353676839e-05, "loss": 0.3671, "step": 17900 }, { "epoch": 302.53, "grad_norm": 1.235718846321106, "learning_rate": 8.212206103051526e-05, "loss": 0.3648, "step": 17925 }, { "epoch": 302.95, "grad_norm": 1.196365237236023, "learning_rate": 8.209704852426213e-05, "loss": 0.361, "step": 17950 }, { "epoch": 303.38, "grad_norm": 0.9447464942932129, "learning_rate": 8.207203601800902e-05, "loss": 0.3602, "step": 17975 }, { "epoch": 303.8, "grad_norm": 1.7616301774978638, "learning_rate": 8.204702351175588e-05, "loss": 0.3594, "step": 18000 }, { "epoch": 304.22, "grad_norm": 0.982491672039032, "learning_rate": 8.202201100550275e-05, "loss": 0.363, "step": 18025 }, { "epoch": 304.64, "grad_norm": 1.9217392206192017, "learning_rate": 8.199699849924964e-05, "loss": 0.3592, "step": 18050 }, { "epoch": 305.06, "grad_norm": 0.7750915288925171, "learning_rate": 8.197298649324662e-05, "loss": 0.3574, "step": 18075 }, { "epoch": 305.49, "grad_norm": 0.8260890245437622, "learning_rate": 8.19479739869935e-05, "loss": 0.3609, "step": 18100 }, { "epoch": 305.91, "grad_norm": 1.706742286682129, "learning_rate": 8.192296148074038e-05, "loss": 0.359, "step": 18125 }, { "epoch": 306.33, "grad_norm": 1.1978380680084229, "learning_rate": 8.189794897448725e-05, "loss": 0.3599, "step": 18150 }, { "epoch": 306.75, "grad_norm": 1.0595779418945312, "learning_rate": 8.187293646823411e-05, "loss": 0.359, "step": 18175 }, { "epoch": 307.17, "grad_norm": 1.3222228288650513, "learning_rate": 8.1847923961981e-05, "loss": 0.3611, "step": 18200 }, { "epoch": 307.59, "grad_norm": 1.8920141458511353, "learning_rate": 8.182291145572787e-05, "loss": 0.3602, "step": 18225 }, { "epoch": 308.02, "grad_norm": 0.7246122360229492, "learning_rate": 8.179789894947474e-05, "loss": 0.361, "step": 18250 }, { "epoch": 308.44, "grad_norm": 0.7427545785903931, "learning_rate": 8.177288644322162e-05, "loss": 0.3601, "step": 18275 }, { "epoch": 308.86, "grad_norm": 1.1107434034347534, "learning_rate": 8.174787393696849e-05, "loss": 0.3569, "step": 18300 }, { "epoch": 309.28, "grad_norm": 0.7275181412696838, "learning_rate": 8.172286143071536e-05, "loss": 0.3576, "step": 18325 }, { "epoch": 309.7, "grad_norm": 1.3516052961349487, "learning_rate": 8.169784892446224e-05, "loss": 0.3581, "step": 18350 }, { "epoch": 310.13, "grad_norm": 0.7587239146232605, "learning_rate": 8.167283641820911e-05, "loss": 0.3619, "step": 18375 }, { "epoch": 310.55, "grad_norm": 1.5297181606292725, "learning_rate": 8.164782391195598e-05, "loss": 0.3603, "step": 18400 }, { "epoch": 310.97, "grad_norm": 1.0974851846694946, "learning_rate": 8.162281140570285e-05, "loss": 0.3608, "step": 18425 }, { "epoch": 311.39, "grad_norm": 1.1951693296432495, "learning_rate": 8.159779889944973e-05, "loss": 0.3579, "step": 18450 }, { "epoch": 311.81, "grad_norm": 0.9509335160255432, "learning_rate": 8.15727863931966e-05, "loss": 0.3587, "step": 18475 }, { "epoch": 312.24, "grad_norm": 0.8640209436416626, "learning_rate": 8.154777388694348e-05, "loss": 0.3596, "step": 18500 }, { "epoch": 312.66, "grad_norm": 0.8381843566894531, "learning_rate": 8.152276138069034e-05, "loss": 0.3557, "step": 18525 }, { "epoch": 313.08, "grad_norm": 1.0143928527832031, "learning_rate": 8.149774887443722e-05, "loss": 0.3562, "step": 18550 }, { "epoch": 313.5, "grad_norm": 1.125887393951416, "learning_rate": 8.14727363681841e-05, "loss": 0.3598, "step": 18575 }, { "epoch": 313.92, "grad_norm": 0.8904116749763489, "learning_rate": 8.144772386193098e-05, "loss": 0.3583, "step": 18600 }, { "epoch": 314.35, "grad_norm": 0.8131029605865479, "learning_rate": 8.142271135567783e-05, "loss": 0.3583, "step": 18625 }, { "epoch": 314.77, "grad_norm": 0.6751884818077087, "learning_rate": 8.139769884942472e-05, "loss": 0.3583, "step": 18650 }, { "epoch": 315.19, "grad_norm": 0.9295366406440735, "learning_rate": 8.137268634317159e-05, "loss": 0.3578, "step": 18675 }, { "epoch": 315.61, "grad_norm": 1.733073115348816, "learning_rate": 8.134767383691845e-05, "loss": 0.3606, "step": 18700 }, { "epoch": 316.03, "grad_norm": 0.653156042098999, "learning_rate": 8.132266133066534e-05, "loss": 0.3594, "step": 18725 }, { "epoch": 316.46, "grad_norm": 0.7915719151496887, "learning_rate": 8.129764882441221e-05, "loss": 0.355, "step": 18750 }, { "epoch": 316.88, "grad_norm": 2.3239645957946777, "learning_rate": 8.127263631815908e-05, "loss": 0.3578, "step": 18775 }, { "epoch": 317.3, "grad_norm": 0.803396999835968, "learning_rate": 8.124762381190596e-05, "loss": 0.3593, "step": 18800 }, { "epoch": 317.72, "grad_norm": 1.1010541915893555, "learning_rate": 8.122261130565283e-05, "loss": 0.3569, "step": 18825 }, { "epoch": 318.14, "grad_norm": 1.3979346752166748, "learning_rate": 8.11975987993997e-05, "loss": 0.3574, "step": 18850 }, { "epoch": 318.57, "grad_norm": 1.1808573007583618, "learning_rate": 8.117258629314657e-05, "loss": 0.3552, "step": 18875 }, { "epoch": 318.99, "grad_norm": 0.9665441513061523, "learning_rate": 8.114757378689345e-05, "loss": 0.3599, "step": 18900 }, { "epoch": 319.41, "grad_norm": 1.1946570873260498, "learning_rate": 8.112256128064032e-05, "loss": 0.3602, "step": 18925 }, { "epoch": 319.83, "grad_norm": 1.1368430852890015, "learning_rate": 8.10975487743872e-05, "loss": 0.3591, "step": 18950 }, { "epoch": 320.25, "grad_norm": 0.7300906777381897, "learning_rate": 8.107253626813406e-05, "loss": 0.3569, "step": 18975 }, { "epoch": 320.68, "grad_norm": 1.307708978652954, "learning_rate": 8.104752376188094e-05, "loss": 0.3598, "step": 19000 }, { "epoch": 321.1, "grad_norm": 1.0256810188293457, "learning_rate": 8.102251125562782e-05, "loss": 0.3565, "step": 19025 }, { "epoch": 321.52, "grad_norm": 0.8039768934249878, "learning_rate": 8.099749874937468e-05, "loss": 0.3559, "step": 19050 }, { "epoch": 321.94, "grad_norm": 1.2809311151504517, "learning_rate": 8.097248624312156e-05, "loss": 0.3575, "step": 19075 }, { "epoch": 322.36, "grad_norm": 0.7176210880279541, "learning_rate": 8.094747373686844e-05, "loss": 0.3606, "step": 19100 }, { "epoch": 322.78, "grad_norm": 0.5972779989242554, "learning_rate": 8.092246123061531e-05, "loss": 0.3565, "step": 19125 }, { "epoch": 323.21, "grad_norm": 1.601108193397522, "learning_rate": 8.089744872436219e-05, "loss": 0.3591, "step": 19150 }, { "epoch": 323.63, "grad_norm": 1.590501070022583, "learning_rate": 8.087343671835919e-05, "loss": 0.3575, "step": 19175 }, { "epoch": 324.05, "grad_norm": 0.6564643383026123, "learning_rate": 8.084842421210606e-05, "loss": 0.358, "step": 19200 }, { "epoch": 324.47, "grad_norm": 1.347285270690918, "learning_rate": 8.082341170585293e-05, "loss": 0.3571, "step": 19225 }, { "epoch": 324.89, "grad_norm": 1.171204924583435, "learning_rate": 8.07983991995998e-05, "loss": 0.3567, "step": 19250 }, { "epoch": 325.32, "grad_norm": 0.8175528645515442, "learning_rate": 8.077338669334668e-05, "loss": 0.3545, "step": 19275 }, { "epoch": 325.74, "grad_norm": 1.1238757371902466, "learning_rate": 8.074837418709356e-05, "loss": 0.3598, "step": 19300 }, { "epoch": 326.16, "grad_norm": 1.1881730556488037, "learning_rate": 8.072336168084042e-05, "loss": 0.3588, "step": 19325 }, { "epoch": 326.58, "grad_norm": 1.769948959350586, "learning_rate": 8.06983491745873e-05, "loss": 0.3566, "step": 19350 }, { "epoch": 327.0, "grad_norm": 1.304895043373108, "learning_rate": 8.067333666833418e-05, "loss": 0.3565, "step": 19375 }, { "epoch": 327.43, "grad_norm": 1.1384321451187134, "learning_rate": 8.064832416208104e-05, "loss": 0.3566, "step": 19400 }, { "epoch": 327.85, "grad_norm": 0.6794611215591431, "learning_rate": 8.062331165582791e-05, "loss": 0.3579, "step": 19425 }, { "epoch": 328.27, "grad_norm": 1.3435585498809814, "learning_rate": 8.05982991495748e-05, "loss": 0.3602, "step": 19450 }, { "epoch": 328.69, "grad_norm": 0.6176559329032898, "learning_rate": 8.057328664332167e-05, "loss": 0.3559, "step": 19475 }, { "epoch": 329.11, "grad_norm": 1.315812587738037, "learning_rate": 8.054827413706853e-05, "loss": 0.3573, "step": 19500 }, { "epoch": 329.54, "grad_norm": 1.4913958311080933, "learning_rate": 8.052326163081542e-05, "loss": 0.3564, "step": 19525 }, { "epoch": 329.96, "grad_norm": 1.6352897882461548, "learning_rate": 8.049824912456229e-05, "loss": 0.36, "step": 19550 }, { "epoch": 330.38, "grad_norm": 1.4940515756607056, "learning_rate": 8.047323661830916e-05, "loss": 0.3547, "step": 19575 }, { "epoch": 330.8, "grad_norm": 1.456263542175293, "learning_rate": 8.044822411205603e-05, "loss": 0.3546, "step": 19600 }, { "epoch": 331.22, "grad_norm": 1.4726428985595703, "learning_rate": 8.04232116058029e-05, "loss": 0.3593, "step": 19625 }, { "epoch": 331.65, "grad_norm": 0.7690699696540833, "learning_rate": 8.039819909954978e-05, "loss": 0.3539, "step": 19650 }, { "epoch": 332.07, "grad_norm": 1.371355652809143, "learning_rate": 8.037318659329665e-05, "loss": 0.3569, "step": 19675 }, { "epoch": 332.49, "grad_norm": 0.6450801491737366, "learning_rate": 8.034817408704352e-05, "loss": 0.3545, "step": 19700 }, { "epoch": 332.91, "grad_norm": 1.1788725852966309, "learning_rate": 8.03231615807904e-05, "loss": 0.3562, "step": 19725 }, { "epoch": 333.33, "grad_norm": 0.6701567769050598, "learning_rate": 8.029814907453727e-05, "loss": 0.3586, "step": 19750 }, { "epoch": 333.76, "grad_norm": 1.3523973226547241, "learning_rate": 8.027313656828414e-05, "loss": 0.3534, "step": 19775 }, { "epoch": 334.18, "grad_norm": 0.8015347719192505, "learning_rate": 8.024812406203102e-05, "loss": 0.3551, "step": 19800 }, { "epoch": 334.6, "grad_norm": 0.8040794134140015, "learning_rate": 8.02231115557779e-05, "loss": 0.358, "step": 19825 }, { "epoch": 335.02, "grad_norm": 0.8004847168922424, "learning_rate": 8.019809904952476e-05, "loss": 0.3582, "step": 19850 }, { "epoch": 335.44, "grad_norm": 0.8439458608627319, "learning_rate": 8.017308654327163e-05, "loss": 0.3558, "step": 19875 }, { "epoch": 335.86, "grad_norm": 1.7248244285583496, "learning_rate": 8.014807403701852e-05, "loss": 0.357, "step": 19900 }, { "epoch": 336.29, "grad_norm": 1.6659387350082397, "learning_rate": 8.012306153076539e-05, "loss": 0.3559, "step": 19925 }, { "epoch": 336.71, "grad_norm": 0.6850571632385254, "learning_rate": 8.009804902451225e-05, "loss": 0.3555, "step": 19950 }, { "epoch": 337.13, "grad_norm": 1.2804358005523682, "learning_rate": 8.007303651825914e-05, "loss": 0.3561, "step": 19975 }, { "epoch": 337.55, "grad_norm": 0.701424777507782, "learning_rate": 8.004802401200601e-05, "loss": 0.3538, "step": 20000 }, { "epoch": 337.55, "eval_loss": 0.45124801993370056, "eval_runtime": 4.2246, "eval_samples_per_second": 73.142, "eval_steps_per_second": 2.367, "step": 20000 }, { "epoch": 337.97, "grad_norm": 1.2011425495147705, "learning_rate": 8.002301150575288e-05, "loss": 0.3545, "step": 20025 }, { "epoch": 338.4, "grad_norm": 0.9771451354026794, "learning_rate": 7.999799899949976e-05, "loss": 0.3543, "step": 20050 }, { "epoch": 338.82, "grad_norm": 0.9714786410331726, "learning_rate": 7.997298649324663e-05, "loss": 0.3621, "step": 20075 }, { "epoch": 339.24, "grad_norm": 0.7581168413162231, "learning_rate": 7.99479739869935e-05, "loss": 0.3566, "step": 20100 }, { "epoch": 339.66, "grad_norm": 1.7439080476760864, "learning_rate": 7.992296148074037e-05, "loss": 0.3564, "step": 20125 }, { "epoch": 340.08, "grad_norm": 1.8293099403381348, "learning_rate": 7.989794897448725e-05, "loss": 0.358, "step": 20150 }, { "epoch": 340.51, "grad_norm": 0.9277634024620056, "learning_rate": 7.987293646823412e-05, "loss": 0.3547, "step": 20175 }, { "epoch": 340.93, "grad_norm": 0.7737613916397095, "learning_rate": 7.984792396198099e-05, "loss": 0.3548, "step": 20200 }, { "epoch": 341.35, "grad_norm": 0.678411602973938, "learning_rate": 7.982291145572786e-05, "loss": 0.3552, "step": 20225 }, { "epoch": 341.77, "grad_norm": 1.2307754755020142, "learning_rate": 7.979789894947474e-05, "loss": 0.3572, "step": 20250 }, { "epoch": 342.19, "grad_norm": 1.110528826713562, "learning_rate": 7.977288644322162e-05, "loss": 0.3555, "step": 20275 }, { "epoch": 342.62, "grad_norm": 0.9352234601974487, "learning_rate": 7.974787393696848e-05, "loss": 0.3567, "step": 20300 }, { "epoch": 343.04, "grad_norm": 1.6342604160308838, "learning_rate": 7.972286143071535e-05, "loss": 0.3565, "step": 20325 }, { "epoch": 343.46, "grad_norm": 2.2916953563690186, "learning_rate": 7.969784892446224e-05, "loss": 0.3596, "step": 20350 }, { "epoch": 343.88, "grad_norm": 1.5183382034301758, "learning_rate": 7.967283641820911e-05, "loss": 0.3566, "step": 20375 }, { "epoch": 344.3, "grad_norm": 0.7948129773139954, "learning_rate": 7.964782391195597e-05, "loss": 0.354, "step": 20400 }, { "epoch": 344.73, "grad_norm": 1.0421210527420044, "learning_rate": 7.962281140570286e-05, "loss": 0.3528, "step": 20425 }, { "epoch": 345.15, "grad_norm": 1.1933951377868652, "learning_rate": 7.959779889944973e-05, "loss": 0.356, "step": 20450 }, { "epoch": 345.57, "grad_norm": 1.3854933977127075, "learning_rate": 7.957278639319659e-05, "loss": 0.3514, "step": 20475 }, { "epoch": 345.99, "grad_norm": 0.5547246932983398, "learning_rate": 7.954777388694348e-05, "loss": 0.3548, "step": 20500 }, { "epoch": 346.41, "grad_norm": 0.6879237294197083, "learning_rate": 7.952276138069035e-05, "loss": 0.3544, "step": 20525 }, { "epoch": 346.84, "grad_norm": 0.8372142314910889, "learning_rate": 7.949774887443722e-05, "loss": 0.352, "step": 20550 }, { "epoch": 347.26, "grad_norm": 1.3571369647979736, "learning_rate": 7.94727363681841e-05, "loss": 0.3551, "step": 20575 }, { "epoch": 347.68, "grad_norm": 1.3200024366378784, "learning_rate": 7.944772386193097e-05, "loss": 0.3524, "step": 20600 }, { "epoch": 348.1, "grad_norm": 1.3835551738739014, "learning_rate": 7.942271135567785e-05, "loss": 0.3551, "step": 20625 }, { "epoch": 348.52, "grad_norm": 0.9660641551017761, "learning_rate": 7.939769884942471e-05, "loss": 0.3545, "step": 20650 }, { "epoch": 348.95, "grad_norm": 0.917105495929718, "learning_rate": 7.937268634317159e-05, "loss": 0.3525, "step": 20675 }, { "epoch": 349.37, "grad_norm": 0.7840372920036316, "learning_rate": 7.934767383691847e-05, "loss": 0.3533, "step": 20700 }, { "epoch": 349.79, "grad_norm": 1.8720412254333496, "learning_rate": 7.932266133066534e-05, "loss": 0.3568, "step": 20725 }, { "epoch": 350.21, "grad_norm": 1.3614581823349, "learning_rate": 7.92976488244122e-05, "loss": 0.3539, "step": 20750 }, { "epoch": 350.63, "grad_norm": 0.652937114238739, "learning_rate": 7.927263631815909e-05, "loss": 0.3535, "step": 20775 }, { "epoch": 351.05, "grad_norm": 1.2418371438980103, "learning_rate": 7.924762381190596e-05, "loss": 0.353, "step": 20800 }, { "epoch": 351.48, "grad_norm": 1.9765123128890991, "learning_rate": 7.922261130565282e-05, "loss": 0.3553, "step": 20825 }, { "epoch": 351.9, "grad_norm": 1.1545944213867188, "learning_rate": 7.919759879939971e-05, "loss": 0.3585, "step": 20850 }, { "epoch": 352.32, "grad_norm": 2.0497570037841797, "learning_rate": 7.917258629314658e-05, "loss": 0.3535, "step": 20875 }, { "epoch": 352.74, "grad_norm": 1.918831706047058, "learning_rate": 7.914757378689345e-05, "loss": 0.3559, "step": 20900 }, { "epoch": 353.16, "grad_norm": 0.9588321447372437, "learning_rate": 7.912256128064033e-05, "loss": 0.3571, "step": 20925 }, { "epoch": 353.59, "grad_norm": 1.0339457988739014, "learning_rate": 7.90975487743872e-05, "loss": 0.3535, "step": 20950 }, { "epoch": 354.01, "grad_norm": 1.0793962478637695, "learning_rate": 7.907253626813407e-05, "loss": 0.3561, "step": 20975 }, { "epoch": 354.43, "grad_norm": 0.7535501718521118, "learning_rate": 7.904752376188094e-05, "loss": 0.3499, "step": 21000 }, { "epoch": 354.85, "grad_norm": 0.6596577763557434, "learning_rate": 7.902251125562782e-05, "loss": 0.3541, "step": 21025 }, { "epoch": 355.27, "grad_norm": 0.590787947177887, "learning_rate": 7.899749874937469e-05, "loss": 0.3547, "step": 21050 }, { "epoch": 355.7, "grad_norm": 1.3444268703460693, "learning_rate": 7.897248624312157e-05, "loss": 0.356, "step": 21075 }, { "epoch": 356.12, "grad_norm": 2.0330324172973633, "learning_rate": 7.894747373686843e-05, "loss": 0.3548, "step": 21100 }, { "epoch": 356.54, "grad_norm": 1.694438099861145, "learning_rate": 7.89224612306153e-05, "loss": 0.3579, "step": 21125 }, { "epoch": 356.96, "grad_norm": 1.0667181015014648, "learning_rate": 7.889744872436219e-05, "loss": 0.3566, "step": 21150 }, { "epoch": 357.38, "grad_norm": 1.1954700946807861, "learning_rate": 7.887243621810905e-05, "loss": 0.3521, "step": 21175 }, { "epoch": 357.81, "grad_norm": 0.7466026544570923, "learning_rate": 7.884742371185592e-05, "loss": 0.3526, "step": 21200 }, { "epoch": 358.23, "grad_norm": 1.0273816585540771, "learning_rate": 7.882241120560281e-05, "loss": 0.3543, "step": 21225 }, { "epoch": 358.65, "grad_norm": 1.0627511739730835, "learning_rate": 7.879739869934968e-05, "loss": 0.3571, "step": 21250 }, { "epoch": 359.07, "grad_norm": 0.8342685103416443, "learning_rate": 7.877238619309654e-05, "loss": 0.354, "step": 21275 }, { "epoch": 359.49, "grad_norm": 1.7476876974105835, "learning_rate": 7.874737368684343e-05, "loss": 0.3525, "step": 21300 }, { "epoch": 359.92, "grad_norm": 0.9430789947509766, "learning_rate": 7.87223611805903e-05, "loss": 0.3542, "step": 21325 }, { "epoch": 360.34, "grad_norm": 0.7588370442390442, "learning_rate": 7.869734867433717e-05, "loss": 0.3533, "step": 21350 }, { "epoch": 360.76, "grad_norm": 1.0649206638336182, "learning_rate": 7.867233616808405e-05, "loss": 0.3514, "step": 21375 }, { "epoch": 361.18, "grad_norm": 0.8731920123100281, "learning_rate": 7.864732366183092e-05, "loss": 0.3528, "step": 21400 }, { "epoch": 361.6, "grad_norm": 1.1676230430603027, "learning_rate": 7.862231115557779e-05, "loss": 0.3537, "step": 21425 }, { "epoch": 362.03, "grad_norm": 0.861766517162323, "learning_rate": 7.859729864932466e-05, "loss": 0.3531, "step": 21450 }, { "epoch": 362.45, "grad_norm": 1.6182026863098145, "learning_rate": 7.857228614307154e-05, "loss": 0.3523, "step": 21475 }, { "epoch": 362.87, "grad_norm": 1.333633542060852, "learning_rate": 7.854727363681841e-05, "loss": 0.353, "step": 21500 }, { "epoch": 363.29, "grad_norm": 1.7726867198944092, "learning_rate": 7.852226113056528e-05, "loss": 0.3548, "step": 21525 }, { "epoch": 363.71, "grad_norm": 0.9420325756072998, "learning_rate": 7.849724862431216e-05, "loss": 0.3518, "step": 21550 }, { "epoch": 364.14, "grad_norm": 1.1037617921829224, "learning_rate": 7.847223611805903e-05, "loss": 0.3515, "step": 21575 }, { "epoch": 364.56, "grad_norm": 0.9805176854133606, "learning_rate": 7.844722361180591e-05, "loss": 0.3535, "step": 21600 }, { "epoch": 364.98, "grad_norm": 1.0911164283752441, "learning_rate": 7.842221110555277e-05, "loss": 0.354, "step": 21625 }, { "epoch": 365.4, "grad_norm": 0.8757855296134949, "learning_rate": 7.839719859929965e-05, "loss": 0.352, "step": 21650 }, { "epoch": 365.82, "grad_norm": 1.072257399559021, "learning_rate": 7.837218609304653e-05, "loss": 0.3551, "step": 21675 }, { "epoch": 366.24, "grad_norm": 0.868468165397644, "learning_rate": 7.83471735867934e-05, "loss": 0.3563, "step": 21700 }, { "epoch": 366.67, "grad_norm": 1.1142504215240479, "learning_rate": 7.832216108054026e-05, "loss": 0.3506, "step": 21725 }, { "epoch": 367.09, "grad_norm": 0.7158011794090271, "learning_rate": 7.829714857428715e-05, "loss": 0.3543, "step": 21750 }, { "epoch": 367.51, "grad_norm": 0.9219402074813843, "learning_rate": 7.827213606803402e-05, "loss": 0.3502, "step": 21775 }, { "epoch": 367.93, "grad_norm": 0.822273850440979, "learning_rate": 7.82471235617809e-05, "loss": 0.3518, "step": 21800 }, { "epoch": 368.35, "grad_norm": 0.6698904037475586, "learning_rate": 7.822211105552777e-05, "loss": 0.3532, "step": 21825 }, { "epoch": 368.78, "grad_norm": 1.118901014328003, "learning_rate": 7.819709854927464e-05, "loss": 0.3537, "step": 21850 }, { "epoch": 369.2, "grad_norm": 0.9635286927223206, "learning_rate": 7.817208604302151e-05, "loss": 0.3549, "step": 21875 }, { "epoch": 369.62, "grad_norm": 0.8210596442222595, "learning_rate": 7.814707353676839e-05, "loss": 0.3513, "step": 21900 }, { "epoch": 370.04, "grad_norm": 1.1113007068634033, "learning_rate": 7.812206103051526e-05, "loss": 0.3502, "step": 21925 }, { "epoch": 370.46, "grad_norm": 0.9070974588394165, "learning_rate": 7.809704852426214e-05, "loss": 0.3513, "step": 21950 }, { "epoch": 370.89, "grad_norm": 1.1311630010604858, "learning_rate": 7.8072036018009e-05, "loss": 0.3546, "step": 21975 }, { "epoch": 371.31, "grad_norm": 1.254599928855896, "learning_rate": 7.804702351175588e-05, "loss": 0.3539, "step": 22000 }, { "epoch": 371.73, "grad_norm": 1.189945936203003, "learning_rate": 7.802201100550276e-05, "loss": 0.3512, "step": 22025 }, { "epoch": 372.15, "grad_norm": 1.8577960729599, "learning_rate": 7.799699849924964e-05, "loss": 0.3529, "step": 22050 }, { "epoch": 372.57, "grad_norm": 1.0196740627288818, "learning_rate": 7.79719859929965e-05, "loss": 0.3548, "step": 22075 }, { "epoch": 373.0, "grad_norm": 0.653826117515564, "learning_rate": 7.794697348674338e-05, "loss": 0.3492, "step": 22100 }, { "epoch": 373.42, "grad_norm": 0.9578218460083008, "learning_rate": 7.792196098049025e-05, "loss": 0.3501, "step": 22125 }, { "epoch": 373.84, "grad_norm": 0.8028138279914856, "learning_rate": 7.789694847423713e-05, "loss": 0.3504, "step": 22150 }, { "epoch": 374.26, "grad_norm": 1.6895277500152588, "learning_rate": 7.7871935967984e-05, "loss": 0.3531, "step": 22175 }, { "epoch": 374.68, "grad_norm": 1.0742213726043701, "learning_rate": 7.784692346173087e-05, "loss": 0.3568, "step": 22200 }, { "epoch": 375.11, "grad_norm": 0.7653369307518005, "learning_rate": 7.782191095547774e-05, "loss": 0.3541, "step": 22225 }, { "epoch": 375.53, "grad_norm": 0.8772682547569275, "learning_rate": 7.779689844922462e-05, "loss": 0.3532, "step": 22250 }, { "epoch": 375.95, "grad_norm": 1.630244493484497, "learning_rate": 7.777188594297149e-05, "loss": 0.3501, "step": 22275 }, { "epoch": 376.37, "grad_norm": 0.8820977210998535, "learning_rate": 7.774687343671836e-05, "loss": 0.354, "step": 22300 }, { "epoch": 376.79, "grad_norm": 1.2248945236206055, "learning_rate": 7.772186093046523e-05, "loss": 0.3609, "step": 22325 }, { "epoch": 377.22, "grad_norm": 1.2037432193756104, "learning_rate": 7.769684842421211e-05, "loss": 0.3554, "step": 22350 }, { "epoch": 377.64, "grad_norm": 0.7431740760803223, "learning_rate": 7.767183591795898e-05, "loss": 0.3542, "step": 22375 }, { "epoch": 378.06, "grad_norm": 0.650550365447998, "learning_rate": 7.764682341170587e-05, "loss": 0.3524, "step": 22400 }, { "epoch": 378.48, "grad_norm": 0.7304168343544006, "learning_rate": 7.762181090545273e-05, "loss": 0.3519, "step": 22425 }, { "epoch": 378.9, "grad_norm": 0.8286672830581665, "learning_rate": 7.75967983991996e-05, "loss": 0.3512, "step": 22450 }, { "epoch": 379.32, "grad_norm": 0.8695940971374512, "learning_rate": 7.757178589294648e-05, "loss": 0.3538, "step": 22475 }, { "epoch": 379.75, "grad_norm": 2.256375312805176, "learning_rate": 7.754677338669336e-05, "loss": 0.3565, "step": 22500 }, { "epoch": 380.17, "grad_norm": 0.8737481236457825, "learning_rate": 7.752176088044022e-05, "loss": 0.3526, "step": 22525 }, { "epoch": 380.59, "grad_norm": 1.0597388744354248, "learning_rate": 7.74967483741871e-05, "loss": 0.3534, "step": 22550 }, { "epoch": 381.01, "grad_norm": 0.8384460806846619, "learning_rate": 7.747173586793397e-05, "loss": 0.3525, "step": 22575 }, { "epoch": 381.43, "grad_norm": 0.8394002318382263, "learning_rate": 7.744672336168083e-05, "loss": 0.3494, "step": 22600 }, { "epoch": 381.86, "grad_norm": 1.5352838039398193, "learning_rate": 7.742171085542772e-05, "loss": 0.3509, "step": 22625 }, { "epoch": 382.28, "grad_norm": 0.7739511132240295, "learning_rate": 7.739669834917459e-05, "loss": 0.3513, "step": 22650 }, { "epoch": 382.7, "grad_norm": 0.8553645014762878, "learning_rate": 7.737168584292147e-05, "loss": 0.3502, "step": 22675 }, { "epoch": 383.12, "grad_norm": 1.0268446207046509, "learning_rate": 7.734667333666834e-05, "loss": 0.3508, "step": 22700 }, { "epoch": 383.54, "grad_norm": 0.7202825546264648, "learning_rate": 7.732166083041521e-05, "loss": 0.352, "step": 22725 }, { "epoch": 383.97, "grad_norm": 0.6662220358848572, "learning_rate": 7.729664832416208e-05, "loss": 0.35, "step": 22750 }, { "epoch": 384.39, "grad_norm": 1.3664321899414062, "learning_rate": 7.727163581790896e-05, "loss": 0.3472, "step": 22775 }, { "epoch": 384.81, "grad_norm": 1.6508921384811401, "learning_rate": 7.724662331165583e-05, "loss": 0.3508, "step": 22800 }, { "epoch": 385.23, "grad_norm": 1.4635356664657593, "learning_rate": 7.72216108054027e-05, "loss": 0.3506, "step": 22825 }, { "epoch": 385.65, "grad_norm": 1.1121479272842407, "learning_rate": 7.719659829914959e-05, "loss": 0.3516, "step": 22850 }, { "epoch": 386.08, "grad_norm": 0.8379659056663513, "learning_rate": 7.717158579289645e-05, "loss": 0.3507, "step": 22875 }, { "epoch": 386.5, "grad_norm": 0.7516236901283264, "learning_rate": 7.714657328664332e-05, "loss": 0.3516, "step": 22900 }, { "epoch": 386.92, "grad_norm": 0.7105032801628113, "learning_rate": 7.71215607803902e-05, "loss": 0.3525, "step": 22925 }, { "epoch": 387.34, "grad_norm": 0.8189804553985596, "learning_rate": 7.709654827413706e-05, "loss": 0.3497, "step": 22950 }, { "epoch": 387.76, "grad_norm": 0.9634209275245667, "learning_rate": 7.707153576788395e-05, "loss": 0.3473, "step": 22975 }, { "epoch": 388.19, "grad_norm": 1.1896082162857056, "learning_rate": 7.704652326163082e-05, "loss": 0.3516, "step": 23000 }, { "epoch": 388.61, "grad_norm": 1.1155837774276733, "learning_rate": 7.70215107553777e-05, "loss": 0.3506, "step": 23025 }, { "epoch": 389.03, "grad_norm": 1.2098743915557861, "learning_rate": 7.699649824912457e-05, "loss": 0.3538, "step": 23050 }, { "epoch": 389.45, "grad_norm": 1.1511611938476562, "learning_rate": 7.697148574287144e-05, "loss": 0.3495, "step": 23075 }, { "epoch": 389.87, "grad_norm": 0.6323677897453308, "learning_rate": 7.694647323661831e-05, "loss": 0.3497, "step": 23100 }, { "epoch": 390.3, "grad_norm": 0.8068463206291199, "learning_rate": 7.692146073036519e-05, "loss": 0.3511, "step": 23125 }, { "epoch": 390.72, "grad_norm": 0.8583862781524658, "learning_rate": 7.689644822411206e-05, "loss": 0.351, "step": 23150 }, { "epoch": 391.14, "grad_norm": 1.6985912322998047, "learning_rate": 7.687143571785893e-05, "loss": 0.3503, "step": 23175 }, { "epoch": 391.56, "grad_norm": 0.7498642206192017, "learning_rate": 7.684642321160582e-05, "loss": 0.3517, "step": 23200 }, { "epoch": 391.98, "grad_norm": 1.8771913051605225, "learning_rate": 7.682141070535268e-05, "loss": 0.3501, "step": 23225 }, { "epoch": 392.41, "grad_norm": 1.1459307670593262, "learning_rate": 7.679639819909955e-05, "loss": 0.351, "step": 23250 }, { "epoch": 392.83, "grad_norm": 1.3706300258636475, "learning_rate": 7.677138569284644e-05, "loss": 0.3546, "step": 23275 }, { "epoch": 393.25, "grad_norm": 0.742113471031189, "learning_rate": 7.67463731865933e-05, "loss": 0.3507, "step": 23300 }, { "epoch": 393.67, "grad_norm": 0.9109113812446594, "learning_rate": 7.672136068034017e-05, "loss": 0.3532, "step": 23325 }, { "epoch": 394.09, "grad_norm": 0.7976974844932556, "learning_rate": 7.669634817408705e-05, "loss": 0.3498, "step": 23350 }, { "epoch": 394.51, "grad_norm": 0.8725335001945496, "learning_rate": 7.667133566783393e-05, "loss": 0.3507, "step": 23375 }, { "epoch": 394.94, "grad_norm": 1.270808219909668, "learning_rate": 7.664632316158079e-05, "loss": 0.3505, "step": 23400 }, { "epoch": 395.36, "grad_norm": 0.9856877326965332, "learning_rate": 7.662131065532767e-05, "loss": 0.351, "step": 23425 }, { "epoch": 395.78, "grad_norm": 1.3601658344268799, "learning_rate": 7.659629814907454e-05, "loss": 0.3492, "step": 23450 }, { "epoch": 396.2, "grad_norm": 0.7440332174301147, "learning_rate": 7.657128564282142e-05, "loss": 0.3514, "step": 23475 }, { "epoch": 396.62, "grad_norm": 0.7078417539596558, "learning_rate": 7.654627313656829e-05, "loss": 0.3488, "step": 23500 }, { "epoch": 397.05, "grad_norm": 1.236758828163147, "learning_rate": 7.652126063031516e-05, "loss": 0.3502, "step": 23525 }, { "epoch": 397.47, "grad_norm": 1.003463864326477, "learning_rate": 7.649624812406204e-05, "loss": 0.3538, "step": 23550 }, { "epoch": 397.89, "grad_norm": 0.7717064619064331, "learning_rate": 7.647123561780891e-05, "loss": 0.3496, "step": 23575 }, { "epoch": 398.31, "grad_norm": 1.1512540578842163, "learning_rate": 7.644622311155578e-05, "loss": 0.3517, "step": 23600 }, { "epoch": 398.73, "grad_norm": 2.2473130226135254, "learning_rate": 7.642121060530265e-05, "loss": 0.3506, "step": 23625 }, { "epoch": 399.16, "grad_norm": 0.7197448015213013, "learning_rate": 7.639619809904953e-05, "loss": 0.3506, "step": 23650 }, { "epoch": 399.58, "grad_norm": 0.7906684279441833, "learning_rate": 7.63711855927964e-05, "loss": 0.3508, "step": 23675 }, { "epoch": 400.0, "grad_norm": 1.1543790102005005, "learning_rate": 7.634617308654327e-05, "loss": 0.35, "step": 23700 }, { "epoch": 400.42, "grad_norm": 0.8989548087120056, "learning_rate": 7.632116058029016e-05, "loss": 0.3498, "step": 23725 }, { "epoch": 400.84, "grad_norm": 1.3787543773651123, "learning_rate": 7.629614807403702e-05, "loss": 0.3509, "step": 23750 }, { "epoch": 401.27, "grad_norm": 2.556903600692749, "learning_rate": 7.627113556778389e-05, "loss": 0.3494, "step": 23775 }, { "epoch": 401.69, "grad_norm": 0.7353131771087646, "learning_rate": 7.624612306153078e-05, "loss": 0.3496, "step": 23800 }, { "epoch": 402.11, "grad_norm": 0.6474384069442749, "learning_rate": 7.622111055527765e-05, "loss": 0.3478, "step": 23825 }, { "epoch": 402.53, "grad_norm": 1.209834098815918, "learning_rate": 7.619609804902451e-05, "loss": 0.3505, "step": 23850 }, { "epoch": 402.95, "grad_norm": 0.7472580671310425, "learning_rate": 7.617108554277139e-05, "loss": 0.3478, "step": 23875 }, { "epoch": 403.38, "grad_norm": 0.9473714232444763, "learning_rate": 7.614607303651827e-05, "loss": 0.3476, "step": 23900 }, { "epoch": 403.8, "grad_norm": 0.7292631268501282, "learning_rate": 7.612106053026512e-05, "loss": 0.351, "step": 23925 }, { "epoch": 404.22, "grad_norm": 1.0364242792129517, "learning_rate": 7.609604802401201e-05, "loss": 0.3483, "step": 23950 }, { "epoch": 404.64, "grad_norm": 1.222464680671692, "learning_rate": 7.607103551775888e-05, "loss": 0.3458, "step": 23975 }, { "epoch": 405.06, "grad_norm": 1.0845186710357666, "learning_rate": 7.604602301150576e-05, "loss": 0.3475, "step": 24000 }, { "epoch": 405.49, "grad_norm": 0.7935957908630371, "learning_rate": 7.602101050525263e-05, "loss": 0.3475, "step": 24025 }, { "epoch": 405.91, "grad_norm": 0.7458210587501526, "learning_rate": 7.59959979989995e-05, "loss": 0.347, "step": 24050 }, { "epoch": 406.33, "grad_norm": 1.182937741279602, "learning_rate": 7.597098549274637e-05, "loss": 0.3514, "step": 24075 }, { "epoch": 406.75, "grad_norm": 1.2656177282333374, "learning_rate": 7.594597298649325e-05, "loss": 0.3515, "step": 24100 }, { "epoch": 407.17, "grad_norm": 1.1706290245056152, "learning_rate": 7.592096048024012e-05, "loss": 0.3526, "step": 24125 }, { "epoch": 407.59, "grad_norm": 0.8373987078666687, "learning_rate": 7.589594797398699e-05, "loss": 0.354, "step": 24150 }, { "epoch": 408.02, "grad_norm": 0.9236277937889099, "learning_rate": 7.587093546773388e-05, "loss": 0.3504, "step": 24175 }, { "epoch": 408.44, "grad_norm": 0.7903749346733093, "learning_rate": 7.584592296148074e-05, "loss": 0.3536, "step": 24200 }, { "epoch": 408.86, "grad_norm": 0.8512540459632874, "learning_rate": 7.582091045522762e-05, "loss": 0.3498, "step": 24225 }, { "epoch": 409.28, "grad_norm": 0.957252562046051, "learning_rate": 7.57958979489745e-05, "loss": 0.3528, "step": 24250 }, { "epoch": 409.7, "grad_norm": 0.7266885638237, "learning_rate": 7.577088544272136e-05, "loss": 0.3507, "step": 24275 }, { "epoch": 410.13, "grad_norm": 0.755660891532898, "learning_rate": 7.574587293646824e-05, "loss": 0.3504, "step": 24300 }, { "epoch": 410.55, "grad_norm": 0.6371527314186096, "learning_rate": 7.572086043021511e-05, "loss": 0.3499, "step": 24325 }, { "epoch": 410.97, "grad_norm": 1.0103071928024292, "learning_rate": 7.569584792396199e-05, "loss": 0.3555, "step": 24350 }, { "epoch": 411.39, "grad_norm": 0.6725526452064514, "learning_rate": 7.567083541770886e-05, "loss": 0.3531, "step": 24375 }, { "epoch": 411.81, "grad_norm": 2.2260618209838867, "learning_rate": 7.564582291145573e-05, "loss": 0.3494, "step": 24400 }, { "epoch": 412.24, "grad_norm": 0.9249132871627808, "learning_rate": 7.56208104052026e-05, "loss": 0.3464, "step": 24425 }, { "epoch": 412.66, "grad_norm": 1.1805235147476196, "learning_rate": 7.559579789894948e-05, "loss": 0.3487, "step": 24450 }, { "epoch": 413.08, "grad_norm": 0.692814290523529, "learning_rate": 7.557078539269635e-05, "loss": 0.3483, "step": 24475 }, { "epoch": 413.5, "grad_norm": 1.8117377758026123, "learning_rate": 7.554577288644322e-05, "loss": 0.3512, "step": 24500 }, { "epoch": 413.92, "grad_norm": 1.1622651815414429, "learning_rate": 7.552076038019011e-05, "loss": 0.3512, "step": 24525 }, { "epoch": 414.35, "grad_norm": 1.6757347583770752, "learning_rate": 7.549574787393697e-05, "loss": 0.3483, "step": 24550 }, { "epoch": 414.77, "grad_norm": 1.9494218826293945, "learning_rate": 7.547073536768384e-05, "loss": 0.3485, "step": 24575 }, { "epoch": 415.19, "grad_norm": 0.8337185382843018, "learning_rate": 7.544572286143073e-05, "loss": 0.3478, "step": 24600 }, { "epoch": 415.61, "grad_norm": 1.034110188484192, "learning_rate": 7.542071035517759e-05, "loss": 0.3461, "step": 24625 }, { "epoch": 416.03, "grad_norm": 1.373704433441162, "learning_rate": 7.539569784892446e-05, "loss": 0.3469, "step": 24650 }, { "epoch": 416.46, "grad_norm": 2.0701894760131836, "learning_rate": 7.537068534267135e-05, "loss": 0.35, "step": 24675 }, { "epoch": 416.88, "grad_norm": 0.7142254114151001, "learning_rate": 7.534567283641822e-05, "loss": 0.3495, "step": 24700 }, { "epoch": 417.3, "grad_norm": 1.1682052612304688, "learning_rate": 7.532066033016508e-05, "loss": 0.3531, "step": 24725 }, { "epoch": 417.72, "grad_norm": 1.1505166292190552, "learning_rate": 7.529564782391196e-05, "loss": 0.3523, "step": 24750 }, { "epoch": 418.14, "grad_norm": 0.8945952653884888, "learning_rate": 7.527063531765884e-05, "loss": 0.3481, "step": 24775 }, { "epoch": 418.57, "grad_norm": 1.270726203918457, "learning_rate": 7.524562281140571e-05, "loss": 0.3512, "step": 24800 }, { "epoch": 418.99, "grad_norm": 1.198731780052185, "learning_rate": 7.522061030515258e-05, "loss": 0.3493, "step": 24825 }, { "epoch": 419.41, "grad_norm": 0.859939694404602, "learning_rate": 7.519559779889945e-05, "loss": 0.3503, "step": 24850 }, { "epoch": 419.83, "grad_norm": 0.8972505927085876, "learning_rate": 7.517058529264633e-05, "loss": 0.3473, "step": 24875 }, { "epoch": 420.25, "grad_norm": 0.8447808027267456, "learning_rate": 7.51455727863932e-05, "loss": 0.3496, "step": 24900 }, { "epoch": 420.68, "grad_norm": 0.7411611080169678, "learning_rate": 7.512056028014007e-05, "loss": 0.3488, "step": 24925 }, { "epoch": 421.1, "grad_norm": 1.642919898033142, "learning_rate": 7.509554777388694e-05, "loss": 0.3498, "step": 24950 }, { "epoch": 421.52, "grad_norm": 0.7631085515022278, "learning_rate": 7.507053526763382e-05, "loss": 0.3484, "step": 24975 }, { "epoch": 421.94, "grad_norm": 0.6194952726364136, "learning_rate": 7.504552276138069e-05, "loss": 0.3527, "step": 25000 }, { "epoch": 422.36, "grad_norm": 2.585062026977539, "learning_rate": 7.502051025512756e-05, "loss": 0.3486, "step": 25025 }, { "epoch": 422.78, "grad_norm": 1.322021722793579, "learning_rate": 7.499549774887445e-05, "loss": 0.3481, "step": 25050 }, { "epoch": 423.21, "grad_norm": 0.7388932704925537, "learning_rate": 7.497048524262131e-05, "loss": 0.3483, "step": 25075 }, { "epoch": 423.63, "grad_norm": 0.9445899128913879, "learning_rate": 7.494547273636818e-05, "loss": 0.3488, "step": 25100 }, { "epoch": 424.05, "grad_norm": 0.7326875329017639, "learning_rate": 7.492046023011507e-05, "loss": 0.3471, "step": 25125 }, { "epoch": 424.47, "grad_norm": 0.8528832197189331, "learning_rate": 7.489544772386194e-05, "loss": 0.3483, "step": 25150 }, { "epoch": 424.89, "grad_norm": 0.8951069116592407, "learning_rate": 7.48704352176088e-05, "loss": 0.3515, "step": 25175 }, { "epoch": 425.32, "grad_norm": 1.1460777521133423, "learning_rate": 7.484542271135568e-05, "loss": 0.3491, "step": 25200 }, { "epoch": 425.74, "grad_norm": 0.9503458142280579, "learning_rate": 7.482041020510256e-05, "loss": 0.3486, "step": 25225 }, { "epoch": 426.16, "grad_norm": 0.9234389662742615, "learning_rate": 7.479539769884943e-05, "loss": 0.3508, "step": 25250 }, { "epoch": 426.58, "grad_norm": 0.7630634903907776, "learning_rate": 7.47703851925963e-05, "loss": 0.3466, "step": 25275 }, { "epoch": 427.0, "grad_norm": 0.6740624308586121, "learning_rate": 7.474537268634317e-05, "loss": 0.3466, "step": 25300 }, { "epoch": 427.43, "grad_norm": 0.7224277257919312, "learning_rate": 7.472036018009005e-05, "loss": 0.3466, "step": 25325 }, { "epoch": 427.85, "grad_norm": 0.908371090888977, "learning_rate": 7.469534767383692e-05, "loss": 0.348, "step": 25350 }, { "epoch": 428.27, "grad_norm": 1.1080868244171143, "learning_rate": 7.467033516758379e-05, "loss": 0.3489, "step": 25375 }, { "epoch": 428.69, "grad_norm": 0.8420177698135376, "learning_rate": 7.464532266133068e-05, "loss": 0.3448, "step": 25400 }, { "epoch": 429.11, "grad_norm": 0.8518602848052979, "learning_rate": 7.462031015507754e-05, "loss": 0.3476, "step": 25425 }, { "epoch": 429.54, "grad_norm": 0.9447184205055237, "learning_rate": 7.459529764882441e-05, "loss": 0.3486, "step": 25450 }, { "epoch": 429.96, "grad_norm": 1.6682593822479248, "learning_rate": 7.45702851425713e-05, "loss": 0.3462, "step": 25475 }, { "epoch": 430.38, "grad_norm": 0.7236553430557251, "learning_rate": 7.454527263631817e-05, "loss": 0.3478, "step": 25500 }, { "epoch": 430.8, "grad_norm": 1.0580646991729736, "learning_rate": 7.452026013006503e-05, "loss": 0.3463, "step": 25525 }, { "epoch": 431.22, "grad_norm": 0.7624313235282898, "learning_rate": 7.449524762381191e-05, "loss": 0.3442, "step": 25550 }, { "epoch": 431.65, "grad_norm": 1.4893125295639038, "learning_rate": 7.447023511755879e-05, "loss": 0.3488, "step": 25575 }, { "epoch": 432.07, "grad_norm": 0.8386535048484802, "learning_rate": 7.444522261130566e-05, "loss": 0.3463, "step": 25600 }, { "epoch": 432.49, "grad_norm": 0.7559728622436523, "learning_rate": 7.442021010505253e-05, "loss": 0.3475, "step": 25625 }, { "epoch": 432.91, "grad_norm": 2.2599916458129883, "learning_rate": 7.43951975987994e-05, "loss": 0.3528, "step": 25650 }, { "epoch": 433.33, "grad_norm": 1.3162493705749512, "learning_rate": 7.437018509254628e-05, "loss": 0.3456, "step": 25675 }, { "epoch": 433.76, "grad_norm": 0.6015697717666626, "learning_rate": 7.434517258629315e-05, "loss": 0.3457, "step": 25700 }, { "epoch": 434.18, "grad_norm": 0.8339949250221252, "learning_rate": 7.432016008004002e-05, "loss": 0.3545, "step": 25725 }, { "epoch": 434.6, "grad_norm": 1.8403244018554688, "learning_rate": 7.42951475737869e-05, "loss": 0.3468, "step": 25750 }, { "epoch": 435.02, "grad_norm": 0.9677891135215759, "learning_rate": 7.427013506753377e-05, "loss": 0.3464, "step": 25775 }, { "epoch": 435.44, "grad_norm": 0.6871969699859619, "learning_rate": 7.424512256128064e-05, "loss": 0.348, "step": 25800 }, { "epoch": 435.86, "grad_norm": 0.7382087111473083, "learning_rate": 7.422011005502751e-05, "loss": 0.3482, "step": 25825 }, { "epoch": 436.29, "grad_norm": 1.1275956630706787, "learning_rate": 7.41950975487744e-05, "loss": 0.3494, "step": 25850 }, { "epoch": 436.71, "grad_norm": 1.573341727256775, "learning_rate": 7.417008504252126e-05, "loss": 0.3451, "step": 25875 }, { "epoch": 437.13, "grad_norm": 0.8995547890663147, "learning_rate": 7.414507253626813e-05, "loss": 0.3454, "step": 25900 }, { "epoch": 437.55, "grad_norm": 1.1728099584579468, "learning_rate": 7.412006003001502e-05, "loss": 0.3475, "step": 25925 }, { "epoch": 437.97, "grad_norm": 0.7371329069137573, "learning_rate": 7.409504752376189e-05, "loss": 0.3454, "step": 25950 }, { "epoch": 438.4, "grad_norm": 1.9534075260162354, "learning_rate": 7.407003501750875e-05, "loss": 0.3493, "step": 25975 }, { "epoch": 438.82, "grad_norm": 0.702603280544281, "learning_rate": 7.404502251125564e-05, "loss": 0.3491, "step": 26000 }, { "epoch": 439.24, "grad_norm": 1.1245557069778442, "learning_rate": 7.402001000500251e-05, "loss": 0.3477, "step": 26025 }, { "epoch": 439.66, "grad_norm": 1.8109171390533447, "learning_rate": 7.399499749874937e-05, "loss": 0.3491, "step": 26050 }, { "epoch": 440.08, "grad_norm": 0.9915460348129272, "learning_rate": 7.396998499249625e-05, "loss": 0.3503, "step": 26075 }, { "epoch": 440.51, "grad_norm": 0.618349552154541, "learning_rate": 7.394497248624313e-05, "loss": 0.3474, "step": 26100 }, { "epoch": 440.93, "grad_norm": 1.0291370153427124, "learning_rate": 7.391995997999e-05, "loss": 0.3479, "step": 26125 }, { "epoch": 441.35, "grad_norm": 1.2550098896026611, "learning_rate": 7.389494747373687e-05, "loss": 0.3521, "step": 26150 }, { "epoch": 441.77, "grad_norm": 0.7164062857627869, "learning_rate": 7.386993496748374e-05, "loss": 0.3494, "step": 26175 }, { "epoch": 442.19, "grad_norm": 2.092674732208252, "learning_rate": 7.384492246123062e-05, "loss": 0.3514, "step": 26200 }, { "epoch": 442.62, "grad_norm": 0.7797991633415222, "learning_rate": 7.381990995497749e-05, "loss": 0.3452, "step": 26225 }, { "epoch": 443.04, "grad_norm": 1.0066286325454712, "learning_rate": 7.379489744872436e-05, "loss": 0.348, "step": 26250 }, { "epoch": 443.46, "grad_norm": 0.6428666114807129, "learning_rate": 7.376988494247124e-05, "loss": 0.3448, "step": 26275 }, { "epoch": 443.88, "grad_norm": 2.1024796962738037, "learning_rate": 7.374487243621812e-05, "loss": 0.3476, "step": 26300 }, { "epoch": 444.3, "grad_norm": 1.4155186414718628, "learning_rate": 7.371985992996498e-05, "loss": 0.3478, "step": 26325 }, { "epoch": 444.73, "grad_norm": 1.0554453134536743, "learning_rate": 7.369484742371185e-05, "loss": 0.3446, "step": 26350 }, { "epoch": 445.15, "grad_norm": 0.630517840385437, "learning_rate": 7.366983491745874e-05, "loss": 0.3452, "step": 26375 }, { "epoch": 445.57, "grad_norm": 1.1623594760894775, "learning_rate": 7.36448224112056e-05, "loss": 0.3469, "step": 26400 }, { "epoch": 445.99, "grad_norm": 0.6913239359855652, "learning_rate": 7.361980990495247e-05, "loss": 0.3452, "step": 26425 }, { "epoch": 446.41, "grad_norm": 0.7332262992858887, "learning_rate": 7.359479739869936e-05, "loss": 0.3482, "step": 26450 }, { "epoch": 446.84, "grad_norm": 0.6684566140174866, "learning_rate": 7.356978489244623e-05, "loss": 0.3438, "step": 26475 }, { "epoch": 447.26, "grad_norm": 1.5724890232086182, "learning_rate": 7.354477238619309e-05, "loss": 0.3469, "step": 26500 }, { "epoch": 447.68, "grad_norm": 1.081941843032837, "learning_rate": 7.351975987993998e-05, "loss": 0.3479, "step": 26525 }, { "epoch": 448.1, "grad_norm": 1.2339354753494263, "learning_rate": 7.349474737368685e-05, "loss": 0.3489, "step": 26550 }, { "epoch": 448.52, "grad_norm": 0.8950616121292114, "learning_rate": 7.346973486743372e-05, "loss": 0.3477, "step": 26575 }, { "epoch": 448.95, "grad_norm": 0.8539718985557556, "learning_rate": 7.34447223611806e-05, "loss": 0.3471, "step": 26600 }, { "epoch": 449.37, "grad_norm": 2.4147543907165527, "learning_rate": 7.341970985492747e-05, "loss": 0.3462, "step": 26625 }, { "epoch": 449.79, "grad_norm": 0.7275307178497314, "learning_rate": 7.339469734867435e-05, "loss": 0.3488, "step": 26650 }, { "epoch": 450.21, "grad_norm": 0.7583906054496765, "learning_rate": 7.336968484242121e-05, "loss": 0.3444, "step": 26675 }, { "epoch": 450.63, "grad_norm": 0.8080020546913147, "learning_rate": 7.334467233616808e-05, "loss": 0.3438, "step": 26700 }, { "epoch": 451.05, "grad_norm": 1.099989891052246, "learning_rate": 7.331965982991497e-05, "loss": 0.3496, "step": 26725 }, { "epoch": 451.48, "grad_norm": 1.3805567026138306, "learning_rate": 7.329464732366183e-05, "loss": 0.3462, "step": 26750 }, { "epoch": 451.9, "grad_norm": 1.597778558731079, "learning_rate": 7.32696348174087e-05, "loss": 0.3454, "step": 26775 }, { "epoch": 452.32, "grad_norm": 0.7864617705345154, "learning_rate": 7.324462231115559e-05, "loss": 0.3449, "step": 26800 }, { "epoch": 452.74, "grad_norm": 1.312760353088379, "learning_rate": 7.321960980490246e-05, "loss": 0.347, "step": 26825 }, { "epoch": 453.16, "grad_norm": 0.9649230241775513, "learning_rate": 7.319459729864932e-05, "loss": 0.3458, "step": 26850 }, { "epoch": 453.59, "grad_norm": 2.123690605163574, "learning_rate": 7.31695847923962e-05, "loss": 0.3438, "step": 26875 }, { "epoch": 454.01, "grad_norm": 0.5515177249908447, "learning_rate": 7.314457228614308e-05, "loss": 0.3494, "step": 26900 }, { "epoch": 454.43, "grad_norm": 1.092910885810852, "learning_rate": 7.311955977988995e-05, "loss": 0.3474, "step": 26925 }, { "epoch": 454.85, "grad_norm": 0.9788797497749329, "learning_rate": 7.309454727363682e-05, "loss": 0.3463, "step": 26950 }, { "epoch": 455.27, "grad_norm": 0.9070157408714294, "learning_rate": 7.30695347673837e-05, "loss": 0.348, "step": 26975 }, { "epoch": 455.7, "grad_norm": 1.222312092781067, "learning_rate": 7.304452226113057e-05, "loss": 0.3455, "step": 27000 }, { "epoch": 456.12, "grad_norm": 0.7826572060585022, "learning_rate": 7.301950975487744e-05, "loss": 0.3449, "step": 27025 }, { "epoch": 456.54, "grad_norm": 0.5664150714874268, "learning_rate": 7.299449724862431e-05, "loss": 0.3432, "step": 27050 }, { "epoch": 456.96, "grad_norm": 1.144474744796753, "learning_rate": 7.296948474237119e-05, "loss": 0.3488, "step": 27075 }, { "epoch": 457.38, "grad_norm": 1.2186014652252197, "learning_rate": 7.294447223611806e-05, "loss": 0.3464, "step": 27100 }, { "epoch": 457.81, "grad_norm": 0.5577250719070435, "learning_rate": 7.291945972986493e-05, "loss": 0.3458, "step": 27125 }, { "epoch": 458.23, "grad_norm": 0.7322788834571838, "learning_rate": 7.28944472236118e-05, "loss": 0.3444, "step": 27150 }, { "epoch": 458.65, "grad_norm": 0.8182567358016968, "learning_rate": 7.286943471735869e-05, "loss": 0.3444, "step": 27175 }, { "epoch": 459.07, "grad_norm": 0.8245006203651428, "learning_rate": 7.284442221110555e-05, "loss": 0.3444, "step": 27200 }, { "epoch": 459.49, "grad_norm": 0.9580527544021606, "learning_rate": 7.281940970485242e-05, "loss": 0.344, "step": 27225 }, { "epoch": 459.92, "grad_norm": 0.6516417860984802, "learning_rate": 7.279439719859931e-05, "loss": 0.3468, "step": 27250 }, { "epoch": 460.34, "grad_norm": 0.8485473990440369, "learning_rate": 7.276938469234618e-05, "loss": 0.3491, "step": 27275 }, { "epoch": 460.76, "grad_norm": 1.5644819736480713, "learning_rate": 7.274437218609304e-05, "loss": 0.346, "step": 27300 }, { "epoch": 461.18, "grad_norm": 0.8502500057220459, "learning_rate": 7.271935967983993e-05, "loss": 0.3439, "step": 27325 }, { "epoch": 461.6, "grad_norm": 0.991686999797821, "learning_rate": 7.269534767383693e-05, "loss": 0.3433, "step": 27350 }, { "epoch": 462.03, "grad_norm": 0.7654645442962646, "learning_rate": 7.26703351675838e-05, "loss": 0.3429, "step": 27375 }, { "epoch": 462.45, "grad_norm": 0.7543670535087585, "learning_rate": 7.264532266133067e-05, "loss": 0.3456, "step": 27400 }, { "epoch": 462.87, "grad_norm": 1.0972864627838135, "learning_rate": 7.262031015507754e-05, "loss": 0.3464, "step": 27425 }, { "epoch": 463.29, "grad_norm": 0.78289794921875, "learning_rate": 7.259629814907454e-05, "loss": 0.3458, "step": 27450 }, { "epoch": 463.71, "grad_norm": 1.2184090614318848, "learning_rate": 7.257128564282141e-05, "loss": 0.3453, "step": 27475 }, { "epoch": 464.14, "grad_norm": 0.6328375935554504, "learning_rate": 7.254627313656829e-05, "loss": 0.349, "step": 27500 }, { "epoch": 464.56, "grad_norm": 0.6412923336029053, "learning_rate": 7.252126063031516e-05, "loss": 0.3461, "step": 27525 }, { "epoch": 464.98, "grad_norm": 1.419706106185913, "learning_rate": 7.249624812406203e-05, "loss": 0.3454, "step": 27550 }, { "epoch": 465.4, "grad_norm": 0.7411277890205383, "learning_rate": 7.24712356178089e-05, "loss": 0.3474, "step": 27575 }, { "epoch": 465.82, "grad_norm": 1.253311038017273, "learning_rate": 7.244622311155578e-05, "loss": 0.3479, "step": 27600 }, { "epoch": 466.24, "grad_norm": 0.6767467856407166, "learning_rate": 7.242121060530266e-05, "loss": 0.3463, "step": 27625 }, { "epoch": 466.67, "grad_norm": 0.9300587177276611, "learning_rate": 7.239619809904952e-05, "loss": 0.3429, "step": 27650 }, { "epoch": 467.09, "grad_norm": 1.1075583696365356, "learning_rate": 7.23711855927964e-05, "loss": 0.345, "step": 27675 }, { "epoch": 467.51, "grad_norm": 1.3273155689239502, "learning_rate": 7.234617308654328e-05, "loss": 0.3458, "step": 27700 }, { "epoch": 467.93, "grad_norm": 1.16497802734375, "learning_rate": 7.232116058029015e-05, "loss": 0.3477, "step": 27725 }, { "epoch": 468.35, "grad_norm": 1.0642740726470947, "learning_rate": 7.229614807403701e-05, "loss": 0.3436, "step": 27750 }, { "epoch": 468.78, "grad_norm": 0.9870076179504395, "learning_rate": 7.22711355677839e-05, "loss": 0.3441, "step": 27775 }, { "epoch": 469.2, "grad_norm": 0.6433194279670715, "learning_rate": 7.224612306153077e-05, "loss": 0.345, "step": 27800 }, { "epoch": 469.62, "grad_norm": 1.2803293466567993, "learning_rate": 7.222111055527763e-05, "loss": 0.3438, "step": 27825 }, { "epoch": 470.04, "grad_norm": 0.8786343336105347, "learning_rate": 7.219609804902452e-05, "loss": 0.3467, "step": 27850 }, { "epoch": 470.46, "grad_norm": 0.8384103775024414, "learning_rate": 7.217108554277139e-05, "loss": 0.3459, "step": 27875 }, { "epoch": 470.89, "grad_norm": 0.6292517185211182, "learning_rate": 7.214607303651826e-05, "loss": 0.3469, "step": 27900 }, { "epoch": 471.31, "grad_norm": 0.7226274609565735, "learning_rate": 7.212106053026514e-05, "loss": 0.3453, "step": 27925 }, { "epoch": 471.73, "grad_norm": 1.099734902381897, "learning_rate": 7.209604802401201e-05, "loss": 0.3436, "step": 27950 }, { "epoch": 472.15, "grad_norm": 1.2019050121307373, "learning_rate": 7.207103551775888e-05, "loss": 0.3441, "step": 27975 }, { "epoch": 472.57, "grad_norm": 0.8609626293182373, "learning_rate": 7.204602301150575e-05, "loss": 0.3485, "step": 28000 }, { "epoch": 473.0, "grad_norm": 0.8070003986358643, "learning_rate": 7.202101050525263e-05, "loss": 0.3442, "step": 28025 }, { "epoch": 473.42, "grad_norm": 0.7357681393623352, "learning_rate": 7.19959979989995e-05, "loss": 0.3412, "step": 28050 }, { "epoch": 473.84, "grad_norm": 0.7148904800415039, "learning_rate": 7.197098549274639e-05, "loss": 0.3436, "step": 28075 }, { "epoch": 474.26, "grad_norm": 1.1431502103805542, "learning_rate": 7.194597298649324e-05, "loss": 0.3454, "step": 28100 }, { "epoch": 474.68, "grad_norm": 3.0286049842834473, "learning_rate": 7.192096048024012e-05, "loss": 0.3454, "step": 28125 }, { "epoch": 475.11, "grad_norm": 1.5722769498825073, "learning_rate": 7.1895947973987e-05, "loss": 0.3437, "step": 28150 }, { "epoch": 475.53, "grad_norm": 1.0324088335037231, "learning_rate": 7.187093546773386e-05, "loss": 0.3424, "step": 28175 }, { "epoch": 475.95, "grad_norm": 0.8513792157173157, "learning_rate": 7.184592296148074e-05, "loss": 0.347, "step": 28200 }, { "epoch": 476.37, "grad_norm": 0.9819296598434448, "learning_rate": 7.182091045522762e-05, "loss": 0.343, "step": 28225 }, { "epoch": 476.79, "grad_norm": 1.2247273921966553, "learning_rate": 7.17958979489745e-05, "loss": 0.3459, "step": 28250 }, { "epoch": 477.22, "grad_norm": 1.0204728841781616, "learning_rate": 7.177088544272137e-05, "loss": 0.3453, "step": 28275 }, { "epoch": 477.64, "grad_norm": 0.6648063063621521, "learning_rate": 7.174587293646824e-05, "loss": 0.3462, "step": 28300 }, { "epoch": 478.06, "grad_norm": 0.8275918960571289, "learning_rate": 7.172086043021511e-05, "loss": 0.344, "step": 28325 }, { "epoch": 478.48, "grad_norm": 1.0491819381713867, "learning_rate": 7.169584792396198e-05, "loss": 0.3439, "step": 28350 }, { "epoch": 478.9, "grad_norm": 1.2293540239334106, "learning_rate": 7.167083541770886e-05, "loss": 0.3437, "step": 28375 }, { "epoch": 479.32, "grad_norm": 0.8363987803459167, "learning_rate": 7.164582291145573e-05, "loss": 0.3443, "step": 28400 }, { "epoch": 479.75, "grad_norm": 1.0335724353790283, "learning_rate": 7.162081040520262e-05, "loss": 0.3466, "step": 28425 }, { "epoch": 480.17, "grad_norm": 1.0848238468170166, "learning_rate": 7.159579789894948e-05, "loss": 0.3426, "step": 28450 }, { "epoch": 480.59, "grad_norm": 0.7144072651863098, "learning_rate": 7.157078539269635e-05, "loss": 0.3424, "step": 28475 }, { "epoch": 481.01, "grad_norm": 0.8419021368026733, "learning_rate": 7.154577288644323e-05, "loss": 0.3432, "step": 28500 }, { "epoch": 481.43, "grad_norm": 0.9820536971092224, "learning_rate": 7.15207603801901e-05, "loss": 0.3411, "step": 28525 }, { "epoch": 481.86, "grad_norm": 0.49246159195899963, "learning_rate": 7.149574787393697e-05, "loss": 0.3407, "step": 28550 }, { "epoch": 482.28, "grad_norm": 0.9092519283294678, "learning_rate": 7.147073536768385e-05, "loss": 0.3442, "step": 28575 }, { "epoch": 482.7, "grad_norm": 0.7360043525695801, "learning_rate": 7.144572286143072e-05, "loss": 0.3465, "step": 28600 }, { "epoch": 483.12, "grad_norm": 1.4036813974380493, "learning_rate": 7.142071035517758e-05, "loss": 0.3426, "step": 28625 }, { "epoch": 483.54, "grad_norm": 1.4704846143722534, "learning_rate": 7.139569784892447e-05, "loss": 0.3433, "step": 28650 }, { "epoch": 483.97, "grad_norm": 0.7687681913375854, "learning_rate": 7.137068534267134e-05, "loss": 0.3422, "step": 28675 }, { "epoch": 484.39, "grad_norm": 1.1586804389953613, "learning_rate": 7.134567283641822e-05, "loss": 0.3418, "step": 28700 }, { "epoch": 484.81, "grad_norm": 0.7006085515022278, "learning_rate": 7.132066033016509e-05, "loss": 0.3427, "step": 28725 }, { "epoch": 485.23, "grad_norm": 1.1023319959640503, "learning_rate": 7.129564782391196e-05, "loss": 0.3437, "step": 28750 }, { "epoch": 485.65, "grad_norm": 0.7163932919502258, "learning_rate": 7.127063531765883e-05, "loss": 0.3432, "step": 28775 }, { "epoch": 486.08, "grad_norm": 0.7462373971939087, "learning_rate": 7.12456228114057e-05, "loss": 0.3421, "step": 28800 }, { "epoch": 486.5, "grad_norm": 0.8255560994148254, "learning_rate": 7.122061030515258e-05, "loss": 0.3427, "step": 28825 }, { "epoch": 486.92, "grad_norm": 0.9154626131057739, "learning_rate": 7.119559779889945e-05, "loss": 0.3452, "step": 28850 }, { "epoch": 487.34, "grad_norm": 1.038651704788208, "learning_rate": 7.117058529264632e-05, "loss": 0.3418, "step": 28875 }, { "epoch": 487.76, "grad_norm": 1.0090618133544922, "learning_rate": 7.11455727863932e-05, "loss": 0.3497, "step": 28900 }, { "epoch": 488.19, "grad_norm": 0.8017352819442749, "learning_rate": 7.112056028014007e-05, "loss": 0.3443, "step": 28925 }, { "epoch": 488.61, "grad_norm": 1.1634854078292847, "learning_rate": 7.109554777388696e-05, "loss": 0.3445, "step": 28950 }, { "epoch": 489.03, "grad_norm": 0.6503928303718567, "learning_rate": 7.107053526763381e-05, "loss": 0.3421, "step": 28975 }, { "epoch": 489.45, "grad_norm": 1.2449018955230713, "learning_rate": 7.104552276138069e-05, "loss": 0.342, "step": 29000 }, { "epoch": 489.87, "grad_norm": 0.6309882998466492, "learning_rate": 7.102051025512757e-05, "loss": 0.3438, "step": 29025 }, { "epoch": 490.3, "grad_norm": 0.9536778330802917, "learning_rate": 7.099549774887445e-05, "loss": 0.3445, "step": 29050 }, { "epoch": 490.72, "grad_norm": 1.489776849746704, "learning_rate": 7.09704852426213e-05, "loss": 0.3433, "step": 29075 }, { "epoch": 491.14, "grad_norm": 1.0106620788574219, "learning_rate": 7.094547273636819e-05, "loss": 0.3476, "step": 29100 }, { "epoch": 491.56, "grad_norm": 0.9181020259857178, "learning_rate": 7.092046023011506e-05, "loss": 0.3441, "step": 29125 }, { "epoch": 491.98, "grad_norm": 0.8944607973098755, "learning_rate": 7.089544772386192e-05, "loss": 0.3429, "step": 29150 }, { "epoch": 492.41, "grad_norm": 0.7976635098457336, "learning_rate": 7.087043521760881e-05, "loss": 0.3439, "step": 29175 }, { "epoch": 492.83, "grad_norm": 1.0343310832977295, "learning_rate": 7.084542271135568e-05, "loss": 0.3415, "step": 29200 }, { "epoch": 493.25, "grad_norm": 1.6515558958053589, "learning_rate": 7.082041020510255e-05, "loss": 0.3448, "step": 29225 }, { "epoch": 493.67, "grad_norm": 0.711284339427948, "learning_rate": 7.079539769884943e-05, "loss": 0.3448, "step": 29250 }, { "epoch": 494.09, "grad_norm": 0.5497289896011353, "learning_rate": 7.07703851925963e-05, "loss": 0.3431, "step": 29275 }, { "epoch": 494.51, "grad_norm": 1.828613042831421, "learning_rate": 7.074537268634317e-05, "loss": 0.3438, "step": 29300 }, { "epoch": 494.94, "grad_norm": 0.8029699921607971, "learning_rate": 7.072036018009005e-05, "loss": 0.3428, "step": 29325 }, { "epoch": 495.36, "grad_norm": 1.4300602674484253, "learning_rate": 7.069534767383692e-05, "loss": 0.3448, "step": 29350 }, { "epoch": 495.78, "grad_norm": 0.6828479766845703, "learning_rate": 7.067033516758379e-05, "loss": 0.3416, "step": 29375 }, { "epoch": 496.2, "grad_norm": 0.5957367420196533, "learning_rate": 7.064532266133068e-05, "loss": 0.3433, "step": 29400 }, { "epoch": 496.62, "grad_norm": 0.708058774471283, "learning_rate": 7.062031015507754e-05, "loss": 0.3457, "step": 29425 }, { "epoch": 497.05, "grad_norm": 1.1386007070541382, "learning_rate": 7.059529764882442e-05, "loss": 0.3429, "step": 29450 }, { "epoch": 497.47, "grad_norm": 0.9724351167678833, "learning_rate": 7.05702851425713e-05, "loss": 0.3465, "step": 29475 }, { "epoch": 497.89, "grad_norm": 0.7893127799034119, "learning_rate": 7.054527263631815e-05, "loss": 0.3416, "step": 29500 }, { "epoch": 498.31, "grad_norm": 0.6965900659561157, "learning_rate": 7.052026013006504e-05, "loss": 0.3421, "step": 29525 }, { "epoch": 498.73, "grad_norm": 0.8730905055999756, "learning_rate": 7.049524762381191e-05, "loss": 0.3437, "step": 29550 }, { "epoch": 499.16, "grad_norm": 1.3555381298065186, "learning_rate": 7.047023511755879e-05, "loss": 0.3427, "step": 29575 }, { "epoch": 499.58, "grad_norm": 0.8411984443664551, "learning_rate": 7.044522261130566e-05, "loss": 0.3407, "step": 29600 }, { "epoch": 500.0, "grad_norm": 0.7559008002281189, "learning_rate": 7.042021010505253e-05, "loss": 0.3425, "step": 29625 }, { "epoch": 500.42, "grad_norm": 2.1265361309051514, "learning_rate": 7.03951975987994e-05, "loss": 0.3436, "step": 29650 }, { "epoch": 500.84, "grad_norm": 1.3348963260650635, "learning_rate": 7.037018509254628e-05, "loss": 0.3418, "step": 29675 }, { "epoch": 501.27, "grad_norm": 0.6080963611602783, "learning_rate": 7.034517258629315e-05, "loss": 0.3424, "step": 29700 }, { "epoch": 501.69, "grad_norm": 0.8030952215194702, "learning_rate": 7.032016008004002e-05, "loss": 0.3424, "step": 29725 }, { "epoch": 502.11, "grad_norm": 0.9044283628463745, "learning_rate": 7.029514757378691e-05, "loss": 0.3454, "step": 29750 }, { "epoch": 502.53, "grad_norm": 2.8454859256744385, "learning_rate": 7.027113556778389e-05, "loss": 0.3434, "step": 29775 }, { "epoch": 502.95, "grad_norm": 0.8920661211013794, "learning_rate": 7.024612306153076e-05, "loss": 0.3446, "step": 29800 }, { "epoch": 503.38, "grad_norm": 0.7959532141685486, "learning_rate": 7.022111055527765e-05, "loss": 0.3426, "step": 29825 }, { "epoch": 503.8, "grad_norm": 0.7092351913452148, "learning_rate": 7.019609804902451e-05, "loss": 0.3413, "step": 29850 }, { "epoch": 504.22, "grad_norm": 0.9638544917106628, "learning_rate": 7.017108554277138e-05, "loss": 0.3419, "step": 29875 }, { "epoch": 504.64, "grad_norm": 0.8417400121688843, "learning_rate": 7.014607303651827e-05, "loss": 0.3435, "step": 29900 }, { "epoch": 505.06, "grad_norm": 0.9539660215377808, "learning_rate": 7.012106053026514e-05, "loss": 0.341, "step": 29925 }, { "epoch": 505.49, "grad_norm": 0.8168984055519104, "learning_rate": 7.0096048024012e-05, "loss": 0.3425, "step": 29950 }, { "epoch": 505.91, "grad_norm": 0.8086312413215637, "learning_rate": 7.007103551775889e-05, "loss": 0.3454, "step": 29975 }, { "epoch": 506.33, "grad_norm": 0.6626878380775452, "learning_rate": 7.004602301150576e-05, "loss": 0.3438, "step": 30000 }, { "epoch": 506.33, "eval_loss": 0.456908643245697, "eval_runtime": 3.9371, "eval_samples_per_second": 78.484, "eval_steps_per_second": 2.54, "step": 30000 }, { "epoch": 506.75, "grad_norm": 1.1000964641571045, "learning_rate": 7.002101050525263e-05, "loss": 0.3411, "step": 30025 }, { "epoch": 507.17, "grad_norm": 0.7044007182121277, "learning_rate": 6.99959979989995e-05, "loss": 0.3409, "step": 30050 }, { "epoch": 507.59, "grad_norm": 2.188924551010132, "learning_rate": 6.997098549274638e-05, "loss": 0.3407, "step": 30075 }, { "epoch": 508.02, "grad_norm": 1.273219108581543, "learning_rate": 6.994597298649325e-05, "loss": 0.3441, "step": 30100 }, { "epoch": 508.44, "grad_norm": 0.9229422807693481, "learning_rate": 6.992096048024012e-05, "loss": 0.3457, "step": 30125 }, { "epoch": 508.86, "grad_norm": 1.3884137868881226, "learning_rate": 6.9895947973987e-05, "loss": 0.3425, "step": 30150 }, { "epoch": 509.28, "grad_norm": 0.7064346671104431, "learning_rate": 6.987093546773387e-05, "loss": 0.3404, "step": 30175 }, { "epoch": 509.7, "grad_norm": 0.6624059677124023, "learning_rate": 6.984592296148074e-05, "loss": 0.3412, "step": 30200 }, { "epoch": 510.13, "grad_norm": 0.9021841287612915, "learning_rate": 6.982091045522761e-05, "loss": 0.3435, "step": 30225 }, { "epoch": 510.55, "grad_norm": 0.7052242755889893, "learning_rate": 6.979589794897449e-05, "loss": 0.3428, "step": 30250 }, { "epoch": 510.97, "grad_norm": 0.7692121267318726, "learning_rate": 6.977088544272137e-05, "loss": 0.341, "step": 30275 }, { "epoch": 511.39, "grad_norm": 0.7966721057891846, "learning_rate": 6.974587293646823e-05, "loss": 0.341, "step": 30300 }, { "epoch": 511.81, "grad_norm": 0.7105217576026917, "learning_rate": 6.97208604302151e-05, "loss": 0.338, "step": 30325 }, { "epoch": 512.24, "grad_norm": 0.7257682681083679, "learning_rate": 6.969584792396199e-05, "loss": 0.3447, "step": 30350 }, { "epoch": 512.66, "grad_norm": 0.6014304161071777, "learning_rate": 6.967083541770886e-05, "loss": 0.3403, "step": 30375 }, { "epoch": 513.08, "grad_norm": 0.6871097683906555, "learning_rate": 6.964582291145572e-05, "loss": 0.3396, "step": 30400 }, { "epoch": 513.5, "grad_norm": 0.8703510761260986, "learning_rate": 6.962081040520261e-05, "loss": 0.3395, "step": 30425 }, { "epoch": 513.92, "grad_norm": 0.7840629816055298, "learning_rate": 6.959579789894948e-05, "loss": 0.3408, "step": 30450 }, { "epoch": 514.35, "grad_norm": 1.2237838506698608, "learning_rate": 6.957078539269635e-05, "loss": 0.3434, "step": 30475 }, { "epoch": 514.77, "grad_norm": 0.5721182227134705, "learning_rate": 6.954577288644323e-05, "loss": 0.3437, "step": 30500 }, { "epoch": 515.19, "grad_norm": 0.8388057947158813, "learning_rate": 6.95207603801901e-05, "loss": 0.3424, "step": 30525 }, { "epoch": 515.61, "grad_norm": 0.8748851418495178, "learning_rate": 6.949574787393697e-05, "loss": 0.3416, "step": 30550 }, { "epoch": 516.03, "grad_norm": 0.6280677914619446, "learning_rate": 6.947073536768384e-05, "loss": 0.3401, "step": 30575 }, { "epoch": 516.46, "grad_norm": 2.2619357109069824, "learning_rate": 6.944572286143072e-05, "loss": 0.3455, "step": 30600 }, { "epoch": 516.88, "grad_norm": 1.2499990463256836, "learning_rate": 6.942071035517759e-05, "loss": 0.3406, "step": 30625 }, { "epoch": 517.3, "grad_norm": 0.7758705615997314, "learning_rate": 6.939569784892446e-05, "loss": 0.3412, "step": 30650 }, { "epoch": 517.72, "grad_norm": 1.0110139846801758, "learning_rate": 6.937068534267133e-05, "loss": 0.3401, "step": 30675 }, { "epoch": 518.14, "grad_norm": 0.858392059803009, "learning_rate": 6.934567283641821e-05, "loss": 0.344, "step": 30700 }, { "epoch": 518.57, "grad_norm": 0.7145842909812927, "learning_rate": 6.93206603301651e-05, "loss": 0.3446, "step": 30725 }, { "epoch": 518.99, "grad_norm": 0.7288022637367249, "learning_rate": 6.929564782391195e-05, "loss": 0.3395, "step": 30750 }, { "epoch": 519.41, "grad_norm": 0.775013267993927, "learning_rate": 6.927063531765883e-05, "loss": 0.3408, "step": 30775 }, { "epoch": 519.83, "grad_norm": 1.1766911745071411, "learning_rate": 6.924562281140571e-05, "loss": 0.3427, "step": 30800 }, { "epoch": 520.25, "grad_norm": 1.5686672925949097, "learning_rate": 6.922061030515258e-05, "loss": 0.344, "step": 30825 }, { "epoch": 520.68, "grad_norm": 0.937619149684906, "learning_rate": 6.919559779889944e-05, "loss": 0.3429, "step": 30850 }, { "epoch": 521.1, "grad_norm": 0.9144926071166992, "learning_rate": 6.917058529264633e-05, "loss": 0.3453, "step": 30875 }, { "epoch": 521.52, "grad_norm": 0.6888349056243896, "learning_rate": 6.91455727863932e-05, "loss": 0.3401, "step": 30900 }, { "epoch": 521.94, "grad_norm": 0.8494904041290283, "learning_rate": 6.912056028014007e-05, "loss": 0.3419, "step": 30925 }, { "epoch": 522.36, "grad_norm": 0.6665810346603394, "learning_rate": 6.909554777388695e-05, "loss": 0.3413, "step": 30950 }, { "epoch": 522.78, "grad_norm": 0.8109012246131897, "learning_rate": 6.907053526763382e-05, "loss": 0.3437, "step": 30975 }, { "epoch": 523.21, "grad_norm": 0.9691833853721619, "learning_rate": 6.904552276138069e-05, "loss": 0.3414, "step": 31000 }, { "epoch": 523.63, "grad_norm": 0.5574228763580322, "learning_rate": 6.902051025512757e-05, "loss": 0.3409, "step": 31025 }, { "epoch": 524.05, "grad_norm": 1.331937313079834, "learning_rate": 6.899549774887444e-05, "loss": 0.3395, "step": 31050 }, { "epoch": 524.47, "grad_norm": 0.6329209804534912, "learning_rate": 6.897048524262132e-05, "loss": 0.3435, "step": 31075 }, { "epoch": 524.89, "grad_norm": 0.5089101195335388, "learning_rate": 6.894547273636818e-05, "loss": 0.3431, "step": 31100 }, { "epoch": 525.32, "grad_norm": 1.0673028230667114, "learning_rate": 6.892046023011506e-05, "loss": 0.339, "step": 31125 }, { "epoch": 525.74, "grad_norm": 0.8072704672813416, "learning_rate": 6.889544772386194e-05, "loss": 0.341, "step": 31150 }, { "epoch": 526.16, "grad_norm": 0.8207008242607117, "learning_rate": 6.887043521760881e-05, "loss": 0.3422, "step": 31175 }, { "epoch": 526.58, "grad_norm": 1.4817074537277222, "learning_rate": 6.884542271135567e-05, "loss": 0.3403, "step": 31200 }, { "epoch": 527.0, "grad_norm": 0.6983245611190796, "learning_rate": 6.882041020510256e-05, "loss": 0.3418, "step": 31225 }, { "epoch": 527.43, "grad_norm": 1.0061943531036377, "learning_rate": 6.879539769884943e-05, "loss": 0.3437, "step": 31250 }, { "epoch": 527.85, "grad_norm": 0.7658777236938477, "learning_rate": 6.877038519259629e-05, "loss": 0.3414, "step": 31275 }, { "epoch": 528.27, "grad_norm": 1.3661540746688843, "learning_rate": 6.874537268634318e-05, "loss": 0.3457, "step": 31300 }, { "epoch": 528.69, "grad_norm": 1.806186556816101, "learning_rate": 6.872036018009005e-05, "loss": 0.3433, "step": 31325 }, { "epoch": 529.11, "grad_norm": 0.6236609220504761, "learning_rate": 6.869534767383692e-05, "loss": 0.3399, "step": 31350 }, { "epoch": 529.54, "grad_norm": 1.7417633533477783, "learning_rate": 6.86703351675838e-05, "loss": 0.3453, "step": 31375 }, { "epoch": 529.96, "grad_norm": 0.6702211499214172, "learning_rate": 6.864532266133067e-05, "loss": 0.3406, "step": 31400 }, { "epoch": 530.38, "grad_norm": 0.9189289212226868, "learning_rate": 6.862031015507754e-05, "loss": 0.3412, "step": 31425 }, { "epoch": 530.8, "grad_norm": 0.7000417709350586, "learning_rate": 6.859529764882441e-05, "loss": 0.3437, "step": 31450 }, { "epoch": 531.22, "grad_norm": 1.5933595895767212, "learning_rate": 6.857028514257129e-05, "loss": 0.3395, "step": 31475 }, { "epoch": 531.65, "grad_norm": 0.7347859144210815, "learning_rate": 6.854527263631816e-05, "loss": 0.3444, "step": 31500 }, { "epoch": 532.07, "grad_norm": 0.9339766502380371, "learning_rate": 6.852026013006505e-05, "loss": 0.3428, "step": 31525 }, { "epoch": 532.49, "grad_norm": 1.0176125764846802, "learning_rate": 6.84952476238119e-05, "loss": 0.3413, "step": 31550 }, { "epoch": 532.91, "grad_norm": 0.8498849868774414, "learning_rate": 6.847023511755878e-05, "loss": 0.341, "step": 31575 }, { "epoch": 533.33, "grad_norm": 0.809094250202179, "learning_rate": 6.844522261130566e-05, "loss": 0.3409, "step": 31600 }, { "epoch": 533.76, "grad_norm": 0.8556786179542542, "learning_rate": 6.842021010505252e-05, "loss": 0.3418, "step": 31625 }, { "epoch": 534.18, "grad_norm": 1.3783509731292725, "learning_rate": 6.83951975987994e-05, "loss": 0.345, "step": 31650 }, { "epoch": 534.6, "grad_norm": 0.7995312213897705, "learning_rate": 6.837018509254628e-05, "loss": 0.3412, "step": 31675 }, { "epoch": 535.02, "grad_norm": 0.6847935914993286, "learning_rate": 6.834517258629315e-05, "loss": 0.3414, "step": 31700 }, { "epoch": 535.44, "grad_norm": 1.8557566404342651, "learning_rate": 6.832016008004001e-05, "loss": 0.3406, "step": 31725 }, { "epoch": 535.86, "grad_norm": 1.7112691402435303, "learning_rate": 6.82951475737869e-05, "loss": 0.3422, "step": 31750 }, { "epoch": 536.29, "grad_norm": 0.6920132637023926, "learning_rate": 6.827013506753377e-05, "loss": 0.342, "step": 31775 }, { "epoch": 536.71, "grad_norm": 0.7069850564002991, "learning_rate": 6.824512256128064e-05, "loss": 0.3406, "step": 31800 }, { "epoch": 537.13, "grad_norm": 0.647543728351593, "learning_rate": 6.822011005502752e-05, "loss": 0.3409, "step": 31825 }, { "epoch": 537.55, "grad_norm": 0.875781774520874, "learning_rate": 6.819509754877439e-05, "loss": 0.3407, "step": 31850 }, { "epoch": 537.97, "grad_norm": 0.8960343599319458, "learning_rate": 6.817008504252126e-05, "loss": 0.344, "step": 31875 }, { "epoch": 538.4, "grad_norm": 0.9137542843818665, "learning_rate": 6.814507253626814e-05, "loss": 0.3408, "step": 31900 }, { "epoch": 538.82, "grad_norm": 1.1292606592178345, "learning_rate": 6.812006003001501e-05, "loss": 0.3462, "step": 31925 }, { "epoch": 539.24, "grad_norm": 1.1129761934280396, "learning_rate": 6.809504752376188e-05, "loss": 0.3427, "step": 31950 }, { "epoch": 539.66, "grad_norm": 0.5662965774536133, "learning_rate": 6.807003501750875e-05, "loss": 0.3423, "step": 31975 }, { "epoch": 540.08, "grad_norm": 0.9578555226325989, "learning_rate": 6.804502251125563e-05, "loss": 0.3425, "step": 32000 }, { "epoch": 540.51, "grad_norm": 1.463629961013794, "learning_rate": 6.80200100050025e-05, "loss": 0.3422, "step": 32025 }, { "epoch": 540.93, "grad_norm": 0.7364199757575989, "learning_rate": 6.799499749874938e-05, "loss": 0.3381, "step": 32050 }, { "epoch": 541.35, "grad_norm": 0.9504633545875549, "learning_rate": 6.796998499249624e-05, "loss": 0.3443, "step": 32075 }, { "epoch": 541.77, "grad_norm": 0.9967548847198486, "learning_rate": 6.794497248624313e-05, "loss": 0.3397, "step": 32100 }, { "epoch": 542.19, "grad_norm": 0.6801573634147644, "learning_rate": 6.791995997999e-05, "loss": 0.3438, "step": 32125 }, { "epoch": 542.62, "grad_norm": 1.1596285104751587, "learning_rate": 6.789494747373688e-05, "loss": 0.3427, "step": 32150 }, { "epoch": 543.04, "grad_norm": 0.881472647190094, "learning_rate": 6.786993496748375e-05, "loss": 0.3411, "step": 32175 }, { "epoch": 543.46, "grad_norm": 0.8397149443626404, "learning_rate": 6.784492246123062e-05, "loss": 0.3382, "step": 32200 }, { "epoch": 543.88, "grad_norm": 0.6837425827980042, "learning_rate": 6.78199099549775e-05, "loss": 0.3422, "step": 32225 }, { "epoch": 544.3, "grad_norm": 1.192498803138733, "learning_rate": 6.779489744872437e-05, "loss": 0.3402, "step": 32250 }, { "epoch": 544.73, "grad_norm": 1.0894124507904053, "learning_rate": 6.776988494247124e-05, "loss": 0.3414, "step": 32275 }, { "epoch": 545.15, "grad_norm": 0.8591002821922302, "learning_rate": 6.774487243621811e-05, "loss": 0.3417, "step": 32300 }, { "epoch": 545.57, "grad_norm": 1.1633388996124268, "learning_rate": 6.771985992996498e-05, "loss": 0.3424, "step": 32325 }, { "epoch": 545.99, "grad_norm": 0.9420775771141052, "learning_rate": 6.769484742371186e-05, "loss": 0.3452, "step": 32350 }, { "epoch": 546.41, "grad_norm": 1.085902452468872, "learning_rate": 6.766983491745873e-05, "loss": 0.3401, "step": 32375 }, { "epoch": 546.84, "grad_norm": 0.6947980523109436, "learning_rate": 6.764482241120562e-05, "loss": 0.3406, "step": 32400 }, { "epoch": 547.26, "grad_norm": 0.5836385488510132, "learning_rate": 6.761980990495247e-05, "loss": 0.3415, "step": 32425 }, { "epoch": 547.68, "grad_norm": 1.2648353576660156, "learning_rate": 6.759479739869935e-05, "loss": 0.3409, "step": 32450 }, { "epoch": 548.1, "grad_norm": 0.8536266088485718, "learning_rate": 6.756978489244623e-05, "loss": 0.339, "step": 32475 }, { "epoch": 548.52, "grad_norm": 0.5980303287506104, "learning_rate": 6.75447723861931e-05, "loss": 0.3391, "step": 32500 }, { "epoch": 548.95, "grad_norm": 0.7351404428482056, "learning_rate": 6.751975987993997e-05, "loss": 0.3381, "step": 32525 }, { "epoch": 549.37, "grad_norm": 0.7535175681114197, "learning_rate": 6.749474737368685e-05, "loss": 0.3412, "step": 32550 }, { "epoch": 549.79, "grad_norm": 0.8227791786193848, "learning_rate": 6.746973486743372e-05, "loss": 0.3428, "step": 32575 }, { "epoch": 550.21, "grad_norm": 1.2879763841629028, "learning_rate": 6.74447223611806e-05, "loss": 0.3404, "step": 32600 }, { "epoch": 550.63, "grad_norm": 0.6368476748466492, "learning_rate": 6.741970985492747e-05, "loss": 0.3423, "step": 32625 }, { "epoch": 551.05, "grad_norm": 1.1966255903244019, "learning_rate": 6.739469734867434e-05, "loss": 0.3416, "step": 32650 }, { "epoch": 551.48, "grad_norm": 0.7135118842124939, "learning_rate": 6.736968484242121e-05, "loss": 0.3388, "step": 32675 }, { "epoch": 551.9, "grad_norm": 0.7546285390853882, "learning_rate": 6.734467233616809e-05, "loss": 0.3385, "step": 32700 }, { "epoch": 552.32, "grad_norm": 0.7926015853881836, "learning_rate": 6.731965982991496e-05, "loss": 0.3388, "step": 32725 }, { "epoch": 552.74, "grad_norm": 2.283576011657715, "learning_rate": 6.729464732366183e-05, "loss": 0.3405, "step": 32750 }, { "epoch": 553.16, "grad_norm": 1.3942337036132812, "learning_rate": 6.72696348174087e-05, "loss": 0.3411, "step": 32775 }, { "epoch": 553.59, "grad_norm": 0.515426754951477, "learning_rate": 6.724462231115558e-05, "loss": 0.3395, "step": 32800 }, { "epoch": 554.01, "grad_norm": 0.6492491960525513, "learning_rate": 6.721960980490245e-05, "loss": 0.3386, "step": 32825 }, { "epoch": 554.43, "grad_norm": 1.293582558631897, "learning_rate": 6.719459729864934e-05, "loss": 0.3403, "step": 32850 }, { "epoch": 554.85, "grad_norm": 1.0767749547958374, "learning_rate": 6.71695847923962e-05, "loss": 0.3407, "step": 32875 }, { "epoch": 555.27, "grad_norm": 0.8864172101020813, "learning_rate": 6.714457228614307e-05, "loss": 0.3396, "step": 32900 }, { "epoch": 555.7, "grad_norm": 0.5895919799804688, "learning_rate": 6.711955977988995e-05, "loss": 0.3413, "step": 32925 }, { "epoch": 556.12, "grad_norm": 0.9285296201705933, "learning_rate": 6.709554777388695e-05, "loss": 0.3385, "step": 32950 }, { "epoch": 556.54, "grad_norm": 0.9765423536300659, "learning_rate": 6.707053526763381e-05, "loss": 0.3418, "step": 32975 }, { "epoch": 556.96, "grad_norm": 0.6705697178840637, "learning_rate": 6.70455227613807e-05, "loss": 0.3402, "step": 33000 }, { "epoch": 557.38, "grad_norm": 0.5522589683532715, "learning_rate": 6.702051025512757e-05, "loss": 0.3364, "step": 33025 }, { "epoch": 557.81, "grad_norm": 1.5293906927108765, "learning_rate": 6.699549774887443e-05, "loss": 0.3415, "step": 33050 }, { "epoch": 558.23, "grad_norm": 0.7968081831932068, "learning_rate": 6.697048524262132e-05, "loss": 0.3397, "step": 33075 }, { "epoch": 558.65, "grad_norm": 0.647320032119751, "learning_rate": 6.694547273636819e-05, "loss": 0.3381, "step": 33100 }, { "epoch": 559.07, "grad_norm": 0.6968383193016052, "learning_rate": 6.692046023011506e-05, "loss": 0.337, "step": 33125 }, { "epoch": 559.49, "grad_norm": 0.6038305759429932, "learning_rate": 6.689544772386193e-05, "loss": 0.3385, "step": 33150 }, { "epoch": 559.92, "grad_norm": 0.7903083562850952, "learning_rate": 6.687043521760881e-05, "loss": 0.34, "step": 33175 }, { "epoch": 560.34, "grad_norm": 0.778022289276123, "learning_rate": 6.684542271135568e-05, "loss": 0.3401, "step": 33200 }, { "epoch": 560.76, "grad_norm": 0.8600345253944397, "learning_rate": 6.682041020510255e-05, "loss": 0.3425, "step": 33225 }, { "epoch": 561.18, "grad_norm": 1.0592252016067505, "learning_rate": 6.679539769884942e-05, "loss": 0.3378, "step": 33250 }, { "epoch": 561.6, "grad_norm": 0.8037461638450623, "learning_rate": 6.67703851925963e-05, "loss": 0.3387, "step": 33275 }, { "epoch": 562.03, "grad_norm": 0.6939969062805176, "learning_rate": 6.674537268634318e-05, "loss": 0.3376, "step": 33300 }, { "epoch": 562.45, "grad_norm": 1.5102074146270752, "learning_rate": 6.672036018009004e-05, "loss": 0.3395, "step": 33325 }, { "epoch": 562.87, "grad_norm": 0.6127755641937256, "learning_rate": 6.669534767383692e-05, "loss": 0.3366, "step": 33350 }, { "epoch": 563.29, "grad_norm": 1.0411701202392578, "learning_rate": 6.66703351675838e-05, "loss": 0.3427, "step": 33375 }, { "epoch": 563.71, "grad_norm": 0.6837058067321777, "learning_rate": 6.664532266133066e-05, "loss": 0.3406, "step": 33400 }, { "epoch": 564.14, "grad_norm": 0.6834747791290283, "learning_rate": 6.662031015507753e-05, "loss": 0.3382, "step": 33425 }, { "epoch": 564.56, "grad_norm": 1.846628189086914, "learning_rate": 6.659529764882442e-05, "loss": 0.3384, "step": 33450 }, { "epoch": 564.98, "grad_norm": 0.6468905806541443, "learning_rate": 6.657028514257129e-05, "loss": 0.3397, "step": 33475 }, { "epoch": 565.4, "grad_norm": 0.6426169872283936, "learning_rate": 6.654527263631816e-05, "loss": 0.3418, "step": 33500 }, { "epoch": 565.82, "grad_norm": 0.6368653774261475, "learning_rate": 6.652026013006504e-05, "loss": 0.334, "step": 33525 }, { "epoch": 566.24, "grad_norm": 1.74380362033844, "learning_rate": 6.649524762381191e-05, "loss": 0.339, "step": 33550 }, { "epoch": 566.67, "grad_norm": 0.7097263932228088, "learning_rate": 6.647023511755878e-05, "loss": 0.3377, "step": 33575 }, { "epoch": 567.09, "grad_norm": 0.7277435064315796, "learning_rate": 6.644522261130566e-05, "loss": 0.3363, "step": 33600 }, { "epoch": 567.51, "grad_norm": 0.6395201086997986, "learning_rate": 6.642021010505253e-05, "loss": 0.3367, "step": 33625 }, { "epoch": 567.93, "grad_norm": 0.7567791938781738, "learning_rate": 6.639519759879941e-05, "loss": 0.3388, "step": 33650 }, { "epoch": 568.35, "grad_norm": 0.7046810984611511, "learning_rate": 6.637018509254627e-05, "loss": 0.3391, "step": 33675 }, { "epoch": 568.78, "grad_norm": 0.6516266465187073, "learning_rate": 6.634517258629315e-05, "loss": 0.3402, "step": 33700 }, { "epoch": 569.2, "grad_norm": 0.9617276191711426, "learning_rate": 6.632016008004003e-05, "loss": 0.3385, "step": 33725 }, { "epoch": 569.62, "grad_norm": 0.7392562031745911, "learning_rate": 6.629514757378689e-05, "loss": 0.3374, "step": 33750 }, { "epoch": 570.04, "grad_norm": 0.9317861795425415, "learning_rate": 6.627013506753376e-05, "loss": 0.339, "step": 33775 }, { "epoch": 570.46, "grad_norm": 0.7316109538078308, "learning_rate": 6.624512256128065e-05, "loss": 0.3361, "step": 33800 }, { "epoch": 570.89, "grad_norm": 1.202375888824463, "learning_rate": 6.622011005502752e-05, "loss": 0.3399, "step": 33825 }, { "epoch": 571.31, "grad_norm": 0.9978744387626648, "learning_rate": 6.619509754877438e-05, "loss": 0.3384, "step": 33850 }, { "epoch": 571.73, "grad_norm": 1.0897681713104248, "learning_rate": 6.617008504252127e-05, "loss": 0.3444, "step": 33875 }, { "epoch": 572.15, "grad_norm": 1.3599427938461304, "learning_rate": 6.614507253626814e-05, "loss": 0.3363, "step": 33900 }, { "epoch": 572.57, "grad_norm": 0.9392386674880981, "learning_rate": 6.612006003001501e-05, "loss": 0.3386, "step": 33925 }, { "epoch": 573.0, "grad_norm": 1.0890257358551025, "learning_rate": 6.609504752376189e-05, "loss": 0.3426, "step": 33950 }, { "epoch": 573.42, "grad_norm": 1.0148147344589233, "learning_rate": 6.607003501750876e-05, "loss": 0.3409, "step": 33975 }, { "epoch": 573.84, "grad_norm": 0.632052481174469, "learning_rate": 6.604502251125563e-05, "loss": 0.3386, "step": 34000 }, { "epoch": 574.26, "grad_norm": 0.7889218330383301, "learning_rate": 6.60200100050025e-05, "loss": 0.3369, "step": 34025 }, { "epoch": 574.68, "grad_norm": 0.6896076202392578, "learning_rate": 6.599499749874938e-05, "loss": 0.3388, "step": 34050 }, { "epoch": 575.11, "grad_norm": 0.8707833886146545, "learning_rate": 6.596998499249625e-05, "loss": 0.3398, "step": 34075 }, { "epoch": 575.53, "grad_norm": 0.8455397486686707, "learning_rate": 6.594497248624312e-05, "loss": 0.3389, "step": 34100 }, { "epoch": 575.95, "grad_norm": 0.8498758673667908, "learning_rate": 6.591995997999e-05, "loss": 0.3372, "step": 34125 }, { "epoch": 576.37, "grad_norm": 0.7174918055534363, "learning_rate": 6.589494747373687e-05, "loss": 0.3356, "step": 34150 }, { "epoch": 576.79, "grad_norm": 0.7529613375663757, "learning_rate": 6.586993496748375e-05, "loss": 0.3362, "step": 34175 }, { "epoch": 577.22, "grad_norm": 1.5480111837387085, "learning_rate": 6.584492246123061e-05, "loss": 0.3391, "step": 34200 }, { "epoch": 577.64, "grad_norm": 0.6244324445724487, "learning_rate": 6.581990995497749e-05, "loss": 0.3444, "step": 34225 }, { "epoch": 578.06, "grad_norm": 0.8089304566383362, "learning_rate": 6.579489744872437e-05, "loss": 0.3374, "step": 34250 }, { "epoch": 578.48, "grad_norm": 0.7356294989585876, "learning_rate": 6.576988494247124e-05, "loss": 0.3379, "step": 34275 }, { "epoch": 578.9, "grad_norm": 0.8512611985206604, "learning_rate": 6.57448724362181e-05, "loss": 0.3383, "step": 34300 }, { "epoch": 579.32, "grad_norm": 0.9146127700805664, "learning_rate": 6.571985992996499e-05, "loss": 0.3373, "step": 34325 }, { "epoch": 579.75, "grad_norm": 0.9658306837081909, "learning_rate": 6.569484742371186e-05, "loss": 0.3386, "step": 34350 }, { "epoch": 580.17, "grad_norm": 0.7263357639312744, "learning_rate": 6.566983491745872e-05, "loss": 0.3401, "step": 34375 }, { "epoch": 580.59, "grad_norm": 0.8097271919250488, "learning_rate": 6.564482241120561e-05, "loss": 0.3391, "step": 34400 }, { "epoch": 581.01, "grad_norm": 0.7183808088302612, "learning_rate": 6.561980990495248e-05, "loss": 0.339, "step": 34425 }, { "epoch": 581.43, "grad_norm": 0.6586313247680664, "learning_rate": 6.559479739869935e-05, "loss": 0.337, "step": 34450 }, { "epoch": 581.86, "grad_norm": 1.879492163658142, "learning_rate": 6.556978489244623e-05, "loss": 0.3356, "step": 34475 }, { "epoch": 582.28, "grad_norm": 0.8550243377685547, "learning_rate": 6.55447723861931e-05, "loss": 0.3392, "step": 34500 }, { "epoch": 582.7, "grad_norm": 1.023224115371704, "learning_rate": 6.551975987993997e-05, "loss": 0.3388, "step": 34525 }, { "epoch": 583.12, "grad_norm": 1.3497540950775146, "learning_rate": 6.549474737368684e-05, "loss": 0.3382, "step": 34550 }, { "epoch": 583.54, "grad_norm": 0.6649655699729919, "learning_rate": 6.546973486743372e-05, "loss": 0.3394, "step": 34575 }, { "epoch": 583.97, "grad_norm": 0.8057993054389954, "learning_rate": 6.544472236118059e-05, "loss": 0.3369, "step": 34600 }, { "epoch": 584.39, "grad_norm": 0.6757055521011353, "learning_rate": 6.541970985492747e-05, "loss": 0.3337, "step": 34625 }, { "epoch": 584.81, "grad_norm": 0.684173583984375, "learning_rate": 6.539469734867433e-05, "loss": 0.3393, "step": 34650 }, { "epoch": 585.23, "grad_norm": 0.7044485807418823, "learning_rate": 6.53696848424212e-05, "loss": 0.3414, "step": 34675 }, { "epoch": 585.65, "grad_norm": 0.8414024710655212, "learning_rate": 6.534467233616809e-05, "loss": 0.3375, "step": 34700 }, { "epoch": 586.08, "grad_norm": 0.7063931822776794, "learning_rate": 6.531965982991495e-05, "loss": 0.3374, "step": 34725 }, { "epoch": 586.5, "grad_norm": 0.7179595232009888, "learning_rate": 6.529464732366184e-05, "loss": 0.3387, "step": 34750 }, { "epoch": 586.92, "grad_norm": 1.063738226890564, "learning_rate": 6.526963481740871e-05, "loss": 0.3387, "step": 34775 }, { "epoch": 587.34, "grad_norm": 0.5038682222366333, "learning_rate": 6.524462231115558e-05, "loss": 0.336, "step": 34800 }, { "epoch": 587.76, "grad_norm": 1.0449435710906982, "learning_rate": 6.521960980490246e-05, "loss": 0.335, "step": 34825 }, { "epoch": 588.19, "grad_norm": 1.1155606508255005, "learning_rate": 6.519459729864933e-05, "loss": 0.3389, "step": 34850 }, { "epoch": 588.61, "grad_norm": 0.8447085022926331, "learning_rate": 6.51695847923962e-05, "loss": 0.3383, "step": 34875 }, { "epoch": 589.03, "grad_norm": 1.4619184732437134, "learning_rate": 6.514457228614307e-05, "loss": 0.3372, "step": 34900 }, { "epoch": 589.45, "grad_norm": 0.7480948567390442, "learning_rate": 6.511955977988995e-05, "loss": 0.3384, "step": 34925 }, { "epoch": 589.87, "grad_norm": 1.1654319763183594, "learning_rate": 6.509454727363682e-05, "loss": 0.3375, "step": 34950 }, { "epoch": 590.3, "grad_norm": 2.2086071968078613, "learning_rate": 6.50695347673837e-05, "loss": 0.3472, "step": 34975 }, { "epoch": 590.72, "grad_norm": 0.9646621346473694, "learning_rate": 6.504452226113056e-05, "loss": 0.3388, "step": 35000 }, { "epoch": 591.14, "grad_norm": 0.9167196154594421, "learning_rate": 6.501950975487744e-05, "loss": 0.3411, "step": 35025 }, { "epoch": 591.56, "grad_norm": 0.8082008957862854, "learning_rate": 6.499449724862432e-05, "loss": 0.3358, "step": 35050 }, { "epoch": 591.98, "grad_norm": 1.171654224395752, "learning_rate": 6.496948474237118e-05, "loss": 0.3396, "step": 35075 }, { "epoch": 592.41, "grad_norm": 1.05853271484375, "learning_rate": 6.494447223611806e-05, "loss": 0.3389, "step": 35100 }, { "epoch": 592.83, "grad_norm": 0.7572023868560791, "learning_rate": 6.491945972986494e-05, "loss": 0.3376, "step": 35125 }, { "epoch": 593.25, "grad_norm": 1.065799355506897, "learning_rate": 6.489444722361181e-05, "loss": 0.3387, "step": 35150 }, { "epoch": 593.67, "grad_norm": 0.7906450033187866, "learning_rate": 6.48704352176088e-05, "loss": 0.3373, "step": 35175 }, { "epoch": 594.09, "grad_norm": 1.0465545654296875, "learning_rate": 6.484542271135569e-05, "loss": 0.3401, "step": 35200 }, { "epoch": 594.51, "grad_norm": 1.086724877357483, "learning_rate": 6.482041020510256e-05, "loss": 0.3389, "step": 35225 }, { "epoch": 594.94, "grad_norm": 0.7371805310249329, "learning_rate": 6.479539769884943e-05, "loss": 0.3368, "step": 35250 }, { "epoch": 595.36, "grad_norm": 0.8596795201301575, "learning_rate": 6.47703851925963e-05, "loss": 0.3365, "step": 35275 }, { "epoch": 595.78, "grad_norm": 0.5421687364578247, "learning_rate": 6.474537268634318e-05, "loss": 0.3418, "step": 35300 }, { "epoch": 596.2, "grad_norm": 0.7373797297477722, "learning_rate": 6.472036018009005e-05, "loss": 0.3377, "step": 35325 }, { "epoch": 596.62, "grad_norm": 0.6927942633628845, "learning_rate": 6.469534767383692e-05, "loss": 0.3385, "step": 35350 }, { "epoch": 597.05, "grad_norm": 0.7038558125495911, "learning_rate": 6.46703351675838e-05, "loss": 0.3376, "step": 35375 }, { "epoch": 597.47, "grad_norm": 0.7141275405883789, "learning_rate": 6.464532266133067e-05, "loss": 0.3356, "step": 35400 }, { "epoch": 597.89, "grad_norm": 1.1170212030410767, "learning_rate": 6.462031015507754e-05, "loss": 0.3365, "step": 35425 }, { "epoch": 598.31, "grad_norm": 1.4306530952453613, "learning_rate": 6.459529764882441e-05, "loss": 0.3381, "step": 35450 }, { "epoch": 598.73, "grad_norm": 0.8636388182640076, "learning_rate": 6.457028514257128e-05, "loss": 0.3372, "step": 35475 }, { "epoch": 599.16, "grad_norm": 0.8073099255561829, "learning_rate": 6.454527263631817e-05, "loss": 0.3377, "step": 35500 }, { "epoch": 599.58, "grad_norm": 0.7793017029762268, "learning_rate": 6.452026013006503e-05, "loss": 0.338, "step": 35525 }, { "epoch": 600.0, "grad_norm": 0.9898437857627869, "learning_rate": 6.44952476238119e-05, "loss": 0.3371, "step": 35550 }, { "epoch": 600.42, "grad_norm": 1.5062886476516724, "learning_rate": 6.447023511755879e-05, "loss": 0.3357, "step": 35575 }, { "epoch": 600.84, "grad_norm": 1.0383638143539429, "learning_rate": 6.444522261130566e-05, "loss": 0.337, "step": 35600 }, { "epoch": 601.27, "grad_norm": 0.7490107417106628, "learning_rate": 6.442021010505252e-05, "loss": 0.3351, "step": 35625 }, { "epoch": 601.69, "grad_norm": 1.25948166847229, "learning_rate": 6.43951975987994e-05, "loss": 0.339, "step": 35650 }, { "epoch": 602.11, "grad_norm": 0.6261009573936462, "learning_rate": 6.437018509254628e-05, "loss": 0.3356, "step": 35675 }, { "epoch": 602.53, "grad_norm": 0.8432815670967102, "learning_rate": 6.434517258629315e-05, "loss": 0.3382, "step": 35700 }, { "epoch": 602.95, "grad_norm": 0.7696906924247742, "learning_rate": 6.432016008004002e-05, "loss": 0.3386, "step": 35725 }, { "epoch": 603.38, "grad_norm": 0.8120480179786682, "learning_rate": 6.42951475737869e-05, "loss": 0.3407, "step": 35750 }, { "epoch": 603.8, "grad_norm": 0.7904940843582153, "learning_rate": 6.427013506753377e-05, "loss": 0.3398, "step": 35775 }, { "epoch": 604.22, "grad_norm": 0.8209044933319092, "learning_rate": 6.424512256128064e-05, "loss": 0.3395, "step": 35800 }, { "epoch": 604.64, "grad_norm": 0.8756649494171143, "learning_rate": 6.422011005502752e-05, "loss": 0.3392, "step": 35825 }, { "epoch": 605.06, "grad_norm": 0.6818459033966064, "learning_rate": 6.419509754877439e-05, "loss": 0.337, "step": 35850 }, { "epoch": 605.49, "grad_norm": 0.6980791687965393, "learning_rate": 6.417008504252126e-05, "loss": 0.3398, "step": 35875 }, { "epoch": 605.91, "grad_norm": 1.1855535507202148, "learning_rate": 6.414507253626813e-05, "loss": 0.3369, "step": 35900 }, { "epoch": 606.33, "grad_norm": 0.6705577969551086, "learning_rate": 6.4120060030015e-05, "loss": 0.3364, "step": 35925 }, { "epoch": 606.75, "grad_norm": 0.8017124533653259, "learning_rate": 6.409504752376189e-05, "loss": 0.339, "step": 35950 }, { "epoch": 607.17, "grad_norm": 0.6904909014701843, "learning_rate": 6.407003501750875e-05, "loss": 0.337, "step": 35975 }, { "epoch": 607.59, "grad_norm": 0.7090895175933838, "learning_rate": 6.404502251125562e-05, "loss": 0.3375, "step": 36000 }, { "epoch": 608.02, "grad_norm": 0.6796347498893738, "learning_rate": 6.402001000500251e-05, "loss": 0.3377, "step": 36025 }, { "epoch": 608.44, "grad_norm": 0.7823631167411804, "learning_rate": 6.399499749874938e-05, "loss": 0.336, "step": 36050 }, { "epoch": 608.86, "grad_norm": 0.8033694624900818, "learning_rate": 6.396998499249624e-05, "loss": 0.3377, "step": 36075 }, { "epoch": 609.28, "grad_norm": 0.8108952641487122, "learning_rate": 6.394497248624313e-05, "loss": 0.3385, "step": 36100 }, { "epoch": 609.7, "grad_norm": 1.8784852027893066, "learning_rate": 6.391995997999e-05, "loss": 0.3357, "step": 36125 }, { "epoch": 610.13, "grad_norm": 0.821980357170105, "learning_rate": 6.389494747373687e-05, "loss": 0.3377, "step": 36150 }, { "epoch": 610.55, "grad_norm": 0.599168062210083, "learning_rate": 6.386993496748375e-05, "loss": 0.3423, "step": 36175 }, { "epoch": 610.97, "grad_norm": 0.7002922296524048, "learning_rate": 6.384492246123062e-05, "loss": 0.3361, "step": 36200 }, { "epoch": 611.39, "grad_norm": 1.7148188352584839, "learning_rate": 6.381990995497749e-05, "loss": 0.3367, "step": 36225 }, { "epoch": 611.81, "grad_norm": 0.5494518876075745, "learning_rate": 6.379489744872436e-05, "loss": 0.3409, "step": 36250 }, { "epoch": 612.24, "grad_norm": 1.060511827468872, "learning_rate": 6.376988494247124e-05, "loss": 0.34, "step": 36275 }, { "epoch": 612.66, "grad_norm": 1.1141198873519897, "learning_rate": 6.374487243621812e-05, "loss": 0.3344, "step": 36300 }, { "epoch": 613.08, "grad_norm": 0.7488123178482056, "learning_rate": 6.371985992996498e-05, "loss": 0.335, "step": 36325 }, { "epoch": 613.5, "grad_norm": 0.6711541414260864, "learning_rate": 6.369484742371185e-05, "loss": 0.3372, "step": 36350 }, { "epoch": 613.92, "grad_norm": 0.8390054702758789, "learning_rate": 6.366983491745874e-05, "loss": 0.3361, "step": 36375 }, { "epoch": 614.35, "grad_norm": 0.6326188445091248, "learning_rate": 6.364482241120561e-05, "loss": 0.3379, "step": 36400 }, { "epoch": 614.77, "grad_norm": 0.8833638429641724, "learning_rate": 6.361980990495247e-05, "loss": 0.3384, "step": 36425 }, { "epoch": 615.19, "grad_norm": 1.15792977809906, "learning_rate": 6.359479739869936e-05, "loss": 0.3374, "step": 36450 }, { "epoch": 615.61, "grad_norm": 0.6561751961708069, "learning_rate": 6.356978489244623e-05, "loss": 0.3375, "step": 36475 }, { "epoch": 616.03, "grad_norm": 0.5406872034072876, "learning_rate": 6.354477238619309e-05, "loss": 0.3367, "step": 36500 }, { "epoch": 616.46, "grad_norm": 1.0396761894226074, "learning_rate": 6.351975987993998e-05, "loss": 0.3364, "step": 36525 }, { "epoch": 616.88, "grad_norm": 1.0573174953460693, "learning_rate": 6.349474737368685e-05, "loss": 0.3385, "step": 36550 }, { "epoch": 617.3, "grad_norm": 0.8727951049804688, "learning_rate": 6.346973486743372e-05, "loss": 0.335, "step": 36575 }, { "epoch": 617.72, "grad_norm": 0.968561589717865, "learning_rate": 6.34447223611806e-05, "loss": 0.3341, "step": 36600 }, { "epoch": 618.14, "grad_norm": 0.9274197220802307, "learning_rate": 6.341970985492747e-05, "loss": 0.3384, "step": 36625 }, { "epoch": 618.57, "grad_norm": 0.6899442672729492, "learning_rate": 6.339469734867434e-05, "loss": 0.3346, "step": 36650 }, { "epoch": 618.99, "grad_norm": 1.122818112373352, "learning_rate": 6.336968484242121e-05, "loss": 0.3362, "step": 36675 }, { "epoch": 619.41, "grad_norm": 0.673625111579895, "learning_rate": 6.334467233616808e-05, "loss": 0.3368, "step": 36700 }, { "epoch": 619.83, "grad_norm": 0.6190720200538635, "learning_rate": 6.331965982991496e-05, "loss": 0.3378, "step": 36725 }, { "epoch": 620.25, "grad_norm": 1.5018521547317505, "learning_rate": 6.329464732366184e-05, "loss": 0.3343, "step": 36750 }, { "epoch": 620.68, "grad_norm": 0.7553064823150635, "learning_rate": 6.32696348174087e-05, "loss": 0.3373, "step": 36775 }, { "epoch": 621.1, "grad_norm": 0.586712658405304, "learning_rate": 6.324462231115558e-05, "loss": 0.336, "step": 36800 }, { "epoch": 621.52, "grad_norm": 1.2413265705108643, "learning_rate": 6.321960980490246e-05, "loss": 0.3411, "step": 36825 }, { "epoch": 621.94, "grad_norm": 0.8846489787101746, "learning_rate": 6.319459729864932e-05, "loss": 0.3372, "step": 36850 }, { "epoch": 622.36, "grad_norm": 0.6217809319496155, "learning_rate": 6.31695847923962e-05, "loss": 0.3332, "step": 36875 }, { "epoch": 622.78, "grad_norm": 0.7208801507949829, "learning_rate": 6.314457228614308e-05, "loss": 0.3367, "step": 36900 }, { "epoch": 623.21, "grad_norm": 1.8619037866592407, "learning_rate": 6.311955977988995e-05, "loss": 0.3409, "step": 36925 }, { "epoch": 623.63, "grad_norm": 1.442118525505066, "learning_rate": 6.309454727363681e-05, "loss": 0.3364, "step": 36950 }, { "epoch": 624.05, "grad_norm": 0.9569216966629028, "learning_rate": 6.30695347673837e-05, "loss": 0.3363, "step": 36975 }, { "epoch": 624.47, "grad_norm": 0.662265956401825, "learning_rate": 6.304452226113057e-05, "loss": 0.337, "step": 37000 }, { "epoch": 624.89, "grad_norm": 0.8474342823028564, "learning_rate": 6.301950975487744e-05, "loss": 0.3338, "step": 37025 }, { "epoch": 625.32, "grad_norm": 0.9669449925422668, "learning_rate": 6.299449724862432e-05, "loss": 0.3345, "step": 37050 }, { "epoch": 625.74, "grad_norm": 0.9579367637634277, "learning_rate": 6.296948474237119e-05, "loss": 0.3347, "step": 37075 }, { "epoch": 626.16, "grad_norm": 0.6778914928436279, "learning_rate": 6.294447223611806e-05, "loss": 0.3377, "step": 37100 }, { "epoch": 626.58, "grad_norm": 1.2884172201156616, "learning_rate": 6.291945972986493e-05, "loss": 0.3376, "step": 37125 }, { "epoch": 627.0, "grad_norm": 1.0747885704040527, "learning_rate": 6.28944472236118e-05, "loss": 0.3369, "step": 37150 }, { "epoch": 627.43, "grad_norm": 0.6872113943099976, "learning_rate": 6.286943471735868e-05, "loss": 0.3328, "step": 37175 }, { "epoch": 627.85, "grad_norm": 1.1908756494522095, "learning_rate": 6.284442221110555e-05, "loss": 0.3349, "step": 37200 }, { "epoch": 628.27, "grad_norm": 1.6441115140914917, "learning_rate": 6.281940970485242e-05, "loss": 0.336, "step": 37225 }, { "epoch": 628.69, "grad_norm": 0.659174919128418, "learning_rate": 6.27943971985993e-05, "loss": 0.3383, "step": 37250 }, { "epoch": 629.11, "grad_norm": 0.690222978591919, "learning_rate": 6.276938469234618e-05, "loss": 0.3339, "step": 37275 }, { "epoch": 629.54, "grad_norm": 0.7107376456260681, "learning_rate": 6.274437218609304e-05, "loss": 0.3403, "step": 37300 }, { "epoch": 629.96, "grad_norm": 0.8036572933197021, "learning_rate": 6.271935967983991e-05, "loss": 0.3347, "step": 37325 }, { "epoch": 630.38, "grad_norm": 1.1143404245376587, "learning_rate": 6.26943471735868e-05, "loss": 0.3373, "step": 37350 }, { "epoch": 630.8, "grad_norm": 0.6640003323554993, "learning_rate": 6.266933466733367e-05, "loss": 0.3347, "step": 37375 }, { "epoch": 631.22, "grad_norm": 0.6351874470710754, "learning_rate": 6.264432216108055e-05, "loss": 0.338, "step": 37400 }, { "epoch": 631.65, "grad_norm": 0.7418273687362671, "learning_rate": 6.261930965482742e-05, "loss": 0.3361, "step": 37425 }, { "epoch": 632.07, "grad_norm": 1.5347431898117065, "learning_rate": 6.259429714857429e-05, "loss": 0.34, "step": 37450 }, { "epoch": 632.49, "grad_norm": 0.6821726560592651, "learning_rate": 6.257028514257129e-05, "loss": 0.3351, "step": 37475 }, { "epoch": 632.91, "grad_norm": 0.9423748850822449, "learning_rate": 6.254527263631816e-05, "loss": 0.3368, "step": 37500 }, { "epoch": 633.33, "grad_norm": 0.847724437713623, "learning_rate": 6.252026013006504e-05, "loss": 0.3356, "step": 37525 }, { "epoch": 633.76, "grad_norm": 1.1588711738586426, "learning_rate": 6.249524762381191e-05, "loss": 0.3361, "step": 37550 }, { "epoch": 634.18, "grad_norm": 1.0508124828338623, "learning_rate": 6.247023511755878e-05, "loss": 0.3332, "step": 37575 }, { "epoch": 634.6, "grad_norm": 1.5878311395645142, "learning_rate": 6.244522261130565e-05, "loss": 0.3366, "step": 37600 }, { "epoch": 635.02, "grad_norm": 1.2667206525802612, "learning_rate": 6.242021010505254e-05, "loss": 0.335, "step": 37625 }, { "epoch": 635.44, "grad_norm": 0.6477533578872681, "learning_rate": 6.23951975987994e-05, "loss": 0.3373, "step": 37650 }, { "epoch": 635.86, "grad_norm": 0.6078073978424072, "learning_rate": 6.237018509254627e-05, "loss": 0.3387, "step": 37675 }, { "epoch": 636.29, "grad_norm": 0.9732393622398376, "learning_rate": 6.234517258629316e-05, "loss": 0.3327, "step": 37700 }, { "epoch": 636.71, "grad_norm": 1.1307120323181152, "learning_rate": 6.232016008004003e-05, "loss": 0.3319, "step": 37725 }, { "epoch": 637.13, "grad_norm": 0.6256436109542847, "learning_rate": 6.229514757378689e-05, "loss": 0.3346, "step": 37750 }, { "epoch": 637.55, "grad_norm": 0.5569838285446167, "learning_rate": 6.227013506753378e-05, "loss": 0.3352, "step": 37775 }, { "epoch": 637.97, "grad_norm": 1.2559618949890137, "learning_rate": 6.224512256128065e-05, "loss": 0.336, "step": 37800 }, { "epoch": 638.4, "grad_norm": 0.6684890389442444, "learning_rate": 6.222011005502752e-05, "loss": 0.3361, "step": 37825 }, { "epoch": 638.82, "grad_norm": 0.9154089093208313, "learning_rate": 6.21950975487744e-05, "loss": 0.3344, "step": 37850 }, { "epoch": 639.24, "grad_norm": 0.8222879767417908, "learning_rate": 6.217008504252127e-05, "loss": 0.338, "step": 37875 }, { "epoch": 639.66, "grad_norm": 0.7803704738616943, "learning_rate": 6.214507253626814e-05, "loss": 0.3369, "step": 37900 }, { "epoch": 640.08, "grad_norm": 1.3542786836624146, "learning_rate": 6.212006003001501e-05, "loss": 0.3365, "step": 37925 }, { "epoch": 640.51, "grad_norm": 0.8476645946502686, "learning_rate": 6.209504752376188e-05, "loss": 0.3343, "step": 37950 }, { "epoch": 640.93, "grad_norm": 0.6095691919326782, "learning_rate": 6.207003501750876e-05, "loss": 0.3347, "step": 37975 }, { "epoch": 641.35, "grad_norm": 1.0936555862426758, "learning_rate": 6.204502251125563e-05, "loss": 0.3354, "step": 38000 }, { "epoch": 641.77, "grad_norm": 1.152754306793213, "learning_rate": 6.20200100050025e-05, "loss": 0.3362, "step": 38025 }, { "epoch": 642.19, "grad_norm": 0.5825944542884827, "learning_rate": 6.199499749874937e-05, "loss": 0.3405, "step": 38050 }, { "epoch": 642.62, "grad_norm": 0.567848801612854, "learning_rate": 6.196998499249626e-05, "loss": 0.3353, "step": 38075 }, { "epoch": 643.04, "grad_norm": 0.7997140884399414, "learning_rate": 6.194497248624312e-05, "loss": 0.3359, "step": 38100 }, { "epoch": 643.46, "grad_norm": 1.017397165298462, "learning_rate": 6.191995997998999e-05, "loss": 0.3374, "step": 38125 }, { "epoch": 643.88, "grad_norm": 0.6991272568702698, "learning_rate": 6.189494747373688e-05, "loss": 0.3377, "step": 38150 }, { "epoch": 644.3, "grad_norm": 0.6539539098739624, "learning_rate": 6.186993496748375e-05, "loss": 0.3376, "step": 38175 }, { "epoch": 644.73, "grad_norm": 0.7927981019020081, "learning_rate": 6.184492246123061e-05, "loss": 0.334, "step": 38200 }, { "epoch": 645.15, "grad_norm": 0.6995012760162354, "learning_rate": 6.18199099549775e-05, "loss": 0.3337, "step": 38225 }, { "epoch": 645.57, "grad_norm": 0.6050649285316467, "learning_rate": 6.179489744872437e-05, "loss": 0.3363, "step": 38250 }, { "epoch": 645.99, "grad_norm": 0.9427962303161621, "learning_rate": 6.176988494247123e-05, "loss": 0.3378, "step": 38275 }, { "epoch": 646.41, "grad_norm": 0.908703625202179, "learning_rate": 6.174487243621811e-05, "loss": 0.3338, "step": 38300 }, { "epoch": 646.84, "grad_norm": 0.7177747488021851, "learning_rate": 6.171985992996499e-05, "loss": 0.3372, "step": 38325 }, { "epoch": 647.26, "grad_norm": 1.283485770225525, "learning_rate": 6.169484742371186e-05, "loss": 0.3392, "step": 38350 }, { "epoch": 647.68, "grad_norm": 0.8213033080101013, "learning_rate": 6.166983491745873e-05, "loss": 0.3366, "step": 38375 }, { "epoch": 648.1, "grad_norm": 0.6762648224830627, "learning_rate": 6.16448224112056e-05, "loss": 0.3395, "step": 38400 }, { "epoch": 648.52, "grad_norm": 0.8918651342391968, "learning_rate": 6.161980990495248e-05, "loss": 0.3343, "step": 38425 }, { "epoch": 648.95, "grad_norm": 0.5843579173088074, "learning_rate": 6.159479739869935e-05, "loss": 0.3358, "step": 38450 }, { "epoch": 649.37, "grad_norm": 0.652820885181427, "learning_rate": 6.156978489244622e-05, "loss": 0.339, "step": 38475 }, { "epoch": 649.79, "grad_norm": 0.8413587808609009, "learning_rate": 6.15447723861931e-05, "loss": 0.3382, "step": 38500 }, { "epoch": 650.21, "grad_norm": 0.6983152031898499, "learning_rate": 6.151975987993998e-05, "loss": 0.3348, "step": 38525 }, { "epoch": 650.63, "grad_norm": 1.1672810316085815, "learning_rate": 6.149474737368684e-05, "loss": 0.336, "step": 38550 }, { "epoch": 651.05, "grad_norm": 0.6260696053504944, "learning_rate": 6.146973486743371e-05, "loss": 0.3342, "step": 38575 }, { "epoch": 651.48, "grad_norm": 0.9090115427970886, "learning_rate": 6.14447223611806e-05, "loss": 0.3382, "step": 38600 }, { "epoch": 651.9, "grad_norm": 0.7545874714851379, "learning_rate": 6.141970985492746e-05, "loss": 0.3361, "step": 38625 }, { "epoch": 652.32, "grad_norm": 0.6558825969696045, "learning_rate": 6.139469734867433e-05, "loss": 0.3325, "step": 38650 }, { "epoch": 652.74, "grad_norm": 0.7328972816467285, "learning_rate": 6.136968484242122e-05, "loss": 0.3344, "step": 38675 }, { "epoch": 653.16, "grad_norm": 1.04045832157135, "learning_rate": 6.134467233616809e-05, "loss": 0.3375, "step": 38700 }, { "epoch": 653.59, "grad_norm": 0.5717912912368774, "learning_rate": 6.131965982991495e-05, "loss": 0.3366, "step": 38725 }, { "epoch": 654.01, "grad_norm": 0.6503519415855408, "learning_rate": 6.129464732366184e-05, "loss": 0.3333, "step": 38750 }, { "epoch": 654.43, "grad_norm": 0.9414656758308411, "learning_rate": 6.126963481740871e-05, "loss": 0.3337, "step": 38775 }, { "epoch": 654.85, "grad_norm": 0.5990950465202332, "learning_rate": 6.124462231115558e-05, "loss": 0.3326, "step": 38800 }, { "epoch": 655.27, "grad_norm": 0.8356573581695557, "learning_rate": 6.121960980490245e-05, "loss": 0.3339, "step": 38825 }, { "epoch": 655.7, "grad_norm": 1.6359909772872925, "learning_rate": 6.119459729864933e-05, "loss": 0.3362, "step": 38850 }, { "epoch": 656.12, "grad_norm": 1.0720454454421997, "learning_rate": 6.116958479239621e-05, "loss": 0.3343, "step": 38875 }, { "epoch": 656.54, "grad_norm": 1.089706301689148, "learning_rate": 6.114457228614307e-05, "loss": 0.3357, "step": 38900 }, { "epoch": 656.96, "grad_norm": 0.5836237072944641, "learning_rate": 6.111955977988994e-05, "loss": 0.3333, "step": 38925 }, { "epoch": 657.38, "grad_norm": 0.7561572194099426, "learning_rate": 6.109554777388696e-05, "loss": 0.3359, "step": 38950 }, { "epoch": 657.81, "grad_norm": 0.570633053779602, "learning_rate": 6.107053526763382e-05, "loss": 0.3384, "step": 38975 }, { "epoch": 658.23, "grad_norm": 0.7189050316810608, "learning_rate": 6.104552276138069e-05, "loss": 0.334, "step": 39000 }, { "epoch": 658.65, "grad_norm": 0.6825759410858154, "learning_rate": 6.1020510255127574e-05, "loss": 0.3347, "step": 39025 }, { "epoch": 659.07, "grad_norm": 1.1351057291030884, "learning_rate": 6.099549774887444e-05, "loss": 0.3371, "step": 39050 }, { "epoch": 659.49, "grad_norm": 1.4017590284347534, "learning_rate": 6.097048524262131e-05, "loss": 0.3323, "step": 39075 }, { "epoch": 659.92, "grad_norm": 0.7010709047317505, "learning_rate": 6.094547273636819e-05, "loss": 0.3332, "step": 39100 }, { "epoch": 660.34, "grad_norm": 1.158513069152832, "learning_rate": 6.0920460230115065e-05, "loss": 0.3361, "step": 39125 }, { "epoch": 660.76, "grad_norm": 0.5579866766929626, "learning_rate": 6.089544772386193e-05, "loss": 0.3327, "step": 39150 }, { "epoch": 661.18, "grad_norm": 0.7707204818725586, "learning_rate": 6.087043521760881e-05, "loss": 0.336, "step": 39175 }, { "epoch": 661.6, "grad_norm": 0.8646402359008789, "learning_rate": 6.084542271135568e-05, "loss": 0.3332, "step": 39200 }, { "epoch": 662.03, "grad_norm": 0.6727370023727417, "learning_rate": 6.082041020510255e-05, "loss": 0.3354, "step": 39225 }, { "epoch": 662.45, "grad_norm": 0.6902338266372681, "learning_rate": 6.0795397698849435e-05, "loss": 0.3337, "step": 39250 }, { "epoch": 662.87, "grad_norm": 0.8199294209480286, "learning_rate": 6.07703851925963e-05, "loss": 0.3368, "step": 39275 }, { "epoch": 663.29, "grad_norm": 0.9135434627532959, "learning_rate": 6.0745372686343173e-05, "loss": 0.3351, "step": 39300 }, { "epoch": 663.71, "grad_norm": 1.3065385818481445, "learning_rate": 6.072036018009005e-05, "loss": 0.3344, "step": 39325 }, { "epoch": 664.14, "grad_norm": 0.9308336973190308, "learning_rate": 6.069534767383692e-05, "loss": 0.3346, "step": 39350 }, { "epoch": 664.56, "grad_norm": 1.3821338415145874, "learning_rate": 6.067033516758379e-05, "loss": 0.3367, "step": 39375 }, { "epoch": 664.98, "grad_norm": 3.1479415893554688, "learning_rate": 6.064532266133067e-05, "loss": 0.3369, "step": 39400 }, { "epoch": 665.4, "grad_norm": 0.6173805594444275, "learning_rate": 6.0620310155077543e-05, "loss": 0.3351, "step": 39425 }, { "epoch": 665.82, "grad_norm": 0.6346443891525269, "learning_rate": 6.059529764882441e-05, "loss": 0.3341, "step": 39450 }, { "epoch": 666.24, "grad_norm": 0.733936071395874, "learning_rate": 6.0570285142571296e-05, "loss": 0.3354, "step": 39475 }, { "epoch": 666.67, "grad_norm": 0.7828245162963867, "learning_rate": 6.054527263631816e-05, "loss": 0.3414, "step": 39500 }, { "epoch": 667.09, "grad_norm": 1.023945689201355, "learning_rate": 6.0520260130065034e-05, "loss": 0.3349, "step": 39525 }, { "epoch": 667.51, "grad_norm": 1.7583357095718384, "learning_rate": 6.0495247623811914e-05, "loss": 0.3346, "step": 39550 }, { "epoch": 667.93, "grad_norm": 1.4628708362579346, "learning_rate": 6.047023511755878e-05, "loss": 0.3333, "step": 39575 }, { "epoch": 668.35, "grad_norm": 0.9842578172683716, "learning_rate": 6.044522261130565e-05, "loss": 0.3365, "step": 39600 }, { "epoch": 668.78, "grad_norm": 0.8163949251174927, "learning_rate": 6.042021010505253e-05, "loss": 0.3355, "step": 39625 }, { "epoch": 669.2, "grad_norm": 0.8072527647018433, "learning_rate": 6.0395197598799404e-05, "loss": 0.3328, "step": 39650 }, { "epoch": 669.62, "grad_norm": 0.7240103483200073, "learning_rate": 6.037018509254627e-05, "loss": 0.3353, "step": 39675 }, { "epoch": 670.04, "grad_norm": 0.9170970320701599, "learning_rate": 6.034517258629315e-05, "loss": 0.3355, "step": 39700 }, { "epoch": 670.46, "grad_norm": 1.5089324712753296, "learning_rate": 6.032016008004002e-05, "loss": 0.3349, "step": 39725 }, { "epoch": 670.89, "grad_norm": 0.688605785369873, "learning_rate": 6.0295147573786895e-05, "loss": 0.3356, "step": 39750 }, { "epoch": 671.31, "grad_norm": 0.6999372243881226, "learning_rate": 6.0270135067533774e-05, "loss": 0.3318, "step": 39775 }, { "epoch": 671.73, "grad_norm": 0.6212260127067566, "learning_rate": 6.024512256128064e-05, "loss": 0.3329, "step": 39800 }, { "epoch": 672.15, "grad_norm": 0.7627471089363098, "learning_rate": 6.022011005502751e-05, "loss": 0.3333, "step": 39825 }, { "epoch": 672.57, "grad_norm": 0.5980801582336426, "learning_rate": 6.019509754877439e-05, "loss": 0.3345, "step": 39850 }, { "epoch": 673.0, "grad_norm": 0.625714898109436, "learning_rate": 6.0170085042521265e-05, "loss": 0.333, "step": 39875 }, { "epoch": 673.42, "grad_norm": 1.350926160812378, "learning_rate": 6.014507253626813e-05, "loss": 0.3347, "step": 39900 }, { "epoch": 673.84, "grad_norm": 0.9399625062942505, "learning_rate": 6.012006003001501e-05, "loss": 0.3362, "step": 39925 }, { "epoch": 674.26, "grad_norm": 0.863330066204071, "learning_rate": 6.009504752376188e-05, "loss": 0.3342, "step": 39950 }, { "epoch": 674.68, "grad_norm": 0.806898295879364, "learning_rate": 6.0070035017508755e-05, "loss": 0.3337, "step": 39975 }, { "epoch": 675.11, "grad_norm": 0.8541303277015686, "learning_rate": 6.0045022511255635e-05, "loss": 0.3339, "step": 40000 }, { "epoch": 675.11, "eval_loss": 0.463015079498291, "eval_runtime": 3.8348, "eval_samples_per_second": 80.577, "eval_steps_per_second": 2.608, "step": 40000 }, { "epoch": 675.53, "grad_norm": 0.5531277656555176, "learning_rate": 6.00200100050025e-05, "loss": 0.335, "step": 40025 }, { "epoch": 675.95, "grad_norm": 0.7718360424041748, "learning_rate": 5.999499749874937e-05, "loss": 0.3346, "step": 40050 }, { "epoch": 676.37, "grad_norm": 0.914580225944519, "learning_rate": 5.996998499249625e-05, "loss": 0.3347, "step": 40075 }, { "epoch": 676.79, "grad_norm": 0.8583731055259705, "learning_rate": 5.9944972486243125e-05, "loss": 0.3358, "step": 40100 }, { "epoch": 677.22, "grad_norm": 0.6906170845031738, "learning_rate": 5.991995997998999e-05, "loss": 0.3331, "step": 40125 }, { "epoch": 677.64, "grad_norm": 0.7248491048812866, "learning_rate": 5.989494747373687e-05, "loss": 0.3344, "step": 40150 }, { "epoch": 678.06, "grad_norm": 0.7911438941955566, "learning_rate": 5.986993496748374e-05, "loss": 0.3343, "step": 40175 }, { "epoch": 678.48, "grad_norm": 0.9510887265205383, "learning_rate": 5.984492246123062e-05, "loss": 0.3339, "step": 40200 }, { "epoch": 678.9, "grad_norm": 0.7660213708877563, "learning_rate": 5.9819909954977495e-05, "loss": 0.3363, "step": 40225 }, { "epoch": 679.32, "grad_norm": 1.2608420848846436, "learning_rate": 5.979489744872436e-05, "loss": 0.3364, "step": 40250 }, { "epoch": 679.75, "grad_norm": 0.5941702723503113, "learning_rate": 5.976988494247124e-05, "loss": 0.3337, "step": 40275 }, { "epoch": 680.17, "grad_norm": 0.6685301065444946, "learning_rate": 5.9744872436218113e-05, "loss": 0.3312, "step": 40300 }, { "epoch": 680.59, "grad_norm": 0.8607536554336548, "learning_rate": 5.9719859929964986e-05, "loss": 0.3323, "step": 40325 }, { "epoch": 681.01, "grad_norm": 0.7609328627586365, "learning_rate": 5.9694847423711865e-05, "loss": 0.3343, "step": 40350 }, { "epoch": 681.43, "grad_norm": 0.6644548177719116, "learning_rate": 5.966983491745873e-05, "loss": 0.3331, "step": 40375 }, { "epoch": 681.86, "grad_norm": 1.296035885810852, "learning_rate": 5.9644822411205604e-05, "loss": 0.3307, "step": 40400 }, { "epoch": 682.28, "grad_norm": 0.6522417068481445, "learning_rate": 5.9619809904952483e-05, "loss": 0.3353, "step": 40425 }, { "epoch": 682.7, "grad_norm": 0.5869843363761902, "learning_rate": 5.9594797398699356e-05, "loss": 0.3333, "step": 40450 }, { "epoch": 683.12, "grad_norm": 0.8301829099655151, "learning_rate": 5.956978489244622e-05, "loss": 0.334, "step": 40475 }, { "epoch": 683.54, "grad_norm": 0.6969075798988342, "learning_rate": 5.95447723861931e-05, "loss": 0.3305, "step": 40500 }, { "epoch": 683.97, "grad_norm": 1.1046277284622192, "learning_rate": 5.9519759879939974e-05, "loss": 0.335, "step": 40525 }, { "epoch": 684.39, "grad_norm": 0.7820445895195007, "learning_rate": 5.949474737368684e-05, "loss": 0.3329, "step": 40550 }, { "epoch": 684.81, "grad_norm": 0.6696531176567078, "learning_rate": 5.9469734867433726e-05, "loss": 0.3322, "step": 40575 }, { "epoch": 685.23, "grad_norm": 0.74398273229599, "learning_rate": 5.944472236118059e-05, "loss": 0.337, "step": 40600 }, { "epoch": 685.65, "grad_norm": 0.8388913869857788, "learning_rate": 5.9419709854927465e-05, "loss": 0.3373, "step": 40625 }, { "epoch": 686.08, "grad_norm": 1.0617294311523438, "learning_rate": 5.9394697348674344e-05, "loss": 0.3344, "step": 40650 }, { "epoch": 686.5, "grad_norm": 0.9923056960105896, "learning_rate": 5.936968484242122e-05, "loss": 0.3353, "step": 40675 }, { "epoch": 686.92, "grad_norm": 0.6869209408760071, "learning_rate": 5.934467233616808e-05, "loss": 0.3346, "step": 40700 }, { "epoch": 687.34, "grad_norm": 0.5821365714073181, "learning_rate": 5.931965982991496e-05, "loss": 0.3331, "step": 40725 }, { "epoch": 687.76, "grad_norm": 0.7897204160690308, "learning_rate": 5.9294647323661835e-05, "loss": 0.3297, "step": 40750 }, { "epoch": 688.19, "grad_norm": 0.7303852438926697, "learning_rate": 5.92696348174087e-05, "loss": 0.3335, "step": 40775 }, { "epoch": 688.61, "grad_norm": 0.6346200108528137, "learning_rate": 5.924462231115559e-05, "loss": 0.3325, "step": 40800 }, { "epoch": 689.03, "grad_norm": 0.7150778770446777, "learning_rate": 5.921960980490245e-05, "loss": 0.3306, "step": 40825 }, { "epoch": 689.45, "grad_norm": 0.9798932671546936, "learning_rate": 5.9194597298649325e-05, "loss": 0.3337, "step": 40850 }, { "epoch": 689.87, "grad_norm": 1.0228947401046753, "learning_rate": 5.9169584792396205e-05, "loss": 0.3339, "step": 40875 }, { "epoch": 690.3, "grad_norm": 0.6772608160972595, "learning_rate": 5.914457228614307e-05, "loss": 0.3314, "step": 40900 }, { "epoch": 690.72, "grad_norm": 0.7962575554847717, "learning_rate": 5.911955977988994e-05, "loss": 0.3351, "step": 40925 }, { "epoch": 691.14, "grad_norm": 0.6201666593551636, "learning_rate": 5.909454727363682e-05, "loss": 0.3327, "step": 40950 }, { "epoch": 691.56, "grad_norm": 1.9425045251846313, "learning_rate": 5.9069534767383695e-05, "loss": 0.3366, "step": 40975 }, { "epoch": 691.98, "grad_norm": 0.791454553604126, "learning_rate": 5.904452226113056e-05, "loss": 0.336, "step": 41000 }, { "epoch": 692.41, "grad_norm": 0.6584410071372986, "learning_rate": 5.901950975487745e-05, "loss": 0.3298, "step": 41025 }, { "epoch": 692.83, "grad_norm": 0.9300652742385864, "learning_rate": 5.899449724862431e-05, "loss": 0.3357, "step": 41050 }, { "epoch": 693.25, "grad_norm": 0.9851629734039307, "learning_rate": 5.8969484742371186e-05, "loss": 0.334, "step": 41075 }, { "epoch": 693.67, "grad_norm": 1.210953712463379, "learning_rate": 5.8944472236118065e-05, "loss": 0.333, "step": 41100 }, { "epoch": 694.09, "grad_norm": 0.7146925330162048, "learning_rate": 5.891945972986493e-05, "loss": 0.3342, "step": 41125 }, { "epoch": 694.51, "grad_norm": 0.5388910174369812, "learning_rate": 5.8894447223611804e-05, "loss": 0.3311, "step": 41150 }, { "epoch": 694.94, "grad_norm": 0.8334054946899414, "learning_rate": 5.886943471735868e-05, "loss": 0.3338, "step": 41175 }, { "epoch": 695.36, "grad_norm": 0.86495441198349, "learning_rate": 5.8844422211105556e-05, "loss": 0.332, "step": 41200 }, { "epoch": 695.78, "grad_norm": 0.8317554593086243, "learning_rate": 5.881940970485242e-05, "loss": 0.3325, "step": 41225 }, { "epoch": 696.2, "grad_norm": 0.6701788902282715, "learning_rate": 5.879439719859931e-05, "loss": 0.3328, "step": 41250 }, { "epoch": 696.62, "grad_norm": 0.814325213432312, "learning_rate": 5.8769384692346174e-05, "loss": 0.332, "step": 41275 }, { "epoch": 697.05, "grad_norm": 0.5316603779792786, "learning_rate": 5.874437218609305e-05, "loss": 0.3327, "step": 41300 }, { "epoch": 697.47, "grad_norm": 0.6985114216804504, "learning_rate": 5.8719359679839926e-05, "loss": 0.332, "step": 41325 }, { "epoch": 697.89, "grad_norm": 0.6453738808631897, "learning_rate": 5.869434717358679e-05, "loss": 0.3342, "step": 41350 }, { "epoch": 698.31, "grad_norm": 1.1835652589797974, "learning_rate": 5.8669334667333665e-05, "loss": 0.3345, "step": 41375 }, { "epoch": 698.73, "grad_norm": 0.8500351309776306, "learning_rate": 5.8644322161080544e-05, "loss": 0.3366, "step": 41400 }, { "epoch": 699.16, "grad_norm": 1.1051822900772095, "learning_rate": 5.861930965482742e-05, "loss": 0.3344, "step": 41425 }, { "epoch": 699.58, "grad_norm": 1.1580395698547363, "learning_rate": 5.8594297148574296e-05, "loss": 0.3326, "step": 41450 }, { "epoch": 700.0, "grad_norm": 0.5726282596588135, "learning_rate": 5.856928464232116e-05, "loss": 0.3328, "step": 41475 }, { "epoch": 700.42, "grad_norm": 0.7072369456291199, "learning_rate": 5.8544272136068035e-05, "loss": 0.3341, "step": 41500 }, { "epoch": 700.84, "grad_norm": 0.5987697839736938, "learning_rate": 5.8519259629814914e-05, "loss": 0.3331, "step": 41525 }, { "epoch": 701.27, "grad_norm": 0.7492908239364624, "learning_rate": 5.849424712356179e-05, "loss": 0.332, "step": 41550 }, { "epoch": 701.69, "grad_norm": 0.6955615282058716, "learning_rate": 5.846923461730865e-05, "loss": 0.3322, "step": 41575 }, { "epoch": 702.11, "grad_norm": 0.9185752868652344, "learning_rate": 5.844422211105554e-05, "loss": 0.333, "step": 41600 }, { "epoch": 702.53, "grad_norm": 0.8091913461685181, "learning_rate": 5.8419209604802405e-05, "loss": 0.33, "step": 41625 }, { "epoch": 702.95, "grad_norm": 1.1685776710510254, "learning_rate": 5.839419709854928e-05, "loss": 0.3346, "step": 41650 }, { "epoch": 703.38, "grad_norm": 1.6387321949005127, "learning_rate": 5.836918459229616e-05, "loss": 0.3358, "step": 41675 }, { "epoch": 703.8, "grad_norm": 0.8366710543632507, "learning_rate": 5.834417208604302e-05, "loss": 0.3339, "step": 41700 }, { "epoch": 704.22, "grad_norm": 0.5328084826469421, "learning_rate": 5.8319159579789895e-05, "loss": 0.3313, "step": 41725 }, { "epoch": 704.64, "grad_norm": 1.02408766746521, "learning_rate": 5.8294147073536775e-05, "loss": 0.3327, "step": 41750 }, { "epoch": 705.06, "grad_norm": 0.7411752343177795, "learning_rate": 5.826913456728365e-05, "loss": 0.3326, "step": 41775 }, { "epoch": 705.49, "grad_norm": 0.7177361249923706, "learning_rate": 5.824412206103051e-05, "loss": 0.3313, "step": 41800 }, { "epoch": 705.91, "grad_norm": 0.9547587037086487, "learning_rate": 5.821910955477739e-05, "loss": 0.3327, "step": 41825 }, { "epoch": 706.33, "grad_norm": 1.92435884475708, "learning_rate": 5.8194097048524265e-05, "loss": 0.3391, "step": 41850 }, { "epoch": 706.75, "grad_norm": 0.6739429831504822, "learning_rate": 5.816908454227114e-05, "loss": 0.3338, "step": 41875 }, { "epoch": 707.17, "grad_norm": 1.2731108665466309, "learning_rate": 5.814407203601802e-05, "loss": 0.3383, "step": 41900 }, { "epoch": 707.59, "grad_norm": 0.9556517601013184, "learning_rate": 5.811905952976488e-05, "loss": 0.3339, "step": 41925 }, { "epoch": 708.02, "grad_norm": 0.6987408399581909, "learning_rate": 5.8094047023511756e-05, "loss": 0.3315, "step": 41950 }, { "epoch": 708.44, "grad_norm": 0.6027930974960327, "learning_rate": 5.8069034517258635e-05, "loss": 0.3304, "step": 41975 }, { "epoch": 708.86, "grad_norm": 0.797393262386322, "learning_rate": 5.804402201100551e-05, "loss": 0.3344, "step": 42000 }, { "epoch": 709.28, "grad_norm": 0.6573100686073303, "learning_rate": 5.8019009504752374e-05, "loss": 0.3352, "step": 42025 }, { "epoch": 709.7, "grad_norm": 0.961092472076416, "learning_rate": 5.799399699849925e-05, "loss": 0.3354, "step": 42050 }, { "epoch": 710.13, "grad_norm": 1.0364923477172852, "learning_rate": 5.7968984492246126e-05, "loss": 0.3283, "step": 42075 }, { "epoch": 710.55, "grad_norm": 1.034196376800537, "learning_rate": 5.7943971985993e-05, "loss": 0.3346, "step": 42100 }, { "epoch": 710.97, "grad_norm": 0.6445721387863159, "learning_rate": 5.791895947973988e-05, "loss": 0.3343, "step": 42125 }, { "epoch": 711.39, "grad_norm": 0.9342503547668457, "learning_rate": 5.7893946973486744e-05, "loss": 0.33, "step": 42150 }, { "epoch": 711.81, "grad_norm": 1.0275731086730957, "learning_rate": 5.7868934467233617e-05, "loss": 0.3283, "step": 42175 }, { "epoch": 712.24, "grad_norm": 0.8965873122215271, "learning_rate": 5.7843921960980496e-05, "loss": 0.3328, "step": 42200 }, { "epoch": 712.66, "grad_norm": 0.6987571716308594, "learning_rate": 5.781890945472737e-05, "loss": 0.3334, "step": 42225 }, { "epoch": 713.08, "grad_norm": 1.0631195306777954, "learning_rate": 5.7793896948474235e-05, "loss": 0.3313, "step": 42250 }, { "epoch": 713.5, "grad_norm": 0.6273359060287476, "learning_rate": 5.7768884442221114e-05, "loss": 0.332, "step": 42275 }, { "epoch": 713.92, "grad_norm": 2.093942403793335, "learning_rate": 5.7743871935967987e-05, "loss": 0.3371, "step": 42300 }, { "epoch": 714.35, "grad_norm": 0.5946274399757385, "learning_rate": 5.771885942971485e-05, "loss": 0.3331, "step": 42325 }, { "epoch": 714.77, "grad_norm": 0.6289260387420654, "learning_rate": 5.769384692346174e-05, "loss": 0.3306, "step": 42350 }, { "epoch": 715.19, "grad_norm": 0.7274248003959656, "learning_rate": 5.7668834417208605e-05, "loss": 0.3339, "step": 42375 }, { "epoch": 715.61, "grad_norm": 0.7944244742393494, "learning_rate": 5.764382191095548e-05, "loss": 0.3303, "step": 42400 }, { "epoch": 716.03, "grad_norm": 0.8237102627754211, "learning_rate": 5.761880940470236e-05, "loss": 0.3305, "step": 42425 }, { "epoch": 716.46, "grad_norm": 0.6971156597137451, "learning_rate": 5.759379689844923e-05, "loss": 0.3329, "step": 42450 }, { "epoch": 716.88, "grad_norm": 0.9237626194953918, "learning_rate": 5.7568784392196095e-05, "loss": 0.3335, "step": 42475 }, { "epoch": 717.3, "grad_norm": 0.8193545341491699, "learning_rate": 5.7543771885942975e-05, "loss": 0.3342, "step": 42500 }, { "epoch": 717.72, "grad_norm": 0.6978764533996582, "learning_rate": 5.751875937968985e-05, "loss": 0.3281, "step": 42525 }, { "epoch": 718.14, "grad_norm": 0.9430092573165894, "learning_rate": 5.749374687343671e-05, "loss": 0.3316, "step": 42550 }, { "epoch": 718.57, "grad_norm": 0.6024510264396667, "learning_rate": 5.74687343671836e-05, "loss": 0.3285, "step": 42575 }, { "epoch": 718.99, "grad_norm": 0.864118218421936, "learning_rate": 5.7443721860930465e-05, "loss": 0.3306, "step": 42600 }, { "epoch": 719.41, "grad_norm": 0.848547637462616, "learning_rate": 5.7418709354677345e-05, "loss": 0.3323, "step": 42625 }, { "epoch": 719.83, "grad_norm": 1.3665473461151123, "learning_rate": 5.739369684842422e-05, "loss": 0.3326, "step": 42650 }, { "epoch": 720.25, "grad_norm": 0.6813738346099854, "learning_rate": 5.736868434217108e-05, "loss": 0.3323, "step": 42675 }, { "epoch": 720.68, "grad_norm": 0.9202477335929871, "learning_rate": 5.734367183591797e-05, "loss": 0.3337, "step": 42700 }, { "epoch": 721.1, "grad_norm": 0.6690118312835693, "learning_rate": 5.7318659329664835e-05, "loss": 0.3353, "step": 42725 }, { "epoch": 721.52, "grad_norm": 0.850532054901123, "learning_rate": 5.729364682341171e-05, "loss": 0.328, "step": 42750 }, { "epoch": 721.94, "grad_norm": 0.6994016170501709, "learning_rate": 5.726863431715859e-05, "loss": 0.3306, "step": 42775 }, { "epoch": 722.36, "grad_norm": 1.455108404159546, "learning_rate": 5.724362181090546e-05, "loss": 0.3323, "step": 42800 }, { "epoch": 722.78, "grad_norm": 0.8454045653343201, "learning_rate": 5.7218609304652326e-05, "loss": 0.3329, "step": 42825 }, { "epoch": 723.21, "grad_norm": 0.6842371225357056, "learning_rate": 5.7193596798399205e-05, "loss": 0.3316, "step": 42850 }, { "epoch": 723.63, "grad_norm": 0.7230656147003174, "learning_rate": 5.716858429214608e-05, "loss": 0.3307, "step": 42875 }, { "epoch": 724.05, "grad_norm": 0.5780199766159058, "learning_rate": 5.7143571785892944e-05, "loss": 0.3349, "step": 42900 }, { "epoch": 724.47, "grad_norm": 0.8254960775375366, "learning_rate": 5.711855927963983e-05, "loss": 0.3306, "step": 42925 }, { "epoch": 724.89, "grad_norm": 0.6607139110565186, "learning_rate": 5.7093546773386696e-05, "loss": 0.3335, "step": 42950 }, { "epoch": 725.32, "grad_norm": 1.253084421157837, "learning_rate": 5.706853426713357e-05, "loss": 0.3306, "step": 42975 }, { "epoch": 725.74, "grad_norm": 0.8245763182640076, "learning_rate": 5.704352176088045e-05, "loss": 0.3294, "step": 43000 }, { "epoch": 726.16, "grad_norm": 0.8240205645561218, "learning_rate": 5.701950975487744e-05, "loss": 0.3343, "step": 43025 }, { "epoch": 726.58, "grad_norm": 0.549266517162323, "learning_rate": 5.699449724862431e-05, "loss": 0.3336, "step": 43050 }, { "epoch": 727.0, "grad_norm": 1.1362807750701904, "learning_rate": 5.696948474237119e-05, "loss": 0.3319, "step": 43075 }, { "epoch": 727.43, "grad_norm": 0.7072771787643433, "learning_rate": 5.6944472236118064e-05, "loss": 0.3312, "step": 43100 }, { "epoch": 727.85, "grad_norm": 0.6766901612281799, "learning_rate": 5.691945972986493e-05, "loss": 0.3338, "step": 43125 }, { "epoch": 728.27, "grad_norm": 0.9430785775184631, "learning_rate": 5.6894447223611816e-05, "loss": 0.3321, "step": 43150 }, { "epoch": 728.69, "grad_norm": 0.7124587893486023, "learning_rate": 5.686943471735868e-05, "loss": 0.3315, "step": 43175 }, { "epoch": 729.11, "grad_norm": 1.3141694068908691, "learning_rate": 5.6844422211105555e-05, "loss": 0.3314, "step": 43200 }, { "epoch": 729.54, "grad_norm": 0.7268463969230652, "learning_rate": 5.6819409704852434e-05, "loss": 0.3334, "step": 43225 }, { "epoch": 729.96, "grad_norm": 0.8302236795425415, "learning_rate": 5.67943971985993e-05, "loss": 0.3345, "step": 43250 }, { "epoch": 730.38, "grad_norm": 0.8234114646911621, "learning_rate": 5.676938469234617e-05, "loss": 0.331, "step": 43275 }, { "epoch": 730.8, "grad_norm": 0.9415809512138367, "learning_rate": 5.674437218609305e-05, "loss": 0.3293, "step": 43300 }, { "epoch": 731.22, "grad_norm": 0.6087244153022766, "learning_rate": 5.6719359679839925e-05, "loss": 0.3304, "step": 43325 }, { "epoch": 731.65, "grad_norm": 0.9064160585403442, "learning_rate": 5.669434717358679e-05, "loss": 0.3301, "step": 43350 }, { "epoch": 732.07, "grad_norm": 0.8394178748130798, "learning_rate": 5.666933466733367e-05, "loss": 0.3315, "step": 43375 }, { "epoch": 732.49, "grad_norm": 0.6835237145423889, "learning_rate": 5.664432216108054e-05, "loss": 0.331, "step": 43400 }, { "epoch": 732.91, "grad_norm": 1.940484642982483, "learning_rate": 5.6619309654827415e-05, "loss": 0.3312, "step": 43425 }, { "epoch": 733.33, "grad_norm": 1.3297643661499023, "learning_rate": 5.6594297148574295e-05, "loss": 0.3286, "step": 43450 }, { "epoch": 733.76, "grad_norm": 0.8291510343551636, "learning_rate": 5.656928464232116e-05, "loss": 0.3312, "step": 43475 }, { "epoch": 734.18, "grad_norm": 0.6278479695320129, "learning_rate": 5.6544272136068033e-05, "loss": 0.3292, "step": 43500 }, { "epoch": 734.6, "grad_norm": 0.8123427629470825, "learning_rate": 5.651925962981491e-05, "loss": 0.3323, "step": 43525 }, { "epoch": 735.02, "grad_norm": 0.6087535619735718, "learning_rate": 5.6494247123561786e-05, "loss": 0.3285, "step": 43550 }, { "epoch": 735.44, "grad_norm": 0.5842786431312561, "learning_rate": 5.646923461730865e-05, "loss": 0.3305, "step": 43575 }, { "epoch": 735.86, "grad_norm": 1.2080456018447876, "learning_rate": 5.644422211105553e-05, "loss": 0.331, "step": 43600 }, { "epoch": 736.29, "grad_norm": 1.8730006217956543, "learning_rate": 5.6419209604802403e-05, "loss": 0.3302, "step": 43625 }, { "epoch": 736.71, "grad_norm": 1.3710256814956665, "learning_rate": 5.6394197098549276e-05, "loss": 0.331, "step": 43650 }, { "epoch": 737.13, "grad_norm": 0.7835788130760193, "learning_rate": 5.6369184592296156e-05, "loss": 0.333, "step": 43675 }, { "epoch": 737.55, "grad_norm": 0.597734808921814, "learning_rate": 5.634417208604302e-05, "loss": 0.3298, "step": 43700 }, { "epoch": 737.97, "grad_norm": 0.9372978806495667, "learning_rate": 5.6319159579789894e-05, "loss": 0.3325, "step": 43725 }, { "epoch": 738.4, "grad_norm": 0.7435552477836609, "learning_rate": 5.6294147073536774e-05, "loss": 0.3318, "step": 43750 }, { "epoch": 738.82, "grad_norm": 0.6531929969787598, "learning_rate": 5.6269134567283646e-05, "loss": 0.3289, "step": 43775 }, { "epoch": 739.24, "grad_norm": 0.6569234728813171, "learning_rate": 5.624412206103051e-05, "loss": 0.3284, "step": 43800 }, { "epoch": 739.66, "grad_norm": 0.7906466722488403, "learning_rate": 5.621910955477739e-05, "loss": 0.3297, "step": 43825 }, { "epoch": 740.08, "grad_norm": 0.6991225481033325, "learning_rate": 5.6194097048524264e-05, "loss": 0.3297, "step": 43850 }, { "epoch": 740.51, "grad_norm": 0.863693356513977, "learning_rate": 5.616908454227113e-05, "loss": 0.3317, "step": 43875 }, { "epoch": 740.93, "grad_norm": 2.4361770153045654, "learning_rate": 5.6144072036018016e-05, "loss": 0.3291, "step": 43900 }, { "epoch": 741.35, "grad_norm": 1.5412043333053589, "learning_rate": 5.611905952976488e-05, "loss": 0.333, "step": 43925 }, { "epoch": 741.77, "grad_norm": 0.7133874297142029, "learning_rate": 5.6094047023511755e-05, "loss": 0.3296, "step": 43950 }, { "epoch": 742.19, "grad_norm": 1.1028324365615845, "learning_rate": 5.6069034517258634e-05, "loss": 0.33, "step": 43975 }, { "epoch": 742.62, "grad_norm": 1.0599080324172974, "learning_rate": 5.604402201100551e-05, "loss": 0.3313, "step": 44000 }, { "epoch": 743.04, "grad_norm": 0.747011125087738, "learning_rate": 5.6019009504752386e-05, "loss": 0.33, "step": 44025 }, { "epoch": 743.46, "grad_norm": 0.5414310097694397, "learning_rate": 5.599399699849925e-05, "loss": 0.3296, "step": 44050 }, { "epoch": 743.88, "grad_norm": 0.7717816829681396, "learning_rate": 5.5968984492246125e-05, "loss": 0.3294, "step": 44075 }, { "epoch": 744.3, "grad_norm": 0.7545495629310608, "learning_rate": 5.5943971985993004e-05, "loss": 0.3304, "step": 44100 }, { "epoch": 744.73, "grad_norm": 0.8181189894676208, "learning_rate": 5.591895947973988e-05, "loss": 0.3328, "step": 44125 }, { "epoch": 745.15, "grad_norm": 0.8401549458503723, "learning_rate": 5.589394697348674e-05, "loss": 0.3304, "step": 44150 }, { "epoch": 745.57, "grad_norm": 1.3432790040969849, "learning_rate": 5.586893446723362e-05, "loss": 0.3317, "step": 44175 }, { "epoch": 745.99, "grad_norm": 0.7849639058113098, "learning_rate": 5.5843921960980495e-05, "loss": 0.3306, "step": 44200 }, { "epoch": 746.41, "grad_norm": 0.7063305974006653, "learning_rate": 5.581890945472736e-05, "loss": 0.328, "step": 44225 }, { "epoch": 746.84, "grad_norm": 1.5137840509414673, "learning_rate": 5.579389694847425e-05, "loss": 0.3314, "step": 44250 }, { "epoch": 747.26, "grad_norm": 0.8159704804420471, "learning_rate": 5.576888444222111e-05, "loss": 0.3336, "step": 44275 }, { "epoch": 747.68, "grad_norm": 0.9577706456184387, "learning_rate": 5.5743871935967985e-05, "loss": 0.3291, "step": 44300 }, { "epoch": 748.1, "grad_norm": 1.2636228799819946, "learning_rate": 5.5718859429714865e-05, "loss": 0.3305, "step": 44325 }, { "epoch": 748.52, "grad_norm": 0.7239065766334534, "learning_rate": 5.569384692346174e-05, "loss": 0.3286, "step": 44350 }, { "epoch": 748.95, "grad_norm": 0.5101439952850342, "learning_rate": 5.56688344172086e-05, "loss": 0.3296, "step": 44375 }, { "epoch": 749.37, "grad_norm": 0.7440423369407654, "learning_rate": 5.564382191095548e-05, "loss": 0.3295, "step": 44400 }, { "epoch": 749.79, "grad_norm": 1.4046690464019775, "learning_rate": 5.5618809404702355e-05, "loss": 0.3311, "step": 44425 }, { "epoch": 750.21, "grad_norm": 0.8493804335594177, "learning_rate": 5.559379689844922e-05, "loss": 0.3317, "step": 44450 }, { "epoch": 750.63, "grad_norm": 0.8719350695610046, "learning_rate": 5.556878439219611e-05, "loss": 0.329, "step": 44475 }, { "epoch": 751.05, "grad_norm": 0.48865702748298645, "learning_rate": 5.5543771885942973e-05, "loss": 0.3328, "step": 44500 }, { "epoch": 751.48, "grad_norm": 0.6534274220466614, "learning_rate": 5.5518759379689846e-05, "loss": 0.3291, "step": 44525 }, { "epoch": 751.9, "grad_norm": 0.7186347842216492, "learning_rate": 5.5493746873436725e-05, "loss": 0.3318, "step": 44550 }, { "epoch": 752.32, "grad_norm": 0.6397172808647156, "learning_rate": 5.546873436718359e-05, "loss": 0.3288, "step": 44575 }, { "epoch": 752.74, "grad_norm": 0.7008906006813049, "learning_rate": 5.5443721860930464e-05, "loss": 0.333, "step": 44600 }, { "epoch": 753.16, "grad_norm": 0.9828735589981079, "learning_rate": 5.5418709354677343e-05, "loss": 0.3297, "step": 44625 }, { "epoch": 753.59, "grad_norm": 0.6443371176719666, "learning_rate": 5.5393696848424216e-05, "loss": 0.3292, "step": 44650 }, { "epoch": 754.01, "grad_norm": 0.7059643268585205, "learning_rate": 5.536868434217108e-05, "loss": 0.3298, "step": 44675 }, { "epoch": 754.43, "grad_norm": 0.5091596841812134, "learning_rate": 5.534367183591797e-05, "loss": 0.3318, "step": 44700 }, { "epoch": 754.85, "grad_norm": 0.579298734664917, "learning_rate": 5.5318659329664834e-05, "loss": 0.3311, "step": 44725 }, { "epoch": 755.27, "grad_norm": 1.0110981464385986, "learning_rate": 5.529364682341171e-05, "loss": 0.3339, "step": 44750 }, { "epoch": 755.7, "grad_norm": 0.8061883449554443, "learning_rate": 5.5268634317158586e-05, "loss": 0.329, "step": 44775 }, { "epoch": 756.12, "grad_norm": 0.826849639415741, "learning_rate": 5.524362181090545e-05, "loss": 0.33, "step": 44800 }, { "epoch": 756.54, "grad_norm": 0.5831419229507446, "learning_rate": 5.5218609304652325e-05, "loss": 0.3273, "step": 44825 }, { "epoch": 756.96, "grad_norm": 0.8474140167236328, "learning_rate": 5.5193596798399204e-05, "loss": 0.3287, "step": 44850 }, { "epoch": 757.38, "grad_norm": 0.9393616318702698, "learning_rate": 5.516858429214608e-05, "loss": 0.3333, "step": 44875 }, { "epoch": 757.81, "grad_norm": 0.8512710928916931, "learning_rate": 5.514357178589294e-05, "loss": 0.3285, "step": 44900 }, { "epoch": 758.23, "grad_norm": 0.5563620924949646, "learning_rate": 5.511855927963982e-05, "loss": 0.33, "step": 44925 }, { "epoch": 758.65, "grad_norm": 1.468787670135498, "learning_rate": 5.5093546773386695e-05, "loss": 0.3313, "step": 44950 }, { "epoch": 759.07, "grad_norm": 0.7625210881233215, "learning_rate": 5.506853426713357e-05, "loss": 0.3306, "step": 44975 }, { "epoch": 759.49, "grad_norm": 1.206878662109375, "learning_rate": 5.504352176088045e-05, "loss": 0.3299, "step": 45000 }, { "epoch": 759.92, "grad_norm": 0.47909948229789734, "learning_rate": 5.501850925462731e-05, "loss": 0.3297, "step": 45025 }, { "epoch": 760.34, "grad_norm": 0.5425036549568176, "learning_rate": 5.4993496748374185e-05, "loss": 0.3303, "step": 45050 }, { "epoch": 760.76, "grad_norm": 1.2390602827072144, "learning_rate": 5.4968484242121065e-05, "loss": 0.3301, "step": 45075 }, { "epoch": 761.18, "grad_norm": 0.7026757597923279, "learning_rate": 5.494347173586794e-05, "loss": 0.3281, "step": 45100 }, { "epoch": 761.6, "grad_norm": 0.6976399421691895, "learning_rate": 5.49184592296148e-05, "loss": 0.3336, "step": 45125 }, { "epoch": 762.03, "grad_norm": 0.7280983924865723, "learning_rate": 5.489344672336168e-05, "loss": 0.3337, "step": 45150 }, { "epoch": 762.45, "grad_norm": 0.6944530010223389, "learning_rate": 5.4868434217108555e-05, "loss": 0.3304, "step": 45175 }, { "epoch": 762.87, "grad_norm": 0.5530226230621338, "learning_rate": 5.484342171085543e-05, "loss": 0.3293, "step": 45200 }, { "epoch": 763.29, "grad_norm": 0.9067057967185974, "learning_rate": 5.481840920460231e-05, "loss": 0.3314, "step": 45225 }, { "epoch": 763.71, "grad_norm": 0.9638586044311523, "learning_rate": 5.479339669834917e-05, "loss": 0.3341, "step": 45250 }, { "epoch": 764.14, "grad_norm": 1.2079076766967773, "learning_rate": 5.476838419209605e-05, "loss": 0.3333, "step": 45275 }, { "epoch": 764.56, "grad_norm": 0.46285155415534973, "learning_rate": 5.4743371685842925e-05, "loss": 0.3314, "step": 45300 }, { "epoch": 764.98, "grad_norm": 1.2184970378875732, "learning_rate": 5.47183591795898e-05, "loss": 0.3316, "step": 45325 }, { "epoch": 765.4, "grad_norm": 0.9557583332061768, "learning_rate": 5.469434717358679e-05, "loss": 0.3323, "step": 45350 }, { "epoch": 765.82, "grad_norm": 0.6370512247085571, "learning_rate": 5.466933466733367e-05, "loss": 0.3279, "step": 45375 }, { "epoch": 766.24, "grad_norm": 0.8065276145935059, "learning_rate": 5.464432216108054e-05, "loss": 0.33, "step": 45400 }, { "epoch": 766.67, "grad_norm": 0.8161351680755615, "learning_rate": 5.461930965482741e-05, "loss": 0.3285, "step": 45425 }, { "epoch": 767.09, "grad_norm": 0.5635145902633667, "learning_rate": 5.4594297148574294e-05, "loss": 0.3294, "step": 45450 }, { "epoch": 767.51, "grad_norm": 0.9862470030784607, "learning_rate": 5.456928464232116e-05, "loss": 0.3313, "step": 45475 }, { "epoch": 767.93, "grad_norm": 0.8373903632164001, "learning_rate": 5.454427213606804e-05, "loss": 0.3284, "step": 45500 }, { "epoch": 768.35, "grad_norm": 0.5550042390823364, "learning_rate": 5.451925962981491e-05, "loss": 0.3325, "step": 45525 }, { "epoch": 768.78, "grad_norm": 0.9700167179107666, "learning_rate": 5.4494247123561784e-05, "loss": 0.3325, "step": 45550 }, { "epoch": 769.2, "grad_norm": 0.6801298260688782, "learning_rate": 5.4469234617308664e-05, "loss": 0.3285, "step": 45575 }, { "epoch": 769.62, "grad_norm": 0.6233556270599365, "learning_rate": 5.444422211105553e-05, "loss": 0.3303, "step": 45600 }, { "epoch": 770.04, "grad_norm": 1.0683132410049438, "learning_rate": 5.44192096048024e-05, "loss": 0.3298, "step": 45625 }, { "epoch": 770.46, "grad_norm": 0.7685022354125977, "learning_rate": 5.439419709854928e-05, "loss": 0.3293, "step": 45650 }, { "epoch": 770.89, "grad_norm": 0.8598924279212952, "learning_rate": 5.4369184592296154e-05, "loss": 0.3301, "step": 45675 }, { "epoch": 771.31, "grad_norm": 1.0311508178710938, "learning_rate": 5.434417208604302e-05, "loss": 0.3323, "step": 45700 }, { "epoch": 771.73, "grad_norm": 0.976029634475708, "learning_rate": 5.43191595797899e-05, "loss": 0.3298, "step": 45725 }, { "epoch": 772.15, "grad_norm": 0.5889798402786255, "learning_rate": 5.429414707353677e-05, "loss": 0.3353, "step": 45750 }, { "epoch": 772.57, "grad_norm": 0.6744228005409241, "learning_rate": 5.426913456728364e-05, "loss": 0.3278, "step": 45775 }, { "epoch": 773.0, "grad_norm": 0.705880880355835, "learning_rate": 5.4244122061030524e-05, "loss": 0.3285, "step": 45800 }, { "epoch": 773.42, "grad_norm": 0.7362236380577087, "learning_rate": 5.421910955477739e-05, "loss": 0.3304, "step": 45825 }, { "epoch": 773.84, "grad_norm": 0.6983739137649536, "learning_rate": 5.419409704852426e-05, "loss": 0.3296, "step": 45850 }, { "epoch": 774.26, "grad_norm": 1.082069993019104, "learning_rate": 5.416908454227114e-05, "loss": 0.3285, "step": 45875 }, { "epoch": 774.68, "grad_norm": 0.6046013236045837, "learning_rate": 5.4144072036018015e-05, "loss": 0.3299, "step": 45900 }, { "epoch": 775.11, "grad_norm": 0.7222424149513245, "learning_rate": 5.411905952976488e-05, "loss": 0.3298, "step": 45925 }, { "epoch": 775.53, "grad_norm": 0.5157485008239746, "learning_rate": 5.409404702351176e-05, "loss": 0.3315, "step": 45950 }, { "epoch": 775.95, "grad_norm": 0.961480975151062, "learning_rate": 5.406903451725863e-05, "loss": 0.3309, "step": 45975 }, { "epoch": 776.37, "grad_norm": 1.0241334438323975, "learning_rate": 5.40440220110055e-05, "loss": 0.3282, "step": 46000 }, { "epoch": 776.79, "grad_norm": 0.6726638078689575, "learning_rate": 5.4019009504752385e-05, "loss": 0.3302, "step": 46025 }, { "epoch": 777.22, "grad_norm": 3.449467897415161, "learning_rate": 5.399399699849925e-05, "loss": 0.3312, "step": 46050 }, { "epoch": 777.64, "grad_norm": 0.9635124206542969, "learning_rate": 5.3968984492246124e-05, "loss": 0.33, "step": 46075 }, { "epoch": 778.06, "grad_norm": 0.767174482345581, "learning_rate": 5.3943971985993e-05, "loss": 0.3283, "step": 46100 }, { "epoch": 778.48, "grad_norm": 0.6488787531852722, "learning_rate": 5.391895947973987e-05, "loss": 0.331, "step": 46125 }, { "epoch": 778.9, "grad_norm": 0.5738757848739624, "learning_rate": 5.389394697348674e-05, "loss": 0.3275, "step": 46150 }, { "epoch": 779.32, "grad_norm": 2.2046620845794678, "learning_rate": 5.386893446723362e-05, "loss": 0.3289, "step": 46175 }, { "epoch": 779.75, "grad_norm": 1.0812076330184937, "learning_rate": 5.3843921960980494e-05, "loss": 0.3314, "step": 46200 }, { "epoch": 780.17, "grad_norm": 0.579319417476654, "learning_rate": 5.381890945472736e-05, "loss": 0.3268, "step": 46225 }, { "epoch": 780.59, "grad_norm": 0.7650734782218933, "learning_rate": 5.3793896948474246e-05, "loss": 0.329, "step": 46250 }, { "epoch": 781.01, "grad_norm": 0.6743921041488647, "learning_rate": 5.376888444222111e-05, "loss": 0.328, "step": 46275 }, { "epoch": 781.43, "grad_norm": 0.619248628616333, "learning_rate": 5.3743871935967984e-05, "loss": 0.3299, "step": 46300 }, { "epoch": 781.86, "grad_norm": 1.4663745164871216, "learning_rate": 5.3718859429714864e-05, "loss": 0.3313, "step": 46325 }, { "epoch": 782.28, "grad_norm": 0.6906927227973938, "learning_rate": 5.369384692346173e-05, "loss": 0.3265, "step": 46350 }, { "epoch": 782.7, "grad_norm": 1.7780324220657349, "learning_rate": 5.36688344172086e-05, "loss": 0.3303, "step": 46375 }, { "epoch": 783.12, "grad_norm": 0.7751718163490295, "learning_rate": 5.364382191095548e-05, "loss": 0.3289, "step": 46400 }, { "epoch": 783.54, "grad_norm": 0.9046327471733093, "learning_rate": 5.3618809404702354e-05, "loss": 0.3276, "step": 46425 }, { "epoch": 783.97, "grad_norm": 1.1644396781921387, "learning_rate": 5.359379689844922e-05, "loss": 0.3297, "step": 46450 }, { "epoch": 784.39, "grad_norm": 0.7988095283508301, "learning_rate": 5.3568784392196106e-05, "loss": 0.3281, "step": 46475 }, { "epoch": 784.81, "grad_norm": 0.666279673576355, "learning_rate": 5.354377188594297e-05, "loss": 0.3292, "step": 46500 }, { "epoch": 785.23, "grad_norm": 0.8225098848342896, "learning_rate": 5.3518759379689845e-05, "loss": 0.3313, "step": 46525 }, { "epoch": 785.65, "grad_norm": 0.7245411276817322, "learning_rate": 5.3493746873436724e-05, "loss": 0.3298, "step": 46550 }, { "epoch": 786.08, "grad_norm": 0.8683385252952576, "learning_rate": 5.346873436718359e-05, "loss": 0.3305, "step": 46575 }, { "epoch": 786.5, "grad_norm": 0.905255138874054, "learning_rate": 5.344372186093046e-05, "loss": 0.3293, "step": 46600 }, { "epoch": 786.92, "grad_norm": 0.8227104544639587, "learning_rate": 5.341870935467734e-05, "loss": 0.3286, "step": 46625 }, { "epoch": 787.34, "grad_norm": 0.9021999835968018, "learning_rate": 5.3393696848424215e-05, "loss": 0.3289, "step": 46650 }, { "epoch": 787.76, "grad_norm": 1.1967681646347046, "learning_rate": 5.3368684342171094e-05, "loss": 0.3277, "step": 46675 }, { "epoch": 788.19, "grad_norm": 0.6865958571434021, "learning_rate": 5.334367183591796e-05, "loss": 0.326, "step": 46700 }, { "epoch": 788.61, "grad_norm": 1.1648318767547607, "learning_rate": 5.331865932966483e-05, "loss": 0.3303, "step": 46725 }, { "epoch": 789.03, "grad_norm": 0.9326013326644897, "learning_rate": 5.329364682341171e-05, "loss": 0.3301, "step": 46750 }, { "epoch": 789.45, "grad_norm": 0.7395642399787903, "learning_rate": 5.3268634317158585e-05, "loss": 0.326, "step": 46775 }, { "epoch": 789.87, "grad_norm": 0.6944596171379089, "learning_rate": 5.324362181090545e-05, "loss": 0.3293, "step": 46800 }, { "epoch": 790.3, "grad_norm": 1.1110926866531372, "learning_rate": 5.321860930465234e-05, "loss": 0.3295, "step": 46825 }, { "epoch": 790.72, "grad_norm": 1.3711137771606445, "learning_rate": 5.31935967983992e-05, "loss": 0.3285, "step": 46850 }, { "epoch": 791.14, "grad_norm": 0.751928448677063, "learning_rate": 5.3168584292146076e-05, "loss": 0.33, "step": 46875 }, { "epoch": 791.56, "grad_norm": 0.8743869066238403, "learning_rate": 5.3143571785892955e-05, "loss": 0.3301, "step": 46900 }, { "epoch": 791.98, "grad_norm": 0.7605223655700684, "learning_rate": 5.311855927963982e-05, "loss": 0.33, "step": 46925 }, { "epoch": 792.41, "grad_norm": 0.6485057473182678, "learning_rate": 5.3093546773386694e-05, "loss": 0.3318, "step": 46950 }, { "epoch": 792.83, "grad_norm": 1.362770676612854, "learning_rate": 5.306853426713357e-05, "loss": 0.3327, "step": 46975 }, { "epoch": 793.25, "grad_norm": 0.8800601363182068, "learning_rate": 5.3043521760880446e-05, "loss": 0.3271, "step": 47000 }, { "epoch": 793.67, "grad_norm": 0.6425147652626038, "learning_rate": 5.301850925462731e-05, "loss": 0.3317, "step": 47025 }, { "epoch": 794.09, "grad_norm": 1.838719367980957, "learning_rate": 5.299349674837419e-05, "loss": 0.3298, "step": 47050 }, { "epoch": 794.51, "grad_norm": 0.8914709687232971, "learning_rate": 5.2968484242121064e-05, "loss": 0.3288, "step": 47075 }, { "epoch": 794.94, "grad_norm": 1.2650481462478638, "learning_rate": 5.2943471735867936e-05, "loss": 0.3287, "step": 47100 }, { "epoch": 795.36, "grad_norm": 0.6978829503059387, "learning_rate": 5.2918459229614816e-05, "loss": 0.3288, "step": 47125 }, { "epoch": 795.78, "grad_norm": 0.7654374241828918, "learning_rate": 5.289344672336168e-05, "loss": 0.3305, "step": 47150 }, { "epoch": 796.2, "grad_norm": 0.6922447085380554, "learning_rate": 5.2868434217108554e-05, "loss": 0.3305, "step": 47175 }, { "epoch": 796.62, "grad_norm": 0.9719737768173218, "learning_rate": 5.2843421710855434e-05, "loss": 0.3323, "step": 47200 }, { "epoch": 797.05, "grad_norm": 0.8489885330200195, "learning_rate": 5.2818409204602306e-05, "loss": 0.328, "step": 47225 }, { "epoch": 797.47, "grad_norm": 0.6230491399765015, "learning_rate": 5.279339669834917e-05, "loss": 0.3274, "step": 47250 }, { "epoch": 797.89, "grad_norm": 0.8637422919273376, "learning_rate": 5.276838419209605e-05, "loss": 0.3281, "step": 47275 }, { "epoch": 798.31, "grad_norm": 0.7426943182945251, "learning_rate": 5.2743371685842924e-05, "loss": 0.33, "step": 47300 }, { "epoch": 798.73, "grad_norm": 0.5537572503089905, "learning_rate": 5.27183591795898e-05, "loss": 0.3295, "step": 47325 }, { "epoch": 799.16, "grad_norm": 0.6883099675178528, "learning_rate": 5.26943471735868e-05, "loss": 0.326, "step": 47350 }, { "epoch": 799.58, "grad_norm": 0.8291670083999634, "learning_rate": 5.266933466733367e-05, "loss": 0.3286, "step": 47375 }, { "epoch": 800.0, "grad_norm": 1.03136146068573, "learning_rate": 5.264432216108054e-05, "loss": 0.331, "step": 47400 }, { "epoch": 800.42, "grad_norm": 0.7519038319587708, "learning_rate": 5.261930965482742e-05, "loss": 0.3289, "step": 47425 }, { "epoch": 800.84, "grad_norm": 0.8344119191169739, "learning_rate": 5.259429714857429e-05, "loss": 0.3261, "step": 47450 }, { "epoch": 801.27, "grad_norm": 1.7398847341537476, "learning_rate": 5.256928464232116e-05, "loss": 0.3294, "step": 47475 }, { "epoch": 801.69, "grad_norm": 0.5780969262123108, "learning_rate": 5.254427213606804e-05, "loss": 0.3272, "step": 47500 }, { "epoch": 802.11, "grad_norm": 0.8206712007522583, "learning_rate": 5.251925962981491e-05, "loss": 0.3283, "step": 47525 }, { "epoch": 802.53, "grad_norm": 0.6790176630020142, "learning_rate": 5.2494247123561776e-05, "loss": 0.3307, "step": 47550 }, { "epoch": 802.95, "grad_norm": 0.8797547221183777, "learning_rate": 5.246923461730866e-05, "loss": 0.3275, "step": 47575 }, { "epoch": 803.38, "grad_norm": 0.619497537612915, "learning_rate": 5.244422211105553e-05, "loss": 0.3285, "step": 47600 }, { "epoch": 803.8, "grad_norm": 0.5010536909103394, "learning_rate": 5.24192096048024e-05, "loss": 0.3271, "step": 47625 }, { "epoch": 804.22, "grad_norm": 0.5864080786705017, "learning_rate": 5.239419709854928e-05, "loss": 0.3284, "step": 47650 }, { "epoch": 804.64, "grad_norm": 0.6308895945549011, "learning_rate": 5.236918459229615e-05, "loss": 0.327, "step": 47675 }, { "epoch": 805.06, "grad_norm": 1.6968401670455933, "learning_rate": 5.234417208604302e-05, "loss": 0.3278, "step": 47700 }, { "epoch": 805.49, "grad_norm": 0.7464416027069092, "learning_rate": 5.23191595797899e-05, "loss": 0.3281, "step": 47725 }, { "epoch": 805.91, "grad_norm": 1.2623881101608276, "learning_rate": 5.229414707353677e-05, "loss": 0.3309, "step": 47750 }, { "epoch": 806.33, "grad_norm": 0.6583206057548523, "learning_rate": 5.226913456728364e-05, "loss": 0.3276, "step": 47775 }, { "epoch": 806.75, "grad_norm": 0.6004701256752014, "learning_rate": 5.224412206103052e-05, "loss": 0.3277, "step": 47800 }, { "epoch": 807.17, "grad_norm": 0.5901530981063843, "learning_rate": 5.221910955477739e-05, "loss": 0.3285, "step": 47825 }, { "epoch": 807.59, "grad_norm": 1.0904884338378906, "learning_rate": 5.219409704852426e-05, "loss": 0.3297, "step": 47850 }, { "epoch": 808.02, "grad_norm": 0.649906575679779, "learning_rate": 5.216908454227114e-05, "loss": 0.328, "step": 47875 }, { "epoch": 808.44, "grad_norm": 0.6729979515075684, "learning_rate": 5.214407203601801e-05, "loss": 0.3267, "step": 47900 }, { "epoch": 808.86, "grad_norm": 0.6820018887519836, "learning_rate": 5.211905952976488e-05, "loss": 0.3275, "step": 47925 }, { "epoch": 809.28, "grad_norm": 0.9879968166351318, "learning_rate": 5.209404702351176e-05, "loss": 0.3279, "step": 47950 }, { "epoch": 809.7, "grad_norm": 1.0850261449813843, "learning_rate": 5.206903451725863e-05, "loss": 0.3362, "step": 47975 }, { "epoch": 810.13, "grad_norm": 2.1593525409698486, "learning_rate": 5.20440220110055e-05, "loss": 0.3288, "step": 48000 }, { "epoch": 810.55, "grad_norm": 0.8372746706008911, "learning_rate": 5.2019009504752384e-05, "loss": 0.3285, "step": 48025 }, { "epoch": 810.97, "grad_norm": 0.5342773795127869, "learning_rate": 5.199399699849925e-05, "loss": 0.3283, "step": 48050 }, { "epoch": 811.39, "grad_norm": 0.7059635519981384, "learning_rate": 5.196898449224613e-05, "loss": 0.3264, "step": 48075 }, { "epoch": 811.81, "grad_norm": 1.2866079807281494, "learning_rate": 5.1943971985993e-05, "loss": 0.3262, "step": 48100 }, { "epoch": 812.24, "grad_norm": 0.6505241990089417, "learning_rate": 5.191895947973987e-05, "loss": 0.3298, "step": 48125 }, { "epoch": 812.66, "grad_norm": 1.0041786432266235, "learning_rate": 5.1893946973486754e-05, "loss": 0.3277, "step": 48150 }, { "epoch": 813.08, "grad_norm": 1.0625736713409424, "learning_rate": 5.186893446723362e-05, "loss": 0.3293, "step": 48175 }, { "epoch": 813.5, "grad_norm": 0.8497466444969177, "learning_rate": 5.184392196098049e-05, "loss": 0.3287, "step": 48200 }, { "epoch": 813.92, "grad_norm": 0.5845162272453308, "learning_rate": 5.181890945472737e-05, "loss": 0.3319, "step": 48225 }, { "epoch": 814.35, "grad_norm": 1.0529389381408691, "learning_rate": 5.179389694847424e-05, "loss": 0.3314, "step": 48250 }, { "epoch": 814.77, "grad_norm": 0.4903738796710968, "learning_rate": 5.176888444222111e-05, "loss": 0.3254, "step": 48275 }, { "epoch": 815.19, "grad_norm": 0.78426593542099, "learning_rate": 5.174387193596799e-05, "loss": 0.3272, "step": 48300 }, { "epoch": 815.61, "grad_norm": 0.7036190629005432, "learning_rate": 5.171885942971486e-05, "loss": 0.3283, "step": 48325 }, { "epoch": 816.03, "grad_norm": 0.7354899644851685, "learning_rate": 5.169384692346173e-05, "loss": 0.3268, "step": 48350 }, { "epoch": 816.46, "grad_norm": 0.7232124209403992, "learning_rate": 5.1668834417208615e-05, "loss": 0.3293, "step": 48375 }, { "epoch": 816.88, "grad_norm": 1.4516865015029907, "learning_rate": 5.164382191095548e-05, "loss": 0.3299, "step": 48400 }, { "epoch": 817.3, "grad_norm": 0.6733472347259521, "learning_rate": 5.161880940470235e-05, "loss": 0.3292, "step": 48425 }, { "epoch": 817.72, "grad_norm": 0.5897396802902222, "learning_rate": 5.159379689844923e-05, "loss": 0.3297, "step": 48450 }, { "epoch": 818.14, "grad_norm": 1.400040864944458, "learning_rate": 5.15687843921961e-05, "loss": 0.3337, "step": 48475 }, { "epoch": 818.57, "grad_norm": 0.5100235342979431, "learning_rate": 5.154377188594297e-05, "loss": 0.3333, "step": 48500 }, { "epoch": 818.99, "grad_norm": 0.9582157731056213, "learning_rate": 5.151875937968985e-05, "loss": 0.3263, "step": 48525 }, { "epoch": 819.41, "grad_norm": 0.7835460901260376, "learning_rate": 5.149374687343672e-05, "loss": 0.3305, "step": 48550 }, { "epoch": 819.83, "grad_norm": 0.6779845952987671, "learning_rate": 5.146873436718359e-05, "loss": 0.3257, "step": 48575 }, { "epoch": 820.25, "grad_norm": 0.7173312902450562, "learning_rate": 5.144372186093047e-05, "loss": 0.3244, "step": 48600 }, { "epoch": 820.68, "grad_norm": 0.7807551622390747, "learning_rate": 5.141870935467734e-05, "loss": 0.3309, "step": 48625 }, { "epoch": 821.1, "grad_norm": 1.0499476194381714, "learning_rate": 5.1393696848424214e-05, "loss": 0.3329, "step": 48650 }, { "epoch": 821.52, "grad_norm": 0.8777140974998474, "learning_rate": 5.136868434217109e-05, "loss": 0.3291, "step": 48675 }, { "epoch": 821.94, "grad_norm": 1.3248114585876465, "learning_rate": 5.134367183591796e-05, "loss": 0.3278, "step": 48700 }, { "epoch": 822.36, "grad_norm": 0.7956157326698303, "learning_rate": 5.131865932966483e-05, "loss": 0.3322, "step": 48725 }, { "epoch": 822.78, "grad_norm": 0.8334755897521973, "learning_rate": 5.129364682341171e-05, "loss": 0.3271, "step": 48750 }, { "epoch": 823.21, "grad_norm": 0.674849808216095, "learning_rate": 5.1268634317158584e-05, "loss": 0.3275, "step": 48775 }, { "epoch": 823.63, "grad_norm": 0.806717574596405, "learning_rate": 5.124362181090545e-05, "loss": 0.3284, "step": 48800 }, { "epoch": 824.05, "grad_norm": 0.8122471570968628, "learning_rate": 5.121860930465233e-05, "loss": 0.3272, "step": 48825 }, { "epoch": 824.47, "grad_norm": 1.0856282711029053, "learning_rate": 5.11935967983992e-05, "loss": 0.3307, "step": 48850 }, { "epoch": 824.89, "grad_norm": 0.9283085465431213, "learning_rate": 5.1168584292146074e-05, "loss": 0.3276, "step": 48875 }, { "epoch": 825.32, "grad_norm": 1.1887524127960205, "learning_rate": 5.1143571785892954e-05, "loss": 0.3305, "step": 48900 }, { "epoch": 825.74, "grad_norm": 0.723471999168396, "learning_rate": 5.111855927963982e-05, "loss": 0.3284, "step": 48925 }, { "epoch": 826.16, "grad_norm": 0.74093097448349, "learning_rate": 5.109354677338669e-05, "loss": 0.3271, "step": 48950 }, { "epoch": 826.58, "grad_norm": 0.5709764361381531, "learning_rate": 5.106853426713357e-05, "loss": 0.3284, "step": 48975 }, { "epoch": 827.0, "grad_norm": 1.0489660501480103, "learning_rate": 5.1043521760880444e-05, "loss": 0.3297, "step": 49000 }, { "epoch": 827.43, "grad_norm": 0.8669236302375793, "learning_rate": 5.101850925462731e-05, "loss": 0.3263, "step": 49025 }, { "epoch": 827.85, "grad_norm": 0.5710304379463196, "learning_rate": 5.099349674837419e-05, "loss": 0.3273, "step": 49050 }, { "epoch": 828.27, "grad_norm": 0.7643998265266418, "learning_rate": 5.096848424212106e-05, "loss": 0.3301, "step": 49075 }, { "epoch": 828.69, "grad_norm": 0.7842844724655151, "learning_rate": 5.094347173586793e-05, "loss": 0.3288, "step": 49100 }, { "epoch": 829.11, "grad_norm": 0.5112960934638977, "learning_rate": 5.0918459229614814e-05, "loss": 0.3253, "step": 49125 }, { "epoch": 829.54, "grad_norm": 0.5777252316474915, "learning_rate": 5.089344672336168e-05, "loss": 0.3304, "step": 49150 }, { "epoch": 829.96, "grad_norm": 0.8494207859039307, "learning_rate": 5.086843421710855e-05, "loss": 0.3278, "step": 49175 }, { "epoch": 830.38, "grad_norm": 0.6506612300872803, "learning_rate": 5.084342171085543e-05, "loss": 0.3261, "step": 49200 }, { "epoch": 830.8, "grad_norm": 0.6846392750740051, "learning_rate": 5.0818409204602305e-05, "loss": 0.3277, "step": 49225 }, { "epoch": 831.22, "grad_norm": 0.9390199780464172, "learning_rate": 5.079339669834917e-05, "loss": 0.3289, "step": 49250 }, { "epoch": 831.65, "grad_norm": 0.9364516735076904, "learning_rate": 5.076838419209605e-05, "loss": 0.3262, "step": 49275 }, { "epoch": 832.07, "grad_norm": 0.5847158432006836, "learning_rate": 5.074337168584292e-05, "loss": 0.326, "step": 49300 }, { "epoch": 832.49, "grad_norm": 0.6708875894546509, "learning_rate": 5.07183591795898e-05, "loss": 0.328, "step": 49325 }, { "epoch": 832.91, "grad_norm": 0.8488514423370361, "learning_rate": 5.0693346673336675e-05, "loss": 0.3278, "step": 49350 }, { "epoch": 833.33, "grad_norm": 0.7885183095932007, "learning_rate": 5.066833416708354e-05, "loss": 0.327, "step": 49375 }, { "epoch": 833.76, "grad_norm": 0.8777099251747131, "learning_rate": 5.064332166083042e-05, "loss": 0.3278, "step": 49400 }, { "epoch": 834.18, "grad_norm": 0.727043628692627, "learning_rate": 5.061830915457729e-05, "loss": 0.3275, "step": 49425 }, { "epoch": 834.6, "grad_norm": NaN, "learning_rate": 5.0594297148574285e-05, "loss": 0.329, "step": 49450 }, { "epoch": 835.02, "grad_norm": 0.5716123580932617, "learning_rate": 5.056928464232116e-05, "loss": 0.3276, "step": 49475 }, { "epoch": 835.44, "grad_norm": 0.5085082054138184, "learning_rate": 5.054427213606804e-05, "loss": 0.3237, "step": 49500 }, { "epoch": 835.86, "grad_norm": 0.6669608354568481, "learning_rate": 5.051925962981491e-05, "loss": 0.3265, "step": 49525 }, { "epoch": 836.29, "grad_norm": 0.7317723631858826, "learning_rate": 5.049424712356179e-05, "loss": 0.3288, "step": 49550 }, { "epoch": 836.71, "grad_norm": 0.6707614660263062, "learning_rate": 5.046923461730866e-05, "loss": 0.3245, "step": 49575 }, { "epoch": 837.13, "grad_norm": 0.5244936347007751, "learning_rate": 5.044422211105553e-05, "loss": 0.3263, "step": 49600 }, { "epoch": 837.55, "grad_norm": 0.6073471903800964, "learning_rate": 5.041920960480241e-05, "loss": 0.3282, "step": 49625 }, { "epoch": 837.97, "grad_norm": 0.5308436751365662, "learning_rate": 5.039419709854928e-05, "loss": 0.3249, "step": 49650 }, { "epoch": 838.4, "grad_norm": 1.0538098812103271, "learning_rate": 5.0369184592296145e-05, "loss": 0.3279, "step": 49675 }, { "epoch": 838.82, "grad_norm": 0.6898013353347778, "learning_rate": 5.034417208604303e-05, "loss": 0.3272, "step": 49700 }, { "epoch": 839.24, "grad_norm": 1.3229933977127075, "learning_rate": 5.03191595797899e-05, "loss": 0.3291, "step": 49725 }, { "epoch": 839.66, "grad_norm": 0.5678596496582031, "learning_rate": 5.029414707353677e-05, "loss": 0.3264, "step": 49750 }, { "epoch": 840.08, "grad_norm": 1.1706764698028564, "learning_rate": 5.026913456728365e-05, "loss": 0.3281, "step": 49775 }, { "epoch": 840.51, "grad_norm": 1.20086669921875, "learning_rate": 5.0244122061030515e-05, "loss": 0.3269, "step": 49800 }, { "epoch": 840.93, "grad_norm": 1.1949619054794312, "learning_rate": 5.021910955477739e-05, "loss": 0.3254, "step": 49825 }, { "epoch": 841.35, "grad_norm": 1.171925663948059, "learning_rate": 5.019409704852427e-05, "loss": 0.3259, "step": 49850 }, { "epoch": 841.77, "grad_norm": 0.6672475934028625, "learning_rate": 5.016908454227114e-05, "loss": 0.3276, "step": 49875 }, { "epoch": 842.19, "grad_norm": 0.7140427827835083, "learning_rate": 5.0144072036018006e-05, "loss": 0.3261, "step": 49900 }, { "epoch": 842.62, "grad_norm": 0.6159445643424988, "learning_rate": 5.011905952976489e-05, "loss": 0.3278, "step": 49925 }, { "epoch": 843.04, "grad_norm": 0.8772084712982178, "learning_rate": 5.009404702351176e-05, "loss": 0.3261, "step": 49950 }, { "epoch": 843.46, "grad_norm": 0.7444985508918762, "learning_rate": 5.006903451725863e-05, "loss": 0.3221, "step": 49975 }, { "epoch": 843.88, "grad_norm": 0.6548954844474792, "learning_rate": 5.004402201100551e-05, "loss": 0.3262, "step": 50000 }, { "epoch": 843.88, "eval_loss": 0.4606074392795563, "eval_runtime": 3.8255, "eval_samples_per_second": 80.774, "eval_steps_per_second": 2.614, "step": 50000 } ], "logging_steps": 25, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 1695, "save_steps": 10000, "total_flos": 3.19229069295554e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }