{ "best_metric": 0.5852221250534058, "best_model_checkpoint": "/ephemeral/models/qwen-describe_tasks/checkpoint-200", "epoch": 6.0, "eval_steps": 200, "global_step": 1344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004464285714285714, "grad_norm": 10.935125350952148, "learning_rate": 0.0, "loss": 1.3545, "step": 1 }, { "epoch": 0.008928571428571428, "grad_norm": 7.386569976806641, "learning_rate": 2.626495350371936e-06, "loss": 1.0726, "step": 2 }, { "epoch": 0.013392857142857142, "grad_norm": 7.35675048828125, "learning_rate": 4.162896638657993e-06, "loss": 1.0713, "step": 3 }, { "epoch": 0.017857142857142856, "grad_norm": 6.851252555847168, "learning_rate": 5.252990700743872e-06, "loss": 0.9934, "step": 4 }, { "epoch": 0.022321428571428572, "grad_norm": 6.426003456115723, "learning_rate": 6.098533345119624e-06, "loss": 0.7561, "step": 5 }, { "epoch": 0.026785714285714284, "grad_norm": 4.757523536682129, "learning_rate": 6.7893919890299284e-06, "loss": 0.7267, "step": 6 }, { "epoch": 0.03125, "grad_norm": 3.9246585369110107, "learning_rate": 7.373504649628066e-06, "loss": 0.6757, "step": 7 }, { "epoch": 0.03571428571428571, "grad_norm": 5.09559965133667, "learning_rate": 7.879486051115807e-06, "loss": 0.7255, "step": 8 }, { "epoch": 0.04017857142857143, "grad_norm": 4.388647079467773, "learning_rate": 8.325793277315987e-06, "loss": 0.6737, "step": 9 }, { "epoch": 0.044642857142857144, "grad_norm": 3.693603754043579, "learning_rate": 8.72502869549156e-06, "loss": 0.6406, "step": 10 }, { "epoch": 0.049107142857142856, "grad_norm": 3.9717981815338135, "learning_rate": 9.086181061280522e-06, "loss": 0.587, "step": 11 }, { "epoch": 0.05357142857142857, "grad_norm": 3.3187167644500732, "learning_rate": 9.415887339401865e-06, "loss": 0.5724, "step": 12 }, { "epoch": 0.05803571428571429, "grad_norm": 3.3204233646392822, "learning_rate": 9.719187714029216e-06, "loss": 0.6456, "step": 13 }, { "epoch": 0.0625, "grad_norm": 3.1636106967926025, "learning_rate": 1e-05, "loss": 0.573, "step": 14 }, { "epoch": 0.06696428571428571, "grad_norm": 3.5804100036621094, "learning_rate": 9.999986052613417e-06, "loss": 0.5863, "step": 15 }, { "epoch": 0.07142857142857142, "grad_norm": 3.142025947570801, "learning_rate": 9.99994421053148e-06, "loss": 0.6012, "step": 16 }, { "epoch": 0.07589285714285714, "grad_norm": 2.6348273754119873, "learning_rate": 9.999874473987653e-06, "loss": 0.5631, "step": 17 }, { "epoch": 0.08035714285714286, "grad_norm": 2.788102149963379, "learning_rate": 9.999776843371027e-06, "loss": 0.6074, "step": 18 }, { "epoch": 0.08482142857142858, "grad_norm": 2.5704562664031982, "learning_rate": 9.99965131922634e-06, "loss": 0.5218, "step": 19 }, { "epoch": 0.08928571428571429, "grad_norm": 2.773559331893921, "learning_rate": 9.999497902253949e-06, "loss": 0.5921, "step": 20 }, { "epoch": 0.09375, "grad_norm": 2.585707664489746, "learning_rate": 9.999316593309849e-06, "loss": 0.603, "step": 21 }, { "epoch": 0.09821428571428571, "grad_norm": 2.896451711654663, "learning_rate": 9.999107393405655e-06, "loss": 0.588, "step": 22 }, { "epoch": 0.10267857142857142, "grad_norm": 2.606309652328491, "learning_rate": 9.998870303708601e-06, "loss": 0.4943, "step": 23 }, { "epoch": 0.10714285714285714, "grad_norm": 2.6549859046936035, "learning_rate": 9.998605325541531e-06, "loss": 0.5824, "step": 24 }, { "epoch": 0.11160714285714286, "grad_norm": 2.7567014694213867, "learning_rate": 9.998312460382895e-06, "loss": 0.5915, "step": 25 }, { "epoch": 0.11607142857142858, "grad_norm": 2.674490451812744, "learning_rate": 9.997991709866738e-06, "loss": 0.5946, "step": 26 }, { "epoch": 0.12053571428571429, "grad_norm": 2.0534379482269287, "learning_rate": 9.997643075782691e-06, "loss": 0.4713, "step": 27 }, { "epoch": 0.125, "grad_norm": 2.5476534366607666, "learning_rate": 9.997266560075961e-06, "loss": 0.5565, "step": 28 }, { "epoch": 0.12946428571428573, "grad_norm": 3.0594515800476074, "learning_rate": 9.996862164847323e-06, "loss": 0.5195, "step": 29 }, { "epoch": 0.13392857142857142, "grad_norm": 2.6829309463500977, "learning_rate": 9.996429892353107e-06, "loss": 0.4949, "step": 30 }, { "epoch": 0.13839285714285715, "grad_norm": 2.6954853534698486, "learning_rate": 9.99596974500518e-06, "loss": 0.5276, "step": 31 }, { "epoch": 0.14285714285714285, "grad_norm": 2.8353872299194336, "learning_rate": 9.995481725370941e-06, "loss": 0.5713, "step": 32 }, { "epoch": 0.14732142857142858, "grad_norm": 2.503706693649292, "learning_rate": 9.994965836173303e-06, "loss": 0.5171, "step": 33 }, { "epoch": 0.15178571428571427, "grad_norm": 2.876476526260376, "learning_rate": 9.994422080290675e-06, "loss": 0.5046, "step": 34 }, { "epoch": 0.15625, "grad_norm": 2.6266987323760986, "learning_rate": 9.99385046075695e-06, "loss": 0.5366, "step": 35 }, { "epoch": 0.16071428571428573, "grad_norm": 2.4998559951782227, "learning_rate": 9.993250980761487e-06, "loss": 0.5082, "step": 36 }, { "epoch": 0.16517857142857142, "grad_norm": 2.6557369232177734, "learning_rate": 9.99262364364909e-06, "loss": 0.5126, "step": 37 }, { "epoch": 0.16964285714285715, "grad_norm": 2.4919795989990234, "learning_rate": 9.991968452919999e-06, "loss": 0.459, "step": 38 }, { "epoch": 0.17410714285714285, "grad_norm": 2.501042604446411, "learning_rate": 9.991285412229854e-06, "loss": 0.5021, "step": 39 }, { "epoch": 0.17857142857142858, "grad_norm": 2.462800979614258, "learning_rate": 9.99057452538969e-06, "loss": 0.5227, "step": 40 }, { "epoch": 0.18303571428571427, "grad_norm": 2.2987539768218994, "learning_rate": 9.989835796365911e-06, "loss": 0.491, "step": 41 }, { "epoch": 0.1875, "grad_norm": 2.5660648345947266, "learning_rate": 9.989069229280264e-06, "loss": 0.5264, "step": 42 }, { "epoch": 0.19196428571428573, "grad_norm": 2.3724122047424316, "learning_rate": 9.988274828409821e-06, "loss": 0.5362, "step": 43 }, { "epoch": 0.19642857142857142, "grad_norm": 2.1337292194366455, "learning_rate": 9.987452598186947e-06, "loss": 0.4827, "step": 44 }, { "epoch": 0.20089285714285715, "grad_norm": 2.4473042488098145, "learning_rate": 9.986602543199292e-06, "loss": 0.5034, "step": 45 }, { "epoch": 0.20535714285714285, "grad_norm": 2.3975181579589844, "learning_rate": 9.985724668189744e-06, "loss": 0.5155, "step": 46 }, { "epoch": 0.20982142857142858, "grad_norm": 2.3558027744293213, "learning_rate": 9.98481897805642e-06, "loss": 0.5271, "step": 47 }, { "epoch": 0.21428571428571427, "grad_norm": 2.3652427196502686, "learning_rate": 9.983885477852628e-06, "loss": 0.5966, "step": 48 }, { "epoch": 0.21875, "grad_norm": 2.551260471343994, "learning_rate": 9.982924172786847e-06, "loss": 0.5586, "step": 49 }, { "epoch": 0.22321428571428573, "grad_norm": 2.4399120807647705, "learning_rate": 9.981935068222687e-06, "loss": 0.5133, "step": 50 }, { "epoch": 0.22767857142857142, "grad_norm": 2.097429037094116, "learning_rate": 9.980918169678872e-06, "loss": 0.439, "step": 51 }, { "epoch": 0.23214285714285715, "grad_norm": 3.4427826404571533, "learning_rate": 9.979873482829199e-06, "loss": 0.5122, "step": 52 }, { "epoch": 0.23660714285714285, "grad_norm": 2.506448268890381, "learning_rate": 9.978801013502511e-06, "loss": 0.5187, "step": 53 }, { "epoch": 0.24107142857142858, "grad_norm": 2.153078556060791, "learning_rate": 9.977700767682665e-06, "loss": 0.4699, "step": 54 }, { "epoch": 0.24553571428571427, "grad_norm": 2.5463476181030273, "learning_rate": 9.976572751508497e-06, "loss": 0.5425, "step": 55 }, { "epoch": 0.25, "grad_norm": 2.3144137859344482, "learning_rate": 9.975416971273787e-06, "loss": 0.4973, "step": 56 }, { "epoch": 0.2544642857142857, "grad_norm": 2.373551607131958, "learning_rate": 9.974233433427222e-06, "loss": 0.4538, "step": 57 }, { "epoch": 0.25892857142857145, "grad_norm": 1.888371467590332, "learning_rate": 9.97302214457237e-06, "loss": 0.4525, "step": 58 }, { "epoch": 0.26339285714285715, "grad_norm": 2.0926015377044678, "learning_rate": 9.971783111467635e-06, "loss": 0.4793, "step": 59 }, { "epoch": 0.26785714285714285, "grad_norm": 2.090359926223755, "learning_rate": 9.970516341026211e-06, "loss": 0.4818, "step": 60 }, { "epoch": 0.27232142857142855, "grad_norm": 2.268264055252075, "learning_rate": 9.969221840316066e-06, "loss": 0.5479, "step": 61 }, { "epoch": 0.2767857142857143, "grad_norm": 2.044856309890747, "learning_rate": 9.967899616559879e-06, "loss": 0.4726, "step": 62 }, { "epoch": 0.28125, "grad_norm": 2.361753225326538, "learning_rate": 9.966549677135015e-06, "loss": 0.4901, "step": 63 }, { "epoch": 0.2857142857142857, "grad_norm": 2.3921494483947754, "learning_rate": 9.965172029573479e-06, "loss": 0.4856, "step": 64 }, { "epoch": 0.29017857142857145, "grad_norm": 2.134155750274658, "learning_rate": 9.96376668156187e-06, "loss": 0.4621, "step": 65 }, { "epoch": 0.29464285714285715, "grad_norm": 2.1714282035827637, "learning_rate": 9.962333640941349e-06, "loss": 0.5009, "step": 66 }, { "epoch": 0.29910714285714285, "grad_norm": 2.107659339904785, "learning_rate": 9.960872915707582e-06, "loss": 0.5125, "step": 67 }, { "epoch": 0.30357142857142855, "grad_norm": 1.874833583831787, "learning_rate": 9.959384514010703e-06, "loss": 0.4568, "step": 68 }, { "epoch": 0.3080357142857143, "grad_norm": 2.18188214302063, "learning_rate": 9.95786844415527e-06, "loss": 0.5336, "step": 69 }, { "epoch": 0.3125, "grad_norm": 1.9995635747909546, "learning_rate": 9.956324714600212e-06, "loss": 0.507, "step": 70 }, { "epoch": 0.3169642857142857, "grad_norm": 2.198190212249756, "learning_rate": 9.95475333395879e-06, "loss": 0.4801, "step": 71 }, { "epoch": 0.32142857142857145, "grad_norm": 2.370695114135742, "learning_rate": 9.95315431099854e-06, "loss": 0.4944, "step": 72 }, { "epoch": 0.32589285714285715, "grad_norm": 2.126605272293091, "learning_rate": 9.951527654641231e-06, "loss": 0.4874, "step": 73 }, { "epoch": 0.33035714285714285, "grad_norm": 2.093745231628418, "learning_rate": 9.949873373962814e-06, "loss": 0.5515, "step": 74 }, { "epoch": 0.33482142857142855, "grad_norm": 1.9955790042877197, "learning_rate": 9.948191478193365e-06, "loss": 0.4524, "step": 75 }, { "epoch": 0.3392857142857143, "grad_norm": 1.8829865455627441, "learning_rate": 9.946481976717046e-06, "loss": 0.4387, "step": 76 }, { "epoch": 0.34375, "grad_norm": 1.8707062005996704, "learning_rate": 9.944744879072043e-06, "loss": 0.4467, "step": 77 }, { "epoch": 0.3482142857142857, "grad_norm": 2.0644657611846924, "learning_rate": 9.942980194950511e-06, "loss": 0.5096, "step": 78 }, { "epoch": 0.35267857142857145, "grad_norm": 2.6129536628723145, "learning_rate": 9.941187934198528e-06, "loss": 0.5458, "step": 79 }, { "epoch": 0.35714285714285715, "grad_norm": 2.0972604751586914, "learning_rate": 9.939368106816038e-06, "loss": 0.4918, "step": 80 }, { "epoch": 0.36160714285714285, "grad_norm": 2.2218029499053955, "learning_rate": 9.937520722956789e-06, "loss": 0.5811, "step": 81 }, { "epoch": 0.36607142857142855, "grad_norm": 2.017685890197754, "learning_rate": 9.93564579292828e-06, "loss": 0.4626, "step": 82 }, { "epoch": 0.3705357142857143, "grad_norm": 2.521484613418579, "learning_rate": 9.933743327191711e-06, "loss": 0.4818, "step": 83 }, { "epoch": 0.375, "grad_norm": 2.1667404174804688, "learning_rate": 9.93181333636191e-06, "loss": 0.5324, "step": 84 }, { "epoch": 0.3794642857142857, "grad_norm": 2.0817134380340576, "learning_rate": 9.929855831207288e-06, "loss": 0.4595, "step": 85 }, { "epoch": 0.38392857142857145, "grad_norm": 1.9740232229232788, "learning_rate": 9.92787082264977e-06, "loss": 0.4228, "step": 86 }, { "epoch": 0.38839285714285715, "grad_norm": 1.985465407371521, "learning_rate": 9.925858321764733e-06, "loss": 0.4971, "step": 87 }, { "epoch": 0.39285714285714285, "grad_norm": 2.02755069732666, "learning_rate": 9.923818339780954e-06, "loss": 0.5322, "step": 88 }, { "epoch": 0.39732142857142855, "grad_norm": 2.2033133506774902, "learning_rate": 9.921750888080534e-06, "loss": 0.4646, "step": 89 }, { "epoch": 0.4017857142857143, "grad_norm": 2.1710963249206543, "learning_rate": 9.91965597819885e-06, "loss": 0.4665, "step": 90 }, { "epoch": 0.40625, "grad_norm": 1.907015085220337, "learning_rate": 9.917533621824476e-06, "loss": 0.408, "step": 91 }, { "epoch": 0.4107142857142857, "grad_norm": 1.8381679058074951, "learning_rate": 9.915383830799129e-06, "loss": 0.4057, "step": 92 }, { "epoch": 0.41517857142857145, "grad_norm": 2.105541229248047, "learning_rate": 9.91320661711759e-06, "loss": 0.5089, "step": 93 }, { "epoch": 0.41964285714285715, "grad_norm": 2.172125816345215, "learning_rate": 9.911001992927655e-06, "loss": 0.4856, "step": 94 }, { "epoch": 0.42410714285714285, "grad_norm": 2.0441055297851562, "learning_rate": 9.908769970530049e-06, "loss": 0.505, "step": 95 }, { "epoch": 0.42857142857142855, "grad_norm": 1.9920746088027954, "learning_rate": 9.90651056237837e-06, "loss": 0.4962, "step": 96 }, { "epoch": 0.4330357142857143, "grad_norm": 2.0216598510742188, "learning_rate": 9.904223781079017e-06, "loss": 0.5041, "step": 97 }, { "epoch": 0.4375, "grad_norm": 2.212711811065674, "learning_rate": 9.901909639391111e-06, "loss": 0.5078, "step": 98 }, { "epoch": 0.4419642857142857, "grad_norm": 2.3241405487060547, "learning_rate": 9.899568150226435e-06, "loss": 0.4316, "step": 99 }, { "epoch": 0.44642857142857145, "grad_norm": 2.092745542526245, "learning_rate": 9.897199326649362e-06, "loss": 0.4653, "step": 100 }, { "epoch": 0.45089285714285715, "grad_norm": 1.8443938493728638, "learning_rate": 9.894803181876765e-06, "loss": 0.4527, "step": 101 }, { "epoch": 0.45535714285714285, "grad_norm": 1.827662467956543, "learning_rate": 9.892379729277972e-06, "loss": 0.455, "step": 102 }, { "epoch": 0.45982142857142855, "grad_norm": 1.8256633281707764, "learning_rate": 9.889928982374663e-06, "loss": 0.369, "step": 103 }, { "epoch": 0.4642857142857143, "grad_norm": 2.305311918258667, "learning_rate": 9.887450954840812e-06, "loss": 0.4713, "step": 104 }, { "epoch": 0.46875, "grad_norm": 2.3441269397735596, "learning_rate": 9.884945660502607e-06, "loss": 0.4571, "step": 105 }, { "epoch": 0.4732142857142857, "grad_norm": 1.9434335231781006, "learning_rate": 9.882413113338364e-06, "loss": 0.4814, "step": 106 }, { "epoch": 0.47767857142857145, "grad_norm": 2.1630849838256836, "learning_rate": 9.879853327478465e-06, "loss": 0.4856, "step": 107 }, { "epoch": 0.48214285714285715, "grad_norm": 2.32119083404541, "learning_rate": 9.877266317205268e-06, "loss": 0.4614, "step": 108 }, { "epoch": 0.48660714285714285, "grad_norm": 2.141681671142578, "learning_rate": 9.874652096953028e-06, "loss": 0.4633, "step": 109 }, { "epoch": 0.49107142857142855, "grad_norm": 2.1267635822296143, "learning_rate": 9.872010681307821e-06, "loss": 0.5275, "step": 110 }, { "epoch": 0.4955357142857143, "grad_norm": 2.1307332515716553, "learning_rate": 9.869342085007458e-06, "loss": 0.4224, "step": 111 }, { "epoch": 0.5, "grad_norm": 1.9681659936904907, "learning_rate": 9.866646322941405e-06, "loss": 0.4645, "step": 112 }, { "epoch": 0.5044642857142857, "grad_norm": 1.8705610036849976, "learning_rate": 9.863923410150704e-06, "loss": 0.4672, "step": 113 }, { "epoch": 0.5089285714285714, "grad_norm": 2.2226390838623047, "learning_rate": 9.861173361827876e-06, "loss": 0.4418, "step": 114 }, { "epoch": 0.5133928571428571, "grad_norm": 1.979615569114685, "learning_rate": 9.858396193316853e-06, "loss": 0.4622, "step": 115 }, { "epoch": 0.5178571428571429, "grad_norm": 2.0155093669891357, "learning_rate": 9.855591920112883e-06, "loss": 0.4585, "step": 116 }, { "epoch": 0.5223214285714286, "grad_norm": 1.7425578832626343, "learning_rate": 9.85276055786244e-06, "loss": 0.4938, "step": 117 }, { "epoch": 0.5267857142857143, "grad_norm": 2.0626533031463623, "learning_rate": 9.849902122363148e-06, "loss": 0.458, "step": 118 }, { "epoch": 0.53125, "grad_norm": 1.8312275409698486, "learning_rate": 9.847016629563683e-06, "loss": 0.4615, "step": 119 }, { "epoch": 0.5357142857142857, "grad_norm": 2.0553085803985596, "learning_rate": 9.844104095563689e-06, "loss": 0.5286, "step": 120 }, { "epoch": 0.5401785714285714, "grad_norm": 1.95139741897583, "learning_rate": 9.841164536613685e-06, "loss": 0.4863, "step": 121 }, { "epoch": 0.5446428571428571, "grad_norm": 1.9694424867630005, "learning_rate": 9.83819796911498e-06, "loss": 0.5279, "step": 122 }, { "epoch": 0.5491071428571429, "grad_norm": 1.9648712873458862, "learning_rate": 9.83520440961957e-06, "loss": 0.3807, "step": 123 }, { "epoch": 0.5535714285714286, "grad_norm": 1.8197687864303589, "learning_rate": 9.83218387483006e-06, "loss": 0.4943, "step": 124 }, { "epoch": 0.5580357142857143, "grad_norm": 1.864090919494629, "learning_rate": 9.829136381599563e-06, "loss": 0.4804, "step": 125 }, { "epoch": 0.5625, "grad_norm": 1.827278971672058, "learning_rate": 9.826061946931605e-06, "loss": 0.4173, "step": 126 }, { "epoch": 0.5669642857142857, "grad_norm": 1.8508820533752441, "learning_rate": 9.822960587980034e-06, "loss": 0.4831, "step": 127 }, { "epoch": 0.5714285714285714, "grad_norm": 1.801448941230774, "learning_rate": 9.81983232204892e-06, "loss": 0.4823, "step": 128 }, { "epoch": 0.5758928571428571, "grad_norm": 1.9353251457214355, "learning_rate": 9.816677166592462e-06, "loss": 0.4074, "step": 129 }, { "epoch": 0.5803571428571429, "grad_norm": 1.9287128448486328, "learning_rate": 9.81349513921489e-06, "loss": 0.4495, "step": 130 }, { "epoch": 0.5848214285714286, "grad_norm": 1.5913102626800537, "learning_rate": 9.810286257670365e-06, "loss": 0.4161, "step": 131 }, { "epoch": 0.5892857142857143, "grad_norm": 1.9423521757125854, "learning_rate": 9.807050539862884e-06, "loss": 0.4254, "step": 132 }, { "epoch": 0.59375, "grad_norm": 1.7807602882385254, "learning_rate": 9.803788003846175e-06, "loss": 0.4541, "step": 133 }, { "epoch": 0.5982142857142857, "grad_norm": 1.9615179300308228, "learning_rate": 9.800498667823595e-06, "loss": 0.4375, "step": 134 }, { "epoch": 0.6026785714285714, "grad_norm": 1.7444758415222168, "learning_rate": 9.797182550148039e-06, "loss": 0.4568, "step": 135 }, { "epoch": 0.6071428571428571, "grad_norm": 1.9967138767242432, "learning_rate": 9.793839669321828e-06, "loss": 0.4152, "step": 136 }, { "epoch": 0.6116071428571429, "grad_norm": 1.8935024738311768, "learning_rate": 9.790470043996604e-06, "loss": 0.4807, "step": 137 }, { "epoch": 0.6160714285714286, "grad_norm": 1.8592950105667114, "learning_rate": 9.78707369297324e-06, "loss": 0.4849, "step": 138 }, { "epoch": 0.6205357142857143, "grad_norm": 1.8688433170318604, "learning_rate": 9.783650635201714e-06, "loss": 0.5193, "step": 139 }, { "epoch": 0.625, "grad_norm": 1.806913137435913, "learning_rate": 9.780200889781021e-06, "loss": 0.4224, "step": 140 }, { "epoch": 0.6294642857142857, "grad_norm": 1.8969337940216064, "learning_rate": 9.776724475959061e-06, "loss": 0.4986, "step": 141 }, { "epoch": 0.6339285714285714, "grad_norm": 1.5755013227462769, "learning_rate": 9.773221413132525e-06, "loss": 0.4343, "step": 142 }, { "epoch": 0.6383928571428571, "grad_norm": 1.70070481300354, "learning_rate": 9.769691720846801e-06, "loss": 0.4163, "step": 143 }, { "epoch": 0.6428571428571429, "grad_norm": 1.9572802782058716, "learning_rate": 9.766135418795848e-06, "loss": 0.4618, "step": 144 }, { "epoch": 0.6473214285714286, "grad_norm": 1.9200794696807861, "learning_rate": 9.762552526822098e-06, "loss": 0.4258, "step": 145 }, { "epoch": 0.6517857142857143, "grad_norm": 1.9915560483932495, "learning_rate": 9.758943064916342e-06, "loss": 0.4185, "step": 146 }, { "epoch": 0.65625, "grad_norm": 1.6791561841964722, "learning_rate": 9.755307053217622e-06, "loss": 0.4074, "step": 147 }, { "epoch": 0.6607142857142857, "grad_norm": 1.984868049621582, "learning_rate": 9.751644512013106e-06, "loss": 0.461, "step": 148 }, { "epoch": 0.6651785714285714, "grad_norm": 1.925027847290039, "learning_rate": 9.74795546173799e-06, "loss": 0.4601, "step": 149 }, { "epoch": 0.6696428571428571, "grad_norm": 2.0234742164611816, "learning_rate": 9.744239922975377e-06, "loss": 0.4874, "step": 150 }, { "epoch": 0.6741071428571429, "grad_norm": 1.8171749114990234, "learning_rate": 9.740497916456163e-06, "loss": 0.4806, "step": 151 }, { "epoch": 0.6785714285714286, "grad_norm": 1.8766131401062012, "learning_rate": 9.736729463058921e-06, "loss": 0.5144, "step": 152 }, { "epoch": 0.6830357142857143, "grad_norm": 1.7700015306472778, "learning_rate": 9.732934583809782e-06, "loss": 0.4958, "step": 153 }, { "epoch": 0.6875, "grad_norm": 1.7756420373916626, "learning_rate": 9.729113299882324e-06, "loss": 0.4685, "step": 154 }, { "epoch": 0.6919642857142857, "grad_norm": 1.7557669878005981, "learning_rate": 9.725265632597448e-06, "loss": 0.4736, "step": 155 }, { "epoch": 0.6964285714285714, "grad_norm": 1.7786169052124023, "learning_rate": 9.721391603423263e-06, "loss": 0.4582, "step": 156 }, { "epoch": 0.7008928571428571, "grad_norm": 1.7477728128433228, "learning_rate": 9.717491233974962e-06, "loss": 0.4399, "step": 157 }, { "epoch": 0.7053571428571429, "grad_norm": 1.8034288883209229, "learning_rate": 9.713564546014707e-06, "loss": 0.4933, "step": 158 }, { "epoch": 0.7098214285714286, "grad_norm": 1.7433615922927856, "learning_rate": 9.7096115614515e-06, "loss": 0.4367, "step": 159 }, { "epoch": 0.7142857142857143, "grad_norm": 1.9310708045959473, "learning_rate": 9.705632302341073e-06, "loss": 0.4674, "step": 160 }, { "epoch": 0.71875, "grad_norm": 1.709022879600525, "learning_rate": 9.701626790885749e-06, "loss": 0.4361, "step": 161 }, { "epoch": 0.7232142857142857, "grad_norm": 1.7623347043991089, "learning_rate": 9.69759504943433e-06, "loss": 0.4037, "step": 162 }, { "epoch": 0.7276785714285714, "grad_norm": 1.6316179037094116, "learning_rate": 9.69353710048197e-06, "loss": 0.4707, "step": 163 }, { "epoch": 0.7321428571428571, "grad_norm": 1.966737151145935, "learning_rate": 9.68945296667004e-06, "loss": 0.4493, "step": 164 }, { "epoch": 0.7366071428571429, "grad_norm": 1.7973566055297852, "learning_rate": 9.685342670786025e-06, "loss": 0.5024, "step": 165 }, { "epoch": 0.7410714285714286, "grad_norm": 2.026155710220337, "learning_rate": 9.681206235763367e-06, "loss": 0.4879, "step": 166 }, { "epoch": 0.7455357142857143, "grad_norm": 1.679463267326355, "learning_rate": 9.677043684681358e-06, "loss": 0.4495, "step": 167 }, { "epoch": 0.75, "grad_norm": 1.7095201015472412, "learning_rate": 9.672855040765006e-06, "loss": 0.3991, "step": 168 }, { "epoch": 0.7544642857142857, "grad_norm": 1.7846328020095825, "learning_rate": 9.668640327384899e-06, "loss": 0.4653, "step": 169 }, { "epoch": 0.7589285714285714, "grad_norm": 1.9439399242401123, "learning_rate": 9.664399568057087e-06, "loss": 0.4811, "step": 170 }, { "epoch": 0.7633928571428571, "grad_norm": 1.8142772912979126, "learning_rate": 9.660132786442937e-06, "loss": 0.4638, "step": 171 }, { "epoch": 0.7678571428571429, "grad_norm": 1.8977925777435303, "learning_rate": 9.655840006349014e-06, "loss": 0.4698, "step": 172 }, { "epoch": 0.7723214285714286, "grad_norm": 1.624567985534668, "learning_rate": 9.651521251726936e-06, "loss": 0.4651, "step": 173 }, { "epoch": 0.7767857142857143, "grad_norm": 1.6974815130233765, "learning_rate": 9.64717654667325e-06, "loss": 0.5096, "step": 174 }, { "epoch": 0.78125, "grad_norm": 1.6671240329742432, "learning_rate": 9.642805915429291e-06, "loss": 0.4255, "step": 175 }, { "epoch": 0.7857142857142857, "grad_norm": 1.703273057937622, "learning_rate": 9.638409382381052e-06, "loss": 0.4436, "step": 176 }, { "epoch": 0.7901785714285714, "grad_norm": 1.924634575843811, "learning_rate": 9.633986972059047e-06, "loss": 0.4679, "step": 177 }, { "epoch": 0.7946428571428571, "grad_norm": 1.6996525526046753, "learning_rate": 9.629538709138166e-06, "loss": 0.4836, "step": 178 }, { "epoch": 0.7991071428571429, "grad_norm": 1.6877120733261108, "learning_rate": 9.625064618437549e-06, "loss": 0.4473, "step": 179 }, { "epoch": 0.8035714285714286, "grad_norm": 1.6341931819915771, "learning_rate": 9.620564724920443e-06, "loss": 0.4279, "step": 180 }, { "epoch": 0.8080357142857143, "grad_norm": 1.5455591678619385, "learning_rate": 9.616039053694058e-06, "loss": 0.4124, "step": 181 }, { "epoch": 0.8125, "grad_norm": 1.6947259902954102, "learning_rate": 9.611487630009436e-06, "loss": 0.4061, "step": 182 }, { "epoch": 0.8169642857142857, "grad_norm": 1.7641195058822632, "learning_rate": 9.606910479261301e-06, "loss": 0.4839, "step": 183 }, { "epoch": 0.8214285714285714, "grad_norm": 1.513904333114624, "learning_rate": 9.602307626987925e-06, "loss": 0.403, "step": 184 }, { "epoch": 0.8258928571428571, "grad_norm": 1.791146993637085, "learning_rate": 9.597679098870978e-06, "loss": 0.4782, "step": 185 }, { "epoch": 0.8303571428571429, "grad_norm": 1.7051180601119995, "learning_rate": 9.593024920735393e-06, "loss": 0.4754, "step": 186 }, { "epoch": 0.8348214285714286, "grad_norm": 1.749476671218872, "learning_rate": 9.588345118549214e-06, "loss": 0.435, "step": 187 }, { "epoch": 0.8392857142857143, "grad_norm": 1.661120057106018, "learning_rate": 9.583639718423457e-06, "loss": 0.4536, "step": 188 }, { "epoch": 0.84375, "grad_norm": 1.4928163290023804, "learning_rate": 9.57890874661196e-06, "loss": 0.375, "step": 189 }, { "epoch": 0.8482142857142857, "grad_norm": 1.6453092098236084, "learning_rate": 9.57415222951124e-06, "loss": 0.445, "step": 190 }, { "epoch": 0.8526785714285714, "grad_norm": 1.6496193408966064, "learning_rate": 9.569370193660348e-06, "loss": 0.4098, "step": 191 }, { "epoch": 0.8571428571428571, "grad_norm": 1.779951810836792, "learning_rate": 9.564562665740708e-06, "loss": 0.5191, "step": 192 }, { "epoch": 0.8616071428571429, "grad_norm": 1.5148218870162964, "learning_rate": 9.559729672575985e-06, "loss": 0.4001, "step": 193 }, { "epoch": 0.8660714285714286, "grad_norm": 1.5799369812011719, "learning_rate": 9.554871241131923e-06, "loss": 0.4807, "step": 194 }, { "epoch": 0.8705357142857143, "grad_norm": 1.6424708366394043, "learning_rate": 9.549987398516206e-06, "loss": 0.4851, "step": 195 }, { "epoch": 0.875, "grad_norm": 1.766772985458374, "learning_rate": 9.54507817197829e-06, "loss": 0.4295, "step": 196 }, { "epoch": 0.8794642857142857, "grad_norm": 1.6645870208740234, "learning_rate": 9.540143588909268e-06, "loss": 0.426, "step": 197 }, { "epoch": 0.8839285714285714, "grad_norm": 1.8015894889831543, "learning_rate": 9.535183676841709e-06, "loss": 0.4602, "step": 198 }, { "epoch": 0.8883928571428571, "grad_norm": 1.8687186241149902, "learning_rate": 9.530198463449507e-06, "loss": 0.4674, "step": 199 }, { "epoch": 0.8928571428571429, "grad_norm": 1.8257641792297363, "learning_rate": 9.525187976547718e-06, "loss": 0.4584, "step": 200 }, { "epoch": 0.8928571428571429, "eval_loss": 0.5852221250534058, "eval_runtime": 5.367, "eval_samples_per_second": 11.18, "eval_steps_per_second": 0.745, "step": 200 }, { "epoch": 0.8973214285714286, "grad_norm": 1.5470904111862183, "learning_rate": 9.520152244092421e-06, "loss": 0.3601, "step": 201 }, { "epoch": 0.9017857142857143, "grad_norm": 1.608096957206726, "learning_rate": 9.515091294180546e-06, "loss": 0.3992, "step": 202 }, { "epoch": 0.90625, "grad_norm": 1.8043137788772583, "learning_rate": 9.510005155049729e-06, "loss": 0.4782, "step": 203 }, { "epoch": 0.9107142857142857, "grad_norm": 1.6916441917419434, "learning_rate": 9.504893855078144e-06, "loss": 0.3765, "step": 204 }, { "epoch": 0.9151785714285714, "grad_norm": 1.5904515981674194, "learning_rate": 9.499757422784358e-06, "loss": 0.4424, "step": 205 }, { "epoch": 0.9196428571428571, "grad_norm": 1.6705337762832642, "learning_rate": 9.494595886827157e-06, "loss": 0.4984, "step": 206 }, { "epoch": 0.9241071428571429, "grad_norm": 1.657344937324524, "learning_rate": 9.489409276005393e-06, "loss": 0.4511, "step": 207 }, { "epoch": 0.9285714285714286, "grad_norm": 1.6433309316635132, "learning_rate": 9.48419761925783e-06, "loss": 0.5035, "step": 208 }, { "epoch": 0.9330357142857143, "grad_norm": 1.81150484085083, "learning_rate": 9.478960945662974e-06, "loss": 0.3958, "step": 209 }, { "epoch": 0.9375, "grad_norm": 1.6642935276031494, "learning_rate": 9.473699284438908e-06, "loss": 0.4011, "step": 210 }, { "epoch": 0.9419642857142857, "grad_norm": 1.8117976188659668, "learning_rate": 9.468412664943137e-06, "loss": 0.5123, "step": 211 }, { "epoch": 0.9464285714285714, "grad_norm": 1.5370533466339111, "learning_rate": 9.463101116672423e-06, "loss": 0.4226, "step": 212 }, { "epoch": 0.9508928571428571, "grad_norm": 2.033900499343872, "learning_rate": 9.457764669262615e-06, "loss": 0.4902, "step": 213 }, { "epoch": 0.9553571428571429, "grad_norm": 1.672672986984253, "learning_rate": 9.452403352488488e-06, "loss": 0.442, "step": 214 }, { "epoch": 0.9598214285714286, "grad_norm": 1.9266510009765625, "learning_rate": 9.447017196263578e-06, "loss": 0.4805, "step": 215 }, { "epoch": 0.9642857142857143, "grad_norm": 1.629258155822754, "learning_rate": 9.441606230640012e-06, "loss": 0.444, "step": 216 }, { "epoch": 0.96875, "grad_norm": 1.6848729848861694, "learning_rate": 9.436170485808338e-06, "loss": 0.4444, "step": 217 }, { "epoch": 0.9732142857142857, "grad_norm": 1.693801999092102, "learning_rate": 9.430709992097364e-06, "loss": 0.4913, "step": 218 }, { "epoch": 0.9776785714285714, "grad_norm": 1.5817019939422607, "learning_rate": 9.425224779973986e-06, "loss": 0.4554, "step": 219 }, { "epoch": 0.9821428571428571, "grad_norm": 1.7226394414901733, "learning_rate": 9.41971488004301e-06, "loss": 0.5002, "step": 220 }, { "epoch": 0.9866071428571429, "grad_norm": 1.7014926671981812, "learning_rate": 9.414180323046991e-06, "loss": 0.4582, "step": 221 }, { "epoch": 0.9910714285714286, "grad_norm": 1.5330203771591187, "learning_rate": 9.408621139866067e-06, "loss": 0.3967, "step": 222 }, { "epoch": 0.9955357142857143, "grad_norm": 1.6649432182312012, "learning_rate": 9.403037361517762e-06, "loss": 0.4442, "step": 223 }, { "epoch": 1.0, "grad_norm": 1.5434422492980957, "learning_rate": 9.397429019156841e-06, "loss": 0.4281, "step": 224 }, { "epoch": 1.0044642857142858, "grad_norm": 1.4803626537322998, "learning_rate": 9.391796144075123e-06, "loss": 0.3408, "step": 225 }, { "epoch": 1.0089285714285714, "grad_norm": 1.7202813625335693, "learning_rate": 9.386138767701306e-06, "loss": 0.3212, "step": 226 }, { "epoch": 1.0133928571428572, "grad_norm": 1.5383710861206055, "learning_rate": 9.380456921600785e-06, "loss": 0.3628, "step": 227 }, { "epoch": 1.0178571428571428, "grad_norm": 1.598041296005249, "learning_rate": 9.374750637475499e-06, "loss": 0.3354, "step": 228 }, { "epoch": 1.0223214285714286, "grad_norm": 1.483928918838501, "learning_rate": 9.36901994716373e-06, "loss": 0.3572, "step": 229 }, { "epoch": 1.0267857142857142, "grad_norm": 1.4985754489898682, "learning_rate": 9.363264882639936e-06, "loss": 0.3117, "step": 230 }, { "epoch": 1.03125, "grad_norm": 1.6541988849639893, "learning_rate": 9.357485476014573e-06, "loss": 0.3235, "step": 231 }, { "epoch": 1.0357142857142858, "grad_norm": 1.459572672843933, "learning_rate": 9.351681759533914e-06, "loss": 0.2768, "step": 232 }, { "epoch": 1.0401785714285714, "grad_norm": 1.7430468797683716, "learning_rate": 9.345853765579865e-06, "loss": 0.2995, "step": 233 }, { "epoch": 1.0446428571428572, "grad_norm": 1.4982185363769531, "learning_rate": 9.340001526669794e-06, "loss": 0.2818, "step": 234 }, { "epoch": 1.0491071428571428, "grad_norm": 1.6810327768325806, "learning_rate": 9.33412507545634e-06, "loss": 0.3382, "step": 235 }, { "epoch": 1.0535714285714286, "grad_norm": 1.8060071468353271, "learning_rate": 9.32822444472724e-06, "loss": 0.343, "step": 236 }, { "epoch": 1.0580357142857142, "grad_norm": 1.786448359489441, "learning_rate": 9.322299667405134e-06, "loss": 0.3411, "step": 237 }, { "epoch": 1.0625, "grad_norm": 1.776330590248108, "learning_rate": 9.31635077654739e-06, "loss": 0.3569, "step": 238 }, { "epoch": 1.0669642857142858, "grad_norm": 1.5425933599472046, "learning_rate": 9.310377805345926e-06, "loss": 0.2962, "step": 239 }, { "epoch": 1.0714285714285714, "grad_norm": 1.6801799535751343, "learning_rate": 9.304380787127003e-06, "loss": 0.3163, "step": 240 }, { "epoch": 1.0758928571428572, "grad_norm": 1.6421353816986084, "learning_rate": 9.298359755351065e-06, "loss": 0.3159, "step": 241 }, { "epoch": 1.0803571428571428, "grad_norm": 1.7834713459014893, "learning_rate": 9.29231474361253e-06, "loss": 0.3795, "step": 242 }, { "epoch": 1.0848214285714286, "grad_norm": 1.6435611248016357, "learning_rate": 9.28624578563962e-06, "loss": 0.3161, "step": 243 }, { "epoch": 1.0892857142857142, "grad_norm": 1.579264760017395, "learning_rate": 9.280152915294162e-06, "loss": 0.3916, "step": 244 }, { "epoch": 1.09375, "grad_norm": 1.5122257471084595, "learning_rate": 9.274036166571402e-06, "loss": 0.3642, "step": 245 }, { "epoch": 1.0982142857142858, "grad_norm": 1.4102823734283447, "learning_rate": 9.267895573599819e-06, "loss": 0.3147, "step": 246 }, { "epoch": 1.1026785714285714, "grad_norm": 1.5816137790679932, "learning_rate": 9.261731170640923e-06, "loss": 0.3346, "step": 247 }, { "epoch": 1.1071428571428572, "grad_norm": 1.426952838897705, "learning_rate": 9.255542992089086e-06, "loss": 0.3033, "step": 248 }, { "epoch": 1.1116071428571428, "grad_norm": 1.437669038772583, "learning_rate": 9.24933107247132e-06, "loss": 0.3195, "step": 249 }, { "epoch": 1.1160714285714286, "grad_norm": 1.5356261730194092, "learning_rate": 9.243095446447113e-06, "loss": 0.306, "step": 250 }, { "epoch": 1.1205357142857142, "grad_norm": 1.4559781551361084, "learning_rate": 9.23683614880822e-06, "loss": 0.3088, "step": 251 }, { "epoch": 1.125, "grad_norm": 1.5992493629455566, "learning_rate": 9.230553214478469e-06, "loss": 0.3614, "step": 252 }, { "epoch": 1.1294642857142858, "grad_norm": 1.5381258726119995, "learning_rate": 9.224246678513569e-06, "loss": 0.322, "step": 253 }, { "epoch": 1.1339285714285714, "grad_norm": 1.6599974632263184, "learning_rate": 9.217916576100922e-06, "loss": 0.3098, "step": 254 }, { "epoch": 1.1383928571428572, "grad_norm": 1.5781593322753906, "learning_rate": 9.211562942559408e-06, "loss": 0.3397, "step": 255 }, { "epoch": 1.1428571428571428, "grad_norm": 1.5014089345932007, "learning_rate": 9.20518581333921e-06, "loss": 0.3195, "step": 256 }, { "epoch": 1.1473214285714286, "grad_norm": 1.5395718812942505, "learning_rate": 9.1987852240216e-06, "loss": 0.3319, "step": 257 }, { "epoch": 1.1517857142857142, "grad_norm": 1.4616154432296753, "learning_rate": 9.192361210318745e-06, "loss": 0.3141, "step": 258 }, { "epoch": 1.15625, "grad_norm": 1.5390483140945435, "learning_rate": 9.185913808073513e-06, "loss": 0.3062, "step": 259 }, { "epoch": 1.1607142857142858, "grad_norm": 1.6422654390335083, "learning_rate": 9.179443053259263e-06, "loss": 0.3551, "step": 260 }, { "epoch": 1.1651785714285714, "grad_norm": 1.5900368690490723, "learning_rate": 9.172948981979654e-06, "loss": 0.3655, "step": 261 }, { "epoch": 1.1696428571428572, "grad_norm": 1.582152009010315, "learning_rate": 9.166431630468438e-06, "loss": 0.3418, "step": 262 }, { "epoch": 1.1741071428571428, "grad_norm": 1.4746930599212646, "learning_rate": 9.159891035089262e-06, "loss": 0.2786, "step": 263 }, { "epoch": 1.1785714285714286, "grad_norm": 1.5723954439163208, "learning_rate": 9.153327232335455e-06, "loss": 0.3347, "step": 264 }, { "epoch": 1.1830357142857142, "grad_norm": 1.374165415763855, "learning_rate": 9.146740258829844e-06, "loss": 0.2811, "step": 265 }, { "epoch": 1.1875, "grad_norm": 1.6520895957946777, "learning_rate": 9.140130151324526e-06, "loss": 0.3786, "step": 266 }, { "epoch": 1.1919642857142858, "grad_norm": 1.3879092931747437, "learning_rate": 9.13349694670068e-06, "loss": 0.3063, "step": 267 }, { "epoch": 1.1964285714285714, "grad_norm": 1.3920384645462036, "learning_rate": 9.126840681968357e-06, "loss": 0.3191, "step": 268 }, { "epoch": 1.2008928571428572, "grad_norm": 1.6087769269943237, "learning_rate": 9.120161394266266e-06, "loss": 0.3101, "step": 269 }, { "epoch": 1.2053571428571428, "grad_norm": 1.465286135673523, "learning_rate": 9.113459120861579e-06, "loss": 0.2833, "step": 270 }, { "epoch": 1.2098214285714286, "grad_norm": 1.681369423866272, "learning_rate": 9.106733899149715e-06, "loss": 0.3389, "step": 271 }, { "epoch": 1.2142857142857142, "grad_norm": 1.5745911598205566, "learning_rate": 9.099985766654132e-06, "loss": 0.3352, "step": 272 }, { "epoch": 1.21875, "grad_norm": 1.3523564338684082, "learning_rate": 9.093214761026121e-06, "loss": 0.2964, "step": 273 }, { "epoch": 1.2232142857142858, "grad_norm": 1.2804710865020752, "learning_rate": 9.08642092004459e-06, "loss": 0.2992, "step": 274 }, { "epoch": 1.2276785714285714, "grad_norm": 1.521187424659729, "learning_rate": 9.079604281615868e-06, "loss": 0.3212, "step": 275 }, { "epoch": 1.2321428571428572, "grad_norm": 1.3694134950637817, "learning_rate": 9.072764883773464e-06, "loss": 0.3195, "step": 276 }, { "epoch": 1.2366071428571428, "grad_norm": 1.4687600135803223, "learning_rate": 9.065902764677897e-06, "loss": 0.2878, "step": 277 }, { "epoch": 1.2410714285714286, "grad_norm": 1.5896199941635132, "learning_rate": 9.059017962616435e-06, "loss": 0.3482, "step": 278 }, { "epoch": 1.2455357142857142, "grad_norm": 1.6721631288528442, "learning_rate": 9.052110516002925e-06, "loss": 0.3629, "step": 279 }, { "epoch": 1.25, "grad_norm": 1.703574299812317, "learning_rate": 9.04518046337755e-06, "loss": 0.3618, "step": 280 }, { "epoch": 1.2544642857142856, "grad_norm": 1.704490065574646, "learning_rate": 9.038227843406628e-06, "loss": 0.3397, "step": 281 }, { "epoch": 1.2589285714285714, "grad_norm": 1.508264422416687, "learning_rate": 9.031252694882386e-06, "loss": 0.3405, "step": 282 }, { "epoch": 1.2633928571428572, "grad_norm": 1.4581125974655151, "learning_rate": 9.024255056722753e-06, "loss": 0.3227, "step": 283 }, { "epoch": 1.2678571428571428, "grad_norm": 1.4935333728790283, "learning_rate": 9.017234967971143e-06, "loss": 0.3248, "step": 284 }, { "epoch": 1.2723214285714286, "grad_norm": 1.5179234743118286, "learning_rate": 9.010192467796228e-06, "loss": 0.3257, "step": 285 }, { "epoch": 1.2767857142857144, "grad_norm": 1.5567351579666138, "learning_rate": 9.003127595491723e-06, "loss": 0.3393, "step": 286 }, { "epoch": 1.28125, "grad_norm": 1.6843305826187134, "learning_rate": 8.996040390476177e-06, "loss": 0.365, "step": 287 }, { "epoch": 1.2857142857142856, "grad_norm": 1.558060646057129, "learning_rate": 8.988930892292738e-06, "loss": 0.3617, "step": 288 }, { "epoch": 1.2901785714285714, "grad_norm": 1.5220246315002441, "learning_rate": 8.981799140608938e-06, "loss": 0.3399, "step": 289 }, { "epoch": 1.2946428571428572, "grad_norm": 1.5785568952560425, "learning_rate": 8.974645175216478e-06, "loss": 0.3267, "step": 290 }, { "epoch": 1.2991071428571428, "grad_norm": 1.5936439037322998, "learning_rate": 8.967469036030996e-06, "loss": 0.3337, "step": 291 }, { "epoch": 1.3035714285714286, "grad_norm": 1.7652866840362549, "learning_rate": 8.960270763091853e-06, "loss": 0.3109, "step": 292 }, { "epoch": 1.3080357142857144, "grad_norm": 1.3734923601150513, "learning_rate": 8.953050396561904e-06, "loss": 0.3026, "step": 293 }, { "epoch": 1.3125, "grad_norm": 1.570879578590393, "learning_rate": 8.94580797672727e-06, "loss": 0.3305, "step": 294 }, { "epoch": 1.3169642857142856, "grad_norm": 1.5227439403533936, "learning_rate": 8.938543543997129e-06, "loss": 0.2904, "step": 295 }, { "epoch": 1.3214285714285714, "grad_norm": 1.756388783454895, "learning_rate": 8.931257138903474e-06, "loss": 0.3017, "step": 296 }, { "epoch": 1.3258928571428572, "grad_norm": 1.714189052581787, "learning_rate": 8.923948802100891e-06, "loss": 0.3486, "step": 297 }, { "epoch": 1.3303571428571428, "grad_norm": 1.7315165996551514, "learning_rate": 8.916618574366338e-06, "loss": 0.3542, "step": 298 }, { "epoch": 1.3348214285714286, "grad_norm": 1.5031179189682007, "learning_rate": 8.909266496598917e-06, "loss": 0.3139, "step": 299 }, { "epoch": 1.3392857142857144, "grad_norm": 1.3531993627548218, "learning_rate": 8.901892609819632e-06, "loss": 0.2723, "step": 300 }, { "epoch": 1.34375, "grad_norm": 1.5571861267089844, "learning_rate": 8.894496955171182e-06, "loss": 0.36, "step": 301 }, { "epoch": 1.3482142857142856, "grad_norm": 1.662018060684204, "learning_rate": 8.887079573917713e-06, "loss": 0.3305, "step": 302 }, { "epoch": 1.3526785714285714, "grad_norm": 1.696288824081421, "learning_rate": 8.879640507444598e-06, "loss": 0.3286, "step": 303 }, { "epoch": 1.3571428571428572, "grad_norm": 1.5394744873046875, "learning_rate": 8.872179797258202e-06, "loss": 0.3549, "step": 304 }, { "epoch": 1.3616071428571428, "grad_norm": 1.6615936756134033, "learning_rate": 8.86469748498565e-06, "loss": 0.3198, "step": 305 }, { "epoch": 1.3660714285714286, "grad_norm": 1.4944754838943481, "learning_rate": 8.8571936123746e-06, "loss": 0.3068, "step": 306 }, { "epoch": 1.3705357142857144, "grad_norm": 1.4707030057907104, "learning_rate": 8.849668221293e-06, "loss": 0.3293, "step": 307 }, { "epoch": 1.375, "grad_norm": 1.4484243392944336, "learning_rate": 8.842121353728867e-06, "loss": 0.3217, "step": 308 }, { "epoch": 1.3794642857142856, "grad_norm": 1.544541835784912, "learning_rate": 8.834553051790044e-06, "loss": 0.3514, "step": 309 }, { "epoch": 1.3839285714285714, "grad_norm": 1.3725217580795288, "learning_rate": 8.826963357703964e-06, "loss": 0.2865, "step": 310 }, { "epoch": 1.3883928571428572, "grad_norm": 1.586159110069275, "learning_rate": 8.819352313817424e-06, "loss": 0.3376, "step": 311 }, { "epoch": 1.3928571428571428, "grad_norm": 1.4718273878097534, "learning_rate": 8.811719962596338e-06, "loss": 0.3243, "step": 312 }, { "epoch": 1.3973214285714286, "grad_norm": 1.41029953956604, "learning_rate": 8.804066346625506e-06, "loss": 0.3142, "step": 313 }, { "epoch": 1.4017857142857144, "grad_norm": 1.6441102027893066, "learning_rate": 8.796391508608372e-06, "loss": 0.362, "step": 314 }, { "epoch": 1.40625, "grad_norm": 1.501712441444397, "learning_rate": 8.788695491366795e-06, "loss": 0.3175, "step": 315 }, { "epoch": 1.4107142857142856, "grad_norm": 1.4669523239135742, "learning_rate": 8.780978337840796e-06, "loss": 0.3059, "step": 316 }, { "epoch": 1.4151785714285714, "grad_norm": 1.505470871925354, "learning_rate": 8.773240091088335e-06, "loss": 0.3163, "step": 317 }, { "epoch": 1.4196428571428572, "grad_norm": 1.5221257209777832, "learning_rate": 8.765480794285054e-06, "loss": 0.3329, "step": 318 }, { "epoch": 1.4241071428571428, "grad_norm": 1.4438836574554443, "learning_rate": 8.757700490724046e-06, "loss": 0.3109, "step": 319 }, { "epoch": 1.4285714285714286, "grad_norm": 1.5612000226974487, "learning_rate": 8.749899223815618e-06, "loss": 0.3512, "step": 320 }, { "epoch": 1.4330357142857144, "grad_norm": 1.447957158088684, "learning_rate": 8.742077037087032e-06, "loss": 0.3244, "step": 321 }, { "epoch": 1.4375, "grad_norm": 1.4214884042739868, "learning_rate": 8.734233974182276e-06, "loss": 0.3149, "step": 322 }, { "epoch": 1.4419642857142856, "grad_norm": 1.732344388961792, "learning_rate": 8.726370078861825e-06, "loss": 0.341, "step": 323 }, { "epoch": 1.4464285714285714, "grad_norm": 1.5918134450912476, "learning_rate": 8.718485395002377e-06, "loss": 0.3636, "step": 324 }, { "epoch": 1.4508928571428572, "grad_norm": 1.486258625984192, "learning_rate": 8.710579966596625e-06, "loss": 0.3428, "step": 325 }, { "epoch": 1.4553571428571428, "grad_norm": 1.4833279848098755, "learning_rate": 8.702653837753005e-06, "loss": 0.3205, "step": 326 }, { "epoch": 1.4598214285714286, "grad_norm": 1.620224118232727, "learning_rate": 8.694707052695459e-06, "loss": 0.3322, "step": 327 }, { "epoch": 1.4642857142857144, "grad_norm": 1.5433918237686157, "learning_rate": 8.686739655763166e-06, "loss": 0.3492, "step": 328 }, { "epoch": 1.46875, "grad_norm": 1.4457906484603882, "learning_rate": 8.678751691410323e-06, "loss": 0.3102, "step": 329 }, { "epoch": 1.4732142857142856, "grad_norm": 1.4180353879928589, "learning_rate": 8.670743204205875e-06, "loss": 0.3432, "step": 330 }, { "epoch": 1.4776785714285714, "grad_norm": 1.3930200338363647, "learning_rate": 8.662714238833278e-06, "loss": 0.294, "step": 331 }, { "epoch": 1.4821428571428572, "grad_norm": 1.436591625213623, "learning_rate": 8.654664840090247e-06, "loss": 0.3666, "step": 332 }, { "epoch": 1.4866071428571428, "grad_norm": 1.6145936250686646, "learning_rate": 8.6465950528885e-06, "loss": 0.323, "step": 333 }, { "epoch": 1.4910714285714286, "grad_norm": 1.4172477722167969, "learning_rate": 8.638504922253518e-06, "loss": 0.3138, "step": 334 }, { "epoch": 1.4955357142857144, "grad_norm": 1.4870661497116089, "learning_rate": 8.63039449332429e-06, "loss": 0.3337, "step": 335 }, { "epoch": 1.5, "grad_norm": 1.487033724784851, "learning_rate": 8.62226381135305e-06, "loss": 0.3157, "step": 336 }, { "epoch": 1.5044642857142856, "grad_norm": 1.5234925746917725, "learning_rate": 8.614112921705045e-06, "loss": 0.3067, "step": 337 }, { "epoch": 1.5089285714285714, "grad_norm": 1.5859482288360596, "learning_rate": 8.605941869858265e-06, "loss": 0.3364, "step": 338 }, { "epoch": 1.5133928571428572, "grad_norm": 1.5042221546173096, "learning_rate": 8.597750701403197e-06, "loss": 0.3187, "step": 339 }, { "epoch": 1.5178571428571428, "grad_norm": 1.379291296005249, "learning_rate": 8.589539462042566e-06, "loss": 0.3073, "step": 340 }, { "epoch": 1.5223214285714286, "grad_norm": 1.4755321741104126, "learning_rate": 8.581308197591088e-06, "loss": 0.309, "step": 341 }, { "epoch": 1.5267857142857144, "grad_norm": 1.5573457479476929, "learning_rate": 8.573056953975208e-06, "loss": 0.3501, "step": 342 }, { "epoch": 1.53125, "grad_norm": 1.4575328826904297, "learning_rate": 8.56478577723284e-06, "loss": 0.3452, "step": 343 }, { "epoch": 1.5357142857142856, "grad_norm": 1.4823976755142212, "learning_rate": 8.556494713513123e-06, "loss": 0.3331, "step": 344 }, { "epoch": 1.5401785714285714, "grad_norm": 1.4269355535507202, "learning_rate": 8.548183809076146e-06, "loss": 0.3107, "step": 345 }, { "epoch": 1.5446428571428572, "grad_norm": 1.4220494031906128, "learning_rate": 8.539853110292708e-06, "loss": 0.3319, "step": 346 }, { "epoch": 1.5491071428571428, "grad_norm": 1.5970823764801025, "learning_rate": 8.531502663644046e-06, "loss": 0.3548, "step": 347 }, { "epoch": 1.5535714285714286, "grad_norm": 1.4499046802520752, "learning_rate": 8.523132515721586e-06, "loss": 0.3369, "step": 348 }, { "epoch": 1.5580357142857144, "grad_norm": 1.3973332643508911, "learning_rate": 8.51474271322667e-06, "loss": 0.3464, "step": 349 }, { "epoch": 1.5625, "grad_norm": 1.7323225736618042, "learning_rate": 8.506333302970306e-06, "loss": 0.3218, "step": 350 }, { "epoch": 1.5669642857142856, "grad_norm": 1.3297864198684692, "learning_rate": 8.497904331872909e-06, "loss": 0.301, "step": 351 }, { "epoch": 1.5714285714285714, "grad_norm": 1.5161700248718262, "learning_rate": 8.489455846964027e-06, "loss": 0.3468, "step": 352 }, { "epoch": 1.5758928571428572, "grad_norm": 1.4331910610198975, "learning_rate": 8.480987895382086e-06, "loss": 0.3613, "step": 353 }, { "epoch": 1.5803571428571428, "grad_norm": 1.5763874053955078, "learning_rate": 8.472500524374129e-06, "loss": 0.357, "step": 354 }, { "epoch": 1.5848214285714286, "grad_norm": 1.50728440284729, "learning_rate": 8.463993781295552e-06, "loss": 0.3252, "step": 355 }, { "epoch": 1.5892857142857144, "grad_norm": 1.4221165180206299, "learning_rate": 8.45546771360983e-06, "loss": 0.3036, "step": 356 }, { "epoch": 1.59375, "grad_norm": 1.430577278137207, "learning_rate": 8.44692236888827e-06, "loss": 0.3187, "step": 357 }, { "epoch": 1.5982142857142856, "grad_norm": 1.316603422164917, "learning_rate": 8.43835779480973e-06, "loss": 0.3275, "step": 358 }, { "epoch": 1.6026785714285714, "grad_norm": 1.6066330671310425, "learning_rate": 8.429774039160355e-06, "loss": 0.336, "step": 359 }, { "epoch": 1.6071428571428572, "grad_norm": 1.6857646703720093, "learning_rate": 8.421171149833322e-06, "loss": 0.3346, "step": 360 }, { "epoch": 1.6116071428571428, "grad_norm": 1.6161949634552002, "learning_rate": 8.412549174828558e-06, "loss": 0.3806, "step": 361 }, { "epoch": 1.6160714285714286, "grad_norm": 1.5804469585418701, "learning_rate": 8.403908162252481e-06, "loss": 0.3332, "step": 362 }, { "epoch": 1.6205357142857144, "grad_norm": 1.568248987197876, "learning_rate": 8.395248160317728e-06, "loss": 0.3175, "step": 363 }, { "epoch": 1.625, "grad_norm": 1.4972864389419556, "learning_rate": 8.386569217342893e-06, "loss": 0.3233, "step": 364 }, { "epoch": 1.6294642857142856, "grad_norm": 1.5257365703582764, "learning_rate": 8.377871381752246e-06, "loss": 0.3273, "step": 365 }, { "epoch": 1.6339285714285714, "grad_norm": 1.4747686386108398, "learning_rate": 8.369154702075466e-06, "loss": 0.3382, "step": 366 }, { "epoch": 1.6383928571428572, "grad_norm": 1.5610405206680298, "learning_rate": 8.360419226947383e-06, "loss": 0.3203, "step": 367 }, { "epoch": 1.6428571428571428, "grad_norm": 1.4387973546981812, "learning_rate": 8.351665005107686e-06, "loss": 0.3271, "step": 368 }, { "epoch": 1.6473214285714286, "grad_norm": 1.5783442258834839, "learning_rate": 8.34289208540067e-06, "loss": 0.3576, "step": 369 }, { "epoch": 1.6517857142857144, "grad_norm": 1.5295450687408447, "learning_rate": 8.334100516774946e-06, "loss": 0.3488, "step": 370 }, { "epoch": 1.65625, "grad_norm": 1.5279836654663086, "learning_rate": 8.325290348283186e-06, "loss": 0.358, "step": 371 }, { "epoch": 1.6607142857142856, "grad_norm": 1.5376560688018799, "learning_rate": 8.316461629081833e-06, "loss": 0.3025, "step": 372 }, { "epoch": 1.6651785714285714, "grad_norm": 1.4480183124542236, "learning_rate": 8.307614408430839e-06, "loss": 0.3609, "step": 373 }, { "epoch": 1.6696428571428572, "grad_norm": 1.5485166311264038, "learning_rate": 8.298748735693382e-06, "loss": 0.3405, "step": 374 }, { "epoch": 1.6741071428571428, "grad_norm": 1.408075213432312, "learning_rate": 8.289864660335595e-06, "loss": 0.322, "step": 375 }, { "epoch": 1.6785714285714286, "grad_norm": 1.5506187677383423, "learning_rate": 8.280962231926288e-06, "loss": 0.3385, "step": 376 }, { "epoch": 1.6830357142857144, "grad_norm": 1.6020299196243286, "learning_rate": 8.27204150013667e-06, "loss": 0.3353, "step": 377 }, { "epoch": 1.6875, "grad_norm": 1.4672306776046753, "learning_rate": 8.263102514740082e-06, "loss": 0.3086, "step": 378 }, { "epoch": 1.6919642857142856, "grad_norm": 1.5637385845184326, "learning_rate": 8.2541453256117e-06, "loss": 0.3136, "step": 379 }, { "epoch": 1.6964285714285714, "grad_norm": 1.446553111076355, "learning_rate": 8.245169982728276e-06, "loss": 0.3389, "step": 380 }, { "epoch": 1.7008928571428572, "grad_norm": 1.4007227420806885, "learning_rate": 8.23617653616785e-06, "loss": 0.3095, "step": 381 }, { "epoch": 1.7053571428571428, "grad_norm": 1.454406976699829, "learning_rate": 8.227165036109468e-06, "loss": 0.329, "step": 382 }, { "epoch": 1.7098214285714286, "grad_norm": 1.7223042249679565, "learning_rate": 8.218135532832909e-06, "loss": 0.3662, "step": 383 }, { "epoch": 1.7142857142857144, "grad_norm": 1.5587764978408813, "learning_rate": 8.209088076718398e-06, "loss": 0.3348, "step": 384 }, { "epoch": 1.71875, "grad_norm": 1.602912187576294, "learning_rate": 8.20002271824633e-06, "loss": 0.3736, "step": 385 }, { "epoch": 1.7232142857142856, "grad_norm": 1.5459792613983154, "learning_rate": 8.190939507996992e-06, "loss": 0.3559, "step": 386 }, { "epoch": 1.7276785714285714, "grad_norm": 1.5487347841262817, "learning_rate": 8.181838496650266e-06, "loss": 0.3466, "step": 387 }, { "epoch": 1.7321428571428572, "grad_norm": 1.5751219987869263, "learning_rate": 8.17271973498536e-06, "loss": 0.3541, "step": 388 }, { "epoch": 1.7366071428571428, "grad_norm": 1.5197848081588745, "learning_rate": 8.163583273880519e-06, "loss": 0.3276, "step": 389 }, { "epoch": 1.7410714285714286, "grad_norm": 1.4693080186843872, "learning_rate": 8.154429164312742e-06, "loss": 0.3051, "step": 390 }, { "epoch": 1.7455357142857144, "grad_norm": 1.4455931186676025, "learning_rate": 8.145257457357502e-06, "loss": 0.328, "step": 391 }, { "epoch": 1.75, "grad_norm": 1.5701878070831299, "learning_rate": 8.136068204188448e-06, "loss": 0.3301, "step": 392 }, { "epoch": 1.7544642857142856, "grad_norm": 1.358162522315979, "learning_rate": 8.12686145607714e-06, "loss": 0.3074, "step": 393 }, { "epoch": 1.7589285714285714, "grad_norm": 1.8741075992584229, "learning_rate": 8.11763726439274e-06, "loss": 0.3883, "step": 394 }, { "epoch": 1.7633928571428572, "grad_norm": 1.5605037212371826, "learning_rate": 8.108395680601742e-06, "loss": 0.3213, "step": 395 }, { "epoch": 1.7678571428571428, "grad_norm": 1.5065466165542603, "learning_rate": 8.099136756267682e-06, "loss": 0.3213, "step": 396 }, { "epoch": 1.7723214285714286, "grad_norm": 1.3987689018249512, "learning_rate": 8.089860543050843e-06, "loss": 0.324, "step": 397 }, { "epoch": 1.7767857142857144, "grad_norm": 1.6072906255722046, "learning_rate": 8.080567092707973e-06, "loss": 0.344, "step": 398 }, { "epoch": 1.78125, "grad_norm": 1.4562393426895142, "learning_rate": 8.071256457091994e-06, "loss": 0.3173, "step": 399 }, { "epoch": 1.7857142857142856, "grad_norm": 1.4061006307601929, "learning_rate": 8.06192868815172e-06, "loss": 0.321, "step": 400 }, { "epoch": 1.7857142857142856, "eval_loss": 0.5984869003295898, "eval_runtime": 4.436, "eval_samples_per_second": 13.526, "eval_steps_per_second": 0.902, "step": 400 }, { "epoch": 1.7901785714285714, "grad_norm": 1.521924614906311, "learning_rate": 8.05258383793155e-06, "loss": 0.3542, "step": 401 }, { "epoch": 1.7946428571428572, "grad_norm": 1.4396589994430542, "learning_rate": 8.043221958571193e-06, "loss": 0.3303, "step": 402 }, { "epoch": 1.7991071428571428, "grad_norm": 1.5028449296951294, "learning_rate": 8.033843102305376e-06, "loss": 0.3436, "step": 403 }, { "epoch": 1.8035714285714286, "grad_norm": 1.3993929624557495, "learning_rate": 8.024447321463545e-06, "loss": 0.3207, "step": 404 }, { "epoch": 1.8080357142857144, "grad_norm": 1.4850343465805054, "learning_rate": 8.015034668469576e-06, "loss": 0.3495, "step": 405 }, { "epoch": 1.8125, "grad_norm": 1.5270286798477173, "learning_rate": 8.005605195841485e-06, "loss": 0.3224, "step": 406 }, { "epoch": 1.8169642857142856, "grad_norm": 1.462645411491394, "learning_rate": 7.996158956191135e-06, "loss": 0.2767, "step": 407 }, { "epoch": 1.8214285714285714, "grad_norm": 1.292880654335022, "learning_rate": 7.986696002223936e-06, "loss": 0.2946, "step": 408 }, { "epoch": 1.8258928571428572, "grad_norm": 1.4748491048812866, "learning_rate": 7.97721638673856e-06, "loss": 0.3342, "step": 409 }, { "epoch": 1.8303571428571428, "grad_norm": 1.4022036790847778, "learning_rate": 7.967720162626643e-06, "loss": 0.333, "step": 410 }, { "epoch": 1.8348214285714286, "grad_norm": 1.479123830795288, "learning_rate": 7.958207382872486e-06, "loss": 0.3424, "step": 411 }, { "epoch": 1.8392857142857144, "grad_norm": 1.455062747001648, "learning_rate": 7.94867810055276e-06, "loss": 0.304, "step": 412 }, { "epoch": 1.84375, "grad_norm": 1.506971001625061, "learning_rate": 7.93913236883622e-06, "loss": 0.3409, "step": 413 }, { "epoch": 1.8482142857142856, "grad_norm": 1.5958027839660645, "learning_rate": 7.929570240983393e-06, "loss": 0.3264, "step": 414 }, { "epoch": 1.8526785714285714, "grad_norm": 1.3807499408721924, "learning_rate": 7.919991770346295e-06, "loss": 0.3436, "step": 415 }, { "epoch": 1.8571428571428572, "grad_norm": 1.3814480304718018, "learning_rate": 7.910397010368122e-06, "loss": 0.3288, "step": 416 }, { "epoch": 1.8616071428571428, "grad_norm": 1.412415862083435, "learning_rate": 7.900786014582957e-06, "loss": 0.353, "step": 417 }, { "epoch": 1.8660714285714286, "grad_norm": 1.3772637844085693, "learning_rate": 7.891158836615472e-06, "loss": 0.2992, "step": 418 }, { "epoch": 1.8705357142857144, "grad_norm": 1.425160527229309, "learning_rate": 7.881515530180629e-06, "loss": 0.3536, "step": 419 }, { "epoch": 1.875, "grad_norm": 1.4070899486541748, "learning_rate": 7.871856149083377e-06, "loss": 0.3252, "step": 420 }, { "epoch": 1.8794642857142856, "grad_norm": 1.3919873237609863, "learning_rate": 7.862180747218354e-06, "loss": 0.3378, "step": 421 }, { "epoch": 1.8839285714285714, "grad_norm": 1.4947540760040283, "learning_rate": 7.852489378569588e-06, "loss": 0.3383, "step": 422 }, { "epoch": 1.8883928571428572, "grad_norm": 1.500719428062439, "learning_rate": 7.84278209721019e-06, "loss": 0.3414, "step": 423 }, { "epoch": 1.8928571428571428, "grad_norm": 1.202958583831787, "learning_rate": 7.83305895730206e-06, "loss": 0.2683, "step": 424 }, { "epoch": 1.8973214285714286, "grad_norm": 1.518190860748291, "learning_rate": 7.823320013095578e-06, "loss": 0.3315, "step": 425 }, { "epoch": 1.9017857142857144, "grad_norm": 1.435086965560913, "learning_rate": 7.81356531892931e-06, "loss": 0.3238, "step": 426 }, { "epoch": 1.90625, "grad_norm": 1.4547661542892456, "learning_rate": 7.803794929229689e-06, "loss": 0.3171, "step": 427 }, { "epoch": 1.9107142857142856, "grad_norm": 1.5060296058654785, "learning_rate": 7.794008898510731e-06, "loss": 0.348, "step": 428 }, { "epoch": 1.9151785714285714, "grad_norm": 1.5503387451171875, "learning_rate": 7.784207281373716e-06, "loss": 0.3573, "step": 429 }, { "epoch": 1.9196428571428572, "grad_norm": 1.4911384582519531, "learning_rate": 7.774390132506892e-06, "loss": 0.3234, "step": 430 }, { "epoch": 1.9241071428571428, "grad_norm": 1.5123614072799683, "learning_rate": 7.764557506685162e-06, "loss": 0.3838, "step": 431 }, { "epoch": 1.9285714285714286, "grad_norm": 1.529229760169983, "learning_rate": 7.754709458769787e-06, "loss": 0.3425, "step": 432 }, { "epoch": 1.9330357142857144, "grad_norm": 1.4191179275512695, "learning_rate": 7.744846043708076e-06, "loss": 0.3175, "step": 433 }, { "epoch": 1.9375, "grad_norm": 1.5373347997665405, "learning_rate": 7.734967316533074e-06, "loss": 0.3379, "step": 434 }, { "epoch": 1.9419642857142856, "grad_norm": 1.562549114227295, "learning_rate": 7.725073332363265e-06, "loss": 0.3521, "step": 435 }, { "epoch": 1.9464285714285714, "grad_norm": 1.4550033807754517, "learning_rate": 7.715164146402259e-06, "loss": 0.3314, "step": 436 }, { "epoch": 1.9508928571428572, "grad_norm": 1.5194437503814697, "learning_rate": 7.705239813938486e-06, "loss": 0.3369, "step": 437 }, { "epoch": 1.9553571428571428, "grad_norm": 1.331009030342102, "learning_rate": 7.69530039034488e-06, "loss": 0.3156, "step": 438 }, { "epoch": 1.9598214285714286, "grad_norm": 1.4136159420013428, "learning_rate": 7.685345931078579e-06, "loss": 0.352, "step": 439 }, { "epoch": 1.9642857142857144, "grad_norm": 1.4941082000732422, "learning_rate": 7.675376491680617e-06, "loss": 0.3487, "step": 440 }, { "epoch": 1.96875, "grad_norm": 1.5224647521972656, "learning_rate": 7.665392127775605e-06, "loss": 0.3472, "step": 441 }, { "epoch": 1.9732142857142856, "grad_norm": 1.389153242111206, "learning_rate": 7.65539289507143e-06, "loss": 0.3188, "step": 442 }, { "epoch": 1.9776785714285714, "grad_norm": 1.3954931497573853, "learning_rate": 7.645378849358931e-06, "loss": 0.2805, "step": 443 }, { "epoch": 1.9821428571428572, "grad_norm": 1.4153132438659668, "learning_rate": 7.635350046511609e-06, "loss": 0.3351, "step": 444 }, { "epoch": 1.9866071428571428, "grad_norm": 1.4916757345199585, "learning_rate": 7.625306542485289e-06, "loss": 0.3388, "step": 445 }, { "epoch": 1.9910714285714286, "grad_norm": 1.395013689994812, "learning_rate": 7.615248393317833e-06, "loss": 0.3422, "step": 446 }, { "epoch": 1.9955357142857144, "grad_norm": 1.6745942831039429, "learning_rate": 7.605175655128809e-06, "loss": 0.3205, "step": 447 }, { "epoch": 2.0, "grad_norm": 1.5079350471496582, "learning_rate": 7.595088384119186e-06, "loss": 0.3489, "step": 448 }, { "epoch": 2.0044642857142856, "grad_norm": 1.3217769861221313, "learning_rate": 7.58498663657102e-06, "loss": 0.2103, "step": 449 }, { "epoch": 2.0089285714285716, "grad_norm": 1.3270319700241089, "learning_rate": 7.57487046884714e-06, "loss": 0.2176, "step": 450 }, { "epoch": 2.013392857142857, "grad_norm": 1.1652079820632935, "learning_rate": 7.5647399373908296e-06, "loss": 0.1737, "step": 451 }, { "epoch": 2.017857142857143, "grad_norm": 1.2553536891937256, "learning_rate": 7.554595098725515e-06, "loss": 0.2092, "step": 452 }, { "epoch": 2.0223214285714284, "grad_norm": 1.1758341789245605, "learning_rate": 7.544436009454454e-06, "loss": 0.1817, "step": 453 }, { "epoch": 2.0267857142857144, "grad_norm": 1.359291434288025, "learning_rate": 7.534262726260413e-06, "loss": 0.2095, "step": 454 }, { "epoch": 2.03125, "grad_norm": 1.4187179803848267, "learning_rate": 7.524075305905351e-06, "loss": 0.199, "step": 455 }, { "epoch": 2.0357142857142856, "grad_norm": 1.6442433595657349, "learning_rate": 7.513873805230111e-06, "loss": 0.2029, "step": 456 }, { "epoch": 2.0401785714285716, "grad_norm": 1.8652024269104004, "learning_rate": 7.5036582811540935e-06, "loss": 0.2194, "step": 457 }, { "epoch": 2.044642857142857, "grad_norm": 1.7254689931869507, "learning_rate": 7.493428790674943e-06, "loss": 0.1686, "step": 458 }, { "epoch": 2.049107142857143, "grad_norm": 1.8289271593093872, "learning_rate": 7.483185390868232e-06, "loss": 0.1941, "step": 459 }, { "epoch": 2.0535714285714284, "grad_norm": 1.7519882917404175, "learning_rate": 7.472928138887134e-06, "loss": 0.1833, "step": 460 }, { "epoch": 2.0580357142857144, "grad_norm": 1.9071825742721558, "learning_rate": 7.462657091962122e-06, "loss": 0.202, "step": 461 }, { "epoch": 2.0625, "grad_norm": 1.690094232559204, "learning_rate": 7.452372307400626e-06, "loss": 0.1903, "step": 462 }, { "epoch": 2.0669642857142856, "grad_norm": 1.6332119703292847, "learning_rate": 7.442073842586733e-06, "loss": 0.1868, "step": 463 }, { "epoch": 2.0714285714285716, "grad_norm": 1.5547043085098267, "learning_rate": 7.43176175498086e-06, "loss": 0.17, "step": 464 }, { "epoch": 2.075892857142857, "grad_norm": 1.6427596807479858, "learning_rate": 7.421436102119427e-06, "loss": 0.2107, "step": 465 }, { "epoch": 2.080357142857143, "grad_norm": 1.4404045343399048, "learning_rate": 7.411096941614543e-06, "loss": 0.1707, "step": 466 }, { "epoch": 2.0848214285714284, "grad_norm": 1.3924144506454468, "learning_rate": 7.400744331153684e-06, "loss": 0.1847, "step": 467 }, { "epoch": 2.0892857142857144, "grad_norm": 1.2086669206619263, "learning_rate": 7.390378328499372e-06, "loss": 0.1564, "step": 468 }, { "epoch": 2.09375, "grad_norm": 1.4519152641296387, "learning_rate": 7.3799989914888506e-06, "loss": 0.1997, "step": 469 }, { "epoch": 2.0982142857142856, "grad_norm": 1.478129506111145, "learning_rate": 7.3696063780337566e-06, "loss": 0.1925, "step": 470 }, { "epoch": 2.1026785714285716, "grad_norm": 1.650312900543213, "learning_rate": 7.359200546119813e-06, "loss": 0.1883, "step": 471 }, { "epoch": 2.107142857142857, "grad_norm": 1.4365777969360352, "learning_rate": 7.3487815538064865e-06, "loss": 0.1868, "step": 472 }, { "epoch": 2.111607142857143, "grad_norm": 1.342756748199463, "learning_rate": 7.338349459226678e-06, "loss": 0.1692, "step": 473 }, { "epoch": 2.1160714285714284, "grad_norm": 1.5859627723693848, "learning_rate": 7.327904320586387e-06, "loss": 0.201, "step": 474 }, { "epoch": 2.1205357142857144, "grad_norm": 1.499635100364685, "learning_rate": 7.3174461961644e-06, "loss": 0.1817, "step": 475 }, { "epoch": 2.125, "grad_norm": 2.32686710357666, "learning_rate": 7.3069751443119505e-06, "loss": 0.1704, "step": 476 }, { "epoch": 2.1294642857142856, "grad_norm": 1.412901759147644, "learning_rate": 7.296491223452407e-06, "loss": 0.1828, "step": 477 }, { "epoch": 2.1339285714285716, "grad_norm": 1.5171948671340942, "learning_rate": 7.285994492080934e-06, "loss": 0.1953, "step": 478 }, { "epoch": 2.138392857142857, "grad_norm": 1.5599428415298462, "learning_rate": 7.275485008764183e-06, "loss": 0.1923, "step": 479 }, { "epoch": 2.142857142857143, "grad_norm": 1.5545156002044678, "learning_rate": 7.2649628321399415e-06, "loss": 0.19, "step": 480 }, { "epoch": 2.1473214285714284, "grad_norm": 1.479979157447815, "learning_rate": 7.254428020916829e-06, "loss": 0.1862, "step": 481 }, { "epoch": 2.1517857142857144, "grad_norm": 1.4323021173477173, "learning_rate": 7.243880633873957e-06, "loss": 0.1968, "step": 482 }, { "epoch": 2.15625, "grad_norm": 1.5759376287460327, "learning_rate": 7.2333207298606075e-06, "loss": 0.1912, "step": 483 }, { "epoch": 2.1607142857142856, "grad_norm": 1.4067718982696533, "learning_rate": 7.222748367795892e-06, "loss": 0.1968, "step": 484 }, { "epoch": 2.1651785714285716, "grad_norm": 1.4289413690567017, "learning_rate": 7.212163606668442e-06, "loss": 0.181, "step": 485 }, { "epoch": 2.169642857142857, "grad_norm": 1.386183738708496, "learning_rate": 7.201566505536065e-06, "loss": 0.1859, "step": 486 }, { "epoch": 2.174107142857143, "grad_norm": 1.5737900733947754, "learning_rate": 7.190957123525417e-06, "loss": 0.1975, "step": 487 }, { "epoch": 2.1785714285714284, "grad_norm": 1.4346133470535278, "learning_rate": 7.180335519831685e-06, "loss": 0.1988, "step": 488 }, { "epoch": 2.1830357142857144, "grad_norm": 1.6407471895217896, "learning_rate": 7.169701753718232e-06, "loss": 0.1942, "step": 489 }, { "epoch": 2.1875, "grad_norm": 1.4272671937942505, "learning_rate": 7.159055884516297e-06, "loss": 0.1709, "step": 490 }, { "epoch": 2.1919642857142856, "grad_norm": 1.4495371580123901, "learning_rate": 7.148397971624636e-06, "loss": 0.1857, "step": 491 }, { "epoch": 2.1964285714285716, "grad_norm": 1.474555253982544, "learning_rate": 7.137728074509211e-06, "loss": 0.1994, "step": 492 }, { "epoch": 2.200892857142857, "grad_norm": 1.3996185064315796, "learning_rate": 7.127046252702847e-06, "loss": 0.1976, "step": 493 }, { "epoch": 2.205357142857143, "grad_norm": 1.4525631666183472, "learning_rate": 7.116352565804904e-06, "loss": 0.1781, "step": 494 }, { "epoch": 2.2098214285714284, "grad_norm": 1.3915441036224365, "learning_rate": 7.105647073480939e-06, "loss": 0.1934, "step": 495 }, { "epoch": 2.2142857142857144, "grad_norm": 1.548956274986267, "learning_rate": 7.0949298354623855e-06, "loss": 0.1956, "step": 496 }, { "epoch": 2.21875, "grad_norm": 1.67276930809021, "learning_rate": 7.084200911546205e-06, "loss": 0.2031, "step": 497 }, { "epoch": 2.2232142857142856, "grad_norm": 1.4230951070785522, "learning_rate": 7.073460361594565e-06, "loss": 0.1747, "step": 498 }, { "epoch": 2.2276785714285716, "grad_norm": 1.3589422702789307, "learning_rate": 7.0627082455344984e-06, "loss": 0.1697, "step": 499 }, { "epoch": 2.232142857142857, "grad_norm": 1.5158501863479614, "learning_rate": 7.0519446233575715e-06, "loss": 0.1931, "step": 500 }, { "epoch": 2.236607142857143, "grad_norm": 1.584936499595642, "learning_rate": 7.041169555119552e-06, "loss": 0.1835, "step": 501 }, { "epoch": 2.2410714285714284, "grad_norm": 1.75810968875885, "learning_rate": 7.030383100940068e-06, "loss": 0.2101, "step": 502 }, { "epoch": 2.2455357142857144, "grad_norm": 1.5574495792388916, "learning_rate": 7.019585321002276e-06, "loss": 0.1967, "step": 503 }, { "epoch": 2.25, "grad_norm": 1.4675954580307007, "learning_rate": 7.008776275552522e-06, "loss": 0.1835, "step": 504 }, { "epoch": 2.2544642857142856, "grad_norm": 1.5428667068481445, "learning_rate": 6.997956024900014e-06, "loss": 0.1758, "step": 505 }, { "epoch": 2.2589285714285716, "grad_norm": 1.7386614084243774, "learning_rate": 6.9871246294164775e-06, "loss": 0.2004, "step": 506 }, { "epoch": 2.263392857142857, "grad_norm": 1.6915313005447388, "learning_rate": 6.9762821495358194e-06, "loss": 0.2123, "step": 507 }, { "epoch": 2.267857142857143, "grad_norm": 1.512141227722168, "learning_rate": 6.965428645753792e-06, "loss": 0.1849, "step": 508 }, { "epoch": 2.2723214285714284, "grad_norm": 1.3899809122085571, "learning_rate": 6.954564178627655e-06, "loss": 0.1755, "step": 509 }, { "epoch": 2.2767857142857144, "grad_norm": 1.446179747581482, "learning_rate": 6.943688808775843e-06, "loss": 0.2002, "step": 510 }, { "epoch": 2.28125, "grad_norm": 1.4694817066192627, "learning_rate": 6.9328025968776155e-06, "loss": 0.1733, "step": 511 }, { "epoch": 2.2857142857142856, "grad_norm": 1.4220669269561768, "learning_rate": 6.921905603672733e-06, "loss": 0.1865, "step": 512 }, { "epoch": 2.2901785714285716, "grad_norm": 1.5075308084487915, "learning_rate": 6.910997889961098e-06, "loss": 0.2068, "step": 513 }, { "epoch": 2.294642857142857, "grad_norm": 1.433173418045044, "learning_rate": 6.900079516602445e-06, "loss": 0.1646, "step": 514 }, { "epoch": 2.299107142857143, "grad_norm": 1.5466556549072266, "learning_rate": 6.889150544515972e-06, "loss": 0.1876, "step": 515 }, { "epoch": 2.3035714285714284, "grad_norm": 1.5720268487930298, "learning_rate": 6.8782110346800155e-06, "loss": 0.1895, "step": 516 }, { "epoch": 2.3080357142857144, "grad_norm": 1.5228114128112793, "learning_rate": 6.867261048131712e-06, "loss": 0.1775, "step": 517 }, { "epoch": 2.3125, "grad_norm": 1.5427687168121338, "learning_rate": 6.856300645966645e-06, "loss": 0.1949, "step": 518 }, { "epoch": 2.3169642857142856, "grad_norm": 1.4784200191497803, "learning_rate": 6.845329889338519e-06, "loss": 0.1972, "step": 519 }, { "epoch": 2.3214285714285716, "grad_norm": 1.5751314163208008, "learning_rate": 6.834348839458806e-06, "loss": 0.192, "step": 520 }, { "epoch": 2.325892857142857, "grad_norm": 1.4170998334884644, "learning_rate": 6.823357557596416e-06, "loss": 0.1739, "step": 521 }, { "epoch": 2.330357142857143, "grad_norm": 1.5813453197479248, "learning_rate": 6.81235610507734e-06, "loss": 0.1973, "step": 522 }, { "epoch": 2.3348214285714284, "grad_norm": 1.4130178689956665, "learning_rate": 6.801344543284324e-06, "loss": 0.1784, "step": 523 }, { "epoch": 2.3392857142857144, "grad_norm": 1.503487229347229, "learning_rate": 6.790322933656515e-06, "loss": 0.1758, "step": 524 }, { "epoch": 2.34375, "grad_norm": 1.5043838024139404, "learning_rate": 6.779291337689122e-06, "loss": 0.1931, "step": 525 }, { "epoch": 2.3482142857142856, "grad_norm": 1.5936574935913086, "learning_rate": 6.768249816933074e-06, "loss": 0.2102, "step": 526 }, { "epoch": 2.3526785714285716, "grad_norm": 1.39701247215271, "learning_rate": 6.757198432994674e-06, "loss": 0.1888, "step": 527 }, { "epoch": 2.357142857142857, "grad_norm": 1.4147759675979614, "learning_rate": 6.7461372475352585e-06, "loss": 0.1728, "step": 528 }, { "epoch": 2.361607142857143, "grad_norm": 1.5155245065689087, "learning_rate": 6.73506632227085e-06, "loss": 0.2029, "step": 529 }, { "epoch": 2.3660714285714284, "grad_norm": 1.3971562385559082, "learning_rate": 6.723985718971818e-06, "loss": 0.1712, "step": 530 }, { "epoch": 2.3705357142857144, "grad_norm": 1.4848732948303223, "learning_rate": 6.712895499462527e-06, "loss": 0.1778, "step": 531 }, { "epoch": 2.375, "grad_norm": 1.3537763357162476, "learning_rate": 6.701795725620995e-06, "loss": 0.1807, "step": 532 }, { "epoch": 2.3794642857142856, "grad_norm": 1.278257131576538, "learning_rate": 6.69068645937855e-06, "loss": 0.1739, "step": 533 }, { "epoch": 2.3839285714285716, "grad_norm": 1.4302947521209717, "learning_rate": 6.6795677627194835e-06, "loss": 0.1757, "step": 534 }, { "epoch": 2.388392857142857, "grad_norm": 1.5175752639770508, "learning_rate": 6.668439697680704e-06, "loss": 0.1744, "step": 535 }, { "epoch": 2.392857142857143, "grad_norm": 1.582108974456787, "learning_rate": 6.65730232635139e-06, "loss": 0.2061, "step": 536 }, { "epoch": 2.3973214285714284, "grad_norm": 1.3695740699768066, "learning_rate": 6.6461557108726435e-06, "loss": 0.1861, "step": 537 }, { "epoch": 2.4017857142857144, "grad_norm": 1.4187901020050049, "learning_rate": 6.634999913437148e-06, "loss": 0.1646, "step": 538 }, { "epoch": 2.40625, "grad_norm": 1.5142133235931396, "learning_rate": 6.623834996288816e-06, "loss": 0.1878, "step": 539 }, { "epoch": 2.4107142857142856, "grad_norm": 1.4513888359069824, "learning_rate": 6.6126610217224405e-06, "loss": 0.2083, "step": 540 }, { "epoch": 2.4151785714285716, "grad_norm": 1.6881784200668335, "learning_rate": 6.601478052083356e-06, "loss": 0.215, "step": 541 }, { "epoch": 2.419642857142857, "grad_norm": 1.5719410181045532, "learning_rate": 6.59028614976708e-06, "loss": 0.2007, "step": 542 }, { "epoch": 2.424107142857143, "grad_norm": 1.5767160654067993, "learning_rate": 6.579085377218973e-06, "loss": 0.1993, "step": 543 }, { "epoch": 2.4285714285714284, "grad_norm": 1.4855055809020996, "learning_rate": 6.567875796933888e-06, "loss": 0.1962, "step": 544 }, { "epoch": 2.4330357142857144, "grad_norm": 1.4299588203430176, "learning_rate": 6.556657471455817e-06, "loss": 0.1904, "step": 545 }, { "epoch": 2.4375, "grad_norm": 1.4422001838684082, "learning_rate": 6.54543046337755e-06, "loss": 0.211, "step": 546 }, { "epoch": 2.4419642857142856, "grad_norm": 1.6163188219070435, "learning_rate": 6.534194835340321e-06, "loss": 0.2123, "step": 547 }, { "epoch": 2.4464285714285716, "grad_norm": 1.485174298286438, "learning_rate": 6.522950650033454e-06, "loss": 0.1834, "step": 548 }, { "epoch": 2.450892857142857, "grad_norm": 1.5538384914398193, "learning_rate": 6.511697970194024e-06, "loss": 0.1982, "step": 549 }, { "epoch": 2.455357142857143, "grad_norm": 1.3437016010284424, "learning_rate": 6.500436858606501e-06, "loss": 0.1809, "step": 550 }, { "epoch": 2.4598214285714284, "grad_norm": 1.5390422344207764, "learning_rate": 6.4891673781023975e-06, "loss": 0.1861, "step": 551 }, { "epoch": 2.4642857142857144, "grad_norm": 1.5996623039245605, "learning_rate": 6.477889591559926e-06, "loss": 0.2132, "step": 552 }, { "epoch": 2.46875, "grad_norm": 1.3555296659469604, "learning_rate": 6.466603561903633e-06, "loss": 0.1709, "step": 553 }, { "epoch": 2.4732142857142856, "grad_norm": 1.4055936336517334, "learning_rate": 6.455309352104065e-06, "loss": 0.1931, "step": 554 }, { "epoch": 2.4776785714285716, "grad_norm": 1.490340232849121, "learning_rate": 6.444007025177407e-06, "loss": 0.1923, "step": 555 }, { "epoch": 2.482142857142857, "grad_norm": 1.451197862625122, "learning_rate": 6.4326966441851355e-06, "loss": 0.1822, "step": 556 }, { "epoch": 2.486607142857143, "grad_norm": 1.33744215965271, "learning_rate": 6.4213782722336625e-06, "loss": 0.1557, "step": 557 }, { "epoch": 2.4910714285714284, "grad_norm": 1.7766042947769165, "learning_rate": 6.4100519724739875e-06, "loss": 0.2113, "step": 558 }, { "epoch": 2.4955357142857144, "grad_norm": 1.5189027786254883, "learning_rate": 6.3987178081013446e-06, "loss": 0.1917, "step": 559 }, { "epoch": 2.5, "grad_norm": 1.4922457933425903, "learning_rate": 6.387375842354843e-06, "loss": 0.1928, "step": 560 }, { "epoch": 2.5044642857142856, "grad_norm": 1.5204309225082397, "learning_rate": 6.376026138517125e-06, "loss": 0.1852, "step": 561 }, { "epoch": 2.508928571428571, "grad_norm": 1.5281089544296265, "learning_rate": 6.364668759914005e-06, "loss": 0.1955, "step": 562 }, { "epoch": 2.513392857142857, "grad_norm": 1.4325025081634521, "learning_rate": 6.353303769914121e-06, "loss": 0.2076, "step": 563 }, { "epoch": 2.517857142857143, "grad_norm": 1.4212669134140015, "learning_rate": 6.341931231928577e-06, "loss": 0.1674, "step": 564 }, { "epoch": 2.522321428571429, "grad_norm": 1.6153303384780884, "learning_rate": 6.330551209410593e-06, "loss": 0.1878, "step": 565 }, { "epoch": 2.5267857142857144, "grad_norm": 1.956329345703125, "learning_rate": 6.319163765855146e-06, "loss": 0.2096, "step": 566 }, { "epoch": 2.53125, "grad_norm": 1.5983258485794067, "learning_rate": 6.307768964798623e-06, "loss": 0.1859, "step": 567 }, { "epoch": 2.5357142857142856, "grad_norm": 1.6349304914474487, "learning_rate": 6.296366869818458e-06, "loss": 0.2049, "step": 568 }, { "epoch": 2.540178571428571, "grad_norm": 1.6352767944335938, "learning_rate": 6.284957544532783e-06, "loss": 0.2139, "step": 569 }, { "epoch": 2.544642857142857, "grad_norm": 1.6097818613052368, "learning_rate": 6.273541052600074e-06, "loss": 0.2227, "step": 570 }, { "epoch": 2.549107142857143, "grad_norm": 1.4204213619232178, "learning_rate": 6.2621174577187895e-06, "loss": 0.1626, "step": 571 }, { "epoch": 2.553571428571429, "grad_norm": 1.5108147859573364, "learning_rate": 6.250686823627022e-06, "loss": 0.1906, "step": 572 }, { "epoch": 2.5580357142857144, "grad_norm": 1.318894863128662, "learning_rate": 6.239249214102139e-06, "loss": 0.1644, "step": 573 }, { "epoch": 2.5625, "grad_norm": 1.4823791980743408, "learning_rate": 6.2278046929604265e-06, "loss": 0.1953, "step": 574 }, { "epoch": 2.5669642857142856, "grad_norm": 1.440727949142456, "learning_rate": 6.216353324056732e-06, "loss": 0.182, "step": 575 }, { "epoch": 2.571428571428571, "grad_norm": 1.3238186836242676, "learning_rate": 6.204895171284115e-06, "loss": 0.1706, "step": 576 }, { "epoch": 2.575892857142857, "grad_norm": 1.4335119724273682, "learning_rate": 6.193430298573481e-06, "loss": 0.2052, "step": 577 }, { "epoch": 2.580357142857143, "grad_norm": 1.4563027620315552, "learning_rate": 6.181958769893234e-06, "loss": 0.2076, "step": 578 }, { "epoch": 2.584821428571429, "grad_norm": 1.4558268785476685, "learning_rate": 6.17048064924891e-06, "loss": 0.2074, "step": 579 }, { "epoch": 2.5892857142857144, "grad_norm": 1.4916455745697021, "learning_rate": 6.158996000682829e-06, "loss": 0.1868, "step": 580 }, { "epoch": 2.59375, "grad_norm": 1.5160123109817505, "learning_rate": 6.147504888273736e-06, "loss": 0.1952, "step": 581 }, { "epoch": 2.5982142857142856, "grad_norm": 1.518829107284546, "learning_rate": 6.136007376136429e-06, "loss": 0.2029, "step": 582 }, { "epoch": 2.602678571428571, "grad_norm": 1.2285252809524536, "learning_rate": 6.124503528421429e-06, "loss": 0.1513, "step": 583 }, { "epoch": 2.607142857142857, "grad_norm": 1.465366244316101, "learning_rate": 6.112993409314594e-06, "loss": 0.183, "step": 584 }, { "epoch": 2.611607142857143, "grad_norm": 1.3672759532928467, "learning_rate": 6.101477083036783e-06, "loss": 0.1763, "step": 585 }, { "epoch": 2.616071428571429, "grad_norm": 1.5228221416473389, "learning_rate": 6.0899546138434785e-06, "loss": 0.2012, "step": 586 }, { "epoch": 2.6205357142857144, "grad_norm": 1.5724462270736694, "learning_rate": 6.0784260660244475e-06, "loss": 0.1689, "step": 587 }, { "epoch": 2.625, "grad_norm": 1.5473616123199463, "learning_rate": 6.066891503903363e-06, "loss": 0.1887, "step": 588 }, { "epoch": 2.6294642857142856, "grad_norm": 1.4898332357406616, "learning_rate": 6.0553509918374635e-06, "loss": 0.1752, "step": 589 }, { "epoch": 2.633928571428571, "grad_norm": 1.443292498588562, "learning_rate": 6.0438045942171775e-06, "loss": 0.1846, "step": 590 }, { "epoch": 2.638392857142857, "grad_norm": 1.4720582962036133, "learning_rate": 6.032252375465778e-06, "loss": 0.1795, "step": 591 }, { "epoch": 2.642857142857143, "grad_norm": 1.4213638305664062, "learning_rate": 6.020694400039017e-06, "loss": 0.1745, "step": 592 }, { "epoch": 2.647321428571429, "grad_norm": 1.5590949058532715, "learning_rate": 6.009130732424758e-06, "loss": 0.1866, "step": 593 }, { "epoch": 2.6517857142857144, "grad_norm": 1.6477134227752686, "learning_rate": 5.997561437142636e-06, "loss": 0.2065, "step": 594 }, { "epoch": 2.65625, "grad_norm": 12.8923921585083, "learning_rate": 5.985986578743676e-06, "loss": 0.1789, "step": 595 }, { "epoch": 2.6607142857142856, "grad_norm": 1.4159936904907227, "learning_rate": 5.974406221809949e-06, "loss": 0.1875, "step": 596 }, { "epoch": 2.665178571428571, "grad_norm": 1.6385328769683838, "learning_rate": 5.962820430954198e-06, "loss": 0.2118, "step": 597 }, { "epoch": 2.669642857142857, "grad_norm": 1.4674583673477173, "learning_rate": 5.951229270819494e-06, "loss": 0.1992, "step": 598 }, { "epoch": 2.674107142857143, "grad_norm": 1.3399938344955444, "learning_rate": 5.9396328060788576e-06, "loss": 0.1903, "step": 599 }, { "epoch": 2.678571428571429, "grad_norm": 1.516242265701294, "learning_rate": 5.928031101434908e-06, "loss": 0.1835, "step": 600 }, { "epoch": 2.678571428571429, "eval_loss": 0.6832678914070129, "eval_runtime": 4.5972, "eval_samples_per_second": 13.051, "eval_steps_per_second": 0.87, "step": 600 }, { "epoch": 2.6830357142857144, "grad_norm": 1.591635823249817, "learning_rate": 5.916424221619507e-06, "loss": 0.2111, "step": 601 }, { "epoch": 2.6875, "grad_norm": 1.3842867612838745, "learning_rate": 5.904812231393383e-06, "loss": 0.1948, "step": 602 }, { "epoch": 2.6919642857142856, "grad_norm": 1.2690483331680298, "learning_rate": 5.893195195545784e-06, "loss": 0.1747, "step": 603 }, { "epoch": 2.696428571428571, "grad_norm": 1.5160295963287354, "learning_rate": 5.8815731788941064e-06, "loss": 0.185, "step": 604 }, { "epoch": 2.700892857142857, "grad_norm": 1.5125572681427002, "learning_rate": 5.86994624628354e-06, "loss": 0.1879, "step": 605 }, { "epoch": 2.705357142857143, "grad_norm": 1.4671415090560913, "learning_rate": 5.858314462586697e-06, "loss": 0.1826, "step": 606 }, { "epoch": 2.709821428571429, "grad_norm": 1.5814452171325684, "learning_rate": 5.846677892703268e-06, "loss": 0.2095, "step": 607 }, { "epoch": 2.7142857142857144, "grad_norm": 1.3826875686645508, "learning_rate": 5.835036601559634e-06, "loss": 0.1791, "step": 608 }, { "epoch": 2.71875, "grad_norm": 1.4754812717437744, "learning_rate": 5.82339065410853e-06, "loss": 0.1866, "step": 609 }, { "epoch": 2.7232142857142856, "grad_norm": 1.541463017463684, "learning_rate": 5.811740115328665e-06, "loss": 0.1812, "step": 610 }, { "epoch": 2.727678571428571, "grad_norm": 1.5529158115386963, "learning_rate": 5.800085050224367e-06, "loss": 0.198, "step": 611 }, { "epoch": 2.732142857142857, "grad_norm": 1.5426830053329468, "learning_rate": 5.7884255238252175e-06, "loss": 0.1981, "step": 612 }, { "epoch": 2.736607142857143, "grad_norm": 1.610823154449463, "learning_rate": 5.776761601185692e-06, "loss": 0.1886, "step": 613 }, { "epoch": 2.741071428571429, "grad_norm": 1.5132354497909546, "learning_rate": 5.765093347384793e-06, "loss": 0.1957, "step": 614 }, { "epoch": 2.7455357142857144, "grad_norm": 1.4942512512207031, "learning_rate": 5.753420827525691e-06, "loss": 0.2006, "step": 615 }, { "epoch": 2.75, "grad_norm": 1.5573617219924927, "learning_rate": 5.741744106735354e-06, "loss": 0.1947, "step": 616 }, { "epoch": 2.7544642857142856, "grad_norm": 1.4934524297714233, "learning_rate": 5.730063250164196e-06, "loss": 0.177, "step": 617 }, { "epoch": 2.758928571428571, "grad_norm": 1.6269088983535767, "learning_rate": 5.718378322985702e-06, "loss": 0.1986, "step": 618 }, { "epoch": 2.763392857142857, "grad_norm": 1.3660807609558105, "learning_rate": 5.70668939039607e-06, "loss": 0.1755, "step": 619 }, { "epoch": 2.767857142857143, "grad_norm": 1.5547926425933838, "learning_rate": 5.694996517613847e-06, "loss": 0.2123, "step": 620 }, { "epoch": 2.772321428571429, "grad_norm": 1.579438328742981, "learning_rate": 5.683299769879562e-06, "loss": 0.1965, "step": 621 }, { "epoch": 2.7767857142857144, "grad_norm": 1.6027774810791016, "learning_rate": 5.6715992124553685e-06, "loss": 0.1928, "step": 622 }, { "epoch": 2.78125, "grad_norm": 1.556585669517517, "learning_rate": 5.6598949106246734e-06, "loss": 0.203, "step": 623 }, { "epoch": 2.7857142857142856, "grad_norm": 1.7546970844268799, "learning_rate": 5.648186929691776e-06, "loss": 0.2128, "step": 624 }, { "epoch": 2.790178571428571, "grad_norm": 1.5303380489349365, "learning_rate": 5.6364753349815035e-06, "loss": 0.1956, "step": 625 }, { "epoch": 2.794642857142857, "grad_norm": 1.4398573637008667, "learning_rate": 5.624760191838845e-06, "loss": 0.1818, "step": 626 }, { "epoch": 2.799107142857143, "grad_norm": 1.4977809190750122, "learning_rate": 5.61304156562859e-06, "loss": 0.2104, "step": 627 }, { "epoch": 2.803571428571429, "grad_norm": 1.465651512145996, "learning_rate": 5.60131952173496e-06, "loss": 0.1905, "step": 628 }, { "epoch": 2.8080357142857144, "grad_norm": 1.5513821840286255, "learning_rate": 5.589594125561246e-06, "loss": 0.201, "step": 629 }, { "epoch": 2.8125, "grad_norm": 1.4582335948944092, "learning_rate": 5.577865442529447e-06, "loss": 0.2003, "step": 630 }, { "epoch": 2.8169642857142856, "grad_norm": 1.513297438621521, "learning_rate": 5.566133538079893e-06, "loss": 0.1771, "step": 631 }, { "epoch": 2.821428571428571, "grad_norm": 1.5092986822128296, "learning_rate": 5.554398477670895e-06, "loss": 0.185, "step": 632 }, { "epoch": 2.825892857142857, "grad_norm": 1.3965249061584473, "learning_rate": 5.54266032677837e-06, "loss": 0.1931, "step": 633 }, { "epoch": 2.830357142857143, "grad_norm": 1.5133389234542847, "learning_rate": 5.53091915089548e-06, "loss": 0.1805, "step": 634 }, { "epoch": 2.834821428571429, "grad_norm": 1.5035574436187744, "learning_rate": 5.5191750155322595e-06, "loss": 0.1936, "step": 635 }, { "epoch": 2.8392857142857144, "grad_norm": 1.4027115106582642, "learning_rate": 5.507427986215265e-06, "loss": 0.188, "step": 636 }, { "epoch": 2.84375, "grad_norm": 1.6537582874298096, "learning_rate": 5.49567812848719e-06, "loss": 0.1989, "step": 637 }, { "epoch": 2.8482142857142856, "grad_norm": 1.6560035943984985, "learning_rate": 5.483925507906514e-06, "loss": 0.2003, "step": 638 }, { "epoch": 2.852678571428571, "grad_norm": 1.5221431255340576, "learning_rate": 5.4721701900471335e-06, "loss": 0.1898, "step": 639 }, { "epoch": 2.857142857142857, "grad_norm": 1.5042264461517334, "learning_rate": 5.460412240497993e-06, "loss": 0.1964, "step": 640 }, { "epoch": 2.861607142857143, "grad_norm": 1.5381810665130615, "learning_rate": 5.448651724862716e-06, "loss": 0.185, "step": 641 }, { "epoch": 2.866071428571429, "grad_norm": 1.4257649183273315, "learning_rate": 5.436888708759253e-06, "loss": 0.189, "step": 642 }, { "epoch": 2.8705357142857144, "grad_norm": 1.5205702781677246, "learning_rate": 5.425123257819494e-06, "loss": 0.2022, "step": 643 }, { "epoch": 2.875, "grad_norm": 1.4278076887130737, "learning_rate": 5.413355437688926e-06, "loss": 0.1724, "step": 644 }, { "epoch": 2.8794642857142856, "grad_norm": 1.529142141342163, "learning_rate": 5.401585314026248e-06, "loss": 0.1946, "step": 645 }, { "epoch": 2.883928571428571, "grad_norm": 1.5609850883483887, "learning_rate": 5.3898129525030105e-06, "loss": 0.2121, "step": 646 }, { "epoch": 2.888392857142857, "grad_norm": 1.690317153930664, "learning_rate": 5.378038418803256e-06, "loss": 0.2023, "step": 647 }, { "epoch": 2.892857142857143, "grad_norm": 1.5702646970748901, "learning_rate": 5.366261778623143e-06, "loss": 0.2036, "step": 648 }, { "epoch": 2.897321428571429, "grad_norm": 1.6084643602371216, "learning_rate": 5.354483097670584e-06, "loss": 0.2009, "step": 649 }, { "epoch": 2.9017857142857144, "grad_norm": 1.7028254270553589, "learning_rate": 5.342702441664875e-06, "loss": 0.2093, "step": 650 }, { "epoch": 2.90625, "grad_norm": 1.5291582345962524, "learning_rate": 5.3309198763363345e-06, "loss": 0.2031, "step": 651 }, { "epoch": 2.9107142857142856, "grad_norm": 1.5348578691482544, "learning_rate": 5.319135467425937e-06, "loss": 0.199, "step": 652 }, { "epoch": 2.915178571428571, "grad_norm": 1.3724780082702637, "learning_rate": 5.3073492806849405e-06, "loss": 0.1765, "step": 653 }, { "epoch": 2.919642857142857, "grad_norm": 1.4268510341644287, "learning_rate": 5.295561381874518e-06, "loss": 0.17, "step": 654 }, { "epoch": 2.924107142857143, "grad_norm": 1.462509036064148, "learning_rate": 5.2837718367654036e-06, "loss": 0.1968, "step": 655 }, { "epoch": 2.928571428571429, "grad_norm": 1.5478267669677734, "learning_rate": 5.2719807111375096e-06, "loss": 0.2011, "step": 656 }, { "epoch": 2.9330357142857144, "grad_norm": 1.6338170766830444, "learning_rate": 5.260188070779573e-06, "loss": 0.1974, "step": 657 }, { "epoch": 2.9375, "grad_norm": 1.4555277824401855, "learning_rate": 5.248393981488777e-06, "loss": 0.1859, "step": 658 }, { "epoch": 2.9419642857142856, "grad_norm": 1.3970043659210205, "learning_rate": 5.236598509070389e-06, "loss": 0.1768, "step": 659 }, { "epoch": 2.946428571428571, "grad_norm": 1.533681869506836, "learning_rate": 5.2248017193374e-06, "loss": 0.2143, "step": 660 }, { "epoch": 2.950892857142857, "grad_norm": 1.6132566928863525, "learning_rate": 5.2130036781101455e-06, "loss": 0.1895, "step": 661 }, { "epoch": 2.955357142857143, "grad_norm": 1.5554327964782715, "learning_rate": 5.201204451215945e-06, "loss": 0.1883, "step": 662 }, { "epoch": 2.959821428571429, "grad_norm": 1.3858691453933716, "learning_rate": 5.18940410448873e-06, "loss": 0.1775, "step": 663 }, { "epoch": 2.9642857142857144, "grad_norm": 1.49596107006073, "learning_rate": 5.1776027037686895e-06, "loss": 0.1754, "step": 664 }, { "epoch": 2.96875, "grad_norm": 1.5804927349090576, "learning_rate": 5.165800314901883e-06, "loss": 0.1922, "step": 665 }, { "epoch": 2.9732142857142856, "grad_norm": 1.5328700542449951, "learning_rate": 5.15399700373989e-06, "loss": 0.1994, "step": 666 }, { "epoch": 2.977678571428571, "grad_norm": 1.593167781829834, "learning_rate": 5.142192836139432e-06, "loss": 0.1943, "step": 667 }, { "epoch": 2.982142857142857, "grad_norm": 1.4839582443237305, "learning_rate": 5.130387877962012e-06, "loss": 0.1697, "step": 668 }, { "epoch": 2.986607142857143, "grad_norm": 1.5925782918930054, "learning_rate": 5.118582195073542e-06, "loss": 0.1986, "step": 669 }, { "epoch": 2.991071428571429, "grad_norm": 1.4743068218231201, "learning_rate": 5.1067758533439804e-06, "loss": 0.1959, "step": 670 }, { "epoch": 2.9955357142857144, "grad_norm": 1.3711611032485962, "learning_rate": 5.094968918646954e-06, "loss": 0.1737, "step": 671 }, { "epoch": 3.0, "grad_norm": 1.3573774099349976, "learning_rate": 5.0831614568594105e-06, "loss": 0.1787, "step": 672 }, { "epoch": 3.0044642857142856, "grad_norm": 1.1417394876480103, "learning_rate": 5.071353533861225e-06, "loss": 0.114, "step": 673 }, { "epoch": 3.0089285714285716, "grad_norm": 1.105907678604126, "learning_rate": 5.059545215534859e-06, "loss": 0.1086, "step": 674 }, { "epoch": 3.013392857142857, "grad_norm": 1.189517617225647, "learning_rate": 5.047736567764967e-06, "loss": 0.1138, "step": 675 }, { "epoch": 3.017857142857143, "grad_norm": 1.0811058282852173, "learning_rate": 5.0359276564380514e-06, "loss": 0.1067, "step": 676 }, { "epoch": 3.0223214285714284, "grad_norm": 1.2044225931167603, "learning_rate": 5.024118547442083e-06, "loss": 0.0996, "step": 677 }, { "epoch": 3.0267857142857144, "grad_norm": 1.1959599256515503, "learning_rate": 5.012309306666129e-06, "loss": 0.0986, "step": 678 }, { "epoch": 3.03125, "grad_norm": 1.1417187452316284, "learning_rate": 5.000500000000001e-06, "loss": 0.0816, "step": 679 }, { "epoch": 3.0357142857142856, "grad_norm": 1.2518037557601929, "learning_rate": 4.988690693333873e-06, "loss": 0.0855, "step": 680 }, { "epoch": 3.0401785714285716, "grad_norm": 1.4060600996017456, "learning_rate": 4.97688145255792e-06, "loss": 0.0997, "step": 681 }, { "epoch": 3.044642857142857, "grad_norm": 1.3264249563217163, "learning_rate": 4.965072343561948e-06, "loss": 0.0907, "step": 682 }, { "epoch": 3.049107142857143, "grad_norm": 1.5176544189453125, "learning_rate": 4.953263432235034e-06, "loss": 0.0882, "step": 683 }, { "epoch": 3.0535714285714284, "grad_norm": 1.6541138887405396, "learning_rate": 4.941454784465144e-06, "loss": 0.0876, "step": 684 }, { "epoch": 3.0580357142857144, "grad_norm": 1.5412651300430298, "learning_rate": 4.929646466138777e-06, "loss": 0.0923, "step": 685 }, { "epoch": 3.0625, "grad_norm": 1.4274966716766357, "learning_rate": 4.917838543140591e-06, "loss": 0.0732, "step": 686 }, { "epoch": 3.0669642857142856, "grad_norm": 1.5422570705413818, "learning_rate": 4.906031081353047e-06, "loss": 0.0882, "step": 687 }, { "epoch": 3.0714285714285716, "grad_norm": 1.7544273138046265, "learning_rate": 4.8942241466560226e-06, "loss": 0.0925, "step": 688 }, { "epoch": 3.075892857142857, "grad_norm": 1.6329741477966309, "learning_rate": 4.882417804926457e-06, "loss": 0.0908, "step": 689 }, { "epoch": 3.080357142857143, "grad_norm": 1.8030346632003784, "learning_rate": 4.870612122037989e-06, "loss": 0.1051, "step": 690 }, { "epoch": 3.0848214285714284, "grad_norm": 1.8434675931930542, "learning_rate": 4.858807163860569e-06, "loss": 0.1039, "step": 691 }, { "epoch": 3.0892857142857144, "grad_norm": 1.7246928215026855, "learning_rate": 4.847002996260113e-06, "loss": 0.0913, "step": 692 }, { "epoch": 3.09375, "grad_norm": 1.6196149587631226, "learning_rate": 4.835199685098117e-06, "loss": 0.1038, "step": 693 }, { "epoch": 3.0982142857142856, "grad_norm": 1.6847678422927856, "learning_rate": 4.823397296231313e-06, "loss": 0.0855, "step": 694 }, { "epoch": 3.1026785714285716, "grad_norm": 1.7434442043304443, "learning_rate": 4.8115958955112715e-06, "loss": 0.094, "step": 695 }, { "epoch": 3.107142857142857, "grad_norm": 1.6175901889801025, "learning_rate": 4.799795548784058e-06, "loss": 0.0908, "step": 696 }, { "epoch": 3.111607142857143, "grad_norm": 1.6020538806915283, "learning_rate": 4.787996321889856e-06, "loss": 0.0943, "step": 697 }, { "epoch": 3.1160714285714284, "grad_norm": 1.4542748928070068, "learning_rate": 4.7761982806626015e-06, "loss": 0.098, "step": 698 }, { "epoch": 3.1205357142857144, "grad_norm": 1.4568634033203125, "learning_rate": 4.764401490929613e-06, "loss": 0.0884, "step": 699 }, { "epoch": 3.125, "grad_norm": 1.4934730529785156, "learning_rate": 4.752606018511225e-06, "loss": 0.0941, "step": 700 }, { "epoch": 3.1294642857142856, "grad_norm": 1.4770476818084717, "learning_rate": 4.740811929220429e-06, "loss": 0.1118, "step": 701 }, { "epoch": 3.1339285714285716, "grad_norm": 1.416353464126587, "learning_rate": 4.729019288862492e-06, "loss": 0.0996, "step": 702 }, { "epoch": 3.138392857142857, "grad_norm": 1.4401273727416992, "learning_rate": 4.717228163234599e-06, "loss": 0.092, "step": 703 }, { "epoch": 3.142857142857143, "grad_norm": 1.1891121864318848, "learning_rate": 4.705438618125482e-06, "loss": 0.0741, "step": 704 }, { "epoch": 3.1473214285714284, "grad_norm": 1.386759877204895, "learning_rate": 4.693650719315062e-06, "loss": 0.0998, "step": 705 }, { "epoch": 3.1517857142857144, "grad_norm": 1.3926823139190674, "learning_rate": 4.681864532574064e-06, "loss": 0.1015, "step": 706 }, { "epoch": 3.15625, "grad_norm": 1.3910696506500244, "learning_rate": 4.670080123663668e-06, "loss": 0.1002, "step": 707 }, { "epoch": 3.1607142857142856, "grad_norm": 1.2177248001098633, "learning_rate": 4.658297558335127e-06, "loss": 0.0903, "step": 708 }, { "epoch": 3.1651785714285716, "grad_norm": 1.434577465057373, "learning_rate": 4.64651690232942e-06, "loss": 0.0956, "step": 709 }, { "epoch": 3.169642857142857, "grad_norm": 1.2175168991088867, "learning_rate": 4.634738221376858e-06, "loss": 0.0805, "step": 710 }, { "epoch": 3.174107142857143, "grad_norm": 1.2530947923660278, "learning_rate": 4.622961581196743e-06, "loss": 0.0804, "step": 711 }, { "epoch": 3.1785714285714284, "grad_norm": 1.273105502128601, "learning_rate": 4.611187047496989e-06, "loss": 0.0912, "step": 712 }, { "epoch": 3.1830357142857144, "grad_norm": 1.1457451581954956, "learning_rate": 4.599414685973754e-06, "loss": 0.0794, "step": 713 }, { "epoch": 3.1875, "grad_norm": 1.20325767993927, "learning_rate": 4.587644562311077e-06, "loss": 0.0809, "step": 714 }, { "epoch": 3.1919642857142856, "grad_norm": 1.5332928895950317, "learning_rate": 4.575876742180506e-06, "loss": 0.0959, "step": 715 }, { "epoch": 3.1964285714285716, "grad_norm": 1.1909514665603638, "learning_rate": 4.56411129124075e-06, "loss": 0.0843, "step": 716 }, { "epoch": 3.200892857142857, "grad_norm": 1.472267508506775, "learning_rate": 4.552348275137285e-06, "loss": 0.1156, "step": 717 }, { "epoch": 3.205357142857143, "grad_norm": 1.532912254333496, "learning_rate": 4.54058775950201e-06, "loss": 0.0935, "step": 718 }, { "epoch": 3.2098214285714284, "grad_norm": 1.4942517280578613, "learning_rate": 4.528829809952867e-06, "loss": 0.0945, "step": 719 }, { "epoch": 3.2142857142857144, "grad_norm": 1.5307639837265015, "learning_rate": 4.517074492093487e-06, "loss": 0.1106, "step": 720 }, { "epoch": 3.21875, "grad_norm": 1.477367639541626, "learning_rate": 4.505321871512813e-06, "loss": 0.1024, "step": 721 }, { "epoch": 3.2232142857142856, "grad_norm": 1.6967626810073853, "learning_rate": 4.493572013784737e-06, "loss": 0.1039, "step": 722 }, { "epoch": 3.2276785714285716, "grad_norm": 1.4869396686553955, "learning_rate": 4.481824984467742e-06, "loss": 0.0929, "step": 723 }, { "epoch": 3.232142857142857, "grad_norm": 1.466221570968628, "learning_rate": 4.470080849104521e-06, "loss": 0.1008, "step": 724 }, { "epoch": 3.236607142857143, "grad_norm": 1.7747437953948975, "learning_rate": 4.458339673221631e-06, "loss": 0.1099, "step": 725 }, { "epoch": 3.2410714285714284, "grad_norm": 1.5948566198349, "learning_rate": 4.446601522329105e-06, "loss": 0.0876, "step": 726 }, { "epoch": 3.2455357142857144, "grad_norm": 1.3861619234085083, "learning_rate": 4.434866461920108e-06, "loss": 0.093, "step": 727 }, { "epoch": 3.25, "grad_norm": 1.7178670167922974, "learning_rate": 4.4231345574705555e-06, "loss": 0.1074, "step": 728 }, { "epoch": 3.2544642857142856, "grad_norm": 1.6227957010269165, "learning_rate": 4.4114058744387535e-06, "loss": 0.0984, "step": 729 }, { "epoch": 3.2589285714285716, "grad_norm": 1.5968222618103027, "learning_rate": 4.399680478265042e-06, "loss": 0.1015, "step": 730 }, { "epoch": 3.263392857142857, "grad_norm": 1.6373909711837769, "learning_rate": 4.387958434371413e-06, "loss": 0.099, "step": 731 }, { "epoch": 3.267857142857143, "grad_norm": 1.3308444023132324, "learning_rate": 4.376239808161157e-06, "loss": 0.0819, "step": 732 }, { "epoch": 3.2723214285714284, "grad_norm": 1.4825462102890015, "learning_rate": 4.364524665018496e-06, "loss": 0.0872, "step": 733 }, { "epoch": 3.2767857142857144, "grad_norm": 1.5847879648208618, "learning_rate": 4.3528130703082245e-06, "loss": 0.0873, "step": 734 }, { "epoch": 3.28125, "grad_norm": 1.3668209314346313, "learning_rate": 4.341105089375328e-06, "loss": 0.0843, "step": 735 }, { "epoch": 3.2857142857142856, "grad_norm": 1.4361255168914795, "learning_rate": 4.329400787544633e-06, "loss": 0.093, "step": 736 }, { "epoch": 3.2901785714285716, "grad_norm": 1.2126491069793701, "learning_rate": 4.317700230120438e-06, "loss": 0.0825, "step": 737 }, { "epoch": 3.294642857142857, "grad_norm": 1.4545817375183105, "learning_rate": 4.306003482386156e-06, "loss": 0.1061, "step": 738 }, { "epoch": 3.299107142857143, "grad_norm": 1.3296757936477661, "learning_rate": 4.2943106096039315e-06, "loss": 0.0888, "step": 739 }, { "epoch": 3.3035714285714284, "grad_norm": 1.4436862468719482, "learning_rate": 4.282621677014299e-06, "loss": 0.0955, "step": 740 }, { "epoch": 3.3080357142857144, "grad_norm": 1.313293218612671, "learning_rate": 4.270936749835805e-06, "loss": 0.0979, "step": 741 }, { "epoch": 3.3125, "grad_norm": 1.2750566005706787, "learning_rate": 4.259255893264647e-06, "loss": 0.0907, "step": 742 }, { "epoch": 3.3169642857142856, "grad_norm": 1.6833667755126953, "learning_rate": 4.247579172474312e-06, "loss": 0.1025, "step": 743 }, { "epoch": 3.3214285714285716, "grad_norm": 1.4311575889587402, "learning_rate": 4.235906652615207e-06, "loss": 0.1006, "step": 744 }, { "epoch": 3.325892857142857, "grad_norm": 1.5903573036193848, "learning_rate": 4.224238398814309e-06, "loss": 0.0953, "step": 745 }, { "epoch": 3.330357142857143, "grad_norm": 1.3978747129440308, "learning_rate": 4.212574476174784e-06, "loss": 0.095, "step": 746 }, { "epoch": 3.3348214285714284, "grad_norm": 1.4329924583435059, "learning_rate": 4.2009149497756355e-06, "loss": 0.1007, "step": 747 }, { "epoch": 3.3392857142857144, "grad_norm": 1.2830747365951538, "learning_rate": 4.189259884671336e-06, "loss": 0.0862, "step": 748 }, { "epoch": 3.34375, "grad_norm": 1.3609569072723389, "learning_rate": 4.177609345891472e-06, "loss": 0.0875, "step": 749 }, { "epoch": 3.3482142857142856, "grad_norm": 1.3331035375595093, "learning_rate": 4.165963398440368e-06, "loss": 0.1016, "step": 750 }, { "epoch": 3.3526785714285716, "grad_norm": 1.4151555299758911, "learning_rate": 4.1543221072967334e-06, "loss": 0.0975, "step": 751 }, { "epoch": 3.357142857142857, "grad_norm": 1.490128755569458, "learning_rate": 4.142685537413303e-06, "loss": 0.0915, "step": 752 }, { "epoch": 3.361607142857143, "grad_norm": 1.3559775352478027, "learning_rate": 4.1310537537164615e-06, "loss": 0.0979, "step": 753 }, { "epoch": 3.3660714285714284, "grad_norm": 1.4377248287200928, "learning_rate": 4.119426821105895e-06, "loss": 0.0935, "step": 754 }, { "epoch": 3.3705357142857144, "grad_norm": 1.4320459365844727, "learning_rate": 4.107804804454215e-06, "loss": 0.0976, "step": 755 }, { "epoch": 3.375, "grad_norm": 1.473273754119873, "learning_rate": 4.096187768606617e-06, "loss": 0.0949, "step": 756 }, { "epoch": 3.3794642857142856, "grad_norm": 1.240604043006897, "learning_rate": 4.084575778380495e-06, "loss": 0.077, "step": 757 }, { "epoch": 3.3839285714285716, "grad_norm": 1.6010382175445557, "learning_rate": 4.072968898565094e-06, "loss": 0.0987, "step": 758 }, { "epoch": 3.388392857142857, "grad_norm": 1.5959616899490356, "learning_rate": 4.061367193921145e-06, "loss": 0.1125, "step": 759 }, { "epoch": 3.392857142857143, "grad_norm": 1.5466870069503784, "learning_rate": 4.049770729180508e-06, "loss": 0.0985, "step": 760 }, { "epoch": 3.3973214285714284, "grad_norm": 1.485848307609558, "learning_rate": 4.038179569045803e-06, "loss": 0.0942, "step": 761 }, { "epoch": 3.4017857142857144, "grad_norm": 1.4044053554534912, "learning_rate": 4.026593778190052e-06, "loss": 0.096, "step": 762 }, { "epoch": 3.40625, "grad_norm": 1.2948501110076904, "learning_rate": 4.015013421256324e-06, "loss": 0.0852, "step": 763 }, { "epoch": 3.4107142857142856, "grad_norm": 1.2491077184677124, "learning_rate": 4.0034385628573655e-06, "loss": 0.0814, "step": 764 }, { "epoch": 3.4151785714285716, "grad_norm": 1.4824570417404175, "learning_rate": 3.991869267575243e-06, "loss": 0.0848, "step": 765 }, { "epoch": 3.419642857142857, "grad_norm": 1.6389641761779785, "learning_rate": 3.9803055999609855e-06, "loss": 0.0929, "step": 766 }, { "epoch": 3.424107142857143, "grad_norm": 1.5796787738800049, "learning_rate": 3.9687476245342234e-06, "loss": 0.1006, "step": 767 }, { "epoch": 3.4285714285714284, "grad_norm": 1.4374067783355713, "learning_rate": 3.957195405782824e-06, "loss": 0.0938, "step": 768 }, { "epoch": 3.4330357142857144, "grad_norm": 1.5482842922210693, "learning_rate": 3.9456490081625396e-06, "loss": 0.1003, "step": 769 }, { "epoch": 3.4375, "grad_norm": 1.366696834564209, "learning_rate": 3.934108496096638e-06, "loss": 0.0896, "step": 770 }, { "epoch": 3.4419642857142856, "grad_norm": 1.4534419775009155, "learning_rate": 3.922573933975555e-06, "loss": 0.0954, "step": 771 }, { "epoch": 3.4464285714285716, "grad_norm": 1.3738148212432861, "learning_rate": 3.911045386156523e-06, "loss": 0.0986, "step": 772 }, { "epoch": 3.450892857142857, "grad_norm": 1.4549438953399658, "learning_rate": 3.899522916963219e-06, "loss": 0.0937, "step": 773 }, { "epoch": 3.455357142857143, "grad_norm": 1.5725458860397339, "learning_rate": 3.888006590685407e-06, "loss": 0.0884, "step": 774 }, { "epoch": 3.4598214285714284, "grad_norm": 1.409229040145874, "learning_rate": 3.876496471578572e-06, "loss": 0.0985, "step": 775 }, { "epoch": 3.4642857142857144, "grad_norm": 1.4481651782989502, "learning_rate": 3.864992623863572e-06, "loss": 0.0993, "step": 776 }, { "epoch": 3.46875, "grad_norm": 1.3321104049682617, "learning_rate": 3.853495111726265e-06, "loss": 0.0938, "step": 777 }, { "epoch": 3.4732142857142856, "grad_norm": 1.501019835472107, "learning_rate": 3.84200399931717e-06, "loss": 0.0955, "step": 778 }, { "epoch": 3.4776785714285716, "grad_norm": 1.3454251289367676, "learning_rate": 3.8305193507510905e-06, "loss": 0.0955, "step": 779 }, { "epoch": 3.482142857142857, "grad_norm": 1.5015486478805542, "learning_rate": 3.819041230106768e-06, "loss": 0.1041, "step": 780 }, { "epoch": 3.486607142857143, "grad_norm": 1.1266005039215088, "learning_rate": 3.807569701426519e-06, "loss": 0.0815, "step": 781 }, { "epoch": 3.4910714285714284, "grad_norm": 1.322966456413269, "learning_rate": 3.7961048287158865e-06, "loss": 0.088, "step": 782 }, { "epoch": 3.4955357142857144, "grad_norm": 1.31028151512146, "learning_rate": 3.784646675943269e-06, "loss": 0.0905, "step": 783 }, { "epoch": 3.5, "grad_norm": 1.4570220708847046, "learning_rate": 3.773195307039574e-06, "loss": 0.0966, "step": 784 }, { "epoch": 3.5044642857142856, "grad_norm": 1.4365588426589966, "learning_rate": 3.7617507858978615e-06, "loss": 0.0924, "step": 785 }, { "epoch": 3.508928571428571, "grad_norm": 1.2722663879394531, "learning_rate": 3.7503131763729785e-06, "loss": 0.0861, "step": 786 }, { "epoch": 3.513392857142857, "grad_norm": 1.4443727731704712, "learning_rate": 3.738882542281212e-06, "loss": 0.0943, "step": 787 }, { "epoch": 3.517857142857143, "grad_norm": 1.4455920457839966, "learning_rate": 3.727458947399927e-06, "loss": 0.0924, "step": 788 }, { "epoch": 3.522321428571429, "grad_norm": 1.2739661931991577, "learning_rate": 3.7160424554672187e-06, "loss": 0.0938, "step": 789 }, { "epoch": 3.5267857142857144, "grad_norm": 1.3610420227050781, "learning_rate": 3.7046331301815435e-06, "loss": 0.0928, "step": 790 }, { "epoch": 3.53125, "grad_norm": 1.360996127128601, "learning_rate": 3.6932310352013796e-06, "loss": 0.0848, "step": 791 }, { "epoch": 3.5357142857142856, "grad_norm": 1.3822091817855835, "learning_rate": 3.6818362341448545e-06, "loss": 0.0937, "step": 792 }, { "epoch": 3.540178571428571, "grad_norm": 1.5511473417282104, "learning_rate": 3.670448790589408e-06, "loss": 0.0964, "step": 793 }, { "epoch": 3.544642857142857, "grad_norm": 1.322980284690857, "learning_rate": 3.659068768071425e-06, "loss": 0.0828, "step": 794 }, { "epoch": 3.549107142857143, "grad_norm": 1.4569833278656006, "learning_rate": 3.6476962300858793e-06, "loss": 0.0982, "step": 795 }, { "epoch": 3.553571428571429, "grad_norm": 1.2515519857406616, "learning_rate": 3.6363312400859963e-06, "loss": 0.0874, "step": 796 }, { "epoch": 3.5580357142857144, "grad_norm": 1.5618796348571777, "learning_rate": 3.6249738614828765e-06, "loss": 0.0951, "step": 797 }, { "epoch": 3.5625, "grad_norm": 1.530705213546753, "learning_rate": 3.613624157645159e-06, "loss": 0.0983, "step": 798 }, { "epoch": 3.5669642857142856, "grad_norm": 1.6114871501922607, "learning_rate": 3.6022821918986563e-06, "loss": 0.0946, "step": 799 }, { "epoch": 3.571428571428571, "grad_norm": 1.43599271774292, "learning_rate": 3.590948027526012e-06, "loss": 0.0989, "step": 800 }, { "epoch": 3.571428571428571, "eval_loss": 0.843691885471344, "eval_runtime": 4.4852, "eval_samples_per_second": 13.377, "eval_steps_per_second": 0.892, "step": 800 }, { "epoch": 3.575892857142857, "grad_norm": 1.2312313318252563, "learning_rate": 3.579621727766339e-06, "loss": 0.079, "step": 801 }, { "epoch": 3.580357142857143, "grad_norm": 1.3309111595153809, "learning_rate": 3.568303355814867e-06, "loss": 0.0926, "step": 802 }, { "epoch": 3.584821428571429, "grad_norm": 1.293211817741394, "learning_rate": 3.5569929748225945e-06, "loss": 0.0879, "step": 803 }, { "epoch": 3.5892857142857144, "grad_norm": 1.532378911972046, "learning_rate": 3.5456906478959367e-06, "loss": 0.1051, "step": 804 }, { "epoch": 3.59375, "grad_norm": 1.4125474691390991, "learning_rate": 3.534396438096369e-06, "loss": 0.0953, "step": 805 }, { "epoch": 3.5982142857142856, "grad_norm": 1.3551851511001587, "learning_rate": 3.5231104084400745e-06, "loss": 0.0889, "step": 806 }, { "epoch": 3.602678571428571, "grad_norm": 1.4057469367980957, "learning_rate": 3.5118326218976013e-06, "loss": 0.0955, "step": 807 }, { "epoch": 3.607142857142857, "grad_norm": 1.5146610736846924, "learning_rate": 3.5005631413935006e-06, "loss": 0.0932, "step": 808 }, { "epoch": 3.611607142857143, "grad_norm": 1.3674921989440918, "learning_rate": 3.4893020298059784e-06, "loss": 0.0831, "step": 809 }, { "epoch": 3.616071428571429, "grad_norm": 1.6528156995773315, "learning_rate": 3.4780493499665478e-06, "loss": 0.103, "step": 810 }, { "epoch": 3.6205357142857144, "grad_norm": 1.5097229480743408, "learning_rate": 3.4668051646596825e-06, "loss": 0.1007, "step": 811 }, { "epoch": 3.625, "grad_norm": 1.6443347930908203, "learning_rate": 3.455569536622451e-06, "loss": 0.0968, "step": 812 }, { "epoch": 3.6294642857142856, "grad_norm": 1.417252779006958, "learning_rate": 3.4443425285441847e-06, "loss": 0.0944, "step": 813 }, { "epoch": 3.633928571428571, "grad_norm": 1.4208124876022339, "learning_rate": 3.433124203066113e-06, "loss": 0.0938, "step": 814 }, { "epoch": 3.638392857142857, "grad_norm": 1.4724245071411133, "learning_rate": 3.421914622781028e-06, "loss": 0.1021, "step": 815 }, { "epoch": 3.642857142857143, "grad_norm": 1.5548927783966064, "learning_rate": 3.4107138502329225e-06, "loss": 0.1037, "step": 816 }, { "epoch": 3.647321428571429, "grad_norm": 1.5089542865753174, "learning_rate": 3.399521947916646e-06, "loss": 0.1108, "step": 817 }, { "epoch": 3.6517857142857144, "grad_norm": 1.24465012550354, "learning_rate": 3.3883389782775604e-06, "loss": 0.0848, "step": 818 }, { "epoch": 3.65625, "grad_norm": 1.3363871574401855, "learning_rate": 3.377165003711185e-06, "loss": 0.0811, "step": 819 }, { "epoch": 3.6607142857142856, "grad_norm": 1.3324394226074219, "learning_rate": 3.3660000865628523e-06, "loss": 0.0767, "step": 820 }, { "epoch": 3.665178571428571, "grad_norm": 1.3909485340118408, "learning_rate": 3.3548442891273553e-06, "loss": 0.0924, "step": 821 }, { "epoch": 3.669642857142857, "grad_norm": 1.3440436124801636, "learning_rate": 3.343697673648611e-06, "loss": 0.0909, "step": 822 }, { "epoch": 3.674107142857143, "grad_norm": 1.4336508512496948, "learning_rate": 3.332560302319297e-06, "loss": 0.1151, "step": 823 }, { "epoch": 3.678571428571429, "grad_norm": 1.3243669271469116, "learning_rate": 3.321432237280518e-06, "loss": 0.0892, "step": 824 }, { "epoch": 3.6830357142857144, "grad_norm": 1.288888692855835, "learning_rate": 3.3103135406214506e-06, "loss": 0.0858, "step": 825 }, { "epoch": 3.6875, "grad_norm": 1.386723279953003, "learning_rate": 3.2992042743790055e-06, "loss": 0.0881, "step": 826 }, { "epoch": 3.6919642857142856, "grad_norm": 1.3573461771011353, "learning_rate": 3.2881045005374747e-06, "loss": 0.087, "step": 827 }, { "epoch": 3.696428571428571, "grad_norm": 1.379010558128357, "learning_rate": 3.277014281028181e-06, "loss": 0.0835, "step": 828 }, { "epoch": 3.700892857142857, "grad_norm": 1.4972076416015625, "learning_rate": 3.2659336777291497e-06, "loss": 0.1027, "step": 829 }, { "epoch": 3.705357142857143, "grad_norm": 1.3756992816925049, "learning_rate": 3.254862752464743e-06, "loss": 0.0912, "step": 830 }, { "epoch": 3.709821428571429, "grad_norm": 1.473894476890564, "learning_rate": 3.243801567005329e-06, "loss": 0.1016, "step": 831 }, { "epoch": 3.7142857142857144, "grad_norm": 1.4770474433898926, "learning_rate": 3.232750183066928e-06, "loss": 0.0974, "step": 832 }, { "epoch": 3.71875, "grad_norm": 1.5533777475357056, "learning_rate": 3.2217086623108796e-06, "loss": 0.0994, "step": 833 }, { "epoch": 3.7232142857142856, "grad_norm": 1.3398988246917725, "learning_rate": 3.2106770663434867e-06, "loss": 0.0838, "step": 834 }, { "epoch": 3.727678571428571, "grad_norm": 1.4815514087677002, "learning_rate": 3.1996554567156774e-06, "loss": 0.0977, "step": 835 }, { "epoch": 3.732142857142857, "grad_norm": 1.1235636472702026, "learning_rate": 3.18864389492266e-06, "loss": 0.0828, "step": 836 }, { "epoch": 3.736607142857143, "grad_norm": 1.524712085723877, "learning_rate": 3.1776424424035857e-06, "loss": 0.1056, "step": 837 }, { "epoch": 3.741071428571429, "grad_norm": 1.3527683019638062, "learning_rate": 3.1666511605411947e-06, "loss": 0.0901, "step": 838 }, { "epoch": 3.7455357142857144, "grad_norm": 1.340248703956604, "learning_rate": 3.155670110661482e-06, "loss": 0.0885, "step": 839 }, { "epoch": 3.75, "grad_norm": 1.3833794593811035, "learning_rate": 3.144699354033356e-06, "loss": 0.086, "step": 840 }, { "epoch": 3.7544642857142856, "grad_norm": 1.4830819368362427, "learning_rate": 3.1337389518682894e-06, "loss": 0.0985, "step": 841 }, { "epoch": 3.758928571428571, "grad_norm": 1.5664781332015991, "learning_rate": 3.122788965319985e-06, "loss": 0.0789, "step": 842 }, { "epoch": 3.763392857142857, "grad_norm": 1.232924222946167, "learning_rate": 3.1118494554840284e-06, "loss": 0.0832, "step": 843 }, { "epoch": 3.767857142857143, "grad_norm": 1.2607110738754272, "learning_rate": 3.100920483397556e-06, "loss": 0.0773, "step": 844 }, { "epoch": 3.772321428571429, "grad_norm": 1.5223504304885864, "learning_rate": 3.090002110038903e-06, "loss": 0.0961, "step": 845 }, { "epoch": 3.7767857142857144, "grad_norm": 1.514897108078003, "learning_rate": 3.0790943963272697e-06, "loss": 0.0978, "step": 846 }, { "epoch": 3.78125, "grad_norm": 1.4400715827941895, "learning_rate": 3.0681974031223854e-06, "loss": 0.0971, "step": 847 }, { "epoch": 3.7857142857142856, "grad_norm": 1.479879379272461, "learning_rate": 3.0573111912241575e-06, "loss": 0.0862, "step": 848 }, { "epoch": 3.790178571428571, "grad_norm": 1.3477786779403687, "learning_rate": 3.0464358213723455e-06, "loss": 0.0891, "step": 849 }, { "epoch": 3.794642857142857, "grad_norm": 1.3734490871429443, "learning_rate": 3.0355713542462086e-06, "loss": 0.0995, "step": 850 }, { "epoch": 3.799107142857143, "grad_norm": 1.3643180131912231, "learning_rate": 3.024717850464181e-06, "loss": 0.0954, "step": 851 }, { "epoch": 3.803571428571429, "grad_norm": 1.619181752204895, "learning_rate": 3.0138753705835234e-06, "loss": 0.1008, "step": 852 }, { "epoch": 3.8080357142857144, "grad_norm": 1.4963918924331665, "learning_rate": 3.003043975099988e-06, "loss": 0.091, "step": 853 }, { "epoch": 3.8125, "grad_norm": 1.2831984758377075, "learning_rate": 2.99222372444748e-06, "loss": 0.0788, "step": 854 }, { "epoch": 3.8169642857142856, "grad_norm": 1.3784717321395874, "learning_rate": 2.9814146789977278e-06, "loss": 0.09, "step": 855 }, { "epoch": 3.821428571428571, "grad_norm": 1.6730880737304688, "learning_rate": 2.970616899059934e-06, "loss": 0.0965, "step": 856 }, { "epoch": 3.825892857142857, "grad_norm": 1.5248148441314697, "learning_rate": 2.959830444880447e-06, "loss": 0.0909, "step": 857 }, { "epoch": 3.830357142857143, "grad_norm": 1.5161914825439453, "learning_rate": 2.949055376642428e-06, "loss": 0.0933, "step": 858 }, { "epoch": 3.834821428571429, "grad_norm": 1.4900877475738525, "learning_rate": 2.9382917544655025e-06, "loss": 0.0886, "step": 859 }, { "epoch": 3.8392857142857144, "grad_norm": 1.3055517673492432, "learning_rate": 2.9275396384054373e-06, "loss": 0.083, "step": 860 }, { "epoch": 3.84375, "grad_norm": 1.5641956329345703, "learning_rate": 2.9167990884537943e-06, "loss": 0.097, "step": 861 }, { "epoch": 3.8482142857142856, "grad_norm": 1.521967887878418, "learning_rate": 2.906070164537616e-06, "loss": 0.1076, "step": 862 }, { "epoch": 3.852678571428571, "grad_norm": 1.445181131362915, "learning_rate": 2.8953529265190618e-06, "loss": 0.0904, "step": 863 }, { "epoch": 3.857142857142857, "grad_norm": 1.4985555410385132, "learning_rate": 2.884647434195099e-06, "loss": 0.0992, "step": 864 }, { "epoch": 3.861607142857143, "grad_norm": 1.4231829643249512, "learning_rate": 2.873953747297153e-06, "loss": 0.0799, "step": 865 }, { "epoch": 3.866071428571429, "grad_norm": 1.472952961921692, "learning_rate": 2.863271925490791e-06, "loss": 0.1004, "step": 866 }, { "epoch": 3.8705357142857144, "grad_norm": 1.5234057903289795, "learning_rate": 2.8526020283753658e-06, "loss": 0.0958, "step": 867 }, { "epoch": 3.875, "grad_norm": 1.4517230987548828, "learning_rate": 2.841944115483703e-06, "loss": 0.0918, "step": 868 }, { "epoch": 3.8794642857142856, "grad_norm": 1.4124717712402344, "learning_rate": 2.8312982462817686e-06, "loss": 0.0946, "step": 869 }, { "epoch": 3.883928571428571, "grad_norm": 1.6698354482650757, "learning_rate": 2.820664480168317e-06, "loss": 0.1049, "step": 870 }, { "epoch": 3.888392857142857, "grad_norm": 1.4935716390609741, "learning_rate": 2.810042876474584e-06, "loss": 0.1022, "step": 871 }, { "epoch": 3.892857142857143, "grad_norm": 1.5294708013534546, "learning_rate": 2.799433494463935e-06, "loss": 0.1049, "step": 872 }, { "epoch": 3.897321428571429, "grad_norm": 1.5460635423660278, "learning_rate": 2.7888363933315593e-06, "loss": 0.1035, "step": 873 }, { "epoch": 3.9017857142857144, "grad_norm": 1.3909894227981567, "learning_rate": 2.7782516322041087e-06, "loss": 0.084, "step": 874 }, { "epoch": 3.90625, "grad_norm": 1.3862639665603638, "learning_rate": 2.767679270139394e-06, "loss": 0.0936, "step": 875 }, { "epoch": 3.9107142857142856, "grad_norm": 1.602386713027954, "learning_rate": 2.7571193661260427e-06, "loss": 0.1008, "step": 876 }, { "epoch": 3.915178571428571, "grad_norm": 1.4371018409729004, "learning_rate": 2.746571979083172e-06, "loss": 0.0975, "step": 877 }, { "epoch": 3.919642857142857, "grad_norm": 1.3875517845153809, "learning_rate": 2.736037167860061e-06, "loss": 0.0999, "step": 878 }, { "epoch": 3.924107142857143, "grad_norm": 1.4412161111831665, "learning_rate": 2.725514991235818e-06, "loss": 0.099, "step": 879 }, { "epoch": 3.928571428571429, "grad_norm": 1.3014588356018066, "learning_rate": 2.7150055079190663e-06, "loss": 0.0833, "step": 880 }, { "epoch": 3.9330357142857144, "grad_norm": 1.4020464420318604, "learning_rate": 2.704508776547595e-06, "loss": 0.093, "step": 881 }, { "epoch": 3.9375, "grad_norm": 1.3537229299545288, "learning_rate": 2.6940248556880512e-06, "loss": 0.0858, "step": 882 }, { "epoch": 3.9419642857142856, "grad_norm": 1.2978018522262573, "learning_rate": 2.6835538038356017e-06, "loss": 0.0779, "step": 883 }, { "epoch": 3.946428571428571, "grad_norm": 1.2745367288589478, "learning_rate": 2.6730956794136138e-06, "loss": 0.0867, "step": 884 }, { "epoch": 3.950892857142857, "grad_norm": 1.3864595890045166, "learning_rate": 2.6626505407733255e-06, "loss": 0.0926, "step": 885 }, { "epoch": 3.955357142857143, "grad_norm": 1.4206799268722534, "learning_rate": 2.6522184461935153e-06, "loss": 0.0893, "step": 886 }, { "epoch": 3.959821428571429, "grad_norm": 1.637922763824463, "learning_rate": 2.6417994538801882e-06, "loss": 0.1107, "step": 887 }, { "epoch": 3.9642857142857144, "grad_norm": 1.5130659341812134, "learning_rate": 2.6313936219662435e-06, "loss": 0.0949, "step": 888 }, { "epoch": 3.96875, "grad_norm": 1.4699238538742065, "learning_rate": 2.6210010085111507e-06, "loss": 0.0965, "step": 889 }, { "epoch": 3.9732142857142856, "grad_norm": 1.3603074550628662, "learning_rate": 2.6106216715006282e-06, "loss": 0.0914, "step": 890 }, { "epoch": 3.977678571428571, "grad_norm": 1.3753093481063843, "learning_rate": 2.600255668846316e-06, "loss": 0.0817, "step": 891 }, { "epoch": 3.982142857142857, "grad_norm": 1.5180922746658325, "learning_rate": 2.58990305838546e-06, "loss": 0.1041, "step": 892 }, { "epoch": 3.986607142857143, "grad_norm": 1.5158816576004028, "learning_rate": 2.5795638978805755e-06, "loss": 0.0873, "step": 893 }, { "epoch": 3.991071428571429, "grad_norm": 1.4908344745635986, "learning_rate": 2.5692382450191404e-06, "loss": 0.103, "step": 894 }, { "epoch": 3.9955357142857144, "grad_norm": 1.3467048406600952, "learning_rate": 2.558926157413266e-06, "loss": 0.1036, "step": 895 }, { "epoch": 4.0, "grad_norm": 1.4331597089767456, "learning_rate": 2.5486276925993746e-06, "loss": 0.0932, "step": 896 }, { "epoch": 4.004464285714286, "grad_norm": 0.806198239326477, "learning_rate": 2.5383429080378807e-06, "loss": 0.0455, "step": 897 }, { "epoch": 4.008928571428571, "grad_norm": 0.8375071287155151, "learning_rate": 2.5280718611128657e-06, "loss": 0.0485, "step": 898 }, { "epoch": 4.013392857142857, "grad_norm": 0.7727735042572021, "learning_rate": 2.517814609131772e-06, "loss": 0.0439, "step": 899 }, { "epoch": 4.017857142857143, "grad_norm": 0.8304307460784912, "learning_rate": 2.507571209325058e-06, "loss": 0.0482, "step": 900 }, { "epoch": 4.022321428571429, "grad_norm": 0.9408035278320312, "learning_rate": 2.4973417188459074e-06, "loss": 0.048, "step": 901 }, { "epoch": 4.026785714285714, "grad_norm": 0.8631353378295898, "learning_rate": 2.4871261947698892e-06, "loss": 0.0499, "step": 902 }, { "epoch": 4.03125, "grad_norm": 1.2712801694869995, "learning_rate": 2.4769246940946487e-06, "loss": 0.0539, "step": 903 }, { "epoch": 4.035714285714286, "grad_norm": 0.8662086129188538, "learning_rate": 2.4667372737395894e-06, "loss": 0.046, "step": 904 }, { "epoch": 4.040178571428571, "grad_norm": 0.874553918838501, "learning_rate": 2.4565639905455455e-06, "loss": 0.0431, "step": 905 }, { "epoch": 4.044642857142857, "grad_norm": 0.9109020829200745, "learning_rate": 2.446404901274486e-06, "loss": 0.0451, "step": 906 }, { "epoch": 4.049107142857143, "grad_norm": 0.915429413318634, "learning_rate": 2.436260062609173e-06, "loss": 0.0453, "step": 907 }, { "epoch": 4.053571428571429, "grad_norm": 0.9552186131477356, "learning_rate": 2.4261295311528632e-06, "loss": 0.0376, "step": 908 }, { "epoch": 4.058035714285714, "grad_norm": 1.0016251802444458, "learning_rate": 2.4160133634289804e-06, "loss": 0.0448, "step": 909 }, { "epoch": 4.0625, "grad_norm": 1.0590428113937378, "learning_rate": 2.4059116158808147e-06, "loss": 0.0487, "step": 910 }, { "epoch": 4.066964285714286, "grad_norm": 1.1106352806091309, "learning_rate": 2.395824344871193e-06, "loss": 0.0439, "step": 911 }, { "epoch": 4.071428571428571, "grad_norm": 1.1301071643829346, "learning_rate": 2.385751606682167e-06, "loss": 0.047, "step": 912 }, { "epoch": 4.075892857142857, "grad_norm": 1.0470985174179077, "learning_rate": 2.3756934575147117e-06, "loss": 0.04, "step": 913 }, { "epoch": 4.080357142857143, "grad_norm": 1.318620204925537, "learning_rate": 2.365649953488393e-06, "loss": 0.0402, "step": 914 }, { "epoch": 4.084821428571429, "grad_norm": 1.3197895288467407, "learning_rate": 2.3556211506410708e-06, "loss": 0.0381, "step": 915 }, { "epoch": 4.089285714285714, "grad_norm": 1.20299232006073, "learning_rate": 2.3456071049285717e-06, "loss": 0.0475, "step": 916 }, { "epoch": 4.09375, "grad_norm": 1.209742784500122, "learning_rate": 2.3356078722243963e-06, "loss": 0.043, "step": 917 }, { "epoch": 4.098214285714286, "grad_norm": 1.1857240200042725, "learning_rate": 2.325623508319385e-06, "loss": 0.0479, "step": 918 }, { "epoch": 4.102678571428571, "grad_norm": 0.9122520089149475, "learning_rate": 2.3156540689214227e-06, "loss": 0.0338, "step": 919 }, { "epoch": 4.107142857142857, "grad_norm": 1.1389027833938599, "learning_rate": 2.3056996096551228e-06, "loss": 0.0403, "step": 920 }, { "epoch": 4.111607142857143, "grad_norm": 1.3362747430801392, "learning_rate": 2.2957601860615152e-06, "loss": 0.0476, "step": 921 }, { "epoch": 4.116071428571429, "grad_norm": 1.198205828666687, "learning_rate": 2.285835853597742e-06, "loss": 0.0427, "step": 922 }, { "epoch": 4.120535714285714, "grad_norm": 1.4842236042022705, "learning_rate": 2.2759266676367345e-06, "loss": 0.0544, "step": 923 }, { "epoch": 4.125, "grad_norm": 1.2623423337936401, "learning_rate": 2.266032683466928e-06, "loss": 0.04, "step": 924 }, { "epoch": 4.129464285714286, "grad_norm": 1.2799335718154907, "learning_rate": 2.2561539562919265e-06, "loss": 0.0392, "step": 925 }, { "epoch": 4.133928571428571, "grad_norm": 1.0505305528640747, "learning_rate": 2.246290541230214e-06, "loss": 0.044, "step": 926 }, { "epoch": 4.138392857142857, "grad_norm": 1.0607540607452393, "learning_rate": 2.236442493314839e-06, "loss": 0.0408, "step": 927 }, { "epoch": 4.142857142857143, "grad_norm": 1.1474037170410156, "learning_rate": 2.2266098674931094e-06, "loss": 0.0391, "step": 928 }, { "epoch": 4.147321428571429, "grad_norm": 1.168620228767395, "learning_rate": 2.216792718626286e-06, "loss": 0.0479, "step": 929 }, { "epoch": 4.151785714285714, "grad_norm": 1.028488278388977, "learning_rate": 2.2069911014892712e-06, "loss": 0.0389, "step": 930 }, { "epoch": 4.15625, "grad_norm": 1.0126864910125732, "learning_rate": 2.197205070770313e-06, "loss": 0.0322, "step": 931 }, { "epoch": 4.160714285714286, "grad_norm": 1.1278817653656006, "learning_rate": 2.1874346810706925e-06, "loss": 0.0411, "step": 932 }, { "epoch": 4.165178571428571, "grad_norm": 0.9957524538040161, "learning_rate": 2.177679986904422e-06, "loss": 0.0381, "step": 933 }, { "epoch": 4.169642857142857, "grad_norm": 1.0937297344207764, "learning_rate": 2.1679410426979412e-06, "loss": 0.0517, "step": 934 }, { "epoch": 4.174107142857143, "grad_norm": 1.3177183866500854, "learning_rate": 2.1582179027898102e-06, "loss": 0.0497, "step": 935 }, { "epoch": 4.178571428571429, "grad_norm": 1.0554062128067017, "learning_rate": 2.148510621430414e-06, "loss": 0.0414, "step": 936 }, { "epoch": 4.183035714285714, "grad_norm": 0.9176511764526367, "learning_rate": 2.1388192527816472e-06, "loss": 0.0335, "step": 937 }, { "epoch": 4.1875, "grad_norm": 1.0542479753494263, "learning_rate": 2.1291438509166236e-06, "loss": 0.0473, "step": 938 }, { "epoch": 4.191964285714286, "grad_norm": 1.0238994359970093, "learning_rate": 2.119484469819371e-06, "loss": 0.0466, "step": 939 }, { "epoch": 4.196428571428571, "grad_norm": 1.262686014175415, "learning_rate": 2.109841163384528e-06, "loss": 0.0456, "step": 940 }, { "epoch": 4.200892857142857, "grad_norm": 1.0453062057495117, "learning_rate": 2.100213985417045e-06, "loss": 0.04, "step": 941 }, { "epoch": 4.205357142857143, "grad_norm": 1.3999793529510498, "learning_rate": 2.090602989631878e-06, "loss": 0.0408, "step": 942 }, { "epoch": 4.209821428571429, "grad_norm": 1.0657180547714233, "learning_rate": 2.081008229653706e-06, "loss": 0.0416, "step": 943 }, { "epoch": 4.214285714285714, "grad_norm": 1.0057283639907837, "learning_rate": 2.071429759016607e-06, "loss": 0.0419, "step": 944 }, { "epoch": 4.21875, "grad_norm": 1.26461923122406, "learning_rate": 2.061867631163781e-06, "loss": 0.0497, "step": 945 }, { "epoch": 4.223214285714286, "grad_norm": 0.9643348455429077, "learning_rate": 2.0523218994472408e-06, "loss": 0.0354, "step": 946 }, { "epoch": 4.227678571428571, "grad_norm": 1.0940420627593994, "learning_rate": 2.0427926171275157e-06, "loss": 0.0415, "step": 947 }, { "epoch": 4.232142857142857, "grad_norm": 1.087691068649292, "learning_rate": 2.033279837373359e-06, "loss": 0.0457, "step": 948 }, { "epoch": 4.236607142857143, "grad_norm": 0.987881064414978, "learning_rate": 2.023783613261439e-06, "loss": 0.038, "step": 949 }, { "epoch": 4.241071428571429, "grad_norm": 1.2403086423873901, "learning_rate": 2.0143039977760663e-06, "loss": 0.0516, "step": 950 }, { "epoch": 4.245535714285714, "grad_norm": 1.2339435815811157, "learning_rate": 2.0048410438088675e-06, "loss": 0.0547, "step": 951 }, { "epoch": 4.25, "grad_norm": 1.1114577054977417, "learning_rate": 1.995394804158516e-06, "loss": 0.0481, "step": 952 }, { "epoch": 4.254464285714286, "grad_norm": 0.9100790619850159, "learning_rate": 1.9859653315304254e-06, "loss": 0.0408, "step": 953 }, { "epoch": 4.258928571428571, "grad_norm": 1.1362229585647583, "learning_rate": 1.976552678536456e-06, "loss": 0.0463, "step": 954 }, { "epoch": 4.263392857142857, "grad_norm": 1.3644853830337524, "learning_rate": 1.9671568976946257e-06, "loss": 0.0584, "step": 955 }, { "epoch": 4.267857142857143, "grad_norm": 1.1988905668258667, "learning_rate": 1.9577780414288066e-06, "loss": 0.0482, "step": 956 }, { "epoch": 4.272321428571429, "grad_norm": 1.1114509105682373, "learning_rate": 1.9484161620684524e-06, "loss": 0.0503, "step": 957 }, { "epoch": 4.276785714285714, "grad_norm": 1.1508238315582275, "learning_rate": 1.939071311848282e-06, "loss": 0.0467, "step": 958 }, { "epoch": 4.28125, "grad_norm": 1.444758415222168, "learning_rate": 1.9297435429080076e-06, "loss": 0.0572, "step": 959 }, { "epoch": 4.285714285714286, "grad_norm": 1.0676677227020264, "learning_rate": 1.9204329072920285e-06, "loss": 0.0368, "step": 960 }, { "epoch": 4.290178571428571, "grad_norm": 1.0957095623016357, "learning_rate": 1.911139456949158e-06, "loss": 0.0472, "step": 961 }, { "epoch": 4.294642857142857, "grad_norm": 1.1863818168640137, "learning_rate": 1.9018632437323198e-06, "loss": 0.0518, "step": 962 }, { "epoch": 4.299107142857143, "grad_norm": 1.166928768157959, "learning_rate": 1.892604319398259e-06, "loss": 0.051, "step": 963 }, { "epoch": 4.303571428571429, "grad_norm": 1.0898560285568237, "learning_rate": 1.883362735607262e-06, "loss": 0.0412, "step": 964 }, { "epoch": 4.308035714285714, "grad_norm": 1.1429767608642578, "learning_rate": 1.8741385439228616e-06, "loss": 0.0495, "step": 965 }, { "epoch": 4.3125, "grad_norm": 1.1606385707855225, "learning_rate": 1.8649317958115534e-06, "loss": 0.0481, "step": 966 }, { "epoch": 4.316964285714286, "grad_norm": 1.2927366495132446, "learning_rate": 1.8557425426424989e-06, "loss": 0.0535, "step": 967 }, { "epoch": 4.321428571428571, "grad_norm": 1.053138017654419, "learning_rate": 1.8465708356872592e-06, "loss": 0.0379, "step": 968 }, { "epoch": 4.325892857142857, "grad_norm": 1.1609596014022827, "learning_rate": 1.8374167261194826e-06, "loss": 0.0478, "step": 969 }, { "epoch": 4.330357142857143, "grad_norm": 1.1894792318344116, "learning_rate": 1.8282802650146408e-06, "loss": 0.0437, "step": 970 }, { "epoch": 4.334821428571429, "grad_norm": 1.1033045053482056, "learning_rate": 1.8191615033497345e-06, "loss": 0.0417, "step": 971 }, { "epoch": 4.339285714285714, "grad_norm": 1.0112273693084717, "learning_rate": 1.810060492003008e-06, "loss": 0.0431, "step": 972 }, { "epoch": 4.34375, "grad_norm": 1.02801513671875, "learning_rate": 1.800977281753671e-06, "loss": 0.0439, "step": 973 }, { "epoch": 4.348214285714286, "grad_norm": 1.085830569267273, "learning_rate": 1.7919119232816049e-06, "loss": 0.0426, "step": 974 }, { "epoch": 4.352678571428571, "grad_norm": 1.0054593086242676, "learning_rate": 1.7828644671670943e-06, "loss": 0.0403, "step": 975 }, { "epoch": 4.357142857142857, "grad_norm": 1.0667251348495483, "learning_rate": 1.773834963890534e-06, "loss": 0.0441, "step": 976 }, { "epoch": 4.361607142857143, "grad_norm": 1.149455189704895, "learning_rate": 1.764823463832151e-06, "loss": 0.041, "step": 977 }, { "epoch": 4.366071428571429, "grad_norm": 1.0217986106872559, "learning_rate": 1.7558300172717234e-06, "loss": 0.0354, "step": 978 }, { "epoch": 4.370535714285714, "grad_norm": 1.0308144092559814, "learning_rate": 1.7468546743882997e-06, "loss": 0.0432, "step": 979 }, { "epoch": 4.375, "grad_norm": 1.0886852741241455, "learning_rate": 1.7378974852599203e-06, "loss": 0.0379, "step": 980 }, { "epoch": 4.379464285714286, "grad_norm": 1.1773141622543335, "learning_rate": 1.7289584998633307e-06, "loss": 0.0476, "step": 981 }, { "epoch": 4.383928571428571, "grad_norm": 1.2766141891479492, "learning_rate": 1.7200377680737148e-06, "loss": 0.0503, "step": 982 }, { "epoch": 4.388392857142857, "grad_norm": 1.2046881914138794, "learning_rate": 1.7111353396644071e-06, "loss": 0.0448, "step": 983 }, { "epoch": 4.392857142857143, "grad_norm": 1.4284559488296509, "learning_rate": 1.7022512643066196e-06, "loss": 0.0487, "step": 984 }, { "epoch": 4.397321428571429, "grad_norm": 1.4075138568878174, "learning_rate": 1.6933855915691622e-06, "loss": 0.0494, "step": 985 }, { "epoch": 4.401785714285714, "grad_norm": 1.065433144569397, "learning_rate": 1.6845383709181676e-06, "loss": 0.0383, "step": 986 }, { "epoch": 4.40625, "grad_norm": 1.1404274702072144, "learning_rate": 1.675709651716817e-06, "loss": 0.0416, "step": 987 }, { "epoch": 4.410714285714286, "grad_norm": 1.093176245689392, "learning_rate": 1.6668994832250556e-06, "loss": 0.0437, "step": 988 }, { "epoch": 4.415178571428571, "grad_norm": 1.237988829612732, "learning_rate": 1.6581079145993323e-06, "loss": 0.0475, "step": 989 }, { "epoch": 4.419642857142857, "grad_norm": 1.1673163175582886, "learning_rate": 1.649334994892314e-06, "loss": 0.0512, "step": 990 }, { "epoch": 4.424107142857143, "grad_norm": 1.1633106470108032, "learning_rate": 1.640580773052618e-06, "loss": 0.0458, "step": 991 }, { "epoch": 4.428571428571429, "grad_norm": 1.2017102241516113, "learning_rate": 1.6318452979245355e-06, "loss": 0.048, "step": 992 }, { "epoch": 4.433035714285714, "grad_norm": 1.0068191289901733, "learning_rate": 1.6231286182477555e-06, "loss": 0.0446, "step": 993 }, { "epoch": 4.4375, "grad_norm": 1.1095032691955566, "learning_rate": 1.6144307826571085e-06, "loss": 0.0489, "step": 994 }, { "epoch": 4.441964285714286, "grad_norm": 1.0608917474746704, "learning_rate": 1.6057518396822724e-06, "loss": 0.0508, "step": 995 }, { "epoch": 4.446428571428571, "grad_norm": 1.0646238327026367, "learning_rate": 1.5970918377475208e-06, "loss": 0.0497, "step": 996 }, { "epoch": 4.450892857142857, "grad_norm": 1.0634431838989258, "learning_rate": 1.5884508251714436e-06, "loss": 0.0402, "step": 997 }, { "epoch": 4.455357142857143, "grad_norm": 0.9448213577270508, "learning_rate": 1.5798288501666793e-06, "loss": 0.0413, "step": 998 }, { "epoch": 4.459821428571429, "grad_norm": 1.16658353805542, "learning_rate": 1.5712259608396462e-06, "loss": 0.0425, "step": 999 }, { "epoch": 4.464285714285714, "grad_norm": 1.0001455545425415, "learning_rate": 1.5626422051902709e-06, "loss": 0.0403, "step": 1000 }, { "epoch": 4.464285714285714, "eval_loss": 1.0235848426818848, "eval_runtime": 4.4483, "eval_samples_per_second": 13.488, "eval_steps_per_second": 0.899, "step": 1000 }, { "epoch": 4.46875, "grad_norm": 1.3267871141433716, "learning_rate": 1.5540776311117304e-06, "loss": 0.0441, "step": 1001 }, { "epoch": 4.473214285714286, "grad_norm": 1.2346901893615723, "learning_rate": 1.5455322863901704e-06, "loss": 0.0457, "step": 1002 }, { "epoch": 4.477678571428571, "grad_norm": 1.1277903318405151, "learning_rate": 1.5370062187044502e-06, "loss": 0.0493, "step": 1003 }, { "epoch": 4.482142857142857, "grad_norm": 1.0025169849395752, "learning_rate": 1.5284994756258718e-06, "loss": 0.0436, "step": 1004 }, { "epoch": 4.486607142857143, "grad_norm": 1.0304396152496338, "learning_rate": 1.5200121046179151e-06, "loss": 0.0436, "step": 1005 }, { "epoch": 4.491071428571429, "grad_norm": 0.9576307535171509, "learning_rate": 1.511544153035975e-06, "loss": 0.0403, "step": 1006 }, { "epoch": 4.495535714285714, "grad_norm": 0.9636532068252563, "learning_rate": 1.5030956681270903e-06, "loss": 0.039, "step": 1007 }, { "epoch": 4.5, "grad_norm": 1.277954339981079, "learning_rate": 1.4946666970296932e-06, "loss": 0.0375, "step": 1008 }, { "epoch": 4.504464285714286, "grad_norm": 0.9996533393859863, "learning_rate": 1.486257286773331e-06, "loss": 0.0448, "step": 1009 }, { "epoch": 4.508928571428571, "grad_norm": 1.0826925039291382, "learning_rate": 1.4778674842784168e-06, "loss": 0.0477, "step": 1010 }, { "epoch": 4.513392857142857, "grad_norm": 1.0207204818725586, "learning_rate": 1.4694973363559539e-06, "loss": 0.0358, "step": 1011 }, { "epoch": 4.517857142857143, "grad_norm": 0.9995800256729126, "learning_rate": 1.4611468897072933e-06, "loss": 0.0394, "step": 1012 }, { "epoch": 4.522321428571429, "grad_norm": 1.090086579322815, "learning_rate": 1.4528161909238569e-06, "loss": 0.0356, "step": 1013 }, { "epoch": 4.526785714285714, "grad_norm": 1.2063642740249634, "learning_rate": 1.44450528648688e-06, "loss": 0.0521, "step": 1014 }, { "epoch": 4.53125, "grad_norm": 1.2835434675216675, "learning_rate": 1.4362142227671607e-06, "loss": 0.0491, "step": 1015 }, { "epoch": 4.535714285714286, "grad_norm": 1.219610571861267, "learning_rate": 1.427943046024793e-06, "loss": 0.0428, "step": 1016 }, { "epoch": 4.540178571428571, "grad_norm": 1.305721402168274, "learning_rate": 1.4196918024089133e-06, "loss": 0.0505, "step": 1017 }, { "epoch": 4.544642857142857, "grad_norm": 1.181258201599121, "learning_rate": 1.4114605379574342e-06, "loss": 0.0449, "step": 1018 }, { "epoch": 4.549107142857143, "grad_norm": 1.1678040027618408, "learning_rate": 1.4032492985968057e-06, "loss": 0.0507, "step": 1019 }, { "epoch": 4.553571428571429, "grad_norm": 1.034266710281372, "learning_rate": 1.3950581301417365e-06, "loss": 0.0413, "step": 1020 }, { "epoch": 4.558035714285714, "grad_norm": 1.1394208669662476, "learning_rate": 1.3868870782949565e-06, "loss": 0.0484, "step": 1021 }, { "epoch": 4.5625, "grad_norm": 1.031049370765686, "learning_rate": 1.3787361886469509e-06, "loss": 0.0441, "step": 1022 }, { "epoch": 4.566964285714286, "grad_norm": 1.0913993120193481, "learning_rate": 1.3706055066757116e-06, "loss": 0.0497, "step": 1023 }, { "epoch": 4.571428571428571, "grad_norm": 1.1953216791152954, "learning_rate": 1.3624950777464828e-06, "loss": 0.0466, "step": 1024 }, { "epoch": 4.575892857142857, "grad_norm": 1.2053303718566895, "learning_rate": 1.3544049471115017e-06, "loss": 0.0413, "step": 1025 }, { "epoch": 4.580357142857143, "grad_norm": 1.085877537727356, "learning_rate": 1.3463351599097552e-06, "loss": 0.0429, "step": 1026 }, { "epoch": 4.584821428571429, "grad_norm": 1.0058659315109253, "learning_rate": 1.3382857611667233e-06, "loss": 0.0396, "step": 1027 }, { "epoch": 4.589285714285714, "grad_norm": 1.1939631700515747, "learning_rate": 1.3302567957941265e-06, "loss": 0.0467, "step": 1028 }, { "epoch": 4.59375, "grad_norm": 1.085060954093933, "learning_rate": 1.3222483085896786e-06, "loss": 0.0412, "step": 1029 }, { "epoch": 4.598214285714286, "grad_norm": 1.1042296886444092, "learning_rate": 1.3142603442368346e-06, "loss": 0.0484, "step": 1030 }, { "epoch": 4.602678571428571, "grad_norm": 1.2152049541473389, "learning_rate": 1.3062929473045442e-06, "loss": 0.0445, "step": 1031 }, { "epoch": 4.607142857142857, "grad_norm": 1.1404128074645996, "learning_rate": 1.2983461622469953e-06, "loss": 0.047, "step": 1032 }, { "epoch": 4.611607142857143, "grad_norm": 1.044474482536316, "learning_rate": 1.290420033403377e-06, "loss": 0.0417, "step": 1033 }, { "epoch": 4.616071428571429, "grad_norm": 1.0207549333572388, "learning_rate": 1.2825146049976244e-06, "loss": 0.0408, "step": 1034 }, { "epoch": 4.620535714285714, "grad_norm": 1.1905019283294678, "learning_rate": 1.2746299211381755e-06, "loss": 0.0459, "step": 1035 }, { "epoch": 4.625, "grad_norm": 0.9946685433387756, "learning_rate": 1.2667660258177241e-06, "loss": 0.0416, "step": 1036 }, { "epoch": 4.629464285714286, "grad_norm": 0.8535659909248352, "learning_rate": 1.258922962912969e-06, "loss": 0.0366, "step": 1037 }, { "epoch": 4.633928571428571, "grad_norm": 1.1927975416183472, "learning_rate": 1.2511007761843839e-06, "loss": 0.048, "step": 1038 }, { "epoch": 4.638392857142857, "grad_norm": 1.1664050817489624, "learning_rate": 1.2432995092759538e-06, "loss": 0.0462, "step": 1039 }, { "epoch": 4.642857142857143, "grad_norm": 1.0218449831008911, "learning_rate": 1.2355192057149477e-06, "loss": 0.0455, "step": 1040 }, { "epoch": 4.647321428571429, "grad_norm": 1.0675235986709595, "learning_rate": 1.2277599089116662e-06, "loss": 0.0428, "step": 1041 }, { "epoch": 4.651785714285714, "grad_norm": 1.136800765991211, "learning_rate": 1.220021662159204e-06, "loss": 0.0536, "step": 1042 }, { "epoch": 4.65625, "grad_norm": 0.9919513463973999, "learning_rate": 1.2123045086332076e-06, "loss": 0.0417, "step": 1043 }, { "epoch": 4.660714285714286, "grad_norm": 0.9957274198532104, "learning_rate": 1.204608491391628e-06, "loss": 0.038, "step": 1044 }, { "epoch": 4.665178571428571, "grad_norm": 1.0251184701919556, "learning_rate": 1.1969336533744965e-06, "loss": 0.04, "step": 1045 }, { "epoch": 4.669642857142857, "grad_norm": 1.3310158252716064, "learning_rate": 1.1892800374036632e-06, "loss": 0.0517, "step": 1046 }, { "epoch": 4.674107142857143, "grad_norm": 1.0729650259017944, "learning_rate": 1.181647686182576e-06, "loss": 0.0465, "step": 1047 }, { "epoch": 4.678571428571429, "grad_norm": 1.2038968801498413, "learning_rate": 1.174036642296036e-06, "loss": 0.041, "step": 1048 }, { "epoch": 4.683035714285714, "grad_norm": 1.0825115442276, "learning_rate": 1.166446948209957e-06, "loss": 0.0459, "step": 1049 }, { "epoch": 4.6875, "grad_norm": 1.114462971687317, "learning_rate": 1.1588786462711347e-06, "loss": 0.0471, "step": 1050 }, { "epoch": 4.691964285714286, "grad_norm": 0.9582335352897644, "learning_rate": 1.1513317787070002e-06, "loss": 0.0332, "step": 1051 }, { "epoch": 4.696428571428571, "grad_norm": 1.0500428676605225, "learning_rate": 1.1438063876254025e-06, "loss": 0.0417, "step": 1052 }, { "epoch": 4.700892857142857, "grad_norm": 1.2004417181015015, "learning_rate": 1.1363025150143508e-06, "loss": 0.0448, "step": 1053 }, { "epoch": 4.705357142857143, "grad_norm": 1.3093647956848145, "learning_rate": 1.1288202027417996e-06, "loss": 0.0477, "step": 1054 }, { "epoch": 4.709821428571429, "grad_norm": 1.1933882236480713, "learning_rate": 1.1213594925554027e-06, "loss": 0.0509, "step": 1055 }, { "epoch": 4.714285714285714, "grad_norm": 1.1147135496139526, "learning_rate": 1.1139204260822874e-06, "loss": 0.0512, "step": 1056 }, { "epoch": 4.71875, "grad_norm": 1.097452163696289, "learning_rate": 1.1065030448288196e-06, "loss": 0.047, "step": 1057 }, { "epoch": 4.723214285714286, "grad_norm": 1.1414300203323364, "learning_rate": 1.0991073901803692e-06, "loss": 0.049, "step": 1058 }, { "epoch": 4.727678571428571, "grad_norm": 1.1289511919021606, "learning_rate": 1.091733503401085e-06, "loss": 0.0458, "step": 1059 }, { "epoch": 4.732142857142857, "grad_norm": 1.0961730480194092, "learning_rate": 1.0843814256336622e-06, "loss": 0.0435, "step": 1060 }, { "epoch": 4.736607142857143, "grad_norm": 1.0257600545883179, "learning_rate": 1.0770511978991116e-06, "loss": 0.0424, "step": 1061 }, { "epoch": 4.741071428571429, "grad_norm": 1.095285415649414, "learning_rate": 1.0697428610965275e-06, "loss": 0.0457, "step": 1062 }, { "epoch": 4.745535714285714, "grad_norm": 1.035879135131836, "learning_rate": 1.0624564560028723e-06, "loss": 0.0435, "step": 1063 }, { "epoch": 4.75, "grad_norm": 1.1119691133499146, "learning_rate": 1.0551920232727309e-06, "loss": 0.0406, "step": 1064 }, { "epoch": 4.754464285714286, "grad_norm": 1.1761224269866943, "learning_rate": 1.0479496034380988e-06, "loss": 0.0529, "step": 1065 }, { "epoch": 4.758928571428571, "grad_norm": 1.2393746376037598, "learning_rate": 1.0407292369081479e-06, "loss": 0.049, "step": 1066 }, { "epoch": 4.763392857142857, "grad_norm": 1.089432716369629, "learning_rate": 1.0335309639690041e-06, "loss": 0.0469, "step": 1067 }, { "epoch": 4.767857142857143, "grad_norm": 1.0081895589828491, "learning_rate": 1.0263548247835246e-06, "loss": 0.0438, "step": 1068 }, { "epoch": 4.772321428571429, "grad_norm": 0.9703455567359924, "learning_rate": 1.0192008593910643e-06, "loss": 0.048, "step": 1069 }, { "epoch": 4.776785714285714, "grad_norm": 1.0442850589752197, "learning_rate": 1.0120691077072643e-06, "loss": 0.0432, "step": 1070 }, { "epoch": 4.78125, "grad_norm": 1.0315920114517212, "learning_rate": 1.004959609523824e-06, "loss": 0.0485, "step": 1071 }, { "epoch": 4.785714285714286, "grad_norm": 1.118602991104126, "learning_rate": 9.978724045082772e-07, "loss": 0.0393, "step": 1072 }, { "epoch": 4.790178571428571, "grad_norm": 0.991577684879303, "learning_rate": 9.908075322037738e-07, "loss": 0.0418, "step": 1073 }, { "epoch": 4.794642857142857, "grad_norm": 0.9192765951156616, "learning_rate": 9.83765032028858e-07, "loss": 0.0427, "step": 1074 }, { "epoch": 4.799107142857143, "grad_norm": 1.1283397674560547, "learning_rate": 9.767449432772485e-07, "loss": 0.0403, "step": 1075 }, { "epoch": 4.803571428571429, "grad_norm": 1.3338288068771362, "learning_rate": 9.697473051176173e-07, "loss": 0.0537, "step": 1076 }, { "epoch": 4.808035714285714, "grad_norm": 1.03440523147583, "learning_rate": 9.627721565933749e-07, "loss": 0.0419, "step": 1077 }, { "epoch": 4.8125, "grad_norm": 1.070723295211792, "learning_rate": 9.558195366224508e-07, "loss": 0.0444, "step": 1078 }, { "epoch": 4.816964285714286, "grad_norm": 1.0093774795532227, "learning_rate": 9.488894839970758e-07, "loss": 0.0408, "step": 1079 }, { "epoch": 4.821428571428571, "grad_norm": 1.1505217552185059, "learning_rate": 9.419820373835668e-07, "loss": 0.0429, "step": 1080 }, { "epoch": 4.825892857142857, "grad_norm": 1.3309438228607178, "learning_rate": 9.350972353221052e-07, "loss": 0.0432, "step": 1081 }, { "epoch": 4.830357142857143, "grad_norm": 1.3498495817184448, "learning_rate": 9.282351162265363e-07, "loss": 0.0491, "step": 1082 }, { "epoch": 4.834821428571429, "grad_norm": 1.1168887615203857, "learning_rate": 9.213957183841355e-07, "loss": 0.0436, "step": 1083 }, { "epoch": 4.839285714285714, "grad_norm": 1.1466056108474731, "learning_rate": 9.145790799554101e-07, "loss": 0.045, "step": 1084 }, { "epoch": 4.84375, "grad_norm": 0.865831196308136, "learning_rate": 9.077852389738817e-07, "loss": 0.0386, "step": 1085 }, { "epoch": 4.848214285714286, "grad_norm": 1.1055983304977417, "learning_rate": 9.010142333458698e-07, "loss": 0.038, "step": 1086 }, { "epoch": 4.852678571428571, "grad_norm": 1.069543480873108, "learning_rate": 8.942661008502875e-07, "loss": 0.0436, "step": 1087 }, { "epoch": 4.857142857142857, "grad_norm": 1.2057677507400513, "learning_rate": 8.87540879138421e-07, "loss": 0.0481, "step": 1088 }, { "epoch": 4.861607142857143, "grad_norm": 1.28882896900177, "learning_rate": 8.808386057337353e-07, "loss": 0.0495, "step": 1089 }, { "epoch": 4.866071428571429, "grad_norm": 1.4611623287200928, "learning_rate": 8.741593180316439e-07, "loss": 0.0496, "step": 1090 }, { "epoch": 4.870535714285714, "grad_norm": 1.2761002779006958, "learning_rate": 8.675030532993193e-07, "loss": 0.0443, "step": 1091 }, { "epoch": 4.875, "grad_norm": 0.9505082964897156, "learning_rate": 8.608698486754739e-07, "loss": 0.0426, "step": 1092 }, { "epoch": 4.879464285714286, "grad_norm": 1.2187812328338623, "learning_rate": 8.542597411701563e-07, "loss": 0.045, "step": 1093 }, { "epoch": 4.883928571428571, "grad_norm": 0.8786472082138062, "learning_rate": 8.476727676645453e-07, "loss": 0.0393, "step": 1094 }, { "epoch": 4.888392857142857, "grad_norm": 1.0366308689117432, "learning_rate": 8.411089649107396e-07, "loss": 0.0389, "step": 1095 }, { "epoch": 4.892857142857143, "grad_norm": 1.0801434516906738, "learning_rate": 8.345683695315633e-07, "loss": 0.0406, "step": 1096 }, { "epoch": 4.897321428571429, "grad_norm": 1.2877177000045776, "learning_rate": 8.280510180203476e-07, "loss": 0.0495, "step": 1097 }, { "epoch": 4.901785714285714, "grad_norm": 1.1729180812835693, "learning_rate": 8.215569467407386e-07, "loss": 0.0476, "step": 1098 }, { "epoch": 4.90625, "grad_norm": 1.261544942855835, "learning_rate": 8.150861919264887e-07, "loss": 0.0464, "step": 1099 }, { "epoch": 4.910714285714286, "grad_norm": 1.185400128364563, "learning_rate": 8.086387896812546e-07, "loss": 0.0468, "step": 1100 }, { "epoch": 4.915178571428571, "grad_norm": 1.1032450199127197, "learning_rate": 8.022147759784016e-07, "loss": 0.0435, "step": 1101 }, { "epoch": 4.919642857142857, "grad_norm": 1.142734408378601, "learning_rate": 7.958141866607897e-07, "loss": 0.0418, "step": 1102 }, { "epoch": 4.924107142857143, "grad_norm": 1.157031774520874, "learning_rate": 7.894370574405928e-07, "loss": 0.0509, "step": 1103 }, { "epoch": 4.928571428571429, "grad_norm": 1.0427000522613525, "learning_rate": 7.830834238990803e-07, "loss": 0.0427, "step": 1104 }, { "epoch": 4.933035714285714, "grad_norm": 1.059389591217041, "learning_rate": 7.767533214864331e-07, "loss": 0.0464, "step": 1105 }, { "epoch": 4.9375, "grad_norm": 0.9791935086250305, "learning_rate": 7.70446785521533e-07, "loss": 0.039, "step": 1106 }, { "epoch": 4.941964285714286, "grad_norm": 1.1133091449737549, "learning_rate": 7.641638511917806e-07, "loss": 0.043, "step": 1107 }, { "epoch": 4.946428571428571, "grad_norm": 1.1858481168746948, "learning_rate": 7.579045535528878e-07, "loss": 0.0409, "step": 1108 }, { "epoch": 4.950892857142857, "grad_norm": 0.9668167233467102, "learning_rate": 7.516689275286813e-07, "loss": 0.0392, "step": 1109 }, { "epoch": 4.955357142857143, "grad_norm": 1.0199722051620483, "learning_rate": 7.454570079109164e-07, "loss": 0.0414, "step": 1110 }, { "epoch": 4.959821428571429, "grad_norm": 0.9346474409103394, "learning_rate": 7.392688293590767e-07, "loss": 0.0379, "step": 1111 }, { "epoch": 4.964285714285714, "grad_norm": 1.0724751949310303, "learning_rate": 7.331044264001836e-07, "loss": 0.0368, "step": 1112 }, { "epoch": 4.96875, "grad_norm": 1.1779240369796753, "learning_rate": 7.269638334285973e-07, "loss": 0.0391, "step": 1113 }, { "epoch": 4.973214285714286, "grad_norm": 1.2583742141723633, "learning_rate": 7.208470847058387e-07, "loss": 0.0558, "step": 1114 }, { "epoch": 4.977678571428571, "grad_norm": 0.9401357769966125, "learning_rate": 7.147542143603806e-07, "loss": 0.037, "step": 1115 }, { "epoch": 4.982142857142857, "grad_norm": 1.1139758825302124, "learning_rate": 7.086852563874705e-07, "loss": 0.0452, "step": 1116 }, { "epoch": 4.986607142857143, "grad_norm": 1.3145822286605835, "learning_rate": 7.026402446489368e-07, "loss": 0.0417, "step": 1117 }, { "epoch": 4.991071428571429, "grad_norm": 1.0069618225097656, "learning_rate": 6.966192128729969e-07, "loss": 0.0385, "step": 1118 }, { "epoch": 4.995535714285714, "grad_norm": 0.9981904029846191, "learning_rate": 6.906221946540765e-07, "loss": 0.0386, "step": 1119 }, { "epoch": 5.0, "grad_norm": 1.0711932182312012, "learning_rate": 6.846492234526105e-07, "loss": 0.0469, "step": 1120 }, { "epoch": 5.004464285714286, "grad_norm": 0.7480913996696472, "learning_rate": 6.787003325948681e-07, "loss": 0.0244, "step": 1121 }, { "epoch": 5.008928571428571, "grad_norm": 0.4924456775188446, "learning_rate": 6.727755552727618e-07, "loss": 0.021, "step": 1122 }, { "epoch": 5.013392857142857, "grad_norm": 0.608626127243042, "learning_rate": 6.668749245436603e-07, "loss": 0.0251, "step": 1123 }, { "epoch": 5.017857142857143, "grad_norm": 0.5998596549034119, "learning_rate": 6.609984733302069e-07, "loss": 0.0279, "step": 1124 }, { "epoch": 5.022321428571429, "grad_norm": 0.7042911648750305, "learning_rate": 6.551462344201356e-07, "loss": 0.0283, "step": 1125 }, { "epoch": 5.026785714285714, "grad_norm": 0.6839467287063599, "learning_rate": 6.493182404660884e-07, "loss": 0.029, "step": 1126 }, { "epoch": 5.03125, "grad_norm": 0.60224449634552, "learning_rate": 6.435145239854279e-07, "loss": 0.0263, "step": 1127 }, { "epoch": 5.035714285714286, "grad_norm": 0.6765567064285278, "learning_rate": 6.377351173600649e-07, "loss": 0.0286, "step": 1128 }, { "epoch": 5.040178571428571, "grad_norm": 0.6019296646118164, "learning_rate": 6.319800528362713e-07, "loss": 0.0213, "step": 1129 }, { "epoch": 5.044642857142857, "grad_norm": 0.661238968372345, "learning_rate": 6.262493625245017e-07, "loss": 0.0224, "step": 1130 }, { "epoch": 5.049107142857143, "grad_norm": 0.8073251247406006, "learning_rate": 6.205430783992163e-07, "loss": 0.0335, "step": 1131 }, { "epoch": 5.053571428571429, "grad_norm": 0.5287904739379883, "learning_rate": 6.148612322986963e-07, "loss": 0.0215, "step": 1132 }, { "epoch": 5.058035714285714, "grad_norm": 0.7667572498321533, "learning_rate": 6.092038559248772e-07, "loss": 0.026, "step": 1133 }, { "epoch": 5.0625, "grad_norm": 0.6493704319000244, "learning_rate": 6.035709808431585e-07, "loss": 0.0211, "step": 1134 }, { "epoch": 5.066964285714286, "grad_norm": 0.6587660312652588, "learning_rate": 5.979626384822384e-07, "loss": 0.0259, "step": 1135 }, { "epoch": 5.071428571428571, "grad_norm": 0.5626607537269592, "learning_rate": 5.923788601339348e-07, "loss": 0.02, "step": 1136 }, { "epoch": 5.075892857142857, "grad_norm": 0.6060606837272644, "learning_rate": 5.868196769530085e-07, "loss": 0.021, "step": 1137 }, { "epoch": 5.080357142857143, "grad_norm": 0.6305381059646606, "learning_rate": 5.81285119956993e-07, "loss": 0.0206, "step": 1138 }, { "epoch": 5.084821428571429, "grad_norm": 0.6716199517250061, "learning_rate": 5.757752200260156e-07, "loss": 0.0247, "step": 1139 }, { "epoch": 5.089285714285714, "grad_norm": 0.5531538128852844, "learning_rate": 5.702900079026365e-07, "loss": 0.0214, "step": 1140 }, { "epoch": 5.09375, "grad_norm": 0.5417418479919434, "learning_rate": 5.648295141916629e-07, "loss": 0.0207, "step": 1141 }, { "epoch": 5.098214285714286, "grad_norm": 0.7253334522247314, "learning_rate": 5.593937693599892e-07, "loss": 0.0235, "step": 1142 }, { "epoch": 5.102678571428571, "grad_norm": 0.6683683395385742, "learning_rate": 5.539828037364222e-07, "loss": 0.0243, "step": 1143 }, { "epoch": 5.107142857142857, "grad_norm": 0.6907817721366882, "learning_rate": 5.485966475115118e-07, "loss": 0.0261, "step": 1144 }, { "epoch": 5.111607142857143, "grad_norm": 0.6566014289855957, "learning_rate": 5.432353307373865e-07, "loss": 0.025, "step": 1145 }, { "epoch": 5.116071428571429, "grad_norm": 0.8714102506637573, "learning_rate": 5.378988833275772e-07, "loss": 0.0321, "step": 1146 }, { "epoch": 5.120535714285714, "grad_norm": 0.7149608731269836, "learning_rate": 5.325873350568641e-07, "loss": 0.0248, "step": 1147 }, { "epoch": 5.125, "grad_norm": 0.7687610983848572, "learning_rate": 5.27300715561093e-07, "loss": 0.0284, "step": 1148 }, { "epoch": 5.129464285714286, "grad_norm": 0.7787485122680664, "learning_rate": 5.220390543370269e-07, "loss": 0.0267, "step": 1149 }, { "epoch": 5.133928571428571, "grad_norm": 0.6577015519142151, "learning_rate": 5.16802380742169e-07, "loss": 0.0231, "step": 1150 }, { "epoch": 5.138392857142857, "grad_norm": 0.6675891280174255, "learning_rate": 5.115907239946071e-07, "loss": 0.0213, "step": 1151 }, { "epoch": 5.142857142857143, "grad_norm": 0.7006237506866455, "learning_rate": 5.064041131728456e-07, "loss": 0.0224, "step": 1152 }, { "epoch": 5.147321428571429, "grad_norm": 0.7089455127716064, "learning_rate": 5.012425772156433e-07, "loss": 0.0247, "step": 1153 }, { "epoch": 5.151785714285714, "grad_norm": 0.7653855085372925, "learning_rate": 4.961061449218561e-07, "loss": 0.0211, "step": 1154 }, { "epoch": 5.15625, "grad_norm": 0.7013386487960815, "learning_rate": 4.90994844950272e-07, "loss": 0.0242, "step": 1155 }, { "epoch": 5.160714285714286, "grad_norm": 0.743937611579895, "learning_rate": 4.859087058194547e-07, "loss": 0.024, "step": 1156 }, { "epoch": 5.165178571428571, "grad_norm": 0.8444202542304993, "learning_rate": 4.808477559075791e-07, "loss": 0.0253, "step": 1157 }, { "epoch": 5.169642857142857, "grad_norm": 0.8176196217536926, "learning_rate": 4.758120234522819e-07, "loss": 0.0266, "step": 1158 }, { "epoch": 5.174107142857143, "grad_norm": 0.8155895471572876, "learning_rate": 4.708015365504947e-07, "loss": 0.0277, "step": 1159 }, { "epoch": 5.178571428571429, "grad_norm": 0.6969717741012573, "learning_rate": 4.658163231582916e-07, "loss": 0.0218, "step": 1160 }, { "epoch": 5.183035714285714, "grad_norm": 0.776943564414978, "learning_rate": 4.6085641109073313e-07, "loss": 0.0272, "step": 1161 }, { "epoch": 5.1875, "grad_norm": 0.8029648661613464, "learning_rate": 4.559218280217121e-07, "loss": 0.0265, "step": 1162 }, { "epoch": 5.191964285714286, "grad_norm": 0.7203864455223083, "learning_rate": 4.5101260148379735e-07, "loss": 0.0221, "step": 1163 }, { "epoch": 5.196428571428571, "grad_norm": 0.747048556804657, "learning_rate": 4.461287588680783e-07, "loss": 0.0301, "step": 1164 }, { "epoch": 5.200892857142857, "grad_norm": 0.7206503748893738, "learning_rate": 4.4127032742401697e-07, "loss": 0.0258, "step": 1165 }, { "epoch": 5.205357142857143, "grad_norm": 0.7037203311920166, "learning_rate": 4.364373342592935e-07, "loss": 0.0247, "step": 1166 }, { "epoch": 5.209821428571429, "grad_norm": 0.9221356511116028, "learning_rate": 4.316298063396534e-07, "loss": 0.0226, "step": 1167 }, { "epoch": 5.214285714285714, "grad_norm": 0.7772340774536133, "learning_rate": 4.2684777048875913e-07, "loss": 0.028, "step": 1168 }, { "epoch": 5.21875, "grad_norm": 0.9022529721260071, "learning_rate": 4.2209125338804007e-07, "loss": 0.0297, "step": 1169 }, { "epoch": 5.223214285714286, "grad_norm": 0.7532088756561279, "learning_rate": 4.173602815765447e-07, "loss": 0.025, "step": 1170 }, { "epoch": 5.227678571428571, "grad_norm": 0.7878860831260681, "learning_rate": 4.126548814507876e-07, "loss": 0.0214, "step": 1171 }, { "epoch": 5.232142857142857, "grad_norm": 0.7220774292945862, "learning_rate": 4.079750792646085e-07, "loss": 0.0228, "step": 1172 }, { "epoch": 5.236607142857143, "grad_norm": 0.8477165699005127, "learning_rate": 4.0332090112902294e-07, "loss": 0.0291, "step": 1173 }, { "epoch": 5.241071428571429, "grad_norm": 0.7177145481109619, "learning_rate": 3.98692373012076e-07, "loss": 0.0246, "step": 1174 }, { "epoch": 5.245535714285714, "grad_norm": 0.7874996662139893, "learning_rate": 3.940895207387007e-07, "loss": 0.0221, "step": 1175 }, { "epoch": 5.25, "grad_norm": 0.7333866357803345, "learning_rate": 3.89512369990565e-07, "loss": 0.0248, "step": 1176 }, { "epoch": 5.254464285714286, "grad_norm": 0.6481684446334839, "learning_rate": 3.849609463059437e-07, "loss": 0.0211, "step": 1177 }, { "epoch": 5.258928571428571, "grad_norm": 0.7252910733222961, "learning_rate": 3.8043527507955926e-07, "loss": 0.0242, "step": 1178 }, { "epoch": 5.263392857142857, "grad_norm": 1.1857779026031494, "learning_rate": 3.759353815624526e-07, "loss": 0.0276, "step": 1179 }, { "epoch": 5.267857142857143, "grad_norm": 0.8104342222213745, "learning_rate": 3.7146129086183547e-07, "loss": 0.0235, "step": 1180 }, { "epoch": 5.272321428571429, "grad_norm": 0.6407288908958435, "learning_rate": 3.6701302794095416e-07, "loss": 0.0202, "step": 1181 }, { "epoch": 5.276785714285714, "grad_norm": 0.8074800968170166, "learning_rate": 3.625906176189484e-07, "loss": 0.0269, "step": 1182 }, { "epoch": 5.28125, "grad_norm": 0.8074230551719666, "learning_rate": 3.5819408457070893e-07, "loss": 0.024, "step": 1183 }, { "epoch": 5.285714285714286, "grad_norm": 0.7819578051567078, "learning_rate": 3.5382345332675154e-07, "loss": 0.0246, "step": 1184 }, { "epoch": 5.290178571428571, "grad_norm": 0.7379055619239807, "learning_rate": 3.494787482730647e-07, "loss": 0.0213, "step": 1185 }, { "epoch": 5.294642857142857, "grad_norm": 0.8835303783416748, "learning_rate": 3.4515999365098667e-07, "loss": 0.0215, "step": 1186 }, { "epoch": 5.299107142857143, "grad_norm": 0.7571390867233276, "learning_rate": 3.4086721355706303e-07, "loss": 0.0243, "step": 1187 }, { "epoch": 5.303571428571429, "grad_norm": 0.9748966693878174, "learning_rate": 3.366004319429139e-07, "loss": 0.0285, "step": 1188 }, { "epoch": 5.308035714285714, "grad_norm": 0.8343793153762817, "learning_rate": 3.323596726151021e-07, "loss": 0.0257, "step": 1189 }, { "epoch": 5.3125, "grad_norm": 0.7045021057128906, "learning_rate": 3.2814495923499496e-07, "loss": 0.0185, "step": 1190 }, { "epoch": 5.316964285714286, "grad_norm": 0.788077712059021, "learning_rate": 3.239563153186429e-07, "loss": 0.0214, "step": 1191 }, { "epoch": 5.321428571428571, "grad_norm": 0.8326147198677063, "learning_rate": 3.197937642366339e-07, "loss": 0.0285, "step": 1192 }, { "epoch": 5.325892857142857, "grad_norm": 0.7847570180892944, "learning_rate": 3.1565732921397583e-07, "loss": 0.026, "step": 1193 }, { "epoch": 5.330357142857143, "grad_norm": 0.8788312673568726, "learning_rate": 3.115470333299593e-07, "loss": 0.0242, "step": 1194 }, { "epoch": 5.334821428571429, "grad_norm": 0.7230253219604492, "learning_rate": 3.0746289951803197e-07, "loss": 0.0227, "step": 1195 }, { "epoch": 5.339285714285714, "grad_norm": 0.8304932117462158, "learning_rate": 3.03404950565671e-07, "loss": 0.0264, "step": 1196 }, { "epoch": 5.34375, "grad_norm": 0.8365828990936279, "learning_rate": 2.9937320911425226e-07, "loss": 0.0245, "step": 1197 }, { "epoch": 5.348214285714286, "grad_norm": 0.7228520512580872, "learning_rate": 2.953676976589278e-07, "loss": 0.0201, "step": 1198 }, { "epoch": 5.352678571428571, "grad_norm": 0.6695645451545715, "learning_rate": 2.9138843854849964e-07, "loss": 0.0212, "step": 1199 }, { "epoch": 5.357142857142857, "grad_norm": 0.8191896677017212, "learning_rate": 2.8743545398529436e-07, "loss": 0.0206, "step": 1200 }, { "epoch": 5.357142857142857, "eval_loss": 1.1710518598556519, "eval_runtime": 4.4446, "eval_samples_per_second": 13.5, "eval_steps_per_second": 0.9, "step": 1200 }, { "epoch": 5.361607142857143, "grad_norm": 0.6906821131706238, "learning_rate": 2.8350876602503893e-07, "loss": 0.0234, "step": 1201 }, { "epoch": 5.366071428571429, "grad_norm": 0.742746889591217, "learning_rate": 2.7960839657673844e-07, "loss": 0.0249, "step": 1202 }, { "epoch": 5.370535714285714, "grad_norm": 0.7856932282447815, "learning_rate": 2.7573436740255337e-07, "loss": 0.0276, "step": 1203 }, { "epoch": 5.375, "grad_norm": 0.8468236327171326, "learning_rate": 2.7188670011767715e-07, "loss": 0.0295, "step": 1204 }, { "epoch": 5.379464285714286, "grad_norm": 0.8368995785713196, "learning_rate": 2.680654161902189e-07, "loss": 0.0236, "step": 1205 }, { "epoch": 5.383928571428571, "grad_norm": 0.6481898427009583, "learning_rate": 2.6427053694107966e-07, "loss": 0.0171, "step": 1206 }, { "epoch": 5.388392857142857, "grad_norm": 0.6581536531448364, "learning_rate": 2.605020835438375e-07, "loss": 0.0208, "step": 1207 }, { "epoch": 5.392857142857143, "grad_norm": 1.2420670986175537, "learning_rate": 2.567600770246237e-07, "loss": 0.0228, "step": 1208 }, { "epoch": 5.397321428571429, "grad_norm": 0.6781761050224304, "learning_rate": 2.5304453826201084e-07, "loss": 0.0196, "step": 1209 }, { "epoch": 5.401785714285714, "grad_norm": 0.8960991501808167, "learning_rate": 2.493554879868958e-07, "loss": 0.023, "step": 1210 }, { "epoch": 5.40625, "grad_norm": 0.815574049949646, "learning_rate": 2.4569294678237995e-07, "loss": 0.0243, "step": 1211 }, { "epoch": 5.410714285714286, "grad_norm": 0.6534021496772766, "learning_rate": 2.42056935083658e-07, "loss": 0.0209, "step": 1212 }, { "epoch": 5.415178571428571, "grad_norm": 0.7151815891265869, "learning_rate": 2.38447473177903e-07, "loss": 0.0195, "step": 1213 }, { "epoch": 5.419642857142857, "grad_norm": 0.6617888808250427, "learning_rate": 2.3486458120415415e-07, "loss": 0.0215, "step": 1214 }, { "epoch": 5.424107142857143, "grad_norm": 0.6826675534248352, "learning_rate": 2.3130827915320015e-07, "loss": 0.0202, "step": 1215 }, { "epoch": 5.428571428571429, "grad_norm": 0.7722153067588806, "learning_rate": 2.2777858686747495e-07, "loss": 0.0269, "step": 1216 }, { "epoch": 5.433035714285714, "grad_norm": 0.651387095451355, "learning_rate": 2.242755240409399e-07, "loss": 0.0185, "step": 1217 }, { "epoch": 5.4375, "grad_norm": 0.7878575921058655, "learning_rate": 2.20799110218979e-07, "loss": 0.0264, "step": 1218 }, { "epoch": 5.441964285714286, "grad_norm": 0.920578122138977, "learning_rate": 2.173493647982873e-07, "loss": 0.0242, "step": 1219 }, { "epoch": 5.446428571428571, "grad_norm": 0.8522472381591797, "learning_rate": 2.139263070267605e-07, "loss": 0.0261, "step": 1220 }, { "epoch": 5.450892857142857, "grad_norm": 0.7734021544456482, "learning_rate": 2.105299560033954e-07, "loss": 0.0231, "step": 1221 }, { "epoch": 5.455357142857143, "grad_norm": 0.6533644795417786, "learning_rate": 2.0716033067817308e-07, "loss": 0.0235, "step": 1222 }, { "epoch": 5.459821428571429, "grad_norm": 0.7817345261573792, "learning_rate": 2.03817449851962e-07, "loss": 0.0214, "step": 1223 }, { "epoch": 5.464285714285714, "grad_norm": 0.671183168888092, "learning_rate": 2.0050133217640672e-07, "loss": 0.0229, "step": 1224 }, { "epoch": 5.46875, "grad_norm": 0.725639283657074, "learning_rate": 1.972119961538277e-07, "loss": 0.0236, "step": 1225 }, { "epoch": 5.473214285714286, "grad_norm": 0.7955847382545471, "learning_rate": 1.9394946013711787e-07, "loss": 0.0311, "step": 1226 }, { "epoch": 5.477678571428571, "grad_norm": 0.8474836349487305, "learning_rate": 1.9071374232963564e-07, "loss": 0.0263, "step": 1227 }, { "epoch": 5.482142857142857, "grad_norm": 0.8439944982528687, "learning_rate": 1.8750486078511206e-07, "loss": 0.0297, "step": 1228 }, { "epoch": 5.486607142857143, "grad_norm": 0.8103631734848022, "learning_rate": 1.8432283340754e-07, "loss": 0.0256, "step": 1229 }, { "epoch": 5.491071428571429, "grad_norm": 0.8478034734725952, "learning_rate": 1.8116767795108175e-07, "loss": 0.0265, "step": 1230 }, { "epoch": 5.495535714285714, "grad_norm": 0.7951886057853699, "learning_rate": 1.780394120199672e-07, "loss": 0.0232, "step": 1231 }, { "epoch": 5.5, "grad_norm": 0.7319163084030151, "learning_rate": 1.7493805306839532e-07, "loss": 0.029, "step": 1232 }, { "epoch": 5.504464285714286, "grad_norm": 0.8299695253372192, "learning_rate": 1.718636184004378e-07, "loss": 0.0256, "step": 1233 }, { "epoch": 5.508928571428571, "grad_norm": 0.7443589568138123, "learning_rate": 1.688161251699405e-07, "loss": 0.0202, "step": 1234 }, { "epoch": 5.513392857142857, "grad_norm": 0.7679203152656555, "learning_rate": 1.6579559038043186e-07, "loss": 0.0251, "step": 1235 }, { "epoch": 5.517857142857143, "grad_norm": 0.6225228309631348, "learning_rate": 1.6280203088502275e-07, "loss": 0.0222, "step": 1236 }, { "epoch": 5.522321428571429, "grad_norm": 0.6747012734413147, "learning_rate": 1.5983546338631578e-07, "loss": 0.0203, "step": 1237 }, { "epoch": 5.526785714285714, "grad_norm": 0.8515854477882385, "learning_rate": 1.5689590443631178e-07, "loss": 0.0243, "step": 1238 }, { "epoch": 5.53125, "grad_norm": 0.7704442143440247, "learning_rate": 1.5398337043631723e-07, "loss": 0.0222, "step": 1239 }, { "epoch": 5.535714285714286, "grad_norm": 0.715097963809967, "learning_rate": 1.5109787763685323e-07, "loss": 0.024, "step": 1240 }, { "epoch": 5.540178571428571, "grad_norm": 0.70245760679245, "learning_rate": 1.4823944213756056e-07, "loss": 0.0226, "step": 1241 }, { "epoch": 5.544642857142857, "grad_norm": 0.8160264492034912, "learning_rate": 1.4540807988711857e-07, "loss": 0.0249, "step": 1242 }, { "epoch": 5.549107142857143, "grad_norm": 0.7433526515960693, "learning_rate": 1.4260380668314764e-07, "loss": 0.0222, "step": 1243 }, { "epoch": 5.553571428571429, "grad_norm": 0.7798489332199097, "learning_rate": 1.3982663817212475e-07, "loss": 0.0268, "step": 1244 }, { "epoch": 5.558035714285714, "grad_norm": 0.7786112427711487, "learning_rate": 1.3707658984929738e-07, "loss": 0.0252, "step": 1245 }, { "epoch": 5.5625, "grad_norm": 0.8275200128555298, "learning_rate": 1.3435367705859475e-07, "loss": 0.0245, "step": 1246 }, { "epoch": 5.566964285714286, "grad_norm": 0.770211935043335, "learning_rate": 1.3165791499254294e-07, "loss": 0.0254, "step": 1247 }, { "epoch": 5.571428571428571, "grad_norm": 0.9335598945617676, "learning_rate": 1.2898931869218046e-07, "loss": 0.0264, "step": 1248 }, { "epoch": 5.575892857142857, "grad_norm": 0.7596639394760132, "learning_rate": 1.2634790304697283e-07, "loss": 0.0241, "step": 1249 }, { "epoch": 5.580357142857143, "grad_norm": 0.7580435276031494, "learning_rate": 1.2373368279473364e-07, "loss": 0.0217, "step": 1250 }, { "epoch": 5.584821428571429, "grad_norm": 0.8928846120834351, "learning_rate": 1.2114667252153644e-07, "loss": 0.0288, "step": 1251 }, { "epoch": 5.589285714285714, "grad_norm": 0.7398735880851746, "learning_rate": 1.1858688666163752e-07, "loss": 0.0185, "step": 1252 }, { "epoch": 5.59375, "grad_norm": 0.8171510100364685, "learning_rate": 1.1605433949739546e-07, "loss": 0.0287, "step": 1253 }, { "epoch": 5.598214285714286, "grad_norm": 0.71816086769104, "learning_rate": 1.1354904515918834e-07, "loss": 0.0248, "step": 1254 }, { "epoch": 5.602678571428571, "grad_norm": 0.7628265023231506, "learning_rate": 1.1107101762533725e-07, "loss": 0.0242, "step": 1255 }, { "epoch": 5.607142857142857, "grad_norm": 0.620901882648468, "learning_rate": 1.0862027072202796e-07, "loss": 0.0199, "step": 1256 }, { "epoch": 5.611607142857143, "grad_norm": 0.9000399112701416, "learning_rate": 1.0619681812323437e-07, "loss": 0.0303, "step": 1257 }, { "epoch": 5.616071428571429, "grad_norm": 0.7469123005867004, "learning_rate": 1.0380067335064019e-07, "loss": 0.0252, "step": 1258 }, { "epoch": 5.620535714285714, "grad_norm": 0.7491857409477234, "learning_rate": 1.0143184977356513e-07, "loss": 0.0224, "step": 1259 }, { "epoch": 5.625, "grad_norm": 0.7108142971992493, "learning_rate": 9.909036060889063e-08, "loss": 0.0245, "step": 1260 }, { "epoch": 5.629464285714286, "grad_norm": 0.6871334910392761, "learning_rate": 9.677621892098471e-08, "loss": 0.0216, "step": 1261 }, { "epoch": 5.633928571428571, "grad_norm": 0.7799824476242065, "learning_rate": 9.448943762163063e-08, "loss": 0.0258, "step": 1262 }, { "epoch": 5.638392857142857, "grad_norm": 0.6933593153953552, "learning_rate": 9.223002946995223e-08, "loss": 0.0218, "step": 1263 }, { "epoch": 5.642857142857143, "grad_norm": 0.7434856295585632, "learning_rate": 8.999800707234651e-08, "loss": 0.0216, "step": 1264 }, { "epoch": 5.647321428571429, "grad_norm": 0.679129421710968, "learning_rate": 8.77933828824112e-08, "loss": 0.0178, "step": 1265 }, { "epoch": 5.651785714285714, "grad_norm": 0.7621864080429077, "learning_rate": 8.561616920087338e-08, "loss": 0.0188, "step": 1266 }, { "epoch": 5.65625, "grad_norm": 0.8270425796508789, "learning_rate": 8.346637817552435e-08, "loss": 0.0285, "step": 1267 }, { "epoch": 5.660714285714286, "grad_norm": 0.718245267868042, "learning_rate": 8.134402180115097e-08, "loss": 0.0262, "step": 1268 }, { "epoch": 5.665178571428571, "grad_norm": 0.7689396739006042, "learning_rate": 7.924911191946728e-08, "loss": 0.031, "step": 1269 }, { "epoch": 5.669642857142857, "grad_norm": 0.7054774165153503, "learning_rate": 7.718166021904903e-08, "loss": 0.0245, "step": 1270 }, { "epoch": 5.674107142857143, "grad_norm": 0.7122982740402222, "learning_rate": 7.514167823526817e-08, "loss": 0.0266, "step": 1271 }, { "epoch": 5.678571428571429, "grad_norm": 0.8461933732032776, "learning_rate": 7.31291773502313e-08, "loss": 0.0215, "step": 1272 }, { "epoch": 5.683035714285714, "grad_norm": 0.8166012763977051, "learning_rate": 7.114416879271188e-08, "loss": 0.0226, "step": 1273 }, { "epoch": 5.6875, "grad_norm": 0.6390535831451416, "learning_rate": 6.918666363808976e-08, "loss": 0.0203, "step": 1274 }, { "epoch": 5.691964285714286, "grad_norm": 0.8909913301467896, "learning_rate": 6.725667280828959e-08, "loss": 0.026, "step": 1275 }, { "epoch": 5.696428571428571, "grad_norm": 0.6632590889930725, "learning_rate": 6.535420707172025e-08, "loss": 0.0237, "step": 1276 }, { "epoch": 5.700892857142857, "grad_norm": 0.575396716594696, "learning_rate": 6.347927704321335e-08, "loss": 0.018, "step": 1277 }, { "epoch": 5.705357142857143, "grad_norm": 0.7212481498718262, "learning_rate": 6.163189318396261e-08, "loss": 0.0223, "step": 1278 }, { "epoch": 5.709821428571429, "grad_norm": 0.8443297147750854, "learning_rate": 5.981206580147232e-08, "loss": 0.021, "step": 1279 }, { "epoch": 5.714285714285714, "grad_norm": 0.8731318116188049, "learning_rate": 5.8019805049490143e-08, "loss": 0.0276, "step": 1280 }, { "epoch": 5.71875, "grad_norm": 1.091413140296936, "learning_rate": 5.625512092795828e-08, "loss": 0.02, "step": 1281 }, { "epoch": 5.723214285714286, "grad_norm": 0.6113536357879639, "learning_rate": 5.451802328295408e-08, "loss": 0.0197, "step": 1282 }, { "epoch": 5.727678571428571, "grad_norm": 0.897653341293335, "learning_rate": 5.2808521806635646e-08, "loss": 0.0318, "step": 1283 }, { "epoch": 5.732142857142857, "grad_norm": 0.7698624134063721, "learning_rate": 5.1126626037188537e-08, "loss": 0.0226, "step": 1284 }, { "epoch": 5.736607142857143, "grad_norm": 0.6883206963539124, "learning_rate": 4.9472345358769714e-08, "loss": 0.0205, "step": 1285 }, { "epoch": 5.741071428571429, "grad_norm": 0.6476832628250122, "learning_rate": 4.784568900146095e-08, "loss": 0.0199, "step": 1286 }, { "epoch": 5.745535714285714, "grad_norm": 0.671054482460022, "learning_rate": 4.624666604121047e-08, "loss": 0.0202, "step": 1287 }, { "epoch": 5.75, "grad_norm": 0.7022548317909241, "learning_rate": 4.4675285399787523e-08, "loss": 0.021, "step": 1288 }, { "epoch": 5.754464285714286, "grad_norm": 0.801397442817688, "learning_rate": 4.3131555844730135e-08, "loss": 0.023, "step": 1289 }, { "epoch": 5.758928571428571, "grad_norm": 1.183430790901184, "learning_rate": 4.161548598929689e-08, "loss": 0.0221, "step": 1290 }, { "epoch": 5.763392857142857, "grad_norm": 1.0177810192108154, "learning_rate": 4.01270842924191e-08, "loss": 0.0236, "step": 1291 }, { "epoch": 5.767857142857143, "grad_norm": 0.7828777432441711, "learning_rate": 3.8666359058652064e-08, "loss": 0.0213, "step": 1292 }, { "epoch": 5.772321428571429, "grad_norm": 0.7310546040534973, "learning_rate": 3.7233318438130064e-08, "loss": 0.0224, "step": 1293 }, { "epoch": 5.776785714285714, "grad_norm": 0.7637031078338623, "learning_rate": 3.582797042652248e-08, "loss": 0.0254, "step": 1294 }, { "epoch": 5.78125, "grad_norm": 0.9804436564445496, "learning_rate": 3.4450322864986106e-08, "loss": 0.0278, "step": 1295 }, { "epoch": 5.785714285714286, "grad_norm": 0.8916503190994263, "learning_rate": 3.310038344012184e-08, "loss": 0.0256, "step": 1296 }, { "epoch": 5.790178571428571, "grad_norm": 0.8148120641708374, "learning_rate": 3.17781596839347e-08, "loss": 0.0209, "step": 1297 }, { "epoch": 5.794642857142857, "grad_norm": 0.8668297529220581, "learning_rate": 3.0483658973788894e-08, "loss": 0.0228, "step": 1298 }, { "epoch": 5.799107142857143, "grad_norm": 0.6615080833435059, "learning_rate": 2.92168885323662e-08, "loss": 0.0184, "step": 1299 }, { "epoch": 5.803571428571429, "grad_norm": 0.7689619064331055, "learning_rate": 2.797785542762927e-08, "loss": 0.0235, "step": 1300 }, { "epoch": 5.808035714285714, "grad_norm": 0.9581748843193054, "learning_rate": 2.67665665727784e-08, "loss": 0.0265, "step": 1301 }, { "epoch": 5.8125, "grad_norm": 0.7389383316040039, "learning_rate": 2.5583028726215427e-08, "loss": 0.021, "step": 1302 }, { "epoch": 5.816964285714286, "grad_norm": 0.752805233001709, "learning_rate": 2.442724849150487e-08, "loss": 0.024, "step": 1303 }, { "epoch": 5.821428571428571, "grad_norm": 0.8107995986938477, "learning_rate": 2.3299232317335643e-08, "loss": 0.0256, "step": 1304 }, { "epoch": 5.825892857142857, "grad_norm": 0.9850029349327087, "learning_rate": 2.2198986497489963e-08, "loss": 0.0234, "step": 1305 }, { "epoch": 5.830357142857143, "grad_norm": 0.6757587194442749, "learning_rate": 2.112651717080228e-08, "loss": 0.0188, "step": 1306 }, { "epoch": 5.834821428571429, "grad_norm": 0.849671483039856, "learning_rate": 2.0081830321129298e-08, "loss": 0.0245, "step": 1307 }, { "epoch": 5.839285714285714, "grad_norm": 0.7331900000572205, "learning_rate": 1.906493177731391e-08, "loss": 0.0203, "step": 1308 }, { "epoch": 5.84375, "grad_norm": 0.7909244894981384, "learning_rate": 1.80758272131541e-08, "loss": 0.0239, "step": 1309 }, { "epoch": 5.848214285714286, "grad_norm": 0.6593401432037354, "learning_rate": 1.711452214737187e-08, "loss": 0.0192, "step": 1310 }, { "epoch": 5.852678571428571, "grad_norm": 0.8781512975692749, "learning_rate": 1.6181021943580477e-08, "loss": 0.0226, "step": 1311 }, { "epoch": 5.857142857142857, "grad_norm": 0.7089523077011108, "learning_rate": 1.5275331810256708e-08, "loss": 0.0216, "step": 1312 }, { "epoch": 5.861607142857143, "grad_norm": 0.9708623886108398, "learning_rate": 1.439745680070921e-08, "loss": 0.0273, "step": 1313 }, { "epoch": 5.866071428571429, "grad_norm": 0.7651482224464417, "learning_rate": 1.3547401813053533e-08, "loss": 0.0234, "step": 1314 }, { "epoch": 5.870535714285714, "grad_norm": 0.721098780632019, "learning_rate": 1.2725171590181043e-08, "loss": 0.0199, "step": 1315 }, { "epoch": 5.875, "grad_norm": 0.8044819831848145, "learning_rate": 1.1930770719736715e-08, "loss": 0.0258, "step": 1316 }, { "epoch": 5.879464285714286, "grad_norm": 0.6233481168746948, "learning_rate": 1.1164203634089718e-08, "loss": 0.0193, "step": 1317 }, { "epoch": 5.883928571428571, "grad_norm": 0.881504237651825, "learning_rate": 1.0425474610310654e-08, "loss": 0.028, "step": 1318 }, { "epoch": 5.888392857142857, "grad_norm": 0.6835102438926697, "learning_rate": 9.714587770147148e-09, "loss": 0.0201, "step": 1319 }, { "epoch": 5.892857142857143, "grad_norm": 0.6504446268081665, "learning_rate": 9.031547080002185e-09, "loss": 0.0197, "step": 1320 }, { "epoch": 5.897321428571429, "grad_norm": 0.6710385084152222, "learning_rate": 8.376356350909694e-09, "loss": 0.0208, "step": 1321 }, { "epoch": 5.901785714285714, "grad_norm": 0.8464110493659973, "learning_rate": 7.749019238513461e-09, "loss": 0.0312, "step": 1322 }, { "epoch": 5.90625, "grad_norm": 0.8968429565429688, "learning_rate": 7.149539243050468e-09, "loss": 0.0272, "step": 1323 }, { "epoch": 5.910714285714286, "grad_norm": 1.0725476741790771, "learning_rate": 6.577919709325367e-09, "loss": 0.0222, "step": 1324 }, { "epoch": 5.915178571428571, "grad_norm": 0.6525650024414062, "learning_rate": 6.034163826697711e-09, "loss": 0.0204, "step": 1325 }, { "epoch": 5.919642857142857, "grad_norm": 0.8145922422409058, "learning_rate": 5.518274629059199e-09, "loss": 0.023, "step": 1326 }, { "epoch": 5.924107142857143, "grad_norm": 0.7454310059547424, "learning_rate": 5.030254994820907e-09, "loss": 0.0241, "step": 1327 }, { "epoch": 5.928571428571429, "grad_norm": 0.7932132482528687, "learning_rate": 4.570107646894414e-09, "loss": 0.0249, "step": 1328 }, { "epoch": 5.933035714285714, "grad_norm": 0.8640964031219482, "learning_rate": 4.137835152677938e-09, "loss": 0.0223, "step": 1329 }, { "epoch": 5.9375, "grad_norm": 0.904984176158905, "learning_rate": 3.7334399240402185e-09, "loss": 0.0231, "step": 1330 }, { "epoch": 5.941964285714286, "grad_norm": 0.849513590335846, "learning_rate": 3.356924217310546e-09, "loss": 0.0276, "step": 1331 }, { "epoch": 5.946428571428571, "grad_norm": 0.7117705345153809, "learning_rate": 3.008290133262653e-09, "loss": 0.019, "step": 1332 }, { "epoch": 5.950892857142857, "grad_norm": 0.7012656927108765, "learning_rate": 2.687539617105282e-09, "loss": 0.0193, "step": 1333 }, { "epoch": 5.955357142857143, "grad_norm": 0.7271556854248047, "learning_rate": 2.3946744584688626e-09, "loss": 0.0241, "step": 1334 }, { "epoch": 5.959821428571429, "grad_norm": 0.6695514917373657, "learning_rate": 2.1296962913994105e-09, "loss": 0.0197, "step": 1335 }, { "epoch": 5.964285714285714, "grad_norm": 0.7411221265792847, "learning_rate": 1.892606594345199e-09, "loss": 0.0201, "step": 1336 }, { "epoch": 5.96875, "grad_norm": 0.8356890678405762, "learning_rate": 1.6834066901512136e-09, "loss": 0.0249, "step": 1337 }, { "epoch": 5.973214285714286, "grad_norm": 0.7158775329589844, "learning_rate": 1.5020977460513809e-09, "loss": 0.0232, "step": 1338 }, { "epoch": 5.977678571428571, "grad_norm": 0.7861185669898987, "learning_rate": 1.3486807736613498e-09, "loss": 0.0214, "step": 1339 }, { "epoch": 5.982142857142857, "grad_norm": 0.770765483379364, "learning_rate": 1.2231566289723888e-09, "loss": 0.0168, "step": 1340 }, { "epoch": 5.986607142857143, "grad_norm": 0.6375583410263062, "learning_rate": 1.1255260123486095e-09, "loss": 0.0216, "step": 1341 }, { "epoch": 5.991071428571429, "grad_norm": 0.795700192451477, "learning_rate": 1.0557894685208617e-09, "loss": 0.0239, "step": 1342 }, { "epoch": 5.995535714285714, "grad_norm": 0.7411868572235107, "learning_rate": 1.013947386585067e-09, "loss": 0.0249, "step": 1343 }, { "epoch": 6.0, "grad_norm": 0.7516708970069885, "learning_rate": 1e-09, "loss": 0.02, "step": 1344 }, { "epoch": 6.0, "step": 1344, "total_flos": 2.429040165805621e+18, "train_loss": 0.1964568888388818, "train_runtime": 10784.1142, "train_samples_per_second": 1.993, "train_steps_per_second": 0.125 } ], "logging_steps": 1.0, "max_steps": 1344, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 200, "total_flos": 2.429040165805621e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }