diff --git "a/20_23/trainer_state.json" "b/20_23/trainer_state.json" new file mode 100644--- /dev/null +++ "b/20_23/trainer_state.json" @@ -0,0 +1,5343 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 759, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.4626648426055908, + "learning_rate": 4.347826086956522e-06, + "loss": 0.3776, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1.4091928005218506, + "learning_rate": 8.695652173913044e-06, + "loss": 0.3922, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1.363326072692871, + "learning_rate": 1.3043478260869566e-05, + "loss": 0.3599, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 1.1503901481628418, + "learning_rate": 1.739130434782609e-05, + "loss": 0.318, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.7365157604217529, + "learning_rate": 2.173913043478261e-05, + "loss": 0.2927, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.6607586145401001, + "learning_rate": 2.608695652173913e-05, + "loss": 0.2404, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.5922650098800659, + "learning_rate": 3.0434782608695656e-05, + "loss": 0.1948, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 0.3794650733470917, + "learning_rate": 3.478260869565218e-05, + "loss": 0.15, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 0.3533678948879242, + "learning_rate": 3.91304347826087e-05, + "loss": 0.1375, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 0.28944671154022217, + "learning_rate": 4.347826086956522e-05, + "loss": 0.1285, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.22203072905540466, + "learning_rate": 4.782608695652174e-05, + "loss": 0.1202, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.20047953724861145, + "learning_rate": 5.217391304347826e-05, + "loss": 0.1136, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.16146406531333923, + "learning_rate": 5.652173913043478e-05, + "loss": 0.0967, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.22107285261154175, + "learning_rate": 6.086956521739131e-05, + "loss": 0.1121, + "step": 14 + }, + { + "epoch": 0.02, + "grad_norm": 0.22693370282649994, + "learning_rate": 6.521739130434783e-05, + "loss": 0.1157, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.2420782446861267, + "learning_rate": 6.956521739130436e-05, + "loss": 0.1019, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 0.2712867259979248, + "learning_rate": 7.391304347826086e-05, + "loss": 0.0927, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 0.16125057637691498, + "learning_rate": 7.82608695652174e-05, + "loss": 0.0956, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.1539854109287262, + "learning_rate": 8.260869565217392e-05, + "loss": 0.0887, + "step": 19 + }, + { + "epoch": 0.03, + "grad_norm": 0.17831459641456604, + "learning_rate": 8.695652173913044e-05, + "loss": 0.0993, + "step": 20 + }, + { + "epoch": 0.03, + "grad_norm": 0.17226484417915344, + "learning_rate": 9.130434782608696e-05, + "loss": 0.0974, + "step": 21 + }, + { + "epoch": 0.03, + "grad_norm": 0.17678162455558777, + "learning_rate": 9.565217391304348e-05, + "loss": 0.0938, + "step": 22 + }, + { + "epoch": 0.03, + "grad_norm": 0.18184365332126617, + "learning_rate": 0.0001, + "loss": 0.0952, + "step": 23 + }, + { + "epoch": 0.03, + "grad_norm": 0.20284950733184814, + "learning_rate": 9.99995445051553e-05, + "loss": 0.0948, + "step": 24 + }, + { + "epoch": 0.03, + "grad_norm": 0.1573178917169571, + "learning_rate": 9.99981780289202e-05, + "loss": 0.0927, + "step": 25 + }, + { + "epoch": 0.03, + "grad_norm": 0.19211460649967194, + "learning_rate": 9.999590059619164e-05, + "loss": 0.0909, + "step": 26 + }, + { + "epoch": 0.04, + "grad_norm": 0.13000018894672394, + "learning_rate": 9.999271224846396e-05, + "loss": 0.0935, + "step": 27 + }, + { + "epoch": 0.04, + "grad_norm": 0.10931742191314697, + "learning_rate": 9.998861304382818e-05, + "loss": 0.0875, + "step": 28 + }, + { + "epoch": 0.04, + "grad_norm": 0.15605789422988892, + "learning_rate": 9.998360305697101e-05, + "loss": 0.1009, + "step": 29 + }, + { + "epoch": 0.04, + "grad_norm": 0.09958553314208984, + "learning_rate": 9.997768237917333e-05, + "loss": 0.0859, + "step": 30 + }, + { + "epoch": 0.04, + "grad_norm": 0.16153687238693237, + "learning_rate": 9.99708511183087e-05, + "loss": 0.0928, + "step": 31 + }, + { + "epoch": 0.04, + "grad_norm": 0.13266044855117798, + "learning_rate": 9.996310939884128e-05, + "loss": 0.0884, + "step": 32 + }, + { + "epoch": 0.04, + "grad_norm": 0.14477793872356415, + "learning_rate": 9.995445736182358e-05, + "loss": 0.1003, + "step": 33 + }, + { + "epoch": 0.04, + "grad_norm": 0.16262026131153107, + "learning_rate": 9.994489516489396e-05, + "loss": 0.0959, + "step": 34 + }, + { + "epoch": 0.05, + "grad_norm": 0.1339733898639679, + "learning_rate": 9.993442298227365e-05, + "loss": 0.0834, + "step": 35 + }, + { + "epoch": 0.05, + "grad_norm": 0.1648261696100235, + "learning_rate": 9.992304100476367e-05, + "loss": 0.091, + "step": 36 + }, + { + "epoch": 0.05, + "grad_norm": 0.10690194368362427, + "learning_rate": 9.991074943974131e-05, + "loss": 0.0917, + "step": 37 + }, + { + "epoch": 0.05, + "grad_norm": 0.13092245161533356, + "learning_rate": 9.989754851115635e-05, + "loss": 0.0875, + "step": 38 + }, + { + "epoch": 0.05, + "grad_norm": 0.11928656697273254, + "learning_rate": 9.988343845952697e-05, + "loss": 0.0908, + "step": 39 + }, + { + "epoch": 0.05, + "grad_norm": 0.14026062190532684, + "learning_rate": 9.98684195419354e-05, + "loss": 0.0885, + "step": 40 + }, + { + "epoch": 0.05, + "grad_norm": 0.16105924546718597, + "learning_rate": 9.985249203202324e-05, + "loss": 0.0836, + "step": 41 + }, + { + "epoch": 0.06, + "grad_norm": 0.1550360769033432, + "learning_rate": 9.983565621998644e-05, + "loss": 0.0865, + "step": 42 + }, + { + "epoch": 0.06, + "grad_norm": 0.1308027058839798, + "learning_rate": 9.981791241257e-05, + "loss": 0.0924, + "step": 43 + }, + { + "epoch": 0.06, + "grad_norm": 0.15918859839439392, + "learning_rate": 9.979926093306245e-05, + "loss": 0.0935, + "step": 44 + }, + { + "epoch": 0.06, + "grad_norm": 0.1623181700706482, + "learning_rate": 9.97797021212899e-05, + "loss": 0.0873, + "step": 45 + }, + { + "epoch": 0.06, + "grad_norm": 0.11046040058135986, + "learning_rate": 9.975923633360985e-05, + "loss": 0.0887, + "step": 46 + }, + { + "epoch": 0.06, + "grad_norm": 0.14533914625644684, + "learning_rate": 9.973786394290474e-05, + "loss": 0.0928, + "step": 47 + }, + { + "epoch": 0.06, + "grad_norm": 0.10755528509616852, + "learning_rate": 9.971558533857515e-05, + "loss": 0.0811, + "step": 48 + }, + { + "epoch": 0.06, + "grad_norm": 0.13999031484127045, + "learning_rate": 9.969240092653261e-05, + "loss": 0.0872, + "step": 49 + }, + { + "epoch": 0.07, + "grad_norm": 0.11199218034744263, + "learning_rate": 9.966831112919235e-05, + "loss": 0.083, + "step": 50 + }, + { + "epoch": 0.07, + "grad_norm": 0.11688799411058426, + "learning_rate": 9.96433163854655e-05, + "loss": 0.0916, + "step": 51 + }, + { + "epoch": 0.07, + "grad_norm": 0.1127270981669426, + "learning_rate": 9.961741715075115e-05, + "loss": 0.0908, + "step": 52 + }, + { + "epoch": 0.07, + "grad_norm": 0.1565464586019516, + "learning_rate": 9.9590613896928e-05, + "loss": 0.0869, + "step": 53 + }, + { + "epoch": 0.07, + "grad_norm": 0.10714145749807358, + "learning_rate": 9.956290711234584e-05, + "loss": 0.086, + "step": 54 + }, + { + "epoch": 0.07, + "grad_norm": 0.09617923945188522, + "learning_rate": 9.953429730181653e-05, + "loss": 0.0911, + "step": 55 + }, + { + "epoch": 0.07, + "grad_norm": 0.1050075963139534, + "learning_rate": 9.950478498660496e-05, + "loss": 0.0869, + "step": 56 + }, + { + "epoch": 0.08, + "grad_norm": 0.11093985289335251, + "learning_rate": 9.947437070441938e-05, + "loss": 0.0896, + "step": 57 + }, + { + "epoch": 0.08, + "grad_norm": 0.10569228231906891, + "learning_rate": 9.944305500940178e-05, + "loss": 0.0954, + "step": 58 + }, + { + "epoch": 0.08, + "grad_norm": 0.10598912090063095, + "learning_rate": 9.941083847211765e-05, + "loss": 0.0909, + "step": 59 + }, + { + "epoch": 0.08, + "grad_norm": 0.1131819486618042, + "learning_rate": 9.937772167954564e-05, + "loss": 0.0962, + "step": 60 + }, + { + "epoch": 0.08, + "grad_norm": 0.1235487088561058, + "learning_rate": 9.93437052350669e-05, + "loss": 0.096, + "step": 61 + }, + { + "epoch": 0.08, + "grad_norm": 0.13566148281097412, + "learning_rate": 9.930878975845406e-05, + "loss": 0.0917, + "step": 62 + }, + { + "epoch": 0.08, + "grad_norm": 0.10908336937427521, + "learning_rate": 9.927297588585984e-05, + "loss": 0.0853, + "step": 63 + }, + { + "epoch": 0.08, + "grad_norm": 0.10724148899316788, + "learning_rate": 9.923626426980567e-05, + "loss": 0.0853, + "step": 64 + }, + { + "epoch": 0.09, + "grad_norm": 0.1097191870212555, + "learning_rate": 9.91986555791696e-05, + "loss": 0.0821, + "step": 65 + }, + { + "epoch": 0.09, + "grad_norm": 0.09474969655275345, + "learning_rate": 9.91601504991742e-05, + "loss": 0.079, + "step": 66 + }, + { + "epoch": 0.09, + "grad_norm": 0.1255607008934021, + "learning_rate": 9.912074973137412e-05, + "loss": 0.0877, + "step": 67 + }, + { + "epoch": 0.09, + "grad_norm": 0.08756373077630997, + "learning_rate": 9.908045399364322e-05, + "loss": 0.0859, + "step": 68 + }, + { + "epoch": 0.09, + "grad_norm": 0.13312621414661407, + "learning_rate": 9.903926402016153e-05, + "loss": 0.0924, + "step": 69 + }, + { + "epoch": 0.09, + "grad_norm": 0.10092984139919281, + "learning_rate": 9.899718056140186e-05, + "loss": 0.0791, + "step": 70 + }, + { + "epoch": 0.09, + "grad_norm": 0.11156034469604492, + "learning_rate": 9.895420438411616e-05, + "loss": 0.0794, + "step": 71 + }, + { + "epoch": 0.09, + "grad_norm": 0.09780649840831757, + "learning_rate": 9.891033627132148e-05, + "loss": 0.0881, + "step": 72 + }, + { + "epoch": 0.1, + "grad_norm": 0.10802754014730453, + "learning_rate": 9.886557702228587e-05, + "loss": 0.0863, + "step": 73 + }, + { + "epoch": 0.1, + "grad_norm": 0.09974031895399094, + "learning_rate": 9.881992745251356e-05, + "loss": 0.0781, + "step": 74 + }, + { + "epoch": 0.1, + "grad_norm": 0.09702256321907043, + "learning_rate": 9.877338839373032e-05, + "loss": 0.0831, + "step": 75 + }, + { + "epoch": 0.1, + "grad_norm": 0.11543964594602585, + "learning_rate": 9.872596069386817e-05, + "loss": 0.0894, + "step": 76 + }, + { + "epoch": 0.1, + "grad_norm": 0.09120846539735794, + "learning_rate": 9.867764521705005e-05, + "loss": 0.0875, + "step": 77 + }, + { + "epoch": 0.1, + "grad_norm": 0.09430748224258423, + "learning_rate": 9.8628442843574e-05, + "loss": 0.0743, + "step": 78 + }, + { + "epoch": 0.1, + "grad_norm": 0.09428028017282486, + "learning_rate": 9.857835446989707e-05, + "loss": 0.0807, + "step": 79 + }, + { + "epoch": 0.11, + "grad_norm": 0.08958426862955093, + "learning_rate": 9.852738100861916e-05, + "loss": 0.092, + "step": 80 + }, + { + "epoch": 0.11, + "grad_norm": 0.10480056703090668, + "learning_rate": 9.847552338846617e-05, + "loss": 0.0864, + "step": 81 + }, + { + "epoch": 0.11, + "grad_norm": 0.1178659051656723, + "learning_rate": 9.842278255427327e-05, + "loss": 0.079, + "step": 82 + }, + { + "epoch": 0.11, + "grad_norm": 0.15165001153945923, + "learning_rate": 9.836915946696759e-05, + "loss": 0.0948, + "step": 83 + }, + { + "epoch": 0.11, + "grad_norm": 0.12370532751083374, + "learning_rate": 9.831465510355069e-05, + "loss": 0.0782, + "step": 84 + }, + { + "epoch": 0.11, + "grad_norm": 0.1311461627483368, + "learning_rate": 9.825927045708088e-05, + "loss": 0.0865, + "step": 85 + }, + { + "epoch": 0.11, + "grad_norm": 0.10418644547462463, + "learning_rate": 9.820300653665495e-05, + "loss": 0.0789, + "step": 86 + }, + { + "epoch": 0.11, + "grad_norm": 0.11643680185079575, + "learning_rate": 9.814586436738998e-05, + "loss": 0.0846, + "step": 87 + }, + { + "epoch": 0.12, + "grad_norm": 0.12964823842048645, + "learning_rate": 9.808784499040445e-05, + "loss": 0.0789, + "step": 88 + }, + { + "epoch": 0.12, + "grad_norm": 0.1188376173377037, + "learning_rate": 9.80289494627995e-05, + "loss": 0.0903, + "step": 89 + }, + { + "epoch": 0.12, + "grad_norm": 0.08869314193725586, + "learning_rate": 9.796917885763946e-05, + "loss": 0.0791, + "step": 90 + }, + { + "epoch": 0.12, + "grad_norm": 0.09060724079608917, + "learning_rate": 9.790853426393245e-05, + "loss": 0.0795, + "step": 91 + }, + { + "epoch": 0.12, + "grad_norm": 0.09172531962394714, + "learning_rate": 9.784701678661045e-05, + "loss": 0.0826, + "step": 92 + }, + { + "epoch": 0.12, + "grad_norm": 0.08181829005479813, + "learning_rate": 9.778462754650921e-05, + "loss": 0.0847, + "step": 93 + }, + { + "epoch": 0.12, + "grad_norm": 0.0949179083108902, + "learning_rate": 9.772136768034785e-05, + "loss": 0.0813, + "step": 94 + }, + { + "epoch": 0.13, + "grad_norm": 0.0956520140171051, + "learning_rate": 9.765723834070804e-05, + "loss": 0.086, + "step": 95 + }, + { + "epoch": 0.13, + "grad_norm": 0.1323326826095581, + "learning_rate": 9.759224069601317e-05, + "loss": 0.0838, + "step": 96 + }, + { + "epoch": 0.13, + "grad_norm": 0.1346251666545868, + "learning_rate": 9.752637593050689e-05, + "loss": 0.0841, + "step": 97 + }, + { + "epoch": 0.13, + "grad_norm": 0.09667576849460602, + "learning_rate": 9.745964524423165e-05, + "loss": 0.0802, + "step": 98 + }, + { + "epoch": 0.13, + "grad_norm": 0.13813923299312592, + "learning_rate": 9.739204985300679e-05, + "loss": 0.0814, + "step": 99 + }, + { + "epoch": 0.13, + "grad_norm": 0.11109049618244171, + "learning_rate": 9.732359098840642e-05, + "loss": 0.0923, + "step": 100 + }, + { + "epoch": 0.13, + "grad_norm": 0.10362949222326279, + "learning_rate": 9.725426989773693e-05, + "loss": 0.0843, + "step": 101 + }, + { + "epoch": 0.13, + "grad_norm": 0.11478770524263382, + "learning_rate": 9.718408784401427e-05, + "loss": 0.0864, + "step": 102 + }, + { + "epoch": 0.14, + "grad_norm": 0.11098852008581161, + "learning_rate": 9.711304610594104e-05, + "loss": 0.0906, + "step": 103 + }, + { + "epoch": 0.14, + "grad_norm": 0.1379733830690384, + "learning_rate": 9.7041145977883e-05, + "loss": 0.0802, + "step": 104 + }, + { + "epoch": 0.14, + "grad_norm": 0.12902683019638062, + "learning_rate": 9.696838876984569e-05, + "loss": 0.0805, + "step": 105 + }, + { + "epoch": 0.14, + "grad_norm": 0.097101591527462, + "learning_rate": 9.689477580745042e-05, + "loss": 0.0841, + "step": 106 + }, + { + "epoch": 0.14, + "grad_norm": 0.12172501534223557, + "learning_rate": 9.682030843191022e-05, + "loss": 0.0837, + "step": 107 + }, + { + "epoch": 0.14, + "grad_norm": 0.11309538781642914, + "learning_rate": 9.674498800000526e-05, + "loss": 0.0857, + "step": 108 + }, + { + "epoch": 0.14, + "grad_norm": 0.10910997539758682, + "learning_rate": 9.666881588405832e-05, + "loss": 0.0939, + "step": 109 + }, + { + "epoch": 0.14, + "grad_norm": 0.12583476305007935, + "learning_rate": 9.659179347190963e-05, + "loss": 0.0914, + "step": 110 + }, + { + "epoch": 0.15, + "grad_norm": 0.10188722610473633, + "learning_rate": 9.651392216689165e-05, + "loss": 0.0838, + "step": 111 + }, + { + "epoch": 0.15, + "grad_norm": 0.08564438670873642, + "learning_rate": 9.643520338780354e-05, + "loss": 0.0773, + "step": 112 + }, + { + "epoch": 0.15, + "grad_norm": 0.08970914781093597, + "learning_rate": 9.635563856888517e-05, + "loss": 0.0916, + "step": 113 + }, + { + "epoch": 0.15, + "grad_norm": 0.09510532021522522, + "learning_rate": 9.627522915979114e-05, + "loss": 0.0861, + "step": 114 + }, + { + "epoch": 0.15, + "grad_norm": 0.10671927779912949, + "learning_rate": 9.619397662556435e-05, + "loss": 0.087, + "step": 115 + }, + { + "epoch": 0.15, + "grad_norm": 0.10593091696500778, + "learning_rate": 9.611188244660915e-05, + "loss": 0.0815, + "step": 116 + }, + { + "epoch": 0.15, + "grad_norm": 0.14427386224269867, + "learning_rate": 9.60289481186646e-05, + "loss": 0.0847, + "step": 117 + }, + { + "epoch": 0.16, + "grad_norm": 0.0999630019068718, + "learning_rate": 9.594517515277705e-05, + "loss": 0.081, + "step": 118 + }, + { + "epoch": 0.16, + "grad_norm": 0.10400121659040451, + "learning_rate": 9.586056507527266e-05, + "loss": 0.0778, + "step": 119 + }, + { + "epoch": 0.16, + "grad_norm": 0.09455764293670654, + "learning_rate": 9.577511942772957e-05, + "loss": 0.0838, + "step": 120 + }, + { + "epoch": 0.16, + "grad_norm": 0.10111963003873825, + "learning_rate": 9.568883976694989e-05, + "loss": 0.0807, + "step": 121 + }, + { + "epoch": 0.16, + "grad_norm": 0.10892399400472641, + "learning_rate": 9.560172766493122e-05, + "loss": 0.0731, + "step": 122 + }, + { + "epoch": 0.16, + "grad_norm": 0.1193283423781395, + "learning_rate": 9.551378470883812e-05, + "loss": 0.0787, + "step": 123 + }, + { + "epoch": 0.16, + "grad_norm": 0.11868227273225784, + "learning_rate": 9.542501250097309e-05, + "loss": 0.0794, + "step": 124 + }, + { + "epoch": 0.16, + "grad_norm": 0.11394625902175903, + "learning_rate": 9.533541265874749e-05, + "loss": 0.0806, + "step": 125 + }, + { + "epoch": 0.17, + "grad_norm": 0.11192644387483597, + "learning_rate": 9.524498681465191e-05, + "loss": 0.084, + "step": 126 + }, + { + "epoch": 0.17, + "grad_norm": 0.1109413430094719, + "learning_rate": 9.515373661622664e-05, + "loss": 0.0783, + "step": 127 + }, + { + "epoch": 0.17, + "grad_norm": 0.12146401405334473, + "learning_rate": 9.506166372603144e-05, + "loss": 0.0799, + "step": 128 + }, + { + "epoch": 0.17, + "grad_norm": 0.09204913675785065, + "learning_rate": 9.496876982161542e-05, + "loss": 0.0795, + "step": 129 + }, + { + "epoch": 0.17, + "grad_norm": 0.13367091119289398, + "learning_rate": 9.487505659548633e-05, + "loss": 0.0865, + "step": 130 + }, + { + "epoch": 0.17, + "grad_norm": 0.11176060140132904, + "learning_rate": 9.478052575507982e-05, + "loss": 0.0832, + "step": 131 + }, + { + "epoch": 0.17, + "grad_norm": 0.0970437228679657, + "learning_rate": 9.468517902272836e-05, + "loss": 0.0853, + "step": 132 + }, + { + "epoch": 0.18, + "grad_norm": 0.10132851451635361, + "learning_rate": 9.45890181356297e-05, + "loss": 0.0906, + "step": 133 + }, + { + "epoch": 0.18, + "grad_norm": 0.12170128524303436, + "learning_rate": 9.449204484581539e-05, + "loss": 0.0869, + "step": 134 + }, + { + "epoch": 0.18, + "grad_norm": 0.10223282128572464, + "learning_rate": 9.439426092011875e-05, + "loss": 0.0772, + "step": 135 + }, + { + "epoch": 0.18, + "grad_norm": 0.11369918286800385, + "learning_rate": 9.429566814014281e-05, + "loss": 0.0819, + "step": 136 + }, + { + "epoch": 0.18, + "grad_norm": 0.12759996950626373, + "learning_rate": 9.419626830222762e-05, + "loss": 0.0837, + "step": 137 + }, + { + "epoch": 0.18, + "grad_norm": 0.11163049191236496, + "learning_rate": 9.409606321741775e-05, + "loss": 0.0868, + "step": 138 + }, + { + "epoch": 0.18, + "grad_norm": 0.10981953144073486, + "learning_rate": 9.39950547114292e-05, + "loss": 0.079, + "step": 139 + }, + { + "epoch": 0.18, + "grad_norm": 0.09947255253791809, + "learning_rate": 9.38932446246161e-05, + "loss": 0.0803, + "step": 140 + }, + { + "epoch": 0.19, + "grad_norm": 0.10793828219175339, + "learning_rate": 9.379063481193725e-05, + "loss": 0.0888, + "step": 141 + }, + { + "epoch": 0.19, + "grad_norm": 0.10283903777599335, + "learning_rate": 9.368722714292227e-05, + "loss": 0.082, + "step": 142 + }, + { + "epoch": 0.19, + "grad_norm": 0.11633747816085815, + "learning_rate": 9.358302350163757e-05, + "loss": 0.0737, + "step": 143 + }, + { + "epoch": 0.19, + "grad_norm": 0.11395063251256943, + "learning_rate": 9.347802578665199e-05, + "loss": 0.0781, + "step": 144 + }, + { + "epoch": 0.19, + "grad_norm": 0.11062236875295639, + "learning_rate": 9.337223591100228e-05, + "loss": 0.0848, + "step": 145 + }, + { + "epoch": 0.19, + "grad_norm": 0.1253620684146881, + "learning_rate": 9.326565580215811e-05, + "loss": 0.0907, + "step": 146 + }, + { + "epoch": 0.19, + "grad_norm": 0.1102418303489685, + "learning_rate": 9.315828740198714e-05, + "loss": 0.0814, + "step": 147 + }, + { + "epoch": 0.19, + "grad_norm": 0.11026754230260849, + "learning_rate": 9.305013266671945e-05, + "loss": 0.0759, + "step": 148 + }, + { + "epoch": 0.2, + "grad_norm": 0.12312046438455582, + "learning_rate": 9.2941193566912e-05, + "loss": 0.0853, + "step": 149 + }, + { + "epoch": 0.2, + "grad_norm": 0.1312018185853958, + "learning_rate": 9.283147208741276e-05, + "loss": 0.0862, + "step": 150 + }, + { + "epoch": 0.2, + "grad_norm": 0.14230753481388092, + "learning_rate": 9.272097022732443e-05, + "loss": 0.0913, + "step": 151 + }, + { + "epoch": 0.2, + "grad_norm": 0.1690019816160202, + "learning_rate": 9.260968999996813e-05, + "loss": 0.0892, + "step": 152 + }, + { + "epoch": 0.2, + "grad_norm": 0.08823404461145401, + "learning_rate": 9.249763343284665e-05, + "loss": 0.0788, + "step": 153 + }, + { + "epoch": 0.2, + "grad_norm": 0.09784504026174545, + "learning_rate": 9.238480256760755e-05, + "loss": 0.0799, + "step": 154 + }, + { + "epoch": 0.2, + "grad_norm": 0.11563264578580856, + "learning_rate": 9.22711994600059e-05, + "loss": 0.0839, + "step": 155 + }, + { + "epoch": 0.21, + "grad_norm": 0.09026098251342773, + "learning_rate": 9.21568261798669e-05, + "loss": 0.0812, + "step": 156 + }, + { + "epoch": 0.21, + "grad_norm": 0.09114386886358261, + "learning_rate": 9.204168481104815e-05, + "loss": 0.0818, + "step": 157 + }, + { + "epoch": 0.21, + "grad_norm": 0.09261880069971085, + "learning_rate": 9.192577745140164e-05, + "loss": 0.0775, + "step": 158 + }, + { + "epoch": 0.21, + "grad_norm": 0.11318907141685486, + "learning_rate": 9.180910621273555e-05, + "loss": 0.0804, + "step": 159 + }, + { + "epoch": 0.21, + "grad_norm": 0.1209554672241211, + "learning_rate": 9.169167322077578e-05, + "loss": 0.0854, + "step": 160 + }, + { + "epoch": 0.21, + "grad_norm": 0.12757565081119537, + "learning_rate": 9.157348061512727e-05, + "loss": 0.0886, + "step": 161 + }, + { + "epoch": 0.21, + "grad_norm": 0.11294035613536835, + "learning_rate": 9.145453054923487e-05, + "loss": 0.0941, + "step": 162 + }, + { + "epoch": 0.21, + "grad_norm": 0.11256886273622513, + "learning_rate": 9.133482519034428e-05, + "loss": 0.0771, + "step": 163 + }, + { + "epoch": 0.22, + "grad_norm": 0.11136170476675034, + "learning_rate": 9.121436671946247e-05, + "loss": 0.0873, + "step": 164 + }, + { + "epoch": 0.22, + "grad_norm": 0.12420880794525146, + "learning_rate": 9.109315733131791e-05, + "loss": 0.0808, + "step": 165 + }, + { + "epoch": 0.22, + "grad_norm": 0.11222650110721588, + "learning_rate": 9.097119923432066e-05, + "loss": 0.0855, + "step": 166 + }, + { + "epoch": 0.22, + "grad_norm": 0.09688053280115128, + "learning_rate": 9.08484946505221e-05, + "loss": 0.0795, + "step": 167 + }, + { + "epoch": 0.22, + "grad_norm": 0.10351915657520294, + "learning_rate": 9.072504581557446e-05, + "loss": 0.0864, + "step": 168 + }, + { + "epoch": 0.22, + "grad_norm": 0.10438606888055801, + "learning_rate": 9.060085497869004e-05, + "loss": 0.0805, + "step": 169 + }, + { + "epoch": 0.22, + "grad_norm": 0.11572164297103882, + "learning_rate": 9.047592440260029e-05, + "loss": 0.0786, + "step": 170 + }, + { + "epoch": 0.23, + "grad_norm": 0.10311150550842285, + "learning_rate": 9.035025636351452e-05, + "loss": 0.0813, + "step": 171 + }, + { + "epoch": 0.23, + "grad_norm": 0.0920066237449646, + "learning_rate": 9.022385315107853e-05, + "loss": 0.0836, + "step": 172 + }, + { + "epoch": 0.23, + "grad_norm": 0.12571555376052856, + "learning_rate": 9.009671706833276e-05, + "loss": 0.0908, + "step": 173 + }, + { + "epoch": 0.23, + "grad_norm": 0.12330035865306854, + "learning_rate": 8.99688504316704e-05, + "loss": 0.0832, + "step": 174 + }, + { + "epoch": 0.23, + "grad_norm": 0.11306159198284149, + "learning_rate": 8.984025557079523e-05, + "loss": 0.0804, + "step": 175 + }, + { + "epoch": 0.23, + "grad_norm": 0.0854019969701767, + "learning_rate": 8.97109348286791e-05, + "loss": 0.0878, + "step": 176 + }, + { + "epoch": 0.23, + "grad_norm": 0.09134180843830109, + "learning_rate": 8.958089056151924e-05, + "loss": 0.086, + "step": 177 + }, + { + "epoch": 0.23, + "grad_norm": 0.1045374944806099, + "learning_rate": 8.945012513869542e-05, + "loss": 0.0812, + "step": 178 + }, + { + "epoch": 0.24, + "grad_norm": 0.10817347466945648, + "learning_rate": 8.931864094272663e-05, + "loss": 0.0733, + "step": 179 + }, + { + "epoch": 0.24, + "grad_norm": 0.11727076768875122, + "learning_rate": 8.918644036922783e-05, + "loss": 0.0791, + "step": 180 + }, + { + "epoch": 0.24, + "grad_norm": 0.09995684027671814, + "learning_rate": 8.905352582686622e-05, + "loss": 0.0784, + "step": 181 + }, + { + "epoch": 0.24, + "grad_norm": 0.10814005136489868, + "learning_rate": 8.891989973731735e-05, + "loss": 0.0801, + "step": 182 + }, + { + "epoch": 0.24, + "grad_norm": 0.11438959091901779, + "learning_rate": 8.8785564535221e-05, + "loss": 0.0825, + "step": 183 + }, + { + "epoch": 0.24, + "grad_norm": 0.09477780759334564, + "learning_rate": 8.865052266813685e-05, + "loss": 0.0781, + "step": 184 + }, + { + "epoch": 0.24, + "grad_norm": 0.12783315777778625, + "learning_rate": 8.851477659649989e-05, + "loss": 0.0852, + "step": 185 + }, + { + "epoch": 0.25, + "grad_norm": 0.13230524957180023, + "learning_rate": 8.837832879357555e-05, + "loss": 0.0855, + "step": 186 + }, + { + "epoch": 0.25, + "grad_norm": 0.11520255357027054, + "learning_rate": 8.824118174541464e-05, + "loss": 0.0799, + "step": 187 + }, + { + "epoch": 0.25, + "grad_norm": 0.11846806108951569, + "learning_rate": 8.810333795080812e-05, + "loss": 0.0837, + "step": 188 + }, + { + "epoch": 0.25, + "grad_norm": 0.1257033497095108, + "learning_rate": 8.79647999212415e-05, + "loss": 0.0812, + "step": 189 + }, + { + "epoch": 0.25, + "grad_norm": 0.1521902084350586, + "learning_rate": 8.782557018084908e-05, + "loss": 0.0767, + "step": 190 + }, + { + "epoch": 0.25, + "grad_norm": 0.11576004326343536, + "learning_rate": 8.768565126636806e-05, + "loss": 0.0792, + "step": 191 + }, + { + "epoch": 0.25, + "grad_norm": 0.1163419857621193, + "learning_rate": 8.754504572709219e-05, + "loss": 0.0762, + "step": 192 + }, + { + "epoch": 0.25, + "grad_norm": 0.11233483254909515, + "learning_rate": 8.740375612482541e-05, + "loss": 0.0874, + "step": 193 + }, + { + "epoch": 0.26, + "grad_norm": 0.11611688137054443, + "learning_rate": 8.726178503383513e-05, + "loss": 0.0858, + "step": 194 + }, + { + "epoch": 0.26, + "grad_norm": 0.10915149748325348, + "learning_rate": 8.711913504080534e-05, + "loss": 0.0814, + "step": 195 + }, + { + "epoch": 0.26, + "grad_norm": 0.11281640082597733, + "learning_rate": 8.697580874478952e-05, + "loss": 0.0827, + "step": 196 + }, + { + "epoch": 0.26, + "grad_norm": 0.08252295106649399, + "learning_rate": 8.683180875716322e-05, + "loss": 0.0791, + "step": 197 + }, + { + "epoch": 0.26, + "grad_norm": 0.09433229267597198, + "learning_rate": 8.668713770157652e-05, + "loss": 0.0769, + "step": 198 + }, + { + "epoch": 0.26, + "grad_norm": 0.11094626039266586, + "learning_rate": 8.654179821390621e-05, + "loss": 0.0786, + "step": 199 + }, + { + "epoch": 0.26, + "grad_norm": 0.11672339588403702, + "learning_rate": 8.639579294220779e-05, + "loss": 0.0744, + "step": 200 + }, + { + "epoch": 0.26, + "grad_norm": 0.0986831933259964, + "learning_rate": 8.624912454666723e-05, + "loss": 0.0759, + "step": 201 + }, + { + "epoch": 0.27, + "grad_norm": 0.13550534844398499, + "learning_rate": 8.610179569955238e-05, + "loss": 0.0746, + "step": 202 + }, + { + "epoch": 0.27, + "grad_norm": 0.0924728587269783, + "learning_rate": 8.595380908516454e-05, + "loss": 0.0794, + "step": 203 + }, + { + "epoch": 0.27, + "grad_norm": 0.09090446680784225, + "learning_rate": 8.580516739978926e-05, + "loss": 0.0822, + "step": 204 + }, + { + "epoch": 0.27, + "grad_norm": 0.1201312392950058, + "learning_rate": 8.56558733516474e-05, + "loss": 0.0856, + "step": 205 + }, + { + "epoch": 0.27, + "grad_norm": 0.10289566963911057, + "learning_rate": 8.550592966084573e-05, + "loss": 0.0798, + "step": 206 + }, + { + "epoch": 0.27, + "grad_norm": 0.1051202043890953, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0688, + "step": 207 + }, + { + "epoch": 0.27, + "grad_norm": 0.09235143661499023, + "learning_rate": 8.520410429082206e-05, + "loss": 0.0793, + "step": 208 + }, + { + "epoch": 0.28, + "grad_norm": 0.11254550516605377, + "learning_rate": 8.505222811079608e-05, + "loss": 0.079, + "step": 209 + }, + { + "epoch": 0.28, + "grad_norm": 0.11825918406248093, + "learning_rate": 8.489971328640207e-05, + "loss": 0.0744, + "step": 210 + }, + { + "epoch": 0.28, + "grad_norm": 0.13841897249221802, + "learning_rate": 8.474656259642873e-05, + "loss": 0.089, + "step": 211 + }, + { + "epoch": 0.28, + "grad_norm": 0.109561987221241, + "learning_rate": 8.459277883125004e-05, + "loss": 0.0777, + "step": 212 + }, + { + "epoch": 0.28, + "grad_norm": 0.16192643344402313, + "learning_rate": 8.443836479277449e-05, + "loss": 0.0894, + "step": 213 + }, + { + "epoch": 0.28, + "grad_norm": 0.10433942824602127, + "learning_rate": 8.428332329439399e-05, + "loss": 0.079, + "step": 214 + }, + { + "epoch": 0.28, + "grad_norm": 0.10524985939264297, + "learning_rate": 8.412765716093272e-05, + "loss": 0.0675, + "step": 215 + }, + { + "epoch": 0.28, + "grad_norm": 0.08769605308771133, + "learning_rate": 8.397136922859548e-05, + "loss": 0.0823, + "step": 216 + }, + { + "epoch": 0.29, + "grad_norm": 0.10349096357822418, + "learning_rate": 8.381446234491618e-05, + "loss": 0.0847, + "step": 217 + }, + { + "epoch": 0.29, + "grad_norm": 0.11507713049650192, + "learning_rate": 8.365693936870594e-05, + "loss": 0.0777, + "step": 218 + }, + { + "epoch": 0.29, + "grad_norm": 0.120513916015625, + "learning_rate": 8.349880317000082e-05, + "loss": 0.0887, + "step": 219 + }, + { + "epoch": 0.29, + "grad_norm": 0.09667017310857773, + "learning_rate": 8.334005663000982e-05, + "loss": 0.0702, + "step": 220 + }, + { + "epoch": 0.29, + "grad_norm": 0.09578403830528259, + "learning_rate": 8.318070264106213e-05, + "loss": 0.0796, + "step": 221 + }, + { + "epoch": 0.29, + "grad_norm": 0.08959231525659561, + "learning_rate": 8.302074410655455e-05, + "loss": 0.0857, + "step": 222 + }, + { + "epoch": 0.29, + "grad_norm": 0.10325565189123154, + "learning_rate": 8.286018394089863e-05, + "loss": 0.0794, + "step": 223 + }, + { + "epoch": 0.3, + "grad_norm": 0.1005704402923584, + "learning_rate": 8.269902506946746e-05, + "loss": 0.0821, + "step": 224 + }, + { + "epoch": 0.3, + "grad_norm": 0.09767957776784897, + "learning_rate": 8.253727042854245e-05, + "loss": 0.0776, + "step": 225 + }, + { + "epoch": 0.3, + "grad_norm": 0.11237328499555588, + "learning_rate": 8.237492296525981e-05, + "loss": 0.0735, + "step": 226 + }, + { + "epoch": 0.3, + "grad_norm": 0.13158953189849854, + "learning_rate": 8.221198563755682e-05, + "loss": 0.0896, + "step": 227 + }, + { + "epoch": 0.3, + "grad_norm": 0.10914699733257294, + "learning_rate": 8.204846141411801e-05, + "loss": 0.0753, + "step": 228 + }, + { + "epoch": 0.3, + "grad_norm": 0.1349821537733078, + "learning_rate": 8.1884353274321e-05, + "loss": 0.0806, + "step": 229 + }, + { + "epoch": 0.3, + "grad_norm": 0.09405103325843811, + "learning_rate": 8.171966420818228e-05, + "loss": 0.083, + "step": 230 + }, + { + "epoch": 0.3, + "grad_norm": 0.09107044339179993, + "learning_rate": 8.155439721630264e-05, + "loss": 0.076, + "step": 231 + }, + { + "epoch": 0.31, + "grad_norm": 0.1389627307653427, + "learning_rate": 8.138855530981262e-05, + "loss": 0.078, + "step": 232 + }, + { + "epoch": 0.31, + "grad_norm": 0.12448082119226456, + "learning_rate": 8.122214151031753e-05, + "loss": 0.0859, + "step": 233 + }, + { + "epoch": 0.31, + "grad_norm": 0.08903878182172775, + "learning_rate": 8.10551588498425e-05, + "loss": 0.0805, + "step": 234 + }, + { + "epoch": 0.31, + "grad_norm": 0.12410246580839157, + "learning_rate": 8.088761037077718e-05, + "loss": 0.0823, + "step": 235 + }, + { + "epoch": 0.31, + "grad_norm": 0.09901540726423264, + "learning_rate": 8.071949912582029e-05, + "loss": 0.0835, + "step": 236 + }, + { + "epoch": 0.31, + "grad_norm": 0.10345863550901413, + "learning_rate": 8.055082817792403e-05, + "loss": 0.0723, + "step": 237 + }, + { + "epoch": 0.31, + "grad_norm": 0.11597669124603271, + "learning_rate": 8.038160060023834e-05, + "loss": 0.0772, + "step": 238 + }, + { + "epoch": 0.31, + "grad_norm": 0.12257598340511322, + "learning_rate": 8.021181947605473e-05, + "loss": 0.079, + "step": 239 + }, + { + "epoch": 0.32, + "grad_norm": 0.12233386933803558, + "learning_rate": 8.00414878987503e-05, + "loss": 0.084, + "step": 240 + }, + { + "epoch": 0.32, + "grad_norm": 0.09252316504716873, + "learning_rate": 7.987060897173128e-05, + "loss": 0.0812, + "step": 241 + }, + { + "epoch": 0.32, + "grad_norm": 0.1188056468963623, + "learning_rate": 7.969918580837648e-05, + "loss": 0.0797, + "step": 242 + }, + { + "epoch": 0.32, + "grad_norm": 0.10174442082643509, + "learning_rate": 7.952722153198054e-05, + "loss": 0.0736, + "step": 243 + }, + { + "epoch": 0.32, + "grad_norm": 0.10943493247032166, + "learning_rate": 7.935471927569718e-05, + "loss": 0.0831, + "step": 244 + }, + { + "epoch": 0.32, + "grad_norm": 0.10414671897888184, + "learning_rate": 7.918168218248188e-05, + "loss": 0.0842, + "step": 245 + }, + { + "epoch": 0.32, + "grad_norm": 0.11411801725625992, + "learning_rate": 7.900811340503483e-05, + "loss": 0.0782, + "step": 246 + }, + { + "epoch": 0.33, + "grad_norm": 0.10560184717178345, + "learning_rate": 7.883401610574336e-05, + "loss": 0.0774, + "step": 247 + }, + { + "epoch": 0.33, + "grad_norm": 0.13932396471500397, + "learning_rate": 7.865939345662436e-05, + "loss": 0.0871, + "step": 248 + }, + { + "epoch": 0.33, + "grad_norm": 0.10407272726297379, + "learning_rate": 7.848424863926649e-05, + "loss": 0.0754, + "step": 249 + }, + { + "epoch": 0.33, + "grad_norm": 0.09210895001888275, + "learning_rate": 7.830858484477218e-05, + "loss": 0.0744, + "step": 250 + }, + { + "epoch": 0.33, + "grad_norm": 0.1021440252661705, + "learning_rate": 7.813240527369959e-05, + "loss": 0.0852, + "step": 251 + }, + { + "epoch": 0.33, + "grad_norm": 0.11797258257865906, + "learning_rate": 7.795571313600411e-05, + "loss": 0.076, + "step": 252 + }, + { + "epoch": 0.33, + "grad_norm": 0.1269652247428894, + "learning_rate": 7.777851165098012e-05, + "loss": 0.0838, + "step": 253 + }, + { + "epoch": 0.33, + "grad_norm": 0.1156778559088707, + "learning_rate": 7.760080404720209e-05, + "loss": 0.081, + "step": 254 + }, + { + "epoch": 0.34, + "grad_norm": 0.10935825854539871, + "learning_rate": 7.742259356246593e-05, + "loss": 0.0821, + "step": 255 + }, + { + "epoch": 0.34, + "grad_norm": 0.10187061131000519, + "learning_rate": 7.724388344372995e-05, + "loss": 0.0782, + "step": 256 + }, + { + "epoch": 0.34, + "grad_norm": 0.10703828930854797, + "learning_rate": 7.706467694705561e-05, + "loss": 0.0779, + "step": 257 + }, + { + "epoch": 0.34, + "grad_norm": 0.09682856500148773, + "learning_rate": 7.688497733754835e-05, + "loss": 0.0798, + "step": 258 + }, + { + "epoch": 0.34, + "grad_norm": 0.098836250603199, + "learning_rate": 7.670478788929802e-05, + "loss": 0.0852, + "step": 259 + }, + { + "epoch": 0.34, + "grad_norm": 0.08768448978662491, + "learning_rate": 7.652411188531917e-05, + "loss": 0.0725, + "step": 260 + }, + { + "epoch": 0.34, + "grad_norm": 0.10361611843109131, + "learning_rate": 7.634295261749135e-05, + "loss": 0.0749, + "step": 261 + }, + { + "epoch": 0.35, + "grad_norm": 0.08791149407625198, + "learning_rate": 7.616131338649907e-05, + "loss": 0.075, + "step": 262 + }, + { + "epoch": 0.35, + "grad_norm": 0.08269870281219482, + "learning_rate": 7.597919750177168e-05, + "loss": 0.0706, + "step": 263 + }, + { + "epoch": 0.35, + "grad_norm": 0.09004026651382446, + "learning_rate": 7.579660828142301e-05, + "loss": 0.083, + "step": 264 + }, + { + "epoch": 0.35, + "grad_norm": 0.12535926699638367, + "learning_rate": 7.561354905219102e-05, + "loss": 0.0891, + "step": 265 + }, + { + "epoch": 0.35, + "grad_norm": 0.10626240819692612, + "learning_rate": 7.543002314937712e-05, + "loss": 0.0793, + "step": 266 + }, + { + "epoch": 0.35, + "grad_norm": 0.10296738147735596, + "learning_rate": 7.524603391678541e-05, + "loss": 0.0873, + "step": 267 + }, + { + "epoch": 0.35, + "grad_norm": 0.12124089896678925, + "learning_rate": 7.506158470666174e-05, + "loss": 0.0764, + "step": 268 + }, + { + "epoch": 0.35, + "grad_norm": 0.09938234090805054, + "learning_rate": 7.487667887963273e-05, + "loss": 0.066, + "step": 269 + }, + { + "epoch": 0.36, + "grad_norm": 0.1229834258556366, + "learning_rate": 7.469131980464439e-05, + "loss": 0.0764, + "step": 270 + }, + { + "epoch": 0.36, + "grad_norm": 0.12650369107723236, + "learning_rate": 7.450551085890087e-05, + "loss": 0.0837, + "step": 271 + }, + { + "epoch": 0.36, + "grad_norm": 0.12816506624221802, + "learning_rate": 7.431925542780281e-05, + "loss": 0.0804, + "step": 272 + }, + { + "epoch": 0.36, + "grad_norm": 0.11941610276699066, + "learning_rate": 7.413255690488577e-05, + "loss": 0.087, + "step": 273 + }, + { + "epoch": 0.36, + "grad_norm": 0.11334256082773209, + "learning_rate": 7.394541869175834e-05, + "loss": 0.0742, + "step": 274 + }, + { + "epoch": 0.36, + "grad_norm": 0.14086034893989563, + "learning_rate": 7.375784419804019e-05, + "loss": 0.0859, + "step": 275 + }, + { + "epoch": 0.36, + "grad_norm": 0.09905703365802765, + "learning_rate": 7.35698368412999e-05, + "loss": 0.0745, + "step": 276 + }, + { + "epoch": 0.36, + "grad_norm": 0.10592259466648102, + "learning_rate": 7.338140004699272e-05, + "loss": 0.0762, + "step": 277 + }, + { + "epoch": 0.37, + "grad_norm": 0.08352551609277725, + "learning_rate": 7.319253724839821e-05, + "loss": 0.0711, + "step": 278 + }, + { + "epoch": 0.37, + "grad_norm": 0.09426555037498474, + "learning_rate": 7.300325188655761e-05, + "loss": 0.0623, + "step": 279 + }, + { + "epoch": 0.37, + "grad_norm": 0.1022004559636116, + "learning_rate": 7.281354741021118e-05, + "loss": 0.0815, + "step": 280 + }, + { + "epoch": 0.37, + "grad_norm": 0.11823340505361557, + "learning_rate": 7.262342727573535e-05, + "loss": 0.0753, + "step": 281 + }, + { + "epoch": 0.37, + "grad_norm": 0.10613666474819183, + "learning_rate": 7.24328949470798e-05, + "loss": 0.0805, + "step": 282 + }, + { + "epoch": 0.37, + "grad_norm": 0.08761774003505707, + "learning_rate": 7.224195389570422e-05, + "loss": 0.0763, + "step": 283 + }, + { + "epoch": 0.37, + "grad_norm": 0.09059295803308487, + "learning_rate": 7.205060760051522e-05, + "loss": 0.0736, + "step": 284 + }, + { + "epoch": 0.38, + "grad_norm": 0.09126615524291992, + "learning_rate": 7.185885954780282e-05, + "loss": 0.0745, + "step": 285 + }, + { + "epoch": 0.38, + "grad_norm": 0.09419333934783936, + "learning_rate": 7.166671323117703e-05, + "loss": 0.0706, + "step": 286 + }, + { + "epoch": 0.38, + "grad_norm": 0.10592428594827652, + "learning_rate": 7.14741721515041e-05, + "loss": 0.0804, + "step": 287 + }, + { + "epoch": 0.38, + "grad_norm": 0.10718122124671936, + "learning_rate": 7.128123981684279e-05, + "loss": 0.076, + "step": 288 + }, + { + "epoch": 0.38, + "grad_norm": 0.09602747857570648, + "learning_rate": 7.108791974238047e-05, + "loss": 0.0722, + "step": 289 + }, + { + "epoch": 0.38, + "grad_norm": 0.10073014348745346, + "learning_rate": 7.0894215450369e-05, + "loss": 0.0704, + "step": 290 + }, + { + "epoch": 0.38, + "grad_norm": 0.10062811523675919, + "learning_rate": 7.070013047006068e-05, + "loss": 0.074, + "step": 291 + }, + { + "epoch": 0.38, + "grad_norm": 0.10420974344015121, + "learning_rate": 7.05056683376438e-05, + "loss": 0.0811, + "step": 292 + }, + { + "epoch": 0.39, + "grad_norm": 0.10123053938150406, + "learning_rate": 7.031083259617832e-05, + "loss": 0.0737, + "step": 293 + }, + { + "epoch": 0.39, + "grad_norm": 0.1051117554306984, + "learning_rate": 7.011562679553126e-05, + "loss": 0.0795, + "step": 294 + }, + { + "epoch": 0.39, + "grad_norm": 0.10867037624120712, + "learning_rate": 6.992005449231208e-05, + "loss": 0.0779, + "step": 295 + }, + { + "epoch": 0.39, + "grad_norm": 0.10807811468839645, + "learning_rate": 6.972411924980778e-05, + "loss": 0.0796, + "step": 296 + }, + { + "epoch": 0.39, + "grad_norm": 0.11884191632270813, + "learning_rate": 6.952782463791813e-05, + "loss": 0.0843, + "step": 297 + }, + { + "epoch": 0.39, + "grad_norm": 0.1229209154844284, + "learning_rate": 6.933117423309042e-05, + "loss": 0.0839, + "step": 298 + }, + { + "epoch": 0.39, + "grad_norm": 0.11353997886180878, + "learning_rate": 6.91341716182545e-05, + "loss": 0.0703, + "step": 299 + }, + { + "epoch": 0.4, + "grad_norm": 0.10744056850671768, + "learning_rate": 6.893682038275738e-05, + "loss": 0.0769, + "step": 300 + }, + { + "epoch": 0.4, + "grad_norm": 0.1081230416893959, + "learning_rate": 6.873912412229788e-05, + "loss": 0.0848, + "step": 301 + }, + { + "epoch": 0.4, + "grad_norm": 0.11869249492883682, + "learning_rate": 6.854108643886113e-05, + "loss": 0.0857, + "step": 302 + }, + { + "epoch": 0.4, + "grad_norm": 0.10422538965940475, + "learning_rate": 6.834271094065283e-05, + "loss": 0.0773, + "step": 303 + }, + { + "epoch": 0.4, + "grad_norm": 0.09786193817853928, + "learning_rate": 6.814400124203369e-05, + "loss": 0.0701, + "step": 304 + }, + { + "epoch": 0.4, + "grad_norm": 0.10703827440738678, + "learning_rate": 6.794496096345341e-05, + "loss": 0.0838, + "step": 305 + }, + { + "epoch": 0.4, + "grad_norm": 0.10062588006258011, + "learning_rate": 6.774559373138484e-05, + "loss": 0.0772, + "step": 306 + }, + { + "epoch": 0.4, + "grad_norm": 0.09322860091924667, + "learning_rate": 6.754590317825785e-05, + "loss": 0.0706, + "step": 307 + }, + { + "epoch": 0.41, + "grad_norm": 0.11063066124916077, + "learning_rate": 6.734589294239311e-05, + "loss": 0.0791, + "step": 308 + }, + { + "epoch": 0.41, + "grad_norm": 0.10088267177343369, + "learning_rate": 6.71455666679359e-05, + "loss": 0.0751, + "step": 309 + }, + { + "epoch": 0.41, + "grad_norm": 0.10203511267900467, + "learning_rate": 6.69449280047896e-05, + "loss": 0.0763, + "step": 310 + }, + { + "epoch": 0.41, + "grad_norm": 0.09792084246873856, + "learning_rate": 6.674398060854931e-05, + "loss": 0.0785, + "step": 311 + }, + { + "epoch": 0.41, + "grad_norm": 0.09468895196914673, + "learning_rate": 6.654272814043514e-05, + "loss": 0.0788, + "step": 312 + }, + { + "epoch": 0.41, + "grad_norm": 0.09165459871292114, + "learning_rate": 6.634117426722556e-05, + "loss": 0.0767, + "step": 313 + }, + { + "epoch": 0.41, + "grad_norm": 0.10596252977848053, + "learning_rate": 6.613932266119056e-05, + "loss": 0.0796, + "step": 314 + }, + { + "epoch": 0.42, + "grad_norm": 0.1063828319311142, + "learning_rate": 6.59371770000248e-05, + "loss": 0.0755, + "step": 315 + }, + { + "epoch": 0.42, + "grad_norm": 0.11581625789403915, + "learning_rate": 6.573474096678052e-05, + "loss": 0.0732, + "step": 316 + }, + { + "epoch": 0.42, + "grad_norm": 0.11985045671463013, + "learning_rate": 6.553201824980054e-05, + "loss": 0.0781, + "step": 317 + }, + { + "epoch": 0.42, + "grad_norm": 0.12919990718364716, + "learning_rate": 6.532901254265093e-05, + "loss": 0.0855, + "step": 318 + }, + { + "epoch": 0.42, + "grad_norm": 0.14869804680347443, + "learning_rate": 6.51257275440538e-05, + "loss": 0.0772, + "step": 319 + }, + { + "epoch": 0.42, + "grad_norm": 0.10909485816955566, + "learning_rate": 6.492216695781992e-05, + "loss": 0.0676, + "step": 320 + }, + { + "epoch": 0.42, + "grad_norm": 0.14061373472213745, + "learning_rate": 6.47183344927812e-05, + "loss": 0.0804, + "step": 321 + }, + { + "epoch": 0.42, + "grad_norm": 0.12443160265684128, + "learning_rate": 6.451423386272312e-05, + "loss": 0.0708, + "step": 322 + }, + { + "epoch": 0.43, + "grad_norm": 0.1312866508960724, + "learning_rate": 6.430986878631707e-05, + "loss": 0.0776, + "step": 323 + }, + { + "epoch": 0.43, + "grad_norm": 0.11382008343935013, + "learning_rate": 6.41052429870526e-05, + "loss": 0.0709, + "step": 324 + }, + { + "epoch": 0.43, + "grad_norm": 0.11452754586935043, + "learning_rate": 6.390036019316956e-05, + "loss": 0.0797, + "step": 325 + }, + { + "epoch": 0.43, + "grad_norm": 0.09669509530067444, + "learning_rate": 6.369522413759022e-05, + "loss": 0.0661, + "step": 326 + }, + { + "epoch": 0.43, + "grad_norm": 0.09662206470966339, + "learning_rate": 6.348983855785121e-05, + "loss": 0.0709, + "step": 327 + }, + { + "epoch": 0.43, + "grad_norm": 0.10699667036533356, + "learning_rate": 6.328420719603546e-05, + "loss": 0.0809, + "step": 328 + }, + { + "epoch": 0.43, + "grad_norm": 0.1173032596707344, + "learning_rate": 6.307833379870394e-05, + "loss": 0.0792, + "step": 329 + }, + { + "epoch": 0.43, + "grad_norm": 0.1062864363193512, + "learning_rate": 6.287222211682751e-05, + "loss": 0.0801, + "step": 330 + }, + { + "epoch": 0.44, + "grad_norm": 0.10499227792024612, + "learning_rate": 6.266587590571852e-05, + "loss": 0.076, + "step": 331 + }, + { + "epoch": 0.44, + "grad_norm": 0.08642653375864029, + "learning_rate": 6.245929892496239e-05, + "loss": 0.0778, + "step": 332 + }, + { + "epoch": 0.44, + "grad_norm": 0.09800557792186737, + "learning_rate": 6.225249493834911e-05, + "loss": 0.0802, + "step": 333 + }, + { + "epoch": 0.44, + "grad_norm": 0.11565027385950089, + "learning_rate": 6.204546771380462e-05, + "loss": 0.0788, + "step": 334 + }, + { + "epoch": 0.44, + "grad_norm": 0.09418751299381256, + "learning_rate": 6.183822102332234e-05, + "loss": 0.0722, + "step": 335 + }, + { + "epoch": 0.44, + "grad_norm": 0.11058859527111053, + "learning_rate": 6.16307586428942e-05, + "loss": 0.0788, + "step": 336 + }, + { + "epoch": 0.44, + "grad_norm": 0.10711957514286041, + "learning_rate": 6.142308435244195e-05, + "loss": 0.074, + "step": 337 + }, + { + "epoch": 0.45, + "grad_norm": 0.1091398298740387, + "learning_rate": 6.121520193574841e-05, + "loss": 0.0718, + "step": 338 + }, + { + "epoch": 0.45, + "grad_norm": 0.11050156503915787, + "learning_rate": 6.1007115180388285e-05, + "loss": 0.0827, + "step": 339 + }, + { + "epoch": 0.45, + "grad_norm": 0.10380526632070541, + "learning_rate": 6.079882787765938e-05, + "loss": 0.0808, + "step": 340 + }, + { + "epoch": 0.45, + "grad_norm": 0.10426554083824158, + "learning_rate": 6.059034382251338e-05, + "loss": 0.0693, + "step": 341 + }, + { + "epoch": 0.45, + "grad_norm": 0.1098361536860466, + "learning_rate": 6.0381666813486795e-05, + "loss": 0.0741, + "step": 342 + }, + { + "epoch": 0.45, + "grad_norm": 0.1143401488661766, + "learning_rate": 6.01728006526317e-05, + "loss": 0.0738, + "step": 343 + }, + { + "epoch": 0.45, + "grad_norm": 0.104662224650383, + "learning_rate": 5.9963749145446444e-05, + "loss": 0.0764, + "step": 344 + }, + { + "epoch": 0.45, + "grad_norm": 0.10660408437252045, + "learning_rate": 5.9754516100806423e-05, + "loss": 0.0762, + "step": 345 + }, + { + "epoch": 0.46, + "grad_norm": 0.1031242236495018, + "learning_rate": 5.9545105330894526e-05, + "loss": 0.0793, + "step": 346 + }, + { + "epoch": 0.46, + "grad_norm": 0.11632685363292694, + "learning_rate": 5.9335520651131814e-05, + "loss": 0.0683, + "step": 347 + }, + { + "epoch": 0.46, + "grad_norm": 0.11623501777648926, + "learning_rate": 5.912576588010795e-05, + "loss": 0.0699, + "step": 348 + }, + { + "epoch": 0.46, + "grad_norm": 0.12420337647199631, + "learning_rate": 5.891584483951157e-05, + "loss": 0.0801, + "step": 349 + }, + { + "epoch": 0.46, + "grad_norm": 0.11674680560827255, + "learning_rate": 5.870576135406077e-05, + "loss": 0.0755, + "step": 350 + }, + { + "epoch": 0.46, + "grad_norm": 0.12480311095714569, + "learning_rate": 5.849551925143334e-05, + "loss": 0.0642, + "step": 351 + }, + { + "epoch": 0.46, + "grad_norm": 0.12658356130123138, + "learning_rate": 5.828512236219701e-05, + "loss": 0.0763, + "step": 352 + }, + { + "epoch": 0.47, + "grad_norm": 0.10800620913505554, + "learning_rate": 5.807457451973975e-05, + "loss": 0.0688, + "step": 353 + }, + { + "epoch": 0.47, + "grad_norm": 0.12109076231718063, + "learning_rate": 5.78638795601998e-05, + "loss": 0.0744, + "step": 354 + }, + { + "epoch": 0.47, + "grad_norm": 0.1361728012561798, + "learning_rate": 5.7653041322395895e-05, + "loss": 0.0715, + "step": 355 + }, + { + "epoch": 0.47, + "grad_norm": 0.11848083138465881, + "learning_rate": 5.744206364775724e-05, + "loss": 0.0803, + "step": 356 + }, + { + "epoch": 0.47, + "grad_norm": 0.12507352232933044, + "learning_rate": 5.723095038025356e-05, + "loss": 0.072, + "step": 357 + }, + { + "epoch": 0.47, + "grad_norm": 0.1153586283326149, + "learning_rate": 5.701970536632507e-05, + "loss": 0.071, + "step": 358 + }, + { + "epoch": 0.47, + "grad_norm": 0.13595929741859436, + "learning_rate": 5.680833245481234e-05, + "loss": 0.0762, + "step": 359 + }, + { + "epoch": 0.47, + "grad_norm": 0.1186838150024414, + "learning_rate": 5.659683549688623e-05, + "loss": 0.0746, + "step": 360 + }, + { + "epoch": 0.48, + "grad_norm": 0.14524149894714355, + "learning_rate": 5.638521834597774e-05, + "loss": 0.0819, + "step": 361 + }, + { + "epoch": 0.48, + "grad_norm": 0.10339554399251938, + "learning_rate": 5.617348485770767e-05, + "loss": 0.0745, + "step": 362 + }, + { + "epoch": 0.48, + "grad_norm": 0.12478932738304138, + "learning_rate": 5.596163888981656e-05, + "loss": 0.0847, + "step": 363 + }, + { + "epoch": 0.48, + "grad_norm": 0.0925443097949028, + "learning_rate": 5.574968430209423e-05, + "loss": 0.0697, + "step": 364 + }, + { + "epoch": 0.48, + "grad_norm": 0.11409781873226166, + "learning_rate": 5.553762495630956e-05, + "loss": 0.0693, + "step": 365 + }, + { + "epoch": 0.48, + "grad_norm": 0.08425464481115341, + "learning_rate": 5.532546471614012e-05, + "loss": 0.076, + "step": 366 + }, + { + "epoch": 0.48, + "grad_norm": 0.09573796391487122, + "learning_rate": 5.511320744710171e-05, + "loss": 0.0663, + "step": 367 + }, + { + "epoch": 0.48, + "grad_norm": 0.10230566561222076, + "learning_rate": 5.490085701647805e-05, + "loss": 0.0789, + "step": 368 + }, + { + "epoch": 0.49, + "grad_norm": 0.10512962937355042, + "learning_rate": 5.468841729325014e-05, + "loss": 0.0795, + "step": 369 + }, + { + "epoch": 0.49, + "grad_norm": 0.09100647270679474, + "learning_rate": 5.447589214802594e-05, + "loss": 0.0724, + "step": 370 + }, + { + "epoch": 0.49, + "grad_norm": 0.08053728193044662, + "learning_rate": 5.4263285452969806e-05, + "loss": 0.0759, + "step": 371 + }, + { + "epoch": 0.49, + "grad_norm": 0.08965244889259338, + "learning_rate": 5.4050601081731845e-05, + "loss": 0.0757, + "step": 372 + }, + { + "epoch": 0.49, + "grad_norm": 0.08649550378322601, + "learning_rate": 5.383784290937747e-05, + "loss": 0.0725, + "step": 373 + }, + { + "epoch": 0.49, + "grad_norm": 0.09037112444639206, + "learning_rate": 5.36250148123167e-05, + "loss": 0.075, + "step": 374 + }, + { + "epoch": 0.49, + "grad_norm": 0.10361938178539276, + "learning_rate": 5.341212066823355e-05, + "loss": 0.0747, + "step": 375 + }, + { + "epoch": 0.5, + "grad_norm": 0.09907560050487518, + "learning_rate": 5.319916435601546e-05, + "loss": 0.0739, + "step": 376 + }, + { + "epoch": 0.5, + "grad_norm": 0.10159469395875931, + "learning_rate": 5.298614975568249e-05, + "loss": 0.0695, + "step": 377 + }, + { + "epoch": 0.5, + "grad_norm": 0.11597546190023422, + "learning_rate": 5.277308074831677e-05, + "loss": 0.0702, + "step": 378 + }, + { + "epoch": 0.5, + "grad_norm": 0.11498542129993439, + "learning_rate": 5.255996121599167e-05, + "loss": 0.0749, + "step": 379 + }, + { + "epoch": 0.5, + "grad_norm": 0.11171206086874008, + "learning_rate": 5.2346795041701077e-05, + "loss": 0.0732, + "step": 380 + }, + { + "epoch": 0.5, + "grad_norm": 0.11158174276351929, + "learning_rate": 5.213358610928878e-05, + "loss": 0.0811, + "step": 381 + }, + { + "epoch": 0.5, + "grad_norm": 0.11858642846345901, + "learning_rate": 5.192033830337754e-05, + "loss": 0.0758, + "step": 382 + }, + { + "epoch": 0.5, + "grad_norm": 0.12075002491474152, + "learning_rate": 5.1707055509298396e-05, + "loss": 0.0752, + "step": 383 + }, + { + "epoch": 0.51, + "grad_norm": 0.08938009291887283, + "learning_rate": 5.1493741613019906e-05, + "loss": 0.0653, + "step": 384 + }, + { + "epoch": 0.51, + "grad_norm": 0.09382277727127075, + "learning_rate": 5.128040050107724e-05, + "loss": 0.076, + "step": 385 + }, + { + "epoch": 0.51, + "grad_norm": 0.12058842927217484, + "learning_rate": 5.1067036060501486e-05, + "loss": 0.0753, + "step": 386 + }, + { + "epoch": 0.51, + "grad_norm": 0.10002080351114273, + "learning_rate": 5.0853652178748746e-05, + "loss": 0.0803, + "step": 387 + }, + { + "epoch": 0.51, + "grad_norm": 0.10245774686336517, + "learning_rate": 5.0640252743629326e-05, + "loss": 0.0729, + "step": 388 + }, + { + "epoch": 0.51, + "grad_norm": 0.11456061154603958, + "learning_rate": 5.042684164323698e-05, + "loss": 0.0675, + "step": 389 + }, + { + "epoch": 0.51, + "grad_norm": 0.11435601860284805, + "learning_rate": 5.021342276587787e-05, + "loss": 0.0727, + "step": 390 + }, + { + "epoch": 0.52, + "grad_norm": 0.10582521557807922, + "learning_rate": 5e-05, + "loss": 0.0694, + "step": 391 + }, + { + "epoch": 0.52, + "grad_norm": 0.10909827798604965, + "learning_rate": 4.978657723412212e-05, + "loss": 0.0724, + "step": 392 + }, + { + "epoch": 0.52, + "grad_norm": 0.12171333283185959, + "learning_rate": 4.957315835676305e-05, + "loss": 0.0817, + "step": 393 + }, + { + "epoch": 0.52, + "grad_norm": 0.10484538972377777, + "learning_rate": 4.935974725637068e-05, + "loss": 0.072, + "step": 394 + }, + { + "epoch": 0.52, + "grad_norm": 0.13289706408977509, + "learning_rate": 4.9146347821251266e-05, + "loss": 0.0682, + "step": 395 + }, + { + "epoch": 0.52, + "grad_norm": 0.11009448766708374, + "learning_rate": 4.893296393949854e-05, + "loss": 0.0785, + "step": 396 + }, + { + "epoch": 0.52, + "grad_norm": 0.13469281792640686, + "learning_rate": 4.871959949892277e-05, + "loss": 0.0715, + "step": 397 + }, + { + "epoch": 0.52, + "grad_norm": 0.11217113584280014, + "learning_rate": 4.8506258386980106e-05, + "loss": 0.0668, + "step": 398 + }, + { + "epoch": 0.53, + "grad_norm": 0.12297201156616211, + "learning_rate": 4.829294449070161e-05, + "loss": 0.0669, + "step": 399 + }, + { + "epoch": 0.53, + "grad_norm": 0.12945881485939026, + "learning_rate": 4.807966169662249e-05, + "loss": 0.0813, + "step": 400 + }, + { + "epoch": 0.53, + "grad_norm": 0.1016998440027237, + "learning_rate": 4.786641389071123e-05, + "loss": 0.072, + "step": 401 + }, + { + "epoch": 0.53, + "grad_norm": 0.13026472926139832, + "learning_rate": 4.765320495829893e-05, + "loss": 0.0798, + "step": 402 + }, + { + "epoch": 0.53, + "grad_norm": 0.12311934679746628, + "learning_rate": 4.744003878400835e-05, + "loss": 0.0777, + "step": 403 + }, + { + "epoch": 0.53, + "grad_norm": 0.12245490401983261, + "learning_rate": 4.7226919251683235e-05, + "loss": 0.0721, + "step": 404 + }, + { + "epoch": 0.53, + "grad_norm": 0.10729845613241196, + "learning_rate": 4.70138502443175e-05, + "loss": 0.0658, + "step": 405 + }, + { + "epoch": 0.53, + "grad_norm": 0.10610393434762955, + "learning_rate": 4.6800835643984564e-05, + "loss": 0.0688, + "step": 406 + }, + { + "epoch": 0.54, + "grad_norm": 0.10243082046508789, + "learning_rate": 4.658787933176646e-05, + "loss": 0.0767, + "step": 407 + }, + { + "epoch": 0.54, + "grad_norm": 0.12154947221279144, + "learning_rate": 4.637498518768331e-05, + "loss": 0.0792, + "step": 408 + }, + { + "epoch": 0.54, + "grad_norm": 0.12794026732444763, + "learning_rate": 4.616215709062253e-05, + "loss": 0.0787, + "step": 409 + }, + { + "epoch": 0.54, + "grad_norm": 0.08822939544916153, + "learning_rate": 4.5949398918268166e-05, + "loss": 0.0697, + "step": 410 + }, + { + "epoch": 0.54, + "grad_norm": 0.09519528597593307, + "learning_rate": 4.57367145470302e-05, + "loss": 0.0668, + "step": 411 + }, + { + "epoch": 0.54, + "grad_norm": 0.10057291388511658, + "learning_rate": 4.5524107851974056e-05, + "loss": 0.0707, + "step": 412 + }, + { + "epoch": 0.54, + "grad_norm": 0.09450813382863998, + "learning_rate": 4.531158270674989e-05, + "loss": 0.0809, + "step": 413 + }, + { + "epoch": 0.55, + "grad_norm": 0.08685977756977081, + "learning_rate": 4.509914298352197e-05, + "loss": 0.0754, + "step": 414 + }, + { + "epoch": 0.55, + "grad_norm": 0.09042025357484818, + "learning_rate": 4.4886792552898286e-05, + "loss": 0.0689, + "step": 415 + }, + { + "epoch": 0.55, + "grad_norm": 0.10866349190473557, + "learning_rate": 4.46745352838599e-05, + "loss": 0.0746, + "step": 416 + }, + { + "epoch": 0.55, + "grad_norm": 0.09778650850057602, + "learning_rate": 4.446237504369045e-05, + "loss": 0.0698, + "step": 417 + }, + { + "epoch": 0.55, + "grad_norm": 0.09621346741914749, + "learning_rate": 4.4250315697905777e-05, + "loss": 0.0726, + "step": 418 + }, + { + "epoch": 0.55, + "grad_norm": 0.09709804505109787, + "learning_rate": 4.403836111018346e-05, + "loss": 0.0663, + "step": 419 + }, + { + "epoch": 0.55, + "grad_norm": 0.10436049848794937, + "learning_rate": 4.382651514229234e-05, + "loss": 0.075, + "step": 420 + }, + { + "epoch": 0.55, + "grad_norm": 0.11697012186050415, + "learning_rate": 4.3614781654022276e-05, + "loss": 0.0724, + "step": 421 + }, + { + "epoch": 0.56, + "grad_norm": 0.10285527259111404, + "learning_rate": 4.340316450311376e-05, + "loss": 0.0714, + "step": 422 + }, + { + "epoch": 0.56, + "grad_norm": 0.1052049994468689, + "learning_rate": 4.319166754518768e-05, + "loss": 0.0709, + "step": 423 + }, + { + "epoch": 0.56, + "grad_norm": 0.10365312546491623, + "learning_rate": 4.2980294633674946e-05, + "loss": 0.083, + "step": 424 + }, + { + "epoch": 0.56, + "grad_norm": 0.11218774318695068, + "learning_rate": 4.2769049619746446e-05, + "loss": 0.069, + "step": 425 + }, + { + "epoch": 0.56, + "grad_norm": 0.11706428229808807, + "learning_rate": 4.2557936352242784e-05, + "loss": 0.0758, + "step": 426 + }, + { + "epoch": 0.56, + "grad_norm": 0.1439225971698761, + "learning_rate": 4.234695867760412e-05, + "loss": 0.0836, + "step": 427 + }, + { + "epoch": 0.56, + "grad_norm": 0.12518368661403656, + "learning_rate": 4.21361204398002e-05, + "loss": 0.0754, + "step": 428 + }, + { + "epoch": 0.57, + "grad_norm": 0.1182391494512558, + "learning_rate": 4.1925425480260266e-05, + "loss": 0.0768, + "step": 429 + }, + { + "epoch": 0.57, + "grad_norm": 0.10238274186849594, + "learning_rate": 4.1714877637803e-05, + "loss": 0.0655, + "step": 430 + }, + { + "epoch": 0.57, + "grad_norm": 0.09843271970748901, + "learning_rate": 4.150448074856667e-05, + "loss": 0.0665, + "step": 431 + }, + { + "epoch": 0.57, + "grad_norm": 0.10684695839881897, + "learning_rate": 4.129423864593923e-05, + "loss": 0.0771, + "step": 432 + }, + { + "epoch": 0.57, + "grad_norm": 0.11525686830282211, + "learning_rate": 4.108415516048845e-05, + "loss": 0.0716, + "step": 433 + }, + { + "epoch": 0.57, + "grad_norm": 0.11386000365018845, + "learning_rate": 4.087423411989206e-05, + "loss": 0.0698, + "step": 434 + }, + { + "epoch": 0.57, + "grad_norm": 0.12380651384592056, + "learning_rate": 4.066447934886819e-05, + "loss": 0.0723, + "step": 435 + }, + { + "epoch": 0.57, + "grad_norm": 0.1446354240179062, + "learning_rate": 4.045489466910549e-05, + "loss": 0.0731, + "step": 436 + }, + { + "epoch": 0.58, + "grad_norm": 0.10570847988128662, + "learning_rate": 4.0245483899193595e-05, + "loss": 0.0714, + "step": 437 + }, + { + "epoch": 0.58, + "grad_norm": 0.12854741513729095, + "learning_rate": 4.0036250854553554e-05, + "loss": 0.0702, + "step": 438 + }, + { + "epoch": 0.58, + "grad_norm": 0.12250388413667679, + "learning_rate": 3.982719934736832e-05, + "loss": 0.0735, + "step": 439 + }, + { + "epoch": 0.58, + "grad_norm": 0.13413843512535095, + "learning_rate": 3.9618333186513216e-05, + "loss": 0.0647, + "step": 440 + }, + { + "epoch": 0.58, + "grad_norm": 0.10849061608314514, + "learning_rate": 3.940965617748662e-05, + "loss": 0.0603, + "step": 441 + }, + { + "epoch": 0.58, + "grad_norm": 0.12001483887434006, + "learning_rate": 3.9201172122340645e-05, + "loss": 0.0749, + "step": 442 + }, + { + "epoch": 0.58, + "grad_norm": 0.11212868988513947, + "learning_rate": 3.899288481961173e-05, + "loss": 0.0718, + "step": 443 + }, + { + "epoch": 0.58, + "grad_norm": 0.1401149332523346, + "learning_rate": 3.87847980642516e-05, + "loss": 0.0736, + "step": 444 + }, + { + "epoch": 0.59, + "grad_norm": 0.12652656435966492, + "learning_rate": 3.857691564755805e-05, + "loss": 0.063, + "step": 445 + }, + { + "epoch": 0.59, + "grad_norm": 0.11927732825279236, + "learning_rate": 3.8369241357105835e-05, + "loss": 0.0686, + "step": 446 + }, + { + "epoch": 0.59, + "grad_norm": 0.1603834480047226, + "learning_rate": 3.8161778976677666e-05, + "loss": 0.0698, + "step": 447 + }, + { + "epoch": 0.59, + "grad_norm": 0.13165497779846191, + "learning_rate": 3.795453228619537e-05, + "loss": 0.0706, + "step": 448 + }, + { + "epoch": 0.59, + "grad_norm": 0.1464444249868393, + "learning_rate": 3.774750506165092e-05, + "loss": 0.0828, + "step": 449 + }, + { + "epoch": 0.59, + "grad_norm": 0.1456499546766281, + "learning_rate": 3.7540701075037626e-05, + "loss": 0.0767, + "step": 450 + }, + { + "epoch": 0.59, + "grad_norm": 0.10524341464042664, + "learning_rate": 3.733412409428148e-05, + "loss": 0.0692, + "step": 451 + }, + { + "epoch": 0.6, + "grad_norm": 0.12726537883281708, + "learning_rate": 3.712777788317251e-05, + "loss": 0.0687, + "step": 452 + }, + { + "epoch": 0.6, + "grad_norm": 0.12855078279972076, + "learning_rate": 3.692166620129608e-05, + "loss": 0.0725, + "step": 453 + }, + { + "epoch": 0.6, + "grad_norm": 0.10352146625518799, + "learning_rate": 3.671579280396455e-05, + "loss": 0.0694, + "step": 454 + }, + { + "epoch": 0.6, + "grad_norm": 0.10154904425144196, + "learning_rate": 3.651016144214878e-05, + "loss": 0.0682, + "step": 455 + }, + { + "epoch": 0.6, + "grad_norm": 0.09369635581970215, + "learning_rate": 3.63047758624098e-05, + "loss": 0.0659, + "step": 456 + }, + { + "epoch": 0.6, + "grad_norm": 0.1108255609869957, + "learning_rate": 3.6099639806830455e-05, + "loss": 0.0672, + "step": 457 + }, + { + "epoch": 0.6, + "grad_norm": 0.11742108315229416, + "learning_rate": 3.589475701294741e-05, + "loss": 0.0607, + "step": 458 + }, + { + "epoch": 0.6, + "grad_norm": 0.1176430806517601, + "learning_rate": 3.5690131213682944e-05, + "loss": 0.0854, + "step": 459 + }, + { + "epoch": 0.61, + "grad_norm": 0.1096600741147995, + "learning_rate": 3.5485766137276894e-05, + "loss": 0.0711, + "step": 460 + }, + { + "epoch": 0.61, + "grad_norm": 0.12446652352809906, + "learning_rate": 3.52816655072188e-05, + "loss": 0.0744, + "step": 461 + }, + { + "epoch": 0.61, + "grad_norm": 0.11120939254760742, + "learning_rate": 3.5077833042180094e-05, + "loss": 0.0739, + "step": 462 + }, + { + "epoch": 0.61, + "grad_norm": 0.14443209767341614, + "learning_rate": 3.487427245594622e-05, + "loss": 0.0698, + "step": 463 + }, + { + "epoch": 0.61, + "grad_norm": 0.13654518127441406, + "learning_rate": 3.467098745734909e-05, + "loss": 0.0658, + "step": 464 + }, + { + "epoch": 0.61, + "grad_norm": 0.10497821122407913, + "learning_rate": 3.446798175019948e-05, + "loss": 0.0683, + "step": 465 + }, + { + "epoch": 0.61, + "grad_norm": 0.11521188169717789, + "learning_rate": 3.426525903321949e-05, + "loss": 0.0602, + "step": 466 + }, + { + "epoch": 0.62, + "grad_norm": 0.1282206028699875, + "learning_rate": 3.406282299997521e-05, + "loss": 0.0694, + "step": 467 + }, + { + "epoch": 0.62, + "grad_norm": 0.14426809549331665, + "learning_rate": 3.386067733880944e-05, + "loss": 0.0652, + "step": 468 + }, + { + "epoch": 0.62, + "grad_norm": 0.17491450905799866, + "learning_rate": 3.3658825732774456e-05, + "loss": 0.0718, + "step": 469 + }, + { + "epoch": 0.62, + "grad_norm": 0.12897461652755737, + "learning_rate": 3.3457271859564864e-05, + "loss": 0.0716, + "step": 470 + }, + { + "epoch": 0.62, + "grad_norm": 0.12860888242721558, + "learning_rate": 3.325601939145069e-05, + "loss": 0.0792, + "step": 471 + }, + { + "epoch": 0.62, + "grad_norm": 0.14141874015331268, + "learning_rate": 3.305507199521041e-05, + "loss": 0.0739, + "step": 472 + }, + { + "epoch": 0.62, + "grad_norm": 0.13235457241535187, + "learning_rate": 3.2854433332064114e-05, + "loss": 0.0735, + "step": 473 + }, + { + "epoch": 0.62, + "grad_norm": 0.12121675908565521, + "learning_rate": 3.265410705760689e-05, + "loss": 0.0639, + "step": 474 + }, + { + "epoch": 0.63, + "grad_norm": 0.12805382907390594, + "learning_rate": 3.245409682174217e-05, + "loss": 0.0741, + "step": 475 + }, + { + "epoch": 0.63, + "grad_norm": 0.11965460330247879, + "learning_rate": 3.225440626861517e-05, + "loss": 0.0717, + "step": 476 + }, + { + "epoch": 0.63, + "grad_norm": 0.12619951367378235, + "learning_rate": 3.2055039036546596e-05, + "loss": 0.0646, + "step": 477 + }, + { + "epoch": 0.63, + "grad_norm": 0.11961790174245834, + "learning_rate": 3.1855998757966324e-05, + "loss": 0.0686, + "step": 478 + }, + { + "epoch": 0.63, + "grad_norm": 0.1319866180419922, + "learning_rate": 3.165728905934718e-05, + "loss": 0.065, + "step": 479 + }, + { + "epoch": 0.63, + "grad_norm": 0.12313670665025711, + "learning_rate": 3.1458913561138884e-05, + "loss": 0.0676, + "step": 480 + }, + { + "epoch": 0.63, + "grad_norm": 0.13332146406173706, + "learning_rate": 3.126087587770211e-05, + "loss": 0.0716, + "step": 481 + }, + { + "epoch": 0.64, + "grad_norm": 0.11296407878398895, + "learning_rate": 3.106317961724263e-05, + "loss": 0.0658, + "step": 482 + }, + { + "epoch": 0.64, + "grad_norm": 0.15619413554668427, + "learning_rate": 3.086582838174551e-05, + "loss": 0.0726, + "step": 483 + }, + { + "epoch": 0.64, + "grad_norm": 0.12882710993289948, + "learning_rate": 3.0668825766909595e-05, + "loss": 0.0691, + "step": 484 + }, + { + "epoch": 0.64, + "grad_norm": 0.12875738739967346, + "learning_rate": 3.0472175362081897e-05, + "loss": 0.0658, + "step": 485 + }, + { + "epoch": 0.64, + "grad_norm": 0.12345566600561142, + "learning_rate": 3.0275880750192227e-05, + "loss": 0.0678, + "step": 486 + }, + { + "epoch": 0.64, + "grad_norm": 0.1387387365102768, + "learning_rate": 3.007994550768793e-05, + "loss": 0.0734, + "step": 487 + }, + { + "epoch": 0.64, + "grad_norm": 0.14710651338100433, + "learning_rate": 2.9884373204468753e-05, + "loss": 0.0784, + "step": 488 + }, + { + "epoch": 0.64, + "grad_norm": 0.11002452671527863, + "learning_rate": 2.9689167403821694e-05, + "loss": 0.0646, + "step": 489 + }, + { + "epoch": 0.65, + "grad_norm": 0.12409970909357071, + "learning_rate": 2.94943316623562e-05, + "loss": 0.0677, + "step": 490 + }, + { + "epoch": 0.65, + "grad_norm": 0.1445932388305664, + "learning_rate": 2.929986952993933e-05, + "loss": 0.0614, + "step": 491 + }, + { + "epoch": 0.65, + "grad_norm": 0.11631357669830322, + "learning_rate": 2.910578454963101e-05, + "loss": 0.0692, + "step": 492 + }, + { + "epoch": 0.65, + "grad_norm": 0.1650647670030594, + "learning_rate": 2.8912080257619544e-05, + "loss": 0.0692, + "step": 493 + }, + { + "epoch": 0.65, + "grad_norm": 0.14293146133422852, + "learning_rate": 2.871876018315721e-05, + "loss": 0.068, + "step": 494 + }, + { + "epoch": 0.65, + "grad_norm": 0.12101910263299942, + "learning_rate": 2.8525827848495913e-05, + "loss": 0.0622, + "step": 495 + }, + { + "epoch": 0.65, + "grad_norm": 0.12193334102630615, + "learning_rate": 2.833328676882297e-05, + "loss": 0.0691, + "step": 496 + }, + { + "epoch": 0.65, + "grad_norm": 0.11649112403392792, + "learning_rate": 2.814114045219717e-05, + "loss": 0.0751, + "step": 497 + }, + { + "epoch": 0.66, + "grad_norm": 0.13954710960388184, + "learning_rate": 2.7949392399484792e-05, + "loss": 0.0636, + "step": 498 + }, + { + "epoch": 0.66, + "grad_norm": 0.137295663356781, + "learning_rate": 2.7758046104295797e-05, + "loss": 0.0785, + "step": 499 + }, + { + "epoch": 0.66, + "grad_norm": 0.13648007810115814, + "learning_rate": 2.7567105052920216e-05, + "loss": 0.0732, + "step": 500 + }, + { + "epoch": 0.66, + "grad_norm": 0.1289539486169815, + "learning_rate": 2.7376572724264644e-05, + "loss": 0.0754, + "step": 501 + }, + { + "epoch": 0.66, + "grad_norm": 0.1078784316778183, + "learning_rate": 2.7186452589788835e-05, + "loss": 0.0641, + "step": 502 + }, + { + "epoch": 0.66, + "grad_norm": 0.13168664276599884, + "learning_rate": 2.6996748113442394e-05, + "loss": 0.0677, + "step": 503 + }, + { + "epoch": 0.66, + "grad_norm": 0.12153467535972595, + "learning_rate": 2.680746275160179e-05, + "loss": 0.0708, + "step": 504 + }, + { + "epoch": 0.67, + "grad_norm": 0.13471120595932007, + "learning_rate": 2.6618599953007296e-05, + "loss": 0.0766, + "step": 505 + }, + { + "epoch": 0.67, + "grad_norm": 0.14139395952224731, + "learning_rate": 2.6430163158700115e-05, + "loss": 0.0715, + "step": 506 + }, + { + "epoch": 0.67, + "grad_norm": 0.1328204870223999, + "learning_rate": 2.624215580195981e-05, + "loss": 0.0723, + "step": 507 + }, + { + "epoch": 0.67, + "grad_norm": 0.11794441193342209, + "learning_rate": 2.605458130824166e-05, + "loss": 0.0671, + "step": 508 + }, + { + "epoch": 0.67, + "grad_norm": 0.20007918775081635, + "learning_rate": 2.5867443095114248e-05, + "loss": 0.076, + "step": 509 + }, + { + "epoch": 0.67, + "grad_norm": 0.12216662615537643, + "learning_rate": 2.56807445721972e-05, + "loss": 0.0656, + "step": 510 + }, + { + "epoch": 0.67, + "grad_norm": 0.12289852648973465, + "learning_rate": 2.5494489141099153e-05, + "loss": 0.0692, + "step": 511 + }, + { + "epoch": 0.67, + "grad_norm": 0.16199322044849396, + "learning_rate": 2.5308680195355616e-05, + "loss": 0.0749, + "step": 512 + }, + { + "epoch": 0.68, + "grad_norm": 0.09642549604177475, + "learning_rate": 2.5123321120367273e-05, + "loss": 0.071, + "step": 513 + }, + { + "epoch": 0.68, + "grad_norm": 0.10821285098791122, + "learning_rate": 2.4938415293338253e-05, + "loss": 0.0724, + "step": 514 + }, + { + "epoch": 0.68, + "grad_norm": 0.10590888559818268, + "learning_rate": 2.4753966083214615e-05, + "loss": 0.0695, + "step": 515 + }, + { + "epoch": 0.68, + "grad_norm": 0.09137357026338577, + "learning_rate": 2.4569976850622888e-05, + "loss": 0.0553, + "step": 516 + }, + { + "epoch": 0.68, + "grad_norm": 0.1135525181889534, + "learning_rate": 2.4386450947808974e-05, + "loss": 0.0787, + "step": 517 + }, + { + "epoch": 0.68, + "grad_norm": 0.12290289998054504, + "learning_rate": 2.4203391718576994e-05, + "loss": 0.0666, + "step": 518 + }, + { + "epoch": 0.68, + "grad_norm": 0.11229589581489563, + "learning_rate": 2.4020802498228335e-05, + "loss": 0.0669, + "step": 519 + }, + { + "epoch": 0.69, + "grad_norm": 0.1318112462759018, + "learning_rate": 2.3838686613500927e-05, + "loss": 0.0706, + "step": 520 + }, + { + "epoch": 0.69, + "grad_norm": 0.12568672001361847, + "learning_rate": 2.3657047382508664e-05, + "loss": 0.0767, + "step": 521 + }, + { + "epoch": 0.69, + "grad_norm": 0.11129189282655716, + "learning_rate": 2.3475888114680845e-05, + "loss": 0.0647, + "step": 522 + }, + { + "epoch": 0.69, + "grad_norm": 0.14194755256175995, + "learning_rate": 2.329521211070199e-05, + "loss": 0.0646, + "step": 523 + }, + { + "epoch": 0.69, + "grad_norm": 0.1168050616979599, + "learning_rate": 2.3115022662451642e-05, + "loss": 0.0718, + "step": 524 + }, + { + "epoch": 0.69, + "grad_norm": 0.109759122133255, + "learning_rate": 2.29353230529444e-05, + "loss": 0.0703, + "step": 525 + }, + { + "epoch": 0.69, + "grad_norm": 0.11648988723754883, + "learning_rate": 2.275611655627006e-05, + "loss": 0.0715, + "step": 526 + }, + { + "epoch": 0.69, + "grad_norm": 0.15601898729801178, + "learning_rate": 2.2577406437534054e-05, + "loss": 0.0734, + "step": 527 + }, + { + "epoch": 0.7, + "grad_norm": 0.13575918972492218, + "learning_rate": 2.2399195952797913e-05, + "loss": 0.067, + "step": 528 + }, + { + "epoch": 0.7, + "grad_norm": 0.16341762244701385, + "learning_rate": 2.2221488349019903e-05, + "loss": 0.0682, + "step": 529 + }, + { + "epoch": 0.7, + "grad_norm": 0.12766920030117035, + "learning_rate": 2.2044286863995893e-05, + "loss": 0.0696, + "step": 530 + }, + { + "epoch": 0.7, + "grad_norm": 0.12745466828346252, + "learning_rate": 2.186759472630045e-05, + "loss": 0.0754, + "step": 531 + }, + { + "epoch": 0.7, + "grad_norm": 0.15519979596138, + "learning_rate": 2.1691415155227835e-05, + "loss": 0.0718, + "step": 532 + }, + { + "epoch": 0.7, + "grad_norm": 0.12283770740032196, + "learning_rate": 2.151575136073353e-05, + "loss": 0.0637, + "step": 533 + }, + { + "epoch": 0.7, + "grad_norm": 0.11813485622406006, + "learning_rate": 2.1340606543375664e-05, + "loss": 0.0627, + "step": 534 + }, + { + "epoch": 0.7, + "grad_norm": 0.13709497451782227, + "learning_rate": 2.1165983894256647e-05, + "loss": 0.0744, + "step": 535 + }, + { + "epoch": 0.71, + "grad_norm": 0.1242271438241005, + "learning_rate": 2.099188659496517e-05, + "loss": 0.0674, + "step": 536 + }, + { + "epoch": 0.71, + "grad_norm": 0.11997799575328827, + "learning_rate": 2.0818317817518118e-05, + "loss": 0.0621, + "step": 537 + }, + { + "epoch": 0.71, + "grad_norm": 0.13129732012748718, + "learning_rate": 2.0645280724302836e-05, + "loss": 0.0721, + "step": 538 + }, + { + "epoch": 0.71, + "grad_norm": 0.14182625710964203, + "learning_rate": 2.0472778468019454e-05, + "loss": 0.0718, + "step": 539 + }, + { + "epoch": 0.71, + "grad_norm": 0.12494273483753204, + "learning_rate": 2.030081419162354e-05, + "loss": 0.0611, + "step": 540 + }, + { + "epoch": 0.71, + "grad_norm": 0.1131092980504036, + "learning_rate": 2.0129391028268735e-05, + "loss": 0.0685, + "step": 541 + }, + { + "epoch": 0.71, + "grad_norm": 0.11226702481508255, + "learning_rate": 1.9958512101249705e-05, + "loss": 0.0622, + "step": 542 + }, + { + "epoch": 0.72, + "grad_norm": 0.11921816319227219, + "learning_rate": 1.9788180523945277e-05, + "loss": 0.0666, + "step": 543 + }, + { + "epoch": 0.72, + "grad_norm": 0.1638924926519394, + "learning_rate": 1.9618399399761688e-05, + "loss": 0.0757, + "step": 544 + }, + { + "epoch": 0.72, + "grad_norm": 0.13155657052993774, + "learning_rate": 1.9449171822075973e-05, + "loss": 0.0651, + "step": 545 + }, + { + "epoch": 0.72, + "grad_norm": 0.1587192714214325, + "learning_rate": 1.9280500874179726e-05, + "loss": 0.0711, + "step": 546 + }, + { + "epoch": 0.72, + "grad_norm": 0.13725396990776062, + "learning_rate": 1.9112389629222823e-05, + "loss": 0.0629, + "step": 547 + }, + { + "epoch": 0.72, + "grad_norm": 0.13107489049434662, + "learning_rate": 1.89448411501575e-05, + "loss": 0.0667, + "step": 548 + }, + { + "epoch": 0.72, + "grad_norm": 0.12860113382339478, + "learning_rate": 1.8777858489682466e-05, + "loss": 0.0696, + "step": 549 + }, + { + "epoch": 0.72, + "grad_norm": 0.1347755342721939, + "learning_rate": 1.8611444690187392e-05, + "loss": 0.063, + "step": 550 + }, + { + "epoch": 0.73, + "grad_norm": 0.14524155855178833, + "learning_rate": 1.8445602783697374e-05, + "loss": 0.0709, + "step": 551 + }, + { + "epoch": 0.73, + "grad_norm": 0.12448085844516754, + "learning_rate": 1.8280335791817733e-05, + "loss": 0.0657, + "step": 552 + }, + { + "epoch": 0.73, + "grad_norm": 0.12951478362083435, + "learning_rate": 1.8115646725678997e-05, + "loss": 0.069, + "step": 553 + }, + { + "epoch": 0.73, + "grad_norm": 0.1278984248638153, + "learning_rate": 1.7951538585882007e-05, + "loss": 0.0753, + "step": 554 + }, + { + "epoch": 0.73, + "grad_norm": 0.12768521904945374, + "learning_rate": 1.778801436244319e-05, + "loss": 0.0677, + "step": 555 + }, + { + "epoch": 0.73, + "grad_norm": 0.1416548192501068, + "learning_rate": 1.7625077034740195e-05, + "loss": 0.0666, + "step": 556 + }, + { + "epoch": 0.73, + "grad_norm": 0.11569023132324219, + "learning_rate": 1.7462729571457558e-05, + "loss": 0.0662, + "step": 557 + }, + { + "epoch": 0.74, + "grad_norm": 0.14115728437900543, + "learning_rate": 1.7300974930532542e-05, + "loss": 0.0741, + "step": 558 + }, + { + "epoch": 0.74, + "grad_norm": 0.12663856148719788, + "learning_rate": 1.713981605910137e-05, + "loss": 0.0694, + "step": 559 + }, + { + "epoch": 0.74, + "grad_norm": 0.14533084630966187, + "learning_rate": 1.6979255893445462e-05, + "loss": 0.0771, + "step": 560 + }, + { + "epoch": 0.74, + "grad_norm": 0.1276610940694809, + "learning_rate": 1.68192973589379e-05, + "loss": 0.0685, + "step": 561 + }, + { + "epoch": 0.74, + "grad_norm": 0.12586858868598938, + "learning_rate": 1.66599433699902e-05, + "loss": 0.0695, + "step": 562 + }, + { + "epoch": 0.74, + "grad_norm": 0.11848849058151245, + "learning_rate": 1.650119682999918e-05, + "loss": 0.0754, + "step": 563 + }, + { + "epoch": 0.74, + "grad_norm": 0.11971968412399292, + "learning_rate": 1.6343060631294083e-05, + "loss": 0.0664, + "step": 564 + }, + { + "epoch": 0.74, + "grad_norm": 0.10469724982976913, + "learning_rate": 1.6185537655083817e-05, + "loss": 0.0643, + "step": 565 + }, + { + "epoch": 0.75, + "grad_norm": 0.11540230363607407, + "learning_rate": 1.6028630771404523e-05, + "loss": 0.0651, + "step": 566 + }, + { + "epoch": 0.75, + "grad_norm": 0.09816712141036987, + "learning_rate": 1.5872342839067306e-05, + "loss": 0.0634, + "step": 567 + }, + { + "epoch": 0.75, + "grad_norm": 0.1148160770535469, + "learning_rate": 1.571667670560601e-05, + "loss": 0.0705, + "step": 568 + }, + { + "epoch": 0.75, + "grad_norm": 0.11048528552055359, + "learning_rate": 1.5561635207225515e-05, + "loss": 0.0741, + "step": 569 + }, + { + "epoch": 0.75, + "grad_norm": 0.1149090901017189, + "learning_rate": 1.5407221168749967e-05, + "loss": 0.0694, + "step": 570 + }, + { + "epoch": 0.75, + "grad_norm": 0.12481275945901871, + "learning_rate": 1.525343740357128e-05, + "loss": 0.063, + "step": 571 + }, + { + "epoch": 0.75, + "grad_norm": 0.12050320208072662, + "learning_rate": 1.5100286713597938e-05, + "loss": 0.0684, + "step": 572 + }, + { + "epoch": 0.75, + "grad_norm": 0.11793509125709534, + "learning_rate": 1.4947771889203938e-05, + "loss": 0.0673, + "step": 573 + }, + { + "epoch": 0.76, + "grad_norm": 0.14838404953479767, + "learning_rate": 1.4795895709177954e-05, + "loss": 0.068, + "step": 574 + }, + { + "epoch": 0.76, + "grad_norm": 0.12349526584148407, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0691, + "step": 575 + }, + { + "epoch": 0.76, + "grad_norm": 0.152385875582695, + "learning_rate": 1.4494070339154276e-05, + "loss": 0.0737, + "step": 576 + }, + { + "epoch": 0.76, + "grad_norm": 0.11645952612161636, + "learning_rate": 1.4344126648352618e-05, + "loss": 0.0729, + "step": 577 + }, + { + "epoch": 0.76, + "grad_norm": 0.11911028623580933, + "learning_rate": 1.419483260021075e-05, + "loss": 0.067, + "step": 578 + }, + { + "epoch": 0.76, + "grad_norm": 0.11739668995141983, + "learning_rate": 1.404619091483546e-05, + "loss": 0.0703, + "step": 579 + }, + { + "epoch": 0.76, + "grad_norm": 0.11539848148822784, + "learning_rate": 1.3898204300447616e-05, + "loss": 0.0684, + "step": 580 + }, + { + "epoch": 0.77, + "grad_norm": 0.1286860853433609, + "learning_rate": 1.3750875453332801e-05, + "loss": 0.0694, + "step": 581 + }, + { + "epoch": 0.77, + "grad_norm": 0.12313937395811081, + "learning_rate": 1.3604207057792218e-05, + "loss": 0.0647, + "step": 582 + }, + { + "epoch": 0.77, + "grad_norm": 0.11173125356435776, + "learning_rate": 1.3458201786093794e-05, + "loss": 0.0675, + "step": 583 + }, + { + "epoch": 0.77, + "grad_norm": 0.12815962731838226, + "learning_rate": 1.3312862298423496e-05, + "loss": 0.0685, + "step": 584 + }, + { + "epoch": 0.77, + "grad_norm": 0.11466740816831589, + "learning_rate": 1.3168191242836786e-05, + "loss": 0.0646, + "step": 585 + }, + { + "epoch": 0.77, + "grad_norm": 0.1391080915927887, + "learning_rate": 1.3024191255210477e-05, + "loss": 0.0674, + "step": 586 + }, + { + "epoch": 0.77, + "grad_norm": 0.13418692350387573, + "learning_rate": 1.2880864959194665e-05, + "loss": 0.0683, + "step": 587 + }, + { + "epoch": 0.77, + "grad_norm": 0.10707702487707138, + "learning_rate": 1.273821496616488e-05, + "loss": 0.0645, + "step": 588 + }, + { + "epoch": 0.78, + "grad_norm": 0.13759590685367584, + "learning_rate": 1.2596243875174591e-05, + "loss": 0.0764, + "step": 589 + }, + { + "epoch": 0.78, + "grad_norm": 0.1846940666437149, + "learning_rate": 1.2454954272907815e-05, + "loss": 0.0679, + "step": 590 + }, + { + "epoch": 0.78, + "grad_norm": 0.11831660568714142, + "learning_rate": 1.2314348733631959e-05, + "loss": 0.0655, + "step": 591 + }, + { + "epoch": 0.78, + "grad_norm": 0.11042218655347824, + "learning_rate": 1.2174429819150929e-05, + "loss": 0.0616, + "step": 592 + }, + { + "epoch": 0.78, + "grad_norm": 0.12103278189897537, + "learning_rate": 1.2035200078758519e-05, + "loss": 0.0724, + "step": 593 + }, + { + "epoch": 0.78, + "grad_norm": 0.1172623485326767, + "learning_rate": 1.1896662049191898e-05, + "loss": 0.0653, + "step": 594 + }, + { + "epoch": 0.78, + "grad_norm": 0.11154913157224655, + "learning_rate": 1.1758818254585369e-05, + "loss": 0.0621, + "step": 595 + }, + { + "epoch": 0.79, + "grad_norm": 0.17942464351654053, + "learning_rate": 1.1621671206424462e-05, + "loss": 0.0735, + "step": 596 + }, + { + "epoch": 0.79, + "grad_norm": 0.12177997827529907, + "learning_rate": 1.1485223403500117e-05, + "loss": 0.0718, + "step": 597 + }, + { + "epoch": 0.79, + "grad_norm": 0.12012594193220139, + "learning_rate": 1.134947733186315e-05, + "loss": 0.0682, + "step": 598 + }, + { + "epoch": 0.79, + "grad_norm": 0.11858519166707993, + "learning_rate": 1.1214435464779006e-05, + "loss": 0.0674, + "step": 599 + }, + { + "epoch": 0.79, + "grad_norm": 0.13078168034553528, + "learning_rate": 1.1080100262682663e-05, + "loss": 0.0636, + "step": 600 + }, + { + "epoch": 0.79, + "grad_norm": 0.14218783378601074, + "learning_rate": 1.0946474173133792e-05, + "loss": 0.0654, + "step": 601 + }, + { + "epoch": 0.79, + "grad_norm": 0.11505080759525299, + "learning_rate": 1.0813559630772173e-05, + "loss": 0.0615, + "step": 602 + }, + { + "epoch": 0.79, + "grad_norm": 0.12753449380397797, + "learning_rate": 1.0681359057273388e-05, + "loss": 0.0754, + "step": 603 + }, + { + "epoch": 0.8, + "grad_norm": 0.13185451924800873, + "learning_rate": 1.0549874861304605e-05, + "loss": 0.06, + "step": 604 + }, + { + "epoch": 0.8, + "grad_norm": 0.13252925872802734, + "learning_rate": 1.0419109438480762e-05, + "loss": 0.0668, + "step": 605 + }, + { + "epoch": 0.8, + "grad_norm": 0.12701675295829773, + "learning_rate": 1.0289065171320905e-05, + "loss": 0.0724, + "step": 606 + }, + { + "epoch": 0.8, + "grad_norm": 0.13044829666614532, + "learning_rate": 1.0159744429204777e-05, + "loss": 0.0659, + "step": 607 + }, + { + "epoch": 0.8, + "grad_norm": 0.1393629014492035, + "learning_rate": 1.00311495683296e-05, + "loss": 0.0566, + "step": 608 + }, + { + "epoch": 0.8, + "grad_norm": 0.12651897966861725, + "learning_rate": 9.903282931667246e-06, + "loss": 0.0659, + "step": 609 + }, + { + "epoch": 0.8, + "grad_norm": 0.11385196447372437, + "learning_rate": 9.776146848921475e-06, + "loss": 0.0696, + "step": 610 + }, + { + "epoch": 0.81, + "grad_norm": 0.13774777948856354, + "learning_rate": 9.64974363648548e-06, + "loss": 0.0725, + "step": 611 + }, + { + "epoch": 0.81, + "grad_norm": 0.14081576466560364, + "learning_rate": 9.524075597399718e-06, + "loss": 0.0696, + "step": 612 + }, + { + "epoch": 0.81, + "grad_norm": 0.13455650210380554, + "learning_rate": 9.399145021309974e-06, + "loss": 0.068, + "step": 613 + }, + { + "epoch": 0.81, + "grad_norm": 0.1522856503725052, + "learning_rate": 9.27495418442555e-06, + "loss": 0.0649, + "step": 614 + }, + { + "epoch": 0.81, + "grad_norm": 0.13210617005825043, + "learning_rate": 9.151505349477902e-06, + "loss": 0.0699, + "step": 615 + }, + { + "epoch": 0.81, + "grad_norm": 0.13013535737991333, + "learning_rate": 9.028800765679347e-06, + "loss": 0.0626, + "step": 616 + }, + { + "epoch": 0.81, + "grad_norm": 0.230327770113945, + "learning_rate": 8.906842668682102e-06, + "loss": 0.0625, + "step": 617 + }, + { + "epoch": 0.81, + "grad_norm": 0.16013884544372559, + "learning_rate": 8.785633280537537e-06, + "loss": 0.0706, + "step": 618 + }, + { + "epoch": 0.82, + "grad_norm": 0.14744172990322113, + "learning_rate": 8.665174809655708e-06, + "loss": 0.065, + "step": 619 + }, + { + "epoch": 0.82, + "grad_norm": 0.11363288015127182, + "learning_rate": 8.54546945076513e-06, + "loss": 0.0553, + "step": 620 + }, + { + "epoch": 0.82, + "grad_norm": 0.30628418922424316, + "learning_rate": 8.426519384872733e-06, + "loss": 0.0682, + "step": 621 + }, + { + "epoch": 0.82, + "grad_norm": 0.18932583928108215, + "learning_rate": 8.308326779224218e-06, + "loss": 0.0712, + "step": 622 + }, + { + "epoch": 0.82, + "grad_norm": 0.14569301903247833, + "learning_rate": 8.19089378726447e-06, + "loss": 0.0648, + "step": 623 + }, + { + "epoch": 0.82, + "grad_norm": 0.14561575651168823, + "learning_rate": 8.074222548598371e-06, + "loss": 0.0643, + "step": 624 + }, + { + "epoch": 0.82, + "grad_norm": 0.13920505344867706, + "learning_rate": 7.958315188951848e-06, + "loss": 0.0624, + "step": 625 + }, + { + "epoch": 0.82, + "grad_norm": 0.12166207283735275, + "learning_rate": 7.843173820133104e-06, + "loss": 0.0705, + "step": 626 + }, + { + "epoch": 0.83, + "grad_norm": 0.1481250822544098, + "learning_rate": 7.728800539994113e-06, + "loss": 0.0646, + "step": 627 + }, + { + "epoch": 0.83, + "grad_norm": 0.1408376544713974, + "learning_rate": 7.615197432392462e-06, + "loss": 0.0573, + "step": 628 + }, + { + "epoch": 0.83, + "grad_norm": 0.1506175696849823, + "learning_rate": 7.502366567153346e-06, + "loss": 0.066, + "step": 629 + }, + { + "epoch": 0.83, + "grad_norm": 0.15556824207305908, + "learning_rate": 7.390310000031875e-06, + "loss": 0.0761, + "step": 630 + }, + { + "epoch": 0.83, + "grad_norm": 0.11223191022872925, + "learning_rate": 7.2790297726755716e-06, + "loss": 0.0588, + "step": 631 + }, + { + "epoch": 0.83, + "grad_norm": 0.1281258761882782, + "learning_rate": 7.168527912587253e-06, + "loss": 0.0634, + "step": 632 + }, + { + "epoch": 0.83, + "grad_norm": 0.1385982483625412, + "learning_rate": 7.058806433088011e-06, + "loss": 0.0584, + "step": 633 + }, + { + "epoch": 0.84, + "grad_norm": 0.1930031180381775, + "learning_rate": 6.949867333280569e-06, + "loss": 0.0723, + "step": 634 + }, + { + "epoch": 0.84, + "grad_norm": 0.16700227558612823, + "learning_rate": 6.8417125980128675e-06, + "loss": 0.0725, + "step": 635 + }, + { + "epoch": 0.84, + "grad_norm": 0.13950631022453308, + "learning_rate": 6.734344197841891e-06, + "loss": 0.0663, + "step": 636 + }, + { + "epoch": 0.84, + "grad_norm": 0.17406101524829865, + "learning_rate": 6.627764088997734e-06, + "loss": 0.0795, + "step": 637 + }, + { + "epoch": 0.84, + "grad_norm": 0.15877845883369446, + "learning_rate": 6.5219742133480085e-06, + "loss": 0.0649, + "step": 638 + }, + { + "epoch": 0.84, + "grad_norm": 0.1270444542169571, + "learning_rate": 6.416976498362432e-06, + "loss": 0.0652, + "step": 639 + }, + { + "epoch": 0.84, + "grad_norm": 0.12452571839094162, + "learning_rate": 6.3127728570777365e-06, + "loss": 0.0664, + "step": 640 + }, + { + "epoch": 0.84, + "grad_norm": 0.16081424057483673, + "learning_rate": 6.209365188062749e-06, + "loss": 0.0652, + "step": 641 + }, + { + "epoch": 0.85, + "grad_norm": 0.1544044464826584, + "learning_rate": 6.106755375383904e-06, + "loss": 0.0643, + "step": 642 + }, + { + "epoch": 0.85, + "grad_norm": 0.12766015529632568, + "learning_rate": 6.004945288570813e-06, + "loss": 0.0672, + "step": 643 + }, + { + "epoch": 0.85, + "grad_norm": 0.1414952278137207, + "learning_rate": 5.903936782582253e-06, + "loss": 0.0659, + "step": 644 + }, + { + "epoch": 0.85, + "grad_norm": 0.12395791709423065, + "learning_rate": 5.803731697772391e-06, + "loss": 0.0717, + "step": 645 + }, + { + "epoch": 0.85, + "grad_norm": 0.12516911327838898, + "learning_rate": 5.7043318598572094e-06, + "loss": 0.064, + "step": 646 + }, + { + "epoch": 0.85, + "grad_norm": 0.1448909342288971, + "learning_rate": 5.605739079881239e-06, + "loss": 0.0734, + "step": 647 + }, + { + "epoch": 0.85, + "grad_norm": 0.1468668133020401, + "learning_rate": 5.507955154184619e-06, + "loss": 0.0647, + "step": 648 + }, + { + "epoch": 0.86, + "grad_norm": 0.13697059452533722, + "learning_rate": 5.4109818643703124e-06, + "loss": 0.0696, + "step": 649 + }, + { + "epoch": 0.86, + "grad_norm": 0.13059355318546295, + "learning_rate": 5.314820977271645e-06, + "loss": 0.0587, + "step": 650 + }, + { + "epoch": 0.86, + "grad_norm": 0.12498347461223602, + "learning_rate": 5.219474244920164e-06, + "loss": 0.0648, + "step": 651 + }, + { + "epoch": 0.86, + "grad_norm": 0.11734464764595032, + "learning_rate": 5.1249434045136765e-06, + "loss": 0.0602, + "step": 652 + }, + { + "epoch": 0.86, + "grad_norm": 0.14948828518390656, + "learning_rate": 5.031230178384594e-06, + "loss": 0.0765, + "step": 653 + }, + { + "epoch": 0.86, + "grad_norm": 0.1507301926612854, + "learning_rate": 4.938336273968558e-06, + "loss": 0.0672, + "step": 654 + }, + { + "epoch": 0.86, + "grad_norm": 0.1803879290819168, + "learning_rate": 4.846263383773364e-06, + "loss": 0.0604, + "step": 655 + }, + { + "epoch": 0.86, + "grad_norm": 0.1707804799079895, + "learning_rate": 4.755013185348095e-06, + "loss": 0.0734, + "step": 656 + }, + { + "epoch": 0.87, + "grad_norm": 0.15067826211452484, + "learning_rate": 4.66458734125253e-06, + "loss": 0.0646, + "step": 657 + }, + { + "epoch": 0.87, + "grad_norm": 0.12395960837602615, + "learning_rate": 4.574987499026912e-06, + "loss": 0.0595, + "step": 658 + }, + { + "epoch": 0.87, + "grad_norm": 0.13461753726005554, + "learning_rate": 4.4862152911618934e-06, + "loss": 0.06, + "step": 659 + }, + { + "epoch": 0.87, + "grad_norm": 0.14375466108322144, + "learning_rate": 4.398272335068787e-06, + "loss": 0.0613, + "step": 660 + }, + { + "epoch": 0.87, + "grad_norm": 0.11830610781908035, + "learning_rate": 4.311160233050121e-06, + "loss": 0.0637, + "step": 661 + }, + { + "epoch": 0.87, + "grad_norm": 0.13027916848659515, + "learning_rate": 4.224880572270434e-06, + "loss": 0.0647, + "step": 662 + }, + { + "epoch": 0.87, + "grad_norm": 0.13159357011318207, + "learning_rate": 4.139434924727359e-06, + "loss": 0.0663, + "step": 663 + }, + { + "epoch": 0.87, + "grad_norm": 0.12546558678150177, + "learning_rate": 4.054824847222949e-06, + "loss": 0.0603, + "step": 664 + }, + { + "epoch": 0.88, + "grad_norm": 0.13789647817611694, + "learning_rate": 3.971051881335391e-06, + "loss": 0.0561, + "step": 665 + }, + { + "epoch": 0.88, + "grad_norm": 0.142785906791687, + "learning_rate": 3.888117553390852e-06, + "loss": 0.0667, + "step": 666 + }, + { + "epoch": 0.88, + "grad_norm": 0.1499502956867218, + "learning_rate": 3.8060233744356633e-06, + "loss": 0.0675, + "step": 667 + }, + { + "epoch": 0.88, + "grad_norm": 0.1378251016139984, + "learning_rate": 3.724770840208852e-06, + "loss": 0.0628, + "step": 668 + }, + { + "epoch": 0.88, + "grad_norm": 0.12548185884952545, + "learning_rate": 3.6443614311148456e-06, + "loss": 0.0689, + "step": 669 + }, + { + "epoch": 0.88, + "grad_norm": 0.13463036715984344, + "learning_rate": 3.564796612196475e-06, + "loss": 0.0667, + "step": 670 + }, + { + "epoch": 0.88, + "grad_norm": 0.14040209352970123, + "learning_rate": 3.486077833108342e-06, + "loss": 0.071, + "step": 671 + }, + { + "epoch": 0.89, + "grad_norm": 0.11872732639312744, + "learning_rate": 3.408206528090374e-06, + "loss": 0.0589, + "step": 672 + }, + { + "epoch": 0.89, + "grad_norm": 0.13672368228435516, + "learning_rate": 3.3311841159416936e-06, + "loss": 0.0605, + "step": 673 + }, + { + "epoch": 0.89, + "grad_norm": 0.16072207689285278, + "learning_rate": 3.25501199999475e-06, + "loss": 0.0637, + "step": 674 + }, + { + "epoch": 0.89, + "grad_norm": 0.12611547112464905, + "learning_rate": 3.1796915680897988e-06, + "loss": 0.0614, + "step": 675 + }, + { + "epoch": 0.89, + "grad_norm": 0.15802842378616333, + "learning_rate": 3.1052241925495885e-06, + "loss": 0.0651, + "step": 676 + }, + { + "epoch": 0.89, + "grad_norm": 0.15073926746845245, + "learning_rate": 3.031611230154324e-06, + "loss": 0.0657, + "step": 677 + }, + { + "epoch": 0.89, + "grad_norm": 0.16130346059799194, + "learning_rate": 2.958854022117008e-06, + "loss": 0.0745, + "step": 678 + }, + { + "epoch": 0.89, + "grad_norm": 0.15508459508419037, + "learning_rate": 2.88695389405898e-06, + "loss": 0.0749, + "step": 679 + }, + { + "epoch": 0.9, + "grad_norm": 0.1521391123533249, + "learning_rate": 2.8159121559857284e-06, + "loss": 0.0599, + "step": 680 + }, + { + "epoch": 0.9, + "grad_norm": 0.12179268896579742, + "learning_rate": 2.745730102263078e-06, + "loss": 0.0588, + "step": 681 + }, + { + "epoch": 0.9, + "grad_norm": 0.1358071267604828, + "learning_rate": 2.6764090115935834e-06, + "loss": 0.0648, + "step": 682 + }, + { + "epoch": 0.9, + "grad_norm": 0.17397892475128174, + "learning_rate": 2.6079501469932156e-06, + "loss": 0.0618, + "step": 683 + }, + { + "epoch": 0.9, + "grad_norm": 0.15012788772583008, + "learning_rate": 2.540354755768365e-06, + "loss": 0.0605, + "step": 684 + }, + { + "epoch": 0.9, + "grad_norm": 0.15713779628276825, + "learning_rate": 2.4736240694931247e-06, + "loss": 0.0675, + "step": 685 + }, + { + "epoch": 0.9, + "grad_norm": 0.1717958003282547, + "learning_rate": 2.407759303986845e-06, + "loss": 0.0679, + "step": 686 + }, + { + "epoch": 0.91, + "grad_norm": 0.15394960343837738, + "learning_rate": 2.342761659291959e-06, + "loss": 0.071, + "step": 687 + }, + { + "epoch": 0.91, + "grad_norm": 0.17352795600891113, + "learning_rate": 2.2786323196521575e-06, + "loss": 0.0674, + "step": 688 + }, + { + "epoch": 0.91, + "grad_norm": 0.16719239950180054, + "learning_rate": 2.215372453490788e-06, + "loss": 0.071, + "step": 689 + }, + { + "epoch": 0.91, + "grad_norm": 0.1454174518585205, + "learning_rate": 2.152983213389559e-06, + "loss": 0.0736, + "step": 690 + }, + { + "epoch": 0.91, + "grad_norm": 0.1298518180847168, + "learning_rate": 2.0914657360675606e-06, + "loss": 0.0617, + "step": 691 + }, + { + "epoch": 0.91, + "grad_norm": 0.14243261516094208, + "learning_rate": 2.0308211423605505e-06, + "loss": 0.0622, + "step": 692 + }, + { + "epoch": 0.91, + "grad_norm": 0.1646115630865097, + "learning_rate": 1.971050537200514e-06, + "loss": 0.0685, + "step": 693 + }, + { + "epoch": 0.91, + "grad_norm": 0.127439484000206, + "learning_rate": 1.912155009595551e-06, + "loss": 0.0667, + "step": 694 + }, + { + "epoch": 0.92, + "grad_norm": 0.14966559410095215, + "learning_rate": 1.8541356326100433e-06, + "loss": 0.066, + "step": 695 + }, + { + "epoch": 0.92, + "grad_norm": 0.1566605120897293, + "learning_rate": 1.7969934633450535e-06, + "loss": 0.0646, + "step": 696 + }, + { + "epoch": 0.92, + "grad_norm": 0.14571478962898254, + "learning_rate": 1.740729542919134e-06, + "loss": 0.0679, + "step": 697 + }, + { + "epoch": 0.92, + "grad_norm": 0.1392918974161148, + "learning_rate": 1.685344896449309e-06, + "loss": 0.0674, + "step": 698 + }, + { + "epoch": 0.92, + "grad_norm": 0.1348922848701477, + "learning_rate": 1.6308405330324294e-06, + "loss": 0.0638, + "step": 699 + }, + { + "epoch": 0.92, + "grad_norm": 0.16134820878505707, + "learning_rate": 1.5772174457267362e-06, + "loss": 0.0681, + "step": 700 + }, + { + "epoch": 0.92, + "grad_norm": 0.1294383555650711, + "learning_rate": 1.5244766115338372e-06, + "loss": 0.0581, + "step": 701 + }, + { + "epoch": 0.92, + "grad_norm": 0.17935150861740112, + "learning_rate": 1.472618991380853e-06, + "loss": 0.074, + "step": 702 + }, + { + "epoch": 0.93, + "grad_norm": 0.13329242169857025, + "learning_rate": 1.4216455301029275e-06, + "loss": 0.0617, + "step": 703 + }, + { + "epoch": 0.93, + "grad_norm": 0.171311616897583, + "learning_rate": 1.3715571564260122e-06, + "loss": 0.0694, + "step": 704 + }, + { + "epoch": 0.93, + "grad_norm": 0.1370270550251007, + "learning_rate": 1.3223547829499527e-06, + "loss": 0.06, + "step": 705 + }, + { + "epoch": 0.93, + "grad_norm": 0.1376296430826187, + "learning_rate": 1.2740393061318357e-06, + "loss": 0.0654, + "step": 706 + }, + { + "epoch": 0.93, + "grad_norm": 0.14175420999526978, + "learning_rate": 1.2266116062696953e-06, + "loss": 0.0737, + "step": 707 + }, + { + "epoch": 0.93, + "grad_norm": 0.16615577042102814, + "learning_rate": 1.1800725474864438e-06, + "loss": 0.0682, + "step": 708 + }, + { + "epoch": 0.93, + "grad_norm": 0.1245865449309349, + "learning_rate": 1.1344229777141336e-06, + "loss": 0.0624, + "step": 709 + }, + { + "epoch": 0.94, + "grad_norm": 0.1388980895280838, + "learning_rate": 1.0896637286785084e-06, + "loss": 0.0683, + "step": 710 + }, + { + "epoch": 0.94, + "grad_norm": 0.14993278682231903, + "learning_rate": 1.0457956158838544e-06, + "loss": 0.0679, + "step": 711 + }, + { + "epoch": 0.94, + "grad_norm": 0.13806217908859253, + "learning_rate": 1.0028194385981515e-06, + "loss": 0.0599, + "step": 712 + }, + { + "epoch": 0.94, + "grad_norm": 0.17131811380386353, + "learning_rate": 9.607359798384785e-07, + "loss": 0.0624, + "step": 713 + }, + { + "epoch": 0.94, + "grad_norm": 0.14887043833732605, + "learning_rate": 9.195460063567807e-07, + "loss": 0.0652, + "step": 714 + }, + { + "epoch": 0.94, + "grad_norm": 0.1282467544078827, + "learning_rate": 8.792502686258752e-07, + "loss": 0.056, + "step": 715 + }, + { + "epoch": 0.94, + "grad_norm": 0.1668287068605423, + "learning_rate": 8.398495008257956e-07, + "loss": 0.0629, + "step": 716 + }, + { + "epoch": 0.94, + "grad_norm": 0.14125637710094452, + "learning_rate": 8.013444208304133e-07, + "loss": 0.0719, + "step": 717 + }, + { + "epoch": 0.95, + "grad_norm": 0.14389286935329437, + "learning_rate": 7.637357301943371e-07, + "loss": 0.0634, + "step": 718 + }, + { + "epoch": 0.95, + "grad_norm": 0.15357066690921783, + "learning_rate": 7.270241141401569e-07, + "loss": 0.063, + "step": 719 + }, + { + "epoch": 0.95, + "grad_norm": 0.127652108669281, + "learning_rate": 6.912102415459476e-07, + "loss": 0.0592, + "step": 720 + }, + { + "epoch": 0.95, + "grad_norm": 0.15074937045574188, + "learning_rate": 6.562947649330853e-07, + "loss": 0.0665, + "step": 721 + }, + { + "epoch": 0.95, + "grad_norm": 0.14497500658035278, + "learning_rate": 6.222783204543559e-07, + "loss": 0.0679, + "step": 722 + }, + { + "epoch": 0.95, + "grad_norm": 0.16830851137638092, + "learning_rate": 5.891615278823537e-07, + "loss": 0.0662, + "step": 723 + }, + { + "epoch": 0.95, + "grad_norm": 0.11228172481060028, + "learning_rate": 5.569449905982239e-07, + "loss": 0.0621, + "step": 724 + }, + { + "epoch": 0.96, + "grad_norm": 0.15398009121418, + "learning_rate": 5.256292955806208e-07, + "loss": 0.0641, + "step": 725 + }, + { + "epoch": 0.96, + "grad_norm": 0.14682018756866455, + "learning_rate": 4.952150133950506e-07, + "loss": 0.0676, + "step": 726 + }, + { + "epoch": 0.96, + "grad_norm": 0.13635100424289703, + "learning_rate": 4.6570269818346224e-07, + "loss": 0.0661, + "step": 727 + }, + { + "epoch": 0.96, + "grad_norm": 0.1787886917591095, + "learning_rate": 4.370928876541613e-07, + "loss": 0.0606, + "step": 728 + }, + { + "epoch": 0.96, + "grad_norm": 0.16212987899780273, + "learning_rate": 4.093861030719903e-07, + "loss": 0.0672, + "step": 729 + }, + { + "epoch": 0.96, + "grad_norm": 0.13791510462760925, + "learning_rate": 3.8258284924885255e-07, + "loss": 0.064, + "step": 730 + }, + { + "epoch": 0.96, + "grad_norm": 0.1542445421218872, + "learning_rate": 3.5668361453450317e-07, + "loss": 0.0647, + "step": 731 + }, + { + "epoch": 0.96, + "grad_norm": 0.14709891378879547, + "learning_rate": 3.3168887080766154e-07, + "loss": 0.0615, + "step": 732 + }, + { + "epoch": 0.97, + "grad_norm": 0.16037517786026, + "learning_rate": 3.075990734674017e-07, + "loss": 0.0605, + "step": 733 + }, + { + "epoch": 0.97, + "grad_norm": 0.14704789221286774, + "learning_rate": 2.8441466142486994e-07, + "loss": 0.0702, + "step": 734 + }, + { + "epoch": 0.97, + "grad_norm": 0.14369893074035645, + "learning_rate": 2.62136057095258e-07, + "loss": 0.0671, + "step": 735 + }, + { + "epoch": 0.97, + "grad_norm": 0.14050166308879852, + "learning_rate": 2.407636663901591e-07, + "loss": 0.0585, + "step": 736 + }, + { + "epoch": 0.97, + "grad_norm": 0.13352462649345398, + "learning_rate": 2.2029787871010733e-07, + "loss": 0.0657, + "step": 737 + }, + { + "epoch": 0.97, + "grad_norm": 0.14897920191287994, + "learning_rate": 2.007390669375553e-07, + "loss": 0.0675, + "step": 738 + }, + { + "epoch": 0.97, + "grad_norm": 0.12918329238891602, + "learning_rate": 1.820875874300021e-07, + "loss": 0.0559, + "step": 739 + }, + { + "epoch": 0.97, + "grad_norm": 0.15355056524276733, + "learning_rate": 1.64343780013565e-07, + "loss": 0.0653, + "step": 740 + }, + { + "epoch": 0.98, + "grad_norm": 0.15641310811042786, + "learning_rate": 1.4750796797675658e-07, + "loss": 0.0674, + "step": 741 + }, + { + "epoch": 0.98, + "grad_norm": 0.1416761726140976, + "learning_rate": 1.3158045806460073e-07, + "loss": 0.0593, + "step": 742 + }, + { + "epoch": 0.98, + "grad_norm": 0.17423132061958313, + "learning_rate": 1.1656154047303691e-07, + "loss": 0.0657, + "step": 743 + }, + { + "epoch": 0.98, + "grad_norm": 0.14585049450397491, + "learning_rate": 1.024514888436634e-07, + "loss": 0.0667, + "step": 744 + }, + { + "epoch": 0.98, + "grad_norm": 0.12190530449151993, + "learning_rate": 8.925056025869128e-08, + "loss": 0.0575, + "step": 745 + }, + { + "epoch": 0.98, + "grad_norm": 0.1758907437324524, + "learning_rate": 7.695899523633143e-08, + "loss": 0.0665, + "step": 746 + }, + { + "epoch": 0.98, + "grad_norm": 0.11777140945196152, + "learning_rate": 6.557701772635372e-08, + "loss": 0.0658, + "step": 747 + }, + { + "epoch": 0.99, + "grad_norm": 0.14637833833694458, + "learning_rate": 5.5104835106051245e-08, + "loss": 0.0643, + "step": 748 + }, + { + "epoch": 0.99, + "grad_norm": 0.13879886269569397, + "learning_rate": 4.5542638176421237e-08, + "loss": 0.0643, + "step": 749 + }, + { + "epoch": 0.99, + "grad_norm": 0.15284717082977295, + "learning_rate": 3.689060115872889e-08, + "loss": 0.0705, + "step": 750 + }, + { + "epoch": 0.99, + "grad_norm": 0.14697931706905365, + "learning_rate": 2.9148881691298813e-08, + "loss": 0.0698, + "step": 751 + }, + { + "epoch": 0.99, + "grad_norm": 0.17319975793361664, + "learning_rate": 2.231762082666733e-08, + "loss": 0.0683, + "step": 752 + }, + { + "epoch": 0.99, + "grad_norm": 0.13041572272777557, + "learning_rate": 1.6396943028995638e-08, + "loss": 0.0646, + "step": 753 + }, + { + "epoch": 0.99, + "grad_norm": 0.16726787388324738, + "learning_rate": 1.1386956171816066e-08, + "loss": 0.0589, + "step": 754 + }, + { + "epoch": 0.99, + "grad_norm": 0.15242543816566467, + "learning_rate": 7.287751536050324e-09, + "loss": 0.0652, + "step": 755 + }, + { + "epoch": 1.0, + "grad_norm": 0.1539144217967987, + "learning_rate": 4.099403808366376e-09, + "loss": 0.0694, + "step": 756 + }, + { + "epoch": 1.0, + "grad_norm": 0.13503475487232208, + "learning_rate": 1.821971079796203e-09, + "loss": 0.0682, + "step": 757 + }, + { + "epoch": 1.0, + "grad_norm": 0.16742639243602753, + "learning_rate": 4.5549484470330807e-10, + "loss": 0.072, + "step": 758 + }, + { + "epoch": 1.0, + "grad_norm": 0.12691205739974976, + "learning_rate": 0.0, + "loss": 0.0598, + "step": 759 + }, + { + "epoch": 1.0, + "step": 759, + "total_flos": 1.6072338418759107e+18, + "train_loss": 0.07730967932833513, + "train_runtime": 10109.3463, + "train_samples_per_second": 9.612, + "train_steps_per_second": 0.075 + } + ], + "logging_steps": 1.0, + "max_steps": 759, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 1.6072338418759107e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}