diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4686 @@ +{ + "best_metric": 0.014902754686772823, + "best_model_checkpoint": "/home/paperspace/Data/models/akoul_whitehorseliquidity_25c/llm3br256/checkpoint-400", + "epoch": 5.0, + "eval_steps": 5, + "global_step": 540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009259259259259259, + "grad_norm": 0.29716095328330994, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.1002, + "step": 1 + }, + { + "epoch": 0.018518518518518517, + "grad_norm": 0.2648535370826721, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0936, + "step": 2 + }, + { + "epoch": 0.027777777777777776, + "grad_norm": 0.24819649755954742, + "learning_rate": 5.555555555555556e-06, + "loss": 0.0898, + "step": 3 + }, + { + "epoch": 0.037037037037037035, + "grad_norm": 0.23442289233207703, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.087, + "step": 4 + }, + { + "epoch": 0.046296296296296294, + "grad_norm": 0.26300737261772156, + "learning_rate": 9.259259259259259e-06, + "loss": 0.0904, + "step": 5 + }, + { + "epoch": 0.046296296296296294, + "eval_loss": 0.0950983464717865, + "eval_runtime": 11.9584, + "eval_samples_per_second": 4.181, + "eval_steps_per_second": 1.087, + "step": 5 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 0.18399731814861298, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0805, + "step": 6 + }, + { + "epoch": 0.06481481481481481, + "grad_norm": 0.19827856123447418, + "learning_rate": 1.2962962962962962e-05, + "loss": 0.0782, + "step": 7 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.13050280511379242, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.0636, + "step": 8 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 0.12110771238803864, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.056, + "step": 9 + }, + { + "epoch": 0.09259259259259259, + "grad_norm": 0.1111820638179779, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.053, + "step": 10 + }, + { + "epoch": 0.09259259259259259, + "eval_loss": 0.04887561500072479, + "eval_runtime": 9.1057, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 1.428, + "step": 10 + }, + { + "epoch": 0.10185185185185185, + "grad_norm": 0.0779903382062912, + "learning_rate": 2.037037037037037e-05, + "loss": 0.0538, + "step": 11 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.08193033933639526, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0398, + "step": 12 + }, + { + "epoch": 0.12037037037037036, + "grad_norm": 0.0821649506688118, + "learning_rate": 2.4074074074074074e-05, + "loss": 0.0473, + "step": 13 + }, + { + "epoch": 0.12962962962962962, + "grad_norm": 0.07107188552618027, + "learning_rate": 2.5925925925925925e-05, + "loss": 0.0386, + "step": 14 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 0.05971238389611244, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0417, + "step": 15 + }, + { + "epoch": 0.1388888888888889, + "eval_loss": 0.04156189784407616, + "eval_runtime": 9.1211, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 1.425, + "step": 15 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.05262186750769615, + "learning_rate": 2.962962962962963e-05, + "loss": 0.0384, + "step": 16 + }, + { + "epoch": 0.1574074074074074, + "grad_norm": 0.05361900106072426, + "learning_rate": 3.148148148148148e-05, + "loss": 0.0378, + "step": 17 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.05355929210782051, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0399, + "step": 18 + }, + { + "epoch": 0.17592592592592593, + "grad_norm": 0.04563885182142258, + "learning_rate": 3.518518518518519e-05, + "loss": 0.0368, + "step": 19 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 0.060624465346336365, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.0396, + "step": 20 + }, + { + "epoch": 0.18518518518518517, + "eval_loss": 0.03584723547101021, + "eval_runtime": 9.1162, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 20 + }, + { + "epoch": 0.19444444444444445, + "grad_norm": 0.0525534488260746, + "learning_rate": 3.888888888888889e-05, + "loss": 0.0364, + "step": 21 + }, + { + "epoch": 0.2037037037037037, + "grad_norm": 0.041657958179712296, + "learning_rate": 4.074074074074074e-05, + "loss": 0.034, + "step": 22 + }, + { + "epoch": 0.21296296296296297, + "grad_norm": 0.04589791223406792, + "learning_rate": 4.259259259259259e-05, + "loss": 0.0317, + "step": 23 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.04220304638147354, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.0339, + "step": 24 + }, + { + "epoch": 0.23148148148148148, + "grad_norm": 0.03630352392792702, + "learning_rate": 4.62962962962963e-05, + "loss": 0.029, + "step": 25 + }, + { + "epoch": 0.23148148148148148, + "eval_loss": 0.03286580368876457, + "eval_runtime": 9.1191, + "eval_samples_per_second": 5.483, + "eval_steps_per_second": 1.426, + "step": 25 + }, + { + "epoch": 0.24074074074074073, + "grad_norm": 0.04235522821545601, + "learning_rate": 4.814814814814815e-05, + "loss": 0.0326, + "step": 26 + }, + { + "epoch": 0.25, + "grad_norm": 0.04675336927175522, + "learning_rate": 5e-05, + "loss": 0.03, + "step": 27 + }, + { + "epoch": 0.25925925925925924, + "grad_norm": 0.039461418986320496, + "learning_rate": 5.185185185185185e-05, + "loss": 0.0328, + "step": 28 + }, + { + "epoch": 0.26851851851851855, + "grad_norm": 0.044042930006980896, + "learning_rate": 5.370370370370371e-05, + "loss": 0.0294, + "step": 29 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.044502489268779755, + "learning_rate": 5.555555555555556e-05, + "loss": 0.0311, + "step": 30 + }, + { + "epoch": 0.2777777777777778, + "eval_loss": 0.030865700915455818, + "eval_runtime": 9.1099, + "eval_samples_per_second": 5.489, + "eval_steps_per_second": 1.427, + "step": 30 + }, + { + "epoch": 0.28703703703703703, + "grad_norm": 0.04979817569255829, + "learning_rate": 5.740740740740741e-05, + "loss": 0.0292, + "step": 31 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.04573828727006912, + "learning_rate": 5.925925925925926e-05, + "loss": 0.0346, + "step": 32 + }, + { + "epoch": 0.3055555555555556, + "grad_norm": 0.0410350002348423, + "learning_rate": 6.111111111111112e-05, + "loss": 0.0295, + "step": 33 + }, + { + "epoch": 0.3148148148148148, + "grad_norm": 0.0416686087846756, + "learning_rate": 6.296296296296296e-05, + "loss": 0.0267, + "step": 34 + }, + { + "epoch": 0.32407407407407407, + "grad_norm": 0.042319901287555695, + "learning_rate": 6.481481481481482e-05, + "loss": 0.0295, + "step": 35 + }, + { + "epoch": 0.32407407407407407, + "eval_loss": 0.028042705729603767, + "eval_runtime": 9.1376, + "eval_samples_per_second": 5.472, + "eval_steps_per_second": 1.423, + "step": 35 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.037845220416784286, + "learning_rate": 6.666666666666667e-05, + "loss": 0.0319, + "step": 36 + }, + { + "epoch": 0.3425925925925926, + "grad_norm": 0.03568718954920769, + "learning_rate": 6.851851851851852e-05, + "loss": 0.0346, + "step": 37 + }, + { + "epoch": 0.35185185185185186, + "grad_norm": 0.037281136959791183, + "learning_rate": 7.037037037037038e-05, + "loss": 0.031, + "step": 38 + }, + { + "epoch": 0.3611111111111111, + "grad_norm": 0.03607446327805519, + "learning_rate": 7.222222222222222e-05, + "loss": 0.0335, + "step": 39 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.03654631972312927, + "learning_rate": 7.407407407407407e-05, + "loss": 0.0262, + "step": 40 + }, + { + "epoch": 0.37037037037037035, + "eval_loss": 0.026602942496538162, + "eval_runtime": 9.1124, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.427, + "step": 40 + }, + { + "epoch": 0.37962962962962965, + "grad_norm": 0.039490777999162674, + "learning_rate": 7.592592592592593e-05, + "loss": 0.0252, + "step": 41 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 0.036680739372968674, + "learning_rate": 7.777777777777778e-05, + "loss": 0.0242, + "step": 42 + }, + { + "epoch": 0.39814814814814814, + "grad_norm": 0.040739599615335464, + "learning_rate": 7.962962962962964e-05, + "loss": 0.025, + "step": 43 + }, + { + "epoch": 0.4074074074074074, + "grad_norm": 0.04679260402917862, + "learning_rate": 8.148148148148148e-05, + "loss": 0.0212, + "step": 44 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.04656214639544487, + "learning_rate": 8.333333333333334e-05, + "loss": 0.0272, + "step": 45 + }, + { + "epoch": 0.4166666666666667, + "eval_loss": 0.02608887106180191, + "eval_runtime": 9.1343, + "eval_samples_per_second": 5.474, + "eval_steps_per_second": 1.423, + "step": 45 + }, + { + "epoch": 0.42592592592592593, + "grad_norm": 0.04525485262274742, + "learning_rate": 8.518518518518518e-05, + "loss": 0.0274, + "step": 46 + }, + { + "epoch": 0.4351851851851852, + "grad_norm": 0.03210742771625519, + "learning_rate": 8.703703703703704e-05, + "loss": 0.0283, + "step": 47 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.03675089031457901, + "learning_rate": 8.888888888888889e-05, + "loss": 0.0242, + "step": 48 + }, + { + "epoch": 0.4537037037037037, + "grad_norm": 0.03396710753440857, + "learning_rate": 9.074074074074075e-05, + "loss": 0.0239, + "step": 49 + }, + { + "epoch": 0.46296296296296297, + "grad_norm": 0.02745971269905567, + "learning_rate": 9.25925925925926e-05, + "loss": 0.0224, + "step": 50 + }, + { + "epoch": 0.46296296296296297, + "eval_loss": 0.02490057609975338, + "eval_runtime": 9.1102, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.427, + "step": 50 + }, + { + "epoch": 0.4722222222222222, + "grad_norm": 0.04084627702832222, + "learning_rate": 9.444444444444444e-05, + "loss": 0.0252, + "step": 51 + }, + { + "epoch": 0.48148148148148145, + "grad_norm": 0.033021993935108185, + "learning_rate": 9.62962962962963e-05, + "loss": 0.0228, + "step": 52 + }, + { + "epoch": 0.49074074074074076, + "grad_norm": 0.034785784780979156, + "learning_rate": 9.814814814814815e-05, + "loss": 0.0259, + "step": 53 + }, + { + "epoch": 0.5, + "grad_norm": 0.03407888114452362, + "learning_rate": 0.0001, + "loss": 0.0239, + "step": 54 + }, + { + "epoch": 0.5092592592592593, + "grad_norm": 0.03268973529338837, + "learning_rate": 9.99989553622803e-05, + "loss": 0.0229, + "step": 55 + }, + { + "epoch": 0.5092592592592593, + "eval_loss": 0.02450372651219368, + "eval_runtime": 9.1421, + "eval_samples_per_second": 5.469, + "eval_steps_per_second": 1.422, + "step": 55 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.032378531992435455, + "learning_rate": 9.999582149277187e-05, + "loss": 0.0219, + "step": 56 + }, + { + "epoch": 0.5277777777777778, + "grad_norm": 0.03997437283396721, + "learning_rate": 9.999059852242507e-05, + "loss": 0.0248, + "step": 57 + }, + { + "epoch": 0.5370370370370371, + "grad_norm": 0.04024836793541908, + "learning_rate": 9.998328666948438e-05, + "loss": 0.0194, + "step": 58 + }, + { + "epoch": 0.5462962962962963, + "grad_norm": 0.03850249573588371, + "learning_rate": 9.997388623947928e-05, + "loss": 0.0251, + "step": 59 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.03326913341879845, + "learning_rate": 9.996239762521151e-05, + "loss": 0.0233, + "step": 60 + }, + { + "epoch": 0.5555555555555556, + "eval_loss": 0.023316912353038788, + "eval_runtime": 9.1353, + "eval_samples_per_second": 5.473, + "eval_steps_per_second": 1.423, + "step": 60 + }, + { + "epoch": 0.5648148148148148, + "grad_norm": 0.034179024398326874, + "learning_rate": 9.994882130673868e-05, + "loss": 0.0222, + "step": 61 + }, + { + "epoch": 0.5740740740740741, + "grad_norm": 0.031797800213098526, + "learning_rate": 9.993315785135416e-05, + "loss": 0.0272, + "step": 62 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 0.03183833882212639, + "learning_rate": 9.991540791356342e-05, + "loss": 0.0241, + "step": 63 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.025173548609018326, + "learning_rate": 9.989557223505661e-05, + "loss": 0.0216, + "step": 64 + }, + { + "epoch": 0.6018518518518519, + "grad_norm": 0.04935009032487869, + "learning_rate": 9.987365164467767e-05, + "loss": 0.0217, + "step": 65 + }, + { + "epoch": 0.6018518518518519, + "eval_loss": 0.02255990356206894, + "eval_runtime": 9.1207, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 1.425, + "step": 65 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.02904060110449791, + "learning_rate": 9.98496470583896e-05, + "loss": 0.0213, + "step": 66 + }, + { + "epoch": 0.6203703703703703, + "grad_norm": 0.046014755964279175, + "learning_rate": 9.982355947923629e-05, + "loss": 0.018, + "step": 67 + }, + { + "epoch": 0.6296296296296297, + "grad_norm": 0.0354795977473259, + "learning_rate": 9.979538999730047e-05, + "loss": 0.0199, + "step": 68 + }, + { + "epoch": 0.6388888888888888, + "grad_norm": 0.03308796137571335, + "learning_rate": 9.976513978965829e-05, + "loss": 0.0239, + "step": 69 + }, + { + "epoch": 0.6481481481481481, + "grad_norm": 0.03860899433493614, + "learning_rate": 9.973281012033007e-05, + "loss": 0.0247, + "step": 70 + }, + { + "epoch": 0.6481481481481481, + "eval_loss": 0.022898558527231216, + "eval_runtime": 9.1074, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 1.427, + "step": 70 + }, + { + "epoch": 0.6574074074074074, + "grad_norm": 0.028213078156113625, + "learning_rate": 9.969840234022749e-05, + "loss": 0.0197, + "step": 71 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.024581043049693108, + "learning_rate": 9.966191788709716e-05, + "loss": 0.0207, + "step": 72 + }, + { + "epoch": 0.6759259259259259, + "grad_norm": 0.026658454909920692, + "learning_rate": 9.962335828546048e-05, + "loss": 0.0214, + "step": 73 + }, + { + "epoch": 0.6851851851851852, + "grad_norm": 0.034941576421260834, + "learning_rate": 9.958272514655006e-05, + "loss": 0.0205, + "step": 74 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 0.03060038387775421, + "learning_rate": 9.954002016824227e-05, + "loss": 0.0193, + "step": 75 + }, + { + "epoch": 0.6944444444444444, + "eval_loss": 0.02283317781984806, + "eval_runtime": 9.1512, + "eval_samples_per_second": 5.464, + "eval_steps_per_second": 1.421, + "step": 75 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 0.0313015952706337, + "learning_rate": 9.949524513498636e-05, + "loss": 0.0206, + "step": 76 + }, + { + "epoch": 0.7129629629629629, + "grad_norm": 0.03317766636610031, + "learning_rate": 9.944840191772987e-05, + "loss": 0.0217, + "step": 77 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 0.027911782264709473, + "learning_rate": 9.939949247384046e-05, + "loss": 0.0196, + "step": 78 + }, + { + "epoch": 0.7314814814814815, + "grad_norm": 0.028807291761040688, + "learning_rate": 9.934851884702414e-05, + "loss": 0.0223, + "step": 79 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.03152855485677719, + "learning_rate": 9.929548316723982e-05, + "loss": 0.0173, + "step": 80 + }, + { + "epoch": 0.7407407407407407, + "eval_loss": 0.021335698664188385, + "eval_runtime": 9.1689, + "eval_samples_per_second": 5.453, + "eval_steps_per_second": 1.418, + "step": 80 + }, + { + "epoch": 0.75, + "grad_norm": 0.03250882402062416, + "learning_rate": 9.924038765061042e-05, + "loss": 0.0231, + "step": 81 + }, + { + "epoch": 0.7592592592592593, + "grad_norm": 0.030853938311338425, + "learning_rate": 9.918323459933005e-05, + "loss": 0.0224, + "step": 82 + }, + { + "epoch": 0.7685185185185185, + "grad_norm": 0.03431202098727226, + "learning_rate": 9.912402640156811e-05, + "loss": 0.0223, + "step": 83 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.027050426229834557, + "learning_rate": 9.906276553136923e-05, + "loss": 0.0198, + "step": 84 + }, + { + "epoch": 0.7870370370370371, + "grad_norm": 0.03224191442131996, + "learning_rate": 9.899945454855006e-05, + "loss": 0.0207, + "step": 85 + }, + { + "epoch": 0.7870370370370371, + "eval_loss": 0.020375357940793037, + "eval_runtime": 9.1362, + "eval_samples_per_second": 5.473, + "eval_steps_per_second": 1.423, + "step": 85 + }, + { + "epoch": 0.7962962962962963, + "grad_norm": 0.028706278651952744, + "learning_rate": 9.893409609859222e-05, + "loss": 0.0197, + "step": 86 + }, + { + "epoch": 0.8055555555555556, + "grad_norm": 0.02814578451216221, + "learning_rate": 9.88666929125318e-05, + "loss": 0.0199, + "step": 87 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.028775395825505257, + "learning_rate": 9.879724780684519e-05, + "loss": 0.0169, + "step": 88 + }, + { + "epoch": 0.8240740740740741, + "grad_norm": 0.030078047886490822, + "learning_rate": 9.872576368333151e-05, + "loss": 0.0209, + "step": 89 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.031860969960689545, + "learning_rate": 9.865224352899119e-05, + "loss": 0.0213, + "step": 90 + }, + { + "epoch": 0.8333333333333334, + "eval_loss": 0.019939038902521133, + "eval_runtime": 9.1287, + "eval_samples_per_second": 5.477, + "eval_steps_per_second": 1.424, + "step": 90 + }, + { + "epoch": 0.8425925925925926, + "grad_norm": 0.03415157273411751, + "learning_rate": 9.857669041590134e-05, + "loss": 0.021, + "step": 91 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 0.032674115151166916, + "learning_rate": 9.849910750108717e-05, + "loss": 0.0207, + "step": 92 + }, + { + "epoch": 0.8611111111111112, + "grad_norm": 0.02941475249826908, + "learning_rate": 9.84194980263903e-05, + "loss": 0.0196, + "step": 93 + }, + { + "epoch": 0.8703703703703703, + "grad_norm": 0.036115583032369614, + "learning_rate": 9.83378653183331e-05, + "loss": 0.0178, + "step": 94 + }, + { + "epoch": 0.8796296296296297, + "grad_norm": 0.03358744457364082, + "learning_rate": 9.825421278797983e-05, + "loss": 0.0199, + "step": 95 + }, + { + "epoch": 0.8796296296296297, + "eval_loss": 0.020193172618746758, + "eval_runtime": 9.1141, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 95 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.029014358296990395, + "learning_rate": 9.816854393079403e-05, + "loss": 0.0219, + "step": 96 + }, + { + "epoch": 0.8981481481481481, + "grad_norm": 0.042931754142045975, + "learning_rate": 9.808086232649246e-05, + "loss": 0.0185, + "step": 97 + }, + { + "epoch": 0.9074074074074074, + "grad_norm": 0.029089825227856636, + "learning_rate": 9.799117163889559e-05, + "loss": 0.021, + "step": 98 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 0.03154176101088524, + "learning_rate": 9.789947561577445e-05, + "loss": 0.02, + "step": 99 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 0.027786221355199814, + "learning_rate": 9.780577808869398e-05, + "loss": 0.0188, + "step": 100 + }, + { + "epoch": 0.9259259259259259, + "eval_loss": 0.02070247381925583, + "eval_runtime": 9.1159, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 100 + }, + { + "epoch": 0.9351851851851852, + "grad_norm": 0.030518539249897003, + "learning_rate": 9.771008297285307e-05, + "loss": 0.0218, + "step": 101 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 0.024817178025841713, + "learning_rate": 9.761239426692077e-05, + "loss": 0.0202, + "step": 102 + }, + { + "epoch": 0.9537037037037037, + "grad_norm": 0.025192229077219963, + "learning_rate": 9.751271605286941e-05, + "loss": 0.0197, + "step": 103 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.02538897655904293, + "learning_rate": 9.741105249580383e-05, + "loss": 0.02, + "step": 104 + }, + { + "epoch": 0.9722222222222222, + "grad_norm": 0.025440450757741928, + "learning_rate": 9.730740784378753e-05, + "loss": 0.0193, + "step": 105 + }, + { + "epoch": 0.9722222222222222, + "eval_loss": 0.020300446078181267, + "eval_runtime": 9.126, + "eval_samples_per_second": 5.479, + "eval_steps_per_second": 1.425, + "step": 105 + }, + { + "epoch": 0.9814814814814815, + "grad_norm": 0.02362542785704136, + "learning_rate": 9.7201786427665e-05, + "loss": 0.0202, + "step": 106 + }, + { + "epoch": 0.9907407407407407, + "grad_norm": 0.022390421479940414, + "learning_rate": 9.709419266088086e-05, + "loss": 0.0188, + "step": 107 + }, + { + "epoch": 1.0, + "grad_norm": 0.026193244382739067, + "learning_rate": 9.698463103929542e-05, + "loss": 0.022, + "step": 108 + }, + { + "epoch": 1.0092592592592593, + "grad_norm": 0.028253022581338882, + "learning_rate": 9.687310614099675e-05, + "loss": 0.0159, + "step": 109 + }, + { + "epoch": 1.0185185185185186, + "grad_norm": 0.02241157554090023, + "learning_rate": 9.67596226261095e-05, + "loss": 0.016, + "step": 110 + }, + { + "epoch": 1.0185185185185186, + "eval_loss": 0.01969613879919052, + "eval_runtime": 9.1053, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 1.428, + "step": 110 + }, + { + "epoch": 1.0277777777777777, + "grad_norm": 0.027405373752117157, + "learning_rate": 9.664418523660004e-05, + "loss": 0.014, + "step": 111 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 0.032646384090185165, + "learning_rate": 9.652679879607843e-05, + "loss": 0.0172, + "step": 112 + }, + { + "epoch": 1.0462962962962963, + "grad_norm": 0.02552163228392601, + "learning_rate": 9.640746820959684e-05, + "loss": 0.014, + "step": 113 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 0.022228199988603592, + "learning_rate": 9.628619846344454e-05, + "loss": 0.0172, + "step": 114 + }, + { + "epoch": 1.0648148148148149, + "grad_norm": 0.028009962290525436, + "learning_rate": 9.616299462493952e-05, + "loss": 0.0166, + "step": 115 + }, + { + "epoch": 1.0648148148148149, + "eval_loss": 0.019864549860358238, + "eval_runtime": 9.122, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 1.425, + "step": 115 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 0.025030331686139107, + "learning_rate": 9.603786184221693e-05, + "loss": 0.0195, + "step": 116 + }, + { + "epoch": 1.0833333333333333, + "grad_norm": 0.030586065724492073, + "learning_rate": 9.591080534401371e-05, + "loss": 0.015, + "step": 117 + }, + { + "epoch": 1.0925925925925926, + "grad_norm": 0.02425476722419262, + "learning_rate": 9.57818304394503e-05, + "loss": 0.0183, + "step": 118 + }, + { + "epoch": 1.1018518518518519, + "grad_norm": 0.03203345090150833, + "learning_rate": 9.565094251780871e-05, + "loss": 0.0172, + "step": 119 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.03028124012053013, + "learning_rate": 9.551814704830734e-05, + "loss": 0.0189, + "step": 120 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.019504941999912262, + "eval_runtime": 9.1171, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.426, + "step": 120 + }, + { + "epoch": 1.1203703703703705, + "grad_norm": 0.026934562250971794, + "learning_rate": 9.538344957987244e-05, + "loss": 0.0132, + "step": 121 + }, + { + "epoch": 1.1296296296296295, + "grad_norm": 0.02392655238509178, + "learning_rate": 9.524685574090627e-05, + "loss": 0.0184, + "step": 122 + }, + { + "epoch": 1.1388888888888888, + "grad_norm": 0.02336742728948593, + "learning_rate": 9.51083712390519e-05, + "loss": 0.0155, + "step": 123 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.025306498631834984, + "learning_rate": 9.496800186095466e-05, + "loss": 0.0156, + "step": 124 + }, + { + "epoch": 1.1574074074074074, + "grad_norm": 0.02764940820634365, + "learning_rate": 9.482575347202047e-05, + "loss": 0.0211, + "step": 125 + }, + { + "epoch": 1.1574074074074074, + "eval_loss": 0.018362991511821747, + "eval_runtime": 9.1297, + "eval_samples_per_second": 5.477, + "eval_steps_per_second": 1.424, + "step": 125 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.02213912270963192, + "learning_rate": 9.468163201617062e-05, + "loss": 0.0178, + "step": 126 + }, + { + "epoch": 1.175925925925926, + "grad_norm": 0.03320689871907234, + "learning_rate": 9.453564351559348e-05, + "loss": 0.0148, + "step": 127 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 0.023370925337076187, + "learning_rate": 9.438779407049281e-05, + "loss": 0.0174, + "step": 128 + }, + { + "epoch": 1.1944444444444444, + "grad_norm": 0.02848099358379841, + "learning_rate": 9.423808985883289e-05, + "loss": 0.0174, + "step": 129 + }, + { + "epoch": 1.2037037037037037, + "grad_norm": 0.02608056552708149, + "learning_rate": 9.40865371360804e-05, + "loss": 0.0171, + "step": 130 + }, + { + "epoch": 1.2037037037037037, + "eval_loss": 0.018851976841688156, + "eval_runtime": 9.1046, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 1.428, + "step": 130 + }, + { + "epoch": 1.212962962962963, + "grad_norm": 0.02152630314230919, + "learning_rate": 9.393314223494296e-05, + "loss": 0.0172, + "step": 131 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.02550230175256729, + "learning_rate": 9.377791156510455e-05, + "loss": 0.016, + "step": 132 + }, + { + "epoch": 1.2314814814814814, + "grad_norm": 0.025004474446177483, + "learning_rate": 9.362085161295769e-05, + "loss": 0.0163, + "step": 133 + }, + { + "epoch": 1.2407407407407407, + "grad_norm": 0.026416007429361343, + "learning_rate": 9.346196894133239e-05, + "loss": 0.0165, + "step": 134 + }, + { + "epoch": 1.25, + "grad_norm": 0.029432326555252075, + "learning_rate": 9.330127018922194e-05, + "loss": 0.0191, + "step": 135 + }, + { + "epoch": 1.25, + "eval_loss": 0.019194327294826508, + "eval_runtime": 9.1131, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.427, + "step": 135 + }, + { + "epoch": 1.2592592592592593, + "grad_norm": 0.03440408781170845, + "learning_rate": 9.313876207150543e-05, + "loss": 0.0165, + "step": 136 + }, + { + "epoch": 1.2685185185185186, + "grad_norm": 0.025614989921450615, + "learning_rate": 9.297445137866727e-05, + "loss": 0.0162, + "step": 137 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 0.02456337958574295, + "learning_rate": 9.280834497651334e-05, + "loss": 0.0192, + "step": 138 + }, + { + "epoch": 1.287037037037037, + "grad_norm": 0.051101330667734146, + "learning_rate": 9.264044980588416e-05, + "loss": 0.015, + "step": 139 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.03369716554880142, + "learning_rate": 9.247077288236488e-05, + "loss": 0.0184, + "step": 140 + }, + { + "epoch": 1.2962962962962963, + "eval_loss": 0.018648317083716393, + "eval_runtime": 9.1079, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 1.427, + "step": 140 + }, + { + "epoch": 1.3055555555555556, + "grad_norm": 0.024168213829398155, + "learning_rate": 9.229932129599205e-05, + "loss": 0.0166, + "step": 141 + }, + { + "epoch": 1.3148148148148149, + "grad_norm": 0.027960045263171196, + "learning_rate": 9.212610221095748e-05, + "loss": 0.0157, + "step": 142 + }, + { + "epoch": 1.324074074074074, + "grad_norm": 0.023985836654901505, + "learning_rate": 9.195112286530873e-05, + "loss": 0.0178, + "step": 143 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.026084545999765396, + "learning_rate": 9.177439057064683e-05, + "loss": 0.0164, + "step": 144 + }, + { + "epoch": 1.3425925925925926, + "grad_norm": 0.022582337260246277, + "learning_rate": 9.159591271182058e-05, + "loss": 0.0162, + "step": 145 + }, + { + "epoch": 1.3425925925925926, + "eval_loss": 0.018656810745596886, + "eval_runtime": 9.1149, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 145 + }, + { + "epoch": 1.3518518518518519, + "grad_norm": 0.030290907248854637, + "learning_rate": 9.141569674661817e-05, + "loss": 0.021, + "step": 146 + }, + { + "epoch": 1.3611111111111112, + "grad_norm": 0.026109322905540466, + "learning_rate": 9.123375020545535e-05, + "loss": 0.0162, + "step": 147 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 0.02652176469564438, + "learning_rate": 9.105008069106093e-05, + "loss": 0.0169, + "step": 148 + }, + { + "epoch": 1.3796296296296298, + "grad_norm": 0.024147020652890205, + "learning_rate": 9.086469587815904e-05, + "loss": 0.0162, + "step": 149 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.021294649690389633, + "learning_rate": 9.067760351314838e-05, + "loss": 0.0165, + "step": 150 + }, + { + "epoch": 1.3888888888888888, + "eval_loss": 0.018213987350463867, + "eval_runtime": 9.1247, + "eval_samples_per_second": 5.48, + "eval_steps_per_second": 1.425, + "step": 150 + }, + { + "epoch": 1.3981481481481481, + "grad_norm": 0.02462903782725334, + "learning_rate": 9.048881141377863e-05, + "loss": 0.0204, + "step": 151 + }, + { + "epoch": 1.4074074074074074, + "grad_norm": 0.024652326479554176, + "learning_rate": 9.029832746882371e-05, + "loss": 0.0164, + "step": 152 + }, + { + "epoch": 1.4166666666666667, + "grad_norm": 0.026834659278392792, + "learning_rate": 9.01061596377522e-05, + "loss": 0.018, + "step": 153 + }, + { + "epoch": 1.425925925925926, + "grad_norm": 0.02342064492404461, + "learning_rate": 8.991231595039465e-05, + "loss": 0.0156, + "step": 154 + }, + { + "epoch": 1.4351851851851851, + "grad_norm": 0.026441222056746483, + "learning_rate": 8.97168045066082e-05, + "loss": 0.0157, + "step": 155 + }, + { + "epoch": 1.4351851851851851, + "eval_loss": 0.01855114847421646, + "eval_runtime": 9.124, + "eval_samples_per_second": 5.48, + "eval_steps_per_second": 1.425, + "step": 155 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.01796615496277809, + "learning_rate": 8.951963347593797e-05, + "loss": 0.0165, + "step": 156 + }, + { + "epoch": 1.4537037037037037, + "grad_norm": 0.02256671153008938, + "learning_rate": 8.932081109727582e-05, + "loss": 0.0201, + "step": 157 + }, + { + "epoch": 1.462962962962963, + "grad_norm": 0.028528334572911263, + "learning_rate": 8.912034567851599e-05, + "loss": 0.0182, + "step": 158 + }, + { + "epoch": 1.4722222222222223, + "grad_norm": 0.029104968532919884, + "learning_rate": 8.891824559620801e-05, + "loss": 0.0153, + "step": 159 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 0.02003669925034046, + "learning_rate": 8.871451929520663e-05, + "loss": 0.0159, + "step": 160 + }, + { + "epoch": 1.4814814814814814, + "eval_loss": 0.01888095587491989, + "eval_runtime": 9.1172, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.426, + "step": 160 + }, + { + "epoch": 1.4907407407407407, + "grad_norm": 0.019447356462478638, + "learning_rate": 8.850917528831899e-05, + "loss": 0.0163, + "step": 161 + }, + { + "epoch": 1.5, + "grad_norm": 0.03438901901245117, + "learning_rate": 8.83022221559489e-05, + "loss": 0.0125, + "step": 162 + }, + { + "epoch": 1.5092592592592593, + "grad_norm": 0.026535626500844955, + "learning_rate": 8.809366854573831e-05, + "loss": 0.0175, + "step": 163 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 0.029025647789239883, + "learning_rate": 8.78835231722059e-05, + "loss": 0.0164, + "step": 164 + }, + { + "epoch": 1.5277777777777777, + "grad_norm": 0.025528129190206528, + "learning_rate": 8.767179481638303e-05, + "loss": 0.0174, + "step": 165 + }, + { + "epoch": 1.5277777777777777, + "eval_loss": 0.018690049648284912, + "eval_runtime": 9.1481, + "eval_samples_per_second": 5.466, + "eval_steps_per_second": 1.421, + "step": 165 + }, + { + "epoch": 1.5370370370370372, + "grad_norm": 0.025675086304545403, + "learning_rate": 8.745849232544681e-05, + "loss": 0.0179, + "step": 166 + }, + { + "epoch": 1.5462962962962963, + "grad_norm": 0.027451254427433014, + "learning_rate": 8.724362461235029e-05, + "loss": 0.0169, + "step": 167 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.026652028784155846, + "learning_rate": 8.702720065545024e-05, + "loss": 0.0168, + "step": 168 + }, + { + "epoch": 1.5648148148148149, + "grad_norm": 0.030202018097043037, + "learning_rate": 8.680922949813178e-05, + "loss": 0.0162, + "step": 169 + }, + { + "epoch": 1.574074074074074, + "grad_norm": 0.027389824390411377, + "learning_rate": 8.658972024843062e-05, + "loss": 0.0184, + "step": 170 + }, + { + "epoch": 1.574074074074074, + "eval_loss": 0.018272995948791504, + "eval_runtime": 9.1448, + "eval_samples_per_second": 5.468, + "eval_steps_per_second": 1.422, + "step": 170 + }, + { + "epoch": 1.5833333333333335, + "grad_norm": 0.025648167356848717, + "learning_rate": 8.636868207865244e-05, + "loss": 0.0152, + "step": 171 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.02472120150923729, + "learning_rate": 8.614612422498964e-05, + "loss": 0.0153, + "step": 172 + }, + { + "epoch": 1.6018518518518519, + "grad_norm": 0.020042769610881805, + "learning_rate": 8.592205598713539e-05, + "loss": 0.017, + "step": 173 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 0.029423648491501808, + "learning_rate": 8.569648672789497e-05, + "loss": 0.0158, + "step": 174 + }, + { + "epoch": 1.6203703703703702, + "grad_norm": 0.02159775421023369, + "learning_rate": 8.546942587279465e-05, + "loss": 0.0165, + "step": 175 + }, + { + "epoch": 1.6203703703703702, + "eval_loss": 0.018273252993822098, + "eval_runtime": 9.118, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.426, + "step": 175 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 0.024837305769324303, + "learning_rate": 8.524088290968781e-05, + "loss": 0.0187, + "step": 176 + }, + { + "epoch": 1.6388888888888888, + "grad_norm": 0.02383432537317276, + "learning_rate": 8.501086738835843e-05, + "loss": 0.0181, + "step": 177 + }, + { + "epoch": 1.6481481481481481, + "grad_norm": 0.025743911042809486, + "learning_rate": 8.47793889201221e-05, + "loss": 0.0171, + "step": 178 + }, + { + "epoch": 1.6574074074074074, + "grad_norm": 0.023100929334759712, + "learning_rate": 8.45464571774244e-05, + "loss": 0.021, + "step": 179 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.02667200192809105, + "learning_rate": 8.43120818934367e-05, + "loss": 0.0173, + "step": 180 + }, + { + "epoch": 1.6666666666666665, + "eval_loss": 0.01778573729097843, + "eval_runtime": 9.1324, + "eval_samples_per_second": 5.475, + "eval_steps_per_second": 1.424, + "step": 180 + }, + { + "epoch": 1.675925925925926, + "grad_norm": 0.02880384773015976, + "learning_rate": 8.407627286164948e-05, + "loss": 0.015, + "step": 181 + }, + { + "epoch": 1.6851851851851851, + "grad_norm": 0.030301645398139954, + "learning_rate": 8.383903993546311e-05, + "loss": 0.0157, + "step": 182 + }, + { + "epoch": 1.6944444444444444, + "grad_norm": 0.021445374935865402, + "learning_rate": 8.360039302777612e-05, + "loss": 0.0181, + "step": 183 + }, + { + "epoch": 1.7037037037037037, + "grad_norm": 0.023577649146318436, + "learning_rate": 8.336034211057098e-05, + "loss": 0.0153, + "step": 184 + }, + { + "epoch": 1.7129629629629628, + "grad_norm": 0.02492811530828476, + "learning_rate": 8.31188972144974e-05, + "loss": 0.0131, + "step": 185 + }, + { + "epoch": 1.7129629629629628, + "eval_loss": 0.017187727615237236, + "eval_runtime": 9.1252, + "eval_samples_per_second": 5.479, + "eval_steps_per_second": 1.425, + "step": 185 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 0.023155970498919487, + "learning_rate": 8.28760684284532e-05, + "loss": 0.0162, + "step": 186 + }, + { + "epoch": 1.7314814814814814, + "grad_norm": 0.02491271123290062, + "learning_rate": 8.263186589916273e-05, + "loss": 0.0137, + "step": 187 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.02165275253355503, + "learning_rate": 8.238629983075294e-05, + "loss": 0.0143, + "step": 188 + }, + { + "epoch": 1.75, + "grad_norm": 0.024284129962325096, + "learning_rate": 8.213938048432697e-05, + "loss": 0.0144, + "step": 189 + }, + { + "epoch": 1.7592592592592593, + "grad_norm": 0.027395077049732208, + "learning_rate": 8.18911181775353e-05, + "loss": 0.0132, + "step": 190 + }, + { + "epoch": 1.7592592592592593, + "eval_loss": 0.018012873828411102, + "eval_runtime": 9.1149, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 190 + }, + { + "epoch": 1.7685185185185186, + "grad_norm": 0.02639261819422245, + "learning_rate": 8.164152328414476e-05, + "loss": 0.0156, + "step": 191 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.02319464646279812, + "learning_rate": 8.139060623360493e-05, + "loss": 0.0121, + "step": 192 + }, + { + "epoch": 1.7870370370370372, + "grad_norm": 0.020444169640541077, + "learning_rate": 8.113837751061246e-05, + "loss": 0.0156, + "step": 193 + }, + { + "epoch": 1.7962962962962963, + "grad_norm": 0.03843529522418976, + "learning_rate": 8.088484765467286e-05, + "loss": 0.0202, + "step": 194 + }, + { + "epoch": 1.8055555555555556, + "grad_norm": 0.03014414757490158, + "learning_rate": 8.063002725966015e-05, + "loss": 0.0157, + "step": 195 + }, + { + "epoch": 1.8055555555555556, + "eval_loss": 0.018071575090289116, + "eval_runtime": 9.1428, + "eval_samples_per_second": 5.469, + "eval_steps_per_second": 1.422, + "step": 195 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 0.028225911781191826, + "learning_rate": 8.037392697337418e-05, + "loss": 0.0152, + "step": 196 + }, + { + "epoch": 1.824074074074074, + "grad_norm": 0.022350864484906197, + "learning_rate": 8.011655749709575e-05, + "loss": 0.0147, + "step": 197 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.023073699325323105, + "learning_rate": 7.985792958513931e-05, + "loss": 0.0142, + "step": 198 + }, + { + "epoch": 1.8425925925925926, + "grad_norm": 0.027160046622157097, + "learning_rate": 7.95980540444038e-05, + "loss": 0.0181, + "step": 199 + }, + { + "epoch": 1.8518518518518519, + "grad_norm": 0.02501911297440529, + "learning_rate": 7.93369417339209e-05, + "loss": 0.0154, + "step": 200 + }, + { + "epoch": 1.8518518518518519, + "eval_loss": 0.01711750030517578, + "eval_runtime": 9.1469, + "eval_samples_per_second": 5.466, + "eval_steps_per_second": 1.421, + "step": 200 + }, + { + "epoch": 1.8611111111111112, + "grad_norm": 0.02209513448178768, + "learning_rate": 7.907460356440133e-05, + "loss": 0.0156, + "step": 201 + }, + { + "epoch": 1.8703703703703702, + "grad_norm": 0.022372853010892868, + "learning_rate": 7.881105049777901e-05, + "loss": 0.0182, + "step": 202 + }, + { + "epoch": 1.8796296296296298, + "grad_norm": 0.02874351665377617, + "learning_rate": 7.854629354675291e-05, + "loss": 0.0145, + "step": 203 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.025754928588867188, + "learning_rate": 7.828034377432693e-05, + "loss": 0.0161, + "step": 204 + }, + { + "epoch": 1.8981481481481481, + "grad_norm": 0.023868247866630554, + "learning_rate": 7.801321229334764e-05, + "loss": 0.0139, + "step": 205 + }, + { + "epoch": 1.8981481481481481, + "eval_loss": 0.01687374897301197, + "eval_runtime": 9.1148, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 205 + }, + { + "epoch": 1.9074074074074074, + "grad_norm": 0.02167942002415657, + "learning_rate": 7.774491026603985e-05, + "loss": 0.0172, + "step": 206 + }, + { + "epoch": 1.9166666666666665, + "grad_norm": 0.028955647721886635, + "learning_rate": 7.74754489035403e-05, + "loss": 0.0182, + "step": 207 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 0.023490311577916145, + "learning_rate": 7.720483946542914e-05, + "loss": 0.0176, + "step": 208 + }, + { + "epoch": 1.9351851851851851, + "grad_norm": 0.02635806053876877, + "learning_rate": 7.69330932592594e-05, + "loss": 0.0149, + "step": 209 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.02554040215909481, + "learning_rate": 7.666022164008457e-05, + "loss": 0.0169, + "step": 210 + }, + { + "epoch": 1.9444444444444444, + "eval_loss": 0.016974864527583122, + "eval_runtime": 9.1008, + "eval_samples_per_second": 5.494, + "eval_steps_per_second": 1.428, + "step": 210 + }, + { + "epoch": 1.9537037037037037, + "grad_norm": 0.02924305759370327, + "learning_rate": 7.63862360099841e-05, + "loss": 0.0148, + "step": 211 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 0.020948631688952446, + "learning_rate": 7.611114781758692e-05, + "loss": 0.0158, + "step": 212 + }, + { + "epoch": 1.9722222222222223, + "grad_norm": 0.021703558042645454, + "learning_rate": 7.583496855759316e-05, + "loss": 0.0172, + "step": 213 + }, + { + "epoch": 1.9814814814814814, + "grad_norm": 0.022922605276107788, + "learning_rate": 7.555770977029367e-05, + "loss": 0.0149, + "step": 214 + }, + { + "epoch": 1.9907407407407407, + "grad_norm": 0.025769095867872238, + "learning_rate": 7.527938304108795e-05, + "loss": 0.0158, + "step": 215 + }, + { + "epoch": 1.9907407407407407, + "eval_loss": 0.017042405903339386, + "eval_runtime": 9.1168, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.426, + "step": 215 + }, + { + "epoch": 2.0, + "grad_norm": 0.03371057286858559, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0126, + "step": 216 + }, + { + "epoch": 2.009259259259259, + "grad_norm": 0.01711084321141243, + "learning_rate": 7.471957232119234e-05, + "loss": 0.0142, + "step": 217 + }, + { + "epoch": 2.0185185185185186, + "grad_norm": 0.023618614301085472, + "learning_rate": 7.443811172247821e-05, + "loss": 0.0151, + "step": 218 + }, + { + "epoch": 2.0277777777777777, + "grad_norm": 0.02181304432451725, + "learning_rate": 7.415562996483192e-05, + "loss": 0.0132, + "step": 219 + }, + { + "epoch": 2.037037037037037, + "grad_norm": 0.020521776750683784, + "learning_rate": 7.387213885189746e-05, + "loss": 0.0139, + "step": 220 + }, + { + "epoch": 2.037037037037037, + "eval_loss": 0.01702064275741577, + "eval_runtime": 9.1369, + "eval_samples_per_second": 5.472, + "eval_steps_per_second": 1.423, + "step": 220 + }, + { + "epoch": 2.0462962962962963, + "grad_norm": 0.022209780290722847, + "learning_rate": 7.358765022949519e-05, + "loss": 0.0152, + "step": 221 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 0.02240665629506111, + "learning_rate": 7.330217598512695e-05, + "loss": 0.0136, + "step": 222 + }, + { + "epoch": 2.064814814814815, + "grad_norm": 0.024021176621317863, + "learning_rate": 7.30157280474793e-05, + "loss": 0.0134, + "step": 223 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 0.022297382354736328, + "learning_rate": 7.272831838592503e-05, + "loss": 0.0158, + "step": 224 + }, + { + "epoch": 2.0833333333333335, + "grad_norm": 0.023189576342701912, + "learning_rate": 7.243995901002312e-05, + "loss": 0.0146, + "step": 225 + }, + { + "epoch": 2.0833333333333335, + "eval_loss": 0.017011733725667, + "eval_runtime": 9.1385, + "eval_samples_per_second": 5.471, + "eval_steps_per_second": 1.423, + "step": 225 + }, + { + "epoch": 2.0925925925925926, + "grad_norm": 0.02641259878873825, + "learning_rate": 7.215066196901676e-05, + "loss": 0.0149, + "step": 226 + }, + { + "epoch": 2.1018518518518516, + "grad_norm": 0.02105395309627056, + "learning_rate": 7.186043935133005e-05, + "loss": 0.0105, + "step": 227 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.020818866789340973, + "learning_rate": 7.156930328406268e-05, + "loss": 0.0144, + "step": 228 + }, + { + "epoch": 2.1203703703703702, + "grad_norm": 0.028699271380901337, + "learning_rate": 7.127726593248337e-05, + "loss": 0.0134, + "step": 229 + }, + { + "epoch": 2.1296296296296298, + "grad_norm": 0.025844816118478775, + "learning_rate": 7.098433949952146e-05, + "loss": 0.0115, + "step": 230 + }, + { + "epoch": 2.1296296296296298, + "eval_loss": 0.017404422163963318, + "eval_runtime": 9.1138, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 230 + }, + { + "epoch": 2.138888888888889, + "grad_norm": 0.02628181129693985, + "learning_rate": 7.069053622525696e-05, + "loss": 0.0135, + "step": 231 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 0.03826741501688957, + "learning_rate": 7.039586838640919e-05, + "loss": 0.013, + "step": 232 + }, + { + "epoch": 2.1574074074074074, + "grad_norm": 0.02549687772989273, + "learning_rate": 7.01003482958237e-05, + "loss": 0.0112, + "step": 233 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.02850032038986683, + "learning_rate": 6.980398830195785e-05, + "loss": 0.0114, + "step": 234 + }, + { + "epoch": 2.175925925925926, + "grad_norm": 0.028789905831217766, + "learning_rate": 6.950680078836474e-05, + "loss": 0.0138, + "step": 235 + }, + { + "epoch": 2.175925925925926, + "eval_loss": 0.016838619485497475, + "eval_runtime": 9.1141, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 235 + }, + { + "epoch": 2.185185185185185, + "grad_norm": 0.024276968091726303, + "learning_rate": 6.920879817317589e-05, + "loss": 0.0156, + "step": 236 + }, + { + "epoch": 2.1944444444444446, + "grad_norm": 0.02652347832918167, + "learning_rate": 6.890999290858214e-05, + "loss": 0.0111, + "step": 237 + }, + { + "epoch": 2.2037037037037037, + "grad_norm": 0.03363705053925514, + "learning_rate": 6.861039748031351e-05, + "loss": 0.0155, + "step": 238 + }, + { + "epoch": 2.212962962962963, + "grad_norm": 0.025364842265844345, + "learning_rate": 6.83100244071174e-05, + "loss": 0.0127, + "step": 239 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.024912815541028976, + "learning_rate": 6.800888624023553e-05, + "loss": 0.0138, + "step": 240 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.017057882621884346, + "eval_runtime": 9.1505, + "eval_samples_per_second": 5.464, + "eval_steps_per_second": 1.421, + "step": 240 + }, + { + "epoch": 2.2314814814814814, + "grad_norm": 0.031296826899051666, + "learning_rate": 6.770699556287939e-05, + "loss": 0.0138, + "step": 241 + }, + { + "epoch": 2.240740740740741, + "grad_norm": 0.03207860141992569, + "learning_rate": 6.740436498970452e-05, + "loss": 0.0128, + "step": 242 + }, + { + "epoch": 2.25, + "grad_norm": 0.027626443654298782, + "learning_rate": 6.710100716628344e-05, + "loss": 0.0142, + "step": 243 + }, + { + "epoch": 2.259259259259259, + "grad_norm": 0.025963863357901573, + "learning_rate": 6.679693476857711e-05, + "loss": 0.0137, + "step": 244 + }, + { + "epoch": 2.2685185185185186, + "grad_norm": 0.022552739828824997, + "learning_rate": 6.649216050240539e-05, + "loss": 0.0134, + "step": 245 + }, + { + "epoch": 2.2685185185185186, + "eval_loss": 0.016679909080266953, + "eval_runtime": 9.1095, + "eval_samples_per_second": 5.489, + "eval_steps_per_second": 1.427, + "step": 245 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 0.0247825738042593, + "learning_rate": 6.618669710291606e-05, + "loss": 0.0116, + "step": 246 + }, + { + "epoch": 2.287037037037037, + "grad_norm": 0.021808508783578873, + "learning_rate": 6.588055733405266e-05, + "loss": 0.014, + "step": 247 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 0.025087367743253708, + "learning_rate": 6.557375398802123e-05, + "loss": 0.0167, + "step": 248 + }, + { + "epoch": 2.3055555555555554, + "grad_norm": 0.022722622379660606, + "learning_rate": 6.526629988475567e-05, + "loss": 0.013, + "step": 249 + }, + { + "epoch": 2.314814814814815, + "grad_norm": 0.023495636880397797, + "learning_rate": 6.495820787138209e-05, + "loss": 0.0167, + "step": 250 + }, + { + "epoch": 2.314814814814815, + "eval_loss": 0.016377143561840057, + "eval_runtime": 9.1133, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 250 + }, + { + "epoch": 2.324074074074074, + "grad_norm": 0.021211953833699226, + "learning_rate": 6.464949082168204e-05, + "loss": 0.0125, + "step": 251 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.022748148068785667, + "learning_rate": 6.434016163555452e-05, + "loss": 0.0121, + "step": 252 + }, + { + "epoch": 2.3425925925925926, + "grad_norm": 0.021960506215691566, + "learning_rate": 6.403023323847695e-05, + "loss": 0.0159, + "step": 253 + }, + { + "epoch": 2.351851851851852, + "grad_norm": 0.02572719193994999, + "learning_rate": 6.371971858096508e-05, + "loss": 0.0137, + "step": 254 + }, + { + "epoch": 2.361111111111111, + "grad_norm": 0.027611717581748962, + "learning_rate": 6.340863063803188e-05, + "loss": 0.0123, + "step": 255 + }, + { + "epoch": 2.361111111111111, + "eval_loss": 0.016414109617471695, + "eval_runtime": 9.1093, + "eval_samples_per_second": 5.489, + "eval_steps_per_second": 1.427, + "step": 255 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 0.026147907599806786, + "learning_rate": 6.30969824086453e-05, + "loss": 0.012, + "step": 256 + }, + { + "epoch": 2.3796296296296298, + "grad_norm": 0.026667073369026184, + "learning_rate": 6.27847869151852e-05, + "loss": 0.0127, + "step": 257 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 0.023840012028813362, + "learning_rate": 6.247205720289907e-05, + "loss": 0.0141, + "step": 258 + }, + { + "epoch": 2.398148148148148, + "grad_norm": 0.028697500005364418, + "learning_rate": 6.215880633935708e-05, + "loss": 0.0135, + "step": 259 + }, + { + "epoch": 2.4074074074074074, + "grad_norm": 0.029124466702342033, + "learning_rate": 6.184504741390596e-05, + "loss": 0.0139, + "step": 260 + }, + { + "epoch": 2.4074074074074074, + "eval_loss": 0.016279693692922592, + "eval_runtime": 9.1162, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 260 + }, + { + "epoch": 2.4166666666666665, + "grad_norm": 0.020265506580471992, + "learning_rate": 6.153079353712201e-05, + "loss": 0.0129, + "step": 261 + }, + { + "epoch": 2.425925925925926, + "grad_norm": 0.020486822351813316, + "learning_rate": 6.121605784026339e-05, + "loss": 0.0114, + "step": 262 + }, + { + "epoch": 2.435185185185185, + "grad_norm": 0.02432914823293686, + "learning_rate": 6.09008534747213e-05, + "loss": 0.0138, + "step": 263 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.027614833787083626, + "learning_rate": 6.058519361147055e-05, + "loss": 0.0118, + "step": 264 + }, + { + "epoch": 2.4537037037037037, + "grad_norm": 0.03493235632777214, + "learning_rate": 6.02690914405191e-05, + "loss": 0.0125, + "step": 265 + }, + { + "epoch": 2.4537037037037037, + "eval_loss": 0.016143780201673508, + "eval_runtime": 9.2054, + "eval_samples_per_second": 5.432, + "eval_steps_per_second": 1.412, + "step": 265 + }, + { + "epoch": 2.462962962962963, + "grad_norm": 0.024250265210866928, + "learning_rate": 5.995256017035703e-05, + "loss": 0.0139, + "step": 266 + }, + { + "epoch": 2.4722222222222223, + "grad_norm": 0.022808292880654335, + "learning_rate": 5.963561302740449e-05, + "loss": 0.0162, + "step": 267 + }, + { + "epoch": 2.4814814814814814, + "grad_norm": 0.03109206259250641, + "learning_rate": 5.9318263255459116e-05, + "loss": 0.0123, + "step": 268 + }, + { + "epoch": 2.490740740740741, + "grad_norm": 0.02985144406557083, + "learning_rate": 5.900052411514257e-05, + "loss": 0.015, + "step": 269 + }, + { + "epoch": 2.5, + "grad_norm": 0.024866314604878426, + "learning_rate": 5.868240888334653e-05, + "loss": 0.0126, + "step": 270 + }, + { + "epoch": 2.5, + "eval_loss": 0.016046511009335518, + "eval_runtime": 9.1128, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.427, + "step": 270 + }, + { + "epoch": 2.5092592592592595, + "grad_norm": 0.0215854924172163, + "learning_rate": 5.836393085267776e-05, + "loss": 0.0133, + "step": 271 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 0.02321489341557026, + "learning_rate": 5.804510333090287e-05, + "loss": 0.0175, + "step": 272 + }, + { + "epoch": 2.5277777777777777, + "grad_norm": 0.024908283725380898, + "learning_rate": 5.772593964039203e-05, + "loss": 0.0116, + "step": 273 + }, + { + "epoch": 2.537037037037037, + "grad_norm": 0.02571980282664299, + "learning_rate": 5.740645311756245e-05, + "loss": 0.0125, + "step": 274 + }, + { + "epoch": 2.5462962962962963, + "grad_norm": 0.022897284477949142, + "learning_rate": 5.708665711232103e-05, + "loss": 0.0138, + "step": 275 + }, + { + "epoch": 2.5462962962962963, + "eval_loss": 0.016013609245419502, + "eval_runtime": 9.1743, + "eval_samples_per_second": 5.45, + "eval_steps_per_second": 1.417, + "step": 275 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 0.023732876405119896, + "learning_rate": 5.6766564987506566e-05, + "loss": 0.0136, + "step": 276 + }, + { + "epoch": 2.564814814814815, + "grad_norm": 0.024980880320072174, + "learning_rate": 5.644619011833133e-05, + "loss": 0.0131, + "step": 277 + }, + { + "epoch": 2.574074074074074, + "grad_norm": 0.023262949660420418, + "learning_rate": 5.6125545891822274e-05, + "loss": 0.0143, + "step": 278 + }, + { + "epoch": 2.5833333333333335, + "grad_norm": 0.024468230083584785, + "learning_rate": 5.5804645706261514e-05, + "loss": 0.0148, + "step": 279 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.020350055769085884, + "learning_rate": 5.548350297062659e-05, + "loss": 0.0125, + "step": 280 + }, + { + "epoch": 2.5925925925925926, + "eval_loss": 0.015153205953538418, + "eval_runtime": 9.1126, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.427, + "step": 280 + }, + { + "epoch": 2.601851851851852, + "grad_norm": 0.027165360748767853, + "learning_rate": 5.516213110403009e-05, + "loss": 0.0093, + "step": 281 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 0.021070580929517746, + "learning_rate": 5.484054353515896e-05, + "loss": 0.0138, + "step": 282 + }, + { + "epoch": 2.6203703703703702, + "grad_norm": 0.025997430086135864, + "learning_rate": 5.451875370171341e-05, + "loss": 0.0121, + "step": 283 + }, + { + "epoch": 2.6296296296296298, + "grad_norm": 0.02517426759004593, + "learning_rate": 5.419677504984534e-05, + "loss": 0.0126, + "step": 284 + }, + { + "epoch": 2.638888888888889, + "grad_norm": 0.025812286883592606, + "learning_rate": 5.387462103359655e-05, + "loss": 0.0133, + "step": 285 + }, + { + "epoch": 2.638888888888889, + "eval_loss": 0.016152961179614067, + "eval_runtime": 9.1127, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.427, + "step": 285 + }, + { + "epoch": 2.648148148148148, + "grad_norm": 0.02393972873687744, + "learning_rate": 5.355230511433651e-05, + "loss": 0.0136, + "step": 286 + }, + { + "epoch": 2.6574074074074074, + "grad_norm": 0.021706297993659973, + "learning_rate": 5.32298407601999e-05, + "loss": 0.0133, + "step": 287 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.026299407705664635, + "learning_rate": 5.290724144552379e-05, + "loss": 0.0143, + "step": 288 + }, + { + "epoch": 2.675925925925926, + "grad_norm": 0.030511364340782166, + "learning_rate": 5.258452065028473e-05, + "loss": 0.0137, + "step": 289 + }, + { + "epoch": 2.685185185185185, + "grad_norm": 0.024854540824890137, + "learning_rate": 5.226169185953532e-05, + "loss": 0.0125, + "step": 290 + }, + { + "epoch": 2.685185185185185, + "eval_loss": 0.016076602041721344, + "eval_runtime": 9.1632, + "eval_samples_per_second": 5.457, + "eval_steps_per_second": 1.419, + "step": 290 + }, + { + "epoch": 2.6944444444444446, + "grad_norm": 0.022800520062446594, + "learning_rate": 5.193876856284085e-05, + "loss": 0.012, + "step": 291 + }, + { + "epoch": 2.7037037037037037, + "grad_norm": 0.021870015189051628, + "learning_rate": 5.1615764253715536e-05, + "loss": 0.0136, + "step": 292 + }, + { + "epoch": 2.712962962962963, + "grad_norm": 0.020156167447566986, + "learning_rate": 5.129269242905882e-05, + "loss": 0.012, + "step": 293 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 0.019064266234636307, + "learning_rate": 5.096956658859122e-05, + "loss": 0.0137, + "step": 294 + }, + { + "epoch": 2.7314814814814814, + "grad_norm": 0.027288921177387238, + "learning_rate": 5.064640023429043e-05, + "loss": 0.0147, + "step": 295 + }, + { + "epoch": 2.7314814814814814, + "eval_loss": 0.01584070920944214, + "eval_runtime": 9.1151, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 295 + }, + { + "epoch": 2.7407407407407405, + "grad_norm": 0.02484748885035515, + "learning_rate": 5.0323206869826966e-05, + "loss": 0.0111, + "step": 296 + }, + { + "epoch": 2.75, + "grad_norm": 0.02521962858736515, + "learning_rate": 5e-05, + "loss": 0.0134, + "step": 297 + }, + { + "epoch": 2.7592592592592595, + "grad_norm": 0.023346634581685066, + "learning_rate": 4.967679313017303e-05, + "loss": 0.0124, + "step": 298 + }, + { + "epoch": 2.7685185185185186, + "grad_norm": 0.021654650568962097, + "learning_rate": 4.9353599765709584e-05, + "loss": 0.0144, + "step": 299 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.021227596327662468, + "learning_rate": 4.903043341140879e-05, + "loss": 0.0134, + "step": 300 + }, + { + "epoch": 2.7777777777777777, + "eval_loss": 0.016122175380587578, + "eval_runtime": 9.1019, + "eval_samples_per_second": 5.493, + "eval_steps_per_second": 1.428, + "step": 300 + }, + { + "epoch": 2.787037037037037, + "grad_norm": 0.024656914174556732, + "learning_rate": 4.870730757094121e-05, + "loss": 0.0123, + "step": 301 + }, + { + "epoch": 2.7962962962962963, + "grad_norm": 0.02583468146622181, + "learning_rate": 4.8384235746284476e-05, + "loss": 0.015, + "step": 302 + }, + { + "epoch": 2.8055555555555554, + "grad_norm": 0.022909915074706078, + "learning_rate": 4.806123143715916e-05, + "loss": 0.0142, + "step": 303 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 0.02014041878283024, + "learning_rate": 4.7738308140464685e-05, + "loss": 0.0131, + "step": 304 + }, + { + "epoch": 2.824074074074074, + "grad_norm": 0.022683143615722656, + "learning_rate": 4.7415479349715275e-05, + "loss": 0.0124, + "step": 305 + }, + { + "epoch": 2.824074074074074, + "eval_loss": 0.015797268599271774, + "eval_runtime": 9.1281, + "eval_samples_per_second": 5.478, + "eval_steps_per_second": 1.424, + "step": 305 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.025906002148985863, + "learning_rate": 4.709275855447621e-05, + "loss": 0.0154, + "step": 306 + }, + { + "epoch": 2.8425925925925926, + "grad_norm": 0.027820315212011337, + "learning_rate": 4.677015923980011e-05, + "loss": 0.0138, + "step": 307 + }, + { + "epoch": 2.851851851851852, + "grad_norm": 0.023744860664010048, + "learning_rate": 4.6447694885663514e-05, + "loss": 0.0124, + "step": 308 + }, + { + "epoch": 2.861111111111111, + "grad_norm": 0.026518192142248154, + "learning_rate": 4.612537896640346e-05, + "loss": 0.0155, + "step": 309 + }, + { + "epoch": 2.8703703703703702, + "grad_norm": 0.020426657050848007, + "learning_rate": 4.5803224950154656e-05, + "loss": 0.0132, + "step": 310 + }, + { + "epoch": 2.8703703703703702, + "eval_loss": 0.015400240197777748, + "eval_runtime": 9.1185, + "eval_samples_per_second": 5.483, + "eval_steps_per_second": 1.426, + "step": 310 + }, + { + "epoch": 2.8796296296296298, + "grad_norm": 0.022766800597310066, + "learning_rate": 4.54812462982866e-05, + "loss": 0.0139, + "step": 311 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.021728193387389183, + "learning_rate": 4.515945646484105e-05, + "loss": 0.0133, + "step": 312 + }, + { + "epoch": 2.898148148148148, + "grad_norm": 0.0226016603410244, + "learning_rate": 4.4837868895969936e-05, + "loss": 0.0126, + "step": 313 + }, + { + "epoch": 2.9074074074074074, + "grad_norm": 0.027723975479602814, + "learning_rate": 4.451649702937342e-05, + "loss": 0.0106, + "step": 314 + }, + { + "epoch": 2.9166666666666665, + "grad_norm": 0.01856391504406929, + "learning_rate": 4.4195354293738484e-05, + "loss": 0.0146, + "step": 315 + }, + { + "epoch": 2.9166666666666665, + "eval_loss": 0.015166966244578362, + "eval_runtime": 9.1172, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.426, + "step": 315 + }, + { + "epoch": 2.925925925925926, + "grad_norm": 0.019857853651046753, + "learning_rate": 4.387445410817774e-05, + "loss": 0.0124, + "step": 316 + }, + { + "epoch": 2.935185185185185, + "grad_norm": 0.025410892441868782, + "learning_rate": 4.355380988166867e-05, + "loss": 0.0119, + "step": 317 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 0.02312655746936798, + "learning_rate": 4.323343501249346e-05, + "loss": 0.0144, + "step": 318 + }, + { + "epoch": 2.9537037037037037, + "grad_norm": 0.022076064720749855, + "learning_rate": 4.2913342887678985e-05, + "loss": 0.0117, + "step": 319 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 0.023769903928041458, + "learning_rate": 4.259354688243757e-05, + "loss": 0.014, + "step": 320 + }, + { + "epoch": 2.962962962962963, + "eval_loss": 0.014957955107092857, + "eval_runtime": 9.1101, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.427, + "step": 320 + }, + { + "epoch": 2.9722222222222223, + "grad_norm": 0.023904340341687202, + "learning_rate": 4.227406035960798e-05, + "loss": 0.0121, + "step": 321 + }, + { + "epoch": 2.9814814814814814, + "grad_norm": 0.02383498102426529, + "learning_rate": 4.195489666909713e-05, + "loss": 0.0119, + "step": 322 + }, + { + "epoch": 2.9907407407407405, + "grad_norm": 0.03048449568450451, + "learning_rate": 4.1636069147322246e-05, + "loss": 0.0136, + "step": 323 + }, + { + "epoch": 3.0, + "grad_norm": 0.023879334330558777, + "learning_rate": 4.131759111665349e-05, + "loss": 0.0137, + "step": 324 + }, + { + "epoch": 3.009259259259259, + "grad_norm": 0.025208691135048866, + "learning_rate": 4.099947588485744e-05, + "loss": 0.0122, + "step": 325 + }, + { + "epoch": 3.009259259259259, + "eval_loss": 0.015089023858308792, + "eval_runtime": 9.116, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 325 + }, + { + "epoch": 3.0185185185185186, + "grad_norm": 0.020718788728117943, + "learning_rate": 4.06817367445409e-05, + "loss": 0.0095, + "step": 326 + }, + { + "epoch": 3.0277777777777777, + "grad_norm": 0.024810951203107834, + "learning_rate": 4.036438697259551e-05, + "loss": 0.0134, + "step": 327 + }, + { + "epoch": 3.037037037037037, + "grad_norm": 0.019842958077788353, + "learning_rate": 4.004743982964298e-05, + "loss": 0.0122, + "step": 328 + }, + { + "epoch": 3.0462962962962963, + "grad_norm": 0.01818239875137806, + "learning_rate": 3.97309085594809e-05, + "loss": 0.0101, + "step": 329 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.022604303434491158, + "learning_rate": 3.941480638852948e-05, + "loss": 0.0118, + "step": 330 + }, + { + "epoch": 3.0555555555555554, + "eval_loss": 0.015503546223044395, + "eval_runtime": 9.1063, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 1.428, + "step": 330 + }, + { + "epoch": 3.064814814814815, + "grad_norm": 0.024690452963113785, + "learning_rate": 3.909914652527871e-05, + "loss": 0.0109, + "step": 331 + }, + { + "epoch": 3.074074074074074, + "grad_norm": 0.02343621291220188, + "learning_rate": 3.878394215973663e-05, + "loss": 0.0123, + "step": 332 + }, + { + "epoch": 3.0833333333333335, + "grad_norm": 0.026170087978243828, + "learning_rate": 3.846920646287799e-05, + "loss": 0.0122, + "step": 333 + }, + { + "epoch": 3.0925925925925926, + "grad_norm": 0.024799769744277, + "learning_rate": 3.815495258609404e-05, + "loss": 0.0125, + "step": 334 + }, + { + "epoch": 3.1018518518518516, + "grad_norm": 0.02072787657380104, + "learning_rate": 3.784119366064293e-05, + "loss": 0.0108, + "step": 335 + }, + { + "epoch": 3.1018518518518516, + "eval_loss": 0.0155374426394701, + "eval_runtime": 9.1152, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 335 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 0.021989421918988228, + "learning_rate": 3.752794279710094e-05, + "loss": 0.0114, + "step": 336 + }, + { + "epoch": 3.1203703703703702, + "grad_norm": 0.03829918056726456, + "learning_rate": 3.721521308481482e-05, + "loss": 0.0101, + "step": 337 + }, + { + "epoch": 3.1296296296296298, + "grad_norm": 0.029835987836122513, + "learning_rate": 3.6903017591354706e-05, + "loss": 0.0107, + "step": 338 + }, + { + "epoch": 3.138888888888889, + "grad_norm": 0.02231847681105137, + "learning_rate": 3.6591369361968124e-05, + "loss": 0.012, + "step": 339 + }, + { + "epoch": 3.148148148148148, + "grad_norm": 0.02263280376791954, + "learning_rate": 3.628028141903493e-05, + "loss": 0.0103, + "step": 340 + }, + { + "epoch": 3.148148148148148, + "eval_loss": 0.01546421181410551, + "eval_runtime": 9.1199, + "eval_samples_per_second": 5.483, + "eval_steps_per_second": 1.425, + "step": 340 + }, + { + "epoch": 3.1574074074074074, + "grad_norm": 0.023618226870894432, + "learning_rate": 3.596976676152306e-05, + "loss": 0.0116, + "step": 341 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.02577986940741539, + "learning_rate": 3.5659838364445505e-05, + "loss": 0.0108, + "step": 342 + }, + { + "epoch": 3.175925925925926, + "grad_norm": 0.026071948930621147, + "learning_rate": 3.535050917831797e-05, + "loss": 0.0108, + "step": 343 + }, + { + "epoch": 3.185185185185185, + "grad_norm": 0.038238752633333206, + "learning_rate": 3.5041792128617927e-05, + "loss": 0.0094, + "step": 344 + }, + { + "epoch": 3.1944444444444446, + "grad_norm": 0.029051663354039192, + "learning_rate": 3.473370011524435e-05, + "loss": 0.0099, + "step": 345 + }, + { + "epoch": 3.1944444444444446, + "eval_loss": 0.015372861176729202, + "eval_runtime": 9.1378, + "eval_samples_per_second": 5.472, + "eval_steps_per_second": 1.423, + "step": 345 + }, + { + "epoch": 3.2037037037037037, + "grad_norm": 0.022384386509656906, + "learning_rate": 3.442624601197877e-05, + "loss": 0.0096, + "step": 346 + }, + { + "epoch": 3.212962962962963, + "grad_norm": 0.024341940879821777, + "learning_rate": 3.4119442665947344e-05, + "loss": 0.0094, + "step": 347 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.02119499258697033, + "learning_rate": 3.381330289708396e-05, + "loss": 0.011, + "step": 348 + }, + { + "epoch": 3.2314814814814814, + "grad_norm": 0.025269504636526108, + "learning_rate": 3.350783949759462e-05, + "loss": 0.0105, + "step": 349 + }, + { + "epoch": 3.240740740740741, + "grad_norm": 0.02428189478814602, + "learning_rate": 3.3203065231422904e-05, + "loss": 0.0115, + "step": 350 + }, + { + "epoch": 3.240740740740741, + "eval_loss": 0.015474287793040276, + "eval_runtime": 9.1142, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 350 + }, + { + "epoch": 3.25, + "grad_norm": 0.027830710634589195, + "learning_rate": 3.289899283371657e-05, + "loss": 0.014, + "step": 351 + }, + { + "epoch": 3.259259259259259, + "grad_norm": 0.026644067838788033, + "learning_rate": 3.2595635010295475e-05, + "loss": 0.0132, + "step": 352 + }, + { + "epoch": 3.2685185185185186, + "grad_norm": 0.028307707980275154, + "learning_rate": 3.2293004437120624e-05, + "loss": 0.0093, + "step": 353 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 0.03480321913957596, + "learning_rate": 3.199111375976449e-05, + "loss": 0.0107, + "step": 354 + }, + { + "epoch": 3.287037037037037, + "grad_norm": 0.029546814039349556, + "learning_rate": 3.1689975592882603e-05, + "loss": 0.0099, + "step": 355 + }, + { + "epoch": 3.287037037037037, + "eval_loss": 0.015444349497556686, + "eval_runtime": 9.1458, + "eval_samples_per_second": 5.467, + "eval_steps_per_second": 1.421, + "step": 355 + }, + { + "epoch": 3.2962962962962963, + "grad_norm": 0.02437739446759224, + "learning_rate": 3.1389602519686515e-05, + "loss": 0.0118, + "step": 356 + }, + { + "epoch": 3.3055555555555554, + "grad_norm": 0.029530519619584084, + "learning_rate": 3.109000709141788e-05, + "loss": 0.0121, + "step": 357 + }, + { + "epoch": 3.314814814814815, + "grad_norm": 0.029449855908751488, + "learning_rate": 3.079120182682412e-05, + "loss": 0.0099, + "step": 358 + }, + { + "epoch": 3.324074074074074, + "grad_norm": 0.020589128136634827, + "learning_rate": 3.049319921163526e-05, + "loss": 0.0119, + "step": 359 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.02450876496732235, + "learning_rate": 3.019601169804216e-05, + "loss": 0.0129, + "step": 360 + }, + { + "epoch": 3.3333333333333335, + "eval_loss": 0.0157760102301836, + "eval_runtime": 9.1103, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.427, + "step": 360 + }, + { + "epoch": 3.3425925925925926, + "grad_norm": 0.0208604596555233, + "learning_rate": 2.9899651704176325e-05, + "loss": 0.011, + "step": 361 + }, + { + "epoch": 3.351851851851852, + "grad_norm": 0.025153055787086487, + "learning_rate": 2.9604131613590824e-05, + "loss": 0.0109, + "step": 362 + }, + { + "epoch": 3.361111111111111, + "grad_norm": 0.021455859765410423, + "learning_rate": 2.9309463774743046e-05, + "loss": 0.0122, + "step": 363 + }, + { + "epoch": 3.3703703703703702, + "grad_norm": 0.01964252069592476, + "learning_rate": 2.901566050047855e-05, + "loss": 0.0113, + "step": 364 + }, + { + "epoch": 3.3796296296296298, + "grad_norm": 0.020809266716241837, + "learning_rate": 2.872273406751664e-05, + "loss": 0.0105, + "step": 365 + }, + { + "epoch": 3.3796296296296298, + "eval_loss": 0.015391937457025051, + "eval_runtime": 9.111, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.427, + "step": 365 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 0.025048566982150078, + "learning_rate": 2.8430696715937337e-05, + "loss": 0.0107, + "step": 366 + }, + { + "epoch": 3.398148148148148, + "grad_norm": 0.024674881249666214, + "learning_rate": 2.8139560648669962e-05, + "loss": 0.0113, + "step": 367 + }, + { + "epoch": 3.4074074074074074, + "grad_norm": 0.025468124076724052, + "learning_rate": 2.7849338030983257e-05, + "loss": 0.012, + "step": 368 + }, + { + "epoch": 3.4166666666666665, + "grad_norm": 0.022864418104290962, + "learning_rate": 2.7560040989976892e-05, + "loss": 0.01, + "step": 369 + }, + { + "epoch": 3.425925925925926, + "grad_norm": 0.02258789725601673, + "learning_rate": 2.7271681614074973e-05, + "loss": 0.0121, + "step": 370 + }, + { + "epoch": 3.425925925925926, + "eval_loss": 0.015503110364079475, + "eval_runtime": 9.1077, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 1.427, + "step": 370 + }, + { + "epoch": 3.435185185185185, + "grad_norm": 0.025097696110606194, + "learning_rate": 2.6984271952520722e-05, + "loss": 0.0104, + "step": 371 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.028177309781312943, + "learning_rate": 2.6697824014873075e-05, + "loss": 0.0132, + "step": 372 + }, + { + "epoch": 3.4537037037037037, + "grad_norm": 0.026587417349219322, + "learning_rate": 2.641234977050484e-05, + "loss": 0.0085, + "step": 373 + }, + { + "epoch": 3.462962962962963, + "grad_norm": 0.0189076978713274, + "learning_rate": 2.612786114810255e-05, + "loss": 0.0096, + "step": 374 + }, + { + "epoch": 3.4722222222222223, + "grad_norm": 0.029332995414733887, + "learning_rate": 2.5844370035168073e-05, + "loss": 0.0096, + "step": 375 + }, + { + "epoch": 3.4722222222222223, + "eval_loss": 0.015461472794413567, + "eval_runtime": 9.1144, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 375 + }, + { + "epoch": 3.4814814814814814, + "grad_norm": 0.02185731939971447, + "learning_rate": 2.5561888277521794e-05, + "loss": 0.0098, + "step": 376 + }, + { + "epoch": 3.490740740740741, + "grad_norm": 0.026887575164437294, + "learning_rate": 2.528042767880766e-05, + "loss": 0.0114, + "step": 377 + }, + { + "epoch": 3.5, + "grad_norm": 0.023131586611270905, + "learning_rate": 2.500000000000001e-05, + "loss": 0.0112, + "step": 378 + }, + { + "epoch": 3.5092592592592595, + "grad_norm": 0.028937749564647675, + "learning_rate": 2.4720616958912053e-05, + "loss": 0.0121, + "step": 379 + }, + { + "epoch": 3.5185185185185186, + "grad_norm": 0.032668791711330414, + "learning_rate": 2.4442290229706344e-05, + "loss": 0.0112, + "step": 380 + }, + { + "epoch": 3.5185185185185186, + "eval_loss": 0.015212837606668472, + "eval_runtime": 9.1177, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.426, + "step": 380 + }, + { + "epoch": 3.5277777777777777, + "grad_norm": 0.02449023723602295, + "learning_rate": 2.4165031442406855e-05, + "loss": 0.0117, + "step": 381 + }, + { + "epoch": 3.537037037037037, + "grad_norm": 0.025157004594802856, + "learning_rate": 2.3888852182413085e-05, + "loss": 0.0091, + "step": 382 + }, + { + "epoch": 3.5462962962962963, + "grad_norm": 0.03108743578195572, + "learning_rate": 2.361376399001592e-05, + "loss": 0.0108, + "step": 383 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.021932488307356834, + "learning_rate": 2.333977835991545e-05, + "loss": 0.0093, + "step": 384 + }, + { + "epoch": 3.564814814814815, + "grad_norm": 0.026496881619095802, + "learning_rate": 2.3066906740740623e-05, + "loss": 0.0118, + "step": 385 + }, + { + "epoch": 3.564814814814815, + "eval_loss": 0.01467986311763525, + "eval_runtime": 9.1127, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.427, + "step": 385 + }, + { + "epoch": 3.574074074074074, + "grad_norm": 0.024211710318922997, + "learning_rate": 2.2795160534570864e-05, + "loss": 0.0086, + "step": 386 + }, + { + "epoch": 3.5833333333333335, + "grad_norm": 0.023977207019925117, + "learning_rate": 2.25245510964597e-05, + "loss": 0.0128, + "step": 387 + }, + { + "epoch": 3.5925925925925926, + "grad_norm": 0.02136526070535183, + "learning_rate": 2.225508973396016e-05, + "loss": 0.0121, + "step": 388 + }, + { + "epoch": 3.601851851851852, + "grad_norm": 0.026328187435865402, + "learning_rate": 2.198678770665238e-05, + "loss": 0.0108, + "step": 389 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.02159940078854561, + "learning_rate": 2.171965622567308e-05, + "loss": 0.0082, + "step": 390 + }, + { + "epoch": 3.611111111111111, + "eval_loss": 0.014544774778187275, + "eval_runtime": 9.1133, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 1.426, + "step": 390 + }, + { + "epoch": 3.6203703703703702, + "grad_norm": 0.02303987927734852, + "learning_rate": 2.1453706453247087e-05, + "loss": 0.0092, + "step": 391 + }, + { + "epoch": 3.6296296296296298, + "grad_norm": 0.027734337374567986, + "learning_rate": 2.1188949502220983e-05, + "loss": 0.0101, + "step": 392 + }, + { + "epoch": 3.638888888888889, + "grad_norm": 0.02069096453487873, + "learning_rate": 2.0925396435598664e-05, + "loss": 0.0111, + "step": 393 + }, + { + "epoch": 3.648148148148148, + "grad_norm": 0.02777431532740593, + "learning_rate": 2.066305826607911e-05, + "loss": 0.0091, + "step": 394 + }, + { + "epoch": 3.6574074074074074, + "grad_norm": 0.02333620935678482, + "learning_rate": 2.0401945955596206e-05, + "loss": 0.0112, + "step": 395 + }, + { + "epoch": 3.6574074074074074, + "eval_loss": 0.01460795197635889, + "eval_runtime": 9.1059, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 1.428, + "step": 395 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.022142188623547554, + "learning_rate": 2.0142070414860704e-05, + "loss": 0.01, + "step": 396 + }, + { + "epoch": 3.675925925925926, + "grad_norm": 0.01749616675078869, + "learning_rate": 1.9883442502904283e-05, + "loss": 0.0095, + "step": 397 + }, + { + "epoch": 3.685185185185185, + "grad_norm": 0.02393367514014244, + "learning_rate": 1.9626073026625818e-05, + "loss": 0.0095, + "step": 398 + }, + { + "epoch": 3.6944444444444446, + "grad_norm": 0.023465050384402275, + "learning_rate": 1.936997274033986e-05, + "loss": 0.0108, + "step": 399 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 0.023157304152846336, + "learning_rate": 1.9115152345327152e-05, + "loss": 0.0086, + "step": 400 + }, + { + "epoch": 3.7037037037037037, + "eval_loss": 0.014902754686772823, + "eval_runtime": 9.1616, + "eval_samples_per_second": 5.458, + "eval_steps_per_second": 1.419, + "step": 400 + }, + { + "epoch": 3.712962962962963, + "grad_norm": 0.021799901500344276, + "learning_rate": 1.8861622489387555e-05, + "loss": 0.0128, + "step": 401 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 0.03070679120719433, + "learning_rate": 1.8609393766395085e-05, + "loss": 0.0123, + "step": 402 + }, + { + "epoch": 3.7314814814814814, + "grad_norm": 0.02543518878519535, + "learning_rate": 1.835847671585526e-05, + "loss": 0.0114, + "step": 403 + }, + { + "epoch": 3.7407407407407405, + "grad_norm": 0.027585655450820923, + "learning_rate": 1.8108881822464696e-05, + "loss": 0.0099, + "step": 404 + }, + { + "epoch": 3.75, + "grad_norm": 0.02352389506995678, + "learning_rate": 1.7860619515673033e-05, + "loss": 0.0102, + "step": 405 + }, + { + "epoch": 3.75, + "eval_loss": 0.014981208369135857, + "eval_runtime": 9.1106, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.427, + "step": 405 + }, + { + "epoch": 3.7592592592592595, + "grad_norm": 0.02560283988714218, + "learning_rate": 1.7613700169247056e-05, + "loss": 0.012, + "step": 406 + }, + { + "epoch": 3.7685185185185186, + "grad_norm": 0.026089752092957497, + "learning_rate": 1.7368134100837287e-05, + "loss": 0.0088, + "step": 407 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 0.030365899205207825, + "learning_rate": 1.7123931571546827e-05, + "loss": 0.0119, + "step": 408 + }, + { + "epoch": 3.787037037037037, + "grad_norm": 0.031558796763420105, + "learning_rate": 1.6881102785502616e-05, + "loss": 0.011, + "step": 409 + }, + { + "epoch": 3.7962962962962963, + "grad_norm": 0.030366325750947, + "learning_rate": 1.6639657889429018e-05, + "loss": 0.0116, + "step": 410 + }, + { + "epoch": 3.7962962962962963, + "eval_loss": 0.014859426766633987, + "eval_runtime": 9.1059, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 1.428, + "step": 410 + }, + { + "epoch": 3.8055555555555554, + "grad_norm": 0.025008074939250946, + "learning_rate": 1.639960697222388e-05, + "loss": 0.0106, + "step": 411 + }, + { + "epoch": 3.814814814814815, + "grad_norm": 0.028196556493639946, + "learning_rate": 1.6160960064536908e-05, + "loss": 0.0113, + "step": 412 + }, + { + "epoch": 3.824074074074074, + "grad_norm": 0.02165764756500721, + "learning_rate": 1.592372713835055e-05, + "loss": 0.0115, + "step": 413 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.020175475627183914, + "learning_rate": 1.5687918106563326e-05, + "loss": 0.0112, + "step": 414 + }, + { + "epoch": 3.8425925925925926, + "grad_norm": 0.027304671704769135, + "learning_rate": 1.545354282257562e-05, + "loss": 0.0126, + "step": 415 + }, + { + "epoch": 3.8425925925925926, + "eval_loss": 0.014735485427081585, + "eval_runtime": 9.198, + "eval_samples_per_second": 5.436, + "eval_steps_per_second": 1.413, + "step": 415 + }, + { + "epoch": 3.851851851851852, + "grad_norm": 0.026429716497659683, + "learning_rate": 1.52206110798779e-05, + "loss": 0.0103, + "step": 416 + }, + { + "epoch": 3.861111111111111, + "grad_norm": 0.02409077063202858, + "learning_rate": 1.4989132611641576e-05, + "loss": 0.012, + "step": 417 + }, + { + "epoch": 3.8703703703703702, + "grad_norm": 0.02310461364686489, + "learning_rate": 1.4759117090312197e-05, + "loss": 0.0096, + "step": 418 + }, + { + "epoch": 3.8796296296296298, + "grad_norm": 0.026219584047794342, + "learning_rate": 1.453057412720536e-05, + "loss": 0.0094, + "step": 419 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.027541201561689377, + "learning_rate": 1.4303513272105057e-05, + "loss": 0.0112, + "step": 420 + }, + { + "epoch": 3.888888888888889, + "eval_loss": 0.014594363048672676, + "eval_runtime": 9.1304, + "eval_samples_per_second": 5.476, + "eval_steps_per_second": 1.424, + "step": 420 + }, + { + "epoch": 3.898148148148148, + "grad_norm": 0.024942217394709587, + "learning_rate": 1.4077944012864636e-05, + "loss": 0.0093, + "step": 421 + }, + { + "epoch": 3.9074074074074074, + "grad_norm": 0.018137283623218536, + "learning_rate": 1.3853875775010355e-05, + "loss": 0.0102, + "step": 422 + }, + { + "epoch": 3.9166666666666665, + "grad_norm": 0.021817779168486595, + "learning_rate": 1.3631317921347563e-05, + "loss": 0.0084, + "step": 423 + }, + { + "epoch": 3.925925925925926, + "grad_norm": 0.023799235001206398, + "learning_rate": 1.3410279751569399e-05, + "loss": 0.0122, + "step": 424 + }, + { + "epoch": 3.935185185185185, + "grad_norm": 0.030764896422624588, + "learning_rate": 1.3190770501868243e-05, + "loss": 0.0107, + "step": 425 + }, + { + "epoch": 3.935185185185185, + "eval_loss": 0.014631365425884724, + "eval_runtime": 9.1149, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 425 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 0.022886106744408607, + "learning_rate": 1.297279934454978e-05, + "loss": 0.0096, + "step": 426 + }, + { + "epoch": 3.9537037037037037, + "grad_norm": 0.03152737021446228, + "learning_rate": 1.2756375387649716e-05, + "loss": 0.0124, + "step": 427 + }, + { + "epoch": 3.962962962962963, + "grad_norm": 0.02872036211192608, + "learning_rate": 1.25415076745532e-05, + "loss": 0.0091, + "step": 428 + }, + { + "epoch": 3.9722222222222223, + "grad_norm": 0.021184636279940605, + "learning_rate": 1.2328205183616965e-05, + "loss": 0.0105, + "step": 429 + }, + { + "epoch": 3.9814814814814814, + "grad_norm": 0.02112959884107113, + "learning_rate": 1.2116476827794104e-05, + "loss": 0.0113, + "step": 430 + }, + { + "epoch": 3.9814814814814814, + "eval_loss": 0.01471536885946989, + "eval_runtime": 9.116, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 430 + }, + { + "epoch": 3.9907407407407405, + "grad_norm": 0.019945990294218063, + "learning_rate": 1.1906331454261704e-05, + "loss": 0.0093, + "step": 431 + }, + { + "epoch": 4.0, + "grad_norm": 0.023910805583000183, + "learning_rate": 1.1697777844051105e-05, + "loss": 0.011, + "step": 432 + }, + { + "epoch": 4.0092592592592595, + "grad_norm": 0.01957758143544197, + "learning_rate": 1.1490824711681025e-05, + "loss": 0.0094, + "step": 433 + }, + { + "epoch": 4.018518518518518, + "grad_norm": 0.02563118375837803, + "learning_rate": 1.1285480704793377e-05, + "loss": 0.0093, + "step": 434 + }, + { + "epoch": 4.027777777777778, + "grad_norm": 0.026251764968037605, + "learning_rate": 1.1081754403791999e-05, + "loss": 0.0091, + "step": 435 + }, + { + "epoch": 4.027777777777778, + "eval_loss": 0.014734329655766487, + "eval_runtime": 9.1592, + "eval_samples_per_second": 5.459, + "eval_steps_per_second": 1.419, + "step": 435 + }, + { + "epoch": 4.037037037037037, + "grad_norm": 0.025834446772933006, + "learning_rate": 1.0879654321484012e-05, + "loss": 0.0067, + "step": 436 + }, + { + "epoch": 4.046296296296297, + "grad_norm": 0.0185233224183321, + "learning_rate": 1.0679188902724191e-05, + "loss": 0.0108, + "step": 437 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 0.021918736398220062, + "learning_rate": 1.0480366524062042e-05, + "loss": 0.0088, + "step": 438 + }, + { + "epoch": 4.064814814814815, + "grad_norm": 0.03142661973834038, + "learning_rate": 1.0283195493391823e-05, + "loss": 0.0103, + "step": 439 + }, + { + "epoch": 4.074074074074074, + "grad_norm": 0.023410873487591743, + "learning_rate": 1.008768404960535e-05, + "loss": 0.0094, + "step": 440 + }, + { + "epoch": 4.074074074074074, + "eval_loss": 0.014965096488595009, + "eval_runtime": 9.1135, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 440 + }, + { + "epoch": 4.083333333333333, + "grad_norm": 0.02943902276456356, + "learning_rate": 9.893840362247809e-06, + "loss": 0.0056, + "step": 441 + }, + { + "epoch": 4.092592592592593, + "grad_norm": 0.021431270986795425, + "learning_rate": 9.701672531176286e-06, + "loss": 0.0089, + "step": 442 + }, + { + "epoch": 4.101851851851852, + "grad_norm": 0.02797669917345047, + "learning_rate": 9.511188586221376e-06, + "loss": 0.0092, + "step": 443 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 0.02437691204249859, + "learning_rate": 9.322396486851626e-06, + "loss": 0.0104, + "step": 444 + }, + { + "epoch": 4.12037037037037, + "grad_norm": 0.024811841547489166, + "learning_rate": 9.135304121840976e-06, + "loss": 0.0096, + "step": 445 + }, + { + "epoch": 4.12037037037037, + "eval_loss": 0.014996801503002644, + "eval_runtime": 9.1094, + "eval_samples_per_second": 5.489, + "eval_steps_per_second": 1.427, + "step": 445 + }, + { + "epoch": 4.12962962962963, + "grad_norm": 0.0309213325381279, + "learning_rate": 8.949919308939082e-06, + "loss": 0.0109, + "step": 446 + }, + { + "epoch": 4.138888888888889, + "grad_norm": 0.023763932287693024, + "learning_rate": 8.766249794544662e-06, + "loss": 0.0073, + "step": 447 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 0.023741643875837326, + "learning_rate": 8.584303253381847e-06, + "loss": 0.0105, + "step": 448 + }, + { + "epoch": 4.157407407407407, + "grad_norm": 0.02090543322265148, + "learning_rate": 8.404087288179424e-06, + "loss": 0.0096, + "step": 449 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.026315612718462944, + "learning_rate": 8.225609429353187e-06, + "loss": 0.0091, + "step": 450 + }, + { + "epoch": 4.166666666666667, + "eval_loss": 0.015186839736998081, + "eval_runtime": 9.1241, + "eval_samples_per_second": 5.48, + "eval_steps_per_second": 1.425, + "step": 450 + }, + { + "epoch": 4.175925925925926, + "grad_norm": 0.023099206387996674, + "learning_rate": 8.048877134691268e-06, + "loss": 0.0091, + "step": 451 + }, + { + "epoch": 4.185185185185185, + "grad_norm": 0.027901167050004005, + "learning_rate": 7.873897789042523e-06, + "loss": 0.0092, + "step": 452 + }, + { + "epoch": 4.194444444444445, + "grad_norm": 0.025486482307314873, + "learning_rate": 7.700678704007947e-06, + "loss": 0.0077, + "step": 453 + }, + { + "epoch": 4.203703703703703, + "grad_norm": 0.0233286302536726, + "learning_rate": 7.529227117635135e-06, + "loss": 0.0077, + "step": 454 + }, + { + "epoch": 4.212962962962963, + "grad_norm": 0.023314587771892548, + "learning_rate": 7.35955019411585e-06, + "loss": 0.0089, + "step": 455 + }, + { + "epoch": 4.212962962962963, + "eval_loss": 0.015497377142310143, + "eval_runtime": 9.1064, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 1.428, + "step": 455 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.021640775725245476, + "learning_rate": 7.191655023486682e-06, + "loss": 0.01, + "step": 456 + }, + { + "epoch": 4.231481481481482, + "grad_norm": 0.027831410989165306, + "learning_rate": 7.02554862133275e-06, + "loss": 0.0105, + "step": 457 + }, + { + "epoch": 4.2407407407407405, + "grad_norm": 0.023242153227329254, + "learning_rate": 6.861237928494579e-06, + "loss": 0.009, + "step": 458 + }, + { + "epoch": 4.25, + "grad_norm": 0.02775505743920803, + "learning_rate": 6.698729810778065e-06, + "loss": 0.0102, + "step": 459 + }, + { + "epoch": 4.2592592592592595, + "grad_norm": 0.0267843846231699, + "learning_rate": 6.53803105866761e-06, + "loss": 0.0063, + "step": 460 + }, + { + "epoch": 4.2592592592592595, + "eval_loss": 0.01563325710594654, + "eval_runtime": 9.111, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.427, + "step": 460 + }, + { + "epoch": 4.268518518518518, + "grad_norm": 0.02488654851913452, + "learning_rate": 6.379148387042316e-06, + "loss": 0.01, + "step": 461 + }, + { + "epoch": 4.277777777777778, + "grad_norm": 0.024208445101976395, + "learning_rate": 6.222088434895462e-06, + "loss": 0.0072, + "step": 462 + }, + { + "epoch": 4.287037037037037, + "grad_norm": 0.023147890344262123, + "learning_rate": 6.066857765057055e-06, + "loss": 0.0088, + "step": 463 + }, + { + "epoch": 4.296296296296296, + "grad_norm": 0.029451172798871994, + "learning_rate": 5.9134628639196e-06, + "loss": 0.0085, + "step": 464 + }, + { + "epoch": 4.305555555555555, + "grad_norm": 0.02764413133263588, + "learning_rate": 5.7619101411671095e-06, + "loss": 0.0099, + "step": 465 + }, + { + "epoch": 4.305555555555555, + "eval_loss": 0.015693385154008865, + "eval_runtime": 9.1176, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 1.426, + "step": 465 + }, + { + "epoch": 4.314814814814815, + "grad_norm": 0.021906448528170586, + "learning_rate": 5.6122059295072085e-06, + "loss": 0.0096, + "step": 466 + }, + { + "epoch": 4.324074074074074, + "grad_norm": 0.02385389618575573, + "learning_rate": 5.464356484406535e-06, + "loss": 0.0072, + "step": 467 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.026357507333159447, + "learning_rate": 5.318367983829392e-06, + "loss": 0.0079, + "step": 468 + }, + { + "epoch": 4.342592592592593, + "grad_norm": 0.026002187281847, + "learning_rate": 5.174246527979531e-06, + "loss": 0.0095, + "step": 469 + }, + { + "epoch": 4.351851851851852, + "grad_norm": 0.02679777517914772, + "learning_rate": 5.031998139045352e-06, + "loss": 0.0085, + "step": 470 + }, + { + "epoch": 4.351851851851852, + "eval_loss": 0.015615792945027351, + "eval_runtime": 9.1365, + "eval_samples_per_second": 5.473, + "eval_steps_per_second": 1.423, + "step": 470 + }, + { + "epoch": 4.361111111111111, + "grad_norm": 0.023431269451975822, + "learning_rate": 4.891628760948114e-06, + "loss": 0.009, + "step": 471 + }, + { + "epoch": 4.37037037037037, + "grad_norm": 0.02848837524652481, + "learning_rate": 4.7531442590937335e-06, + "loss": 0.0102, + "step": 472 + }, + { + "epoch": 4.37962962962963, + "grad_norm": 0.026586227118968964, + "learning_rate": 4.616550420127563e-06, + "loss": 0.0078, + "step": 473 + }, + { + "epoch": 4.388888888888889, + "grad_norm": 0.025660747662186623, + "learning_rate": 4.4818529516926726e-06, + "loss": 0.0086, + "step": 474 + }, + { + "epoch": 4.398148148148148, + "grad_norm": 0.02436869405210018, + "learning_rate": 4.349057482191299e-06, + "loss": 0.011, + "step": 475 + }, + { + "epoch": 4.398148148148148, + "eval_loss": 0.015554042533040047, + "eval_runtime": 9.1142, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 475 + }, + { + "epoch": 4.407407407407407, + "grad_norm": 0.02513139322400093, + "learning_rate": 4.218169560549706e-06, + "loss": 0.0108, + "step": 476 + }, + { + "epoch": 4.416666666666667, + "grad_norm": 0.027343349531292915, + "learning_rate": 4.089194655986306e-06, + "loss": 0.0099, + "step": 477 + }, + { + "epoch": 4.425925925925926, + "grad_norm": 0.02374204248189926, + "learning_rate": 3.962138157783085e-06, + "loss": 0.0095, + "step": 478 + }, + { + "epoch": 4.435185185185185, + "grad_norm": 0.04114212468266487, + "learning_rate": 3.837005375060482e-06, + "loss": 0.0089, + "step": 479 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.024016965180635452, + "learning_rate": 3.7138015365554833e-06, + "loss": 0.0067, + "step": 480 + }, + { + "epoch": 4.444444444444445, + "eval_loss": 0.01539613213390112, + "eval_runtime": 9.1246, + "eval_samples_per_second": 5.48, + "eval_steps_per_second": 1.425, + "step": 480 + }, + { + "epoch": 4.453703703703704, + "grad_norm": 0.02901994250714779, + "learning_rate": 3.5925317904031587e-06, + "loss": 0.0087, + "step": 481 + }, + { + "epoch": 4.462962962962963, + "grad_norm": 0.020981522276997566, + "learning_rate": 3.4732012039215776e-06, + "loss": 0.011, + "step": 482 + }, + { + "epoch": 4.472222222222222, + "grad_norm": 0.023783011361956596, + "learning_rate": 3.3558147633999728e-06, + "loss": 0.0096, + "step": 483 + }, + { + "epoch": 4.481481481481482, + "grad_norm": 0.02081628330051899, + "learning_rate": 3.2403773738905187e-06, + "loss": 0.0087, + "step": 484 + }, + { + "epoch": 4.4907407407407405, + "grad_norm": 0.024986054748296738, + "learning_rate": 3.126893859003249e-06, + "loss": 0.0092, + "step": 485 + }, + { + "epoch": 4.4907407407407405, + "eval_loss": 0.015287145972251892, + "eval_runtime": 9.1097, + "eval_samples_per_second": 5.489, + "eval_steps_per_second": 1.427, + "step": 485 + }, + { + "epoch": 4.5, + "grad_norm": 0.032323963940143585, + "learning_rate": 3.0153689607045845e-06, + "loss": 0.0086, + "step": 486 + }, + { + "epoch": 4.5092592592592595, + "grad_norm": 0.02963520959019661, + "learning_rate": 2.9058073391191375e-06, + "loss": 0.0068, + "step": 487 + }, + { + "epoch": 4.518518518518518, + "grad_norm": 0.035344675183296204, + "learning_rate": 2.798213572335001e-06, + "loss": 0.0062, + "step": 488 + }, + { + "epoch": 4.527777777777778, + "grad_norm": 0.026800939813256264, + "learning_rate": 2.692592156212487e-06, + "loss": 0.0092, + "step": 489 + }, + { + "epoch": 4.537037037037037, + "grad_norm": 0.024116506800055504, + "learning_rate": 2.5889475041961765e-06, + "loss": 0.0072, + "step": 490 + }, + { + "epoch": 4.537037037037037, + "eval_loss": 0.015211592428386211, + "eval_runtime": 9.1184, + "eval_samples_per_second": 5.483, + "eval_steps_per_second": 1.426, + "step": 490 + }, + { + "epoch": 4.546296296296296, + "grad_norm": 0.027498748153448105, + "learning_rate": 2.4872839471306084e-06, + "loss": 0.0082, + "step": 491 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 0.026998436078429222, + "learning_rate": 2.3876057330792346e-06, + "loss": 0.008, + "step": 492 + }, + { + "epoch": 4.564814814814815, + "grad_norm": 0.023703446611762047, + "learning_rate": 2.2899170271469428e-06, + "loss": 0.011, + "step": 493 + }, + { + "epoch": 4.574074074074074, + "grad_norm": 0.019968930631875992, + "learning_rate": 2.1942219113060212e-06, + "loss": 0.0075, + "step": 494 + }, + { + "epoch": 4.583333333333333, + "grad_norm": 0.02214980125427246, + "learning_rate": 2.100524384225555e-06, + "loss": 0.0078, + "step": 495 + }, + { + "epoch": 4.583333333333333, + "eval_loss": 0.015181516297161579, + "eval_runtime": 9.1214, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 1.425, + "step": 495 + }, + { + "epoch": 4.592592592592593, + "grad_norm": 0.025330157950520515, + "learning_rate": 2.0088283611044036e-06, + "loss": 0.0062, + "step": 496 + }, + { + "epoch": 4.601851851851852, + "grad_norm": 0.019013626500964165, + "learning_rate": 1.9191376735075427e-06, + "loss": 0.0088, + "step": 497 + }, + { + "epoch": 4.611111111111111, + "grad_norm": 0.022145694121718407, + "learning_rate": 1.8314560692059835e-06, + "loss": 0.0089, + "step": 498 + }, + { + "epoch": 4.62037037037037, + "grad_norm": 0.023724934086203575, + "learning_rate": 1.7457872120201779e-06, + "loss": 0.0086, + "step": 499 + }, + { + "epoch": 4.62962962962963, + "grad_norm": 0.020578699186444283, + "learning_rate": 1.6621346816668992e-06, + "loss": 0.0091, + "step": 500 + }, + { + "epoch": 4.62962962962963, + "eval_loss": 0.015207822434604168, + "eval_runtime": 9.1136, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 500 + }, + { + "epoch": 4.638888888888889, + "grad_norm": 0.024306217208504677, + "learning_rate": 1.5805019736097104e-06, + "loss": 0.009, + "step": 501 + }, + { + "epoch": 4.648148148148148, + "grad_norm": 0.020744021981954575, + "learning_rate": 1.5008924989128258e-06, + "loss": 0.0089, + "step": 502 + }, + { + "epoch": 4.657407407407407, + "grad_norm": 0.02516799047589302, + "learning_rate": 1.4233095840986753e-06, + "loss": 0.0093, + "step": 503 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.024567998945713043, + "learning_rate": 1.3477564710088098e-06, + "loss": 0.0094, + "step": 504 + }, + { + "epoch": 4.675925925925926, + "grad_norm": 0.024358859285712242, + "learning_rate": 1.2742363166685034e-06, + "loss": 0.007, + "step": 505 + }, + { + "epoch": 4.675925925925926, + "eval_loss": 0.015200878493487835, + "eval_runtime": 9.1155, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 505 + }, + { + "epoch": 4.685185185185185, + "grad_norm": 0.023163504898548126, + "learning_rate": 1.2027521931548214e-06, + "loss": 0.0074, + "step": 506 + }, + { + "epoch": 4.694444444444445, + "grad_norm": 0.023604586720466614, + "learning_rate": 1.1333070874682216e-06, + "loss": 0.0093, + "step": 507 + }, + { + "epoch": 4.703703703703704, + "grad_norm": 0.02068418823182583, + "learning_rate": 1.0659039014077944e-06, + "loss": 0.0084, + "step": 508 + }, + { + "epoch": 4.712962962962963, + "grad_norm": 0.02598651312291622, + "learning_rate": 1.0005454514499414e-06, + "loss": 0.0088, + "step": 509 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.02512424811720848, + "learning_rate": 9.372344686307655e-07, + "loss": 0.0064, + "step": 510 + }, + { + "epoch": 4.722222222222222, + "eval_loss": 0.01521637849509716, + "eval_runtime": 9.1143, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 510 + }, + { + "epoch": 4.731481481481482, + "grad_norm": 0.021041063591837883, + "learning_rate": 8.759735984318895e-07, + "loss": 0.0096, + "step": 511 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 0.025718161836266518, + "learning_rate": 8.167654006699443e-07, + "loss": 0.0077, + "step": 512 + }, + { + "epoch": 4.75, + "grad_norm": 0.02913082391023636, + "learning_rate": 7.596123493895991e-07, + "loss": 0.0072, + "step": 513 + }, + { + "epoch": 4.7592592592592595, + "grad_norm": 0.026588505133986473, + "learning_rate": 7.04516832760177e-07, + "loss": 0.0094, + "step": 514 + }, + { + "epoch": 4.768518518518518, + "grad_norm": 0.023728126659989357, + "learning_rate": 6.514811529758747e-07, + "loss": 0.0099, + "step": 515 + }, + { + "epoch": 4.768518518518518, + "eval_loss": 0.01521516963839531, + "eval_runtime": 9.1511, + "eval_samples_per_second": 5.464, + "eval_steps_per_second": 1.421, + "step": 515 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 0.03438512608408928, + "learning_rate": 6.005075261595494e-07, + "loss": 0.0086, + "step": 516 + }, + { + "epoch": 4.787037037037037, + "grad_norm": 0.019554298371076584, + "learning_rate": 5.515980822701439e-07, + "loss": 0.0092, + "step": 517 + }, + { + "epoch": 4.796296296296296, + "grad_norm": 0.0235204566270113, + "learning_rate": 5.047548650136513e-07, + "loss": 0.009, + "step": 518 + }, + { + "epoch": 4.805555555555555, + "grad_norm": 0.023747643455863, + "learning_rate": 4.5997983175773417e-07, + "loss": 0.0092, + "step": 519 + }, + { + "epoch": 4.814814814814815, + "grad_norm": 0.02751827985048294, + "learning_rate": 4.1727485344994486e-07, + "loss": 0.0088, + "step": 520 + }, + { + "epoch": 4.814814814814815, + "eval_loss": 0.015235532075166702, + "eval_runtime": 9.1256, + "eval_samples_per_second": 5.479, + "eval_steps_per_second": 1.425, + "step": 520 + }, + { + "epoch": 4.824074074074074, + "grad_norm": 0.026621591299772263, + "learning_rate": 3.766417145395218e-07, + "loss": 0.0086, + "step": 521 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.01991841197013855, + "learning_rate": 3.380821129028489e-07, + "loss": 0.0084, + "step": 522 + }, + { + "epoch": 4.842592592592593, + "grad_norm": 0.023508219048380852, + "learning_rate": 3.0159765977250673e-07, + "loss": 0.0103, + "step": 523 + }, + { + "epoch": 4.851851851851852, + "grad_norm": 0.02976732887327671, + "learning_rate": 2.671898796699268e-07, + "loss": 0.0084, + "step": 524 + }, + { + "epoch": 4.861111111111111, + "grad_norm": 0.02255621738731861, + "learning_rate": 2.3486021034170857e-07, + "loss": 0.0089, + "step": 525 + }, + { + "epoch": 4.861111111111111, + "eval_loss": 0.015216498635709286, + "eval_runtime": 9.1106, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 1.427, + "step": 525 + }, + { + "epoch": 4.87037037037037, + "grad_norm": 0.025215914472937584, + "learning_rate": 2.0461000269953456e-07, + "loss": 0.0075, + "step": 526 + }, + { + "epoch": 4.87962962962963, + "grad_norm": 0.02554066851735115, + "learning_rate": 1.7644052076371542e-07, + "loss": 0.0083, + "step": 527 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.02162836864590645, + "learning_rate": 1.503529416103988e-07, + "loss": 0.009, + "step": 528 + }, + { + "epoch": 4.898148148148148, + "grad_norm": 0.02335723116993904, + "learning_rate": 1.2634835532233657e-07, + "loss": 0.0093, + "step": 529 + }, + { + "epoch": 4.907407407407407, + "grad_norm": 0.02844967506825924, + "learning_rate": 1.044277649433989e-07, + "loss": 0.0083, + "step": 530 + }, + { + "epoch": 4.907407407407407, + "eval_loss": 0.015229844488203526, + "eval_runtime": 9.1406, + "eval_samples_per_second": 5.47, + "eval_steps_per_second": 1.422, + "step": 530 + }, + { + "epoch": 4.916666666666667, + "grad_norm": 0.02188325859606266, + "learning_rate": 8.459208643659122e-08, + "loss": 0.0084, + "step": 531 + }, + { + "epoch": 4.925925925925926, + "grad_norm": 0.026782654225826263, + "learning_rate": 6.684214864584038e-08, + "loss": 0.009, + "step": 532 + }, + { + "epoch": 4.935185185185185, + "grad_norm": 0.024010982364416122, + "learning_rate": 5.11786932613223e-08, + "loss": 0.0055, + "step": 533 + }, + { + "epoch": 4.944444444444445, + "grad_norm": 0.02621973119676113, + "learning_rate": 3.760237478849793e-08, + "loss": 0.0093, + "step": 534 + }, + { + "epoch": 4.953703703703704, + "grad_norm": 0.02257387712597847, + "learning_rate": 2.6113760520735108e-08, + "loss": 0.0103, + "step": 535 + }, + { + "epoch": 4.953703703703704, + "eval_loss": 0.015256751328706741, + "eval_runtime": 9.1156, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 1.426, + "step": 535 + }, + { + "epoch": 4.962962962962963, + "grad_norm": 0.02289225161075592, + "learning_rate": 1.6713330515627513e-08, + "loss": 0.0106, + "step": 536 + }, + { + "epoch": 4.972222222222222, + "grad_norm": 0.032289694994688034, + "learning_rate": 9.401477574932926e-09, + "loss": 0.0074, + "step": 537 + }, + { + "epoch": 4.981481481481482, + "grad_norm": 0.0215620007365942, + "learning_rate": 4.178507228136397e-09, + "loss": 0.0082, + "step": 538 + }, + { + "epoch": 4.9907407407407405, + "grad_norm": 0.02391226962208748, + "learning_rate": 1.0446377197104173e-09, + "loss": 0.0085, + "step": 539 + }, + { + "epoch": 5.0, + "grad_norm": 0.0241775494068861, + "learning_rate": 0.0, + "loss": 0.0092, + "step": 540 + }, + { + "epoch": 5.0, + "eval_loss": 0.01526525616645813, + "eval_runtime": 9.1149, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 1.426, + "step": 540 + }, + { + "epoch": 5.0, + "step": 540, + "total_flos": 1.2254685925518213e+18, + "train_loss": 0.016027936152251506, + "train_runtime": 9839.9649, + "train_samples_per_second": 1.756, + "train_steps_per_second": 0.055 + } + ], + "logging_steps": 1, + "max_steps": 540, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2254685925518213e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}