diff --git "a/checkpoints/Qwen2.5-7B/babylm_shuffle_control_10M_seed0/runs/checkpoint-1290/trainer_state.json" "b/checkpoints/Qwen2.5-7B/babylm_shuffle_control_10M_seed0/runs/checkpoint-1290/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/Qwen2.5-7B/babylm_shuffle_control_10M_seed0/runs/checkpoint-1290/trainer_state.json" @@ -0,0 +1,10095 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1290, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015503875968992248, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6553, + "step": 1 + }, + { + "epoch": 0.0031007751937984496, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6785, + "step": 2 + }, + { + "epoch": 0.004651162790697674, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6452, + "step": 3 + }, + { + "epoch": 0.006201550387596899, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6734, + "step": 4 + }, + { + "epoch": 0.007751937984496124, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6731, + "step": 5 + }, + { + "epoch": 0.009302325581395349, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.671, + "step": 6 + }, + { + "epoch": 0.010852713178294573, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.7004, + "step": 7 + }, + { + "epoch": 0.012403100775193798, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6789, + "step": 8 + }, + { + "epoch": 0.013953488372093023, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6501, + "step": 9 + }, + { + "epoch": 0.015503875968992248, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6589, + "step": 10 + }, + { + "epoch": 0.015503875968992248, + "eval_loss": 1.6840144395828247, + "eval_runtime": 44.2414, + "eval_samples_per_second": 22.603, + "eval_steps_per_second": 1.424, + "step": 10 + }, + { + "epoch": 0.017054263565891473, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6732, + "step": 11 + }, + { + "epoch": 0.018604651162790697, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.7091, + "step": 12 + }, + { + "epoch": 0.020155038759689922, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6712, + "step": 13 + }, + { + "epoch": 0.021705426356589147, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6595, + "step": 14 + }, + { + "epoch": 0.023255813953488372, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6723, + "step": 15 + }, + { + "epoch": 0.024806201550387597, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6729, + "step": 16 + }, + { + "epoch": 0.02635658914728682, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6801, + "step": 17 + }, + { + "epoch": 0.027906976744186046, + "grad_norm": 4.608963489532471, + "learning_rate": 3.875968992248062e-08, + "loss": 1.6527, + "step": 18 + }, + { + "epoch": 0.02945736434108527, + "grad_norm": 4.420685291290283, + "learning_rate": 7.751937984496124e-08, + "loss": 1.6801, + "step": 19 + }, + { + "epoch": 0.031007751937984496, + "grad_norm": 3.385024309158325, + "learning_rate": 1.1627906976744187e-07, + "loss": 1.6978, + "step": 20 + }, + { + "epoch": 0.031007751937984496, + "eval_loss": 1.6839702129364014, + "eval_runtime": 44.132, + "eval_samples_per_second": 22.659, + "eval_steps_per_second": 1.428, + "step": 20 + }, + { + "epoch": 0.03255813953488372, + "grad_norm": 4.05656099319458, + "learning_rate": 1.5503875968992249e-07, + "loss": 1.6369, + "step": 21 + }, + { + "epoch": 0.034108527131782945, + "grad_norm": 4.343766689300537, + "learning_rate": 1.9379844961240311e-07, + "loss": 1.6709, + "step": 22 + }, + { + "epoch": 0.03565891472868217, + "grad_norm": 4.232193946838379, + "learning_rate": 2.3255813953488374e-07, + "loss": 1.6855, + "step": 23 + }, + { + "epoch": 0.037209302325581395, + "grad_norm": 3.5586187839508057, + "learning_rate": 2.7131782945736437e-07, + "loss": 1.6812, + "step": 24 + }, + { + "epoch": 0.03875968992248062, + "grad_norm": 3.9483323097229004, + "learning_rate": 3.1007751937984497e-07, + "loss": 1.6795, + "step": 25 + }, + { + "epoch": 0.040310077519379844, + "grad_norm": 2.9243829250335693, + "learning_rate": 3.488372093023256e-07, + "loss": 1.6395, + "step": 26 + }, + { + "epoch": 0.04186046511627907, + "grad_norm": 3.643986701965332, + "learning_rate": 3.8759689922480623e-07, + "loss": 1.6803, + "step": 27 + }, + { + "epoch": 0.043410852713178294, + "grad_norm": 3.2554931640625, + "learning_rate": 4.2635658914728683e-07, + "loss": 1.6848, + "step": 28 + }, + { + "epoch": 0.04496124031007752, + "grad_norm": 3.6439590454101562, + "learning_rate": 4.651162790697675e-07, + "loss": 1.6376, + "step": 29 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 4.260690689086914, + "learning_rate": 5.038759689922481e-07, + "loss": 1.6478, + "step": 30 + }, + { + "epoch": 0.046511627906976744, + "eval_loss": 1.6623069047927856, + "eval_runtime": 45.3876, + "eval_samples_per_second": 22.032, + "eval_steps_per_second": 1.388, + "step": 30 + }, + { + "epoch": 0.04806201550387597, + "grad_norm": 3.7455801963806152, + "learning_rate": 5.426356589147287e-07, + "loss": 1.6443, + "step": 31 + }, + { + "epoch": 0.04961240310077519, + "grad_norm": 5.081115245819092, + "learning_rate": 5.813953488372094e-07, + "loss": 1.6436, + "step": 32 + }, + { + "epoch": 0.05116279069767442, + "grad_norm": 1.788765788078308, + "learning_rate": 6.201550387596899e-07, + "loss": 1.645, + "step": 33 + }, + { + "epoch": 0.05271317829457364, + "grad_norm": 1.6885591745376587, + "learning_rate": 6.589147286821707e-07, + "loss": 1.6139, + "step": 34 + }, + { + "epoch": 0.05426356589147287, + "grad_norm": 2.4668631553649902, + "learning_rate": 6.976744186046513e-07, + "loss": 1.6299, + "step": 35 + }, + { + "epoch": 0.05581395348837209, + "grad_norm": 2.387413740158081, + "learning_rate": 7.364341085271319e-07, + "loss": 1.64, + "step": 36 + }, + { + "epoch": 0.05736434108527132, + "grad_norm": 2.5546491146087646, + "learning_rate": 7.751937984496125e-07, + "loss": 1.6272, + "step": 37 + }, + { + "epoch": 0.05891472868217054, + "grad_norm": 1.8689916133880615, + "learning_rate": 8.139534883720931e-07, + "loss": 1.5905, + "step": 38 + }, + { + "epoch": 0.06046511627906977, + "grad_norm": 2.584873914718628, + "learning_rate": 8.527131782945737e-07, + "loss": 1.607, + "step": 39 + }, + { + "epoch": 0.06201550387596899, + "grad_norm": 2.200226306915283, + "learning_rate": 8.914728682170544e-07, + "loss": 1.6105, + "step": 40 + }, + { + "epoch": 0.06201550387596899, + "eval_loss": 1.616817593574524, + "eval_runtime": 46.2754, + "eval_samples_per_second": 21.61, + "eval_steps_per_second": 1.361, + "step": 40 + }, + { + "epoch": 0.06356589147286822, + "grad_norm": 1.4535961151123047, + "learning_rate": 9.30232558139535e-07, + "loss": 1.6013, + "step": 41 + }, + { + "epoch": 0.06511627906976744, + "grad_norm": 3.4520232677459717, + "learning_rate": 9.689922480620157e-07, + "loss": 1.5835, + "step": 42 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 2.159172534942627, + "learning_rate": 1.0077519379844962e-06, + "loss": 1.5887, + "step": 43 + }, + { + "epoch": 0.06821705426356589, + "grad_norm": 1.8451075553894043, + "learning_rate": 1.0465116279069768e-06, + "loss": 1.5939, + "step": 44 + }, + { + "epoch": 0.06976744186046512, + "grad_norm": 1.8367409706115723, + "learning_rate": 1.0852713178294575e-06, + "loss": 1.5711, + "step": 45 + }, + { + "epoch": 0.07131782945736434, + "grad_norm": 2.565183401107788, + "learning_rate": 1.1240310077519381e-06, + "loss": 1.585, + "step": 46 + }, + { + "epoch": 0.07286821705426356, + "grad_norm": 3.1191582679748535, + "learning_rate": 1.1627906976744188e-06, + "loss": 1.6068, + "step": 47 + }, + { + "epoch": 0.07441860465116279, + "grad_norm": 2.8720552921295166, + "learning_rate": 1.2015503875968994e-06, + "loss": 1.5557, + "step": 48 + }, + { + "epoch": 0.07596899224806201, + "grad_norm": 2.1380953788757324, + "learning_rate": 1.2403100775193799e-06, + "loss": 1.566, + "step": 49 + }, + { + "epoch": 0.07751937984496124, + "grad_norm": 2.3946728706359863, + "learning_rate": 1.2790697674418605e-06, + "loss": 1.5299, + "step": 50 + }, + { + "epoch": 0.07751937984496124, + "eval_loss": 1.5494492053985596, + "eval_runtime": 46.4926, + "eval_samples_per_second": 21.509, + "eval_steps_per_second": 1.355, + "step": 50 + }, + { + "epoch": 0.07906976744186046, + "grad_norm": 2.220069646835327, + "learning_rate": 1.3178294573643414e-06, + "loss": 1.5401, + "step": 51 + }, + { + "epoch": 0.08062015503875969, + "grad_norm": 1.8800451755523682, + "learning_rate": 1.3565891472868216e-06, + "loss": 1.5078, + "step": 52 + }, + { + "epoch": 0.08217054263565891, + "grad_norm": 1.897739052772522, + "learning_rate": 1.3953488372093025e-06, + "loss": 1.4914, + "step": 53 + }, + { + "epoch": 0.08372093023255814, + "grad_norm": 2.7918992042541504, + "learning_rate": 1.4341085271317832e-06, + "loss": 1.4992, + "step": 54 + }, + { + "epoch": 0.08527131782945736, + "grad_norm": 2.1271169185638428, + "learning_rate": 1.4728682170542638e-06, + "loss": 1.4418, + "step": 55 + }, + { + "epoch": 0.08682170542635659, + "grad_norm": 2.3548240661621094, + "learning_rate": 1.5116279069767443e-06, + "loss": 1.4575, + "step": 56 + }, + { + "epoch": 0.08837209302325581, + "grad_norm": 2.4243152141571045, + "learning_rate": 1.550387596899225e-06, + "loss": 1.4292, + "step": 57 + }, + { + "epoch": 0.08992248062015504, + "grad_norm": 1.9162991046905518, + "learning_rate": 1.5891472868217056e-06, + "loss": 1.4401, + "step": 58 + }, + { + "epoch": 0.09147286821705426, + "grad_norm": 2.9491307735443115, + "learning_rate": 1.6279069767441862e-06, + "loss": 1.4409, + "step": 59 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 2.5566985607147217, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.3898, + "step": 60 + }, + { + "epoch": 0.09302325581395349, + "eval_loss": 1.4331709146499634, + "eval_runtime": 46.6366, + "eval_samples_per_second": 21.442, + "eval_steps_per_second": 1.351, + "step": 60 + }, + { + "epoch": 0.09457364341085271, + "grad_norm": 1.8447020053863525, + "learning_rate": 1.7054263565891473e-06, + "loss": 1.4007, + "step": 61 + }, + { + "epoch": 0.09612403100775194, + "grad_norm": 3.000507354736328, + "learning_rate": 1.7441860465116282e-06, + "loss": 1.3862, + "step": 62 + }, + { + "epoch": 0.09767441860465116, + "grad_norm": 2.1395022869110107, + "learning_rate": 1.7829457364341088e-06, + "loss": 1.4188, + "step": 63 + }, + { + "epoch": 0.09922480620155039, + "grad_norm": 2.0232176780700684, + "learning_rate": 1.8217054263565893e-06, + "loss": 1.369, + "step": 64 + }, + { + "epoch": 0.10077519379844961, + "grad_norm": 3.6658763885498047, + "learning_rate": 1.86046511627907e-06, + "loss": 1.3842, + "step": 65 + }, + { + "epoch": 0.10232558139534884, + "grad_norm": 1.7414005994796753, + "learning_rate": 1.8992248062015506e-06, + "loss": 1.3289, + "step": 66 + }, + { + "epoch": 0.10387596899224806, + "grad_norm": 1.593327522277832, + "learning_rate": 1.9379844961240315e-06, + "loss": 1.3289, + "step": 67 + }, + { + "epoch": 0.10542635658914729, + "grad_norm": 1.774498462677002, + "learning_rate": 1.976744186046512e-06, + "loss": 1.3283, + "step": 68 + }, + { + "epoch": 0.10697674418604651, + "grad_norm": 1.7857030630111694, + "learning_rate": 2.0155038759689923e-06, + "loss": 1.3127, + "step": 69 + }, + { + "epoch": 0.10852713178294573, + "grad_norm": 1.4791711568832397, + "learning_rate": 2.054263565891473e-06, + "loss": 1.3534, + "step": 70 + }, + { + "epoch": 0.10852713178294573, + "eval_loss": 1.3698604106903076, + "eval_runtime": 46.6094, + "eval_samples_per_second": 21.455, + "eval_steps_per_second": 1.352, + "step": 70 + }, + { + "epoch": 0.11007751937984496, + "grad_norm": 1.0254175662994385, + "learning_rate": 2.0930232558139536e-06, + "loss": 1.3053, + "step": 71 + }, + { + "epoch": 0.11162790697674418, + "grad_norm": 1.492088794708252, + "learning_rate": 2.131782945736434e-06, + "loss": 1.3374, + "step": 72 + }, + { + "epoch": 0.11317829457364341, + "grad_norm": 1.8035080432891846, + "learning_rate": 2.170542635658915e-06, + "loss": 1.3438, + "step": 73 + }, + { + "epoch": 0.11472868217054263, + "grad_norm": 2.005145311355591, + "learning_rate": 2.2093023255813954e-06, + "loss": 1.277, + "step": 74 + }, + { + "epoch": 0.11627906976744186, + "grad_norm": 2.1278674602508545, + "learning_rate": 2.2480620155038763e-06, + "loss": 1.3063, + "step": 75 + }, + { + "epoch": 0.11782945736434108, + "grad_norm": 2.4920172691345215, + "learning_rate": 2.2868217054263567e-06, + "loss": 1.2928, + "step": 76 + }, + { + "epoch": 0.11937984496124031, + "grad_norm": 1.801375389099121, + "learning_rate": 2.3255813953488376e-06, + "loss": 1.2927, + "step": 77 + }, + { + "epoch": 0.12093023255813953, + "grad_norm": 1.4796046018600464, + "learning_rate": 2.364341085271318e-06, + "loss": 1.2427, + "step": 78 + }, + { + "epoch": 0.12248062015503876, + "grad_norm": 1.9576448202133179, + "learning_rate": 2.403100775193799e-06, + "loss": 1.244, + "step": 79 + }, + { + "epoch": 0.12403100775193798, + "grad_norm": 2.2607786655426025, + "learning_rate": 2.4418604651162793e-06, + "loss": 1.2798, + "step": 80 + }, + { + "epoch": 0.12403100775193798, + "eval_loss": 1.3285717964172363, + "eval_runtime": 46.5854, + "eval_samples_per_second": 21.466, + "eval_steps_per_second": 1.352, + "step": 80 + }, + { + "epoch": 0.12558139534883722, + "grad_norm": 1.7419644594192505, + "learning_rate": 2.4806201550387598e-06, + "loss": 1.2767, + "step": 81 + }, + { + "epoch": 0.12713178294573643, + "grad_norm": 1.6084612607955933, + "learning_rate": 2.5193798449612406e-06, + "loss": 1.2692, + "step": 82 + }, + { + "epoch": 0.12868217054263567, + "grad_norm": 1.8201138973236084, + "learning_rate": 2.558139534883721e-06, + "loss": 1.259, + "step": 83 + }, + { + "epoch": 0.13023255813953488, + "grad_norm": 1.7329964637756348, + "learning_rate": 2.596899224806202e-06, + "loss": 1.2547, + "step": 84 + }, + { + "epoch": 0.13178294573643412, + "grad_norm": 1.6506623029708862, + "learning_rate": 2.635658914728683e-06, + "loss": 1.2642, + "step": 85 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 2.2526042461395264, + "learning_rate": 2.674418604651163e-06, + "loss": 1.2583, + "step": 86 + }, + { + "epoch": 0.13488372093023257, + "grad_norm": 2.1409525871276855, + "learning_rate": 2.7131782945736433e-06, + "loss": 1.2749, + "step": 87 + }, + { + "epoch": 0.13643410852713178, + "grad_norm": 2.350247621536255, + "learning_rate": 2.751937984496124e-06, + "loss": 1.2073, + "step": 88 + }, + { + "epoch": 0.13798449612403102, + "grad_norm": 2.1530516147613525, + "learning_rate": 2.790697674418605e-06, + "loss": 1.2357, + "step": 89 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 2.1129376888275146, + "learning_rate": 2.8294573643410855e-06, + "loss": 1.2374, + "step": 90 + }, + { + "epoch": 0.13953488372093023, + "eval_loss": 1.296906590461731, + "eval_runtime": 46.5431, + "eval_samples_per_second": 21.485, + "eval_steps_per_second": 1.354, + "step": 90 + }, + { + "epoch": 0.14108527131782947, + "grad_norm": 1.8323308229446411, + "learning_rate": 2.8682170542635663e-06, + "loss": 1.2455, + "step": 91 + }, + { + "epoch": 0.14263565891472868, + "grad_norm": 2.5223309993743896, + "learning_rate": 2.9069767441860468e-06, + "loss": 1.241, + "step": 92 + }, + { + "epoch": 0.14418604651162792, + "grad_norm": 1.8278683423995972, + "learning_rate": 2.9457364341085276e-06, + "loss": 1.1999, + "step": 93 + }, + { + "epoch": 0.14573643410852713, + "grad_norm": 1.683199167251587, + "learning_rate": 2.9844961240310076e-06, + "loss": 1.2185, + "step": 94 + }, + { + "epoch": 0.14728682170542637, + "grad_norm": 1.664718747138977, + "learning_rate": 3.0232558139534885e-06, + "loss": 1.2202, + "step": 95 + }, + { + "epoch": 0.14883720930232558, + "grad_norm": 1.8569575548171997, + "learning_rate": 3.062015503875969e-06, + "loss": 1.2114, + "step": 96 + }, + { + "epoch": 0.15038759689922482, + "grad_norm": 1.8585015535354614, + "learning_rate": 3.10077519379845e-06, + "loss": 1.1885, + "step": 97 + }, + { + "epoch": 0.15193798449612403, + "grad_norm": 1.98724365234375, + "learning_rate": 3.1395348837209307e-06, + "loss": 1.2183, + "step": 98 + }, + { + "epoch": 0.15348837209302327, + "grad_norm": 1.7224115133285522, + "learning_rate": 3.178294573643411e-06, + "loss": 1.2109, + "step": 99 + }, + { + "epoch": 0.15503875968992248, + "grad_norm": 1.634524941444397, + "learning_rate": 3.217054263565892e-06, + "loss": 1.2071, + "step": 100 + }, + { + "epoch": 0.15503875968992248, + "eval_loss": 1.2693296670913696, + "eval_runtime": 46.5124, + "eval_samples_per_second": 21.5, + "eval_steps_per_second": 1.354, + "step": 100 + }, + { + "epoch": 0.15658914728682172, + "grad_norm": 1.557700514793396, + "learning_rate": 3.2558139534883724e-06, + "loss": 1.2432, + "step": 101 + }, + { + "epoch": 0.15813953488372093, + "grad_norm": 1.700620174407959, + "learning_rate": 3.294573643410853e-06, + "loss": 1.2114, + "step": 102 + }, + { + "epoch": 0.15968992248062017, + "grad_norm": 1.8116543292999268, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.2158, + "step": 103 + }, + { + "epoch": 0.16124031007751938, + "grad_norm": 1.6625306606292725, + "learning_rate": 3.372093023255814e-06, + "loss": 1.2171, + "step": 104 + }, + { + "epoch": 0.16279069767441862, + "grad_norm": 2.052786111831665, + "learning_rate": 3.4108527131782946e-06, + "loss": 1.2303, + "step": 105 + }, + { + "epoch": 0.16434108527131783, + "grad_norm": 2.397324800491333, + "learning_rate": 3.4496124031007755e-06, + "loss": 1.2279, + "step": 106 + }, + { + "epoch": 0.16589147286821707, + "grad_norm": 2.131593942642212, + "learning_rate": 3.4883720930232564e-06, + "loss": 1.1918, + "step": 107 + }, + { + "epoch": 0.16744186046511628, + "grad_norm": 1.8036249876022339, + "learning_rate": 3.527131782945737e-06, + "loss": 1.1777, + "step": 108 + }, + { + "epoch": 0.16899224806201552, + "grad_norm": 1.5316169261932373, + "learning_rate": 3.5658914728682177e-06, + "loss": 1.2162, + "step": 109 + }, + { + "epoch": 0.17054263565891473, + "grad_norm": 2.1200644969940186, + "learning_rate": 3.6046511627906977e-06, + "loss": 1.1771, + "step": 110 + }, + { + "epoch": 0.17054263565891473, + "eval_loss": 1.2527676820755005, + "eval_runtime": 46.5002, + "eval_samples_per_second": 21.505, + "eval_steps_per_second": 1.355, + "step": 110 + }, + { + "epoch": 0.17209302325581396, + "grad_norm": 2.45514178276062, + "learning_rate": 3.6434108527131786e-06, + "loss": 1.1977, + "step": 111 + }, + { + "epoch": 0.17364341085271318, + "grad_norm": 1.8457828760147095, + "learning_rate": 3.682170542635659e-06, + "loss": 1.1834, + "step": 112 + }, + { + "epoch": 0.17519379844961241, + "grad_norm": 2.7428853511810303, + "learning_rate": 3.72093023255814e-06, + "loss": 1.1401, + "step": 113 + }, + { + "epoch": 0.17674418604651163, + "grad_norm": 2.583146095275879, + "learning_rate": 3.7596899224806203e-06, + "loss": 1.1642, + "step": 114 + }, + { + "epoch": 0.17829457364341086, + "grad_norm": 2.205864667892456, + "learning_rate": 3.798449612403101e-06, + "loss": 1.1595, + "step": 115 + }, + { + "epoch": 0.17984496124031008, + "grad_norm": 1.9899259805679321, + "learning_rate": 3.837209302325582e-06, + "loss": 1.1645, + "step": 116 + }, + { + "epoch": 0.1813953488372093, + "grad_norm": 1.7081761360168457, + "learning_rate": 3.875968992248063e-06, + "loss": 1.1369, + "step": 117 + }, + { + "epoch": 0.18294573643410852, + "grad_norm": 2.1556506156921387, + "learning_rate": 3.914728682170543e-06, + "loss": 1.148, + "step": 118 + }, + { + "epoch": 0.18449612403100776, + "grad_norm": 1.763687014579773, + "learning_rate": 3.953488372093024e-06, + "loss": 1.1631, + "step": 119 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 1.9201045036315918, + "learning_rate": 3.992248062015504e-06, + "loss": 1.1472, + "step": 120 + }, + { + "epoch": 0.18604651162790697, + "eval_loss": 1.2374286651611328, + "eval_runtime": 46.5809, + "eval_samples_per_second": 21.468, + "eval_steps_per_second": 1.352, + "step": 120 + }, + { + "epoch": 0.1875968992248062, + "grad_norm": 2.8066470623016357, + "learning_rate": 4.031007751937985e-06, + "loss": 1.1611, + "step": 121 + }, + { + "epoch": 0.18914728682170542, + "grad_norm": 1.8357489109039307, + "learning_rate": 4.0697674418604655e-06, + "loss": 1.1304, + "step": 122 + }, + { + "epoch": 0.19069767441860466, + "grad_norm": 2.2851312160491943, + "learning_rate": 4.108527131782946e-06, + "loss": 1.161, + "step": 123 + }, + { + "epoch": 0.19224806201550387, + "grad_norm": 1.841111660003662, + "learning_rate": 4.1472868217054264e-06, + "loss": 1.1538, + "step": 124 + }, + { + "epoch": 0.1937984496124031, + "grad_norm": 2.4325246810913086, + "learning_rate": 4.186046511627907e-06, + "loss": 1.1469, + "step": 125 + }, + { + "epoch": 0.19534883720930232, + "grad_norm": 1.4917737245559692, + "learning_rate": 4.224806201550387e-06, + "loss": 1.1266, + "step": 126 + }, + { + "epoch": 0.19689922480620156, + "grad_norm": 2.5839462280273438, + "learning_rate": 4.263565891472868e-06, + "loss": 1.1439, + "step": 127 + }, + { + "epoch": 0.19844961240310077, + "grad_norm": 2.911651611328125, + "learning_rate": 4.302325581395349e-06, + "loss": 1.1537, + "step": 128 + }, + { + "epoch": 0.2, + "grad_norm": 2.2323522567749023, + "learning_rate": 4.34108527131783e-06, + "loss": 1.1498, + "step": 129 + }, + { + "epoch": 0.20155038759689922, + "grad_norm": 1.976426124572754, + "learning_rate": 4.379844961240311e-06, + "loss": 1.1103, + "step": 130 + }, + { + "epoch": 0.20155038759689922, + "eval_loss": 1.2214908599853516, + "eval_runtime": 46.6151, + "eval_samples_per_second": 21.452, + "eval_steps_per_second": 1.351, + "step": 130 + }, + { + "epoch": 0.20310077519379846, + "grad_norm": 1.4584643840789795, + "learning_rate": 4.418604651162791e-06, + "loss": 1.1413, + "step": 131 + }, + { + "epoch": 0.20465116279069767, + "grad_norm": 1.827351689338684, + "learning_rate": 4.457364341085272e-06, + "loss": 1.142, + "step": 132 + }, + { + "epoch": 0.2062015503875969, + "grad_norm": 2.4622914791107178, + "learning_rate": 4.4961240310077525e-06, + "loss": 1.1604, + "step": 133 + }, + { + "epoch": 0.20775193798449612, + "grad_norm": 1.939195156097412, + "learning_rate": 4.5348837209302326e-06, + "loss": 1.0974, + "step": 134 + }, + { + "epoch": 0.20930232558139536, + "grad_norm": 1.4621402025222778, + "learning_rate": 4.573643410852713e-06, + "loss": 1.1114, + "step": 135 + }, + { + "epoch": 0.21085271317829457, + "grad_norm": 2.018838882446289, + "learning_rate": 4.612403100775194e-06, + "loss": 1.1252, + "step": 136 + }, + { + "epoch": 0.2124031007751938, + "grad_norm": 2.0258100032806396, + "learning_rate": 4.651162790697675e-06, + "loss": 1.1045, + "step": 137 + }, + { + "epoch": 0.21395348837209302, + "grad_norm": 1.714446783065796, + "learning_rate": 4.689922480620155e-06, + "loss": 1.1026, + "step": 138 + }, + { + "epoch": 0.21550387596899226, + "grad_norm": 1.330955147743225, + "learning_rate": 4.728682170542636e-06, + "loss": 1.096, + "step": 139 + }, + { + "epoch": 0.21705426356589147, + "grad_norm": 2.121669054031372, + "learning_rate": 4.767441860465117e-06, + "loss": 1.0984, + "step": 140 + }, + { + "epoch": 0.21705426356589147, + "eval_loss": 1.2069424390792847, + "eval_runtime": 46.2613, + "eval_samples_per_second": 21.616, + "eval_steps_per_second": 1.362, + "step": 140 + }, + { + "epoch": 0.2186046511627907, + "grad_norm": 1.4863253831863403, + "learning_rate": 4.806201550387598e-06, + "loss": 1.1096, + "step": 141 + }, + { + "epoch": 0.22015503875968992, + "grad_norm": 2.0480966567993164, + "learning_rate": 4.844961240310078e-06, + "loss": 1.1017, + "step": 142 + }, + { + "epoch": 0.22170542635658916, + "grad_norm": 1.563471794128418, + "learning_rate": 4.883720930232559e-06, + "loss": 1.1047, + "step": 143 + }, + { + "epoch": 0.22325581395348837, + "grad_norm": 1.5344929695129395, + "learning_rate": 4.922480620155039e-06, + "loss": 1.118, + "step": 144 + }, + { + "epoch": 0.2248062015503876, + "grad_norm": 1.8281855583190918, + "learning_rate": 4.9612403100775195e-06, + "loss": 1.0955, + "step": 145 + }, + { + "epoch": 0.22635658914728682, + "grad_norm": 1.6736650466918945, + "learning_rate": 5e-06, + "loss": 1.1399, + "step": 146 + }, + { + "epoch": 0.22790697674418606, + "grad_norm": 2.0063889026641846, + "learning_rate": 4.995693367786392e-06, + "loss": 1.0867, + "step": 147 + }, + { + "epoch": 0.22945736434108527, + "grad_norm": 1.842398762702942, + "learning_rate": 4.991386735572782e-06, + "loss": 1.0799, + "step": 148 + }, + { + "epoch": 0.2310077519379845, + "grad_norm": 1.9986329078674316, + "learning_rate": 4.987080103359174e-06, + "loss": 1.1371, + "step": 149 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 1.5376241207122803, + "learning_rate": 4.982773471145564e-06, + "loss": 1.0728, + "step": 150 + }, + { + "epoch": 0.23255813953488372, + "eval_loss": 1.1861159801483154, + "eval_runtime": 46.3174, + "eval_samples_per_second": 21.59, + "eval_steps_per_second": 1.36, + "step": 150 + }, + { + "epoch": 0.23410852713178296, + "grad_norm": 2.3728039264678955, + "learning_rate": 4.978466838931956e-06, + "loss": 1.128, + "step": 151 + }, + { + "epoch": 0.23565891472868217, + "grad_norm": 2.1690421104431152, + "learning_rate": 4.974160206718346e-06, + "loss": 1.1249, + "step": 152 + }, + { + "epoch": 0.2372093023255814, + "grad_norm": 1.9691181182861328, + "learning_rate": 4.969853574504738e-06, + "loss": 1.0955, + "step": 153 + }, + { + "epoch": 0.23875968992248062, + "grad_norm": 2.235119581222534, + "learning_rate": 4.965546942291129e-06, + "loss": 1.0875, + "step": 154 + }, + { + "epoch": 0.24031007751937986, + "grad_norm": 1.7278294563293457, + "learning_rate": 4.9612403100775195e-06, + "loss": 1.072, + "step": 155 + }, + { + "epoch": 0.24186046511627907, + "grad_norm": 1.7765733003616333, + "learning_rate": 4.956933677863911e-06, + "loss": 1.0662, + "step": 156 + }, + { + "epoch": 0.2434108527131783, + "grad_norm": 2.1046664714813232, + "learning_rate": 4.9526270456503015e-06, + "loss": 1.0748, + "step": 157 + }, + { + "epoch": 0.24496124031007752, + "grad_norm": 1.7691307067871094, + "learning_rate": 4.948320413436693e-06, + "loss": 1.0809, + "step": 158 + }, + { + "epoch": 0.24651162790697675, + "grad_norm": 1.506629228591919, + "learning_rate": 4.944013781223083e-06, + "loss": 1.0807, + "step": 159 + }, + { + "epoch": 0.24806201550387597, + "grad_norm": 1.8298925161361694, + "learning_rate": 4.939707149009475e-06, + "loss": 1.0976, + "step": 160 + }, + { + "epoch": 0.24806201550387597, + "eval_loss": 1.1819802522659302, + "eval_runtime": 46.1853, + "eval_samples_per_second": 21.652, + "eval_steps_per_second": 1.364, + "step": 160 + }, + { + "epoch": 0.2496124031007752, + "grad_norm": 1.712808609008789, + "learning_rate": 4.935400516795866e-06, + "loss": 1.0967, + "step": 161 + }, + { + "epoch": 0.25116279069767444, + "grad_norm": 2.387460947036743, + "learning_rate": 4.931093884582257e-06, + "loss": 1.1141, + "step": 162 + }, + { + "epoch": 0.2527131782945736, + "grad_norm": 1.8970969915390015, + "learning_rate": 4.926787252368648e-06, + "loss": 1.0605, + "step": 163 + }, + { + "epoch": 0.25426356589147286, + "grad_norm": 1.6015546321868896, + "learning_rate": 4.922480620155039e-06, + "loss": 1.0739, + "step": 164 + }, + { + "epoch": 0.2558139534883721, + "grad_norm": 1.9534488916397095, + "learning_rate": 4.91817398794143e-06, + "loss": 1.0496, + "step": 165 + }, + { + "epoch": 0.25736434108527134, + "grad_norm": 2.064868927001953, + "learning_rate": 4.913867355727821e-06, + "loss": 1.0796, + "step": 166 + }, + { + "epoch": 0.2589147286821705, + "grad_norm": 1.4253895282745361, + "learning_rate": 4.909560723514212e-06, + "loss": 1.0444, + "step": 167 + }, + { + "epoch": 0.26046511627906976, + "grad_norm": 2.0125179290771484, + "learning_rate": 4.905254091300603e-06, + "loss": 1.0418, + "step": 168 + }, + { + "epoch": 0.262015503875969, + "grad_norm": 1.7060673236846924, + "learning_rate": 4.900947459086994e-06, + "loss": 1.1122, + "step": 169 + }, + { + "epoch": 0.26356589147286824, + "grad_norm": 1.7167747020721436, + "learning_rate": 4.896640826873385e-06, + "loss": 1.092, + "step": 170 + }, + { + "epoch": 0.26356589147286824, + "eval_loss": 1.1651350259780884, + "eval_runtime": 46.3335, + "eval_samples_per_second": 21.583, + "eval_steps_per_second": 1.36, + "step": 170 + }, + { + "epoch": 0.2651162790697674, + "grad_norm": 1.3494670391082764, + "learning_rate": 4.892334194659777e-06, + "loss": 1.069, + "step": 171 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.1221230030059814, + "learning_rate": 4.888027562446167e-06, + "loss": 1.0827, + "step": 172 + }, + { + "epoch": 0.2682170542635659, + "grad_norm": 1.819272756576538, + "learning_rate": 4.883720930232559e-06, + "loss": 1.0702, + "step": 173 + }, + { + "epoch": 0.26976744186046514, + "grad_norm": 1.7097699642181396, + "learning_rate": 4.879414298018949e-06, + "loss": 1.0715, + "step": 174 + }, + { + "epoch": 0.2713178294573643, + "grad_norm": 2.005596399307251, + "learning_rate": 4.875107665805341e-06, + "loss": 1.0736, + "step": 175 + }, + { + "epoch": 0.27286821705426356, + "grad_norm": 2.6549158096313477, + "learning_rate": 4.870801033591732e-06, + "loss": 1.0681, + "step": 176 + }, + { + "epoch": 0.2744186046511628, + "grad_norm": 1.527632713317871, + "learning_rate": 4.8664944013781225e-06, + "loss": 1.0675, + "step": 177 + }, + { + "epoch": 0.27596899224806204, + "grad_norm": 3.4207332134246826, + "learning_rate": 4.862187769164514e-06, + "loss": 1.0977, + "step": 178 + }, + { + "epoch": 0.2775193798449612, + "grad_norm": 2.768385648727417, + "learning_rate": 4.857881136950905e-06, + "loss": 1.0411, + "step": 179 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 2.1901512145996094, + "learning_rate": 4.853574504737296e-06, + "loss": 1.0946, + "step": 180 + }, + { + "epoch": 0.27906976744186046, + "eval_loss": 1.1633468866348267, + "eval_runtime": 46.353, + "eval_samples_per_second": 21.574, + "eval_steps_per_second": 1.359, + "step": 180 + }, + { + "epoch": 0.2806201550387597, + "grad_norm": 2.5135326385498047, + "learning_rate": 4.849267872523687e-06, + "loss": 1.0576, + "step": 181 + }, + { + "epoch": 0.28217054263565894, + "grad_norm": 1.8446581363677979, + "learning_rate": 4.844961240310078e-06, + "loss": 1.0539, + "step": 182 + }, + { + "epoch": 0.2837209302325581, + "grad_norm": 1.7093740701675415, + "learning_rate": 4.840654608096469e-06, + "loss": 1.0753, + "step": 183 + }, + { + "epoch": 0.28527131782945736, + "grad_norm": 1.4978317022323608, + "learning_rate": 4.8363479758828606e-06, + "loss": 1.0597, + "step": 184 + }, + { + "epoch": 0.2868217054263566, + "grad_norm": 1.7498767375946045, + "learning_rate": 4.832041343669251e-06, + "loss": 1.0675, + "step": 185 + }, + { + "epoch": 0.28837209302325584, + "grad_norm": 1.6801395416259766, + "learning_rate": 4.8277347114556425e-06, + "loss": 1.0532, + "step": 186 + }, + { + "epoch": 0.289922480620155, + "grad_norm": 2.029369354248047, + "learning_rate": 4.823428079242033e-06, + "loss": 1.075, + "step": 187 + }, + { + "epoch": 0.29147286821705426, + "grad_norm": 1.808666467666626, + "learning_rate": 4.8191214470284244e-06, + "loss": 1.0713, + "step": 188 + }, + { + "epoch": 0.2930232558139535, + "grad_norm": 1.4356886148452759, + "learning_rate": 4.814814814814815e-06, + "loss": 1.0571, + "step": 189 + }, + { + "epoch": 0.29457364341085274, + "grad_norm": 1.743287205696106, + "learning_rate": 4.810508182601206e-06, + "loss": 1.047, + "step": 190 + }, + { + "epoch": 0.29457364341085274, + "eval_loss": 1.1456130743026733, + "eval_runtime": 46.2826, + "eval_samples_per_second": 21.606, + "eval_steps_per_second": 1.361, + "step": 190 + }, + { + "epoch": 0.2961240310077519, + "grad_norm": 1.9248270988464355, + "learning_rate": 4.806201550387598e-06, + "loss": 1.061, + "step": 191 + }, + { + "epoch": 0.29767441860465116, + "grad_norm": 1.4090445041656494, + "learning_rate": 4.801894918173988e-06, + "loss": 1.085, + "step": 192 + }, + { + "epoch": 0.2992248062015504, + "grad_norm": 1.858083724975586, + "learning_rate": 4.79758828596038e-06, + "loss": 1.0412, + "step": 193 + }, + { + "epoch": 0.30077519379844964, + "grad_norm": 1.6754525899887085, + "learning_rate": 4.79328165374677e-06, + "loss": 1.0611, + "step": 194 + }, + { + "epoch": 0.3023255813953488, + "grad_norm": 1.5942410230636597, + "learning_rate": 4.788975021533162e-06, + "loss": 1.0373, + "step": 195 + }, + { + "epoch": 0.30387596899224806, + "grad_norm": 1.6549283266067505, + "learning_rate": 4.784668389319552e-06, + "loss": 1.0343, + "step": 196 + }, + { + "epoch": 0.3054263565891473, + "grad_norm": 2.2277469635009766, + "learning_rate": 4.780361757105944e-06, + "loss": 1.0881, + "step": 197 + }, + { + "epoch": 0.30697674418604654, + "grad_norm": 1.6255255937576294, + "learning_rate": 4.776055124892335e-06, + "loss": 0.9892, + "step": 198 + }, + { + "epoch": 0.3085271317829457, + "grad_norm": 1.761346697807312, + "learning_rate": 4.7717484926787255e-06, + "loss": 1.0547, + "step": 199 + }, + { + "epoch": 0.31007751937984496, + "grad_norm": 1.673264503479004, + "learning_rate": 4.767441860465117e-06, + "loss": 1.0516, + "step": 200 + }, + { + "epoch": 0.31007751937984496, + "eval_loss": 1.141044020652771, + "eval_runtime": 46.3982, + "eval_samples_per_second": 21.553, + "eval_steps_per_second": 1.358, + "step": 200 + }, + { + "epoch": 0.3116279069767442, + "grad_norm": 2.031104803085327, + "learning_rate": 4.7631352282515074e-06, + "loss": 1.0646, + "step": 201 + }, + { + "epoch": 0.31317829457364343, + "grad_norm": 2.710296630859375, + "learning_rate": 4.758828596037899e-06, + "loss": 1.0444, + "step": 202 + }, + { + "epoch": 0.3147286821705426, + "grad_norm": 1.8177646398544312, + "learning_rate": 4.754521963824289e-06, + "loss": 1.05, + "step": 203 + }, + { + "epoch": 0.31627906976744186, + "grad_norm": 1.5122544765472412, + "learning_rate": 4.750215331610681e-06, + "loss": 1.0373, + "step": 204 + }, + { + "epoch": 0.3178294573643411, + "grad_norm": 2.730760335922241, + "learning_rate": 4.745908699397072e-06, + "loss": 1.0868, + "step": 205 + }, + { + "epoch": 0.31937984496124033, + "grad_norm": 2.1081650257110596, + "learning_rate": 4.741602067183463e-06, + "loss": 1.0587, + "step": 206 + }, + { + "epoch": 0.3209302325581395, + "grad_norm": 2.4439713954925537, + "learning_rate": 4.737295434969854e-06, + "loss": 1.0528, + "step": 207 + }, + { + "epoch": 0.32248062015503876, + "grad_norm": 1.6500515937805176, + "learning_rate": 4.732988802756245e-06, + "loss": 1.0446, + "step": 208 + }, + { + "epoch": 0.324031007751938, + "grad_norm": 1.764997959136963, + "learning_rate": 4.728682170542636e-06, + "loss": 1.0973, + "step": 209 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 1.9934346675872803, + "learning_rate": 4.724375538329027e-06, + "loss": 1.0559, + "step": 210 + }, + { + "epoch": 0.32558139534883723, + "eval_loss": 1.139148473739624, + "eval_runtime": 46.3053, + "eval_samples_per_second": 21.596, + "eval_steps_per_second": 1.361, + "step": 210 + }, + { + "epoch": 0.3271317829457364, + "grad_norm": 2.008603572845459, + "learning_rate": 4.720068906115418e-06, + "loss": 1.0477, + "step": 211 + }, + { + "epoch": 0.32868217054263565, + "grad_norm": 1.631761074066162, + "learning_rate": 4.715762273901809e-06, + "loss": 1.0588, + "step": 212 + }, + { + "epoch": 0.3302325581395349, + "grad_norm": 1.682797908782959, + "learning_rate": 4.7114556416882e-06, + "loss": 1.045, + "step": 213 + }, + { + "epoch": 0.33178294573643413, + "grad_norm": 1.7425205707550049, + "learning_rate": 4.707149009474591e-06, + "loss": 1.0116, + "step": 214 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.903198480606079, + "learning_rate": 4.702842377260982e-06, + "loss": 1.0358, + "step": 215 + }, + { + "epoch": 0.33488372093023255, + "grad_norm": 1.5025352239608765, + "learning_rate": 4.698535745047373e-06, + "loss": 1.0306, + "step": 216 + }, + { + "epoch": 0.3364341085271318, + "grad_norm": 1.7889755964279175, + "learning_rate": 4.694229112833764e-06, + "loss": 1.0451, + "step": 217 + }, + { + "epoch": 0.33798449612403103, + "grad_norm": 1.4736870527267456, + "learning_rate": 4.689922480620155e-06, + "loss": 1.0245, + "step": 218 + }, + { + "epoch": 0.3395348837209302, + "grad_norm": 1.4667205810546875, + "learning_rate": 4.6856158484065466e-06, + "loss": 0.9702, + "step": 219 + }, + { + "epoch": 0.34108527131782945, + "grad_norm": 1.825951337814331, + "learning_rate": 4.681309216192937e-06, + "loss": 1.0135, + "step": 220 + }, + { + "epoch": 0.34108527131782945, + "eval_loss": 1.125650405883789, + "eval_runtime": 46.3561, + "eval_samples_per_second": 21.572, + "eval_steps_per_second": 1.359, + "step": 220 + }, + { + "epoch": 0.3426356589147287, + "grad_norm": 1.6442840099334717, + "learning_rate": 4.6770025839793285e-06, + "loss": 1.0692, + "step": 221 + }, + { + "epoch": 0.34418604651162793, + "grad_norm": 1.994643211364746, + "learning_rate": 4.672695951765719e-06, + "loss": 1.0225, + "step": 222 + }, + { + "epoch": 0.3457364341085271, + "grad_norm": 1.6924593448638916, + "learning_rate": 4.6683893195521104e-06, + "loss": 1.0258, + "step": 223 + }, + { + "epoch": 0.34728682170542635, + "grad_norm": 1.9766777753829956, + "learning_rate": 4.664082687338502e-06, + "loss": 1.0297, + "step": 224 + }, + { + "epoch": 0.3488372093023256, + "grad_norm": 1.4598184823989868, + "learning_rate": 4.659776055124892e-06, + "loss": 1.0646, + "step": 225 + }, + { + "epoch": 0.35038759689922483, + "grad_norm": 1.876902461051941, + "learning_rate": 4.655469422911284e-06, + "loss": 1.0572, + "step": 226 + }, + { + "epoch": 0.351937984496124, + "grad_norm": 1.948635220527649, + "learning_rate": 4.651162790697675e-06, + "loss": 1.0218, + "step": 227 + }, + { + "epoch": 0.35348837209302325, + "grad_norm": 1.4742966890335083, + "learning_rate": 4.646856158484066e-06, + "loss": 1.0234, + "step": 228 + }, + { + "epoch": 0.3550387596899225, + "grad_norm": 1.8568025827407837, + "learning_rate": 4.642549526270457e-06, + "loss": 1.0538, + "step": 229 + }, + { + "epoch": 0.35658914728682173, + "grad_norm": 1.664680004119873, + "learning_rate": 4.638242894056848e-06, + "loss": 1.0414, + "step": 230 + }, + { + "epoch": 0.35658914728682173, + "eval_loss": 1.1241024732589722, + "eval_runtime": 46.3527, + "eval_samples_per_second": 21.574, + "eval_steps_per_second": 1.359, + "step": 230 + }, + { + "epoch": 0.3581395348837209, + "grad_norm": 1.617343544960022, + "learning_rate": 4.633936261843239e-06, + "loss": 1.0035, + "step": 231 + }, + { + "epoch": 0.35968992248062015, + "grad_norm": 1.8938863277435303, + "learning_rate": 4.62962962962963e-06, + "loss": 1.0488, + "step": 232 + }, + { + "epoch": 0.3612403100775194, + "grad_norm": 1.4788931608200073, + "learning_rate": 4.625322997416021e-06, + "loss": 1.0032, + "step": 233 + }, + { + "epoch": 0.3627906976744186, + "grad_norm": 1.4180240631103516, + "learning_rate": 4.621016365202412e-06, + "loss": 1.0016, + "step": 234 + }, + { + "epoch": 0.3643410852713178, + "grad_norm": 2.0693907737731934, + "learning_rate": 4.616709732988803e-06, + "loss": 1.0599, + "step": 235 + }, + { + "epoch": 0.36589147286821705, + "grad_norm": 1.6417962312698364, + "learning_rate": 4.612403100775194e-06, + "loss": 1.0194, + "step": 236 + }, + { + "epoch": 0.3674418604651163, + "grad_norm": 2.0166518688201904, + "learning_rate": 4.608096468561586e-06, + "loss": 1.0507, + "step": 237 + }, + { + "epoch": 0.3689922480620155, + "grad_norm": 1.8325533866882324, + "learning_rate": 4.603789836347976e-06, + "loss": 1.0052, + "step": 238 + }, + { + "epoch": 0.3705426356589147, + "grad_norm": 1.4332795143127441, + "learning_rate": 4.599483204134368e-06, + "loss": 1.0225, + "step": 239 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 2.0295088291168213, + "learning_rate": 4.595176571920759e-06, + "loss": 1.0317, + "step": 240 + }, + { + "epoch": 0.37209302325581395, + "eval_loss": 1.1203081607818604, + "eval_runtime": 46.3802, + "eval_samples_per_second": 21.561, + "eval_steps_per_second": 1.358, + "step": 240 + }, + { + "epoch": 0.3736434108527132, + "grad_norm": 2.3276357650756836, + "learning_rate": 4.5908699397071495e-06, + "loss": 1.0321, + "step": 241 + }, + { + "epoch": 0.3751937984496124, + "grad_norm": 1.6102005243301392, + "learning_rate": 4.586563307493541e-06, + "loss": 1.0207, + "step": 242 + }, + { + "epoch": 0.3767441860465116, + "grad_norm": 2.5230729579925537, + "learning_rate": 4.5822566752799315e-06, + "loss": 1.0534, + "step": 243 + }, + { + "epoch": 0.37829457364341085, + "grad_norm": 2.3337669372558594, + "learning_rate": 4.577950043066323e-06, + "loss": 1.0261, + "step": 244 + }, + { + "epoch": 0.3798449612403101, + "grad_norm": 2.6739656925201416, + "learning_rate": 4.573643410852713e-06, + "loss": 1.0279, + "step": 245 + }, + { + "epoch": 0.3813953488372093, + "grad_norm": 2.375446081161499, + "learning_rate": 4.569336778639105e-06, + "loss": 1.0272, + "step": 246 + }, + { + "epoch": 0.3829457364341085, + "grad_norm": 1.7358253002166748, + "learning_rate": 4.565030146425496e-06, + "loss": 1.0303, + "step": 247 + }, + { + "epoch": 0.38449612403100775, + "grad_norm": 2.24027156829834, + "learning_rate": 4.560723514211887e-06, + "loss": 1.021, + "step": 248 + }, + { + "epoch": 0.386046511627907, + "grad_norm": 2.228729724884033, + "learning_rate": 4.556416881998278e-06, + "loss": 1.0436, + "step": 249 + }, + { + "epoch": 0.3875968992248062, + "grad_norm": 1.5235542058944702, + "learning_rate": 4.552110249784669e-06, + "loss": 1.0248, + "step": 250 + }, + { + "epoch": 0.3875968992248062, + "eval_loss": 1.1150026321411133, + "eval_runtime": 46.3762, + "eval_samples_per_second": 21.563, + "eval_steps_per_second": 1.358, + "step": 250 + }, + { + "epoch": 0.3891472868217054, + "grad_norm": 1.5835521221160889, + "learning_rate": 4.54780361757106e-06, + "loss": 1.0066, + "step": 251 + }, + { + "epoch": 0.39069767441860465, + "grad_norm": 1.6656657457351685, + "learning_rate": 4.543496985357451e-06, + "loss": 1.0438, + "step": 252 + }, + { + "epoch": 0.3922480620155039, + "grad_norm": 2.0052080154418945, + "learning_rate": 4.539190353143842e-06, + "loss": 0.9879, + "step": 253 + }, + { + "epoch": 0.3937984496124031, + "grad_norm": 1.6761670112609863, + "learning_rate": 4.5348837209302326e-06, + "loss": 0.9964, + "step": 254 + }, + { + "epoch": 0.3953488372093023, + "grad_norm": 2.0463616847991943, + "learning_rate": 4.530577088716624e-06, + "loss": 1.0137, + "step": 255 + }, + { + "epoch": 0.39689922480620154, + "grad_norm": 1.684457540512085, + "learning_rate": 4.526270456503015e-06, + "loss": 0.9994, + "step": 256 + }, + { + "epoch": 0.3984496124031008, + "grad_norm": 1.9504594802856445, + "learning_rate": 4.521963824289406e-06, + "loss": 1.008, + "step": 257 + }, + { + "epoch": 0.4, + "grad_norm": 2.0540153980255127, + "learning_rate": 4.517657192075797e-06, + "loss": 1.0145, + "step": 258 + }, + { + "epoch": 0.4015503875968992, + "grad_norm": 2.077800989151001, + "learning_rate": 4.513350559862188e-06, + "loss": 1.0078, + "step": 259 + }, + { + "epoch": 0.40310077519379844, + "grad_norm": 2.054661273956299, + "learning_rate": 4.509043927648579e-06, + "loss": 1.0202, + "step": 260 + }, + { + "epoch": 0.40310077519379844, + "eval_loss": 1.112622857093811, + "eval_runtime": 46.4135, + "eval_samples_per_second": 21.545, + "eval_steps_per_second": 1.357, + "step": 260 + }, + { + "epoch": 0.4046511627906977, + "grad_norm": 1.7232954502105713, + "learning_rate": 4.50473729543497e-06, + "loss": 0.9975, + "step": 261 + }, + { + "epoch": 0.4062015503875969, + "grad_norm": 1.672469973564148, + "learning_rate": 4.500430663221361e-06, + "loss": 1.0262, + "step": 262 + }, + { + "epoch": 0.4077519379844961, + "grad_norm": 1.5814067125320435, + "learning_rate": 4.4961240310077525e-06, + "loss": 0.9977, + "step": 263 + }, + { + "epoch": 0.40930232558139534, + "grad_norm": 2.1820123195648193, + "learning_rate": 4.491817398794143e-06, + "loss": 1.0404, + "step": 264 + }, + { + "epoch": 0.4108527131782946, + "grad_norm": 1.5097086429595947, + "learning_rate": 4.4875107665805345e-06, + "loss": 1.0023, + "step": 265 + }, + { + "epoch": 0.4124031007751938, + "grad_norm": 1.551700472831726, + "learning_rate": 4.483204134366925e-06, + "loss": 0.9823, + "step": 266 + }, + { + "epoch": 0.413953488372093, + "grad_norm": 1.595078945159912, + "learning_rate": 4.478897502153316e-06, + "loss": 1.0101, + "step": 267 + }, + { + "epoch": 0.41550387596899224, + "grad_norm": 1.7391844987869263, + "learning_rate": 4.474590869939707e-06, + "loss": 1.0208, + "step": 268 + }, + { + "epoch": 0.4170542635658915, + "grad_norm": 1.820359706878662, + "learning_rate": 4.470284237726098e-06, + "loss": 0.982, + "step": 269 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 1.852407693862915, + "learning_rate": 4.46597760551249e-06, + "loss": 0.9752, + "step": 270 + }, + { + "epoch": 0.4186046511627907, + "eval_loss": 1.109637975692749, + "eval_runtime": 46.4539, + "eval_samples_per_second": 21.527, + "eval_steps_per_second": 1.356, + "step": 270 + }, + { + "epoch": 0.4201550387596899, + "grad_norm": 2.0624260902404785, + "learning_rate": 4.46167097329888e-06, + "loss": 0.98, + "step": 271 + }, + { + "epoch": 0.42170542635658914, + "grad_norm": 1.4282116889953613, + "learning_rate": 4.457364341085272e-06, + "loss": 0.9782, + "step": 272 + }, + { + "epoch": 0.4232558139534884, + "grad_norm": 2.189063549041748, + "learning_rate": 4.453057708871662e-06, + "loss": 1.0447, + "step": 273 + }, + { + "epoch": 0.4248062015503876, + "grad_norm": 1.8906320333480835, + "learning_rate": 4.448751076658054e-06, + "loss": 1.0013, + "step": 274 + }, + { + "epoch": 0.4263565891472868, + "grad_norm": 1.7382593154907227, + "learning_rate": 4.444444444444444e-06, + "loss": 0.9955, + "step": 275 + }, + { + "epoch": 0.42790697674418604, + "grad_norm": 1.7929259538650513, + "learning_rate": 4.4401378122308355e-06, + "loss": 1.0087, + "step": 276 + }, + { + "epoch": 0.4294573643410853, + "grad_norm": 1.4211113452911377, + "learning_rate": 4.435831180017227e-06, + "loss": 1.0109, + "step": 277 + }, + { + "epoch": 0.4310077519379845, + "grad_norm": 2.430835008621216, + "learning_rate": 4.4315245478036175e-06, + "loss": 1.0401, + "step": 278 + }, + { + "epoch": 0.4325581395348837, + "grad_norm": 2.3752498626708984, + "learning_rate": 4.427217915590009e-06, + "loss": 1.0202, + "step": 279 + }, + { + "epoch": 0.43410852713178294, + "grad_norm": 1.6907297372817993, + "learning_rate": 4.4229112833764e-06, + "loss": 1.001, + "step": 280 + }, + { + "epoch": 0.43410852713178294, + "eval_loss": 1.098842978477478, + "eval_runtime": 46.5116, + "eval_samples_per_second": 21.5, + "eval_steps_per_second": 1.355, + "step": 280 + }, + { + "epoch": 0.4356589147286822, + "grad_norm": 1.5076638460159302, + "learning_rate": 4.418604651162791e-06, + "loss": 1.0402, + "step": 281 + }, + { + "epoch": 0.4372093023255814, + "grad_norm": 1.7280958890914917, + "learning_rate": 4.414298018949182e-06, + "loss": 1.0115, + "step": 282 + }, + { + "epoch": 0.4387596899224806, + "grad_norm": 1.711249589920044, + "learning_rate": 4.409991386735573e-06, + "loss": 0.9946, + "step": 283 + }, + { + "epoch": 0.44031007751937984, + "grad_norm": 1.9347134828567505, + "learning_rate": 4.405684754521964e-06, + "loss": 1.0077, + "step": 284 + }, + { + "epoch": 0.4418604651162791, + "grad_norm": 1.5475000143051147, + "learning_rate": 4.4013781223083555e-06, + "loss": 1.0163, + "step": 285 + }, + { + "epoch": 0.4434108527131783, + "grad_norm": 1.4094364643096924, + "learning_rate": 4.397071490094746e-06, + "loss": 0.9963, + "step": 286 + }, + { + "epoch": 0.4449612403100775, + "grad_norm": 1.7702579498291016, + "learning_rate": 4.3927648578811375e-06, + "loss": 0.9867, + "step": 287 + }, + { + "epoch": 0.44651162790697674, + "grad_norm": 2.4476609230041504, + "learning_rate": 4.388458225667529e-06, + "loss": 1.0049, + "step": 288 + }, + { + "epoch": 0.448062015503876, + "grad_norm": 1.6922342777252197, + "learning_rate": 4.384151593453919e-06, + "loss": 1.0192, + "step": 289 + }, + { + "epoch": 0.4496124031007752, + "grad_norm": 1.8502883911132812, + "learning_rate": 4.379844961240311e-06, + "loss": 1.0031, + "step": 290 + }, + { + "epoch": 0.4496124031007752, + "eval_loss": 1.095705270767212, + "eval_runtime": 46.5015, + "eval_samples_per_second": 21.505, + "eval_steps_per_second": 1.355, + "step": 290 + }, + { + "epoch": 0.4511627906976744, + "grad_norm": 1.6261955499649048, + "learning_rate": 4.375538329026701e-06, + "loss": 0.9904, + "step": 291 + }, + { + "epoch": 0.45271317829457364, + "grad_norm": 1.3905696868896484, + "learning_rate": 4.371231696813093e-06, + "loss": 0.9788, + "step": 292 + }, + { + "epoch": 0.4542635658914729, + "grad_norm": 1.8039098978042603, + "learning_rate": 4.366925064599484e-06, + "loss": 0.9855, + "step": 293 + }, + { + "epoch": 0.4558139534883721, + "grad_norm": 1.7904952764511108, + "learning_rate": 4.362618432385875e-06, + "loss": 1.0097, + "step": 294 + }, + { + "epoch": 0.4573643410852713, + "grad_norm": 1.918278455734253, + "learning_rate": 4.358311800172266e-06, + "loss": 0.9839, + "step": 295 + }, + { + "epoch": 0.45891472868217054, + "grad_norm": 1.6572740077972412, + "learning_rate": 4.354005167958657e-06, + "loss": 1.0167, + "step": 296 + }, + { + "epoch": 0.4604651162790698, + "grad_norm": 1.6543735265731812, + "learning_rate": 4.349698535745048e-06, + "loss": 1.0181, + "step": 297 + }, + { + "epoch": 0.462015503875969, + "grad_norm": 1.5788277387619019, + "learning_rate": 4.345391903531439e-06, + "loss": 1.013, + "step": 298 + }, + { + "epoch": 0.4635658914728682, + "grad_norm": 1.4497977495193481, + "learning_rate": 4.34108527131783e-06, + "loss": 0.9837, + "step": 299 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 1.2367212772369385, + "learning_rate": 4.336778639104221e-06, + "loss": 0.9611, + "step": 300 + }, + { + "epoch": 0.46511627906976744, + "eval_loss": 1.0975078344345093, + "eval_runtime": 46.5651, + "eval_samples_per_second": 21.475, + "eval_steps_per_second": 1.353, + "step": 300 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 1.7050374746322632, + "learning_rate": 4.332472006890612e-06, + "loss": 1.0096, + "step": 301 + }, + { + "epoch": 0.4682170542635659, + "grad_norm": 1.4360929727554321, + "learning_rate": 4.328165374677003e-06, + "loss": 0.9569, + "step": 302 + }, + { + "epoch": 0.4697674418604651, + "grad_norm": 1.49595308303833, + "learning_rate": 4.323858742463394e-06, + "loss": 1.0097, + "step": 303 + }, + { + "epoch": 0.47131782945736433, + "grad_norm": 1.9372719526290894, + "learning_rate": 4.319552110249785e-06, + "loss": 0.9716, + "step": 304 + }, + { + "epoch": 0.4728682170542636, + "grad_norm": 1.4308205842971802, + "learning_rate": 4.3152454780361766e-06, + "loss": 0.9773, + "step": 305 + }, + { + "epoch": 0.4744186046511628, + "grad_norm": 1.724716067314148, + "learning_rate": 4.310938845822567e-06, + "loss": 0.972, + "step": 306 + }, + { + "epoch": 0.475968992248062, + "grad_norm": 1.7052260637283325, + "learning_rate": 4.3066322136089585e-06, + "loss": 0.9538, + "step": 307 + }, + { + "epoch": 0.47751937984496123, + "grad_norm": 1.9146831035614014, + "learning_rate": 4.302325581395349e-06, + "loss": 0.995, + "step": 308 + }, + { + "epoch": 0.4790697674418605, + "grad_norm": 1.581652283668518, + "learning_rate": 4.2980189491817404e-06, + "loss": 0.9966, + "step": 309 + }, + { + "epoch": 0.4806201550387597, + "grad_norm": 1.852246642112732, + "learning_rate": 4.293712316968131e-06, + "loss": 1.0029, + "step": 310 + }, + { + "epoch": 0.4806201550387597, + "eval_loss": 1.0914039611816406, + "eval_runtime": 46.5191, + "eval_samples_per_second": 21.497, + "eval_steps_per_second": 1.354, + "step": 310 + }, + { + "epoch": 0.4821705426356589, + "grad_norm": 1.5872552394866943, + "learning_rate": 4.289405684754522e-06, + "loss": 1.052, + "step": 311 + }, + { + "epoch": 0.48372093023255813, + "grad_norm": 2.0670878887176514, + "learning_rate": 4.285099052540914e-06, + "loss": 0.99, + "step": 312 + }, + { + "epoch": 0.48527131782945737, + "grad_norm": 1.722896695137024, + "learning_rate": 4.280792420327304e-06, + "loss": 1.0338, + "step": 313 + }, + { + "epoch": 0.4868217054263566, + "grad_norm": 1.4839717149734497, + "learning_rate": 4.276485788113696e-06, + "loss": 0.9857, + "step": 314 + }, + { + "epoch": 0.4883720930232558, + "grad_norm": 1.6284306049346924, + "learning_rate": 4.272179155900086e-06, + "loss": 1.003, + "step": 315 + }, + { + "epoch": 0.48992248062015503, + "grad_norm": 1.7428269386291504, + "learning_rate": 4.267872523686478e-06, + "loss": 0.9737, + "step": 316 + }, + { + "epoch": 0.49147286821705427, + "grad_norm": 1.281343698501587, + "learning_rate": 4.263565891472868e-06, + "loss": 0.9812, + "step": 317 + }, + { + "epoch": 0.4930232558139535, + "grad_norm": 1.4639809131622314, + "learning_rate": 4.2592592592592596e-06, + "loss": 0.9783, + "step": 318 + }, + { + "epoch": 0.4945736434108527, + "grad_norm": 1.636660099029541, + "learning_rate": 4.25495262704565e-06, + "loss": 0.9923, + "step": 319 + }, + { + "epoch": 0.49612403100775193, + "grad_norm": 1.8043773174285889, + "learning_rate": 4.2506459948320415e-06, + "loss": 0.9543, + "step": 320 + }, + { + "epoch": 0.49612403100775193, + "eval_loss": 1.0884485244750977, + "eval_runtime": 46.5738, + "eval_samples_per_second": 21.471, + "eval_steps_per_second": 1.353, + "step": 320 + }, + { + "epoch": 0.49767441860465117, + "grad_norm": 1.836240530014038, + "learning_rate": 4.246339362618433e-06, + "loss": 1.0107, + "step": 321 + }, + { + "epoch": 0.4992248062015504, + "grad_norm": 1.5559934377670288, + "learning_rate": 4.2420327304048234e-06, + "loss": 0.9607, + "step": 322 + }, + { + "epoch": 0.5007751937984496, + "grad_norm": 1.523479700088501, + "learning_rate": 4.237726098191215e-06, + "loss": 0.9307, + "step": 323 + }, + { + "epoch": 0.5023255813953489, + "grad_norm": 1.6808844804763794, + "learning_rate": 4.233419465977605e-06, + "loss": 1.0017, + "step": 324 + }, + { + "epoch": 0.5038759689922481, + "grad_norm": 1.4419201612472534, + "learning_rate": 4.229112833763997e-06, + "loss": 0.9715, + "step": 325 + }, + { + "epoch": 0.5054263565891473, + "grad_norm": 1.8381807804107666, + "learning_rate": 4.224806201550387e-06, + "loss": 1.0087, + "step": 326 + }, + { + "epoch": 0.5069767441860465, + "grad_norm": 1.410925030708313, + "learning_rate": 4.220499569336779e-06, + "loss": 0.996, + "step": 327 + }, + { + "epoch": 0.5085271317829457, + "grad_norm": 1.5510520935058594, + "learning_rate": 4.21619293712317e-06, + "loss": 1.0014, + "step": 328 + }, + { + "epoch": 0.5100775193798449, + "grad_norm": 1.7308863401412964, + "learning_rate": 4.211886304909561e-06, + "loss": 1.0047, + "step": 329 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 1.627509355545044, + "learning_rate": 4.207579672695952e-06, + "loss": 0.9879, + "step": 330 + }, + { + "epoch": 0.5116279069767442, + "eval_loss": 1.0827207565307617, + "eval_runtime": 46.6671, + "eval_samples_per_second": 21.428, + "eval_steps_per_second": 1.35, + "step": 330 + }, + { + "epoch": 0.5131782945736434, + "grad_norm": 1.7517614364624023, + "learning_rate": 4.203273040482343e-06, + "loss": 0.9946, + "step": 331 + }, + { + "epoch": 0.5147286821705427, + "grad_norm": 1.7542222738265991, + "learning_rate": 4.198966408268734e-06, + "loss": 0.977, + "step": 332 + }, + { + "epoch": 0.5162790697674419, + "grad_norm": 1.5376570224761963, + "learning_rate": 4.194659776055125e-06, + "loss": 0.9587, + "step": 333 + }, + { + "epoch": 0.517829457364341, + "grad_norm": 1.613654613494873, + "learning_rate": 4.190353143841516e-06, + "loss": 0.9844, + "step": 334 + }, + { + "epoch": 0.5193798449612403, + "grad_norm": 1.5097827911376953, + "learning_rate": 4.186046511627907e-06, + "loss": 0.9881, + "step": 335 + }, + { + "epoch": 0.5209302325581395, + "grad_norm": 1.6402108669281006, + "learning_rate": 4.181739879414298e-06, + "loss": 0.9919, + "step": 336 + }, + { + "epoch": 0.5224806201550387, + "grad_norm": 1.4518394470214844, + "learning_rate": 4.177433247200689e-06, + "loss": 0.9837, + "step": 337 + }, + { + "epoch": 0.524031007751938, + "grad_norm": 1.3409701585769653, + "learning_rate": 4.173126614987081e-06, + "loss": 0.9892, + "step": 338 + }, + { + "epoch": 0.5255813953488372, + "grad_norm": 1.3539661169052124, + "learning_rate": 4.168819982773471e-06, + "loss": 0.9845, + "step": 339 + }, + { + "epoch": 0.5271317829457365, + "grad_norm": 1.314976692199707, + "learning_rate": 4.1645133505598626e-06, + "loss": 1.0105, + "step": 340 + }, + { + "epoch": 0.5271317829457365, + "eval_loss": 1.0878815650939941, + "eval_runtime": 46.7206, + "eval_samples_per_second": 21.404, + "eval_steps_per_second": 1.348, + "step": 340 + }, + { + "epoch": 0.5286821705426357, + "grad_norm": 1.6854596138000488, + "learning_rate": 4.160206718346254e-06, + "loss": 0.9637, + "step": 341 + }, + { + "epoch": 0.5302325581395348, + "grad_norm": 1.7850494384765625, + "learning_rate": 4.1559000861326445e-06, + "loss": 0.9895, + "step": 342 + }, + { + "epoch": 0.5317829457364341, + "grad_norm": 1.2481727600097656, + "learning_rate": 4.151593453919036e-06, + "loss": 0.9999, + "step": 343 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.3739581108093262, + "learning_rate": 4.1472868217054264e-06, + "loss": 0.9676, + "step": 344 + }, + { + "epoch": 0.5348837209302325, + "grad_norm": 2.001826763153076, + "learning_rate": 4.142980189491818e-06, + "loss": 1.0295, + "step": 345 + }, + { + "epoch": 0.5364341085271318, + "grad_norm": 1.5063499212265015, + "learning_rate": 4.138673557278209e-06, + "loss": 0.9818, + "step": 346 + }, + { + "epoch": 0.537984496124031, + "grad_norm": 1.5834412574768066, + "learning_rate": 4.1343669250646e-06, + "loss": 0.9757, + "step": 347 + }, + { + "epoch": 0.5395348837209303, + "grad_norm": 1.3693667650222778, + "learning_rate": 4.130060292850991e-06, + "loss": 0.9734, + "step": 348 + }, + { + "epoch": 0.5410852713178295, + "grad_norm": 1.5714665651321411, + "learning_rate": 4.1257536606373825e-06, + "loss": 0.9877, + "step": 349 + }, + { + "epoch": 0.5426356589147286, + "grad_norm": 1.579288125038147, + "learning_rate": 4.121447028423773e-06, + "loss": 0.9917, + "step": 350 + }, + { + "epoch": 0.5426356589147286, + "eval_loss": 1.0909483432769775, + "eval_runtime": 46.7187, + "eval_samples_per_second": 21.405, + "eval_steps_per_second": 1.348, + "step": 350 + }, + { + "epoch": 0.5441860465116279, + "grad_norm": 1.3985519409179688, + "learning_rate": 4.1171403962101645e-06, + "loss": 0.9619, + "step": 351 + }, + { + "epoch": 0.5457364341085271, + "grad_norm": 1.4393198490142822, + "learning_rate": 4.112833763996555e-06, + "loss": 0.9612, + "step": 352 + }, + { + "epoch": 0.5472868217054263, + "grad_norm": 1.7534122467041016, + "learning_rate": 4.108527131782946e-06, + "loss": 0.9831, + "step": 353 + }, + { + "epoch": 0.5488372093023256, + "grad_norm": 1.56703782081604, + "learning_rate": 4.104220499569337e-06, + "loss": 1.015, + "step": 354 + }, + { + "epoch": 0.5503875968992248, + "grad_norm": 1.3634308576583862, + "learning_rate": 4.099913867355728e-06, + "loss": 0.9443, + "step": 355 + }, + { + "epoch": 0.5519379844961241, + "grad_norm": 1.6318840980529785, + "learning_rate": 4.09560723514212e-06, + "loss": 0.9574, + "step": 356 + }, + { + "epoch": 0.5534883720930233, + "grad_norm": 1.642081618309021, + "learning_rate": 4.09130060292851e-06, + "loss": 0.9887, + "step": 357 + }, + { + "epoch": 0.5550387596899224, + "grad_norm": 1.4372543096542358, + "learning_rate": 4.086993970714902e-06, + "loss": 0.991, + "step": 358 + }, + { + "epoch": 0.5565891472868217, + "grad_norm": 1.6540495157241821, + "learning_rate": 4.082687338501292e-06, + "loss": 0.9589, + "step": 359 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 1.7728432416915894, + "learning_rate": 4.078380706287684e-06, + "loss": 0.9945, + "step": 360 + }, + { + "epoch": 0.5581395348837209, + "eval_loss": 1.084891676902771, + "eval_runtime": 46.7029, + "eval_samples_per_second": 21.412, + "eval_steps_per_second": 1.349, + "step": 360 + }, + { + "epoch": 0.5596899224806201, + "grad_norm": 1.5775972604751587, + "learning_rate": 4.074074074074074e-06, + "loss": 0.9392, + "step": 361 + }, + { + "epoch": 0.5612403100775194, + "grad_norm": 1.696594476699829, + "learning_rate": 4.0697674418604655e-06, + "loss": 1.0223, + "step": 362 + }, + { + "epoch": 0.5627906976744186, + "grad_norm": 1.5510598421096802, + "learning_rate": 4.065460809646857e-06, + "loss": 0.9487, + "step": 363 + }, + { + "epoch": 0.5643410852713179, + "grad_norm": 1.3794918060302734, + "learning_rate": 4.0611541774332475e-06, + "loss": 0.964, + "step": 364 + }, + { + "epoch": 0.5658914728682171, + "grad_norm": 1.5407869815826416, + "learning_rate": 4.056847545219639e-06, + "loss": 0.9757, + "step": 365 + }, + { + "epoch": 0.5674418604651162, + "grad_norm": 1.3653312921524048, + "learning_rate": 4.052540913006029e-06, + "loss": 0.9175, + "step": 366 + }, + { + "epoch": 0.5689922480620155, + "grad_norm": 1.6502870321273804, + "learning_rate": 4.048234280792421e-06, + "loss": 0.9658, + "step": 367 + }, + { + "epoch": 0.5705426356589147, + "grad_norm": 1.4073630571365356, + "learning_rate": 4.043927648578811e-06, + "loss": 0.9482, + "step": 368 + }, + { + "epoch": 0.5720930232558139, + "grad_norm": 1.6393091678619385, + "learning_rate": 4.039621016365203e-06, + "loss": 0.9714, + "step": 369 + }, + { + "epoch": 0.5736434108527132, + "grad_norm": 1.6095997095108032, + "learning_rate": 4.035314384151594e-06, + "loss": 0.9513, + "step": 370 + }, + { + "epoch": 0.5736434108527132, + "eval_loss": 1.0774425268173218, + "eval_runtime": 46.7096, + "eval_samples_per_second": 21.409, + "eval_steps_per_second": 1.349, + "step": 370 + }, + { + "epoch": 0.5751937984496124, + "grad_norm": 1.783840298652649, + "learning_rate": 4.031007751937985e-06, + "loss": 0.9564, + "step": 371 + }, + { + "epoch": 0.5767441860465117, + "grad_norm": 1.4101550579071045, + "learning_rate": 4.026701119724376e-06, + "loss": 0.9904, + "step": 372 + }, + { + "epoch": 0.5782945736434109, + "grad_norm": 1.8022546768188477, + "learning_rate": 4.022394487510767e-06, + "loss": 0.9826, + "step": 373 + }, + { + "epoch": 0.57984496124031, + "grad_norm": 2.191577672958374, + "learning_rate": 4.018087855297158e-06, + "loss": 0.973, + "step": 374 + }, + { + "epoch": 0.5813953488372093, + "grad_norm": 1.8315773010253906, + "learning_rate": 4.0137812230835486e-06, + "loss": 0.9809, + "step": 375 + }, + { + "epoch": 0.5829457364341085, + "grad_norm": 1.692712664604187, + "learning_rate": 4.00947459086994e-06, + "loss": 0.9805, + "step": 376 + }, + { + "epoch": 0.5844961240310077, + "grad_norm": 1.888137698173523, + "learning_rate": 4.005167958656331e-06, + "loss": 1.0037, + "step": 377 + }, + { + "epoch": 0.586046511627907, + "grad_norm": 1.4804847240447998, + "learning_rate": 4.000861326442722e-06, + "loss": 0.9519, + "step": 378 + }, + { + "epoch": 0.5875968992248062, + "grad_norm": 1.3487441539764404, + "learning_rate": 3.996554694229113e-06, + "loss": 0.9736, + "step": 379 + }, + { + "epoch": 0.5891472868217055, + "grad_norm": 1.4220670461654663, + "learning_rate": 3.992248062015504e-06, + "loss": 0.9745, + "step": 380 + }, + { + "epoch": 0.5891472868217055, + "eval_loss": 1.078254222869873, + "eval_runtime": 46.7579, + "eval_samples_per_second": 21.387, + "eval_steps_per_second": 1.347, + "step": 380 + }, + { + "epoch": 0.5906976744186047, + "grad_norm": 1.7034496068954468, + "learning_rate": 3.987941429801895e-06, + "loss": 0.9566, + "step": 381 + }, + { + "epoch": 0.5922480620155038, + "grad_norm": 1.5833885669708252, + "learning_rate": 3.983634797588286e-06, + "loss": 0.9712, + "step": 382 + }, + { + "epoch": 0.5937984496124031, + "grad_norm": 1.4043247699737549, + "learning_rate": 3.979328165374677e-06, + "loss": 0.9919, + "step": 383 + }, + { + "epoch": 0.5953488372093023, + "grad_norm": 1.7453659772872925, + "learning_rate": 3.9750215331610685e-06, + "loss": 1.0067, + "step": 384 + }, + { + "epoch": 0.5968992248062015, + "grad_norm": 1.2731187343597412, + "learning_rate": 3.970714900947459e-06, + "loss": 0.9521, + "step": 385 + }, + { + "epoch": 0.5984496124031008, + "grad_norm": 1.5030323266983032, + "learning_rate": 3.9664082687338505e-06, + "loss": 0.9613, + "step": 386 + }, + { + "epoch": 0.6, + "grad_norm": 1.53596031665802, + "learning_rate": 3.962101636520241e-06, + "loss": 0.9654, + "step": 387 + }, + { + "epoch": 0.6015503875968993, + "grad_norm": 1.6132758855819702, + "learning_rate": 3.957795004306632e-06, + "loss": 0.9967, + "step": 388 + }, + { + "epoch": 0.6031007751937985, + "grad_norm": 1.3358467817306519, + "learning_rate": 3.953488372093024e-06, + "loss": 0.9542, + "step": 389 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 1.7046970129013062, + "learning_rate": 3.949181739879414e-06, + "loss": 0.9607, + "step": 390 + }, + { + "epoch": 0.6046511627906976, + "eval_loss": 1.076040267944336, + "eval_runtime": 46.6238, + "eval_samples_per_second": 21.448, + "eval_steps_per_second": 1.351, + "step": 390 + }, + { + "epoch": 0.6062015503875969, + "grad_norm": 1.3822777271270752, + "learning_rate": 3.944875107665806e-06, + "loss": 0.9484, + "step": 391 + }, + { + "epoch": 0.6077519379844961, + "grad_norm": 2.166247844696045, + "learning_rate": 3.940568475452196e-06, + "loss": 0.9447, + "step": 392 + }, + { + "epoch": 0.6093023255813953, + "grad_norm": 1.4651894569396973, + "learning_rate": 3.936261843238588e-06, + "loss": 0.9858, + "step": 393 + }, + { + "epoch": 0.6108527131782946, + "grad_norm": 1.5112606287002563, + "learning_rate": 3.931955211024979e-06, + "loss": 0.9934, + "step": 394 + }, + { + "epoch": 0.6124031007751938, + "grad_norm": 1.657082200050354, + "learning_rate": 3.92764857881137e-06, + "loss": 0.9742, + "step": 395 + }, + { + "epoch": 0.6139534883720931, + "grad_norm": 1.5030896663665771, + "learning_rate": 3.923341946597761e-06, + "loss": 0.9566, + "step": 396 + }, + { + "epoch": 0.6155038759689923, + "grad_norm": 1.536012887954712, + "learning_rate": 3.919035314384152e-06, + "loss": 0.986, + "step": 397 + }, + { + "epoch": 0.6170542635658914, + "grad_norm": 1.7443193197250366, + "learning_rate": 3.914728682170543e-06, + "loss": 0.9722, + "step": 398 + }, + { + "epoch": 0.6186046511627907, + "grad_norm": 1.4055274724960327, + "learning_rate": 3.910422049956934e-06, + "loss": 0.9561, + "step": 399 + }, + { + "epoch": 0.6201550387596899, + "grad_norm": 1.6770274639129639, + "learning_rate": 3.906115417743325e-06, + "loss": 0.9506, + "step": 400 + }, + { + "epoch": 0.6201550387596899, + "eval_loss": 1.0693814754486084, + "eval_runtime": 46.7609, + "eval_samples_per_second": 21.385, + "eval_steps_per_second": 1.347, + "step": 400 + }, + { + "epoch": 0.6217054263565891, + "grad_norm": 1.9705766439437866, + "learning_rate": 3.901808785529716e-06, + "loss": 0.9313, + "step": 401 + }, + { + "epoch": 0.6232558139534884, + "grad_norm": 1.6670687198638916, + "learning_rate": 3.897502153316108e-06, + "loss": 0.9685, + "step": 402 + }, + { + "epoch": 0.6248062015503876, + "grad_norm": 1.290456771850586, + "learning_rate": 3.893195521102498e-06, + "loss": 0.9516, + "step": 403 + }, + { + "epoch": 0.6263565891472869, + "grad_norm": 1.2610697746276855, + "learning_rate": 3.88888888888889e-06, + "loss": 0.958, + "step": 404 + }, + { + "epoch": 0.627906976744186, + "grad_norm": 1.427687406539917, + "learning_rate": 3.88458225667528e-06, + "loss": 0.9522, + "step": 405 + }, + { + "epoch": 0.6294573643410852, + "grad_norm": 1.5878995656967163, + "learning_rate": 3.8802756244616715e-06, + "loss": 0.9856, + "step": 406 + }, + { + "epoch": 0.6310077519379845, + "grad_norm": 1.3636142015457153, + "learning_rate": 3.875968992248063e-06, + "loss": 0.9166, + "step": 407 + }, + { + "epoch": 0.6325581395348837, + "grad_norm": 1.675515055656433, + "learning_rate": 3.8716623600344535e-06, + "loss": 0.9417, + "step": 408 + }, + { + "epoch": 0.6341085271317829, + "grad_norm": 1.4354209899902344, + "learning_rate": 3.867355727820845e-06, + "loss": 0.9434, + "step": 409 + }, + { + "epoch": 0.6356589147286822, + "grad_norm": 1.6979807615280151, + "learning_rate": 3.863049095607235e-06, + "loss": 0.9692, + "step": 410 + }, + { + "epoch": 0.6356589147286822, + "eval_loss": 1.0683481693267822, + "eval_runtime": 46.7785, + "eval_samples_per_second": 21.377, + "eval_steps_per_second": 1.347, + "step": 410 + }, + { + "epoch": 0.6372093023255814, + "grad_norm": 1.300763726234436, + "learning_rate": 3.858742463393627e-06, + "loss": 0.9572, + "step": 411 + }, + { + "epoch": 0.6387596899224807, + "grad_norm": 1.6949375867843628, + "learning_rate": 3.854435831180017e-06, + "loss": 0.9986, + "step": 412 + }, + { + "epoch": 0.6403100775193798, + "grad_norm": 1.6742664575576782, + "learning_rate": 3.850129198966409e-06, + "loss": 1.0145, + "step": 413 + }, + { + "epoch": 0.641860465116279, + "grad_norm": 1.3767681121826172, + "learning_rate": 3.8458225667528e-06, + "loss": 0.9573, + "step": 414 + }, + { + "epoch": 0.6434108527131783, + "grad_norm": 1.656080722808838, + "learning_rate": 3.841515934539191e-06, + "loss": 0.9311, + "step": 415 + }, + { + "epoch": 0.6449612403100775, + "grad_norm": 1.4878498315811157, + "learning_rate": 3.837209302325582e-06, + "loss": 0.9467, + "step": 416 + }, + { + "epoch": 0.6465116279069767, + "grad_norm": 2.324364185333252, + "learning_rate": 3.832902670111973e-06, + "loss": 0.9442, + "step": 417 + }, + { + "epoch": 0.648062015503876, + "grad_norm": 1.2944444417953491, + "learning_rate": 3.828596037898364e-06, + "loss": 0.9664, + "step": 418 + }, + { + "epoch": 0.6496124031007752, + "grad_norm": 1.3722344636917114, + "learning_rate": 3.8242894056847545e-06, + "loss": 0.9458, + "step": 419 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 2.1579277515411377, + "learning_rate": 3.819982773471146e-06, + "loss": 0.9934, + "step": 420 + }, + { + "epoch": 0.6511627906976745, + "eval_loss": 1.0679476261138916, + "eval_runtime": 46.7186, + "eval_samples_per_second": 21.405, + "eval_steps_per_second": 1.348, + "step": 420 + }, + { + "epoch": 0.6527131782945736, + "grad_norm": 2.1869912147521973, + "learning_rate": 3.815676141257537e-06, + "loss": 0.922, + "step": 421 + }, + { + "epoch": 0.6542635658914728, + "grad_norm": 1.5521175861358643, + "learning_rate": 3.811369509043928e-06, + "loss": 0.9917, + "step": 422 + }, + { + "epoch": 0.6558139534883721, + "grad_norm": 1.3776301145553589, + "learning_rate": 3.8070628768303192e-06, + "loss": 0.9766, + "step": 423 + }, + { + "epoch": 0.6573643410852713, + "grad_norm": 2.0059685707092285, + "learning_rate": 3.8027562446167098e-06, + "loss": 0.9768, + "step": 424 + }, + { + "epoch": 0.6589147286821705, + "grad_norm": 1.6401809453964233, + "learning_rate": 3.798449612403101e-06, + "loss": 0.9773, + "step": 425 + }, + { + "epoch": 0.6604651162790698, + "grad_norm": 1.5309780836105347, + "learning_rate": 3.7941429801894917e-06, + "loss": 0.9605, + "step": 426 + }, + { + "epoch": 0.662015503875969, + "grad_norm": 1.6547430753707886, + "learning_rate": 3.789836347975883e-06, + "loss": 0.9674, + "step": 427 + }, + { + "epoch": 0.6635658914728683, + "grad_norm": 1.8590940237045288, + "learning_rate": 3.7855297157622745e-06, + "loss": 0.994, + "step": 428 + }, + { + "epoch": 0.6651162790697674, + "grad_norm": 1.347604751586914, + "learning_rate": 3.781223083548665e-06, + "loss": 0.9635, + "step": 429 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.4679491519927979, + "learning_rate": 3.7769164513350564e-06, + "loss": 0.9318, + "step": 430 + }, + { + "epoch": 0.6666666666666666, + "eval_loss": 1.0620640516281128, + "eval_runtime": 46.7063, + "eval_samples_per_second": 21.41, + "eval_steps_per_second": 1.349, + "step": 430 + }, + { + "epoch": 0.6682170542635659, + "grad_norm": 1.8079434633255005, + "learning_rate": 3.772609819121447e-06, + "loss": 0.9651, + "step": 431 + }, + { + "epoch": 0.6697674418604651, + "grad_norm": 1.7883085012435913, + "learning_rate": 3.7683031869078384e-06, + "loss": 0.9598, + "step": 432 + }, + { + "epoch": 0.6713178294573643, + "grad_norm": 1.6297731399536133, + "learning_rate": 3.7639965546942293e-06, + "loss": 1.0005, + "step": 433 + }, + { + "epoch": 0.6728682170542636, + "grad_norm": 1.4521540403366089, + "learning_rate": 3.7596899224806203e-06, + "loss": 0.9638, + "step": 434 + }, + { + "epoch": 0.6744186046511628, + "grad_norm": 1.7151950597763062, + "learning_rate": 3.7553832902670117e-06, + "loss": 0.9498, + "step": 435 + }, + { + "epoch": 0.6759689922480621, + "grad_norm": 1.7151180505752563, + "learning_rate": 3.7510766580534027e-06, + "loss": 0.9506, + "step": 436 + }, + { + "epoch": 0.6775193798449612, + "grad_norm": 1.6881719827651978, + "learning_rate": 3.7467700258397936e-06, + "loss": 0.9792, + "step": 437 + }, + { + "epoch": 0.6790697674418604, + "grad_norm": 1.6680631637573242, + "learning_rate": 3.7424633936261846e-06, + "loss": 0.9621, + "step": 438 + }, + { + "epoch": 0.6806201550387597, + "grad_norm": 1.5197463035583496, + "learning_rate": 3.7381567614125756e-06, + "loss": 0.9458, + "step": 439 + }, + { + "epoch": 0.6821705426356589, + "grad_norm": 1.3536181449890137, + "learning_rate": 3.7338501291989665e-06, + "loss": 0.9585, + "step": 440 + }, + { + "epoch": 0.6821705426356589, + "eval_loss": 1.0652538537979126, + "eval_runtime": 46.6998, + "eval_samples_per_second": 21.413, + "eval_steps_per_second": 1.349, + "step": 440 + }, + { + "epoch": 0.6837209302325581, + "grad_norm": 1.3938905000686646, + "learning_rate": 3.729543496985358e-06, + "loss": 0.9654, + "step": 441 + }, + { + "epoch": 0.6852713178294574, + "grad_norm": 1.677477478981018, + "learning_rate": 3.725236864771749e-06, + "loss": 0.9453, + "step": 442 + }, + { + "epoch": 0.6868217054263566, + "grad_norm": 1.654832363128662, + "learning_rate": 3.72093023255814e-06, + "loss": 0.9384, + "step": 443 + }, + { + "epoch": 0.6883720930232559, + "grad_norm": 1.330737829208374, + "learning_rate": 3.7166236003445313e-06, + "loss": 0.9682, + "step": 444 + }, + { + "epoch": 0.689922480620155, + "grad_norm": 1.4447482824325562, + "learning_rate": 3.712316968130922e-06, + "loss": 0.9465, + "step": 445 + }, + { + "epoch": 0.6914728682170542, + "grad_norm": 1.5042719841003418, + "learning_rate": 3.708010335917313e-06, + "loss": 0.9519, + "step": 446 + }, + { + "epoch": 0.6930232558139535, + "grad_norm": 1.5131895542144775, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.9435, + "step": 447 + }, + { + "epoch": 0.6945736434108527, + "grad_norm": 1.4704760313034058, + "learning_rate": 3.699397071490095e-06, + "loss": 0.9643, + "step": 448 + }, + { + "epoch": 0.6961240310077519, + "grad_norm": 1.6629536151885986, + "learning_rate": 3.6950904392764865e-06, + "loss": 0.9813, + "step": 449 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 1.8209941387176514, + "learning_rate": 3.690783807062877e-06, + "loss": 0.9872, + "step": 450 + }, + { + "epoch": 0.6976744186046512, + "eval_loss": 1.0628501176834106, + "eval_runtime": 46.7022, + "eval_samples_per_second": 21.412, + "eval_steps_per_second": 1.349, + "step": 450 + }, + { + "epoch": 0.6992248062015504, + "grad_norm": 1.903017282485962, + "learning_rate": 3.6864771748492685e-06, + "loss": 0.9505, + "step": 451 + }, + { + "epoch": 0.7007751937984497, + "grad_norm": 1.5136739015579224, + "learning_rate": 3.682170542635659e-06, + "loss": 0.9826, + "step": 452 + }, + { + "epoch": 0.7023255813953488, + "grad_norm": 1.440664291381836, + "learning_rate": 3.6778639104220504e-06, + "loss": 0.9321, + "step": 453 + }, + { + "epoch": 0.703875968992248, + "grad_norm": 1.4127347469329834, + "learning_rate": 3.673557278208441e-06, + "loss": 0.9422, + "step": 454 + }, + { + "epoch": 0.7054263565891473, + "grad_norm": 1.6021867990493774, + "learning_rate": 3.6692506459948323e-06, + "loss": 0.9645, + "step": 455 + }, + { + "epoch": 0.7069767441860465, + "grad_norm": 1.1911089420318604, + "learning_rate": 3.6649440137812233e-06, + "loss": 0.9296, + "step": 456 + }, + { + "epoch": 0.7085271317829457, + "grad_norm": 1.9586585760116577, + "learning_rate": 3.6606373815676143e-06, + "loss": 0.9404, + "step": 457 + }, + { + "epoch": 0.710077519379845, + "grad_norm": 1.6152875423431396, + "learning_rate": 3.6563307493540057e-06, + "loss": 0.9489, + "step": 458 + }, + { + "epoch": 0.7116279069767442, + "grad_norm": 1.417580246925354, + "learning_rate": 3.652024117140396e-06, + "loss": 0.9628, + "step": 459 + }, + { + "epoch": 0.7131782945736435, + "grad_norm": 1.3339707851409912, + "learning_rate": 3.6477174849267876e-06, + "loss": 0.9415, + "step": 460 + }, + { + "epoch": 0.7131782945736435, + "eval_loss": 1.061155080795288, + "eval_runtime": 46.6993, + "eval_samples_per_second": 21.414, + "eval_steps_per_second": 1.349, + "step": 460 + }, + { + "epoch": 0.7147286821705426, + "grad_norm": 1.5678375959396362, + "learning_rate": 3.6434108527131786e-06, + "loss": 0.9344, + "step": 461 + }, + { + "epoch": 0.7162790697674418, + "grad_norm": 1.6713725328445435, + "learning_rate": 3.6391042204995695e-06, + "loss": 0.9387, + "step": 462 + }, + { + "epoch": 0.7178294573643411, + "grad_norm": 1.3815056085586548, + "learning_rate": 3.6347975882859605e-06, + "loss": 0.9666, + "step": 463 + }, + { + "epoch": 0.7193798449612403, + "grad_norm": 1.466451644897461, + "learning_rate": 3.630490956072352e-06, + "loss": 0.9766, + "step": 464 + }, + { + "epoch": 0.7209302325581395, + "grad_norm": 1.3906329870224, + "learning_rate": 3.626184323858743e-06, + "loss": 0.9517, + "step": 465 + }, + { + "epoch": 0.7224806201550388, + "grad_norm": 1.4931296110153198, + "learning_rate": 3.621877691645134e-06, + "loss": 0.9618, + "step": 466 + }, + { + "epoch": 0.724031007751938, + "grad_norm": 1.3734408617019653, + "learning_rate": 3.617571059431525e-06, + "loss": 0.9456, + "step": 467 + }, + { + "epoch": 0.7255813953488373, + "grad_norm": 1.6479203701019287, + "learning_rate": 3.6132644272179158e-06, + "loss": 0.9547, + "step": 468 + }, + { + "epoch": 0.7271317829457364, + "grad_norm": 1.3253997564315796, + "learning_rate": 3.608957795004307e-06, + "loss": 0.9598, + "step": 469 + }, + { + "epoch": 0.7286821705426356, + "grad_norm": 1.624961018562317, + "learning_rate": 3.6046511627906977e-06, + "loss": 0.9769, + "step": 470 + }, + { + "epoch": 0.7286821705426356, + "eval_loss": 1.0569216012954712, + "eval_runtime": 46.6486, + "eval_samples_per_second": 21.437, + "eval_steps_per_second": 1.351, + "step": 470 + }, + { + "epoch": 0.7302325581395349, + "grad_norm": 1.2979047298431396, + "learning_rate": 3.600344530577089e-06, + "loss": 0.9171, + "step": 471 + }, + { + "epoch": 0.7317829457364341, + "grad_norm": 1.3191477060317993, + "learning_rate": 3.5960378983634805e-06, + "loss": 0.943, + "step": 472 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 1.6449686288833618, + "learning_rate": 3.591731266149871e-06, + "loss": 0.9488, + "step": 473 + }, + { + "epoch": 0.7348837209302326, + "grad_norm": 1.4530247449874878, + "learning_rate": 3.5874246339362624e-06, + "loss": 0.9479, + "step": 474 + }, + { + "epoch": 0.7364341085271318, + "grad_norm": 1.400500774383545, + "learning_rate": 3.583118001722653e-06, + "loss": 0.9182, + "step": 475 + }, + { + "epoch": 0.737984496124031, + "grad_norm": 1.5714648962020874, + "learning_rate": 3.5788113695090443e-06, + "loss": 0.9756, + "step": 476 + }, + { + "epoch": 0.7395348837209302, + "grad_norm": 1.4749916791915894, + "learning_rate": 3.574504737295435e-06, + "loss": 0.9322, + "step": 477 + }, + { + "epoch": 0.7410852713178294, + "grad_norm": 1.3581645488739014, + "learning_rate": 3.5701981050818263e-06, + "loss": 0.9456, + "step": 478 + }, + { + "epoch": 0.7426356589147287, + "grad_norm": 1.58745276927948, + "learning_rate": 3.5658914728682177e-06, + "loss": 0.9197, + "step": 479 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 1.3937815427780151, + "learning_rate": 3.5615848406546082e-06, + "loss": 0.9271, + "step": 480 + }, + { + "epoch": 0.7441860465116279, + "eval_loss": 1.0562697649002075, + "eval_runtime": 46.7957, + "eval_samples_per_second": 21.369, + "eval_steps_per_second": 1.346, + "step": 480 + }, + { + "epoch": 0.7457364341085271, + "grad_norm": 1.3200627565383911, + "learning_rate": 3.5572782084409996e-06, + "loss": 0.9454, + "step": 481 + }, + { + "epoch": 0.7472868217054264, + "grad_norm": 1.427779197692871, + "learning_rate": 3.55297157622739e-06, + "loss": 0.9604, + "step": 482 + }, + { + "epoch": 0.7488372093023256, + "grad_norm": 1.8657872676849365, + "learning_rate": 3.5486649440137815e-06, + "loss": 0.9359, + "step": 483 + }, + { + "epoch": 0.7503875968992249, + "grad_norm": 2.0786523818969727, + "learning_rate": 3.5443583118001725e-06, + "loss": 0.9995, + "step": 484 + }, + { + "epoch": 0.751937984496124, + "grad_norm": 1.6857866048812866, + "learning_rate": 3.5400516795865635e-06, + "loss": 0.9657, + "step": 485 + }, + { + "epoch": 0.7534883720930232, + "grad_norm": 1.380631923675537, + "learning_rate": 3.535745047372955e-06, + "loss": 0.9625, + "step": 486 + }, + { + "epoch": 0.7550387596899225, + "grad_norm": 1.4253772497177124, + "learning_rate": 3.5314384151593454e-06, + "loss": 0.9499, + "step": 487 + }, + { + "epoch": 0.7565891472868217, + "grad_norm": 1.6862024068832397, + "learning_rate": 3.527131782945737e-06, + "loss": 0.9358, + "step": 488 + }, + { + "epoch": 0.7581395348837209, + "grad_norm": 1.6502622365951538, + "learning_rate": 3.5228251507321278e-06, + "loss": 0.926, + "step": 489 + }, + { + "epoch": 0.7596899224806202, + "grad_norm": 1.6885582208633423, + "learning_rate": 3.5185185185185187e-06, + "loss": 0.9426, + "step": 490 + }, + { + "epoch": 0.7596899224806202, + "eval_loss": 1.0555542707443237, + "eval_runtime": 46.8684, + "eval_samples_per_second": 21.336, + "eval_steps_per_second": 1.344, + "step": 490 + }, + { + "epoch": 0.7612403100775194, + "grad_norm": 1.6672579050064087, + "learning_rate": 3.5142118863049097e-06, + "loss": 0.9195, + "step": 491 + }, + { + "epoch": 0.7627906976744186, + "grad_norm": 1.3683370351791382, + "learning_rate": 3.509905254091301e-06, + "loss": 0.9301, + "step": 492 + }, + { + "epoch": 0.7643410852713178, + "grad_norm": 1.5962402820587158, + "learning_rate": 3.505598621877692e-06, + "loss": 0.9496, + "step": 493 + }, + { + "epoch": 0.765891472868217, + "grad_norm": 1.45559561252594, + "learning_rate": 3.501291989664083e-06, + "loss": 0.9263, + "step": 494 + }, + { + "epoch": 0.7674418604651163, + "grad_norm": 1.8540050983428955, + "learning_rate": 3.496985357450474e-06, + "loss": 0.9307, + "step": 495 + }, + { + "epoch": 0.7689922480620155, + "grad_norm": 2.0742030143737793, + "learning_rate": 3.492678725236865e-06, + "loss": 0.9206, + "step": 496 + }, + { + "epoch": 0.7705426356589147, + "grad_norm": 1.6275883913040161, + "learning_rate": 3.4883720930232564e-06, + "loss": 0.9237, + "step": 497 + }, + { + "epoch": 0.772093023255814, + "grad_norm": 1.5503264665603638, + "learning_rate": 3.484065460809647e-06, + "loss": 0.9468, + "step": 498 + }, + { + "epoch": 0.7736434108527132, + "grad_norm": 2.0982398986816406, + "learning_rate": 3.4797588285960383e-06, + "loss": 0.9697, + "step": 499 + }, + { + "epoch": 0.7751937984496124, + "grad_norm": 1.5442216396331787, + "learning_rate": 3.4754521963824293e-06, + "loss": 0.988, + "step": 500 + }, + { + "epoch": 0.7751937984496124, + "eval_loss": 1.0530461072921753, + "eval_runtime": 46.7519, + "eval_samples_per_second": 21.389, + "eval_steps_per_second": 1.348, + "step": 500 + }, + { + "epoch": 0.7767441860465116, + "grad_norm": 1.2036796808242798, + "learning_rate": 3.4711455641688202e-06, + "loss": 0.9044, + "step": 501 + }, + { + "epoch": 0.7782945736434108, + "grad_norm": 1.345171332359314, + "learning_rate": 3.4668389319552116e-06, + "loss": 0.9189, + "step": 502 + }, + { + "epoch": 0.7798449612403101, + "grad_norm": 1.998370885848999, + "learning_rate": 3.462532299741602e-06, + "loss": 0.94, + "step": 503 + }, + { + "epoch": 0.7813953488372093, + "grad_norm": 1.5674575567245483, + "learning_rate": 3.4582256675279936e-06, + "loss": 0.9254, + "step": 504 + }, + { + "epoch": 0.7829457364341085, + "grad_norm": 1.315887689590454, + "learning_rate": 3.453919035314384e-06, + "loss": 0.9419, + "step": 505 + }, + { + "epoch": 0.7844961240310078, + "grad_norm": 1.5615918636322021, + "learning_rate": 3.4496124031007755e-06, + "loss": 0.9518, + "step": 506 + }, + { + "epoch": 0.786046511627907, + "grad_norm": 1.6197636127471924, + "learning_rate": 3.445305770887167e-06, + "loss": 0.9401, + "step": 507 + }, + { + "epoch": 0.7875968992248062, + "grad_norm": 1.2421844005584717, + "learning_rate": 3.4409991386735574e-06, + "loss": 0.934, + "step": 508 + }, + { + "epoch": 0.7891472868217054, + "grad_norm": 1.2320061922073364, + "learning_rate": 3.436692506459949e-06, + "loss": 0.9228, + "step": 509 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 1.4907335042953491, + "learning_rate": 3.4323858742463394e-06, + "loss": 0.9741, + "step": 510 + }, + { + "epoch": 0.7906976744186046, + "eval_loss": 1.0517873764038086, + "eval_runtime": 46.9265, + "eval_samples_per_second": 21.31, + "eval_steps_per_second": 1.343, + "step": 510 + }, + { + "epoch": 0.7922480620155039, + "grad_norm": 1.1737737655639648, + "learning_rate": 3.4280792420327308e-06, + "loss": 0.8744, + "step": 511 + }, + { + "epoch": 0.7937984496124031, + "grad_norm": 1.3589445352554321, + "learning_rate": 3.4237726098191217e-06, + "loss": 0.9366, + "step": 512 + }, + { + "epoch": 0.7953488372093023, + "grad_norm": 1.3620688915252686, + "learning_rate": 3.4194659776055127e-06, + "loss": 0.9422, + "step": 513 + }, + { + "epoch": 0.7968992248062016, + "grad_norm": 1.8479820489883423, + "learning_rate": 3.415159345391904e-06, + "loss": 0.9338, + "step": 514 + }, + { + "epoch": 0.7984496124031008, + "grad_norm": 1.673426866531372, + "learning_rate": 3.4108527131782946e-06, + "loss": 0.9237, + "step": 515 + }, + { + "epoch": 0.8, + "grad_norm": 1.6327366828918457, + "learning_rate": 3.406546080964686e-06, + "loss": 0.937, + "step": 516 + }, + { + "epoch": 0.8015503875968992, + "grad_norm": 1.3908108472824097, + "learning_rate": 3.402239448751077e-06, + "loss": 0.9524, + "step": 517 + }, + { + "epoch": 0.8031007751937984, + "grad_norm": 1.5284862518310547, + "learning_rate": 3.397932816537468e-06, + "loss": 0.9818, + "step": 518 + }, + { + "epoch": 0.8046511627906977, + "grad_norm": 1.657374620437622, + "learning_rate": 3.393626184323859e-06, + "loss": 0.8977, + "step": 519 + }, + { + "epoch": 0.8062015503875969, + "grad_norm": 1.6133347749710083, + "learning_rate": 3.38931955211025e-06, + "loss": 0.9226, + "step": 520 + }, + { + "epoch": 0.8062015503875969, + "eval_loss": 1.0512946844100952, + "eval_runtime": 46.8983, + "eval_samples_per_second": 21.323, + "eval_steps_per_second": 1.343, + "step": 520 + }, + { + "epoch": 0.8077519379844961, + "grad_norm": 1.406281590461731, + "learning_rate": 3.3850129198966413e-06, + "loss": 0.9272, + "step": 521 + }, + { + "epoch": 0.8093023255813954, + "grad_norm": 1.2945131063461304, + "learning_rate": 3.3807062876830323e-06, + "loss": 0.9195, + "step": 522 + }, + { + "epoch": 0.8108527131782945, + "grad_norm": 1.5606355667114258, + "learning_rate": 3.3763996554694232e-06, + "loss": 0.919, + "step": 523 + }, + { + "epoch": 0.8124031007751938, + "grad_norm": 1.3578014373779297, + "learning_rate": 3.372093023255814e-06, + "loss": 0.9443, + "step": 524 + }, + { + "epoch": 0.813953488372093, + "grad_norm": 1.62376868724823, + "learning_rate": 3.3677863910422056e-06, + "loss": 0.9629, + "step": 525 + }, + { + "epoch": 0.8155038759689922, + "grad_norm": 1.5871691703796387, + "learning_rate": 3.363479758828596e-06, + "loss": 0.9555, + "step": 526 + }, + { + "epoch": 0.8170542635658915, + "grad_norm": 1.765234112739563, + "learning_rate": 3.3591731266149875e-06, + "loss": 0.9506, + "step": 527 + }, + { + "epoch": 0.8186046511627907, + "grad_norm": 1.445127248764038, + "learning_rate": 3.354866494401378e-06, + "loss": 0.9583, + "step": 528 + }, + { + "epoch": 0.8201550387596899, + "grad_norm": 1.3839683532714844, + "learning_rate": 3.3505598621877695e-06, + "loss": 0.9735, + "step": 529 + }, + { + "epoch": 0.8217054263565892, + "grad_norm": 1.2828640937805176, + "learning_rate": 3.346253229974161e-06, + "loss": 0.917, + "step": 530 + }, + { + "epoch": 0.8217054263565892, + "eval_loss": 1.0487127304077148, + "eval_runtime": 46.8274, + "eval_samples_per_second": 21.355, + "eval_steps_per_second": 1.345, + "step": 530 + }, + { + "epoch": 0.8232558139534883, + "grad_norm": 1.6673675775527954, + "learning_rate": 3.3419465977605514e-06, + "loss": 0.9419, + "step": 531 + }, + { + "epoch": 0.8248062015503876, + "grad_norm": 1.508750319480896, + "learning_rate": 3.3376399655469428e-06, + "loss": 0.9363, + "step": 532 + }, + { + "epoch": 0.8263565891472868, + "grad_norm": 1.4697679281234741, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.9531, + "step": 533 + }, + { + "epoch": 0.827906976744186, + "grad_norm": 1.2485036849975586, + "learning_rate": 3.3290267011197247e-06, + "loss": 0.938, + "step": 534 + }, + { + "epoch": 0.8294573643410853, + "grad_norm": 1.6659992933273315, + "learning_rate": 3.3247200689061153e-06, + "loss": 0.9419, + "step": 535 + }, + { + "epoch": 0.8310077519379845, + "grad_norm": 1.207901120185852, + "learning_rate": 3.3204134366925067e-06, + "loss": 0.9582, + "step": 536 + }, + { + "epoch": 0.8325581395348837, + "grad_norm": 1.456590175628662, + "learning_rate": 3.316106804478898e-06, + "loss": 0.9355, + "step": 537 + }, + { + "epoch": 0.834108527131783, + "grad_norm": 1.2961267232894897, + "learning_rate": 3.3118001722652886e-06, + "loss": 0.9317, + "step": 538 + }, + { + "epoch": 0.8356589147286821, + "grad_norm": 1.304347276687622, + "learning_rate": 3.30749354005168e-06, + "loss": 0.9133, + "step": 539 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 1.2878105640411377, + "learning_rate": 3.3031869078380705e-06, + "loss": 0.908, + "step": 540 + }, + { + "epoch": 0.8372093023255814, + "eval_loss": 1.047052264213562, + "eval_runtime": 46.8479, + "eval_samples_per_second": 21.346, + "eval_steps_per_second": 1.345, + "step": 540 + }, + { + "epoch": 0.8387596899224806, + "grad_norm": 1.663776159286499, + "learning_rate": 3.298880275624462e-06, + "loss": 0.8975, + "step": 541 + }, + { + "epoch": 0.8403100775193798, + "grad_norm": 1.3324629068374634, + "learning_rate": 3.294573643410853e-06, + "loss": 0.9524, + "step": 542 + }, + { + "epoch": 0.8418604651162791, + "grad_norm": 1.5450448989868164, + "learning_rate": 3.290267011197244e-06, + "loss": 0.9771, + "step": 543 + }, + { + "epoch": 0.8434108527131783, + "grad_norm": 1.4546128511428833, + "learning_rate": 3.2859603789836352e-06, + "loss": 0.9168, + "step": 544 + }, + { + "epoch": 0.8449612403100775, + "grad_norm": 1.623872995376587, + "learning_rate": 3.281653746770026e-06, + "loss": 0.9637, + "step": 545 + }, + { + "epoch": 0.8465116279069768, + "grad_norm": 1.4465365409851074, + "learning_rate": 3.277347114556417e-06, + "loss": 0.9399, + "step": 546 + }, + { + "epoch": 0.8480620155038759, + "grad_norm": 1.4867907762527466, + "learning_rate": 3.273040482342808e-06, + "loss": 0.9274, + "step": 547 + }, + { + "epoch": 0.8496124031007752, + "grad_norm": 1.429142713546753, + "learning_rate": 3.268733850129199e-06, + "loss": 0.9566, + "step": 548 + }, + { + "epoch": 0.8511627906976744, + "grad_norm": 1.2755072116851807, + "learning_rate": 3.26442721791559e-06, + "loss": 0.9417, + "step": 549 + }, + { + "epoch": 0.8527131782945736, + "grad_norm": 1.5570671558380127, + "learning_rate": 3.2601205857019815e-06, + "loss": 0.9316, + "step": 550 + }, + { + "epoch": 0.8527131782945736, + "eval_loss": 1.0439643859863281, + "eval_runtime": 46.7656, + "eval_samples_per_second": 21.383, + "eval_steps_per_second": 1.347, + "step": 550 + }, + { + "epoch": 0.8542635658914729, + "grad_norm": 1.3648735284805298, + "learning_rate": 3.2558139534883724e-06, + "loss": 0.9174, + "step": 551 + }, + { + "epoch": 0.8558139534883721, + "grad_norm": 1.1797237396240234, + "learning_rate": 3.2515073212747634e-06, + "loss": 0.9101, + "step": 552 + }, + { + "epoch": 0.8573643410852713, + "grad_norm": 1.3258795738220215, + "learning_rate": 3.247200689061155e-06, + "loss": 0.9288, + "step": 553 + }, + { + "epoch": 0.8589147286821706, + "grad_norm": 1.6303619146347046, + "learning_rate": 3.2428940568475453e-06, + "loss": 0.9293, + "step": 554 + }, + { + "epoch": 0.8604651162790697, + "grad_norm": 1.412011981010437, + "learning_rate": 3.2385874246339367e-06, + "loss": 0.9597, + "step": 555 + }, + { + "epoch": 0.862015503875969, + "grad_norm": 1.3949530124664307, + "learning_rate": 3.2342807924203273e-06, + "loss": 0.8872, + "step": 556 + }, + { + "epoch": 0.8635658914728682, + "grad_norm": 1.3831912279129028, + "learning_rate": 3.2299741602067187e-06, + "loss": 0.9067, + "step": 557 + }, + { + "epoch": 0.8651162790697674, + "grad_norm": 1.4866950511932373, + "learning_rate": 3.22566752799311e-06, + "loss": 0.9392, + "step": 558 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.6705503463745117, + "learning_rate": 3.2213608957795006e-06, + "loss": 0.9563, + "step": 559 + }, + { + "epoch": 0.8682170542635659, + "grad_norm": 1.3490782976150513, + "learning_rate": 3.217054263565892e-06, + "loss": 0.9131, + "step": 560 + }, + { + "epoch": 0.8682170542635659, + "eval_loss": 1.0431301593780518, + "eval_runtime": 46.8914, + "eval_samples_per_second": 21.326, + "eval_steps_per_second": 1.344, + "step": 560 + }, + { + "epoch": 0.8697674418604651, + "grad_norm": 1.2459760904312134, + "learning_rate": 3.2127476313522825e-06, + "loss": 0.9982, + "step": 561 + }, + { + "epoch": 0.8713178294573644, + "grad_norm": 1.5161173343658447, + "learning_rate": 3.208440999138674e-06, + "loss": 0.9763, + "step": 562 + }, + { + "epoch": 0.8728682170542635, + "grad_norm": 1.6294832229614258, + "learning_rate": 3.2041343669250645e-06, + "loss": 0.9324, + "step": 563 + }, + { + "epoch": 0.8744186046511628, + "grad_norm": 1.414649248123169, + "learning_rate": 3.199827734711456e-06, + "loss": 0.9336, + "step": 564 + }, + { + "epoch": 0.875968992248062, + "grad_norm": 1.5806972980499268, + "learning_rate": 3.1955211024978473e-06, + "loss": 0.9293, + "step": 565 + }, + { + "epoch": 0.8775193798449612, + "grad_norm": 1.524552345275879, + "learning_rate": 3.191214470284238e-06, + "loss": 0.9531, + "step": 566 + }, + { + "epoch": 0.8790697674418605, + "grad_norm": 1.7706551551818848, + "learning_rate": 3.186907838070629e-06, + "loss": 0.9478, + "step": 567 + }, + { + "epoch": 0.8806201550387597, + "grad_norm": 1.6926461458206177, + "learning_rate": 3.1826012058570197e-06, + "loss": 0.9311, + "step": 568 + }, + { + "epoch": 0.8821705426356589, + "grad_norm": 1.43769109249115, + "learning_rate": 3.178294573643411e-06, + "loss": 0.9509, + "step": 569 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 1.8943246603012085, + "learning_rate": 3.173987941429802e-06, + "loss": 0.9301, + "step": 570 + }, + { + "epoch": 0.8837209302325582, + "eval_loss": 1.0409941673278809, + "eval_runtime": 46.854, + "eval_samples_per_second": 21.343, + "eval_steps_per_second": 1.345, + "step": 570 + }, + { + "epoch": 0.8852713178294573, + "grad_norm": 1.3987243175506592, + "learning_rate": 3.169681309216193e-06, + "loss": 0.9351, + "step": 571 + }, + { + "epoch": 0.8868217054263566, + "grad_norm": 1.3229787349700928, + "learning_rate": 3.1653746770025845e-06, + "loss": 0.9429, + "step": 572 + }, + { + "epoch": 0.8883720930232558, + "grad_norm": 1.3845443725585938, + "learning_rate": 3.1610680447889754e-06, + "loss": 0.931, + "step": 573 + }, + { + "epoch": 0.889922480620155, + "grad_norm": 1.3660778999328613, + "learning_rate": 3.1567614125753664e-06, + "loss": 0.9146, + "step": 574 + }, + { + "epoch": 0.8914728682170543, + "grad_norm": 1.6824007034301758, + "learning_rate": 3.1524547803617574e-06, + "loss": 0.9355, + "step": 575 + }, + { + "epoch": 0.8930232558139535, + "grad_norm": 1.283627986907959, + "learning_rate": 3.1481481481481483e-06, + "loss": 0.9401, + "step": 576 + }, + { + "epoch": 0.8945736434108527, + "grad_norm": 1.3251144886016846, + "learning_rate": 3.1438415159345393e-06, + "loss": 0.9103, + "step": 577 + }, + { + "epoch": 0.896124031007752, + "grad_norm": 1.3561856746673584, + "learning_rate": 3.1395348837209307e-06, + "loss": 0.9096, + "step": 578 + }, + { + "epoch": 0.8976744186046511, + "grad_norm": 1.5303910970687866, + "learning_rate": 3.1352282515073217e-06, + "loss": 0.9187, + "step": 579 + }, + { + "epoch": 0.8992248062015504, + "grad_norm": 1.2881441116333008, + "learning_rate": 3.1309216192937126e-06, + "loss": 0.9671, + "step": 580 + }, + { + "epoch": 0.8992248062015504, + "eval_loss": 1.0374441146850586, + "eval_runtime": 46.9153, + "eval_samples_per_second": 21.315, + "eval_steps_per_second": 1.343, + "step": 580 + }, + { + "epoch": 0.9007751937984496, + "grad_norm": 1.260320782661438, + "learning_rate": 3.126614987080104e-06, + "loss": 0.9375, + "step": 581 + }, + { + "epoch": 0.9023255813953488, + "grad_norm": 1.4502344131469727, + "learning_rate": 3.1223083548664946e-06, + "loss": 0.9049, + "step": 582 + }, + { + "epoch": 0.9038759689922481, + "grad_norm": 1.310264229774475, + "learning_rate": 3.118001722652886e-06, + "loss": 0.926, + "step": 583 + }, + { + "epoch": 0.9054263565891473, + "grad_norm": 1.3460558652877808, + "learning_rate": 3.1136950904392765e-06, + "loss": 0.9237, + "step": 584 + }, + { + "epoch": 0.9069767441860465, + "grad_norm": 1.5001479387283325, + "learning_rate": 3.109388458225668e-06, + "loss": 0.9161, + "step": 585 + }, + { + "epoch": 0.9085271317829458, + "grad_norm": 1.394601583480835, + "learning_rate": 3.1050818260120593e-06, + "loss": 0.9684, + "step": 586 + }, + { + "epoch": 0.9100775193798449, + "grad_norm": 1.2460273504257202, + "learning_rate": 3.10077519379845e-06, + "loss": 0.9369, + "step": 587 + }, + { + "epoch": 0.9116279069767442, + "grad_norm": 1.4341247081756592, + "learning_rate": 3.096468561584841e-06, + "loss": 0.9552, + "step": 588 + }, + { + "epoch": 0.9131782945736434, + "grad_norm": 1.6534271240234375, + "learning_rate": 3.0921619293712318e-06, + "loss": 0.8792, + "step": 589 + }, + { + "epoch": 0.9147286821705426, + "grad_norm": 2.0037407875061035, + "learning_rate": 3.087855297157623e-06, + "loss": 0.9157, + "step": 590 + }, + { + "epoch": 0.9147286821705426, + "eval_loss": 1.0375549793243408, + "eval_runtime": 46.8146, + "eval_samples_per_second": 21.361, + "eval_steps_per_second": 1.346, + "step": 590 + }, + { + "epoch": 0.9162790697674419, + "grad_norm": 1.5423766374588013, + "learning_rate": 3.0835486649440137e-06, + "loss": 0.95, + "step": 591 + }, + { + "epoch": 0.9178294573643411, + "grad_norm": 1.272499918937683, + "learning_rate": 3.079242032730405e-06, + "loss": 0.9288, + "step": 592 + }, + { + "epoch": 0.9193798449612403, + "grad_norm": 1.304702639579773, + "learning_rate": 3.074935400516796e-06, + "loss": 0.9131, + "step": 593 + }, + { + "epoch": 0.9209302325581395, + "grad_norm": 1.9163035154342651, + "learning_rate": 3.070628768303187e-06, + "loss": 0.8756, + "step": 594 + }, + { + "epoch": 0.9224806201550387, + "grad_norm": 1.6911531686782837, + "learning_rate": 3.0663221360895784e-06, + "loss": 0.9391, + "step": 595 + }, + { + "epoch": 0.924031007751938, + "grad_norm": 1.4596829414367676, + "learning_rate": 3.062015503875969e-06, + "loss": 0.9413, + "step": 596 + }, + { + "epoch": 0.9255813953488372, + "grad_norm": 1.4826016426086426, + "learning_rate": 3.0577088716623603e-06, + "loss": 0.9136, + "step": 597 + }, + { + "epoch": 0.9271317829457364, + "grad_norm": 1.2671222686767578, + "learning_rate": 3.0534022394487513e-06, + "loss": 0.8991, + "step": 598 + }, + { + "epoch": 0.9286821705426357, + "grad_norm": 1.6153379678726196, + "learning_rate": 3.0490956072351423e-06, + "loss": 0.925, + "step": 599 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 1.5550309419631958, + "learning_rate": 3.0447889750215332e-06, + "loss": 0.9235, + "step": 600 + }, + { + "epoch": 0.9302325581395349, + "eval_loss": 1.0385956764221191, + "eval_runtime": 46.8361, + "eval_samples_per_second": 21.351, + "eval_steps_per_second": 1.345, + "step": 600 + }, + { + "epoch": 0.931782945736434, + "grad_norm": 1.2835649251937866, + "learning_rate": 3.0404823428079242e-06, + "loss": 0.9586, + "step": 601 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.3722798824310303, + "learning_rate": 3.0361757105943156e-06, + "loss": 0.9018, + "step": 602 + }, + { + "epoch": 0.9348837209302325, + "grad_norm": 1.4283156394958496, + "learning_rate": 3.0318690783807066e-06, + "loss": 0.939, + "step": 603 + }, + { + "epoch": 0.9364341085271318, + "grad_norm": 1.3454813957214355, + "learning_rate": 3.0275624461670975e-06, + "loss": 0.9296, + "step": 604 + }, + { + "epoch": 0.937984496124031, + "grad_norm": 1.2258172035217285, + "learning_rate": 3.0232558139534885e-06, + "loss": 0.9141, + "step": 605 + }, + { + "epoch": 0.9395348837209302, + "grad_norm": 1.4662123918533325, + "learning_rate": 3.01894918173988e-06, + "loss": 0.9382, + "step": 606 + }, + { + "epoch": 0.9410852713178295, + "grad_norm": 1.8197379112243652, + "learning_rate": 3.0146425495262704e-06, + "loss": 0.9361, + "step": 607 + }, + { + "epoch": 0.9426356589147287, + "grad_norm": 1.3216370344161987, + "learning_rate": 3.010335917312662e-06, + "loss": 0.9353, + "step": 608 + }, + { + "epoch": 0.9441860465116279, + "grad_norm": 1.6560941934585571, + "learning_rate": 3.006029285099053e-06, + "loss": 0.9102, + "step": 609 + }, + { + "epoch": 0.9457364341085271, + "grad_norm": 1.6229398250579834, + "learning_rate": 3.0017226528854438e-06, + "loss": 0.9177, + "step": 610 + }, + { + "epoch": 0.9457364341085271, + "eval_loss": 1.0402629375457764, + "eval_runtime": 46.9417, + "eval_samples_per_second": 21.303, + "eval_steps_per_second": 1.342, + "step": 610 + }, + { + "epoch": 0.9472868217054263, + "grad_norm": 1.4512161016464233, + "learning_rate": 2.997416020671835e-06, + "loss": 0.9386, + "step": 611 + }, + { + "epoch": 0.9488372093023256, + "grad_norm": 1.578696370124817, + "learning_rate": 2.9931093884582257e-06, + "loss": 0.919, + "step": 612 + }, + { + "epoch": 0.9503875968992248, + "grad_norm": 1.3147693872451782, + "learning_rate": 2.988802756244617e-06, + "loss": 0.9037, + "step": 613 + }, + { + "epoch": 0.951937984496124, + "grad_norm": 1.4229583740234375, + "learning_rate": 2.9844961240310076e-06, + "loss": 0.924, + "step": 614 + }, + { + "epoch": 0.9534883720930233, + "grad_norm": 1.5262490510940552, + "learning_rate": 2.980189491817399e-06, + "loss": 0.9287, + "step": 615 + }, + { + "epoch": 0.9550387596899225, + "grad_norm": 1.342050552368164, + "learning_rate": 2.9758828596037904e-06, + "loss": 0.9327, + "step": 616 + }, + { + "epoch": 0.9565891472868217, + "grad_norm": 1.3471406698226929, + "learning_rate": 2.971576227390181e-06, + "loss": 0.9113, + "step": 617 + }, + { + "epoch": 0.958139534883721, + "grad_norm": 1.7015011310577393, + "learning_rate": 2.9672695951765724e-06, + "loss": 0.9062, + "step": 618 + }, + { + "epoch": 0.9596899224806201, + "grad_norm": 1.6812702417373657, + "learning_rate": 2.962962962962963e-06, + "loss": 0.9307, + "step": 619 + }, + { + "epoch": 0.9612403100775194, + "grad_norm": 1.5313730239868164, + "learning_rate": 2.9586563307493543e-06, + "loss": 0.9149, + "step": 620 + }, + { + "epoch": 0.9612403100775194, + "eval_loss": 1.0406851768493652, + "eval_runtime": 46.8759, + "eval_samples_per_second": 21.333, + "eval_steps_per_second": 1.344, + "step": 620 + }, + { + "epoch": 0.9627906976744186, + "grad_norm": 1.806265950202942, + "learning_rate": 2.954349698535745e-06, + "loss": 0.9579, + "step": 621 + }, + { + "epoch": 0.9643410852713178, + "grad_norm": 1.4483739137649536, + "learning_rate": 2.9500430663221362e-06, + "loss": 0.9086, + "step": 622 + }, + { + "epoch": 0.9658914728682171, + "grad_norm": 1.275719165802002, + "learning_rate": 2.9457364341085276e-06, + "loss": 0.8883, + "step": 623 + }, + { + "epoch": 0.9674418604651163, + "grad_norm": 1.3039225339889526, + "learning_rate": 2.941429801894918e-06, + "loss": 0.9115, + "step": 624 + }, + { + "epoch": 0.9689922480620154, + "grad_norm": 1.658772587776184, + "learning_rate": 2.9371231696813096e-06, + "loss": 0.8889, + "step": 625 + }, + { + "epoch": 0.9705426356589147, + "grad_norm": 1.480093240737915, + "learning_rate": 2.9328165374677005e-06, + "loss": 0.9302, + "step": 626 + }, + { + "epoch": 0.9720930232558139, + "grad_norm": 1.272149682044983, + "learning_rate": 2.9285099052540915e-06, + "loss": 0.9357, + "step": 627 + }, + { + "epoch": 0.9736434108527132, + "grad_norm": 1.2398171424865723, + "learning_rate": 2.9242032730404825e-06, + "loss": 0.9148, + "step": 628 + }, + { + "epoch": 0.9751937984496124, + "grad_norm": 1.6542110443115234, + "learning_rate": 2.9198966408268734e-06, + "loss": 0.9419, + "step": 629 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 1.2841962575912476, + "learning_rate": 2.915590008613265e-06, + "loss": 0.9098, + "step": 630 + }, + { + "epoch": 0.9767441860465116, + "eval_loss": 1.039470911026001, + "eval_runtime": 46.948, + "eval_samples_per_second": 21.3, + "eval_steps_per_second": 1.342, + "step": 630 + }, + { + "epoch": 0.9782945736434109, + "grad_norm": 1.546913743019104, + "learning_rate": 2.911283376399656e-06, + "loss": 0.8966, + "step": 631 + }, + { + "epoch": 0.9798449612403101, + "grad_norm": 1.6431787014007568, + "learning_rate": 2.9069767441860468e-06, + "loss": 0.9415, + "step": 632 + }, + { + "epoch": 0.9813953488372092, + "grad_norm": 1.4569215774536133, + "learning_rate": 2.9026701119724377e-06, + "loss": 0.9119, + "step": 633 + }, + { + "epoch": 0.9829457364341085, + "grad_norm": 1.4622957706451416, + "learning_rate": 2.898363479758829e-06, + "loss": 0.9187, + "step": 634 + }, + { + "epoch": 0.9844961240310077, + "grad_norm": 1.2801949977874756, + "learning_rate": 2.8940568475452197e-06, + "loss": 0.9243, + "step": 635 + }, + { + "epoch": 0.986046511627907, + "grad_norm": 1.4466568231582642, + "learning_rate": 2.889750215331611e-06, + "loss": 0.9346, + "step": 636 + }, + { + "epoch": 0.9875968992248062, + "grad_norm": 1.5166065692901611, + "learning_rate": 2.885443583118002e-06, + "loss": 0.9137, + "step": 637 + }, + { + "epoch": 0.9891472868217054, + "grad_norm": 1.5546586513519287, + "learning_rate": 2.881136950904393e-06, + "loss": 0.9281, + "step": 638 + }, + { + "epoch": 0.9906976744186047, + "grad_norm": 1.2714097499847412, + "learning_rate": 2.8768303186907844e-06, + "loss": 0.9187, + "step": 639 + }, + { + "epoch": 0.9922480620155039, + "grad_norm": 1.4462742805480957, + "learning_rate": 2.872523686477175e-06, + "loss": 0.9156, + "step": 640 + }, + { + "epoch": 0.9922480620155039, + "eval_loss": 1.0363060235977173, + "eval_runtime": 46.8096, + "eval_samples_per_second": 21.363, + "eval_steps_per_second": 1.346, + "step": 640 + }, + { + "epoch": 0.993798449612403, + "grad_norm": 1.4602237939834595, + "learning_rate": 2.8682170542635663e-06, + "loss": 0.9291, + "step": 641 + }, + { + "epoch": 0.9953488372093023, + "grad_norm": 1.36737859249115, + "learning_rate": 2.863910422049957e-06, + "loss": 0.8979, + "step": 642 + }, + { + "epoch": 0.9968992248062015, + "grad_norm": 1.4275497198104858, + "learning_rate": 2.8596037898363483e-06, + "loss": 0.8784, + "step": 643 + }, + { + "epoch": 0.9984496124031008, + "grad_norm": 1.1453689336776733, + "learning_rate": 2.8552971576227396e-06, + "loss": 0.904, + "step": 644 + }, + { + "epoch": 1.0, + "grad_norm": 1.5238977670669556, + "learning_rate": 2.85099052540913e-06, + "loss": 0.8926, + "step": 645 + }, + { + "epoch": 1.0015503875968992, + "grad_norm": 1.5500783920288086, + "learning_rate": 2.8466838931955216e-06, + "loss": 0.9153, + "step": 646 + }, + { + "epoch": 1.0031007751937984, + "grad_norm": 1.5735803842544556, + "learning_rate": 2.842377260981912e-06, + "loss": 0.9061, + "step": 647 + }, + { + "epoch": 1.0046511627906978, + "grad_norm": 1.289521336555481, + "learning_rate": 2.8380706287683035e-06, + "loss": 0.9081, + "step": 648 + }, + { + "epoch": 1.006201550387597, + "grad_norm": 1.4659291505813599, + "learning_rate": 2.833763996554694e-06, + "loss": 0.8713, + "step": 649 + }, + { + "epoch": 1.0077519379844961, + "grad_norm": 1.3533378839492798, + "learning_rate": 2.8294573643410855e-06, + "loss": 0.9052, + "step": 650 + }, + { + "epoch": 1.0077519379844961, + "eval_loss": 1.0426983833312988, + "eval_runtime": 46.7738, + "eval_samples_per_second": 21.379, + "eval_steps_per_second": 1.347, + "step": 650 + }, + { + "epoch": 1.0093023255813953, + "grad_norm": 1.3357157707214355, + "learning_rate": 2.825150732127477e-06, + "loss": 0.8823, + "step": 651 + }, + { + "epoch": 1.0108527131782945, + "grad_norm": 1.4741071462631226, + "learning_rate": 2.8208440999138674e-06, + "loss": 0.8922, + "step": 652 + }, + { + "epoch": 1.012403100775194, + "grad_norm": 1.5935308933258057, + "learning_rate": 2.8165374677002588e-06, + "loss": 0.8688, + "step": 653 + }, + { + "epoch": 1.013953488372093, + "grad_norm": 1.4742423295974731, + "learning_rate": 2.8122308354866497e-06, + "loss": 0.8912, + "step": 654 + }, + { + "epoch": 1.0155038759689923, + "grad_norm": 1.3270450830459595, + "learning_rate": 2.8079242032730407e-06, + "loss": 0.8977, + "step": 655 + }, + { + "epoch": 1.0170542635658915, + "grad_norm": 1.7434755563735962, + "learning_rate": 2.8036175710594317e-06, + "loss": 0.912, + "step": 656 + }, + { + "epoch": 1.0186046511627906, + "grad_norm": 1.4647204875946045, + "learning_rate": 2.7993109388458226e-06, + "loss": 0.8749, + "step": 657 + }, + { + "epoch": 1.0201550387596898, + "grad_norm": 1.4478235244750977, + "learning_rate": 2.795004306632214e-06, + "loss": 0.9043, + "step": 658 + }, + { + "epoch": 1.0217054263565892, + "grad_norm": 1.345246434211731, + "learning_rate": 2.790697674418605e-06, + "loss": 0.9033, + "step": 659 + }, + { + "epoch": 1.0232558139534884, + "grad_norm": 1.8008910417556763, + "learning_rate": 2.786391042204996e-06, + "loss": 0.9009, + "step": 660 + }, + { + "epoch": 1.0232558139534884, + "eval_loss": 1.0404847860336304, + "eval_runtime": 46.7813, + "eval_samples_per_second": 21.376, + "eval_steps_per_second": 1.347, + "step": 660 + }, + { + "epoch": 1.0248062015503876, + "grad_norm": 1.5372025966644287, + "learning_rate": 2.782084409991387e-06, + "loss": 0.8794, + "step": 661 + }, + { + "epoch": 1.0263565891472868, + "grad_norm": 1.5077435970306396, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.8922, + "step": 662 + }, + { + "epoch": 1.027906976744186, + "grad_norm": 1.6719175577163696, + "learning_rate": 2.773471145564169e-06, + "loss": 0.9135, + "step": 663 + }, + { + "epoch": 1.0294573643410854, + "grad_norm": 1.5047252178192139, + "learning_rate": 2.7691645133505603e-06, + "loss": 0.8653, + "step": 664 + }, + { + "epoch": 1.0310077519379846, + "grad_norm": 1.3442022800445557, + "learning_rate": 2.764857881136951e-06, + "loss": 0.9235, + "step": 665 + }, + { + "epoch": 1.0325581395348837, + "grad_norm": 1.690727949142456, + "learning_rate": 2.760551248923342e-06, + "loss": 0.8707, + "step": 666 + }, + { + "epoch": 1.034108527131783, + "grad_norm": 1.3899257183074951, + "learning_rate": 2.7562446167097336e-06, + "loss": 0.8895, + "step": 667 + }, + { + "epoch": 1.035658914728682, + "grad_norm": 1.6989372968673706, + "learning_rate": 2.751937984496124e-06, + "loss": 0.8967, + "step": 668 + }, + { + "epoch": 1.0372093023255813, + "grad_norm": 1.4262644052505493, + "learning_rate": 2.7476313522825155e-06, + "loss": 0.8856, + "step": 669 + }, + { + "epoch": 1.0387596899224807, + "grad_norm": 1.456101894378662, + "learning_rate": 2.743324720068906e-06, + "loss": 0.8853, + "step": 670 + }, + { + "epoch": 1.0387596899224807, + "eval_loss": 1.0409016609191895, + "eval_runtime": 46.7499, + "eval_samples_per_second": 21.39, + "eval_steps_per_second": 1.348, + "step": 670 + }, + { + "epoch": 1.0403100775193799, + "grad_norm": 1.3542068004608154, + "learning_rate": 2.7390180878552975e-06, + "loss": 0.8617, + "step": 671 + }, + { + "epoch": 1.041860465116279, + "grad_norm": 1.4485602378845215, + "learning_rate": 2.734711455641688e-06, + "loss": 0.8755, + "step": 672 + }, + { + "epoch": 1.0434108527131782, + "grad_norm": 1.6059707403182983, + "learning_rate": 2.7304048234280794e-06, + "loss": 0.8995, + "step": 673 + }, + { + "epoch": 1.0449612403100774, + "grad_norm": 1.3006627559661865, + "learning_rate": 2.726098191214471e-06, + "loss": 0.8453, + "step": 674 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 1.3277314901351929, + "learning_rate": 2.7217915590008613e-06, + "loss": 0.894, + "step": 675 + }, + { + "epoch": 1.048062015503876, + "grad_norm": 1.4030394554138184, + "learning_rate": 2.7174849267872527e-06, + "loss": 0.8726, + "step": 676 + }, + { + "epoch": 1.0496124031007752, + "grad_norm": 1.308321237564087, + "learning_rate": 2.7131782945736433e-06, + "loss": 0.9121, + "step": 677 + }, + { + "epoch": 1.0511627906976744, + "grad_norm": 1.4298022985458374, + "learning_rate": 2.7088716623600347e-06, + "loss": 0.8867, + "step": 678 + }, + { + "epoch": 1.0527131782945736, + "grad_norm": 1.473902702331543, + "learning_rate": 2.7045650301464256e-06, + "loss": 0.8777, + "step": 679 + }, + { + "epoch": 1.054263565891473, + "grad_norm": 1.275970697402954, + "learning_rate": 2.7002583979328166e-06, + "loss": 0.884, + "step": 680 + }, + { + "epoch": 1.054263565891473, + "eval_loss": 1.0358476638793945, + "eval_runtime": 46.7815, + "eval_samples_per_second": 21.376, + "eval_steps_per_second": 1.347, + "step": 680 + }, + { + "epoch": 1.0558139534883721, + "grad_norm": 1.3834108114242554, + "learning_rate": 2.695951765719208e-06, + "loss": 0.902, + "step": 681 + }, + { + "epoch": 1.0573643410852713, + "grad_norm": 1.5840026140213013, + "learning_rate": 2.691645133505599e-06, + "loss": 0.8921, + "step": 682 + }, + { + "epoch": 1.0589147286821705, + "grad_norm": 1.3392385244369507, + "learning_rate": 2.68733850129199e-06, + "loss": 0.8747, + "step": 683 + }, + { + "epoch": 1.0604651162790697, + "grad_norm": 1.3050808906555176, + "learning_rate": 2.683031869078381e-06, + "loss": 0.9219, + "step": 684 + }, + { + "epoch": 1.062015503875969, + "grad_norm": 1.5907018184661865, + "learning_rate": 2.678725236864772e-06, + "loss": 0.9024, + "step": 685 + }, + { + "epoch": 1.0635658914728683, + "grad_norm": 1.4619678258895874, + "learning_rate": 2.674418604651163e-06, + "loss": 0.8858, + "step": 686 + }, + { + "epoch": 1.0651162790697675, + "grad_norm": 1.3546361923217773, + "learning_rate": 2.6701119724375542e-06, + "loss": 0.8739, + "step": 687 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 1.4255887269973755, + "learning_rate": 2.665805340223945e-06, + "loss": 0.8941, + "step": 688 + }, + { + "epoch": 1.0682170542635658, + "grad_norm": 1.3893107175827026, + "learning_rate": 2.661498708010336e-06, + "loss": 0.8999, + "step": 689 + }, + { + "epoch": 1.069767441860465, + "grad_norm": 1.3497520685195923, + "learning_rate": 2.657192075796727e-06, + "loss": 0.8921, + "step": 690 + }, + { + "epoch": 1.069767441860465, + "eval_loss": 1.0340094566345215, + "eval_runtime": 46.7821, + "eval_samples_per_second": 21.376, + "eval_steps_per_second": 1.347, + "step": 690 + }, + { + "epoch": 1.0713178294573644, + "grad_norm": 1.3812423944473267, + "learning_rate": 2.652885443583118e-06, + "loss": 0.8989, + "step": 691 + }, + { + "epoch": 1.0728682170542636, + "grad_norm": 1.6061986684799194, + "learning_rate": 2.6485788113695095e-06, + "loss": 0.8508, + "step": 692 + }, + { + "epoch": 1.0744186046511628, + "grad_norm": 1.4533159732818604, + "learning_rate": 2.6442721791559e-06, + "loss": 0.8722, + "step": 693 + }, + { + "epoch": 1.075968992248062, + "grad_norm": 1.398797869682312, + "learning_rate": 2.6399655469422914e-06, + "loss": 0.9048, + "step": 694 + }, + { + "epoch": 1.0775193798449612, + "grad_norm": 1.404852271080017, + "learning_rate": 2.635658914728683e-06, + "loss": 0.8665, + "step": 695 + }, + { + "epoch": 1.0790697674418606, + "grad_norm": 1.5127512216567993, + "learning_rate": 2.6313522825150734e-06, + "loss": 0.8915, + "step": 696 + }, + { + "epoch": 1.0806201550387597, + "grad_norm": 1.6629828214645386, + "learning_rate": 2.6270456503014647e-06, + "loss": 0.8929, + "step": 697 + }, + { + "epoch": 1.082170542635659, + "grad_norm": 1.7313421964645386, + "learning_rate": 2.6227390180878553e-06, + "loss": 0.8704, + "step": 698 + }, + { + "epoch": 1.083720930232558, + "grad_norm": 1.555357813835144, + "learning_rate": 2.6184323858742467e-06, + "loss": 0.9208, + "step": 699 + }, + { + "epoch": 1.0852713178294573, + "grad_norm": 1.4509795904159546, + "learning_rate": 2.6141257536606372e-06, + "loss": 0.9248, + "step": 700 + }, + { + "epoch": 1.0852713178294573, + "eval_loss": 1.0356652736663818, + "eval_runtime": 46.7938, + "eval_samples_per_second": 21.37, + "eval_steps_per_second": 1.346, + "step": 700 + }, + { + "epoch": 1.0868217054263565, + "grad_norm": 1.5227144956588745, + "learning_rate": 2.6098191214470286e-06, + "loss": 0.8902, + "step": 701 + }, + { + "epoch": 1.0883720930232559, + "grad_norm": 1.8152211904525757, + "learning_rate": 2.60551248923342e-06, + "loss": 0.9044, + "step": 702 + }, + { + "epoch": 1.089922480620155, + "grad_norm": 1.3922104835510254, + "learning_rate": 2.6012058570198106e-06, + "loss": 0.9128, + "step": 703 + }, + { + "epoch": 1.0914728682170542, + "grad_norm": 1.2599694728851318, + "learning_rate": 2.596899224806202e-06, + "loss": 0.9084, + "step": 704 + }, + { + "epoch": 1.0930232558139534, + "grad_norm": 1.395390510559082, + "learning_rate": 2.5925925925925925e-06, + "loss": 0.923, + "step": 705 + }, + { + "epoch": 1.0945736434108526, + "grad_norm": 1.374895691871643, + "learning_rate": 2.588285960378984e-06, + "loss": 0.8996, + "step": 706 + }, + { + "epoch": 1.096124031007752, + "grad_norm": 1.4008183479309082, + "learning_rate": 2.583979328165375e-06, + "loss": 0.9291, + "step": 707 + }, + { + "epoch": 1.0976744186046512, + "grad_norm": 1.3724210262298584, + "learning_rate": 2.579672695951766e-06, + "loss": 0.8805, + "step": 708 + }, + { + "epoch": 1.0992248062015504, + "grad_norm": 1.417680025100708, + "learning_rate": 2.575366063738157e-06, + "loss": 0.8795, + "step": 709 + }, + { + "epoch": 1.1007751937984496, + "grad_norm": 1.5281082391738892, + "learning_rate": 2.5710594315245478e-06, + "loss": 0.9098, + "step": 710 + }, + { + "epoch": 1.1007751937984496, + "eval_loss": 1.036144733428955, + "eval_runtime": 46.7339, + "eval_samples_per_second": 21.398, + "eval_steps_per_second": 1.348, + "step": 710 + }, + { + "epoch": 1.1023255813953488, + "grad_norm": 1.631238341331482, + "learning_rate": 2.566752799310939e-06, + "loss": 0.9207, + "step": 711 + }, + { + "epoch": 1.1038759689922482, + "grad_norm": 1.405727744102478, + "learning_rate": 2.56244616709733e-06, + "loss": 0.8988, + "step": 712 + }, + { + "epoch": 1.1054263565891473, + "grad_norm": 1.3681135177612305, + "learning_rate": 2.558139534883721e-06, + "loss": 0.8691, + "step": 713 + }, + { + "epoch": 1.1069767441860465, + "grad_norm": 1.588548183441162, + "learning_rate": 2.553832902670112e-06, + "loss": 0.9006, + "step": 714 + }, + { + "epoch": 1.1085271317829457, + "grad_norm": 1.4774502515792847, + "learning_rate": 2.5495262704565034e-06, + "loss": 0.8987, + "step": 715 + }, + { + "epoch": 1.110077519379845, + "grad_norm": 1.4388251304626465, + "learning_rate": 2.5452196382428944e-06, + "loss": 0.8796, + "step": 716 + }, + { + "epoch": 1.1116279069767443, + "grad_norm": 1.5053632259368896, + "learning_rate": 2.5409130060292854e-06, + "loss": 0.9051, + "step": 717 + }, + { + "epoch": 1.1131782945736435, + "grad_norm": 1.7834973335266113, + "learning_rate": 2.5366063738156763e-06, + "loss": 0.9509, + "step": 718 + }, + { + "epoch": 1.1147286821705427, + "grad_norm": 1.351125717163086, + "learning_rate": 2.5322997416020673e-06, + "loss": 0.9065, + "step": 719 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 1.4976617097854614, + "learning_rate": 2.5279931093884587e-06, + "loss": 0.8987, + "step": 720 + }, + { + "epoch": 1.1162790697674418, + "eval_loss": 1.0367296934127808, + "eval_runtime": 46.7237, + "eval_samples_per_second": 21.402, + "eval_steps_per_second": 1.348, + "step": 720 + }, + { + "epoch": 1.117829457364341, + "grad_norm": 1.302541732788086, + "learning_rate": 2.5236864771748492e-06, + "loss": 0.8542, + "step": 721 + }, + { + "epoch": 1.1193798449612402, + "grad_norm": 1.36167311668396, + "learning_rate": 2.5193798449612406e-06, + "loss": 0.886, + "step": 722 + }, + { + "epoch": 1.1209302325581396, + "grad_norm": 1.4094293117523193, + "learning_rate": 2.515073212747632e-06, + "loss": 0.912, + "step": 723 + }, + { + "epoch": 1.1224806201550388, + "grad_norm": 1.5545191764831543, + "learning_rate": 2.5107665805340226e-06, + "loss": 0.9054, + "step": 724 + }, + { + "epoch": 1.124031007751938, + "grad_norm": 1.5490257740020752, + "learning_rate": 2.506459948320414e-06, + "loss": 0.9226, + "step": 725 + }, + { + "epoch": 1.1255813953488372, + "grad_norm": 1.3758277893066406, + "learning_rate": 2.5021533161068045e-06, + "loss": 0.8985, + "step": 726 + }, + { + "epoch": 1.1271317829457363, + "grad_norm": 1.403664231300354, + "learning_rate": 2.497846683893196e-06, + "loss": 0.8851, + "step": 727 + }, + { + "epoch": 1.1286821705426358, + "grad_norm": 1.4780570268630981, + "learning_rate": 2.493540051679587e-06, + "loss": 0.932, + "step": 728 + }, + { + "epoch": 1.130232558139535, + "grad_norm": 1.527536153793335, + "learning_rate": 2.489233419465978e-06, + "loss": 0.8963, + "step": 729 + }, + { + "epoch": 1.1317829457364341, + "grad_norm": 1.3349652290344238, + "learning_rate": 2.484926787252369e-06, + "loss": 0.8905, + "step": 730 + }, + { + "epoch": 1.1317829457364341, + "eval_loss": 1.031362533569336, + "eval_runtime": 46.6489, + "eval_samples_per_second": 21.437, + "eval_steps_per_second": 1.351, + "step": 730 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.4726243019104004, + "learning_rate": 2.4806201550387598e-06, + "loss": 0.9037, + "step": 731 + }, + { + "epoch": 1.1348837209302325, + "grad_norm": 1.2975237369537354, + "learning_rate": 2.4763135228251507e-06, + "loss": 0.9131, + "step": 732 + }, + { + "epoch": 1.1364341085271317, + "grad_norm": 1.412916660308838, + "learning_rate": 2.4720068906115417e-06, + "loss": 0.9124, + "step": 733 + }, + { + "epoch": 1.137984496124031, + "grad_norm": 1.5443305969238281, + "learning_rate": 2.467700258397933e-06, + "loss": 0.9019, + "step": 734 + }, + { + "epoch": 1.1395348837209303, + "grad_norm": 1.422319769859314, + "learning_rate": 2.463393626184324e-06, + "loss": 0.8924, + "step": 735 + }, + { + "epoch": 1.1410852713178294, + "grad_norm": 1.3379539251327515, + "learning_rate": 2.459086993970715e-06, + "loss": 0.8707, + "step": 736 + }, + { + "epoch": 1.1426356589147286, + "grad_norm": 1.4538874626159668, + "learning_rate": 2.454780361757106e-06, + "loss": 0.8878, + "step": 737 + }, + { + "epoch": 1.1441860465116278, + "grad_norm": 1.3966622352600098, + "learning_rate": 2.450473729543497e-06, + "loss": 0.8923, + "step": 738 + }, + { + "epoch": 1.1457364341085272, + "grad_norm": 1.3471720218658447, + "learning_rate": 2.4461670973298884e-06, + "loss": 0.8864, + "step": 739 + }, + { + "epoch": 1.1472868217054264, + "grad_norm": 1.378870964050293, + "learning_rate": 2.4418604651162793e-06, + "loss": 0.8591, + "step": 740 + }, + { + "epoch": 1.1472868217054264, + "eval_loss": 1.0319344997406006, + "eval_runtime": 46.729, + "eval_samples_per_second": 21.4, + "eval_steps_per_second": 1.348, + "step": 740 + }, + { + "epoch": 1.1488372093023256, + "grad_norm": 1.4001867771148682, + "learning_rate": 2.4375538329026703e-06, + "loss": 0.8649, + "step": 741 + }, + { + "epoch": 1.1503875968992248, + "grad_norm": 1.3273779153823853, + "learning_rate": 2.4332472006890613e-06, + "loss": 0.8754, + "step": 742 + }, + { + "epoch": 1.151937984496124, + "grad_norm": 1.8371070623397827, + "learning_rate": 2.4289405684754527e-06, + "loss": 0.9154, + "step": 743 + }, + { + "epoch": 1.1534883720930234, + "grad_norm": 1.433976411819458, + "learning_rate": 2.4246339362618436e-06, + "loss": 0.883, + "step": 744 + }, + { + "epoch": 1.1550387596899225, + "grad_norm": 1.651013731956482, + "learning_rate": 2.4203273040482346e-06, + "loss": 0.8659, + "step": 745 + }, + { + "epoch": 1.1565891472868217, + "grad_norm": 1.5998989343643188, + "learning_rate": 2.4160206718346256e-06, + "loss": 0.8754, + "step": 746 + }, + { + "epoch": 1.158139534883721, + "grad_norm": 1.4122543334960938, + "learning_rate": 2.4117140396210165e-06, + "loss": 0.8901, + "step": 747 + }, + { + "epoch": 1.15968992248062, + "grad_norm": 1.4213175773620605, + "learning_rate": 2.4074074074074075e-06, + "loss": 0.8925, + "step": 748 + }, + { + "epoch": 1.1612403100775195, + "grad_norm": 1.3378446102142334, + "learning_rate": 2.403100775193799e-06, + "loss": 0.9257, + "step": 749 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 1.4897722005844116, + "learning_rate": 2.39879414298019e-06, + "loss": 0.8783, + "step": 750 + }, + { + "epoch": 1.1627906976744187, + "eval_loss": 1.0316088199615479, + "eval_runtime": 46.8059, + "eval_samples_per_second": 21.365, + "eval_steps_per_second": 1.346, + "step": 750 + }, + { + "epoch": 1.1643410852713179, + "grad_norm": 1.5811142921447754, + "learning_rate": 2.394487510766581e-06, + "loss": 0.8976, + "step": 751 + }, + { + "epoch": 1.165891472868217, + "grad_norm": 1.561867117881775, + "learning_rate": 2.390180878552972e-06, + "loss": 0.8731, + "step": 752 + }, + { + "epoch": 1.1674418604651162, + "grad_norm": 1.3906325101852417, + "learning_rate": 2.3858742463393628e-06, + "loss": 0.9042, + "step": 753 + }, + { + "epoch": 1.1689922480620156, + "grad_norm": 1.5785614252090454, + "learning_rate": 2.3815676141257537e-06, + "loss": 0.8894, + "step": 754 + }, + { + "epoch": 1.1705426356589148, + "grad_norm": 1.5823315382003784, + "learning_rate": 2.3772609819121447e-06, + "loss": 0.8894, + "step": 755 + }, + { + "epoch": 1.172093023255814, + "grad_norm": 1.4650275707244873, + "learning_rate": 2.372954349698536e-06, + "loss": 0.9021, + "step": 756 + }, + { + "epoch": 1.1736434108527132, + "grad_norm": 1.4748083353042603, + "learning_rate": 2.368647717484927e-06, + "loss": 0.9031, + "step": 757 + }, + { + "epoch": 1.1751937984496124, + "grad_norm": 1.4308465719223022, + "learning_rate": 2.364341085271318e-06, + "loss": 0.8562, + "step": 758 + }, + { + "epoch": 1.1767441860465115, + "grad_norm": 1.4933222532272339, + "learning_rate": 2.360034453057709e-06, + "loss": 0.8546, + "step": 759 + }, + { + "epoch": 1.178294573643411, + "grad_norm": 1.376835823059082, + "learning_rate": 2.3557278208441e-06, + "loss": 0.8787, + "step": 760 + }, + { + "epoch": 1.178294573643411, + "eval_loss": 1.0326775312423706, + "eval_runtime": 46.6601, + "eval_samples_per_second": 21.432, + "eval_steps_per_second": 1.35, + "step": 760 + }, + { + "epoch": 1.1798449612403101, + "grad_norm": 1.7554876804351807, + "learning_rate": 2.351421188630491e-06, + "loss": 0.9169, + "step": 761 + }, + { + "epoch": 1.1813953488372093, + "grad_norm": 1.345408320426941, + "learning_rate": 2.347114556416882e-06, + "loss": 0.8787, + "step": 762 + }, + { + "epoch": 1.1829457364341085, + "grad_norm": 1.4579490423202515, + "learning_rate": 2.3428079242032733e-06, + "loss": 0.8528, + "step": 763 + }, + { + "epoch": 1.1844961240310077, + "grad_norm": 1.4105459451675415, + "learning_rate": 2.3385012919896642e-06, + "loss": 0.8845, + "step": 764 + }, + { + "epoch": 1.1860465116279069, + "grad_norm": 1.4601131677627563, + "learning_rate": 2.3341946597760552e-06, + "loss": 0.8702, + "step": 765 + }, + { + "epoch": 1.1875968992248063, + "grad_norm": 1.5710184574127197, + "learning_rate": 2.329888027562446e-06, + "loss": 0.8971, + "step": 766 + }, + { + "epoch": 1.1891472868217055, + "grad_norm": 1.5850234031677246, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.8814, + "step": 767 + }, + { + "epoch": 1.1906976744186046, + "grad_norm": 1.4523247480392456, + "learning_rate": 2.3212747631352285e-06, + "loss": 0.9126, + "step": 768 + }, + { + "epoch": 1.1922480620155038, + "grad_norm": 1.5494409799575806, + "learning_rate": 2.3169681309216195e-06, + "loss": 0.9006, + "step": 769 + }, + { + "epoch": 1.193798449612403, + "grad_norm": 1.81002676486969, + "learning_rate": 2.3126614987080105e-06, + "loss": 0.9018, + "step": 770 + }, + { + "epoch": 1.193798449612403, + "eval_loss": 1.0328283309936523, + "eval_runtime": 46.7572, + "eval_samples_per_second": 21.387, + "eval_steps_per_second": 1.347, + "step": 770 + }, + { + "epoch": 1.1953488372093024, + "grad_norm": 1.527626633644104, + "learning_rate": 2.3083548664944014e-06, + "loss": 0.866, + "step": 771 + }, + { + "epoch": 1.1968992248062016, + "grad_norm": 1.469875693321228, + "learning_rate": 2.304048234280793e-06, + "loss": 0.8965, + "step": 772 + }, + { + "epoch": 1.1984496124031008, + "grad_norm": 1.4807052612304688, + "learning_rate": 2.299741602067184e-06, + "loss": 0.8492, + "step": 773 + }, + { + "epoch": 1.2, + "grad_norm": 1.494244933128357, + "learning_rate": 2.2954349698535748e-06, + "loss": 0.906, + "step": 774 + }, + { + "epoch": 1.2015503875968991, + "grad_norm": 1.5816459655761719, + "learning_rate": 2.2911283376399657e-06, + "loss": 0.8956, + "step": 775 + }, + { + "epoch": 1.2031007751937985, + "grad_norm": 1.479781150817871, + "learning_rate": 2.2868217054263567e-06, + "loss": 0.8492, + "step": 776 + }, + { + "epoch": 1.2046511627906977, + "grad_norm": 1.469319462776184, + "learning_rate": 2.282515073212748e-06, + "loss": 0.8865, + "step": 777 + }, + { + "epoch": 1.206201550387597, + "grad_norm": 1.88156259059906, + "learning_rate": 2.278208440999139e-06, + "loss": 0.9198, + "step": 778 + }, + { + "epoch": 1.207751937984496, + "grad_norm": 1.3706072568893433, + "learning_rate": 2.27390180878553e-06, + "loss": 0.88, + "step": 779 + }, + { + "epoch": 1.2093023255813953, + "grad_norm": 1.4646852016448975, + "learning_rate": 2.269595176571921e-06, + "loss": 0.8898, + "step": 780 + }, + { + "epoch": 1.2093023255813953, + "eval_loss": 1.029195785522461, + "eval_runtime": 46.8464, + "eval_samples_per_second": 21.346, + "eval_steps_per_second": 1.345, + "step": 780 + }, + { + "epoch": 1.2108527131782947, + "grad_norm": 1.5722054243087769, + "learning_rate": 2.265288544358312e-06, + "loss": 0.8598, + "step": 781 + }, + { + "epoch": 1.2124031007751939, + "grad_norm": 1.3713362216949463, + "learning_rate": 2.260981912144703e-06, + "loss": 0.9053, + "step": 782 + }, + { + "epoch": 1.213953488372093, + "grad_norm": 1.4536453485488892, + "learning_rate": 2.256675279931094e-06, + "loss": 0.8515, + "step": 783 + }, + { + "epoch": 1.2155038759689922, + "grad_norm": 1.6506465673446655, + "learning_rate": 2.252368647717485e-06, + "loss": 0.9042, + "step": 784 + }, + { + "epoch": 1.2170542635658914, + "grad_norm": 1.5541293621063232, + "learning_rate": 2.2480620155038763e-06, + "loss": 0.8998, + "step": 785 + }, + { + "epoch": 1.2186046511627908, + "grad_norm": 1.3651714324951172, + "learning_rate": 2.2437553832902672e-06, + "loss": 0.9014, + "step": 786 + }, + { + "epoch": 1.22015503875969, + "grad_norm": 1.7241014242172241, + "learning_rate": 2.239448751076658e-06, + "loss": 0.9098, + "step": 787 + }, + { + "epoch": 1.2217054263565892, + "grad_norm": 1.600130558013916, + "learning_rate": 2.235142118863049e-06, + "loss": 0.8904, + "step": 788 + }, + { + "epoch": 1.2232558139534884, + "grad_norm": 1.5315771102905273, + "learning_rate": 2.23083548664944e-06, + "loss": 0.9212, + "step": 789 + }, + { + "epoch": 1.2248062015503876, + "grad_norm": 1.463247537612915, + "learning_rate": 2.226528854435831e-06, + "loss": 0.8708, + "step": 790 + }, + { + "epoch": 1.2248062015503876, + "eval_loss": 1.0294724702835083, + "eval_runtime": 46.9288, + "eval_samples_per_second": 21.309, + "eval_steps_per_second": 1.342, + "step": 790 + }, + { + "epoch": 1.2263565891472867, + "grad_norm": 1.5448787212371826, + "learning_rate": 2.222222222222222e-06, + "loss": 0.8834, + "step": 791 + }, + { + "epoch": 1.2279069767441861, + "grad_norm": 1.6028432846069336, + "learning_rate": 2.2179155900086135e-06, + "loss": 0.8902, + "step": 792 + }, + { + "epoch": 1.2294573643410853, + "grad_norm": 1.4141221046447754, + "learning_rate": 2.2136089577950044e-06, + "loss": 0.8757, + "step": 793 + }, + { + "epoch": 1.2310077519379845, + "grad_norm": 1.753729224205017, + "learning_rate": 2.2093023255813954e-06, + "loss": 0.9163, + "step": 794 + }, + { + "epoch": 1.2325581395348837, + "grad_norm": 1.4352757930755615, + "learning_rate": 2.2049956933677864e-06, + "loss": 0.8727, + "step": 795 + }, + { + "epoch": 1.2341085271317829, + "grad_norm": 1.4990347623825073, + "learning_rate": 2.2006890611541778e-06, + "loss": 0.8983, + "step": 796 + }, + { + "epoch": 1.235658914728682, + "grad_norm": 1.5358068943023682, + "learning_rate": 2.1963824289405687e-06, + "loss": 0.904, + "step": 797 + }, + { + "epoch": 1.2372093023255815, + "grad_norm": 1.5169130563735962, + "learning_rate": 2.1920757967269597e-06, + "loss": 0.9223, + "step": 798 + }, + { + "epoch": 1.2387596899224806, + "grad_norm": 1.5093134641647339, + "learning_rate": 2.1877691645133507e-06, + "loss": 0.8884, + "step": 799 + }, + { + "epoch": 1.2403100775193798, + "grad_norm": 1.4294668436050415, + "learning_rate": 2.183462532299742e-06, + "loss": 0.8961, + "step": 800 + }, + { + "epoch": 1.2403100775193798, + "eval_loss": 1.0295883417129517, + "eval_runtime": 46.8817, + "eval_samples_per_second": 21.33, + "eval_steps_per_second": 1.344, + "step": 800 + }, + { + "epoch": 1.241860465116279, + "grad_norm": 1.3810713291168213, + "learning_rate": 2.179155900086133e-06, + "loss": 0.8758, + "step": 801 + }, + { + "epoch": 1.2434108527131782, + "grad_norm": 1.5186481475830078, + "learning_rate": 2.174849267872524e-06, + "loss": 0.8674, + "step": 802 + }, + { + "epoch": 1.2449612403100776, + "grad_norm": 1.4637038707733154, + "learning_rate": 2.170542635658915e-06, + "loss": 0.8852, + "step": 803 + }, + { + "epoch": 1.2465116279069768, + "grad_norm": 1.553092122077942, + "learning_rate": 2.166236003445306e-06, + "loss": 0.886, + "step": 804 + }, + { + "epoch": 1.248062015503876, + "grad_norm": 1.5428677797317505, + "learning_rate": 2.161929371231697e-06, + "loss": 0.8984, + "step": 805 + }, + { + "epoch": 1.2496124031007751, + "grad_norm": 1.6426329612731934, + "learning_rate": 2.1576227390180883e-06, + "loss": 0.935, + "step": 806 + }, + { + "epoch": 1.2511627906976743, + "grad_norm": 1.3592965602874756, + "learning_rate": 2.1533161068044793e-06, + "loss": 0.9014, + "step": 807 + }, + { + "epoch": 1.2527131782945737, + "grad_norm": 1.5501923561096191, + "learning_rate": 2.1490094745908702e-06, + "loss": 0.8659, + "step": 808 + }, + { + "epoch": 1.254263565891473, + "grad_norm": 1.3319815397262573, + "learning_rate": 2.144702842377261e-06, + "loss": 0.8818, + "step": 809 + }, + { + "epoch": 1.255813953488372, + "grad_norm": 1.3006103038787842, + "learning_rate": 2.140396210163652e-06, + "loss": 0.8787, + "step": 810 + }, + { + "epoch": 1.255813953488372, + "eval_loss": 1.028241753578186, + "eval_runtime": 46.6041, + "eval_samples_per_second": 21.457, + "eval_steps_per_second": 1.352, + "step": 810 + }, + { + "epoch": 1.2573643410852713, + "grad_norm": 1.4956690073013306, + "learning_rate": 2.136089577950043e-06, + "loss": 0.8905, + "step": 811 + }, + { + "epoch": 1.2589147286821705, + "grad_norm": 1.419439673423767, + "learning_rate": 2.131782945736434e-06, + "loss": 0.8329, + "step": 812 + }, + { + "epoch": 1.2604651162790699, + "grad_norm": 1.302194356918335, + "learning_rate": 2.127476313522825e-06, + "loss": 0.8731, + "step": 813 + }, + { + "epoch": 1.262015503875969, + "grad_norm": 1.4846899509429932, + "learning_rate": 2.1231696813092165e-06, + "loss": 0.8761, + "step": 814 + }, + { + "epoch": 1.2635658914728682, + "grad_norm": 1.4464991092681885, + "learning_rate": 2.1188630490956074e-06, + "loss": 0.8933, + "step": 815 + }, + { + "epoch": 1.2651162790697674, + "grad_norm": 1.411321759223938, + "learning_rate": 2.1145564168819984e-06, + "loss": 0.9096, + "step": 816 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.3309530019760132, + "learning_rate": 2.1102497846683894e-06, + "loss": 0.8554, + "step": 817 + }, + { + "epoch": 1.268217054263566, + "grad_norm": 1.4233335256576538, + "learning_rate": 2.1059431524547803e-06, + "loss": 0.9031, + "step": 818 + }, + { + "epoch": 1.2697674418604652, + "grad_norm": 1.4139257669448853, + "learning_rate": 2.1016365202411713e-06, + "loss": 0.8552, + "step": 819 + }, + { + "epoch": 1.2713178294573644, + "grad_norm": 1.3686296939849854, + "learning_rate": 2.0973298880275627e-06, + "loss": 0.8897, + "step": 820 + }, + { + "epoch": 1.2713178294573644, + "eval_loss": 1.0273642539978027, + "eval_runtime": 46.6682, + "eval_samples_per_second": 21.428, + "eval_steps_per_second": 1.35, + "step": 820 + }, + { + "epoch": 1.2728682170542636, + "grad_norm": 1.530033826828003, + "learning_rate": 2.0930232558139536e-06, + "loss": 0.8856, + "step": 821 + }, + { + "epoch": 1.2744186046511627, + "grad_norm": 1.335328221321106, + "learning_rate": 2.0887166236003446e-06, + "loss": 0.8437, + "step": 822 + }, + { + "epoch": 1.2759689922480622, + "grad_norm": 1.6352286338806152, + "learning_rate": 2.0844099913867356e-06, + "loss": 0.9048, + "step": 823 + }, + { + "epoch": 1.2775193798449611, + "grad_norm": 1.414017677307129, + "learning_rate": 2.080103359173127e-06, + "loss": 0.8967, + "step": 824 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 1.5296484231948853, + "learning_rate": 2.075796726959518e-06, + "loss": 0.8628, + "step": 825 + }, + { + "epoch": 1.2806201550387597, + "grad_norm": 1.4565972089767456, + "learning_rate": 2.071490094745909e-06, + "loss": 0.8846, + "step": 826 + }, + { + "epoch": 1.2821705426356589, + "grad_norm": 1.4244471788406372, + "learning_rate": 2.0671834625323e-06, + "loss": 0.8899, + "step": 827 + }, + { + "epoch": 1.283720930232558, + "grad_norm": 1.3682016134262085, + "learning_rate": 2.0628768303186913e-06, + "loss": 0.8437, + "step": 828 + }, + { + "epoch": 1.2852713178294572, + "grad_norm": 1.3579398393630981, + "learning_rate": 2.0585701981050822e-06, + "loss": 0.8981, + "step": 829 + }, + { + "epoch": 1.2868217054263567, + "grad_norm": 1.6583482027053833, + "learning_rate": 2.054263565891473e-06, + "loss": 0.8743, + "step": 830 + }, + { + "epoch": 1.2868217054263567, + "eval_loss": 1.0267916917800903, + "eval_runtime": 46.7644, + "eval_samples_per_second": 21.384, + "eval_steps_per_second": 1.347, + "step": 830 + }, + { + "epoch": 1.2883720930232558, + "grad_norm": 1.6551933288574219, + "learning_rate": 2.049956933677864e-06, + "loss": 0.9314, + "step": 831 + }, + { + "epoch": 1.289922480620155, + "grad_norm": 1.3006973266601562, + "learning_rate": 2.045650301464255e-06, + "loss": 0.8844, + "step": 832 + }, + { + "epoch": 1.2914728682170542, + "grad_norm": 1.4112263917922974, + "learning_rate": 2.041343669250646e-06, + "loss": 0.9061, + "step": 833 + }, + { + "epoch": 1.2930232558139534, + "grad_norm": 1.4028356075286865, + "learning_rate": 2.037037037037037e-06, + "loss": 0.8872, + "step": 834 + }, + { + "epoch": 1.2945736434108528, + "grad_norm": 1.5988967418670654, + "learning_rate": 2.0327304048234285e-06, + "loss": 0.8994, + "step": 835 + }, + { + "epoch": 1.296124031007752, + "grad_norm": 1.464126706123352, + "learning_rate": 2.0284237726098194e-06, + "loss": 0.8918, + "step": 836 + }, + { + "epoch": 1.2976744186046512, + "grad_norm": 1.4168930053710938, + "learning_rate": 2.0241171403962104e-06, + "loss": 0.9086, + "step": 837 + }, + { + "epoch": 1.2992248062015503, + "grad_norm": 1.6398478746414185, + "learning_rate": 2.0198105081826014e-06, + "loss": 0.88, + "step": 838 + }, + { + "epoch": 1.3007751937984495, + "grad_norm": 1.3792091608047485, + "learning_rate": 2.0155038759689923e-06, + "loss": 0.8951, + "step": 839 + }, + { + "epoch": 1.302325581395349, + "grad_norm": 1.5528695583343506, + "learning_rate": 2.0111972437553833e-06, + "loss": 0.8858, + "step": 840 + }, + { + "epoch": 1.302325581395349, + "eval_loss": 1.026399850845337, + "eval_runtime": 46.7644, + "eval_samples_per_second": 21.384, + "eval_steps_per_second": 1.347, + "step": 840 + }, + { + "epoch": 1.3038759689922481, + "grad_norm": 1.271855115890503, + "learning_rate": 2.0068906115417743e-06, + "loss": 0.8877, + "step": 841 + }, + { + "epoch": 1.3054263565891473, + "grad_norm": 1.2496144771575928, + "learning_rate": 2.0025839793281657e-06, + "loss": 0.8959, + "step": 842 + }, + { + "epoch": 1.3069767441860465, + "grad_norm": 1.5338107347488403, + "learning_rate": 1.9982773471145566e-06, + "loss": 0.865, + "step": 843 + }, + { + "epoch": 1.3085271317829457, + "grad_norm": 1.368303894996643, + "learning_rate": 1.9939707149009476e-06, + "loss": 0.8666, + "step": 844 + }, + { + "epoch": 1.310077519379845, + "grad_norm": 1.3765580654144287, + "learning_rate": 1.9896640826873386e-06, + "loss": 0.8953, + "step": 845 + }, + { + "epoch": 1.3116279069767443, + "grad_norm": 1.5001338720321655, + "learning_rate": 1.9853574504737295e-06, + "loss": 0.8634, + "step": 846 + }, + { + "epoch": 1.3131782945736434, + "grad_norm": 1.650890588760376, + "learning_rate": 1.9810508182601205e-06, + "loss": 0.8917, + "step": 847 + }, + { + "epoch": 1.3147286821705426, + "grad_norm": 1.4128278493881226, + "learning_rate": 1.976744186046512e-06, + "loss": 0.9183, + "step": 848 + }, + { + "epoch": 1.3162790697674418, + "grad_norm": 1.561835527420044, + "learning_rate": 1.972437553832903e-06, + "loss": 0.8975, + "step": 849 + }, + { + "epoch": 1.3178294573643412, + "grad_norm": 1.3987250328063965, + "learning_rate": 1.968130921619294e-06, + "loss": 0.9322, + "step": 850 + }, + { + "epoch": 1.3178294573643412, + "eval_loss": 1.0248056650161743, + "eval_runtime": 46.7823, + "eval_samples_per_second": 21.376, + "eval_steps_per_second": 1.347, + "step": 850 + }, + { + "epoch": 1.3193798449612404, + "grad_norm": 1.349995493888855, + "learning_rate": 1.963824289405685e-06, + "loss": 0.8965, + "step": 851 + }, + { + "epoch": 1.3209302325581396, + "grad_norm": 1.7483686208724976, + "learning_rate": 1.959517657192076e-06, + "loss": 0.88, + "step": 852 + }, + { + "epoch": 1.3224806201550388, + "grad_norm": 1.3280638456344604, + "learning_rate": 1.955211024978467e-06, + "loss": 0.9086, + "step": 853 + }, + { + "epoch": 1.324031007751938, + "grad_norm": 1.4023438692092896, + "learning_rate": 1.950904392764858e-06, + "loss": 0.8778, + "step": 854 + }, + { + "epoch": 1.3255813953488373, + "grad_norm": 1.488553762435913, + "learning_rate": 1.946597760551249e-06, + "loss": 0.9065, + "step": 855 + }, + { + "epoch": 1.3271317829457363, + "grad_norm": 1.4213240146636963, + "learning_rate": 1.94229112833764e-06, + "loss": 0.8722, + "step": 856 + }, + { + "epoch": 1.3286821705426357, + "grad_norm": 1.4626022577285767, + "learning_rate": 1.9379844961240315e-06, + "loss": 0.8851, + "step": 857 + }, + { + "epoch": 1.330232558139535, + "grad_norm": 1.3919224739074707, + "learning_rate": 1.9336778639104224e-06, + "loss": 0.8531, + "step": 858 + }, + { + "epoch": 1.331782945736434, + "grad_norm": 1.4865458011627197, + "learning_rate": 1.9293712316968134e-06, + "loss": 0.897, + "step": 859 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.4093918800354004, + "learning_rate": 1.9250645994832044e-06, + "loss": 0.8791, + "step": 860 + }, + { + "epoch": 1.3333333333333333, + "eval_loss": 1.0247600078582764, + "eval_runtime": 46.7524, + "eval_samples_per_second": 21.389, + "eval_steps_per_second": 1.348, + "step": 860 + }, + { + "epoch": 1.3348837209302324, + "grad_norm": 1.3935306072235107, + "learning_rate": 1.9207579672695953e-06, + "loss": 0.8745, + "step": 861 + }, + { + "epoch": 1.3364341085271318, + "grad_norm": 1.6292296648025513, + "learning_rate": 1.9164513350559863e-06, + "loss": 0.8661, + "step": 862 + }, + { + "epoch": 1.337984496124031, + "grad_norm": 1.5986114740371704, + "learning_rate": 1.9121447028423773e-06, + "loss": 0.8763, + "step": 863 + }, + { + "epoch": 1.3395348837209302, + "grad_norm": 1.5964652299880981, + "learning_rate": 1.9078380706287687e-06, + "loss": 0.8869, + "step": 864 + }, + { + "epoch": 1.3410852713178294, + "grad_norm": 1.3685358762741089, + "learning_rate": 1.9035314384151596e-06, + "loss": 0.8891, + "step": 865 + }, + { + "epoch": 1.3426356589147286, + "grad_norm": 1.6272172927856445, + "learning_rate": 1.8992248062015506e-06, + "loss": 0.8855, + "step": 866 + }, + { + "epoch": 1.344186046511628, + "grad_norm": 1.669884204864502, + "learning_rate": 1.8949181739879416e-06, + "loss": 0.879, + "step": 867 + }, + { + "epoch": 1.3457364341085272, + "grad_norm": 1.4029791355133057, + "learning_rate": 1.8906115417743325e-06, + "loss": 0.8685, + "step": 868 + }, + { + "epoch": 1.3472868217054264, + "grad_norm": 1.5204144716262817, + "learning_rate": 1.8863049095607235e-06, + "loss": 0.9151, + "step": 869 + }, + { + "epoch": 1.3488372093023255, + "grad_norm": 1.6033802032470703, + "learning_rate": 1.8819982773471147e-06, + "loss": 0.8891, + "step": 870 + }, + { + "epoch": 1.3488372093023255, + "eval_loss": 1.0215697288513184, + "eval_runtime": 46.7118, + "eval_samples_per_second": 21.408, + "eval_steps_per_second": 1.349, + "step": 870 + }, + { + "epoch": 1.3503875968992247, + "grad_norm": 1.290747046470642, + "learning_rate": 1.8776916451335059e-06, + "loss": 0.8761, + "step": 871 + }, + { + "epoch": 1.3519379844961241, + "grad_norm": 1.5597420930862427, + "learning_rate": 1.8733850129198968e-06, + "loss": 0.8585, + "step": 872 + }, + { + "epoch": 1.3534883720930233, + "grad_norm": 1.3587156534194946, + "learning_rate": 1.8690783807062878e-06, + "loss": 0.8602, + "step": 873 + }, + { + "epoch": 1.3550387596899225, + "grad_norm": 1.3756153583526611, + "learning_rate": 1.864771748492679e-06, + "loss": 0.9024, + "step": 874 + }, + { + "epoch": 1.3565891472868217, + "grad_norm": 1.3279125690460205, + "learning_rate": 1.86046511627907e-06, + "loss": 0.8898, + "step": 875 + }, + { + "epoch": 1.3581395348837209, + "grad_norm": 1.416416883468628, + "learning_rate": 1.856158484065461e-06, + "loss": 0.8923, + "step": 876 + }, + { + "epoch": 1.3596899224806203, + "grad_norm": 1.5571491718292236, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.9037, + "step": 877 + }, + { + "epoch": 1.3612403100775194, + "grad_norm": 1.5523213148117065, + "learning_rate": 1.8475452196382433e-06, + "loss": 0.8944, + "step": 878 + }, + { + "epoch": 1.3627906976744186, + "grad_norm": 1.4088724851608276, + "learning_rate": 1.8432385874246342e-06, + "loss": 0.9118, + "step": 879 + }, + { + "epoch": 1.3643410852713178, + "grad_norm": 1.3843094110488892, + "learning_rate": 1.8389319552110252e-06, + "loss": 0.8646, + "step": 880 + }, + { + "epoch": 1.3643410852713178, + "eval_loss": 1.0226423740386963, + "eval_runtime": 46.7174, + "eval_samples_per_second": 21.405, + "eval_steps_per_second": 1.349, + "step": 880 + }, + { + "epoch": 1.365891472868217, + "grad_norm": 1.386268138885498, + "learning_rate": 1.8346253229974162e-06, + "loss": 0.8754, + "step": 881 + }, + { + "epoch": 1.3674418604651164, + "grad_norm": 1.3752778768539429, + "learning_rate": 1.8303186907838071e-06, + "loss": 0.8663, + "step": 882 + }, + { + "epoch": 1.3689922480620156, + "grad_norm": 1.56470787525177, + "learning_rate": 1.826012058570198e-06, + "loss": 0.8255, + "step": 883 + }, + { + "epoch": 1.3705426356589148, + "grad_norm": 1.2622406482696533, + "learning_rate": 1.8217054263565893e-06, + "loss": 0.8959, + "step": 884 + }, + { + "epoch": 1.372093023255814, + "grad_norm": 1.473328948020935, + "learning_rate": 1.8173987941429802e-06, + "loss": 0.8822, + "step": 885 + }, + { + "epoch": 1.3736434108527131, + "grad_norm": 1.4207509756088257, + "learning_rate": 1.8130921619293714e-06, + "loss": 0.8792, + "step": 886 + }, + { + "epoch": 1.3751937984496125, + "grad_norm": 1.6432552337646484, + "learning_rate": 1.8087855297157624e-06, + "loss": 0.8789, + "step": 887 + }, + { + "epoch": 1.3767441860465115, + "grad_norm": 1.360485315322876, + "learning_rate": 1.8044788975021536e-06, + "loss": 0.8578, + "step": 888 + }, + { + "epoch": 1.378294573643411, + "grad_norm": 1.2980175018310547, + "learning_rate": 1.8001722652885445e-06, + "loss": 0.9028, + "step": 889 + }, + { + "epoch": 1.37984496124031, + "grad_norm": 1.4118475914001465, + "learning_rate": 1.7958656330749355e-06, + "loss": 0.8923, + "step": 890 + }, + { + "epoch": 1.37984496124031, + "eval_loss": 1.0218552350997925, + "eval_runtime": 46.7621, + "eval_samples_per_second": 21.385, + "eval_steps_per_second": 1.347, + "step": 890 + }, + { + "epoch": 1.3813953488372093, + "grad_norm": 1.4917428493499756, + "learning_rate": 1.7915590008613265e-06, + "loss": 0.865, + "step": 891 + }, + { + "epoch": 1.3829457364341085, + "grad_norm": 1.2792720794677734, + "learning_rate": 1.7872523686477174e-06, + "loss": 0.8735, + "step": 892 + }, + { + "epoch": 1.3844961240310076, + "grad_norm": 1.3006173372268677, + "learning_rate": 1.7829457364341088e-06, + "loss": 0.8578, + "step": 893 + }, + { + "epoch": 1.386046511627907, + "grad_norm": 1.5386368036270142, + "learning_rate": 1.7786391042204998e-06, + "loss": 0.8904, + "step": 894 + }, + { + "epoch": 1.3875968992248062, + "grad_norm": 1.5131858587265015, + "learning_rate": 1.7743324720068908e-06, + "loss": 0.8607, + "step": 895 + }, + { + "epoch": 1.3891472868217054, + "grad_norm": 1.478264570236206, + "learning_rate": 1.7700258397932817e-06, + "loss": 0.8655, + "step": 896 + }, + { + "epoch": 1.3906976744186046, + "grad_norm": 1.4144504070281982, + "learning_rate": 1.7657192075796727e-06, + "loss": 0.8745, + "step": 897 + }, + { + "epoch": 1.3922480620155038, + "grad_norm": 1.3581087589263916, + "learning_rate": 1.7614125753660639e-06, + "loss": 0.8595, + "step": 898 + }, + { + "epoch": 1.3937984496124032, + "grad_norm": 1.3146580457687378, + "learning_rate": 1.7571059431524549e-06, + "loss": 0.8602, + "step": 899 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 1.5949207544326782, + "learning_rate": 1.752799310938846e-06, + "loss": 0.9105, + "step": 900 + }, + { + "epoch": 1.3953488372093024, + "eval_loss": 1.0218842029571533, + "eval_runtime": 46.6933, + "eval_samples_per_second": 21.416, + "eval_steps_per_second": 1.349, + "step": 900 + }, + { + "epoch": 1.3968992248062015, + "grad_norm": 1.3286627531051636, + "learning_rate": 1.748492678725237e-06, + "loss": 0.8767, + "step": 901 + }, + { + "epoch": 1.3984496124031007, + "grad_norm": 1.3980764150619507, + "learning_rate": 1.7441860465116282e-06, + "loss": 0.8573, + "step": 902 + }, + { + "epoch": 1.4, + "grad_norm": 1.4759293794631958, + "learning_rate": 1.7398794142980192e-06, + "loss": 0.8628, + "step": 903 + }, + { + "epoch": 1.4015503875968993, + "grad_norm": 1.4072186946868896, + "learning_rate": 1.7355727820844101e-06, + "loss": 0.8829, + "step": 904 + }, + { + "epoch": 1.4031007751937985, + "grad_norm": 1.3058583736419678, + "learning_rate": 1.731266149870801e-06, + "loss": 0.8833, + "step": 905 + }, + { + "epoch": 1.4046511627906977, + "grad_norm": 1.5181350708007812, + "learning_rate": 1.726959517657192e-06, + "loss": 0.8492, + "step": 906 + }, + { + "epoch": 1.4062015503875969, + "grad_norm": 1.4261865615844727, + "learning_rate": 1.7226528854435834e-06, + "loss": 0.8881, + "step": 907 + }, + { + "epoch": 1.407751937984496, + "grad_norm": 1.3683359622955322, + "learning_rate": 1.7183462532299744e-06, + "loss": 0.8904, + "step": 908 + }, + { + "epoch": 1.4093023255813955, + "grad_norm": 1.519061803817749, + "learning_rate": 1.7140396210163654e-06, + "loss": 0.8609, + "step": 909 + }, + { + "epoch": 1.4108527131782946, + "grad_norm": 1.5184195041656494, + "learning_rate": 1.7097329888027563e-06, + "loss": 0.8768, + "step": 910 + }, + { + "epoch": 1.4108527131782946, + "eval_loss": 1.0236916542053223, + "eval_runtime": 46.6499, + "eval_samples_per_second": 21.436, + "eval_steps_per_second": 1.35, + "step": 910 + }, + { + "epoch": 1.4124031007751938, + "grad_norm": 1.2919946908950806, + "learning_rate": 1.7054263565891473e-06, + "loss": 0.8799, + "step": 911 + }, + { + "epoch": 1.413953488372093, + "grad_norm": 1.4635928869247437, + "learning_rate": 1.7011197243755385e-06, + "loss": 0.9136, + "step": 912 + }, + { + "epoch": 1.4155038759689922, + "grad_norm": 1.3878158330917358, + "learning_rate": 1.6968130921619295e-06, + "loss": 0.8462, + "step": 913 + }, + { + "epoch": 1.4170542635658916, + "grad_norm": 1.375191569328308, + "learning_rate": 1.6925064599483206e-06, + "loss": 0.8843, + "step": 914 + }, + { + "epoch": 1.4186046511627908, + "grad_norm": 1.5198922157287598, + "learning_rate": 1.6881998277347116e-06, + "loss": 0.842, + "step": 915 + }, + { + "epoch": 1.42015503875969, + "grad_norm": 1.5138405561447144, + "learning_rate": 1.6838931955211028e-06, + "loss": 0.8676, + "step": 916 + }, + { + "epoch": 1.4217054263565891, + "grad_norm": 1.3270615339279175, + "learning_rate": 1.6795865633074938e-06, + "loss": 0.8844, + "step": 917 + }, + { + "epoch": 1.4232558139534883, + "grad_norm": 1.3569897413253784, + "learning_rate": 1.6752799310938847e-06, + "loss": 0.8724, + "step": 918 + }, + { + "epoch": 1.4248062015503877, + "grad_norm": 1.3835643529891968, + "learning_rate": 1.6709732988802757e-06, + "loss": 0.921, + "step": 919 + }, + { + "epoch": 1.4263565891472867, + "grad_norm": 1.406451940536499, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.8555, + "step": 920 + }, + { + "epoch": 1.4263565891472867, + "eval_loss": 1.0234382152557373, + "eval_runtime": 46.7532, + "eval_samples_per_second": 21.389, + "eval_steps_per_second": 1.348, + "step": 920 + }, + { + "epoch": 1.427906976744186, + "grad_norm": 1.3230928182601929, + "learning_rate": 1.6623600344530576e-06, + "loss": 0.8871, + "step": 921 + }, + { + "epoch": 1.4294573643410853, + "grad_norm": 1.4418071508407593, + "learning_rate": 1.658053402239449e-06, + "loss": 0.8918, + "step": 922 + }, + { + "epoch": 1.4310077519379845, + "grad_norm": 1.3506301641464233, + "learning_rate": 1.65374677002584e-06, + "loss": 0.8637, + "step": 923 + }, + { + "epoch": 1.4325581395348836, + "grad_norm": 1.537008285522461, + "learning_rate": 1.649440137812231e-06, + "loss": 0.8866, + "step": 924 + }, + { + "epoch": 1.4341085271317828, + "grad_norm": 1.418975830078125, + "learning_rate": 1.645133505598622e-06, + "loss": 0.9255, + "step": 925 + }, + { + "epoch": 1.4356589147286822, + "grad_norm": 1.5376110076904297, + "learning_rate": 1.640826873385013e-06, + "loss": 0.8596, + "step": 926 + }, + { + "epoch": 1.4372093023255814, + "grad_norm": 1.4103554487228394, + "learning_rate": 1.636520241171404e-06, + "loss": 0.869, + "step": 927 + }, + { + "epoch": 1.4387596899224806, + "grad_norm": 1.6398717164993286, + "learning_rate": 1.632213608957795e-06, + "loss": 0.9124, + "step": 928 + }, + { + "epoch": 1.4403100775193798, + "grad_norm": 1.4364229440689087, + "learning_rate": 1.6279069767441862e-06, + "loss": 0.9052, + "step": 929 + }, + { + "epoch": 1.441860465116279, + "grad_norm": 1.9486956596374512, + "learning_rate": 1.6236003445305774e-06, + "loss": 0.876, + "step": 930 + }, + { + "epoch": 1.441860465116279, + "eval_loss": 1.0196127891540527, + "eval_runtime": 46.6689, + "eval_samples_per_second": 21.428, + "eval_steps_per_second": 1.35, + "step": 930 + }, + { + "epoch": 1.4434108527131784, + "grad_norm": 1.3562145233154297, + "learning_rate": 1.6192937123169684e-06, + "loss": 0.9019, + "step": 931 + }, + { + "epoch": 1.4449612403100776, + "grad_norm": 1.6002224683761597, + "learning_rate": 1.6149870801033593e-06, + "loss": 0.8805, + "step": 932 + }, + { + "epoch": 1.4465116279069767, + "grad_norm": 1.3504170179367065, + "learning_rate": 1.6106804478897503e-06, + "loss": 0.9131, + "step": 933 + }, + { + "epoch": 1.448062015503876, + "grad_norm": 1.3638203144073486, + "learning_rate": 1.6063738156761413e-06, + "loss": 0.8759, + "step": 934 + }, + { + "epoch": 1.449612403100775, + "grad_norm": 1.2600183486938477, + "learning_rate": 1.6020671834625322e-06, + "loss": 0.8757, + "step": 935 + }, + { + "epoch": 1.4511627906976745, + "grad_norm": 1.4572343826293945, + "learning_rate": 1.5977605512489236e-06, + "loss": 0.8628, + "step": 936 + }, + { + "epoch": 1.4527131782945737, + "grad_norm": 1.6208621263504028, + "learning_rate": 1.5934539190353146e-06, + "loss": 0.899, + "step": 937 + }, + { + "epoch": 1.4542635658914729, + "grad_norm": 1.4335441589355469, + "learning_rate": 1.5891472868217056e-06, + "loss": 0.8645, + "step": 938 + }, + { + "epoch": 1.455813953488372, + "grad_norm": 1.396388292312622, + "learning_rate": 1.5848406546080965e-06, + "loss": 0.9317, + "step": 939 + }, + { + "epoch": 1.4573643410852712, + "grad_norm": 1.5560067892074585, + "learning_rate": 1.5805340223944877e-06, + "loss": 0.8483, + "step": 940 + }, + { + "epoch": 1.4573643410852712, + "eval_loss": 1.0217546224594116, + "eval_runtime": 46.6616, + "eval_samples_per_second": 21.431, + "eval_steps_per_second": 1.35, + "step": 940 + }, + { + "epoch": 1.4589147286821706, + "grad_norm": 1.4226781129837036, + "learning_rate": 1.5762273901808787e-06, + "loss": 0.896, + "step": 941 + }, + { + "epoch": 1.4604651162790698, + "grad_norm": 1.2623789310455322, + "learning_rate": 1.5719207579672696e-06, + "loss": 0.875, + "step": 942 + }, + { + "epoch": 1.462015503875969, + "grad_norm": 1.440442442893982, + "learning_rate": 1.5676141257536608e-06, + "loss": 0.8916, + "step": 943 + }, + { + "epoch": 1.4635658914728682, + "grad_norm": 1.4717267751693726, + "learning_rate": 1.563307493540052e-06, + "loss": 0.8684, + "step": 944 + }, + { + "epoch": 1.4651162790697674, + "grad_norm": 1.418413758277893, + "learning_rate": 1.559000861326443e-06, + "loss": 0.8677, + "step": 945 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.4560842514038086, + "learning_rate": 1.554694229112834e-06, + "loss": 0.8496, + "step": 946 + }, + { + "epoch": 1.468217054263566, + "grad_norm": 1.7570191621780396, + "learning_rate": 1.550387596899225e-06, + "loss": 0.9065, + "step": 947 + }, + { + "epoch": 1.4697674418604652, + "grad_norm": 1.2934919595718384, + "learning_rate": 1.5460809646856159e-06, + "loss": 0.8618, + "step": 948 + }, + { + "epoch": 1.4713178294573643, + "grad_norm": 1.404919147491455, + "learning_rate": 1.5417743324720068e-06, + "loss": 0.8713, + "step": 949 + }, + { + "epoch": 1.4728682170542635, + "grad_norm": 1.5484105348587036, + "learning_rate": 1.537467700258398e-06, + "loss": 0.8946, + "step": 950 + }, + { + "epoch": 1.4728682170542635, + "eval_loss": 1.0181382894515991, + "eval_runtime": 46.6806, + "eval_samples_per_second": 21.422, + "eval_steps_per_second": 1.35, + "step": 950 + }, + { + "epoch": 1.474418604651163, + "grad_norm": 1.3862688541412354, + "learning_rate": 1.5331610680447892e-06, + "loss": 0.8691, + "step": 951 + }, + { + "epoch": 1.4759689922480619, + "grad_norm": 1.417663335800171, + "learning_rate": 1.5288544358311802e-06, + "loss": 0.8743, + "step": 952 + }, + { + "epoch": 1.4775193798449613, + "grad_norm": 1.374737024307251, + "learning_rate": 1.5245478036175711e-06, + "loss": 0.8856, + "step": 953 + }, + { + "epoch": 1.4790697674418605, + "grad_norm": 1.3962092399597168, + "learning_rate": 1.5202411714039621e-06, + "loss": 0.9071, + "step": 954 + }, + { + "epoch": 1.4806201550387597, + "grad_norm": 1.3754186630249023, + "learning_rate": 1.5159345391903533e-06, + "loss": 0.879, + "step": 955 + }, + { + "epoch": 1.4821705426356588, + "grad_norm": 1.3639583587646484, + "learning_rate": 1.5116279069767443e-06, + "loss": 0.853, + "step": 956 + }, + { + "epoch": 1.483720930232558, + "grad_norm": 1.312952995300293, + "learning_rate": 1.5073212747631352e-06, + "loss": 0.8534, + "step": 957 + }, + { + "epoch": 1.4852713178294574, + "grad_norm": 1.432968258857727, + "learning_rate": 1.5030146425495264e-06, + "loss": 0.8553, + "step": 958 + }, + { + "epoch": 1.4868217054263566, + "grad_norm": 1.5956652164459229, + "learning_rate": 1.4987080103359176e-06, + "loss": 0.8849, + "step": 959 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 1.5594089031219482, + "learning_rate": 1.4944013781223086e-06, + "loss": 0.8654, + "step": 960 + }, + { + "epoch": 1.4883720930232558, + "eval_loss": 1.017091989517212, + "eval_runtime": 46.7467, + "eval_samples_per_second": 21.392, + "eval_steps_per_second": 1.348, + "step": 960 + }, + { + "epoch": 1.489922480620155, + "grad_norm": 1.3392845392227173, + "learning_rate": 1.4900947459086995e-06, + "loss": 0.882, + "step": 961 + }, + { + "epoch": 1.4914728682170542, + "grad_norm": 1.3368501663208008, + "learning_rate": 1.4857881136950905e-06, + "loss": 0.8967, + "step": 962 + }, + { + "epoch": 1.4930232558139536, + "grad_norm": 1.3974400758743286, + "learning_rate": 1.4814814814814815e-06, + "loss": 0.8687, + "step": 963 + }, + { + "epoch": 1.4945736434108527, + "grad_norm": 1.4317644834518433, + "learning_rate": 1.4771748492678724e-06, + "loss": 0.8867, + "step": 964 + }, + { + "epoch": 1.496124031007752, + "grad_norm": 1.5514066219329834, + "learning_rate": 1.4728682170542638e-06, + "loss": 0.8486, + "step": 965 + }, + { + "epoch": 1.4976744186046511, + "grad_norm": 1.5198034048080444, + "learning_rate": 1.4685615848406548e-06, + "loss": 0.8577, + "step": 966 + }, + { + "epoch": 1.4992248062015503, + "grad_norm": 1.3727871179580688, + "learning_rate": 1.4642549526270457e-06, + "loss": 0.8795, + "step": 967 + }, + { + "epoch": 1.5007751937984497, + "grad_norm": 1.3373262882232666, + "learning_rate": 1.4599483204134367e-06, + "loss": 0.8795, + "step": 968 + }, + { + "epoch": 1.5023255813953489, + "grad_norm": 1.358616590499878, + "learning_rate": 1.455641688199828e-06, + "loss": 0.8924, + "step": 969 + }, + { + "epoch": 1.503875968992248, + "grad_norm": 1.3942570686340332, + "learning_rate": 1.4513350559862189e-06, + "loss": 0.8977, + "step": 970 + }, + { + "epoch": 1.503875968992248, + "eval_loss": 1.018630027770996, + "eval_runtime": 46.7829, + "eval_samples_per_second": 21.375, + "eval_steps_per_second": 1.347, + "step": 970 + }, + { + "epoch": 1.5054263565891473, + "grad_norm": 1.560254454612732, + "learning_rate": 1.4470284237726098e-06, + "loss": 0.8894, + "step": 971 + }, + { + "epoch": 1.5069767441860464, + "grad_norm": 1.3579188585281372, + "learning_rate": 1.442721791559001e-06, + "loss": 0.8544, + "step": 972 + }, + { + "epoch": 1.5085271317829458, + "grad_norm": 1.4697273969650269, + "learning_rate": 1.4384151593453922e-06, + "loss": 0.896, + "step": 973 + }, + { + "epoch": 1.5100775193798448, + "grad_norm": 1.4097820520401, + "learning_rate": 1.4341085271317832e-06, + "loss": 0.8713, + "step": 974 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 1.326789379119873, + "learning_rate": 1.4298018949181741e-06, + "loss": 0.8309, + "step": 975 + }, + { + "epoch": 1.5131782945736434, + "grad_norm": 1.573042631149292, + "learning_rate": 1.425495262704565e-06, + "loss": 0.8647, + "step": 976 + }, + { + "epoch": 1.5147286821705426, + "grad_norm": 1.374802589416504, + "learning_rate": 1.421188630490956e-06, + "loss": 0.9035, + "step": 977 + }, + { + "epoch": 1.516279069767442, + "grad_norm": 1.363222360610962, + "learning_rate": 1.416881998277347e-06, + "loss": 0.9028, + "step": 978 + }, + { + "epoch": 1.517829457364341, + "grad_norm": 1.6008557081222534, + "learning_rate": 1.4125753660637384e-06, + "loss": 0.9165, + "step": 979 + }, + { + "epoch": 1.5193798449612403, + "grad_norm": 1.4018700122833252, + "learning_rate": 1.4082687338501294e-06, + "loss": 0.879, + "step": 980 + }, + { + "epoch": 1.5193798449612403, + "eval_loss": 1.0206389427185059, + "eval_runtime": 47.0863, + "eval_samples_per_second": 21.238, + "eval_steps_per_second": 1.338, + "step": 980 + }, + { + "epoch": 1.5209302325581395, + "grad_norm": 1.3979579210281372, + "learning_rate": 1.4039621016365204e-06, + "loss": 0.8617, + "step": 981 + }, + { + "epoch": 1.5224806201550387, + "grad_norm": 1.4407587051391602, + "learning_rate": 1.3996554694229113e-06, + "loss": 0.8488, + "step": 982 + }, + { + "epoch": 1.5240310077519381, + "grad_norm": 1.4984155893325806, + "learning_rate": 1.3953488372093025e-06, + "loss": 0.8823, + "step": 983 + }, + { + "epoch": 1.525581395348837, + "grad_norm": 1.3611596822738647, + "learning_rate": 1.3910422049956935e-06, + "loss": 0.8595, + "step": 984 + }, + { + "epoch": 1.5271317829457365, + "grad_norm": 1.49971604347229, + "learning_rate": 1.3867355727820844e-06, + "loss": 0.9004, + "step": 985 + }, + { + "epoch": 1.5286821705426357, + "grad_norm": 1.4561238288879395, + "learning_rate": 1.3824289405684754e-06, + "loss": 0.8699, + "step": 986 + }, + { + "epoch": 1.5302325581395348, + "grad_norm": 1.4849098920822144, + "learning_rate": 1.3781223083548668e-06, + "loss": 0.8434, + "step": 987 + }, + { + "epoch": 1.5317829457364343, + "grad_norm": 1.4471638202667236, + "learning_rate": 1.3738156761412578e-06, + "loss": 0.8643, + "step": 988 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 1.2716907262802124, + "learning_rate": 1.3695090439276487e-06, + "loss": 0.8657, + "step": 989 + }, + { + "epoch": 1.5348837209302326, + "grad_norm": 1.3805012702941895, + "learning_rate": 1.3652024117140397e-06, + "loss": 0.8757, + "step": 990 + }, + { + "epoch": 1.5348837209302326, + "eval_loss": 1.0188567638397217, + "eval_runtime": 46.9241, + "eval_samples_per_second": 21.311, + "eval_steps_per_second": 1.343, + "step": 990 + }, + { + "epoch": 1.5364341085271318, + "grad_norm": 1.333486557006836, + "learning_rate": 1.3608957795004307e-06, + "loss": 0.8503, + "step": 991 + }, + { + "epoch": 1.537984496124031, + "grad_norm": 1.3029228448867798, + "learning_rate": 1.3565891472868216e-06, + "loss": 0.8412, + "step": 992 + }, + { + "epoch": 1.5395348837209304, + "grad_norm": 1.4335917234420776, + "learning_rate": 1.3522825150732128e-06, + "loss": 0.8768, + "step": 993 + }, + { + "epoch": 1.5410852713178294, + "grad_norm": 1.6070386171340942, + "learning_rate": 1.347975882859604e-06, + "loss": 0.8478, + "step": 994 + }, + { + "epoch": 1.5426356589147288, + "grad_norm": 1.4501957893371582, + "learning_rate": 1.343669250645995e-06, + "loss": 0.8681, + "step": 995 + }, + { + "epoch": 1.544186046511628, + "grad_norm": 1.4788450002670288, + "learning_rate": 1.339362618432386e-06, + "loss": 0.8757, + "step": 996 + }, + { + "epoch": 1.5457364341085271, + "grad_norm": 1.4308056831359863, + "learning_rate": 1.3350559862187771e-06, + "loss": 0.8965, + "step": 997 + }, + { + "epoch": 1.5472868217054263, + "grad_norm": 1.5292010307312012, + "learning_rate": 1.330749354005168e-06, + "loss": 0.8452, + "step": 998 + }, + { + "epoch": 1.5488372093023255, + "grad_norm": 1.4352688789367676, + "learning_rate": 1.326442721791559e-06, + "loss": 0.8677, + "step": 999 + }, + { + "epoch": 1.550387596899225, + "grad_norm": 1.3322887420654297, + "learning_rate": 1.32213608957795e-06, + "loss": 0.8788, + "step": 1000 + }, + { + "epoch": 1.550387596899225, + "eval_loss": 1.014811635017395, + "eval_runtime": 46.8871, + "eval_samples_per_second": 21.328, + "eval_steps_per_second": 1.344, + "step": 1000 + }, + { + "epoch": 1.551937984496124, + "grad_norm": 1.272476077079773, + "learning_rate": 1.3178294573643414e-06, + "loss": 0.8553, + "step": 1001 + }, + { + "epoch": 1.5534883720930233, + "grad_norm": 1.3475803136825562, + "learning_rate": 1.3135228251507324e-06, + "loss": 0.9133, + "step": 1002 + }, + { + "epoch": 1.5550387596899224, + "grad_norm": 1.677804708480835, + "learning_rate": 1.3092161929371233e-06, + "loss": 0.8689, + "step": 1003 + }, + { + "epoch": 1.5565891472868216, + "grad_norm": 1.4577577114105225, + "learning_rate": 1.3049095607235143e-06, + "loss": 0.8971, + "step": 1004 + }, + { + "epoch": 1.558139534883721, + "grad_norm": 1.408029317855835, + "learning_rate": 1.3006029285099053e-06, + "loss": 0.8724, + "step": 1005 + }, + { + "epoch": 1.55968992248062, + "grad_norm": 1.579338788986206, + "learning_rate": 1.2962962962962962e-06, + "loss": 0.889, + "step": 1006 + }, + { + "epoch": 1.5612403100775194, + "grad_norm": 1.338676929473877, + "learning_rate": 1.2919896640826874e-06, + "loss": 0.8721, + "step": 1007 + }, + { + "epoch": 1.5627906976744186, + "grad_norm": 1.50413978099823, + "learning_rate": 1.2876830318690786e-06, + "loss": 0.8771, + "step": 1008 + }, + { + "epoch": 1.5643410852713178, + "grad_norm": 1.3786473274230957, + "learning_rate": 1.2833763996554696e-06, + "loss": 0.859, + "step": 1009 + }, + { + "epoch": 1.5658914728682172, + "grad_norm": 1.5475035905838013, + "learning_rate": 1.2790697674418605e-06, + "loss": 0.885, + "step": 1010 + }, + { + "epoch": 1.5658914728682172, + "eval_loss": 1.0137555599212646, + "eval_runtime": 46.6808, + "eval_samples_per_second": 21.422, + "eval_steps_per_second": 1.35, + "step": 1010 + }, + { + "epoch": 1.5674418604651161, + "grad_norm": 1.5142121315002441, + "learning_rate": 1.2747631352282517e-06, + "loss": 0.8622, + "step": 1011 + }, + { + "epoch": 1.5689922480620155, + "grad_norm": 1.3826631307601929, + "learning_rate": 1.2704565030146427e-06, + "loss": 0.8431, + "step": 1012 + }, + { + "epoch": 1.5705426356589147, + "grad_norm": 1.3889191150665283, + "learning_rate": 1.2661498708010337e-06, + "loss": 0.8674, + "step": 1013 + }, + { + "epoch": 1.572093023255814, + "grad_norm": 1.5307679176330566, + "learning_rate": 1.2618432385874246e-06, + "loss": 0.8782, + "step": 1014 + }, + { + "epoch": 1.5736434108527133, + "grad_norm": 1.6250945329666138, + "learning_rate": 1.257536606373816e-06, + "loss": 0.8839, + "step": 1015 + }, + { + "epoch": 1.5751937984496123, + "grad_norm": 1.3860634565353394, + "learning_rate": 1.253229974160207e-06, + "loss": 0.8722, + "step": 1016 + }, + { + "epoch": 1.5767441860465117, + "grad_norm": 1.4336800575256348, + "learning_rate": 1.248923341946598e-06, + "loss": 0.8492, + "step": 1017 + }, + { + "epoch": 1.5782945736434109, + "grad_norm": 1.4651581048965454, + "learning_rate": 1.244616709732989e-06, + "loss": 0.8478, + "step": 1018 + }, + { + "epoch": 1.57984496124031, + "grad_norm": 1.3516453504562378, + "learning_rate": 1.2403100775193799e-06, + "loss": 0.887, + "step": 1019 + }, + { + "epoch": 1.5813953488372094, + "grad_norm": 1.4061412811279297, + "learning_rate": 1.2360034453057709e-06, + "loss": 0.8648, + "step": 1020 + }, + { + "epoch": 1.5813953488372094, + "eval_loss": 1.0146329402923584, + "eval_runtime": 46.4999, + "eval_samples_per_second": 21.505, + "eval_steps_per_second": 1.355, + "step": 1020 + }, + { + "epoch": 1.5829457364341084, + "grad_norm": 1.4939271211624146, + "learning_rate": 1.231696813092162e-06, + "loss": 0.8797, + "step": 1021 + }, + { + "epoch": 1.5844961240310078, + "grad_norm": 1.8291614055633545, + "learning_rate": 1.227390180878553e-06, + "loss": 0.8553, + "step": 1022 + }, + { + "epoch": 1.586046511627907, + "grad_norm": 1.4999667406082153, + "learning_rate": 1.2230835486649442e-06, + "loss": 0.821, + "step": 1023 + }, + { + "epoch": 1.5875968992248062, + "grad_norm": 1.3863446712493896, + "learning_rate": 1.2187769164513351e-06, + "loss": 0.863, + "step": 1024 + }, + { + "epoch": 1.5891472868217056, + "grad_norm": 1.6713688373565674, + "learning_rate": 1.2144702842377263e-06, + "loss": 0.9091, + "step": 1025 + }, + { + "epoch": 1.5906976744186045, + "grad_norm": 1.4036214351654053, + "learning_rate": 1.2101636520241173e-06, + "loss": 0.8798, + "step": 1026 + }, + { + "epoch": 1.592248062015504, + "grad_norm": 1.470150113105774, + "learning_rate": 1.2058570198105083e-06, + "loss": 0.8649, + "step": 1027 + }, + { + "epoch": 1.5937984496124031, + "grad_norm": 1.312363862991333, + "learning_rate": 1.2015503875968994e-06, + "loss": 0.8768, + "step": 1028 + }, + { + "epoch": 1.5953488372093023, + "grad_norm": 1.326096534729004, + "learning_rate": 1.1972437553832904e-06, + "loss": 0.8694, + "step": 1029 + }, + { + "epoch": 1.5968992248062015, + "grad_norm": 1.5062940120697021, + "learning_rate": 1.1929371231696814e-06, + "loss": 0.8576, + "step": 1030 + }, + { + "epoch": 1.5968992248062015, + "eval_loss": 1.0147876739501953, + "eval_runtime": 46.4171, + "eval_samples_per_second": 21.544, + "eval_steps_per_second": 1.357, + "step": 1030 + }, + { + "epoch": 1.5984496124031007, + "grad_norm": 1.51321280002594, + "learning_rate": 1.1886304909560723e-06, + "loss": 0.8853, + "step": 1031 + }, + { + "epoch": 1.6, + "grad_norm": 1.4676376581192017, + "learning_rate": 1.1843238587424635e-06, + "loss": 0.8764, + "step": 1032 + }, + { + "epoch": 1.6015503875968993, + "grad_norm": 1.5837122201919556, + "learning_rate": 1.1800172265288545e-06, + "loss": 0.8573, + "step": 1033 + }, + { + "epoch": 1.6031007751937985, + "grad_norm": 1.3964475393295288, + "learning_rate": 1.1757105943152455e-06, + "loss": 0.8533, + "step": 1034 + }, + { + "epoch": 1.6046511627906976, + "grad_norm": 1.3663526773452759, + "learning_rate": 1.1714039621016366e-06, + "loss": 0.87, + "step": 1035 + }, + { + "epoch": 1.6062015503875968, + "grad_norm": 1.3615703582763672, + "learning_rate": 1.1670973298880276e-06, + "loss": 0.8873, + "step": 1036 + }, + { + "epoch": 1.6077519379844962, + "grad_norm": 1.720268726348877, + "learning_rate": 1.1627906976744188e-06, + "loss": 0.8814, + "step": 1037 + }, + { + "epoch": 1.6093023255813952, + "grad_norm": 1.7937016487121582, + "learning_rate": 1.1584840654608098e-06, + "loss": 0.8723, + "step": 1038 + }, + { + "epoch": 1.6108527131782946, + "grad_norm": 1.2060179710388184, + "learning_rate": 1.1541774332472007e-06, + "loss": 0.8176, + "step": 1039 + }, + { + "epoch": 1.6124031007751938, + "grad_norm": 1.556498408317566, + "learning_rate": 1.149870801033592e-06, + "loss": 0.8536, + "step": 1040 + }, + { + "epoch": 1.6124031007751938, + "eval_loss": 1.0148112773895264, + "eval_runtime": 46.3077, + "eval_samples_per_second": 21.595, + "eval_steps_per_second": 1.36, + "step": 1040 + }, + { + "epoch": 1.613953488372093, + "grad_norm": 1.3427506685256958, + "learning_rate": 1.1455641688199829e-06, + "loss": 0.8651, + "step": 1041 + }, + { + "epoch": 1.6155038759689924, + "grad_norm": 1.2598917484283447, + "learning_rate": 1.141257536606374e-06, + "loss": 0.8207, + "step": 1042 + }, + { + "epoch": 1.6170542635658913, + "grad_norm": 1.4230711460113525, + "learning_rate": 1.136950904392765e-06, + "loss": 0.8898, + "step": 1043 + }, + { + "epoch": 1.6186046511627907, + "grad_norm": 1.5285265445709229, + "learning_rate": 1.132644272179156e-06, + "loss": 0.8667, + "step": 1044 + }, + { + "epoch": 1.62015503875969, + "grad_norm": 1.5158048868179321, + "learning_rate": 1.128337639965547e-06, + "loss": 0.8694, + "step": 1045 + }, + { + "epoch": 1.621705426356589, + "grad_norm": 1.2861136198043823, + "learning_rate": 1.1240310077519381e-06, + "loss": 0.8414, + "step": 1046 + }, + { + "epoch": 1.6232558139534885, + "grad_norm": 1.4673242568969727, + "learning_rate": 1.119724375538329e-06, + "loss": 0.8825, + "step": 1047 + }, + { + "epoch": 1.6248062015503875, + "grad_norm": 1.460770845413208, + "learning_rate": 1.11541774332472e-06, + "loss": 0.8739, + "step": 1048 + }, + { + "epoch": 1.6263565891472869, + "grad_norm": 1.6731219291687012, + "learning_rate": 1.111111111111111e-06, + "loss": 0.8759, + "step": 1049 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 1.4495750665664673, + "learning_rate": 1.1068044788975022e-06, + "loss": 0.8499, + "step": 1050 + }, + { + "epoch": 1.627906976744186, + "eval_loss": 1.014420986175537, + "eval_runtime": 46.4748, + "eval_samples_per_second": 21.517, + "eval_steps_per_second": 1.356, + "step": 1050 + }, + { + "epoch": 1.6294573643410852, + "grad_norm": 1.457659125328064, + "learning_rate": 1.1024978466838932e-06, + "loss": 0.8712, + "step": 1051 + }, + { + "epoch": 1.6310077519379846, + "grad_norm": 1.3786025047302246, + "learning_rate": 1.0981912144702844e-06, + "loss": 0.872, + "step": 1052 + }, + { + "epoch": 1.6325581395348836, + "grad_norm": 1.3944154977798462, + "learning_rate": 1.0938845822566753e-06, + "loss": 0.9201, + "step": 1053 + }, + { + "epoch": 1.634108527131783, + "grad_norm": 1.571994662284851, + "learning_rate": 1.0895779500430665e-06, + "loss": 0.8853, + "step": 1054 + }, + { + "epoch": 1.6356589147286822, + "grad_norm": 1.4556063413619995, + "learning_rate": 1.0852713178294575e-06, + "loss": 0.8827, + "step": 1055 + }, + { + "epoch": 1.6372093023255814, + "grad_norm": 1.5137085914611816, + "learning_rate": 1.0809646856158484e-06, + "loss": 0.8885, + "step": 1056 + }, + { + "epoch": 1.6387596899224808, + "grad_norm": 1.3905104398727417, + "learning_rate": 1.0766580534022396e-06, + "loss": 0.8917, + "step": 1057 + }, + { + "epoch": 1.6403100775193797, + "grad_norm": 1.4482052326202393, + "learning_rate": 1.0723514211886306e-06, + "loss": 0.8912, + "step": 1058 + }, + { + "epoch": 1.6418604651162791, + "grad_norm": 1.3725974559783936, + "learning_rate": 1.0680447889750216e-06, + "loss": 0.8918, + "step": 1059 + }, + { + "epoch": 1.6434108527131783, + "grad_norm": 1.3311225175857544, + "learning_rate": 1.0637381567614125e-06, + "loss": 0.8609, + "step": 1060 + }, + { + "epoch": 1.6434108527131783, + "eval_loss": 1.0134074687957764, + "eval_runtime": 46.4133, + "eval_samples_per_second": 21.546, + "eval_steps_per_second": 1.357, + "step": 1060 + }, + { + "epoch": 1.6449612403100775, + "grad_norm": 1.3237463235855103, + "learning_rate": 1.0594315245478037e-06, + "loss": 0.8416, + "step": 1061 + }, + { + "epoch": 1.6465116279069767, + "grad_norm": 1.3081848621368408, + "learning_rate": 1.0551248923341947e-06, + "loss": 0.8836, + "step": 1062 + }, + { + "epoch": 1.6480620155038759, + "grad_norm": 1.6118297576904297, + "learning_rate": 1.0508182601205856e-06, + "loss": 0.8822, + "step": 1063 + }, + { + "epoch": 1.6496124031007753, + "grad_norm": 1.3589472770690918, + "learning_rate": 1.0465116279069768e-06, + "loss": 0.8715, + "step": 1064 + }, + { + "epoch": 1.6511627906976745, + "grad_norm": 1.4369632005691528, + "learning_rate": 1.0422049956933678e-06, + "loss": 0.8416, + "step": 1065 + }, + { + "epoch": 1.6527131782945736, + "grad_norm": 1.4928234815597534, + "learning_rate": 1.037898363479759e-06, + "loss": 0.8723, + "step": 1066 + }, + { + "epoch": 1.6542635658914728, + "grad_norm": 1.333701729774475, + "learning_rate": 1.03359173126615e-06, + "loss": 0.8692, + "step": 1067 + }, + { + "epoch": 1.655813953488372, + "grad_norm": 1.324978232383728, + "learning_rate": 1.0292850990525411e-06, + "loss": 0.8549, + "step": 1068 + }, + { + "epoch": 1.6573643410852714, + "grad_norm": 1.4882620573043823, + "learning_rate": 1.024978466838932e-06, + "loss": 0.8732, + "step": 1069 + }, + { + "epoch": 1.6589147286821704, + "grad_norm": 1.3324483633041382, + "learning_rate": 1.020671834625323e-06, + "loss": 0.8902, + "step": 1070 + }, + { + "epoch": 1.6589147286821704, + "eval_loss": 1.0144659280776978, + "eval_runtime": 46.3728, + "eval_samples_per_second": 21.564, + "eval_steps_per_second": 1.359, + "step": 1070 + }, + { + "epoch": 1.6604651162790698, + "grad_norm": 1.2734097242355347, + "learning_rate": 1.0163652024117142e-06, + "loss": 0.8381, + "step": 1071 + }, + { + "epoch": 1.662015503875969, + "grad_norm": 1.3319727182388306, + "learning_rate": 1.0120585701981052e-06, + "loss": 0.8616, + "step": 1072 + }, + { + "epoch": 1.6635658914728682, + "grad_norm": 1.4507578611373901, + "learning_rate": 1.0077519379844962e-06, + "loss": 0.8615, + "step": 1073 + }, + { + "epoch": 1.6651162790697676, + "grad_norm": 1.4849218130111694, + "learning_rate": 1.0034453057708871e-06, + "loss": 0.8825, + "step": 1074 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.3075767755508423, + "learning_rate": 9.991386735572783e-07, + "loss": 0.8813, + "step": 1075 + }, + { + "epoch": 1.668217054263566, + "grad_norm": 1.359571933746338, + "learning_rate": 9.948320413436693e-07, + "loss": 0.8874, + "step": 1076 + }, + { + "epoch": 1.669767441860465, + "grad_norm": 1.582699179649353, + "learning_rate": 9.905254091300603e-07, + "loss": 0.8527, + "step": 1077 + }, + { + "epoch": 1.6713178294573643, + "grad_norm": 1.3974062204360962, + "learning_rate": 9.862187769164514e-07, + "loss": 0.8754, + "step": 1078 + }, + { + "epoch": 1.6728682170542637, + "grad_norm": 1.4189597368240356, + "learning_rate": 9.819121447028424e-07, + "loss": 0.84, + "step": 1079 + }, + { + "epoch": 1.6744186046511627, + "grad_norm": 1.387370228767395, + "learning_rate": 9.776055124892336e-07, + "loss": 0.8757, + "step": 1080 + }, + { + "epoch": 1.6744186046511627, + "eval_loss": 1.0135537385940552, + "eval_runtime": 46.4402, + "eval_samples_per_second": 21.533, + "eval_steps_per_second": 1.357, + "step": 1080 + }, + { + "epoch": 1.675968992248062, + "grad_norm": 1.3377106189727783, + "learning_rate": 9.732988802756245e-07, + "loss": 0.8955, + "step": 1081 + }, + { + "epoch": 1.6775193798449612, + "grad_norm": 1.3911293745040894, + "learning_rate": 9.689922480620157e-07, + "loss": 0.8785, + "step": 1082 + }, + { + "epoch": 1.6790697674418604, + "grad_norm": 1.2773281335830688, + "learning_rate": 9.646856158484067e-07, + "loss": 0.8504, + "step": 1083 + }, + { + "epoch": 1.6806201550387598, + "grad_norm": 1.370222806930542, + "learning_rate": 9.603789836347977e-07, + "loss": 0.8879, + "step": 1084 + }, + { + "epoch": 1.6821705426356588, + "grad_norm": 1.3102188110351562, + "learning_rate": 9.560723514211886e-07, + "loss": 0.8193, + "step": 1085 + }, + { + "epoch": 1.6837209302325582, + "grad_norm": 1.5439156293869019, + "learning_rate": 9.517657192075798e-07, + "loss": 0.8803, + "step": 1086 + }, + { + "epoch": 1.6852713178294574, + "grad_norm": 1.530087947845459, + "learning_rate": 9.474590869939708e-07, + "loss": 0.8543, + "step": 1087 + }, + { + "epoch": 1.6868217054263566, + "grad_norm": 1.5727529525756836, + "learning_rate": 9.431524547803617e-07, + "loss": 0.9076, + "step": 1088 + }, + { + "epoch": 1.688372093023256, + "grad_norm": 1.666233777999878, + "learning_rate": 9.388458225667529e-07, + "loss": 0.8781, + "step": 1089 + }, + { + "epoch": 1.689922480620155, + "grad_norm": 1.4620155096054077, + "learning_rate": 9.345391903531439e-07, + "loss": 0.851, + "step": 1090 + }, + { + "epoch": 1.689922480620155, + "eval_loss": 1.0132611989974976, + "eval_runtime": 46.4938, + "eval_samples_per_second": 21.508, + "eval_steps_per_second": 1.355, + "step": 1090 + }, + { + "epoch": 1.6914728682170543, + "grad_norm": 1.3910199403762817, + "learning_rate": 9.30232558139535e-07, + "loss": 0.8891, + "step": 1091 + }, + { + "epoch": 1.6930232558139535, + "grad_norm": 1.347779631614685, + "learning_rate": 9.259259259259259e-07, + "loss": 0.8509, + "step": 1092 + }, + { + "epoch": 1.6945736434108527, + "grad_norm": 1.4723174571990967, + "learning_rate": 9.216192937123171e-07, + "loss": 0.8737, + "step": 1093 + }, + { + "epoch": 1.6961240310077519, + "grad_norm": 1.3850101232528687, + "learning_rate": 9.173126614987081e-07, + "loss": 0.8863, + "step": 1094 + }, + { + "epoch": 1.697674418604651, + "grad_norm": 1.3053226470947266, + "learning_rate": 9.13006029285099e-07, + "loss": 0.8678, + "step": 1095 + }, + { + "epoch": 1.6992248062015505, + "grad_norm": 1.3607691526412964, + "learning_rate": 9.086993970714901e-07, + "loss": 0.8616, + "step": 1096 + }, + { + "epoch": 1.7007751937984497, + "grad_norm": 1.5045592784881592, + "learning_rate": 9.043927648578812e-07, + "loss": 0.8576, + "step": 1097 + }, + { + "epoch": 1.7023255813953488, + "grad_norm": 1.4135942459106445, + "learning_rate": 9.000861326442723e-07, + "loss": 0.8837, + "step": 1098 + }, + { + "epoch": 1.703875968992248, + "grad_norm": 1.3827612400054932, + "learning_rate": 8.957795004306632e-07, + "loss": 0.8609, + "step": 1099 + }, + { + "epoch": 1.7054263565891472, + "grad_norm": 1.3499889373779297, + "learning_rate": 8.914728682170544e-07, + "loss": 0.8283, + "step": 1100 + }, + { + "epoch": 1.7054263565891472, + "eval_loss": 1.0121917724609375, + "eval_runtime": 46.4487, + "eval_samples_per_second": 21.529, + "eval_steps_per_second": 1.356, + "step": 1100 + }, + { + "epoch": 1.7069767441860466, + "grad_norm": 1.363226294517517, + "learning_rate": 8.871662360034454e-07, + "loss": 0.8827, + "step": 1101 + }, + { + "epoch": 1.7085271317829456, + "grad_norm": 1.5012890100479126, + "learning_rate": 8.828596037898364e-07, + "loss": 0.8636, + "step": 1102 + }, + { + "epoch": 1.710077519379845, + "grad_norm": 1.3506569862365723, + "learning_rate": 8.785529715762274e-07, + "loss": 0.8507, + "step": 1103 + }, + { + "epoch": 1.7116279069767442, + "grad_norm": 1.5143163204193115, + "learning_rate": 8.742463393626185e-07, + "loss": 0.8715, + "step": 1104 + }, + { + "epoch": 1.7131782945736433, + "grad_norm": 1.5398318767547607, + "learning_rate": 8.699397071490096e-07, + "loss": 0.823, + "step": 1105 + }, + { + "epoch": 1.7147286821705428, + "grad_norm": 1.3466542959213257, + "learning_rate": 8.656330749354005e-07, + "loss": 0.8737, + "step": 1106 + }, + { + "epoch": 1.7162790697674417, + "grad_norm": 1.4723925590515137, + "learning_rate": 8.613264427217917e-07, + "loss": 0.8641, + "step": 1107 + }, + { + "epoch": 1.7178294573643411, + "grad_norm": 1.493037462234497, + "learning_rate": 8.570198105081827e-07, + "loss": 0.8641, + "step": 1108 + }, + { + "epoch": 1.7193798449612403, + "grad_norm": 1.3498090505599976, + "learning_rate": 8.527131782945737e-07, + "loss": 0.8424, + "step": 1109 + }, + { + "epoch": 1.7209302325581395, + "grad_norm": 1.5638823509216309, + "learning_rate": 8.484065460809647e-07, + "loss": 0.8661, + "step": 1110 + }, + { + "epoch": 1.7209302325581395, + "eval_loss": 1.0125303268432617, + "eval_runtime": 46.5089, + "eval_samples_per_second": 21.501, + "eval_steps_per_second": 1.355, + "step": 1110 + }, + { + "epoch": 1.7224806201550389, + "grad_norm": 1.3510512113571167, + "learning_rate": 8.440999138673558e-07, + "loss": 0.8589, + "step": 1111 + }, + { + "epoch": 1.7240310077519378, + "grad_norm": 1.4735004901885986, + "learning_rate": 8.397932816537469e-07, + "loss": 0.8345, + "step": 1112 + }, + { + "epoch": 1.7255813953488373, + "grad_norm": 1.4031803607940674, + "learning_rate": 8.354866494401378e-07, + "loss": 0.8861, + "step": 1113 + }, + { + "epoch": 1.7271317829457364, + "grad_norm": 1.3703727722167969, + "learning_rate": 8.311800172265288e-07, + "loss": 0.8727, + "step": 1114 + }, + { + "epoch": 1.7286821705426356, + "grad_norm": 1.4153964519500732, + "learning_rate": 8.2687338501292e-07, + "loss": 0.8676, + "step": 1115 + }, + { + "epoch": 1.730232558139535, + "grad_norm": 1.292462944984436, + "learning_rate": 8.22566752799311e-07, + "loss": 0.8722, + "step": 1116 + }, + { + "epoch": 1.731782945736434, + "grad_norm": 1.3069566488265991, + "learning_rate": 8.18260120585702e-07, + "loss": 0.8322, + "step": 1117 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 1.288217544555664, + "learning_rate": 8.139534883720931e-07, + "loss": 0.8583, + "step": 1118 + }, + { + "epoch": 1.7348837209302326, + "grad_norm": 1.555336594581604, + "learning_rate": 8.096468561584842e-07, + "loss": 0.8624, + "step": 1119 + }, + { + "epoch": 1.7364341085271318, + "grad_norm": 1.4711161851882935, + "learning_rate": 8.053402239448752e-07, + "loss": 0.851, + "step": 1120 + }, + { + "epoch": 1.7364341085271318, + "eval_loss": 1.0113039016723633, + "eval_runtime": 46.4504, + "eval_samples_per_second": 21.528, + "eval_steps_per_second": 1.356, + "step": 1120 + }, + { + "epoch": 1.7379844961240312, + "grad_norm": 1.5582923889160156, + "learning_rate": 8.010335917312661e-07, + "loss": 0.872, + "step": 1121 + }, + { + "epoch": 1.7395348837209301, + "grad_norm": 1.3969173431396484, + "learning_rate": 7.967269595176573e-07, + "loss": 0.8279, + "step": 1122 + }, + { + "epoch": 1.7410852713178295, + "grad_norm": 1.5065845251083374, + "learning_rate": 7.924203273040483e-07, + "loss": 0.9051, + "step": 1123 + }, + { + "epoch": 1.7426356589147287, + "grad_norm": 1.520509123802185, + "learning_rate": 7.881136950904393e-07, + "loss": 0.8762, + "step": 1124 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 1.4614084959030151, + "learning_rate": 7.838070628768304e-07, + "loss": 0.8638, + "step": 1125 + }, + { + "epoch": 1.745736434108527, + "grad_norm": 1.3072479963302612, + "learning_rate": 7.795004306632215e-07, + "loss": 0.8482, + "step": 1126 + }, + { + "epoch": 1.7472868217054263, + "grad_norm": 1.5456178188323975, + "learning_rate": 7.751937984496125e-07, + "loss": 0.8306, + "step": 1127 + }, + { + "epoch": 1.7488372093023257, + "grad_norm": 1.354901671409607, + "learning_rate": 7.708871662360034e-07, + "loss": 0.8581, + "step": 1128 + }, + { + "epoch": 1.7503875968992249, + "grad_norm": 1.367995262145996, + "learning_rate": 7.665805340223946e-07, + "loss": 0.8409, + "step": 1129 + }, + { + "epoch": 1.751937984496124, + "grad_norm": 1.4493037462234497, + "learning_rate": 7.622739018087856e-07, + "loss": 0.8783, + "step": 1130 + }, + { + "epoch": 1.751937984496124, + "eval_loss": 1.0115089416503906, + "eval_runtime": 46.5666, + "eval_samples_per_second": 21.475, + "eval_steps_per_second": 1.353, + "step": 1130 + }, + { + "epoch": 1.7534883720930232, + "grad_norm": 1.3681944608688354, + "learning_rate": 7.579672695951766e-07, + "loss": 0.8619, + "step": 1131 + }, + { + "epoch": 1.7550387596899224, + "grad_norm": 1.399470329284668, + "learning_rate": 7.536606373815676e-07, + "loss": 0.8456, + "step": 1132 + }, + { + "epoch": 1.7565891472868218, + "grad_norm": 1.2924598455429077, + "learning_rate": 7.493540051679588e-07, + "loss": 0.8667, + "step": 1133 + }, + { + "epoch": 1.7581395348837208, + "grad_norm": 1.578048825263977, + "learning_rate": 7.450473729543498e-07, + "loss": 0.8768, + "step": 1134 + }, + { + "epoch": 1.7596899224806202, + "grad_norm": 1.6144347190856934, + "learning_rate": 7.407407407407407e-07, + "loss": 0.8443, + "step": 1135 + }, + { + "epoch": 1.7612403100775194, + "grad_norm": 1.5569820404052734, + "learning_rate": 7.364341085271319e-07, + "loss": 0.8826, + "step": 1136 + }, + { + "epoch": 1.7627906976744185, + "grad_norm": 1.437939167022705, + "learning_rate": 7.321274763135229e-07, + "loss": 0.832, + "step": 1137 + }, + { + "epoch": 1.764341085271318, + "grad_norm": 1.333731770515442, + "learning_rate": 7.27820844099914e-07, + "loss": 0.8508, + "step": 1138 + }, + { + "epoch": 1.765891472868217, + "grad_norm": 1.3062009811401367, + "learning_rate": 7.235142118863049e-07, + "loss": 0.8724, + "step": 1139 + }, + { + "epoch": 1.7674418604651163, + "grad_norm": 1.471814513206482, + "learning_rate": 7.192075796726961e-07, + "loss": 0.8718, + "step": 1140 + }, + { + "epoch": 1.7674418604651163, + "eval_loss": 1.0100195407867432, + "eval_runtime": 46.5331, + "eval_samples_per_second": 21.49, + "eval_steps_per_second": 1.354, + "step": 1140 + }, + { + "epoch": 1.7689922480620155, + "grad_norm": 1.6204849481582642, + "learning_rate": 7.149009474590871e-07, + "loss": 0.8556, + "step": 1141 + }, + { + "epoch": 1.7705426356589147, + "grad_norm": 1.3823068141937256, + "learning_rate": 7.10594315245478e-07, + "loss": 0.8851, + "step": 1142 + }, + { + "epoch": 1.772093023255814, + "grad_norm": 1.2534395456314087, + "learning_rate": 7.062876830318692e-07, + "loss": 0.8517, + "step": 1143 + }, + { + "epoch": 1.773643410852713, + "grad_norm": 1.6374818086624146, + "learning_rate": 7.019810508182602e-07, + "loss": 0.8953, + "step": 1144 + }, + { + "epoch": 1.7751937984496124, + "grad_norm": 1.5260299444198608, + "learning_rate": 6.976744186046513e-07, + "loss": 0.8555, + "step": 1145 + }, + { + "epoch": 1.7767441860465116, + "grad_norm": 1.4138376712799072, + "learning_rate": 6.933677863910422e-07, + "loss": 0.8479, + "step": 1146 + }, + { + "epoch": 1.7782945736434108, + "grad_norm": 1.225590467453003, + "learning_rate": 6.890611541774334e-07, + "loss": 0.845, + "step": 1147 + }, + { + "epoch": 1.7798449612403102, + "grad_norm": 1.3658701181411743, + "learning_rate": 6.847545219638244e-07, + "loss": 0.8561, + "step": 1148 + }, + { + "epoch": 1.7813953488372092, + "grad_norm": 1.4470748901367188, + "learning_rate": 6.804478897502153e-07, + "loss": 0.8799, + "step": 1149 + }, + { + "epoch": 1.7829457364341086, + "grad_norm": 1.3451247215270996, + "learning_rate": 6.761412575366064e-07, + "loss": 0.8523, + "step": 1150 + }, + { + "epoch": 1.7829457364341086, + "eval_loss": 1.0102328062057495, + "eval_runtime": 46.5475, + "eval_samples_per_second": 21.483, + "eval_steps_per_second": 1.353, + "step": 1150 + }, + { + "epoch": 1.7844961240310078, + "grad_norm": 1.4519866704940796, + "learning_rate": 6.718346253229975e-07, + "loss": 0.8441, + "step": 1151 + }, + { + "epoch": 1.786046511627907, + "grad_norm": 1.3575239181518555, + "learning_rate": 6.675279931093886e-07, + "loss": 0.8829, + "step": 1152 + }, + { + "epoch": 1.7875968992248064, + "grad_norm": 1.427385926246643, + "learning_rate": 6.632213608957795e-07, + "loss": 0.8407, + "step": 1153 + }, + { + "epoch": 1.7891472868217053, + "grad_norm": 1.4608075618743896, + "learning_rate": 6.589147286821707e-07, + "loss": 0.864, + "step": 1154 + }, + { + "epoch": 1.7906976744186047, + "grad_norm": 1.3200838565826416, + "learning_rate": 6.546080964685617e-07, + "loss": 0.8551, + "step": 1155 + }, + { + "epoch": 1.792248062015504, + "grad_norm": 1.4095115661621094, + "learning_rate": 6.503014642549526e-07, + "loss": 0.8753, + "step": 1156 + }, + { + "epoch": 1.793798449612403, + "grad_norm": 1.334485411643982, + "learning_rate": 6.459948320413437e-07, + "loss": 0.8601, + "step": 1157 + }, + { + "epoch": 1.7953488372093023, + "grad_norm": 1.524133563041687, + "learning_rate": 6.416881998277348e-07, + "loss": 0.8746, + "step": 1158 + }, + { + "epoch": 1.7968992248062015, + "grad_norm": 1.4331588745117188, + "learning_rate": 6.373815676141259e-07, + "loss": 0.8826, + "step": 1159 + }, + { + "epoch": 1.7984496124031009, + "grad_norm": 1.455048680305481, + "learning_rate": 6.330749354005168e-07, + "loss": 0.8734, + "step": 1160 + }, + { + "epoch": 1.7984496124031009, + "eval_loss": 1.009870171546936, + "eval_runtime": 46.6815, + "eval_samples_per_second": 21.422, + "eval_steps_per_second": 1.35, + "step": 1160 + }, + { + "epoch": 1.8, + "grad_norm": 1.3782318830490112, + "learning_rate": 6.28768303186908e-07, + "loss": 0.8566, + "step": 1161 + }, + { + "epoch": 1.8015503875968992, + "grad_norm": 1.4073529243469238, + "learning_rate": 6.24461670973299e-07, + "loss": 0.8946, + "step": 1162 + }, + { + "epoch": 1.8031007751937984, + "grad_norm": 1.3667042255401611, + "learning_rate": 6.201550387596899e-07, + "loss": 0.8499, + "step": 1163 + }, + { + "epoch": 1.8046511627906976, + "grad_norm": 1.4060592651367188, + "learning_rate": 6.15848406546081e-07, + "loss": 0.8715, + "step": 1164 + }, + { + "epoch": 1.806201550387597, + "grad_norm": 1.335383415222168, + "learning_rate": 6.115417743324721e-07, + "loss": 0.8357, + "step": 1165 + }, + { + "epoch": 1.807751937984496, + "grad_norm": 1.4511020183563232, + "learning_rate": 6.072351421188632e-07, + "loss": 0.8505, + "step": 1166 + }, + { + "epoch": 1.8093023255813954, + "grad_norm": 1.4266198873519897, + "learning_rate": 6.029285099052541e-07, + "loss": 0.868, + "step": 1167 + }, + { + "epoch": 1.8108527131782945, + "grad_norm": 1.3813896179199219, + "learning_rate": 5.986218776916452e-07, + "loss": 0.8709, + "step": 1168 + }, + { + "epoch": 1.8124031007751937, + "grad_norm": 1.5237208604812622, + "learning_rate": 5.943152454780362e-07, + "loss": 0.8397, + "step": 1169 + }, + { + "epoch": 1.8139534883720931, + "grad_norm": 1.3090314865112305, + "learning_rate": 5.900086132644272e-07, + "loss": 0.8507, + "step": 1170 + }, + { + "epoch": 1.8139534883720931, + "eval_loss": 1.0094170570373535, + "eval_runtime": 46.5958, + "eval_samples_per_second": 21.461, + "eval_steps_per_second": 1.352, + "step": 1170 + }, + { + "epoch": 1.815503875968992, + "grad_norm": 1.3480397462844849, + "learning_rate": 5.857019810508183e-07, + "loss": 0.8762, + "step": 1171 + }, + { + "epoch": 1.8170542635658915, + "grad_norm": 1.4064971208572388, + "learning_rate": 5.813953488372094e-07, + "loss": 0.8769, + "step": 1172 + }, + { + "epoch": 1.8186046511627907, + "grad_norm": 1.3863223791122437, + "learning_rate": 5.770887166236004e-07, + "loss": 0.864, + "step": 1173 + }, + { + "epoch": 1.8201550387596899, + "grad_norm": 1.4081705808639526, + "learning_rate": 5.727820844099914e-07, + "loss": 0.8843, + "step": 1174 + }, + { + "epoch": 1.8217054263565893, + "grad_norm": 1.417191505432129, + "learning_rate": 5.684754521963825e-07, + "loss": 0.8526, + "step": 1175 + }, + { + "epoch": 1.8232558139534882, + "grad_norm": 1.395699143409729, + "learning_rate": 5.641688199827735e-07, + "loss": 0.8801, + "step": 1176 + }, + { + "epoch": 1.8248062015503876, + "grad_norm": 1.3280203342437744, + "learning_rate": 5.598621877691646e-07, + "loss": 0.8444, + "step": 1177 + }, + { + "epoch": 1.8263565891472868, + "grad_norm": 1.5310155153274536, + "learning_rate": 5.555555555555555e-07, + "loss": 0.8828, + "step": 1178 + }, + { + "epoch": 1.827906976744186, + "grad_norm": 1.2928513288497925, + "learning_rate": 5.512489233419466e-07, + "loss": 0.8882, + "step": 1179 + }, + { + "epoch": 1.8294573643410854, + "grad_norm": 1.4305659532546997, + "learning_rate": 5.469422911283377e-07, + "loss": 0.9073, + "step": 1180 + }, + { + "epoch": 1.8294573643410854, + "eval_loss": 1.0100611448287964, + "eval_runtime": 46.6091, + "eval_samples_per_second": 21.455, + "eval_steps_per_second": 1.352, + "step": 1180 + }, + { + "epoch": 1.8310077519379844, + "grad_norm": 1.4535592794418335, + "learning_rate": 5.426356589147287e-07, + "loss": 0.8876, + "step": 1181 + }, + { + "epoch": 1.8325581395348838, + "grad_norm": 1.358090877532959, + "learning_rate": 5.383290267011198e-07, + "loss": 0.8592, + "step": 1182 + }, + { + "epoch": 1.834108527131783, + "grad_norm": 1.328998327255249, + "learning_rate": 5.340223944875108e-07, + "loss": 0.8607, + "step": 1183 + }, + { + "epoch": 1.8356589147286821, + "grad_norm": 1.3628747463226318, + "learning_rate": 5.297157622739019e-07, + "loss": 0.8563, + "step": 1184 + }, + { + "epoch": 1.8372093023255816, + "grad_norm": 1.3805159330368042, + "learning_rate": 5.254091300602928e-07, + "loss": 0.8252, + "step": 1185 + }, + { + "epoch": 1.8387596899224805, + "grad_norm": 1.4368276596069336, + "learning_rate": 5.211024978466839e-07, + "loss": 0.8873, + "step": 1186 + }, + { + "epoch": 1.84031007751938, + "grad_norm": 1.3603229522705078, + "learning_rate": 5.16795865633075e-07, + "loss": 0.835, + "step": 1187 + }, + { + "epoch": 1.841860465116279, + "grad_norm": 1.41812002658844, + "learning_rate": 5.12489233419466e-07, + "loss": 0.8423, + "step": 1188 + }, + { + "epoch": 1.8434108527131783, + "grad_norm": 1.3339006900787354, + "learning_rate": 5.081826012058571e-07, + "loss": 0.8612, + "step": 1189 + }, + { + "epoch": 1.8449612403100775, + "grad_norm": 1.4747363328933716, + "learning_rate": 5.038759689922481e-07, + "loss": 0.8648, + "step": 1190 + }, + { + "epoch": 1.8449612403100775, + "eval_loss": 1.0100913047790527, + "eval_runtime": 46.6654, + "eval_samples_per_second": 21.429, + "eval_steps_per_second": 1.35, + "step": 1190 + }, + { + "epoch": 1.8465116279069766, + "grad_norm": 1.3146756887435913, + "learning_rate": 4.995693367786392e-07, + "loss": 0.8726, + "step": 1191 + }, + { + "epoch": 1.848062015503876, + "grad_norm": 1.419121265411377, + "learning_rate": 4.952627045650301e-07, + "loss": 0.8612, + "step": 1192 + }, + { + "epoch": 1.8496124031007752, + "grad_norm": 1.4017002582550049, + "learning_rate": 4.909560723514212e-07, + "loss": 0.861, + "step": 1193 + }, + { + "epoch": 1.8511627906976744, + "grad_norm": 1.3471308946609497, + "learning_rate": 4.866494401378123e-07, + "loss": 0.8722, + "step": 1194 + }, + { + "epoch": 1.8527131782945736, + "grad_norm": 1.2866394519805908, + "learning_rate": 4.823428079242033e-07, + "loss": 0.8506, + "step": 1195 + }, + { + "epoch": 1.8542635658914728, + "grad_norm": 1.3998876810073853, + "learning_rate": 4.780361757105943e-07, + "loss": 0.8552, + "step": 1196 + }, + { + "epoch": 1.8558139534883722, + "grad_norm": 1.5448180437088013, + "learning_rate": 4.737295434969854e-07, + "loss": 0.8772, + "step": 1197 + }, + { + "epoch": 1.8573643410852712, + "grad_norm": 1.33297860622406, + "learning_rate": 4.6942291128337646e-07, + "loss": 0.836, + "step": 1198 + }, + { + "epoch": 1.8589147286821706, + "grad_norm": 1.3891242742538452, + "learning_rate": 4.651162790697675e-07, + "loss": 0.8571, + "step": 1199 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 1.4276305437088013, + "learning_rate": 4.6080964685615856e-07, + "loss": 0.8513, + "step": 1200 + }, + { + "epoch": 1.8604651162790697, + "eval_loss": 1.0112069845199585, + "eval_runtime": 46.6809, + "eval_samples_per_second": 21.422, + "eval_steps_per_second": 1.35, + "step": 1200 + }, + { + "epoch": 1.862015503875969, + "grad_norm": 1.5008084774017334, + "learning_rate": 4.565030146425495e-07, + "loss": 0.8589, + "step": 1201 + }, + { + "epoch": 1.8635658914728683, + "grad_norm": 1.44911789894104, + "learning_rate": 4.521963824289406e-07, + "loss": 0.8717, + "step": 1202 + }, + { + "epoch": 1.8651162790697673, + "grad_norm": 1.4527829885482788, + "learning_rate": 4.478897502153316e-07, + "loss": 0.8712, + "step": 1203 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 1.5256608724594116, + "learning_rate": 4.435831180017227e-07, + "loss": 0.8731, + "step": 1204 + }, + { + "epoch": 1.8682170542635659, + "grad_norm": 1.342320203781128, + "learning_rate": 4.392764857881137e-07, + "loss": 0.8313, + "step": 1205 + }, + { + "epoch": 1.869767441860465, + "grad_norm": 1.4780925512313843, + "learning_rate": 4.349698535745048e-07, + "loss": 0.87, + "step": 1206 + }, + { + "epoch": 1.8713178294573645, + "grad_norm": 1.495001196861267, + "learning_rate": 4.3066322136089586e-07, + "loss": 0.872, + "step": 1207 + }, + { + "epoch": 1.8728682170542634, + "grad_norm": 1.3074004650115967, + "learning_rate": 4.2635658914728683e-07, + "loss": 0.8781, + "step": 1208 + }, + { + "epoch": 1.8744186046511628, + "grad_norm": 1.480075478553772, + "learning_rate": 4.220499569336779e-07, + "loss": 0.8689, + "step": 1209 + }, + { + "epoch": 1.875968992248062, + "grad_norm": 1.5911939144134521, + "learning_rate": 4.177433247200689e-07, + "loss": 0.8646, + "step": 1210 + }, + { + "epoch": 1.875968992248062, + "eval_loss": 1.0095446109771729, + "eval_runtime": 46.6513, + "eval_samples_per_second": 21.436, + "eval_steps_per_second": 1.35, + "step": 1210 + }, + { + "epoch": 1.8775193798449612, + "grad_norm": 1.456148386001587, + "learning_rate": 4.1343669250646e-07, + "loss": 0.8474, + "step": 1211 + }, + { + "epoch": 1.8790697674418606, + "grad_norm": 1.439658284187317, + "learning_rate": 4.09130060292851e-07, + "loss": 0.8873, + "step": 1212 + }, + { + "epoch": 1.8806201550387596, + "grad_norm": 1.3261785507202148, + "learning_rate": 4.048234280792421e-07, + "loss": 0.8622, + "step": 1213 + }, + { + "epoch": 1.882170542635659, + "grad_norm": 1.4313101768493652, + "learning_rate": 4.0051679586563306e-07, + "loss": 0.8569, + "step": 1214 + }, + { + "epoch": 1.8837209302325582, + "grad_norm": 1.4603917598724365, + "learning_rate": 3.9621016365202413e-07, + "loss": 0.8678, + "step": 1215 + }, + { + "epoch": 1.8852713178294573, + "grad_norm": 1.5106719732284546, + "learning_rate": 3.919035314384152e-07, + "loss": 0.8787, + "step": 1216 + }, + { + "epoch": 1.8868217054263567, + "grad_norm": 1.6415821313858032, + "learning_rate": 3.8759689922480623e-07, + "loss": 0.8721, + "step": 1217 + }, + { + "epoch": 1.8883720930232557, + "grad_norm": 1.4079430103302002, + "learning_rate": 3.832902670111973e-07, + "loss": 0.8608, + "step": 1218 + }, + { + "epoch": 1.889922480620155, + "grad_norm": 1.3679237365722656, + "learning_rate": 3.789836347975883e-07, + "loss": 0.8637, + "step": 1219 + }, + { + "epoch": 1.8914728682170543, + "grad_norm": 1.6318389177322388, + "learning_rate": 3.746770025839794e-07, + "loss": 0.8705, + "step": 1220 + }, + { + "epoch": 1.8914728682170543, + "eval_loss": 1.009251594543457, + "eval_runtime": 46.6094, + "eval_samples_per_second": 21.455, + "eval_steps_per_second": 1.352, + "step": 1220 + }, + { + "epoch": 1.8930232558139535, + "grad_norm": 1.377698540687561, + "learning_rate": 3.7037037037037036e-07, + "loss": 0.8041, + "step": 1221 + }, + { + "epoch": 1.8945736434108527, + "grad_norm": 1.3580753803253174, + "learning_rate": 3.6606373815676144e-07, + "loss": 0.8252, + "step": 1222 + }, + { + "epoch": 1.8961240310077518, + "grad_norm": 1.3340038061141968, + "learning_rate": 3.6175710594315246e-07, + "loss": 0.8446, + "step": 1223 + }, + { + "epoch": 1.8976744186046512, + "grad_norm": 1.389021635055542, + "learning_rate": 3.5745047372954353e-07, + "loss": 0.8788, + "step": 1224 + }, + { + "epoch": 1.8992248062015504, + "grad_norm": 1.3582135438919067, + "learning_rate": 3.531438415159346e-07, + "loss": 0.8398, + "step": 1225 + }, + { + "epoch": 1.9007751937984496, + "grad_norm": 1.3302254676818848, + "learning_rate": 3.488372093023256e-07, + "loss": 0.8652, + "step": 1226 + }, + { + "epoch": 1.9023255813953488, + "grad_norm": 1.466200590133667, + "learning_rate": 3.445305770887167e-07, + "loss": 0.8607, + "step": 1227 + }, + { + "epoch": 1.903875968992248, + "grad_norm": 1.7074103355407715, + "learning_rate": 3.4022394487510767e-07, + "loss": 0.8779, + "step": 1228 + }, + { + "epoch": 1.9054263565891474, + "grad_norm": 1.379107117652893, + "learning_rate": 3.3591731266149874e-07, + "loss": 0.8496, + "step": 1229 + }, + { + "epoch": 1.9069767441860463, + "grad_norm": 1.5536657571792603, + "learning_rate": 3.3161068044788976e-07, + "loss": 0.8684, + "step": 1230 + }, + { + "epoch": 1.9069767441860463, + "eval_loss": 1.0089800357818604, + "eval_runtime": 46.8452, + "eval_samples_per_second": 21.347, + "eval_steps_per_second": 1.345, + "step": 1230 + }, + { + "epoch": 1.9085271317829458, + "grad_norm": 1.3638468980789185, + "learning_rate": 3.2730404823428084e-07, + "loss": 0.8937, + "step": 1231 + }, + { + "epoch": 1.910077519379845, + "grad_norm": 1.3530744314193726, + "learning_rate": 3.2299741602067186e-07, + "loss": 0.8746, + "step": 1232 + }, + { + "epoch": 1.9116279069767441, + "grad_norm": 1.3501743078231812, + "learning_rate": 3.1869078380706293e-07, + "loss": 0.8455, + "step": 1233 + }, + { + "epoch": 1.9131782945736435, + "grad_norm": 1.3847023248672485, + "learning_rate": 3.14384151593454e-07, + "loss": 0.8811, + "step": 1234 + }, + { + "epoch": 1.9147286821705425, + "grad_norm": 1.319321870803833, + "learning_rate": 3.1007751937984497e-07, + "loss": 0.8572, + "step": 1235 + }, + { + "epoch": 1.916279069767442, + "grad_norm": 1.5270951986312866, + "learning_rate": 3.0577088716623605e-07, + "loss": 0.8696, + "step": 1236 + }, + { + "epoch": 1.917829457364341, + "grad_norm": 1.324975848197937, + "learning_rate": 3.0146425495262707e-07, + "loss": 0.8656, + "step": 1237 + }, + { + "epoch": 1.9193798449612403, + "grad_norm": 1.3629916906356812, + "learning_rate": 2.971576227390181e-07, + "loss": 0.8662, + "step": 1238 + }, + { + "epoch": 1.9209302325581397, + "grad_norm": 1.3780391216278076, + "learning_rate": 2.9285099052540916e-07, + "loss": 0.8328, + "step": 1239 + }, + { + "epoch": 1.9224806201550386, + "grad_norm": 1.4938024282455444, + "learning_rate": 2.885443583118002e-07, + "loss": 0.8509, + "step": 1240 + }, + { + "epoch": 1.9224806201550386, + "eval_loss": 1.0085103511810303, + "eval_runtime": 46.6396, + "eval_samples_per_second": 21.441, + "eval_steps_per_second": 1.351, + "step": 1240 + }, + { + "epoch": 1.924031007751938, + "grad_norm": 1.4414805173873901, + "learning_rate": 2.8423772609819125e-07, + "loss": 0.8642, + "step": 1241 + }, + { + "epoch": 1.9255813953488372, + "grad_norm": 1.319283366203308, + "learning_rate": 2.799310938845823e-07, + "loss": 0.8324, + "step": 1242 + }, + { + "epoch": 1.9271317829457364, + "grad_norm": 1.5541296005249023, + "learning_rate": 2.756244616709733e-07, + "loss": 0.8245, + "step": 1243 + }, + { + "epoch": 1.9286821705426358, + "grad_norm": 1.3837289810180664, + "learning_rate": 2.7131782945736437e-07, + "loss": 0.8664, + "step": 1244 + }, + { + "epoch": 1.9302325581395348, + "grad_norm": 1.316536784172058, + "learning_rate": 2.670111972437554e-07, + "loss": 0.847, + "step": 1245 + }, + { + "epoch": 1.9317829457364342, + "grad_norm": 1.3872352838516235, + "learning_rate": 2.627045650301464e-07, + "loss": 0.8927, + "step": 1246 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 1.3149522542953491, + "learning_rate": 2.583979328165375e-07, + "loss": 0.8375, + "step": 1247 + }, + { + "epoch": 1.9348837209302325, + "grad_norm": 1.3242137432098389, + "learning_rate": 2.5409130060292856e-07, + "loss": 0.8603, + "step": 1248 + }, + { + "epoch": 1.936434108527132, + "grad_norm": 1.4688844680786133, + "learning_rate": 2.497846683893196e-07, + "loss": 0.866, + "step": 1249 + }, + { + "epoch": 1.937984496124031, + "grad_norm": 1.4166333675384521, + "learning_rate": 2.454780361757106e-07, + "loss": 0.8736, + "step": 1250 + }, + { + "epoch": 1.937984496124031, + "eval_loss": 1.0081682205200195, + "eval_runtime": 46.7379, + "eval_samples_per_second": 21.396, + "eval_steps_per_second": 1.348, + "step": 1250 + }, + { + "epoch": 1.9395348837209303, + "grad_norm": 1.35499107837677, + "learning_rate": 2.411714039621017e-07, + "loss": 0.8823, + "step": 1251 + }, + { + "epoch": 1.9410852713178295, + "grad_norm": 1.3941737413406372, + "learning_rate": 2.368647717484927e-07, + "loss": 0.8673, + "step": 1252 + }, + { + "epoch": 1.9426356589147287, + "grad_norm": 1.3867199420928955, + "learning_rate": 2.3255813953488374e-07, + "loss": 0.8793, + "step": 1253 + }, + { + "epoch": 1.9441860465116279, + "grad_norm": 1.4277222156524658, + "learning_rate": 2.2825150732127476e-07, + "loss": 0.8699, + "step": 1254 + }, + { + "epoch": 1.945736434108527, + "grad_norm": 1.3886282444000244, + "learning_rate": 2.239448751076658e-07, + "loss": 0.8716, + "step": 1255 + }, + { + "epoch": 1.9472868217054264, + "grad_norm": 1.4584248065948486, + "learning_rate": 2.1963824289405686e-07, + "loss": 0.8419, + "step": 1256 + }, + { + "epoch": 1.9488372093023256, + "grad_norm": 1.3909106254577637, + "learning_rate": 2.1533161068044793e-07, + "loss": 0.8941, + "step": 1257 + }, + { + "epoch": 1.9503875968992248, + "grad_norm": 1.4313043355941772, + "learning_rate": 2.1102497846683895e-07, + "loss": 0.8523, + "step": 1258 + }, + { + "epoch": 1.951937984496124, + "grad_norm": 1.36488938331604, + "learning_rate": 2.0671834625323e-07, + "loss": 0.8589, + "step": 1259 + }, + { + "epoch": 1.9534883720930232, + "grad_norm": 1.3644864559173584, + "learning_rate": 2.0241171403962105e-07, + "loss": 0.8304, + "step": 1260 + }, + { + "epoch": 1.9534883720930232, + "eval_loss": 1.0075480937957764, + "eval_runtime": 46.7386, + "eval_samples_per_second": 21.396, + "eval_steps_per_second": 1.348, + "step": 1260 + }, + { + "epoch": 1.9550387596899226, + "grad_norm": 1.4223475456237793, + "learning_rate": 1.9810508182601207e-07, + "loss": 0.8433, + "step": 1261 + }, + { + "epoch": 1.9565891472868215, + "grad_norm": 1.571225881576538, + "learning_rate": 1.9379844961240311e-07, + "loss": 0.8897, + "step": 1262 + }, + { + "epoch": 1.958139534883721, + "grad_norm": 1.5039650201797485, + "learning_rate": 1.8949181739879416e-07, + "loss": 0.865, + "step": 1263 + }, + { + "epoch": 1.9596899224806201, + "grad_norm": 1.4037103652954102, + "learning_rate": 1.8518518518518518e-07, + "loss": 0.8295, + "step": 1264 + }, + { + "epoch": 1.9612403100775193, + "grad_norm": 1.435829520225525, + "learning_rate": 1.8087855297157623e-07, + "loss": 0.8811, + "step": 1265 + }, + { + "epoch": 1.9627906976744187, + "grad_norm": 1.5063098669052124, + "learning_rate": 1.765719207579673e-07, + "loss": 0.8701, + "step": 1266 + }, + { + "epoch": 1.9643410852713177, + "grad_norm": 1.4760342836380005, + "learning_rate": 1.7226528854435835e-07, + "loss": 0.8545, + "step": 1267 + }, + { + "epoch": 1.965891472868217, + "grad_norm": 1.3338667154312134, + "learning_rate": 1.6795865633074937e-07, + "loss": 0.884, + "step": 1268 + }, + { + "epoch": 1.9674418604651163, + "grad_norm": 1.4161882400512695, + "learning_rate": 1.6365202411714042e-07, + "loss": 0.8676, + "step": 1269 + }, + { + "epoch": 1.9689922480620154, + "grad_norm": 1.3632901906967163, + "learning_rate": 1.5934539190353146e-07, + "loss": 0.8498, + "step": 1270 + }, + { + "epoch": 1.9689922480620154, + "eval_loss": 1.0080918073654175, + "eval_runtime": 46.7217, + "eval_samples_per_second": 21.403, + "eval_steps_per_second": 1.348, + "step": 1270 + }, + { + "epoch": 1.9705426356589149, + "grad_norm": 1.2917699813842773, + "learning_rate": 1.5503875968992249e-07, + "loss": 0.8382, + "step": 1271 + }, + { + "epoch": 1.9720930232558138, + "grad_norm": 1.4430114030838013, + "learning_rate": 1.5073212747631353e-07, + "loss": 0.8861, + "step": 1272 + }, + { + "epoch": 1.9736434108527132, + "grad_norm": 1.3696763515472412, + "learning_rate": 1.4642549526270458e-07, + "loss": 0.8619, + "step": 1273 + }, + { + "epoch": 1.9751937984496124, + "grad_norm": 1.266954779624939, + "learning_rate": 1.4211886304909563e-07, + "loss": 0.8384, + "step": 1274 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 1.4904766082763672, + "learning_rate": 1.3781223083548665e-07, + "loss": 0.8798, + "step": 1275 + }, + { + "epoch": 1.978294573643411, + "grad_norm": 1.3043338060379028, + "learning_rate": 1.335055986218777e-07, + "loss": 0.8552, + "step": 1276 + }, + { + "epoch": 1.97984496124031, + "grad_norm": 1.3325281143188477, + "learning_rate": 1.2919896640826874e-07, + "loss": 0.8717, + "step": 1277 + }, + { + "epoch": 1.9813953488372094, + "grad_norm": 1.4170563220977783, + "learning_rate": 1.248923341946598e-07, + "loss": 0.865, + "step": 1278 + }, + { + "epoch": 1.9829457364341085, + "grad_norm": 1.3400081396102905, + "learning_rate": 1.2058570198105084e-07, + "loss": 0.8657, + "step": 1279 + }, + { + "epoch": 1.9844961240310077, + "grad_norm": 1.3908562660217285, + "learning_rate": 1.1627906976744187e-07, + "loss": 0.8422, + "step": 1280 + }, + { + "epoch": 1.9844961240310077, + "eval_loss": 1.008200764656067, + "eval_runtime": 46.6818, + "eval_samples_per_second": 21.422, + "eval_steps_per_second": 1.35, + "step": 1280 + }, + { + "epoch": 1.9860465116279071, + "grad_norm": 1.3601107597351074, + "learning_rate": 1.119724375538329e-07, + "loss": 0.8516, + "step": 1281 + }, + { + "epoch": 1.987596899224806, + "grad_norm": 1.4389938116073608, + "learning_rate": 1.0766580534022397e-07, + "loss": 0.8759, + "step": 1282 + }, + { + "epoch": 1.9891472868217055, + "grad_norm": 1.3003112077713013, + "learning_rate": 1.03359173126615e-07, + "loss": 0.8829, + "step": 1283 + }, + { + "epoch": 1.9906976744186047, + "grad_norm": 1.5875047445297241, + "learning_rate": 9.905254091300603e-08, + "loss": 0.8766, + "step": 1284 + }, + { + "epoch": 1.9922480620155039, + "grad_norm": 1.4247767925262451, + "learning_rate": 9.474590869939708e-08, + "loss": 0.8557, + "step": 1285 + }, + { + "epoch": 1.993798449612403, + "grad_norm": 1.4395599365234375, + "learning_rate": 9.043927648578811e-08, + "loss": 0.8775, + "step": 1286 + }, + { + "epoch": 1.9953488372093022, + "grad_norm": 1.3539119958877563, + "learning_rate": 8.613264427217917e-08, + "loss": 0.8762, + "step": 1287 + }, + { + "epoch": 1.9968992248062016, + "grad_norm": 1.2914642095565796, + "learning_rate": 8.182601205857021e-08, + "loss": 0.8612, + "step": 1288 + }, + { + "epoch": 1.9984496124031008, + "grad_norm": 1.3129777908325195, + "learning_rate": 7.751937984496124e-08, + "loss": 0.8446, + "step": 1289 + }, + { + "epoch": 2.0, + "grad_norm": 1.5141966342926025, + "learning_rate": 7.321274763135229e-08, + "loss": 0.8649, + "step": 1290 + }, + { + "epoch": 2.0, + "eval_loss": 1.0077102184295654, + "eval_runtime": 46.7171, + "eval_samples_per_second": 21.405, + "eval_steps_per_second": 1.349, + "step": 1290 + } + ], + "logging_steps": 1, + "max_steps": 1290, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3449606952963277e+18, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}