diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9077 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.998067259373792, + "eval_steps": 500, + "global_step": 1292, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015461925009663702, + "grad_norm": 0.305040568113327, + "learning_rate": 5.000000000000001e-07, + "loss": 0.9801, + "step": 1 + }, + { + "epoch": 0.0030923850019327404, + "grad_norm": 0.3316026031970978, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0582, + "step": 2 + }, + { + "epoch": 0.004638577502899111, + "grad_norm": 0.321511834859848, + "learning_rate": 1.5e-06, + "loss": 1.0877, + "step": 3 + }, + { + "epoch": 0.006184770003865481, + "grad_norm": 0.32884451746940613, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1082, + "step": 4 + }, + { + "epoch": 0.007730962504831852, + "grad_norm": 0.3244039714336395, + "learning_rate": 2.5e-06, + "loss": 1.1352, + "step": 5 + }, + { + "epoch": 0.009277155005798222, + "grad_norm": 0.319381445646286, + "learning_rate": 3e-06, + "loss": 1.0808, + "step": 6 + }, + { + "epoch": 0.010823347506764593, + "grad_norm": 0.3265005946159363, + "learning_rate": 3.5000000000000004e-06, + "loss": 1.0554, + "step": 7 + }, + { + "epoch": 0.012369540007730962, + "grad_norm": 0.3248363137245178, + "learning_rate": 4.000000000000001e-06, + "loss": 1.0834, + "step": 8 + }, + { + "epoch": 0.013915732508697333, + "grad_norm": 0.3369300365447998, + "learning_rate": 4.5e-06, + "loss": 1.1304, + "step": 9 + }, + { + "epoch": 0.015461925009663703, + "grad_norm": 0.3302851915359497, + "learning_rate": 5e-06, + "loss": 1.0683, + "step": 10 + }, + { + "epoch": 0.017008117510630073, + "grad_norm": 0.3330378830432892, + "learning_rate": 5.500000000000001e-06, + "loss": 1.1028, + "step": 11 + }, + { + "epoch": 0.018554310011596443, + "grad_norm": 0.332900732755661, + "learning_rate": 6e-06, + "loss": 1.1557, + "step": 12 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 0.36040395498275757, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.1344, + "step": 13 + }, + { + "epoch": 0.021646695013529185, + "grad_norm": 0.37165355682373047, + "learning_rate": 7.000000000000001e-06, + "loss": 1.1522, + "step": 14 + }, + { + "epoch": 0.023192887514495556, + "grad_norm": 0.35057052969932556, + "learning_rate": 7.5e-06, + "loss": 1.1397, + "step": 15 + }, + { + "epoch": 0.024739080015461924, + "grad_norm": 0.3664647042751312, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1833, + "step": 16 + }, + { + "epoch": 0.026285272516428294, + "grad_norm": 0.38355353474617004, + "learning_rate": 8.500000000000002e-06, + "loss": 1.1925, + "step": 17 + }, + { + "epoch": 0.027831465017394665, + "grad_norm": 0.3568861186504364, + "learning_rate": 9e-06, + "loss": 1.1113, + "step": 18 + }, + { + "epoch": 0.029377657518361036, + "grad_norm": 0.3547118604183197, + "learning_rate": 9.5e-06, + "loss": 1.1403, + "step": 19 + }, + { + "epoch": 0.030923850019327407, + "grad_norm": 0.3746045231819153, + "learning_rate": 1e-05, + "loss": 1.1356, + "step": 20 + }, + { + "epoch": 0.03247004252029378, + "grad_norm": 0.38165998458862305, + "learning_rate": 1.05e-05, + "loss": 1.1239, + "step": 21 + }, + { + "epoch": 0.034016235021260145, + "grad_norm": 0.4228748679161072, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.1263, + "step": 22 + }, + { + "epoch": 0.03556242752222652, + "grad_norm": 0.36566364765167236, + "learning_rate": 1.1500000000000002e-05, + "loss": 1.1362, + "step": 23 + }, + { + "epoch": 0.03710862002319289, + "grad_norm": 0.37338364124298096, + "learning_rate": 1.2e-05, + "loss": 1.1816, + "step": 24 + }, + { + "epoch": 0.038654812524159254, + "grad_norm": 0.3412342667579651, + "learning_rate": 1.25e-05, + "loss": 1.1341, + "step": 25 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.38790470361709595, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.2407, + "step": 26 + }, + { + "epoch": 0.041747197526091996, + "grad_norm": 0.38183456659317017, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.1206, + "step": 27 + }, + { + "epoch": 0.04329339002705837, + "grad_norm": 0.4037930965423584, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.1111, + "step": 28 + }, + { + "epoch": 0.04483958252802474, + "grad_norm": 0.3747173547744751, + "learning_rate": 1.45e-05, + "loss": 1.14, + "step": 29 + }, + { + "epoch": 0.04638577502899111, + "grad_norm": 0.37349933385849, + "learning_rate": 1.5e-05, + "loss": 1.1629, + "step": 30 + }, + { + "epoch": 0.04793196752995748, + "grad_norm": 0.3699789345264435, + "learning_rate": 1.55e-05, + "loss": 1.1417, + "step": 31 + }, + { + "epoch": 0.04947816003092385, + "grad_norm": 0.37245669960975647, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.1563, + "step": 32 + }, + { + "epoch": 0.05102435253189022, + "grad_norm": 0.3655848503112793, + "learning_rate": 1.65e-05, + "loss": 1.1556, + "step": 33 + }, + { + "epoch": 0.05257054503285659, + "grad_norm": 0.3637336194515228, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.1589, + "step": 34 + }, + { + "epoch": 0.05411673753382296, + "grad_norm": 0.35633373260498047, + "learning_rate": 1.75e-05, + "loss": 1.0969, + "step": 35 + }, + { + "epoch": 0.05566293003478933, + "grad_norm": 0.35284438729286194, + "learning_rate": 1.8e-05, + "loss": 1.0921, + "step": 36 + }, + { + "epoch": 0.057209122535755705, + "grad_norm": 0.3530278205871582, + "learning_rate": 1.85e-05, + "loss": 1.0982, + "step": 37 + }, + { + "epoch": 0.05875531503672207, + "grad_norm": 0.3533940315246582, + "learning_rate": 1.9e-05, + "loss": 1.1329, + "step": 38 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 0.35752803087234497, + "learning_rate": 1.9500000000000003e-05, + "loss": 1.1617, + "step": 39 + }, + { + "epoch": 0.061847700038654814, + "grad_norm": 0.34077584743499756, + "learning_rate": 2e-05, + "loss": 1.0956, + "step": 40 + }, + { + "epoch": 0.06339389253962119, + "grad_norm": 0.36975380778312683, + "learning_rate": 2.05e-05, + "loss": 1.0757, + "step": 41 + }, + { + "epoch": 0.06494008504058756, + "grad_norm": 0.36623820662498474, + "learning_rate": 2.1e-05, + "loss": 1.1465, + "step": 42 + }, + { + "epoch": 0.06648627754155392, + "grad_norm": 0.3988489508628845, + "learning_rate": 2.15e-05, + "loss": 1.1701, + "step": 43 + }, + { + "epoch": 0.06803247004252029, + "grad_norm": 0.37758833169937134, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.0647, + "step": 44 + }, + { + "epoch": 0.06957866254348666, + "grad_norm": 0.4047185182571411, + "learning_rate": 2.25e-05, + "loss": 1.1057, + "step": 45 + }, + { + "epoch": 0.07112485504445304, + "grad_norm": 0.39636239409446716, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.1294, + "step": 46 + }, + { + "epoch": 0.0726710475454194, + "grad_norm": 0.4347302317619324, + "learning_rate": 2.35e-05, + "loss": 1.152, + "step": 47 + }, + { + "epoch": 0.07421724004638577, + "grad_norm": 0.4483806788921356, + "learning_rate": 2.4e-05, + "loss": 1.1676, + "step": 48 + }, + { + "epoch": 0.07576343254735214, + "grad_norm": 0.5394858121871948, + "learning_rate": 2.45e-05, + "loss": 1.2228, + "step": 49 + }, + { + "epoch": 0.07730962504831851, + "grad_norm": 0.7282954454421997, + "learning_rate": 2.5e-05, + "loss": 1.2397, + "step": 50 + }, + { + "epoch": 0.07885581754928489, + "grad_norm": 0.3573731482028961, + "learning_rate": 2.5500000000000003e-05, + "loss": 0.9119, + "step": 51 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.36452800035476685, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.983, + "step": 52 + }, + { + "epoch": 0.08194820255121762, + "grad_norm": 0.3743703067302704, + "learning_rate": 2.6500000000000004e-05, + "loss": 0.9428, + "step": 53 + }, + { + "epoch": 0.08349439505218399, + "grad_norm": 0.3590448498725891, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.9967, + "step": 54 + }, + { + "epoch": 0.08504058755315037, + "grad_norm": 0.35900965332984924, + "learning_rate": 2.7500000000000004e-05, + "loss": 0.9687, + "step": 55 + }, + { + "epoch": 0.08658678005411674, + "grad_norm": 0.3654124140739441, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.9342, + "step": 56 + }, + { + "epoch": 0.08813297255508311, + "grad_norm": 0.3421430289745331, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.9207, + "step": 57 + }, + { + "epoch": 0.08967916505604948, + "grad_norm": 0.3365703523159027, + "learning_rate": 2.9e-05, + "loss": 0.9487, + "step": 58 + }, + { + "epoch": 0.09122535755701584, + "grad_norm": 0.32828226685523987, + "learning_rate": 2.95e-05, + "loss": 0.9236, + "step": 59 + }, + { + "epoch": 0.09277155005798222, + "grad_norm": 0.31865429878234863, + "learning_rate": 3e-05, + "loss": 0.9071, + "step": 60 + }, + { + "epoch": 0.09431774255894859, + "grad_norm": 0.3250292241573334, + "learning_rate": 3.05e-05, + "loss": 0.9609, + "step": 61 + }, + { + "epoch": 0.09586393505991496, + "grad_norm": 0.3177807033061981, + "learning_rate": 3.1e-05, + "loss": 0.9727, + "step": 62 + }, + { + "epoch": 0.09741012756088133, + "grad_norm": 0.3273405134677887, + "learning_rate": 3.15e-05, + "loss": 1.0691, + "step": 63 + }, + { + "epoch": 0.0989563200618477, + "grad_norm": 0.30870744585990906, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.9726, + "step": 64 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 0.3052951395511627, + "learning_rate": 3.2500000000000004e-05, + "loss": 0.8804, + "step": 65 + }, + { + "epoch": 0.10204870506378044, + "grad_norm": 0.2952975928783417, + "learning_rate": 3.3e-05, + "loss": 0.9563, + "step": 66 + }, + { + "epoch": 0.10359489756474681, + "grad_norm": 0.29250308871269226, + "learning_rate": 3.35e-05, + "loss": 0.9601, + "step": 67 + }, + { + "epoch": 0.10514109006571318, + "grad_norm": 0.2929418683052063, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.8986, + "step": 68 + }, + { + "epoch": 0.10668728256667954, + "grad_norm": 0.28594040870666504, + "learning_rate": 3.45e-05, + "loss": 0.8887, + "step": 69 + }, + { + "epoch": 0.10823347506764593, + "grad_norm": 0.2964857816696167, + "learning_rate": 3.5e-05, + "loss": 0.9407, + "step": 70 + }, + { + "epoch": 0.1097796675686123, + "grad_norm": 0.31084346771240234, + "learning_rate": 3.55e-05, + "loss": 0.9907, + "step": 71 + }, + { + "epoch": 0.11132586006957866, + "grad_norm": 0.31780946254730225, + "learning_rate": 3.6e-05, + "loss": 0.9753, + "step": 72 + }, + { + "epoch": 0.11287205257054503, + "grad_norm": 0.31742170453071594, + "learning_rate": 3.65e-05, + "loss": 0.9411, + "step": 73 + }, + { + "epoch": 0.11441824507151141, + "grad_norm": 0.3252294361591339, + "learning_rate": 3.7e-05, + "loss": 0.9973, + "step": 74 + }, + { + "epoch": 0.11596443757247778, + "grad_norm": 0.3107585608959198, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.9186, + "step": 75 + }, + { + "epoch": 0.11751063007344414, + "grad_norm": 0.32054993510246277, + "learning_rate": 3.8e-05, + "loss": 0.9264, + "step": 76 + }, + { + "epoch": 0.11905682257441051, + "grad_norm": 0.31898629665374756, + "learning_rate": 3.85e-05, + "loss": 0.9958, + "step": 77 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.3400716185569763, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.9793, + "step": 78 + }, + { + "epoch": 0.12214920757634326, + "grad_norm": 0.347918838262558, + "learning_rate": 3.9500000000000005e-05, + "loss": 0.914, + "step": 79 + }, + { + "epoch": 0.12369540007730963, + "grad_norm": 0.33906570076942444, + "learning_rate": 4e-05, + "loss": 0.93, + "step": 80 + }, + { + "epoch": 0.125241592578276, + "grad_norm": 0.31842949986457825, + "learning_rate": 4.05e-05, + "loss": 0.8957, + "step": 81 + }, + { + "epoch": 0.12678778507924238, + "grad_norm": 0.3301747739315033, + "learning_rate": 4.1e-05, + "loss": 0.9606, + "step": 82 + }, + { + "epoch": 0.12833397758020873, + "grad_norm": 0.33865636587142944, + "learning_rate": 4.15e-05, + "loss": 0.9846, + "step": 83 + }, + { + "epoch": 0.1298801700811751, + "grad_norm": 0.31991609930992126, + "learning_rate": 4.2e-05, + "loss": 0.9574, + "step": 84 + }, + { + "epoch": 0.13142636258214146, + "grad_norm": 0.3442661762237549, + "learning_rate": 4.25e-05, + "loss": 0.9398, + "step": 85 + }, + { + "epoch": 0.13297255508310785, + "grad_norm": 0.33919069170951843, + "learning_rate": 4.3e-05, + "loss": 0.9736, + "step": 86 + }, + { + "epoch": 0.13451874758407423, + "grad_norm": 0.3447560966014862, + "learning_rate": 4.35e-05, + "loss": 0.9589, + "step": 87 + }, + { + "epoch": 0.13606494008504058, + "grad_norm": 0.3504810333251953, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.0156, + "step": 88 + }, + { + "epoch": 0.13761113258600696, + "grad_norm": 0.35089215636253357, + "learning_rate": 4.4500000000000004e-05, + "loss": 0.9732, + "step": 89 + }, + { + "epoch": 0.13915732508697332, + "grad_norm": 0.346599280834198, + "learning_rate": 4.5e-05, + "loss": 0.9705, + "step": 90 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 0.3983485698699951, + "learning_rate": 4.55e-05, + "loss": 1.0236, + "step": 91 + }, + { + "epoch": 0.14224971008890608, + "grad_norm": 0.38158902525901794, + "learning_rate": 4.600000000000001e-05, + "loss": 1.0174, + "step": 92 + }, + { + "epoch": 0.14379590258987243, + "grad_norm": 0.3771804869174957, + "learning_rate": 4.6500000000000005e-05, + "loss": 1.012, + "step": 93 + }, + { + "epoch": 0.1453420950908388, + "grad_norm": 0.38228315114974976, + "learning_rate": 4.7e-05, + "loss": 0.9786, + "step": 94 + }, + { + "epoch": 0.14688828759180517, + "grad_norm": 0.3989880681037903, + "learning_rate": 4.75e-05, + "loss": 0.98, + "step": 95 + }, + { + "epoch": 0.14843448009277155, + "grad_norm": 0.41765791177749634, + "learning_rate": 4.8e-05, + "loss": 1.0502, + "step": 96 + }, + { + "epoch": 0.14998067259373793, + "grad_norm": 0.4242067039012909, + "learning_rate": 4.85e-05, + "loss": 1.0911, + "step": 97 + }, + { + "epoch": 0.15152686509470428, + "grad_norm": 0.4490616023540497, + "learning_rate": 4.9e-05, + "loss": 1.0791, + "step": 98 + }, + { + "epoch": 0.15307305759567066, + "grad_norm": 0.4694664478302002, + "learning_rate": 4.9500000000000004e-05, + "loss": 1.0374, + "step": 99 + }, + { + "epoch": 0.15461925009663702, + "grad_norm": 0.7335464954376221, + "learning_rate": 5e-05, + "loss": 1.093, + "step": 100 + }, + { + "epoch": 0.1561654425976034, + "grad_norm": 0.3335883915424347, + "learning_rate": 4.995805369127517e-05, + "loss": 0.8461, + "step": 101 + }, + { + "epoch": 0.15771163509856978, + "grad_norm": 0.3368653357028961, + "learning_rate": 4.9916107382550336e-05, + "loss": 0.8619, + "step": 102 + }, + { + "epoch": 0.15925782759953613, + "grad_norm": 0.34530943632125854, + "learning_rate": 4.9874161073825505e-05, + "loss": 0.8399, + "step": 103 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.3303908407688141, + "learning_rate": 4.983221476510067e-05, + "loss": 0.891, + "step": 104 + }, + { + "epoch": 0.1623502126014689, + "grad_norm": 0.3104991614818573, + "learning_rate": 4.9790268456375845e-05, + "loss": 0.8851, + "step": 105 + }, + { + "epoch": 0.16389640510243525, + "grad_norm": 0.3138999342918396, + "learning_rate": 4.974832214765101e-05, + "loss": 0.8668, + "step": 106 + }, + { + "epoch": 0.16544259760340163, + "grad_norm": 0.3183957040309906, + "learning_rate": 4.970637583892618e-05, + "loss": 0.8494, + "step": 107 + }, + { + "epoch": 0.16698879010436798, + "grad_norm": 0.337296724319458, + "learning_rate": 4.966442953020135e-05, + "loss": 0.872, + "step": 108 + }, + { + "epoch": 0.16853498260533437, + "grad_norm": 0.29680609703063965, + "learning_rate": 4.962248322147651e-05, + "loss": 0.8473, + "step": 109 + }, + { + "epoch": 0.17008117510630075, + "grad_norm": 0.31526410579681396, + "learning_rate": 4.958053691275168e-05, + "loss": 0.9042, + "step": 110 + }, + { + "epoch": 0.1716273676072671, + "grad_norm": 0.3531287610530853, + "learning_rate": 4.9538590604026845e-05, + "loss": 0.9444, + "step": 111 + }, + { + "epoch": 0.17317356010823348, + "grad_norm": 0.324305921792984, + "learning_rate": 4.9496644295302015e-05, + "loss": 0.8867, + "step": 112 + }, + { + "epoch": 0.17471975260919984, + "grad_norm": 0.32134464383125305, + "learning_rate": 4.945469798657718e-05, + "loss": 0.8575, + "step": 113 + }, + { + "epoch": 0.17626594511016622, + "grad_norm": 0.32409024238586426, + "learning_rate": 4.9412751677852355e-05, + "loss": 0.8937, + "step": 114 + }, + { + "epoch": 0.1778121376111326, + "grad_norm": 0.3491647243499756, + "learning_rate": 4.937080536912752e-05, + "loss": 0.8902, + "step": 115 + }, + { + "epoch": 0.17935833011209895, + "grad_norm": 0.3334656357765198, + "learning_rate": 4.932885906040269e-05, + "loss": 0.8788, + "step": 116 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 0.3333323299884796, + "learning_rate": 4.928691275167786e-05, + "loss": 0.907, + "step": 117 + }, + { + "epoch": 0.1824507151140317, + "grad_norm": 0.3563230335712433, + "learning_rate": 4.924496644295302e-05, + "loss": 0.9043, + "step": 118 + }, + { + "epoch": 0.18399690761499807, + "grad_norm": 0.35443171858787537, + "learning_rate": 4.920302013422819e-05, + "loss": 0.8966, + "step": 119 + }, + { + "epoch": 0.18554310011596445, + "grad_norm": 0.36155420541763306, + "learning_rate": 4.9161073825503354e-05, + "loss": 0.903, + "step": 120 + }, + { + "epoch": 0.1870892926169308, + "grad_norm": 0.3701721131801605, + "learning_rate": 4.9119127516778524e-05, + "loss": 0.8786, + "step": 121 + }, + { + "epoch": 0.18863548511789718, + "grad_norm": 0.3697488307952881, + "learning_rate": 4.9077181208053694e-05, + "loss": 0.9279, + "step": 122 + }, + { + "epoch": 0.19018167761886354, + "grad_norm": 0.35044190287590027, + "learning_rate": 4.9035234899328864e-05, + "loss": 0.8577, + "step": 123 + }, + { + "epoch": 0.19172787011982992, + "grad_norm": 0.3510192632675171, + "learning_rate": 4.8993288590604034e-05, + "loss": 0.8831, + "step": 124 + }, + { + "epoch": 0.1932740626207963, + "grad_norm": 0.3509821593761444, + "learning_rate": 4.89513422818792e-05, + "loss": 0.8507, + "step": 125 + }, + { + "epoch": 0.19482025512176265, + "grad_norm": 0.3761618137359619, + "learning_rate": 4.890939597315437e-05, + "loss": 0.9151, + "step": 126 + }, + { + "epoch": 0.19636644762272903, + "grad_norm": 0.3758131265640259, + "learning_rate": 4.886744966442953e-05, + "loss": 0.9067, + "step": 127 + }, + { + "epoch": 0.1979126401236954, + "grad_norm": 0.39383259415626526, + "learning_rate": 4.88255033557047e-05, + "loss": 0.9002, + "step": 128 + }, + { + "epoch": 0.19945883262466177, + "grad_norm": 0.37084588408470154, + "learning_rate": 4.878355704697986e-05, + "loss": 0.8459, + "step": 129 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.389466792345047, + "learning_rate": 4.874161073825503e-05, + "loss": 0.9218, + "step": 130 + }, + { + "epoch": 0.2025512176265945, + "grad_norm": 0.38749396800994873, + "learning_rate": 4.86996644295302e-05, + "loss": 0.892, + "step": 131 + }, + { + "epoch": 0.20409741012756089, + "grad_norm": 0.36285966634750366, + "learning_rate": 4.865771812080537e-05, + "loss": 0.8458, + "step": 132 + }, + { + "epoch": 0.20564360262852724, + "grad_norm": 0.38966134190559387, + "learning_rate": 4.861577181208054e-05, + "loss": 0.9189, + "step": 133 + }, + { + "epoch": 0.20718979512949362, + "grad_norm": 0.3792301118373871, + "learning_rate": 4.8573825503355706e-05, + "loss": 0.891, + "step": 134 + }, + { + "epoch": 0.20873598763046, + "grad_norm": 0.39519718289375305, + "learning_rate": 4.8531879194630876e-05, + "loss": 0.8946, + "step": 135 + }, + { + "epoch": 0.21028218013142636, + "grad_norm": 0.399034708738327, + "learning_rate": 4.848993288590604e-05, + "loss": 0.8692, + "step": 136 + }, + { + "epoch": 0.21182837263239274, + "grad_norm": 0.451168954372406, + "learning_rate": 4.844798657718121e-05, + "loss": 0.9005, + "step": 137 + }, + { + "epoch": 0.2133745651333591, + "grad_norm": 0.41019031405448914, + "learning_rate": 4.840604026845638e-05, + "loss": 0.9766, + "step": 138 + }, + { + "epoch": 0.21492075763432547, + "grad_norm": 0.4109886586666107, + "learning_rate": 4.836409395973154e-05, + "loss": 0.9078, + "step": 139 + }, + { + "epoch": 0.21646695013529185, + "grad_norm": 0.4546094834804535, + "learning_rate": 4.832214765100672e-05, + "loss": 0.9621, + "step": 140 + }, + { + "epoch": 0.2180131426362582, + "grad_norm": 0.44215071201324463, + "learning_rate": 4.828020134228188e-05, + "loss": 0.9965, + "step": 141 + }, + { + "epoch": 0.2195593351372246, + "grad_norm": 0.4271162748336792, + "learning_rate": 4.823825503355705e-05, + "loss": 0.9277, + "step": 142 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 0.43438541889190674, + "learning_rate": 4.8196308724832215e-05, + "loss": 0.9727, + "step": 143 + }, + { + "epoch": 0.22265172013915732, + "grad_norm": 0.45643556118011475, + "learning_rate": 4.8154362416107385e-05, + "loss": 0.9284, + "step": 144 + }, + { + "epoch": 0.2241979126401237, + "grad_norm": 0.5079519748687744, + "learning_rate": 4.8112416107382555e-05, + "loss": 1.004, + "step": 145 + }, + { + "epoch": 0.22574410514109006, + "grad_norm": 0.4708511531352997, + "learning_rate": 4.807046979865772e-05, + "loss": 1.001, + "step": 146 + }, + { + "epoch": 0.22729029764205644, + "grad_norm": 0.510336697101593, + "learning_rate": 4.802852348993289e-05, + "loss": 0.9808, + "step": 147 + }, + { + "epoch": 0.22883649014302282, + "grad_norm": 0.5595572590827942, + "learning_rate": 4.798657718120805e-05, + "loss": 0.9983, + "step": 148 + }, + { + "epoch": 0.23038268264398917, + "grad_norm": 0.5763404965400696, + "learning_rate": 4.794463087248323e-05, + "loss": 0.9841, + "step": 149 + }, + { + "epoch": 0.23192887514495555, + "grad_norm": 0.9527286887168884, + "learning_rate": 4.790268456375839e-05, + "loss": 0.9827, + "step": 150 + }, + { + "epoch": 0.2334750676459219, + "grad_norm": 0.35568949580192566, + "learning_rate": 4.786073825503356e-05, + "loss": 0.8084, + "step": 151 + }, + { + "epoch": 0.2350212601468883, + "grad_norm": 0.3866717517375946, + "learning_rate": 4.7818791946308725e-05, + "loss": 0.8538, + "step": 152 + }, + { + "epoch": 0.23656745264785467, + "grad_norm": 0.3884953260421753, + "learning_rate": 4.7776845637583895e-05, + "loss": 0.853, + "step": 153 + }, + { + "epoch": 0.23811364514882102, + "grad_norm": 0.40036800503730774, + "learning_rate": 4.7734899328859064e-05, + "loss": 0.8604, + "step": 154 + }, + { + "epoch": 0.2396598376497874, + "grad_norm": 0.4019627273082733, + "learning_rate": 4.769295302013423e-05, + "loss": 0.8409, + "step": 155 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.3811728358268738, + "learning_rate": 4.76510067114094e-05, + "loss": 0.8602, + "step": 156 + }, + { + "epoch": 0.24275222265172014, + "grad_norm": 0.39744654297828674, + "learning_rate": 4.760906040268457e-05, + "loss": 0.8011, + "step": 157 + }, + { + "epoch": 0.24429841515268652, + "grad_norm": 0.38136810064315796, + "learning_rate": 4.756711409395974e-05, + "loss": 0.9004, + "step": 158 + }, + { + "epoch": 0.24584460765365287, + "grad_norm": 0.3670859932899475, + "learning_rate": 4.75251677852349e-05, + "loss": 0.8506, + "step": 159 + }, + { + "epoch": 0.24739080015461926, + "grad_norm": 0.4202471375465393, + "learning_rate": 4.748322147651007e-05, + "loss": 0.7832, + "step": 160 + }, + { + "epoch": 0.2489369926555856, + "grad_norm": 0.40722745656967163, + "learning_rate": 4.744127516778524e-05, + "loss": 0.8813, + "step": 161 + }, + { + "epoch": 0.250483185156552, + "grad_norm": 0.4037550389766693, + "learning_rate": 4.7399328859060404e-05, + "loss": 0.8328, + "step": 162 + }, + { + "epoch": 0.25202937765751837, + "grad_norm": 0.4262521266937256, + "learning_rate": 4.7357382550335574e-05, + "loss": 0.9148, + "step": 163 + }, + { + "epoch": 0.25357557015848475, + "grad_norm": 0.40966907143592834, + "learning_rate": 4.731543624161074e-05, + "loss": 0.8689, + "step": 164 + }, + { + "epoch": 0.2551217626594511, + "grad_norm": 0.41350632905960083, + "learning_rate": 4.727348993288591e-05, + "loss": 0.8253, + "step": 165 + }, + { + "epoch": 0.25666795516041746, + "grad_norm": 0.3822355568408966, + "learning_rate": 4.723154362416108e-05, + "loss": 0.8176, + "step": 166 + }, + { + "epoch": 0.25821414766138384, + "grad_norm": 0.4168022572994232, + "learning_rate": 4.718959731543625e-05, + "loss": 0.8821, + "step": 167 + }, + { + "epoch": 0.2597603401623502, + "grad_norm": 0.4205927848815918, + "learning_rate": 4.714765100671141e-05, + "loss": 0.9162, + "step": 168 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 0.41793638467788696, + "learning_rate": 4.710570469798658e-05, + "loss": 0.9045, + "step": 169 + }, + { + "epoch": 0.26285272516428293, + "grad_norm": 0.41607388854026794, + "learning_rate": 4.706375838926175e-05, + "loss": 0.8474, + "step": 170 + }, + { + "epoch": 0.2643989176652493, + "grad_norm": 0.3936135172843933, + "learning_rate": 4.702181208053691e-05, + "loss": 0.8859, + "step": 171 + }, + { + "epoch": 0.2659451101662157, + "grad_norm": 0.39022210240364075, + "learning_rate": 4.697986577181208e-05, + "loss": 0.8455, + "step": 172 + }, + { + "epoch": 0.2674913026671821, + "grad_norm": 0.40289798378944397, + "learning_rate": 4.6937919463087246e-05, + "loss": 0.9244, + "step": 173 + }, + { + "epoch": 0.26903749516814845, + "grad_norm": 0.4135661721229553, + "learning_rate": 4.6895973154362416e-05, + "loss": 0.8427, + "step": 174 + }, + { + "epoch": 0.2705836876691148, + "grad_norm": 0.4445662796497345, + "learning_rate": 4.6854026845637586e-05, + "loss": 0.8042, + "step": 175 + }, + { + "epoch": 0.27212988017008116, + "grad_norm": 0.42175382375717163, + "learning_rate": 4.6812080536912756e-05, + "loss": 0.8447, + "step": 176 + }, + { + "epoch": 0.27367607267104754, + "grad_norm": 0.42236700654029846, + "learning_rate": 4.6770134228187926e-05, + "loss": 0.8964, + "step": 177 + }, + { + "epoch": 0.2752222651720139, + "grad_norm": 0.44100165367126465, + "learning_rate": 4.672818791946309e-05, + "loss": 0.7989, + "step": 178 + }, + { + "epoch": 0.2767684576729803, + "grad_norm": 0.43265581130981445, + "learning_rate": 4.668624161073826e-05, + "loss": 0.8795, + "step": 179 + }, + { + "epoch": 0.27831465017394663, + "grad_norm": 0.43812161684036255, + "learning_rate": 4.664429530201342e-05, + "loss": 0.8984, + "step": 180 + }, + { + "epoch": 0.279860842674913, + "grad_norm": 0.4735831916332245, + "learning_rate": 4.660234899328859e-05, + "loss": 0.8868, + "step": 181 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.4555661678314209, + "learning_rate": 4.6560402684563755e-05, + "loss": 0.9057, + "step": 182 + }, + { + "epoch": 0.2829532276768458, + "grad_norm": 0.44536617398262024, + "learning_rate": 4.6518456375838925e-05, + "loss": 0.8956, + "step": 183 + }, + { + "epoch": 0.28449942017781216, + "grad_norm": 0.4430801272392273, + "learning_rate": 4.6476510067114095e-05, + "loss": 0.9171, + "step": 184 + }, + { + "epoch": 0.2860456126787785, + "grad_norm": 0.4534691572189331, + "learning_rate": 4.6434563758389265e-05, + "loss": 0.7871, + "step": 185 + }, + { + "epoch": 0.28759180517974486, + "grad_norm": 0.46288105845451355, + "learning_rate": 4.6392617449664435e-05, + "loss": 0.8819, + "step": 186 + }, + { + "epoch": 0.28913799768071125, + "grad_norm": 0.4744343161582947, + "learning_rate": 4.63506711409396e-05, + "loss": 0.9415, + "step": 187 + }, + { + "epoch": 0.2906841901816776, + "grad_norm": 0.46916016936302185, + "learning_rate": 4.630872483221477e-05, + "loss": 0.9436, + "step": 188 + }, + { + "epoch": 0.292230382682644, + "grad_norm": 0.4746655821800232, + "learning_rate": 4.626677852348993e-05, + "loss": 0.9039, + "step": 189 + }, + { + "epoch": 0.29377657518361033, + "grad_norm": 0.48333853483200073, + "learning_rate": 4.62248322147651e-05, + "loss": 0.9466, + "step": 190 + }, + { + "epoch": 0.2953227676845767, + "grad_norm": 0.4735361933708191, + "learning_rate": 4.618288590604027e-05, + "loss": 0.9088, + "step": 191 + }, + { + "epoch": 0.2968689601855431, + "grad_norm": 0.484523206949234, + "learning_rate": 4.6140939597315434e-05, + "loss": 0.909, + "step": 192 + }, + { + "epoch": 0.2984151526865095, + "grad_norm": 0.5111984610557556, + "learning_rate": 4.609899328859061e-05, + "loss": 0.9451, + "step": 193 + }, + { + "epoch": 0.29996134518747586, + "grad_norm": 0.5157277584075928, + "learning_rate": 4.6057046979865774e-05, + "loss": 0.9768, + "step": 194 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 0.5363951325416565, + "learning_rate": 4.6015100671140944e-05, + "loss": 0.9411, + "step": 195 + }, + { + "epoch": 0.30305373018940857, + "grad_norm": 0.5306119918823242, + "learning_rate": 4.597315436241611e-05, + "loss": 0.9238, + "step": 196 + }, + { + "epoch": 0.30459992269037495, + "grad_norm": 0.5576231479644775, + "learning_rate": 4.593120805369128e-05, + "loss": 0.9866, + "step": 197 + }, + { + "epoch": 0.30614611519134133, + "grad_norm": 0.6054885983467102, + "learning_rate": 4.588926174496645e-05, + "loss": 0.9987, + "step": 198 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.6553575396537781, + "learning_rate": 4.584731543624161e-05, + "loss": 1.0413, + "step": 199 + }, + { + "epoch": 0.30923850019327404, + "grad_norm": 1.023345708847046, + "learning_rate": 4.580536912751678e-05, + "loss": 1.0355, + "step": 200 + }, + { + "epoch": 0.3107846926942404, + "grad_norm": 0.4243873059749603, + "learning_rate": 4.576342281879195e-05, + "loss": 0.7642, + "step": 201 + }, + { + "epoch": 0.3123308851952068, + "grad_norm": 0.4183518886566162, + "learning_rate": 4.572147651006712e-05, + "loss": 0.8115, + "step": 202 + }, + { + "epoch": 0.3138770776961732, + "grad_norm": 0.43979495763778687, + "learning_rate": 4.5679530201342284e-05, + "loss": 0.7705, + "step": 203 + }, + { + "epoch": 0.31542327019713956, + "grad_norm": 0.44140326976776123, + "learning_rate": 4.5637583892617453e-05, + "loss": 0.7784, + "step": 204 + }, + { + "epoch": 0.31696946269810594, + "grad_norm": 0.4253062903881073, + "learning_rate": 4.559563758389262e-05, + "loss": 0.8006, + "step": 205 + }, + { + "epoch": 0.31851565519907227, + "grad_norm": 0.41898655891418457, + "learning_rate": 4.5553691275167787e-05, + "loss": 0.8304, + "step": 206 + }, + { + "epoch": 0.32006184770003865, + "grad_norm": 0.3986593186855316, + "learning_rate": 4.5511744966442957e-05, + "loss": 0.7815, + "step": 207 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.42180249094963074, + "learning_rate": 4.546979865771812e-05, + "loss": 0.859, + "step": 208 + }, + { + "epoch": 0.3231542327019714, + "grad_norm": 0.429800808429718, + "learning_rate": 4.542785234899329e-05, + "loss": 0.8188, + "step": 209 + }, + { + "epoch": 0.3247004252029378, + "grad_norm": 0.4582015573978424, + "learning_rate": 4.538590604026846e-05, + "loss": 0.8069, + "step": 210 + }, + { + "epoch": 0.3262466177039041, + "grad_norm": 0.44153645634651184, + "learning_rate": 4.534395973154363e-05, + "loss": 0.8107, + "step": 211 + }, + { + "epoch": 0.3277928102048705, + "grad_norm": 0.4466538429260254, + "learning_rate": 4.530201342281879e-05, + "loss": 0.8076, + "step": 212 + }, + { + "epoch": 0.3293390027058369, + "grad_norm": 0.4263439178466797, + "learning_rate": 4.526006711409396e-05, + "loss": 0.8532, + "step": 213 + }, + { + "epoch": 0.33088519520680326, + "grad_norm": 0.469927042722702, + "learning_rate": 4.521812080536913e-05, + "loss": 0.8264, + "step": 214 + }, + { + "epoch": 0.33243138770776964, + "grad_norm": 0.45597830414772034, + "learning_rate": 4.5176174496644296e-05, + "loss": 0.8125, + "step": 215 + }, + { + "epoch": 0.33397758020873597, + "grad_norm": 0.43289533257484436, + "learning_rate": 4.5134228187919466e-05, + "loss": 0.8223, + "step": 216 + }, + { + "epoch": 0.33552377270970235, + "grad_norm": 0.4476693272590637, + "learning_rate": 4.509228187919463e-05, + "loss": 0.8206, + "step": 217 + }, + { + "epoch": 0.33706996521066873, + "grad_norm": 0.43755078315734863, + "learning_rate": 4.50503355704698e-05, + "loss": 0.8084, + "step": 218 + }, + { + "epoch": 0.3386161577116351, + "grad_norm": 0.4320957064628601, + "learning_rate": 4.500838926174497e-05, + "loss": 0.8127, + "step": 219 + }, + { + "epoch": 0.3401623502126015, + "grad_norm": 0.45105233788490295, + "learning_rate": 4.496644295302014e-05, + "loss": 0.8208, + "step": 220 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 0.42372819781303406, + "learning_rate": 4.49244966442953e-05, + "loss": 0.8685, + "step": 221 + }, + { + "epoch": 0.3432547352145342, + "grad_norm": 0.4444120526313782, + "learning_rate": 4.488255033557047e-05, + "loss": 0.8501, + "step": 222 + }, + { + "epoch": 0.3448009277155006, + "grad_norm": 0.4355757236480713, + "learning_rate": 4.484060402684564e-05, + "loss": 0.8507, + "step": 223 + }, + { + "epoch": 0.34634712021646696, + "grad_norm": 0.45914098620414734, + "learning_rate": 4.4798657718120805e-05, + "loss": 0.9056, + "step": 224 + }, + { + "epoch": 0.34789331271743335, + "grad_norm": 0.4455774128437042, + "learning_rate": 4.4756711409395975e-05, + "loss": 0.8716, + "step": 225 + }, + { + "epoch": 0.34943950521839967, + "grad_norm": 0.4548904299736023, + "learning_rate": 4.471476510067114e-05, + "loss": 0.7938, + "step": 226 + }, + { + "epoch": 0.35098569771936605, + "grad_norm": 0.44709107279777527, + "learning_rate": 4.467281879194631e-05, + "loss": 0.8248, + "step": 227 + }, + { + "epoch": 0.35253189022033243, + "grad_norm": 0.4717913866043091, + "learning_rate": 4.463087248322148e-05, + "loss": 0.864, + "step": 228 + }, + { + "epoch": 0.3540780827212988, + "grad_norm": 0.48902633786201477, + "learning_rate": 4.458892617449665e-05, + "loss": 0.7545, + "step": 229 + }, + { + "epoch": 0.3556242752222652, + "grad_norm": 0.4466339945793152, + "learning_rate": 4.454697986577182e-05, + "loss": 0.8869, + "step": 230 + }, + { + "epoch": 0.3571704677232315, + "grad_norm": 0.4503350257873535, + "learning_rate": 4.450503355704698e-05, + "loss": 0.8719, + "step": 231 + }, + { + "epoch": 0.3587166602241979, + "grad_norm": 0.4927978217601776, + "learning_rate": 4.446308724832215e-05, + "loss": 0.9214, + "step": 232 + }, + { + "epoch": 0.3602628527251643, + "grad_norm": 0.4751432240009308, + "learning_rate": 4.4421140939597314e-05, + "loss": 0.8735, + "step": 233 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.4933432638645172, + "learning_rate": 4.4379194630872484e-05, + "loss": 0.8353, + "step": 234 + }, + { + "epoch": 0.36335523772709705, + "grad_norm": 0.5034083127975464, + "learning_rate": 4.4337248322147654e-05, + "loss": 0.9168, + "step": 235 + }, + { + "epoch": 0.3649014302280634, + "grad_norm": 0.5039856433868408, + "learning_rate": 4.4295302013422824e-05, + "loss": 0.9183, + "step": 236 + }, + { + "epoch": 0.36644762272902975, + "grad_norm": 0.47999468445777893, + "learning_rate": 4.4253355704697994e-05, + "loss": 0.9402, + "step": 237 + }, + { + "epoch": 0.36799381522999614, + "grad_norm": 0.5011245608329773, + "learning_rate": 4.421140939597316e-05, + "loss": 0.9587, + "step": 238 + }, + { + "epoch": 0.3695400077309625, + "grad_norm": 0.49425458908081055, + "learning_rate": 4.416946308724833e-05, + "loss": 0.8474, + "step": 239 + }, + { + "epoch": 0.3710862002319289, + "grad_norm": 0.514880359172821, + "learning_rate": 4.412751677852349e-05, + "loss": 0.9016, + "step": 240 + }, + { + "epoch": 0.3726323927328952, + "grad_norm": 0.5200977325439453, + "learning_rate": 4.408557046979866e-05, + "loss": 0.9552, + "step": 241 + }, + { + "epoch": 0.3741785852338616, + "grad_norm": 0.5420643091201782, + "learning_rate": 4.4043624161073823e-05, + "loss": 0.9023, + "step": 242 + }, + { + "epoch": 0.375724777734828, + "grad_norm": 0.5034189820289612, + "learning_rate": 4.4001677852348993e-05, + "loss": 0.9255, + "step": 243 + }, + { + "epoch": 0.37727097023579437, + "grad_norm": 0.529698371887207, + "learning_rate": 4.395973154362416e-05, + "loss": 0.9337, + "step": 244 + }, + { + "epoch": 0.37881716273676075, + "grad_norm": 0.5248023867607117, + "learning_rate": 4.391778523489933e-05, + "loss": 0.9626, + "step": 245 + }, + { + "epoch": 0.3803633552377271, + "grad_norm": 0.5809698700904846, + "learning_rate": 4.38758389261745e-05, + "loss": 0.9598, + "step": 246 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 0.5663429498672485, + "learning_rate": 4.3833892617449666e-05, + "loss": 0.979, + "step": 247 + }, + { + "epoch": 0.38345574023965984, + "grad_norm": 0.6147710680961609, + "learning_rate": 4.3791946308724836e-05, + "loss": 0.9747, + "step": 248 + }, + { + "epoch": 0.3850019327406262, + "grad_norm": 0.7502771615982056, + "learning_rate": 4.375e-05, + "loss": 0.9344, + "step": 249 + }, + { + "epoch": 0.3865481252415926, + "grad_norm": 1.0911471843719482, + "learning_rate": 4.370805369127517e-05, + "loss": 1.0075, + "step": 250 + }, + { + "epoch": 0.3880943177425589, + "grad_norm": 0.4232114255428314, + "learning_rate": 4.366610738255034e-05, + "loss": 0.7643, + "step": 251 + }, + { + "epoch": 0.3896405102435253, + "grad_norm": 0.423627644777298, + "learning_rate": 4.36241610738255e-05, + "loss": 0.7551, + "step": 252 + }, + { + "epoch": 0.3911867027444917, + "grad_norm": 0.4198389947414398, + "learning_rate": 4.358221476510067e-05, + "loss": 0.7888, + "step": 253 + }, + { + "epoch": 0.39273289524545807, + "grad_norm": 0.43714022636413574, + "learning_rate": 4.354026845637584e-05, + "loss": 0.7958, + "step": 254 + }, + { + "epoch": 0.39427908774642445, + "grad_norm": 0.4278389513492584, + "learning_rate": 4.349832214765101e-05, + "loss": 0.7959, + "step": 255 + }, + { + "epoch": 0.3958252802473908, + "grad_norm": 0.45499542355537415, + "learning_rate": 4.3456375838926176e-05, + "loss": 0.7461, + "step": 256 + }, + { + "epoch": 0.39737147274835716, + "grad_norm": 0.46723300218582153, + "learning_rate": 4.3414429530201346e-05, + "loss": 0.8312, + "step": 257 + }, + { + "epoch": 0.39891766524932354, + "grad_norm": 0.44024282693862915, + "learning_rate": 4.337248322147651e-05, + "loss": 0.8517, + "step": 258 + }, + { + "epoch": 0.4004638577502899, + "grad_norm": 0.42471960186958313, + "learning_rate": 4.333053691275168e-05, + "loss": 0.8312, + "step": 259 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.47083520889282227, + "learning_rate": 4.328859060402685e-05, + "loss": 0.8618, + "step": 260 + }, + { + "epoch": 0.4035562427522226, + "grad_norm": 0.45681217312812805, + "learning_rate": 4.324664429530201e-05, + "loss": 0.8986, + "step": 261 + }, + { + "epoch": 0.405102435253189, + "grad_norm": 0.42533236742019653, + "learning_rate": 4.320469798657718e-05, + "loss": 0.824, + "step": 262 + }, + { + "epoch": 0.4066486277541554, + "grad_norm": 0.4372871220111847, + "learning_rate": 4.316275167785235e-05, + "loss": 0.8077, + "step": 263 + }, + { + "epoch": 0.40819482025512177, + "grad_norm": 0.45690760016441345, + "learning_rate": 4.312080536912752e-05, + "loss": 0.8735, + "step": 264 + }, + { + "epoch": 0.40974101275608815, + "grad_norm": 0.46814388036727905, + "learning_rate": 4.3078859060402685e-05, + "loss": 0.8161, + "step": 265 + }, + { + "epoch": 0.4112872052570545, + "grad_norm": 0.4442387521266937, + "learning_rate": 4.3036912751677855e-05, + "loss": 0.8352, + "step": 266 + }, + { + "epoch": 0.41283339775802086, + "grad_norm": 0.4814305305480957, + "learning_rate": 4.2994966442953025e-05, + "loss": 0.7508, + "step": 267 + }, + { + "epoch": 0.41437959025898724, + "grad_norm": 0.4563637673854828, + "learning_rate": 4.295302013422819e-05, + "loss": 0.8639, + "step": 268 + }, + { + "epoch": 0.4159257827599536, + "grad_norm": 0.4439164698123932, + "learning_rate": 4.291107382550336e-05, + "loss": 0.8466, + "step": 269 + }, + { + "epoch": 0.41747197526092, + "grad_norm": 0.45698022842407227, + "learning_rate": 4.286912751677852e-05, + "loss": 0.8784, + "step": 270 + }, + { + "epoch": 0.41901816776188633, + "grad_norm": 0.46446678042411804, + "learning_rate": 4.28271812080537e-05, + "loss": 0.7856, + "step": 271 + }, + { + "epoch": 0.4205643602628527, + "grad_norm": 0.4599681496620178, + "learning_rate": 4.278523489932886e-05, + "loss": 0.8241, + "step": 272 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 0.4912761449813843, + "learning_rate": 4.274328859060403e-05, + "loss": 0.8697, + "step": 273 + }, + { + "epoch": 0.4236567452647855, + "grad_norm": 0.4875286817550659, + "learning_rate": 4.27013422818792e-05, + "loss": 0.7999, + "step": 274 + }, + { + "epoch": 0.42520293776575185, + "grad_norm": 0.4821873903274536, + "learning_rate": 4.2659395973154364e-05, + "loss": 0.7944, + "step": 275 + }, + { + "epoch": 0.4267491302667182, + "grad_norm": 0.4670597016811371, + "learning_rate": 4.2617449664429534e-05, + "loss": 0.9423, + "step": 276 + }, + { + "epoch": 0.42829532276768456, + "grad_norm": 0.5004227757453918, + "learning_rate": 4.25755033557047e-05, + "loss": 0.8073, + "step": 277 + }, + { + "epoch": 0.42984151526865094, + "grad_norm": 0.510180652141571, + "learning_rate": 4.253355704697987e-05, + "loss": 0.8658, + "step": 278 + }, + { + "epoch": 0.4313877077696173, + "grad_norm": 0.4866536855697632, + "learning_rate": 4.249161073825503e-05, + "loss": 0.7532, + "step": 279 + }, + { + "epoch": 0.4329339002705837, + "grad_norm": 0.5010313987731934, + "learning_rate": 4.244966442953021e-05, + "loss": 0.9368, + "step": 280 + }, + { + "epoch": 0.43448009277155003, + "grad_norm": 0.5455735325813293, + "learning_rate": 4.240771812080537e-05, + "loss": 0.9058, + "step": 281 + }, + { + "epoch": 0.4360262852725164, + "grad_norm": 0.47950518131256104, + "learning_rate": 4.236577181208054e-05, + "loss": 0.8709, + "step": 282 + }, + { + "epoch": 0.4375724777734828, + "grad_norm": 0.5143994092941284, + "learning_rate": 4.232382550335571e-05, + "loss": 0.8528, + "step": 283 + }, + { + "epoch": 0.4391186702744492, + "grad_norm": 0.5421144366264343, + "learning_rate": 4.228187919463087e-05, + "loss": 0.8989, + "step": 284 + }, + { + "epoch": 0.44066486277541556, + "grad_norm": 0.5018422603607178, + "learning_rate": 4.223993288590604e-05, + "loss": 0.8903, + "step": 285 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.4975266754627228, + "learning_rate": 4.2197986577181206e-05, + "loss": 0.9568, + "step": 286 + }, + { + "epoch": 0.44375724777734826, + "grad_norm": 0.4982987642288208, + "learning_rate": 4.2156040268456376e-05, + "loss": 0.946, + "step": 287 + }, + { + "epoch": 0.44530344027831464, + "grad_norm": 0.5241461992263794, + "learning_rate": 4.2114093959731546e-05, + "loss": 0.9359, + "step": 288 + }, + { + "epoch": 0.446849632779281, + "grad_norm": 0.4898720681667328, + "learning_rate": 4.2072147651006716e-05, + "loss": 0.9798, + "step": 289 + }, + { + "epoch": 0.4483958252802474, + "grad_norm": 0.5274215936660767, + "learning_rate": 4.2030201342281886e-05, + "loss": 0.8603, + "step": 290 + }, + { + "epoch": 0.4499420177812138, + "grad_norm": 0.5538789629936218, + "learning_rate": 4.198825503355705e-05, + "loss": 0.8873, + "step": 291 + }, + { + "epoch": 0.4514882102821801, + "grad_norm": 0.5374253392219543, + "learning_rate": 4.194630872483222e-05, + "loss": 1.0174, + "step": 292 + }, + { + "epoch": 0.4530344027831465, + "grad_norm": 0.5286028385162354, + "learning_rate": 4.190436241610738e-05, + "loss": 0.9891, + "step": 293 + }, + { + "epoch": 0.4545805952841129, + "grad_norm": 0.5447548627853394, + "learning_rate": 4.186241610738255e-05, + "loss": 0.9587, + "step": 294 + }, + { + "epoch": 0.45612678778507926, + "grad_norm": 0.5515534281730652, + "learning_rate": 4.1820469798657716e-05, + "loss": 0.9663, + "step": 295 + }, + { + "epoch": 0.45767298028604564, + "grad_norm": 0.6086519956588745, + "learning_rate": 4.1778523489932886e-05, + "loss": 1.0357, + "step": 296 + }, + { + "epoch": 0.45921917278701196, + "grad_norm": 0.6332388520240784, + "learning_rate": 4.1736577181208055e-05, + "loss": 0.9999, + "step": 297 + }, + { + "epoch": 0.46076536528797835, + "grad_norm": 0.6928420066833496, + "learning_rate": 4.1694630872483225e-05, + "loss": 1.0243, + "step": 298 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 0.7226160168647766, + "learning_rate": 4.1652684563758395e-05, + "loss": 0.9209, + "step": 299 + }, + { + "epoch": 0.4638577502899111, + "grad_norm": 1.0083510875701904, + "learning_rate": 4.161073825503356e-05, + "loss": 0.9493, + "step": 300 + }, + { + "epoch": 0.4654039427908775, + "grad_norm": 0.4649132192134857, + "learning_rate": 4.156879194630873e-05, + "loss": 0.7743, + "step": 301 + }, + { + "epoch": 0.4669501352918438, + "grad_norm": 0.43430233001708984, + "learning_rate": 4.152684563758389e-05, + "loss": 0.8197, + "step": 302 + }, + { + "epoch": 0.4684963277928102, + "grad_norm": 0.44487112760543823, + "learning_rate": 4.148489932885906e-05, + "loss": 0.7632, + "step": 303 + }, + { + "epoch": 0.4700425202937766, + "grad_norm": 0.4654642343521118, + "learning_rate": 4.144295302013423e-05, + "loss": 0.7637, + "step": 304 + }, + { + "epoch": 0.47158871279474296, + "grad_norm": 0.48062166571617126, + "learning_rate": 4.1401006711409395e-05, + "loss": 0.7771, + "step": 305 + }, + { + "epoch": 0.47313490529570934, + "grad_norm": 0.4307418167591095, + "learning_rate": 4.135906040268457e-05, + "loss": 0.8336, + "step": 306 + }, + { + "epoch": 0.47468109779667567, + "grad_norm": 0.4675520658493042, + "learning_rate": 4.1317114093959735e-05, + "loss": 0.7969, + "step": 307 + }, + { + "epoch": 0.47622729029764205, + "grad_norm": 0.48613134026527405, + "learning_rate": 4.1275167785234905e-05, + "loss": 0.8821, + "step": 308 + }, + { + "epoch": 0.47777348279860843, + "grad_norm": 0.46094274520874023, + "learning_rate": 4.123322147651007e-05, + "loss": 0.8103, + "step": 309 + }, + { + "epoch": 0.4793196752995748, + "grad_norm": 0.47018101811408997, + "learning_rate": 4.119127516778524e-05, + "loss": 0.8515, + "step": 310 + }, + { + "epoch": 0.4808658678005412, + "grad_norm": 0.43754515051841736, + "learning_rate": 4.11493288590604e-05, + "loss": 0.8218, + "step": 311 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.49217140674591064, + "learning_rate": 4.110738255033557e-05, + "loss": 0.7866, + "step": 312 + }, + { + "epoch": 0.4839582528024739, + "grad_norm": 0.47612541913986206, + "learning_rate": 4.106543624161074e-05, + "loss": 0.8088, + "step": 313 + }, + { + "epoch": 0.4855044453034403, + "grad_norm": 0.475328654050827, + "learning_rate": 4.1023489932885904e-05, + "loss": 0.8165, + "step": 314 + }, + { + "epoch": 0.48705063780440666, + "grad_norm": 0.4439482092857361, + "learning_rate": 4.098154362416108e-05, + "loss": 0.7833, + "step": 315 + }, + { + "epoch": 0.48859683030537304, + "grad_norm": 0.5019133687019348, + "learning_rate": 4.0939597315436244e-05, + "loss": 0.7942, + "step": 316 + }, + { + "epoch": 0.49014302280633937, + "grad_norm": 0.451492577791214, + "learning_rate": 4.0897651006711414e-05, + "loss": 0.8039, + "step": 317 + }, + { + "epoch": 0.49168921530730575, + "grad_norm": 0.46894291043281555, + "learning_rate": 4.085570469798658e-05, + "loss": 0.843, + "step": 318 + }, + { + "epoch": 0.49323540780827213, + "grad_norm": 0.49084803462028503, + "learning_rate": 4.081375838926175e-05, + "loss": 0.8121, + "step": 319 + }, + { + "epoch": 0.4947816003092385, + "grad_norm": 0.46862879395484924, + "learning_rate": 4.077181208053692e-05, + "loss": 0.901, + "step": 320 + }, + { + "epoch": 0.4963277928102049, + "grad_norm": 0.4881606698036194, + "learning_rate": 4.072986577181208e-05, + "loss": 0.9001, + "step": 321 + }, + { + "epoch": 0.4978739853111712, + "grad_norm": 0.5040379166603088, + "learning_rate": 4.068791946308725e-05, + "loss": 0.801, + "step": 322 + }, + { + "epoch": 0.4994201778121376, + "grad_norm": 0.46498745679855347, + "learning_rate": 4.064597315436241e-05, + "loss": 0.8473, + "step": 323 + }, + { + "epoch": 0.500966370313104, + "grad_norm": 0.4876708388328552, + "learning_rate": 4.060402684563759e-05, + "loss": 0.8418, + "step": 324 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 0.4962271451950073, + "learning_rate": 4.056208053691275e-05, + "loss": 0.7611, + "step": 325 + }, + { + "epoch": 0.5040587553150367, + "grad_norm": 0.48774364590644836, + "learning_rate": 4.052013422818792e-05, + "loss": 0.8825, + "step": 326 + }, + { + "epoch": 0.5056049478160031, + "grad_norm": 0.5011964440345764, + "learning_rate": 4.047818791946309e-05, + "loss": 0.7659, + "step": 327 + }, + { + "epoch": 0.5071511403169695, + "grad_norm": 0.5307598114013672, + "learning_rate": 4.0436241610738256e-05, + "loss": 0.8629, + "step": 328 + }, + { + "epoch": 0.5086973328179358, + "grad_norm": 0.49158573150634766, + "learning_rate": 4.0394295302013426e-05, + "loss": 0.9071, + "step": 329 + }, + { + "epoch": 0.5102435253189022, + "grad_norm": 0.5126063227653503, + "learning_rate": 4.035234899328859e-05, + "loss": 0.7998, + "step": 330 + }, + { + "epoch": 0.5117897178198686, + "grad_norm": 0.5247028470039368, + "learning_rate": 4.031040268456376e-05, + "loss": 0.8443, + "step": 331 + }, + { + "epoch": 0.5133359103208349, + "grad_norm": 0.5511295199394226, + "learning_rate": 4.026845637583892e-05, + "loss": 0.8946, + "step": 332 + }, + { + "epoch": 0.5148821028218014, + "grad_norm": 0.5266992449760437, + "learning_rate": 4.02265100671141e-05, + "loss": 0.8172, + "step": 333 + }, + { + "epoch": 0.5164282953227677, + "grad_norm": 0.5326645374298096, + "learning_rate": 4.018456375838926e-05, + "loss": 0.8407, + "step": 334 + }, + { + "epoch": 0.517974487823734, + "grad_norm": 0.5065395832061768, + "learning_rate": 4.014261744966443e-05, + "loss": 0.8751, + "step": 335 + }, + { + "epoch": 0.5195206803247004, + "grad_norm": 0.53061443567276, + "learning_rate": 4.01006711409396e-05, + "loss": 0.9204, + "step": 336 + }, + { + "epoch": 0.5210668728256668, + "grad_norm": 0.5223432779312134, + "learning_rate": 4.0058724832214765e-05, + "loss": 0.8933, + "step": 337 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.5290891528129578, + "learning_rate": 4.0016778523489935e-05, + "loss": 0.977, + "step": 338 + }, + { + "epoch": 0.5241592578275995, + "grad_norm": 0.5419663786888123, + "learning_rate": 3.99748322147651e-05, + "loss": 0.9521, + "step": 339 + }, + { + "epoch": 0.5257054503285659, + "grad_norm": 0.5404860973358154, + "learning_rate": 3.993288590604027e-05, + "loss": 0.8907, + "step": 340 + }, + { + "epoch": 0.5272516428295323, + "grad_norm": 0.5783061385154724, + "learning_rate": 3.989093959731544e-05, + "loss": 0.9162, + "step": 341 + }, + { + "epoch": 0.5287978353304986, + "grad_norm": 0.5513641834259033, + "learning_rate": 3.984899328859061e-05, + "loss": 0.8912, + "step": 342 + }, + { + "epoch": 0.5303440278314651, + "grad_norm": 0.5670628547668457, + "learning_rate": 3.980704697986578e-05, + "loss": 0.9502, + "step": 343 + }, + { + "epoch": 0.5318902203324314, + "grad_norm": 0.5960517525672913, + "learning_rate": 3.976510067114094e-05, + "loss": 0.9976, + "step": 344 + }, + { + "epoch": 0.5334364128333977, + "grad_norm": 0.6074213981628418, + "learning_rate": 3.972315436241611e-05, + "loss": 0.9861, + "step": 345 + }, + { + "epoch": 0.5349826053343641, + "grad_norm": 0.6100422143936157, + "learning_rate": 3.9681208053691275e-05, + "loss": 0.9483, + "step": 346 + }, + { + "epoch": 0.5365287978353305, + "grad_norm": 0.6308846473693848, + "learning_rate": 3.9639261744966445e-05, + "loss": 0.9455, + "step": 347 + }, + { + "epoch": 0.5380749903362969, + "grad_norm": 0.6239264607429504, + "learning_rate": 3.959731543624161e-05, + "loss": 0.9551, + "step": 348 + }, + { + "epoch": 0.5396211828372632, + "grad_norm": 0.6669812798500061, + "learning_rate": 3.955536912751678e-05, + "loss": 0.9251, + "step": 349 + }, + { + "epoch": 0.5411673753382296, + "grad_norm": 0.8175333738327026, + "learning_rate": 3.951342281879195e-05, + "loss": 0.9795, + "step": 350 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 0.44942405819892883, + "learning_rate": 3.947147651006712e-05, + "loss": 0.7184, + "step": 351 + }, + { + "epoch": 0.5442597603401623, + "grad_norm": 0.46283286809921265, + "learning_rate": 3.942953020134229e-05, + "loss": 0.7251, + "step": 352 + }, + { + "epoch": 0.5458059528411288, + "grad_norm": 0.47923749685287476, + "learning_rate": 3.938758389261745e-05, + "loss": 0.766, + "step": 353 + }, + { + "epoch": 0.5473521453420951, + "grad_norm": 0.4575538635253906, + "learning_rate": 3.934563758389262e-05, + "loss": 0.7539, + "step": 354 + }, + { + "epoch": 0.5488983378430614, + "grad_norm": 0.48839524388313293, + "learning_rate": 3.9303691275167784e-05, + "loss": 0.7633, + "step": 355 + }, + { + "epoch": 0.5504445303440278, + "grad_norm": 0.45980286598205566, + "learning_rate": 3.9261744966442954e-05, + "loss": 0.774, + "step": 356 + }, + { + "epoch": 0.5519907228449942, + "grad_norm": 0.4767687916755676, + "learning_rate": 3.9219798657718124e-05, + "loss": 0.7588, + "step": 357 + }, + { + "epoch": 0.5535369153459606, + "grad_norm": 0.4907895028591156, + "learning_rate": 3.917785234899329e-05, + "loss": 0.7305, + "step": 358 + }, + { + "epoch": 0.5550831078469269, + "grad_norm": 0.46510565280914307, + "learning_rate": 3.9135906040268464e-05, + "loss": 0.8727, + "step": 359 + }, + { + "epoch": 0.5566293003478933, + "grad_norm": 0.4917638599872589, + "learning_rate": 3.909395973154363e-05, + "loss": 0.8013, + "step": 360 + }, + { + "epoch": 0.5581754928488597, + "grad_norm": 0.49160364270210266, + "learning_rate": 3.90520134228188e-05, + "loss": 0.8871, + "step": 361 + }, + { + "epoch": 0.559721685349826, + "grad_norm": 0.5044489502906799, + "learning_rate": 3.901006711409396e-05, + "loss": 0.7945, + "step": 362 + }, + { + "epoch": 0.5612678778507925, + "grad_norm": 0.49622687697410583, + "learning_rate": 3.896812080536913e-05, + "loss": 0.7616, + "step": 363 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.5114305019378662, + "learning_rate": 3.89261744966443e-05, + "loss": 0.7593, + "step": 364 + }, + { + "epoch": 0.5643602628527251, + "grad_norm": 0.499931663274765, + "learning_rate": 3.888422818791946e-05, + "loss": 0.7935, + "step": 365 + }, + { + "epoch": 0.5659064553536916, + "grad_norm": 0.511566698551178, + "learning_rate": 3.884228187919463e-05, + "loss": 0.7665, + "step": 366 + }, + { + "epoch": 0.5674526478546579, + "grad_norm": 0.49260082840919495, + "learning_rate": 3.8800335570469796e-05, + "loss": 0.8207, + "step": 367 + }, + { + "epoch": 0.5689988403556243, + "grad_norm": 0.48072633147239685, + "learning_rate": 3.875838926174497e-05, + "loss": 0.7739, + "step": 368 + }, + { + "epoch": 0.5705450328565906, + "grad_norm": 0.4969918727874756, + "learning_rate": 3.8716442953020136e-05, + "loss": 0.8155, + "step": 369 + }, + { + "epoch": 0.572091225357557, + "grad_norm": 0.49780312180519104, + "learning_rate": 3.8674496644295306e-05, + "loss": 0.8397, + "step": 370 + }, + { + "epoch": 0.5736374178585234, + "grad_norm": 0.5126276612281799, + "learning_rate": 3.863255033557047e-05, + "loss": 0.8101, + "step": 371 + }, + { + "epoch": 0.5751836103594897, + "grad_norm": 0.48839470744132996, + "learning_rate": 3.859060402684564e-05, + "loss": 0.7629, + "step": 372 + }, + { + "epoch": 0.5767298028604562, + "grad_norm": 0.4904743432998657, + "learning_rate": 3.854865771812081e-05, + "loss": 0.8126, + "step": 373 + }, + { + "epoch": 0.5782759953614225, + "grad_norm": 0.48648014664649963, + "learning_rate": 3.850671140939597e-05, + "loss": 0.845, + "step": 374 + }, + { + "epoch": 0.5798221878623888, + "grad_norm": 0.537968635559082, + "learning_rate": 3.846476510067114e-05, + "loss": 0.863, + "step": 375 + }, + { + "epoch": 0.5813683803633553, + "grad_norm": 0.5034117102622986, + "learning_rate": 3.8422818791946305e-05, + "loss": 0.8333, + "step": 376 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 0.529244601726532, + "learning_rate": 3.838087248322148e-05, + "loss": 0.7756, + "step": 377 + }, + { + "epoch": 0.584460765365288, + "grad_norm": 0.5105958580970764, + "learning_rate": 3.8338926174496645e-05, + "loss": 0.8372, + "step": 378 + }, + { + "epoch": 0.5860069578662543, + "grad_norm": 0.5209059715270996, + "learning_rate": 3.8296979865771815e-05, + "loss": 0.8347, + "step": 379 + }, + { + "epoch": 0.5875531503672207, + "grad_norm": 0.5022969841957092, + "learning_rate": 3.8255033557046985e-05, + "loss": 0.8657, + "step": 380 + }, + { + "epoch": 0.5890993428681871, + "grad_norm": 0.5265902876853943, + "learning_rate": 3.821308724832215e-05, + "loss": 0.8333, + "step": 381 + }, + { + "epoch": 0.5906455353691534, + "grad_norm": 0.5082066655158997, + "learning_rate": 3.817114093959732e-05, + "loss": 0.8834, + "step": 382 + }, + { + "epoch": 0.5921917278701199, + "grad_norm": 0.5264511704444885, + "learning_rate": 3.812919463087248e-05, + "loss": 0.908, + "step": 383 + }, + { + "epoch": 0.5937379203710862, + "grad_norm": 0.5576880574226379, + "learning_rate": 3.808724832214765e-05, + "loss": 0.9174, + "step": 384 + }, + { + "epoch": 0.5952841128720525, + "grad_norm": 0.5170852541923523, + "learning_rate": 3.804530201342282e-05, + "loss": 0.8983, + "step": 385 + }, + { + "epoch": 0.596830305373019, + "grad_norm": 0.5212023854255676, + "learning_rate": 3.800335570469799e-05, + "loss": 0.8684, + "step": 386 + }, + { + "epoch": 0.5983764978739853, + "grad_norm": 0.5252931714057922, + "learning_rate": 3.7961409395973154e-05, + "loss": 0.9016, + "step": 387 + }, + { + "epoch": 0.5999226903749517, + "grad_norm": 0.5745819807052612, + "learning_rate": 3.7919463087248324e-05, + "loss": 0.8521, + "step": 388 + }, + { + "epoch": 0.601468882875918, + "grad_norm": 0.5259801149368286, + "learning_rate": 3.7877516778523494e-05, + "loss": 0.9101, + "step": 389 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.5453861355781555, + "learning_rate": 3.783557046979866e-05, + "loss": 0.8816, + "step": 390 + }, + { + "epoch": 0.6045612678778508, + "grad_norm": 0.5567570328712463, + "learning_rate": 3.779362416107383e-05, + "loss": 0.9126, + "step": 391 + }, + { + "epoch": 0.6061074603788171, + "grad_norm": 0.5654810070991516, + "learning_rate": 3.775167785234899e-05, + "loss": 0.8147, + "step": 392 + }, + { + "epoch": 0.6076536528797836, + "grad_norm": 0.5371072888374329, + "learning_rate": 3.770973154362416e-05, + "loss": 0.936, + "step": 393 + }, + { + "epoch": 0.6091998453807499, + "grad_norm": 0.6840121746063232, + "learning_rate": 3.766778523489933e-05, + "loss": 1.0121, + "step": 394 + }, + { + "epoch": 0.6107460378817162, + "grad_norm": 0.6007458567619324, + "learning_rate": 3.76258389261745e-05, + "loss": 0.895, + "step": 395 + }, + { + "epoch": 0.6122922303826827, + "grad_norm": 0.6644576191902161, + "learning_rate": 3.758389261744967e-05, + "loss": 0.9978, + "step": 396 + }, + { + "epoch": 0.613838422883649, + "grad_norm": 0.6520909667015076, + "learning_rate": 3.7541946308724834e-05, + "loss": 0.9551, + "step": 397 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.66536945104599, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.9551, + "step": 398 + }, + { + "epoch": 0.6169308078855817, + "grad_norm": 0.7479529976844788, + "learning_rate": 3.745805369127517e-05, + "loss": 1.022, + "step": 399 + }, + { + "epoch": 0.6184770003865481, + "grad_norm": 1.1065740585327148, + "learning_rate": 3.741610738255034e-05, + "loss": 0.9431, + "step": 400 + }, + { + "epoch": 0.6200231928875145, + "grad_norm": 0.4878085255622864, + "learning_rate": 3.7374161073825507e-05, + "loss": 0.728, + "step": 401 + }, + { + "epoch": 0.6215693853884808, + "grad_norm": 0.4753767251968384, + "learning_rate": 3.733221476510067e-05, + "loss": 0.7575, + "step": 402 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 0.4956800043582916, + "learning_rate": 3.7290268456375846e-05, + "loss": 0.7292, + "step": 403 + }, + { + "epoch": 0.6246617703904136, + "grad_norm": 0.48405370116233826, + "learning_rate": 3.724832214765101e-05, + "loss": 0.7879, + "step": 404 + }, + { + "epoch": 0.62620796289138, + "grad_norm": 0.46832674741744995, + "learning_rate": 3.720637583892618e-05, + "loss": 0.7819, + "step": 405 + }, + { + "epoch": 0.6277541553923464, + "grad_norm": 0.4691636264324188, + "learning_rate": 3.716442953020134e-05, + "loss": 0.7984, + "step": 406 + }, + { + "epoch": 0.6293003478933127, + "grad_norm": 0.5165214538574219, + "learning_rate": 3.712248322147651e-05, + "loss": 0.7296, + "step": 407 + }, + { + "epoch": 0.6308465403942791, + "grad_norm": 0.5001758933067322, + "learning_rate": 3.7080536912751676e-05, + "loss": 0.808, + "step": 408 + }, + { + "epoch": 0.6323927328952454, + "grad_norm": 0.4783785045146942, + "learning_rate": 3.7038590604026846e-05, + "loss": 0.7856, + "step": 409 + }, + { + "epoch": 0.6339389253962119, + "grad_norm": 0.5511770248413086, + "learning_rate": 3.6996644295302016e-05, + "loss": 0.711, + "step": 410 + }, + { + "epoch": 0.6354851178971782, + "grad_norm": 0.5434517860412598, + "learning_rate": 3.695469798657718e-05, + "loss": 0.8498, + "step": 411 + }, + { + "epoch": 0.6370313103981445, + "grad_norm": 0.5018695592880249, + "learning_rate": 3.6912751677852356e-05, + "loss": 0.7898, + "step": 412 + }, + { + "epoch": 0.638577502899111, + "grad_norm": 0.5000431537628174, + "learning_rate": 3.687080536912752e-05, + "loss": 0.7803, + "step": 413 + }, + { + "epoch": 0.6401236954000773, + "grad_norm": 0.5340814590454102, + "learning_rate": 3.682885906040269e-05, + "loss": 0.7671, + "step": 414 + }, + { + "epoch": 0.6416698879010437, + "grad_norm": 0.5046684741973877, + "learning_rate": 3.678691275167785e-05, + "loss": 0.7876, + "step": 415 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.5036927461624146, + "learning_rate": 3.674496644295302e-05, + "loss": 0.8726, + "step": 416 + }, + { + "epoch": 0.6447622729029764, + "grad_norm": 0.5092120170593262, + "learning_rate": 3.670302013422819e-05, + "loss": 0.806, + "step": 417 + }, + { + "epoch": 0.6463084654039428, + "grad_norm": 0.5002840757369995, + "learning_rate": 3.6661073825503355e-05, + "loss": 0.8814, + "step": 418 + }, + { + "epoch": 0.6478546579049091, + "grad_norm": 0.5107703804969788, + "learning_rate": 3.6619127516778525e-05, + "loss": 0.7984, + "step": 419 + }, + { + "epoch": 0.6494008504058756, + "grad_norm": 0.5407206416130066, + "learning_rate": 3.6577181208053695e-05, + "loss": 0.7979, + "step": 420 + }, + { + "epoch": 0.6509470429068419, + "grad_norm": 0.5402006506919861, + "learning_rate": 3.6535234899328865e-05, + "loss": 0.7547, + "step": 421 + }, + { + "epoch": 0.6524932354078082, + "grad_norm": 0.5221443176269531, + "learning_rate": 3.649328859060403e-05, + "loss": 0.8426, + "step": 422 + }, + { + "epoch": 0.6540394279087747, + "grad_norm": 0.5319470167160034, + "learning_rate": 3.64513422818792e-05, + "loss": 0.7866, + "step": 423 + }, + { + "epoch": 0.655585620409741, + "grad_norm": 0.5060355067253113, + "learning_rate": 3.640939597315436e-05, + "loss": 0.8102, + "step": 424 + }, + { + "epoch": 0.6571318129107074, + "grad_norm": 0.5067129731178284, + "learning_rate": 3.636744966442953e-05, + "loss": 0.8316, + "step": 425 + }, + { + "epoch": 0.6586780054116738, + "grad_norm": 0.5815935730934143, + "learning_rate": 3.63255033557047e-05, + "loss": 0.7207, + "step": 426 + }, + { + "epoch": 0.6602241979126401, + "grad_norm": 0.518981397151947, + "learning_rate": 3.6283557046979864e-05, + "loss": 0.7909, + "step": 427 + }, + { + "epoch": 0.6617703904136065, + "grad_norm": 0.5797117948532104, + "learning_rate": 3.6241610738255034e-05, + "loss": 0.8346, + "step": 428 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 0.565744936466217, + "learning_rate": 3.6199664429530204e-05, + "loss": 0.8366, + "step": 429 + }, + { + "epoch": 0.6648627754155393, + "grad_norm": 0.5538972020149231, + "learning_rate": 3.6157718120805374e-05, + "loss": 0.8346, + "step": 430 + }, + { + "epoch": 0.6664089679165056, + "grad_norm": 0.561184823513031, + "learning_rate": 3.611577181208054e-05, + "loss": 0.8389, + "step": 431 + }, + { + "epoch": 0.6679551604174719, + "grad_norm": 0.5446305871009827, + "learning_rate": 3.607382550335571e-05, + "loss": 0.7981, + "step": 432 + }, + { + "epoch": 0.6695013529184384, + "grad_norm": 0.6099398136138916, + "learning_rate": 3.603187919463088e-05, + "loss": 0.8988, + "step": 433 + }, + { + "epoch": 0.6710475454194047, + "grad_norm": 0.572850227355957, + "learning_rate": 3.598993288590604e-05, + "loss": 0.8981, + "step": 434 + }, + { + "epoch": 0.6725937379203711, + "grad_norm": 0.5837070941925049, + "learning_rate": 3.594798657718121e-05, + "loss": 0.9049, + "step": 435 + }, + { + "epoch": 0.6741399304213375, + "grad_norm": 0.5394341945648193, + "learning_rate": 3.5906040268456373e-05, + "loss": 0.8744, + "step": 436 + }, + { + "epoch": 0.6756861229223038, + "grad_norm": 0.5494775772094727, + "learning_rate": 3.5864093959731543e-05, + "loss": 0.9612, + "step": 437 + }, + { + "epoch": 0.6772323154232702, + "grad_norm": 0.5773348808288574, + "learning_rate": 3.582214765100671e-05, + "loss": 0.9005, + "step": 438 + }, + { + "epoch": 0.6787785079242366, + "grad_norm": 0.5757828950881958, + "learning_rate": 3.578020134228188e-05, + "loss": 0.9121, + "step": 439 + }, + { + "epoch": 0.680324700425203, + "grad_norm": 0.6010167598724365, + "learning_rate": 3.5738255033557046e-05, + "loss": 0.8575, + "step": 440 + }, + { + "epoch": 0.6818708929261693, + "grad_norm": 0.621435284614563, + "learning_rate": 3.5696308724832216e-05, + "loss": 0.9356, + "step": 441 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.5952706933021545, + "learning_rate": 3.5654362416107386e-05, + "loss": 0.9653, + "step": 442 + }, + { + "epoch": 0.6849632779281021, + "grad_norm": 0.59757000207901, + "learning_rate": 3.561241610738255e-05, + "loss": 0.9147, + "step": 443 + }, + { + "epoch": 0.6865094704290684, + "grad_norm": 0.6352919936180115, + "learning_rate": 3.557046979865772e-05, + "loss": 0.8931, + "step": 444 + }, + { + "epoch": 0.6880556629300348, + "grad_norm": 0.5814509987831116, + "learning_rate": 3.552852348993288e-05, + "loss": 0.9158, + "step": 445 + }, + { + "epoch": 0.6896018554310012, + "grad_norm": 0.7028055787086487, + "learning_rate": 3.548657718120805e-05, + "loss": 0.8891, + "step": 446 + }, + { + "epoch": 0.6911480479319675, + "grad_norm": 0.659831166267395, + "learning_rate": 3.544463087248322e-05, + "loss": 0.9629, + "step": 447 + }, + { + "epoch": 0.6926942404329339, + "grad_norm": 0.7297990918159485, + "learning_rate": 3.540268456375839e-05, + "loss": 0.9856, + "step": 448 + }, + { + "epoch": 0.6942404329339003, + "grad_norm": 0.8682158589363098, + "learning_rate": 3.536073825503356e-05, + "loss": 1.0006, + "step": 449 + }, + { + "epoch": 0.6957866254348667, + "grad_norm": 1.2786856889724731, + "learning_rate": 3.5318791946308726e-05, + "loss": 0.9672, + "step": 450 + }, + { + "epoch": 0.697332817935833, + "grad_norm": 0.5570630431175232, + "learning_rate": 3.5276845637583896e-05, + "loss": 0.6907, + "step": 451 + }, + { + "epoch": 0.6988790104367993, + "grad_norm": 0.4869362413883209, + "learning_rate": 3.523489932885906e-05, + "loss": 0.7276, + "step": 452 + }, + { + "epoch": 0.7004252029377658, + "grad_norm": 0.4926219582557678, + "learning_rate": 3.519295302013423e-05, + "loss": 0.7988, + "step": 453 + }, + { + "epoch": 0.7019713954387321, + "grad_norm": 0.49141383171081543, + "learning_rate": 3.51510067114094e-05, + "loss": 0.7625, + "step": 454 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 0.48099425435066223, + "learning_rate": 3.510906040268457e-05, + "loss": 0.7932, + "step": 455 + }, + { + "epoch": 0.7050637804406649, + "grad_norm": 0.48948779702186584, + "learning_rate": 3.506711409395974e-05, + "loss": 0.725, + "step": 456 + }, + { + "epoch": 0.7066099729416312, + "grad_norm": 0.5123636722564697, + "learning_rate": 3.50251677852349e-05, + "loss": 0.7704, + "step": 457 + }, + { + "epoch": 0.7081561654425976, + "grad_norm": 0.49425482749938965, + "learning_rate": 3.498322147651007e-05, + "loss": 0.8, + "step": 458 + }, + { + "epoch": 0.709702357943564, + "grad_norm": 0.5385423898696899, + "learning_rate": 3.4941275167785235e-05, + "loss": 0.8093, + "step": 459 + }, + { + "epoch": 0.7112485504445304, + "grad_norm": 0.5466816425323486, + "learning_rate": 3.4899328859060405e-05, + "loss": 0.7991, + "step": 460 + }, + { + "epoch": 0.7127947429454967, + "grad_norm": 0.5069779753684998, + "learning_rate": 3.485738255033557e-05, + "loss": 0.7989, + "step": 461 + }, + { + "epoch": 0.714340935446463, + "grad_norm": 0.5083027482032776, + "learning_rate": 3.481543624161074e-05, + "loss": 0.7891, + "step": 462 + }, + { + "epoch": 0.7158871279474295, + "grad_norm": 0.5162233710289001, + "learning_rate": 3.477348993288591e-05, + "loss": 0.7208, + "step": 463 + }, + { + "epoch": 0.7174333204483958, + "grad_norm": 0.5323002338409424, + "learning_rate": 3.473154362416108e-05, + "loss": 0.7471, + "step": 464 + }, + { + "epoch": 0.7189795129493622, + "grad_norm": 0.5144820809364319, + "learning_rate": 3.468959731543625e-05, + "loss": 0.8335, + "step": 465 + }, + { + "epoch": 0.7205257054503286, + "grad_norm": 0.5344799160957336, + "learning_rate": 3.464765100671141e-05, + "loss": 0.7575, + "step": 466 + }, + { + "epoch": 0.7220718979512949, + "grad_norm": 0.5163140296936035, + "learning_rate": 3.460570469798658e-05, + "loss": 0.7471, + "step": 467 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.537284791469574, + "learning_rate": 3.4563758389261744e-05, + "loss": 0.8013, + "step": 468 + }, + { + "epoch": 0.7251642829532277, + "grad_norm": 0.5194945335388184, + "learning_rate": 3.4521812080536914e-05, + "loss": 0.7855, + "step": 469 + }, + { + "epoch": 0.7267104754541941, + "grad_norm": 0.5371651649475098, + "learning_rate": 3.4479865771812084e-05, + "loss": 0.7345, + "step": 470 + }, + { + "epoch": 0.7282566679551604, + "grad_norm": 0.5294081568717957, + "learning_rate": 3.443791946308725e-05, + "loss": 0.854, + "step": 471 + }, + { + "epoch": 0.7298028604561267, + "grad_norm": 0.5522701144218445, + "learning_rate": 3.439597315436242e-05, + "loss": 0.8338, + "step": 472 + }, + { + "epoch": 0.7313490529570932, + "grad_norm": 0.5342947244644165, + "learning_rate": 3.435402684563759e-05, + "loss": 0.8429, + "step": 473 + }, + { + "epoch": 0.7328952454580595, + "grad_norm": 0.5579586029052734, + "learning_rate": 3.431208053691276e-05, + "loss": 0.77, + "step": 474 + }, + { + "epoch": 0.734441437959026, + "grad_norm": 0.5210466384887695, + "learning_rate": 3.427013422818792e-05, + "loss": 0.8538, + "step": 475 + }, + { + "epoch": 0.7359876304599923, + "grad_norm": 0.5218535661697388, + "learning_rate": 3.422818791946309e-05, + "loss": 0.8771, + "step": 476 + }, + { + "epoch": 0.7375338229609586, + "grad_norm": 0.5211417078971863, + "learning_rate": 3.418624161073825e-05, + "loss": 0.827, + "step": 477 + }, + { + "epoch": 0.739080015461925, + "grad_norm": 0.5523113012313843, + "learning_rate": 3.414429530201342e-05, + "loss": 0.8367, + "step": 478 + }, + { + "epoch": 0.7406262079628914, + "grad_norm": 0.5218014717102051, + "learning_rate": 3.410234899328859e-05, + "loss": 0.8319, + "step": 479 + }, + { + "epoch": 0.7421724004638578, + "grad_norm": 0.5514734387397766, + "learning_rate": 3.4060402684563756e-05, + "loss": 0.8703, + "step": 480 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 0.5442553162574768, + "learning_rate": 3.4018456375838926e-05, + "loss": 0.8094, + "step": 481 + }, + { + "epoch": 0.7452647854657904, + "grad_norm": 0.5472526550292969, + "learning_rate": 3.3976510067114096e-05, + "loss": 0.8419, + "step": 482 + }, + { + "epoch": 0.7468109779667569, + "grad_norm": 0.5330756306648254, + "learning_rate": 3.3934563758389266e-05, + "loss": 0.8497, + "step": 483 + }, + { + "epoch": 0.7483571704677232, + "grad_norm": 0.5805166363716125, + "learning_rate": 3.389261744966443e-05, + "loss": 0.8505, + "step": 484 + }, + { + "epoch": 0.7499033629686896, + "grad_norm": 0.5910758376121521, + "learning_rate": 3.38506711409396e-05, + "loss": 0.8154, + "step": 485 + }, + { + "epoch": 0.751449555469656, + "grad_norm": 0.579165518283844, + "learning_rate": 3.380872483221477e-05, + "loss": 0.8398, + "step": 486 + }, + { + "epoch": 0.7529957479706223, + "grad_norm": 0.5517799258232117, + "learning_rate": 3.376677852348993e-05, + "loss": 0.8426, + "step": 487 + }, + { + "epoch": 0.7545419404715887, + "grad_norm": 0.588138997554779, + "learning_rate": 3.37248322147651e-05, + "loss": 0.8658, + "step": 488 + }, + { + "epoch": 0.7560881329725551, + "grad_norm": 0.5962609648704529, + "learning_rate": 3.3682885906040266e-05, + "loss": 0.9157, + "step": 489 + }, + { + "epoch": 0.7576343254735215, + "grad_norm": 0.5821203589439392, + "learning_rate": 3.3640939597315436e-05, + "loss": 0.9158, + "step": 490 + }, + { + "epoch": 0.7591805179744878, + "grad_norm": 0.5653342008590698, + "learning_rate": 3.3598993288590605e-05, + "loss": 0.924, + "step": 491 + }, + { + "epoch": 0.7607267104754541, + "grad_norm": 0.6114529967308044, + "learning_rate": 3.3557046979865775e-05, + "loss": 0.9249, + "step": 492 + }, + { + "epoch": 0.7622729029764206, + "grad_norm": 0.6291983723640442, + "learning_rate": 3.3515100671140945e-05, + "loss": 0.8665, + "step": 493 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.6541888117790222, + "learning_rate": 3.347315436241611e-05, + "loss": 0.9678, + "step": 494 + }, + { + "epoch": 0.7653652879783533, + "grad_norm": 0.6252798438072205, + "learning_rate": 3.343120805369128e-05, + "loss": 0.8691, + "step": 495 + }, + { + "epoch": 0.7669114804793197, + "grad_norm": 0.6714550852775574, + "learning_rate": 3.338926174496644e-05, + "loss": 0.9581, + "step": 496 + }, + { + "epoch": 0.768457672980286, + "grad_norm": 0.7336750626564026, + "learning_rate": 3.334731543624161e-05, + "loss": 0.9024, + "step": 497 + }, + { + "epoch": 0.7700038654812524, + "grad_norm": 0.7459555864334106, + "learning_rate": 3.3305369127516775e-05, + "loss": 0.9153, + "step": 498 + }, + { + "epoch": 0.7715500579822188, + "grad_norm": 0.7960460186004639, + "learning_rate": 3.326342281879195e-05, + "loss": 0.9776, + "step": 499 + }, + { + "epoch": 0.7730962504831852, + "grad_norm": 1.188610315322876, + "learning_rate": 3.3221476510067115e-05, + "loss": 0.9963, + "step": 500 + }, + { + "epoch": 0.7746424429841515, + "grad_norm": 0.5373630523681641, + "learning_rate": 3.3179530201342285e-05, + "loss": 0.7497, + "step": 501 + }, + { + "epoch": 0.7761886354851179, + "grad_norm": 0.5297138094902039, + "learning_rate": 3.3137583892617455e-05, + "loss": 0.6842, + "step": 502 + }, + { + "epoch": 0.7777348279860843, + "grad_norm": 0.5116965770721436, + "learning_rate": 3.309563758389262e-05, + "loss": 0.7081, + "step": 503 + }, + { + "epoch": 0.7792810204870506, + "grad_norm": 0.523707389831543, + "learning_rate": 3.305369127516779e-05, + "loss": 0.7354, + "step": 504 + }, + { + "epoch": 0.780827212988017, + "grad_norm": 0.5127305388450623, + "learning_rate": 3.301174496644295e-05, + "loss": 0.7478, + "step": 505 + }, + { + "epoch": 0.7823734054889834, + "grad_norm": 0.5249508619308472, + "learning_rate": 3.296979865771812e-05, + "loss": 0.7198, + "step": 506 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 0.4998358190059662, + "learning_rate": 3.292785234899329e-05, + "loss": 0.7726, + "step": 507 + }, + { + "epoch": 0.7854657904909161, + "grad_norm": 0.48964107036590576, + "learning_rate": 3.288590604026846e-05, + "loss": 0.7643, + "step": 508 + }, + { + "epoch": 0.7870119829918825, + "grad_norm": 0.4985421299934387, + "learning_rate": 3.284395973154363e-05, + "loss": 0.7807, + "step": 509 + }, + { + "epoch": 0.7885581754928489, + "grad_norm": 0.5199535489082336, + "learning_rate": 3.2802013422818794e-05, + "loss": 0.7899, + "step": 510 + }, + { + "epoch": 0.7901043679938152, + "grad_norm": 0.5251364707946777, + "learning_rate": 3.2760067114093964e-05, + "loss": 0.7619, + "step": 511 + }, + { + "epoch": 0.7916505604947816, + "grad_norm": 0.5302333831787109, + "learning_rate": 3.271812080536913e-05, + "loss": 0.7561, + "step": 512 + }, + { + "epoch": 0.793196752995748, + "grad_norm": 0.544916033744812, + "learning_rate": 3.26761744966443e-05, + "loss": 0.7497, + "step": 513 + }, + { + "epoch": 0.7947429454967143, + "grad_norm": 0.5568458437919617, + "learning_rate": 3.263422818791946e-05, + "loss": 0.7872, + "step": 514 + }, + { + "epoch": 0.7962891379976808, + "grad_norm": 0.5136657357215881, + "learning_rate": 3.259228187919463e-05, + "loss": 0.8583, + "step": 515 + }, + { + "epoch": 0.7978353304986471, + "grad_norm": 0.5261164903640747, + "learning_rate": 3.25503355704698e-05, + "loss": 0.7806, + "step": 516 + }, + { + "epoch": 0.7993815229996134, + "grad_norm": 0.5282236933708191, + "learning_rate": 3.250838926174497e-05, + "loss": 0.7391, + "step": 517 + }, + { + "epoch": 0.8009277155005798, + "grad_norm": 0.5518252849578857, + "learning_rate": 3.246644295302014e-05, + "loss": 0.7981, + "step": 518 + }, + { + "epoch": 0.8024739080015462, + "grad_norm": 0.5146390199661255, + "learning_rate": 3.24244966442953e-05, + "loss": 0.7831, + "step": 519 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.5058282017707825, + "learning_rate": 3.238255033557047e-05, + "loss": 0.807, + "step": 520 + }, + { + "epoch": 0.8055662930034789, + "grad_norm": 0.5313502550125122, + "learning_rate": 3.2340604026845636e-05, + "loss": 0.8176, + "step": 521 + }, + { + "epoch": 0.8071124855044453, + "grad_norm": 0.5471131801605225, + "learning_rate": 3.2298657718120806e-05, + "loss": 0.81, + "step": 522 + }, + { + "epoch": 0.8086586780054117, + "grad_norm": 0.5153350830078125, + "learning_rate": 3.2256711409395976e-05, + "loss": 0.8729, + "step": 523 + }, + { + "epoch": 0.810204870506378, + "grad_norm": 0.5148741006851196, + "learning_rate": 3.221476510067114e-05, + "loss": 0.7776, + "step": 524 + }, + { + "epoch": 0.8117510630073445, + "grad_norm": 0.5151890516281128, + "learning_rate": 3.217281879194631e-05, + "loss": 0.8355, + "step": 525 + }, + { + "epoch": 0.8132972555083108, + "grad_norm": 0.5539215207099915, + "learning_rate": 3.213087248322148e-05, + "loss": 0.8074, + "step": 526 + }, + { + "epoch": 0.8148434480092771, + "grad_norm": 0.5416978597640991, + "learning_rate": 3.208892617449665e-05, + "loss": 0.8077, + "step": 527 + }, + { + "epoch": 0.8163896405102435, + "grad_norm": 0.5648258924484253, + "learning_rate": 3.204697986577181e-05, + "loss": 0.7928, + "step": 528 + }, + { + "epoch": 0.8179358330112099, + "grad_norm": 0.549947202205658, + "learning_rate": 3.200503355704698e-05, + "loss": 0.8371, + "step": 529 + }, + { + "epoch": 0.8194820255121763, + "grad_norm": 0.5815473198890686, + "learning_rate": 3.196308724832215e-05, + "loss": 0.8236, + "step": 530 + }, + { + "epoch": 0.8210282180131426, + "grad_norm": 0.5924739241600037, + "learning_rate": 3.1921140939597315e-05, + "loss": 0.7953, + "step": 531 + }, + { + "epoch": 0.822574410514109, + "grad_norm": 0.5691911578178406, + "learning_rate": 3.1879194630872485e-05, + "loss": 0.8312, + "step": 532 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 0.5568677186965942, + "learning_rate": 3.183724832214765e-05, + "loss": 0.839, + "step": 533 + }, + { + "epoch": 0.8256667955160417, + "grad_norm": 0.584894061088562, + "learning_rate": 3.1795302013422825e-05, + "loss": 0.9014, + "step": 534 + }, + { + "epoch": 0.8272129880170082, + "grad_norm": 0.5695962309837341, + "learning_rate": 3.175335570469799e-05, + "loss": 0.8177, + "step": 535 + }, + { + "epoch": 0.8287591805179745, + "grad_norm": 0.5662206411361694, + "learning_rate": 3.171140939597316e-05, + "loss": 0.8666, + "step": 536 + }, + { + "epoch": 0.8303053730189408, + "grad_norm": 0.6044044494628906, + "learning_rate": 3.166946308724832e-05, + "loss": 0.8757, + "step": 537 + }, + { + "epoch": 0.8318515655199072, + "grad_norm": 0.5584191083908081, + "learning_rate": 3.162751677852349e-05, + "loss": 0.8701, + "step": 538 + }, + { + "epoch": 0.8333977580208736, + "grad_norm": 0.6086747646331787, + "learning_rate": 3.158557046979866e-05, + "loss": 0.9369, + "step": 539 + }, + { + "epoch": 0.83494395052184, + "grad_norm": 0.5706461071968079, + "learning_rate": 3.1543624161073825e-05, + "loss": 0.9749, + "step": 540 + }, + { + "epoch": 0.8364901430228063, + "grad_norm": 0.5996837615966797, + "learning_rate": 3.1501677852348995e-05, + "loss": 0.881, + "step": 541 + }, + { + "epoch": 0.8380363355237727, + "grad_norm": 0.645283043384552, + "learning_rate": 3.145973154362416e-05, + "loss": 0.9201, + "step": 542 + }, + { + "epoch": 0.8395825280247391, + "grad_norm": 0.6247162222862244, + "learning_rate": 3.1417785234899334e-05, + "loss": 0.9159, + "step": 543 + }, + { + "epoch": 0.8411287205257054, + "grad_norm": 0.6042696833610535, + "learning_rate": 3.13758389261745e-05, + "loss": 0.9362, + "step": 544 + }, + { + "epoch": 0.8426749130266719, + "grad_norm": 0.6593182682991028, + "learning_rate": 3.133389261744967e-05, + "loss": 0.9722, + "step": 545 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.665708601474762, + "learning_rate": 3.129194630872484e-05, + "loss": 0.9028, + "step": 546 + }, + { + "epoch": 0.8457672980286045, + "grad_norm": 0.6491990089416504, + "learning_rate": 3.125e-05, + "loss": 0.9169, + "step": 547 + }, + { + "epoch": 0.847313490529571, + "grad_norm": 0.7431784868240356, + "learning_rate": 3.120805369127517e-05, + "loss": 0.9927, + "step": 548 + }, + { + "epoch": 0.8488596830305373, + "grad_norm": 0.8240005970001221, + "learning_rate": 3.1166107382550334e-05, + "loss": 1.0051, + "step": 549 + }, + { + "epoch": 0.8504058755315037, + "grad_norm": 1.1392122507095337, + "learning_rate": 3.1124161073825504e-05, + "loss": 0.9428, + "step": 550 + }, + { + "epoch": 0.85195206803247, + "grad_norm": 0.48860234022140503, + "learning_rate": 3.108221476510067e-05, + "loss": 0.7069, + "step": 551 + }, + { + "epoch": 0.8534982605334364, + "grad_norm": 0.4950990378856659, + "learning_rate": 3.1040268456375844e-05, + "loss": 0.7463, + "step": 552 + }, + { + "epoch": 0.8550444530344028, + "grad_norm": 0.48706600069999695, + "learning_rate": 3.099832214765101e-05, + "loss": 0.786, + "step": 553 + }, + { + "epoch": 0.8565906455353691, + "grad_norm": 0.5027382373809814, + "learning_rate": 3.095637583892618e-05, + "loss": 0.7882, + "step": 554 + }, + { + "epoch": 0.8581368380363356, + "grad_norm": 0.5104815363883972, + "learning_rate": 3.091442953020135e-05, + "loss": 0.7301, + "step": 555 + }, + { + "epoch": 0.8596830305373019, + "grad_norm": 0.49334749579429626, + "learning_rate": 3.087248322147651e-05, + "loss": 0.786, + "step": 556 + }, + { + "epoch": 0.8612292230382682, + "grad_norm": 0.5204259157180786, + "learning_rate": 3.083053691275168e-05, + "loss": 0.7809, + "step": 557 + }, + { + "epoch": 0.8627754155392346, + "grad_norm": 0.5358408093452454, + "learning_rate": 3.078859060402684e-05, + "loss": 0.7887, + "step": 558 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 0.515990674495697, + "learning_rate": 3.074664429530201e-05, + "loss": 0.8452, + "step": 559 + }, + { + "epoch": 0.8658678005411674, + "grad_norm": 0.5604992508888245, + "learning_rate": 3.070469798657718e-05, + "loss": 0.7178, + "step": 560 + }, + { + "epoch": 0.8674139930421337, + "grad_norm": 0.5511056184768677, + "learning_rate": 3.066275167785235e-05, + "loss": 0.7577, + "step": 561 + }, + { + "epoch": 0.8689601855431001, + "grad_norm": 0.5570728778839111, + "learning_rate": 3.062080536912752e-05, + "loss": 0.7799, + "step": 562 + }, + { + "epoch": 0.8705063780440665, + "grad_norm": 0.535253643989563, + "learning_rate": 3.0578859060402686e-05, + "loss": 0.85, + "step": 563 + }, + { + "epoch": 0.8720525705450328, + "grad_norm": 0.5327886343002319, + "learning_rate": 3.0536912751677856e-05, + "loss": 0.8057, + "step": 564 + }, + { + "epoch": 0.8735987630459993, + "grad_norm": 0.5440667867660522, + "learning_rate": 3.0494966442953022e-05, + "loss": 0.7407, + "step": 565 + }, + { + "epoch": 0.8751449555469656, + "grad_norm": 0.5552430152893066, + "learning_rate": 3.045302013422819e-05, + "loss": 0.8022, + "step": 566 + }, + { + "epoch": 0.876691148047932, + "grad_norm": 0.523145854473114, + "learning_rate": 3.0411073825503356e-05, + "loss": 0.7694, + "step": 567 + }, + { + "epoch": 0.8782373405488983, + "grad_norm": 0.5461344122886658, + "learning_rate": 3.0369127516778522e-05, + "loss": 0.7849, + "step": 568 + }, + { + "epoch": 0.8797835330498647, + "grad_norm": 0.548468828201294, + "learning_rate": 3.0327181208053695e-05, + "loss": 0.8127, + "step": 569 + }, + { + "epoch": 0.8813297255508311, + "grad_norm": 0.5320367813110352, + "learning_rate": 3.0285234899328862e-05, + "loss": 0.8243, + "step": 570 + }, + { + "epoch": 0.8828759180517974, + "grad_norm": 0.5147125124931335, + "learning_rate": 3.024328859060403e-05, + "loss": 0.8784, + "step": 571 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.5353872776031494, + "learning_rate": 3.02013422818792e-05, + "loss": 0.7841, + "step": 572 + }, + { + "epoch": 0.8859683030537302, + "grad_norm": 0.5554389953613281, + "learning_rate": 3.0159395973154365e-05, + "loss": 0.8601, + "step": 573 + }, + { + "epoch": 0.8875144955546965, + "grad_norm": 0.5762984752655029, + "learning_rate": 3.011744966442953e-05, + "loss": 0.7359, + "step": 574 + }, + { + "epoch": 0.889060688055663, + "grad_norm": 0.5287367701530457, + "learning_rate": 3.0075503355704698e-05, + "loss": 0.8063, + "step": 575 + }, + { + "epoch": 0.8906068805566293, + "grad_norm": 0.5730248689651489, + "learning_rate": 3.0033557046979865e-05, + "loss": 0.735, + "step": 576 + }, + { + "epoch": 0.8921530730575957, + "grad_norm": 0.557384192943573, + "learning_rate": 2.999161073825503e-05, + "loss": 0.7962, + "step": 577 + }, + { + "epoch": 0.893699265558562, + "grad_norm": 0.5956771373748779, + "learning_rate": 2.9949664429530205e-05, + "loss": 0.8867, + "step": 578 + }, + { + "epoch": 0.8952454580595284, + "grad_norm": 0.5796382427215576, + "learning_rate": 2.990771812080537e-05, + "loss": 0.858, + "step": 579 + }, + { + "epoch": 0.8967916505604948, + "grad_norm": 0.5635401010513306, + "learning_rate": 2.986577181208054e-05, + "loss": 0.7987, + "step": 580 + }, + { + "epoch": 0.8983378430614611, + "grad_norm": 0.6214528679847717, + "learning_rate": 2.9823825503355708e-05, + "loss": 0.8397, + "step": 581 + }, + { + "epoch": 0.8998840355624276, + "grad_norm": 0.5945339202880859, + "learning_rate": 2.9781879194630874e-05, + "loss": 0.8428, + "step": 582 + }, + { + "epoch": 0.9014302280633939, + "grad_norm": 0.5704767107963562, + "learning_rate": 2.973993288590604e-05, + "loss": 0.8291, + "step": 583 + }, + { + "epoch": 0.9029764205643602, + "grad_norm": 0.5803176164627075, + "learning_rate": 2.9697986577181207e-05, + "loss": 0.8792, + "step": 584 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 0.6039038300514221, + "learning_rate": 2.9656040268456374e-05, + "loss": 0.7788, + "step": 585 + }, + { + "epoch": 0.906068805566293, + "grad_norm": 0.5550165176391602, + "learning_rate": 2.9614093959731544e-05, + "loss": 0.8658, + "step": 586 + }, + { + "epoch": 0.9076149980672594, + "grad_norm": 0.6096512079238892, + "learning_rate": 2.9572147651006714e-05, + "loss": 0.9237, + "step": 587 + }, + { + "epoch": 0.9091611905682258, + "grad_norm": 0.5784430503845215, + "learning_rate": 2.9530201342281884e-05, + "loss": 0.8538, + "step": 588 + }, + { + "epoch": 0.9107073830691921, + "grad_norm": 0.5854966044425964, + "learning_rate": 2.948825503355705e-05, + "loss": 0.9009, + "step": 589 + }, + { + "epoch": 0.9122535755701585, + "grad_norm": 0.5904499292373657, + "learning_rate": 2.9446308724832217e-05, + "loss": 0.9064, + "step": 590 + }, + { + "epoch": 0.9137997680711248, + "grad_norm": 0.6125240325927734, + "learning_rate": 2.9404362416107384e-05, + "loss": 0.8539, + "step": 591 + }, + { + "epoch": 0.9153459605720913, + "grad_norm": 0.6209454536437988, + "learning_rate": 2.936241610738255e-05, + "loss": 0.8864, + "step": 592 + }, + { + "epoch": 0.9168921530730576, + "grad_norm": 0.634355902671814, + "learning_rate": 2.9320469798657717e-05, + "loss": 0.8999, + "step": 593 + }, + { + "epoch": 0.9184383455740239, + "grad_norm": 0.644378125667572, + "learning_rate": 2.9278523489932887e-05, + "loss": 0.9351, + "step": 594 + }, + { + "epoch": 0.9199845380749904, + "grad_norm": 0.638783872127533, + "learning_rate": 2.9236577181208053e-05, + "loss": 0.8522, + "step": 595 + }, + { + "epoch": 0.9215307305759567, + "grad_norm": 0.6960675716400146, + "learning_rate": 2.9194630872483227e-05, + "loss": 0.8183, + "step": 596 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.7445054054260254, + "learning_rate": 2.9152684563758393e-05, + "loss": 0.9749, + "step": 597 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.7296366095542908, + "learning_rate": 2.911073825503356e-05, + "loss": 0.9535, + "step": 598 + }, + { + "epoch": 0.9261693080788558, + "grad_norm": 0.8242074251174927, + "learning_rate": 2.9068791946308726e-05, + "loss": 0.9661, + "step": 599 + }, + { + "epoch": 0.9277155005798222, + "grad_norm": 0.983094334602356, + "learning_rate": 2.9026845637583893e-05, + "loss": 0.8454, + "step": 600 + }, + { + "epoch": 0.9292616930807885, + "grad_norm": 0.4916326403617859, + "learning_rate": 2.898489932885906e-05, + "loss": 0.7178, + "step": 601 + }, + { + "epoch": 0.930807885581755, + "grad_norm": 0.5236021876335144, + "learning_rate": 2.894295302013423e-05, + "loss": 0.7586, + "step": 602 + }, + { + "epoch": 0.9323540780827213, + "grad_norm": 0.5226188898086548, + "learning_rate": 2.8901006711409396e-05, + "loss": 0.7287, + "step": 603 + }, + { + "epoch": 0.9339002705836876, + "grad_norm": 0.5189059376716614, + "learning_rate": 2.885906040268457e-05, + "loss": 0.7176, + "step": 604 + }, + { + "epoch": 0.9354464630846541, + "grad_norm": 0.5282127857208252, + "learning_rate": 2.8817114093959736e-05, + "loss": 0.7487, + "step": 605 + }, + { + "epoch": 0.9369926555856204, + "grad_norm": 0.5156176090240479, + "learning_rate": 2.8775167785234902e-05, + "loss": 0.7929, + "step": 606 + }, + { + "epoch": 0.9385388480865868, + "grad_norm": 0.5219593644142151, + "learning_rate": 2.873322147651007e-05, + "loss": 0.7992, + "step": 607 + }, + { + "epoch": 0.9400850405875532, + "grad_norm": 0.5400338768959045, + "learning_rate": 2.8691275167785235e-05, + "loss": 0.7738, + "step": 608 + }, + { + "epoch": 0.9416312330885195, + "grad_norm": 0.5067276954650879, + "learning_rate": 2.8649328859060402e-05, + "loss": 0.8046, + "step": 609 + }, + { + "epoch": 0.9431774255894859, + "grad_norm": 0.5286040902137756, + "learning_rate": 2.8607382550335572e-05, + "loss": 0.6818, + "step": 610 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 0.5325278043746948, + "learning_rate": 2.856543624161074e-05, + "loss": 0.8112, + "step": 611 + }, + { + "epoch": 0.9462698105914187, + "grad_norm": 0.5434727668762207, + "learning_rate": 2.8523489932885905e-05, + "loss": 0.8245, + "step": 612 + }, + { + "epoch": 0.947816003092385, + "grad_norm": 0.5526994466781616, + "learning_rate": 2.848154362416108e-05, + "loss": 0.7493, + "step": 613 + }, + { + "epoch": 0.9493621955933513, + "grad_norm": 0.5655114054679871, + "learning_rate": 2.8439597315436245e-05, + "loss": 0.7514, + "step": 614 + }, + { + "epoch": 0.9509083880943178, + "grad_norm": 0.5636076331138611, + "learning_rate": 2.839765100671141e-05, + "loss": 0.7992, + "step": 615 + }, + { + "epoch": 0.9524545805952841, + "grad_norm": 0.5688204169273376, + "learning_rate": 2.8355704697986578e-05, + "loss": 0.7747, + "step": 616 + }, + { + "epoch": 0.9540007730962505, + "grad_norm": 0.5534058809280396, + "learning_rate": 2.8313758389261748e-05, + "loss": 0.7511, + "step": 617 + }, + { + "epoch": 0.9555469655972169, + "grad_norm": 0.5112160444259644, + "learning_rate": 2.8271812080536915e-05, + "loss": 0.7875, + "step": 618 + }, + { + "epoch": 0.9570931580981832, + "grad_norm": 0.553830623626709, + "learning_rate": 2.822986577181208e-05, + "loss": 0.8652, + "step": 619 + }, + { + "epoch": 0.9586393505991496, + "grad_norm": 0.5614729523658752, + "learning_rate": 2.8187919463087248e-05, + "loss": 0.8693, + "step": 620 + }, + { + "epoch": 0.960185543100116, + "grad_norm": 0.5519264340400696, + "learning_rate": 2.8145973154362414e-05, + "loss": 0.8102, + "step": 621 + }, + { + "epoch": 0.9617317356010824, + "grad_norm": 0.5544281601905823, + "learning_rate": 2.8104026845637588e-05, + "loss": 0.8263, + "step": 622 + }, + { + "epoch": 0.9632779281020487, + "grad_norm": 0.5747584700584412, + "learning_rate": 2.8062080536912754e-05, + "loss": 0.7718, + "step": 623 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.5676540732383728, + "learning_rate": 2.802013422818792e-05, + "loss": 0.7616, + "step": 624 + }, + { + "epoch": 0.9663703131039815, + "grad_norm": 0.5307291150093079, + "learning_rate": 2.797818791946309e-05, + "loss": 0.8401, + "step": 625 + }, + { + "epoch": 0.9679165056049478, + "grad_norm": 0.5527417063713074, + "learning_rate": 2.7936241610738257e-05, + "loss": 0.8818, + "step": 626 + }, + { + "epoch": 0.9694626981059142, + "grad_norm": 0.545058012008667, + "learning_rate": 2.7894295302013424e-05, + "loss": 0.8606, + "step": 627 + }, + { + "epoch": 0.9710088906068806, + "grad_norm": 0.5928349494934082, + "learning_rate": 2.785234899328859e-05, + "loss": 0.7728, + "step": 628 + }, + { + "epoch": 0.9725550831078469, + "grad_norm": 0.5348992943763733, + "learning_rate": 2.7810402684563757e-05, + "loss": 0.8101, + "step": 629 + }, + { + "epoch": 0.9741012756088133, + "grad_norm": 0.583490788936615, + "learning_rate": 2.7768456375838923e-05, + "loss": 0.8648, + "step": 630 + }, + { + "epoch": 0.9756474681097796, + "grad_norm": 0.6235148906707764, + "learning_rate": 2.7726510067114097e-05, + "loss": 0.8425, + "step": 631 + }, + { + "epoch": 0.9771936606107461, + "grad_norm": 0.5755742788314819, + "learning_rate": 2.7684563758389263e-05, + "loss": 0.8765, + "step": 632 + }, + { + "epoch": 0.9787398531117124, + "grad_norm": 0.5636020302772522, + "learning_rate": 2.7642617449664433e-05, + "loss": 0.8114, + "step": 633 + }, + { + "epoch": 0.9802860456126787, + "grad_norm": 0.5665507316589355, + "learning_rate": 2.76006711409396e-05, + "loss": 0.9008, + "step": 634 + }, + { + "epoch": 0.9818322381136452, + "grad_norm": 0.5966489911079407, + "learning_rate": 2.7558724832214766e-05, + "loss": 0.846, + "step": 635 + }, + { + "epoch": 0.9833784306146115, + "grad_norm": 0.5960109829902649, + "learning_rate": 2.7516778523489933e-05, + "loss": 0.8962, + "step": 636 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 0.5626753568649292, + "learning_rate": 2.74748322147651e-05, + "loss": 0.9062, + "step": 637 + }, + { + "epoch": 0.9864708156165443, + "grad_norm": 0.6764492392539978, + "learning_rate": 2.7432885906040266e-05, + "loss": 0.8961, + "step": 638 + }, + { + "epoch": 0.9880170081175106, + "grad_norm": 0.5922832489013672, + "learning_rate": 2.7390939597315436e-05, + "loss": 0.8525, + "step": 639 + }, + { + "epoch": 0.989563200618477, + "grad_norm": 0.6102508902549744, + "learning_rate": 2.7348993288590606e-05, + "loss": 0.8887, + "step": 640 + }, + { + "epoch": 0.9911093931194433, + "grad_norm": 0.6205296516418457, + "learning_rate": 2.7307046979865776e-05, + "loss": 0.9007, + "step": 641 + }, + { + "epoch": 0.9926555856204098, + "grad_norm": 0.6284985542297363, + "learning_rate": 2.7265100671140943e-05, + "loss": 0.9228, + "step": 642 + }, + { + "epoch": 0.9942017781213761, + "grad_norm": 0.6276938915252686, + "learning_rate": 2.722315436241611e-05, + "loss": 0.9066, + "step": 643 + }, + { + "epoch": 0.9957479706223424, + "grad_norm": 0.6849061250686646, + "learning_rate": 2.7181208053691276e-05, + "loss": 0.9204, + "step": 644 + }, + { + "epoch": 0.9972941631233089, + "grad_norm": 0.7061152458190918, + "learning_rate": 2.7139261744966442e-05, + "loss": 0.9711, + "step": 645 + }, + { + "epoch": 0.9988403556242752, + "grad_norm": 0.7999619841575623, + "learning_rate": 2.709731543624161e-05, + "loss": 0.9722, + "step": 646 + }, + { + "epoch": 1.0007730962504833, + "grad_norm": 1.3448657989501953, + "learning_rate": 2.705536912751678e-05, + "loss": 1.2285, + "step": 647 + }, + { + "epoch": 1.0023192887514496, + "grad_norm": 0.4756757318973541, + "learning_rate": 2.701342281879195e-05, + "loss": 0.7208, + "step": 648 + }, + { + "epoch": 1.003865481252416, + "grad_norm": 0.495257169008255, + "learning_rate": 2.697147651006712e-05, + "loss": 0.7645, + "step": 649 + }, + { + "epoch": 1.0054116737533823, + "grad_norm": 0.5022267699241638, + "learning_rate": 2.6929530201342285e-05, + "loss": 0.7099, + "step": 650 + }, + { + "epoch": 1.0069578662543486, + "grad_norm": 0.5082698464393616, + "learning_rate": 2.6887583892617452e-05, + "loss": 0.7451, + "step": 651 + }, + { + "epoch": 1.0085040587553151, + "grad_norm": 0.5273095369338989, + "learning_rate": 2.6845637583892618e-05, + "loss": 0.7665, + "step": 652 + }, + { + "epoch": 1.0100502512562815, + "grad_norm": 0.5531541109085083, + "learning_rate": 2.6803691275167785e-05, + "loss": 0.7041, + "step": 653 + }, + { + "epoch": 1.0115964437572478, + "grad_norm": 0.5290402173995972, + "learning_rate": 2.6761744966442955e-05, + "loss": 0.812, + "step": 654 + }, + { + "epoch": 1.013142636258214, + "grad_norm": 0.556932270526886, + "learning_rate": 2.671979865771812e-05, + "loss": 0.7414, + "step": 655 + }, + { + "epoch": 1.0146888287591804, + "grad_norm": 0.501987874507904, + "learning_rate": 2.6677852348993288e-05, + "loss": 0.7446, + "step": 656 + }, + { + "epoch": 1.016235021260147, + "grad_norm": 0.5225424766540527, + "learning_rate": 2.663590604026846e-05, + "loss": 0.8439, + "step": 657 + }, + { + "epoch": 1.0177812137611133, + "grad_norm": 0.5627469420433044, + "learning_rate": 2.6593959731543628e-05, + "loss": 0.6966, + "step": 658 + }, + { + "epoch": 1.0193274062620796, + "grad_norm": 0.6055929660797119, + "learning_rate": 2.6552013422818794e-05, + "loss": 0.7859, + "step": 659 + }, + { + "epoch": 1.020873598763046, + "grad_norm": 0.6422880291938782, + "learning_rate": 2.651006711409396e-05, + "loss": 0.7733, + "step": 660 + }, + { + "epoch": 1.0224197912640123, + "grad_norm": 0.5887860059738159, + "learning_rate": 2.6468120805369128e-05, + "loss": 0.7041, + "step": 661 + }, + { + "epoch": 1.0239659837649788, + "grad_norm": 0.5853235125541687, + "learning_rate": 2.6426174496644297e-05, + "loss": 0.7268, + "step": 662 + }, + { + "epoch": 1.0255121762659452, + "grad_norm": 0.5295597910881042, + "learning_rate": 2.6384228187919464e-05, + "loss": 0.7482, + "step": 663 + }, + { + "epoch": 1.0270583687669115, + "grad_norm": 0.5481401085853577, + "learning_rate": 2.634228187919463e-05, + "loss": 0.7548, + "step": 664 + }, + { + "epoch": 1.0286045612678778, + "grad_norm": 0.538827121257782, + "learning_rate": 2.6300335570469797e-05, + "loss": 0.764, + "step": 665 + }, + { + "epoch": 1.0301507537688441, + "grad_norm": 0.577368974685669, + "learning_rate": 2.625838926174497e-05, + "loss": 0.6835, + "step": 666 + }, + { + "epoch": 1.0316969462698107, + "grad_norm": 0.5659049153327942, + "learning_rate": 2.6216442953020137e-05, + "loss": 0.7264, + "step": 667 + }, + { + "epoch": 1.033243138770777, + "grad_norm": 0.5179746150970459, + "learning_rate": 2.6174496644295304e-05, + "loss": 0.7911, + "step": 668 + }, + { + "epoch": 1.0347893312717433, + "grad_norm": 0.5719809532165527, + "learning_rate": 2.613255033557047e-05, + "loss": 0.7479, + "step": 669 + }, + { + "epoch": 1.0363355237727097, + "grad_norm": 0.5943763852119446, + "learning_rate": 2.609060402684564e-05, + "loss": 0.7802, + "step": 670 + }, + { + "epoch": 1.037881716273676, + "grad_norm": 0.5392094850540161, + "learning_rate": 2.6048657718120807e-05, + "loss": 0.7625, + "step": 671 + }, + { + "epoch": 1.0394279087746425, + "grad_norm": 0.5679749250411987, + "learning_rate": 2.6006711409395973e-05, + "loss": 0.7911, + "step": 672 + }, + { + "epoch": 1.0409741012756089, + "grad_norm": 0.5740141272544861, + "learning_rate": 2.596476510067114e-05, + "loss": 0.7477, + "step": 673 + }, + { + "epoch": 1.0425202937765752, + "grad_norm": 0.607397198677063, + "learning_rate": 2.5922818791946306e-05, + "loss": 0.852, + "step": 674 + }, + { + "epoch": 1.0440664862775415, + "grad_norm": 0.5706917643547058, + "learning_rate": 2.588087248322148e-05, + "loss": 0.6778, + "step": 675 + }, + { + "epoch": 1.0456126787785078, + "grad_norm": 0.5882996320724487, + "learning_rate": 2.5838926174496646e-05, + "loss": 0.7071, + "step": 676 + }, + { + "epoch": 1.0471588712794744, + "grad_norm": 0.5978296995162964, + "learning_rate": 2.5796979865771813e-05, + "loss": 0.7593, + "step": 677 + }, + { + "epoch": 1.0487050637804407, + "grad_norm": 0.6237056255340576, + "learning_rate": 2.5755033557046983e-05, + "loss": 0.8099, + "step": 678 + }, + { + "epoch": 1.050251256281407, + "grad_norm": 0.6156934499740601, + "learning_rate": 2.571308724832215e-05, + "loss": 0.8256, + "step": 679 + }, + { + "epoch": 1.0517974487823734, + "grad_norm": 0.6217848062515259, + "learning_rate": 2.5671140939597316e-05, + "loss": 0.8445, + "step": 680 + }, + { + "epoch": 1.0533436412833397, + "grad_norm": 0.6550363898277283, + "learning_rate": 2.5629194630872482e-05, + "loss": 0.7931, + "step": 681 + }, + { + "epoch": 1.0548898337843062, + "grad_norm": 0.6072224378585815, + "learning_rate": 2.558724832214765e-05, + "loss": 0.7985, + "step": 682 + }, + { + "epoch": 1.0564360262852726, + "grad_norm": 0.6478685140609741, + "learning_rate": 2.5545302013422822e-05, + "loss": 0.8055, + "step": 683 + }, + { + "epoch": 1.0579822187862389, + "grad_norm": 0.6033689975738525, + "learning_rate": 2.550335570469799e-05, + "loss": 0.9145, + "step": 684 + }, + { + "epoch": 1.0595284112872052, + "grad_norm": 0.5850486755371094, + "learning_rate": 2.5461409395973155e-05, + "loss": 0.8129, + "step": 685 + }, + { + "epoch": 1.0610746037881715, + "grad_norm": 0.6233928799629211, + "learning_rate": 2.5419463087248325e-05, + "loss": 0.9122, + "step": 686 + }, + { + "epoch": 1.062620796289138, + "grad_norm": 0.6058603525161743, + "learning_rate": 2.5377516778523492e-05, + "loss": 0.7968, + "step": 687 + }, + { + "epoch": 1.0641669887901044, + "grad_norm": 0.6053382754325867, + "learning_rate": 2.533557046979866e-05, + "loss": 0.8349, + "step": 688 + }, + { + "epoch": 1.0657131812910707, + "grad_norm": 0.6354022026062012, + "learning_rate": 2.5293624161073825e-05, + "loss": 0.8879, + "step": 689 + }, + { + "epoch": 1.067259373792037, + "grad_norm": 0.6736825108528137, + "learning_rate": 2.525167785234899e-05, + "loss": 0.8317, + "step": 690 + }, + { + "epoch": 1.0688055662930034, + "grad_norm": 0.6729496717453003, + "learning_rate": 2.5209731543624158e-05, + "loss": 0.8247, + "step": 691 + }, + { + "epoch": 1.07035175879397, + "grad_norm": 0.7000686526298523, + "learning_rate": 2.516778523489933e-05, + "loss": 0.9076, + "step": 692 + }, + { + "epoch": 1.0718979512949363, + "grad_norm": 0.7471379041671753, + "learning_rate": 2.5125838926174498e-05, + "loss": 0.861, + "step": 693 + }, + { + "epoch": 1.0734441437959026, + "grad_norm": 0.7505892515182495, + "learning_rate": 2.5083892617449668e-05, + "loss": 0.9747, + "step": 694 + }, + { + "epoch": 1.074990336296869, + "grad_norm": 0.7872920632362366, + "learning_rate": 2.5041946308724835e-05, + "loss": 0.8623, + "step": 695 + }, + { + "epoch": 1.0765365287978352, + "grad_norm": 0.97450190782547, + "learning_rate": 2.5e-05, + "loss": 0.8574, + "step": 696 + }, + { + "epoch": 1.0780827212988018, + "grad_norm": 0.6834471225738525, + "learning_rate": 2.4958053691275168e-05, + "loss": 0.7284, + "step": 697 + }, + { + "epoch": 1.079628913799768, + "grad_norm": 0.5371273756027222, + "learning_rate": 2.4916107382550334e-05, + "loss": 0.6604, + "step": 698 + }, + { + "epoch": 1.0811751063007344, + "grad_norm": 0.5436398983001709, + "learning_rate": 2.4874161073825504e-05, + "loss": 0.7106, + "step": 699 + }, + { + "epoch": 1.0827212988017008, + "grad_norm": 0.5626257061958313, + "learning_rate": 2.4832214765100674e-05, + "loss": 0.7199, + "step": 700 + }, + { + "epoch": 1.084267491302667, + "grad_norm": 0.5859701037406921, + "learning_rate": 2.479026845637584e-05, + "loss": 0.727, + "step": 701 + }, + { + "epoch": 1.0858136838036336, + "grad_norm": 0.5619367957115173, + "learning_rate": 2.4748322147651007e-05, + "loss": 0.7721, + "step": 702 + }, + { + "epoch": 1.0873598763046, + "grad_norm": 0.5503877997398376, + "learning_rate": 2.4706375838926177e-05, + "loss": 0.6971, + "step": 703 + }, + { + "epoch": 1.0889060688055663, + "grad_norm": 0.5587684512138367, + "learning_rate": 2.4664429530201344e-05, + "loss": 0.7868, + "step": 704 + }, + { + "epoch": 1.0904522613065326, + "grad_norm": 0.5771764516830444, + "learning_rate": 2.462248322147651e-05, + "loss": 0.7831, + "step": 705 + }, + { + "epoch": 1.091998453807499, + "grad_norm": 0.5702334046363831, + "learning_rate": 2.4580536912751677e-05, + "loss": 0.6751, + "step": 706 + }, + { + "epoch": 1.0935446463084655, + "grad_norm": 0.5503116250038147, + "learning_rate": 2.4538590604026847e-05, + "loss": 0.7732, + "step": 707 + }, + { + "epoch": 1.0950908388094318, + "grad_norm": 0.5595600008964539, + "learning_rate": 2.4496644295302017e-05, + "loss": 0.7168, + "step": 708 + }, + { + "epoch": 1.0966370313103981, + "grad_norm": 0.5581438541412354, + "learning_rate": 2.4454697986577183e-05, + "loss": 0.7462, + "step": 709 + }, + { + "epoch": 1.0981832238113645, + "grad_norm": 0.5454738736152649, + "learning_rate": 2.441275167785235e-05, + "loss": 0.8809, + "step": 710 + }, + { + "epoch": 1.0997294163123308, + "grad_norm": 0.5763128399848938, + "learning_rate": 2.4370805369127517e-05, + "loss": 0.6942, + "step": 711 + }, + { + "epoch": 1.1012756088132973, + "grad_norm": 0.5813524127006531, + "learning_rate": 2.4328859060402687e-05, + "loss": 0.6836, + "step": 712 + }, + { + "epoch": 1.1028218013142637, + "grad_norm": 0.5554409623146057, + "learning_rate": 2.4286912751677853e-05, + "loss": 0.7616, + "step": 713 + }, + { + "epoch": 1.10436799381523, + "grad_norm": 0.5576358437538147, + "learning_rate": 2.424496644295302e-05, + "loss": 0.7835, + "step": 714 + }, + { + "epoch": 1.1059141863161963, + "grad_norm": 0.5760726928710938, + "learning_rate": 2.420302013422819e-05, + "loss": 0.7866, + "step": 715 + }, + { + "epoch": 1.1074603788171626, + "grad_norm": 0.5910109877586365, + "learning_rate": 2.416107382550336e-05, + "loss": 0.7893, + "step": 716 + }, + { + "epoch": 1.1090065713181292, + "grad_norm": 0.6322896480560303, + "learning_rate": 2.4119127516778526e-05, + "loss": 0.7235, + "step": 717 + }, + { + "epoch": 1.1105527638190955, + "grad_norm": 0.5939295887947083, + "learning_rate": 2.4077181208053693e-05, + "loss": 0.7816, + "step": 718 + }, + { + "epoch": 1.1120989563200618, + "grad_norm": 0.5953226089477539, + "learning_rate": 2.403523489932886e-05, + "loss": 0.7444, + "step": 719 + }, + { + "epoch": 1.1136451488210282, + "grad_norm": 0.60584956407547, + "learning_rate": 2.3993288590604026e-05, + "loss": 0.7525, + "step": 720 + }, + { + "epoch": 1.1151913413219945, + "grad_norm": 0.6163296699523926, + "learning_rate": 2.3951342281879196e-05, + "loss": 0.772, + "step": 721 + }, + { + "epoch": 1.116737533822961, + "grad_norm": 0.5978072881698608, + "learning_rate": 2.3909395973154362e-05, + "loss": 0.7421, + "step": 722 + }, + { + "epoch": 1.1182837263239274, + "grad_norm": 0.5980417132377625, + "learning_rate": 2.3867449664429532e-05, + "loss": 0.7942, + "step": 723 + }, + { + "epoch": 1.1198299188248937, + "grad_norm": 0.6046664118766785, + "learning_rate": 2.38255033557047e-05, + "loss": 0.8131, + "step": 724 + }, + { + "epoch": 1.12137611132586, + "grad_norm": 0.5973670482635498, + "learning_rate": 2.378355704697987e-05, + "loss": 0.7857, + "step": 725 + }, + { + "epoch": 1.1229223038268263, + "grad_norm": 0.6115438342094421, + "learning_rate": 2.3741610738255035e-05, + "loss": 0.7942, + "step": 726 + }, + { + "epoch": 1.1244684963277929, + "grad_norm": 0.5946105718612671, + "learning_rate": 2.3699664429530202e-05, + "loss": 0.8088, + "step": 727 + }, + { + "epoch": 1.1260146888287592, + "grad_norm": 0.6095959544181824, + "learning_rate": 2.365771812080537e-05, + "loss": 0.8034, + "step": 728 + }, + { + "epoch": 1.1275608813297255, + "grad_norm": 0.6506893634796143, + "learning_rate": 2.361577181208054e-05, + "loss": 0.8469, + "step": 729 + }, + { + "epoch": 1.1291070738306919, + "grad_norm": 0.6179336309432983, + "learning_rate": 2.3573825503355705e-05, + "loss": 0.7918, + "step": 730 + }, + { + "epoch": 1.1306532663316582, + "grad_norm": 0.6253457069396973, + "learning_rate": 2.3531879194630875e-05, + "loss": 0.845, + "step": 731 + }, + { + "epoch": 1.1321994588326247, + "grad_norm": 0.6577156186103821, + "learning_rate": 2.348993288590604e-05, + "loss": 0.7475, + "step": 732 + }, + { + "epoch": 1.133745651333591, + "grad_norm": 0.6993891596794128, + "learning_rate": 2.3447986577181208e-05, + "loss": 0.7988, + "step": 733 + }, + { + "epoch": 1.1352918438345574, + "grad_norm": 0.6977733373641968, + "learning_rate": 2.3406040268456378e-05, + "loss": 0.7899, + "step": 734 + }, + { + "epoch": 1.1368380363355237, + "grad_norm": 0.6664114594459534, + "learning_rate": 2.3364093959731545e-05, + "loss": 0.8609, + "step": 735 + }, + { + "epoch": 1.13838422883649, + "grad_norm": 0.6565979719161987, + "learning_rate": 2.332214765100671e-05, + "loss": 0.8592, + "step": 736 + }, + { + "epoch": 1.1399304213374566, + "grad_norm": 0.6794628500938416, + "learning_rate": 2.3280201342281878e-05, + "loss": 0.8709, + "step": 737 + }, + { + "epoch": 1.141476613838423, + "grad_norm": 0.6833226084709167, + "learning_rate": 2.3238255033557048e-05, + "loss": 0.8121, + "step": 738 + }, + { + "epoch": 1.1430228063393892, + "grad_norm": 0.6899168491363525, + "learning_rate": 2.3196308724832218e-05, + "loss": 0.8362, + "step": 739 + }, + { + "epoch": 1.1445689988403556, + "grad_norm": 0.7028947472572327, + "learning_rate": 2.3154362416107384e-05, + "loss": 0.7995, + "step": 740 + }, + { + "epoch": 1.146115191341322, + "grad_norm": 0.7305999994277954, + "learning_rate": 2.311241610738255e-05, + "loss": 0.832, + "step": 741 + }, + { + "epoch": 1.1476613838422884, + "grad_norm": 0.7404617667198181, + "learning_rate": 2.3070469798657717e-05, + "loss": 0.9275, + "step": 742 + }, + { + "epoch": 1.1492075763432548, + "grad_norm": 0.7118339538574219, + "learning_rate": 2.3028523489932887e-05, + "loss": 0.8852, + "step": 743 + }, + { + "epoch": 1.150753768844221, + "grad_norm": 0.8329970240592957, + "learning_rate": 2.2986577181208054e-05, + "loss": 0.8482, + "step": 744 + }, + { + "epoch": 1.1522999613451874, + "grad_norm": 0.8370991945266724, + "learning_rate": 2.2944630872483224e-05, + "loss": 1.0003, + "step": 745 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.0701762437820435, + "learning_rate": 2.290268456375839e-05, + "loss": 1.0102, + "step": 746 + }, + { + "epoch": 1.1553923463471203, + "grad_norm": 0.7740350961685181, + "learning_rate": 2.286073825503356e-05, + "loss": 0.617, + "step": 747 + }, + { + "epoch": 1.1569385388480866, + "grad_norm": 0.5996577739715576, + "learning_rate": 2.2818791946308727e-05, + "loss": 0.7698, + "step": 748 + }, + { + "epoch": 1.158484731349053, + "grad_norm": 0.597896933555603, + "learning_rate": 2.2776845637583893e-05, + "loss": 0.7018, + "step": 749 + }, + { + "epoch": 1.1600309238500193, + "grad_norm": 0.5840704441070557, + "learning_rate": 2.273489932885906e-05, + "loss": 0.7634, + "step": 750 + }, + { + "epoch": 1.1615771163509856, + "grad_norm": 0.5987460613250732, + "learning_rate": 2.269295302013423e-05, + "loss": 0.7973, + "step": 751 + }, + { + "epoch": 1.1631233088519521, + "grad_norm": 0.5817953944206238, + "learning_rate": 2.2651006711409396e-05, + "loss": 0.7703, + "step": 752 + }, + { + "epoch": 1.1646695013529185, + "grad_norm": 0.5792746543884277, + "learning_rate": 2.2609060402684566e-05, + "loss": 0.7425, + "step": 753 + }, + { + "epoch": 1.1662156938538848, + "grad_norm": 0.5840611457824707, + "learning_rate": 2.2567114093959733e-05, + "loss": 0.715, + "step": 754 + }, + { + "epoch": 1.1677618863548511, + "grad_norm": 0.534000039100647, + "learning_rate": 2.25251677852349e-05, + "loss": 0.7764, + "step": 755 + }, + { + "epoch": 1.1693080788558174, + "grad_norm": 0.6020839214324951, + "learning_rate": 2.248322147651007e-05, + "loss": 0.7502, + "step": 756 + }, + { + "epoch": 1.170854271356784, + "grad_norm": 0.5764395594596863, + "learning_rate": 2.2441275167785236e-05, + "loss": 0.7298, + "step": 757 + }, + { + "epoch": 1.1724004638577503, + "grad_norm": 0.5840582251548767, + "learning_rate": 2.2399328859060403e-05, + "loss": 0.7836, + "step": 758 + }, + { + "epoch": 1.1739466563587166, + "grad_norm": 0.5759351253509521, + "learning_rate": 2.235738255033557e-05, + "loss": 0.7646, + "step": 759 + }, + { + "epoch": 1.175492848859683, + "grad_norm": 0.5777841806411743, + "learning_rate": 2.231543624161074e-05, + "loss": 0.8351, + "step": 760 + }, + { + "epoch": 1.1770390413606493, + "grad_norm": 0.6419858932495117, + "learning_rate": 2.227348993288591e-05, + "loss": 0.6833, + "step": 761 + }, + { + "epoch": 1.1785852338616158, + "grad_norm": 0.5851649045944214, + "learning_rate": 2.2231543624161076e-05, + "loss": 0.7559, + "step": 762 + }, + { + "epoch": 1.1801314263625822, + "grad_norm": 0.6243789196014404, + "learning_rate": 2.2189597315436242e-05, + "loss": 0.6587, + "step": 763 + }, + { + "epoch": 1.1816776188635485, + "grad_norm": 0.6107107996940613, + "learning_rate": 2.2147651006711412e-05, + "loss": 0.7724, + "step": 764 + }, + { + "epoch": 1.1832238113645148, + "grad_norm": 0.6164106130599976, + "learning_rate": 2.210570469798658e-05, + "loss": 0.6949, + "step": 765 + }, + { + "epoch": 1.1847700038654811, + "grad_norm": 0.6295919418334961, + "learning_rate": 2.2063758389261745e-05, + "loss": 0.7239, + "step": 766 + }, + { + "epoch": 1.1863161963664477, + "grad_norm": 0.582129955291748, + "learning_rate": 2.2021812080536912e-05, + "loss": 0.7619, + "step": 767 + }, + { + "epoch": 1.187862388867414, + "grad_norm": 0.639700174331665, + "learning_rate": 2.197986577181208e-05, + "loss": 0.7788, + "step": 768 + }, + { + "epoch": 1.1894085813683803, + "grad_norm": 0.6275160908699036, + "learning_rate": 2.193791946308725e-05, + "loss": 0.7457, + "step": 769 + }, + { + "epoch": 1.1909547738693467, + "grad_norm": 0.5969827175140381, + "learning_rate": 2.1895973154362418e-05, + "loss": 0.7981, + "step": 770 + }, + { + "epoch": 1.192500966370313, + "grad_norm": 0.5809218883514404, + "learning_rate": 2.1854026845637585e-05, + "loss": 0.8194, + "step": 771 + }, + { + "epoch": 1.1940471588712795, + "grad_norm": 0.5926761627197266, + "learning_rate": 2.181208053691275e-05, + "loss": 0.7808, + "step": 772 + }, + { + "epoch": 1.1955933513722459, + "grad_norm": 0.6315684914588928, + "learning_rate": 2.177013422818792e-05, + "loss": 0.8342, + "step": 773 + }, + { + "epoch": 1.1971395438732122, + "grad_norm": 0.5824242830276489, + "learning_rate": 2.1728187919463088e-05, + "loss": 0.8041, + "step": 774 + }, + { + "epoch": 1.1986857363741785, + "grad_norm": 0.6280264258384705, + "learning_rate": 2.1686241610738254e-05, + "loss": 0.7976, + "step": 775 + }, + { + "epoch": 1.2002319288751448, + "grad_norm": 0.6388978958129883, + "learning_rate": 2.1644295302013424e-05, + "loss": 0.7253, + "step": 776 + }, + { + "epoch": 1.2017781213761114, + "grad_norm": 0.6550062894821167, + "learning_rate": 2.160234899328859e-05, + "loss": 0.7531, + "step": 777 + }, + { + "epoch": 1.2033243138770777, + "grad_norm": 0.6625634431838989, + "learning_rate": 2.156040268456376e-05, + "loss": 0.7869, + "step": 778 + }, + { + "epoch": 1.204870506378044, + "grad_norm": 0.6421430706977844, + "learning_rate": 2.1518456375838927e-05, + "loss": 0.8198, + "step": 779 + }, + { + "epoch": 1.2064166988790104, + "grad_norm": 0.6213207840919495, + "learning_rate": 2.1476510067114094e-05, + "loss": 0.8565, + "step": 780 + }, + { + "epoch": 1.2079628913799767, + "grad_norm": 0.6527850031852722, + "learning_rate": 2.143456375838926e-05, + "loss": 0.8084, + "step": 781 + }, + { + "epoch": 1.2095090838809432, + "grad_norm": 0.6330167651176453, + "learning_rate": 2.139261744966443e-05, + "loss": 0.8062, + "step": 782 + }, + { + "epoch": 1.2110552763819096, + "grad_norm": 0.6649383902549744, + "learning_rate": 2.13506711409396e-05, + "loss": 0.7586, + "step": 783 + }, + { + "epoch": 1.212601468882876, + "grad_norm": 0.6256992220878601, + "learning_rate": 2.1308724832214767e-05, + "loss": 0.7785, + "step": 784 + }, + { + "epoch": 1.2141476613838422, + "grad_norm": 0.622163712978363, + "learning_rate": 2.1266778523489934e-05, + "loss": 0.9157, + "step": 785 + }, + { + "epoch": 1.2156938538848086, + "grad_norm": 0.6411583423614502, + "learning_rate": 2.1224832214765103e-05, + "loss": 0.8916, + "step": 786 + }, + { + "epoch": 1.217240046385775, + "grad_norm": 0.6612896919250488, + "learning_rate": 2.118288590604027e-05, + "loss": 0.8234, + "step": 787 + }, + { + "epoch": 1.2187862388867414, + "grad_norm": 0.7050024271011353, + "learning_rate": 2.1140939597315437e-05, + "loss": 0.9097, + "step": 788 + }, + { + "epoch": 1.2203324313877078, + "grad_norm": 0.7046983242034912, + "learning_rate": 2.1098993288590603e-05, + "loss": 0.8503, + "step": 789 + }, + { + "epoch": 1.221878623888674, + "grad_norm": 0.7282384634017944, + "learning_rate": 2.1057046979865773e-05, + "loss": 0.8317, + "step": 790 + }, + { + "epoch": 1.2234248163896404, + "grad_norm": 0.7514353394508362, + "learning_rate": 2.1015100671140943e-05, + "loss": 0.8562, + "step": 791 + }, + { + "epoch": 1.224971008890607, + "grad_norm": 0.6923422813415527, + "learning_rate": 2.097315436241611e-05, + "loss": 0.8234, + "step": 792 + }, + { + "epoch": 1.2265172013915733, + "grad_norm": 0.7773630023002625, + "learning_rate": 2.0931208053691276e-05, + "loss": 0.8988, + "step": 793 + }, + { + "epoch": 1.2280633938925396, + "grad_norm": 0.8075311183929443, + "learning_rate": 2.0889261744966443e-05, + "loss": 0.8421, + "step": 794 + }, + { + "epoch": 1.229609586393506, + "grad_norm": 0.8537722826004028, + "learning_rate": 2.0847315436241613e-05, + "loss": 0.835, + "step": 795 + }, + { + "epoch": 1.2311557788944723, + "grad_norm": 1.1471562385559082, + "learning_rate": 2.080536912751678e-05, + "loss": 0.9985, + "step": 796 + }, + { + "epoch": 1.2327019713954388, + "grad_norm": 0.8220142126083374, + "learning_rate": 2.0763422818791946e-05, + "loss": 0.6295, + "step": 797 + }, + { + "epoch": 1.2342481638964051, + "grad_norm": 0.6230762600898743, + "learning_rate": 2.0721476510067116e-05, + "loss": 0.6921, + "step": 798 + }, + { + "epoch": 1.2357943563973715, + "grad_norm": 0.596222996711731, + "learning_rate": 2.0679530201342286e-05, + "loss": 0.7503, + "step": 799 + }, + { + "epoch": 1.2373405488983378, + "grad_norm": 0.5620123744010925, + "learning_rate": 2.0637583892617452e-05, + "loss": 0.6849, + "step": 800 + }, + { + "epoch": 1.238886741399304, + "grad_norm": 0.5710748434066772, + "learning_rate": 2.059563758389262e-05, + "loss": 0.7431, + "step": 801 + }, + { + "epoch": 1.2404329339002707, + "grad_norm": 0.584814727306366, + "learning_rate": 2.0553691275167785e-05, + "loss": 0.721, + "step": 802 + }, + { + "epoch": 1.241979126401237, + "grad_norm": 0.576964795589447, + "learning_rate": 2.0511744966442952e-05, + "loss": 0.6897, + "step": 803 + }, + { + "epoch": 1.2435253189022033, + "grad_norm": 0.6063031554222107, + "learning_rate": 2.0469798657718122e-05, + "loss": 0.7516, + "step": 804 + }, + { + "epoch": 1.2450715114031696, + "grad_norm": 0.596976101398468, + "learning_rate": 2.042785234899329e-05, + "loss": 0.7488, + "step": 805 + }, + { + "epoch": 1.246617703904136, + "grad_norm": 0.6499916911125183, + "learning_rate": 2.038590604026846e-05, + "loss": 0.7554, + "step": 806 + }, + { + "epoch": 1.2481638964051025, + "grad_norm": 0.5964358448982239, + "learning_rate": 2.0343959731543625e-05, + "loss": 0.7417, + "step": 807 + }, + { + "epoch": 1.2497100889060688, + "grad_norm": 0.5935381650924683, + "learning_rate": 2.0302013422818795e-05, + "loss": 0.7308, + "step": 808 + }, + { + "epoch": 1.2512562814070352, + "grad_norm": 0.6293373703956604, + "learning_rate": 2.026006711409396e-05, + "loss": 0.734, + "step": 809 + }, + { + "epoch": 1.2528024739080015, + "grad_norm": 0.592042088508606, + "learning_rate": 2.0218120805369128e-05, + "loss": 0.7412, + "step": 810 + }, + { + "epoch": 1.2543486664089678, + "grad_norm": 0.5861983895301819, + "learning_rate": 2.0176174496644295e-05, + "loss": 0.8101, + "step": 811 + }, + { + "epoch": 1.2558948589099344, + "grad_norm": 0.6101320385932922, + "learning_rate": 2.013422818791946e-05, + "loss": 0.7394, + "step": 812 + }, + { + "epoch": 1.2574410514109007, + "grad_norm": 0.6127662062644958, + "learning_rate": 2.009228187919463e-05, + "loss": 0.7941, + "step": 813 + }, + { + "epoch": 1.258987243911867, + "grad_norm": 0.5885155200958252, + "learning_rate": 2.00503355704698e-05, + "loss": 0.7987, + "step": 814 + }, + { + "epoch": 1.2605334364128333, + "grad_norm": 0.6043751835823059, + "learning_rate": 2.0008389261744968e-05, + "loss": 0.7918, + "step": 815 + }, + { + "epoch": 1.2620796289137997, + "grad_norm": 0.6351797580718994, + "learning_rate": 1.9966442953020134e-05, + "loss": 0.7218, + "step": 816 + }, + { + "epoch": 1.2636258214147662, + "grad_norm": 0.6218336224555969, + "learning_rate": 1.9924496644295304e-05, + "loss": 0.8294, + "step": 817 + }, + { + "epoch": 1.2651720139157325, + "grad_norm": 0.6148021817207336, + "learning_rate": 1.988255033557047e-05, + "loss": 0.7767, + "step": 818 + }, + { + "epoch": 1.2667182064166989, + "grad_norm": 0.6303946375846863, + "learning_rate": 1.9840604026845637e-05, + "loss": 0.7816, + "step": 819 + }, + { + "epoch": 1.2682643989176652, + "grad_norm": 0.6076005101203918, + "learning_rate": 1.9798657718120804e-05, + "loss": 0.8023, + "step": 820 + }, + { + "epoch": 1.2698105914186315, + "grad_norm": 0.6453294157981873, + "learning_rate": 1.9756711409395974e-05, + "loss": 0.7728, + "step": 821 + }, + { + "epoch": 1.271356783919598, + "grad_norm": 0.6279742121696472, + "learning_rate": 1.9714765100671144e-05, + "loss": 0.7564, + "step": 822 + }, + { + "epoch": 1.2729029764205644, + "grad_norm": 0.6434690356254578, + "learning_rate": 1.967281879194631e-05, + "loss": 0.7989, + "step": 823 + }, + { + "epoch": 1.2744491689215307, + "grad_norm": 0.6816707253456116, + "learning_rate": 1.9630872483221477e-05, + "loss": 0.7964, + "step": 824 + }, + { + "epoch": 1.275995361422497, + "grad_norm": 0.6387498378753662, + "learning_rate": 1.9588926174496643e-05, + "loss": 0.7621, + "step": 825 + }, + { + "epoch": 1.2775415539234634, + "grad_norm": 0.6609524488449097, + "learning_rate": 1.9546979865771813e-05, + "loss": 0.8252, + "step": 826 + }, + { + "epoch": 1.27908774642443, + "grad_norm": 0.6324920654296875, + "learning_rate": 1.950503355704698e-05, + "loss": 0.7502, + "step": 827 + }, + { + "epoch": 1.2806339389253962, + "grad_norm": 0.6569236516952515, + "learning_rate": 1.946308724832215e-05, + "loss": 0.8148, + "step": 828 + }, + { + "epoch": 1.2821801314263626, + "grad_norm": 0.625497043132782, + "learning_rate": 1.9421140939597316e-05, + "loss": 0.8663, + "step": 829 + }, + { + "epoch": 1.2837263239273289, + "grad_norm": 0.6833832263946533, + "learning_rate": 1.9379194630872486e-05, + "loss": 0.692, + "step": 830 + }, + { + "epoch": 1.2852725164282952, + "grad_norm": 0.6534375548362732, + "learning_rate": 1.9337248322147653e-05, + "loss": 0.7731, + "step": 831 + }, + { + "epoch": 1.2868187089292618, + "grad_norm": 0.6668411493301392, + "learning_rate": 1.929530201342282e-05, + "loss": 0.758, + "step": 832 + }, + { + "epoch": 1.288364901430228, + "grad_norm": 0.6552398204803467, + "learning_rate": 1.9253355704697986e-05, + "loss": 0.8387, + "step": 833 + }, + { + "epoch": 1.2899110939311944, + "grad_norm": 0.6804989576339722, + "learning_rate": 1.9211409395973153e-05, + "loss": 0.823, + "step": 834 + }, + { + "epoch": 1.2914572864321607, + "grad_norm": 0.6738516092300415, + "learning_rate": 1.9169463087248323e-05, + "loss": 0.8759, + "step": 835 + }, + { + "epoch": 1.293003478933127, + "grad_norm": 0.6957226991653442, + "learning_rate": 1.9127516778523493e-05, + "loss": 0.8465, + "step": 836 + }, + { + "epoch": 1.2945496714340936, + "grad_norm": 0.7255749106407166, + "learning_rate": 1.908557046979866e-05, + "loss": 0.7872, + "step": 837 + }, + { + "epoch": 1.29609586393506, + "grad_norm": 0.7283822894096375, + "learning_rate": 1.9043624161073826e-05, + "loss": 0.8759, + "step": 838 + }, + { + "epoch": 1.2976420564360263, + "grad_norm": 0.7126689553260803, + "learning_rate": 1.9001677852348996e-05, + "loss": 0.8252, + "step": 839 + }, + { + "epoch": 1.2991882489369926, + "grad_norm": 0.7381671071052551, + "learning_rate": 1.8959731543624162e-05, + "loss": 0.9227, + "step": 840 + }, + { + "epoch": 1.300734441437959, + "grad_norm": 0.721825897693634, + "learning_rate": 1.891778523489933e-05, + "loss": 0.9147, + "step": 841 + }, + { + "epoch": 1.3022806339389255, + "grad_norm": 0.719792902469635, + "learning_rate": 1.8875838926174495e-05, + "loss": 0.9038, + "step": 842 + }, + { + "epoch": 1.3038268264398918, + "grad_norm": 0.7600436806678772, + "learning_rate": 1.8833892617449665e-05, + "loss": 0.903, + "step": 843 + }, + { + "epoch": 1.305373018940858, + "grad_norm": 0.743240475654602, + "learning_rate": 1.8791946308724835e-05, + "loss": 0.8799, + "step": 844 + }, + { + "epoch": 1.3069192114418244, + "grad_norm": 0.8364670872688293, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.9059, + "step": 845 + }, + { + "epoch": 1.3084654039427908, + "grad_norm": 0.9684616327285767, + "learning_rate": 1.870805369127517e-05, + "loss": 0.9468, + "step": 846 + }, + { + "epoch": 1.3100115964437573, + "grad_norm": 0.7947605848312378, + "learning_rate": 1.8666107382550335e-05, + "loss": 0.6681, + "step": 847 + }, + { + "epoch": 1.3115577889447236, + "grad_norm": 0.5737844109535217, + "learning_rate": 1.8624161073825505e-05, + "loss": 0.6837, + "step": 848 + }, + { + "epoch": 1.31310398144569, + "grad_norm": 0.5890055298805237, + "learning_rate": 1.858221476510067e-05, + "loss": 0.7115, + "step": 849 + }, + { + "epoch": 1.3146501739466563, + "grad_norm": 0.6102372407913208, + "learning_rate": 1.8540268456375838e-05, + "loss": 0.7131, + "step": 850 + }, + { + "epoch": 1.3161963664476226, + "grad_norm": 0.5825210809707642, + "learning_rate": 1.8498322147651008e-05, + "loss": 0.6873, + "step": 851 + }, + { + "epoch": 1.3177425589485892, + "grad_norm": 0.5857095718383789, + "learning_rate": 1.8456375838926178e-05, + "loss": 0.7435, + "step": 852 + }, + { + "epoch": 1.3192887514495555, + "grad_norm": 0.60005122423172, + "learning_rate": 1.8414429530201344e-05, + "loss": 0.6988, + "step": 853 + }, + { + "epoch": 1.3208349439505218, + "grad_norm": 0.6294332146644592, + "learning_rate": 1.837248322147651e-05, + "loss": 0.7796, + "step": 854 + }, + { + "epoch": 1.3223811364514881, + "grad_norm": 0.6006381511688232, + "learning_rate": 1.8330536912751678e-05, + "loss": 0.7677, + "step": 855 + }, + { + "epoch": 1.3239273289524545, + "grad_norm": 0.5754826664924622, + "learning_rate": 1.8288590604026847e-05, + "loss": 0.7845, + "step": 856 + }, + { + "epoch": 1.325473521453421, + "grad_norm": 0.6118499040603638, + "learning_rate": 1.8246644295302014e-05, + "loss": 0.6819, + "step": 857 + }, + { + "epoch": 1.3270197139543873, + "grad_norm": 0.5924245119094849, + "learning_rate": 1.820469798657718e-05, + "loss": 0.8092, + "step": 858 + }, + { + "epoch": 1.3285659064553537, + "grad_norm": 0.6202556490898132, + "learning_rate": 1.816275167785235e-05, + "loss": 0.7992, + "step": 859 + }, + { + "epoch": 1.33011209895632, + "grad_norm": 0.6356550455093384, + "learning_rate": 1.8120805369127517e-05, + "loss": 0.7731, + "step": 860 + }, + { + "epoch": 1.3316582914572863, + "grad_norm": 0.6205728650093079, + "learning_rate": 1.8078859060402687e-05, + "loss": 0.7951, + "step": 861 + }, + { + "epoch": 1.3332044839582529, + "grad_norm": 0.6306980848312378, + "learning_rate": 1.8036912751677854e-05, + "loss": 0.7904, + "step": 862 + }, + { + "epoch": 1.3347506764592192, + "grad_norm": 0.5935050249099731, + "learning_rate": 1.799496644295302e-05, + "loss": 0.8029, + "step": 863 + }, + { + "epoch": 1.3362968689601855, + "grad_norm": 0.6275442838668823, + "learning_rate": 1.7953020134228187e-05, + "loss": 0.7376, + "step": 864 + }, + { + "epoch": 1.3378430614611518, + "grad_norm": 0.6528346538543701, + "learning_rate": 1.7911073825503357e-05, + "loss": 0.7286, + "step": 865 + }, + { + "epoch": 1.3393892539621182, + "grad_norm": 0.6417142748832703, + "learning_rate": 1.7869127516778523e-05, + "loss": 0.7632, + "step": 866 + }, + { + "epoch": 1.3409354464630847, + "grad_norm": 0.5968343615531921, + "learning_rate": 1.7827181208053693e-05, + "loss": 0.8156, + "step": 867 + }, + { + "epoch": 1.342481638964051, + "grad_norm": 0.5820614695549011, + "learning_rate": 1.778523489932886e-05, + "loss": 0.7957, + "step": 868 + }, + { + "epoch": 1.3440278314650174, + "grad_norm": 0.576564610004425, + "learning_rate": 1.7743288590604026e-05, + "loss": 0.7799, + "step": 869 + }, + { + "epoch": 1.3455740239659837, + "grad_norm": 0.6096657514572144, + "learning_rate": 1.7701342281879196e-05, + "loss": 0.7929, + "step": 870 + }, + { + "epoch": 1.34712021646695, + "grad_norm": 0.6413806080818176, + "learning_rate": 1.7659395973154363e-05, + "loss": 0.7468, + "step": 871 + }, + { + "epoch": 1.3486664089679166, + "grad_norm": 0.6387087106704712, + "learning_rate": 1.761744966442953e-05, + "loss": 0.7342, + "step": 872 + }, + { + "epoch": 1.350212601468883, + "grad_norm": 0.6342037320137024, + "learning_rate": 1.75755033557047e-05, + "loss": 0.829, + "step": 873 + }, + { + "epoch": 1.3517587939698492, + "grad_norm": 0.6201750636100769, + "learning_rate": 1.753355704697987e-05, + "loss": 0.7289, + "step": 874 + }, + { + "epoch": 1.3533049864708155, + "grad_norm": 0.6321792006492615, + "learning_rate": 1.7491610738255036e-05, + "loss": 0.7253, + "step": 875 + }, + { + "epoch": 1.3548511789717819, + "grad_norm": 0.6561968922615051, + "learning_rate": 1.7449664429530202e-05, + "loss": 0.7157, + "step": 876 + }, + { + "epoch": 1.3563973714727484, + "grad_norm": 0.6112854480743408, + "learning_rate": 1.740771812080537e-05, + "loss": 0.7421, + "step": 877 + }, + { + "epoch": 1.3579435639737147, + "grad_norm": 0.6668578386306763, + "learning_rate": 1.736577181208054e-05, + "loss": 0.7849, + "step": 878 + }, + { + "epoch": 1.359489756474681, + "grad_norm": 0.6496410965919495, + "learning_rate": 1.7323825503355705e-05, + "loss": 0.8031, + "step": 879 + }, + { + "epoch": 1.3610359489756474, + "grad_norm": 0.6386198401451111, + "learning_rate": 1.7281879194630872e-05, + "loss": 0.8812, + "step": 880 + }, + { + "epoch": 1.3625821414766137, + "grad_norm": 0.6844744682312012, + "learning_rate": 1.7239932885906042e-05, + "loss": 0.8245, + "step": 881 + }, + { + "epoch": 1.3641283339775803, + "grad_norm": 0.6815952658653259, + "learning_rate": 1.719798657718121e-05, + "loss": 0.8751, + "step": 882 + }, + { + "epoch": 1.3656745264785466, + "grad_norm": 0.6746396422386169, + "learning_rate": 1.715604026845638e-05, + "loss": 0.812, + "step": 883 + }, + { + "epoch": 1.367220718979513, + "grad_norm": 0.6984376907348633, + "learning_rate": 1.7114093959731545e-05, + "loss": 0.8418, + "step": 884 + }, + { + "epoch": 1.3687669114804792, + "grad_norm": 0.6949034929275513, + "learning_rate": 1.707214765100671e-05, + "loss": 0.8032, + "step": 885 + }, + { + "epoch": 1.3703131039814456, + "grad_norm": 0.6689850091934204, + "learning_rate": 1.7030201342281878e-05, + "loss": 0.8363, + "step": 886 + }, + { + "epoch": 1.3718592964824121, + "grad_norm": 0.6734853386878967, + "learning_rate": 1.6988255033557048e-05, + "loss": 0.8643, + "step": 887 + }, + { + "epoch": 1.3734054889833784, + "grad_norm": 0.7207046747207642, + "learning_rate": 1.6946308724832215e-05, + "loss": 0.853, + "step": 888 + }, + { + "epoch": 1.3749516814843448, + "grad_norm": 0.7100691199302673, + "learning_rate": 1.6904362416107385e-05, + "loss": 0.8776, + "step": 889 + }, + { + "epoch": 1.376497873985311, + "grad_norm": 0.7267347574234009, + "learning_rate": 1.686241610738255e-05, + "loss": 0.817, + "step": 890 + }, + { + "epoch": 1.3780440664862774, + "grad_norm": 0.7453994154930115, + "learning_rate": 1.6820469798657718e-05, + "loss": 0.7864, + "step": 891 + }, + { + "epoch": 1.379590258987244, + "grad_norm": 0.7423779368400574, + "learning_rate": 1.6778523489932888e-05, + "loss": 0.8475, + "step": 892 + }, + { + "epoch": 1.3811364514882103, + "grad_norm": 0.7435901165008545, + "learning_rate": 1.6736577181208054e-05, + "loss": 0.8409, + "step": 893 + }, + { + "epoch": 1.3826826439891766, + "grad_norm": 0.8185101747512817, + "learning_rate": 1.669463087248322e-05, + "loss": 0.9255, + "step": 894 + }, + { + "epoch": 1.384228836490143, + "grad_norm": 0.8886857628822327, + "learning_rate": 1.6652684563758387e-05, + "loss": 0.9095, + "step": 895 + }, + { + "epoch": 1.3857750289911093, + "grad_norm": 1.2426859140396118, + "learning_rate": 1.6610738255033557e-05, + "loss": 0.9899, + "step": 896 + }, + { + "epoch": 1.3873212214920758, + "grad_norm": 0.7389244437217712, + "learning_rate": 1.6568791946308727e-05, + "loss": 0.5189, + "step": 897 + }, + { + "epoch": 1.3888674139930421, + "grad_norm": 0.5934030413627625, + "learning_rate": 1.6526845637583894e-05, + "loss": 0.6868, + "step": 898 + }, + { + "epoch": 1.3904136064940085, + "grad_norm": 0.5986191630363464, + "learning_rate": 1.648489932885906e-05, + "loss": 0.7075, + "step": 899 + }, + { + "epoch": 1.3919597989949748, + "grad_norm": 0.6108096241950989, + "learning_rate": 1.644295302013423e-05, + "loss": 0.6855, + "step": 900 + }, + { + "epoch": 1.3935059914959411, + "grad_norm": 0.5924285650253296, + "learning_rate": 1.6401006711409397e-05, + "loss": 0.7459, + "step": 901 + }, + { + "epoch": 1.3950521839969077, + "grad_norm": 0.627100944519043, + "learning_rate": 1.6359060402684563e-05, + "loss": 0.71, + "step": 902 + }, + { + "epoch": 1.396598376497874, + "grad_norm": 0.6151503920555115, + "learning_rate": 1.631711409395973e-05, + "loss": 0.6844, + "step": 903 + }, + { + "epoch": 1.3981445689988403, + "grad_norm": 0.6181209683418274, + "learning_rate": 1.62751677852349e-05, + "loss": 0.7635, + "step": 904 + }, + { + "epoch": 1.3996907614998066, + "grad_norm": 0.5866500735282898, + "learning_rate": 1.623322147651007e-05, + "loss": 0.742, + "step": 905 + }, + { + "epoch": 1.401236954000773, + "grad_norm": 0.6311929225921631, + "learning_rate": 1.6191275167785237e-05, + "loss": 0.734, + "step": 906 + }, + { + "epoch": 1.4027831465017395, + "grad_norm": 0.6104720830917358, + "learning_rate": 1.6149328859060403e-05, + "loss": 0.7187, + "step": 907 + }, + { + "epoch": 1.4043293390027058, + "grad_norm": 0.6044167280197144, + "learning_rate": 1.610738255033557e-05, + "loss": 0.717, + "step": 908 + }, + { + "epoch": 1.4058755315036722, + "grad_norm": 0.5984567999839783, + "learning_rate": 1.606543624161074e-05, + "loss": 0.8021, + "step": 909 + }, + { + "epoch": 1.4074217240046385, + "grad_norm": 0.6355494260787964, + "learning_rate": 1.6023489932885906e-05, + "loss": 0.8507, + "step": 910 + }, + { + "epoch": 1.4089679165056048, + "grad_norm": 0.6177673935890198, + "learning_rate": 1.5981543624161076e-05, + "loss": 0.7205, + "step": 911 + }, + { + "epoch": 1.4105141090065714, + "grad_norm": 0.6486150026321411, + "learning_rate": 1.5939597315436243e-05, + "loss": 0.8078, + "step": 912 + }, + { + "epoch": 1.4120603015075377, + "grad_norm": 0.6070188879966736, + "learning_rate": 1.5897651006711413e-05, + "loss": 0.7067, + "step": 913 + }, + { + "epoch": 1.413606494008504, + "grad_norm": 0.6363996267318726, + "learning_rate": 1.585570469798658e-05, + "loss": 0.7621, + "step": 914 + }, + { + "epoch": 1.4151526865094703, + "grad_norm": 0.6805376410484314, + "learning_rate": 1.5813758389261746e-05, + "loss": 0.7354, + "step": 915 + }, + { + "epoch": 1.4166988790104367, + "grad_norm": 0.6029914617538452, + "learning_rate": 1.5771812080536912e-05, + "loss": 0.7425, + "step": 916 + }, + { + "epoch": 1.4182450715114032, + "grad_norm": 0.6197159290313721, + "learning_rate": 1.572986577181208e-05, + "loss": 0.769, + "step": 917 + }, + { + "epoch": 1.4197912640123695, + "grad_norm": 0.6450138092041016, + "learning_rate": 1.568791946308725e-05, + "loss": 0.7455, + "step": 918 + }, + { + "epoch": 1.4213374565133359, + "grad_norm": 0.6024583578109741, + "learning_rate": 1.564597315436242e-05, + "loss": 0.783, + "step": 919 + }, + { + "epoch": 1.4228836490143022, + "grad_norm": 0.6291201114654541, + "learning_rate": 1.5604026845637585e-05, + "loss": 0.7888, + "step": 920 + }, + { + "epoch": 1.4244298415152685, + "grad_norm": 0.6113951802253723, + "learning_rate": 1.5562080536912752e-05, + "loss": 0.7682, + "step": 921 + }, + { + "epoch": 1.425976034016235, + "grad_norm": 0.6421045064926147, + "learning_rate": 1.5520134228187922e-05, + "loss": 0.7691, + "step": 922 + }, + { + "epoch": 1.4275222265172014, + "grad_norm": 0.6352230906486511, + "learning_rate": 1.547818791946309e-05, + "loss": 0.7902, + "step": 923 + }, + { + "epoch": 1.4290684190181677, + "grad_norm": 0.677621066570282, + "learning_rate": 1.5436241610738255e-05, + "loss": 0.765, + "step": 924 + }, + { + "epoch": 1.430614611519134, + "grad_norm": 0.6393842697143555, + "learning_rate": 1.539429530201342e-05, + "loss": 0.6943, + "step": 925 + }, + { + "epoch": 1.4321608040201004, + "grad_norm": 0.6991260051727295, + "learning_rate": 1.535234899328859e-05, + "loss": 0.7966, + "step": 926 + }, + { + "epoch": 1.433706996521067, + "grad_norm": 0.642305314540863, + "learning_rate": 1.531040268456376e-05, + "loss": 0.8245, + "step": 927 + }, + { + "epoch": 1.4352531890220332, + "grad_norm": 0.6572315692901611, + "learning_rate": 1.5268456375838928e-05, + "loss": 0.7562, + "step": 928 + }, + { + "epoch": 1.4367993815229996, + "grad_norm": 0.6849440336227417, + "learning_rate": 1.5226510067114095e-05, + "loss": 0.8522, + "step": 929 + }, + { + "epoch": 1.4383455740239661, + "grad_norm": 0.7153504490852356, + "learning_rate": 1.5184563758389261e-05, + "loss": 0.7843, + "step": 930 + }, + { + "epoch": 1.4398917665249322, + "grad_norm": 0.6780909895896912, + "learning_rate": 1.5142617449664431e-05, + "loss": 0.8062, + "step": 931 + }, + { + "epoch": 1.4414379590258988, + "grad_norm": 0.6425846815109253, + "learning_rate": 1.51006711409396e-05, + "loss": 0.7866, + "step": 932 + }, + { + "epoch": 1.442984151526865, + "grad_norm": 0.6629062294960022, + "learning_rate": 1.5058724832214766e-05, + "loss": 0.8047, + "step": 933 + }, + { + "epoch": 1.4445303440278314, + "grad_norm": 0.6615095734596252, + "learning_rate": 1.5016778523489932e-05, + "loss": 0.8117, + "step": 934 + }, + { + "epoch": 1.446076536528798, + "grad_norm": 0.718272864818573, + "learning_rate": 1.4974832214765102e-05, + "loss": 0.8416, + "step": 935 + }, + { + "epoch": 1.447622729029764, + "grad_norm": 0.7286227941513062, + "learning_rate": 1.493288590604027e-05, + "loss": 0.8347, + "step": 936 + }, + { + "epoch": 1.4491689215307306, + "grad_norm": 0.7250097990036011, + "learning_rate": 1.4890939597315437e-05, + "loss": 0.8095, + "step": 937 + }, + { + "epoch": 1.450715114031697, + "grad_norm": 0.7696204781532288, + "learning_rate": 1.4848993288590604e-05, + "loss": 0.8611, + "step": 938 + }, + { + "epoch": 1.4522613065326633, + "grad_norm": 0.7375919222831726, + "learning_rate": 1.4807046979865772e-05, + "loss": 0.8903, + "step": 939 + }, + { + "epoch": 1.4538074990336298, + "grad_norm": 0.7759246230125427, + "learning_rate": 1.4765100671140942e-05, + "loss": 0.8005, + "step": 940 + }, + { + "epoch": 1.455353691534596, + "grad_norm": 0.8237205743789673, + "learning_rate": 1.4723154362416108e-05, + "loss": 0.8204, + "step": 941 + }, + { + "epoch": 1.4568998840355625, + "grad_norm": 0.7622566819190979, + "learning_rate": 1.4681208053691275e-05, + "loss": 0.8673, + "step": 942 + }, + { + "epoch": 1.4584460765365288, + "grad_norm": 0.7782172560691833, + "learning_rate": 1.4639261744966443e-05, + "loss": 0.9434, + "step": 943 + }, + { + "epoch": 1.4599922690374951, + "grad_norm": 0.856338381767273, + "learning_rate": 1.4597315436241613e-05, + "loss": 0.9032, + "step": 944 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 0.9459933042526245, + "learning_rate": 1.455536912751678e-05, + "loss": 0.8599, + "step": 945 + }, + { + "epoch": 1.4630846540394278, + "grad_norm": 1.1424592733383179, + "learning_rate": 1.4513422818791946e-05, + "loss": 0.879, + "step": 946 + }, + { + "epoch": 1.4646308465403943, + "grad_norm": 0.9243542551994324, + "learning_rate": 1.4471476510067115e-05, + "loss": 0.6613, + "step": 947 + }, + { + "epoch": 1.4661770390413607, + "grad_norm": 0.5729179978370667, + "learning_rate": 1.4429530201342285e-05, + "loss": 0.6693, + "step": 948 + }, + { + "epoch": 1.467723231542327, + "grad_norm": 0.6040563583374023, + "learning_rate": 1.4387583892617451e-05, + "loss": 0.663, + "step": 949 + }, + { + "epoch": 1.4692694240432935, + "grad_norm": 0.6559063196182251, + "learning_rate": 1.4345637583892618e-05, + "loss": 0.691, + "step": 950 + }, + { + "epoch": 1.4708156165442596, + "grad_norm": 0.5916107296943665, + "learning_rate": 1.4303691275167786e-05, + "loss": 0.7474, + "step": 951 + }, + { + "epoch": 1.4723618090452262, + "grad_norm": 0.5876263380050659, + "learning_rate": 1.4261744966442953e-05, + "loss": 0.7287, + "step": 952 + }, + { + "epoch": 1.4739080015461925, + "grad_norm": 0.5855628848075867, + "learning_rate": 1.4219798657718122e-05, + "loss": 0.7254, + "step": 953 + }, + { + "epoch": 1.4754541940471588, + "grad_norm": 0.5839198231697083, + "learning_rate": 1.4177852348993289e-05, + "loss": 0.7462, + "step": 954 + }, + { + "epoch": 1.4770003865481254, + "grad_norm": 0.5933511257171631, + "learning_rate": 1.4135906040268457e-05, + "loss": 0.7303, + "step": 955 + }, + { + "epoch": 1.4785465790490915, + "grad_norm": 0.620617687702179, + "learning_rate": 1.4093959731543624e-05, + "loss": 0.7287, + "step": 956 + }, + { + "epoch": 1.480092771550058, + "grad_norm": 0.6549968719482422, + "learning_rate": 1.4052013422818794e-05, + "loss": 0.7426, + "step": 957 + }, + { + "epoch": 1.4816389640510244, + "grad_norm": 0.6639398336410522, + "learning_rate": 1.401006711409396e-05, + "loss": 0.7329, + "step": 958 + }, + { + "epoch": 1.4831851565519907, + "grad_norm": 0.6285966634750366, + "learning_rate": 1.3968120805369129e-05, + "loss": 0.7302, + "step": 959 + }, + { + "epoch": 1.4847313490529572, + "grad_norm": 0.6673871278762817, + "learning_rate": 1.3926174496644295e-05, + "loss": 0.7165, + "step": 960 + }, + { + "epoch": 1.4862775415539233, + "grad_norm": 0.6557399034500122, + "learning_rate": 1.3884228187919462e-05, + "loss": 0.7465, + "step": 961 + }, + { + "epoch": 1.4878237340548899, + "grad_norm": 0.615143358707428, + "learning_rate": 1.3842281879194632e-05, + "loss": 0.7531, + "step": 962 + }, + { + "epoch": 1.4893699265558562, + "grad_norm": 0.6445402503013611, + "learning_rate": 1.38003355704698e-05, + "loss": 0.7088, + "step": 963 + }, + { + "epoch": 1.4909161190568225, + "grad_norm": 0.6428129076957703, + "learning_rate": 1.3758389261744966e-05, + "loss": 0.7011, + "step": 964 + }, + { + "epoch": 1.492462311557789, + "grad_norm": 0.6472491025924683, + "learning_rate": 1.3716442953020133e-05, + "loss": 0.8012, + "step": 965 + }, + { + "epoch": 1.4940085040587552, + "grad_norm": 0.6467485427856445, + "learning_rate": 1.3674496644295303e-05, + "loss": 0.7108, + "step": 966 + }, + { + "epoch": 1.4955546965597217, + "grad_norm": 0.648501455783844, + "learning_rate": 1.3632550335570471e-05, + "loss": 0.7786, + "step": 967 + }, + { + "epoch": 1.497100889060688, + "grad_norm": 0.6614555716514587, + "learning_rate": 1.3590604026845638e-05, + "loss": 0.6745, + "step": 968 + }, + { + "epoch": 1.4986470815616544, + "grad_norm": 0.6633491516113281, + "learning_rate": 1.3548657718120804e-05, + "loss": 0.7064, + "step": 969 + }, + { + "epoch": 1.500193274062621, + "grad_norm": 0.658253014087677, + "learning_rate": 1.3506711409395974e-05, + "loss": 0.7438, + "step": 970 + }, + { + "epoch": 1.501739466563587, + "grad_norm": 0.6717640161514282, + "learning_rate": 1.3464765100671143e-05, + "loss": 0.7792, + "step": 971 + }, + { + "epoch": 1.5032856590645536, + "grad_norm": 0.6208140850067139, + "learning_rate": 1.3422818791946309e-05, + "loss": 0.7973, + "step": 972 + }, + { + "epoch": 1.50483185156552, + "grad_norm": 0.6302463412284851, + "learning_rate": 1.3380872483221477e-05, + "loss": 0.785, + "step": 973 + }, + { + "epoch": 1.5063780440664862, + "grad_norm": 0.6479527950286865, + "learning_rate": 1.3338926174496644e-05, + "loss": 0.8266, + "step": 974 + }, + { + "epoch": 1.5079242365674528, + "grad_norm": 0.6340357661247253, + "learning_rate": 1.3296979865771814e-05, + "loss": 0.7134, + "step": 975 + }, + { + "epoch": 1.5094704290684189, + "grad_norm": 0.6478318572044373, + "learning_rate": 1.325503355704698e-05, + "loss": 0.7638, + "step": 976 + }, + { + "epoch": 1.5110166215693854, + "grad_norm": 0.6769170761108398, + "learning_rate": 1.3213087248322149e-05, + "loss": 0.7754, + "step": 977 + }, + { + "epoch": 1.5125628140703518, + "grad_norm": 0.6835642457008362, + "learning_rate": 1.3171140939597315e-05, + "loss": 0.7856, + "step": 978 + }, + { + "epoch": 1.514109006571318, + "grad_norm": 0.700343668460846, + "learning_rate": 1.3129194630872485e-05, + "loss": 0.7428, + "step": 979 + }, + { + "epoch": 1.5156551990722846, + "grad_norm": 0.6960814595222473, + "learning_rate": 1.3087248322147652e-05, + "loss": 0.7649, + "step": 980 + }, + { + "epoch": 1.5172013915732507, + "grad_norm": 0.6891623735427856, + "learning_rate": 1.304530201342282e-05, + "loss": 0.8444, + "step": 981 + }, + { + "epoch": 1.5187475840742173, + "grad_norm": 0.6881600618362427, + "learning_rate": 1.3003355704697987e-05, + "loss": 0.7956, + "step": 982 + }, + { + "epoch": 1.5202937765751836, + "grad_norm": 0.6770979762077332, + "learning_rate": 1.2961409395973153e-05, + "loss": 0.8173, + "step": 983 + }, + { + "epoch": 1.52183996907615, + "grad_norm": 0.6920093894004822, + "learning_rate": 1.2919463087248323e-05, + "loss": 0.8522, + "step": 984 + }, + { + "epoch": 1.5233861615771165, + "grad_norm": 0.6803298592567444, + "learning_rate": 1.2877516778523491e-05, + "loss": 0.8345, + "step": 985 + }, + { + "epoch": 1.5249323540780826, + "grad_norm": 0.6997124552726746, + "learning_rate": 1.2835570469798658e-05, + "loss": 0.8141, + "step": 986 + }, + { + "epoch": 1.5264785465790491, + "grad_norm": 0.7112342715263367, + "learning_rate": 1.2793624161073825e-05, + "loss": 0.8196, + "step": 987 + }, + { + "epoch": 1.5280247390800155, + "grad_norm": 0.7443994879722595, + "learning_rate": 1.2751677852348994e-05, + "loss": 0.8575, + "step": 988 + }, + { + "epoch": 1.5295709315809818, + "grad_norm": 0.6785229444503784, + "learning_rate": 1.2709731543624163e-05, + "loss": 0.8843, + "step": 989 + }, + { + "epoch": 1.5311171240819483, + "grad_norm": 0.7394726276397705, + "learning_rate": 1.266778523489933e-05, + "loss": 0.9496, + "step": 990 + }, + { + "epoch": 1.5326633165829144, + "grad_norm": 0.738186776638031, + "learning_rate": 1.2625838926174496e-05, + "loss": 0.9172, + "step": 991 + }, + { + "epoch": 1.534209509083881, + "grad_norm": 0.7867004871368408, + "learning_rate": 1.2583892617449666e-05, + "loss": 0.8743, + "step": 992 + }, + { + "epoch": 1.5357557015848473, + "grad_norm": 0.8075705170631409, + "learning_rate": 1.2541946308724834e-05, + "loss": 0.9021, + "step": 993 + }, + { + "epoch": 1.5373018940858136, + "grad_norm": 0.8144130110740662, + "learning_rate": 1.25e-05, + "loss": 0.8258, + "step": 994 + }, + { + "epoch": 1.5388480865867802, + "grad_norm": 0.8460942506790161, + "learning_rate": 1.2458053691275167e-05, + "loss": 0.8051, + "step": 995 + }, + { + "epoch": 1.5403942790877463, + "grad_norm": 1.0733917951583862, + "learning_rate": 1.2416107382550337e-05, + "loss": 0.9469, + "step": 996 + }, + { + "epoch": 1.5419404715887128, + "grad_norm": 0.6984366178512573, + "learning_rate": 1.2374161073825504e-05, + "loss": 0.6271, + "step": 997 + }, + { + "epoch": 1.5434866640896792, + "grad_norm": 0.5974945425987244, + "learning_rate": 1.2332214765100672e-05, + "loss": 0.6767, + "step": 998 + }, + { + "epoch": 1.5450328565906455, + "grad_norm": 0.5917332172393799, + "learning_rate": 1.2290268456375838e-05, + "loss": 0.7027, + "step": 999 + }, + { + "epoch": 1.546579049091612, + "grad_norm": 0.5879162549972534, + "learning_rate": 1.2248322147651008e-05, + "loss": 0.6444, + "step": 1000 + }, + { + "epoch": 1.5481252415925781, + "grad_norm": 0.5611408948898315, + "learning_rate": 1.2206375838926175e-05, + "loss": 0.6743, + "step": 1001 + }, + { + "epoch": 1.5496714340935447, + "grad_norm": 0.6579827666282654, + "learning_rate": 1.2164429530201343e-05, + "loss": 0.6863, + "step": 1002 + }, + { + "epoch": 1.551217626594511, + "grad_norm": 0.6543861031532288, + "learning_rate": 1.212248322147651e-05, + "loss": 0.6911, + "step": 1003 + }, + { + "epoch": 1.5527638190954773, + "grad_norm": 0.6316918134689331, + "learning_rate": 1.208053691275168e-05, + "loss": 0.7126, + "step": 1004 + }, + { + "epoch": 1.5543100115964439, + "grad_norm": 0.6070011258125305, + "learning_rate": 1.2038590604026846e-05, + "loss": 0.5802, + "step": 1005 + }, + { + "epoch": 1.55585620409741, + "grad_norm": 0.6225163340568542, + "learning_rate": 1.1996644295302013e-05, + "loss": 0.7565, + "step": 1006 + }, + { + "epoch": 1.5574023965983765, + "grad_norm": 0.6303391456604004, + "learning_rate": 1.1954697986577181e-05, + "loss": 0.7537, + "step": 1007 + }, + { + "epoch": 1.5589485890993429, + "grad_norm": 0.6572604179382324, + "learning_rate": 1.191275167785235e-05, + "loss": 0.7332, + "step": 1008 + }, + { + "epoch": 1.5604947816003092, + "grad_norm": 0.6323292255401611, + "learning_rate": 1.1870805369127518e-05, + "loss": 0.7537, + "step": 1009 + }, + { + "epoch": 1.5620409741012757, + "grad_norm": 0.621051549911499, + "learning_rate": 1.1828859060402684e-05, + "loss": 0.7371, + "step": 1010 + }, + { + "epoch": 1.5635871666022418, + "grad_norm": 0.6632164120674133, + "learning_rate": 1.1786912751677852e-05, + "loss": 0.7169, + "step": 1011 + }, + { + "epoch": 1.5651333591032084, + "grad_norm": 0.6632513999938965, + "learning_rate": 1.174496644295302e-05, + "loss": 0.726, + "step": 1012 + }, + { + "epoch": 1.5666795516041747, + "grad_norm": 0.6338618397712708, + "learning_rate": 1.1703020134228189e-05, + "loss": 0.7774, + "step": 1013 + }, + { + "epoch": 1.568225744105141, + "grad_norm": 0.6426889896392822, + "learning_rate": 1.1661073825503356e-05, + "loss": 0.712, + "step": 1014 + }, + { + "epoch": 1.5697719366061076, + "grad_norm": 0.6418123841285706, + "learning_rate": 1.1619127516778524e-05, + "loss": 0.692, + "step": 1015 + }, + { + "epoch": 1.5713181291070737, + "grad_norm": 0.6338704228401184, + "learning_rate": 1.1577181208053692e-05, + "loss": 0.7498, + "step": 1016 + }, + { + "epoch": 1.5728643216080402, + "grad_norm": 0.6129948496818542, + "learning_rate": 1.1535234899328859e-05, + "loss": 0.8721, + "step": 1017 + }, + { + "epoch": 1.5744105141090066, + "grad_norm": 0.6413542032241821, + "learning_rate": 1.1493288590604027e-05, + "loss": 0.8381, + "step": 1018 + }, + { + "epoch": 1.575956706609973, + "grad_norm": 0.6431681513786316, + "learning_rate": 1.1451342281879195e-05, + "loss": 0.7365, + "step": 1019 + }, + { + "epoch": 1.5775028991109394, + "grad_norm": 0.6414735317230225, + "learning_rate": 1.1409395973154363e-05, + "loss": 0.8006, + "step": 1020 + }, + { + "epoch": 1.5790490916119055, + "grad_norm": 0.6431716680526733, + "learning_rate": 1.136744966442953e-05, + "loss": 0.7759, + "step": 1021 + }, + { + "epoch": 1.580595284112872, + "grad_norm": 0.6646420359611511, + "learning_rate": 1.1325503355704698e-05, + "loss": 0.7038, + "step": 1022 + }, + { + "epoch": 1.5821414766138384, + "grad_norm": 0.7008923292160034, + "learning_rate": 1.1283557046979866e-05, + "loss": 0.719, + "step": 1023 + }, + { + "epoch": 1.5836876691148047, + "grad_norm": 0.6393096446990967, + "learning_rate": 1.1241610738255035e-05, + "loss": 0.7841, + "step": 1024 + }, + { + "epoch": 1.5852338616157713, + "grad_norm": 0.6879417300224304, + "learning_rate": 1.1199664429530201e-05, + "loss": 0.8218, + "step": 1025 + }, + { + "epoch": 1.5867800541167374, + "grad_norm": 0.6500439047813416, + "learning_rate": 1.115771812080537e-05, + "loss": 0.8026, + "step": 1026 + }, + { + "epoch": 1.588326246617704, + "grad_norm": 0.6967812180519104, + "learning_rate": 1.1115771812080538e-05, + "loss": 0.7494, + "step": 1027 + }, + { + "epoch": 1.5898724391186703, + "grad_norm": 0.6527352929115295, + "learning_rate": 1.1073825503355706e-05, + "loss": 0.8472, + "step": 1028 + }, + { + "epoch": 1.5914186316196366, + "grad_norm": 0.7033571004867554, + "learning_rate": 1.1031879194630873e-05, + "loss": 0.7553, + "step": 1029 + }, + { + "epoch": 1.5929648241206031, + "grad_norm": 0.6790737509727478, + "learning_rate": 1.098993288590604e-05, + "loss": 0.8045, + "step": 1030 + }, + { + "epoch": 1.5945110166215692, + "grad_norm": 0.7032893300056458, + "learning_rate": 1.0947986577181209e-05, + "loss": 0.8292, + "step": 1031 + }, + { + "epoch": 1.5960572091225358, + "grad_norm": 0.6649371981620789, + "learning_rate": 1.0906040268456376e-05, + "loss": 0.8032, + "step": 1032 + }, + { + "epoch": 1.5976034016235021, + "grad_norm": 0.7009012699127197, + "learning_rate": 1.0864093959731544e-05, + "loss": 0.9107, + "step": 1033 + }, + { + "epoch": 1.5991495941244684, + "grad_norm": 0.7047061324119568, + "learning_rate": 1.0822147651006712e-05, + "loss": 0.904, + "step": 1034 + }, + { + "epoch": 1.600695786625435, + "grad_norm": 0.7520581483840942, + "learning_rate": 1.078020134228188e-05, + "loss": 0.8972, + "step": 1035 + }, + { + "epoch": 1.602241979126401, + "grad_norm": 0.710966944694519, + "learning_rate": 1.0738255033557047e-05, + "loss": 0.8133, + "step": 1036 + }, + { + "epoch": 1.6037881716273676, + "grad_norm": 0.7045226693153381, + "learning_rate": 1.0696308724832215e-05, + "loss": 0.8474, + "step": 1037 + }, + { + "epoch": 1.605334364128334, + "grad_norm": 0.7222912311553955, + "learning_rate": 1.0654362416107383e-05, + "loss": 0.8577, + "step": 1038 + }, + { + "epoch": 1.6068805566293003, + "grad_norm": 0.7768924236297607, + "learning_rate": 1.0612416107382552e-05, + "loss": 0.8906, + "step": 1039 + }, + { + "epoch": 1.6084267491302668, + "grad_norm": 0.7447935342788696, + "learning_rate": 1.0570469798657718e-05, + "loss": 0.8571, + "step": 1040 + }, + { + "epoch": 1.609972941631233, + "grad_norm": 0.7860161662101746, + "learning_rate": 1.0528523489932887e-05, + "loss": 0.868, + "step": 1041 + }, + { + "epoch": 1.6115191341321995, + "grad_norm": 0.772621750831604, + "learning_rate": 1.0486577181208055e-05, + "loss": 0.9064, + "step": 1042 + }, + { + "epoch": 1.6130653266331658, + "grad_norm": 0.7927963733673096, + "learning_rate": 1.0444630872483221e-05, + "loss": 0.8628, + "step": 1043 + }, + { + "epoch": 1.6146115191341321, + "grad_norm": 0.8811314105987549, + "learning_rate": 1.040268456375839e-05, + "loss": 0.9203, + "step": 1044 + }, + { + "epoch": 1.6161577116350987, + "grad_norm": 1.0155308246612549, + "learning_rate": 1.0360738255033558e-05, + "loss": 0.9134, + "step": 1045 + }, + { + "epoch": 1.6177039041360648, + "grad_norm": 1.3558322191238403, + "learning_rate": 1.0318791946308726e-05, + "loss": 1.007, + "step": 1046 + }, + { + "epoch": 1.6192500966370313, + "grad_norm": 0.771702766418457, + "learning_rate": 1.0276845637583893e-05, + "loss": 0.5492, + "step": 1047 + }, + { + "epoch": 1.6207962891379977, + "grad_norm": 0.5778831243515015, + "learning_rate": 1.0234899328859061e-05, + "loss": 0.5969, + "step": 1048 + }, + { + "epoch": 1.622342481638964, + "grad_norm": 0.5768334865570068, + "learning_rate": 1.019295302013423e-05, + "loss": 0.7561, + "step": 1049 + }, + { + "epoch": 1.6238886741399305, + "grad_norm": 0.6801527142524719, + "learning_rate": 1.0151006711409397e-05, + "loss": 0.6649, + "step": 1050 + }, + { + "epoch": 1.6254348666408966, + "grad_norm": 0.6493854522705078, + "learning_rate": 1.0109060402684564e-05, + "loss": 0.6788, + "step": 1051 + }, + { + "epoch": 1.6269810591418632, + "grad_norm": 0.6070857644081116, + "learning_rate": 1.006711409395973e-05, + "loss": 0.6509, + "step": 1052 + }, + { + "epoch": 1.6285272516428295, + "grad_norm": 0.6425489783287048, + "learning_rate": 1.00251677852349e-05, + "loss": 0.7303, + "step": 1053 + }, + { + "epoch": 1.6300734441437958, + "grad_norm": 0.6311664581298828, + "learning_rate": 9.983221476510067e-06, + "loss": 0.6492, + "step": 1054 + }, + { + "epoch": 1.6316196366447624, + "grad_norm": 0.5983323454856873, + "learning_rate": 9.941275167785235e-06, + "loss": 0.7778, + "step": 1055 + }, + { + "epoch": 1.6331658291457285, + "grad_norm": 0.5775598287582397, + "learning_rate": 9.899328859060402e-06, + "loss": 0.7594, + "step": 1056 + }, + { + "epoch": 1.634712021646695, + "grad_norm": 0.6420764327049255, + "learning_rate": 9.857382550335572e-06, + "loss": 0.7812, + "step": 1057 + }, + { + "epoch": 1.6362582141476614, + "grad_norm": 0.6438013911247253, + "learning_rate": 9.815436241610738e-06, + "loss": 0.7245, + "step": 1058 + }, + { + "epoch": 1.6378044066486277, + "grad_norm": 0.6310963034629822, + "learning_rate": 9.773489932885907e-06, + "loss": 0.7321, + "step": 1059 + }, + { + "epoch": 1.6393505991495942, + "grad_norm": 0.6203198432922363, + "learning_rate": 9.731543624161075e-06, + "loss": 0.6914, + "step": 1060 + }, + { + "epoch": 1.6408967916505603, + "grad_norm": 0.638454258441925, + "learning_rate": 9.689597315436243e-06, + "loss": 0.7478, + "step": 1061 + }, + { + "epoch": 1.642442984151527, + "grad_norm": 0.6612226963043213, + "learning_rate": 9.64765100671141e-06, + "loss": 0.7209, + "step": 1062 + }, + { + "epoch": 1.6439891766524932, + "grad_norm": 0.6566651463508606, + "learning_rate": 9.605704697986576e-06, + "loss": 0.8045, + "step": 1063 + }, + { + "epoch": 1.6455353691534595, + "grad_norm": 0.6543892621994019, + "learning_rate": 9.563758389261746e-06, + "loss": 0.7727, + "step": 1064 + }, + { + "epoch": 1.647081561654426, + "grad_norm": 0.6707619428634644, + "learning_rate": 9.521812080536913e-06, + "loss": 0.7271, + "step": 1065 + }, + { + "epoch": 1.6486277541553922, + "grad_norm": 0.6582595705986023, + "learning_rate": 9.479865771812081e-06, + "loss": 0.7332, + "step": 1066 + }, + { + "epoch": 1.6501739466563587, + "grad_norm": 0.6757826209068298, + "learning_rate": 9.437919463087248e-06, + "loss": 0.7663, + "step": 1067 + }, + { + "epoch": 1.651720139157325, + "grad_norm": 0.698391318321228, + "learning_rate": 9.395973154362418e-06, + "loss": 0.7277, + "step": 1068 + }, + { + "epoch": 1.6532663316582914, + "grad_norm": 0.6885313987731934, + "learning_rate": 9.354026845637584e-06, + "loss": 0.7959, + "step": 1069 + }, + { + "epoch": 1.654812524159258, + "grad_norm": 0.664113461971283, + "learning_rate": 9.312080536912752e-06, + "loss": 0.7571, + "step": 1070 + }, + { + "epoch": 1.656358716660224, + "grad_norm": 0.6730982065200806, + "learning_rate": 9.270134228187919e-06, + "loss": 0.709, + "step": 1071 + }, + { + "epoch": 1.6579049091611906, + "grad_norm": 0.684267520904541, + "learning_rate": 9.228187919463089e-06, + "loss": 0.6932, + "step": 1072 + }, + { + "epoch": 1.659451101662157, + "grad_norm": 0.631272554397583, + "learning_rate": 9.186241610738255e-06, + "loss": 0.8569, + "step": 1073 + }, + { + "epoch": 1.6609972941631233, + "grad_norm": 0.7173067927360535, + "learning_rate": 9.144295302013424e-06, + "loss": 0.777, + "step": 1074 + }, + { + "epoch": 1.6625434866640898, + "grad_norm": 0.6663726568222046, + "learning_rate": 9.10234899328859e-06, + "loss": 0.7523, + "step": 1075 + }, + { + "epoch": 1.664089679165056, + "grad_norm": 0.689304530620575, + "learning_rate": 9.060402684563759e-06, + "loss": 0.7922, + "step": 1076 + }, + { + "epoch": 1.6656358716660224, + "grad_norm": 0.6620500683784485, + "learning_rate": 9.018456375838927e-06, + "loss": 0.8587, + "step": 1077 + }, + { + "epoch": 1.6671820641669888, + "grad_norm": 0.6905922889709473, + "learning_rate": 8.976510067114093e-06, + "loss": 0.7897, + "step": 1078 + }, + { + "epoch": 1.668728256667955, + "grad_norm": 0.6763781309127808, + "learning_rate": 8.934563758389262e-06, + "loss": 0.7886, + "step": 1079 + }, + { + "epoch": 1.6702744491689216, + "grad_norm": 0.684766948223114, + "learning_rate": 8.89261744966443e-06, + "loss": 0.8122, + "step": 1080 + }, + { + "epoch": 1.6718206416698878, + "grad_norm": 0.6934885382652283, + "learning_rate": 8.850671140939598e-06, + "loss": 0.7894, + "step": 1081 + }, + { + "epoch": 1.6733668341708543, + "grad_norm": 0.7051049470901489, + "learning_rate": 8.808724832214765e-06, + "loss": 0.8047, + "step": 1082 + }, + { + "epoch": 1.6749130266718206, + "grad_norm": 0.7332410216331482, + "learning_rate": 8.766778523489935e-06, + "loss": 0.7371, + "step": 1083 + }, + { + "epoch": 1.676459219172787, + "grad_norm": 0.7231751084327698, + "learning_rate": 8.724832214765101e-06, + "loss": 0.7804, + "step": 1084 + }, + { + "epoch": 1.6780054116737535, + "grad_norm": 0.7421995997428894, + "learning_rate": 8.68288590604027e-06, + "loss": 0.8364, + "step": 1085 + }, + { + "epoch": 1.6795516041747196, + "grad_norm": 0.7182170152664185, + "learning_rate": 8.640939597315436e-06, + "loss": 0.848, + "step": 1086 + }, + { + "epoch": 1.6810977966756862, + "grad_norm": 0.7546189427375793, + "learning_rate": 8.598993288590604e-06, + "loss": 0.8336, + "step": 1087 + }, + { + "epoch": 1.6826439891766525, + "grad_norm": 0.7409399747848511, + "learning_rate": 8.557046979865773e-06, + "loss": 0.8151, + "step": 1088 + }, + { + "epoch": 1.6841901816776188, + "grad_norm": 0.7315151691436768, + "learning_rate": 8.515100671140939e-06, + "loss": 0.8126, + "step": 1089 + }, + { + "epoch": 1.6857363741785854, + "grad_norm": 0.7949945330619812, + "learning_rate": 8.473154362416107e-06, + "loss": 0.8533, + "step": 1090 + }, + { + "epoch": 1.6872825666795515, + "grad_norm": 0.7690507769584656, + "learning_rate": 8.431208053691276e-06, + "loss": 0.846, + "step": 1091 + }, + { + "epoch": 1.688828759180518, + "grad_norm": 0.7926616072654724, + "learning_rate": 8.389261744966444e-06, + "loss": 0.8402, + "step": 1092 + }, + { + "epoch": 1.6903749516814843, + "grad_norm": 0.8056994676589966, + "learning_rate": 8.34731543624161e-06, + "loss": 0.8191, + "step": 1093 + }, + { + "epoch": 1.6919211441824507, + "grad_norm": 0.8308284282684326, + "learning_rate": 8.305369127516779e-06, + "loss": 0.9042, + "step": 1094 + }, + { + "epoch": 1.6934673366834172, + "grad_norm": 0.9019404053688049, + "learning_rate": 8.263422818791947e-06, + "loss": 0.8962, + "step": 1095 + }, + { + "epoch": 1.6950135291843833, + "grad_norm": 1.144425630569458, + "learning_rate": 8.221476510067115e-06, + "loss": 1.0569, + "step": 1096 + }, + { + "epoch": 1.6965597216853499, + "grad_norm": 0.7823913097381592, + "learning_rate": 8.179530201342282e-06, + "loss": 0.5956, + "step": 1097 + }, + { + "epoch": 1.6981059141863162, + "grad_norm": 0.5778486132621765, + "learning_rate": 8.13758389261745e-06, + "loss": 0.6246, + "step": 1098 + }, + { + "epoch": 1.6996521066872825, + "grad_norm": 0.6111435890197754, + "learning_rate": 8.095637583892618e-06, + "loss": 0.6681, + "step": 1099 + }, + { + "epoch": 1.701198299188249, + "grad_norm": 0.6174157857894897, + "learning_rate": 8.053691275167785e-06, + "loss": 0.6635, + "step": 1100 + }, + { + "epoch": 1.7027444916892152, + "grad_norm": 0.660089910030365, + "learning_rate": 8.011744966442953e-06, + "loss": 0.6994, + "step": 1101 + }, + { + "epoch": 1.7042906841901817, + "grad_norm": 0.6478524208068848, + "learning_rate": 7.969798657718121e-06, + "loss": 0.6157, + "step": 1102 + }, + { + "epoch": 1.705836876691148, + "grad_norm": 0.6385321617126465, + "learning_rate": 7.92785234899329e-06, + "loss": 0.6967, + "step": 1103 + }, + { + "epoch": 1.7073830691921144, + "grad_norm": 0.6223974227905273, + "learning_rate": 7.885906040268456e-06, + "loss": 0.7572, + "step": 1104 + }, + { + "epoch": 1.708929261693081, + "grad_norm": 0.6408361792564392, + "learning_rate": 7.843959731543624e-06, + "loss": 0.8266, + "step": 1105 + }, + { + "epoch": 1.710475454194047, + "grad_norm": 0.6401566863059998, + "learning_rate": 7.802013422818793e-06, + "loss": 0.7073, + "step": 1106 + }, + { + "epoch": 1.7120216466950136, + "grad_norm": 0.6204596757888794, + "learning_rate": 7.760067114093961e-06, + "loss": 0.6831, + "step": 1107 + }, + { + "epoch": 1.7135678391959799, + "grad_norm": 0.6418159604072571, + "learning_rate": 7.718120805369127e-06, + "loss": 0.6925, + "step": 1108 + }, + { + "epoch": 1.7151140316969462, + "grad_norm": 0.6294611692428589, + "learning_rate": 7.676174496644296e-06, + "loss": 0.7594, + "step": 1109 + }, + { + "epoch": 1.7166602241979128, + "grad_norm": 0.6831735968589783, + "learning_rate": 7.634228187919464e-06, + "loss": 0.6813, + "step": 1110 + }, + { + "epoch": 1.7182064166988789, + "grad_norm": 0.6496366262435913, + "learning_rate": 7.5922818791946305e-06, + "loss": 0.7343, + "step": 1111 + }, + { + "epoch": 1.7197526091998454, + "grad_norm": 0.6550260782241821, + "learning_rate": 7.5503355704698e-06, + "loss": 0.7389, + "step": 1112 + }, + { + "epoch": 1.7212988017008117, + "grad_norm": 0.6319887638092041, + "learning_rate": 7.508389261744966e-06, + "loss": 0.7011, + "step": 1113 + }, + { + "epoch": 1.722844994201778, + "grad_norm": 0.6578625440597534, + "learning_rate": 7.466442953020135e-06, + "loss": 0.7221, + "step": 1114 + }, + { + "epoch": 1.7243911867027446, + "grad_norm": 0.6553155779838562, + "learning_rate": 7.424496644295302e-06, + "loss": 0.7053, + "step": 1115 + }, + { + "epoch": 1.7259373792037107, + "grad_norm": 0.6502689719200134, + "learning_rate": 7.382550335570471e-06, + "loss": 0.8073, + "step": 1116 + }, + { + "epoch": 1.7274835717046773, + "grad_norm": 0.6422849893569946, + "learning_rate": 7.3406040268456375e-06, + "loss": 0.6863, + "step": 1117 + }, + { + "epoch": 1.7290297642056436, + "grad_norm": 0.6866220831871033, + "learning_rate": 7.298657718120807e-06, + "loss": 0.7959, + "step": 1118 + }, + { + "epoch": 1.73057595670661, + "grad_norm": 0.645169198513031, + "learning_rate": 7.256711409395973e-06, + "loss": 0.7729, + "step": 1119 + }, + { + "epoch": 1.7321221492075765, + "grad_norm": 0.654244601726532, + "learning_rate": 7.214765100671142e-06, + "loss": 0.7707, + "step": 1120 + }, + { + "epoch": 1.7336683417085426, + "grad_norm": 0.6695640087127686, + "learning_rate": 7.172818791946309e-06, + "loss": 0.714, + "step": 1121 + }, + { + "epoch": 1.735214534209509, + "grad_norm": 0.6387913227081299, + "learning_rate": 7.130872483221476e-06, + "loss": 0.7654, + "step": 1122 + }, + { + "epoch": 1.7367607267104754, + "grad_norm": 0.664368748664856, + "learning_rate": 7.0889261744966445e-06, + "loss": 0.7525, + "step": 1123 + }, + { + "epoch": 1.7383069192114418, + "grad_norm": 0.7358962893486023, + "learning_rate": 7.046979865771812e-06, + "loss": 0.747, + "step": 1124 + }, + { + "epoch": 1.7398531117124083, + "grad_norm": 0.6654751896858215, + "learning_rate": 7.00503355704698e-06, + "loss": 0.7669, + "step": 1125 + }, + { + "epoch": 1.7413993042133744, + "grad_norm": 0.6763463020324707, + "learning_rate": 6.963087248322148e-06, + "loss": 0.8077, + "step": 1126 + }, + { + "epoch": 1.742945496714341, + "grad_norm": 0.6764401197433472, + "learning_rate": 6.921140939597316e-06, + "loss": 0.7438, + "step": 1127 + }, + { + "epoch": 1.7444916892153073, + "grad_norm": 0.7182475924491882, + "learning_rate": 6.879194630872483e-06, + "loss": 0.728, + "step": 1128 + }, + { + "epoch": 1.7460378817162736, + "grad_norm": 0.6785516142845154, + "learning_rate": 6.8372483221476515e-06, + "loss": 0.7939, + "step": 1129 + }, + { + "epoch": 1.7475840742172402, + "grad_norm": 0.7003461122512817, + "learning_rate": 6.795302013422819e-06, + "loss": 0.7729, + "step": 1130 + }, + { + "epoch": 1.7491302667182063, + "grad_norm": 0.6882821917533875, + "learning_rate": 6.753355704697987e-06, + "loss": 0.9204, + "step": 1131 + }, + { + "epoch": 1.7506764592191728, + "grad_norm": 0.7342216372489929, + "learning_rate": 6.7114093959731546e-06, + "loss": 0.7385, + "step": 1132 + }, + { + "epoch": 1.7522226517201391, + "grad_norm": 0.7073139548301697, + "learning_rate": 6.669463087248322e-06, + "loss": 0.9268, + "step": 1133 + }, + { + "epoch": 1.7537688442211055, + "grad_norm": 0.7231860756874084, + "learning_rate": 6.62751677852349e-06, + "loss": 0.8596, + "step": 1134 + }, + { + "epoch": 1.755315036722072, + "grad_norm": 0.7145645618438721, + "learning_rate": 6.585570469798658e-06, + "loss": 0.8466, + "step": 1135 + }, + { + "epoch": 1.7568612292230381, + "grad_norm": 0.7454160451889038, + "learning_rate": 6.543624161073826e-06, + "loss": 0.7629, + "step": 1136 + }, + { + "epoch": 1.7584074217240047, + "grad_norm": 0.7323503494262695, + "learning_rate": 6.501677852348993e-06, + "loss": 0.8666, + "step": 1137 + }, + { + "epoch": 1.759953614224971, + "grad_norm": 0.7377263307571411, + "learning_rate": 6.4597315436241616e-06, + "loss": 0.8515, + "step": 1138 + }, + { + "epoch": 1.7614998067259373, + "grad_norm": 0.7750667929649353, + "learning_rate": 6.417785234899329e-06, + "loss": 0.8033, + "step": 1139 + }, + { + "epoch": 1.7630459992269039, + "grad_norm": 0.7829360365867615, + "learning_rate": 6.375838926174497e-06, + "loss": 0.8903, + "step": 1140 + }, + { + "epoch": 1.76459219172787, + "grad_norm": 0.7346689701080322, + "learning_rate": 6.333892617449665e-06, + "loss": 0.9772, + "step": 1141 + }, + { + "epoch": 1.7661383842288365, + "grad_norm": 0.7309790253639221, + "learning_rate": 6.291946308724833e-06, + "loss": 0.8317, + "step": 1142 + }, + { + "epoch": 1.7676845767298028, + "grad_norm": 0.8332825899124146, + "learning_rate": 6.25e-06, + "loss": 0.7692, + "step": 1143 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 0.8708672523498535, + "learning_rate": 6.2080536912751686e-06, + "loss": 0.946, + "step": 1144 + }, + { + "epoch": 1.7707769617317357, + "grad_norm": 0.9887493848800659, + "learning_rate": 6.166107382550336e-06, + "loss": 0.9102, + "step": 1145 + }, + { + "epoch": 1.7723231542327018, + "grad_norm": 1.3964698314666748, + "learning_rate": 6.124161073825504e-06, + "loss": 0.8942, + "step": 1146 + }, + { + "epoch": 1.7738693467336684, + "grad_norm": 0.9236196875572205, + "learning_rate": 6.082214765100672e-06, + "loss": 0.6239, + "step": 1147 + }, + { + "epoch": 1.7754155392346347, + "grad_norm": 0.5753183960914612, + "learning_rate": 6.04026845637584e-06, + "loss": 0.6674, + "step": 1148 + }, + { + "epoch": 1.776961731735601, + "grad_norm": 0.6132087111473083, + "learning_rate": 5.9983221476510064e-06, + "loss": 0.7186, + "step": 1149 + }, + { + "epoch": 1.7785079242365676, + "grad_norm": 0.586824893951416, + "learning_rate": 5.956375838926175e-06, + "loss": 0.7077, + "step": 1150 + }, + { + "epoch": 1.7800541167375337, + "grad_norm": 0.6105204224586487, + "learning_rate": 5.914429530201342e-06, + "loss": 0.7448, + "step": 1151 + }, + { + "epoch": 1.7816003092385002, + "grad_norm": 0.6133350133895874, + "learning_rate": 5.87248322147651e-06, + "loss": 0.7761, + "step": 1152 + }, + { + "epoch": 1.7831465017394665, + "grad_norm": 0.6162737011909485, + "learning_rate": 5.830536912751678e-06, + "loss": 0.7204, + "step": 1153 + }, + { + "epoch": 1.7846926942404329, + "grad_norm": 0.6450937390327454, + "learning_rate": 5.788590604026846e-06, + "loss": 0.7163, + "step": 1154 + }, + { + "epoch": 1.7862388867413994, + "grad_norm": 0.6387403011322021, + "learning_rate": 5.7466442953020134e-06, + "loss": 0.7454, + "step": 1155 + }, + { + "epoch": 1.7877850792423655, + "grad_norm": 0.6444848775863647, + "learning_rate": 5.704697986577182e-06, + "loss": 0.7048, + "step": 1156 + }, + { + "epoch": 1.789331271743332, + "grad_norm": 0.6471322774887085, + "learning_rate": 5.662751677852349e-06, + "loss": 0.75, + "step": 1157 + }, + { + "epoch": 1.7908774642442984, + "grad_norm": 0.6798081398010254, + "learning_rate": 5.620805369127517e-06, + "loss": 0.7096, + "step": 1158 + }, + { + "epoch": 1.7924236567452647, + "grad_norm": 0.6646496653556824, + "learning_rate": 5.578859060402685e-06, + "loss": 0.8017, + "step": 1159 + }, + { + "epoch": 1.7939698492462313, + "grad_norm": 0.6455252170562744, + "learning_rate": 5.536912751677853e-06, + "loss": 0.7323, + "step": 1160 + }, + { + "epoch": 1.7955160417471974, + "grad_norm": 0.6841678023338318, + "learning_rate": 5.49496644295302e-06, + "loss": 0.7373, + "step": 1161 + }, + { + "epoch": 1.797062234248164, + "grad_norm": 0.6608906388282776, + "learning_rate": 5.453020134228188e-06, + "loss": 0.7039, + "step": 1162 + }, + { + "epoch": 1.7986084267491302, + "grad_norm": 0.6611918807029724, + "learning_rate": 5.411073825503356e-06, + "loss": 0.768, + "step": 1163 + }, + { + "epoch": 1.8001546192500966, + "grad_norm": 0.6479055881500244, + "learning_rate": 5.3691275167785235e-06, + "loss": 0.7672, + "step": 1164 + }, + { + "epoch": 1.8017008117510631, + "grad_norm": 0.6645311117172241, + "learning_rate": 5.327181208053692e-06, + "loss": 0.8146, + "step": 1165 + }, + { + "epoch": 1.8032470042520292, + "grad_norm": 0.6700178980827332, + "learning_rate": 5.285234899328859e-06, + "loss": 0.7019, + "step": 1166 + }, + { + "epoch": 1.8047931967529958, + "grad_norm": 0.6892951130867004, + "learning_rate": 5.243288590604027e-06, + "loss": 0.7982, + "step": 1167 + }, + { + "epoch": 1.806339389253962, + "grad_norm": 0.6578270196914673, + "learning_rate": 5.201342281879195e-06, + "loss": 0.7844, + "step": 1168 + }, + { + "epoch": 1.8078855817549284, + "grad_norm": 0.6844699382781982, + "learning_rate": 5.159395973154363e-06, + "loss": 0.73, + "step": 1169 + }, + { + "epoch": 1.809431774255895, + "grad_norm": 0.67879718542099, + "learning_rate": 5.1174496644295305e-06, + "loss": 0.8256, + "step": 1170 + }, + { + "epoch": 1.810977966756861, + "grad_norm": 0.6748877167701721, + "learning_rate": 5.075503355704699e-06, + "loss": 0.7733, + "step": 1171 + }, + { + "epoch": 1.8125241592578276, + "grad_norm": 0.6248130798339844, + "learning_rate": 5.033557046979865e-06, + "loss": 0.7489, + "step": 1172 + }, + { + "epoch": 1.814070351758794, + "grad_norm": 0.6643280386924744, + "learning_rate": 4.9916107382550336e-06, + "loss": 0.726, + "step": 1173 + }, + { + "epoch": 1.8156165442597603, + "grad_norm": 0.6891630291938782, + "learning_rate": 4.949664429530201e-06, + "loss": 0.7442, + "step": 1174 + }, + { + "epoch": 1.8171627367607268, + "grad_norm": 0.6768823862075806, + "learning_rate": 4.907718120805369e-06, + "loss": 0.7376, + "step": 1175 + }, + { + "epoch": 1.818708929261693, + "grad_norm": 0.669154167175293, + "learning_rate": 4.8657718120805375e-06, + "loss": 0.7775, + "step": 1176 + }, + { + "epoch": 1.8202551217626595, + "grad_norm": 0.7224996089935303, + "learning_rate": 4.823825503355705e-06, + "loss": 0.7535, + "step": 1177 + }, + { + "epoch": 1.8218013142636258, + "grad_norm": 0.6954566240310669, + "learning_rate": 4.781879194630873e-06, + "loss": 0.77, + "step": 1178 + }, + { + "epoch": 1.8233475067645921, + "grad_norm": 0.6938985586166382, + "learning_rate": 4.7399328859060405e-06, + "loss": 0.7963, + "step": 1179 + }, + { + "epoch": 1.8248936992655587, + "grad_norm": 0.6518607139587402, + "learning_rate": 4.697986577181209e-06, + "loss": 0.7652, + "step": 1180 + }, + { + "epoch": 1.8264398917665248, + "grad_norm": 0.7072224617004395, + "learning_rate": 4.656040268456376e-06, + "loss": 0.7875, + "step": 1181 + }, + { + "epoch": 1.8279860842674913, + "grad_norm": 0.689007580280304, + "learning_rate": 4.6140939597315445e-06, + "loss": 0.8659, + "step": 1182 + }, + { + "epoch": 1.8295322767684576, + "grad_norm": 0.7117170095443726, + "learning_rate": 4.572147651006712e-06, + "loss": 0.7761, + "step": 1183 + }, + { + "epoch": 1.831078469269424, + "grad_norm": 0.6940242052078247, + "learning_rate": 4.530201342281879e-06, + "loss": 0.8305, + "step": 1184 + }, + { + "epoch": 1.8326246617703905, + "grad_norm": 0.7297004461288452, + "learning_rate": 4.488255033557047e-06, + "loss": 0.8451, + "step": 1185 + }, + { + "epoch": 1.8341708542713566, + "grad_norm": 0.7414118647575378, + "learning_rate": 4.446308724832215e-06, + "loss": 0.7932, + "step": 1186 + }, + { + "epoch": 1.8357170467723232, + "grad_norm": 0.7365830540657043, + "learning_rate": 4.404362416107382e-06, + "loss": 0.8153, + "step": 1187 + }, + { + "epoch": 1.8372632392732895, + "grad_norm": 0.7378683686256409, + "learning_rate": 4.362416107382551e-06, + "loss": 0.8795, + "step": 1188 + }, + { + "epoch": 1.8388094317742558, + "grad_norm": 0.7360830307006836, + "learning_rate": 4.320469798657718e-06, + "loss": 0.863, + "step": 1189 + }, + { + "epoch": 1.8403556242752224, + "grad_norm": 0.7365934252738953, + "learning_rate": 4.278523489932886e-06, + "loss": 0.8836, + "step": 1190 + }, + { + "epoch": 1.8419018167761885, + "grad_norm": 0.7860958576202393, + "learning_rate": 4.236577181208054e-06, + "loss": 0.8834, + "step": 1191 + }, + { + "epoch": 1.843448009277155, + "grad_norm": 0.8424365520477295, + "learning_rate": 4.194630872483222e-06, + "loss": 0.8303, + "step": 1192 + }, + { + "epoch": 1.8449942017781213, + "grad_norm": 0.8641267418861389, + "learning_rate": 4.152684563758389e-06, + "loss": 0.7928, + "step": 1193 + }, + { + "epoch": 1.8465403942790877, + "grad_norm": 0.8995117545127869, + "learning_rate": 4.110738255033558e-06, + "loss": 0.8474, + "step": 1194 + }, + { + "epoch": 1.8480865867800542, + "grad_norm": 0.9355427026748657, + "learning_rate": 4.068791946308725e-06, + "loss": 0.8516, + "step": 1195 + }, + { + "epoch": 1.8496327792810203, + "grad_norm": 1.3238755464553833, + "learning_rate": 4.026845637583892e-06, + "loss": 1.0038, + "step": 1196 + }, + { + "epoch": 1.8511789717819869, + "grad_norm": 0.8634078502655029, + "learning_rate": 3.984899328859061e-06, + "loss": 0.6067, + "step": 1197 + }, + { + "epoch": 1.8527251642829532, + "grad_norm": 0.5876993536949158, + "learning_rate": 3.942953020134228e-06, + "loss": 0.6623, + "step": 1198 + }, + { + "epoch": 1.8542713567839195, + "grad_norm": 0.6208466291427612, + "learning_rate": 3.901006711409396e-06, + "loss": 0.6686, + "step": 1199 + }, + { + "epoch": 1.855817549284886, + "grad_norm": 0.5899839401245117, + "learning_rate": 3.859060402684564e-06, + "loss": 0.7108, + "step": 1200 + }, + { + "epoch": 1.8573637417858522, + "grad_norm": 0.6139258742332458, + "learning_rate": 3.817114093959732e-06, + "loss": 0.6853, + "step": 1201 + }, + { + "epoch": 1.8589099342868187, + "grad_norm": 0.6186558604240417, + "learning_rate": 3.7751677852349e-06, + "loss": 0.6619, + "step": 1202 + }, + { + "epoch": 1.860456126787785, + "grad_norm": 0.6083664298057556, + "learning_rate": 3.7332214765100677e-06, + "loss": 0.7119, + "step": 1203 + }, + { + "epoch": 1.8620023192887514, + "grad_norm": 0.6227363348007202, + "learning_rate": 3.6912751677852355e-06, + "loss": 0.7251, + "step": 1204 + }, + { + "epoch": 1.863548511789718, + "grad_norm": 0.6329985857009888, + "learning_rate": 3.6493288590604033e-06, + "loss": 0.7796, + "step": 1205 + }, + { + "epoch": 1.865094704290684, + "grad_norm": 0.644888162612915, + "learning_rate": 3.607382550335571e-06, + "loss": 0.7747, + "step": 1206 + }, + { + "epoch": 1.8666408967916506, + "grad_norm": 0.6022448539733887, + "learning_rate": 3.565436241610738e-06, + "loss": 0.7027, + "step": 1207 + }, + { + "epoch": 1.868187089292617, + "grad_norm": 0.6584210991859436, + "learning_rate": 3.523489932885906e-06, + "loss": 0.6816, + "step": 1208 + }, + { + "epoch": 1.8697332817935832, + "grad_norm": 0.6227755546569824, + "learning_rate": 3.481543624161074e-06, + "loss": 0.7245, + "step": 1209 + }, + { + "epoch": 1.8712794742945498, + "grad_norm": 0.6115254759788513, + "learning_rate": 3.4395973154362416e-06, + "loss": 0.7511, + "step": 1210 + }, + { + "epoch": 1.8728256667955159, + "grad_norm": 0.638425886631012, + "learning_rate": 3.3976510067114095e-06, + "loss": 0.742, + "step": 1211 + }, + { + "epoch": 1.8743718592964824, + "grad_norm": 0.654451310634613, + "learning_rate": 3.3557046979865773e-06, + "loss": 0.7323, + "step": 1212 + }, + { + "epoch": 1.8759180517974487, + "grad_norm": 0.6516563892364502, + "learning_rate": 3.313758389261745e-06, + "loss": 0.6654, + "step": 1213 + }, + { + "epoch": 1.877464244298415, + "grad_norm": 0.6713635325431824, + "learning_rate": 3.271812080536913e-06, + "loss": 0.7639, + "step": 1214 + }, + { + "epoch": 1.8790104367993816, + "grad_norm": 0.6944072246551514, + "learning_rate": 3.2298657718120808e-06, + "loss": 0.7274, + "step": 1215 + }, + { + "epoch": 1.880556629300348, + "grad_norm": 0.6565424799919128, + "learning_rate": 3.1879194630872486e-06, + "loss": 0.7757, + "step": 1216 + }, + { + "epoch": 1.8821028218013143, + "grad_norm": 0.6998873949050903, + "learning_rate": 3.1459731543624164e-06, + "loss": 0.8059, + "step": 1217 + }, + { + "epoch": 1.8836490143022806, + "grad_norm": 0.6804831624031067, + "learning_rate": 3.1040268456375843e-06, + "loss": 0.7777, + "step": 1218 + }, + { + "epoch": 1.885195206803247, + "grad_norm": 0.6547567248344421, + "learning_rate": 3.062080536912752e-06, + "loss": 0.746, + "step": 1219 + }, + { + "epoch": 1.8867413993042135, + "grad_norm": 0.6822516322135925, + "learning_rate": 3.02013422818792e-06, + "loss": 0.7518, + "step": 1220 + }, + { + "epoch": 1.8882875918051798, + "grad_norm": 0.6750561594963074, + "learning_rate": 2.9781879194630873e-06, + "loss": 0.7261, + "step": 1221 + }, + { + "epoch": 1.8898337843061461, + "grad_norm": 0.6937829256057739, + "learning_rate": 2.936241610738255e-06, + "loss": 0.7716, + "step": 1222 + }, + { + "epoch": 1.8913799768071125, + "grad_norm": 0.6683257818222046, + "learning_rate": 2.894295302013423e-06, + "loss": 0.7657, + "step": 1223 + }, + { + "epoch": 1.8929261693080788, + "grad_norm": 0.6856616735458374, + "learning_rate": 2.852348993288591e-06, + "loss": 0.7833, + "step": 1224 + }, + { + "epoch": 1.8944723618090453, + "grad_norm": 0.719022274017334, + "learning_rate": 2.8104026845637587e-06, + "loss": 0.7462, + "step": 1225 + }, + { + "epoch": 1.8960185543100117, + "grad_norm": 0.6999730467796326, + "learning_rate": 2.7684563758389265e-06, + "loss": 0.7777, + "step": 1226 + }, + { + "epoch": 1.897564746810978, + "grad_norm": 0.710363507270813, + "learning_rate": 2.726510067114094e-06, + "loss": 0.7624, + "step": 1227 + }, + { + "epoch": 1.8991109393119443, + "grad_norm": 0.6629136204719543, + "learning_rate": 2.6845637583892617e-06, + "loss": 0.7578, + "step": 1228 + }, + { + "epoch": 1.9006571318129106, + "grad_norm": 0.6639029383659363, + "learning_rate": 2.6426174496644296e-06, + "loss": 0.8438, + "step": 1229 + }, + { + "epoch": 1.9022033243138772, + "grad_norm": 0.6924868226051331, + "learning_rate": 2.6006711409395974e-06, + "loss": 0.7844, + "step": 1230 + }, + { + "epoch": 1.9037495168148435, + "grad_norm": 0.7059327960014343, + "learning_rate": 2.5587248322147652e-06, + "loss": 0.7654, + "step": 1231 + }, + { + "epoch": 1.9052957093158098, + "grad_norm": 0.6732707619667053, + "learning_rate": 2.5167785234899326e-06, + "loss": 0.7158, + "step": 1232 + }, + { + "epoch": 1.9068419018167762, + "grad_norm": 0.712011456489563, + "learning_rate": 2.4748322147651005e-06, + "loss": 0.8323, + "step": 1233 + }, + { + "epoch": 1.9083880943177425, + "grad_norm": 0.7283968329429626, + "learning_rate": 2.4328859060402687e-06, + "loss": 0.86, + "step": 1234 + }, + { + "epoch": 1.909934286818709, + "grad_norm": 0.736113429069519, + "learning_rate": 2.3909395973154366e-06, + "loss": 0.8779, + "step": 1235 + }, + { + "epoch": 1.9114804793196754, + "grad_norm": 0.7328662276268005, + "learning_rate": 2.3489932885906044e-06, + "loss": 0.8673, + "step": 1236 + }, + { + "epoch": 1.9130266718206417, + "grad_norm": 0.7270045280456543, + "learning_rate": 2.3070469798657722e-06, + "loss": 0.8348, + "step": 1237 + }, + { + "epoch": 1.914572864321608, + "grad_norm": 0.7681441903114319, + "learning_rate": 2.2651006711409396e-06, + "loss": 0.818, + "step": 1238 + }, + { + "epoch": 1.9161190568225743, + "grad_norm": 0.7744415998458862, + "learning_rate": 2.2231543624161075e-06, + "loss": 0.8879, + "step": 1239 + }, + { + "epoch": 1.9176652493235409, + "grad_norm": 0.7644962072372437, + "learning_rate": 2.1812080536912753e-06, + "loss": 0.8545, + "step": 1240 + }, + { + "epoch": 1.9192114418245072, + "grad_norm": 0.7949373722076416, + "learning_rate": 2.139261744966443e-06, + "loss": 0.8603, + "step": 1241 + }, + { + "epoch": 1.9207576343254735, + "grad_norm": 0.8180006146430969, + "learning_rate": 2.097315436241611e-06, + "loss": 0.8146, + "step": 1242 + }, + { + "epoch": 1.9223038268264399, + "grad_norm": 0.8280307650566101, + "learning_rate": 2.055369127516779e-06, + "loss": 0.9628, + "step": 1243 + }, + { + "epoch": 1.9238500193274062, + "grad_norm": 0.8613501787185669, + "learning_rate": 2.013422818791946e-06, + "loss": 0.8326, + "step": 1244 + }, + { + "epoch": 1.9253962118283727, + "grad_norm": 0.9388262629508972, + "learning_rate": 1.971476510067114e-06, + "loss": 0.9428, + "step": 1245 + }, + { + "epoch": 1.926942404329339, + "grad_norm": 1.25608491897583, + "learning_rate": 1.929530201342282e-06, + "loss": 0.8971, + "step": 1246 + }, + { + "epoch": 1.9284885968303054, + "grad_norm": 0.8791813254356384, + "learning_rate": 1.88758389261745e-06, + "loss": 0.6034, + "step": 1247 + }, + { + "epoch": 1.9300347893312717, + "grad_norm": 0.5444540977478027, + "learning_rate": 1.8456375838926177e-06, + "loss": 0.6993, + "step": 1248 + }, + { + "epoch": 1.931580981832238, + "grad_norm": 0.6073552966117859, + "learning_rate": 1.8036912751677856e-06, + "loss": 0.7178, + "step": 1249 + }, + { + "epoch": 1.9331271743332046, + "grad_norm": 0.6113264560699463, + "learning_rate": 1.761744966442953e-06, + "loss": 0.6966, + "step": 1250 + }, + { + "epoch": 1.934673366834171, + "grad_norm": 0.5979108810424805, + "learning_rate": 1.7197986577181208e-06, + "loss": 0.7133, + "step": 1251 + }, + { + "epoch": 1.9362195593351372, + "grad_norm": 0.6315338611602783, + "learning_rate": 1.6778523489932886e-06, + "loss": 0.7721, + "step": 1252 + }, + { + "epoch": 1.9377657518361036, + "grad_norm": 0.6291902661323547, + "learning_rate": 1.6359060402684565e-06, + "loss": 0.7006, + "step": 1253 + }, + { + "epoch": 1.9393119443370699, + "grad_norm": 0.6238860487937927, + "learning_rate": 1.5939597315436243e-06, + "loss": 0.7787, + "step": 1254 + }, + { + "epoch": 1.9408581368380364, + "grad_norm": 0.6401642560958862, + "learning_rate": 1.5520134228187921e-06, + "loss": 0.7603, + "step": 1255 + }, + { + "epoch": 1.9424043293390028, + "grad_norm": 0.6174636483192444, + "learning_rate": 1.51006711409396e-06, + "loss": 0.7259, + "step": 1256 + }, + { + "epoch": 1.943950521839969, + "grad_norm": 0.6495472192764282, + "learning_rate": 1.4681208053691276e-06, + "loss": 0.6654, + "step": 1257 + }, + { + "epoch": 1.9454967143409354, + "grad_norm": 0.667812168598175, + "learning_rate": 1.4261744966442954e-06, + "loss": 0.7147, + "step": 1258 + }, + { + "epoch": 1.9470429068419017, + "grad_norm": 0.6613562703132629, + "learning_rate": 1.3842281879194633e-06, + "loss": 0.7437, + "step": 1259 + }, + { + "epoch": 1.9485890993428683, + "grad_norm": 0.7061516046524048, + "learning_rate": 1.3422818791946309e-06, + "loss": 0.6831, + "step": 1260 + }, + { + "epoch": 1.9501352918438346, + "grad_norm": 0.6739703416824341, + "learning_rate": 1.3003355704697987e-06, + "loss": 0.747, + "step": 1261 + }, + { + "epoch": 1.951681484344801, + "grad_norm": 0.6422568559646606, + "learning_rate": 1.2583892617449663e-06, + "loss": 0.7569, + "step": 1262 + }, + { + "epoch": 1.9532276768457673, + "grad_norm": 0.6601455211639404, + "learning_rate": 1.2164429530201344e-06, + "loss": 0.7461, + "step": 1263 + }, + { + "epoch": 1.9547738693467336, + "grad_norm": 0.6846182942390442, + "learning_rate": 1.1744966442953022e-06, + "loss": 0.7185, + "step": 1264 + }, + { + "epoch": 1.9563200618477001, + "grad_norm": 0.6746420860290527, + "learning_rate": 1.1325503355704698e-06, + "loss": 0.6806, + "step": 1265 + }, + { + "epoch": 1.9578662543486665, + "grad_norm": 0.6510487198829651, + "learning_rate": 1.0906040268456377e-06, + "loss": 0.7693, + "step": 1266 + }, + { + "epoch": 1.9594124468496328, + "grad_norm": 0.6752596497535706, + "learning_rate": 1.0486577181208055e-06, + "loss": 0.899, + "step": 1267 + }, + { + "epoch": 1.960958639350599, + "grad_norm": 0.698874294757843, + "learning_rate": 1.006711409395973e-06, + "loss": 0.7521, + "step": 1268 + }, + { + "epoch": 1.9625048318515654, + "grad_norm": 0.6726242899894714, + "learning_rate": 9.64765100671141e-07, + "loss": 0.7782, + "step": 1269 + }, + { + "epoch": 1.964051024352532, + "grad_norm": 0.6857203245162964, + "learning_rate": 9.228187919463089e-07, + "loss": 0.7344, + "step": 1270 + }, + { + "epoch": 1.9655972168534983, + "grad_norm": 0.7023079991340637, + "learning_rate": 8.808724832214765e-07, + "loss": 0.7533, + "step": 1271 + }, + { + "epoch": 1.9671434093544646, + "grad_norm": 0.6352121233940125, + "learning_rate": 8.389261744966443e-07, + "loss": 0.7554, + "step": 1272 + }, + { + "epoch": 1.968689601855431, + "grad_norm": 0.6913352608680725, + "learning_rate": 7.969798657718122e-07, + "loss": 0.7468, + "step": 1273 + }, + { + "epoch": 1.9702357943563973, + "grad_norm": 0.6950103640556335, + "learning_rate": 7.5503355704698e-07, + "loss": 0.8532, + "step": 1274 + }, + { + "epoch": 1.9717819868573638, + "grad_norm": 0.6684133410453796, + "learning_rate": 7.130872483221477e-07, + "loss": 0.8302, + "step": 1275 + }, + { + "epoch": 1.9733281793583302, + "grad_norm": 0.6860172152519226, + "learning_rate": 6.711409395973154e-07, + "loss": 0.7342, + "step": 1276 + }, + { + "epoch": 1.9748743718592965, + "grad_norm": 0.6672773957252502, + "learning_rate": 6.291946308724832e-07, + "loss": 0.7901, + "step": 1277 + }, + { + "epoch": 1.9764205643602628, + "grad_norm": 0.7022169828414917, + "learning_rate": 5.872483221476511e-07, + "loss": 0.7796, + "step": 1278 + }, + { + "epoch": 1.9779667568612291, + "grad_norm": 0.6778275370597839, + "learning_rate": 5.453020134228188e-07, + "loss": 0.8389, + "step": 1279 + }, + { + "epoch": 1.9795129493621957, + "grad_norm": 0.721585750579834, + "learning_rate": 5.033557046979866e-07, + "loss": 0.848, + "step": 1280 + }, + { + "epoch": 1.981059141863162, + "grad_norm": 0.6909308433532715, + "learning_rate": 4.6140939597315444e-07, + "loss": 0.8214, + "step": 1281 + }, + { + "epoch": 1.9826053343641283, + "grad_norm": 0.7320754528045654, + "learning_rate": 4.1946308724832216e-07, + "loss": 0.8179, + "step": 1282 + }, + { + "epoch": 1.9841515268650947, + "grad_norm": 0.7341198325157166, + "learning_rate": 3.7751677852349e-07, + "loss": 0.8212, + "step": 1283 + }, + { + "epoch": 1.985697719366061, + "grad_norm": 0.7166808843612671, + "learning_rate": 3.355704697986577e-07, + "loss": 0.8472, + "step": 1284 + }, + { + "epoch": 1.9872439118670275, + "grad_norm": 0.7207593321800232, + "learning_rate": 2.9362416107382555e-07, + "loss": 0.8408, + "step": 1285 + }, + { + "epoch": 1.9887901043679939, + "grad_norm": 0.7933880090713501, + "learning_rate": 2.516778523489933e-07, + "loss": 0.8781, + "step": 1286 + }, + { + "epoch": 1.9903362968689602, + "grad_norm": 0.7923394441604614, + "learning_rate": 2.0973154362416108e-07, + "loss": 0.9313, + "step": 1287 + }, + { + "epoch": 1.9918824893699265, + "grad_norm": 0.7550710439682007, + "learning_rate": 1.6778523489932886e-07, + "loss": 0.8893, + "step": 1288 + }, + { + "epoch": 1.9934286818708928, + "grad_norm": 0.8748595118522644, + "learning_rate": 1.2583892617449664e-07, + "loss": 0.9071, + "step": 1289 + }, + { + "epoch": 1.9949748743718594, + "grad_norm": 0.8392748236656189, + "learning_rate": 8.389261744966443e-08, + "loss": 0.9107, + "step": 1290 + }, + { + "epoch": 1.9965210668728257, + "grad_norm": 0.8647322058677673, + "learning_rate": 4.1946308724832215e-08, + "loss": 0.8547, + "step": 1291 + }, + { + "epoch": 1.998067259373792, + "grad_norm": 1.0320862531661987, + "learning_rate": 0.0, + "loss": 0.8732, + "step": 1292 + } + ], + "logging_steps": 1, + "max_steps": 1292, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.697955930285742e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}